src - FreeBSD source tree

diff options


context:
space:
mode:

author	Baptiste Daroussin <bapt@FreeBSD.org>	2019-02-26 08:18:34 +0000
committer	Baptiste Daroussin <bapt@FreeBSD.org>	2019-02-26 08:18:34 +0000
commit	0b858c82d8a4fe3740870e2db04729d7c89aa505 (patch)
tree	6fb7b000a2f18b5667a69d77a6e14db6f6cbc103 /cddl
parent	8df8b2d3e51d1b816201d8a1fe8bc29fe192e562 (diff)
download	src-0b858c82d8a4fe3740870e2db04729d7c89aa505.tar.gz src-0b858c82d8a4fe3740870e2db04729d7c89aa505.zip

Implement parallel mounting for ZFS filesystem

It was first implemented on Illumos and then ported to ZoL. This patch is a port to FreeBSD of the ZoL version. This patch also includes a fix for a race condition that was amended With such patch Delphix has seen a huge decrease in latency of the mount phase (https://github.com/openzfs/openzfs/commit/a3f0e2b569 for details). With that current change Gandi has measured improvments that are on par with those reported by Delphix. Zol commits incorporated: https://github.com/zfsonlinux/zfs/commit/a10d50f999511d304f910852c7825c70c9c9e303 https://github.com/zfsonlinux/zfs/commit/e63ac16d25fbe991a356489c86d4077567dfea21 Reviewed by: avg, sef Approved by: avg, sef Obtained from: ZoL MFC after: 1 month Relnotes: yes Sponsored by: Gandi.net Differential Revision: https://reviews.freebsd.org/D19098

Notes

Notes: svn path=/head/; revision=344569

Diffstat (limited to 'cddl')

-rw-r--r--

cddl/contrib/opensolaris/cmd/zfs/zfs_main.c

124

-rw-r--r--

cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h

-rw-r--r--

cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c

-rw-r--r--

cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h

-rw-r--r--

cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c

424

5 files changed, 464 insertions, 130 deletions

diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
index bbf2d4ba262b..89f2bdcb2d21 100644
--- a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
+++ b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c

@@ -5812,7 +5812,12 @@ zfs_do_holds(int argc, char **argv)

#define CHECK_SPINNER 30

#define SPINNER_TIME 3 /* seconds */

-#define MOUNT_TIME 5 /* seconds */

+#define MOUNT_TIME 1 /* seconds */

+typedef struct get_all_state {

+ boolean_t ga_verbose;

+ get_all_cb_t *ga_cbp;

+} get_all_state_t;

static int

get_one_dataset(zfs_handle_t *zhp, void *data)

@@ -5821,10 +5826,10 @@ get_one_dataset(zfs_handle_t *zhp, void *data)

static int spinval = 0;

static int spincheck = 0;

static time_t last_spin_time = (time_t)0;

- get_all_cb_t *cbp = data;

+ get_all_state_t *state = data;

zfs_type_t type = zfs_get_type(zhp);

- if (cbp->cb_verbose) {

+ if (state->ga_verbose) {

if (--spincheck < 0) {

time_t now = time(NULL);

if (last_spin_time + SPINNER_TIME < now) {

@@ -5850,25 +5855,23 @@ get_one_dataset(zfs_handle_t *zhp, void *data)

zfs_close(zhp);

return (0);

}

- libzfs_add_handle(cbp, zhp);

- assert(cbp->cb_used <= cbp->cb_alloc);

+ libzfs_add_handle(state->ga_cbp, zhp);

+ assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc);

return (0);

}

static void

-get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t verbose)

+get_all_datasets(get_all_cb_t *cbp, boolean_t verbose)

{

- get_all_cb_t cb = { 0 };

- cb.cb_verbose = verbose;

- cb.cb_getone = get_one_dataset;

+ get_all_state_t state = {

+ .ga_verbose = verbose,

+ .ga_cbp = cbp

+ };

if (verbose)

set_progress_header(gettext("Reading ZFS config"));

- (void) zfs_iter_root(g_zfs, get_one_dataset, &cb);

- *dslist = cb.cb_handles;

- *count = cb.cb_used;

+ (void) zfs_iter_root(g_zfs, get_one_dataset, &state);

if (verbose)

finish_progress(gettext("done."));

@@ -5879,8 +5882,19 @@ get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t verbose)

* similar, we have a common function with an extra parameter to determine which

* mode we are using.

-#define OP_SHARE 0x1

-#define OP_MOUNT 0x2

+typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t;

+typedef struct share_mount_state {

+ share_mount_op_t sm_op;

+ boolean_t sm_verbose;

+ int sm_flags;

+ char *sm_options;

+ char *sm_proto; /* only valid for OP_SHARE */

+ pthread_mutex_t sm_lock; /* protects the remaining fields */

+ uint_t sm_total; /* number of filesystems to process */

+ uint_t sm_done; /* number of filesystems processed */

+ int sm_status; /* -1 if any of the share/mount operations failed */

+} share_mount_state_t;

* Share or mount a dataset.

@@ -6122,6 +6136,29 @@ report_mount_progress(int current, int total)

update_progress(info);

}

+/*

+ * zfs_foreach_mountpoint() callback that mounts or shares on filesystem and

+ * updates the progress meter

+ */

+static int

+share_mount_one_cb(zfs_handle_t *zhp, void *arg)

+ share_mount_state_t *sms = arg;

+ int ret;

+ ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto,

+ B_FALSE, sms->sm_options);

+ pthread_mutex_lock(&sms->sm_lock);

+ if (ret != 0)

+ sms->sm_status = ret;

+ sms->sm_done++;

+ if (sms->sm_verbose)

+ report_mount_progress(sms->sm_done, sms->sm_total);

+ pthread_mutex_unlock(&sms->sm_lock);

+ return (ret);

static void

append_options(char *mntopts, char *newopts)

{

@@ -6194,8 +6231,6 @@ share_mount(int op, int argc, char **argv)

/* check number of arguments */

if (do_all) {

- zfs_handle_t **dslist = NULL;

- size_t i, count = 0;

char *protocol = NULL;

if (op == OP_SHARE && argc > 0) {

@@ -6216,35 +6251,48 @@ share_mount(int op, int argc, char **argv)

}

start_progress_timer();

- get_all_datasets(&dslist, &count, verbose);

+ get_all_cb_t cb = { 0 };

+ get_all_datasets(&cb, verbose);

- if (count == 0)

+ if (cb.cb_used == 0) {

+ if (options != NULL)

+ free(options);

return (0);

+ }

- qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp);

#ifdef illumos

- sa_init_selective_arg_t sharearg;

- sharearg.zhandle_arr = dslist;

- sharearg.zhandle_len = count;

- if ((ret = zfs_init_libshare_arg(zfs_get_handle(dslist[0]),

- SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != SA_OK) {

- (void) fprintf(stderr,

- gettext("Could not initialize libshare, %d"), ret);

- return (ret);

+ if (op == OP_SHARE) {

+ sa_init_selective_arg_t sharearg;

+ sharearg.zhandle_arr = cb.cb_handles;

+ sharearg.zhandle_len = cb.cb_used;

+ if ((ret = zfs_init_libshare_arg(g_zfs,

+ SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != SA_OK) {

+ (void) fprintf(stderr, gettext(

+ "Could not initialize libshare, %d"), ret);

+ return (ret);

+ }

}

#endif

+ share_mount_state_t share_mount_state = { 0 };

+ share_mount_state.sm_op = op;

+ share_mount_state.sm_verbose = verbose;

+ share_mount_state.sm_flags = flags;

+ share_mount_state.sm_options = options;

+ share_mount_state.sm_proto = protocol;

+ share_mount_state.sm_total = cb.cb_used;

+ pthread_mutex_init(&share_mount_state.sm_lock, NULL);

- for (i = 0; i < count; i++) {

- if (verbose)

- report_mount_progress(i, count);

- if (share_mount_one(dslist[i], op, flags, protocol,

- B_FALSE, options) != 0)

- ret = 1;

- zfs_close(dslist[i]);

- }

+ /*

+ * libshare isn't mt-safe, so only do the operation in parallel

+ * if we're mounting.

+ */

+ zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used,

+ share_mount_one_cb, &share_mount_state, op == OP_MOUNT);

+ ret = share_mount_state.sm_status;

- free(dslist);

+ for (int i = 0; i < cb.cb_used; i++)

+ zfs_close(cb.cb_handles[i]);

+ free(cb.cb_handles);

} else if (argc == 0) {

struct mnttab entry;

diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
index 4f8a68e4d296..337fd97f8bd3 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h

@@ -579,12 +579,12 @@ typedef struct get_all_cb {

zfs_handle_t **cb_handles;

size_t cb_alloc;

size_t cb_used;

- boolean_t cb_verbose;

- int (*cb_getone)(zfs_handle_t *, void *);

} get_all_cb_t;

+void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, size_t,

+ zfs_iter_f, void*, boolean_t);

void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *);

-int libzfs_dataset_cmp(const void *, const void *);

* Functions to create and destroy datasets.

diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
index fbcc14bcc8fe..fcef8bf16f16 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c

@@ -799,6 +799,7 @@ libzfs_mnttab_cache_compare(const void *arg1, const void *arg2)

void

libzfs_mnttab_init(libzfs_handle_t *hdl)

{

+ pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL);

assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0);

avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare,

sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));

@@ -839,6 +840,7 @@ libzfs_mnttab_fini(libzfs_handle_t *hdl)

free(mtn);

}

avl_destroy(&hdl->libzfs_mnttab_cache);

+ (void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock);

}

void

@@ -853,6 +855,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname,

{

mnttab_node_t find;

mnttab_node_t *mtn;

+ int ret = ENOENT;

if (!hdl->libzfs_mnttab_enable) {

struct mnttab srch = { 0 };

@@ -868,6 +871,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname,

return (ENOENT);

}

+ pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);

if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)

libzfs_mnttab_update(hdl);

@@ -875,9 +879,10 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname,

mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL);

if (mtn) {

*entry = mtn->mtn_mt;

- return (0);

+ ret = 0;

}

- return (ENOENT);

+ pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);

+ return (ret);

}

void

@@ -886,15 +891,17 @@ libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special,

{

mnttab_node_t *mtn;

- if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)

- return;

- mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));

- mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);

- mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);

- mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);

- mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);

- avl_add(&hdl->libzfs_mnttab_cache, mtn);

+ pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);

+ if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) {

+ mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));

+ mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);

+ mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);

+ mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);

+ mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);

+ avl_add(&hdl->libzfs_mnttab_cache, mtn);

+ }

+ pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);

void

libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)

@@ -902,6 +909,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)

mnttab_node_t find;

mnttab_node_t *ret;

+ pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);

find.mtn_mt.mnt_special = (char *)fsname;

if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL))

!= NULL) {

@@ -912,6 +920,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)

free(ret->mtn_mt.mnt_mntopts);

free(ret);

}

+ pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);

}

int

diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
index bdb9ffeadf75..0f66ac0aeadb 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h

@@ -22,7 +22,7 @@

@@ -73,6 +73,13 @@ struct libzfs_handle {

int libzfs_storeerr; /* stuff error messages into buffer */

void *libzfs_sharehdl; /* libshare handle */

boolean_t libzfs_mnttab_enable;

+ /*

+ * We need a lock to handle the case where parallel mount

+ * threads are populating the mnttab cache simultaneously. The

+ * lock only protects the integrity of the avl tree, and does

+ * not protect the contents of the mnttab entries themselves.

+ */

+ pthread_mutex_t libzfs_mnttab_cache_lock;

avl_tree_t libzfs_mnttab_cache;

int libzfs_pool_iter;

libzfs_fru_t **libzfs_fru_hash;

diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c
index d8723d37d385..27d98ba4fe12 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c

@@ -26,6 +26,7 @@

@@ -34,25 +35,25 @@

* they are used by mount and unmount and when changing a filesystem's

* mountpoint.

- * zfs_is_mounted()

- * zfs_mount()

- * zfs_unmount()

- * zfs_unmountall()

+ * zfs_is_mounted()

+ * zfs_mount()

+ * zfs_unmount()

+ * zfs_unmountall()

* This file also contains the functions used to manage sharing filesystems via

* NFS and iSCSI:

- * zfs_is_shared()

- * zfs_share()

- * zfs_unshare()

+ * zfs_is_shared()

+ * zfs_share()

+ * zfs_unshare()

- * zfs_is_shared_nfs()

- * zfs_is_shared_smb()

- * zfs_share_proto()

- * zfs_shareall();

- * zfs_unshare_nfs()

- * zfs_unshare_smb()

- * zfs_unshareall_nfs()

+ * zfs_is_shared_nfs()

+ * zfs_is_shared_smb()

+ * zfs_share_proto()

+ * zfs_shareall();

+ * zfs_unshare_nfs()

+ * zfs_unshare_smb()

+ * zfs_unshareall_nfs()

* zfs_unshareall_smb()

* zfs_unshareall()

* zfs_unshareall_bypath()

@@ -60,8 +61,8 @@

* The following functions are available for pool consumers, and will

* mount/unmount and share/unshare all datasets within pool:

- * zpool_enable_datasets()

- * zpool_disable_datasets()

+ * zpool_enable_datasets()

+ * zpool_disable_datasets()

#include <dirent.h>

@@ -83,10 +84,14 @@

#include <libzfs.h>

#include "libzfs_impl.h"

+#include <thread_pool.h>

#include <libshare.h>

#define MAXISALEN 257 /* based on sysinfo(2) man page */

+static int mount_tp_nthr = 512; /* tpool threads for multi-threaded mounting */

+static void zfs_mount_task(void *);

static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *);

zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **,

zfs_share_proto_t);

@@ -1134,25 +1139,32 @@ remove_mountpoint(zfs_handle_t *zhp)

}

+/*

+ * Add the given zfs handle to the cb_handles array, dynamically reallocating

+ * the array if it is out of space

+ */

void

libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp)

{

if (cbp->cb_alloc == cbp->cb_used) {

size_t newsz;

- void *ptr;

+ zfs_handle_t **newhandles;

- newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64;

- ptr = zfs_realloc(zhp->zfs_hdl,

- cbp->cb_handles, cbp->cb_alloc * sizeof (void *),

- newsz * sizeof (void *));

- cbp->cb_handles = ptr;

+ newsz = cbp->cb_alloc != 0 ? cbp->cb_alloc * 2 : 64;

+ newhandles = zfs_realloc(zhp->zfs_hdl,

+ cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *),

+ newsz * sizeof (zfs_handle_t *));

+ cbp->cb_handles = newhandles;

cbp->cb_alloc = newsz;

}

cbp->cb_handles[cbp->cb_used++] = zhp;

}

+/*

+ * Recursive helper function used during file system enumeration

+ */

static int

-mount_cb(zfs_handle_t *zhp, void *data)

+zfs_iter_cb(zfs_handle_t *zhp, void *data)

{

get_all_cb_t *cbp = data;

@@ -1178,104 +1190,362 @@ mount_cb(zfs_handle_t *zhp, void *data)

}

libzfs_add_handle(cbp, zhp);

- if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) {

+ if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) != 0) {

zfs_close(zhp);

return (-1);

}

return (0);

}

-int

-libzfs_dataset_cmp(const void *a, const void *b)

+/*

+ * Sort comparator that compares two mountpoint paths. We sort these paths so

+ * that subdirectories immediately follow their parents. This means that we

+ * effectively treat the '/' character as the lowest value non-nul char.

+ * Since filesystems from non-global zones can have the same mountpoint

+ * as other filesystems, the comparator sorts global zone filesystems to

+ * the top of the list. This means that the global zone will traverse the

+ * filesystem list in the correct order and can stop when it sees the

+ * first zoned filesystem. In a non-global zone, only the delegated

+ * filesystems are seen.

+ *

+ * An example sorted list using this comparator would look like:

+ *

+ * /foo

+ * /foo/bar

+ * /foo/bar/baz

+ * /foo/baz

+ * /foo.bar

+ * /foo (NGZ1)

+ * /foo (NGZ2)

+ *

+ * The mount code depend on this ordering to deterministically iterate

+ * over filesystems in order to spawn parallel mount tasks.

+ */

+static int

+mountpoint_cmp(const void *arga, const void *argb)

{

- zfs_handle_t **za = (zfs_handle_t **)a;

- zfs_handle_t **zb = (zfs_handle_t **)b;

+ zfs_handle_t *const *zap = arga;

+ zfs_handle_t *za = *zap;

+ zfs_handle_t *const *zbp = argb;

+ zfs_handle_t *zb = *zbp;

char mounta[MAXPATHLEN];

char mountb[MAXPATHLEN];

+ const char *a = mounta;

+ const char *b = mountb;

boolean_t gota, gotb;

+ uint64_t zoneda, zonedb;

- if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0)

- verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta,

+ zoneda = zfs_prop_get_int(za, ZFS_PROP_ZONED);

+ zonedb = zfs_prop_get_int(zb, ZFS_PROP_ZONED);

+ if (zoneda && !zonedb)

+ return (1);

+ if (!zoneda && zonedb)

+ return (-1);

+ gota = (zfs_get_type(za) == ZFS_TYPE_FILESYSTEM);

+ if (gota)

+ verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta,

sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);

- if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0)

- verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb,

+ gotb = (zfs_get_type(zb) == ZFS_TYPE_FILESYSTEM);

+ if (gotb)

+ verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb,

sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);

- if (gota && gotb)

- return (strcmp(mounta, mountb));

+ if (gota && gotb) {

+ while (*a != '\0' && (*a == *b)) {

+ a++;

+ b++;

+ }

+ if (*a == *b)

+ return (0);

+ if (*a == '\0')

+ return (-1);

+ if (*b == '\0')

+ return (-1);

+ if (*a == '/')

+ return (-1);

+ if (*b == '/')

+ return (-1);

+ return (*a < *b ? -1 : *a > *b);

+ }

if (gota)

return (-1);

if (gotb)

return (1);

- return (strcmp(zfs_get_name(a), zfs_get_name(b)));

+ /*

+ * If neither filesystem has a mountpoint, revert to sorting by

+ * datset name.

+ */

+ return (strcmp(zfs_get_name(za), zfs_get_name(zb)));

+/*

+ * Reutrn true if path2 is a child of path1

+ */

+static boolean_t

+libzfs_path_contains(const char *path1, const char *path2)

+ return (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/');

+static int

+non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int idx)

+ char parent[ZFS_MAXPROPLEN];

+ char child[ZFS_MAXPROPLEN];

+ int i;

+ verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent,

+ sizeof (parent), NULL, NULL, 0, B_FALSE) == 0);

+ for (i = idx + 1; i < num_handles; i++) {

+ verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child,

+ sizeof (child), NULL, NULL, 0, B_FALSE) == 0);

+ if (!libzfs_path_contains(parent, child))

+ break;

+ }

+ return (i);

+typedef struct mnt_param {

+ libzfs_handle_t *mnt_hdl;

+ tpool_t *mnt_tp;

+ zfs_handle_t **mnt_zhps; /* filesystems to mount */

+ size_t mnt_num_handles;

+ int mnt_idx; /* Index of selected entry to mount */

+ zfs_iter_f mnt_func;

+ void *mnt_data;

+} mnt_param_t;

+/*

+ * Allocate and populate the parameter struct for mount function, and

+ * schedule mounting of the entry selected by idx.

+ */

+static void

+zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles,

+ size_t num_handles, int idx, zfs_iter_f func, void *data, tpool_t *tp)

+ mnt_param_t *mnt_param = zfs_alloc(hdl, sizeof (mnt_param_t));

+ mnt_param->mnt_hdl = hdl;

+ mnt_param->mnt_tp = tp;

+ mnt_param->mnt_zhps = handles;

+ mnt_param->mnt_num_handles = num_handles;

+ mnt_param->mnt_idx = idx;

+ mnt_param->mnt_func = func;

+ mnt_param->mnt_data = data;

+ (void) tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param);

+/*

+ * This is the structure used to keep state of mounting or sharing operations

+ * during a call to zpool_enable_datasets().

+ */

+typedef struct mount_state {

+ /*

+ * ms_mntstatus is set to -1 if any mount fails. While multiple threads

+ * could update this variable concurrently, no synchronization is

+ * needed as it's only ever set to -1.

+ */

+ int ms_mntstatus;

+ int ms_mntflags;

+ const char *ms_mntopts;

+} mount_state_t;

+static int

+zfs_mount_one(zfs_handle_t *zhp, void *arg)

+ mount_state_t *ms = arg;

+ int ret = 0;

+ if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) != 0)

+ ret = ms->ms_mntstatus = -1;

+ return (ret);

+static int

+zfs_share_one(zfs_handle_t *zhp, void *arg)

+ mount_state_t *ms = arg;

+ int ret = 0;

+ if (zfs_share(zhp) != 0)

+ ret = ms->ms_mntstatus = -1;

+ return (ret);

+/*

+ * Thread pool function to mount one file system. On completion, it finds and

+ * schedules its children to be mounted. This depends on the sorting done in

+ * zfs_foreach_mountpoint(). Note that the degenerate case (chain of entries

+ * each descending from the previous) will have no parallelism since we always

+ * have to wait for the parent to finish mounting before we can schedule

+ * its children.

+ */

+static void

+zfs_mount_task(void *arg)

+ mnt_param_t *mp = arg;

+ int idx = mp->mnt_idx;

+ zfs_handle_t **handles = mp->mnt_zhps;

+ size_t num_handles = mp->mnt_num_handles;

+ char mountpoint[ZFS_MAXPROPLEN];

+ verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint,

+ sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);

+ if (mp->mnt_func(handles[idx], mp->mnt_data) != 0)

+ return;

+ /*

+ * We dispatch tasks to mount filesystems with mountpoints underneath

+ * this one. We do this by dispatching the next filesystem with a

+ * descendant mountpoint of the one we just mounted, then skip all of

+ * its descendants, dispatch the next descendant mountpoint, and so on.

+ * The non_descendant_idx() function skips over filesystems that are

+ * descendants of the filesystem we just dispatched.

+ */

+ for (int i = idx + 1; i < num_handles;

+ i = non_descendant_idx(handles, num_handles, i)) {

+ char child[ZFS_MAXPROPLEN];

+ verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT,

+ child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0);

+ if (!libzfs_path_contains(mountpoint, child))

+ break; /* not a descendant, return */

+ zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i,

+ mp->mnt_func, mp->mnt_data, mp->mnt_tp);

+ }

+ free(mp);

+/*

+ * Issue the func callback for each ZFS handle contained in the handles

+ * array. This function is used to mount all datasets, and so this function

+ * guarantees that filesystems for parent mountpoints are called before their

+ * children. As such, before issuing any callbacks, we first sort the array

+ * of handles by mountpoint.

+ *

+ * Callbacks are issued in one of two ways:

+ *

+ * 1. Sequentially: If the parallel argument is B_FALSE or the ZFS_SERIAL_MOUNT

+ * environment variable is set, then we issue callbacks sequentially.

+ *

+ * 2. In parallel: If the parallel argument is B_TRUE and the ZFS_SERIAL_MOUNT

+ * environment variable is not set, then we use a tpool to dispatch threads

+ * to mount filesystems in parallel. This function dispatches tasks to mount

+ * the filesystems at the top-level mountpoints, and these tasks in turn

+ * are responsible for recursively mounting filesystems in their children

+ * mountpoints.

+ */

+void

+zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles,

+ size_t num_handles, zfs_iter_f func, void *data, boolean_t parallel)

+ zoneid_t zoneid = getzoneid();

+ /*

+ * The ZFS_SERIAL_MOUNT environment variable is an undocumented

+ * variable that can be used as a convenience to do a/b comparison

+ * of serial vs. parallel mounting.

+ */

+ boolean_t serial_mount = !parallel ||

+ (getenv("ZFS_SERIAL_MOUNT") != NULL);

+ /*

+ * Sort the datasets by mountpoint. See mountpoint_cmp for details

+ * of how these are sorted.

+ */

+ qsort(handles, num_handles, sizeof (zfs_handle_t *), mountpoint_cmp);

+ if (serial_mount) {

+ for (int i = 0; i < num_handles; i++) {

+ func(handles[i], data);

+ }

+ return;

+ }

+ /*

+ * Issue the callback function for each dataset using a parallel

+ * algorithm that uses a thread pool to manage threads.

+ */

+ tpool_t *tp = tpool_create(1, mount_tp_nthr, 0, NULL);

+ /*

+ * There may be multiple "top level" mountpoints outside of the pool's

+ * root mountpoint, e.g.: /foo /bar. Dispatch a mount task for each of

+ * these.

+ */

+ for (int i = 0; i < num_handles;

+ i = non_descendant_idx(handles, num_handles, i)) {

+ /*

+ * Since the mountpoints have been sorted so that the zoned

+ * filesystems are at the end, a zoned filesystem seen from

+ * the global zone means that we're done.

+ */

+ if (zoneid == GLOBAL_ZONEID &&

+ zfs_prop_get_int(handles[i], ZFS_PROP_ZONED))

+ break;

+ zfs_dispatch_mount(hdl, handles, num_handles, i, func, data,

+ tp);

+ }

+ tpool_wait(tp); /* wait for all scheduled mounts to complete */

+ tpool_destroy(tp);

}

* Mount and share all datasets within the given pool. This assumes that no

- * datasets within the pool are currently mounted. Because users can create

- * complicated nested hierarchies of mountpoints, we first gather all the

- * datasets and mountpoints within the pool, and sort them by mountpoint. Once

- * we have the list of all filesystems, we iterate over them in order and mount

- * and/or share each one.

+ * datasets within the pool are currently mounted.

#pragma weak zpool_mount_datasets = zpool_enable_datasets

int

zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)

{

get_all_cb_t cb = { 0 };

- libzfs_handle_t *hdl = zhp->zpool_hdl;

+ mount_state_t ms = { 0 };

zfs_handle_t *zfsp;

- int i, ret = -1;

- int *good;

+ int ret = 0;

- /*

- * Gather all non-snap datasets within the pool.

- */

- if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) == NULL)

+ if ((zfsp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,

+ ZFS_TYPE_DATASET)) == NULL)

goto out;

- libzfs_add_handle(&cb, zfsp);

- if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0)

- goto out;

- * Sort the datasets by mountpoint.

+ * Gather all non-snapshot datasets within the pool. Start by adding

+ * the root filesystem for this pool to the list, and then iterate

+ * over all child filesystems.

- qsort(cb.cb_handles, cb.cb_used, sizeof (void *),

- libzfs_dataset_cmp);

+ libzfs_add_handle(&cb, zfsp);

+ if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) != 0)

+ goto out;

- * And mount all the datasets, keeping track of which ones

- * succeeded or failed.

+ * Mount all filesystems

- if ((good = zfs_alloc(zhp->zpool_hdl,

- cb.cb_used * sizeof (int))) == NULL)

- goto out;

- ret = 0;

- for (i = 0; i < cb.cb_used; i++) {

- if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0)

- ret = -1;

- else

- good[i] = 1;

- }

+ ms.ms_mntopts = mntopts;

+ ms.ms_mntflags = flags;

+ zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,

+ zfs_mount_one, &ms, B_TRUE);

+ if (ms.ms_mntstatus != 0)

+ ret = ms.ms_mntstatus;

- * Then share all the ones that need to be shared. This needs

- * to be a separate pass in order to avoid excessive reloading

- * of the configuration. Good should never be NULL since

- * zfs_alloc is supposed to exit if memory isn't available.

+ * Share all filesystems that need to be shared. This needs to be

+ * a separate pass because libshare is not mt-safe, and so we need

+ * to share serially.

- for (i = 0; i < cb.cb_used; i++) {

- if (good[i] && zfs_share(cb.cb_handles[i]) != 0)

- ret = -1;

- }

- free(good);

+ ms.ms_mntstatus = 0;

+ zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,

+ zfs_share_one, &ms, B_FALSE);

+ if (ms.ms_mntstatus != 0)

+ ret = ms.ms_mntstatus;

out:

- for (i = 0; i < cb.cb_used; i++)

+ for (int i = 0; i < cb.cb_used; i++)

zfs_close(cb.cb_handles[i]);

free(cb.cb_handles);