105 files changed, 71676 insertions, 0 deletions
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/acl_common.c b/sys/contrib/openzfs/module/os/freebsd/spl/acl_common.c
new file mode 100644
index 000000000000..66e27cefa396
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/acl_common.c
@@ -0,0 +1,1709 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/avl.h>
+#include <sys/misc.h>
+#if defined(_KERNEL)
+#include <sys/kmem.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <acl/acl_common.h>
+#include <sys/debug.h>
+#else
+#include <errno.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <grp.h>
+#include <pwd.h>
+#include <acl_common.h>
+#define	ASSERT	assert
+#endif
+
+#define	ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \
+    ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \
+    ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL)
+
+
+#define	ACL_SYNCHRONIZE_SET_DENY		0x0000001
+#define	ACL_SYNCHRONIZE_SET_ALLOW		0x0000002
+#define	ACL_SYNCHRONIZE_ERR_DENY		0x0000004
+#define	ACL_SYNCHRONIZE_ERR_ALLOW		0x0000008
+
+#define	ACL_WRITE_OWNER_SET_DENY		0x0000010
+#define	ACL_WRITE_OWNER_SET_ALLOW		0x0000020
+#define	ACL_WRITE_OWNER_ERR_DENY		0x0000040
+#define	ACL_WRITE_OWNER_ERR_ALLOW		0x0000080
+
+#define	ACL_DELETE_SET_DENY			0x0000100
+#define	ACL_DELETE_SET_ALLOW			0x0000200
+#define	ACL_DELETE_ERR_DENY			0x0000400
+#define	ACL_DELETE_ERR_ALLOW			0x0000800
+
+#define	ACL_WRITE_ATTRS_OWNER_SET_DENY		0x0001000
+#define	ACL_WRITE_ATTRS_OWNER_SET_ALLOW		0x0002000
+#define	ACL_WRITE_ATTRS_OWNER_ERR_DENY		0x0004000
+#define	ACL_WRITE_ATTRS_OWNER_ERR_ALLOW		0x0008000
+
+#define	ACL_WRITE_ATTRS_WRITER_SET_DENY		0x0010000
+#define	ACL_WRITE_ATTRS_WRITER_SET_ALLOW	0x0020000
+#define	ACL_WRITE_ATTRS_WRITER_ERR_DENY		0x0040000
+#define	ACL_WRITE_ATTRS_WRITER_ERR_ALLOW	0x0080000
+
+#define	ACL_WRITE_NAMED_WRITER_SET_DENY		0x0100000
+#define	ACL_WRITE_NAMED_WRITER_SET_ALLOW	0x0200000
+#define	ACL_WRITE_NAMED_WRITER_ERR_DENY		0x0400000
+#define	ACL_WRITE_NAMED_WRITER_ERR_ALLOW	0x0800000
+
+#define	ACL_READ_NAMED_READER_SET_DENY		0x1000000
+#define	ACL_READ_NAMED_READER_SET_ALLOW		0x2000000
+#define	ACL_READ_NAMED_READER_ERR_DENY		0x4000000
+#define	ACL_READ_NAMED_READER_ERR_ALLOW		0x8000000
+
+
+#define	ACE_VALID_MASK_BITS (\
+    ACE_READ_DATA | \
+    ACE_LIST_DIRECTORY | \
+    ACE_WRITE_DATA | \
+    ACE_ADD_FILE | \
+    ACE_APPEND_DATA | \
+    ACE_ADD_SUBDIRECTORY | \
+    ACE_READ_NAMED_ATTRS | \
+    ACE_WRITE_NAMED_ATTRS | \
+    ACE_EXECUTE | \
+    ACE_DELETE_CHILD | \
+    ACE_READ_ATTRIBUTES | \
+    ACE_WRITE_ATTRIBUTES | \
+    ACE_DELETE | \
+    ACE_READ_ACL | \
+    ACE_WRITE_ACL | \
+    ACE_WRITE_OWNER | \
+    ACE_SYNCHRONIZE)
+
+#define	ACE_MASK_UNDEFINED			0x80000000
+
+#define	ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE | \
+    ACE_DIRECTORY_INHERIT_ACE | \
+    ACE_NO_PROPAGATE_INHERIT_ACE | ACE_INHERIT_ONLY_ACE | \
+    ACE_SUCCESSFUL_ACCESS_ACE_FLAG | ACE_FAILED_ACCESS_ACE_FLAG | \
+    ACE_IDENTIFIER_GROUP | ACE_OWNER | ACE_GROUP | ACE_EVERYONE)
+
+/*
+ * ACL conversion helpers
+ */
+
+typedef enum {
+	ace_unused,
+	ace_user_obj,
+	ace_user,
+	ace_group, /* includes GROUP and GROUP_OBJ */
+	ace_other_obj
+} ace_to_aent_state_t;
+
+typedef struct acevals {
+	uid_t key;
+	avl_node_t avl;
+	uint32_t mask;
+	uint32_t allowed;
+	uint32_t denied;
+	int aent_type;
+} acevals_t;
+
+typedef struct ace_list {
+	acevals_t user_obj;
+	avl_tree_t user;
+	int numusers;
+	acevals_t group_obj;
+	avl_tree_t group;
+	int numgroups;
+	acevals_t other_obj;
+	uint32_t acl_mask;
+	int hasmask;
+	int dfacl_flag;
+	ace_to_aent_state_t state;
+	int seen; /* bitmask of all aclent_t a_type values seen */
+} ace_list_t;
+
+/*
+ * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
+ * v = Ptr to array/vector of objs
+ * n = # objs in the array
+ * s = size of each obj (must be multiples of a word size)
+ * f = ptr to function to compare two objs
+ *	returns (-1 = less than, 0 = equal, 1 = greater than
+ */
+void
+ksort(caddr_t v, int n, int s, int (*f)(void *, void *))
+{
+	int g, i, j, ii;
+	unsigned int *p1, *p2;
+	unsigned int tmp;
+
+	/* No work to do */
+	if (v == NULL || n <= 1)
+		return;
+
+	/* Sanity check on arguments */
+	ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0);
+	ASSERT(s > 0);
+	for (g = n / 2; g > 0; g /= 2) {
+		for (i = g; i < n; i++) {
+			for (j = i - g; j >= 0 &&
+			    (*f)(v + j * s, v + (j + g) * s) == 1;
+			    j -= g) {
+				p1 = (void *)(v + j * s);
+				p2 = (void *)(v + (j + g) * s);
+				for (ii = 0; ii < s / 4; ii++) {
+					tmp = *p1;
+					*p1++ = *p2;
+					*p2++ = tmp;
+				}
+			}
+		}
+	}
+}
+
+/*
+ * Compare two acls, all fields.  Returns:
+ * -1 (less than)
+ *  0 (equal)
+ * +1 (greater than)
+ */
+int
+cmp2acls(void *a, void *b)
+{
+	aclent_t *x = (aclent_t *)a;
+	aclent_t *y = (aclent_t *)b;
+
+	/* Compare types */
+	if (x->a_type < y->a_type)
+		return (-1);
+	if (x->a_type > y->a_type)
+		return (1);
+	/* Equal types; compare id's */
+	if (x->a_id < y->a_id)
+		return (-1);
+	if (x->a_id > y->a_id)
+		return (1);
+	/* Equal ids; compare perms */
+	if (x->a_perm < y->a_perm)
+		return (-1);
+	if (x->a_perm > y->a_perm)
+		return (1);
+	/* Totally equal */
+	return (0);
+}
+
+static int
+cacl_malloc(void **ptr, size_t size)
+{
+	*ptr = kmem_zalloc(size, KM_SLEEP);
+	return (0);
+}
+
+
+#if !defined(_KERNEL)
+acl_t *
+acl_alloc(enum acl_type type)
+{
+	acl_t *aclp;
+
+	if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0)
+		return (NULL);
+
+	aclp->acl_aclp = NULL;
+	aclp->acl_cnt = 0;
+
+	switch (type) {
+	case ACE_T:
+		aclp->acl_type = ACE_T;
+		aclp->acl_entry_size = sizeof (ace_t);
+		break;
+	case ACLENT_T:
+		aclp->acl_type = ACLENT_T;
+		aclp->acl_entry_size = sizeof (aclent_t);
+		break;
+	default:
+		acl_free(aclp);
+		aclp = NULL;
+	}
+	return (aclp);
+}
+
+/*
+ * Free acl_t structure
+ */
+void
+acl_free(acl_t *aclp)
+{
+	int acl_size;
+
+	if (aclp == NULL)
+		return;
+
+	if (aclp->acl_aclp) {
+		acl_size = aclp->acl_cnt * aclp->acl_entry_size;
+		cacl_free(aclp->acl_aclp, acl_size);
+	}
+
+	cacl_free(aclp, sizeof (acl_t));
+}
+
+static uint32_t
+access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow)
+{
+	uint32_t access_mask = 0;
+	int acl_produce;
+	int synchronize_set = 0, write_owner_set = 0;
+	int delete_set = 0, write_attrs_set = 0;
+	int read_named_set = 0, write_named_set = 0;
+
+	acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW |
+	    ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+	    ACL_WRITE_ATTRS_WRITER_SET_DENY);
+
+	if (isallow) {
+		synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW;
+		write_owner_set = ACL_WRITE_OWNER_SET_ALLOW;
+		delete_set = ACL_DELETE_SET_ALLOW;
+		if (hasreadperm)
+			read_named_set = ACL_READ_NAMED_READER_SET_ALLOW;
+		if (haswriteperm)
+			write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+		if (isowner)
+			write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+		else if (haswriteperm)
+			write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+	} else {
+
+		synchronize_set = ACL_SYNCHRONIZE_SET_DENY;
+		write_owner_set = ACL_WRITE_OWNER_SET_DENY;
+		delete_set = ACL_DELETE_SET_DENY;
+		if (hasreadperm)
+			read_named_set = ACL_READ_NAMED_READER_SET_DENY;
+		if (haswriteperm)
+			write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY;
+		if (isowner)
+			write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+		else if (haswriteperm)
+			write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+		else
+			/*
+			 * If the entity is not the owner and does not
+			 * have write permissions ACE_WRITE_ATTRIBUTES will
+			 * always go in the DENY ACE.
+			 */
+			access_mask |= ACE_WRITE_ATTRIBUTES;
+	}
+
+	if (acl_produce & synchronize_set)
+		access_mask |= ACE_SYNCHRONIZE;
+	if (acl_produce & write_owner_set)
+		access_mask |= ACE_WRITE_OWNER;
+	if (acl_produce & delete_set)
+		access_mask |= ACE_DELETE;
+	if (acl_produce & write_attrs_set)
+		access_mask |= ACE_WRITE_ATTRIBUTES;
+	if (acl_produce & read_named_set)
+		access_mask |= ACE_READ_NAMED_ATTRS;
+	if (acl_produce & write_named_set)
+		access_mask |= ACE_WRITE_NAMED_ATTRS;
+
+	return (access_mask);
+}
+
+/*
+ * Given an mode_t, convert it into an access_mask as used
+ * by nfsace, assuming aclent_t -> nfsace semantics.
+ */
+static uint32_t
+mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow)
+{
+	uint32_t access = 0;
+	int haswriteperm = 0;
+	int hasreadperm = 0;
+
+	if (isallow) {
+		haswriteperm = (mode & S_IWOTH);
+		hasreadperm = (mode & S_IROTH);
+	} else {
+		haswriteperm = !(mode & S_IWOTH);
+		hasreadperm = !(mode & S_IROTH);
+	}
+
+	/*
+	 * The following call takes care of correctly setting the following
+	 * mask bits in the access_mask:
+	 * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE,
+	 * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS
+	 */
+	access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow);
+
+	if (isallow) {
+		access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES;
+		if (isowner)
+			access |= ACE_WRITE_ACL;
+	} else {
+		if (! isowner)
+			access |= ACE_WRITE_ACL;
+	}
+
+	/* read */
+	if (mode & S_IROTH) {
+		access |= ACE_READ_DATA;
+	}
+	/* write */
+	if (mode & S_IWOTH) {
+		access |= ACE_WRITE_DATA |
+		    ACE_APPEND_DATA;
+		if (isdir)
+			access |= ACE_DELETE_CHILD;
+	}
+	/* exec */
+	if (mode & S_IXOTH) {
+		access |= ACE_EXECUTE;
+	}
+
+	return (access);
+}
+
+/*
+ * Given an nfsace (presumably an ALLOW entry), make a
+ * corresponding DENY entry at the address given.
+ */
+static void
+ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner)
+{
+	(void) memcpy(deny, allow, sizeof (ace_t));
+
+	deny->a_who = allow->a_who;
+
+	deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+	deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS;
+	if (isdir)
+		deny->a_access_mask ^= ACE_DELETE_CHILD;
+
+	deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER |
+	    ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS |
+	    ACE_WRITE_NAMED_ATTRS);
+	deny->a_access_mask |= access_mask_set((allow->a_access_mask &
+	    ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner,
+	    B_FALSE);
+}
+/*
+ * Make an initial pass over an array of aclent_t's.  Gather
+ * information such as an ACL_MASK (if any), number of users,
+ * number of groups, and whether the array needs to be sorted.
+ */
+static int
+ln_aent_preprocess(aclent_t *aclent, int n,
+    int *hasmask, mode_t *mask,
+    int *numuser, int *numgroup, int *needsort)
+{
+	int error = 0;
+	int i;
+	int curtype = 0;
+
+	*hasmask = 0;
+	*mask = 07;
+	*needsort = 0;
+	*numuser = 0;
+	*numgroup = 0;
+
+	for (i = 0; i < n; i++) {
+		if (aclent[i].a_type < curtype)
+			*needsort = 1;
+		else if (aclent[i].a_type > curtype)
+			curtype = aclent[i].a_type;
+		if (aclent[i].a_type & USER)
+			(*numuser)++;
+		if (aclent[i].a_type & (GROUP | GROUP_OBJ))
+			(*numgroup)++;
+		if (aclent[i].a_type & CLASS_OBJ) {
+			if (*hasmask) {
+				error = EINVAL;
+				goto out;
+			} else {
+				*hasmask = 1;
+				*mask = aclent[i].a_perm;
+			}
+		}
+	}
+
+	if ((! *hasmask) && (*numuser + *numgroup > 1)) {
+		error = EINVAL;
+		goto out;
+	}
+
+out:
+	return (error);
+}
+
+/*
+ * Convert an array of aclent_t into an array of nfsace entries,
+ * following POSIX draft -> nfsv4 conversion semantics as outlined in
+ * the IETF draft.
+ */
+static int
+ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir)
+{
+	int error = 0;
+	mode_t mask;
+	int numuser, numgroup, needsort;
+	int resultsize = 0;
+	int i, groupi = 0, skip;
+	ace_t *acep, *result = NULL;
+	int hasmask;
+
+	error = ln_aent_preprocess(aclent, n, &hasmask, &mask,
+	    &numuser, &numgroup, &needsort);
+	if (error != 0)
+		goto out;
+
+	/* allow + deny for each aclent */
+	resultsize = n * 2;
+	if (hasmask) {
+		/*
+		 * stick extra deny on the group_obj and on each
+		 * user|group for the mask (the group_obj was added
+		 * into the count for numgroup)
+		 */
+		resultsize += numuser + numgroup;
+		/* ... and don't count the mask itself */
+		resultsize -= 2;
+	}
+
+	/* sort the source if necessary */
+	if (needsort)
+		ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls);
+
+	if (cacl_malloc((void **)&result, resultsize * sizeof (ace_t)) != 0)
+		goto out;
+
+	acep = result;
+
+	for (i = 0; i < n; i++) {
+		/*
+		 * don't process CLASS_OBJ (mask); mask was grabbed in
+		 * ln_aent_preprocess()
+		 */
+		if (aclent[i].a_type & CLASS_OBJ)
+			continue;
+
+		/* If we need an ACL_MASK emulator, prepend it now */
+		if ((hasmask) &&
+		    (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) {
+			acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+			acep->a_flags = 0;
+			if (aclent[i].a_type & GROUP_OBJ) {
+				acep->a_who = (uid_t)-1;
+				acep->a_flags |=
+				    (ACE_IDENTIFIER_GROUP|ACE_GROUP);
+			} else if (aclent[i].a_type & USER) {
+				acep->a_who = aclent[i].a_id;
+			} else {
+				acep->a_who = aclent[i].a_id;
+				acep->a_flags |= ACE_IDENTIFIER_GROUP;
+			}
+			if (aclent[i].a_type & ACL_DEFAULT) {
+				acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+				    ACE_FILE_INHERIT_ACE |
+				    ACE_DIRECTORY_INHERIT_ACE;
+			}
+			/*
+			 * Set the access mask for the prepended deny
+			 * ace.  To do this, we invert the mask (found
+			 * in ln_aent_preprocess()) then convert it to an
+			 * DENY ace access_mask.
+			 */
+			acep->a_access_mask = mode_to_ace_access((mask ^ 07),
+			    isdir, 0, 0);
+			acep += 1;
+		}
+
+		/* handle a_perm -> access_mask */
+		acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm,
+		    isdir, aclent[i].a_type & USER_OBJ, 1);
+
+		/* emulate a default aclent */
+		if (aclent[i].a_type & ACL_DEFAULT) {
+			acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+			    ACE_FILE_INHERIT_ACE |
+			    ACE_DIRECTORY_INHERIT_ACE;
+		}
+
+		/*
+		 * handle a_perm and a_id
+		 *
+		 * this must be done last, since it involves the
+		 * corresponding deny aces, which are handled
+		 * differently for each different a_type.
+		 */
+		if (aclent[i].a_type & USER_OBJ) {
+			acep->a_who = (uid_t)-1;
+			acep->a_flags |= ACE_OWNER;
+			ace_make_deny(acep, acep + 1, isdir, B_TRUE);
+			acep += 2;
+		} else if (aclent[i].a_type & USER) {
+			acep->a_who = aclent[i].a_id;
+			ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+			acep += 2;
+		} else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) {
+			if (aclent[i].a_type & GROUP_OBJ) {
+				acep->a_who = (uid_t)-1;
+				acep->a_flags |= ACE_GROUP;
+			} else {
+				acep->a_who = aclent[i].a_id;
+			}
+			acep->a_flags |= ACE_IDENTIFIER_GROUP;
+			/*
+			 * Set the corresponding deny for the group ace.
+			 *
+			 * The deny aces go after all of the groups, unlike
+			 * everything else, where they immediately follow
+			 * the allow ace.
+			 *
+			 * We calculate "skip", the number of slots to
+			 * skip ahead for the deny ace, here.
+			 *
+			 * The pattern is:
+			 * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3
+			 * thus, skip is
+			 * (2 * numgroup) - 1 - groupi
+			 * (2 * numgroup) to account for MD + A
+			 * - 1 to account for the fact that we're on the
+			 * access (A), not the mask (MD)
+			 * - groupi to account for the fact that we have
+			 * passed up groupi number of MD's.
+			 */
+			skip = (2 * numgroup) - 1 - groupi;
+			ace_make_deny(acep, acep + skip, isdir, B_FALSE);
+			/*
+			 * If we just did the last group, skip acep past
+			 * all of the denies; else, just move ahead one.
+			 */
+			if (++groupi >= numgroup)
+				acep += numgroup + 1;
+			else
+				acep += 1;
+		} else if (aclent[i].a_type & OTHER_OBJ) {
+			acep->a_who = (uid_t)-1;
+			acep->a_flags |= ACE_EVERYONE;
+			ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+			acep += 2;
+		} else {
+			error = EINVAL;
+			goto out;
+		}
+	}
+
+	*acepp = result;
+	*rescount = resultsize;
+
+out:
+	if (error != 0) {
+		if ((result != NULL) && (resultsize > 0)) {
+			cacl_free(result, resultsize * sizeof (ace_t));
+		}
+	}
+
+	return (error);
+}
+
+static int
+convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir,
+    ace_t **retacep, int *retacecnt)
+{
+	ace_t *acep;
+	ace_t *dfacep;
+	int acecnt = 0;
+	int dfacecnt = 0;
+	int dfaclstart = 0;
+	int dfaclcnt = 0;
+	aclent_t *aclp;
+	int i;
+	int error;
+	int acesz, dfacesz;
+
+	ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls);
+
+	for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) {
+		if (aclp->a_type & ACL_DEFAULT)
+			break;
+	}
+
+	if (i < aclcnt) {
+		dfaclstart = i;
+		dfaclcnt = aclcnt - i;
+	}
+
+	if (dfaclcnt && !isdir) {
+		return (EINVAL);
+	}
+
+	error = ln_aent_to_ace(aclentp, i,  &acep, &acecnt, isdir);
+	if (error)
+		return (error);
+
+	if (dfaclcnt) {
+		error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt,
+		    &dfacep, &dfacecnt, isdir);
+		if (error) {
+			if (acep) {
+				cacl_free(acep, acecnt * sizeof (ace_t));
+			}
+			return (error);
+		}
+	}
+
+	if (dfacecnt != 0) {
+		acesz = sizeof (ace_t) * acecnt;
+		dfacesz = sizeof (ace_t) * dfacecnt;
+		acep = cacl_realloc(acep, acesz, acesz + dfacesz);
+		if (acep == NULL)
+			return (ENOMEM);
+		if (dfaclcnt) {
+			(void) memcpy(acep + acecnt, dfacep, dfacesz);
+		}
+	}
+	if (dfaclcnt)
+		cacl_free(dfacep, dfacecnt * sizeof (ace_t));
+
+	*retacecnt = acecnt + dfacecnt;
+	*retacep = acep;
+	return (0);
+}
+
+static int
+ace_mask_to_mode(uint32_t  mask, o_mode_t *modep, boolean_t isdir)
+{
+	int error = 0;
+	o_mode_t mode = 0;
+	uint32_t bits, wantbits;
+
+	/* read */
+	if (mask & ACE_READ_DATA)
+		mode |= S_IROTH;
+
+	/* write */
+	wantbits = (ACE_WRITE_DATA | ACE_APPEND_DATA);
+	if (isdir)
+		wantbits |= ACE_DELETE_CHILD;
+	bits = mask & wantbits;
+	if (bits != 0) {
+		if (bits != wantbits) {
+			error = ENOTSUP;
+			goto out;
+		}
+		mode |= S_IWOTH;
+	}
+
+	/* exec */
+	if (mask & ACE_EXECUTE) {
+		mode |= S_IXOTH;
+	}
+
+	*modep = mode;
+
+out:
+	return (error);
+}
+
+static void
+acevals_init(acevals_t *vals, uid_t key)
+{
+	bzero(vals, sizeof (*vals));
+	vals->allowed = ACE_MASK_UNDEFINED;
+	vals->denied = ACE_MASK_UNDEFINED;
+	vals->mask = ACE_MASK_UNDEFINED;
+	vals->key = key;
+}
+
+static void
+ace_list_init(ace_list_t *al, int dfacl_flag)
+{
+	acevals_init(&al->user_obj, 0);
+	acevals_init(&al->group_obj, 0);
+	acevals_init(&al->other_obj, 0);
+	al->numusers = 0;
+	al->numgroups = 0;
+	al->acl_mask = 0;
+	al->hasmask = 0;
+	al->state = ace_unused;
+	al->seen = 0;
+	al->dfacl_flag = dfacl_flag;
+}
+
+/*
+ * Find or create an acevals holder for a given id and avl tree.
+ *
+ * Note that only one thread will ever touch these avl trees, so
+ * there is no need for locking.
+ */
+static acevals_t *
+acevals_find(ace_t *ace, avl_tree_t *avl, int *num)
+{
+	acevals_t key, *rc;
+	avl_index_t where;
+
+	key.key = ace->a_who;
+	rc = avl_find(avl, &key, &where);
+	if (rc != NULL)
+		return (rc);
+
+	/* this memory is freed by ln_ace_to_aent()->ace_list_free() */
+	if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0)
+		return (NULL);
+
+	acevals_init(rc, ace->a_who);
+	avl_insert(avl, rc, where);
+	(*num)++;
+
+	return (rc);
+}
+
+static int
+access_mask_check(ace_t *acep, int mask_bit, int isowner)
+{
+	int set_deny, err_deny;
+	int set_allow, err_allow;
+	int acl_consume;
+	int haswriteperm, hasreadperm;
+
+	if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
+		haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1;
+		hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1;
+	} else {
+		haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0;
+		hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0;
+	}
+
+	acl_consume = (ACL_SYNCHRONIZE_ERR_DENY |
+	    ACL_DELETE_ERR_DENY |
+	    ACL_WRITE_OWNER_ERR_DENY |
+	    ACL_WRITE_OWNER_ERR_ALLOW |
+	    ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+	    ACL_WRITE_ATTRS_OWNER_ERR_DENY |
+	    ACL_WRITE_ATTRS_WRITER_SET_DENY |
+	    ACL_WRITE_ATTRS_WRITER_ERR_ALLOW |
+	    ACL_WRITE_NAMED_WRITER_ERR_DENY |
+	    ACL_READ_NAMED_READER_ERR_DENY);
+
+	if (mask_bit == ACE_SYNCHRONIZE) {
+		set_deny = ACL_SYNCHRONIZE_SET_DENY;
+		err_deny =  ACL_SYNCHRONIZE_ERR_DENY;
+		set_allow = ACL_SYNCHRONIZE_SET_ALLOW;
+		err_allow = ACL_SYNCHRONIZE_ERR_ALLOW;
+	} else if (mask_bit == ACE_WRITE_OWNER) {
+		set_deny = ACL_WRITE_OWNER_SET_DENY;
+		err_deny =  ACL_WRITE_OWNER_ERR_DENY;
+		set_allow = ACL_WRITE_OWNER_SET_ALLOW;
+		err_allow = ACL_WRITE_OWNER_ERR_ALLOW;
+	} else if (mask_bit == ACE_DELETE) {
+		set_deny = ACL_DELETE_SET_DENY;
+		err_deny =  ACL_DELETE_ERR_DENY;
+		set_allow = ACL_DELETE_SET_ALLOW;
+		err_allow = ACL_DELETE_ERR_ALLOW;
+	} else if (mask_bit == ACE_WRITE_ATTRIBUTES) {
+		if (isowner) {
+			set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+			err_deny =  ACL_WRITE_ATTRS_OWNER_ERR_DENY;
+			set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+			err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW;
+		} else if (haswriteperm) {
+			set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+			err_deny =  ACL_WRITE_ATTRS_WRITER_ERR_DENY;
+			set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+			err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW;
+		} else {
+			if ((acep->a_access_mask & mask_bit) &&
+			    (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) {
+				return (ENOTSUP);
+			}
+			return (0);
+		}
+	} else if (mask_bit == ACE_READ_NAMED_ATTRS) {
+		if (!hasreadperm)
+			return (0);
+
+		set_deny = ACL_READ_NAMED_READER_SET_DENY;
+		err_deny = ACL_READ_NAMED_READER_ERR_DENY;
+		set_allow = ACL_READ_NAMED_READER_SET_ALLOW;
+		err_allow = ACL_READ_NAMED_READER_ERR_ALLOW;
+	} else if (mask_bit == ACE_WRITE_NAMED_ATTRS) {
+		if (!haswriteperm)
+			return (0);
+
+		set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY;
+		err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY;
+		set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+		err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW;
+	} else {
+		return (EINVAL);
+	}
+
+	if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
+		if (acl_consume & set_deny) {
+			if (!(acep->a_access_mask & mask_bit)) {
+				return (ENOTSUP);
+			}
+		} else if (acl_consume & err_deny) {
+			if (acep->a_access_mask & mask_bit) {
+				return (ENOTSUP);
+			}
+		}
+	} else {
+		/* ACE_ACCESS_ALLOWED_ACE_TYPE */
+		if (acl_consume & set_allow) {
+			if (!(acep->a_access_mask & mask_bit)) {
+				return (ENOTSUP);
+			}
+		} else if (acl_consume & err_allow) {
+			if (acep->a_access_mask & mask_bit) {
+				return (ENOTSUP);
+			}
+		}
+	}
+	return (0);
+}
+
+static int
+ace_to_aent_legal(ace_t *acep)
+{
+	int error = 0;
+	int isowner;
+
+	/* only ALLOW or DENY */
+	if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) &&
+	    (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) {
+		error = ENOTSUP;
+		goto out;
+	}
+
+	/* check for invalid flags */
+	if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/* some flags are illegal */
+	if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG |
+	    ACE_FAILED_ACCESS_ACE_FLAG |
+	    ACE_NO_PROPAGATE_INHERIT_ACE)) {
+		error = ENOTSUP;
+		goto out;
+	}
+
+	/* check for invalid masks */
+	if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) {
+		error = EINVAL;
+		goto out;
+	}
+
+	if ((acep->a_flags & ACE_OWNER)) {
+		isowner = 1;
+	} else {
+		isowner = 0;
+	}
+
+	error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner);
+	if (error)
+		goto out;
+
+	error = access_mask_check(acep, ACE_WRITE_OWNER, isowner);
+	if (error)
+		goto out;
+
+	error = access_mask_check(acep, ACE_DELETE, isowner);
+	if (error)
+		goto out;
+
+	error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner);
+	if (error)
+		goto out;
+
+	error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner);
+	if (error)
+		goto out;
+
+	error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner);
+	if (error)
+		goto out;
+
+	/* more detailed checking of masks */
+	if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
+		if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) {
+			error = ENOTSUP;
+			goto out;
+		}
+		if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+		    (! (acep->a_access_mask & ACE_APPEND_DATA))) {
+			error = ENOTSUP;
+			goto out;
+		}
+		if ((! (acep->a_access_mask & ACE_WRITE_DATA)) &&
+		    (acep->a_access_mask & ACE_APPEND_DATA)) {
+			error = ENOTSUP;
+			goto out;
+		}
+	}
+
+	/* ACL enforcement */
+	if ((acep->a_access_mask & ACE_READ_ACL) &&
+	    (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) {
+		error = ENOTSUP;
+		goto out;
+	}
+	if (acep->a_access_mask & ACE_WRITE_ACL) {
+		if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) &&
+		    (isowner)) {
+			error = ENOTSUP;
+			goto out;
+		}
+		if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) &&
+		    (! isowner)) {
+			error = ENOTSUP;
+			goto out;
+		}
+	}
+
+out:
+	return (error);
+}
+
+static int
+ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir)
+{
+	/* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */
+	if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) !=
+	    (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) {
+		return (ENOTSUP);
+	}
+
+	return (ace_mask_to_mode(mask, modep, isdir));
+}
+
+static int
+acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list,
+    uid_t owner, gid_t group, boolean_t isdir)
+{
+	int error;
+	uint32_t  flips = ACE_POSIX_SUPPORTED_BITS;
+
+	if (isdir)
+		flips |= ACE_DELETE_CHILD;
+	if (vals->allowed != (vals->denied ^ flips)) {
+		error = ENOTSUP;
+		goto out;
+	}
+	if ((list->hasmask) && (list->acl_mask != vals->mask) &&
+	    (vals->aent_type & (USER | GROUP | GROUP_OBJ))) {
+		error = ENOTSUP;
+		goto out;
+	}
+	error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir);
+	if (error != 0)
+		goto out;
+	dest->a_type = vals->aent_type;
+	if (dest->a_type & (USER | GROUP)) {
+		dest->a_id = vals->key;
+	} else if (dest->a_type & USER_OBJ) {
+		dest->a_id = owner;
+	} else if (dest->a_type & GROUP_OBJ) {
+		dest->a_id = group;
+	} else if (dest->a_type & OTHER_OBJ) {
+		dest->a_id = 0;
+	} else {
+		error = EINVAL;
+		goto out;
+	}
+
+out:
+	return (error);
+}
+
+
+static int
+ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt,
+    uid_t owner, gid_t group, boolean_t isdir)
+{
+	int error = 0;
+	aclent_t *aent, *result = NULL;
+	acevals_t *vals;
+	int resultcount;
+
+	if ((list->seen & (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) !=
+	    (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) {
+		error = ENOTSUP;
+		goto out;
+	}
+	if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) {
+		error = ENOTSUP;
+		goto out;
+	}
+
+	resultcount = 3 + list->numusers + list->numgroups;
+	/*
+	 * This must be the same condition as below, when we add the CLASS_OBJ
+	 * (aka ACL mask)
+	 */
+	if ((list->hasmask) || (! list->dfacl_flag))
+		resultcount += 1;
+
+	if (cacl_malloc((void **)&result,
+	    resultcount * sizeof (aclent_t)) != 0) {
+		error = ENOMEM;
+		goto out;
+	}
+	aent = result;
+
+	/* USER_OBJ */
+	if (!(list->user_obj.aent_type & USER_OBJ)) {
+		error = EINVAL;
+		goto out;
+	}
+
+	error = acevals_to_aent(&list->user_obj, aent, list, owner, group,
+	    isdir);
+
+	if (error != 0)
+		goto out;
+	++aent;
+	/* USER */
+	vals = NULL;
+	for (vals = avl_first(&list->user); vals != NULL;
+	    vals = AVL_NEXT(&list->user, vals)) {
+		if (!(vals->aent_type & USER)) {
+			error = EINVAL;
+			goto out;
+		}
+		error = acevals_to_aent(vals, aent, list, owner, group,
+		    isdir);
+		if (error != 0)
+			goto out;
+		++aent;
+	}
+	/* GROUP_OBJ */
+	if (!(list->group_obj.aent_type & GROUP_OBJ)) {
+		error = EINVAL;
+		goto out;
+	}
+	error = acevals_to_aent(&list->group_obj, aent, list, owner, group,
+	    isdir);
+	if (error != 0)
+		goto out;
+	++aent;
+	/* GROUP */
+	vals = NULL;
+	for (vals = avl_first(&list->group); vals != NULL;
+	    vals = AVL_NEXT(&list->group, vals)) {
+		if (!(vals->aent_type & GROUP)) {
+			error = EINVAL;
+			goto out;
+		}
+		error = acevals_to_aent(vals, aent, list, owner, group,
+		    isdir);
+		if (error != 0)
+			goto out;
+		++aent;
+	}
+	/*
+	 * CLASS_OBJ (aka ACL_MASK)
+	 *
+	 * An ACL_MASK is not fabricated if the ACL is a default ACL.
+	 * This is to follow UFS's behavior.
+	 */
+	if ((list->hasmask) || (! list->dfacl_flag)) {
+		if (list->hasmask) {
+			uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
+			if (isdir)
+				flips |= ACE_DELETE_CHILD;
+			error = ace_mask_to_mode(list->acl_mask ^ flips,
+			    &aent->a_perm, isdir);
+			if (error != 0)
+				goto out;
+		} else {
+			/* fabricate the ACL_MASK from the group permissions */
+			error = ace_mask_to_mode(list->group_obj.allowed,
+			    &aent->a_perm, isdir);
+			if (error != 0)
+				goto out;
+		}
+		aent->a_id = 0;
+		aent->a_type = CLASS_OBJ | list->dfacl_flag;
+		++aent;
+	}
+	/* OTHER_OBJ */
+	if (!(list->other_obj.aent_type & OTHER_OBJ)) {
+		error = EINVAL;
+		goto out;
+	}
+	error = acevals_to_aent(&list->other_obj, aent, list, owner, group,
+	    isdir);
+	if (error != 0)
+		goto out;
+	++aent;
+
+	*aclentp = result;
+	*aclcnt = resultcount;
+
+out:
+	if (error != 0) {
+		if (result != NULL)
+			cacl_free(result, resultcount * sizeof (aclent_t));
+	}
+
+	return (error);
+}
+
+
+/*
+ * free all data associated with an ace_list
+ */
+static void
+ace_list_free(ace_list_t *al)
+{
+	acevals_t *node;
+	void *cookie;
+
+	if (al == NULL)
+		return;
+
+	cookie = NULL;
+	while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL)
+		cacl_free(node, sizeof (acevals_t));
+	cookie = NULL;
+	while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL)
+		cacl_free(node, sizeof (acevals_t));
+
+	avl_destroy(&al->user);
+	avl_destroy(&al->group);
+
+	/* free the container itself */
+	cacl_free(al, sizeof (ace_list_t));
+}
+
+static int
+acevals_compare(const void *va, const void *vb)
+{
+	const acevals_t *a = va, *b = vb;
+
+	if (a->key == b->key)
+		return (0);
+
+	if (a->key > b->key)
+		return (1);
+
+	else
+		return (-1);
+}
+
+/*
+ * Convert a list of ace_t entries to equivalent regular and default
+ * aclent_t lists.  Return error (ENOTSUP) when conversion is not possible.
+ */
+static int
+ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group,
+    aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt,
+    boolean_t isdir)
+{
+	int error = 0;
+	ace_t *acep;
+	uint32_t bits;
+	int i;
+	ace_list_t *normacl = NULL, *dfacl = NULL, *acl;
+	acevals_t *vals;
+
+	*aclentp = NULL;
+	*aclcnt = 0;
+	*dfaclentp = NULL;
+	*dfaclcnt = 0;
+
+	/* we need at least user_obj, group_obj, and other_obj */
+	if (n < 6) {
+		error = ENOTSUP;
+		goto out;
+	}
+	if (ace == NULL) {
+		error = EINVAL;
+		goto out;
+	}
+
+	error = cacl_malloc((void **)&normacl, sizeof (ace_list_t));
+	if (error != 0)
+		goto out;
+
+	avl_create(&normacl->user, acevals_compare, sizeof (acevals_t),
+	    offsetof(acevals_t, avl));
+	avl_create(&normacl->group, acevals_compare, sizeof (acevals_t),
+	    offsetof(acevals_t, avl));
+
+	ace_list_init(normacl, 0);
+
+	error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t));
+	if (error != 0)
+		goto out;
+
+	avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t),
+	    offsetof(acevals_t, avl));
+	avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t),
+	    offsetof(acevals_t, avl));
+	ace_list_init(dfacl, ACL_DEFAULT);
+
+	/* process every ace_t... */
+	for (i = 0; i < n; i++) {
+		acep = &ace[i];
+
+		/* rule out certain cases quickly */
+		error = ace_to_aent_legal(acep);
+		if (error != 0)
+			goto out;
+
+		/*
+		 * Turn off these bits in order to not have to worry about
+		 * them when doing the checks for compliments.
+		 */
+		acep->a_access_mask &= ~(ACE_WRITE_OWNER | ACE_DELETE |
+		    ACE_SYNCHRONIZE | ACE_WRITE_ATTRIBUTES |
+		    ACE_READ_NAMED_ATTRS | ACE_WRITE_NAMED_ATTRS);
+
+		/* see if this should be a regular or default acl */
+		bits = acep->a_flags &
+		    (ACE_INHERIT_ONLY_ACE |
+		    ACE_FILE_INHERIT_ACE |
+		    ACE_DIRECTORY_INHERIT_ACE);
+		if (bits != 0) {
+			/* all or nothing on these inherit bits */
+			if (bits != (ACE_INHERIT_ONLY_ACE |
+			    ACE_FILE_INHERIT_ACE |
+			    ACE_DIRECTORY_INHERIT_ACE)) {
+				error = ENOTSUP;
+				goto out;
+			}
+			acl = dfacl;
+		} else {
+			acl = normacl;
+		}
+
+		if ((acep->a_flags & ACE_OWNER)) {
+			if (acl->state > ace_user_obj) {
+				error = ENOTSUP;
+				goto out;
+			}
+			acl->state = ace_user_obj;
+			acl->seen |= USER_OBJ;
+			vals = &acl->user_obj;
+			vals->aent_type = USER_OBJ | acl->dfacl_flag;
+		} else if ((acep->a_flags & ACE_EVERYONE)) {
+			acl->state = ace_other_obj;
+			acl->seen |= OTHER_OBJ;
+			vals = &acl->other_obj;
+			vals->aent_type = OTHER_OBJ | acl->dfacl_flag;
+		} else if (acep->a_flags & ACE_IDENTIFIER_GROUP) {
+			if (acl->state > ace_group) {
+				error = ENOTSUP;
+				goto out;
+			}
+			if ((acep->a_flags & ACE_GROUP)) {
+				acl->seen |= GROUP_OBJ;
+				vals = &acl->group_obj;
+				vals->aent_type = GROUP_OBJ | acl->dfacl_flag;
+			} else {
+				acl->seen |= GROUP;
+				vals = acevals_find(acep, &acl->group,
+				    &acl->numgroups);
+				if (vals == NULL) {
+					error = ENOMEM;
+					goto out;
+				}
+				vals->aent_type = GROUP | acl->dfacl_flag;
+			}
+			acl->state = ace_group;
+		} else {
+			if (acl->state > ace_user) {
+				error = ENOTSUP;
+				goto out;
+			}
+			acl->state = ace_user;
+			acl->seen |= USER;
+			vals = acevals_find(acep, &acl->user,
+			    &acl->numusers);
+			if (vals == NULL) {
+				error = ENOMEM;
+				goto out;
+			}
+			vals->aent_type = USER | acl->dfacl_flag;
+		}
+
+		if (!(acl->state > ace_unused)) {
+			error = EINVAL;
+			goto out;
+		}
+
+		if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			/* no more than one allowed per aclent_t */
+			if (vals->allowed != ACE_MASK_UNDEFINED) {
+				error = ENOTSUP;
+				goto out;
+			}
+			vals->allowed = acep->a_access_mask;
+		} else {
+			/*
+			 * it's a DENY; if there was a previous DENY, it
+			 * must have been an ACL_MASK.
+			 */
+			if (vals->denied != ACE_MASK_UNDEFINED) {
+				/* ACL_MASK is for USER and GROUP only */
+				if ((acl->state != ace_user) &&
+				    (acl->state != ace_group)) {
+					error = ENOTSUP;
+					goto out;
+				}
+
+				if (! acl->hasmask) {
+					acl->hasmask = 1;
+					acl->acl_mask = vals->denied;
+				/* check for mismatched ACL_MASK emulations */
+				} else if (acl->acl_mask != vals->denied) {
+					error = ENOTSUP;
+					goto out;
+				}
+				vals->mask = vals->denied;
+			}
+			vals->denied = acep->a_access_mask;
+		}
+	}
+
+	/* done collating; produce the aclent_t lists */
+	if (normacl->state != ace_unused) {
+		error = ace_list_to_aent(normacl, aclentp, aclcnt,
+		    owner, group, isdir);
+		if (error != 0) {
+			goto out;
+		}
+	}
+	if (dfacl->state != ace_unused) {
+		error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt,
+		    owner, group, isdir);
+		if (error != 0) {
+			goto out;
+		}
+	}
+
+out:
+	if (normacl != NULL)
+		ace_list_free(normacl);
+	if (dfacl != NULL)
+		ace_list_free(dfacl);
+
+	return (error);
+}
+
+static int
+convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir,
+    uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt)
+{
+	int error = 0;
+	aclent_t *aclentp, *dfaclentp;
+	int aclcnt, dfaclcnt;
+	int aclsz, dfaclsz;
+
+	error = ln_ace_to_aent(acebufp, acecnt, owner, group,
+	    &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir);
+
+	if (error)
+		return (error);
+
+
+	if (dfaclcnt != 0) {
+		/*
+		 * Slap aclentp and dfaclentp into a single array.
+		 */
+		aclsz = sizeof (aclent_t) * aclcnt;
+		dfaclsz = sizeof (aclent_t) * dfaclcnt;
+		aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz);
+		if (aclentp != NULL) {
+			(void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz);
+		} else {
+			error = ENOMEM;
+		}
+	}
+
+	if (aclentp) {
+		*retaclentp = aclentp;
+		*retaclcnt = aclcnt + dfaclcnt;
+	}
+
+	if (dfaclentp)
+		cacl_free(dfaclentp, dfaclsz);
+
+	return (error);
+}
+
+
+int
+acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner,
+    gid_t group)
+{
+	int aclcnt;
+	void *acldata;
+	int error;
+
+	/*
+	 * See if we need to translate
+	 */
+	if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) ||
+	    (target_flavor == _ACL_ACLENT_ENABLED &&
+	    aclp->acl_type == ACLENT_T))
+		return (0);
+
+	if (target_flavor == -1) {
+		error = EINVAL;
+		goto out;
+	}
+
+	if (target_flavor ==  _ACL_ACE_ENABLED &&
+	    aclp->acl_type == ACLENT_T) {
+		error = convert_aent_to_ace(aclp->acl_aclp,
+		    aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt);
+		if (error)
+			goto out;
+
+	} else if (target_flavor == _ACL_ACLENT_ENABLED &&
+	    aclp->acl_type == ACE_T) {
+		error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt,
+		    isdir, owner, group, (aclent_t **)&acldata, &aclcnt);
+		if (error)
+			goto out;
+	} else {
+		error = ENOTSUP;
+		goto out;
+	}
+
+	/*
+	 * replace old acl with newly translated acl
+	 */
+	cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size);
+	aclp->acl_aclp = acldata;
+	aclp->acl_cnt = aclcnt;
+	if (target_flavor == _ACL_ACE_ENABLED) {
+		aclp->acl_type = ACE_T;
+		aclp->acl_entry_size = sizeof (ace_t);
+	} else {
+		aclp->acl_type = ACLENT_T;
+		aclp->acl_entry_size = sizeof (aclent_t);
+	}
+	return (0);
+
+out:
+
+#if !defined(_KERNEL)
+	errno = error;
+	return (-1);
+#else
+	return (error);
+#endif
+}
+#endif /* !_KERNEL */
+
+#define	SET_ACE(acl, index, who, mask, type, flags) { \
+	acl[0][index].a_who = (uint32_t)who; \
+	acl[0][index].a_type = type; \
+	acl[0][index].a_flags = flags; \
+	acl[0][index++].a_access_mask = mask; \
+}
+
+void
+acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks)
+{
+	uint32_t read_mask = ACE_READ_DATA;
+	uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA;
+	uint32_t execute_mask = ACE_EXECUTE;
+
+	(void) isdir;	/* will need this later */
+
+	masks->deny1 = 0;
+	if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH)))
+		masks->deny1 |= read_mask;
+	if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH)))
+		masks->deny1 |= write_mask;
+	if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH)))
+		masks->deny1 |= execute_mask;
+
+	masks->deny2 = 0;
+	if (!(mode & S_IRGRP) && (mode & S_IROTH))
+		masks->deny2 |= read_mask;
+	if (!(mode & S_IWGRP) && (mode & S_IWOTH))
+		masks->deny2 |= write_mask;
+	if (!(mode & S_IXGRP) && (mode & S_IXOTH))
+		masks->deny2 |= execute_mask;
+
+	masks->allow0 = 0;
+	if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH)))
+		masks->allow0 |= read_mask;
+	if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH)))
+		masks->allow0 |= write_mask;
+	if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH)))
+		masks->allow0 |= execute_mask;
+
+	masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL|
+	    ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES|
+	    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE;
+	if (mode & S_IRUSR)
+		masks->owner |= read_mask;
+	if (mode & S_IWUSR)
+		masks->owner |= write_mask;
+	if (mode & S_IXUSR)
+		masks->owner |= execute_mask;
+
+	masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+	    ACE_SYNCHRONIZE;
+	if (mode & S_IRGRP)
+		masks->group |= read_mask;
+	if (mode & S_IWGRP)
+		masks->group |= write_mask;
+	if (mode & S_IXGRP)
+		masks->group |= execute_mask;
+
+	masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+	    ACE_SYNCHRONIZE;
+	if (mode & S_IROTH)
+		masks->everyone |= read_mask;
+	if (mode & S_IWOTH)
+		masks->everyone |= write_mask;
+	if (mode & S_IXOTH)
+		masks->everyone |= execute_mask;
+}
+
+int
+acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count)
+{
+	int		index = 0;
+	int		error;
+	trivial_acl_t	masks;
+
+	*count = 3;
+	acl_trivial_access_masks(mode, isdir, &masks);
+
+	if (masks.allow0)
+		(*count)++;
+	if (masks.deny1)
+		(*count)++;
+	if (masks.deny2)
+		(*count)++;
+
+	if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0)
+		return (error);
+
+	if (masks.allow0) {
+		SET_ACE(acl, index, -1, masks.allow0,
+		    ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER);
+	}
+	if (masks.deny1) {
+		SET_ACE(acl, index, -1, masks.deny1,
+		    ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER);
+	}
+	if (masks.deny2) {
+		SET_ACE(acl, index, -1, masks.deny2,
+		    ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP|ACE_IDENTIFIER_GROUP);
+	}
+
+	SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE,
+	    ACE_OWNER);
+	SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE,
+	    ACE_IDENTIFIER_GROUP|ACE_GROUP);
+	SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE,
+	    ACE_EVERYONE);
+
+	return (0);
+}
+
+/*
+ * ace_trivial:
+ * determine whether an ace_t acl is trivial
+ *
+ * Trivialness implies that the acl is composed of only
+ * owner, group, everyone entries.  ACL can't
+ * have read_acl denied, and write_owner/write_acl/write_attributes
+ * can only be owner@ entry.
+ */
+int
+ace_trivial_common(void *acep, int aclcnt,
+    uint64_t (*walk)(void *, uint64_t, int aclcnt,
+    uint16_t *, uint16_t *, uint32_t *))
+{
+	uint16_t flags;
+	uint32_t mask;
+	uint16_t type;
+	uint64_t cookie = 0;
+
+	while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) {
+		switch (flags & ACE_TYPE_FLAGS) {
+		case ACE_OWNER:
+		case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+		case ACE_EVERYONE:
+			break;
+		default:
+			return (1);
+
+		}
+
+		if (flags & (ACE_FILE_INHERIT_ACE|
+		    ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
+		    ACE_INHERIT_ONLY_ACE))
+			return (1);
+
+		/*
+		 * Special check for some special bits
+		 *
+		 * Don't allow anybody to deny reading basic
+		 * attributes or a files ACL.
+		 */
+		if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+		    (type == ACE_ACCESS_DENIED_ACE_TYPE))
+			return (1);
+
+		/*
+		 * Delete permissions are never set by default
+		 */
+		if (mask & (ACE_DELETE|ACE_DELETE_CHILD))
+			return (1);
+		/*
+		 * only allow owner@ to have
+		 * write_acl/write_owner/write_attributes/write_xattr/
+		 */
+		if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+		    (!(flags & ACE_OWNER) && (mask &
+		    (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
+		    ACE_WRITE_NAMED_ATTRS))))
+			return (1);
+
+	}
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/callb.c b/sys/contrib/openzfs/module/os/freebsd/spl/callb.c
new file mode 100644
index 000000000000..fffa85b6b91b
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/callb.c
@@ -0,0 +1,373 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/mutex.h>
+#include <sys/condvar.h>
+#include <sys/callb.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/kobj.h>
+#include <sys/systm.h>	/* for delay() */
+#include <sys/taskq.h>  /* For TASKQ_NAMELEN */
+#include <sys/kernel.h>
+
+#define	CB_MAXNAME	TASKQ_NAMELEN
+
+/*
+ * The callb mechanism provides generic event scheduling/echoing.
+ * A callb function is registered and called on behalf of the event.
+ */
+typedef struct callb {
+	struct callb	*c_next; 	/* next in class or on freelist */
+	kthread_id_t	c_thread;	/* ptr to caller's thread struct */
+	char		c_flag;		/* info about the callb state */
+	uchar_t		c_class;	/* this callb's class */
+	kcondvar_t	c_done_cv;	/* signal callb completion */
+	boolean_t	(*c_func)(void *, int);
+					/* cb function: returns true if ok */
+	void		*c_arg;		/* arg to c_func */
+	char		c_name[CB_MAXNAME+1]; /* debug:max func name length */
+} callb_t;
+
+/*
+ * callb c_flag bitmap definitions
+ */
+#define	CALLB_FREE		0x0
+#define	CALLB_TAKEN		0x1
+#define	CALLB_EXECUTING		0x2
+
+/*
+ * Basic structure for a callb table.
+ * All callbs are organized into different class groups described
+ * by ct_class array.
+ * The callbs within a class are single-linked and normally run by a
+ * serial execution.
+ */
+typedef struct callb_table {
+	kmutex_t ct_lock;		/* protect all callb states */
+	callb_t	*ct_freelist; 		/* free callb structures */
+	int	ct_busy;		/* != 0 prevents additions */
+	kcondvar_t ct_busy_cv;		/* to wait for not busy    */
+	int	ct_ncallb; 		/* num of callbs allocated */
+	callb_t	*ct_first_cb[NCBCLASS];	/* ptr to 1st callb in a class */
+} callb_table_t;
+
+int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC;
+
+static callb_id_t callb_add_common(boolean_t (*)(void *, int),
+    void *, int, char *, kthread_id_t);
+
+static callb_table_t callb_table;	/* system level callback table */
+static callb_table_t *ct = &callb_table;
+static kmutex_t	callb_safe_mutex;
+callb_cpr_t	callb_cprinfo_safe = {
+	&callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, {0, 0} };
+
+/*
+ * Init all callb tables in the system.
+ */
+static void
+callb_init(void *dummy __unused)
+{
+	callb_table.ct_busy = 0;	/* mark table open for additions */
+	mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+callb_fini(void *dummy __unused)
+{
+	callb_t *cp;
+	int i;
+
+	mutex_enter(&ct->ct_lock);
+	for (i = 0; i < 16; i++) {
+		while ((cp = ct->ct_freelist) != NULL) {
+			ct->ct_freelist = cp->c_next;
+			ct->ct_ncallb--;
+			kmem_free(cp, sizeof (callb_t));
+		}
+		if (ct->ct_ncallb == 0)
+			break;
+		/* Not all callbacks finished, waiting for the rest. */
+		mutex_exit(&ct->ct_lock);
+		tsleep(ct, 0, "callb", hz / 4);
+		mutex_enter(&ct->ct_lock);
+	}
+	if (ct->ct_ncallb > 0)
+		printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb);
+	mutex_exit(&ct->ct_lock);
+	mutex_destroy(&callb_safe_mutex);
+	mutex_destroy(&callb_table.ct_lock);
+}
+
+/*
+ * callout_add() is called to register func() be called later.
+ */
+static callb_id_t
+callb_add_common(boolean_t (*func)(void *arg, int code),
+    void *arg, int class, char *name, kthread_id_t t)
+{
+	callb_t *cp;
+
+	ASSERT(class < NCBCLASS);
+
+	mutex_enter(&ct->ct_lock);
+	while (ct->ct_busy)
+		cv_wait(&ct->ct_busy_cv, &ct->ct_lock);
+	if ((cp = ct->ct_freelist) == NULL) {
+		ct->ct_ncallb++;
+		cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP);
+	}
+	ct->ct_freelist = cp->c_next;
+	cp->c_thread = t;
+	cp->c_func = func;
+	cp->c_arg = arg;
+	cp->c_class = (uchar_t)class;
+	cp->c_flag |= CALLB_TAKEN;
+#ifdef ZFS_DEBUG
+	if (strlen(name) > CB_MAXNAME)
+		cmn_err(CE_WARN, "callb_add: name of callback function '%s' "
+		    "too long -- truncated to %d chars",
+		    name, CB_MAXNAME);
+#endif
+	(void) strncpy(cp->c_name, name, CB_MAXNAME);
+	cp->c_name[CB_MAXNAME] = '\0';
+
+	/*
+	 * Insert the new callb at the head of its class list.
+	 */
+	cp->c_next = ct->ct_first_cb[class];
+	ct->ct_first_cb[class] = cp;
+
+	mutex_exit(&ct->ct_lock);
+	return ((callb_id_t)cp);
+}
+
+/*
+ * The default function to add an entry to the callback table.  Since
+ * it uses curthread as the thread identifier to store in the table,
+ * it should be used for the normal case of a thread which is calling
+ * to add ITSELF to the table.
+ */
+callb_id_t
+callb_add(boolean_t (*func)(void *arg, int code),
+    void *arg, int class, char *name)
+{
+	return (callb_add_common(func, arg, class, name, curthread));
+}
+
+/*
+ * A special version of callb_add() above for use by threads which
+ * might be adding an entry to the table on behalf of some other
+ * thread (for example, one which is constructed but not yet running).
+ * In this version the thread id is an argument.
+ */
+callb_id_t
+callb_add_thread(boolean_t (*func)(void *arg, int code),
+    void *arg, int class, char *name, kthread_id_t t)
+{
+	return (callb_add_common(func, arg, class, name, t));
+}
+
+/*
+ * callout_delete() is called to remove an entry identified by id
+ * that was originally placed there by a call to callout_add().
+ * return -1 if fail to delete a callb entry otherwise return 0.
+ */
+int
+callb_delete(callb_id_t id)
+{
+	callb_t **pp;
+	callb_t *me = (callb_t *)id;
+
+	mutex_enter(&ct->ct_lock);
+
+	for (;;) {
+		pp = &ct->ct_first_cb[me->c_class];
+		while (*pp != NULL && *pp != me)
+			pp = &(*pp)->c_next;
+
+#ifdef ZFS_DEBUG
+		if (*pp != me) {
+			cmn_err(CE_WARN, "callb delete bogus entry 0x%p",
+			    (void *)me);
+			mutex_exit(&ct->ct_lock);
+			return (-1);
+		}
+#endif /* DEBUG */
+
+		/*
+		 * It is not allowed to delete a callb in the middle of
+		 * executing otherwise, the callb_execute() will be confused.
+		 */
+		if (!(me->c_flag & CALLB_EXECUTING))
+			break;
+
+		cv_wait(&me->c_done_cv, &ct->ct_lock);
+	}
+	/* relink the class list */
+	*pp = me->c_next;
+
+	/* clean up myself and return the free callb to the head of freelist */
+	me->c_flag = CALLB_FREE;
+	me->c_next = ct->ct_freelist;
+	ct->ct_freelist = me;
+
+	mutex_exit(&ct->ct_lock);
+	return (0);
+}
+
+/*
+ * class:	indicates to execute all callbs in the same class;
+ * code:	optional argument for the callb functions.
+ * return:	 = 0: success
+ *		!= 0: ptr to string supplied when callback was registered
+ */
+void *
+callb_execute_class(int class, int code)
+{
+	callb_t *cp;
+	void *ret = NULL;
+
+	ASSERT(class < NCBCLASS);
+
+	mutex_enter(&ct->ct_lock);
+
+	for (cp = ct->ct_first_cb[class];
+	    cp != NULL && ret == 0; cp = cp->c_next) {
+		while (cp->c_flag & CALLB_EXECUTING)
+			cv_wait(&cp->c_done_cv, &ct->ct_lock);
+		/*
+		 * cont if the callb is deleted while we're sleeping
+		 */
+		if (cp->c_flag == CALLB_FREE)
+			continue;
+		cp->c_flag |= CALLB_EXECUTING;
+
+#ifdef CALLB_DEBUG
+		printf("callb_execute: name=%s func=%p arg=%p\n",
+		    cp->c_name, (void *)cp->c_func, (void *)cp->c_arg);
+#endif /* CALLB_DEBUG */
+
+		mutex_exit(&ct->ct_lock);
+		/* If callback function fails, pass back client's name */
+		if (!(*cp->c_func)(cp->c_arg, code))
+			ret = cp->c_name;
+		mutex_enter(&ct->ct_lock);
+
+		cp->c_flag &= ~CALLB_EXECUTING;
+		cv_broadcast(&cp->c_done_cv);
+	}
+	mutex_exit(&ct->ct_lock);
+	return (ret);
+}
+
+/*
+ * callers make sure no recursive entries to this func.
+ * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure.
+ *
+ * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we
+ * use a cv_timedwait() in case the kernel thread is blocked.
+ *
+ * Note that this is a generic callback handler for daemon CPR and
+ * should NOT be changed to accommodate any specific requirement in a daemon.
+ * Individual daemons that require changes to the handler shall write
+ * callback routines in their own daemon modules.
+ */
+boolean_t
+callb_generic_cpr(void *arg, int code)
+{
+	callb_cpr_t *cp = (callb_cpr_t *)arg;
+	clock_t ret = 0;			/* assume success */
+
+	mutex_enter(cp->cc_lockp);
+
+	switch (code) {
+	case CB_CODE_CPR_CHKPT:
+		cp->cc_events |= CALLB_CPR_START;
+#ifdef CPR_NOT_THREAD_SAFE
+		while (!(cp->cc_events & CALLB_CPR_SAFE))
+			/* cv_timedwait() returns -1 if it times out. */
+			if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
+			    cp->cc_lockp, (callb_timeout_sec * hz),
+			    TR_CLOCK_TICK)) == -1)
+				break;
+#endif
+		break;
+
+	case CB_CODE_CPR_RESUME:
+		cp->cc_events &= ~CALLB_CPR_START;
+		cv_signal(&cp->cc_stop_cv);
+		break;
+	}
+	mutex_exit(cp->cc_lockp);
+	return (ret != -1);
+}
+
+/*
+ * The generic callback function associated with kernel threads which
+ * are always considered safe.
+ */
+/* ARGSUSED */
+boolean_t
+callb_generic_cpr_safe(void *arg, int code)
+{
+	return (B_TRUE);
+}
+/*
+ * Prevent additions to callback table.
+ */
+void
+callb_lock_table(void)
+{
+	mutex_enter(&ct->ct_lock);
+	ASSERT(ct->ct_busy == 0);
+	ct->ct_busy = 1;
+	mutex_exit(&ct->ct_lock);
+}
+
+/*
+ * Allow additions to callback table.
+ */
+void
+callb_unlock_table(void)
+{
+	mutex_enter(&ct->ct_lock);
+	ASSERT(ct->ct_busy != 0);
+	ct->ct_busy = 0;
+	cv_broadcast(&ct->ct_busy_cv);
+	mutex_exit(&ct->ct_lock);
+}
+
+SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL);
+SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL);
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/list.c b/sys/contrib/openzfs/module/os/freebsd/spl/list.c
new file mode 100644
index 000000000000..21230b2adddb
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/list.c
@@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Generic doubly-linked list implementation
+ */
+
+#include <sys/list.h>
+#include <sys/list_impl.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+
+#define	list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset))
+#define	list_object(a, node) ((void *)(((char *)node) - (a)->list_offset))
+#define	list_empty(a) ((a)->list_head.list_next == &(a)->list_head)
+
+#define	list_insert_after_node(list, node, object) {	\
+	list_node_t *lnew = list_d2l(list, object);	\
+	lnew->list_prev = (node);			\
+	lnew->list_next = (node)->list_next;		\
+	(node)->list_next->list_prev = lnew;		\
+	(node)->list_next = lnew;			\
+}
+
+#define	list_insert_before_node(list, node, object) {	\
+	list_node_t *lnew = list_d2l(list, object);	\
+	lnew->list_next = (node);			\
+	lnew->list_prev = (node)->list_prev;		\
+	(node)->list_prev->list_next = lnew;		\
+	(node)->list_prev = lnew;			\
+}
+
+#define	list_remove_node(node)					\
+	(node)->list_prev->list_next = (node)->list_next;	\
+	(node)->list_next->list_prev = (node)->list_prev;	\
+	(node)->list_next = (node)->list_prev = NULL
+
+void
+list_create(list_t *list, size_t size, size_t offset)
+{
+	ASSERT(list);
+	ASSERT(size > 0);
+	ASSERT(size >= offset + sizeof (list_node_t));
+
+	list->list_size = size;
+	list->list_offset = offset;
+	list->list_head.list_next = list->list_head.list_prev =
+	    &list->list_head;
+}
+
+void
+list_destroy(list_t *list)
+{
+	list_node_t *node = &list->list_head;
+
+	ASSERT(list);
+	ASSERT(list->list_head.list_next == node);
+	ASSERT(list->list_head.list_prev == node);
+
+	node->list_next = node->list_prev = NULL;
+}
+
+void
+list_insert_after(list_t *list, void *object, void *nobject)
+{
+	if (object == NULL) {
+		list_insert_head(list, nobject);
+	} else {
+		list_node_t *lold = list_d2l(list, object);
+		list_insert_after_node(list, lold, nobject);
+	}
+}
+
+void
+list_insert_before(list_t *list, void *object, void *nobject)
+{
+	if (object == NULL) {
+		list_insert_tail(list, nobject);
+	} else {
+		list_node_t *lold = list_d2l(list, object);
+		list_insert_before_node(list, lold, nobject);
+	}
+}
+
+void
+list_insert_head(list_t *list, void *object)
+{
+	list_node_t *lold = &list->list_head;
+	list_insert_after_node(list, lold, object);
+}
+
+void
+list_insert_tail(list_t *list, void *object)
+{
+	list_node_t *lold = &list->list_head;
+	list_insert_before_node(list, lold, object);
+}
+
+void
+list_remove(list_t *list, void *object)
+{
+	list_node_t *lold = list_d2l(list, object);
+	ASSERT(!list_empty(list));
+	ASSERT(lold->list_next != NULL);
+	list_remove_node(lold);
+}
+
+void *
+list_remove_head(list_t *list)
+{
+	list_node_t *head = list->list_head.list_next;
+	if (head == &list->list_head)
+		return (NULL);
+	list_remove_node(head);
+	return (list_object(list, head));
+}
+
+void *
+list_remove_tail(list_t *list)
+{
+	list_node_t *tail = list->list_head.list_prev;
+	if (tail == &list->list_head)
+		return (NULL);
+	list_remove_node(tail);
+	return (list_object(list, tail));
+}
+
+void *
+list_head(list_t *list)
+{
+	if (list_empty(list))
+		return (NULL);
+	return (list_object(list, list->list_head.list_next));
+}
+
+void *
+list_tail(list_t *list)
+{
+	if (list_empty(list))
+		return (NULL);
+	return (list_object(list, list->list_head.list_prev));
+}
+
+void *
+list_next(list_t *list, void *object)
+{
+	list_node_t *node = list_d2l(list, object);
+
+	if (node->list_next != &list->list_head)
+		return (list_object(list, node->list_next));
+
+	return (NULL);
+}
+
+void *
+list_prev(list_t *list, void *object)
+{
+	list_node_t *node = list_d2l(list, object);
+
+	if (node->list_prev != &list->list_head)
+		return (list_object(list, node->list_prev));
+
+	return (NULL);
+}
+
+/*
+ *  Insert src list after dst list. Empty src list thereafter.
+ */
+void
+list_move_tail(list_t *dst, list_t *src)
+{
+	list_node_t *dstnode = &dst->list_head;
+	list_node_t *srcnode = &src->list_head;
+
+	ASSERT(dst->list_size == src->list_size);
+	ASSERT(dst->list_offset == src->list_offset);
+
+	if (list_empty(src))
+		return;
+
+	dstnode->list_prev->list_next = srcnode->list_next;
+	srcnode->list_next->list_prev = dstnode->list_prev;
+	dstnode->list_prev = srcnode->list_prev;
+	srcnode->list_prev->list_next = dstnode;
+
+	/* empty src list */
+	srcnode->list_next = srcnode->list_prev = srcnode;
+}
+
+void
+list_link_replace(list_node_t *lold, list_node_t *lnew)
+{
+	ASSERT(list_link_active(lold));
+	ASSERT(!list_link_active(lnew));
+
+	lnew->list_next = lold->list_next;
+	lnew->list_prev = lold->list_prev;
+	lold->list_prev->list_next = lnew;
+	lold->list_next->list_prev = lnew;
+	lold->list_next = lold->list_prev = NULL;
+}
+
+void
+list_link_init(list_node_t *link)
+{
+	link->list_next = NULL;
+	link->list_prev = NULL;
+}
+
+int
+list_link_active(list_node_t *link)
+{
+	EQUIV(link->list_next == NULL, link->list_prev == NULL);
+	return (link->list_next != NULL);
+}
+
+int
+list_is_empty(list_t *list)
+{
+	return (list_empty(list));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha224.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha224.h
new file mode 100644
index 000000000000..0abd43068708
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha224.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_SHA224_H_
+#define	_SHA224_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define	SHA224_BLOCK_LENGTH		64
+#define	SHA224_DIGEST_LENGTH		28
+#define	SHA224_DIGEST_STRING_LENGTH	(SHA224_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA224Context {
+	uint32_t state[8];
+	uint64_t count;
+	uint8_t buf[SHA224_BLOCK_LENGTH];
+} SHA224_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+
+#ifndef SHA224_Init
+#define	SHA224_Init		_libmd_SHA224_Init
+#endif
+#ifndef SHA224_Update
+#define	SHA224_Update		_libmd_SHA224_Update
+#endif
+#ifndef SHA224_Final
+#define	SHA224_Final		_libmd_SHA224_Final
+#endif
+#ifndef SHA224_End
+#define	SHA224_End		_libmd_SHA224_End
+#endif
+#ifndef SHA224_Fd
+#define	SHA224_Fd		_libmd_SHA224_Fd
+#endif
+#ifndef SHA224_FdChunk
+#define	SHA224_FdChunk		_libmd_SHA224_FdChunk
+#endif
+#ifndef SHA224_File
+#define	SHA224_File		_libmd_SHA224_File
+#endif
+#ifndef SHA224_FileChunk
+#define	SHA224_FileChunk	_libmd_SHA224_FileChunk
+#endif
+#ifndef SHA224_Data
+#define	SHA224_Data		_libmd_SHA224_Data
+#endif
+
+#ifndef SHA224_version
+#define	SHA224_version		_libmd_SHA224_version
+#endif
+
+void	SHA224_Init(SHA224_CTX *);
+void	SHA224_Update(SHA224_CTX *, const void *, size_t);
+void	SHA224_Final(unsigned char [__min_size(SHA224_DIGEST_LENGTH)],
+    SHA224_CTX *);
+#ifndef _KERNEL
+char   *SHA224_End(SHA224_CTX *, char *);
+char   *SHA224_Data(const void *, unsigned int, char *);
+char   *SHA224_Fd(int, char *);
+char   *SHA224_FdChunk(int, char *, off_t, off_t);
+char   *SHA224_File(const char *, char *);
+char   *SHA224_FileChunk(const char *, char *, off_t, off_t);
+#endif
+__END_DECLS
+
+#endif /* !_SHA224_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha256.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha256.h
new file mode 100644
index 000000000000..193c0c025120
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha256.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA256_H_
+#define	_SHA256_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define	SHA256_BLOCK_LENGTH		64
+#define	SHA256_DIGEST_LENGTH		32
+#define	SHA256_DIGEST_STRING_LENGTH	(SHA256_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA256Context {
+	uint32_t state[8];
+	uint64_t count;
+	uint8_t buf[SHA256_BLOCK_LENGTH];
+} SHA256_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+
+#ifndef SHA256_Init
+#define	SHA256_Init		_libmd_SHA256_Init
+#endif
+#ifndef SHA256_Update
+#define	SHA256_Update		_libmd_SHA256_Update
+#endif
+#ifndef SHA256_Final
+#define	SHA256_Final		_libmd_SHA256_Final
+#endif
+#ifndef SHA256_End
+#define	SHA256_End		_libmd_SHA256_End
+#endif
+#ifndef SHA256_Fd
+#define	SHA256_Fd		_libmd_SHA256_Fd
+#endif
+#ifndef SHA256_FdChunk
+#define	SHA256_FdChunk		_libmd_SHA256_FdChunk
+#endif
+#ifndef SHA256_File
+#define	SHA256_File		_libmd_SHA256_File
+#endif
+#ifndef SHA256_FileChunk
+#define	SHA256_FileChunk	_libmd_SHA256_FileChunk
+#endif
+#ifndef SHA256_Data
+#define	SHA256_Data		_libmd_SHA256_Data
+#endif
+
+#ifndef SHA256_Transform
+#define	SHA256_Transform	_libmd_SHA256_Transform
+#endif
+#ifndef SHA256_version
+#define	SHA256_version		_libmd_SHA256_version
+#endif
+
+void	SHA256_Init(SHA256_CTX *);
+void	SHA256_Update(SHA256_CTX *, const void *, size_t);
+void	SHA256_Final(unsigned char [__min_size(SHA256_DIGEST_LENGTH)],
+    SHA256_CTX *);
+#ifndef _KERNEL
+char   *SHA256_End(SHA256_CTX *, char *);
+char   *SHA256_Data(const void *, unsigned int, char *);
+char   *SHA256_Fd(int, char *);
+char   *SHA256_FdChunk(int, char *, off_t, off_t);
+char   *SHA256_File(const char *, char *);
+char   *SHA256_FileChunk(const char *, char *, off_t, off_t);
+#endif
+__END_DECLS
+
+#endif /* !_SHA256_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha256c.c b/sys/contrib/openzfs/module/os/freebsd/spl/sha256c.c
new file mode 100644
index 000000000000..241cf8c9ae76
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha256c.c
@@ -0,0 +1,378 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+
+#include <sys/byteorder.h>
+#include <sys/endian.h>
+#include "sha224.h"
+#include "sha256.h"
+
+#if BYTE_ORDER == BIG_ENDIAN
+
+/* Copy a vector of big-endian uint32_t into a vector of bytes */
+#define	be32enc_vect(dst, src, len)	\
+	memcpy((void *)dst, (const void *)src, (size_t)len)
+
+/* Copy a vector of bytes into a vector of big-endian uint32_t */
+#define	be32dec_vect(dst, src, len)	\
+	memcpy((void *)dst, (const void *)src, (size_t)len)
+
+#else /* BYTE_ORDER != BIG_ENDIAN */
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static void
+be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		be32enc(dst + i * 4, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint32_t).  Assumes len is a multiple of 4.
+ */
+static void
+be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		dst[i] = be32dec(src + i * 4);
+}
+
+#endif /* BYTE_ORDER != BIG_ENDIAN */
+
+/* SHA256 round constants. */
+static const uint32_t K[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+/* Elementary functions used by SHA256 */
+#define	Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define	Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define	SHR(x, n)	(x >> n)
+#define	ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define	S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define	S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define	s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define	s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define	RND(a, b, c, d, e, f, g, h, k)			\
+	h += S1(e) + Ch(e, f, g) + k;			\
+	d += h;						\
+	h += S0(a) + Maj(a, b, c);
+
+/* Adjusted round function for rotating state */
+#define	RNDr(S, W, i, ii)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i + ii] + K[i + ii])
+
+/* Message schedule computation */
+#define	MSCH(W, ii, i)				\
+	W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] +	\
+		s0(W[i + ii + 1]) + W[i + ii]
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA256_Transform(uint32_t *state, const unsigned char block[64])
+{
+	uint32_t W[64];
+	uint32_t S[8];
+	int i;
+
+	/* 1. Prepare the first part of the message schedule W. */
+	be32dec_vect(W, block, 64);
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 32);
+
+	/* 3. Mix. */
+	for (i = 0; i < 64; i += 16) {
+		RNDr(S, W, 0, i);
+		RNDr(S, W, 1, i);
+		RNDr(S, W, 2, i);
+		RNDr(S, W, 3, i);
+		RNDr(S, W, 4, i);
+		RNDr(S, W, 5, i);
+		RNDr(S, W, 6, i);
+		RNDr(S, W, 7, i);
+		RNDr(S, W, 8, i);
+		RNDr(S, W, 9, i);
+		RNDr(S, W, 10, i);
+		RNDr(S, W, 11, i);
+		RNDr(S, W, 12, i);
+		RNDr(S, W, 13, i);
+		RNDr(S, W, 14, i);
+		RNDr(S, W, 15, i);
+
+		if (i == 48)
+			break;
+		MSCH(W, 0, i);
+		MSCH(W, 1, i);
+		MSCH(W, 2, i);
+		MSCH(W, 3, i);
+		MSCH(W, 4, i);
+		MSCH(W, 5, i);
+		MSCH(W, 6, i);
+		MSCH(W, 7, i);
+		MSCH(W, 8, i);
+		MSCH(W, 9, i);
+		MSCH(W, 10, i);
+		MSCH(W, 11, i);
+		MSCH(W, 12, i);
+		MSCH(W, 13, i);
+		MSCH(W, 14, i);
+		MSCH(W, 15, i);
+	}
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+}
+
+static unsigned char PAD[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Add padding and terminating bit-count. */
+static void
+SHA256_Pad(SHA256_CTX * ctx)
+{
+	size_t r;
+
+	/* Figure out how many bytes we have buffered. */
+	r = (ctx->count >> 3) & 0x3f;
+
+	/* Pad to 56 mod 64, transforming if we finish a block en route. */
+	if (r < 56) {
+		/* Pad to 56 mod 64. */
+		memcpy(&ctx->buf[r], PAD, 56 - r);
+	} else {
+		/* Finish the current block and mix. */
+		memcpy(&ctx->buf[r], PAD, 64 - r);
+		SHA256_Transform(ctx->state, ctx->buf);
+
+		/* The start of the final block is all zeroes. */
+		memset(&ctx->buf[0], 0, 56);
+	}
+
+	/* Add the terminating bit-count. */
+	be64enc(&ctx->buf[56], ctx->count);
+
+	/* Mix in the final block. */
+	SHA256_Transform(ctx->state, ctx->buf);
+}
+
+/* SHA-256 initialization.  Begins a SHA-256 operation. */
+void
+SHA256_Init(SHA256_CTX * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0x6A09E667;
+	ctx->state[1] = 0xBB67AE85;
+	ctx->state[2] = 0x3C6EF372;
+	ctx->state[3] = 0xA54FF53A;
+	ctx->state[4] = 0x510E527F;
+	ctx->state[5] = 0x9B05688C;
+	ctx->state[6] = 0x1F83D9AB;
+	ctx->state[7] = 0x5BE0CD19;
+}
+
+/* Add bytes into the hash */
+void
+SHA256_Update(SHA256_CTX * ctx, const void *in, size_t len)
+{
+	uint64_t bitlen;
+	uint32_t r;
+	const unsigned char *src = in;
+
+	/* Number of bytes left in the buffer from previous updates */
+	r = (ctx->count >> 3) & 0x3f;
+
+	/* Convert the length into a number of bits */
+	bitlen = len << 3;
+
+	/* Update number of bits */
+	ctx->count += bitlen;
+
+	/* Handle the case where we don't need to perform any transforms */
+	if (len < 64 - r) {
+		memcpy(&ctx->buf[r], src, len);
+		return;
+	}
+
+	/* Finish the current block */
+	memcpy(&ctx->buf[r], src, 64 - r);
+	SHA256_Transform(ctx->state, ctx->buf);
+	src += 64 - r;
+	len -= 64 - r;
+
+	/* Perform complete blocks */
+	while (len >= 64) {
+		SHA256_Transform(ctx->state, src);
+		src += 64;
+		len -= 64;
+	}
+
+	/* Copy left over data into buffer */
+	memcpy(ctx->buf, src, len);
+}
+
+/*
+ * SHA-256 finalization.  Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA256_Final(unsigned char digest[static SHA256_DIGEST_LENGTH], SHA256_CTX *ctx)
+{
+
+	/* Add padding */
+	SHA256_Pad(ctx);
+
+	/* Write the hash */
+	be32enc_vect(digest, ctx->state, SHA256_DIGEST_LENGTH);
+
+	/* Clear the context state */
+	explicit_bzero(ctx, sizeof (*ctx));
+}
+
+/* SHA-224: ******************************************************* */
+/*
+ * the SHA224 and SHA256 transforms are identical
+ */
+
+/* SHA-224 initialization.  Begins a SHA-224 operation. */
+void
+SHA224_Init(SHA224_CTX * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0xC1059ED8;
+	ctx->state[1] = 0x367CD507;
+	ctx->state[2] = 0x3070DD17;
+	ctx->state[3] = 0xF70E5939;
+	ctx->state[4] = 0xFFC00B31;
+	ctx->state[5] = 0x68581511;
+	ctx->state[6] = 0x64f98FA7;
+	ctx->state[7] = 0xBEFA4FA4;
+}
+
+/* Add bytes into the SHA-224 hash */
+void
+SHA224_Update(SHA224_CTX * ctx, const void *in, size_t len)
+{
+
+	SHA256_Update((SHA256_CTX *)ctx, in, len);
+}
+
+/*
+ * SHA-224 finalization.  Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA224_Final(unsigned char digest[static SHA224_DIGEST_LENGTH], SHA224_CTX *ctx)
+{
+
+	/* Add padding */
+	SHA256_Pad((SHA256_CTX *)ctx);
+
+	/* Write the hash */
+	be32enc_vect(digest, ctx->state, SHA224_DIGEST_LENGTH);
+
+	/* Clear the context state */
+	explicit_bzero(ctx, sizeof (*ctx));
+}
+
+#ifdef WEAK_REFS
+/*
+ * When building libmd, provide weak references. Note: this is not
+ * activated in the context of compiling these sources for internal
+ * use in libcrypt.
+ */
+#undef SHA256_Init
+__weak_reference(_libmd_SHA256_Init, SHA256_Init);
+#undef SHA256_Update
+__weak_reference(_libmd_SHA256_Update, SHA256_Update);
+#undef SHA256_Final
+__weak_reference(_libmd_SHA256_Final, SHA256_Final);
+#undef SHA256_Transform
+__weak_reference(_libmd_SHA256_Transform, SHA256_Transform);
+
+#undef SHA224_Init
+__weak_reference(_libmd_SHA224_Init, SHA224_Init);
+#undef SHA224_Update
+__weak_reference(_libmd_SHA224_Update, SHA224_Update);
+#undef SHA224_Final
+__weak_reference(_libmd_SHA224_Final, SHA224_Final);
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha384.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha384.h
new file mode 100644
index 000000000000..67250cee0313
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha384.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA384_H_
+#define	_SHA384_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define	SHA384_BLOCK_LENGTH		128
+#define	SHA384_DIGEST_LENGTH		48
+#define	SHA384_DIGEST_STRING_LENGTH	(SHA384_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA384Context {
+	uint64_t state[8];
+	uint64_t count[2];
+	uint8_t buf[SHA384_BLOCK_LENGTH];
+} SHA384_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+#ifndef SHA384_Init
+#define	SHA384_Init		_libmd_SHA384_Init
+#endif
+#ifndef SHA384_Update
+#define	SHA384_Update		_libmd_SHA384_Update
+#endif
+#ifndef SHA384_Final
+#define	SHA384_Final		_libmd_SHA384_Final
+#endif
+#ifndef SHA384_End
+#define	SHA384_End		_libmd_SHA384_End
+#endif
+#ifndef SHA384_Fd
+#define	SHA384_Fd		_libmd_SHA384_Fd
+#endif
+#ifndef SHA384_FdChunk
+#define	SHA384_FdChunk		_libmd_SHA384_FdChunk
+#endif
+#ifndef SHA384_File
+#define	SHA384_File		_libmd_SHA384_File
+#endif
+#ifndef SHA384_FileChunk
+#define	SHA384_FileChunk	_libmd_SHA384_FileChunk
+#endif
+#ifndef SHA384_Data
+#define	SHA384_Data		_libmd_SHA384_Data
+#endif
+
+#ifndef SHA384_version
+#define	SHA384_version		_libmd_SHA384_version
+#endif
+
+void	SHA384_Init(SHA384_CTX *);
+void	SHA384_Update(SHA384_CTX *, const void *, size_t);
+void	SHA384_Final(unsigned char [__min_size(SHA384_DIGEST_LENGTH)],
+    SHA384_CTX *);
+#ifndef _KERNEL
+char   *SHA384_End(SHA384_CTX *, char *);
+char   *SHA384_Data(const void *, unsigned int, char *);
+char   *SHA384_Fd(int, char *);
+char   *SHA384_FdChunk(int, char *, off_t, off_t);
+char   *SHA384_File(const char *, char *);
+char   *SHA384_FileChunk(const char *, char *, off_t, off_t);
+#endif
+
+__END_DECLS
+
+#endif /* !_SHA384_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha512.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha512.h
new file mode 100644
index 000000000000..b6fb733ca54e
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha512.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA512_H_
+#define	_SHA512_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define	SHA512_BLOCK_LENGTH		128
+#define	SHA512_DIGEST_LENGTH		64
+#define	SHA512_DIGEST_STRING_LENGTH	(SHA512_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA512Context {
+	uint64_t state[8];
+	uint64_t count[2];
+	uint8_t buf[SHA512_BLOCK_LENGTH];
+} SHA512_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+#if 0
+#ifndef SHA512_Init
+#define	SHA512_Init		_libmd_SHA512_Init
+#endif
+#ifndef SHA512_Update
+#define	SHA512_Update		_libmd_SHA512_Update
+#endif
+#ifndef SHA512_Final
+#define	SHA512_Final		_libmd_SHA512_Final
+#endif
+#endif
+#ifndef SHA512_End
+#define	SHA512_End		_libmd_SHA512_End
+#endif
+#ifndef SHA512_Fd
+#define	SHA512_Fd		_libmd_SHA512_Fd
+#endif
+#ifndef SHA512_FdChunk
+#define	SHA512_FdChunk		_libmd_SHA512_FdChunk
+#endif
+#ifndef SHA512_File
+#define	SHA512_File		_libmd_SHA512_File
+#endif
+#ifndef SHA512_FileChunk
+#define	SHA512_FileChunk	_libmd_SHA512_FileChunk
+#endif
+#ifndef SHA512_Data
+#define	SHA512_Data		_libmd_SHA512_Data
+#endif
+
+#ifndef SHA512_Transform
+#define	SHA512_Transform	_libmd_SHA512_Transform
+#endif
+#ifndef SHA512_version
+#define	SHA512_version		_libmd_SHA512_version
+#endif
+
+void	SHA512_Init(SHA512_CTX *);
+void	SHA512_Update(SHA512_CTX *, const void *, size_t);
+void	SHA512_Final(unsigned char [__min_size(SHA512_DIGEST_LENGTH)],
+    SHA512_CTX *);
+#ifndef _KERNEL
+char   *SHA512_End(SHA512_CTX *, char *);
+char   *SHA512_Data(const void *, unsigned int, char *);
+char   *SHA512_Fd(int, char *);
+char   *SHA512_FdChunk(int, char *, off_t, off_t);
+char   *SHA512_File(const char *, char *);
+char   *SHA512_FileChunk(const char *, char *, off_t, off_t);
+#endif
+
+__END_DECLS
+
+#endif /* !_SHA512_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha512c.c b/sys/contrib/openzfs/module/os/freebsd/spl/sha512c.c
new file mode 100644
index 000000000000..146f338f0ed4
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha512c.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright 2005 Colin Percival
+ * Copyright (c) 2015 Allan Jude <allanjude@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/endian.h>
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include "sha512.h"
+#include "sha512t.h"
+#include "sha384.h"
+
+#if BYTE_ORDER == BIG_ENDIAN
+
+/* Copy a vector of big-endian uint64_t into a vector of bytes */
+#define	be64enc_vect(dst, src, len)	\
+	memcpy((void *)dst, (const void *)src, (size_t)len)
+
+/* Copy a vector of bytes into a vector of big-endian uint64_t */
+#define	be64dec_vect(dst, src, len)	\
+	memcpy((void *)dst, (const void *)src, (size_t)len)
+
+#else /* BYTE_ORDER != BIG_ENDIAN */
+
+/*
+ * Encode a length len/4 vector of (uint64_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 8.
+ */
+static void
+be64enc_vect(unsigned char *dst, const uint64_t *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 8; i++)
+		be64enc(dst + i * 8, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint64_t).  Assumes len is a multiple of 8.
+ */
+static void
+be64dec_vect(uint64_t *dst, const unsigned char *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 8; i++)
+		dst[i] = be64dec(src + i * 8);
+}
+
+#endif /* BYTE_ORDER != BIG_ENDIAN */
+
+/* SHA512 round constants. */
+static const uint64_t K[80] = {
+	0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
+	0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
+	0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+	0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
+	0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
+	0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+	0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
+	0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
+	0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+	0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
+	0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
+	0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+	0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
+	0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
+	0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+	0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
+	0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
+	0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+	0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
+	0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
+	0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+	0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
+	0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
+	0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+	0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
+	0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
+	0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+	0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
+	0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
+	0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+	0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
+	0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
+	0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+	0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
+	0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
+	0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+	0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
+	0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
+	0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+	0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+/* Elementary functions used by SHA512 */
+#define	Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define	Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define	SHR(x, n)	(x >> n)
+#define	ROTR(x, n)	((x >> n) | (x << (64 - n)))
+#define	S0(x)		(ROTR(x, 28) ^ ROTR(x, 34) ^ ROTR(x, 39))
+#define	S1(x)		(ROTR(x, 14) ^ ROTR(x, 18) ^ ROTR(x, 41))
+#define	s0(x)		(ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x, 7))
+#define	s1(x)		(ROTR(x, 19) ^ ROTR(x, 61) ^ SHR(x, 6))
+
+/* SHA512 round function */
+#define	RND(a, b, c, d, e, f, g, h, k)			\
+	h += S1(e) + Ch(e, f, g) + k;			\
+	d += h;						\
+	h += S0(a) + Maj(a, b, c);
+
+/* Adjusted round function for rotating state */
+#define	RNDr(S, W, i, ii)			\
+	RND(S[(80 - i) % 8], S[(81 - i) % 8],	\
+	    S[(82 - i) % 8], S[(83 - i) % 8],	\
+	    S[(84 - i) % 8], S[(85 - i) % 8],	\
+	    S[(86 - i) % 8], S[(87 - i) % 8],	\
+	    W[i + ii] + K[i + ii])
+
+/* Message schedule computation */
+#define	MSCH(W, ii, i)				\
+	W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] +	\
+		s0(W[i + ii + 1]) + W[i + ii]
+
+/*
+ * SHA512 block compression function.  The 512-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA512_Transform(uint64_t *state,
+    const unsigned char block[SHA512_BLOCK_LENGTH])
+{
+	uint64_t W[80];
+	uint64_t S[8];
+	int i;
+
+	/* 1. Prepare the first part of the message schedule W. */
+	be64dec_vect(W, block, SHA512_BLOCK_LENGTH);
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, SHA512_DIGEST_LENGTH);
+
+	/* 3. Mix. */
+	for (i = 0; i < 80; i += 16) {
+		RNDr(S, W, 0, i);
+		RNDr(S, W, 1, i);
+		RNDr(S, W, 2, i);
+		RNDr(S, W, 3, i);
+		RNDr(S, W, 4, i);
+		RNDr(S, W, 5, i);
+		RNDr(S, W, 6, i);
+		RNDr(S, W, 7, i);
+		RNDr(S, W, 8, i);
+		RNDr(S, W, 9, i);
+		RNDr(S, W, 10, i);
+		RNDr(S, W, 11, i);
+		RNDr(S, W, 12, i);
+		RNDr(S, W, 13, i);
+		RNDr(S, W, 14, i);
+		RNDr(S, W, 15, i);
+
+		if (i == 64)
+			break;
+		MSCH(W, 0, i);
+		MSCH(W, 1, i);
+		MSCH(W, 2, i);
+		MSCH(W, 3, i);
+		MSCH(W, 4, i);
+		MSCH(W, 5, i);
+		MSCH(W, 6, i);
+		MSCH(W, 7, i);
+		MSCH(W, 8, i);
+		MSCH(W, 9, i);
+		MSCH(W, 10, i);
+		MSCH(W, 11, i);
+		MSCH(W, 12, i);
+		MSCH(W, 13, i);
+		MSCH(W, 14, i);
+		MSCH(W, 15, i);
+	}
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+}
+
+static unsigned char PAD[SHA512_BLOCK_LENGTH] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Add padding and terminating bit-count. */
+static void
+SHA512_Pad(SHA512_CTX * ctx)
+{
+	size_t r;
+
+	/* Figure out how many bytes we have buffered. */
+	r = (ctx->count[1] >> 3) & 0x7f;
+
+	/* Pad to 112 mod 128, transforming if we finish a block en route. */
+	if (r < 112) {
+		/* Pad to 112 mod 128. */
+		memcpy(&ctx->buf[r], PAD, 112 - r);
+	} else {
+		/* Finish the current block and mix. */
+		memcpy(&ctx->buf[r], PAD, 128 - r);
+		SHA512_Transform(ctx->state, ctx->buf);
+
+		/* The start of the final block is all zeroes. */
+		memset(&ctx->buf[0], 0, 112);
+	}
+
+	/* Add the terminating bit-count. */
+	be64enc_vect(&ctx->buf[112], ctx->count, 16);
+
+	/* Mix in the final block. */
+	SHA512_Transform(ctx->state, ctx->buf);
+}
+
+/* SHA-512 initialization.  Begins a SHA-512 operation. */
+void
+SHA512_Init(SHA512_CTX * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0x6a09e667f3bcc908ULL;
+	ctx->state[1] = 0xbb67ae8584caa73bULL;
+	ctx->state[2] = 0x3c6ef372fe94f82bULL;
+	ctx->state[3] = 0xa54ff53a5f1d36f1ULL;
+	ctx->state[4] = 0x510e527fade682d1ULL;
+	ctx->state[5] = 0x9b05688c2b3e6c1fULL;
+	ctx->state[6] = 0x1f83d9abfb41bd6bULL;
+	ctx->state[7] = 0x5be0cd19137e2179ULL;
+}
+
+/* Add bytes into the hash */
+void
+SHA512_Update(SHA512_CTX * ctx, const void *in, size_t len)
+{
+	uint64_t bitlen[2];
+	uint64_t r;
+	const unsigned char *src = in;
+
+	/* Number of bytes left in the buffer from previous updates */
+	r = (ctx->count[1] >> 3) & 0x7f;
+
+	/* Convert the length into a number of bits */
+	bitlen[1] = ((uint64_t)len) << 3;
+	bitlen[0] = ((uint64_t)len) >> 61;
+
+	/* Update number of bits */
+	if ((ctx->count[1] += bitlen[1]) < bitlen[1])
+		ctx->count[0]++;
+	ctx->count[0] += bitlen[0];
+
+	/* Handle the case where we don't need to perform any transforms */
+	if (len < SHA512_BLOCK_LENGTH - r) {
+		memcpy(&ctx->buf[r], src, len);
+		return;
+	}
+
+	/* Finish the current block */
+	memcpy(&ctx->buf[r], src, SHA512_BLOCK_LENGTH - r);
+	SHA512_Transform(ctx->state, ctx->buf);
+	src += SHA512_BLOCK_LENGTH - r;
+	len -= SHA512_BLOCK_LENGTH - r;
+
+	/* Perform complete blocks */
+	while (len >= SHA512_BLOCK_LENGTH) {
+		SHA512_Transform(ctx->state, src);
+		src += SHA512_BLOCK_LENGTH;
+		len -= SHA512_BLOCK_LENGTH;
+	}
+
+	/* Copy left over data into buffer */
+	memcpy(ctx->buf, src, len);
+}
+
+/*
+ * SHA-512 finalization.  Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA512_Final(unsigned char digest[static SHA512_DIGEST_LENGTH], SHA512_CTX *ctx)
+{
+
+	/* Add padding */
+	SHA512_Pad(ctx);
+
+	/* Write the hash */
+	be64enc_vect(digest, ctx->state, SHA512_DIGEST_LENGTH);
+
+	/* Clear the context state */
+	explicit_bzero(ctx, sizeof (*ctx));
+}
+
+/* SHA-512t: ******************************************************** */
+/*
+ * the SHA512t transforms are identical to SHA512 so reuse the existing function
+ */
+void
+SHA512_224_Init(SHA512_CTX * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0x8c3d37c819544da2ULL;
+	ctx->state[1] = 0x73e1996689dcd4d6ULL;
+	ctx->state[2] = 0x1dfab7ae32ff9c82ULL;
+	ctx->state[3] = 0x679dd514582f9fcfULL;
+	ctx->state[4] = 0x0f6d2b697bd44da8ULL;
+	ctx->state[5] = 0x77e36f7304c48942ULL;
+	ctx->state[6] = 0x3f9d85a86a1d36c8ULL;
+	ctx->state[7] = 0x1112e6ad91d692a1ULL;
+}
+
+void
+SHA512_224_Update(SHA512_CTX * ctx, const void *in, size_t len)
+{
+
+	SHA512_Update(ctx, in, len);
+}
+
+void
+SHA512_224_Final(unsigned char digest[static SHA512_224_DIGEST_LENGTH],
+    SHA512_CTX *ctx)
+{
+
+	/* Add padding */
+	SHA512_Pad(ctx);
+
+	/* Write the hash */
+	be64enc_vect(digest, ctx->state, SHA512_224_DIGEST_LENGTH);
+
+	/* Clear the context state */
+	explicit_bzero(ctx, sizeof (*ctx));
+}
+
+void
+SHA512_256_Init(SHA512_CTX * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0x22312194fc2bf72cULL;
+	ctx->state[1] = 0x9f555fa3c84c64c2ULL;
+	ctx->state[2] = 0x2393b86b6f53b151ULL;
+	ctx->state[3] = 0x963877195940eabdULL;
+	ctx->state[4] = 0x96283ee2a88effe3ULL;
+	ctx->state[5] = 0xbe5e1e2553863992ULL;
+	ctx->state[6] = 0x2b0199fc2c85b8aaULL;
+	ctx->state[7] = 0x0eb72ddc81c52ca2ULL;
+}
+
+void
+SHA512_256_Update(SHA512_CTX * ctx, const void *in, size_t len)
+{
+
+	SHA512_Update(ctx, in, len);
+}
+
+void
+SHA512_256_Final(unsigned char digest[static SHA512_256_DIGEST_LENGTH],
+    SHA512_CTX * ctx)
+{
+
+	/* Add padding */
+	SHA512_Pad(ctx);
+
+	/* Write the hash */
+	be64enc_vect(digest, ctx->state, SHA512_256_DIGEST_LENGTH);
+
+	/* Clear the context state */
+	explicit_bzero(ctx, sizeof (*ctx));
+}
+
+/* ** SHA-384: ******************************************************** */
+/*
+ * the SHA384 and SHA512 transforms are identical, so SHA384 is skipped
+ */
+
+/* SHA-384 initialization.  Begins a SHA-384 operation. */
+void
+SHA384_Init(SHA384_CTX * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0xcbbb9d5dc1059ed8ULL;
+	ctx->state[1] = 0x629a292a367cd507ULL;
+	ctx->state[2] = 0x9159015a3070dd17ULL;
+	ctx->state[3] = 0x152fecd8f70e5939ULL;
+	ctx->state[4] = 0x67332667ffc00b31ULL;
+	ctx->state[5] = 0x8eb44a8768581511ULL;
+	ctx->state[6] = 0xdb0c2e0d64f98fa7ULL;
+	ctx->state[7] = 0x47b5481dbefa4fa4ULL;
+}
+
+/* Add bytes into the SHA-384 hash */
+void
+SHA384_Update(SHA384_CTX * ctx, const void *in, size_t len)
+{
+
+	SHA512_Update((SHA512_CTX *)ctx, in, len);
+}
+
+/*
+ * SHA-384 finalization.  Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA384_Final(unsigned char digest[static SHA384_DIGEST_LENGTH], SHA384_CTX *ctx)
+{
+
+	/* Add padding */
+	SHA512_Pad((SHA512_CTX *)ctx);
+
+	/* Write the hash */
+	be64enc_vect(digest, ctx->state, SHA384_DIGEST_LENGTH);
+
+	/* Clear the context state */
+	explicit_bzero(ctx, sizeof (*ctx));
+}
+
+#if 0
+/*
+ * When building libmd, provide weak references. Note: this is not
+ * activated in the context of compiling these sources for internal
+ * use in libcrypt.
+ */
+#undef SHA512_Init
+__weak_reference(_libmd_SHA512_Init, SHA512_Init);
+#undef SHA512_Update
+__weak_reference(_libmd_SHA512_Update, SHA512_Update);
+#undef SHA512_Final
+__weak_reference(_libmd_SHA512_Final, SHA512_Final);
+#undef SHA512_Transform
+__weak_reference(_libmd_SHA512_Transform, SHA512_Transform);
+
+#undef SHA512_224_Init
+__weak_reference(_libmd_SHA512_224_Init, SHA512_224_Init);
+#undef SHA512_224_Update
+__weak_reference(_libmd_SHA512_224_Update, SHA512_224_Update);
+#undef SHA512_224_Final
+__weak_reference(_libmd_SHA512_224_Final, SHA512_224_Final);
+
+#undef SHA512_256_Init
+__weak_reference(_libmd_SHA512_256_Init, SHA512_256_Init);
+#undef SHA512_256_Update
+__weak_reference(_libmd_SHA512_256_Update, SHA512_256_Update);
+#undef SHA512_256_Final
+__weak_reference(_libmd_SHA512_256_Final, SHA512_256_Final);
+
+#undef SHA384_Init
+__weak_reference(_libmd_SHA384_Init, SHA384_Init);
+#undef SHA384_Update
+__weak_reference(_libmd_SHA384_Update, SHA384_Update);
+#undef SHA384_Final
+__weak_reference(_libmd_SHA384_Final, SHA384_Final);
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha512t.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha512t.h
new file mode 100644
index 000000000000..703867fc0288
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha512t.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2015 Allan Jude <allanjude@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA512T_H_
+#define	_SHA512T_H_
+
+#include "sha512.h"
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define	SHA512_224_DIGEST_LENGTH	28
+#define	SHA512_224_DIGEST_STRING_LENGTH	(SHA512_224_DIGEST_LENGTH * 2 + 1)
+#define	SHA512_256_DIGEST_LENGTH	32
+#define	SHA512_256_DIGEST_STRING_LENGTH	(SHA512_256_DIGEST_LENGTH * 2 + 1)
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+#ifndef SHA512_224_Init
+#define	SHA512_224_Init		_libmd_SHA512_224_Init
+#endif
+#ifndef SHA512_224_Update
+#define	SHA512_224_Update	_libmd_SHA512_224_Update
+#endif
+#ifndef SHA512_224_Final
+#define	SHA512_224_Final	_libmd_SHA512_224_Final
+#endif
+#ifndef SHA512_224_End
+#define	SHA512_224_End		_libmd_SHA512_224_End
+#endif
+#ifndef SHA512_224_Fd
+#define	SHA512_224_Fd		_libmd_SHA512_224_Fd
+#endif
+#ifndef SHA512_224_FdChunk
+#define	SHA512_224_FdChunk	_libmd_SHA512_224_FdChunk
+#endif
+#ifndef SHA512_224_File
+#define	SHA512_224_File		_libmd_SHA512_224_File
+#endif
+#ifndef SHA512_224_FileChunk
+#define	SHA512_224_FileChunk	_libmd_SHA512_224_FileChunk
+#endif
+#ifndef SHA512_224_Data
+#define	SHA512_224_Data		_libmd_SHA512_224_Data
+#endif
+
+#ifndef SHA512_224_Transform
+#define	SHA512_224_Transform	_libmd_SHA512_224_Transform
+#endif
+#ifndef SHA512_224_version
+#define	SHA512_224_version	_libmd_SHA512_224_version
+#endif
+
+#ifndef SHA512_256_Init
+#define	SHA512_256_Init		_libmd_SHA512_256_Init
+#endif
+#ifndef SHA512_256_Update
+#define	SHA512_256_Update	_libmd_SHA512_256_Update
+#endif
+#ifndef SHA512_256_Final
+#define	SHA512_256_Final	_libmd_SHA512_256_Final
+#endif
+#ifndef SHA512_256_End
+#define	SHA512_256_End		_libmd_SHA512_256_End
+#endif
+#ifndef SHA512_256_Fd
+#define	SHA512_256_Fd		_libmd_SHA512_256_Fd
+#endif
+#ifndef SHA512_256_FdChunk
+#define	SHA512_256_FdChunk	_libmd_SHA512_256_FdChunk
+#endif
+#ifndef SHA512_256_File
+#define	SHA512_256_File		_libmd_SHA512_256_File
+#endif
+#ifndef SHA512_256_FileChunk
+#define	SHA512_256_FileChunk	_libmd_SHA512_256_FileChunk
+#endif
+#ifndef SHA512_256_Data
+#define	SHA512_256_Data		_libmd_SHA512_256_Data
+#endif
+
+#ifndef SHA512_256_Transform
+#define	SHA512_256_Transform	_libmd_SHA512_256_Transform
+#endif
+#ifndef SHA512_256_version
+#define	SHA512_256_version	_libmd_SHA512_256_version
+#endif
+
+void	SHA512_224_Init(SHA512_CTX *);
+void	SHA512_224_Update(SHA512_CTX *, const void *, size_t);
+void	SHA512_224_Final(unsigned char [__min_size(SHA512_224_DIGEST_LENGTH)],
+    SHA512_CTX *);
+#ifndef _KERNEL
+char   *SHA512_224_End(SHA512_CTX *, char *);
+char   *SHA512_224_Data(const void *, unsigned int, char *);
+char   *SHA512_224_Fd(int, char *);
+char   *SHA512_224_FdChunk(int, char *, off_t, off_t);
+char   *SHA512_224_File(const char *, char *);
+char   *SHA512_224_FileChunk(const char *, char *, off_t, off_t);
+#endif
+void	SHA512_256_Init(SHA512_CTX *);
+void	SHA512_256_Update(SHA512_CTX *, const void *, size_t);
+void	SHA512_256_Final(unsigned char [__min_size(SHA512_256_DIGEST_LENGTH)],
+    SHA512_CTX *);
+#ifndef _KERNEL
+char   *SHA512_256_End(SHA512_CTX *, char *);
+char   *SHA512_256_Data(const void *, unsigned int, char *);
+char   *SHA512_256_Fd(int, char *);
+char   *SHA512_256_FdChunk(int, char *, off_t, off_t);
+char   *SHA512_256_File(const char *, char *);
+char   *SHA512_256_FileChunk(const char *, char *, off_t, off_t);
+#endif
+
+__END_DECLS
+
+#endif /* !_SHA512T_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_acl.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_acl.c
new file mode 100644
index 000000000000..18188ca0adec
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_acl.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2008, 2009 Edward Tomasz Napierała <trasz@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/errno.h>
+#include <sys/zfs_acl.h>
+#include <sys/acl.h>
+
+struct zfs2bsd {
+	uint32_t	zb_zfs;
+	int		zb_bsd;
+};
+
+struct zfs2bsd perms[] = {{ACE_READ_DATA, ACL_READ_DATA},
+			{ACE_WRITE_DATA, ACL_WRITE_DATA},
+			{ACE_EXECUTE, ACL_EXECUTE},
+			{ACE_APPEND_DATA, ACL_APPEND_DATA},
+			{ACE_DELETE_CHILD, ACL_DELETE_CHILD},
+			{ACE_DELETE, ACL_DELETE},
+			{ACE_READ_ATTRIBUTES, ACL_READ_ATTRIBUTES},
+			{ACE_WRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES},
+			{ACE_READ_NAMED_ATTRS, ACL_READ_NAMED_ATTRS},
+			{ACE_WRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS},
+			{ACE_READ_ACL, ACL_READ_ACL},
+			{ACE_WRITE_ACL, ACL_WRITE_ACL},
+			{ACE_WRITE_OWNER, ACL_WRITE_OWNER},
+			{ACE_SYNCHRONIZE, ACL_SYNCHRONIZE},
+			{0, 0}};
+
+struct zfs2bsd flags[] = {{ACE_FILE_INHERIT_ACE,
+			    ACL_ENTRY_FILE_INHERIT},
+			{ACE_DIRECTORY_INHERIT_ACE,
+			    ACL_ENTRY_DIRECTORY_INHERIT},
+			{ACE_NO_PROPAGATE_INHERIT_ACE,
+			    ACL_ENTRY_NO_PROPAGATE_INHERIT},
+			{ACE_INHERIT_ONLY_ACE,
+			    ACL_ENTRY_INHERIT_ONLY},
+			{ACE_INHERITED_ACE,
+			    ACL_ENTRY_INHERITED},
+			{ACE_SUCCESSFUL_ACCESS_ACE_FLAG,
+			    ACL_ENTRY_SUCCESSFUL_ACCESS},
+			{ACE_FAILED_ACCESS_ACE_FLAG,
+			    ACL_ENTRY_FAILED_ACCESS},
+			{0, 0}};
+
+static int
+_bsd_from_zfs(uint32_t zfs, const struct zfs2bsd *table)
+{
+	const struct zfs2bsd *tmp;
+	int bsd = 0;
+
+	for (tmp = table; tmp->zb_zfs != 0; tmp++) {
+		if (zfs & tmp->zb_zfs)
+			bsd |= tmp->zb_bsd;
+	}
+
+	return (bsd);
+}
+
+static uint32_t
+_zfs_from_bsd(int bsd, const struct zfs2bsd *table)
+{
+	const struct zfs2bsd *tmp;
+	uint32_t zfs = 0;
+
+	for (tmp = table; tmp->zb_bsd != 0; tmp++) {
+		if (bsd & tmp->zb_bsd)
+			zfs |= tmp->zb_zfs;
+	}
+
+	return (zfs);
+}
+
+int
+acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries)
+{
+	int i;
+	struct acl_entry *entry;
+	const ace_t *ace;
+
+	if (nentries < 1) {
+		printf("acl_from_aces: empty ZFS ACL; returning EINVAL.\n");
+		return (EINVAL);
+	}
+
+	if (nentries > ACL_MAX_ENTRIES) {
+		/*
+		 * I believe it may happen only when moving a pool
+		 * from SunOS to FreeBSD.
+		 */
+		printf("acl_from_aces: ZFS ACL too big to fit "
+		    "into 'struct acl'; returning EINVAL.\n");
+		return (EINVAL);
+	}
+
+	bzero(aclp, sizeof (*aclp));
+	aclp->acl_maxcnt = ACL_MAX_ENTRIES;
+	aclp->acl_cnt = nentries;
+
+	for (i = 0; i < nentries; i++) {
+		entry = &(aclp->acl_entry[i]);
+		ace = &(aces[i]);
+
+		if (ace->a_flags & ACE_OWNER)
+			entry->ae_tag = ACL_USER_OBJ;
+		else if (ace->a_flags & ACE_GROUP)
+			entry->ae_tag = ACL_GROUP_OBJ;
+		else if (ace->a_flags & ACE_EVERYONE)
+			entry->ae_tag = ACL_EVERYONE;
+		else if (ace->a_flags & ACE_IDENTIFIER_GROUP)
+			entry->ae_tag = ACL_GROUP;
+		else
+			entry->ae_tag = ACL_USER;
+
+		if (entry->ae_tag == ACL_USER || entry->ae_tag == ACL_GROUP)
+			entry->ae_id = ace->a_who;
+		else
+			entry->ae_id = ACL_UNDEFINED_ID;
+
+		entry->ae_perm = _bsd_from_zfs(ace->a_access_mask, perms);
+		entry->ae_flags = _bsd_from_zfs(ace->a_flags, flags);
+
+		switch (ace->a_type) {
+		case ACE_ACCESS_ALLOWED_ACE_TYPE:
+			entry->ae_entry_type = ACL_ENTRY_TYPE_ALLOW;
+			break;
+		case ACE_ACCESS_DENIED_ACE_TYPE:
+			entry->ae_entry_type = ACL_ENTRY_TYPE_DENY;
+			break;
+		case ACE_SYSTEM_AUDIT_ACE_TYPE:
+			entry->ae_entry_type = ACL_ENTRY_TYPE_AUDIT;
+			break;
+		case ACE_SYSTEM_ALARM_ACE_TYPE:
+			entry->ae_entry_type = ACL_ENTRY_TYPE_ALARM;
+			break;
+		default:
+			panic("acl_from_aces: a_type is 0x%x", ace->a_type);
+		}
+	}
+
+	return (0);
+}
+
+void
+aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp)
+{
+	int i;
+	const struct acl_entry *entry;
+	ace_t *ace;
+
+	bzero(aces, sizeof (*aces) * aclp->acl_cnt);
+
+	*nentries = aclp->acl_cnt;
+
+	for (i = 0; i < aclp->acl_cnt; i++) {
+		entry = &(aclp->acl_entry[i]);
+		ace = &(aces[i]);
+
+		ace->a_who = entry->ae_id;
+
+		if (entry->ae_tag == ACL_USER_OBJ)
+			ace->a_flags = ACE_OWNER;
+		else if (entry->ae_tag == ACL_GROUP_OBJ)
+			ace->a_flags = (ACE_GROUP | ACE_IDENTIFIER_GROUP);
+		else if (entry->ae_tag == ACL_GROUP)
+			ace->a_flags = ACE_IDENTIFIER_GROUP;
+		else if (entry->ae_tag == ACL_EVERYONE)
+			ace->a_flags = ACE_EVERYONE;
+		else /* ACL_USER */
+			ace->a_flags = 0;
+
+		ace->a_access_mask = _zfs_from_bsd(entry->ae_perm, perms);
+		ace->a_flags |= _zfs_from_bsd(entry->ae_flags, flags);
+
+		switch (entry->ae_entry_type) {
+		case ACL_ENTRY_TYPE_ALLOW:
+			ace->a_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
+			break;
+		case ACL_ENTRY_TYPE_DENY:
+			ace->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+			break;
+		case ACL_ENTRY_TYPE_ALARM:
+			ace->a_type = ACE_SYSTEM_ALARM_ACE_TYPE;
+			break;
+		case ACL_ENTRY_TYPE_AUDIT:
+			ace->a_type = ACE_SYSTEM_AUDIT_ACE_TYPE;
+			break;
+		default:
+			panic("aces_from_acl: ae_entry_type is 0x%x",
+			    entry->ae_entry_type);
+		}
+	}
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_atomic.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_atomic.c
new file mode 100644
index 000000000000..80040fc6a3e3
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_atomic.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/atomic.h>
+
+#if !defined(__LP64__) && !defined(__mips_n32) && \
+	!defined(ARM_HAVE_ATOMIC64) && !defined(I386_HAVE_ATOMIC64) && \
+	!defined(HAS_EMULATED_ATOMIC64)
+
+#ifdef _KERNEL
+#include <sys/kernel.h>
+
+struct mtx atomic_mtx;
+MTX_SYSINIT(atomic, &atomic_mtx, "atomic", MTX_DEF);
+#else
+#include <pthread.h>
+
+#define	mtx_lock(lock)		pthread_mutex_lock(lock)
+#define	mtx_unlock(lock)	pthread_mutex_unlock(lock)
+
+static pthread_mutex_t atomic_mtx;
+
+static __attribute__((constructor)) void
+atomic_init(void)
+{
+	pthread_mutex_init(&atomic_mtx, NULL);
+}
+#endif
+
+void
+atomic_add_64(volatile uint64_t *target, int64_t delta)
+{
+
+	mtx_lock(&atomic_mtx);
+	*target += delta;
+	mtx_unlock(&atomic_mtx);
+}
+
+void
+atomic_dec_64(volatile uint64_t *target)
+{
+
+	mtx_lock(&atomic_mtx);
+	*target -= 1;
+	mtx_unlock(&atomic_mtx);
+}
+
+uint64_t
+atomic_swap_64(volatile uint64_t *a, uint64_t value)
+{
+	uint64_t ret;
+
+	mtx_lock(&atomic_mtx);
+	ret = *a;
+	*a = value;
+	mtx_unlock(&atomic_mtx);
+	return (ret);
+}
+
+uint64_t
+atomic_load_64(volatile uint64_t *a)
+{
+	uint64_t ret;
+
+	mtx_lock(&atomic_mtx);
+	ret = *a;
+	mtx_unlock(&atomic_mtx);
+	return (ret);
+}
+
+uint64_t
+atomic_add_64_nv(volatile uint64_t *target, int64_t delta)
+{
+	uint64_t newval;
+
+	mtx_lock(&atomic_mtx);
+	newval = (*target += delta);
+	mtx_unlock(&atomic_mtx);
+	return (newval);
+}
+
+uint64_t
+atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval)
+{
+	uint64_t oldval;
+
+	mtx_lock(&atomic_mtx);
+	oldval = *target;
+	if (oldval == cmp)
+		*target = newval;
+	mtx_unlock(&atomic_mtx);
+	return (oldval);
+}
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_cmn_err.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_cmn_err.c
new file mode 100644
index 000000000000..22c7338b7399
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_cmn_err.c
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * $FreeBSD$
+ */
+/*
+ * Copyright 2007 John Birrell <jb@FreeBSD.org>. All rights reserved.
+ * Copyright 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cmn_err.h>
+
+void
+vcmn_err(int ce, const char *fmt, va_list adx)
+{
+	char buf[256];
+	const char *prefix;
+
+	prefix = NULL; /* silence unwitty compilers */
+	switch (ce) {
+	case CE_CONT:
+		prefix = "Solaris(cont): ";
+		break;
+	case CE_NOTE:
+		prefix = "Solaris: NOTICE: ";
+		break;
+	case CE_WARN:
+		prefix = "Solaris: WARNING: ";
+		break;
+	case CE_PANIC:
+		prefix = "Solaris(panic): ";
+		break;
+	case CE_IGNORE:
+		break;
+	default:
+		panic("Solaris: unknown severity level");
+	}
+	if (ce == CE_PANIC) {
+		vsnprintf(buf, sizeof (buf), fmt, adx);
+		panic("%s%s", prefix, buf);
+	}
+	if (ce != CE_IGNORE) {
+		printf("%s", prefix);
+		vprintf(fmt, adx);
+		printf("\n");
+	}
+}
+
+void
+cmn_err(int type, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vcmn_err(type, fmt, ap);
+	va_end(ap);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c
new file mode 100644
index 000000000000..6b2872bcc066
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2014 The FreeBSD Project.
+ * All rights reserved.
+ *
+ * This software was developed by Steven Hartland.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/sdt.h>
+
+/* CSTYLED */
+SDT_PROBE_DEFINE1(sdt, , , set__error, "int");
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
new file mode 100644
index 000000000000..cfc61dd7fc2a
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2006-2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/byteorder.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/debug.h>
+#include <sys/mutex.h>
+#include <sys/vmmeter.h>
+
+
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+
+#ifdef KMEM_DEBUG
+#include <sys/queue.h>
+#include <sys/stack.h>
+#endif
+
+#ifdef _KERNEL
+MALLOC_DEFINE(M_SOLARIS, "solaris", "Solaris");
+#else
+#define	malloc(size, type, flags)	malloc(size)
+#define	free(addr, type)		free(addr)
+#endif
+
+#ifdef KMEM_DEBUG
+struct kmem_item {
+	struct stack	stack;
+	LIST_ENTRY(kmem_item) next;
+};
+static LIST_HEAD(, kmem_item) kmem_items;
+static struct mtx kmem_items_mtx;
+MTX_SYSINIT(kmem_items_mtx, &kmem_items_mtx, "kmem_items", MTX_DEF);
+#endif	/* KMEM_DEBUG */
+
+#include <sys/vmem.h>
+
+void *
+zfs_kmem_alloc(size_t size, int kmflags)
+{
+	void *p;
+#ifdef KMEM_DEBUG
+	struct kmem_item *i;
+
+	size += sizeof (struct kmem_item);
+#endif
+	p = malloc(MAX(size, 16), M_SOLARIS, kmflags);
+#ifndef _KERNEL
+	if (kmflags & KM_SLEEP)
+		assert(p != NULL);
+#endif
+#ifdef KMEM_DEBUG
+	if (p != NULL) {
+		i = p;
+		p = (uint8_t *)p + sizeof (struct kmem_item);
+		stack_save(&i->stack);
+		mtx_lock(&kmem_items_mtx);
+		LIST_INSERT_HEAD(&kmem_items, i, next);
+		mtx_unlock(&kmem_items_mtx);
+	}
+#endif
+	return (p);
+}
+
+void
+zfs_kmem_free(void *buf, size_t size __unused)
+{
+#ifdef KMEM_DEBUG
+	if (buf == NULL) {
+		printf("%s: attempt to free NULL\n", __func__);
+		return;
+	}
+	struct kmem_item *i;
+
+	buf = (uint8_t *)buf - sizeof (struct kmem_item);
+	mtx_lock(&kmem_items_mtx);
+	LIST_FOREACH(i, &kmem_items, next) {
+		if (i == buf)
+			break;
+	}
+	ASSERT(i != NULL);
+	LIST_REMOVE(i, next);
+	mtx_unlock(&kmem_items_mtx);
+	memset(buf, 0xDC, MAX(size, 16));
+#endif
+	free(buf, M_SOLARIS);
+}
+
+static uint64_t kmem_size_val;
+
+static void
+kmem_size_init(void *unused __unused)
+{
+
+	kmem_size_val = (uint64_t)vm_cnt.v_page_count * PAGE_SIZE;
+	if (kmem_size_val > vm_kmem_size)
+		kmem_size_val = vm_kmem_size;
+}
+SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL);
+
+uint64_t
+kmem_size(void)
+{
+
+	return (kmem_size_val);
+}
+
+static int
+kmem_std_constructor(void *mem, int size __unused, void *private, int flags)
+{
+	struct kmem_cache *cache = private;
+
+	return (cache->kc_constructor(mem, cache->kc_private, flags));
+}
+
+static void
+kmem_std_destructor(void *mem, int size __unused, void *private)
+{
+	struct kmem_cache *cache = private;
+
+	cache->kc_destructor(mem, cache->kc_private);
+}
+
+kmem_cache_t *
+kmem_cache_create(char *name, size_t bufsize, size_t align,
+    int (*constructor)(void *, void *, int), void (*destructor)(void *, void *),
+    void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags)
+{
+	kmem_cache_t *cache;
+
+	ASSERT(vmp == NULL);
+
+	cache = kmem_alloc(sizeof (*cache), KM_SLEEP);
+	strlcpy(cache->kc_name, name, sizeof (cache->kc_name));
+	cache->kc_constructor = constructor;
+	cache->kc_destructor = destructor;
+	cache->kc_private = private;
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+	cache->kc_zone = uma_zcreate(cache->kc_name, bufsize,
+	    constructor != NULL ? kmem_std_constructor : NULL,
+	    destructor != NULL ? kmem_std_destructor : NULL,
+	    NULL, NULL, align > 0 ? align - 1 : 0, cflags);
+#else
+	cache->kc_size = bufsize;
+#endif
+
+	return (cache);
+}
+
+void
+kmem_cache_destroy(kmem_cache_t *cache)
+{
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+	uma_zdestroy(cache->kc_zone);
+#endif
+	kmem_free(cache, sizeof (*cache));
+}
+
+void *
+kmem_cache_alloc(kmem_cache_t *cache, int flags)
+{
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+	return (uma_zalloc_arg(cache->kc_zone, cache, flags));
+#else
+	void *p;
+
+	p = kmem_alloc(cache->kc_size, flags);
+	if (p != NULL && cache->kc_constructor != NULL)
+		kmem_std_constructor(p, cache->kc_size, cache, flags);
+	return (p);
+#endif
+}
+
+void
+kmem_cache_free(kmem_cache_t *cache, void *buf)
+{
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+	uma_zfree_arg(cache->kc_zone, buf, cache);
+#else
+	if (cache->kc_destructor != NULL)
+		kmem_std_destructor(buf, cache->kc_size, cache);
+	kmem_free(buf, cache->kc_size);
+#endif
+}
+
+/*
+ * Allow our caller to determine if there are running reaps.
+ *
+ * This call is very conservative and may return B_TRUE even when
+ * reaping activity isn't active. If it returns B_FALSE, then reaping
+ * activity is definitely inactive.
+ */
+boolean_t
+kmem_cache_reap_active(void)
+{
+
+	return (B_FALSE);
+}
+
+/*
+ * Reap (almost) everything soon.
+ *
+ * Note: this does not wait for the reap-tasks to complete. Caller
+ * should use kmem_cache_reap_active() (above) and/or moderation to
+ * avoid scheduling too many reap-tasks.
+ */
+#ifdef _KERNEL
+void
+kmem_cache_reap_soon(kmem_cache_t *cache)
+{
+#ifndef KMEM_DEBUG
+#if __FreeBSD_version >= 1300043
+	uma_zone_reclaim(cache->kc_zone, UMA_RECLAIM_DRAIN);
+#else
+	zone_drain(cache->kc_zone);
+#endif
+#endif
+}
+
+void
+kmem_reap(void)
+{
+#if __FreeBSD_version >= 1300043
+	uma_reclaim(UMA_RECLAIM_TRIM);
+#else
+	uma_reclaim();
+#endif
+}
+#else
+void
+kmem_cache_reap_soon(kmem_cache_t *cache __unused)
+{
+}
+
+void
+kmem_reap(void)
+{
+}
+#endif
+
+int
+kmem_debugging(void)
+{
+	return (0);
+}
+
+void *
+calloc(size_t n, size_t s)
+{
+	return (kmem_zalloc(n * s, KM_NOSLEEP));
+}
+
+char *
+kmem_vasprintf(const char *fmt, va_list adx)
+{
+	char *msg;
+	va_list adx2;
+
+	va_copy(adx2, adx);
+	msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP);
+	(void) vsprintf(msg, fmt, adx2);
+	va_end(adx2);
+
+	return (msg);
+}
+
+#include <vm/uma.h>
+#include <vm/uma_int.h>
+#ifdef KMEM_DEBUG
+#error "KMEM_DEBUG not currently supported"
+#endif
+
+uint64_t
+spl_kmem_cache_inuse(kmem_cache_t *cache)
+{
+	return (uma_zone_get_cur(cache->kc_zone));
+}
+
+uint64_t
+spl_kmem_cache_entry_size(kmem_cache_t *cache)
+{
+	return (cache->kc_zone->uz_size);
+}
+
+/*
+ * Register a move callback for cache defragmentation.
+ * XXX: Unimplemented but harmless to stub out for now.
+ */
+void
+spl_kmem_cache_set_move(kmem_cache_t *skc,
+    kmem_cbrc_t (move)(void *, void *, size_t, void *))
+{
+	ASSERT(move != NULL);
+}
+
+#ifdef KMEM_DEBUG
+void kmem_show(void *);
+void
+kmem_show(void *dummy __unused)
+{
+	struct kmem_item *i;
+
+	mtx_lock(&kmem_items_mtx);
+	if (LIST_EMPTY(&kmem_items))
+		printf("KMEM_DEBUG: No leaked elements.\n");
+	else {
+		printf("KMEM_DEBUG: Leaked elements:\n\n");
+		LIST_FOREACH(i, &kmem_items, next) {
+			printf("address=%p\n", i);
+			stack_print_ddb(&i->stack);
+			printf("\n");
+		}
+	}
+	mtx_unlock(&kmem_items_mtx);
+}
+
+SYSUNINIT(sol_kmem, SI_SUB_CPU, SI_ORDER_FIRST, kmem_show, NULL);
+#endif	/* KMEM_DEBUG */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c
new file mode 100644
index 000000000000..756667045b17
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sysctl.h>
+#include <sys/kstat.h>
+
+static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics");
+
+SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW, 0, "Kernel statistics");
+
+void
+__kstat_set_raw_ops(kstat_t *ksp,
+    int (*headers)(char *buf, size_t size),
+    int (*data)(char *buf, size_t size, void *data),
+    void *(*addr)(kstat_t *ksp, loff_t index))
+{
+	ksp->ks_raw_ops.headers = headers;
+	ksp->ks_raw_ops.data    = data;
+	ksp->ks_raw_ops.addr    = addr;
+}
+
+static int
+kstat_default_update(kstat_t *ksp, int rw)
+{
+	ASSERT(ksp != NULL);
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	return (0);
+}
+
+kstat_t *
+__kstat_create(const char *module, int instance, const char *name,
+    const char *class, uchar_t ks_type, uint_t ks_ndata, uchar_t flags)
+{
+	struct sysctl_oid *root;
+	kstat_t *ksp;
+
+	KASSERT(instance == 0, ("instance=%d", instance));
+	if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
+		ASSERT(ks_ndata == 1);
+
+	/*
+	 * Allocate the main structure. We don't need to copy module/class/name
+	 * stuff in here, because it is only used for sysctl node creation
+	 * done in this function.
+	 */
+	ksp = malloc(sizeof (*ksp), M_KSTAT, M_WAITOK|M_ZERO);
+
+	ksp->ks_crtime = gethrtime();
+	ksp->ks_snaptime = ksp->ks_crtime;
+	ksp->ks_instance = instance;
+	strncpy(ksp->ks_name, name, KSTAT_STRLEN);
+	strncpy(ksp->ks_class, class, KSTAT_STRLEN);
+	ksp->ks_type = ks_type;
+	ksp->ks_flags = flags;
+	ksp->ks_update = kstat_default_update;
+
+	switch (ksp->ks_type) {
+		case KSTAT_TYPE_RAW:
+			ksp->ks_ndata = 1;
+			ksp->ks_data_size = ks_ndata;
+			break;
+		case KSTAT_TYPE_NAMED:
+			ksp->ks_ndata = ks_ndata;
+			ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t);
+			break;
+		case KSTAT_TYPE_INTR:
+			ksp->ks_ndata = ks_ndata;
+			ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t);
+			break;
+		case KSTAT_TYPE_IO:
+			ksp->ks_ndata = ks_ndata;
+			ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t);
+			break;
+		case KSTAT_TYPE_TIMER:
+			ksp->ks_ndata = ks_ndata;
+			ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t);
+			break;
+		default:
+			panic("Undefined kstat type %d\n", ksp->ks_type);
+	}
+
+	if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) {
+		ksp->ks_data = NULL;
+	} else {
+		ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP);
+		if (ksp->ks_data == NULL) {
+			kmem_free(ksp, sizeof (*ksp));
+			ksp = NULL;
+		}
+	}
+	/*
+	 * Create sysctl tree for those statistics:
+	 *
+	 *	kstat.<module>.<class>.<name>.
+	 */
+	sysctl_ctx_init(&ksp->ks_sysctl_ctx);
+	root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
+	    SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, CTLFLAG_RW, 0,
+	    "");
+	if (root == NULL) {
+		printf("%s: Cannot create kstat.%s tree!\n", __func__, module);
+		sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+		free(ksp, M_KSTAT);
+		return (NULL);
+	}
+	root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root),
+	    OID_AUTO, class, CTLFLAG_RW, 0, "");
+	if (root == NULL) {
+		printf("%s: Cannot create kstat.%s.%s tree!\n", __func__,
+		    module, class);
+		sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+		free(ksp, M_KSTAT);
+		return (NULL);
+	}
+	root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root),
+	    OID_AUTO, name, CTLFLAG_RW, 0, "");
+	if (root == NULL) {
+		printf("%s: Cannot create kstat.%s.%s.%s tree!\n", __func__,
+		    module, class, name);
+		sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+		free(ksp, M_KSTAT);
+		return (NULL);
+	}
+	ksp->ks_sysctl_root = root;
+
+	return (ksp);
+}
+
+static int
+kstat_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	kstat_t *ksp = arg1;
+	kstat_named_t *ksent = ksp->ks_data;
+	uint64_t val;
+
+	/* Select the correct element */
+	ksent += arg2;
+	/* Update the aggsums before reading */
+	(void) ksp->ks_update(ksp, KSTAT_READ);
+	val = ksent->value.ui64;
+
+	return (sysctl_handle_64(oidp, &val, 0, req));
+}
+
+static int
+kstat_sysctl_string(SYSCTL_HANDLER_ARGS)
+{
+	kstat_t *ksp = arg1;
+	kstat_named_t *ksent = ksp->ks_data;
+	char *val;
+	uint32_t len = 0;
+
+	/* Select the correct element */
+	ksent += arg2;
+	/* Update the aggsums before reading */
+	(void) ksp->ks_update(ksp, KSTAT_READ);
+	val = KSTAT_NAMED_STR_PTR(ksent);
+	len = KSTAT_NAMED_STR_BUFLEN(ksent);
+	val[len-1] = '\0';
+
+	return (sysctl_handle_string(oidp, val, len, req));
+}
+
+void
+kstat_install(kstat_t *ksp)
+{
+	kstat_named_t *ksent;
+	char *namelast;
+	int typelast;
+
+	ksent = ksp->ks_data;
+	if (ksp->ks_ndata == UINT32_MAX) {
+#ifdef INVARIANTS
+		printf("can't handle raw ops yet!!!\n");
+#endif
+		return;
+	}
+	if (ksent == NULL) {
+		printf("%s ksp->ks_data == NULL!!!!\n", __func__);
+		return;
+	}
+	typelast = 0;
+	namelast = NULL;
+	for (int i = 0; i < ksp->ks_ndata; i++, ksent++) {
+		if (ksent->data_type != 0) {
+			typelast = ksent->data_type;
+			namelast = ksent->name;
+		}
+		switch (typelast) {
+			case KSTAT_DATA_CHAR:
+				/* Not Implemented */
+				break;
+			case KSTAT_DATA_INT32:
+				SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+				    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+				    OID_AUTO, namelast,
+				    CTLTYPE_S32 | CTLFLAG_RD, ksp, i,
+				    kstat_sysctl, "I", namelast);
+				break;
+			case KSTAT_DATA_UINT32:
+				SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+				    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+				    OID_AUTO, namelast,
+				    CTLTYPE_U32 | CTLFLAG_RD, ksp, i,
+				    kstat_sysctl, "IU", namelast);
+				break;
+			case KSTAT_DATA_INT64:
+				SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+				    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+				    OID_AUTO, namelast,
+				    CTLTYPE_S64 | CTLFLAG_RD, ksp, i,
+				    kstat_sysctl, "Q", namelast);
+				break;
+			case KSTAT_DATA_UINT64:
+				SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+				    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+				    OID_AUTO, namelast,
+				    CTLTYPE_U64 | CTLFLAG_RD, ksp, i,
+				    kstat_sysctl, "QU", namelast);
+				break;
+			case KSTAT_DATA_LONG:
+				SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+				    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+				    OID_AUTO, namelast,
+				    CTLTYPE_LONG | CTLFLAG_RD, ksp, i,
+				    kstat_sysctl, "L", namelast);
+				break;
+			case KSTAT_DATA_ULONG:
+				SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+				    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+				    OID_AUTO, namelast,
+				    CTLTYPE_ULONG | CTLFLAG_RD, ksp, i,
+				    kstat_sysctl, "LU", namelast);
+				break;
+			case KSTAT_DATA_STRING:
+				SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+				    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+				    OID_AUTO, namelast,
+				    CTLTYPE_STRING | CTLFLAG_RD, ksp, i,
+				    kstat_sysctl_string, "A", namelast);
+				break;
+			default:
+				panic("unsupported type: %d", typelast);
+		}
+
+	}
+}
+
+void
+kstat_delete(kstat_t *ksp)
+{
+
+	sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+	free(ksp, M_KSTAT);
+}
+
+void
+kstat_waitq_enter(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t wcnt;
+
+	new = gethrtime();
+	delta = new - kiop->wlastupdate;
+	kiop->wlastupdate = new;
+	wcnt = kiop->wcnt++;
+	if (wcnt != 0) {
+		kiop->wlentime += delta * wcnt;
+		kiop->wtime += delta;
+	}
+}
+
+void
+kstat_waitq_exit(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t wcnt;
+
+	new = gethrtime();
+	delta = new - kiop->wlastupdate;
+	kiop->wlastupdate = new;
+	wcnt = kiop->wcnt--;
+	ASSERT((int)wcnt > 0);
+	kiop->wlentime += delta * wcnt;
+	kiop->wtime += delta;
+}
+
+void
+kstat_runq_enter(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t rcnt;
+
+	new = gethrtime();
+	delta = new - kiop->rlastupdate;
+	kiop->rlastupdate = new;
+	rcnt = kiop->rcnt++;
+	if (rcnt != 0) {
+		kiop->rlentime += delta * rcnt;
+		kiop->rtime += delta;
+	}
+}
+
+void
+kstat_runq_exit(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t rcnt;
+
+	new = gethrtime();
+	delta = new - kiop->rlastupdate;
+	kiop->rlastupdate = new;
+	rcnt = kiop->rcnt--;
+	ASSERT((int)rcnt > 0);
+	kiop->rlentime += delta * rcnt;
+	kiop->rtime += delta;
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
new file mode 100644
index 000000000000..0354b986cd5f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/limits.h>
+#include <sys/misc.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <sys/zfs_context.h>
+
+static struct opensolaris_utsname hw_utsname = {
+	.machine = MACHINE
+};
+
+#ifndef KERNEL_STATIC
+char hw_serial[11] = "0";
+
+utsname_t *
+utsname(void)
+{
+	return (&hw_utsname);
+}
+#endif
+
+static void
+opensolaris_utsname_init(void *arg)
+{
+
+	hw_utsname.sysname = ostype;
+	hw_utsname.nodename = prison0.pr_hostname;
+	hw_utsname.release = osrelease;
+	snprintf(hw_utsname.version, sizeof (hw_utsname.version),
+	    "%d", osreldate);
+}
+
+char *
+kmem_strdup(const char *s)
+{
+	char *buf;
+
+	buf = kmem_alloc(strlen(s) + 1, KM_SLEEP);
+	strcpy(buf, s);
+	return (buf);
+}
+
+int
+ddi_copyin(const void *from, void *to, size_t len, int flags)
+{
+	/* Fake ioctl() issued by kernel, 'from' is a kernel address */
+	if (flags & FKIOCTL) {
+		memcpy(to, from, len);
+		return (0);
+	}
+
+	return (copyin(from, to, len));
+}
+
+int
+ddi_copyout(const void *from, void *to, size_t len, int flags)
+{
+	/* Fake ioctl() issued by kernel, 'from' is a kernel address */
+	if (flags & FKIOCTL) {
+		memcpy(to, from, len);
+		return (0);
+	}
+
+	return (copyout(from, to, len));
+}
+
+int
+spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vpanic(fmt, ap);
+	va_end(ap);
+}
+
+
+SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY,
+    opensolaris_utsname_init, NULL);
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
new file mode 100644
index 000000000000..5cd5c69efa71
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/mntent.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/jail.h>
+#include <sys/policy.h>
+#include <sys/zfs_vfsops.h>
+
+
+int
+secpolicy_nfs(cred_t *cr)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_NFS_DAEMON));
+}
+
+int
+secpolicy_zfs(cred_t *cr)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT));
+}
+
+int
+secpolicy_zfs_proc(cred_t *cr, proc_t *proc)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT));
+}
+
+int
+secpolicy_sys_config(cred_t *cr, int checkonly __unused)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_ZFS_POOL_CONFIG));
+}
+
+int
+secpolicy_zinject(cred_t *cr)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_ZFS_INJECT));
+}
+
+int
+secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp __unused)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_VFS_UNMOUNT));
+}
+
+int
+secpolicy_fs_owner(struct mount *mp, cred_t *cr)
+{
+
+	if (zfs_super_owner) {
+		if (cr->cr_uid == mp->mnt_cred->cr_uid &&
+		    cr->cr_prison == mp->mnt_cred->cr_prison) {
+			return (0);
+		}
+	}
+	return (EPERM);
+}
+
+/*
+ * This check is done in kern_link(), so we could just return 0 here.
+ */
+extern int hardlink_check_uid;
+int
+secpolicy_basic_link(vnode_t *vp, cred_t *cr)
+{
+
+	if (!hardlink_check_uid)
+		return (0);
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+	return (spl_priv_check_cred(cr, PRIV_VFS_LINK));
+}
+
+int
+secpolicy_vnode_stky_modify(cred_t *cr)
+{
+
+	return (EPERM);
+}
+
+int
+secpolicy_vnode_remove(vnode_t *vp, cred_t *cr)
+{
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+	return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN));
+}
+
+int
+secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, accmode_t accmode)
+{
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+
+	if ((accmode & VREAD) && spl_priv_check_cred(cr, PRIV_VFS_READ) != 0)
+		return (EACCES);
+	if ((accmode & VWRITE) &&
+	    spl_priv_check_cred(cr, PRIV_VFS_WRITE) != 0) {
+		return (EACCES);
+	}
+	if (accmode & VEXEC) {
+		if (vp->v_type == VDIR) {
+			if (spl_priv_check_cred(cr, PRIV_VFS_LOOKUP) != 0)
+				return (EACCES);
+		} else {
+			if (spl_priv_check_cred(cr, PRIV_VFS_EXEC) != 0)
+				return (EACCES);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Like secpolicy_vnode_access() but we get the actual wanted mode and the
+ * current mode of the file, not the missing bits.
+ */
+int
+secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner,
+    accmode_t curmode, accmode_t wantmode)
+{
+	accmode_t mode;
+
+	mode = ~curmode & wantmode;
+
+	if (mode == 0)
+		return (0);
+
+	return (secpolicy_vnode_access(cr, vp, owner, mode));
+}
+
+int
+secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner)
+{
+	static int privs[] = {
+	    PRIV_VFS_ADMIN,
+	    PRIV_VFS_READ,
+	    PRIV_VFS_WRITE,
+	    PRIV_VFS_EXEC,
+	    PRIV_VFS_LOOKUP
+	};
+	int i;
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+
+	/* Same as secpolicy_vnode_setdac */
+	if (owner == cr->cr_uid)
+		return (0);
+
+	for (i = 0; i < sizeof (privs)/sizeof (int); i++) {
+		int priv;
+
+		switch (priv = privs[i]) {
+		case PRIV_VFS_EXEC:
+			if (vp->v_type == VDIR)
+				continue;
+			break;
+		case PRIV_VFS_LOOKUP:
+			if (vp->v_type != VDIR)
+				continue;
+			break;
+		}
+		if (spl_priv_check_cred(cr, priv) == 0)
+			return (0);
+	}
+	return (EPERM);
+}
+
+int
+secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner)
+{
+
+	if (owner == cr->cr_uid)
+		return (0);
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+	return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN));
+}
+
+int
+secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap,
+    const struct vattr *ovap, int flags,
+    int unlocked_access(void *, int, cred_t *), void *node)
+{
+	int mask = vap->va_mask;
+	int error;
+
+	if (mask & AT_SIZE) {
+		if (vp->v_type == VDIR)
+			return (EISDIR);
+		error = unlocked_access(node, VWRITE, cr);
+		if (error)
+			return (error);
+	}
+	if (mask & AT_MODE) {
+		/*
+		 * If not the owner of the file then check privilege
+		 * for two things: the privilege to set the mode at all
+		 * and, if we're setting setuid, we also need permissions
+		 * to add the set-uid bit, if we're not the owner.
+		 * In the specific case of creating a set-uid root
+		 * file, we need even more permissions.
+		 */
+		error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
+		if (error)
+			return (error);
+		error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cr);
+		if (error)
+			return (error);
+	} else {
+		vap->va_mode = ovap->va_mode;
+	}
+	if (mask & (AT_UID | AT_GID)) {
+		error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
+		if (error)
+			return (error);
+
+		/*
+		 * To change the owner of a file, or change the group of
+		 * a file to a group of which we are not a member, the
+		 * caller must have privilege.
+		 */
+		if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
+		    ((mask & AT_GID) && vap->va_gid != ovap->va_gid &&
+		    !groupmember(vap->va_gid, cr))) {
+			if (secpolicy_fs_owner(vp->v_mount, cr) != 0) {
+				error = spl_priv_check_cred(cr, PRIV_VFS_CHOWN);
+				if (error)
+					return (error);
+			}
+		}
+
+		if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
+		    ((mask & AT_GID) && vap->va_gid != ovap->va_gid)) {
+			secpolicy_setid_clear(vap, vp, cr);
+		}
+	}
+	if (mask & (AT_ATIME | AT_MTIME)) {
+		/*
+		 * From utimes(2):
+		 * If times is NULL, ... The caller must be the owner of
+		 * the file, have permission to write the file, or be the
+		 * super-user.
+		 * If times is non-NULL, ... The caller must be the owner of
+		 * the file or be the super-user.
+		 */
+		error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
+		if (error && (vap->va_vaflags & VA_UTIMES_NULL))
+			error = unlocked_access(node, VWRITE, cr);
+		if (error)
+			return (error);
+	}
+	return (0);
+}
+
+int
+secpolicy_vnode_create_gid(cred_t *cr)
+{
+
+	return (EPERM);
+}
+
+int
+secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid)
+{
+
+	if (groupmember(gid, cr))
+		return (0);
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+	return (spl_priv_check_cred(cr, PRIV_VFS_SETGID));
+}
+
+int
+secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr,
+    boolean_t issuidroot __unused)
+{
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+	return (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID));
+}
+
+void
+secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr)
+{
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return;
+
+	if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0) {
+		if (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)) {
+			vap->va_mask |= AT_MODE;
+			vap->va_mode &= ~(S_ISUID|S_ISGID);
+		}
+	}
+}
+
+int
+secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap,
+    const struct vattr *ovap, cred_t *cr)
+{
+	int error;
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+
+	/*
+	 * Privileged processes may set the sticky bit on non-directories,
+	 * as well as set the setgid bit on a file with a group that the process
+	 * is not a member of. Both of these are allowed in jail(8).
+	 */
+	if (vp->v_type != VDIR && (vap->va_mode & S_ISTXT)) {
+		if (spl_priv_check_cred(cr, PRIV_VFS_STICKYFILE))
+			return (EFTYPE);
+	}
+	/*
+	 * Check for privilege if attempting to set the
+	 * group-id bit.
+	 */
+	if ((vap->va_mode & S_ISGID) != 0) {
+		error = secpolicy_vnode_setids_setgids(vp, cr, ovap->va_gid);
+		if (error)
+			return (error);
+	}
+	/*
+	 * Deny setting setuid if we are not the file owner.
+	 */
+	if ((vap->va_mode & S_ISUID) && ovap->va_uid != cr->cr_uid) {
+		error = spl_priv_check_cred(cr, PRIV_VFS_ADMIN);
+		if (error)
+			return (error);
+	}
+	return (0);
+}
+
+int
+secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT));
+}
+
+int
+secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner)
+{
+
+	if (owner == cr->cr_uid)
+		return (0);
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+
+	/* XXX: vfs_suser()? */
+	return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_OWNER));
+}
+
+int
+secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner)
+{
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+	return (spl_priv_check_cred(cr, PRIV_VFS_CHOWN));
+}
+
+void
+secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp)
+{
+
+	if (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_NONUSER) != 0) {
+		MNT_ILOCK(vfsp);
+		vfsp->vfs_flag |= VFS_NOSETUID | MNT_USER;
+		vfs_clearmntopt(vfsp, MNTOPT_SETUID);
+		vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 0);
+		MNT_IUNLOCK(vfsp);
+	}
+}
+
+/*
+ * Check privileges for setting xvattr attributes
+ */
+int
+secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr,
+    vtype_t vtype)
+{
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+	return (spl_priv_check_cred(cr, PRIV_VFS_SYSFLAGS));
+}
+
+int
+secpolicy_smb(cred_t *cr)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_NETSMB));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c
new file mode 100644
index 000000000000..7b4ae9d0e357
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/list.h>
+#include <sys/mutex.h>
+#include <sys/procfs_list.h>
+
+void
+seq_printf(struct seq_file *m, const char *fmt, ...)
+{}
+
+void
+procfs_list_install(const char *module,
+    const char *name,
+    mode_t mode,
+    procfs_list_t *procfs_list,
+    int (*show)(struct seq_file *f, void *p),
+    int (*show_header)(struct seq_file *f),
+    int (*clear)(procfs_list_t *procfs_list),
+    size_t procfs_list_node_off)
+{
+	mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&procfs_list->pl_list,
+	    procfs_list_node_off + sizeof (procfs_list_node_t),
+	    procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
+	procfs_list->pl_next_id = 1;
+	procfs_list->pl_node_offset = procfs_list_node_off;
+}
+
+void
+procfs_list_uninstall(procfs_list_t *procfs_list)
+{}
+
+void
+procfs_list_destroy(procfs_list_t *procfs_list)
+{
+	ASSERT(list_is_empty(&procfs_list->pl_list));
+	list_destroy(&procfs_list->pl_list);
+	mutex_destroy(&procfs_list->pl_lock);
+}
+
+#define	NODE_ID(procfs_list, obj) \
+		(((procfs_list_node_t *)(((char *)obj) + \
+		(procfs_list)->pl_node_offset))->pln_id)
+
+void
+procfs_list_add(procfs_list_t *procfs_list, void *p)
+{
+	ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+	NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
+	list_insert_tail(&procfs_list->pl_list, p);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c
new file mode 100644
index 000000000000..d13b64b4cd26
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c
@@ -0,0 +1,107 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * $FreeBSD$
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/string.h>
+#include <sys/kmem.h>
+#include <machine/stdarg.h>
+
+#define	IS_DIGIT(c)	((c) >= '0' && (c) <= '9')
+
+#define	IS_ALPHA(c)	\
+	(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
+
+char *
+strpbrk(const char *s, const char *b)
+{
+	const char *p;
+
+	do {
+		for (p = b; *p != '\0' && *p != *s; ++p)
+			;
+		if (*p != '\0')
+			return ((char *)s);
+	} while (*s++);
+
+	return (NULL);
+}
+
+/*
+ * Convert a string into a valid C identifier by replacing invalid
+ * characters with '_'.  Also makes sure the string is nul-terminated
+ * and takes up at most n bytes.
+ */
+void
+strident_canon(char *s, size_t n)
+{
+	char c;
+	char *end = s + n - 1;
+
+	if ((c = *s) == 0)
+		return;
+
+	if (!IS_ALPHA(c) && c != '_')
+		*s = '_';
+
+	while (s < end && ((c = *(++s)) != 0)) {
+		if (!IS_ALPHA(c) && !IS_DIGIT(c) && c != '_')
+			*s = '_';
+	}
+	*s = 0;
+}
+
+/*
+ * Do not change the length of the returned string; it must be freed
+ * with strfree().
+ */
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+	int size;
+	va_list adx;
+	char *buf;
+
+	va_start(adx, fmt);
+	size = vsnprintf(NULL, 0, fmt, adx) + 1;
+	va_end(adx);
+
+	buf = kmem_alloc(size, KM_SLEEP);
+
+	va_start(adx, fmt);
+	(void) vsnprintf(buf, size, fmt, adx);
+	va_end(adx);
+
+	return (buf);
+}
+
+void
+kmem_strfree(char *str)
+{
+	ASSERT(str != NULL);
+	kmem_free(str, strlen(str) + 1);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c
new file mode 100644
index 000000000000..ebec77bdb37f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/limits.h>
+#include <sys/misc.h>
+#include <sys/sunddi.h>
+#include <sys/sysctl.h>
+
+int
+ddi_strtol(const char *str, char **nptr, int base, long *result)
+{
+
+	*result = strtol(str, nptr, base);
+	return (0);
+}
+
+int
+ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result)
+{
+
+	if (str == hw_serial) {
+		*result = prison0.pr_hostid;
+		return (0);
+	}
+
+	*result = strtoul(str, nptr, base);
+	return (0);
+}
+
+int
+ddi_strtoull(const char *str, char **nptr, int base, unsigned long long *result)
+{
+
+	*result = (unsigned long long)strtouq(str, nptr, base);
+	return (0);
+}
+
+int
+ddi_strtoll(const char *str, char **nptr, int base, long long *result)
+{
+
+	*result = (long long)strtoq(str, nptr, base);
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
new file mode 100644
index 000000000000..8c0e495681e9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kmem.h>
+#include <sys/list.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/nvpair.h>
+#include <sys/sunddi.h>
+#include <sys/sysevent.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/bus.h>
+
+static int
+log_sysevent(nvlist_t *event)
+{
+	struct sbuf *sb;
+	const char *type;
+	char typestr[128];
+	nvpair_t *elem = NULL;
+
+	sb = sbuf_new_auto();
+	if (sb == NULL)
+		return (ENOMEM);
+	type = NULL;
+
+	while ((elem = nvlist_next_nvpair(event, elem)) != NULL) {
+		switch (nvpair_type(elem)) {
+		case DATA_TYPE_BOOLEAN:
+		{
+			boolean_t value;
+
+			(void) nvpair_value_boolean_value(elem, &value);
+			sbuf_printf(sb, " %s=%s", nvpair_name(elem),
+			    value ? "true" : "false");
+			break;
+		}
+		case DATA_TYPE_UINT8:
+		{
+			uint8_t value;
+
+			(void) nvpair_value_uint8(elem, &value);
+			sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value);
+			break;
+		}
+		case DATA_TYPE_INT32:
+		{
+			int32_t value;
+
+			(void) nvpair_value_int32(elem, &value);
+			sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
+			    (intmax_t)value);
+			break;
+		}
+		case DATA_TYPE_UINT32:
+		{
+			uint32_t value;
+
+			(void) nvpair_value_uint32(elem, &value);
+			sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
+			    (uintmax_t)value);
+			break;
+		}
+		case DATA_TYPE_INT64:
+		{
+			int64_t value;
+
+			(void) nvpair_value_int64(elem, &value);
+			sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
+			    (intmax_t)value);
+			break;
+		}
+		case DATA_TYPE_UINT64:
+		{
+			uint64_t value;
+
+			(void) nvpair_value_uint64(elem, &value);
+			sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
+			    (uintmax_t)value);
+			break;
+		}
+		case DATA_TYPE_STRING:
+		{
+			char *value;
+
+			(void) nvpair_value_string(elem, &value);
+			sbuf_printf(sb, " %s=%s", nvpair_name(elem), value);
+			if (strcmp(FM_CLASS, nvpair_name(elem)) == 0)
+				type = value;
+			break;
+		}
+		case DATA_TYPE_UINT8_ARRAY:
+		{
+			uint8_t *value;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_uint8_array(elem, &value, &nelem);
+			sbuf_printf(sb, " %s=", nvpair_name(elem));
+			for (ii = 0; ii < nelem; ii++)
+				sbuf_printf(sb, "%02hhx", value[ii]);
+			break;
+		}
+		case DATA_TYPE_UINT16_ARRAY:
+		{
+			uint16_t *value;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_uint16_array(elem, &value, &nelem);
+			sbuf_printf(sb, " %s=", nvpair_name(elem));
+			for (ii = 0; ii < nelem; ii++)
+				sbuf_printf(sb, "%04hx", value[ii]);
+			break;
+		}
+		case DATA_TYPE_UINT32_ARRAY:
+		{
+			uint32_t *value;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_uint32_array(elem, &value, &nelem);
+			sbuf_printf(sb, " %s=", nvpair_name(elem));
+			for (ii = 0; ii < nelem; ii++)
+				sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]);
+			break;
+		}
+		case DATA_TYPE_INT64_ARRAY:
+		{
+			int64_t *value;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_int64_array(elem, &value, &nelem);
+			sbuf_printf(sb, " %s=", nvpair_name(elem));
+			for (ii = 0; ii < nelem; ii++)
+				sbuf_printf(sb, "%016lld",
+				    (long long)value[ii]);
+			break;
+		}
+		case DATA_TYPE_UINT64_ARRAY:
+		{
+			uint64_t *value;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_uint64_array(elem, &value, &nelem);
+			sbuf_printf(sb, " %s=", nvpair_name(elem));
+			for (ii = 0; ii < nelem; ii++)
+				sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]);
+			break;
+		}
+		case DATA_TYPE_STRING_ARRAY:
+		{
+			char **strarr;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_string_array(elem, &strarr, &nelem);
+
+			for (ii = 0; ii < nelem; ii++) {
+				if (strarr[ii] == NULL)  {
+					sbuf_printf(sb, " <NULL>");
+					continue;
+				}
+
+				sbuf_printf(sb, " %s", strarr[ii]);
+				if (strcmp(FM_CLASS, strarr[ii]) == 0)
+					type = strarr[ii];
+			}
+			break;
+		}
+		case DATA_TYPE_NVLIST:
+			/* XXX - requires recursing in log_sysevent */
+			break;
+		default:
+			printf("%s: type %d is not implemented\n", __func__,
+			    nvpair_type(elem));
+			break;
+		}
+	}
+
+	if (sbuf_finish(sb) != 0) {
+		sbuf_delete(sb);
+		return (ENOMEM);
+	}
+
+	if (type == NULL)
+		type = "";
+	if (strncmp(type, "ESC_ZFS_", 8) == 0) {
+		snprintf(typestr, sizeof (typestr), "misc.fs.zfs.%s", type + 8);
+		type = typestr;
+	}
+	devctl_notify("ZFS", "ZFS", type, sbuf_data(sb));
+	sbuf_delete(sb);
+
+	return (0);
+}
+
+static void
+sysevent_worker(void *arg __unused)
+{
+	zfs_zevent_t *ze;
+	nvlist_t *event;
+	uint64_t dropped = 0;
+	uint64_t dst_size;
+	int error;
+
+	zfs_zevent_init(&ze);
+	for (;;) {
+		dst_size = 131072;
+		dropped = 0;
+		event = NULL;
+		error = zfs_zevent_next(ze, &event,
+		    &dst_size, &dropped);
+		if (error) {
+			error = zfs_zevent_wait(ze);
+			if (error == ESHUTDOWN)
+				break;
+		} else {
+			VERIFY(event != NULL);
+			log_sysevent(event);
+			nvlist_free(event);
+		}
+	}
+	zfs_zevent_destroy(ze);
+	kthread_exit();
+}
+
+void
+ddi_sysevent_init(void)
+{
+	kproc_kthread_add(sysevent_worker, NULL, &system_proc, NULL, 0, 0,
+	    "zfskern", "sysevent");
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c
new file mode 100644
index 000000000000..049e889cf304
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2009 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Copyright (c) 2012 Spectra Logic Corporation.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/kmem.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/taskq.h>
+#include <sys/zfs_context.h>
+#include <sys/ck.h>
+#include <sys/epoch.h>
+
+#include <vm/uma.h>
+
+#if __FreeBSD_version < 1201522
+#define	taskqueue_start_threads_in_proc(tqp, count, pri, proc, name, ...) \
+    taskqueue_start_threads(tqp, count, pri, name, __VA_ARGS__)
+#endif
+
+static uint_t taskq_tsd;
+static uma_zone_t taskq_zone;
+
+taskq_t *system_taskq = NULL;
+taskq_t *system_delay_taskq = NULL;
+taskq_t *dynamic_taskq = NULL;
+
+proc_t *system_proc;
+
+extern int uma_align_cache;
+
+static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures");
+
+static CK_LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl;
+static unsigned long tqenthash;
+static unsigned long tqenthashlock;
+static struct sx *tqenthashtbl_lock;
+
+static uint32_t tqidnext = 1;
+
+#define	TQIDHASH(tqid) (&tqenthashtbl[(tqid) & tqenthash])
+#define	TQIDHASHLOCK(tqid) (&tqenthashtbl_lock[((tqid) & tqenthashlock)])
+
+#define	TIMEOUT_TASK 1
+#define	NORMAL_TASK 2
+
+static void
+system_taskq_init(void *arg)
+{
+	int i;
+
+	tsd_create(&taskq_tsd, NULL);
+	tqenthashtbl = hashinit(mp_ncpus * 8, M_TASKQ, &tqenthash);
+	tqenthashlock = (tqenthash + 1) / 8;
+	if (tqenthashlock > 0)
+		tqenthashlock--;
+	tqenthashtbl_lock =
+	    malloc(sizeof (*tqenthashtbl_lock) * (tqenthashlock + 1),
+	    M_TASKQ, M_WAITOK | M_ZERO);
+	for (i = 0; i < tqenthashlock + 1; i++)
+		sx_init_flags(&tqenthashtbl_lock[i], "tqenthash", SX_DUPOK);
+	tqidnext = 1;
+	taskq_zone = uma_zcreate("taskq_zone", sizeof (taskq_ent_t),
+	    NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_CACHE, 0);
+	system_taskq = taskq_create("system_taskq", mp_ncpus, minclsyspri,
+	    0, 0, 0);
+	system_delay_taskq = taskq_create("system_delay_taskq", mp_ncpus,
+	    minclsyspri, 0, 0, 0);
+}
+SYSINIT(system_taskq_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_init,
+    NULL);
+
+static void
+system_taskq_fini(void *arg)
+{
+	int i;
+
+	taskq_destroy(system_delay_taskq);
+	taskq_destroy(system_taskq);
+	uma_zdestroy(taskq_zone);
+	tsd_destroy(&taskq_tsd);
+	for (i = 0; i < tqenthashlock + 1; i++)
+		sx_destroy(&tqenthashtbl_lock[i]);
+	for (i = 0; i < tqenthash + 1; i++)
+		VERIFY(CK_LIST_EMPTY(&tqenthashtbl[i]));
+	free(tqenthashtbl_lock, M_TASKQ);
+	free(tqenthashtbl, M_TASKQ);
+}
+SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini,
+    NULL);
+
+static taskq_ent_t *
+taskq_lookup(taskqid_t tqid)
+{
+	taskq_ent_t *ent = NULL;
+
+	sx_xlock(TQIDHASHLOCK(tqid));
+	CK_LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) {
+		if (ent->tqent_id == tqid)
+			break;
+	}
+	if (ent != NULL)
+		refcount_acquire(&ent->tqent_rc);
+	sx_xunlock(TQIDHASHLOCK(tqid));
+	return (ent);
+}
+
+static taskqid_t
+taskq_insert(taskq_ent_t *ent)
+{
+	taskqid_t tqid = atomic_fetchadd_int(&tqidnext, 1);
+
+	ent->tqent_id = tqid;
+	ent->tqent_registered = B_TRUE;
+	sx_xlock(TQIDHASHLOCK(tqid));
+	CK_LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash);
+	sx_xunlock(TQIDHASHLOCK(tqid));
+	return (tqid);
+}
+
+static void
+taskq_remove(taskq_ent_t *ent)
+{
+	taskqid_t tqid = ent->tqent_id;
+
+	if (!ent->tqent_registered)
+		return;
+
+	sx_xlock(TQIDHASHLOCK(tqid));
+	CK_LIST_REMOVE(ent, tqent_hash);
+	sx_xunlock(TQIDHASHLOCK(tqid));
+	ent->tqent_registered = B_FALSE;
+}
+
+static void
+taskq_tsd_set(void *context)
+{
+	taskq_t *tq = context;
+
+	tsd_set(taskq_tsd, tq);
+}
+
+static taskq_t *
+taskq_create_impl(const char *name, int nthreads, pri_t pri,
+    proc_t *proc __maybe_unused, uint_t flags)
+{
+	taskq_t *tq;
+
+	if ((flags & TASKQ_THREADS_CPU_PCT) != 0)
+		nthreads = MAX((mp_ncpus * nthreads) / 100, 1);
+
+	tq = kmem_alloc(sizeof (*tq), KM_SLEEP);
+	tq->tq_queue = taskqueue_create(name, M_WAITOK,
+	    taskqueue_thread_enqueue, &tq->tq_queue);
+	taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT,
+	    taskq_tsd_set, tq);
+	taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN,
+	    taskq_tsd_set, NULL);
+	(void) taskqueue_start_threads_in_proc(&tq->tq_queue, nthreads, pri,
+	    proc, "%s", name);
+
+	return ((taskq_t *)tq);
+}
+
+taskq_t *
+taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused,
+    int maxalloc __unused, uint_t flags)
+{
+	return (taskq_create_impl(name, nthreads, pri, system_proc, flags));
+}
+
+taskq_t *
+taskq_create_proc(const char *name, int nthreads, pri_t pri,
+    int minalloc __unused, int maxalloc __unused, proc_t *proc, uint_t flags)
+{
+	return (taskq_create_impl(name, nthreads, pri, proc, flags));
+}
+
+void
+taskq_destroy(taskq_t *tq)
+{
+
+	taskqueue_free(tq->tq_queue);
+	kmem_free(tq, sizeof (*tq));
+}
+
+int
+taskq_member(taskq_t *tq, kthread_t *thread)
+{
+
+	return (taskqueue_member(tq->tq_queue, thread));
+}
+
+taskq_t *
+taskq_of_curthread(void)
+{
+	return (tsd_get(taskq_tsd));
+}
+
+static void
+taskq_free(taskq_ent_t *task)
+{
+	taskq_remove(task);
+	if (refcount_release(&task->tqent_rc))
+		uma_zfree(taskq_zone, task);
+}
+
+int
+taskq_cancel_id(taskq_t *tq, taskqid_t tid)
+{
+	uint32_t pend;
+	int rc;
+	taskq_ent_t *ent;
+
+	if (tid == 0)
+		return (0);
+
+	if ((ent = taskq_lookup(tid)) == NULL)
+		return (0);
+
+	ent->tqent_cancelled = B_TRUE;
+	if (ent->tqent_type == TIMEOUT_TASK) {
+		rc = taskqueue_cancel_timeout(tq->tq_queue,
+		    &ent->tqent_timeout_task, &pend);
+	} else
+		rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
+	if (rc == EBUSY) {
+		taskqueue_drain(tq->tq_queue, &ent->tqent_task);
+	} else if (pend) {
+		/*
+		 * Tasks normally free themselves when run, but here the task
+		 * was cancelled so it did not free itself.
+		 */
+		taskq_free(ent);
+	}
+	/* Free the extra reference we added with taskq_lookup. */
+	taskq_free(ent);
+	return (rc);
+}
+
+static void
+taskq_run(void *arg, int pending __unused)
+{
+	taskq_ent_t *task = arg;
+
+	if (!task->tqent_cancelled)
+		task->tqent_func(task->tqent_arg);
+	taskq_free(task);
+}
+
+taskqid_t
+taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
+    uint_t flags, clock_t expire_time)
+{
+	taskq_ent_t *task;
+	taskqid_t tid;
+	clock_t timo;
+	int mflag;
+
+	timo = expire_time - ddi_get_lbolt();
+	if (timo <= 0)
+		return (taskq_dispatch(tq, func, arg, flags));
+
+	if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP)
+		mflag = M_WAITOK;
+	else
+		mflag = M_NOWAIT;
+
+	task = uma_zalloc(taskq_zone, mflag);
+	if (task == NULL)
+		return (0);
+	task->tqent_func = func;
+	task->tqent_arg = arg;
+	task->tqent_type = TIMEOUT_TASK;
+	task->tqent_cancelled = B_FALSE;
+	refcount_init(&task->tqent_rc, 1);
+	tid = taskq_insert(task);
+	TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0,
+	    taskq_run, task);
+
+	taskqueue_enqueue_timeout(tq->tq_queue, &task->tqent_timeout_task,
+	    timo);
+	return (tid);
+}
+
+taskqid_t
+taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
+{
+	taskq_ent_t *task;
+	int mflag, prio;
+	taskqid_t tid;
+
+	if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP)
+		mflag = M_WAITOK;
+	else
+		mflag = M_NOWAIT;
+	/*
+	 * If TQ_FRONT is given, we want higher priority for this task, so it
+	 * can go at the front of the queue.
+	 */
+	prio = !!(flags & TQ_FRONT);
+
+	task = uma_zalloc(taskq_zone, mflag);
+	if (task == NULL)
+		return (0);
+	refcount_init(&task->tqent_rc, 1);
+	task->tqent_func = func;
+	task->tqent_arg = arg;
+	task->tqent_cancelled = B_FALSE;
+	task->tqent_type = NORMAL_TASK;
+	tid = taskq_insert(task);
+	TASK_INIT(&task->tqent_task, prio, taskq_run, task);
+	taskqueue_enqueue(tq->tq_queue, &task->tqent_task);
+	VERIFY(tid);
+	return (tid);
+}
+
+static void
+taskq_run_ent(void *arg, int pending __unused)
+{
+	taskq_ent_t *task = arg;
+
+	task->tqent_func(task->tqent_arg);
+}
+
+void
+taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags,
+    taskq_ent_t *task)
+{
+	int prio;
+
+	/*
+	 * If TQ_FRONT is given, we want higher priority for this task, so it
+	 * can go at the front of the queue.
+	 */
+	prio = !!(flags & TQ_FRONT);
+	task->tqent_cancelled = B_FALSE;
+	task->tqent_registered = B_FALSE;
+	task->tqent_id = 0;
+	task->tqent_func = func;
+	task->tqent_arg = arg;
+
+	TASK_INIT(&task->tqent_task, prio, taskq_run_ent, task);
+	taskqueue_enqueue(tq->tq_queue, &task->tqent_task);
+}
+
+void
+taskq_wait(taskq_t *tq)
+{
+	taskqueue_quiesce(tq->tq_queue);
+}
+
+void
+taskq_wait_id(taskq_t *tq, taskqid_t tid)
+{
+	taskq_ent_t *ent;
+
+	if (tid == 0)
+		return;
+	if ((ent = taskq_lookup(tid)) == NULL)
+		return;
+
+	taskqueue_drain(tq->tq_queue, &ent->tqent_task);
+	taskq_free(ent);
+}
+
+void
+taskq_wait_outstanding(taskq_t *tq, taskqid_t id __unused)
+{
+	taskqueue_drain_all(tq->tq_queue);
+}
+
+int
+taskq_empty_ent(taskq_ent_t *t)
+{
+	return (t->tqent_task.ta_pending == 0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c
new file mode 100644
index 000000000000..c6b610394718
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c
@@ -0,0 +1,92 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved   */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+
+/*
+ * same as uiomove() but doesn't modify uio structure.
+ * return in cbytes how many bytes were copied.
+ */
+int
+uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes)
+{
+	struct iovec small_iovec[1];
+	struct uio small_uio_clone;
+	struct uio *uio_clone;
+	int error;
+
+	ASSERT3U(uio->uio_rw, ==, rw);
+	if (uio->uio_iovcnt == 1) {
+		small_uio_clone = *uio;
+		small_iovec[0] = *uio->uio_iov;
+		small_uio_clone.uio_iov = small_iovec;
+		uio_clone = &small_uio_clone;
+	} else {
+		uio_clone = cloneuio(uio);
+	}
+
+	error = vn_io_fault_uiomove(p, n, uio_clone);
+	*cbytes = uio->uio_resid - uio_clone->uio_resid;
+	if (uio_clone != &small_uio_clone)
+		free(uio_clone, M_IOV);
+	return (error);
+}
+
+/*
+ * Drop the next n chars out of *uiop.
+ */
+void
+uioskip(uio_t *uio, size_t n)
+{
+	enum uio_seg segflg;
+
+	/* For the full compatibility with illumos. */
+	if (n > uio->uio_resid)
+		return;
+
+	segflg = uio->uio_segflg;
+	uio->uio_segflg = UIO_NOCOPY;
+	uiomove(NULL, n, uio->uio_rw, uio);
+	uio->uio_segflg = segflg;
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c
new file mode 100644
index 000000000000..991a11fe2baf
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2006-2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/cred.h>
+#include <sys/vfs.h>
+#include <sys/priv.h>
+#include <sys/libkern.h>
+
+#include <sys/mutex.h>
+#include <sys/vnode.h>
+#include <sys/taskq.h>
+
+#include <sys/ccompat.h>
+
+MALLOC_DECLARE(M_MOUNT);
+
+void
+vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg,
+    int flags __unused)
+{
+	struct vfsopt *opt;
+	size_t namesize;
+	int locked;
+
+	if (!(locked = mtx_owned(MNT_MTX(vfsp))))
+		MNT_ILOCK(vfsp);
+
+	if (vfsp->mnt_opt == NULL) {
+		void *opts;
+
+		MNT_IUNLOCK(vfsp);
+		opts = malloc(sizeof (*vfsp->mnt_opt), M_MOUNT, M_WAITOK);
+		MNT_ILOCK(vfsp);
+		if (vfsp->mnt_opt == NULL) {
+			vfsp->mnt_opt = opts;
+			TAILQ_INIT(vfsp->mnt_opt);
+		} else {
+			free(opts, M_MOUNT);
+		}
+	}
+
+	MNT_IUNLOCK(vfsp);
+
+	opt = malloc(sizeof (*opt), M_MOUNT, M_WAITOK);
+	namesize = strlen(name) + 1;
+	opt->name = malloc(namesize, M_MOUNT, M_WAITOK);
+	strlcpy(opt->name, name, namesize);
+	opt->pos = -1;
+	opt->seen = 1;
+	if (arg == NULL) {
+		opt->value = NULL;
+		opt->len = 0;
+	} else {
+		opt->len = strlen(arg) + 1;
+		opt->value = malloc(opt->len, M_MOUNT, M_WAITOK);
+		bcopy(arg, opt->value, opt->len);
+	}
+
+	MNT_ILOCK(vfsp);
+	TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link);
+	if (!locked)
+		MNT_IUNLOCK(vfsp);
+}
+
+void
+vfs_clearmntopt(vfs_t *vfsp, const char *name)
+{
+	int locked;
+
+	if (!(locked = mtx_owned(MNT_MTX(vfsp))))
+		MNT_ILOCK(vfsp);
+	vfs_deleteopt(vfsp->mnt_opt, name);
+	if (!locked)
+		MNT_IUNLOCK(vfsp);
+}
+
+int
+vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp)
+{
+	struct vfsoptlist *opts = vfsp->mnt_optnew;
+	int error;
+
+	if (opts == NULL)
+		return (0);
+	error = vfs_getopt(opts, opt, (void **)argp, NULL);
+	return (error != 0 ? 0 : 1);
+}
+
+int
+mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
+    char *fspec, int fsflags)
+{
+	struct vfsconf *vfsp;
+	struct mount *mp;
+	vnode_t *vp, *mvp;
+	struct ucred *cr;
+	int error;
+
+	ASSERT_VOP_ELOCKED(*vpp, "mount_snapshot");
+
+	vp = *vpp;
+	*vpp = NULL;
+	error = 0;
+
+	/*
+	 * Be ultra-paranoid about making sure the type and fspath
+	 * variables will fit in our mp buffers, including the
+	 * terminating NUL.
+	 */
+	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
+		error = ENAMETOOLONG;
+	if (error == 0 && (vfsp = vfs_byname_kld(fstype, td, &error)) == NULL)
+		error = ENODEV;
+	if (error == 0 && vp->v_type != VDIR)
+		error = ENOTDIR;
+	/*
+	 * We need vnode lock to protect v_mountedhere and vnode interlock
+	 * to protect v_iflag.
+	 */
+	if (error == 0) {
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
+			vp->v_iflag |= VI_MOUNT;
+		else
+			error = EBUSY;
+		VI_UNLOCK(vp);
+	}
+	if (error != 0) {
+		vput(vp);
+		return (error);
+	}
+	vn_seqc_write_begin(vp);
+	VOP_UNLOCK1(vp);
+
+	/*
+	 * Allocate and initialize the filesystem.
+	 * We don't want regular user that triggered snapshot mount to be able
+	 * to unmount it, so pass credentials of the parent mount.
+	 */
+	mp = vfs_mount_alloc(vp, vfsp, fspath, vp->v_mount->mnt_cred);
+
+	mp->mnt_optnew = NULL;
+	vfs_setmntopt(mp, "from", fspec, 0);
+	mp->mnt_optnew = mp->mnt_opt;
+	mp->mnt_opt = NULL;
+
+	/*
+	 * Set the mount level flags.
+	 */
+	mp->mnt_flag = fsflags & MNT_UPDATEMASK;
+	/*
+	 * Snapshots are always read-only.
+	 */
+	mp->mnt_flag |= MNT_RDONLY;
+	/*
+	 * We don't want snapshots to allow access to vulnerable setuid
+	 * programs, so we turn off setuid when mounting snapshots.
+	 */
+	mp->mnt_flag |= MNT_NOSUID;
+	/*
+	 * We don't want snapshots to be visible in regular
+	 * mount(8) and df(1) output.
+	 */
+	mp->mnt_flag |= MNT_IGNORE;
+	/*
+	 * XXX: This is evil, but we can't mount a snapshot as a regular user.
+	 * XXX: Is is safe when snapshot is mounted from within a jail?
+	 */
+	cr = td->td_ucred;
+	td->td_ucred = kcred;
+	error = VFS_MOUNT(mp);
+	td->td_ucred = cr;
+
+	if (error != 0) {
+		/*
+		 * Clear VI_MOUNT and decrement the use count "atomically",
+		 * under the vnode lock.  This is not strictly required,
+		 * but makes it easier to reason about the life-cycle and
+		 * ownership of the covered vnode.
+		 */
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		VI_LOCK(vp);
+		vp->v_iflag &= ~VI_MOUNT;
+		VI_UNLOCK(vp);
+		vn_seqc_write_end(vp);
+		vput(vp);
+		vfs_unbusy(mp);
+		vfs_freeopts(mp->mnt_optnew);
+		mp->mnt_vnodecovered = NULL;
+		vfs_mount_destroy(mp);
+		return (error);
+	}
+
+	if (mp->mnt_opt != NULL)
+		vfs_freeopts(mp->mnt_opt);
+	mp->mnt_opt = mp->mnt_optnew;
+	(void) VFS_STATFS(mp, &mp->mnt_stat);
+
+	/*
+	 * Prevent external consumers of mount options from reading
+	 * mnt_optnew.
+	 */
+	mp->mnt_optnew = NULL;
+
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef FREEBSD_NAMECACHE
+	cache_purge(vp);
+#endif
+	VI_LOCK(vp);
+	vp->v_iflag &= ~VI_MOUNT;
+	VI_UNLOCK(vp);
+
+	vp->v_mountedhere = mp;
+	/* Put the new filesystem on the mount list. */
+	mtx_lock(&mountlist_mtx);
+	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+	mtx_unlock(&mountlist_mtx);
+	vfs_event_signal(NULL, VQ_MOUNT, 0);
+	if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp))
+		panic("mount: lost mount");
+	vn_seqc_write_end(vp);
+	VOP_UNLOCK1(vp);
+#if __FreeBSD_version >= 1300048
+	vfs_op_exit(mp);
+#endif
+	vfs_unbusy(mp);
+	*vpp = mvp;
+	return (0);
+}
+
+/*
+ * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
+ * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
+ * the file system as a result of releasing the vnode. Note, file systems
+ * already have to handle the race where the vnode is incremented before the
+ * inactive routine is called and does its locking.
+ *
+ * Warning: Excessive use of this routine can lead to performance problems.
+ * This is because taskqs throttle back allocation if too many are created.
+ */
+void
+vn_rele_async(vnode_t *vp, taskq_t *taskq)
+{
+	VERIFY(vp->v_count > 0);
+	if (refcount_release_if_not_last(&vp->v_usecount)) {
+#if __FreeBSD_version < 1300045
+		vdrop(vp);
+#endif
+		return;
+	}
+	VERIFY(taskq_dispatch((taskq_t *)taskq,
+	    (task_func_t *)vrele, vp, TQ_SLEEP) != 0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
new file mode 100644
index 000000000000..739ddb05e895
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2013 EMC Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/counter.h>
+
+#include <sys/byteorder.h>
+#include <sys/lock.h>
+#include <sys/freebsd_rwlock.h>
+#include <sys/vm.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+const int zfs_vm_pagerret_bad = VM_PAGER_BAD;
+const int zfs_vm_pagerret_error = VM_PAGER_ERROR;
+const int zfs_vm_pagerret_ok = VM_PAGER_OK;
+const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC;
+const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL;
+
+void
+zfs_vmobject_assert_wlocked(vm_object_t object)
+{
+
+	/*
+	 * This is not ideal because FILE/LINE used by assertions will not
+	 * be too helpful, but it must be an hard function for
+	 * compatibility reasons.
+	 */
+	VM_OBJECT_ASSERT_WLOCKED(object);
+}
+
+void
+zfs_vmobject_wlock(vm_object_t object)
+{
+
+	VM_OBJECT_WLOCK(object);
+}
+
+void
+zfs_vmobject_wunlock(vm_object_t object)
+{
+
+	VM_OBJECT_WUNLOCK(object);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_zlib.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zlib.c
new file mode 100644
index 000000000000..3644eba77ca1
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zlib.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/zmod.h>
+#if __FreeBSD_version >= 1300041
+#include <contrib/zlib/zlib.h>
+#else
+#include <sys/zlib.h>
+#endif
+#include <sys/kobj.h>
+
+
+/*ARGSUSED*/
+static void *
+zcalloc(void *opaque, uint_t items, uint_t size)
+{
+
+	return (malloc((size_t)items*size, M_SOLARIS, M_NOWAIT));
+}
+
+/*ARGSUSED*/
+static void
+zcfree(void *opaque, void *ptr)
+{
+
+	free(ptr, M_SOLARIS);
+}
+
+static int
+zlib_deflateInit(z_stream *stream, int level)
+{
+
+	stream->zalloc = zcalloc;
+	stream->opaque = NULL;
+	stream->zfree = zcfree;
+
+	return (deflateInit(stream, level));
+}
+
+static int
+zlib_deflate(z_stream *stream, int flush)
+{
+	return (deflate(stream, flush));
+}
+
+static int
+zlib_deflateEnd(z_stream *stream)
+{
+	return (deflateEnd(stream));
+}
+
+static int
+zlib_inflateInit(z_stream *stream)
+{
+	stream->zalloc = zcalloc;
+	stream->opaque = NULL;
+	stream->zfree = zcfree;
+
+	return (inflateInit(stream));
+}
+
+static int
+zlib_inflate(z_stream *stream, int finish)
+{
+#if __FreeBSD_version >= 1300024
+	return (inflate(stream, finish));
+#else
+	return (_zlib104_inflate(stream, finish));
+#endif
+}
+
+
+static int
+zlib_inflateEnd(z_stream *stream)
+{
+	return (inflateEnd(stream));
+}
+
+/*
+ * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc
+ * and vfree for every call.  Using a kmem_cache also has the advantage
+ * that improves the odds that the memory used will be local to this cpu.
+ * To further improve things it might be wise to create a dedicated per-cpu
+ * workspace for use.  This would take some additional care because we then
+ * must disable preemption around the critical section, and verify that
+ * zlib_deflate* and zlib_inflate* never internally call schedule().
+ */
+static void *
+zlib_workspace_alloc(int flags)
+{
+	// return (kmem_cache_alloc(zlib_workspace_cache, flags));
+	return (NULL);
+}
+
+static void
+zlib_workspace_free(void *workspace)
+{
+	// kmem_cache_free(zlib_workspace_cache, workspace);
+}
+
+/*
+ * Compresses the source buffer into the destination buffer. The level
+ * parameter has the same meaning as in deflateInit.  sourceLen is the byte
+ * length of the source buffer. Upon entry, destLen is the total size of the
+ * destination buffer, which must be at least 0.1% larger than sourceLen plus
+ * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
+ *
+ * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ * memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+ * Z_STREAM_ERROR if the level parameter is invalid.
+ */
+int
+z_compress_level(void *dest, size_t *destLen, const void *source,
+    size_t sourceLen, int level)
+{
+	z_stream stream;
+	int err;
+
+	bzero(&stream, sizeof (stream));
+	stream.next_in = (Byte *)source;
+	stream.avail_in = (uInt)sourceLen;
+	stream.next_out = dest;
+	stream.avail_out = (uInt)*destLen;
+	stream.opaque = NULL;
+
+	if ((size_t)stream.avail_out != *destLen)
+		return (Z_BUF_ERROR);
+
+	stream.opaque = zlib_workspace_alloc(KM_SLEEP);
+#if 0
+	if (!stream.opaque)
+		return (Z_MEM_ERROR);
+#endif
+	err = zlib_deflateInit(&stream, level);
+	if (err != Z_OK) {
+		zlib_workspace_free(stream.opaque);
+		return (err);
+	}
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END) {
+		zlib_deflateEnd(&stream);
+		zlib_workspace_free(stream.opaque);
+		return (err == Z_OK ? Z_BUF_ERROR : err);
+	}
+	*destLen = stream.total_out;
+
+	err = zlib_deflateEnd(&stream);
+	zlib_workspace_free(stream.opaque);
+	return (err);
+}
+
+/*
+ * Decompresses the source buffer into the destination buffer.  sourceLen is
+ * the byte length of the source buffer. Upon entry, destLen is the total
+ * size of the destination buffer, which must be large enough to hold the
+ * entire uncompressed data. (The size of the uncompressed data must have
+ * been saved previously by the compressor and transmitted to the decompressor
+ * by some mechanism outside the scope of this compression library.)
+ * Upon exit, destLen is the actual size of the compressed buffer.
+ * This function can be used to decompress a whole file at once if the
+ * input file is mmap'ed.
+ *
+ * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+ * enough memory, Z_BUF_ERROR if there was not enough room in the output
+ * buffer, or Z_DATA_ERROR if the input data was corrupted.
+ */
+int
+z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen)
+{
+	z_stream stream;
+	int err;
+
+	bzero(&stream, sizeof (stream));
+
+	stream.next_in = (Byte *)source;
+	stream.avail_in = (uInt)sourceLen;
+	stream.next_out = dest;
+	stream.avail_out = (uInt)*destLen;
+
+	if ((size_t)stream.avail_out != *destLen)
+		return (Z_BUF_ERROR);
+
+	stream.opaque = zlib_workspace_alloc(KM_SLEEP);
+#if 0
+	if (!stream.opaque)
+		return (Z_MEM_ERROR);
+#endif
+	err = zlib_inflateInit(&stream);
+	if (err != Z_OK) {
+		zlib_workspace_free(stream.opaque);
+		return (err);
+	}
+
+	err = zlib_inflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END) {
+		zlib_inflateEnd(&stream);
+		zlib_workspace_free(stream.opaque);
+
+		if (err == Z_NEED_DICT ||
+		    (err == Z_BUF_ERROR && stream.avail_in == 0))
+			return (Z_DATA_ERROR);
+
+		return (err);
+	}
+	*destLen = stream.total_out;
+
+	err = zlib_inflateEnd(&stream);
+	zlib_workspace_free(stream.opaque);
+
+	return (err);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_zone.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zone.c
new file mode 100644
index 000000000000..0b3b04d2a73e
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zone.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/malloc.h>
+#include <sys/queue.h>
+#include <sys/jail.h>
+#include <sys/osd.h>
+#include <sys/priv.h>
+#include <sys/zone.h>
+
+#include <sys/policy.h>
+
+static MALLOC_DEFINE(M_ZONES, "zones_data", "Zones data");
+
+/*
+ * Structure to record list of ZFS datasets exported to a zone.
+ */
+typedef struct zone_dataset {
+	LIST_ENTRY(zone_dataset) zd_next;
+	char	zd_dataset[0];
+} zone_dataset_t;
+
+LIST_HEAD(zone_dataset_head, zone_dataset);
+
+static int zone_slot;
+
+int
+zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid)
+{
+	struct zone_dataset_head *head;
+	zone_dataset_t *zd, *zd2;
+	struct prison *pr;
+	int dofree, error;
+
+	if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0)
+		return (error);
+
+	/* Allocate memory before we grab prison's mutex. */
+	zd = malloc(sizeof (*zd) + strlen(dataset) + 1, M_ZONES, M_WAITOK);
+
+	sx_slock(&allprison_lock);
+	pr = prison_find(jailid);	/* Locks &pr->pr_mtx. */
+	sx_sunlock(&allprison_lock);
+	if (pr == NULL) {
+		free(zd, M_ZONES);
+		return (ENOENT);
+	}
+
+	head = osd_jail_get(pr, zone_slot);
+	if (head != NULL) {
+		dofree = 0;
+		LIST_FOREACH(zd2, head, zd_next) {
+			if (strcmp(dataset, zd2->zd_dataset) == 0) {
+				free(zd, M_ZONES);
+				error = EEXIST;
+				goto end;
+			}
+		}
+	} else {
+		dofree = 1;
+		prison_hold_locked(pr);
+		mtx_unlock(&pr->pr_mtx);
+		head = malloc(sizeof (*head), M_ZONES, M_WAITOK);
+		LIST_INIT(head);
+		mtx_lock(&pr->pr_mtx);
+		error = osd_jail_set(pr, zone_slot, head);
+		KASSERT(error == 0, ("osd_jail_set() failed (error=%d)",
+		    error));
+	}
+	strcpy(zd->zd_dataset, dataset);
+	LIST_INSERT_HEAD(head, zd, zd_next);
+end:
+	if (dofree)
+		prison_free_locked(pr);
+	else
+		mtx_unlock(&pr->pr_mtx);
+	return (error);
+}
+
+int
+zone_dataset_detach(struct ucred *cred, const char *dataset, int jailid)
+{
+	struct zone_dataset_head *head;
+	zone_dataset_t *zd;
+	struct prison *pr;
+	int error;
+
+	if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0)
+		return (error);
+
+	sx_slock(&allprison_lock);
+	pr = prison_find(jailid);
+	sx_sunlock(&allprison_lock);
+	if (pr == NULL)
+		return (ENOENT);
+	head = osd_jail_get(pr, zone_slot);
+	if (head == NULL) {
+		error = ENOENT;
+		goto end;
+	}
+	LIST_FOREACH(zd, head, zd_next) {
+		if (strcmp(dataset, zd->zd_dataset) == 0)
+			break;
+	}
+	if (zd == NULL)
+		error = ENOENT;
+	else {
+		LIST_REMOVE(zd, zd_next);
+		free(zd, M_ZONES);
+		if (LIST_EMPTY(head))
+			osd_jail_del(pr, zone_slot);
+		error = 0;
+	}
+end:
+	mtx_unlock(&pr->pr_mtx);
+	return (error);
+}
+
+/*
+ * Returns true if the named dataset is visible in the current zone.
+ * The 'write' parameter is set to 1 if the dataset is also writable.
+ */
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+	struct zone_dataset_head *head;
+	zone_dataset_t *zd;
+	struct prison *pr;
+	size_t len;
+	int ret = 0;
+
+	if (dataset[0] == '\0')
+		return (0);
+	if (INGLOBALZONE(curproc)) {
+		if (write != NULL)
+			*write = 1;
+		return (1);
+	}
+	pr = curthread->td_ucred->cr_prison;
+	mtx_lock(&pr->pr_mtx);
+	head = osd_jail_get(pr, zone_slot);
+	if (head == NULL)
+		goto end;
+
+	/*
+	 * Walk the list once, looking for datasets which match exactly, or
+	 * specify a dataset underneath an exported dataset.  If found, return
+	 * true and note that it is writable.
+	 */
+	LIST_FOREACH(zd, head, zd_next) {
+		len = strlen(zd->zd_dataset);
+		if (strlen(dataset) >= len &&
+		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
+		    (dataset[len] == '\0' || dataset[len] == '/' ||
+		    dataset[len] == '@')) {
+			if (write)
+				*write = 1;
+			ret = 1;
+			goto end;
+		}
+	}
+
+	/*
+	 * Walk the list a second time, searching for datasets which are parents
+	 * of exported datasets.  These should be visible, but read-only.
+	 *
+	 * Note that we also have to support forms such as 'pool/dataset/', with
+	 * a trailing slash.
+	 */
+	LIST_FOREACH(zd, head, zd_next) {
+		len = strlen(dataset);
+		if (dataset[len - 1] == '/')
+			len--;	/* Ignore trailing slash */
+		if (len < strlen(zd->zd_dataset) &&
+		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
+		    zd->zd_dataset[len] == '/') {
+			if (write)
+				*write = 0;
+			ret = 1;
+			goto end;
+		}
+	}
+end:
+	mtx_unlock(&pr->pr_mtx);
+	return (ret);
+}
+
+static void
+zone_destroy(void *arg)
+{
+	struct zone_dataset_head *head;
+	zone_dataset_t *zd;
+
+	head = arg;
+	while ((zd = LIST_FIRST(head)) != NULL) {
+		LIST_REMOVE(zd, zd_next);
+		free(zd, M_ZONES);
+	}
+	free(head, M_ZONES);
+}
+
+uint32_t
+zone_get_hostid(void *ptr)
+{
+
+	KASSERT(ptr == NULL, ("only NULL pointer supported in %s", __func__));
+
+	return ((uint32_t)curthread->td_ucred->cr_prison->pr_hostid);
+}
+
+boolean_t
+in_globalzone(struct proc *p)
+{
+	return (!jailed(FIRST_THREAD_IN_PROC((p))->td_ucred));
+}
+
+static void
+zone_sysinit(void *arg __unused)
+{
+
+	zone_slot = osd_jail_register(zone_destroy, NULL);
+}
+
+static void
+zone_sysuninit(void *arg __unused)
+{
+
+	osd_jail_deregister(zone_slot);
+}
+
+SYSINIT(zone_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysinit, NULL);
+SYSUNINIT(zone_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysuninit, NULL);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
new file mode 100644
index 000000000000..a7bda509bf54
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
@@ -0,0 +1,498 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * See abd.c for a general overview of the arc buffered data (ABD).
+ *
+ * Using a large proportion of scattered ABDs decreases ARC fragmentation since
+ * when we are at the limit of allocatable space, using equal-size chunks will
+ * allow us to quickly reclaim enough space for a new large allocation (assuming
+ * it is also scattered).
+ *
+ * ABDs are allocated scattered by default unless the caller uses
+ * abd_alloc_linear() or zfs_abd_scatter_enabled is disabled.
+ */
+
+#include <sys/abd_impl.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+
+typedef struct abd_stats {
+	kstat_named_t abdstat_struct_size;
+	kstat_named_t abdstat_scatter_cnt;
+	kstat_named_t abdstat_scatter_data_size;
+	kstat_named_t abdstat_scatter_chunk_waste;
+	kstat_named_t abdstat_linear_cnt;
+	kstat_named_t abdstat_linear_data_size;
+} abd_stats_t;
+
+static abd_stats_t abd_stats = {
+	/* Amount of memory occupied by all of the abd_t struct allocations */
+	{ "struct_size",			KSTAT_DATA_UINT64 },
+	/*
+	 * The number of scatter ABDs which are currently allocated, excluding
+	 * ABDs which don't own their data (for instance the ones which were
+	 * allocated through abd_get_offset()).
+	 */
+	{ "scatter_cnt",			KSTAT_DATA_UINT64 },
+	/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
+	{ "scatter_data_size",			KSTAT_DATA_UINT64 },
+	/*
+	 * The amount of space wasted at the end of the last chunk across all
+	 * scatter ABDs tracked by scatter_cnt.
+	 */
+	{ "scatter_chunk_waste",		KSTAT_DATA_UINT64 },
+	/*
+	 * The number of linear ABDs which are currently allocated, excluding
+	 * ABDs which don't own their data (for instance the ones which were
+	 * allocated through abd_get_offset() and abd_get_from_buf()). If an
+	 * ABD takes ownership of its buf then it will become tracked.
+	 */
+	{ "linear_cnt",				KSTAT_DATA_UINT64 },
+	/* Amount of data stored in all linear ABDs tracked by linear_cnt */
+	{ "linear_data_size",			KSTAT_DATA_UINT64 },
+};
+
+/*
+ * The size of the chunks ABD allocates. Because the sizes allocated from the
+ * kmem_cache can't change, this tunable can only be modified at boot. Changing
+ * it at runtime would cause ABD iteration to work incorrectly for ABDs which
+ * were allocated with the old size, so a safeguard has been put in place which
+ * will cause the machine to panic if you change it and try to access the data
+ * within a scattered ABD.
+ */
+size_t zfs_abd_chunk_size = 4096;
+
+#if defined(_KERNEL)
+SYSCTL_DECL(_vfs_zfs);
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
+	&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
+SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN,
+	&zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates");
+#endif
+
+kmem_cache_t *abd_chunk_cache;
+static kstat_t *abd_ksp;
+
+/*
+ * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are
+ * just a single zero'd sized zfs_abd_chunk_size buffer. This
+ * allows us to conserve memory by only using a single zero buffer
+ * for the scatter chunks.
+ */
+abd_t *abd_zero_scatter = NULL;
+static char *abd_zero_buf = NULL;
+
+static void
+abd_free_chunk(void *c)
+{
+	kmem_cache_free(abd_chunk_cache, c);
+}
+
+static size_t
+abd_chunkcnt_for_bytes(size_t size)
+{
+	return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
+}
+
+static inline size_t
+abd_scatter_chunkcnt(abd_t *abd)
+{
+	ASSERT(!abd_is_linear(abd));
+	return (abd_chunkcnt_for_bytes(
+	    ABD_SCATTER(abd).abd_offset + abd->abd_size));
+}
+
+boolean_t
+abd_size_alloc_linear(size_t size)
+{
+	return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE);
+}
+
+void
+abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
+{
+	size_t n = abd_scatter_chunkcnt(abd);
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+	int waste = n * zfs_abd_chunk_size - abd->abd_size;
+	if (op == ABDSTAT_INCR) {
+		ABDSTAT_BUMP(abdstat_scatter_cnt);
+		ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
+		ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
+		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	} else {
+		ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+		ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
+		ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
+		arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	}
+}
+
+void
+abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
+{
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+	if (op == ABDSTAT_INCR) {
+		ABDSTAT_BUMP(abdstat_linear_cnt);
+		ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
+	} else {
+		ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+		ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+	}
+}
+
+void
+abd_verify_scatter(abd_t *abd)
+{
+	/*
+	 * There is no scatter linear pages in FreeBSD so there is an
+	 * if an error if the ABD has been marked as a linear page.
+	 */
+	VERIFY(!abd_is_linear_page(abd));
+	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
+	    zfs_abd_chunk_size);
+	size_t n = abd_scatter_chunkcnt(abd);
+	for (int i = 0; i < n; i++) {
+		ASSERT3P(
+		    ABD_SCATTER(abd).abd_chunks[i], !=, NULL);
+	}
+}
+
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+	size_t n = abd_chunkcnt_for_bytes(size);
+	for (int i = 0; i < n; i++) {
+		void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
+		ASSERT3P(c, !=, NULL);
+		ABD_SCATTER(abd).abd_chunks[i] = c;
+	}
+	ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;
+}
+
+void
+abd_free_chunks(abd_t *abd)
+{
+	size_t n = abd_scatter_chunkcnt(abd);
+	for (int i = 0; i < n; i++) {
+		abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]);
+	}
+}
+
+abd_t *
+abd_alloc_struct(size_t size)
+{
+	size_t chunkcnt = abd_chunkcnt_for_bytes(size);
+	/*
+	 * In the event we are allocating a gang ABD, the size passed in
+	 * will be 0. We must make sure to set abd_size to the size of an
+	 * ABD struct as opposed to an ABD scatter with 0 chunks. The gang
+	 * ABD struct allocation accounts for an additional 24 bytes over
+	 * a scatter ABD with 0 chunks.
+	 */
+	size_t abd_size = MAX(sizeof (abd_t),
+	    offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
+	abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE);
+	ASSERT3P(abd, !=, NULL);
+	list_link_init(&abd->abd_gang_link);
+	mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL);
+	ABDSTAT_INCR(abdstat_struct_size, abd_size);
+
+	return (abd);
+}
+
+void
+abd_free_struct(abd_t *abd)
+{
+	size_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 :
+	    abd_scatter_chunkcnt(abd);
+	int size = MAX(sizeof (abd_t),
+	    offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
+	mutex_destroy(&abd->abd_mtx);
+	ASSERT(!list_link_active(&abd->abd_gang_link));
+	kmem_free(abd, size);
+	ABDSTAT_INCR(abdstat_struct_size, -size);
+}
+
+/*
+ * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where
+ * each chunk in the scatterlist will be set to abd_zero_buf.
+ */
+static void
+abd_alloc_zero_scatter(void)
+{
+	size_t n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
+	abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP);
+	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+
+	abd_zero_scatter->abd_flags = ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
+	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
+	abd_zero_scatter->abd_parent = NULL;
+	zfs_refcount_create(&abd_zero_scatter->abd_children);
+
+	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
+	ABD_SCATTER(abd_zero_scatter).abd_chunk_size =
+	    zfs_abd_chunk_size;
+
+	for (int i = 0; i < n; i++) {
+		ABD_SCATTER(abd_zero_scatter).abd_chunks[i] =
+		    abd_zero_buf;
+	}
+
+	ABDSTAT_BUMP(abdstat_scatter_cnt);
+	ABDSTAT_INCR(abdstat_scatter_data_size, zfs_abd_chunk_size);
+}
+
+static void
+abd_free_zero_scatter(void)
+{
+	zfs_refcount_destroy(&abd_zero_scatter->abd_children);
+	ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)zfs_abd_chunk_size);
+
+	abd_free_struct(abd_zero_scatter);
+	abd_zero_scatter = NULL;
+	kmem_free(abd_zero_buf, zfs_abd_chunk_size);
+}
+
+void
+abd_init(void)
+{
+	abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
+	    NULL, NULL, NULL, NULL, 0, KMC_NODEBUG);
+
+	abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
+	    sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+	if (abd_ksp != NULL) {
+		abd_ksp->ks_data = &abd_stats;
+		kstat_install(abd_ksp);
+	}
+
+	abd_alloc_zero_scatter();
+}
+
+void
+abd_fini(void)
+{
+	abd_free_zero_scatter();
+
+	if (abd_ksp != NULL) {
+		kstat_delete(abd_ksp);
+		abd_ksp = NULL;
+	}
+
+	kmem_cache_destroy(abd_chunk_cache);
+	abd_chunk_cache = NULL;
+}
+
+void
+abd_free_linear_page(abd_t *abd)
+{
+	/*
+	 * FreeBSD does not have have scatter linear pages
+	 * so there is an error.
+	 */
+	VERIFY(0);
+}
+
+/*
+ * If we're going to use this ABD for doing I/O using the block layer, the
+ * consumer of the ABD data doesn't care if it's scattered or not, and we don't
+ * plan to store this ABD in memory for a long period of time, we should
+ * allocate the ABD type that requires the least data copying to do the I/O.
+ *
+ * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
+ * using a scatter/gather list we should switch to that and replace this call
+ * with vanilla abd_alloc().
+ */
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+	return (abd_alloc_linear(size, is_metadata));
+}
+
+/*
+ * This is just a helper function to abd_get_offset_scatter() to alloc a
+ * scatter ABD using the calculated chunkcnt based on the offset within the
+ * parent ABD.
+ */
+static abd_t *
+abd_alloc_scatter_offset_chunkcnt(size_t chunkcnt)
+{
+	size_t abd_size = offsetof(abd_t,
+	    abd_u.abd_scatter.abd_chunks[chunkcnt]);
+	abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE);
+	ASSERT3P(abd, !=, NULL);
+	list_link_init(&abd->abd_gang_link);
+	mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL);
+	ABDSTAT_INCR(abdstat_struct_size, abd_size);
+
+	return (abd);
+}
+
+abd_t *
+abd_get_offset_scatter(abd_t *sabd, size_t off)
+{
+	abd_t *abd = NULL;
+
+	abd_verify(sabd);
+	ASSERT3U(off, <=, sabd->abd_size);
+
+	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
+	size_t chunkcnt = abd_scatter_chunkcnt(sabd) -
+	    (new_offset / zfs_abd_chunk_size);
+
+	abd = abd_alloc_scatter_offset_chunkcnt(chunkcnt);
+
+	/*
+	 * Even if this buf is filesystem metadata, we only track that
+	 * if we own the underlying data buffer, which is not true in
+	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
+	 */
+	abd->abd_flags = 0;
+
+	ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size;
+	ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;
+
+	/* Copy the scatterlist starting at the correct offset */
+	(void) memcpy(&ABD_SCATTER(abd).abd_chunks,
+	    &ABD_SCATTER(sabd).abd_chunks[new_offset /
+	    zfs_abd_chunk_size],
+	    chunkcnt * sizeof (void *));
+
+	return (abd);
+}
+
+static inline size_t
+abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
+{
+	ASSERT(!abd_is_linear(aiter->iter_abd));
+	return ((ABD_SCATTER(aiter->iter_abd).abd_offset +
+	    aiter->iter_pos) % zfs_abd_chunk_size);
+}
+
+static inline size_t
+abd_iter_scatter_chunk_index(struct abd_iter *aiter)
+{
+	ASSERT(!abd_is_linear(aiter->iter_abd));
+	return ((ABD_SCATTER(aiter->iter_abd).abd_offset +
+	    aiter->iter_pos) / zfs_abd_chunk_size);
+}
+
+/*
+ * Initialize the abd_iter.
+ */
+void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+{
+	ASSERT(!abd_is_gang(abd));
+	abd_verify(abd);
+	aiter->iter_abd = abd;
+	aiter->iter_pos = 0;
+	aiter->iter_mapaddr = NULL;
+	aiter->iter_mapsize = 0;
+}
+
+/*
+ * This is just a helper function to see if we have exhausted the
+ * abd_iter and reached the end.
+ */
+boolean_t
+abd_iter_at_end(struct abd_iter *aiter)
+{
+	return (aiter->iter_pos == aiter->iter_abd->abd_size);
+}
+
+/*
+ * Advance the iterator by a certain amount. Cannot be called when a chunk is
+ * in use. This can be safely called when the aiter has already exhausted, in
+ * which case this does nothing.
+ */
+void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	/* There's nothing left to advance to, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	aiter->iter_pos += amount;
+}
+
+/*
+ * Map the current chunk into aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_map(struct abd_iter *aiter)
+{
+	void *paddr;
+	size_t offset = 0;
+
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	/* Panic if someone has changed zfs_abd_chunk_size */
+	IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
+	    ABD_SCATTER(aiter->iter_abd).abd_chunk_size);
+
+	/* There's nothing left to iterate over, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	if (abd_is_linear(aiter->iter_abd)) {
+		offset = aiter->iter_pos;
+		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
+		paddr = ABD_LINEAR_BUF(aiter->iter_abd);
+	} else {
+		size_t index = abd_iter_scatter_chunk_index(aiter);
+		offset = abd_iter_scatter_chunk_offset(aiter);
+		aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset,
+		    aiter->iter_abd->abd_size - aiter->iter_pos);
+		paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index];
+	}
+	aiter->iter_mapaddr = (char *)paddr + offset;
+}
+
+/*
+ * Unmap the current chunk from aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+	/* There's nothing left to unmap, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+	ASSERT3U(aiter->iter_mapsize, >, 0);
+
+	aiter->iter_mapaddr = NULL;
+	aiter->iter_mapsize = 0;
+}
+
+void
+abd_cache_reap_now(void)
+{
+	kmem_cache_reap_soon(abd_chunk_cache);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
new file mode 100644
index 000000000000..94df750035a4
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
@@ -0,0 +1,245 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/counter.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/zfs_refcount.h>
+#include <sys/vdev.h>
+#include <sys/vdev_trim.h>
+#include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/multilist.h>
+#include <sys/abd.h>
+#include <sys/zil.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/eventhandler.h>
+#include <sys/callb.h>
+#include <sys/kstat.h>
+#include <sys/zthr.h>
+#include <zfs_fletcher.h>
+#include <sys/arc_impl.h>
+#include <sys/sdt.h>
+#include <sys/aggsum.h>
+#include <sys/vnode.h>
+#include <cityhash.h>
+#include <machine/vmparam.h>
+#include <sys/vm.h>
+#include <sys/vmmeter.h>
+
+extern struct vfsops zfs_vfsops;
+
+uint_t zfs_arc_free_target = 0;
+
+static void
+arc_free_target_init(void *unused __unused)
+{
+	zfs_arc_free_target = vm_cnt.v_free_target;
+}
+SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
+    arc_free_target_init, NULL);
+
+/*
+ * We don't have a tunable for arc_free_target due to the dependency on
+ * pagedaemon initialisation.
+ */
+static int
+sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
+{
+	uint_t val;
+	int err;
+
+	val = zfs_arc_free_target;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < minfree)
+		return (EINVAL);
+	if (val > vm_cnt.v_page_count)
+		return (EINVAL);
+
+	zfs_arc_free_target = val;
+
+	return (0);
+}
+SYSCTL_DECL(_vfs_zfs);
+/* BEGIN CSTYLED */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint_t),
+    sysctl_vfs_zfs_arc_free_target, "IU",
+    "Desired number of free pages below which ARC triggers reclaim");
+/* END CSTYLED */
+
+int64_t
+arc_available_memory(void)
+{
+	int64_t lowest = INT64_MAX;
+	int64_t n __unused;
+
+	/*
+	 * Cooperate with pagedaemon when it's time for it to scan
+	 * and reclaim some pages.
+	 */
+	n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
+	if (n < lowest) {
+		lowest = n;
+	}
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
+	/*
+	 * If we're on an i386 platform, it's possible that we'll exhaust the
+	 * kernel heap space before we ever run out of available physical
+	 * memory.  Most checks of the size of the heap_area compare against
+	 * tune.t_minarmem, which is the minimum available real memory that we
+	 * can have in the system.  However, this is generally fixed at 25 pages
+	 * which is so low that it's useless.  In this comparison, we seek to
+	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
+	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
+	 * free)
+	 */
+	n = uma_avail() - (long)(uma_limit() / 4);
+	if (n < lowest) {
+		lowest = n;
+	}
+#endif
+
+	DTRACE_PROBE1(arc__available_memory, int64_t, lowest);
+	return (lowest);
+}
+
+/*
+ * Return a default max arc size based on the amount of physical memory.
+ */
+uint64_t
+arc_default_max(uint64_t min, uint64_t allmem)
+{
+	uint64_t size;
+
+	if (allmem >= 1 << 30)
+		size = allmem - (1 << 30);
+	else
+		size = min;
+	return (MAX(allmem * 5 / 8, size));
+}
+
+/*
+ * Helper function for arc_prune_async() it is responsible for safely
+ * handling the execution of a registered arc_prune_func_t.
+ */
+static void
+arc_prune_task(void *arg)
+{
+	int64_t nr_scan = *(int64_t *)arg;
+
+	arc_reduce_target_size(ptob(nr_scan));
+	free(arg, M_TEMP);
+	vnlru_free(nr_scan, &zfs_vfsops);
+}
+
+/*
+ * Notify registered consumers they must drop holds on a portion of the ARC
+ * buffered they reference.  This provides a mechanism to ensure the ARC can
+ * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers.  This
+ * is analogous to dnlc_reduce_cache() but more generic.
+ *
+ * This operation is performed asynchronously so it may be safely called
+ * in the context of the arc_reclaim_thread().  A reference is taken here
+ * for each registered arc_prune_t and the arc_prune_task() is responsible
+ * for releasing it once the registered arc_prune_func_t has completed.
+ */
+void
+arc_prune_async(int64_t adjust)
+{
+
+	int64_t *adjustptr;
+
+	if ((adjustptr = malloc(sizeof (int64_t), M_TEMP, M_NOWAIT)) == NULL)
+		return;
+
+	*adjustptr = adjust;
+	taskq_dispatch(arc_prune_taskq, arc_prune_task, adjustptr, TQ_SLEEP);
+	ARCSTAT_BUMP(arcstat_prune);
+}
+
+uint64_t
+arc_all_memory(void)
+{
+	return (ptob(physmem));
+}
+
+int
+arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
+{
+	return (0);
+}
+
+uint64_t
+arc_free_memory(void)
+{
+	return (ptob(freemem));
+}
+
+static eventhandler_tag arc_event_lowmem = NULL;
+
+static void
+arc_lowmem(void *arg __unused, int howto __unused)
+{
+	int64_t free_memory, to_free;
+
+	arc_no_grow = B_TRUE;
+	arc_warm = B_TRUE;
+	arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
+	free_memory = arc_available_memory();
+	to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0);
+	DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
+	arc_reduce_target_size(to_free);
+
+	/*
+	 * It is unsafe to block here in arbitrary threads, because we can come
+	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
+	 * with ARC reclaim thread.
+	 */
+	if (curproc == pageproc)
+		arc_wait_for_eviction(to_free);
+	else
+		arc_wait_for_eviction(0);
+}
+
+void
+arc_lowmem_init(void)
+{
+	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
+	    EVENTHANDLER_PRI_FIRST);
+
+}
+
+void
+arc_lowmem_fini(void)
+{
+	if (arc_event_lowmem != NULL)
+		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
new file mode 100644
index 000000000000..b86ffc59a21d
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
@@ -0,0 +1,611 @@
+/*
+ * Copyright (c) 2005-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * Copyright (c) 2018 Sean Eric Fagan <sef@ixsystems.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Portions of this file are derived from sys/geom/eli/g_eli_hmac.c
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+
+#ifdef _KERNEL
+#include <sys/libkern.h>
+#include <sys/malloc.h>
+#include <sys/sysctl.h>
+#include <opencrypto/cryptodev.h>
+#include <opencrypto/xform.h>
+#else
+#include <strings.h>
+#endif
+
+#include <sys/zio_crypt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+
+#include <sys/freebsd_crypto.h>
+
+#define	SHA512_HMAC_BLOCK_SIZE	128
+
+static int crypt_sessions = 0;
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, crypt_sessions, CTLFLAG_RD,
+	&crypt_sessions, 0, "Number of cryptographic sessions created");
+
+void
+crypto_mac_init(struct hmac_ctx *ctx, const crypto_key_t *c_key)
+{
+	uint8_t k_ipad[SHA512_HMAC_BLOCK_SIZE],
+	    k_opad[SHA512_HMAC_BLOCK_SIZE],
+	    key[SHA512_HMAC_BLOCK_SIZE];
+	SHA512_CTX lctx;
+	int i;
+	size_t cl_bytes = CRYPTO_BITS2BYTES(c_key->ck_length);
+
+	/*
+	 * This code is based on the similar code in geom/eli/g_eli_hmac.c
+	 */
+	explicit_bzero(key, sizeof (key));
+	if (c_key->ck_length  == 0)
+		/* do nothing */;
+	else if (cl_bytes <= SHA512_HMAC_BLOCK_SIZE)
+		bcopy(c_key->ck_data, key, cl_bytes);
+	else {
+		/*
+		 * If key is longer than 128 bytes reset it to
+		 * key = SHA512(key).
+		 */
+		SHA512_Init(&lctx);
+		SHA512_Update(&lctx, c_key->ck_data, cl_bytes);
+		SHA512_Final(key, &lctx);
+	}
+
+	/* XOR key with ipad and opad values. */
+	for (i = 0; i < sizeof (key); i++) {
+		k_ipad[i] = key[i] ^ 0x36;
+		k_opad[i] = key[i] ^ 0x5c;
+	}
+	explicit_bzero(key, sizeof (key));
+
+	/* Start inner SHA512. */
+	SHA512_Init(&ctx->innerctx);
+	SHA512_Update(&ctx->innerctx, k_ipad, sizeof (k_ipad));
+	explicit_bzero(k_ipad, sizeof (k_ipad));
+	/* Start outer SHA512. */
+	SHA512_Init(&ctx->outerctx);
+	SHA512_Update(&ctx->outerctx, k_opad, sizeof (k_opad));
+	explicit_bzero(k_opad, sizeof (k_opad));
+}
+
+void
+crypto_mac_update(struct hmac_ctx *ctx, const void *data, size_t datasize)
+{
+	SHA512_Update(&ctx->innerctx, data, datasize);
+}
+
+void
+crypto_mac_final(struct hmac_ctx *ctx, void *md, size_t mdsize)
+{
+	uint8_t digest[SHA512_DIGEST_LENGTH];
+
+	/* Complete inner hash */
+	SHA512_Final(digest, &ctx->innerctx);
+
+	/* Complete outer hash */
+	SHA512_Update(&ctx->outerctx, digest, sizeof (digest));
+	SHA512_Final(digest, &ctx->outerctx);
+
+	explicit_bzero(ctx, sizeof (*ctx));
+	/* mdsize == 0 means "Give me the whole hash!" */
+	if (mdsize == 0)
+		mdsize = SHA512_DIGEST_LENGTH;
+	bcopy(digest, md, mdsize);
+	explicit_bzero(digest, sizeof (digest));
+}
+
+void
+crypto_mac(const crypto_key_t *key, const void *in_data, size_t in_data_size,
+    void *out_data, size_t out_data_size)
+{
+	struct hmac_ctx ctx;
+
+	crypto_mac_init(&ctx, key);
+	crypto_mac_update(&ctx, in_data, in_data_size);
+	crypto_mac_final(&ctx, out_data, out_data_size);
+}
+
+static int
+freebsd_zfs_crypt_done(struct cryptop *crp)
+{
+	freebsd_crypt_session_t *ses;
+
+	ses = crp->crp_opaque;
+	mtx_lock(&ses->fs_lock);
+	ses->fs_done = true;
+	mtx_unlock(&ses->fs_lock);
+	wakeup(crp);
+	return (0);
+}
+
+void
+freebsd_crypt_freesession(freebsd_crypt_session_t *sess)
+{
+	mtx_destroy(&sess->fs_lock);
+	crypto_freesession(sess->fs_sid);
+	explicit_bzero(sess, sizeof (*sess));
+}
+
+static int
+zfs_crypto_dispatch(freebsd_crypt_session_t *session, 	struct cryptop *crp)
+{
+	int error;
+
+	crp->crp_opaque = session;
+	crp->crp_callback = freebsd_zfs_crypt_done;
+	for (;;) {
+		error = crypto_dispatch(crp);
+		if (error)
+			break;
+		mtx_lock(&session->fs_lock);
+		while (session->fs_done == false)
+			msleep(crp, &session->fs_lock, PRIBIO,
+			    "zfs_crypto", hz/5);
+		mtx_unlock(&session->fs_lock);
+
+		if (crp->crp_etype != EAGAIN) {
+			error = crp->crp_etype;
+			break;
+		}
+		crp->crp_etype = 0;
+		crp->crp_flags &= ~CRYPTO_F_DONE;
+		session->fs_done = false;
+#if __FreeBSD_version < 1300087
+		/*
+		 * Session ID changed, so we should record that,
+		 * and try again
+		 */
+		session->fs_sid = crp->crp_session;
+#endif
+	}
+	return (error);
+}
+static void
+freebsd_crypt_uio_debug_log(boolean_t encrypt,
+    freebsd_crypt_session_t *input_sessionp,
+    struct zio_crypt_info *c_info,
+    uio_t *data_uio,
+    crypto_key_t *key,
+    uint8_t *ivbuf,
+    size_t datalen,
+    size_t auth_len)
+{
+#ifdef FCRYPTO_DEBUG
+	struct cryptodesc *crd;
+	uint8_t *p = NULL;
+	size_t total = 0;
+
+	printf("%s(%s, %p, { %s, %d, %d, %s }, %p, { %d, %p, %u }, "
+	    "%p, %u, %u)\n",
+	    __FUNCTION__, encrypt ? "encrypt" : "decrypt", input_sessionp,
+	    c_info->ci_algname, c_info->ci_crypt_type,
+	    (unsigned int)c_info->ci_keylen, c_info->ci_name,
+	    data_uio, key->ck_format, key->ck_data,
+	    (unsigned int)key->ck_length,
+	    ivbuf, (unsigned int)datalen, (unsigned int)auth_len);
+	printf("\tkey = { ");
+	for (int i = 0; i < key->ck_length / 8; i++) {
+		uint8_t *b = (uint8_t *)key->ck_data;
+		printf("%02x ", b[i]);
+	}
+	printf("}\n");
+	for (int i = 0; i < data_uio->uio_iovcnt; i++) {
+		printf("\tiovec #%d: <%p, %u>\n", i,
+		    data_uio->uio_iov[i].iov_base,
+		    (unsigned int)data_uio->uio_iov[i].iov_len);
+		total += data_uio->uio_iov[i].iov_len;
+	}
+	data_uio->uio_resid = total;
+#endif
+}
+/*
+ * Create a new cryptographic session.  This should
+ * happen every time the key changes (including when
+ * it's first loaded).
+ */
+#if __FreeBSD_version >= 1300087
+int
+freebsd_crypt_newsession(freebsd_crypt_session_t *sessp,
+    struct zio_crypt_info *c_info, crypto_key_t *key)
+{
+	struct crypto_session_params csp;
+	int error = 0;
+
+#ifdef FCRYPTO_DEBUG
+	printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n",
+	    __FUNCTION__, sessp,
+	    c_info->ci_algname, c_info->ci_crypt_type,
+	    (unsigned int)c_info->ci_keylen, c_info->ci_name,
+	    key->ck_format, key->ck_data, (unsigned int)key->ck_length);
+	printf("\tkey = { ");
+	for (int i = 0; i < key->ck_length / 8; i++) {
+		uint8_t *b = (uint8_t *)key->ck_data;
+		printf("%02x ", b[i]);
+	}
+	printf("}\n");
+#endif
+	bzero(&csp, sizeof (csp));
+	csp.csp_mode = CSP_MODE_AEAD;
+	csp.csp_cipher_key = key->ck_data;
+	csp.csp_cipher_klen = key->ck_length / 8;
+	switch (c_info->ci_crypt_type) {
+		case ZC_TYPE_GCM:
+		csp.csp_cipher_alg = CRYPTO_AES_NIST_GCM_16;
+		csp.csp_ivlen = AES_GCM_IV_LEN;
+		switch (key->ck_length/8) {
+		case AES_128_GMAC_KEY_LEN:
+		case AES_192_GMAC_KEY_LEN:
+		case AES_256_GMAC_KEY_LEN:
+			break;
+		default:
+			error = EINVAL;
+			goto bad;
+		}
+		break;
+	case ZC_TYPE_CCM:
+		csp.csp_cipher_alg = CRYPTO_AES_CCM_16;
+		csp.csp_ivlen = AES_CCM_IV_LEN;
+		switch (key->ck_length/8) {
+		case AES_128_CBC_MAC_KEY_LEN:
+		case AES_192_CBC_MAC_KEY_LEN:
+		case AES_256_CBC_MAC_KEY_LEN:
+			break;
+		default:
+			error = EINVAL;
+			goto bad;
+			break;
+		}
+		break;
+	default:
+		error = ENOTSUP;
+		goto bad;
+	}
+	error = crypto_newsession(&sessp->fs_sid, &csp,
+	    CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE);
+	mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock",
+	    NULL, MTX_DEF);
+	crypt_sessions++;
+bad:
+#ifdef FCRYPTO_DEBUG
+	if (error)
+		printf("%s: returning error %d\n", __FUNCTION__, error);
+#endif
+	return (error);
+}
+
+int
+freebsd_crypt_uio(boolean_t encrypt,
+    freebsd_crypt_session_t *input_sessionp,
+    struct zio_crypt_info *c_info,
+    uio_t *data_uio,
+    crypto_key_t *key,
+    uint8_t *ivbuf,
+    size_t datalen,
+    size_t auth_len)
+{
+	struct cryptop *crp;
+	freebsd_crypt_session_t *session = NULL;
+	int error = 0;
+	size_t total = 0;
+
+	freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio,
+	    key, ivbuf, datalen, auth_len);
+	for (int i = 0; i < data_uio->uio_iovcnt; i++)
+		total += data_uio->uio_iov[i].iov_len;
+	data_uio->uio_resid = total;
+	if (input_sessionp == NULL) {
+		session = kmem_zalloc(sizeof (*session), KM_SLEEP);
+		error = freebsd_crypt_newsession(session, c_info, key);
+		if (error)
+			goto out;
+	} else
+		session = input_sessionp;
+
+	crp = crypto_getreq(session->fs_sid, M_WAITOK);
+	if (encrypt) {
+		crp->crp_op = CRYPTO_OP_ENCRYPT |
+		    CRYPTO_OP_COMPUTE_DIGEST;
+	} else {
+		crp->crp_op = CRYPTO_OP_DECRYPT |
+		    CRYPTO_OP_VERIFY_DIGEST;
+	}
+	crp->crp_flags = CRYPTO_F_CBIFSYNC | CRYPTO_F_IV_SEPARATE;
+	crypto_use_uio(crp, data_uio);
+
+	crp->crp_aad_start = 0;
+	crp->crp_aad_length = auth_len;
+	crp->crp_payload_start = auth_len;
+	crp->crp_payload_length = datalen;
+	crp->crp_digest_start = auth_len + datalen;
+
+	bcopy(ivbuf, crp->crp_iv, ZIO_DATA_IV_LEN);
+	error = zfs_crypto_dispatch(session, crp);
+	crypto_freereq(crp);
+out:
+#ifdef FCRYPTO_DEBUG
+	if (error)
+		printf("%s: returning error %d\n", __FUNCTION__, error);
+#endif
+	if (input_sessionp == NULL) {
+		freebsd_crypt_freesession(session);
+		kmem_free(session, sizeof (*session));
+	}
+	return (error);
+}
+
+#else
+int
+freebsd_crypt_newsession(freebsd_crypt_session_t *sessp,
+    struct zio_crypt_info *c_info, crypto_key_t *key)
+{
+	struct cryptoini cria, crie, *crip;
+	struct enc_xform *xform;
+	struct auth_hash *xauth;
+	int error = 0;
+	crypto_session_t sid;
+
+#ifdef FCRYPTO_DEBUG
+	printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n",
+	    __FUNCTION__, sessp,
+	    c_info->ci_algname, c_info->ci_crypt_type,
+	    (unsigned int)c_info->ci_keylen, c_info->ci_name,
+	    key->ck_format, key->ck_data, (unsigned int)key->ck_length);
+	printf("\tkey = { ");
+	for (int i = 0; i < key->ck_length / 8; i++) {
+		uint8_t *b = (uint8_t *)key->ck_data;
+		printf("%02x ", b[i]);
+	}
+	printf("}\n");
+#endif
+	switch (c_info->ci_crypt_type) {
+	case ZC_TYPE_GCM:
+		xform = &enc_xform_aes_nist_gcm;
+		switch (key->ck_length/8) {
+		case AES_128_GMAC_KEY_LEN:
+			xauth = &auth_hash_nist_gmac_aes_128;
+			break;
+		case AES_192_GMAC_KEY_LEN:
+			xauth = &auth_hash_nist_gmac_aes_192;
+			break;
+		case AES_256_GMAC_KEY_LEN:
+			xauth = &auth_hash_nist_gmac_aes_256;
+			break;
+		default:
+			error = EINVAL;
+			goto bad;
+		}
+		break;
+	case ZC_TYPE_CCM:
+		xform = &enc_xform_ccm;
+		switch (key->ck_length/8) {
+		case AES_128_CBC_MAC_KEY_LEN:
+			xauth = &auth_hash_ccm_cbc_mac_128;
+			break;
+		case AES_192_CBC_MAC_KEY_LEN:
+			xauth = &auth_hash_ccm_cbc_mac_192;
+			break;
+		case AES_256_CBC_MAC_KEY_LEN:
+			xauth = &auth_hash_ccm_cbc_mac_256;
+			break;
+		default:
+			error = EINVAL;
+			goto bad;
+			break;
+		}
+		break;
+	default:
+		error = ENOTSUP;
+		goto bad;
+	}
+#ifdef FCRYPTO_DEBUG
+	printf("%s(%d): Using crypt %s (key length %u [%u bytes]), "
+	    "auth %s (key length %d)\n",
+	    __FUNCTION__, __LINE__,
+	    xform->name, (unsigned int)key->ck_length,
+	    (unsigned int)key->ck_length/8,
+	    xauth->name, xauth->keysize);
+#endif
+
+	bzero(&crie, sizeof (crie));
+	bzero(&cria, sizeof (cria));
+
+	crie.cri_alg = xform->type;
+	crie.cri_key = key->ck_data;
+	crie.cri_klen = key->ck_length;
+
+	cria.cri_alg = xauth->type;
+	cria.cri_key = key->ck_data;
+	cria.cri_klen = key->ck_length;
+
+	cria.cri_next = &crie;
+	crie.cri_next = NULL;
+	crip = &cria;
+	// Everything else is bzero'd
+
+	error = crypto_newsession(&sid, crip,
+	    CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE);
+	if (error != 0) {
+		printf("%s(%d):  crypto_newsession failed with %d\n",
+		    __FUNCTION__, __LINE__, error);
+		goto bad;
+	}
+	sessp->fs_sid = sid;
+	mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock",
+	    NULL, MTX_DEF);
+	crypt_sessions++;
+bad:
+	return (error);
+}
+
+/*
+ * The meat of encryption/decryption.
+ * If sessp is NULL, then it will create a
+ * temporary cryptographic session, and release
+ * it when done.
+ */
+int
+freebsd_crypt_uio(boolean_t encrypt,
+    freebsd_crypt_session_t *input_sessionp,
+    struct zio_crypt_info *c_info,
+    uio_t *data_uio,
+    crypto_key_t *key,
+    uint8_t *ivbuf,
+    size_t datalen,
+    size_t auth_len)
+{
+	struct cryptop *crp;
+	struct cryptodesc *enc_desc, *auth_desc;
+	struct enc_xform *xform;
+	struct auth_hash *xauth;
+	freebsd_crypt_session_t *session = NULL;
+	int error;
+
+	freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio,
+	    key, ivbuf, datalen, auth_len);
+	switch (c_info->ci_crypt_type) {
+	case ZC_TYPE_GCM:
+		xform = &enc_xform_aes_nist_gcm;
+		switch (key->ck_length/8) {
+		case AES_128_GMAC_KEY_LEN:
+			xauth = &auth_hash_nist_gmac_aes_128;
+			break;
+		case AES_192_GMAC_KEY_LEN:
+			xauth = &auth_hash_nist_gmac_aes_192;
+			break;
+		case AES_256_GMAC_KEY_LEN:
+			xauth = &auth_hash_nist_gmac_aes_256;
+			break;
+		default:
+			error = EINVAL;
+			goto bad;
+		}
+		break;
+	case ZC_TYPE_CCM:
+		xform = &enc_xform_ccm;
+		switch (key->ck_length/8) {
+		case AES_128_CBC_MAC_KEY_LEN:
+			xauth = &auth_hash_ccm_cbc_mac_128;
+			break;
+		case AES_192_CBC_MAC_KEY_LEN:
+			xauth = &auth_hash_ccm_cbc_mac_192;
+			break;
+		case AES_256_CBC_MAC_KEY_LEN:
+			xauth = &auth_hash_ccm_cbc_mac_256;
+			break;
+		default:
+			error = EINVAL;
+			goto bad;
+			break;
+		}
+		break;
+	default:
+		error = ENOTSUP;
+		goto bad;
+	}
+
+#ifdef FCRYPTO_DEBUG
+	printf("%s(%d): Using crypt %s (key length %u [%u bytes]), "
+	    "auth %s (key length %d)\n",
+	    __FUNCTION__, __LINE__,
+	    xform->name, (unsigned int)key->ck_length,
+	    (unsigned int)key->ck_length/8,
+	    xauth->name, xauth->keysize);
+#endif
+
+	if (input_sessionp == NULL) {
+		session = kmem_zalloc(sizeof (*session), KM_SLEEP);
+		error = freebsd_crypt_newsession(session, c_info, key);
+		if (error)
+			goto out;
+	} else
+		session = input_sessionp;
+
+	crp = crypto_getreq(2);
+	if (crp == NULL) {
+		error = ENOMEM;
+		goto bad;
+	}
+
+	auth_desc = crp->crp_desc;
+	enc_desc = auth_desc->crd_next;
+
+	crp->crp_session = session->fs_sid;
+	crp->crp_ilen = auth_len + datalen;
+	crp->crp_buf = (void*)data_uio;
+	crp->crp_flags = CRYPTO_F_IOV | CRYPTO_F_CBIFSYNC;
+
+	auth_desc->crd_skip = 0;
+	auth_desc->crd_len = auth_len;
+	auth_desc->crd_inject = auth_len + datalen;
+	auth_desc->crd_alg = xauth->type;
+#ifdef FCRYPTO_DEBUG
+	printf("%s: auth: skip = %u, len = %u, inject = %u\n",
+	    __FUNCTION__, auth_desc->crd_skip, auth_desc->crd_len,
+	    auth_desc->crd_inject);
+#endif
+
+	enc_desc->crd_skip = auth_len;
+	enc_desc->crd_len = datalen;
+	enc_desc->crd_inject = auth_len;
+	enc_desc->crd_alg = xform->type;
+	enc_desc->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT;
+	bcopy(ivbuf, enc_desc->crd_iv, ZIO_DATA_IV_LEN);
+	enc_desc->crd_next = NULL;
+
+#ifdef FCRYPTO_DEBUG
+	printf("%s: enc: skip = %u, len = %u, inject = %u\n",
+	    __FUNCTION__, enc_desc->crd_skip, enc_desc->crd_len,
+	    enc_desc->crd_inject);
+#endif
+
+	if (encrypt)
+		enc_desc->crd_flags |= CRD_F_ENCRYPT;
+
+	error = zfs_crypto_dispatch(session, crp);
+	crypto_freereq(crp);
+out:
+	if (input_sessionp == NULL) {
+		freebsd_crypt_freesession(session);
+		kmem_free(session, sizeof (*session));
+	}
+bad:
+#ifdef FCRYPTO_DEBUG
+	if (error)
+		printf("%s: returning error %d\n", __FUNCTION__, error);
+#endif
+	return (error);
+}
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
new file mode 100644
index 000000000000..8e412d9c1359
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_prop.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/sa.h>
+#include <sys/zfeature.h>
+#include <sys/abd.h>
+#include <sys/zfs_rlock.h>
+#include <sys/racct.h>
+#include <sys/vm.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+
+#include <sys/ccompat.h>
+
+#ifndef IDX_TO_OFF
+#define	IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
+#endif
+
+#if  __FreeBSD_version < 1300051
+#define	VM_ALLOC_BUSY_FLAGS VM_ALLOC_NOBUSY
+#else
+#define	VM_ALLOC_BUSY_FLAGS  VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY
+#endif
+
+
+#if __FreeBSD_version < 1300072
+#define	dmu_page_lock(m)	vm_page_lock(m)
+#define	dmu_page_unlock(m)	vm_page_unlock(m)
+#else
+#define	dmu_page_lock(m)
+#define	dmu_page_unlock(m)
+#endif
+
+static int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+
+	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+	    numbufsp, dbpp, DMU_READ_PREFETCH);
+
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+
+int
+dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    vm_page_t *ma, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	struct sf_buf *sf;
+	int numbufs, i;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	err = dmu_buf_hold_array(os, object, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp);
+	if (err)
+		return (err);
+
+	for (i = 0; i < numbufs; i++) {
+		int tocpy, copied, thiscpy;
+		int bufoff;
+		dmu_buf_t *db = dbp[i];
+		caddr_t va;
+
+		ASSERT(size > 0);
+		ASSERT3U(db->db_size, >=, PAGESIZE);
+
+		bufoff = offset - db->db_offset;
+		tocpy = (int)MIN(db->db_size - bufoff, size);
+
+		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+		if (tocpy == db->db_size)
+			dmu_buf_will_fill(db, tx);
+		else
+			dmu_buf_will_dirty(db, tx);
+
+		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
+			ASSERT3U(ptoa((*ma)->pindex), ==,
+			    db->db_offset + bufoff);
+			thiscpy = MIN(PAGESIZE, tocpy - copied);
+			va = zfs_map_page(*ma, &sf);
+			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
+			zfs_unmap_page(sf);
+			ma += 1;
+			bufoff += PAGESIZE;
+		}
+
+		if (tocpy == db->db_size)
+			dmu_buf_fill_done(db, tx);
+
+		offset += tocpy;
+		size -= tocpy;
+	}
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+	return (err);
+}
+
+int
+dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
+    int *rbehind, int *rahead, int last_size)
+{
+	struct sf_buf *sf;
+	vm_object_t vmobj;
+	vm_page_t m;
+	dmu_buf_t **dbp;
+	dmu_buf_t *db;
+	caddr_t va;
+	int numbufs, i;
+	int bufoff, pgoff, tocpy;
+	int mi, di;
+	int err;
+
+	ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
+	ASSERT(last_size <= PAGE_SIZE);
+
+	err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
+	    IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
+	if (err != 0)
+		return (err);
+
+#ifdef ZFS_DEBUG
+	IMPLY(last_size < PAGE_SIZE, *rahead == 0);
+	if (dbp[0]->db_offset != 0 || numbufs > 1) {
+		for (i = 0; i < numbufs; i++) {
+			ASSERT(ISP2(dbp[i]->db_size));
+			ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
+			ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
+		}
+	}
+#endif
+
+	vmobj = ma[0]->object;
+	zfs_vmobject_wlock_12(vmobj);
+
+	db = dbp[0];
+	for (i = 0; i < *rbehind; i++) {
+		m = vm_page_grab_unlocked(vmobj, ma[0]->pindex - 1 - i,
+		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS);
+		if (m == NULL)
+			break;
+		if (!vm_page_none_valid(m)) {
+			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+			vm_page_do_sunbusy(m);
+			break;
+		}
+		ASSERT(m->dirty == 0);
+		ASSERT(!pmap_page_is_write_mapped(m));
+
+		ASSERT(db->db_size > PAGE_SIZE);
+		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+		va = zfs_map_page(m, &sf);
+		bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
+		zfs_unmap_page(sf);
+		vm_page_valid(m);
+		dmu_page_lock(m);
+		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+			vm_page_activate(m);
+		else
+			vm_page_deactivate(m);
+		dmu_page_unlock(m);
+		vm_page_do_sunbusy(m);
+	}
+	*rbehind = i;
+
+	bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
+	pgoff = 0;
+	for (mi = 0, di = 0; mi < count && di < numbufs; ) {
+		if (pgoff == 0) {
+			m = ma[mi];
+			if (m != bogus_page) {
+				vm_page_assert_xbusied(m);
+				ASSERT(vm_page_none_valid(m));
+				ASSERT(m->dirty == 0);
+				ASSERT(!pmap_page_is_write_mapped(m));
+				va = zfs_map_page(m, &sf);
+			}
+		}
+		if (bufoff == 0)
+			db = dbp[di];
+
+		if (m != bogus_page) {
+			ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
+			    db->db_offset + bufoff);
+		}
+
+		/*
+		 * We do not need to clamp the copy size by the file
+		 * size as the last block is zero-filled beyond the
+		 * end of file anyway.
+		 */
+		tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
+		if (m != bogus_page)
+			bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
+
+		pgoff += tocpy;
+		ASSERT(pgoff <= PAGESIZE);
+		if (pgoff == PAGESIZE) {
+			if (m != bogus_page) {
+				zfs_unmap_page(sf);
+				vm_page_valid(m);
+			}
+			ASSERT(mi < count);
+			mi++;
+			pgoff = 0;
+		}
+
+		bufoff += tocpy;
+		ASSERT(bufoff <= db->db_size);
+		if (bufoff == db->db_size) {
+			ASSERT(di < numbufs);
+			di++;
+			bufoff = 0;
+		}
+	}
+
+#ifdef ZFS_DEBUG
+	/*
+	 * Three possibilities:
+	 * - last requested page ends at a buffer boundary and , thus,
+	 *   all pages and buffers have been iterated;
+	 * - all requested pages are filled, but the last buffer
+	 *   has not been exhausted;
+	 *   the read-ahead is possible only in this case;
+	 * - all buffers have been read, but the last page has not been
+	 *   fully filled;
+	 *   this is only possible if the file has only a single buffer
+	 *   with a size that is not a multiple of the page size.
+	 */
+	if (mi == count) {
+		ASSERT(di >= numbufs - 1);
+		IMPLY(*rahead != 0, di == numbufs - 1);
+		IMPLY(*rahead != 0, bufoff != 0);
+		ASSERT(pgoff == 0);
+	}
+	if (di == numbufs) {
+		ASSERT(mi >= count - 1);
+		ASSERT(*rahead == 0);
+		IMPLY(pgoff == 0, mi == count);
+		if (pgoff != 0) {
+			ASSERT(mi == count - 1);
+			ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
+		}
+	}
+#endif
+	if (pgoff != 0) {
+		ASSERT(m != bogus_page);
+		bzero(va + pgoff, PAGESIZE - pgoff);
+		zfs_unmap_page(sf);
+		vm_page_valid(m);
+	}
+
+	for (i = 0; i < *rahead; i++) {
+		m = vm_page_grab_unlocked(vmobj, ma[count - 1]->pindex + 1 + i,
+		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS);
+		if (m == NULL)
+			break;
+		if (!vm_page_none_valid(m)) {
+			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+			vm_page_do_sunbusy(m);
+			break;
+		}
+		ASSERT(m->dirty == 0);
+		ASSERT(!pmap_page_is_mapped(m));
+
+		ASSERT(db->db_size > PAGE_SIZE);
+		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+		tocpy = MIN(db->db_size - bufoff, PAGESIZE);
+		va = zfs_map_page(m, &sf);
+		bcopy((char *)db->db_data + bufoff, va, tocpy);
+		if (tocpy < PAGESIZE) {
+			ASSERT(i == *rahead - 1);
+			ASSERT((db->db_size & PAGE_MASK) != 0);
+			bzero(va + tocpy, PAGESIZE - tocpy);
+		}
+		zfs_unmap_page(sf);
+		vm_page_valid(m);
+		dmu_page_lock(m);
+		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+			vm_page_activate(m);
+		else
+			vm_page_deactivate(m);
+		dmu_page_unlock(m);
+		vm_page_do_sunbusy(m);
+	}
+	*rahead = i;
+	zfs_vmobject_wunlock_12(vmobj);
+
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/hkdf.c b/sys/contrib/openzfs/module/os/freebsd/zfs/hkdf.c
new file mode 100644
index 000000000000..8324ff2319b6
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/hkdf.c
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/hkdf.h>
+#include <sys/freebsd_crypto.h>
+#include <sys/hkdf.h>
+
+static int
+hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material,
+    uint_t km_len, uint8_t *out_buf)
+{
+	crypto_key_t key;
+
+	/* initialize the salt as a crypto key */
+	key.ck_format = CRYPTO_KEY_RAW;
+	key.ck_length = CRYPTO_BYTES2BITS(salt_len);
+	key.ck_data = salt;
+
+	crypto_mac(&key, key_material, km_len, out_buf, SHA512_DIGEST_LENGTH);
+
+	return (0);
+}
+
+static int
+hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len,
+    uint8_t *out_buf, uint_t out_len)
+{
+	struct hmac_ctx ctx;
+	crypto_key_t key;
+	uint_t i, T_len = 0, pos = 0;
+	uint8_t c;
+	uint_t N = (out_len + SHA512_DIGEST_LENGTH) / SHA512_DIGEST_LENGTH;
+	uint8_t T[SHA512_DIGEST_LENGTH];
+
+	if (N > 255)
+		return (SET_ERROR(EINVAL));
+
+	/* initialize the salt as a crypto key */
+	key.ck_format = CRYPTO_KEY_RAW;
+	key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH);
+	key.ck_data = extract_key;
+
+	for (i = 1; i <= N; i++) {
+		c = i;
+
+		crypto_mac_init(&ctx, &key);
+		crypto_mac_update(&ctx, T, T_len);
+		crypto_mac_update(&ctx, info, info_len);
+		crypto_mac_update(&ctx, &c, 1);
+		crypto_mac_final(&ctx, T, SHA512_DIGEST_LENGTH);
+		bcopy(T, out_buf + pos,
+		    (i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos));
+		pos += SHA512_DIGEST_LENGTH;
+	}
+
+	return (0);
+}
+
+/*
+ * HKDF is designed to be a relatively fast function for deriving keys from a
+ * master key + a salt. We use this function to generate new encryption keys
+ * so as to avoid hitting the cryptographic limits of the underlying
+ * encryption modes. Note that, for the sake of deriving encryption keys, the
+ * info parameter is called the "salt" everywhere else in the code.
+ */
+int
+hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt,
+    uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key,
+    uint_t out_len)
+{
+	int ret;
+	uint8_t extract_key[SHA512_DIGEST_LENGTH];
+
+	ret = hkdf_sha512_extract(salt, salt_len, key_material, km_len,
+	    extract_key);
+	if (ret != 0)
+		return (ret);
+
+	ret = hkdf_sha512_expand(extract_key, info, info_len, output_key,
+	    out_len);
+	if (ret != 0)
+		return (ret);
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
new file mode 100644
index 000000000000..dce73577eacd
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/eventhandler.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/fm/util.h>
+#include <sys/sunddi.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/nvpair.h>
+#include <sys/mount.h>
+#include <sys/taskqueue.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zcp.h>
+#include <sys/zio_checksum.h>
+#include <sys/vdev_removal.h>
+#include <sys/dsl_crypt.h>
+
+#include <sys/zfs_ioctl_compat.h>
+#include <sys/zfs_ioctl_impl.h>
+
+#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+#include "zfs_comutil.h"
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_DECL(_vfs_zfs_vdev);
+
+
+static int zfs_version_ioctl = ZFS_IOCVER_OZFS;
+SYSCTL_DECL(_vfs_zfs_version);
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl,
+    0, "ZFS_IOCTL_VERSION");
+
+static struct cdev *zfsdev;
+
+static struct root_hold_token *zfs_root_token;
+
+extern uint_t rrw_tsd_key;
+extern uint_t zfs_allow_log_key;
+extern uint_t zfs_geom_probe_vdev_key;
+
+static int zfs__init(void);
+static int zfs__fini(void);
+static void zfs_shutdown(void *, int);
+
+static eventhandler_tag zfs_shutdown_event_tag;
+extern zfsdev_state_t *zfsdev_state_list;
+
+#define	ZFS_MIN_KSTACK_PAGES 4
+
+
+static int
+zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag,
+    struct thread *td)
+{
+	uint_t len;
+	int vecnum;
+	zfs_iocparm_t *zp;
+	zfs_cmd_t *zc;
+	zfs_cmd_legacy_t *zcl;
+	int rc, error;
+	void *uaddr;
+
+	len = IOCPARM_LEN(zcmd);
+	vecnum = zcmd & 0xff;
+	zp = (void *)arg;
+	uaddr = (void *)zp->zfs_cmd;
+	error = 0;
+	zcl = NULL;
+
+	if (len != sizeof (zfs_iocparm_t)) {
+		printf("len %d vecnum: %d sizeof (zfs_cmd_t) %ju\n",
+		    len, vecnum, (uintmax_t)sizeof (zfs_cmd_t));
+		return (EINVAL);
+	}
+
+	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+	/*
+	 * Remap ioctl code for legacy user binaries
+	 */
+	if (zp->zfs_ioctl_version == ZFS_IOCVER_LEGACY) {
+		vecnum = zfs_ioctl_legacy_to_ozfs(vecnum);
+		if (vecnum < 0) {
+			kmem_free(zc, sizeof (zfs_cmd_t));
+			return (ENOTSUP);
+		}
+		zcl = kmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP);
+		if (copyin(uaddr, zcl, sizeof (zfs_cmd_legacy_t))) {
+			error = SET_ERROR(EFAULT);
+			goto out;
+		}
+		zfs_cmd_legacy_to_ozfs(zcl, zc);
+	} else if (copyin(uaddr, zc, sizeof (zfs_cmd_t))) {
+		error = SET_ERROR(EFAULT);
+		goto out;
+	}
+	error = zfsdev_ioctl_common(vecnum, zc, 0);
+	if (zcl) {
+		zfs_cmd_ozfs_to_legacy(zc, zcl);
+		rc = copyout(zcl, uaddr, sizeof (*zcl));
+	} else {
+		rc = copyout(zc, uaddr, sizeof (*zc));
+	}
+	if (error == 0 && rc != 0)
+		error = SET_ERROR(EFAULT);
+out:
+	if (zcl)
+		kmem_free(zcl, sizeof (zfs_cmd_legacy_t));
+	kmem_free(zc, sizeof (zfs_cmd_t));
+	return (error);
+}
+
+static void
+zfsdev_close(void *data)
+{
+	zfsdev_state_t *zs, *zsp = data;
+
+	mutex_enter(&zfsdev_state_lock);
+	for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+		if (zs == zsp)
+			break;
+	}
+	if (zs == NULL || zs->zs_minor <= 0) {
+		mutex_exit(&zfsdev_state_lock);
+		return;
+	}
+	zs->zs_minor = -1;
+	zfs_onexit_destroy(zs->zs_onexit);
+	zfs_zevent_destroy(zs->zs_zevent);
+	mutex_exit(&zfsdev_state_lock);
+	zs->zs_onexit = NULL;
+	zs->zs_zevent = NULL;
+}
+
+static int
+zfs_ctldev_init(struct cdev *devp)
+{
+	boolean_t newzs = B_FALSE;
+	minor_t minor;
+	zfsdev_state_t *zs, *zsprev = NULL;
+
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+	minor = zfsdev_minor_alloc();
+	if (minor == 0)
+		return (SET_ERROR(ENXIO));
+
+	for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+		if (zs->zs_minor == -1)
+			break;
+		zsprev = zs;
+	}
+
+	if (!zs) {
+		zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
+		newzs = B_TRUE;
+	}
+
+	devfs_set_cdevpriv(zs, zfsdev_close);
+
+	zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit);
+	zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent);
+
+	if (newzs) {
+		zs->zs_minor = minor;
+		wmb();
+		zsprev->zs_next = zs;
+	} else {
+		wmb();
+		zs->zs_minor = minor;
+	}
+	return (0);
+}
+
+static int
+zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td)
+{
+	int error;
+
+	mutex_enter(&zfsdev_state_lock);
+	error = zfs_ctldev_init(devp);
+	mutex_exit(&zfsdev_state_lock);
+
+	return (error);
+}
+
+static struct cdevsw zfs_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_open =	zfsdev_open,
+	.d_ioctl =	zfsdev_ioctl,
+	.d_name =	ZFS_DRIVER
+};
+
+int
+zfsdev_attach(void)
+{
+	zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666,
+	    ZFS_DRIVER);
+	return (0);
+}
+
+void
+zfsdev_detach(void)
+{
+	if (zfsdev != NULL)
+		destroy_dev(zfsdev);
+}
+
+int
+zfs__init(void)
+{
+	int error;
+
+#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
+	printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
+	    "overflow panic!\nPlease consider adding "
+	    "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
+	    ZFS_MIN_KSTACK_PAGES);
+#endif
+	zfs_root_token = root_mount_hold("ZFS");
+	if ((error = zfs_kmod_init()) != 0) {
+		printf("ZFS: Failed to Load ZFS Filesystem"
+		    ", rc = %d\n", error);
+		root_mount_rel(zfs_root_token);
+		return (error);
+	}
+
+
+	tsd_create(&zfs_geom_probe_vdev_key, NULL);
+
+	printf("ZFS storage pool version: features support ("
+	    SPA_VERSION_STRING ")\n");
+	root_mount_rel(zfs_root_token);
+	ddi_sysevent_init();
+	return (0);
+}
+
+int
+zfs__fini(void)
+{
+	if (zfs_busy() || zvol_busy() ||
+	    zio_injection_enabled) {
+		return (EBUSY);
+	}
+	zfs_kmod_fini();
+	tsd_destroy(&zfs_geom_probe_vdev_key);
+	return (0);
+}
+
+static void
+zfs_shutdown(void *arg __unused, int howto __unused)
+{
+
+	/*
+	 * ZFS fini routines can not properly work in a panic-ed system.
+	 */
+	if (panicstr == NULL)
+		zfs__fini();
+}
+
+
+static int
+zfs_modevent(module_t mod, int type, void *unused __unused)
+{
+	int err;
+
+	switch (type) {
+	case MOD_LOAD:
+		err = zfs__init();
+		if (err == 0)
+			zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
+			    shutdown_post_sync, zfs_shutdown, NULL,
+			    SHUTDOWN_PRI_FIRST);
+		return (err);
+	case MOD_UNLOAD:
+		err = zfs__fini();
+		if (err == 0 && zfs_shutdown_event_tag != NULL)
+			EVENTHANDLER_DEREGISTER(shutdown_post_sync,
+			    zfs_shutdown_event_tag);
+		return (err);
+	case MOD_SHUTDOWN:
+		return (0);
+	default:
+		break;
+	}
+	return (EOPNOTSUPP);
+}
+
+static moduledata_t zfs_mod = {
+	"zfsctrl",
+	zfs_modevent,
+	0
+};
+
+#ifdef _KERNEL
+EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
+#endif
+
+DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_CLOCKS, SI_ORDER_ANY);
+MODULE_VERSION(zfsctrl, 1);
+#if __FreeBSD_version > 1300092
+MODULE_DEPEND(zfsctrl, xdr, 1, 1, 1);
+#else
+MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1);
+#endif
+MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
+MODULE_DEPEND(zfsctrl, crypto, 1, 1, 1);
+MODULE_DEPEND(zfsctrl, cryptodev, 1, 1, 1);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/spa_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/spa_os.c
new file mode 100644
index 000000000000..2bc78cb451e8
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/spa_os.c
@@ -0,0 +1,281 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ */
+
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/ddt.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_os.h>
+#include <sys/vdev_removal.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/vdev_indirect_births.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/bpobj.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_objset.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/fs/zfs.h>
+#include <sys/arc.h>
+#include <sys/callb.h>
+#include <sys/spa_boot.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zvol.h>
+#include <sys/abd.h>
+#include <sys/callb.h>
+#include <sys/zone.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+static nvlist_t *
+spa_generate_rootconf(const char *name)
+{
+	nvlist_t **configs, **tops;
+	nvlist_t *config;
+	nvlist_t *best_cfg, *nvtop, *nvroot;
+	uint64_t *holes;
+	uint64_t best_txg;
+	uint64_t nchildren;
+	uint64_t pgid;
+	uint64_t count;
+	uint64_t i;
+	uint_t   nholes;
+
+	if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
+		return (NULL);
+
+	ASSERT3U(count, !=, 0);
+	best_txg = 0;
+	for (i = 0; i < count; i++) {
+		uint64_t txg;
+
+		VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
+		    &txg) == 0);
+		if (txg > best_txg) {
+			best_txg = txg;
+			best_cfg = configs[i];
+		}
+	}
+
+	nchildren = 1;
+	nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
+	holes = NULL;
+	nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
+	    &holes, &nholes);
+
+	tops = kmem_zalloc(nchildren * sizeof (void *), KM_SLEEP);
+	for (i = 0; i < nchildren; i++) {
+		if (i >= count)
+			break;
+		if (configs[i] == NULL)
+			continue;
+		VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
+		    &nvtop) == 0);
+		nvlist_dup(nvtop, &tops[i], KM_SLEEP);
+	}
+	for (i = 0; holes != NULL && i < nholes; i++) {
+		if (i >= nchildren)
+			continue;
+		if (tops[holes[i]] != NULL)
+			continue;
+		nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
+		VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
+		    VDEV_TYPE_HOLE) == 0);
+		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
+		    holes[i]) == 0);
+		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
+		    0) == 0);
+	}
+	for (i = 0; i < nchildren; i++) {
+		if (tops[i] != NULL)
+			continue;
+		nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
+		VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
+		    VDEV_TYPE_MISSING) == 0);
+		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
+		    i) == 0);
+		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
+		    0) == 0);
+	}
+
+	/*
+	 * Create pool config based on the best vdev config.
+	 */
+	nvlist_dup(best_cfg, &config, KM_SLEEP);
+
+	/*
+	 * Put this pool's top-level vdevs into a root vdev.
+	 */
+	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    &pgid) == 0);
+	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_ROOT) == 0);
+	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
+	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
+	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    tops, nchildren) == 0);
+
+	/*
+	 * Replace the existing vdev_tree with the new root vdev in
+	 * this pool's configuration (remove the old, add the new).
+	 */
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+
+	/*
+	 * Drop vdev config elements that should not be present at pool level.
+	 */
+	nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
+	nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
+
+	for (i = 0; i < count; i++)
+		nvlist_free(configs[i]);
+	kmem_free(configs, count * sizeof (void *));
+	for (i = 0; i < nchildren; i++)
+		nvlist_free(tops[i]);
+	kmem_free(tops, nchildren * sizeof (void *));
+	nvlist_free(nvroot);
+	return (config);
+}
+
+int
+spa_import_rootpool(const char *name, bool checkpointrewind)
+{
+	spa_t *spa;
+	vdev_t *rvd;
+	nvlist_t *config, *nvtop;
+	uint64_t txg;
+	char *pname;
+	int error;
+
+	/*
+	 * Read the label from the boot device and generate a configuration.
+	 */
+	config = spa_generate_rootconf(name);
+
+	mutex_enter(&spa_namespace_lock);
+	if (config != NULL) {
+		VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+		    &pname) == 0 && strcmp(name, pname) == 0);
+		VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
+		    == 0);
+
+		if ((spa = spa_lookup(pname)) != NULL) {
+			/*
+			 * The pool could already be imported,
+			 * e.g., after reboot -r.
+			 */
+			if (spa->spa_state == POOL_STATE_ACTIVE) {
+				mutex_exit(&spa_namespace_lock);
+				nvlist_free(config);
+				return (0);
+			}
+
+			/*
+			 * Remove the existing root pool from the namespace so
+			 * that we can replace it with the correct config
+			 * we just read in.
+			 */
+			spa_remove(spa);
+		}
+		spa = spa_add(pname, config, NULL);
+
+		/*
+		 * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
+		 * via spa_version().
+		 */
+		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+		    &spa->spa_ubsync.ub_version) != 0)
+			spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+	} else if ((spa = spa_lookup(name)) == NULL) {
+		mutex_exit(&spa_namespace_lock);
+		nvlist_free(config);
+		cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
+		    name);
+		return (EIO);
+	} else {
+		VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
+	}
+	spa->spa_is_root = B_TRUE;
+	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
+	if (checkpointrewind) {
+		spa->spa_import_flags |= ZFS_IMPORT_CHECKPOINT;
+	}
+
+	/*
+	 * Build up a vdev tree based on the boot device's label config.
+	 */
+	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvtop) == 0);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
+	    VDEV_ALLOC_ROOTPOOL);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+	if (error) {
+		mutex_exit(&spa_namespace_lock);
+		nvlist_free(config);
+		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
+		    pname);
+		return (error);
+	}
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	vdev_free(rvd);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+	mutex_exit(&spa_namespace_lock);
+
+	nvlist_free(config);
+	return (0);
+}
+
+const char *
+spa_history_zone(void)
+{
+	return ("freebsd");
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/spa_stats.c b/sys/contrib/openzfs/module/os/freebsd/zfs/spa_stats.c
new file mode 100644
index 000000000000..45c880ada24d
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/spa_stats.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/spa.h>
+#include <zfs_comutil.h>
+
+void
+spa_stats_init(spa_t *spa)
+{
+
+}
+
+void
+spa_stats_destroy(spa_t *spa)
+{
+
+}
+
+void
+spa_iostats_trim_add(spa_t *spa, trim_type_t type,
+    uint64_t extents_written, uint64_t bytes_written,
+    uint64_t extents_skipped, uint64_t bytes_skipped,
+    uint64_t extents_failed, uint64_t bytes_failed)
+{
+}
+
+void
+spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
+{
+}
+
+void
+spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time)
+{
+
+}
+/*
+ * Set txg state completion time and increment current state.
+ */
+int
+spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
+    hrtime_t completed_time)
+{
+	return (0);
+}
+
+txg_stat_t *
+spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp)
+{
+	return (NULL);
+}
+
+void
+spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts)
+{
+
+}
+
+void
+spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs)
+{
+
+}
+
+void
+spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
+    uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id,
+    int error)
+{
+
+}
+
+int
+spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error,
+    hrtime_t duration)
+{
+	return (0);
+}
+
+int
+spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id)
+{
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
new file mode 100644
index 000000000000..200bbf43d757
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -0,0 +1,693 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/sunddi.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/nvpair.h>
+#include <sys/mount.h>
+#include <sys/taskqueue.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zcp.h>
+#include <sys/zio_checksum.h>
+#include <sys/vdev_removal.h>
+#include <sys/dsl_crypt.h>
+
+#include <sys/zfs_ioctl_compat.h>
+#include <sys/zfs_context.h>
+
+#include <sys/arc_impl.h>
+#include <sys/dsl_pool.h>
+
+
+/* BEGIN CSTYLED */
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, "ZFS adaptive replacement cache");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0, "ZFS disk buf cache");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, deadman, CTLFLAG_RW, 0, "ZFS deadman");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS dedup");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, l2arc, CTLFLAG_RW, 0, "ZFS l2arc");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, livelist, CTLFLAG_RW, 0, "ZFS livelist");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, lua, CTLFLAG_RW, 0, "ZFS lua");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, mg, CTLFLAG_RW, 0, "ZFS metaslab group");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, multihost, CTLFLAG_RW, 0, "ZFS multihost protection");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, prefetch, CTLFLAG_RW, 0, "ZFS prefetch");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, reconstruct, CTLFLAG_RW, 0, "ZFS reconstruct");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, recv, CTLFLAG_RW, 0, "ZFS receive");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, send, CTLFLAG_RW, 0, "ZFS send");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "ZFS space allocation");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS transaction group");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
+
+SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0,
+    "ZFS livelist condense");
+SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
+SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
+    "ZFS VDEV mirror");
+
+SYSCTL_DECL(_vfs_zfs_version);
+SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD,
+    (ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version");
+
+extern arc_state_t ARC_anon;
+extern arc_state_t ARC_mru;
+extern arc_state_t ARC_mru_ghost;
+extern arc_state_t ARC_mfu;
+extern arc_state_t ARC_mfu_ghost;
+extern arc_state_t ARC_l2c_only;
+
+/*
+ * minimum lifespan of a prefetch block in clock ticks
+ * (initialized in arc_init())
+ */
+
+/* arc.c */
+
+/* legacy compat */
+extern uint64_t l2arc_write_max;	/* def max write size */
+extern uint64_t l2arc_write_boost;	/* extra warmup write */
+extern uint64_t l2arc_headroom;		/* # of dev writes */
+extern uint64_t l2arc_headroom_boost;
+extern uint64_t l2arc_feed_secs;	/* interval seconds */
+extern uint64_t l2arc_feed_min_ms;	/* min interval msecs */
+extern int l2arc_noprefetch;			/* don't cache prefetch bufs */
+extern int l2arc_feed_again;			/* turbo warmup */
+extern int l2arc_norw;			/* no reads during writes */
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
+    &l2arc_write_max, 0, "max write size (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
+    &l2arc_write_boost, 0, "extra write during warmup (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
+    &l2arc_headroom, 0, "number of dev writes (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
+    &l2arc_feed_secs, 0, "interval seconds (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
+    &l2arc_feed_min_ms, 0, "min interval milliseconds (LEGACY)");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
+    &l2arc_noprefetch, 0, "don't cache prefetch bufs (LEGACY)");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
+    &l2arc_feed_again, 0, "turbo warmup (LEGACY)");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
+    &l2arc_norw, 0, "no reads during writes (LEGACY)");
+#if 0
+extern int zfs_compressed_arc_enabled;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RW,
+    &zfs_compressed_arc_enabled, 1, "compressed arc buffers (LEGACY)");
+#endif
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
+    &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
+    &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
+    &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of anonymous state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
+    &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
+    &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
+    &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mru state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mru ghost state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
+    &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
+    &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
+    &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mfu state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mfu ghost state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
+    &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
+
+static int
+sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
+{
+	uint32_t val;
+	int err;
+
+	val = arc_no_grow_shift;
+	err = sysctl_handle_32(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+        if (val >= arc_shrink_shift)
+		return (EINVAL);
+
+	arc_no_grow_shift = val;
+	return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN,
+    0, sizeof (uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U",
+    "log2(fraction of ARC which must be free to allow growing)");
+
+int
+param_set_arc_long(SYSCTL_HANDLER_ARGS)
+{
+	int err;
+
+	err = sysctl_handle_long(oidp, arg1, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	arc_tuning_update(B_TRUE);
+
+	return (0);
+}
+
+int
+param_set_arc_int(SYSCTL_HANDLER_ARGS)
+{
+	int err;
+
+	err = sysctl_handle_int(oidp, arg1, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	arc_tuning_update(B_TRUE);
+
+	return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_ULONG | CTLFLAG_RWTUN,
+    &zfs_arc_min, sizeof (zfs_arc_min), param_set_arc_long, "LU",
+    "min arc size (LEGACY)");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_ULONG | CTLFLAG_RWTUN,
+    &zfs_arc_max, sizeof (zfs_arc_max), param_set_arc_long, "LU",
+    "max arc size (LEGACY)");
+
+/* dbuf.c */
+
+
+/* dmu.c */
+
+/* dmu_zfetch.c */
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)");
+
+/* max bytes to prefetch per stream (default 8MB) */
+extern uint32_t	zfetch_max_distance;
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
+    &zfetch_max_distance, 0, "Max bytes to prefetch per stream (LEGACY)");
+
+/* max bytes to prefetch indirects for per stream (default 64MB) */
+extern uint32_t	zfetch_max_idistance;
+SYSCTL_UINT(_vfs_zfs_prefetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN,
+    &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream");
+
+/* dsl_pool.c */
+
+/* dnode.c */
+extern int zfs_default_bs;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN,
+    &zfs_default_bs, 0, "Default dnode block shift");
+
+extern int zfs_default_ibs;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN,
+    &zfs_default_ibs, 0, "Default dnode indirect block shift");
+
+
+/* dsl_scan.c */
+
+/* metaslab.c */
+
+/*
+ * In pools where the log space map feature is not enabled we touch
+ * multiple metaslabs (and their respective space maps) with each
+ * transaction group. Thus, we benefit from having a small space map
+ * block size since it allows us to issue more I/O operations scattered
+ * around the disk. So a sane default for the space map block size
+ * is 8~16K.
+ */
+extern int zfs_metaslab_sm_blksz_no_log;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, CTLFLAG_RDTUN,
+    &zfs_metaslab_sm_blksz_no_log, 0,
+    "Block size for space map in pools with log space map disabled.  "
+    "Power of 2 and greater than 4096.");
+
+/*
+ * When the log space map feature is enabled, we accumulate a lot of
+ * changes per metaslab that are flushed once in a while so we benefit
+ * from a bigger block size like 128K for the metaslab space maps.
+ */
+extern int zfs_metaslab_sm_blksz_with_log;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, CTLFLAG_RDTUN,
+    &zfs_metaslab_sm_blksz_with_log, 0,
+    "Block size for space map in pools with log space map enabled.  "
+    "Power of 2 and greater than 4096.");
+
+/*
+ * The in-core space map representation is more compact than its on-disk form.
+ * The zfs_condense_pct determines how much more compact the in-core
+ * space map representation must be before we compact it on-disk.
+ * Values should be greater than or equal to 100.
+ */
+extern int zfs_condense_pct;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
+    &zfs_condense_pct, 0,
+    "Condense on-disk spacemap when it is more than this many percents"
+    " of in-memory counterpart");
+
+extern int zfs_remove_max_segment;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, remove_max_segment, CTLFLAG_RWTUN,
+    &zfs_remove_max_segment, 0, "Largest contiguous segment ZFS will attempt to"
+    " allocate when removing a device");
+
+extern int zfs_removal_suspend_progress;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, CTLFLAG_RWTUN,
+    &zfs_removal_suspend_progress, 0, "Ensures certain actions can happen while"
+    " in the middle of a removal");
+
+
+/*
+ * Minimum size which forces the dynamic allocator to change
+ * it's allocation strategy.  Once the space map cannot satisfy
+ * an allocation of this size then it switches to using more
+ * aggressive strategy (i.e search by size rather than offset).
+ */
+extern uint64_t metaslab_df_alloc_threshold;
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
+    &metaslab_df_alloc_threshold, 0,
+    "Minimum size which forces the dynamic allocator to change it's allocation strategy");
+
+/*
+ * The minimum free space, in percent, which must be available
+ * in a space map to continue allocations in a first-fit fashion.
+ * Once the space map's free space drops below this level we dynamically
+ * switch to using best-fit allocations.
+ */
+extern int metaslab_df_free_pct;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
+    &metaslab_df_free_pct, 0,
+    "The minimum free space, in percent, which must be available in a "
+    "space map to continue allocations in a first-fit fashion");
+
+/*
+ * Percentage of all cpus that can be used by the metaslab taskq.
+ */
+extern int metaslab_load_pct;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
+    &metaslab_load_pct, 0,
+    "Percentage of cpus that can be used by the metaslab taskq");
+
+/*
+ * Max number of metaslabs per group to preload.
+ */
+extern int metaslab_preload_limit;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
+    &metaslab_preload_limit, 0,
+    "Max number of metaslabs per group to preload");
+
+/* refcount.c */
+extern int reference_tracking_enable;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN,
+    &reference_tracking_enable, 0,
+    "Track reference holders to refcount_t objects, used mostly by ZFS");
+
+/* spa.c */
+extern int zfs_ccw_retry_interval;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN,
+    &zfs_ccw_retry_interval, 0,
+    "Configuration cache file write, retry after failure, interval (seconds)");
+
+extern uint64_t zfs_max_missing_tvds_cachefile;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN,
+    &zfs_max_missing_tvds_cachefile, 0,
+    "allow importing pools with missing top-level vdevs in cache file");
+
+extern uint64_t zfs_max_missing_tvds_scan;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN,
+    &zfs_max_missing_tvds_scan, 0,
+    "allow importing pools with missing top-level vdevs during scan");
+
+/* spa_misc.c */
+extern int zfs_flags;
+static int
+sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
+{
+	int err, val;
+
+	val = zfs_flags;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	/*
+	 * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
+	 * arc buffers in the system have the necessary additional
+	 * checksum data.  However, it is safe to disable at any
+	 * time.
+	 */
+	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+		val &= ~ZFS_DEBUG_MODIFY;
+	zfs_flags = val;
+
+	return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0,
+    sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
+
+int
+param_set_deadman_synctime(SYSCTL_HANDLER_ARGS)
+{
+	unsigned long val;
+	int err;
+
+	val = zfs_deadman_synctime_ms;
+	err = sysctl_handle_long(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+	zfs_deadman_synctime_ms = val;
+
+	spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms));
+
+	return (0);
+}
+
+int
+param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS)
+{
+	unsigned long val;
+	int err;
+
+	val = zfs_deadman_ziotime_ms;
+	err = sysctl_handle_long(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+	zfs_deadman_ziotime_ms = val;
+
+	spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_synctime_ms));
+
+	return (0);
+}
+
+int
+param_set_deadman_failmode(SYSCTL_HANDLER_ARGS)
+{
+	char buf[16];
+	int rc;
+
+	if (req->newptr == NULL)
+		strlcpy(buf, zfs_deadman_failmode, sizeof (buf));
+
+	rc = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+	if (rc || req->newptr == NULL)
+		return (rc);
+	if (strcmp(buf, zfs_deadman_failmode) == 0)
+		return (0);
+	if (!strcmp(buf,  "wait"))
+		zfs_deadman_failmode = "wait";
+	if (!strcmp(buf,  "continue"))
+		zfs_deadman_failmode = "continue";
+	if (!strcmp(buf,  "panic"))
+		zfs_deadman_failmode = "panic";
+
+	return (-param_set_deadman_failmode_common(buf));
+}
+
+
+/* spacemap.c */
+extern int space_map_ibs;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN,
+    &space_map_ibs, 0, "Space map indirect block shift");
+
+
+/* vdev.c */
+int
+param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = zfs_vdev_min_auto_ashift;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (SET_ERROR(err));
+
+	if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
+		return (SET_ERROR(EINVAL));
+
+	zfs_vdev_min_auto_ashift = val;
+
+	return (0);
+}
+
+int
+param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = zfs_vdev_max_auto_ashift;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (SET_ERROR(err));
+
+	if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
+		return (SET_ERROR(EINVAL));
+
+	zfs_vdev_max_auto_ashift = val;
+
+	return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, CTLTYPE_U64 | CTLFLAG_RWTUN,
+    &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift),
+    param_set_min_auto_ashift, "QU",
+    "Min ashift used when creating new top-level vdev. (LEGACY)");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, CTLTYPE_U64 | CTLFLAG_RWTUN,
+    &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift),
+    param_set_max_auto_ashift, "QU",
+    "Max ashift used when optimizing for logical -> physical sector size on "
+    "new top-level vdevs. (LEGACY)");
+
+/*
+ * Since the DTL space map of a vdev is not expected to have a lot of
+ * entries, we default its block size to 4K.
+ */
+extern int zfs_vdev_dtl_sm_blksz;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN,
+    &zfs_vdev_dtl_sm_blksz, 0,
+    "Block size for DTL space map.  Power of 2 and greater than 4096.");
+
+/*
+ * vdev-wide space maps that have lots of entries written to them at
+ * the end of each transaction can benefit from a higher I/O bandwidth
+ * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
+ */
+extern int zfs_vdev_standard_sm_blksz;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN,
+    &zfs_vdev_standard_sm_blksz, 0,
+    "Block size for standard space map.  Power of 2 and greater than 4096.");
+
+extern int vdev_validate_skip;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, CTLFLAG_RDTUN,
+    &vdev_validate_skip, 0,
+    "Enable to bypass vdev_validate().");
+
+
+/* vdev_cache.c */
+
+/* vdev_mirror.c */
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * non_rotating_seek_inc to 0 may well provide better results as it
+ * will direct more reads to the non-rotating vdevs which are more
+ * likely to have a higher performance.
+ */
+
+
+/* vdev_queue.c */
+#define	ZFS_VDEV_QUEUE_KNOB_MIN(name)					\
+extern uint32_t zfs_vdev_ ## name ## _min_active;				\
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\
+    &zfs_vdev_ ## name ## _min_active, 0,				\
+    "Initial number of I/O requests of type " #name			\
+    " active for each device");
+
+#define	ZFS_VDEV_QUEUE_KNOB_MAX(name)					\
+extern uint32_t zfs_vdev_ ## name ## _max_active;				\
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN, \
+    &zfs_vdev_ ## name ## _max_active, 0,				\
+    "Maximum number of I/O requests of type " #name			\
+    " active for each device");
+
+
+#undef ZFS_VDEV_QUEUE_KNOB
+
+extern uint32_t zfs_vdev_max_active;
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN,
+    &zfs_vdev_max_active, 0,
+    "The maximum number of I/Os of all types active for each device. (LEGACY)");
+
+extern int zfs_vdev_def_queue_depth;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN,
+    &zfs_vdev_def_queue_depth, 0,
+    "Default queue depth for each allocator");
+
+/*extern uint64_t zfs_multihost_history;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, multihost_history, CTLFLAG_RWTUN,
+    &zfs_multihost_history, 0,
+    "Historical staticists for the last N multihost updates");*/
+
+#ifdef notyet
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW,
+    &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation");
+#endif
+
+
+/* zio.c */
+#if defined(__LP64__)
+int zio_use_uma = 1;
+#else
+int zio_use_uma = 0;
+#endif
+
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
+    "Use uma(9) for ZIO allocations");
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
+    "Exclude metadata buffers from dumps as well");
+
+int
+param_set_slop_shift(SYSCTL_HANDLER_ARGS)
+{
+	int val;
+	int err;
+
+	val = *(int *)arg1;
+
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < 1 || val > 31)
+		return (EINVAL);
+
+	*(int *)arg1 = val;
+
+	return (0);
+}
+
+int
+param_set_multihost_interval(SYSCTL_HANDLER_ARGS)
+{
+	int err;
+
+	err = sysctl_handle_long(oidp, arg1, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (spa_mode_global != SPA_MODE_UNINIT)
+		mmp_signal_all_threads();
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c
new file mode 100644
index 000000000000..4d27751c8893
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c
@@ -0,0 +1,328 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/file.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/abd.h>
+#include <sys/stat.h>
+
+/*
+ * Virtual device vector for files.
+ */
+
+static taskq_t *vdev_file_taskq;
+
+void
+vdev_file_init(void)
+{
+	vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16),
+	    minclsyspri, max_ncpus, INT_MAX, 0);
+}
+
+void
+vdev_file_fini(void)
+{
+	taskq_destroy(vdev_file_taskq);
+}
+
+static void
+vdev_file_hold(vdev_t *vd)
+{
+	ASSERT(vd->vdev_path != NULL);
+}
+
+static void
+vdev_file_rele(vdev_t *vd)
+{
+	ASSERT(vd->vdev_path != NULL);
+}
+
+static mode_t
+vdev_file_open_mode(spa_mode_t spa_mode)
+{
+	mode_t mode = 0;
+
+	if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) {
+		mode = O_RDWR;
+	} else if (spa_mode & SPA_MODE_READ) {
+		mode = O_RDONLY;
+	} else if (spa_mode & SPA_MODE_WRITE) {
+		mode = O_WRONLY;
+	}
+
+	return (mode | O_LARGEFILE);
+}
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+	vdev_file_t *vf;
+	zfs_file_t *fp;
+	zfs_file_attr_t zfa;
+	int error;
+
+	/*
+	 * Rotational optimizations only make sense on block devices.
+	 */
+	vd->vdev_nonrot = B_TRUE;
+
+	/*
+	 * Allow TRIM on file based vdevs.  This may not always be supported,
+	 * since it depends on your kernel version and underlying filesystem
+	 * type but it is always safe to attempt.
+	 */
+	vd->vdev_has_trim = B_TRUE;
+
+	/*
+	 * Disable secure TRIM on file based vdevs.  There is no way to
+	 * request this behavior from the underlying filesystem.
+	 */
+	vd->vdev_has_securetrim = B_FALSE;
+
+	/*
+	 * We must have a pathname, and it must be absolute.
+	 */
+	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Reopen the device if it's not currently open.  Otherwise,
+	 * just update the physical size of the device.
+	 */
+	if (vd->vdev_tsd != NULL) {
+		ASSERT(vd->vdev_reopening);
+		vf = vd->vdev_tsd;
+		goto skip_open;
+	}
+
+	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+	/*
+	 * We always open the files from the root of the global zone, even if
+	 * we're in a local zone.  If the user has gotten to this point, the
+	 * administrator has already decided that the pool should be available
+	 * to local zone users, so the underlying devices should be as well.
+	 */
+	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+
+	error = zfs_file_open(vd->vdev_path,
+	    vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp);
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	vf->vf_file = fp;
+
+#ifdef _KERNEL
+	/*
+	 * Make sure it's a regular file.
+	 */
+	if (zfs_file_getattr(fp, &zfa)) {
+		return (SET_ERROR(ENODEV));
+	}
+	if (!S_ISREG(zfa.zfa_mode)) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (SET_ERROR(ENODEV));
+	}
+#endif
+
+skip_open:
+
+	error =  zfs_file_getattr(vf->vf_file, &zfa);
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	*max_psize = *psize = zfa.zfa_size;
+	*logical_ashift = SPA_MINBLOCKSHIFT;
+	*physical_ashift = SPA_MINBLOCKSHIFT;
+
+	return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+	vdev_file_t *vf = vd->vdev_tsd;
+
+	if (vd->vdev_reopening || vf == NULL)
+		return;
+
+	if (vf->vf_file != NULL) {
+		zfs_file_close(vf->vf_file);
+	}
+
+	vd->vdev_delayed_close = B_FALSE;
+	kmem_free(vf, sizeof (vdev_file_t));
+	vd->vdev_tsd = NULL;
+}
+
+/*
+ * Implements the interrupt side for file vdev types. This routine will be
+ * called when the I/O completes allowing us to transfer the I/O to the
+ * interrupt taskqs. For consistency, the code structure mimics disk vdev
+ * types.
+ */
+static void
+vdev_file_io_intr(zio_t *zio)
+{
+	zio_delay_interrupt(zio);
+}
+
+static void
+vdev_file_io_strategy(void *arg)
+{
+	zio_t *zio = arg;
+	vdev_t *vd = zio->io_vd;
+	vdev_file_t *vf;
+	void *buf;
+	ssize_t resid;
+	loff_t off;
+	ssize_t size;
+	int err;
+
+	off = zio->io_offset;
+	size = zio->io_size;
+	resid = 0;
+
+	vf = vd->vdev_tsd;
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+	if (zio->io_type == ZIO_TYPE_READ) {
+		buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+		err = zfs_file_pread(vf->vf_file, buf, size, off, &resid);
+		abd_return_buf_copy(zio->io_abd, buf, size);
+	} else {
+		buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+		err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
+		abd_return_buf(zio->io_abd, buf, size);
+	}
+	if (resid != 0 && zio->io_error == 0)
+		zio->io_error = ENOSPC;
+
+	vdev_file_io_intr(zio);
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_file_t *vf = vd->vdev_tsd;
+
+	if (zio->io_type == ZIO_TYPE_IOCTL) {
+		/* XXPOLICY */
+		if (!vdev_readable(vd)) {
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
+		}
+
+		switch (zio->io_cmd) {
+		case DKIOCFLUSHWRITECACHE:
+			zio->io_error = zfs_file_fsync(vf->vf_file,
+			    O_SYNC|O_DSYNC);
+			break;
+		default:
+			zio->io_error = SET_ERROR(ENOTSUP);
+		}
+
+		zio_execute(zio);
+		return;
+	} else if (zio->io_type == ZIO_TYPE_TRIM) {
+#ifdef notyet
+		int mode = 0;
+
+		ASSERT3U(zio->io_size, !=, 0);
+
+		/* XXX FreeBSD has no fallocate routine in file ops */
+		zio->io_error = zfs_file_fallocate(vf->vf_file,
+		    mode, zio->io_offset, zio->io_size);
+#endif
+		zio->io_error = SET_ERROR(ENOTSUP);
+		zio_execute(zio);
+		return;
+	}
+	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+	zio->io_target_timestamp = zio_handle_io_delay(zio);
+
+	VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
+	    TQ_SLEEP), !=, 0);
+}
+
+/* ARGSUSED */
+static void
+vdev_file_io_done(zio_t *zio)
+{
+}
+
+vdev_ops_t vdev_file_ops = {
+	vdev_file_open,
+	vdev_file_close,
+	vdev_default_asize,
+	vdev_file_io_start,
+	vdev_file_io_done,
+	NULL,
+	NULL,
+	vdev_file_hold,
+	vdev_file_rele,
+	NULL,
+	vdev_default_xlate,
+	VDEV_TYPE_FILE,		/* name of this vdev type */
+	B_TRUE			/* leaf vdev */
+};
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+	vdev_file_open,
+	vdev_file_close,
+	vdev_default_asize,
+	vdev_file_io_start,
+	vdev_file_io_done,
+	NULL,
+	NULL,
+	vdev_file_hold,
+	vdev_file_rele,
+	NULL,
+	vdev_default_xlate,
+	VDEV_TYPE_DISK,		/* name of this vdev type */
+	B_TRUE			/* leaf vdev */
+};
+
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
new file mode 100644
index 000000000000..bf06f69192d9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
@@ -0,0 +1,1206 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/file.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_os.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <geom/geom.h>
+#include <geom/geom_disk.h>
+#include <geom/geom_int.h>
+
+#ifndef g_topology_locked
+#define	g_topology_locked()	sx_xlocked(&topology_lock)
+#endif
+
+/*
+ * Virtual device vector for GEOM.
+ */
+
+static g_attrchanged_t vdev_geom_attrchanged;
+struct g_class zfs_vdev_class = {
+	.name = "ZFS::VDEV",
+	.version = G_VERSION,
+	.attrchanged = vdev_geom_attrchanged,
+};
+
+struct consumer_vdev_elem {
+	SLIST_ENTRY(consumer_vdev_elem)	elems;
+	vdev_t	*vd;
+};
+
+SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
+/* BEGIN CSTYLED */
+_Static_assert(sizeof (((struct g_consumer *)NULL)->private)
+	== sizeof (struct consumer_priv_t*),
+	"consumer_priv_t* can't be stored in g_consumer.private");
+
+DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
+
+SYSCTL_DECL(_vfs_zfs_vdev);
+/* Don't send BIO_FLUSH. */
+static int vdev_geom_bio_flush_disable;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
+    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
+/* Don't send BIO_DELETE. */
+static int vdev_geom_bio_delete_disable;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
+    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
+/* END CSTYLED */
+
+/* Declare local functions */
+static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
+
+/*
+ * Thread local storage used to indicate when a thread is probing geoms
+ * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
+ * it is looking for a replacement for the vdev_t* that is its value.
+ */
+uint_t zfs_geom_probe_vdev_key;
+
+static void
+vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
+    boolean_t do_null_update)
+{
+	boolean_t needs_update = B_FALSE;
+	char *physpath;
+	int error, physpath_len;
+
+	physpath_len = MAXPATHLEN;
+	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
+	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
+	if (error == 0) {
+		char *old_physpath;
+
+		/* g_topology lock ensures that vdev has not been closed */
+		g_topology_assert();
+		old_physpath = vd->vdev_physpath;
+		vd->vdev_physpath = spa_strdup(physpath);
+
+		if (old_physpath != NULL) {
+			needs_update = (strcmp(old_physpath,
+			    vd->vdev_physpath) != 0);
+			spa_strfree(old_physpath);
+		} else
+			needs_update = do_null_update;
+	}
+	g_free(physpath);
+
+	/*
+	 * If the physical path changed, update the config.
+	 * Only request an update for previously unset physpaths if
+	 * requested by the caller.
+	 */
+	if (needs_update)
+		spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
+
+}
+
+static void
+vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
+{
+	struct consumer_priv_t *priv;
+	struct consumer_vdev_elem *elem;
+
+	priv = (struct consumer_priv_t *)&cp->private;
+	if (SLIST_EMPTY(priv))
+		return;
+
+	SLIST_FOREACH(elem, priv, elems) {
+		vdev_t *vd = elem->vd;
+		if (strcmp(attr, "GEOM::physpath") == 0) {
+			vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE);
+			return;
+		}
+	}
+}
+
+static void
+vdev_geom_resize(struct g_consumer *cp)
+{
+	struct consumer_priv_t *priv;
+	struct consumer_vdev_elem *elem;
+	spa_t *spa;
+	vdev_t *vd;
+
+	priv = (struct consumer_priv_t *)&cp->private;
+	if (SLIST_EMPTY(priv))
+		return;
+
+	SLIST_FOREACH(elem, priv, elems) {
+		vd = elem->vd;
+		if (vd->vdev_state != VDEV_STATE_HEALTHY)
+			continue;
+		spa = vd->vdev_spa;
+		if (!spa->spa_autoexpand)
+			continue;
+		vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL);
+	}
+}
+
+static void
+vdev_geom_orphan(struct g_consumer *cp)
+{
+	struct consumer_priv_t *priv;
+	// cppcheck-suppress uninitvar
+	struct consumer_vdev_elem *elem;
+
+	g_topology_assert();
+
+	priv = (struct consumer_priv_t *)&cp->private;
+	if (SLIST_EMPTY(priv))
+		/* Vdev close in progress.  Ignore the event. */
+		return;
+
+	/*
+	 * Orphan callbacks occur from the GEOM event thread.
+	 * Concurrent with this call, new I/O requests may be
+	 * working their way through GEOM about to find out
+	 * (only once executed by the g_down thread) that we've
+	 * been orphaned from our disk provider.  These I/Os
+	 * must be retired before we can detach our consumer.
+	 * This is most easily achieved by acquiring the
+	 * SPA ZIO configuration lock as a writer, but doing
+	 * so with the GEOM topology lock held would cause
+	 * a lock order reversal.  Instead, rely on the SPA's
+	 * async removal support to invoke a close on this
+	 * vdev once it is safe to do so.
+	 */
+	// cppcheck-suppress All
+	SLIST_FOREACH(elem, priv, elems) {
+		// cppcheck-suppress uninitvar
+		vdev_t *vd = elem->vd;
+
+		vd->vdev_remove_wanted = B_TRUE;
+		spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
+	}
+}
+
+static struct g_consumer *
+vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
+{
+	struct g_geom *gp;
+	struct g_consumer *cp;
+	int error;
+
+	g_topology_assert();
+
+	ZFS_LOG(1, "Attaching to %s.", pp->name);
+
+	if (sanity) {
+		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
+			ZFS_LOG(1, "Failing attach of %s. "
+			    "Incompatible sectorsize %d\n",
+			    pp->name, pp->sectorsize);
+			return (NULL);
+		} else if (pp->mediasize < SPA_MINDEVSIZE) {
+			ZFS_LOG(1, "Failing attach of %s. "
+			    "Incompatible mediasize %ju\n",
+			    pp->name, pp->mediasize);
+			return (NULL);
+		}
+	}
+
+	/* Do we have geom already? No? Create one. */
+	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
+		if (gp->flags & G_GEOM_WITHER)
+			continue;
+		if (strcmp(gp->name, "zfs::vdev") != 0)
+			continue;
+		break;
+	}
+	if (gp == NULL) {
+		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
+		gp->orphan = vdev_geom_orphan;
+		gp->attrchanged = vdev_geom_attrchanged;
+		gp->resize = vdev_geom_resize;
+		cp = g_new_consumer(gp);
+		error = g_attach(cp, pp);
+		if (error != 0) {
+			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
+			    __LINE__, error);
+			vdev_geom_detach(cp, B_FALSE);
+			return (NULL);
+		}
+		error = g_access(cp, 1, 0, 1);
+		if (error != 0) {
+			ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
+			    __LINE__, error);
+			vdev_geom_detach(cp, B_FALSE);
+			return (NULL);
+		}
+		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
+	} else {
+		/* Check if we are already connected to this provider. */
+		LIST_FOREACH(cp, &gp->consumer, consumer) {
+			if (cp->provider == pp) {
+				ZFS_LOG(1, "Found consumer for %s.", pp->name);
+				break;
+			}
+		}
+		if (cp == NULL) {
+			cp = g_new_consumer(gp);
+			error = g_attach(cp, pp);
+			if (error != 0) {
+				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
+				    __func__, __LINE__, error);
+				vdev_geom_detach(cp, B_FALSE);
+				return (NULL);
+			}
+			error = g_access(cp, 1, 0, 1);
+			if (error != 0) {
+				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+				    __func__, __LINE__, error);
+				vdev_geom_detach(cp, B_FALSE);
+				return (NULL);
+			}
+			ZFS_LOG(1, "Created consumer for %s.", pp->name);
+		} else {
+			error = g_access(cp, 1, 0, 1);
+			if (error != 0) {
+				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+				    __func__, __LINE__, error);
+				return (NULL);
+			}
+			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
+		}
+	}
+
+	if (vd != NULL)
+		vd->vdev_tsd = cp;
+
+	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
+	return (cp);
+}
+
+static void
+vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
+{
+	struct g_geom *gp;
+
+	g_topology_assert();
+
+	ZFS_LOG(1, "Detaching from %s.",
+	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
+
+	gp = cp->geom;
+	if (open_for_read)
+		g_access(cp, -1, 0, -1);
+	/* Destroy consumer on last close. */
+	if (cp->acr == 0 && cp->ace == 0) {
+		if (cp->acw > 0)
+			g_access(cp, 0, -cp->acw, 0);
+		if (cp->provider != NULL) {
+			ZFS_LOG(1, "Destroying consumer for %s.",
+			    cp->provider->name ? cp->provider->name : "NULL");
+			g_detach(cp);
+		}
+		g_destroy_consumer(cp);
+	}
+	/* Destroy geom if there are no consumers left. */
+	if (LIST_EMPTY(&gp->consumer)) {
+		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
+		g_wither_geom(gp, ENXIO);
+	}
+}
+
+static void
+vdev_geom_close_locked(vdev_t *vd)
+{
+	struct g_consumer *cp;
+	struct consumer_priv_t *priv;
+	struct consumer_vdev_elem *elem, *elem_temp;
+
+	g_topology_assert();
+
+	cp = vd->vdev_tsd;
+	vd->vdev_delayed_close = B_FALSE;
+	if (cp == NULL)
+		return;
+
+	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
+	KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
+	priv = (struct consumer_priv_t *)&cp->private;
+	vd->vdev_tsd = NULL;
+	SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
+		if (elem->vd == vd) {
+			SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
+			g_free(elem);
+		}
+	}
+
+	vdev_geom_detach(cp, B_TRUE);
+}
+
+/*
+ * Issue one or more bios to the vdev in parallel
+ * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
+ * operation is described by parallel entries from each array.  There may be
+ * more bios actually issued than entries in the array
+ */
+static void
+vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
+    off_t *sizes, int *errors, int ncmds)
+{
+	struct bio **bios;
+	uint8_t *p;
+	off_t off, maxio, s, end;
+	int i, n_bios, j;
+	size_t bios_size;
+
+	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
+	n_bios = 0;
+
+	/* How many bios are required for all commands ? */
+	for (i = 0; i < ncmds; i++)
+		n_bios += (sizes[i] + maxio - 1) / maxio;
+
+	/* Allocate memory for the bios */
+	bios_size = n_bios * sizeof (struct bio *);
+	bios = kmem_zalloc(bios_size, KM_SLEEP);
+
+	/* Prepare and issue all of the bios */
+	for (i = j = 0; i < ncmds; i++) {
+		off = offsets[i];
+		p = datas[i];
+		s = sizes[i];
+		end = off + s;
+		ASSERT((off % cp->provider->sectorsize) == 0);
+		ASSERT((s % cp->provider->sectorsize) == 0);
+
+		for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
+			bios[j] = g_alloc_bio();
+			bios[j]->bio_cmd = cmds[i];
+			bios[j]->bio_done = NULL;
+			bios[j]->bio_offset = off;
+			bios[j]->bio_length = MIN(s, maxio);
+			bios[j]->bio_data = (caddr_t)p;
+			g_io_request(bios[j], cp);
+		}
+	}
+	ASSERT(j == n_bios);
+
+	/* Wait for all of the bios to complete, and clean them up */
+	for (i = j = 0; i < ncmds; i++) {
+		off = offsets[i];
+		s = sizes[i];
+		end = off + s;
+
+		for (; off < end; off += maxio, s -= maxio, j++) {
+			errors[i] = biowait(bios[j], "vdev_geom_io") ||
+			    errors[i];
+			g_destroy_bio(bios[j]);
+		}
+	}
+	kmem_free(bios, bios_size);
+}
+
+/*
+ * Read the vdev config from a device.  Return the number of valid labels that
+ * were found.  The vdev config will be returned in config if and only if at
+ * least one valid label was found.
+ */
+static int
+vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp)
+{
+	struct g_provider *pp;
+	nvlist_t *config;
+	vdev_phys_t *vdev_lists[VDEV_LABELS];
+	char *buf;
+	size_t buflen;
+	uint64_t psize, state, txg;
+	off_t offsets[VDEV_LABELS];
+	off_t size;
+	off_t sizes[VDEV_LABELS];
+	int cmds[VDEV_LABELS];
+	int errors[VDEV_LABELS];
+	int l, nlabels;
+
+	g_topology_assert_not();
+
+	pp = cp->provider;
+	ZFS_LOG(1, "Reading config from %s...", pp->name);
+
+	psize = pp->mediasize;
+	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
+
+	size = sizeof (*vdev_lists[0]) + pp->sectorsize -
+	    ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
+
+	buflen = sizeof (vdev_lists[0]->vp_nvlist);
+
+	/* Create all of the IO requests */
+	for (l = 0; l < VDEV_LABELS; l++) {
+		cmds[l] = BIO_READ;
+		vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
+		offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
+		sizes[l] = size;
+		errors[l] = 0;
+		ASSERT(offsets[l] % pp->sectorsize == 0);
+	}
+
+	/* Issue the IO requests */
+	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
+	    VDEV_LABELS);
+
+	/* Parse the labels */
+	config = *configp = NULL;
+	nlabels = 0;
+	for (l = 0; l < VDEV_LABELS; l++) {
+		if (errors[l] != 0)
+			continue;
+
+		buf = vdev_lists[l]->vp_nvlist;
+
+		if (nvlist_unpack(buf, buflen, &config, 0) != 0)
+			continue;
+
+		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+		    &state) != 0 || state > POOL_STATE_L2CACHE) {
+			nvlist_free(config);
+			continue;
+		}
+
+		if (state != POOL_STATE_SPARE &&
+		    state != POOL_STATE_L2CACHE &&
+		    (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+		    &txg) != 0 || txg == 0)) {
+			nvlist_free(config);
+			continue;
+		}
+
+		if (*configp != NULL)
+			nvlist_free(*configp);
+		*configp = config;
+		nlabels++;
+	}
+
+	/* Free the label storage */
+	for (l = 0; l < VDEV_LABELS; l++)
+		kmem_free(vdev_lists[l], size);
+
+	return (nlabels);
+}
+
+static void
+resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
+{
+	nvlist_t **new_configs;
+	uint64_t i;
+
+	if (id < *count)
+		return;
+	new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *),
+	    KM_SLEEP);
+	for (i = 0; i < *count; i++)
+		new_configs[i] = (*configs)[i];
+	if (*configs != NULL)
+		kmem_free(*configs, *count * sizeof (void *));
+	*configs = new_configs;
+	*count = id + 1;
+}
+
+static void
+process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
+    const char *name, uint64_t *known_pool_guid)
+{
+	nvlist_t *vdev_tree;
+	uint64_t pool_guid;
+	uint64_t vdev_guid;
+	uint64_t id, txg, known_txg;
+	char *pname;
+
+	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
+	    strcmp(pname, name) != 0)
+		goto ignore;
+
+	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
+		goto ignore;
+
+	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
+		goto ignore;
+
+	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
+		goto ignore;
+
+	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
+		goto ignore;
+
+	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
+
+	if (*known_pool_guid != 0) {
+		if (pool_guid != *known_pool_guid)
+			goto ignore;
+	} else
+		*known_pool_guid = pool_guid;
+
+	resize_configs(configs, count, id);
+
+	if ((*configs)[id] != NULL) {
+		VERIFY(nvlist_lookup_uint64((*configs)[id],
+		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
+		if (txg <= known_txg)
+			goto ignore;
+		nvlist_free((*configs)[id]);
+	}
+
+	(*configs)[id] = cfg;
+	return;
+
+ignore:
+	nvlist_free(cfg);
+}
+
+int
+vdev_geom_read_pool_label(const char *name,
+    nvlist_t ***configs, uint64_t *count)
+{
+	struct g_class *mp;
+	struct g_geom *gp;
+	struct g_provider *pp;
+	struct g_consumer *zcp;
+	nvlist_t *vdev_cfg;
+	uint64_t pool_guid;
+	int nlabels;
+
+	DROP_GIANT();
+	g_topology_lock();
+
+	*configs = NULL;
+	*count = 0;
+	pool_guid = 0;
+	LIST_FOREACH(mp, &g_classes, class) {
+		if (mp == &zfs_vdev_class)
+			continue;
+		LIST_FOREACH(gp, &mp->geom, geom) {
+			if (gp->flags & G_GEOM_WITHER)
+				continue;
+			LIST_FOREACH(pp, &gp->provider, provider) {
+				if (pp->flags & G_PF_WITHER)
+					continue;
+				zcp = vdev_geom_attach(pp, NULL, B_TRUE);
+				if (zcp == NULL)
+					continue;
+				g_topology_unlock();
+				nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
+				g_topology_lock();
+				vdev_geom_detach(zcp, B_TRUE);
+				if (nlabels == 0)
+					continue;
+				ZFS_LOG(1, "successfully read vdev config");
+
+				process_vdev_config(configs, count,
+				    vdev_cfg, name, &pool_guid);
+			}
+		}
+	}
+	g_topology_unlock();
+	PICKUP_GIANT();
+
+	return (*count > 0 ? 0 : ENOENT);
+}
+
+enum match {
+	NO_MATCH = 0,		/* No matching labels found */
+	TOPGUID_MATCH = 1,	/* Labels match top guid, not vdev guid */
+	ZERO_MATCH = 1,		/* Should never be returned */
+	ONE_MATCH = 2,		/* 1 label matching the vdev_guid */
+	TWO_MATCH = 3,		/* 2 label matching the vdev_guid */
+	THREE_MATCH = 4,	/* 3 label matching the vdev_guid */
+	FULL_MATCH = 5		/* all labels match the vdev_guid */
+};
+
+static enum match
+vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
+{
+	nvlist_t *config;
+	uint64_t pool_guid, top_guid, vdev_guid;
+	struct g_consumer *cp;
+	int nlabels;
+
+	cp = vdev_geom_attach(pp, NULL, B_TRUE);
+	if (cp == NULL) {
+		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
+		    pp->name);
+		return (NO_MATCH);
+	}
+	g_topology_unlock();
+	nlabels = vdev_geom_read_config(cp, &config);
+	g_topology_lock();
+	vdev_geom_detach(cp, B_TRUE);
+	if (nlabels == 0) {
+		ZFS_LOG(1, "Unable to read config from %s.", pp->name);
+		return (NO_MATCH);
+	}
+
+	pool_guid = 0;
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
+	top_guid = 0;
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
+	vdev_guid = 0;
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
+	nvlist_free(config);
+
+	/*
+	 * Check that the label's pool guid matches the desired guid.
+	 * Inactive spares and L2ARCs do not have any pool guid in the label.
+	 */
+	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
+		ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
+		    pp->name,
+		    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
+		return (NO_MATCH);
+	}
+
+	/*
+	 * Check that the label's vdev guid matches the desired guid.
+	 * The second condition handles possible race on vdev detach, when
+	 * remaining vdev receives GUID of destroyed top level mirror vdev.
+	 */
+	if (vdev_guid == vd->vdev_guid) {
+		ZFS_LOG(1, "guids match for provider %s.", pp->name);
+		return (ZERO_MATCH + nlabels);
+	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
+		ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
+		return (TOPGUID_MATCH);
+	}
+	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
+	    pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
+	return (NO_MATCH);
+}
+
+static struct g_consumer *
+vdev_geom_attach_by_guids(vdev_t *vd)
+{
+	struct g_class *mp;
+	struct g_geom *gp;
+	struct g_provider *pp, *best_pp;
+	struct g_consumer *cp;
+	const char *vdpath;
+	enum match match, best_match;
+
+	g_topology_assert();
+
+	vdpath = vd->vdev_path + sizeof ("/dev/") - 1;
+	cp = NULL;
+	best_pp = NULL;
+	best_match = NO_MATCH;
+	LIST_FOREACH(mp, &g_classes, class) {
+		if (mp == &zfs_vdev_class)
+			continue;
+		LIST_FOREACH(gp, &mp->geom, geom) {
+			if (gp->flags & G_GEOM_WITHER)
+				continue;
+			LIST_FOREACH(pp, &gp->provider, provider) {
+				match = vdev_attach_ok(vd, pp);
+				if (match > best_match) {
+					best_match = match;
+					best_pp = pp;
+				} else if (match == best_match) {
+					if (strcmp(pp->name, vdpath) == 0) {
+						best_pp = pp;
+					}
+				}
+				if (match == FULL_MATCH)
+					goto out;
+			}
+		}
+	}
+
+out:
+	if (best_pp) {
+		cp = vdev_geom_attach(best_pp, vd, B_TRUE);
+		if (cp == NULL) {
+			printf("ZFS WARNING: Unable to attach to %s.\n",
+			    best_pp->name);
+		}
+	}
+	return (cp);
+}
+
+static struct g_consumer *
+vdev_geom_open_by_guids(vdev_t *vd)
+{
+	struct g_consumer *cp;
+	char *buf;
+	size_t len;
+
+	g_topology_assert();
+
+	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
+	    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
+	cp = vdev_geom_attach_by_guids(vd);
+	if (cp != NULL) {
+		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
+		buf = kmem_alloc(len, KM_SLEEP);
+
+		snprintf(buf, len, "/dev/%s", cp->provider->name);
+		spa_strfree(vd->vdev_path);
+		vd->vdev_path = buf;
+
+		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
+		    (uintmax_t)spa_guid(vd->vdev_spa),
+		    (uintmax_t)vd->vdev_guid, cp->provider->name);
+	} else {
+		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
+		    (uintmax_t)spa_guid(vd->vdev_spa),
+		    (uintmax_t)vd->vdev_guid);
+	}
+
+	return (cp);
+}
+
+static struct g_consumer *
+vdev_geom_open_by_path(vdev_t *vd, int check_guid)
+{
+	struct g_provider *pp;
+	struct g_consumer *cp;
+
+	g_topology_assert();
+
+	cp = NULL;
+	pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1);
+	if (pp != NULL) {
+		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
+		if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
+			cp = vdev_geom_attach(pp, vd, B_FALSE);
+	}
+
+	return (cp);
+}
+
+static int
+vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+	struct g_provider *pp;
+	struct g_consumer *cp;
+	int error, has_trim;
+	uint16_t rate;
+
+	/*
+	 * Set the TLS to indicate downstack that we
+	 * should not access zvols
+	 */
+	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
+
+	/*
+	 * We must have a pathname, and it must be absolute.
+	 */
+	if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (EINVAL);
+	}
+
+	/*
+	 * Reopen the device if it's not currently open. Otherwise,
+	 * just update the physical size of the device.
+	 */
+	if ((cp = vd->vdev_tsd) != NULL) {
+		ASSERT(vd->vdev_reopening);
+		goto skip_open;
+	}
+
+	DROP_GIANT();
+	g_topology_lock();
+	error = 0;
+
+	if (vd->vdev_spa->spa_is_splitting ||
+	    ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
+	    (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
+	    vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) {
+		/*
+		 * We are dealing with a vdev that hasn't been previously
+		 * opened (since boot), and we are not loading an
+		 * existing pool configuration.  This looks like a
+		 * vdev add operation to a new or existing pool.
+		 * Assume the user really wants to do this, and find
+		 * GEOM provider by its name, ignoring GUID mismatches.
+		 *
+		 * XXPOLICY: It would be safer to only allow a device
+		 *           that is unlabeled or labeled but missing
+		 *           GUID information to be opened in this fashion,
+		 *           unless we are doing a split, in which case we
+		 *           should allow any guid.
+		 */
+		cp = vdev_geom_open_by_path(vd, 0);
+	} else {
+		/*
+		 * Try using the recorded path for this device, but only
+		 * accept it if its label data contains the expected GUIDs.
+		 */
+		cp = vdev_geom_open_by_path(vd, 1);
+		if (cp == NULL) {
+			/*
+			 * The device at vd->vdev_path doesn't have the
+			 * expected GUIDs. The disks might have merely
+			 * moved around so try all other GEOM providers
+			 * to find one with the right GUIDs.
+			 */
+			cp = vdev_geom_open_by_guids(vd);
+		}
+	}
+
+	/* Clear the TLS now that tasting is done */
+	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
+
+	if (cp == NULL) {
+		ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
+		error = ENOENT;
+	} else {
+		struct consumer_priv_t *priv;
+		struct consumer_vdev_elem *elem;
+		int spamode;
+
+		priv = (struct consumer_priv_t *)&cp->private;
+		if (cp->private == NULL)
+			SLIST_INIT(priv);
+		elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO);
+		elem->vd = vd;
+		SLIST_INSERT_HEAD(priv, elem, elems);
+
+		spamode = spa_mode(vd->vdev_spa);
+		if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
+		    !ISP2(cp->provider->sectorsize)) {
+			ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
+			    cp->provider->name);
+
+			vdev_geom_close_locked(vd);
+			error = EINVAL;
+			cp = NULL;
+		} else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
+			int i;
+
+			for (i = 0; i < 5; i++) {
+				error = g_access(cp, 0, 1, 0);
+				if (error == 0)
+					break;
+				g_topology_unlock();
+				tsleep(vd, 0, "vdev", hz / 2);
+				g_topology_lock();
+			}
+			if (error != 0) {
+				printf("ZFS WARNING: Unable to open %s for "
+				    "writing (error=%d).\n",
+				    cp->provider->name, error);
+				vdev_geom_close_locked(vd);
+				cp = NULL;
+			}
+		}
+	}
+
+	/* Fetch initial physical path information for this device. */
+	if (cp != NULL) {
+		vdev_geom_attrchanged(cp, "GEOM::physpath");
+
+		/* Set other GEOM characteristics */
+		vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE);
+	}
+
+	g_topology_unlock();
+	PICKUP_GIANT();
+	if (cp == NULL) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]",
+		    error);
+		return (error);
+	}
+skip_open:
+	pp = cp->provider;
+
+	/*
+	 * Determine the actual size of the device.
+	 */
+	*max_psize = *psize = pp->mediasize;
+
+	/*
+	 * Determine the device's minimum transfer size and preferred
+	 * transfer size.
+	 */
+	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
+	*physical_ashift = 0;
+	if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) &&
+	    ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) &&
+	    pp->stripeoffset == 0)
+		*physical_ashift = highbit(pp->stripesize) - 1;
+
+	/*
+	 * Clear the nowritecache settings, so that on a vdev_reopen()
+	 * we will try again.
+	 */
+	vd->vdev_nowritecache = B_FALSE;
+
+	/* Inform the ZIO pipeline that we are non-rotational. */
+	error = g_getattr("GEOM::rotation_rate", cp, &rate);
+	if (error == 0 && rate == DISK_RR_NON_ROTATING)
+		vd->vdev_nonrot = B_TRUE;
+	else
+		vd->vdev_nonrot = B_FALSE;
+
+	/* Set when device reports it supports TRIM. */
+	error = g_getattr("GEOM::candelete", cp, &has_trim);
+	vd->vdev_has_trim = (error == 0 && has_trim);
+
+	/* Set when device reports it supports secure TRIM. */
+	/* unavailable on FreeBSD */
+	vd->vdev_has_securetrim = B_FALSE;
+
+	return (0);
+}
+
+static void
+vdev_geom_close(vdev_t *vd)
+{
+	struct g_consumer *cp;
+	boolean_t locked;
+
+	cp = vd->vdev_tsd;
+
+	DROP_GIANT();
+	locked = g_topology_locked();
+	if (!locked)
+		g_topology_lock();
+
+	if (!vd->vdev_reopening ||
+	    (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
+	    (cp->provider != NULL && cp->provider->error != 0))))
+		vdev_geom_close_locked(vd);
+
+	if (!locked)
+		g_topology_unlock();
+	PICKUP_GIANT();
+}
+
+static void
+vdev_geom_io_intr(struct bio *bp)
+{
+	vdev_t *vd;
+	zio_t *zio;
+
+	zio = bp->bio_caller1;
+	vd = zio->io_vd;
+	zio->io_error = bp->bio_error;
+	if (zio->io_error == 0 && bp->bio_resid != 0)
+		zio->io_error = SET_ERROR(EIO);
+
+	switch (zio->io_error) {
+	case ENOTSUP:
+		/*
+		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
+		 * that future attempts will never succeed. In this case
+		 * we set a persistent flag so that we don't bother with
+		 * requests in the future.
+		 */
+		switch (bp->bio_cmd) {
+		case BIO_FLUSH:
+			vd->vdev_nowritecache = B_TRUE;
+			break;
+		case BIO_DELETE:
+			break;
+		}
+		break;
+	case ENXIO:
+		if (!vd->vdev_remove_wanted) {
+			/*
+			 * If provider's error is set we assume it is being
+			 * removed.
+			 */
+			if (bp->bio_to->error != 0) {
+				vd->vdev_remove_wanted = B_TRUE;
+				spa_async_request(zio->io_spa,
+				    SPA_ASYNC_REMOVE);
+			} else if (!vd->vdev_delayed_close) {
+				vd->vdev_delayed_close = B_TRUE;
+			}
+		}
+		break;
+	}
+
+	/*
+	 * We have to split bio freeing into two parts, because the ABD code
+	 * cannot be called in this context and vdev_op_io_done is not called
+	 * for ZIO_TYPE_IOCTL zio-s.
+	 */
+	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
+		g_destroy_bio(bp);
+		zio->io_bio = NULL;
+	}
+	zio_delay_interrupt(zio);
+}
+
+static void
+vdev_geom_io_start(zio_t *zio)
+{
+	vdev_t *vd;
+	struct g_consumer *cp;
+	struct bio *bp;
+
+	vd = zio->io_vd;
+
+	switch (zio->io_type) {
+	case ZIO_TYPE_IOCTL:
+		/* XXPOLICY */
+		if (!vdev_readable(vd)) {
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
+		} else {
+			switch (zio->io_cmd) {
+			case DKIOCFLUSHWRITECACHE:
+				if (zfs_nocacheflush ||
+				    vdev_geom_bio_flush_disable)
+					break;
+				if (vd->vdev_nowritecache) {
+					zio->io_error = SET_ERROR(ENOTSUP);
+					break;
+				}
+				goto sendreq;
+			default:
+				zio->io_error = SET_ERROR(ENOTSUP);
+			}
+		}
+
+		zio_execute(zio);
+		return;
+	case ZIO_TYPE_TRIM:
+		if (!vdev_geom_bio_delete_disable) {
+			goto sendreq;
+		}
+		zio_execute(zio);
+		return;
+	default:
+			;
+		/* PASSTHROUGH --- placate compiler */
+	}
+sendreq:
+	ASSERT(zio->io_type == ZIO_TYPE_READ ||
+	    zio->io_type == ZIO_TYPE_WRITE ||
+	    zio->io_type == ZIO_TYPE_TRIM ||
+	    zio->io_type == ZIO_TYPE_IOCTL);
+
+	cp = vd->vdev_tsd;
+	if (cp == NULL) {
+		zio->io_error = SET_ERROR(ENXIO);
+		zio_interrupt(zio);
+		return;
+	}
+	bp = g_alloc_bio();
+	bp->bio_caller1 = zio;
+	switch (zio->io_type) {
+	case ZIO_TYPE_READ:
+	case ZIO_TYPE_WRITE:
+		zio->io_target_timestamp = zio_handle_io_delay(zio);
+		bp->bio_offset = zio->io_offset;
+		bp->bio_length = zio->io_size;
+		if (zio->io_type == ZIO_TYPE_READ) {
+			bp->bio_cmd = BIO_READ;
+			bp->bio_data =
+			    abd_borrow_buf(zio->io_abd, zio->io_size);
+		} else {
+			bp->bio_cmd = BIO_WRITE;
+			bp->bio_data =
+			    abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+		}
+		break;
+	case ZIO_TYPE_TRIM:
+		bp->bio_cmd = BIO_DELETE;
+		bp->bio_data = NULL;
+		bp->bio_offset = zio->io_offset;
+		bp->bio_length = zio->io_size;
+		break;
+	case ZIO_TYPE_IOCTL:
+		bp->bio_cmd = BIO_FLUSH;
+		bp->bio_flags |= BIO_ORDERED;
+		bp->bio_data = NULL;
+		bp->bio_offset = cp->provider->mediasize;
+		bp->bio_length = 0;
+		break;
+	default:
+		panic("invalid zio->io_type: %d\n", zio->io_type);
+	}
+	bp->bio_done = vdev_geom_io_intr;
+	zio->io_bio = bp;
+
+	g_io_request(bp, cp);
+}
+
+static void
+vdev_geom_io_done(zio_t *zio)
+{
+	struct bio *bp = zio->io_bio;
+
+	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
+		ASSERT(bp == NULL);
+		return;
+	}
+
+	if (bp == NULL) {
+		ASSERT3S(zio->io_error, ==, ENXIO);
+		return;
+	}
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size);
+	else
+		abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size);
+
+	g_destroy_bio(bp);
+	zio->io_bio = NULL;
+}
+
+static void
+vdev_geom_hold(vdev_t *vd)
+{
+}
+
+static void
+vdev_geom_rele(vdev_t *vd)
+{
+}
+
+vdev_ops_t vdev_disk_ops = {
+	vdev_geom_open,
+	vdev_geom_close,
+	vdev_default_asize,
+	vdev_geom_io_start,
+	vdev_geom_io_done,
+	NULL,
+	NULL,
+	vdev_geom_hold,
+	vdev_geom_rele,
+	NULL,
+	vdev_default_xlate,
+	VDEV_TYPE_DISK,		/* name of this vdev type */
+	B_TRUE			/* leaf vdev */
+};
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c
new file mode 100644
index 000000000000..97cb201934dc
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/vdev.h>
+#include <sys/vdev_os.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/zio.h>
+#include <sys/dsl_scan.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+
+int
+vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
+{
+	spa_t *spa = vd->vdev_spa;
+	zio_t *zio;
+	abd_t *pad2;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+	int error;
+
+	if (size > VDEV_PAD_SIZE)
+		return (EINVAL);
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (ENODEV);
+	if (vdev_is_dead(vd))
+		return (ENXIO);
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+	abd_zero(pad2, VDEV_PAD_SIZE);
+	abd_copy_from_buf(pad2, buf, size);
+
+retry:
+	zio = zio_root(spa, NULL, NULL, flags);
+	vdev_label_write(zio, vd, 0, pad2,
+	    offsetof(vdev_label_t, vl_be),
+	    VDEV_PAD_SIZE, NULL, NULL, flags);
+	error = zio_wait(zio);
+	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+		flags |= ZIO_FLAG_TRYHARD;
+		goto retry;
+	}
+
+	abd_free(pad2);
+	return (error);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
new file mode 100644
index 000000000000..018120c82ab3
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
@@ -0,0 +1,2700 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <acl/acl_common.h>
+
+
+#define	ALLOW	ACE_ACCESS_ALLOWED_ACE_TYPE
+#define	DENY	ACE_ACCESS_DENIED_ACE_TYPE
+#define	MAX_ACE_TYPE	ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
+#define	MIN_ACE_TYPE	ALLOW
+
+#define	OWNING_GROUP		(ACE_GROUP|ACE_IDENTIFIER_GROUP)
+#define	EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
+    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
+#define	EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define	OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+
+#define	ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
+    ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
+    ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
+    ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
+
+#define	WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
+#define	WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
+    ACE_DELETE|ACE_DELETE_CHILD)
+#define	WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
+
+#define	OGE_CLEAR	(ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define	OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define	ALL_INHERIT	(ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
+    ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE)
+
+#define	RESTRICTED_CLEAR	(ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define	V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\
+    ZFS_ACL_PROTECTED)
+
+#define	ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
+    ZFS_ACL_OBJ_ACE)
+
+#define	ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
+
+static uint16_t
+zfs_ace_v0_get_type(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_v0_get_flags(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_v0_get_mask(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_v0_get_who(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_v0_set_type(void *acep, uint16_t type)
+{
+	((zfs_oldace_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_v0_set_flags(void *acep, uint16_t flags)
+{
+	((zfs_oldace_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_v0_set_mask(void *acep, uint32_t mask)
+{
+	((zfs_oldace_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_v0_set_who(void *acep, uint64_t who)
+{
+	((zfs_oldace_t *)acep)->z_fuid = who;
+}
+
+/*ARGSUSED*/
+static size_t
+zfs_ace_v0_size(void *acep)
+{
+	return (sizeof (zfs_oldace_t));
+}
+
+static size_t
+zfs_ace_v0_abstract_size(void)
+{
+	return (sizeof (zfs_oldace_t));
+}
+
+static int
+zfs_ace_v0_mask_off(void)
+{
+	return (offsetof(zfs_oldace_t, z_access_mask));
+}
+
+/*ARGSUSED*/
+static int
+zfs_ace_v0_data(void *acep, void **datap)
+{
+	*datap = NULL;
+	return (0);
+}
+
+static acl_ops_t zfs_acl_v0_ops = {
+	zfs_ace_v0_get_mask,
+	zfs_ace_v0_set_mask,
+	zfs_ace_v0_get_flags,
+	zfs_ace_v0_set_flags,
+	zfs_ace_v0_get_type,
+	zfs_ace_v0_set_type,
+	zfs_ace_v0_get_who,
+	zfs_ace_v0_set_who,
+	zfs_ace_v0_size,
+	zfs_ace_v0_abstract_size,
+	zfs_ace_v0_mask_off,
+	zfs_ace_v0_data
+};
+
+static uint16_t
+zfs_ace_fuid_get_type(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_fuid_get_flags(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_fuid_get_mask(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_fuid_get_who(void *args)
+{
+	uint16_t entry_type;
+	zfs_ace_t *acep = args;
+
+	entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+	if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+	    entry_type == ACE_EVERYONE)
+		return (-1);
+	return (((zfs_ace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_fuid_set_type(void *acep, uint16_t type)
+{
+	((zfs_ace_hdr_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
+{
+	((zfs_ace_hdr_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
+{
+	((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_fuid_set_who(void *arg, uint64_t who)
+{
+	zfs_ace_t *acep = arg;
+
+	uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+	if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+	    entry_type == ACE_EVERYONE)
+		return;
+	acep->z_fuid = who;
+}
+
+static size_t
+zfs_ace_fuid_size(void *acep)
+{
+	zfs_ace_hdr_t *zacep = acep;
+	uint16_t entry_type;
+
+	switch (zacep->z_type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		return (sizeof (zfs_object_ace_t));
+	case ALLOW:
+	case DENY:
+		entry_type =
+		    (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
+		if (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE)
+			return (sizeof (zfs_ace_hdr_t));
+		/*FALLTHROUGH*/
+	default:
+		return (sizeof (zfs_ace_t));
+	}
+}
+
+static size_t
+zfs_ace_fuid_abstract_size(void)
+{
+	return (sizeof (zfs_ace_hdr_t));
+}
+
+static int
+zfs_ace_fuid_mask_off(void)
+{
+	return (offsetof(zfs_ace_hdr_t, z_access_mask));
+}
+
+static int
+zfs_ace_fuid_data(void *acep, void **datap)
+{
+	zfs_ace_t *zacep = acep;
+	zfs_object_ace_t *zobjp;
+
+	switch (zacep->z_hdr.z_type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		zobjp = acep;
+		*datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
+		return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
+	default:
+		*datap = NULL;
+		return (0);
+	}
+}
+
+static acl_ops_t zfs_acl_fuid_ops = {
+	zfs_ace_fuid_get_mask,
+	zfs_ace_fuid_set_mask,
+	zfs_ace_fuid_get_flags,
+	zfs_ace_fuid_set_flags,
+	zfs_ace_fuid_get_type,
+	zfs_ace_fuid_set_type,
+	zfs_ace_fuid_get_who,
+	zfs_ace_fuid_set_who,
+	zfs_ace_fuid_size,
+	zfs_ace_fuid_abstract_size,
+	zfs_ace_fuid_mask_off,
+	zfs_ace_fuid_data
+};
+
+/*
+ * The following three functions are provided for compatibility with
+ * older ZPL version in order to determine if the file use to have
+ * an external ACL and what version of ACL previously existed on the
+ * file.  Would really be nice to not need this, sigh.
+ */
+uint64_t
+zfs_external_acl(znode_t *zp)
+{
+	zfs_acl_phys_t acl_phys;
+	int error;
+
+	if (zp->z_is_sa)
+		return (0);
+
+	/*
+	 * Need to deal with a potential
+	 * race where zfs_sa_upgrade could cause
+	 * z_isa_sa to change.
+	 *
+	 * If the lookup fails then the state of z_is_sa should have
+	 * changed.
+	 */
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+	    &acl_phys, sizeof (acl_phys))) == 0)
+		return (acl_phys.z_acl_extern_obj);
+	else {
+		/*
+		 * after upgrade the SA_ZPL_ZNODE_ACL should have been
+		 * removed
+		 */
+		VERIFY(zp->z_is_sa && error == ENOENT);
+		return (0);
+	}
+}
+
+/*
+ * Determine size of ACL in bytes
+ *
+ * This is more complicated than it should be since we have to deal
+ * with old external ACLs.
+ */
+static int
+zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
+    zfs_acl_phys_t *aclphys)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	uint64_t acl_count;
+	int size;
+	int error;
+
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	if (zp->z_is_sa) {
+		if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
+		    &size)) != 0)
+			return (error);
+		*aclsize = size;
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
+		    &acl_count, sizeof (acl_count))) != 0)
+			return (error);
+		*aclcount = acl_count;
+	} else {
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+		    aclphys, sizeof (*aclphys))) != 0)
+			return (error);
+
+		if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+			*aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
+			*aclcount = aclphys->z_acl_size;
+		} else {
+			*aclsize = aclphys->z_acl_size;
+			*aclcount = aclphys->z_acl_count;
+		}
+	}
+	return (0);
+}
+
+int
+zfs_znode_acl_version(znode_t *zp)
+{
+	zfs_acl_phys_t acl_phys;
+
+	if (zp->z_is_sa)
+		return (ZFS_ACL_VERSION_FUID);
+	else {
+		int error;
+
+		/*
+		 * Need to deal with a potential
+		 * race where zfs_sa_upgrade could cause
+		 * z_isa_sa to change.
+		 *
+		 * If the lookup fails then the state of z_is_sa should have
+		 * changed.
+		 */
+		if ((error = sa_lookup(zp->z_sa_hdl,
+		    SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+		    &acl_phys, sizeof (acl_phys))) == 0)
+			return (acl_phys.z_acl_version);
+		else {
+			/*
+			 * After upgrade SA_ZPL_ZNODE_ACL should have
+			 * been removed.
+			 */
+			VERIFY(zp->z_is_sa && error == ENOENT);
+			return (ZFS_ACL_VERSION_FUID);
+		}
+	}
+}
+
+static int
+zfs_acl_version(int version)
+{
+	if (version < ZPL_VERSION_FUID)
+		return (ZFS_ACL_VERSION_INITIAL);
+	else
+		return (ZFS_ACL_VERSION_FUID);
+}
+
+static int
+zfs_acl_version_zp(znode_t *zp)
+{
+	return (zfs_acl_version(zp->z_zfsvfs->z_version));
+}
+
+zfs_acl_t *
+zfs_acl_alloc(int vers)
+{
+	zfs_acl_t *aclp;
+
+	aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
+	list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
+	    offsetof(zfs_acl_node_t, z_next));
+	aclp->z_version = vers;
+	if (vers == ZFS_ACL_VERSION_FUID)
+		aclp->z_ops = &zfs_acl_fuid_ops;
+	else
+		aclp->z_ops = &zfs_acl_v0_ops;
+	return (aclp);
+}
+
+zfs_acl_node_t *
+zfs_acl_node_alloc(size_t bytes)
+{
+	zfs_acl_node_t *aclnode;
+
+	aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
+	if (bytes) {
+		aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
+		aclnode->z_allocdata = aclnode->z_acldata;
+		aclnode->z_allocsize = bytes;
+		aclnode->z_size = bytes;
+	}
+
+	return (aclnode);
+}
+
+static void
+zfs_acl_node_free(zfs_acl_node_t *aclnode)
+{
+	if (aclnode->z_allocsize)
+		kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
+	kmem_free(aclnode, sizeof (zfs_acl_node_t));
+}
+
+static void
+zfs_acl_release_nodes(zfs_acl_t *aclp)
+{
+	zfs_acl_node_t *aclnode;
+
+	while ((aclnode = list_head(&aclp->z_acl))) {
+		list_remove(&aclp->z_acl, aclnode);
+		zfs_acl_node_free(aclnode);
+	}
+	aclp->z_acl_count = 0;
+	aclp->z_acl_bytes = 0;
+}
+
+void
+zfs_acl_free(zfs_acl_t *aclp)
+{
+	zfs_acl_release_nodes(aclp);
+	list_destroy(&aclp->z_acl);
+	kmem_free(aclp, sizeof (zfs_acl_t));
+}
+
+static boolean_t
+zfs_acl_valid_ace_type(uint_t type, uint_t flags)
+{
+	uint16_t entry_type;
+
+	switch (type) {
+	case ALLOW:
+	case DENY:
+	case ACE_SYSTEM_AUDIT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_ACE_TYPE:
+		entry_type = flags & ACE_TYPE_FLAGS;
+		return (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE || entry_type == 0 ||
+		    entry_type == ACE_IDENTIFIER_GROUP);
+	default:
+		if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+static boolean_t
+zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
+{
+	/*
+	 * first check type of entry
+	 */
+
+	if (!zfs_acl_valid_ace_type(type, iflags))
+		return (B_FALSE);
+
+	switch (type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		if (aclp->z_version < ZFS_ACL_VERSION_FUID)
+			return (B_FALSE);
+		aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+	}
+
+	/*
+	 * next check inheritance level flags
+	 */
+
+	if (obj_type == VDIR &&
+	    (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+		aclp->z_hints |= ZFS_INHERIT_ACE;
+
+	if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+		if ((iflags & (ACE_FILE_INHERIT_ACE|
+		    ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+			return (B_FALSE);
+		}
+	}
+
+	return (B_TRUE);
+}
+
+static void *
+zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
+    uint32_t *access_mask, uint16_t *iflags, uint16_t *type)
+{
+	zfs_acl_node_t *aclnode;
+
+	ASSERT(aclp);
+
+	if (start == NULL) {
+		aclnode = list_head(&aclp->z_acl);
+		if (aclnode == NULL)
+			return (NULL);
+
+		aclp->z_next_ace = aclnode->z_acldata;
+		aclp->z_curr_node = aclnode;
+		aclnode->z_ace_idx = 0;
+	}
+
+	aclnode = aclp->z_curr_node;
+
+	if (aclnode == NULL)
+		return (NULL);
+
+	if (aclnode->z_ace_idx >= aclnode->z_ace_count) {
+		aclnode = list_next(&aclp->z_acl, aclnode);
+		if (aclnode == NULL)
+			return (NULL);
+		else {
+			aclp->z_curr_node = aclnode;
+			aclnode->z_ace_idx = 0;
+			aclp->z_next_ace = aclnode->z_acldata;
+		}
+	}
+
+	if (aclnode->z_ace_idx < aclnode->z_ace_count) {
+		void *acep = aclp->z_next_ace;
+		size_t ace_size;
+
+		/*
+		 * Make sure we don't overstep our bounds
+		 */
+		ace_size = aclp->z_ops->ace_size(acep);
+
+		if (((caddr_t)acep + ace_size) >
+		    ((caddr_t)aclnode->z_acldata + aclnode->z_size)) {
+			return (NULL);
+		}
+
+		*iflags = aclp->z_ops->ace_flags_get(acep);
+		*type = aclp->z_ops->ace_type_get(acep);
+		*access_mask = aclp->z_ops->ace_mask_get(acep);
+		*who = aclp->z_ops->ace_who_get(acep);
+		aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
+		aclnode->z_ace_idx++;
+
+		return ((void *)acep);
+	}
+	return (NULL);
+}
+
+/*ARGSUSED*/
+static uint64_t
+zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
+    uint16_t *flags, uint16_t *type, uint32_t *mask)
+{
+	zfs_acl_t *aclp = datap;
+	zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
+	uint64_t who;
+
+	acep = zfs_acl_next_ace(aclp, acep, &who, mask,
+	    flags, type);
+	return ((uint64_t)(uintptr_t)acep);
+}
+
+/*
+ * Copy ACE to internal ZFS format.
+ * While processing the ACL each ACE will be validated for correctness.
+ * ACE FUIDs will be created later.
+ */
+static int
+zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
+    void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
+    zfs_fuid_info_t **fuidp, cred_t *cr)
+{
+	int i;
+	uint16_t entry_type;
+	zfs_ace_t *aceptr = z_acl;
+	ace_t *acep = datap;
+	zfs_object_ace_t *zobjacep;
+	ace_object_t *aceobjp;
+
+	for (i = 0; i != aclcnt; i++) {
+		aceptr->z_hdr.z_access_mask = acep->a_access_mask;
+		aceptr->z_hdr.z_flags = acep->a_flags;
+		aceptr->z_hdr.z_type = acep->a_type;
+		entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
+		if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
+		    entry_type != ACE_EVERYONE) {
+			aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
+			    cr, (entry_type == 0) ?
+			    ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
+		}
+
+		/*
+		 * Make sure ACE is valid
+		 */
+		if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type,
+		    aceptr->z_hdr.z_flags) != B_TRUE)
+			return (SET_ERROR(EINVAL));
+
+		switch (acep->a_type) {
+		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+		case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+			zobjacep = (zfs_object_ace_t *)aceptr;
+			aceobjp = (ace_object_t *)acep;
+
+			bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
+			    sizeof (aceobjp->a_obj_type));
+			bcopy(aceobjp->a_inherit_obj_type,
+			    zobjacep->z_inherit_type,
+			    sizeof (aceobjp->a_inherit_obj_type));
+			acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
+			break;
+		default:
+			acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
+		}
+
+		aceptr = (zfs_ace_t *)((caddr_t)aceptr +
+		    aclp->z_ops->ace_size(aceptr));
+	}
+
+	*size = (caddr_t)aceptr - (caddr_t)z_acl;
+
+	return (0);
+}
+
+/*
+ * Copy ZFS ACEs to fixed size ace_t layout
+ */
+static void
+zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr,
+    void *datap, int filter)
+{
+	uint64_t who;
+	uint32_t access_mask;
+	uint16_t iflags, type;
+	zfs_ace_hdr_t *zacep = NULL;
+	ace_t *acep = datap;
+	ace_object_t *objacep;
+	zfs_object_ace_t *zobjacep;
+	size_t ace_size;
+	uint16_t entry_type;
+
+	while ((zacep = zfs_acl_next_ace(aclp, zacep,
+	    &who, &access_mask, &iflags, &type))) {
+
+		switch (type) {
+		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+		case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+			if (filter) {
+				continue;
+			}
+			zobjacep = (zfs_object_ace_t *)zacep;
+			objacep = (ace_object_t *)acep;
+			bcopy(zobjacep->z_object_type,
+			    objacep->a_obj_type,
+			    sizeof (zobjacep->z_object_type));
+			bcopy(zobjacep->z_inherit_type,
+			    objacep->a_inherit_obj_type,
+			    sizeof (zobjacep->z_inherit_type));
+			ace_size = sizeof (ace_object_t);
+			break;
+		default:
+			ace_size = sizeof (ace_t);
+			break;
+		}
+
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+		if ((entry_type != ACE_OWNER &&
+		    entry_type != OWNING_GROUP &&
+		    entry_type != ACE_EVERYONE)) {
+			acep->a_who = zfs_fuid_map_id(zfsvfs, who,
+			    cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
+			    ZFS_ACE_GROUP : ZFS_ACE_USER);
+		} else {
+			acep->a_who = (uid_t)(int64_t)who;
+		}
+		acep->a_access_mask = access_mask;
+		acep->a_flags = iflags;
+		acep->a_type = type;
+		acep = (ace_t *)((caddr_t)acep + ace_size);
+	}
+}
+
+static int
+zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep,
+    zfs_oldace_t *z_acl, int aclcnt, size_t *size)
+{
+	int i;
+	zfs_oldace_t *aceptr = z_acl;
+
+	for (i = 0; i != aclcnt; i++, aceptr++) {
+		aceptr->z_access_mask = acep[i].a_access_mask;
+		aceptr->z_type = acep[i].a_type;
+		aceptr->z_flags = acep[i].a_flags;
+		aceptr->z_fuid = acep[i].a_who;
+		/*
+		 * Make sure ACE is valid
+		 */
+		if (zfs_ace_valid(obj_type, aclp, aceptr->z_type,
+		    aceptr->z_flags) != B_TRUE)
+			return (SET_ERROR(EINVAL));
+	}
+	*size = (caddr_t)aceptr - (caddr_t)z_acl;
+	return (0);
+}
+
+/*
+ * convert old ACL format to new
+ */
+void
+zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
+{
+	zfs_oldace_t *oldaclp;
+	int i;
+	uint16_t type, iflags;
+	uint32_t access_mask;
+	uint64_t who;
+	void *cookie = NULL;
+	zfs_acl_node_t *newaclnode;
+
+	ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
+	/*
+	 * First create the ACE in a contiguous piece of memory
+	 * for zfs_copy_ace_2_fuid().
+	 *
+	 * We only convert an ACL once, so this won't happen
+	 * everytime.
+	 */
+	oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
+	    KM_SLEEP);
+	i = 0;
+	while ((cookie = zfs_acl_next_ace(aclp, cookie, &who,
+	    &access_mask, &iflags, &type))) {
+		oldaclp[i].z_flags = iflags;
+		oldaclp[i].z_type = type;
+		oldaclp[i].z_fuid = who;
+		oldaclp[i++].z_access_mask = access_mask;
+	}
+
+	newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
+	    sizeof (zfs_object_ace_t));
+	aclp->z_ops = &zfs_acl_fuid_ops;
+	VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp,
+	    oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
+	    &newaclnode->z_size, NULL, cr) == 0);
+	newaclnode->z_ace_count = aclp->z_acl_count;
+	aclp->z_version = ZFS_ACL_VERSION;
+	kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
+
+	/*
+	 * Release all previous ACL nodes
+	 */
+
+	zfs_acl_release_nodes(aclp);
+
+	list_insert_head(&aclp->z_acl, newaclnode);
+
+	aclp->z_acl_bytes = newaclnode->z_size;
+	aclp->z_acl_count = newaclnode->z_ace_count;
+
+}
+
+/*
+ * Convert unix access mask to v4 access mask
+ */
+static uint32_t
+zfs_unix_to_v4(uint32_t access_mask)
+{
+	uint32_t new_mask = 0;
+
+	if (access_mask & S_IXOTH)
+		new_mask |= ACE_EXECUTE;
+	if (access_mask & S_IWOTH)
+		new_mask |= ACE_WRITE_DATA;
+	if (access_mask & S_IROTH)
+		new_mask |= ACE_READ_DATA;
+	return (new_mask);
+}
+
+static void
+zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
+    uint16_t access_type, uint64_t fuid, uint16_t entry_type)
+{
+	uint16_t type = entry_type & ACE_TYPE_FLAGS;
+
+	aclp->z_ops->ace_mask_set(acep, access_mask);
+	aclp->z_ops->ace_type_set(acep, access_type);
+	aclp->z_ops->ace_flags_set(acep, entry_type);
+	if ((type != ACE_OWNER && type != OWNING_GROUP &&
+	    type != ACE_EVERYONE))
+		aclp->z_ops->ace_who_set(acep, fuid);
+}
+
+/*
+ * Determine mode of file based on ACL.
+ */
+uint64_t
+zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
+    uint64_t *pflags, uint64_t fuid, uint64_t fgid)
+{
+	int		entry_type;
+	mode_t		mode;
+	mode_t		seen = 0;
+	zfs_ace_hdr_t 	*acep = NULL;
+	uint64_t	who;
+	uint16_t	iflags, type;
+	uint32_t	access_mask;
+	boolean_t	an_exec_denied = B_FALSE;
+
+	mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+
+	while ((acep = zfs_acl_next_ace(aclp, acep, &who,
+	    &access_mask, &iflags, &type))) {
+
+		if (!zfs_acl_valid_ace_type(type, iflags))
+			continue;
+
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+
+		/*
+		 * Skip over any inherit_only ACEs
+		 */
+		if (iflags & ACE_INHERIT_ONLY_ACE)
+			continue;
+
+		if (entry_type == ACE_OWNER || (entry_type == 0 &&
+		    who == fuid)) {
+			if ((access_mask & ACE_READ_DATA) &&
+			    (!(seen & S_IRUSR))) {
+				seen |= S_IRUSR;
+				if (type == ALLOW) {
+					mode |= S_IRUSR;
+				}
+			}
+			if ((access_mask & ACE_WRITE_DATA) &&
+			    (!(seen & S_IWUSR))) {
+				seen |= S_IWUSR;
+				if (type == ALLOW) {
+					mode |= S_IWUSR;
+				}
+			}
+			if ((access_mask & ACE_EXECUTE) &&
+			    (!(seen & S_IXUSR))) {
+				seen |= S_IXUSR;
+				if (type == ALLOW) {
+					mode |= S_IXUSR;
+				}
+			}
+		} else if (entry_type == OWNING_GROUP ||
+		    (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
+			if ((access_mask & ACE_READ_DATA) &&
+			    (!(seen & S_IRGRP))) {
+				seen |= S_IRGRP;
+				if (type == ALLOW) {
+					mode |= S_IRGRP;
+				}
+			}
+			if ((access_mask & ACE_WRITE_DATA) &&
+			    (!(seen & S_IWGRP))) {
+				seen |= S_IWGRP;
+				if (type == ALLOW) {
+					mode |= S_IWGRP;
+				}
+			}
+			if ((access_mask & ACE_EXECUTE) &&
+			    (!(seen & S_IXGRP))) {
+				seen |= S_IXGRP;
+				if (type == ALLOW) {
+					mode |= S_IXGRP;
+				}
+			}
+		} else if (entry_type == ACE_EVERYONE) {
+			if ((access_mask & ACE_READ_DATA)) {
+				if (!(seen & S_IRUSR)) {
+					seen |= S_IRUSR;
+					if (type == ALLOW) {
+						mode |= S_IRUSR;
+					}
+				}
+				if (!(seen & S_IRGRP)) {
+					seen |= S_IRGRP;
+					if (type == ALLOW) {
+						mode |= S_IRGRP;
+					}
+				}
+				if (!(seen & S_IROTH)) {
+					seen |= S_IROTH;
+					if (type == ALLOW) {
+						mode |= S_IROTH;
+					}
+				}
+			}
+			if ((access_mask & ACE_WRITE_DATA)) {
+				if (!(seen & S_IWUSR)) {
+					seen |= S_IWUSR;
+					if (type == ALLOW) {
+						mode |= S_IWUSR;
+					}
+				}
+				if (!(seen & S_IWGRP)) {
+					seen |= S_IWGRP;
+					if (type == ALLOW) {
+						mode |= S_IWGRP;
+					}
+				}
+				if (!(seen & S_IWOTH)) {
+					seen |= S_IWOTH;
+					if (type == ALLOW) {
+						mode |= S_IWOTH;
+					}
+				}
+			}
+			if ((access_mask & ACE_EXECUTE)) {
+				if (!(seen & S_IXUSR)) {
+					seen |= S_IXUSR;
+					if (type == ALLOW) {
+						mode |= S_IXUSR;
+					}
+				}
+				if (!(seen & S_IXGRP)) {
+					seen |= S_IXGRP;
+					if (type == ALLOW) {
+						mode |= S_IXGRP;
+					}
+				}
+				if (!(seen & S_IXOTH)) {
+					seen |= S_IXOTH;
+					if (type == ALLOW) {
+						mode |= S_IXOTH;
+					}
+				}
+			}
+		} else {
+			/*
+			 * Only care if this IDENTIFIER_GROUP or
+			 * USER ACE denies execute access to someone,
+			 * mode is not affected
+			 */
+			if ((access_mask & ACE_EXECUTE) && type == DENY)
+				an_exec_denied = B_TRUE;
+		}
+	}
+
+	/*
+	 * Failure to allow is effectively a deny, so execute permission
+	 * is denied if it was never mentioned or if we explicitly
+	 * weren't allowed it.
+	 */
+	if (!an_exec_denied &&
+	    ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
+	    (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
+		an_exec_denied = B_TRUE;
+
+	if (an_exec_denied)
+		*pflags &= ~ZFS_NO_EXECS_DENIED;
+	else
+		*pflags |= ZFS_NO_EXECS_DENIED;
+
+	return (mode);
+}
+
+/*
+ * Read an external acl object.  If the intent is to modify, always
+ * create a new acl and leave any cached acl in place.
+ */
+int
+zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
+    boolean_t will_modify)
+{
+	zfs_acl_t	*aclp;
+	int		aclsize;
+	int		acl_count;
+	zfs_acl_node_t	*aclnode;
+	zfs_acl_phys_t	znode_acl;
+	int		version;
+	int		error;
+
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	if (zp->z_zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+
+	if (zp->z_acl_cached && !will_modify) {
+		*aclpp = zp->z_acl_cached;
+		return (0);
+	}
+
+	version = zfs_znode_acl_version(zp);
+
+	if ((error = zfs_acl_znode_info(zp, &aclsize,
+	    &acl_count, &znode_acl)) != 0) {
+		goto done;
+	}
+
+	aclp = zfs_acl_alloc(version);
+
+	aclp->z_acl_count = acl_count;
+	aclp->z_acl_bytes = aclsize;
+
+	aclnode = zfs_acl_node_alloc(aclsize);
+	aclnode->z_ace_count = aclp->z_acl_count;
+	aclnode->z_size = aclsize;
+
+	if (!zp->z_is_sa) {
+		if (znode_acl.z_acl_extern_obj) {
+			error = dmu_read(zp->z_zfsvfs->z_os,
+			    znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
+			    aclnode->z_acldata, DMU_READ_PREFETCH);
+		} else {
+			bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
+			    aclnode->z_size);
+		}
+	} else {
+		error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs),
+		    aclnode->z_acldata, aclnode->z_size);
+	}
+
+	if (error != 0) {
+		zfs_acl_free(aclp);
+		zfs_acl_node_free(aclnode);
+		/* convert checksum errors into IO errors */
+		if (error == ECKSUM)
+			error = SET_ERROR(EIO);
+		goto done;
+	}
+
+	list_insert_head(&aclp->z_acl, aclnode);
+
+	*aclpp = aclp;
+	if (!will_modify)
+		zp->z_acl_cached = aclp;
+done:
+	return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
+    boolean_t start, void *userdata)
+{
+	zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
+
+	if (start) {
+		cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
+	} else {
+		cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
+		    cb->cb_acl_node);
+	}
+	*dataptr = cb->cb_acl_node->z_acldata;
+	*length = cb->cb_acl_node->z_size;
+}
+
+int
+zfs_acl_chown_setattr(znode_t *zp)
+{
+	int error;
+	zfs_acl_t *aclp;
+
+	if (zp->z_zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	ASSERT_VOP_IN_SEQC(ZTOV(zp));
+
+	if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0)
+		zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
+		    &zp->z_pflags, zp->z_uid, zp->z_gid);
+	return (error);
+}
+
+/*
+ * common code for setting ACLs.
+ *
+ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
+ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
+ * already checked the acl and knows whether to inherit.
+ */
+int
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
+{
+	int			error;
+	zfsvfs_t		*zfsvfs = zp->z_zfsvfs;
+	dmu_object_type_t	otype;
+	zfs_acl_locator_cb_t	locate = { 0 };
+	uint64_t		mode;
+	sa_bulk_attr_t		bulk[5];
+	uint64_t		ctime[2];
+	int			count = 0;
+	zfs_acl_phys_t		acl_phys;
+
+	ASSERT_VOP_IN_SEQC(ZTOV(zp));
+
+	mode = zp->z_mode;
+
+	mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
+	    zp->z_uid, zp->z_gid);
+
+	zp->z_mode = mode;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+	    &mode, sizeof (mode));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    &ctime, sizeof (ctime));
+
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
+	/*
+	 * Upgrade needed?
+	 */
+	if (!zfsvfs->z_use_fuids) {
+		otype = DMU_OT_OLDACL;
+	} else {
+		if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
+		    (zfsvfs->z_version >= ZPL_VERSION_FUID))
+			zfs_acl_xform(zp, aclp, cr);
+		ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
+		otype = DMU_OT_ACL;
+	}
+
+	/*
+	 * Arrgh, we have to handle old on disk format
+	 * as well as newer (preferred) SA format.
+	 */
+
+	if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
+		locate.cb_aclp = aclp;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
+		    zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
+		    NULL, &aclp->z_acl_count, sizeof (uint64_t));
+	} else { /* Painful legacy way */
+		zfs_acl_node_t *aclnode;
+		uint64_t off = 0;
+		uint64_t aoid;
+
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+		    &acl_phys, sizeof (acl_phys))) != 0)
+			return (error);
+
+		aoid = acl_phys.z_acl_extern_obj;
+
+		if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+			/*
+			 * If ACL was previously external and we are now
+			 * converting to new ACL format then release old
+			 * ACL object and create a new one.
+			 */
+			if (aoid &&
+			    aclp->z_version != acl_phys.z_acl_version) {
+				error = dmu_object_free(zfsvfs->z_os, aoid, tx);
+				if (error)
+					return (error);
+				aoid = 0;
+			}
+			if (aoid == 0) {
+				aoid = dmu_object_alloc(zfsvfs->z_os,
+				    otype, aclp->z_acl_bytes,
+				    otype == DMU_OT_ACL ?
+				    DMU_OT_SYSACL : DMU_OT_NONE,
+				    otype == DMU_OT_ACL ?
+				    DN_OLD_MAX_BONUSLEN : 0, tx);
+			} else {
+				(void) dmu_object_set_blocksize(zfsvfs->z_os,
+				    aoid, aclp->z_acl_bytes, 0, tx);
+			}
+			acl_phys.z_acl_extern_obj = aoid;
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				if (aclnode->z_ace_count == 0)
+					continue;
+				dmu_write(zfsvfs->z_os, aoid, off,
+				    aclnode->z_size, aclnode->z_acldata, tx);
+				off += aclnode->z_size;
+			}
+		} else {
+			void *start = acl_phys.z_ace_data;
+			/*
+			 * Migrating back embedded?
+			 */
+			if (acl_phys.z_acl_extern_obj) {
+				error = dmu_object_free(zfsvfs->z_os,
+				    acl_phys.z_acl_extern_obj, tx);
+				if (error)
+					return (error);
+				acl_phys.z_acl_extern_obj = 0;
+			}
+
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				if (aclnode->z_ace_count == 0)
+					continue;
+				bcopy(aclnode->z_acldata, start,
+				    aclnode->z_size);
+				start = (caddr_t)start + aclnode->z_size;
+			}
+		}
+		/*
+		 * If Old version then swap count/bytes to match old
+		 * layout of znode_acl_phys_t.
+		 */
+		if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+			acl_phys.z_acl_size = aclp->z_acl_count;
+			acl_phys.z_acl_count = aclp->z_acl_bytes;
+		} else {
+			acl_phys.z_acl_size = aclp->z_acl_bytes;
+			acl_phys.z_acl_count = aclp->z_acl_count;
+		}
+		acl_phys.z_acl_version = aclp->z_version;
+
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+		    &acl_phys, sizeof (acl_phys));
+	}
+
+	/*
+	 * Replace ACL wide bits, but first clear them.
+	 */
+	zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
+
+	zp->z_pflags |= aclp->z_hints;
+
+	if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
+		zp->z_pflags |= ZFS_ACL_TRIVIAL;
+
+	zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
+	return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
+}
+
+static void
+zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim,
+    zfs_acl_t *aclp)
+{
+	void		*acep = NULL;
+	uint64_t	who;
+	int		new_count, new_bytes;
+	int		ace_size;
+	int 		entry_type;
+	uint16_t	iflags, type;
+	uint32_t	access_mask;
+	zfs_acl_node_t	*newnode;
+	size_t 		abstract_size = aclp->z_ops->ace_abstract_size();
+	void 		*zacep;
+	boolean_t	isdir;
+	trivial_acl_t	masks;
+
+	new_count = new_bytes = 0;
+
+	isdir = (vtype == VDIR);
+
+	acl_trivial_access_masks((mode_t)mode, isdir, &masks);
+
+	newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
+
+	zacep = newnode->z_acldata;
+	if (masks.allow0) {
+		zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
+	if (masks.deny1) {
+		zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
+	if (masks.deny2) {
+		zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
+
+	while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+	    &iflags, &type))) {
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+		/*
+		 * ACEs used to represent the file mode may be divided
+		 * into an equivalent pair of inherit-only and regular
+		 * ACEs, if they are inheritable.
+		 * Skip regular ACEs, which are replaced by the new mode.
+		 */
+		if (split && (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE)) {
+			if (!isdir || !(iflags &
+			    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+				continue;
+			/*
+			 * We preserve owner@, group@, or @everyone
+			 * permissions, if they are inheritable, by
+			 * copying them to inherit_only ACEs. This
+			 * prevents inheritable permissions from being
+			 * altered along with the file mode.
+			 */
+			iflags |= ACE_INHERIT_ONLY_ACE;
+		}
+
+		/*
+		 * If this ACL has any inheritable ACEs, mark that in
+		 * the hints (which are later masked into the pflags)
+		 * so create knows to do inheritance.
+		 */
+		if (isdir && (iflags &
+		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+			aclp->z_hints |= ZFS_INHERIT_ACE;
+
+		if ((type != ALLOW && type != DENY) ||
+		    (iflags & ACE_INHERIT_ONLY_ACE)) {
+			switch (type) {
+			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+				aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+				break;
+			}
+		} else {
+			/*
+			 * Limit permissions granted by ACEs to be no greater
+			 * than permissions of the requested group mode.
+			 * Applies when the "aclmode" property is set to
+			 * "groupmask".
+			 */
+			if ((type == ALLOW) && trim)
+				access_mask &= masks.group;
+		}
+		zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
+		ace_size = aclp->z_ops->ace_size(acep);
+		zacep = (void *)((uintptr_t)zacep + ace_size);
+		new_count++;
+		new_bytes += ace_size;
+	}
+	zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER);
+	zacep = (void *)((uintptr_t)zacep + abstract_size);
+	zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP);
+	zacep = (void *)((uintptr_t)zacep + abstract_size);
+	zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE);
+
+	new_count += 3;
+	new_bytes += abstract_size * 3;
+	zfs_acl_release_nodes(aclp);
+	aclp->z_acl_count = new_count;
+	aclp->z_acl_bytes = new_bytes;
+	newnode->z_ace_count = new_count;
+	newnode->z_size = new_bytes;
+	list_insert_tail(&aclp->z_acl, newnode);
+}
+
+int
+zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
+{
+	int error = 0;
+
+	mutex_enter(&zp->z_acl_lock);
+	if (zp->z_zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+	if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
+		*aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+	else
+		error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE);
+
+	if (error == 0) {
+		(*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
+		zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE,
+		    (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
+	}
+	mutex_exit(&zp->z_acl_lock);
+
+	return (error);
+}
+
+/*
+ * Should ACE be inherited?
+ */
+static int
+zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags)
+{
+	int	iflags = (acep_flags & 0xf);
+
+	if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+		return (1);
+	else if (iflags & ACE_FILE_INHERIT_ACE)
+		return (!((vtype == VDIR) &&
+		    (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
+	return (0);
+}
+
+/*
+ * inherit inheritable ACEs from parent
+ */
+static zfs_acl_t *
+zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
+    uint64_t mode, boolean_t *need_chmod)
+{
+	void		*pacep = NULL;
+	void		*acep;
+	zfs_acl_node_t  *aclnode;
+	zfs_acl_t	*aclp = NULL;
+	uint64_t	who;
+	uint32_t	access_mask;
+	uint16_t	iflags, newflags, type;
+	size_t		ace_size;
+	void		*data1, *data2;
+	size_t		data1sz, data2sz;
+	uint_t		aclinherit;
+	boolean_t	isdir = (vtype == VDIR);
+	boolean_t	isreg = (vtype == VREG);
+
+	*need_chmod = B_TRUE;
+
+	aclp = zfs_acl_alloc(paclp->z_version);
+	aclinherit = zfsvfs->z_acl_inherit;
+	if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK)
+		return (aclp);
+
+	while ((pacep = zfs_acl_next_ace(paclp, pacep, &who,
+	    &access_mask, &iflags, &type))) {
+
+		/*
+		 * don't inherit bogus ACEs
+		 */
+		if (!zfs_acl_valid_ace_type(type, iflags))
+			continue;
+
+		/*
+		 * Check if ACE is inheritable by this vnode
+		 */
+		if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) ||
+		    !zfs_ace_can_use(vtype, iflags))
+			continue;
+
+		/*
+		 * If owner@, group@, or everyone@ inheritable
+		 * then zfs_acl_chmod() isn't needed.
+		 */
+		if ((aclinherit == ZFS_ACL_PASSTHROUGH ||
+		    aclinherit == ZFS_ACL_PASSTHROUGH_X) &&
+		    ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
+		    ((iflags & OWNING_GROUP) == OWNING_GROUP)) &&
+		    (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE))))
+			*need_chmod = B_FALSE;
+
+		/*
+		 * Strip inherited execute permission from file if
+		 * not in mode
+		 */
+		if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW &&
+		    !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) {
+			access_mask &= ~ACE_EXECUTE;
+		}
+
+		/*
+		 * Strip write_acl and write_owner from permissions
+		 * when inheriting an ACE
+		 */
+		if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) {
+			access_mask &= ~RESTRICTED_CLEAR;
+		}
+
+		ace_size = aclp->z_ops->ace_size(pacep);
+		aclnode = zfs_acl_node_alloc(ace_size);
+		list_insert_tail(&aclp->z_acl, aclnode);
+		acep = aclnode->z_acldata;
+
+		zfs_set_ace(aclp, acep, access_mask, type,
+		    who, iflags|ACE_INHERITED_ACE);
+
+		/*
+		 * Copy special opaque data if any
+		 */
+		if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) {
+			VERIFY((data2sz = aclp->z_ops->ace_data(acep,
+			    &data2)) == data1sz);
+			bcopy(data1, data2, data2sz);
+		}
+
+		aclp->z_acl_count++;
+		aclnode->z_ace_count++;
+		aclp->z_acl_bytes += aclnode->z_size;
+		newflags = aclp->z_ops->ace_flags_get(acep);
+
+		/*
+		 * If ACE is not to be inherited further, or if the vnode is
+		 * not a directory, remove all inheritance flags
+		 */
+		if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) {
+			newflags &= ~ALL_INHERIT;
+			aclp->z_ops->ace_flags_set(acep,
+			    newflags|ACE_INHERITED_ACE);
+			continue;
+		}
+
+		/*
+		 * This directory has an inheritable ACE
+		 */
+		aclp->z_hints |= ZFS_INHERIT_ACE;
+
+		/*
+		 * If only FILE_INHERIT is set then turn on
+		 * inherit_only
+		 */
+		if ((iflags & (ACE_FILE_INHERIT_ACE |
+		    ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
+			newflags |= ACE_INHERIT_ONLY_ACE;
+			aclp->z_ops->ace_flags_set(acep,
+			    newflags|ACE_INHERITED_ACE);
+		} else {
+			newflags &= ~ACE_INHERIT_ONLY_ACE;
+			aclp->z_ops->ace_flags_set(acep,
+			    newflags|ACE_INHERITED_ACE);
+		}
+	}
+	if (zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
+	    aclp->z_acl_count != 0) {
+		*need_chmod = B_FALSE;
+	}
+
+	return (aclp);
+}
+
+/*
+ * Create file system object initial permissions
+ * including inheritable ACEs.
+ * Also, create FUIDs for owner and group.
+ */
+int
+zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
+    vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
+{
+	int		error;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zfs_acl_t	*paclp;
+	gid_t		gid;
+	boolean_t	need_chmod = B_TRUE;
+	boolean_t	trim = B_FALSE;
+	boolean_t	inherited = B_FALSE;
+
+	if ((flag & IS_ROOT_NODE) == 0) {
+		if (zfsvfs->z_replay == B_FALSE)
+			ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+	} else
+		ASSERT(dzp->z_vnode == NULL);
+	bzero(acl_ids, sizeof (zfs_acl_ids_t));
+	acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+
+	if (vsecp)
+		if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
+		    &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
+			return (error);
+	/*
+	 * Determine uid and gid.
+	 */
+	if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
+	    ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
+		acl_ids->z_fuid = zfs_fuid_create(zfsvfs,
+		    (uint64_t)vap->va_uid, cr,
+		    ZFS_OWNER, &acl_ids->z_fuidp);
+		acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+		    (uint64_t)vap->va_gid, cr,
+		    ZFS_GROUP, &acl_ids->z_fuidp);
+		gid = vap->va_gid;
+	} else {
+		acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
+		    cr, &acl_ids->z_fuidp);
+		acl_ids->z_fgid = 0;
+		if (vap->va_mask & AT_GID)  {
+			acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_gid,
+			    cr, ZFS_GROUP, &acl_ids->z_fuidp);
+			gid = vap->va_gid;
+			if (acl_ids->z_fgid != dzp->z_gid &&
+			    !groupmember(vap->va_gid, cr) &&
+			    secpolicy_vnode_create_gid(cr) != 0)
+				acl_ids->z_fgid = 0;
+		}
+		if (acl_ids->z_fgid == 0) {
+			char		*domain;
+			uint32_t	rid;
+
+			acl_ids->z_fgid = dzp->z_gid;
+			gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
+			    cr, ZFS_GROUP);
+
+			if (zfsvfs->z_use_fuids &&
+			    IS_EPHEMERAL(acl_ids->z_fgid)) {
+				domain =
+				    zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx,
+				    FUID_INDEX(acl_ids->z_fgid));
+				rid = FUID_RID(acl_ids->z_fgid);
+				zfs_fuid_node_add(&acl_ids->z_fuidp,
+				    domain, rid, FUID_INDEX(acl_ids->z_fgid),
+				    acl_ids->z_fgid, ZFS_GROUP);
+			}
+		}
+	}
+
+	/*
+	 * If we're creating a directory, and the parent directory has the
+	 * set-GID bit set, set in on the new directory.
+	 * Otherwise, if the user is neither privileged nor a member of the
+	 * file's new group, clear the file's set-GID bit.
+	 */
+
+	if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
+	    (vap->va_type == VDIR)) {
+		acl_ids->z_mode |= S_ISGID;
+	} else {
+		if ((acl_ids->z_mode & S_ISGID) &&
+		    secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0)
+			acl_ids->z_mode &= ~S_ISGID;
+	}
+
+	if (acl_ids->z_aclp == NULL) {
+		mutex_enter(&dzp->z_acl_lock);
+		if (!(flag & IS_ROOT_NODE) &&
+		    (dzp->z_pflags & ZFS_INHERIT_ACE) &&
+		    !(dzp->z_pflags & ZFS_XATTR)) {
+			VERIFY0(zfs_acl_node_read(dzp, B_TRUE,
+			    &paclp, B_FALSE));
+			acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
+			    vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
+			inherited = B_TRUE;
+		} else {
+			acl_ids->z_aclp =
+			    zfs_acl_alloc(zfs_acl_version_zp(dzp));
+			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+		}
+		mutex_exit(&dzp->z_acl_lock);
+
+		if (need_chmod) {
+			if (vap->va_type == VDIR)
+				acl_ids->z_aclp->z_hints |=
+				    ZFS_ACL_AUTO_INHERIT;
+
+			if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK &&
+			    zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH &&
+			    zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X)
+				trim = B_TRUE;
+			zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE,
+			    trim, acl_ids->z_aclp);
+		}
+	}
+
+	if (inherited || vsecp) {
+		acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
+		    acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
+		    acl_ids->z_fuid, acl_ids->z_fgid);
+		if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
+			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+	}
+
+	return (0);
+}
+
+/*
+ * Free ACL and fuid_infop, but not the acl_ids structure
+ */
+void
+zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
+{
+	if (acl_ids->z_aclp)
+		zfs_acl_free(acl_ids->z_aclp);
+	if (acl_ids->z_fuidp)
+		zfs_fuid_info_free(acl_ids->z_fuidp);
+	acl_ids->z_aclp = NULL;
+	acl_ids->z_fuidp = NULL;
+}
+
+boolean_t
+zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid)
+{
+	return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) ||
+	    zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) ||
+	    (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID &&
+	    zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid)));
+}
+
+/*
+ * Retrieve a file's ACL
+ */
+int
+zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+	zfs_acl_t	*aclp;
+	ulong_t		mask;
+	int		error;
+	int 		count = 0;
+	int		largeace = 0;
+
+	mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
+	    VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
+
+	if (mask == 0)
+		return (SET_ERROR(ENOSYS));
+
+	if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)))
+		return (error);
+
+	mutex_enter(&zp->z_acl_lock);
+
+	if (zp->z_zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+	error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE);
+	if (error != 0) {
+		mutex_exit(&zp->z_acl_lock);
+		return (error);
+	}
+
+	/*
+	 * Scan ACL to determine number of ACEs
+	 */
+	if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
+		void *zacep = NULL;
+		uint64_t who;
+		uint32_t access_mask;
+		uint16_t type, iflags;
+
+		while ((zacep = zfs_acl_next_ace(aclp, zacep,
+		    &who, &access_mask, &iflags, &type))) {
+			switch (type) {
+			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+				largeace++;
+				continue;
+			default:
+				count++;
+			}
+		}
+		vsecp->vsa_aclcnt = count;
+	} else
+		count = (int)aclp->z_acl_count;
+
+	if (mask & VSA_ACECNT) {
+		vsecp->vsa_aclcnt = count;
+	}
+
+	if (mask & VSA_ACE) {
+		size_t aclsz;
+
+		aclsz = count * sizeof (ace_t) +
+		    sizeof (ace_object_t) * largeace;
+
+		vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
+		vsecp->vsa_aclentsz = aclsz;
+
+		if (aclp->z_version == ZFS_ACL_VERSION_FUID)
+			zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr,
+			    vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
+		else {
+			zfs_acl_node_t *aclnode;
+			void *start = vsecp->vsa_aclentp;
+
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				bcopy(aclnode->z_acldata, start,
+				    aclnode->z_size);
+				start = (caddr_t)start + aclnode->z_size;
+			}
+			ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
+			    aclp->z_acl_bytes);
+		}
+	}
+	if (mask & VSA_ACE_ACLFLAGS) {
+		vsecp->vsa_aclflags = 0;
+		if (zp->z_pflags & ZFS_ACL_DEFAULTED)
+			vsecp->vsa_aclflags |= ACL_DEFAULTED;
+		if (zp->z_pflags & ZFS_ACL_PROTECTED)
+			vsecp->vsa_aclflags |= ACL_PROTECTED;
+		if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
+			vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+
+	return (0);
+}
+
+int
+zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_type,
+    vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
+{
+	zfs_acl_t *aclp;
+	zfs_acl_node_t *aclnode;
+	int aclcnt = vsecp->vsa_aclcnt;
+	int error;
+
+	if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
+		return (SET_ERROR(EINVAL));
+
+	aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
+
+	aclp->z_hints = 0;
+	aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
+	if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+		if ((error = zfs_copy_ace_2_oldace(obj_type, aclp,
+		    (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
+		    aclcnt, &aclnode->z_size)) != 0) {
+			zfs_acl_free(aclp);
+			zfs_acl_node_free(aclnode);
+			return (error);
+		}
+	} else {
+		if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp,
+		    vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
+		    &aclnode->z_size, fuidp, cr)) != 0) {
+			zfs_acl_free(aclp);
+			zfs_acl_node_free(aclnode);
+			return (error);
+		}
+	}
+	aclp->z_acl_bytes = aclnode->z_size;
+	aclnode->z_ace_count = aclcnt;
+	aclp->z_acl_count = aclcnt;
+	list_insert_head(&aclp->z_acl, aclnode);
+
+	/*
+	 * If flags are being set then add them to z_hints
+	 */
+	if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
+		if (vsecp->vsa_aclflags & ACL_PROTECTED)
+			aclp->z_hints |= ZFS_ACL_PROTECTED;
+		if (vsecp->vsa_aclflags & ACL_DEFAULTED)
+			aclp->z_hints |= ZFS_ACL_DEFAULTED;
+		if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
+			aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
+	}
+
+	*zaclp = aclp;
+
+	return (0);
+}
+
+/*
+ * Set a file's ACL
+ */
+int
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	ulong_t		mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+	dmu_tx_t	*tx;
+	int		error;
+	zfs_acl_t	*aclp;
+	zfs_fuid_info_t	*fuidp = NULL;
+	boolean_t	fuid_dirtied;
+	uint64_t	acl_obj;
+
+	if (zp->z_zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+	if (mask == 0)
+		return (SET_ERROR(ENOSYS));
+
+	if (zp->z_pflags & ZFS_IMMUTABLE)
+		return (SET_ERROR(EPERM));
+
+	if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)))
+		return (error);
+
+	error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp,
+	    &aclp);
+	if (error)
+		return (error);
+
+	/*
+	 * If ACL wide flags aren't being set then preserve any
+	 * existing flags.
+	 */
+	if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
+		aclp->z_hints |=
+		    (zp->z_pflags & V4_ACL_WIDE_FLAGS);
+	}
+top:
+	mutex_enter(&zp->z_acl_lock);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+
+	/*
+	 * If old version and ACL won't fit in bonus and we aren't
+	 * upgrading then take out necessary DMU holds
+	 */
+
+	if ((acl_obj = zfs_external_acl(zp)) != 0) {
+		if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+		    zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
+			dmu_tx_hold_free(tx, acl_obj, 0,
+			    DMU_OBJECT_END);
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+			    aclp->z_acl_bytes);
+		} else {
+			dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
+		}
+	} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
+	}
+
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	if (error) {
+		mutex_exit(&zp->z_acl_lock);
+
+		if (error == ERESTART) {
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
+		dmu_tx_abort(tx);
+		zfs_acl_free(aclp);
+		return (error);
+	}
+
+	error = zfs_aclset_common(zp, aclp, cr, tx);
+	ASSERT(error == 0);
+	ASSERT(zp->z_acl_cached == NULL);
+	zp->z_acl_cached = aclp;
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
+
+	if (fuidp)
+		zfs_fuid_info_free(fuidp);
+	dmu_tx_commit(tx);
+	mutex_exit(&zp->z_acl_lock);
+
+	return (error);
+}
+
+/*
+ * Check accesses of interest (AoI) against attributes of the dataset
+ * such as read-only.  Returns zero if no AoI conflict with dataset
+ * attributes, otherwise an appropriate errno is returned.
+ */
+static int
+zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
+{
+	if ((v4_mode & WRITE_MASK) &&
+	    (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
+	    (!IS_DEVVP(ZTOV(zp)) ||
+	    (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) {
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * Intentionally allow ZFS_READONLY through here.
+	 * See zfs_zaccess_common().
+	 */
+	if ((v4_mode & WRITE_MASK_DATA) &&
+	    (zp->z_pflags & ZFS_IMMUTABLE)) {
+		return (SET_ERROR(EPERM));
+	}
+
+	/*
+	 * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK
+	 * (sunlnk) is set. We just don't allow directory removal, which is
+	 * handled in zfs_zaccess_delete().
+	 */
+	if ((v4_mode & ACE_DELETE) &&
+	    (zp->z_pflags & ZFS_NOUNLINK)) {
+		return (EPERM);
+	}
+
+	if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
+	    (zp->z_pflags & ZFS_AV_QUARANTINED))) {
+		return (SET_ERROR(EACCES));
+	}
+
+	return (0);
+}
+
+/*
+ * The primary usage of this function is to loop through all of the
+ * ACEs in the znode, determining what accesses of interest (AoI) to
+ * the caller are allowed or denied.  The AoI are expressed as bits in
+ * the working_mode parameter.  As each ACE is processed, bits covered
+ * by that ACE are removed from the working_mode.  This removal
+ * facilitates two things.  The first is that when the working mode is
+ * empty (= 0), we know we've looked at all the AoI. The second is
+ * that the ACE interpretation rules don't allow a later ACE to undo
+ * something granted or denied by an earlier ACE.  Removing the
+ * discovered access or denial enforces this rule.  At the end of
+ * processing the ACEs, all AoI that were found to be denied are
+ * placed into the working_mode, giving the caller a mask of denied
+ * accesses.  Returns:
+ *	0		if all AoI granted
+ *	EACCESS 	if the denied mask is non-zero
+ *	other error	if abnormal failure (e.g., IO error)
+ *
+ * A secondary usage of the function is to determine if any of the
+ * AoI are granted.  If an ACE grants any access in
+ * the working_mode, we immediately short circuit out of the function.
+ * This mode is chosen by setting anyaccess to B_TRUE.  The
+ * working_mode is not a denied access mask upon exit if the function
+ * is used in this manner.
+ */
+static int
+zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
+    boolean_t anyaccess, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zfs_acl_t	*aclp;
+	int		error;
+	uid_t		uid = crgetuid(cr);
+	uint64_t 	who;
+	uint16_t	type, iflags;
+	uint16_t	entry_type;
+	uint32_t	access_mask;
+	uint32_t	deny_mask = 0;
+	zfs_ace_hdr_t	*acep = NULL;
+	boolean_t	checkit;
+	uid_t		gowner;
+	uid_t		fowner;
+
+	zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
+
+	mutex_enter(&zp->z_acl_lock);
+
+	if (zp->z_zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+	error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE);
+	if (error != 0) {
+		mutex_exit(&zp->z_acl_lock);
+		return (error);
+	}
+
+	ASSERT(zp->z_acl_cached);
+
+	while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+	    &iflags, &type))) {
+		uint32_t mask_matched;
+
+		if (!zfs_acl_valid_ace_type(type, iflags))
+			continue;
+
+		if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE))
+			continue;
+
+		/* Skip ACE if it does not affect any AoI */
+		mask_matched = (access_mask & *working_mode);
+		if (!mask_matched)
+			continue;
+
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+
+		checkit = B_FALSE;
+
+		switch (entry_type) {
+		case ACE_OWNER:
+			if (uid == fowner)
+				checkit = B_TRUE;
+			break;
+		case OWNING_GROUP:
+			who = gowner;
+			/*FALLTHROUGH*/
+		case ACE_IDENTIFIER_GROUP:
+			checkit = zfs_groupmember(zfsvfs, who, cr);
+			break;
+		case ACE_EVERYONE:
+			checkit = B_TRUE;
+			break;
+
+		/* USER Entry */
+		default:
+			if (entry_type == 0) {
+				uid_t newid;
+
+				newid = zfs_fuid_map_id(zfsvfs, who, cr,
+				    ZFS_ACE_USER);
+				if (newid !=  UID_NOBODY &&
+				    uid == newid)
+					checkit = B_TRUE;
+				break;
+			} else {
+				mutex_exit(&zp->z_acl_lock);
+				return (SET_ERROR(EIO));
+			}
+		}
+
+		if (checkit) {
+			if (type == DENY) {
+				DTRACE_PROBE3(zfs__ace__denies,
+				    znode_t *, zp,
+				    zfs_ace_hdr_t *, acep,
+				    uint32_t, mask_matched);
+				deny_mask |= mask_matched;
+			} else {
+				DTRACE_PROBE3(zfs__ace__allows,
+				    znode_t *, zp,
+				    zfs_ace_hdr_t *, acep,
+				    uint32_t, mask_matched);
+				if (anyaccess) {
+					mutex_exit(&zp->z_acl_lock);
+					return (0);
+				}
+			}
+			*working_mode &= ~mask_matched;
+		}
+
+		/* Are we done? */
+		if (*working_mode == 0)
+			break;
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+
+	/* Put the found 'denies' back on the working mode */
+	if (deny_mask) {
+		*working_mode |= deny_mask;
+		return (SET_ERROR(EACCES));
+	} else if (*working_mode) {
+		return (-1);
+	}
+
+	return (0);
+}
+
+/*
+ * Return true if any access whatsoever granted, we don't actually
+ * care what access is granted.
+ */
+boolean_t
+zfs_has_access(znode_t *zp, cred_t *cr)
+{
+	uint32_t have = ACE_ALL_PERMS;
+
+	if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
+		uid_t owner;
+
+		owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+		return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0);
+	}
+	return (B_TRUE);
+}
+
+static int
+zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
+    boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int err;
+
+	*working_mode = v4_mode;
+	*check_privs = B_TRUE;
+
+	/*
+	 * Short circuit empty requests
+	 */
+	if (v4_mode == 0 || zfsvfs->z_replay) {
+		*working_mode = 0;
+		return (0);
+	}
+
+	if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
+		*check_privs = B_FALSE;
+		return (err);
+	}
+
+	/*
+	 * The caller requested that the ACL check be skipped.  This
+	 * would only happen if the caller checked VOP_ACCESS() with a
+	 * 32 bit ACE mask and already had the appropriate permissions.
+	 */
+	if (skipaclchk) {
+		*working_mode = 0;
+		return (0);
+	}
+
+	/*
+	 * Note: ZFS_READONLY represents the "DOS R/O" attribute.
+	 * When that flag is set, we should behave as if write access
+	 * were not granted by anything in the ACL.  In particular:
+	 * We _must_ allow writes after opening the file r/w, then
+	 * setting the DOS R/O attribute, and writing some more.
+	 * (Similar to how you can write after fchmod(fd, 0444).)
+	 *
+	 * Therefore ZFS_READONLY is ignored in the dataset check
+	 * above, and checked here as if part of the ACL check.
+	 * Also note: DOS R/O is ignored for directories.
+	 */
+	if ((v4_mode & WRITE_MASK_DATA) &&
+	    (ZTOV(zp)->v_type != VDIR) &&
+	    (zp->z_pflags & ZFS_READONLY)) {
+		return (SET_ERROR(EPERM));
+	}
+
+	return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
+}
+
+static int
+zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
+    cred_t *cr)
+{
+	if (*working_mode != ACE_WRITE_DATA)
+		return (SET_ERROR(EACCES));
+
+	return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
+	    check_privs, B_FALSE, cr));
+}
+
+/*
+ * Check if VEXEC is allowed.
+ *
+ * This routine is based on zfs_fastaccesschk_execute which has slowpath
+ * calling zfs_zaccess. This would be incorrect on FreeBSD (see
+ * zfs_freebsd_access for the difference). Thus this variant let's the
+ * caller handle the slowpath (if necessary).
+ *
+ * On top of that we perform a lockless check for ZFS_NO_EXECS_DENIED.
+ *
+ * Safe access to znode_t is provided by the vnode lock.
+ */
+int
+zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
+{
+	boolean_t is_attr;
+
+	if (zdp->z_pflags & ZFS_AV_QUARANTINED)
+		return (1);
+
+	is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
+	    (ZTOV(zdp)->v_type == VDIR));
+	if (is_attr)
+		return (1);
+
+	if (zdp->z_pflags & ZFS_NO_EXECS_DENIED)
+		return (0);
+
+	return (1);
+}
+
+
+/*
+ * Determine whether Access should be granted/denied.
+ *
+ * The least priv subsystem is always consulted as a basic privilege
+ * can define any form of access.
+ */
+int
+zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
+{
+	uint32_t	working_mode;
+	int		error;
+	int		is_attr;
+	boolean_t 	check_privs;
+	znode_t		*xzp = NULL;
+	znode_t 	*check_zp = zp;
+	mode_t		needed_bits;
+	uid_t		owner;
+
+	is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
+
+#ifdef __FreeBSD_kernel__
+	/*
+	 * In FreeBSD, we don't care about permissions of individual ADS.
+	 * Note that not checking them is not just an optimization - without
+	 * this shortcut, EA operations may bogusly fail with EACCES.
+	 */
+	if (zp->z_pflags & ZFS_XATTR)
+		return (0);
+#else
+	/*
+	 * If attribute then validate against base file
+	 */
+	if (is_attr) {
+		uint64_t	parent;
+
+		if ((error = sa_lookup(zp->z_sa_hdl,
+		    SA_ZPL_PARENT(zp->z_zfsvfs), &parent,
+		    sizeof (parent))) != 0)
+			return (error);
+
+		if ((error = zfs_zget(zp->z_zfsvfs,
+		    parent, &xzp)) != 0)	{
+			return (error);
+		}
+
+		check_zp = xzp;
+
+		/*
+		 * fixup mode to map to xattr perms
+		 */
+
+		if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+			mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+			mode |= ACE_WRITE_NAMED_ATTRS;
+		}
+
+		if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+			mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+			mode |= ACE_READ_NAMED_ATTRS;
+		}
+	}
+#endif
+
+	owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+	/*
+	 * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC
+	 * in needed_bits.  Map the bits mapped by working_mode (currently
+	 * missing) in missing_bits.
+	 * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
+	 * needed_bits.
+	 */
+	needed_bits = 0;
+
+	working_mode = mode;
+	if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+	    owner == crgetuid(cr))
+		working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+	if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+	    ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+		needed_bits |= VREAD;
+	if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+	    ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+		needed_bits |= VWRITE;
+	if (working_mode & ACE_EXECUTE)
+		needed_bits |= VEXEC;
+
+	if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
+	    &check_privs, skipaclchk, cr)) == 0) {
+		if (is_attr)
+			VN_RELE(ZTOV(xzp));
+		return (secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+		    needed_bits, needed_bits));
+	}
+
+	if (error && !check_privs) {
+		if (is_attr)
+			VN_RELE(ZTOV(xzp));
+		return (error);
+	}
+
+	if (error && (flags & V_APPEND)) {
+		error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
+	}
+
+	if (error && check_privs) {
+		mode_t		checkmode = 0;
+		vnode_t *check_vp = ZTOV(check_zp);
+
+		/*
+		 * First check for implicit owner permission on
+		 * read_acl/read_attributes
+		 */
+
+		error = 0;
+		ASSERT(working_mode != 0);
+
+		if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
+		    owner == crgetuid(cr)))
+			working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+		if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+		    ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+			checkmode |= VREAD;
+		if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+		    ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+			checkmode |= VWRITE;
+		if (working_mode & ACE_EXECUTE)
+			checkmode |= VEXEC;
+
+		error = secpolicy_vnode_access2(cr, check_vp, owner,
+		    needed_bits & ~checkmode, needed_bits);
+
+		if (error == 0 && (working_mode & ACE_WRITE_OWNER))
+			error = secpolicy_vnode_chown(check_vp, cr, owner);
+		if (error == 0 && (working_mode & ACE_WRITE_ACL))
+			error = secpolicy_vnode_setdac(check_vp, cr, owner);
+
+		if (error == 0 && (working_mode &
+		    (ACE_DELETE|ACE_DELETE_CHILD)))
+			error = secpolicy_vnode_remove(check_vp, cr);
+
+		if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
+			error = secpolicy_vnode_chown(check_vp, cr, owner);
+		}
+		if (error == 0) {
+			/*
+			 * See if any bits other than those already checked
+			 * for are still present.  If so then return EACCES
+			 */
+			if (working_mode & ~(ZFS_CHECKED_MASKS)) {
+				error = SET_ERROR(EACCES);
+			}
+		}
+	} else if (error == 0) {
+		error = secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+		    needed_bits, needed_bits);
+	}
+
+
+	if (is_attr)
+		VN_RELE(ZTOV(xzp));
+
+	return (error);
+}
+
+/*
+ * Translate traditional unix VREAD/VWRITE/VEXEC mode into
+ * native ACL format and call zfs_zaccess()
+ */
+int
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
+{
+	return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
+}
+
+/*
+ * Access function for secpolicy_vnode_setattr
+ */
+int
+zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
+{
+	int v4_mode = zfs_unix_to_v4(mode >> 6);
+
+	return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
+}
+
+static int
+zfs_delete_final_check(znode_t *zp, znode_t *dzp,
+    mode_t available_perms, cred_t *cr)
+{
+	int error;
+	uid_t downer;
+
+	downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER);
+
+	error = secpolicy_vnode_access2(cr, ZTOV(dzp),
+	    downer, available_perms, VWRITE|VEXEC);
+
+	if (error == 0)
+		error = zfs_sticky_remove_access(dzp, zp, cr);
+
+	return (error);
+}
+
+/*
+ * Determine whether Access should be granted/deny, without
+ * consulting least priv subsystem.
+ *
+ * The following chart is the recommended NFSv4 enforcement for
+ * ability to delete an object.
+ *
+ *      -------------------------------------------------------
+ *      |   Parent Dir  |           Target Object Permissions |
+ *      |  permissions  |                                     |
+ *      -------------------------------------------------------
+ *      |               | ACL Allows | ACL Denies| Delete     |
+ *      |               |  Delete    |  Delete   | unspecified|
+ *      -------------------------------------------------------
+ *      |  ACL Allows   | Permit     | Permit    | Permit     |
+ *      |  DELETE_CHILD |                                     |
+ *      -------------------------------------------------------
+ *      |  ACL Denies   | Permit     | Deny      | Deny       |
+ *      |  DELETE_CHILD |            |           |            |
+ *      -------------------------------------------------------
+ *      | ACL specifies |            |           |            |
+ *      | only allow    | Permit     | Permit    | Permit     |
+ *      | write and     |            |           |            |
+ *      | execute       |            |           |            |
+ *      -------------------------------------------------------
+ *      | ACL denies    |            |           |            |
+ *      | write and     | Permit     | Deny      | Deny       |
+ *      | execute       |            |           |            |
+ *      -------------------------------------------------------
+ *         ^
+ *         |
+ *         No search privilege, can't even look up file?
+ *
+ */
+int
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+{
+	uint32_t dzp_working_mode = 0;
+	uint32_t zp_working_mode = 0;
+	int dzp_error, zp_error;
+	mode_t available_perms;
+	boolean_t dzpcheck_privs = B_TRUE;
+	boolean_t zpcheck_privs = B_TRUE;
+
+	/*
+	 * We want specific DELETE permissions to
+	 * take precedence over WRITE/EXECUTE.  We don't
+	 * want an ACL such as this to mess us up.
+	 * user:joe:write_data:deny,user:joe:delete:allow
+	 *
+	 * However, deny permissions may ultimately be overridden
+	 * by secpolicy_vnode_access().
+	 *
+	 * We will ask for all of the necessary permissions and then
+	 * look at the working modes from the directory and target object
+	 * to determine what was found.
+	 */
+
+	if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+		return (SET_ERROR(EPERM));
+
+	/*
+	 * First row
+	 * If the directory permissions allow the delete, we are done.
+	 */
+	if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
+	    &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
+		return (0);
+
+	/*
+	 * If target object has delete permission then we are done
+	 */
+	if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
+	    &zpcheck_privs, B_FALSE, cr)) == 0)
+		return (0);
+
+	ASSERT(dzp_error && zp_error);
+
+	if (!dzpcheck_privs)
+		return (dzp_error);
+	if (!zpcheck_privs)
+		return (zp_error);
+
+	/*
+	 * Second row
+	 *
+	 * If directory returns EACCES then delete_child was denied
+	 * due to deny delete_child.  In this case send the request through
+	 * secpolicy_vnode_remove().  We don't use zfs_delete_final_check()
+	 * since that *could* allow the delete based on write/execute permission
+	 * and we want delete permissions to override write/execute.
+	 */
+
+	if (dzp_error == EACCES) {
+		/* XXXPJD: s/dzp/zp/ ? */
+		return (secpolicy_vnode_remove(ZTOV(dzp), cr));
+	}
+	/*
+	 * Third Row
+	 * only need to see if we have write/execute on directory.
+	 */
+
+	dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
+	    &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
+
+	if (dzp_error != 0 && !dzpcheck_privs)
+		return (dzp_error);
+
+	/*
+	 * Fourth row
+	 */
+
+	available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE;
+	available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC;
+
+	return (zfs_delete_final_check(zp, dzp, available_perms, cr));
+
+}
+
+int
+zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
+    znode_t *tzp, cred_t *cr)
+{
+	int add_perm;
+	int error;
+
+	if (szp->z_pflags & ZFS_AV_QUARANTINED)
+		return (SET_ERROR(EACCES));
+
+	add_perm = (ZTOV(szp)->v_type == VDIR) ?
+	    ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
+
+	/*
+	 * Rename permissions are combination of delete permission +
+	 * add file/subdir permission.
+	 *
+	 * BSD operating systems also require write permission
+	 * on the directory being moved from one parent directory
+	 * to another.
+	 */
+	if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) {
+		if ((error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr)))
+			return (error);
+	}
+
+	/*
+	 * first make sure we do the delete portion.
+	 *
+	 * If that succeeds then check for add_file/add_subdir permissions
+	 */
+
+	if ((error = zfs_zaccess_delete(sdzp, szp, cr)))
+		return (error);
+
+	/*
+	 * If we have a tzp, see if we can delete it?
+	 */
+	if (tzp && (error = zfs_zaccess_delete(tdzp, tzp, cr)))
+		return (error);
+
+	/*
+	 * Now check for add permissions
+	 */
+	error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
+
+	return (error);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
new file mode 100644
index 000000000000..0fe32b19520c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
@@ -0,0 +1,1350 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ */
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' directory, but this may expand in the
+ * future.  The elements are built using the GFS primitives, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab.  We have three
+ * types of objects:
+ *
+ * 	ctldir ------> snapshotdir -------> snapshot
+ *                                             |
+ *                                             |
+ *                                             V
+ *                                         mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding vnode.
+ *
+ * All mounts are handled automatically by the kernel, but unmounts are
+ * (currently) handled from user land.  The main reason is that there is no
+ * reliable way to auto-unmount the filesystem when it's "no longer in use".
+ * When the user unmounts a filesystem, we call zfsctl_unmount(), which
+ * unmounts any snapshots within the snapshot directory.
+ *
+ * The '.zfs', '.zfs/snapshot', and all directories created under
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
+ * share the same vfs_t as the head filesystem (what '.zfs' lives under).
+ *
+ * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
+ * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
+ * However, vnodes within these mounted on file systems have their v_vfsp
+ * fields set to the head filesystem to make NFS happy (see
+ * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
+ * so that it cannot be freed until all snapshots have been unmounted.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/libkern.h>
+#include <sys/dirent.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/namei.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_deleg.h>
+#include <sys/mount.h>
+#include <sys/zap.h>
+#include <sys/sysproto.h>
+
+#include "zfs_namecheck.h"
+
+#include <sys/kernel.h>
+#include <sys/ccompat.h>
+
+/* Common access mode for all virtual directories under the ctldir */
+const uint16_t zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
+    S_IROTH | S_IXOTH;
+
+/*
+ * "Synthetic" filesystem implementation.
+ */
+
+/*
+ * Assert that A implies B.
+ */
+#define	KASSERT_IMPLY(A, B, msg)	KASSERT(!(A) || (B), (msg));
+
+static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes");
+
+typedef struct sfs_node {
+	char		sn_name[ZFS_MAX_DATASET_NAME_LEN];
+	uint64_t	sn_parent_id;
+	uint64_t	sn_id;
+} sfs_node_t;
+
+/*
+ * Check the parent's ID as well as the node's to account for a chance
+ * that IDs originating from different domains (snapshot IDs, artificial
+ * IDs, znode IDs) may clash.
+ */
+static int
+sfs_compare_ids(struct vnode *vp, void *arg)
+{
+	sfs_node_t *n1 = vp->v_data;
+	sfs_node_t *n2 = arg;
+	bool equal;
+
+	equal = n1->sn_id == n2->sn_id &&
+	    n1->sn_parent_id == n2->sn_parent_id;
+
+	/* Zero means equality. */
+	return (!equal);
+}
+
+static int
+sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id,
+    uint64_t id, struct vnode **vpp)
+{
+	sfs_node_t search;
+	int err;
+
+	search.sn_id = id;
+	search.sn_parent_id = parent_id;
+	err = vfs_hash_get(mp, (uint32_t)id, flags, curthread, vpp,
+	    sfs_compare_ids, &search);
+	return (err);
+}
+
+static int
+sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id,
+    uint64_t id, struct vnode **vpp)
+{
+	int err;
+
+	KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data"));
+	err = vfs_hash_insert(vp, (uint32_t)id, flags, curthread, vpp,
+	    sfs_compare_ids, vp->v_data);
+	return (err);
+}
+
+static void
+sfs_vnode_remove(struct vnode *vp)
+{
+	vfs_hash_remove(vp);
+}
+
+typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg);
+
+static int
+sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id,
+    const char *tag, struct vop_vector *vops,
+    sfs_vnode_setup_fn setup, void *arg,
+    struct vnode **vpp)
+{
+	struct vnode *vp;
+	int error;
+
+	error = sfs_vnode_get(mp, flags, parent_id, id, vpp);
+	if (error != 0 || *vpp != NULL) {
+		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+		    "sfs vnode with no data");
+		return (error);
+	}
+
+	/* Allocate a new vnode/inode. */
+	error = getnewvnode(tag, mp, vops, &vp);
+	if (error != 0) {
+		*vpp = NULL;
+		return (error);
+	}
+
+	/*
+	 * Exclusively lock the vnode vnode while it's being constructed.
+	 */
+	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
+	error = insmntque(vp, mp);
+	if (error != 0) {
+		*vpp = NULL;
+		return (error);
+	}
+
+	setup(vp, arg);
+
+	error = sfs_vnode_insert(vp, flags, parent_id, id, vpp);
+	if (error != 0 || *vpp != NULL) {
+		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+		    "sfs vnode with no data");
+		return (error);
+	}
+
+	*vpp = vp;
+	return (0);
+}
+
+static void
+sfs_print_node(sfs_node_t *node)
+{
+	printf("\tname = %s\n", node->sn_name);
+	printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id);
+	printf("\tid = %ju\n", (uintmax_t)node->sn_id);
+}
+
+static sfs_node_t *
+sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id)
+{
+	struct sfs_node *node;
+
+	KASSERT(strlen(name) < sizeof (node->sn_name),
+	    ("sfs node name is too long"));
+	KASSERT(size >= sizeof (*node), ("sfs node size is too small"));
+	node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO);
+	strlcpy(node->sn_name, name, sizeof (node->sn_name));
+	node->sn_parent_id = parent_id;
+	node->sn_id = id;
+
+	return (node);
+}
+
+static void
+sfs_destroy_node(sfs_node_t *node)
+{
+	free(node, M_SFSNODES);
+}
+
+static void *
+sfs_reclaim_vnode(vnode_t *vp)
+{
+	void *data;
+
+	sfs_vnode_remove(vp);
+	data = vp->v_data;
+	vp->v_data = NULL;
+	return (data);
+}
+
+static int
+sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap,
+    uio_t *uio, off_t *offp)
+{
+	struct dirent entry;
+	int error;
+
+	/* Reset ncookies for subsequent use of vfs_read_dirent. */
+	if (ap->a_ncookies != NULL)
+		*ap->a_ncookies = 0;
+
+	if (uio->uio_resid < sizeof (entry))
+		return (SET_ERROR(EINVAL));
+
+	if (uio->uio_offset < 0)
+		return (SET_ERROR(EINVAL));
+	if (uio->uio_offset == 0) {
+		entry.d_fileno = id;
+		entry.d_type = DT_DIR;
+		entry.d_name[0] = '.';
+		entry.d_name[1] = '\0';
+		entry.d_namlen = 1;
+		entry.d_reclen = sizeof (entry);
+		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+		if (error != 0)
+			return (SET_ERROR(error));
+	}
+
+	if (uio->uio_offset < sizeof (entry))
+		return (SET_ERROR(EINVAL));
+	if (uio->uio_offset == sizeof (entry)) {
+		entry.d_fileno = parent_id;
+		entry.d_type = DT_DIR;
+		entry.d_name[0] = '.';
+		entry.d_name[1] = '.';
+		entry.d_name[2] = '\0';
+		entry.d_namlen = 2;
+		entry.d_reclen = sizeof (entry);
+		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+		if (error != 0)
+			return (SET_ERROR(error));
+	}
+
+	if (offp != NULL)
+		*offp = 2 * sizeof (entry);
+	return (0);
+}
+
+
+/*
+ * .zfs inode namespace
+ *
+ * We need to generate unique inode numbers for all files and directories
+ * within the .zfs pseudo-filesystem.  We use the following scheme:
+ *
+ * 	ENTRY			ZFSCTL_INODE
+ * 	.zfs			1
+ * 	.zfs/snapshot		2
+ * 	.zfs/snapshot/<snap>	objectid(snap)
+ */
+#define	ZFSCTL_INO_SNAP(id)	(id)
+
+static struct vop_vector zfsctl_ops_root;
+static struct vop_vector zfsctl_ops_snapdir;
+static struct vop_vector zfsctl_ops_snapshot;
+static struct vop_vector zfsctl_ops_shares_dir;
+
+void
+zfsctl_init(void)
+{
+}
+
+void
+zfsctl_fini(void)
+{
+}
+
+boolean_t
+zfsctl_is_node(vnode_t *vp)
+{
+	return (vn_matchops(vp, zfsctl_ops_root) ||
+	    vn_matchops(vp, zfsctl_ops_snapdir) ||
+	    vn_matchops(vp, zfsctl_ops_snapshot) ||
+	    vn_matchops(vp, zfsctl_ops_shares_dir));
+
+}
+
+typedef struct zfsctl_root {
+	sfs_node_t	node;
+	sfs_node_t	*snapdir;
+	timestruc_t	cmtime;
+} zfsctl_root_t;
+
+
+/*
+ * Create the '.zfs' directory.
+ */
+void
+zfsctl_create(zfsvfs_t *zfsvfs)
+{
+	zfsctl_root_t *dot_zfs;
+	sfs_node_t *snapdir;
+	vnode_t *rvp;
+	uint64_t crtime[2];
+
+	ASSERT(zfsvfs->z_ctldir == NULL);
+
+	snapdir = sfs_alloc_node(sizeof (*snapdir), "snapshot", ZFSCTL_INO_ROOT,
+	    ZFSCTL_INO_SNAPDIR);
+	dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof (*dot_zfs), ".zfs", 0,
+	    ZFSCTL_INO_ROOT);
+	dot_zfs->snapdir = snapdir;
+
+	VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
+	VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+	    &crtime, sizeof (crtime)));
+	ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime);
+	vput(rvp);
+
+	zfsvfs->z_ctldir = dot_zfs;
+}
+
+/*
+ * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
+ * The nodes must not have any associated vnodes by now as they should be
+ * vflush-ed.
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+	sfs_destroy_node(zfsvfs->z_ctldir->snapdir);
+	sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir);
+	zfsvfs->z_ctldir = NULL;
+}
+
+static int
+zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags,
+    struct vnode **vpp)
+{
+	return (VFS_ROOT(mp, flags, vpp));
+}
+
+static void
+zfsctl_common_vnode_setup(vnode_t *vp, void *arg)
+{
+	ASSERT_VOP_ELOCKED(vp, __func__);
+
+	/* We support shared locking. */
+	VN_LOCK_ASHARE(vp);
+	vp->v_type = VDIR;
+	vp->v_data = arg;
+}
+
+static int
+zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags,
+    struct vnode **vpp)
+{
+	void *node;
+	int err;
+
+	node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir;
+	err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root,
+	    zfsctl_common_vnode_setup, node, vpp);
+	return (err);
+}
+
+static int
+zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags,
+    struct vnode **vpp)
+{
+	void *node;
+	int err;
+
+	node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir->snapdir;
+	err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs",
+	    &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp);
+	return (err);
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+int
+zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp)
+{
+	int error;
+
+	error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp);
+	return (error);
+}
+
+/*
+ * Common open routine.  Disallow any write access.
+ */
+static int
+zfsctl_common_open(struct vop_open_args *ap)
+{
+	int flags = ap->a_mode;
+
+	if (flags & FWRITE)
+		return (SET_ERROR(EACCES));
+
+	return (0);
+}
+
+/*
+ * Common close routine.  Nothing to do here.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_close(struct vop_close_args *ap)
+{
+	return (0);
+}
+
+/*
+ * Common access routine.  Disallow writes.
+ */
+static int
+zfsctl_common_access(struct vop_access_args *ap)
+{
+	accmode_t accmode = ap->a_accmode;
+
+	if (accmode & VWRITE)
+		return (SET_ERROR(EACCES));
+	return (0);
+}
+
+/*
+ * Common getattr function.  Fill in basic information.
+ */
+static void
+zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
+{
+	timestruc_t	now;
+	sfs_node_t *node;
+
+	node = vp->v_data;
+
+	vap->va_uid = 0;
+	vap->va_gid = 0;
+	vap->va_rdev = 0;
+	/*
+	 * We are a purely virtual object, so we have no
+	 * blocksize or allocated blocks.
+	 */
+	vap->va_blksize = 0;
+	vap->va_nblocks = 0;
+	vap->va_seq = 0;
+	vn_fsid(vp, vap);
+	vap->va_mode = zfsctl_ctldir_mode;
+	vap->va_type = VDIR;
+	/*
+	 * We live in the now (for atime).
+	 */
+	gethrestime(&now);
+	vap->va_atime = now;
+	/* FreeBSD: Reset chflags(2) flags. */
+	vap->va_flags = 0;
+
+	vap->va_nodeid = node->sn_id;
+
+	/* At least '.' and '..'. */
+	vap->va_nlink = 2;
+}
+
+#ifndef _OPENSOLARIS_SYS_VNODE_H_
+struct vop_fid_args {
+	struct vnode *a_vp;
+	struct fid *a_fid;
+};
+#endif
+
+static int
+zfsctl_common_fid(struct vop_fid_args *ap)
+{
+	vnode_t		*vp = ap->a_vp;
+	fid_t		*fidp = (void *)ap->a_fid;
+	sfs_node_t	*node = vp->v_data;
+	uint64_t	object = node->sn_id;
+	zfid_short_t	*zfid;
+	int		i;
+
+	zfid = (zfid_short_t *)fidp;
+	zfid->zf_len = SHORT_FID_LEN;
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	/* .zfs nodes always have a generation number of 0 */
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = 0;
+
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_reclaim_args {
+	struct vnode *a_vp;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfsctl_common_reclaim(struct vop_reclaim_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+
+	(void) sfs_reclaim_vnode(vp);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_print_args {
+	struct vnode *a_vp;
+};
+#endif
+
+static int
+zfsctl_common_print(struct vop_print_args *ap)
+{
+	sfs_print_node(ap->a_vp->v_data);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getattr_args {
+	struct vnode *a_vp;
+	struct vattr *a_vap;
+	struct ucred *a_cred;
+};
+#endif
+
+/*
+ * Get root directory attributes.
+ */
+static int
+zfsctl_root_getattr(struct vop_getattr_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct vattr *vap = ap->a_vap;
+	zfsctl_root_t *node = vp->v_data;
+
+	zfsctl_common_getattr(vp, vap);
+	vap->va_ctime = node->cmtime;
+	vap->va_mtime = vap->va_ctime;
+	vap->va_birthtime = vap->va_ctime;
+	vap->va_nlink += 1; /* snapdir */
+	vap->va_size = vap->va_nlink;
+	return (0);
+}
+
+/*
+ * When we lookup "." we still can be asked to lock it
+ * differently, can't we?
+ */
+static int
+zfsctl_relock_dot(vnode_t *dvp, int ltype)
+{
+	vref(dvp);
+	if (ltype != VOP_ISLOCKED(dvp)) {
+		if (ltype == LK_EXCLUSIVE)
+			vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+		else /* if (ltype == LK_SHARED) */
+			vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+		/* Relock for the "." case may left us with reclaimed vnode. */
+		if (VN_IS_DOOMED(dvp)) {
+			vrele(dvp);
+			return (SET_ERROR(ENOENT));
+		}
+	}
+	return (0);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+static int
+zfsctl_root_lookup(struct vop_lookup_args *ap)
+{
+	struct componentname *cnp = ap->a_cnp;
+	vnode_t *dvp = ap->a_dvp;
+	vnode_t **vpp = ap->a_vpp;
+	int flags = ap->a_cnp->cn_flags;
+	int lkflags = ap->a_cnp->cn_lkflags;
+	int nameiop = ap->a_cnp->cn_nameiop;
+	int err;
+
+	ASSERT(dvp->v_type == VDIR);
+
+	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+		return (SET_ERROR(ENOTSUP));
+
+	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+		if (err == 0)
+			*vpp = dvp;
+	} else if ((flags & ISDOTDOT) != 0) {
+		err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL,
+		    lkflags, vpp);
+	} else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) {
+		err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp);
+	} else {
+		err = SET_ERROR(ENOENT);
+	}
+	if (err != 0)
+		*vpp = NULL;
+	return (err);
+}
+
+static int
+zfsctl_root_readdir(struct vop_readdir_args *ap)
+{
+	struct dirent entry;
+	vnode_t *vp = ap->a_vp;
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	zfsctl_root_t *node = vp->v_data;
+	uio_t *uio = ap->a_uio;
+	int *eofp = ap->a_eofflag;
+	off_t dots_offset;
+	int error;
+
+	ASSERT(vp->v_type == VDIR);
+
+	error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio,
+	    &dots_offset);
+	if (error != 0) {
+		if (error == ENAMETOOLONG) /* ran out of destination space */
+			error = 0;
+		return (error);
+	}
+	if (uio->uio_offset != dots_offset)
+		return (SET_ERROR(EINVAL));
+
+	CTASSERT(sizeof (node->snapdir->sn_name) <= sizeof (entry.d_name));
+	entry.d_fileno = node->snapdir->sn_id;
+	entry.d_type = DT_DIR;
+	strcpy(entry.d_name, node->snapdir->sn_name);
+	entry.d_namlen = strlen(entry.d_name);
+	entry.d_reclen = sizeof (entry);
+	error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+	if (error != 0) {
+		if (error == ENAMETOOLONG)
+			error = 0;
+		return (SET_ERROR(error));
+	}
+	if (eofp != NULL)
+		*eofp = 1;
+	return (0);
+}
+
+static int
+zfsctl_root_vptocnp(struct vop_vptocnp_args *ap)
+{
+	static const char dotzfs_name[4] = ".zfs";
+	vnode_t *dvp;
+	int error;
+
+	if (*ap->a_buflen < sizeof (dotzfs_name))
+		return (SET_ERROR(ENOMEM));
+
+	error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL,
+	    LK_SHARED, &dvp);
+	if (error != 0)
+		return (SET_ERROR(error));
+
+	VOP_UNLOCK1(dvp);
+	*ap->a_vpp = dvp;
+	*ap->a_buflen -= sizeof (dotzfs_name);
+	bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name));
+	return (0);
+}
+
+static int
+zfsctl_common_pathconf(struct vop_pathconf_args *ap)
+{
+	/*
+	 * We care about ACL variables so that user land utilities like ls
+	 * can display them correctly.  Since the ctldir's st_dev is set to be
+	 * the same as the parent dataset, we must support all variables that
+	 * it supports.
+	 */
+	switch (ap->a_name) {
+	case _PC_LINK_MAX:
+		*ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX);
+		return (0);
+
+	case _PC_FILESIZEBITS:
+		*ap->a_retval = 64;
+		return (0);
+
+	case _PC_MIN_HOLE_SIZE:
+		*ap->a_retval = (int)SPA_MINBLOCKSIZE;
+		return (0);
+
+	case _PC_ACL_EXTENDED:
+		*ap->a_retval = 0;
+		return (0);
+
+	case _PC_ACL_NFS4:
+		*ap->a_retval = 1;
+		return (0);
+
+	case _PC_ACL_PATH_MAX:
+		*ap->a_retval = ACL_MAX_ENTRIES;
+		return (0);
+
+	case _PC_NAME_MAX:
+		*ap->a_retval = NAME_MAX;
+		return (0);
+
+	default:
+		return (vop_stdpathconf(ap));
+	}
+}
+
+/*
+ * Returns a trivial ACL
+ */
+static int
+zfsctl_common_getacl(struct vop_getacl_args *ap)
+{
+	int i;
+
+	if (ap->a_type != ACL_TYPE_NFS4)
+		return (EINVAL);
+
+	acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0);
+	/*
+	 * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify
+	 * attributes.  That is not the case for the ctldir, so we must clear
+	 * those bits.  We also must clear ACL_READ_NAMED_ATTRS, because xattrs
+	 * aren't supported by the ctldir.
+	 */
+	for (i = 0; i < ap->a_aclp->acl_cnt; i++) {
+		struct acl_entry *entry;
+		entry = &(ap->a_aclp->acl_entry[i]);
+		entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
+		    ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS |
+		    ACL_READ_NAMED_ATTRS);
+	}
+
+	return (0);
+}
+
+static struct vop_vector zfsctl_ops_root = {
+	.vop_default =	&default_vnodeops,
+	.vop_open =	zfsctl_common_open,
+	.vop_close =	zfsctl_common_close,
+	.vop_ioctl =	VOP_EINVAL,
+	.vop_getattr =	zfsctl_root_getattr,
+	.vop_access =	zfsctl_common_access,
+	.vop_readdir =	zfsctl_root_readdir,
+	.vop_lookup =	zfsctl_root_lookup,
+	.vop_inactive =	VOP_NULL,
+	.vop_reclaim =	zfsctl_common_reclaim,
+	.vop_fid =	zfsctl_common_fid,
+	.vop_print =	zfsctl_common_print,
+	.vop_vptocnp =	zfsctl_root_vptocnp,
+	.vop_pathconf =	zfsctl_common_pathconf,
+	.vop_getacl =	zfsctl_common_getacl,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root);
+
+static int
+zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
+{
+	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+
+	dmu_objset_name(os, zname);
+	if (strlen(zname) + 1 + strlen(name) >= len)
+		return (SET_ERROR(ENAMETOOLONG));
+	(void) strcat(zname, "@");
+	(void) strcat(zname, name);
+	return (0);
+}
+
+static int
+zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id)
+{
+	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+	int err;
+
+	err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id);
+	return (err);
+}
+
+/*
+ * Given a vnode get a root vnode of a filesystem mounted on top of
+ * the vnode, if any.  The root vnode is referenced and locked.
+ * If no filesystem is mounted then the orinal vnode remains referenced
+ * and locked.  If any error happens the orinal vnode is unlocked and
+ * released.
+ */
+static int
+zfsctl_mounted_here(vnode_t **vpp, int flags)
+{
+	struct mount *mp;
+	int err;
+
+	ASSERT_VOP_LOCKED(*vpp, __func__);
+	ASSERT3S((*vpp)->v_type, ==, VDIR);
+
+	if ((mp = (*vpp)->v_mountedhere) != NULL) {
+		err = vfs_busy(mp, 0);
+		KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err));
+		KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint"));
+		vput(*vpp);
+		err = VFS_ROOT(mp, flags, vpp);
+		vfs_unbusy(mp);
+		return (err);
+	}
+	return (EJUSTRETURN);
+}
+
+typedef struct {
+	const char *snap_name;
+	uint64_t    snap_id;
+} snapshot_setup_arg_t;
+
+static void
+zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg)
+{
+	snapshot_setup_arg_t *ssa = arg;
+	sfs_node_t *node;
+
+	ASSERT_VOP_ELOCKED(vp, __func__);
+
+	node = sfs_alloc_node(sizeof (sfs_node_t),
+	    ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id);
+	zfsctl_common_vnode_setup(vp, node);
+
+	/* We have to support recursive locking. */
+	VN_LOCK_AREC(vp);
+}
+
+/*
+ * Lookup entry point for the 'snapshot' directory.  Try to open the
+ * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
+ * Perform a mount of the associated dataset on top of the vnode.
+ * There are four possibilities:
+ * - the snapshot node and vnode do not exist
+ * - the snapshot vnode is covered by the mounted snapshot
+ * - the snapshot vnode is not covered yet, the mount operation is in progress
+ * - the snapshot vnode is not covered, because the snapshot has been unmounted
+ * The last two states are transient and should be relatively short-lived.
+ */
+static int
+zfsctl_snapdir_lookup(struct vop_lookup_args *ap)
+{
+	vnode_t *dvp = ap->a_dvp;
+	vnode_t **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	char name[NAME_MAX + 1];
+	char fullname[ZFS_MAX_DATASET_NAME_LEN];
+	char *mountpoint;
+	size_t mountpoint_len;
+	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+	uint64_t snap_id;
+	int nameiop = cnp->cn_nameiop;
+	int lkflags = cnp->cn_lkflags;
+	int flags = cnp->cn_flags;
+	int err;
+
+	ASSERT(dvp->v_type == VDIR);
+
+	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+		return (SET_ERROR(ENOTSUP));
+
+	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+		if (err == 0)
+			*vpp = dvp;
+		return (err);
+	}
+	if (flags & ISDOTDOT) {
+		err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags,
+		    vpp);
+		return (err);
+	}
+
+	if (cnp->cn_namelen >= sizeof (name))
+		return (SET_ERROR(ENAMETOOLONG));
+
+	strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
+	err = zfsctl_snapshot_lookup(dvp, name, &snap_id);
+	if (err != 0)
+		return (SET_ERROR(ENOENT));
+
+	for (;;) {
+		snapshot_setup_arg_t ssa;
+
+		ssa.snap_name = name;
+		ssa.snap_id = snap_id;
+		err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR,
+		    snap_id, "zfs", &zfsctl_ops_snapshot,
+		    zfsctl_snapshot_vnode_setup, &ssa, vpp);
+		if (err != 0)
+			return (err);
+
+		/* Check if a new vnode has just been created. */
+		if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)
+			break;
+
+		/*
+		 * Check if a snapshot is already mounted on top of the vnode.
+		 */
+		err = zfsctl_mounted_here(vpp, lkflags);
+		if (err != EJUSTRETURN)
+			return (err);
+
+		/*
+		 * If the vnode is not covered, then either the mount operation
+		 * is in progress or the snapshot has already been unmounted
+		 * but the vnode hasn't been inactivated and reclaimed yet.
+		 * We can try to re-use the vnode in the latter case.
+		 */
+		VI_LOCK(*vpp);
+		if (((*vpp)->v_iflag & VI_MOUNT) == 0) {
+			/*
+			 * Upgrade to exclusive lock in order to:
+			 * - avoid race conditions
+			 * - satisfy the contract of mount_snapshot()
+			 */
+			err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK);
+			if (err == 0)
+				break;
+		} else {
+			VI_UNLOCK(*vpp);
+		}
+
+		/*
+		 * In this state we can loop on uncontested locks and starve
+		 * the thread doing the lengthy, non-trivial mount operation.
+		 * So, yield to prevent that from happening.
+		 */
+		vput(*vpp);
+		kern_yield(PRI_USER);
+	}
+
+	VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof (fullname), fullname));
+
+	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
+	    strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1;
+	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
+	(void) snprintf(mountpoint, mountpoint_len,
+	    "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
+	    dvp->v_vfsp->mnt_stat.f_mntonname, name);
+
+	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
+	kmem_free(mountpoint, mountpoint_len);
+	if (err == 0) {
+		/*
+		 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
+		 *
+		 * This is where we lie about our v_vfsp in order to
+		 * make .zfs/snapshot/<snapname> accessible over NFS
+		 * without requiring manual mounts of <snapname>.
+		 */
+		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
+		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
+
+		/* Clear the root flag (set via VFS_ROOT) as well. */
+		(*vpp)->v_vflag &= ~VV_ROOT;
+	}
+
+	if (err != 0)
+		*vpp = NULL;
+	return (err);
+}
+
+static int
+zfsctl_snapdir_readdir(struct vop_readdir_args *ap)
+{
+	char snapname[ZFS_MAX_DATASET_NAME_LEN];
+	struct dirent entry;
+	vnode_t *vp = ap->a_vp;
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	uio_t *uio = ap->a_uio;
+	int *eofp = ap->a_eofflag;
+	off_t dots_offset;
+	int error;
+
+	ASSERT(vp->v_type == VDIR);
+
+	error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio,
+	    &dots_offset);
+	if (error != 0) {
+		if (error == ENAMETOOLONG) /* ran out of destination space */
+			error = 0;
+		return (error);
+	}
+
+	ZFS_ENTER(zfsvfs);
+	for (;;) {
+		uint64_t cookie;
+		uint64_t id;
+
+		cookie = uio->uio_offset - dots_offset;
+
+		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname),
+		    snapname, &id, &cookie, NULL);
+		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		if (error != 0) {
+			if (error == ENOENT) {
+				if (eofp != NULL)
+					*eofp = 1;
+				error = 0;
+			}
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+
+		entry.d_fileno = id;
+		entry.d_type = DT_DIR;
+		strcpy(entry.d_name, snapname);
+		entry.d_namlen = strlen(entry.d_name);
+		entry.d_reclen = sizeof (entry);
+		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+		if (error != 0) {
+			if (error == ENAMETOOLONG)
+				error = 0;
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(error));
+		}
+		uio->uio_offset = cookie + dots_offset;
+	}
+	/* NOTREACHED */
+}
+
+static int
+zfsctl_snapdir_getattr(struct vop_getattr_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+	vattr_t *vap = ap->a_vap;
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	dsl_dataset_t *ds;
+	uint64_t snap_count;
+	int err;
+
+	ZFS_ENTER(zfsvfs);
+	ds = dmu_objset_ds(zfsvfs->z_os);
+	zfsctl_common_getattr(vp, vap);
+	vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+	vap->va_mtime = vap->va_ctime;
+	vap->va_birthtime = vap->va_ctime;
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+		err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
+		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+		if (err != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+		vap->va_nlink += snap_count;
+	}
+	vap->va_size = vap->va_nlink;
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+static struct vop_vector zfsctl_ops_snapdir = {
+	.vop_default =	&default_vnodeops,
+	.vop_open =	zfsctl_common_open,
+	.vop_close =	zfsctl_common_close,
+	.vop_getattr =	zfsctl_snapdir_getattr,
+	.vop_access =	zfsctl_common_access,
+	.vop_readdir =	zfsctl_snapdir_readdir,
+	.vop_lookup =	zfsctl_snapdir_lookup,
+	.vop_reclaim =	zfsctl_common_reclaim,
+	.vop_fid =	zfsctl_common_fid,
+	.vop_print =	zfsctl_common_print,
+	.vop_pathconf =	zfsctl_common_pathconf,
+	.vop_getacl =	zfsctl_common_getacl,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir);
+
+
+static int
+zfsctl_snapshot_inactive(struct vop_inactive_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+
+	VERIFY(vrecycle(vp) == 1);
+	return (0);
+}
+
+static int
+zfsctl_snapshot_reclaim(struct vop_reclaim_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+	void *data = vp->v_data;
+
+	sfs_reclaim_vnode(vp);
+	sfs_destroy_node(data);
+	return (0);
+}
+
+static int
+zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
+{
+	struct mount *mp;
+	vnode_t *dvp;
+	vnode_t *vp;
+	sfs_node_t *node;
+	size_t len;
+	int locked;
+	int error;
+
+	vp = ap->a_vp;
+	node = vp->v_data;
+	len = strlen(node->sn_name);
+	if (*ap->a_buflen < len)
+		return (SET_ERROR(ENOMEM));
+
+	/*
+	 * Prevent unmounting of the snapshot while the vnode lock
+	 * is not held.  That is not strictly required, but allows
+	 * us to assert that an uncovered snapshot vnode is never
+	 * "leaked".
+	 */
+	mp = vp->v_mountedhere;
+	if (mp == NULL)
+		return (SET_ERROR(ENOENT));
+	error = vfs_busy(mp, 0);
+	KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error));
+
+	/*
+	 * We can vput the vnode as we can now depend on the reference owned
+	 * by the busied mp.  But we also need to hold the vnode, because
+	 * the reference may go after vfs_unbusy() which has to be called
+	 * before we can lock the vnode again.
+	 */
+	locked = VOP_ISLOCKED(vp);
+#if __FreeBSD_version >= 1300045
+	enum vgetstate vs = vget_prep(vp);
+#else
+	vhold(vp);
+#endif
+	vput(vp);
+
+	/* Look up .zfs/snapshot, our parent. */
+	error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp);
+	if (error == 0) {
+		VOP_UNLOCK1(dvp);
+		*ap->a_vpp = dvp;
+		*ap->a_buflen -= len;
+		bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len);
+	}
+	vfs_unbusy(mp);
+#if __FreeBSD_version >= 1300045
+	vget_finish(vp, locked | LK_RETRY, vs);
+#else
+	vget(vp, locked | LK_VNHELD | LK_RETRY, curthread);
+#endif
+	return (error);
+}
+
+/*
+ * These VP's should never see the light of day.  They should always
+ * be covered.
+ */
+static struct vop_vector zfsctl_ops_snapshot = {
+	.vop_default =		NULL, /* ensure very restricted access */
+	.vop_inactive =		zfsctl_snapshot_inactive,
+#if __FreeBSD_version >= 1300045
+	.vop_need_inactive = vop_stdneed_inactive,
+#endif
+	.vop_reclaim =		zfsctl_snapshot_reclaim,
+	.vop_vptocnp =		zfsctl_snapshot_vptocnp,
+	.vop_lock1 =		vop_stdlock,
+	.vop_unlock =		vop_stdunlock,
+	.vop_islocked =		vop_stdislocked,
+	.vop_advlockpurge =	vop_stdadvlockpurge, /* called by vgone */
+	.vop_print =		zfsctl_common_print,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot);
+
+int
+zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
+{
+	zfsvfs_t *zfsvfs __unused = vfsp->vfs_data;
+	vnode_t *vp;
+	int error;
+
+	ASSERT(zfsvfs->z_ctldir != NULL);
+	*zfsvfsp = NULL;
+	error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+	    ZFSCTL_INO_SNAPDIR, objsetid, &vp);
+	if (error == 0 && vp != NULL) {
+		/*
+		 * XXX Probably need to at least reference, if not busy, the mp.
+		 */
+		if (vp->v_mountedhere != NULL)
+			*zfsvfsp = vp->v_mountedhere->mnt_data;
+		vput(vp);
+	}
+	if (*zfsvfsp == NULL)
+		return (SET_ERROR(EINVAL));
+	return (0);
+}
+
+/*
+ * Unmount any snapshots for the given filesystem.  This is called from
+ * zfs_umount() - if we have a ctldir, then go through and unmount all the
+ * snapshots.
+ */
+int
+zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
+{
+	char snapname[ZFS_MAX_DATASET_NAME_LEN];
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	struct mount *mp;
+	vnode_t *vp;
+	uint64_t cookie;
+	int error;
+
+	ASSERT(zfsvfs->z_ctldir != NULL);
+
+	cookie = 0;
+	for (;;) {
+		uint64_t id;
+
+		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname),
+		    snapname, &id, &cookie, NULL);
+		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		if (error != 0) {
+			if (error == ENOENT)
+				error = 0;
+			break;
+		}
+
+		for (;;) {
+			error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+			    ZFSCTL_INO_SNAPDIR, id, &vp);
+			if (error != 0 || vp == NULL)
+				break;
+
+			mp = vp->v_mountedhere;
+
+			/*
+			 * v_mountedhere being NULL means that the
+			 * (uncovered) vnode is in a transient state
+			 * (mounting or unmounting), so loop until it
+			 * settles down.
+			 */
+			if (mp != NULL)
+				break;
+			vput(vp);
+		}
+		if (error != 0)
+			break;
+		if (vp == NULL)
+			continue;	/* no mountpoint, nothing to do */
+
+		/*
+		 * The mount-point vnode is kept locked to avoid spurious EBUSY
+		 * from a concurrent umount.
+		 * The vnode lock must have recursive locking enabled.
+		 */
+		vfs_ref(mp);
+		error = dounmount(mp, fflags, curthread);
+		KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1,
+		    ("extra references after unmount"));
+		vput(vp);
+		if (error != 0)
+			break;
+	}
+	KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0,
+	    ("force unmounting failed"));
+	return (error);
+}
+
+int
+zfsctl_snapshot_unmount(char *snapname, int flags __unused)
+{
+	vfs_t *vfsp = NULL;
+	zfsvfs_t *zfsvfs = NULL;
+
+	if (strchr(snapname, '@') == NULL)
+		return (0);
+
+	int err = getzfsvfs(snapname, &zfsvfs);
+	if (err != 0) {
+		ASSERT3P(zfsvfs, ==, NULL);
+		return (0);
+	}
+	vfsp = zfsvfs->z_vfs;
+
+	ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
+
+	vfs_ref(vfsp);
+	vfs_unbusy(vfsp);
+	return (dounmount(vfsp, MS_FORCE, curthread));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_debug.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_debug.c
new file mode 100644
index 000000000000..74742ad3669f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_debug.c
@@ -0,0 +1,251 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/kstat.h>
+
+typedef struct zfs_dbgmsg {
+	list_node_t zdm_node;
+	time_t zdm_timestamp;
+	int zdm_size;
+	char zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size = 0;
+kmutex_t zfs_dbgmsgs_lock;
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
+kstat_t *zfs_dbgmsg_kstat;
+
+/*
+ * Internal ZFS debug messages are enabled by default.
+ *
+ * # Print debug messages
+ * dtrace -n 'zfs-dbgmsg { print(stringof(arg0)); }'
+ *
+ * # Disable the kernel debug message log.
+ * sysctl vfs.zfs.dbgmsg_enable=0
+ */
+int zfs_dbgmsg_enable = 1;
+
+static int
+zfs_dbgmsg_headers(char *buf, size_t size)
+{
+	(void) snprintf(buf, size, "%-12s %-8s\n", "timestamp", "message");
+
+	return (0);
+}
+
+static int
+zfs_dbgmsg_data(char *buf, size_t size, void *data)
+{
+	zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)data;
+
+	(void) snprintf(buf, size, "%-12llu %-s\n",
+	    (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg);
+
+	return (0);
+}
+
+static void *
+zfs_dbgmsg_addr(kstat_t *ksp, loff_t n)
+{
+	zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)ksp->ks_private;
+
+	ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock));
+
+	if (n == 0)
+		ksp->ks_private = list_head(&zfs_dbgmsgs);
+	else if (zdm)
+		ksp->ks_private = list_next(&zfs_dbgmsgs, zdm);
+
+	return (ksp->ks_private);
+}
+
+static void
+zfs_dbgmsg_purge(int max_size)
+{
+	zfs_dbgmsg_t *zdm;
+	int size;
+
+	ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock));
+
+	while (zfs_dbgmsg_size > max_size) {
+		zdm = list_remove_head(&zfs_dbgmsgs);
+		if (zdm == NULL)
+			return;
+
+		size = zdm->zdm_size;
+		kmem_free(zdm, size);
+		zfs_dbgmsg_size -= size;
+	}
+}
+
+static int
+zfs_dbgmsg_update(kstat_t *ksp, int rw)
+{
+	if (rw == KSTAT_WRITE)
+		zfs_dbgmsg_purge(0);
+
+	return (0);
+}
+
+void
+zfs_dbgmsg_init(void)
+{
+	list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
+	    offsetof(zfs_dbgmsg_t, zdm_node));
+	mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	zfs_dbgmsg_kstat = kstat_create("zfs", 0, "dbgmsg", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+	if (zfs_dbgmsg_kstat) {
+		zfs_dbgmsg_kstat->ks_lock = &zfs_dbgmsgs_lock;
+		zfs_dbgmsg_kstat->ks_ndata = UINT32_MAX;
+		zfs_dbgmsg_kstat->ks_private = NULL;
+		zfs_dbgmsg_kstat->ks_update = zfs_dbgmsg_update;
+		kstat_set_raw_ops(zfs_dbgmsg_kstat, zfs_dbgmsg_headers,
+		    zfs_dbgmsg_data, zfs_dbgmsg_addr);
+		kstat_install(zfs_dbgmsg_kstat);
+	}
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+	if (zfs_dbgmsg_kstat)
+		kstat_delete(zfs_dbgmsg_kstat);
+	/*
+	 * TODO - decide how to make this permanent
+	 */
+#ifdef _KERNEL
+	mutex_enter(&zfs_dbgmsgs_lock);
+	zfs_dbgmsg_purge(0);
+	mutex_exit(&zfs_dbgmsgs_lock);
+	mutex_destroy(&zfs_dbgmsgs_lock);
+#endif
+}
+
+void
+__zfs_dbgmsg(char *buf)
+{
+	zfs_dbgmsg_t *zdm;
+	int size;
+
+	DTRACE_PROBE1(zfs__dbgmsg, char *, buf);
+
+	size = sizeof (zfs_dbgmsg_t) + strlen(buf);
+	zdm = kmem_zalloc(size, KM_SLEEP);
+	zdm->zdm_size = size;
+	zdm->zdm_timestamp = gethrestime_sec();
+	strcpy(zdm->zdm_msg, buf);
+
+	mutex_enter(&zfs_dbgmsgs_lock);
+	list_insert_tail(&zfs_dbgmsgs, zdm);
+	zfs_dbgmsg_size += size;
+	zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0));
+	mutex_exit(&zfs_dbgmsgs_lock);
+}
+
+void
+__set_error(const char *file, const char *func, int line, int err)
+{
+	/*
+	 * To enable this:
+	 *
+	 * $ echo 512 >/sys/module/zfs/parameters/zfs_flags
+	 */
+	if (zfs_flags & ZFS_DEBUG_SET_ERROR)
+		__dprintf(B_FALSE, file, func, line, "error %lu", err);
+}
+
+#ifdef _KERNEL
+void
+__dprintf(boolean_t dprint, const char *file, const char *func,
+    int line, const char *fmt, ...)
+{
+	const char *newfile;
+	va_list adx;
+	size_t size;
+	char *buf;
+	char *nl;
+	int i;
+
+	size = 1024;
+	buf = kmem_alloc(size, KM_SLEEP);
+
+	/*
+	 * Get rid of annoying prefix to filename.
+	 */
+	newfile = strrchr(file, '/');
+	if (newfile != NULL) {
+		newfile = newfile + 1; /* Get rid of leading / */
+	} else {
+		newfile = file;
+	}
+
+	i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func);
+
+	if (i < size) {
+		va_start(adx, fmt);
+		(void) vsnprintf(buf + i, size - i, fmt, adx);
+		va_end(adx);
+	}
+
+	/*
+	 * Get rid of trailing newline.
+	 */
+	nl = strrchr(buf, '\n');
+	if (nl != NULL)
+		*nl = '\0';
+
+	__zfs_dbgmsg(buf);
+
+	kmem_free(buf, size);
+}
+
+#else
+
+void
+zfs_dbgmsg_print(const char *tag)
+{
+	zfs_dbgmsg_t *zdm;
+
+	(void) printf("ZFS_DBGMSG(%s):\n", tag);
+	mutex_enter(&zfs_dbgmsgs_lock);
+	for (zdm = list_head(&zfs_dbgmsgs); zdm;
+	    zdm = list_next(&zfs_dbgmsgs, zdm))
+		(void) printf("%s\n", zdm->zdm_msg);
+	mutex_exit(&zfs_dbgmsgs_lock);
+}
+#endif /* _KERNEL */
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, dbgmsg_enable, INT, ZMOD_RW,
+    "Enable ZFS debug message log");
+
+ZFS_MODULE_PARAM(zfs, zfs_, dbgmsg_maxsize, INT, ZMOD_RW,
+    "Maximum ZFS debug log size");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c
new file mode 100644
index 000000000000..4b1f4a8832e0
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c
@@ -0,0 +1,967 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/extdirent.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/sunddi.h>
+#include <sys/random.h>
+#include <sys/policy.h>
+#include <sys/condvar.h>
+#include <sys/callb.h>
+#include <sys/smp.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+
+#include <sys/ccompat.h>
+
+/*
+ * zfs_match_find() is used by zfs_dirent_lookup() to perform zap lookups
+ * of names after deciding which is the appropriate lookup interface.
+ */
+static int
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
+    matchtype_t mt, uint64_t *zoid)
+{
+	int error;
+
+	if (zfsvfs->z_norm) {
+
+		/*
+		 * In the non-mixed case we only expect there would ever
+		 * be one match, but we need to use the normalizing lookup.
+		 */
+		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
+		    zoid, mt, NULL, 0, NULL);
+	} else {
+		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
+	}
+	*zoid = ZFS_DIRENT_OBJ(*zoid);
+
+	return (error);
+}
+
+/*
+ * Look up a directory entry under a locked vnode.
+ * dvp being locked gives us a guarantee that there are no concurrent
+ * modification of the directory and, thus, if a node can be found in
+ * the directory, then it must not be unlinked.
+ *
+ * Input arguments:
+ *	dzp	- znode for directory
+ *	name	- name of entry to lock
+ *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
+ *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
+ *		  ZXATTR: we want dzp's xattr directory
+ *
+ * Output arguments:
+ *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ */
+int
+zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
+{
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	znode_t		*zp;
+	matchtype_t	mt = 0;
+	uint64_t	zoid;
+	int		error = 0;
+
+	if (zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+
+	*zpp = NULL;
+
+	/*
+	 * Verify that we are not trying to lock '.', '..', or '.zfs'
+	 */
+	if (name[0] == '.' &&
+	    (((name[1] == '\0') || (name[1] == '.' && name[2] == '\0')) ||
+	    (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)))
+		return (SET_ERROR(EEXIST));
+
+	/*
+	 * Case sensitivity and normalization preferences are set when
+	 * the file system is created.  These are stored in the
+	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
+	 * affect how we perform zap lookups.
+	 *
+	 * When matching we may need to normalize & change case according to
+	 * FS settings.
+	 *
+	 * Note that a normalized match is necessary for a case insensitive
+	 * filesystem when the lookup request is not exact because normalization
+	 * can fold case independent of normalizing code point sequences.
+	 *
+	 * See the table above zfs_dropname().
+	 */
+	if (zfsvfs->z_norm != 0) {
+		mt = MT_NORMALIZE;
+
+		/*
+		 * Determine if the match needs to honor the case specified in
+		 * lookup, and if so keep track of that so that during
+		 * normalization we don't fold case.
+		 */
+		if (zfsvfs->z_case == ZFS_CASE_MIXED) {
+			mt |= MT_MATCH_CASE;
+		}
+	}
+
+	/*
+	 * Only look in or update the DNLC if we are looking for the
+	 * name on a file system that does not require normalization
+	 * or case folding.  We can also look there if we happen to be
+	 * on a non-normalizing, mixed sensitivity file system IF we
+	 * are looking for the exact name.
+	 *
+	 * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
+	 * because in that case MT_EXACT and MT_FIRST should produce exactly
+	 * the same result.
+	 */
+
+	if (dzp->z_unlinked && !(flag & ZXATTR))
+		return (ENOENT);
+	if (flag & ZXATTR) {
+		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
+		    sizeof (zoid));
+		if (error == 0)
+			error = (zoid == 0 ? ENOENT : 0);
+	} else {
+		error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid);
+	}
+	if (error) {
+		if (error != ENOENT || (flag & ZEXISTS)) {
+			return (error);
+		}
+	} else {
+		if (flag & ZNEW) {
+			return (SET_ERROR(EEXIST));
+		}
+		error = zfs_zget(zfsvfs, zoid, &zp);
+		if (error)
+			return (error);
+		ASSERT(!zp->z_unlinked);
+		*zpp = zp;
+	}
+
+	return (0);
+}
+
+static int
+zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
+{
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+	znode_t *zp;
+	uint64_t parent;
+	int error;
+
+	if (zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
+
+	if (dzp->z_unlinked)
+		return (ENOENT);
+
+	if ((error = sa_lookup(dzp->z_sa_hdl,
+	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+		return (error);
+
+	error = zfs_zget(zfsvfs, parent, &zp);
+	if (error == 0)
+		*zpp = zp;
+	return (error);
+}
+
+int
+zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
+{
+	zfsvfs_t *zfsvfs __unused = dzp->z_zfsvfs;
+	znode_t *zp = NULL;
+	int error = 0;
+
+#ifdef ZFS_DEBUG
+	if (zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
+#endif
+	if (dzp->z_unlinked)
+		return (SET_ERROR(ENOENT));
+
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		*zpp = dzp;
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		error = zfs_dd_lookup(dzp, &zp);
+		if (error == 0)
+			*zpp = zp;
+	} else {
+		error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
+		if (error == 0) {
+			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+			*zpp = zp;
+		}
+	}
+	return (error);
+}
+
+/*
+ * unlinked Set (formerly known as the "delete queue") Error Handling
+ *
+ * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating.  We
+ * also fib and say that we won't be adding any new entries to the
+ * unlinked set, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem).  So on the small
+ * chance that the nlink list is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the unlinked set below to fail due to i/o error.  On a
+ * nondebug system, this will result in the space being leaked.
+ */
+void
+zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ASSERT(zp->z_unlinked);
+	ASSERT(zp->z_links == 0);
+
+	VERIFY3U(0, ==,
+	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+	dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
+}
+
+/*
+ * Clean up any znodes that had no links when we either crashed or
+ * (force) umounted the file system.
+ */
+void
+zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t zap;
+	dmu_object_info_t doi;
+	znode_t		*zp;
+	dmu_tx_t	*tx;
+	int		error;
+
+	/*
+	 * Iterate over the contents of the unlinked set.
+	 */
+	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
+	    zap_cursor_retrieve(&zc, &zap) == 0;
+	    zap_cursor_advance(&zc)) {
+
+		/*
+		 * See what kind of object we have in list
+		 */
+
+		error = dmu_object_info(zfsvfs->z_os,
+		    zap.za_first_integer, &doi);
+		if (error != 0)
+			continue;
+
+		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+		/*
+		 * We need to re-mark these list entries for deletion,
+		 * so we pull them back into core and set zp->z_unlinked.
+		 */
+		error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+		/*
+		 * We may pick up znodes that are already marked for deletion.
+		 * This could happen during the purge of an extended attribute
+		 * directory.  All we need to do is skip over them, since they
+		 * are already in the system marked z_unlinked.
+		 */
+		if (error != 0)
+			continue;
+
+		vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
+
+		/*
+		 * Due to changes in zfs_rmnode we need to make sure the
+		 * link count is set to zero here.
+		 */
+		if (zp->z_links != 0) {
+			tx = dmu_tx_create(zfsvfs->z_os);
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+			error = dmu_tx_assign(tx, TXG_WAIT);
+			if (error != 0) {
+				dmu_tx_abort(tx);
+				vput(ZTOV(zp));
+				continue;
+			}
+			zp->z_links = 0;
+			VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+			    &zp->z_links, sizeof (zp->z_links), tx));
+			dmu_tx_commit(tx);
+		}
+
+		zp->z_unlinked = B_TRUE;
+		vput(ZTOV(zp));
+	}
+	zap_cursor_fini(&zc);
+}
+
+/*
+ * Delete the entire contents of a directory.  Return a count
+ * of the number of entries that could not be deleted. If we encounter
+ * an error, return a count of at least one so that the directory stays
+ * in the unlinked set.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ *	so there is no need to lock its entries before deletion.
+ *	Also, it assumes the directory contents is *only* regular
+ *	files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	znode_t		*xzp;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	int skipped = 0;
+	int error;
+
+	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+	    zap_cursor_advance(&zc)) {
+		error = zfs_zget(zfsvfs,
+		    ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
+		if (error) {
+			skipped += 1;
+			continue;
+		}
+
+		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
+		ASSERT((ZTOV(xzp)->v_type == VREG) ||
+		    (ZTOV(xzp)->v_type == VLNK));
+
+		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
+		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+		/* Is this really needed ? */
+		zfs_sa_upgrade_txholds(tx, xzp);
+		dmu_tx_mark_netfree(tx);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			vput(ZTOV(xzp));
+			skipped += 1;
+			continue;
+		}
+
+		error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
+		if (error)
+			skipped += 1;
+		dmu_tx_commit(tx);
+
+		vput(ZTOV(xzp));
+	}
+	zap_cursor_fini(&zc);
+	if (error != ENOENT)
+		skipped += 1;
+	return (skipped);
+}
+
+extern taskq_t *zfsvfs_taskq;
+
+void
+zfs_rmnode(znode_t *zp)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	objset_t	*os = zfsvfs->z_os;
+	dmu_tx_t	*tx;
+	uint64_t	acl_obj;
+	uint64_t	xattr_obj;
+	uint64_t	count;
+	int		error;
+
+	ASSERT(zp->z_links == 0);
+	if (zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+
+	/*
+	 * If this is an attribute directory, purge its contents.
+	 */
+	if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
+	    (zp->z_pflags & ZFS_XATTR)) {
+		if (zfs_purgedir(zp) != 0) {
+			/*
+			 * Not enough space to delete some xattrs.
+			 * Leave it in the unlinked set.
+			 */
+			zfs_znode_dmu_fini(zp);
+			zfs_znode_free(zp);
+			return;
+		}
+	} else {
+		/*
+		 * Free up all the data in the file.  We don't do this for
+		 * XATTR directories because we need truncate and remove to be
+		 * in the same tx, like in zfs_znode_delete(). Otherwise, if
+		 * we crash here we'll end up with an inconsistent truncated
+		 * zap object in the delete queue.  Note a truncated file is
+		 * harmless since it only contains user data.
+		 */
+		error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
+		if (error) {
+			/*
+			 * Not enough space or we were interrupted by unmount.
+			 * Leave the file in the unlinked set.
+			 */
+			zfs_znode_dmu_fini(zp);
+			zfs_znode_free(zp);
+			return;
+		}
+	}
+
+	/*
+	 * If the file has extended attributes, we're going to unlink
+	 * the xattr dir.
+	 */
+	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+	    &xattr_obj, sizeof (xattr_obj));
+	if (error)
+		xattr_obj = 0;
+
+	acl_obj = zfs_external_acl(zp);
+
+	/*
+	 * Set up the final transaction.
+	 */
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+	if (xattr_obj)
+		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+	if (acl_obj)
+		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		/*
+		 * Not enough space to delete the file.  Leave it in the
+		 * unlinked set, leaking it until the fs is remounted (at
+		 * which point we'll call zfs_unlinked_drain() to process it).
+		 */
+		dmu_tx_abort(tx);
+		zfs_znode_dmu_fini(zp);
+		zfs_znode_free(zp);
+		return;
+	}
+
+	/*
+	 * FreeBSD's implementation of zfs_zget requires a vnode to back it.
+	 * This means that we could end up calling into getnewvnode while
+	 * calling zfs_rmnode as a result of a prior call to getnewvnode
+	 * trying to clear vnodes out of the cache. If this repeats we can
+	 * recurse enough that we overflow our stack. To avoid this, we
+	 * avoid calling zfs_zget on the xattr znode and instead simply add
+	 * it to the unlinked set and schedule a call to zfs_unlinked_drain.
+	 */
+	if (xattr_obj) {
+		/* Add extended attribute directory to the unlinked set. */
+		VERIFY3U(0, ==,
+		    zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx));
+	}
+
+	mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
+
+	/* Remove this znode from the unlinked set */
+	VERIFY3U(0, ==,
+	    zap_remove_int(os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+	if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) {
+		cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv);
+	}
+
+	mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
+
+	dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
+
+	zfs_znode_delete(zp, tx);
+
+	dmu_tx_commit(tx);
+
+	if (xattr_obj) {
+		/*
+		 * We're using the FreeBSD taskqueue API here instead of
+		 * the Solaris taskq API since the FreeBSD API allows for a
+		 * task to be enqueued multiple times but executed once.
+		 */
+		taskqueue_enqueue(zfsvfs_taskq->tq_queue,
+		    &zfsvfs->z_unlinked_drain_task);
+	}
+}
+
+static uint64_t
+zfs_dirent(znode_t *zp, uint64_t mode)
+{
+	uint64_t de = zp->z_id;
+
+	if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
+		de |= IFTODT(mode) << 60;
+	return (de);
+}
+
+/*
+ * Link zp into dzp.  Can only fail if zp has been unlinked.
+ */
+int
+zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	vnode_t *vp = ZTOV(zp);
+	uint64_t value;
+	int zp_is_dir = (vp->v_type == VDIR);
+	sa_bulk_attr_t bulk[5];
+	uint64_t mtime[2], ctime[2];
+	int count = 0;
+	int error;
+
+	if (zfsvfs->z_replay == B_FALSE) {
+		ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+	}
+	if (zp_is_dir) {
+		if (dzp->z_links >= ZFS_LINK_MAX)
+			return (SET_ERROR(EMLINK));
+	}
+	if (!(flag & ZRENAMING)) {
+		if (zp->z_unlinked) {	/* no new links to unlinked zp */
+			ASSERT(!(flag & (ZNEW | ZEXISTS)));
+			return (SET_ERROR(ENOENT));
+		}
+		if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) {
+			return (SET_ERROR(EMLINK));
+		}
+		zp->z_links++;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+		    &zp->z_links, sizeof (zp->z_links));
+
+	} else {
+		ASSERT(zp->z_unlinked == 0);
+	}
+	value = zfs_dirent(zp, zp->z_mode);
+	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
+	    8, 1, &value, tx);
+
+	/*
+	 * zap_add could fail to add the entry if it exceeds the capacity of the
+	 * leaf-block and zap_leaf_split() failed to help.
+	 * The caller of this routine is responsible for failing the transaction
+	 * which will rollback the SA updates done above.
+	 */
+	if (error != 0) {
+		if (!(flag & ZRENAMING) && !(flag & ZNEW))
+			zp->z_links--;
+		return (error);
+	}
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+	    &dzp->z_id, sizeof (dzp->z_id));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+
+	if (!(flag & ZNEW)) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    ctime, sizeof (ctime));
+		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+		    ctime);
+	}
+	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+	ASSERT0(error);
+
+	dzp->z_size++;
+	dzp->z_links += zp_is_dir;
+	count = 0;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &dzp->z_size, sizeof (dzp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &dzp->z_links, sizeof (dzp->z_links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+	    mtime, sizeof (mtime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    ctime, sizeof (ctime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &dzp->z_pflags, sizeof (dzp->z_pflags));
+	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+	ASSERT0(error);
+	return (0);
+}
+
+/*
+ * The match type in the code for this function should conform to:
+ *
+ * ------------------------------------------------------------------------
+ * fs type  | z_norm      | lookup type | match type
+ * ---------|-------------|-------------|----------------------------------
+ * CS !norm | 0           |           0 | 0 (exact)
+ * CS  norm | formX       |           0 | MT_NORMALIZE
+ * CI !norm | upper       |   !ZCIEXACT | MT_NORMALIZE
+ * CI !norm | upper       |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CI  norm | upper|formX |   !ZCIEXACT | MT_NORMALIZE
+ * CI  norm | upper|formX |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper       |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper       |     ZCILOOK | MT_NORMALIZE
+ * CM  norm | upper|formX |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM  norm | upper|formX |     ZCILOOK | MT_NORMALIZE
+ *
+ * Abbreviations:
+ *    CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
+ *    upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
+ *    formX = unicode normalization form set on fs creation
+ */
+static int
+zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag)
+{
+	int error;
+
+	if (zp->z_zfsvfs->z_norm) {
+		matchtype_t mt = MT_NORMALIZE;
+
+		if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) {
+			mt |= MT_MATCH_CASE;
+		}
+
+		error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id,
+		    name, mt, tx);
+	} else {
+		error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx);
+	}
+
+	return (error);
+}
+
+/*
+ * Unlink zp from dzp, and mark zp for deletion if this was the last link.
+ * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
+ * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
+ * If it's non-NULL, we use it to indicate whether the znode needs deletion,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag, boolean_t *unlinkedp)
+{
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+	vnode_t *vp = ZTOV(zp);
+	int zp_is_dir = (vp->v_type == VDIR);
+	boolean_t unlinked = B_FALSE;
+	sa_bulk_attr_t bulk[5];
+	uint64_t mtime[2], ctime[2];
+	int count = 0;
+	int error;
+
+	if (zfsvfs->z_replay == B_FALSE) {
+		ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+	}
+	if (!(flag & ZRENAMING)) {
+
+		if (zp_is_dir && !zfs_dirempty(zp))
+			return (SET_ERROR(ENOTEMPTY));
+
+		/*
+		 * If we get here, we are going to try to remove the object.
+		 * First try removing the name from the directory; if that
+		 * fails, return the error.
+		 */
+		error = zfs_dropname(dzp, name, zp, tx, flag);
+		if (error != 0) {
+			return (error);
+		}
+
+		if (zp->z_links <= zp_is_dir) {
+			zfs_panic_recover("zfs: link count on vnode %p is %u, "
+			    "should be at least %u", zp->z_vnode,
+			    (int)zp->z_links,
+			    zp_is_dir + 1);
+			zp->z_links = zp_is_dir + 1;
+		}
+		if (--zp->z_links == zp_is_dir) {
+			zp->z_unlinked = B_TRUE;
+			zp->z_links = 0;
+			unlinked = B_TRUE;
+		} else {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+			    NULL, &ctime, sizeof (ctime));
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+			    NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+			zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+			    ctime);
+		}
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+		    NULL, &zp->z_links, sizeof (zp->z_links));
+		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		count = 0;
+		ASSERT0(error);
+	} else {
+		ASSERT(zp->z_unlinked == 0);
+		error = zfs_dropname(dzp, name, zp, tx, flag);
+		if (error != 0)
+			return (error);
+	}
+
+	dzp->z_size--;		/* one dirent removed */
+	dzp->z_links -= zp_is_dir;	/* ".." link from zp */
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+	    NULL, &dzp->z_links, sizeof (dzp->z_links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+	    NULL, &dzp->z_size, sizeof (dzp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+	    NULL, ctime, sizeof (ctime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+	    NULL, mtime, sizeof (mtime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
+	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+	ASSERT0(error);
+
+	if (unlinkedp != NULL)
+		*unlinkedp = unlinked;
+	else if (unlinked)
+		zfs_unlinked_add(zp, tx);
+
+	return (0);
+}
+
+/*
+ * Indicate whether the directory is empty.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+	return (dzp->z_size == 2);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xvpp, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	znode_t *xzp;
+	dmu_tx_t *tx;
+	int error;
+	zfs_acl_ids_t acl_ids;
+	boolean_t fuid_dirtied;
+	uint64_t parent __unused;
+
+	*xvpp = NULL;
+
+	if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
+	    &acl_ids)) != 0)
+		return (error);
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0)) {
+		zfs_acl_ids_free(&acl_ids);
+		return (SET_ERROR(EDQUOT));
+	}
+
+	getnewvnode_reserve_();
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
+		return (error);
+	}
+	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+#ifdef ZFS_DEBUG
+	error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (parent));
+	ASSERT(error == 0 && parent == zp->z_id);
+#endif
+
+	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+	    sizeof (xzp->z_id), tx));
+
+	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
+	    xzp, "", NULL, acl_ids.z_fuidp, vap);
+
+	zfs_acl_ids_free(&acl_ids);
+	dmu_tx_commit(tx);
+
+	getnewvnode_drop_reserve();
+
+	*xvpp = xzp;
+
+	return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ *	IN:	zp	- znode to obtain attribute directory from
+ *		cr	- credentials of caller
+ *		flags	- flags from the VOP_LOOKUP call
+ *
+ *	OUT:	xzpp	- pointer to extended attribute znode
+ *
+ *	RETURN:	0 on success
+ *		error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	znode_t		*xzp;
+	vattr_t		va;
+	int		error;
+top:
+	error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
+	if (error)
+		return (error);
+
+	if (xzp != NULL) {
+		*xzpp = xzp;
+		return (0);
+	}
+
+
+	if (!(flags & CREATE_XATTR_DIR))
+		return (SET_ERROR(ENOATTR));
+
+	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * The ability to 'create' files in an attribute
+	 * directory comes from the write_xattr permission on the base file.
+	 *
+	 * The ability to 'search' an attribute directory requires
+	 * read_xattr permission on the base file.
+	 *
+	 * Once in a directory the ability to read/write attributes
+	 * is controlled by the permissions on the attribute file.
+	 */
+	va.va_mask = AT_MODE | AT_UID | AT_GID;
+	va.va_type = VDIR;
+	va.va_mode = S_IFDIR | S_ISVTX | 0777;
+	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
+
+	error = zfs_make_xattrdir(zp, &va, xzpp, cr);
+
+	if (error == ERESTART) {
+		/* NB: we already did dmu_tx_wait() if necessary */
+		goto top;
+	}
+	if (error == 0)
+		VOP_UNLOCK1(ZTOV(*xzpp));
+
+	return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ *	you own the directory,
+ *	you own the entry,
+ *	the entry is a plain file and you have write access,
+ *	or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+	uid_t  		uid;
+	uid_t		downer;
+	uid_t		fowner;
+	zfsvfs_t	*zfsvfs = zdp->z_zfsvfs;
+
+	if (zdp->z_zfsvfs->z_replay)
+		return (0);
+
+	if ((zdp->z_mode & S_ISVTX) == 0)
+		return (0);
+
+	downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
+	fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+
+	if ((uid = crgetuid(cr)) == downer || uid == fowner ||
+	    (ZTOV(zp)->v_type == VREG &&
+	    zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
+		return (0);
+	else
+		return (secpolicy_vnode_remove(ZTOV(zp), cr));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
new file mode 100644
index 000000000000..ec7c04717c84
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_recv.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_file.h>
+#include <sys/buf.h>
+#include <sys/stat.h>
+
+int
+zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
+{
+	struct thread *td;
+	int rc, fd;
+
+	td = curthread;
+	pwd_ensure_dirs();
+	/* 12.x doesn't take a const char * */
+	rc = kern_openat(td, AT_FDCWD, __DECONST(char *, path),
+	    UIO_SYSSPACE, flags, mode);
+	if (rc)
+		return (SET_ERROR(rc));
+	fd = td->td_retval[0];
+	td->td_retval[0] = 0;
+	if (fget(curthread, fd, &cap_no_rights, fpp))
+		kern_close(td, fd);
+	return (0);
+}
+
+void
+zfs_file_close(zfs_file_t *fp)
+{
+	fo_close(fp, curthread);
+}
+
+static int
+zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *offp,
+    ssize_t *resid)
+{
+	ssize_t rc;
+	struct uio auio;
+	struct thread *td;
+	struct iovec aiov;
+
+	td = curthread;
+	aiov.iov_base = (void *)(uintptr_t)buf;
+	aiov.iov_len = count;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_resid = count;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_td = td;
+	auio.uio_offset = *offp;
+
+	if ((fp->f_flag & FWRITE) == 0)
+		return (SET_ERROR(EBADF));
+
+	if (fp->f_type == DTYPE_VNODE)
+		bwillwrite();
+
+	rc = fo_write(fp, &auio, td->td_ucred, FOF_OFFSET, td);
+	if (rc)
+		return (SET_ERROR(rc));
+	if (resid)
+		*resid = auio.uio_resid;
+	else if (auio.uio_resid)
+		return (SET_ERROR(EIO));
+	*offp += count - auio.uio_resid;
+	return (rc);
+}
+
+int
+zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
+{
+	loff_t off = fp->f_offset;
+	ssize_t rc;
+
+	rc = zfs_file_write_impl(fp, buf, count, &off, resid);
+	if (rc == 0)
+		fp->f_offset = off;
+
+	return (SET_ERROR(rc));
+}
+
+int
+zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
+    ssize_t *resid)
+{
+	return (zfs_file_write_impl(fp, buf, count, &off, resid));
+}
+
+static int
+zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp,
+    ssize_t *resid)
+{
+	ssize_t rc;
+	struct uio auio;
+	struct thread *td;
+	struct iovec aiov;
+
+	td = curthread;
+	aiov.iov_base = (void *)(uintptr_t)buf;
+	aiov.iov_len = count;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_resid = count;
+	auio.uio_rw = UIO_READ;
+	auio.uio_td = td;
+	auio.uio_offset = *offp;
+
+	if ((fp->f_flag & FREAD) == 0)
+		return (SET_ERROR(EBADF));
+
+	rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td);
+	if (rc)
+		return (SET_ERROR(rc));
+	*resid = auio.uio_resid;
+	*offp += count - auio.uio_resid;
+	return (SET_ERROR(0));
+}
+
+int
+zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
+{
+	loff_t off = fp->f_offset;
+	ssize_t rc;
+
+	rc = zfs_file_read_impl(fp, buf, count, &off, resid);
+	if (rc == 0)
+		fp->f_offset = off;
+	return (rc);
+}
+
+int
+zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
+    ssize_t *resid)
+{
+	return (zfs_file_read_impl(fp, buf, count, &off, resid));
+}
+
+int
+zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
+{
+	int rc;
+	struct thread *td;
+
+	td = curthread;
+	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0)
+		return (SET_ERROR(ESPIPE));
+	rc = fo_seek(fp, *offp, whence, td);
+	if (rc == 0)
+		*offp = td->td_uretoff.tdu_off;
+	return (SET_ERROR(rc));
+}
+
+int
+zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr)
+{
+	struct thread *td;
+	struct stat sb;
+	int rc;
+
+	td = curthread;
+
+	rc = fo_stat(fp, &sb, td->td_ucred, td);
+	if (rc)
+		return (SET_ERROR(rc));
+	zfattr->zfa_size = sb.st_size;
+	zfattr->zfa_mode = sb.st_mode;
+
+	return (0);
+}
+
+static __inline int
+zfs_vop_fsync(vnode_t *vp)
+{
+	struct mount *mp;
+	int error;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		goto drop;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	error = VOP_FSYNC(vp, MNT_WAIT, curthread);
+	VOP_UNLOCK1(vp);
+	vn_finished_write(mp);
+drop:
+	return (SET_ERROR(error));
+}
+
+int
+zfs_file_fsync(zfs_file_t *fp, int flags)
+{
+	struct vnode *v;
+
+	if (fp->f_type != DTYPE_VNODE)
+		return (EINVAL);
+
+	v = fp->f_data;
+	return (zfs_vop_fsync(v));
+}
+
+int
+zfs_file_get(int fd, zfs_file_t **fpp)
+{
+	struct file *fp;
+
+	if (fget(curthread, fd, &cap_no_rights, &fp))
+		return (SET_ERROR(EBADF));
+
+	*fpp = fp;
+	return (0);
+}
+
+void
+zfs_file_put(int fd)
+{
+	struct file *fp;
+
+	/* No CAP_ rights required, as we're only releasing. */
+	if (fget(curthread, fd, &cap_no_rights, &fp) == 0) {
+		fdrop(fp, curthread);
+		fdrop(fp, curthread);
+	}
+}
+
+loff_t
+zfs_file_off(zfs_file_t *fp)
+{
+	return (fp->f_offset);
+}
+
+void *
+zfs_file_private(zfs_file_t *fp)
+{
+	file_t *tmpfp;
+	void *data;
+	int error;
+
+	tmpfp = curthread->td_fpop;
+	curthread->td_fpop = fp;
+	error = devfs_get_cdevpriv(&data);
+	curthread->td_fpop = tmpfp;
+	if (error != 0)
+		return (NULL);
+	return (data);
+}
+
+int
+zfs_file_unlink(const char *fnamep)
+{
+	enum uio_seg seg = UIO_SYSSPACE;
+	int rc;
+
+#if __FreeBSD_version >= 1300018
+	rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0);
+#else
+#ifdef AT_BENEATH
+	rc = kern_unlinkat(curthread, AT_FDCWD, fnamep, seg, 0, 0);
+#else
+	rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep),
+	    seg, 0);
+#endif
+#endif
+	return (SET_ERROR(rc));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c
new file mode 100644
index 000000000000..8dec8644c06e
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_ioctl_compat.h>
+
+enum zfs_ioc_legacy {
+	ZFS_IOC_LEGACY_NONE =	-1,
+	ZFS_IOC_LEGACY_FIRST =	0,
+	ZFS_LEGACY_IOC = ZFS_IOC_LEGACY_FIRST,
+	ZFS_IOC_LEGACY_POOL_CREATE = ZFS_IOC_LEGACY_FIRST,
+	ZFS_IOC_LEGACY_POOL_DESTROY,
+	ZFS_IOC_LEGACY_POOL_IMPORT,
+	ZFS_IOC_LEGACY_POOL_EXPORT,
+	ZFS_IOC_LEGACY_POOL_CONFIGS,
+	ZFS_IOC_LEGACY_POOL_STATS,
+	ZFS_IOC_LEGACY_POOL_TRYIMPORT,
+	ZFS_IOC_LEGACY_POOL_SCAN,
+	ZFS_IOC_LEGACY_POOL_FREEZE,
+	ZFS_IOC_LEGACY_POOL_UPGRADE,
+	ZFS_IOC_LEGACY_POOL_GET_HISTORY,
+	ZFS_IOC_LEGACY_VDEV_ADD,
+	ZFS_IOC_LEGACY_VDEV_REMOVE,
+	ZFS_IOC_LEGACY_VDEV_SET_STATE,
+	ZFS_IOC_LEGACY_VDEV_ATTACH,
+	ZFS_IOC_LEGACY_VDEV_DETACH,
+	ZFS_IOC_LEGACY_VDEV_SETPATH,
+	ZFS_IOC_LEGACY_VDEV_SETFRU,
+	ZFS_IOC_LEGACY_OBJSET_STATS,
+	ZFS_IOC_LEGACY_OBJSET_ZPLPROPS,
+	ZFS_IOC_LEGACY_DATASET_LIST_NEXT,
+	ZFS_IOC_LEGACY_SNAPSHOT_LIST_NEXT,
+	ZFS_IOC_LEGACY_SET_PROP,
+	ZFS_IOC_LEGACY_CREATE,
+	ZFS_IOC_LEGACY_DESTROY,
+	ZFS_IOC_LEGACY_ROLLBACK,
+	ZFS_IOC_LEGACY_RENAME,
+	ZFS_IOC_LEGACY_RECV,
+	ZFS_IOC_LEGACY_SEND,
+	ZFS_IOC_LEGACY_INJECT_FAULT,
+	ZFS_IOC_LEGACY_CLEAR_FAULT,
+	ZFS_IOC_LEGACY_INJECT_LIST_NEXT,
+	ZFS_IOC_LEGACY_ERROR_LOG,
+	ZFS_IOC_LEGACY_CLEAR,
+	ZFS_IOC_LEGACY_PROMOTE,
+	ZFS_IOC_LEGACY_DESTROY_SNAPS,
+	ZFS_IOC_LEGACY_SNAPSHOT,
+	ZFS_IOC_LEGACY_DSOBJ_TO_DSNAME,
+	ZFS_IOC_LEGACY_OBJ_TO_PATH,
+	ZFS_IOC_LEGACY_POOL_SET_PROPS,
+	ZFS_IOC_LEGACY_POOL_GET_PROPS,
+	ZFS_IOC_LEGACY_SET_FSACL,
+	ZFS_IOC_LEGACY_GET_FSACL,
+	ZFS_IOC_LEGACY_SHARE,
+	ZFS_IOC_LEGACY_INHERIT_PROP,
+	ZFS_IOC_LEGACY_SMB_ACL,
+	ZFS_IOC_LEGACY_USERSPACE_ONE,
+	ZFS_IOC_LEGACY_USERSPACE_MANY,
+	ZFS_IOC_LEGACY_USERSPACE_UPGRADE,
+	ZFS_IOC_LEGACY_HOLD,
+	ZFS_IOC_LEGACY_RELEASE,
+	ZFS_IOC_LEGACY_GET_HOLDS,
+	ZFS_IOC_LEGACY_OBJSET_RECVD_PROPS,
+	ZFS_IOC_LEGACY_VDEV_SPLIT,
+	ZFS_IOC_LEGACY_NEXT_OBJ,
+	ZFS_IOC_LEGACY_DIFF,
+	ZFS_IOC_LEGACY_TMP_SNAPSHOT,
+	ZFS_IOC_LEGACY_OBJ_TO_STATS,
+	ZFS_IOC_LEGACY_JAIL,
+	ZFS_IOC_LEGACY_UNJAIL,
+	ZFS_IOC_LEGACY_POOL_REGUID,
+	ZFS_IOC_LEGACY_SPACE_WRITTEN,
+	ZFS_IOC_LEGACY_SPACE_SNAPS,
+	ZFS_IOC_LEGACY_SEND_PROGRESS,
+	ZFS_IOC_LEGACY_POOL_REOPEN,
+	ZFS_IOC_LEGACY_LOG_HISTORY,
+	ZFS_IOC_LEGACY_SEND_NEW,
+	ZFS_IOC_LEGACY_SEND_SPACE,
+	ZFS_IOC_LEGACY_CLONE,
+	ZFS_IOC_LEGACY_BOOKMARK,
+	ZFS_IOC_LEGACY_GET_BOOKMARKS,
+	ZFS_IOC_LEGACY_DESTROY_BOOKMARKS,
+	ZFS_IOC_LEGACY_NEXTBOOT,
+	ZFS_IOC_LEGACY_CHANNEL_PROGRAM,
+	ZFS_IOC_LEGACY_REMAP,
+	ZFS_IOC_LEGACY_POOL_CHECKPOINT,
+	ZFS_IOC_LEGACY_POOL_DISCARD_CHECKPOINT,
+	ZFS_IOC_LEGACY_POOL_INITIALIZE,
+	ZFS_IOC_LEGACY_POOL_SYNC,
+	ZFS_IOC_LEGACY_LAST
+};
+
+unsigned static long zfs_ioctl_legacy_to_ozfs_[] = {
+	ZFS_IOC_POOL_CREATE,			/* 0x00 */
+	ZFS_IOC_POOL_DESTROY,			/* 0x01 */
+	ZFS_IOC_POOL_IMPORT,			/* 0x02 */
+	ZFS_IOC_POOL_EXPORT,			/* 0x03 */
+	ZFS_IOC_POOL_CONFIGS,			/* 0x04 */
+	ZFS_IOC_POOL_STATS,			/* 0x05 */
+	ZFS_IOC_POOL_TRYIMPORT,			/* 0x06 */
+	ZFS_IOC_POOL_SCAN,			/* 0x07 */
+	ZFS_IOC_POOL_FREEZE,			/* 0x08 */
+	ZFS_IOC_POOL_UPGRADE,			/* 0x09 */
+	ZFS_IOC_POOL_GET_HISTORY,		/* 0x0a */
+	ZFS_IOC_VDEV_ADD,			/* 0x0b */
+	ZFS_IOC_VDEV_REMOVE,			/* 0x0c */
+	ZFS_IOC_VDEV_SET_STATE,			/* 0x0d */
+	ZFS_IOC_VDEV_ATTACH,			/* 0x0e */
+	ZFS_IOC_VDEV_DETACH,			/* 0x0f */
+	ZFS_IOC_VDEV_SETPATH,			/* 0x10 */
+	ZFS_IOC_VDEV_SETFRU,			/* 0x11 */
+	ZFS_IOC_OBJSET_STATS,			/* 0x12 */
+	ZFS_IOC_OBJSET_ZPLPROPS,		/* 0x13 */
+	ZFS_IOC_DATASET_LIST_NEXT,		/* 0x14 */
+	ZFS_IOC_SNAPSHOT_LIST_NEXT,		/* 0x15 */
+	ZFS_IOC_SET_PROP,			/* 0x16 */
+	ZFS_IOC_CREATE,				/* 0x17 */
+	ZFS_IOC_DESTROY,			/* 0x18 */
+	ZFS_IOC_ROLLBACK,			/* 0x19 */
+	ZFS_IOC_RENAME,				/* 0x1a */
+	ZFS_IOC_RECV,				/* 0x1b */
+	ZFS_IOC_SEND,				/* 0x1c */
+	ZFS_IOC_INJECT_FAULT,			/* 0x1d */
+	ZFS_IOC_CLEAR_FAULT,			/* 0x1e */
+	ZFS_IOC_INJECT_LIST_NEXT,		/* 0x1f */
+	ZFS_IOC_ERROR_LOG,			/* 0x20 */
+	ZFS_IOC_CLEAR,				/* 0x21 */
+	ZFS_IOC_PROMOTE,			/* 0x22 */
+	/* start of mismatch */
+
+	ZFS_IOC_DESTROY_SNAPS,			/* 0x23:0x3b */
+	ZFS_IOC_SNAPSHOT,			/* 0x24:0x23 */
+	ZFS_IOC_DSOBJ_TO_DSNAME,		/* 0x25:0x24 */
+	ZFS_IOC_OBJ_TO_PATH,			/* 0x26:0x25 */
+	ZFS_IOC_POOL_SET_PROPS,			/* 0x27:0x26 */
+	ZFS_IOC_POOL_GET_PROPS,			/* 0x28:0x27 */
+	ZFS_IOC_SET_FSACL,			/* 0x29:0x28 */
+	ZFS_IOC_GET_FSACL,			/* 0x30:0x29 */
+	ZFS_IOC_SHARE,				/* 0x2b:0x2a */
+	ZFS_IOC_INHERIT_PROP,			/* 0x2c:0x2b */
+	ZFS_IOC_SMB_ACL,			/* 0x2d:0x2c */
+	ZFS_IOC_USERSPACE_ONE,			/* 0x2e:0x2d */
+	ZFS_IOC_USERSPACE_MANY,			/* 0x2f:0x2e */
+	ZFS_IOC_USERSPACE_UPGRADE,		/* 0x30:0x2f */
+	ZFS_IOC_HOLD,				/* 0x31:0x30 */
+	ZFS_IOC_RELEASE,			/* 0x32:0x31 */
+	ZFS_IOC_GET_HOLDS,			/* 0x33:0x32 */
+	ZFS_IOC_OBJSET_RECVD_PROPS,		/* 0x34:0x33 */
+	ZFS_IOC_VDEV_SPLIT,			/* 0x35:0x34 */
+	ZFS_IOC_NEXT_OBJ,			/* 0x36:0x35 */
+	ZFS_IOC_DIFF,				/* 0x37:0x36 */
+	ZFS_IOC_TMP_SNAPSHOT,			/* 0x38:0x37 */
+	ZFS_IOC_OBJ_TO_STATS,			/* 0x39:0x38 */
+	ZFS_IOC_JAIL,			/* 0x3a:0xc2 */
+	ZFS_IOC_UNJAIL,			/* 0x3b:0xc3 */
+	ZFS_IOC_POOL_REGUID,			/* 0x3c:0x3c */
+	ZFS_IOC_SPACE_WRITTEN,			/* 0x3d:0x39 */
+	ZFS_IOC_SPACE_SNAPS,			/* 0x3e:0x3a */
+	ZFS_IOC_SEND_PROGRESS,			/* 0x3f:0x3e */
+	ZFS_IOC_POOL_REOPEN,			/* 0x40:0x3d */
+	ZFS_IOC_LOG_HISTORY,			/* 0x41:0x3f */
+	ZFS_IOC_SEND_NEW,			/* 0x42:0x40 */
+	ZFS_IOC_SEND_SPACE,			/* 0x43:0x41 */
+	ZFS_IOC_CLONE,				/* 0x44:0x42 */
+	ZFS_IOC_BOOKMARK,			/* 0x45:0x43 */
+	ZFS_IOC_GET_BOOKMARKS,			/* 0x46:0x44 */
+	ZFS_IOC_DESTROY_BOOKMARKS,		/* 0x47:0x45 */
+	ZFS_IOC_NEXTBOOT,			/* 0x48:0xc1 */
+	ZFS_IOC_CHANNEL_PROGRAM,		/* 0x49:0x48 */
+	ZFS_IOC_REMAP,				/* 0x4a:0x4c */
+	ZFS_IOC_POOL_CHECKPOINT,		/* 0x4b:0x4d */
+	ZFS_IOC_POOL_DISCARD_CHECKPOINT,	/* 0x4c:0x4e */
+	ZFS_IOC_POOL_INITIALIZE,		/* 0x4d:0x4f */
+};
+
+unsigned static long zfs_ioctl_ozfs_to_legacy_common_[] = {
+	ZFS_IOC_POOL_CREATE,			/* 0x00 */
+	ZFS_IOC_POOL_DESTROY,			/* 0x01 */
+	ZFS_IOC_POOL_IMPORT,			/* 0x02 */
+	ZFS_IOC_POOL_EXPORT,			/* 0x03 */
+	ZFS_IOC_POOL_CONFIGS,			/* 0x04 */
+	ZFS_IOC_POOL_STATS,			/* 0x05 */
+	ZFS_IOC_POOL_TRYIMPORT,			/* 0x06 */
+	ZFS_IOC_POOL_SCAN,			/* 0x07 */
+	ZFS_IOC_POOL_FREEZE,			/* 0x08 */
+	ZFS_IOC_POOL_UPGRADE,			/* 0x09 */
+	ZFS_IOC_POOL_GET_HISTORY,		/* 0x0a */
+	ZFS_IOC_VDEV_ADD,			/* 0x0b */
+	ZFS_IOC_VDEV_REMOVE,			/* 0x0c */
+	ZFS_IOC_VDEV_SET_STATE,			/* 0x0d */
+	ZFS_IOC_VDEV_ATTACH,			/* 0x0e */
+	ZFS_IOC_VDEV_DETACH,			/* 0x0f */
+	ZFS_IOC_VDEV_SETPATH,			/* 0x10 */
+	ZFS_IOC_VDEV_SETFRU,			/* 0x11 */
+	ZFS_IOC_OBJSET_STATS,			/* 0x12 */
+	ZFS_IOC_OBJSET_ZPLPROPS,		/* 0x13 */
+	ZFS_IOC_DATASET_LIST_NEXT,		/* 0x14 */
+	ZFS_IOC_SNAPSHOT_LIST_NEXT,		/* 0x15 */
+	ZFS_IOC_SET_PROP,			/* 0x16 */
+	ZFS_IOC_CREATE,				/* 0x17 */
+	ZFS_IOC_DESTROY,			/* 0x18 */
+	ZFS_IOC_ROLLBACK,			/* 0x19 */
+	ZFS_IOC_RENAME,				/* 0x1a */
+	ZFS_IOC_RECV,				/* 0x1b */
+	ZFS_IOC_SEND,				/* 0x1c */
+	ZFS_IOC_INJECT_FAULT,			/* 0x1d */
+	ZFS_IOC_CLEAR_FAULT,			/* 0x1e */
+	ZFS_IOC_INJECT_LIST_NEXT,		/* 0x1f */
+	ZFS_IOC_ERROR_LOG,			/* 0x20 */
+	ZFS_IOC_CLEAR,				/* 0x21 */
+	ZFS_IOC_PROMOTE,			/* 0x22 */
+	/* start of mismatch */
+	ZFS_IOC_LEGACY_SNAPSHOT,		/* 0x23 */
+	ZFS_IOC_LEGACY_DSOBJ_TO_DSNAME,		/* 0x24 */
+	ZFS_IOC_LEGACY_OBJ_TO_PATH,		/* 0x25 */
+	ZFS_IOC_LEGACY_POOL_SET_PROPS,		/* 0x26 */
+	ZFS_IOC_LEGACY_POOL_GET_PROPS,		/* 0x27 */
+	ZFS_IOC_LEGACY_SET_FSACL,		/* 0x28 */
+	ZFS_IOC_LEGACY_GET_FSACL,		/* 0x29 */
+	ZFS_IOC_LEGACY_SHARE,			/* 0x2a */
+	ZFS_IOC_LEGACY_INHERIT_PROP,		/* 0x2b */
+	ZFS_IOC_LEGACY_SMB_ACL,			/* 0x2c */
+	ZFS_IOC_LEGACY_USERSPACE_ONE,		/* 0x2d */
+	ZFS_IOC_LEGACY_USERSPACE_MANY,		/* 0x2e */
+	ZFS_IOC_LEGACY_USERSPACE_UPGRADE,	/* 0x2f */
+	ZFS_IOC_LEGACY_HOLD,			/* 0x30 */
+	ZFS_IOC_LEGACY_RELEASE,			/* 0x31 */
+	ZFS_IOC_LEGACY_GET_HOLDS,		/* 0x32 */
+	ZFS_IOC_LEGACY_OBJSET_RECVD_PROPS,	/* 0x33 */
+	ZFS_IOC_LEGACY_VDEV_SPLIT,		/* 0x34 */
+	ZFS_IOC_LEGACY_NEXT_OBJ,		/* 0x35 */
+	ZFS_IOC_LEGACY_DIFF,			/* 0x36 */
+	ZFS_IOC_LEGACY_TMP_SNAPSHOT,		/* 0x37 */
+	ZFS_IOC_LEGACY_OBJ_TO_STATS,		/* 0x38 */
+	ZFS_IOC_LEGACY_SPACE_WRITTEN,		/* 0x39 */
+	ZFS_IOC_LEGACY_SPACE_SNAPS,		/* 0x3a */
+	ZFS_IOC_LEGACY_DESTROY_SNAPS,		/* 0x3b */
+	ZFS_IOC_LEGACY_POOL_REGUID,		/* 0x3c */
+	ZFS_IOC_LEGACY_POOL_REOPEN,		/* 0x3d */
+	ZFS_IOC_LEGACY_SEND_PROGRESS,		/* 0x3e */
+	ZFS_IOC_LEGACY_LOG_HISTORY,		/* 0x3f */
+	ZFS_IOC_LEGACY_SEND_NEW,		/* 0x40 */
+	ZFS_IOC_LEGACY_SEND_SPACE,		/* 0x41 */
+	ZFS_IOC_LEGACY_CLONE,			/* 0x42 */
+	ZFS_IOC_LEGACY_BOOKMARK,		/* 0x43 */
+	ZFS_IOC_LEGACY_GET_BOOKMARKS,		/* 0x44 */
+	ZFS_IOC_LEGACY_DESTROY_BOOKMARKS,	/* 0x45 */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_RECV_NEW */
+	ZFS_IOC_LEGACY_POOL_SYNC,		/* 0x47 */
+	ZFS_IOC_LEGACY_CHANNEL_PROGRAM,		/* 0x48 */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_LOAD_KEY */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_UNLOAD_KEY */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_CHANGE_KEY */
+	ZFS_IOC_LEGACY_REMAP,			/* 0x4c */
+	ZFS_IOC_LEGACY_POOL_CHECKPOINT,		/* 0x4d */
+	ZFS_IOC_LEGACY_POOL_DISCARD_CHECKPOINT,	/* 0x4e */
+	ZFS_IOC_LEGACY_POOL_INITIALIZE,		/* 0x4f  */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_POOL_TRIM */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_REDACT */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_GET_BOOKMARK_PROPS */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_WAIT */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_WAIT_FS */
+};
+
+unsigned static long zfs_ioctl_ozfs_to_legacy_platform_[] = {
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_NEXT */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_CLEAR */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_SEEK */
+	ZFS_IOC_LEGACY_NEXTBOOT,
+	ZFS_IOC_LEGACY_JAIL,
+	ZFS_IOC_LEGACY_UNJAIL,
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_SET_BOOTENV */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_GET_BOOTENV */
+};
+
+int
+zfs_ioctl_legacy_to_ozfs(int request)
+{
+	if (request >= sizeof (zfs_ioctl_legacy_to_ozfs_)/sizeof (long))
+		return (-1);
+	return (zfs_ioctl_legacy_to_ozfs_[request]);
+}
+
+int
+zfs_ioctl_ozfs_to_legacy(int request)
+{
+	if (request > ZFS_IOC_LAST)
+		return (-1);
+
+	if (request > ZFS_IOC_PLATFORM)
+		return (zfs_ioctl_ozfs_to_legacy_platform_[request]);
+	if (request >= sizeof (zfs_ioctl_ozfs_to_legacy_common_)/sizeof (long))
+		return (-1);
+	return (zfs_ioctl_ozfs_to_legacy_common_[request]);
+}
+
+void
+zfs_cmd_legacy_to_ozfs(zfs_cmd_legacy_t *src, zfs_cmd_t *dst)
+{
+	memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats));
+	*&dst->zc_objset_stats = *&src->zc_objset_stats;
+	memcpy(&dst->zc_begin_record, &src->zc_begin_record,
+	    offsetof(zfs_cmd_t, zc_sendobj) -
+	    offsetof(zfs_cmd_t, zc_begin_record));
+	memcpy(&dst->zc_sendobj, &src->zc_sendobj,
+	    sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj));
+	dst->zc_zoneid = src->zc_jailid;
+}
+
+void
+zfs_cmd_ozfs_to_legacy(zfs_cmd_t *src, zfs_cmd_legacy_t *dst)
+{
+	memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats));
+	*&dst->zc_objset_stats = *&src->zc_objset_stats;
+	*&dst->zc_begin_record.drr_u.drr_begin = *&src->zc_begin_record;
+	dst->zc_begin_record.drr_payloadlen = 0;
+	dst->zc_begin_record.drr_type = 0;
+
+	memcpy(&dst->zc_inject_record, &src->zc_inject_record,
+	    offsetof(zfs_cmd_t, zc_sendobj) -
+	    offsetof(zfs_cmd_t, zc_inject_record));
+	dst->zc_resumable = B_FALSE;
+	memcpy(&dst->zc_sendobj, &src->zc_sendobj,
+	    sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj));
+	dst->zc_jailid = src->zc_zoneid;
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c
new file mode 100644
index 000000000000..0e0c16033b15
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/nvpair.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_os.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zone.h>
+#include <vm/vm_pageout.h>
+
+#include <sys/zfs_ioctl_impl.h>
+
+#if __FreeBSD_version < 1201517
+#define	vm_page_max_user_wired	vm_page_max_wired
+#endif
+
+int
+zfs_vfs_ref(zfsvfs_t **zfvp)
+{
+	int error = 0;
+
+	if (*zfvp == NULL)
+		return (SET_ERROR(ESRCH));
+
+	error = vfs_busy((*zfvp)->z_vfs, 0);
+	if (error != 0) {
+		*zfvp = NULL;
+		error = SET_ERROR(ESRCH);
+	}
+	return (error);
+}
+
+int
+zfs_vfs_held(zfsvfs_t *zfsvfs)
+{
+	return (zfsvfs->z_vfs != NULL);
+}
+
+void
+zfs_vfs_rele(zfsvfs_t *zfsvfs)
+{
+	vfs_unbusy(zfsvfs->z_vfs);
+}
+
+static const zfs_ioc_key_t zfs_keys_nextboot[] = {
+	{"command",		DATA_TYPE_STRING,	0},
+	{ ZPOOL_CONFIG_POOL_GUID,		DATA_TYPE_UINT64,	0},
+	{ ZPOOL_CONFIG_GUID,		DATA_TYPE_UINT64,	0}
+};
+
+static int
+zfs_ioc_jail(zfs_cmd_t *zc)
+{
+
+	return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
+	    (int)zc->zc_zoneid));
+}
+
+static int
+zfs_ioc_unjail(zfs_cmd_t *zc)
+{
+
+	return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
+	    (int)zc->zc_zoneid));
+}
+
+static int
+zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	char name[MAXNAMELEN];
+	spa_t *spa;
+	vdev_t *vd;
+	char *command;
+	uint64_t pool_guid;
+	uint64_t vdev_guid;
+	int error;
+
+	if (nvlist_lookup_uint64(innvl,
+	    ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
+		return (EINVAL);
+	if (nvlist_lookup_uint64(innvl,
+	    ZPOOL_CONFIG_GUID, &vdev_guid) != 0)
+		return (EINVAL);
+	if (nvlist_lookup_string(innvl,
+	    "command", &command) != 0)
+		return (EINVAL);
+
+	mutex_enter(&spa_namespace_lock);
+	spa = spa_by_guid(pool_guid, vdev_guid);
+	if (spa != NULL)
+		strcpy(name, spa_name(spa));
+	mutex_exit(&spa_namespace_lock);
+	if (spa == NULL)
+		return (ENOENT);
+
+	if ((error = spa_open(name, &spa, FTAG)) != 0)
+		return (error);
+	spa_vdev_state_enter(spa, SCL_ALL);
+	vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
+	if (vd == NULL) {
+		(void) spa_vdev_state_exit(spa, NULL, ENXIO);
+		spa_close(spa, FTAG);
+		return (ENODEV);
+	}
+	error = vdev_label_write_pad2(vd, command, strlen(command));
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+uint64_t
+zfs_max_nvlist_src_size_os(void)
+{
+	if (zfs_max_nvlist_src_size != 0)
+		return (zfs_max_nvlist_src_size);
+
+	return (ptob(vm_page_max_user_wired) / 4);
+}
+
+void
+zfs_ioctl_init_os(void)
+{
+	zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail,
+	    zfs_secpolicy_config, POOL_CHECK_NONE);
+	zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
+	    zfs_secpolicy_config, POOL_CHECK_NONE);
+	zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT,
+	    zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME,
+	    POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_nextboot, 3);
+
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c
new file mode 100644
index 000000000000..8b22f2fdc3b3
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c
@@ -0,0 +1,70 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/sunddi.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_onexit.h>
+
+static int
+zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
+{
+	*zo = zfsdev_get_state(minor, ZST_ONEXIT);
+	if (*zo == NULL)
+		return (SET_ERROR(EBADF));
+
+	return (0);
+}
+
+int
+zfs_onexit_fd_hold(int fd, minor_t *minorp)
+{
+	file_t *fp, *tmpfp;
+	zfs_onexit_t *zo;
+	void *data;
+	int error;
+
+	if ((error = zfs_file_get(fd, &fp)))
+		return (error);
+
+	tmpfp = curthread->td_fpop;
+	curthread->td_fpop = fp;
+	error = devfs_get_cdevpriv(&data);
+	if (error == 0)
+		*minorp = (minor_t)(uintptr_t)data;
+	curthread->td_fpop = tmpfp;
+	if (error != 0)
+		return (SET_ERROR(EBADF));
+	return (zfs_onexit_minor_to_state(*minorp, &zo));
+}
+
+void
+zfs_onexit_fd_rele(int fd)
+{
+	zfs_file_put(fd);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
new file mode 100644
index 000000000000..f94ea44335c6
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
@@ -0,0 +1,2482 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/acl.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/mntent.h>
+#include <sys/mount.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sunddi.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/spa_boot.h>
+#include <sys/jail.h>
+#include <ufs/ufs/quota.h>
+#include <sys/zfs_quota.h>
+
+#include "zfs_comutil.h"
+
+#ifndef	MNTK_VMSETSIZE_BUG
+#define	MNTK_VMSETSIZE_BUG	0
+#endif
+#ifndef	MNTK_NOMSYNC
+#define	MNTK_NOMSYNC	8
+#endif
+
+/* BEGIN CSTYLED */
+struct mtx zfs_debug_mtx;
+MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
+
+SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
+
+int zfs_super_owner;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
+    "File system owner can perform privileged operation on his file systems");
+
+int zfs_debug_level;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
+	"Debug level");
+
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
+static int zfs_version_acl = ZFS_ACL_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
+    "ZFS_ACL_VERSION");
+static int zfs_version_spa = SPA_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
+    "SPA_VERSION");
+static int zfs_version_zpl = ZPL_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
+    "ZPL_VERSION");
+/* END CSTYLED */
+
+static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
+static int zfs_mount(vfs_t *vfsp);
+static int zfs_umount(vfs_t *vfsp, int fflag);
+static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
+static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
+static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
+static int zfs_sync(vfs_t *vfsp, int waitfor);
+#if __FreeBSD_version >= 1300098
+static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
+    struct ucred **credanonp, int *numsecflavors, int *secflavors);
+#else
+static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
+    struct ucred **credanonp, int *numsecflavors, int **secflavors);
+#endif
+static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
+static void zfs_freevfs(vfs_t *vfsp);
+
+struct vfsops zfs_vfsops = {
+	.vfs_mount =		zfs_mount,
+	.vfs_unmount =		zfs_umount,
+#if __FreeBSD_version >= 1300049
+	.vfs_root =		vfs_cache_root,
+	.vfs_cachedroot = zfs_root,
+#else
+	.vfs_root =		zfs_root,
+#endif
+	.vfs_statfs =		zfs_statfs,
+	.vfs_vget =		zfs_vget,
+	.vfs_sync =		zfs_sync,
+	.vfs_checkexp =		zfs_checkexp,
+	.vfs_fhtovp =		zfs_fhtovp,
+	.vfs_quotactl =		zfs_quotactl,
+};
+
+VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
+
+/*
+ * We need to keep a count of active fs's.
+ * This is necessary to prevent our module
+ * from being unloaded after a umount -f
+ */
+static uint32_t	zfs_active_fs_count = 0;
+
+int
+zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
+    char *setpoint)
+{
+	int error;
+	zfsvfs_t *zfvp;
+	vfs_t *vfsp;
+	objset_t *os;
+	uint64_t tmp = *val;
+
+	error = dmu_objset_from_ds(ds, &os);
+	if (error != 0)
+		return (error);
+
+	error = getzfsvfs_impl(os, &zfvp);
+	if (error != 0)
+		return (error);
+	if (zfvp == NULL)
+		return (ENOENT);
+	vfsp = zfvp->z_vfs;
+	switch (zfs_prop) {
+	case ZFS_PROP_ATIME:
+		if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
+			tmp = 0;
+		if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
+			tmp = 1;
+		break;
+	case ZFS_PROP_DEVICES:
+		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
+			tmp = 0;
+		if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
+			tmp = 1;
+		break;
+	case ZFS_PROP_EXEC:
+		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
+			tmp = 0;
+		if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
+			tmp = 1;
+		break;
+	case ZFS_PROP_SETUID:
+		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
+			tmp = 0;
+		if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
+			tmp = 1;
+		break;
+	case ZFS_PROP_READONLY:
+		if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
+			tmp = 0;
+		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
+			tmp = 1;
+		break;
+	case ZFS_PROP_XATTR:
+		if (zfvp->z_flags & ZSB_XATTR)
+			tmp = zfvp->z_xattr;
+		break;
+	case ZFS_PROP_NBMAND:
+		if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
+			tmp = 0;
+		if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
+			tmp = 1;
+		break;
+	default:
+		vfs_unbusy(vfsp);
+		return (ENOENT);
+	}
+
+	vfs_unbusy(vfsp);
+	if (tmp != *val) {
+		(void) strcpy(setpoint, "temporary");
+		*val = tmp;
+	}
+	return (0);
+}
+
+static int
+zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
+{
+	int error = 0;
+	char buf[32];
+	uint64_t usedobj, quotaobj;
+	uint64_t quota, used = 0;
+	timespec_t now;
+
+	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
+	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+	if (quotaobj == 0 || zfsvfs->z_replay) {
+		error = ENOENT;
+		goto done;
+	}
+	(void) sprintf(buf, "%llx", (longlong_t)id);
+	if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
+	    buf, sizeof (quota), 1, &quota)) != 0) {
+		dprintf("%s(%d): quotaobj lookup failed\n",
+		    __FUNCTION__, __LINE__);
+		goto done;
+	}
+	/*
+	 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
+	 * So we set them to be the same.
+	 */
+	dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
+	error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
+	if (error && error != ENOENT) {
+		dprintf("%s(%d):  usedobj failed; %d\n",
+		    __FUNCTION__, __LINE__, error);
+		goto done;
+	}
+	dqp->dqb_curblocks = btodb(used);
+	dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
+	vfs_timestamp(&now);
+	/*
+	 * Setting this to 0 causes FreeBSD quota(8) to print
+	 * the number of days since the epoch, which isn't
+	 * particularly useful.
+	 */
+	dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
+done:
+	return (error);
+}
+
+static int
+zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	struct thread *td;
+	int cmd, type, error = 0;
+	int bitsize;
+	zfs_userquota_prop_t quota_type;
+	struct dqblk64 dqblk = { 0 };
+
+	td = curthread;
+	cmd = cmds >> SUBCMDSHIFT;
+	type = cmds & SUBCMDMASK;
+
+	ZFS_ENTER(zfsvfs);
+	if (id == -1) {
+		switch (type) {
+		case USRQUOTA:
+			id = td->td_ucred->cr_ruid;
+			break;
+		case GRPQUOTA:
+			id = td->td_ucred->cr_rgid;
+			break;
+		default:
+			error = EINVAL;
+			if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
+				vfs_unbusy(vfsp);
+			goto done;
+		}
+	}
+	/*
+	 * Map BSD type to:
+	 * ZFS_PROP_USERUSED,
+	 * ZFS_PROP_USERQUOTA,
+	 * ZFS_PROP_GROUPUSED,
+	 * ZFS_PROP_GROUPQUOTA
+	 */
+	switch (cmd) {
+	case Q_SETQUOTA:
+	case Q_SETQUOTA32:
+		if (type == USRQUOTA)
+			quota_type = ZFS_PROP_USERQUOTA;
+		else if (type == GRPQUOTA)
+			quota_type = ZFS_PROP_GROUPQUOTA;
+		else
+			error = EINVAL;
+		break;
+	case Q_GETQUOTA:
+	case Q_GETQUOTA32:
+		if (type == USRQUOTA)
+			quota_type = ZFS_PROP_USERUSED;
+		else if (type == GRPQUOTA)
+			quota_type = ZFS_PROP_GROUPUSED;
+		else
+			error = EINVAL;
+		break;
+	}
+
+	/*
+	 * Depending on the cmd, we may need to get
+	 * the ruid and domain (see fuidstr_to_sid?),
+	 * the fuid (how?), or other information.
+	 * Create fuid using zfs_fuid_create(zfsvfs, id,
+	 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
+	 * I think I can use just the id?
+	 *
+	 * Look at zfs_id_overquota() to look up a quota.
+	 * zap_lookup(something, quotaobj, fuidstring,
+	 *     sizeof (long long), 1, &quota)
+	 *
+	 * See zfs_set_userquota() to set a quota.
+	 */
+	if ((uint32_t)type >= MAXQUOTAS) {
+		error = EINVAL;
+		goto done;
+	}
+
+	switch (cmd) {
+	case Q_GETQUOTASIZE:
+		bitsize = 64;
+		error = copyout(&bitsize, arg, sizeof (int));
+		break;
+	case Q_QUOTAON:
+		// As far as I can tell, you can't turn quotas on or off on zfs
+		error = 0;
+		vfs_unbusy(vfsp);
+		break;
+	case Q_QUOTAOFF:
+		error = ENOTSUP;
+		vfs_unbusy(vfsp);
+		break;
+	case Q_SETQUOTA:
+		error = copyin(arg, &dqblk, sizeof (dqblk));
+		if (error == 0)
+			error = zfs_set_userquota(zfsvfs, quota_type,
+			    "", id, dbtob(dqblk.dqb_bhardlimit));
+		break;
+	case Q_GETQUOTA:
+		error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
+		if (error == 0)
+			error = copyout(&dqblk, arg, sizeof (dqblk));
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+done:
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+
+boolean_t
+zfs_is_readonly(zfsvfs_t *zfsvfs)
+{
+	return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
+}
+
+/*ARGSUSED*/
+static int
+zfs_sync(vfs_t *vfsp, int waitfor)
+{
+
+	/*
+	 * Data integrity is job one.  We don't want a compromised kernel
+	 * writing to the storage pool, so we never sync during panic.
+	 */
+	if (panicstr)
+		return (0);
+
+	/*
+	 * Ignore the system syncher.  ZFS already commits async data
+	 * at zfs_txg_timeout intervals.
+	 */
+	if (waitfor == MNT_LAZY)
+		return (0);
+
+	if (vfsp != NULL) {
+		/*
+		 * Sync a specific filesystem.
+		 */
+		zfsvfs_t *zfsvfs = vfsp->vfs_data;
+		dsl_pool_t *dp;
+		int error;
+
+		error = vfs_stdsync(vfsp, waitfor);
+		if (error != 0)
+			return (error);
+
+		ZFS_ENTER(zfsvfs);
+		dp = dmu_objset_pool(zfsvfs->z_os);
+
+		/*
+		 * If the system is shutting down, then skip any
+		 * filesystems which may exist on a suspended pool.
+		 */
+		if (rebooting && spa_suspended(dp->dp_spa)) {
+			ZFS_EXIT(zfsvfs);
+			return (0);
+		}
+
+		if (zfsvfs->z_log != NULL)
+			zil_commit(zfsvfs->z_log, 0);
+
+		ZFS_EXIT(zfsvfs);
+	} else {
+		/*
+		 * Sync all ZFS filesystems.  This is what happens when you
+		 * run sync(1M).  Unlike other filesystems, ZFS honors the
+		 * request by waiting for all pools to commit all dirty data.
+		 */
+		spa_sync_allpools();
+	}
+
+	return (0);
+}
+
+static void
+atime_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == TRUE) {
+		zfsvfs->z_atime = TRUE;
+		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
+	} else {
+		zfsvfs->z_atime = FALSE;
+		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
+	}
+}
+
+static void
+xattr_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == ZFS_XATTR_OFF) {
+		zfsvfs->z_flags &= ~ZSB_XATTR;
+	} else {
+		zfsvfs->z_flags |= ZSB_XATTR;
+
+		if (newval == ZFS_XATTR_SA)
+			zfsvfs->z_xattr_sa = B_TRUE;
+		else
+			zfsvfs->z_xattr_sa = B_FALSE;
+	}
+}
+
+static void
+blksz_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+	ASSERT(ISP2(newval));
+
+	zfsvfs->z_max_blksz = newval;
+	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
+}
+
+static void
+readonly_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval) {
+		/* XXX locking on vfs_flag? */
+		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
+	} else {
+		/* XXX locking on vfs_flag? */
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
+	}
+}
+
+static void
+setuid_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == FALSE) {
+		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
+	} else {
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
+	}
+}
+
+static void
+exec_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == FALSE) {
+		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
+	} else {
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
+	}
+}
+
+/*
+ * The nbmand mount option can be changed at mount time.
+ * We can't allow it to be toggled on live file systems or incorrect
+ * behavior may be seen from cifs clients
+ *
+ * This property isn't registered via dsl_prop_register(), but this callback
+ * will be called when a file system is first mounted
+ */
+static void
+nbmand_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+	if (newval == FALSE) {
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
+	} else {
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
+	}
+}
+
+static void
+snapdir_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_show_ctldir = newval;
+}
+
+static void
+vscan_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_vscan = newval;
+}
+
+static void
+acl_mode_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_acl_mode = newval;
+}
+
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_acl_inherit = newval;
+}
+
+static int
+zfs_register_callbacks(vfs_t *vfsp)
+{
+	struct dsl_dataset *ds = NULL;
+	objset_t *os = NULL;
+	zfsvfs_t *zfsvfs = NULL;
+	uint64_t nbmand;
+	boolean_t readonly = B_FALSE;
+	boolean_t do_readonly = B_FALSE;
+	boolean_t setuid = B_FALSE;
+	boolean_t do_setuid = B_FALSE;
+	boolean_t exec = B_FALSE;
+	boolean_t do_exec = B_FALSE;
+	boolean_t xattr = B_FALSE;
+	boolean_t atime = B_FALSE;
+	boolean_t do_atime = B_FALSE;
+	boolean_t do_xattr = B_FALSE;
+	int error = 0;
+
+	ASSERT(vfsp);
+	zfsvfs = vfsp->vfs_data;
+	ASSERT(zfsvfs);
+	os = zfsvfs->z_os;
+
+	/*
+	 * This function can be called for a snapshot when we update snapshot's
+	 * mount point, which isn't really supported.
+	 */
+	if (dmu_objset_is_snapshot(os))
+		return (EOPNOTSUPP);
+
+	/*
+	 * The act of registering our callbacks will destroy any mount
+	 * options we may have.  In order to enable temporary overrides
+	 * of mount options, we stash away the current values and
+	 * restore them after we register the callbacks.
+	 */
+	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
+	    !spa_writeable(dmu_objset_spa(os))) {
+		readonly = B_TRUE;
+		do_readonly = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+		readonly = B_FALSE;
+		do_readonly = B_TRUE;
+	}
+	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
+		setuid = B_FALSE;
+		do_setuid = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
+		setuid = B_TRUE;
+		do_setuid = B_TRUE;
+	}
+	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
+		exec = B_FALSE;
+		do_exec = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
+		exec = B_TRUE;
+		do_exec = B_TRUE;
+	}
+	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
+		zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
+		do_xattr = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
+		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
+		do_xattr = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
+		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
+		do_xattr = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
+		zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
+		do_xattr = B_TRUE;
+	}
+	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
+		atime = B_FALSE;
+		do_atime = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
+		atime = B_TRUE;
+		do_atime = B_TRUE;
+	}
+
+	/*
+	 * We need to enter pool configuration here, so that we can use
+	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
+	 * dsl_prop_get_integer() can not be used, because it has to acquire
+	 * spa_namespace_lock and we can not do that because we already hold
+	 * z_teardown_lock.  The problem is that spa_write_cachefile() is called
+	 * with spa_namespace_lock held and the function calls ZFS vnode
+	 * operations to write the cache file and thus z_teardown_lock is
+	 * acquired after spa_namespace_lock.
+	 */
+	ds = dmu_objset_ds(os);
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+
+	/*
+	 * nbmand is a special property.  It can only be changed at
+	 * mount time.
+	 *
+	 * This is weird, but it is documented to only be changeable
+	 * at mount time.
+	 */
+	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
+		nbmand = B_FALSE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
+		nbmand = B_TRUE;
+	} else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) {
+		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+		return (error);
+	}
+
+	/*
+	 * Register property callbacks.
+	 *
+	 * It would probably be fine to just check for i/o error from
+	 * the first prop_register(), but I guess I like to go
+	 * overboard...
+	 */
+	error = dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
+	    zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
+	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+	if (error)
+		goto unregister;
+
+	/*
+	 * Invoke our callbacks to restore temporary mount options.
+	 */
+	if (do_readonly)
+		readonly_changed_cb(zfsvfs, readonly);
+	if (do_setuid)
+		setuid_changed_cb(zfsvfs, setuid);
+	if (do_exec)
+		exec_changed_cb(zfsvfs, exec);
+	if (do_xattr)
+		xattr_changed_cb(zfsvfs, xattr);
+	if (do_atime)
+		atime_changed_cb(zfsvfs, atime);
+
+	nbmand_changed_cb(zfsvfs, nbmand);
+
+	return (0);
+
+unregister:
+	dsl_prop_unregister_all(ds, zfsvfs);
+	return (error);
+}
+
+/*
+ * Associate this zfsvfs with the given objset, which must be owned.
+ * This will cache a bunch of on-disk state from the objset in the
+ * zfsvfs.
+ */
+static int
+zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
+{
+	int error;
+	uint64_t val;
+
+	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
+	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
+	zfsvfs->z_os = os;
+
+	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
+	if (error != 0)
+		return (error);
+	if (zfsvfs->z_version >
+	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
+		(void) printf("Can't mount a version %lld file system "
+		    "on a version %lld pool\n. Pool must be upgraded to mount "
+		    "this file system.", (u_longlong_t)zfsvfs->z_version,
+		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
+		return (SET_ERROR(ENOTSUP));
+	}
+	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_norm = (int)val;
+
+	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_utf8 = (val != 0);
+
+	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_case = (uint_t)val;
+
+	/*
+	 * Fold case on file systems that are always or sometimes case
+	 * insensitive.
+	 */
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+	    zfsvfs->z_case == ZFS_CASE_MIXED)
+		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+	uint64_t sa_obj = 0;
+	if (zfsvfs->z_use_sa) {
+		/* should either have both of these objects or none */
+		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+		    &sa_obj);
+		if (error != 0)
+			return (error);
+	}
+
+	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+	    &zfsvfs->z_attr_table);
+	if (error != 0)
+		return (error);
+
+	if (zfsvfs->z_version >= ZPL_VERSION_SA)
+		sa_register_update_callback(os, zfs_sa_upgrade);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+	    &zfsvfs->z_root);
+	if (error != 0)
+		return (error);
+	ASSERT(zfsvfs->z_root != 0);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+	    &zfsvfs->z_unlinkedobj);
+	if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
+	    8, 1, &zfsvfs->z_userquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_userquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
+	    8, 1, &zfsvfs->z_groupquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_groupquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
+	    8, 1, &zfsvfs->z_projectquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_projectquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
+	    8, 1, &zfsvfs->z_userobjquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_userobjquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
+	    8, 1, &zfsvfs->z_groupobjquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_groupobjquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
+	    8, 1, &zfsvfs->z_projectobjquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_projectobjquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
+	    &zfsvfs->z_fuid_obj);
+	if (error == ENOENT)
+		zfsvfs->z_fuid_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
+	    &zfsvfs->z_shares_dir);
+	if (error == ENOENT)
+		zfsvfs->z_shares_dir = 0;
+	else if (error != 0)
+		return (error);
+
+	/*
+	 * Only use the name cache if we are looking for a
+	 * name on a file system that does not require normalization
+	 * or case folding.  We can also look there if we happen to be
+	 * on a non-normalizing, mixed sensitivity file system IF we
+	 * are looking for the exact name (which is always the case on
+	 * FreeBSD).
+	 */
+	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
+	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
+	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
+
+	return (0);
+}
+
+taskq_t *zfsvfs_taskq;
+
+static void
+zfsvfs_task_unlinked_drain(void *context, int pending __unused)
+{
+
+	zfs_unlinked_drain((zfsvfs_t *)context);
+}
+
+int
+zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
+{
+	objset_t *os;
+	zfsvfs_t *zfsvfs;
+	int error;
+	boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
+
+	/*
+	 * XXX: Fix struct statfs so this isn't necessary!
+	 *
+	 * The 'osname' is used as the filesystem's special node, which means
+	 * it must fit in statfs.f_mntfromname, or else it can't be
+	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
+	 * 'zfs unmount' to think it's not mounted when it is.
+	 */
+	if (strlen(osname) >= MNAMELEN)
+		return (SET_ERROR(ENAMETOOLONG));
+
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+	error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
+	    &os);
+	if (error != 0) {
+		kmem_free(zfsvfs, sizeof (zfsvfs_t));
+		return (error);
+	}
+
+	error = zfsvfs_create_impl(zfvp, zfsvfs, os);
+
+	return (error);
+}
+
+
+int
+zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
+{
+	int error;
+
+	zfsvfs->z_vfs = NULL;
+	zfsvfs->z_parent = zfsvfs;
+
+	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+	    offsetof(znode_t, z_link_node));
+	TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
+	    zfsvfs_task_unlinked_drain, zfsvfs);
+#ifdef DIAGNOSTIC
+	rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
+#else
+	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
+#endif
+	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
+	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+	error = zfsvfs_init(zfsvfs, os);
+	if (error != 0) {
+		dmu_objset_disown(os, B_TRUE, zfsvfs);
+		*zfvp = NULL;
+		kmem_free(zfsvfs, sizeof (zfsvfs_t));
+		return (error);
+	}
+
+	*zfvp = zfsvfs;
+	return (0);
+}
+
+static int
+zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
+{
+	int error;
+
+	/*
+	 * Check for a bad on-disk format version now since we
+	 * lied about owning the dataset readonly before.
+	 */
+	if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
+	    dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
+		return (SET_ERROR(EROFS));
+
+	error = zfs_register_callbacks(zfsvfs->z_vfs);
+	if (error)
+		return (error);
+
+	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+
+	/*
+	 * If we are not mounting (ie: online recv), then we don't
+	 * have to worry about replaying the log as we blocked all
+	 * operations out since we closed the ZIL.
+	 */
+	if (mounting) {
+		boolean_t readonly;
+
+		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+		dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
+
+		/*
+		 * During replay we remove the read only flag to
+		 * allow replays to succeed.
+		 */
+		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
+		if (readonly != 0) {
+			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+		} else {
+			dsl_dir_t *dd;
+			zap_stats_t zs;
+
+			if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+			    &zs) == 0) {
+				dataset_kstats_update_nunlinks_kstat(
+				    &zfsvfs->z_kstat, zs.zs_num_entries);
+				dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
+				    "num_entries in unlinked set: %llu",
+				    zs.zs_num_entries);
+			}
+
+			zfs_unlinked_drain(zfsvfs);
+			dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
+			dd->dd_activity_cancelled = B_FALSE;
+		}
+
+		/*
+		 * Parse and replay the intent log.
+		 *
+		 * Because of ziltest, this must be done after
+		 * zfs_unlinked_drain().  (Further note: ziltest
+		 * doesn't use readonly mounts, where
+		 * zfs_unlinked_drain() isn't called.)  This is because
+		 * ziltest causes spa_sync() to think it's committed,
+		 * but actually it is not, so the intent log contains
+		 * many txg's worth of changes.
+		 *
+		 * In particular, if object N is in the unlinked set in
+		 * the last txg to actually sync, then it could be
+		 * actually freed in a later txg and then reallocated
+		 * in a yet later txg.  This would write a "create
+		 * object N" record to the intent log.  Normally, this
+		 * would be fine because the spa_sync() would have
+		 * written out the fact that object N is free, before
+		 * we could write the "create object N" intent log
+		 * record.
+		 *
+		 * But when we are in ziltest mode, we advance the "open
+		 * txg" without actually spa_sync()-ing the changes to
+		 * disk.  So we would see that object N is still
+		 * allocated and in the unlinked set, and there is an
+		 * intent log record saying to allocate it.
+		 */
+		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
+			if (zil_replay_disable) {
+				zil_destroy(zfsvfs->z_log, B_FALSE);
+			} else {
+				boolean_t use_nc = zfsvfs->z_use_namecache;
+				zfsvfs->z_use_namecache = B_FALSE;
+				zfsvfs->z_replay = B_TRUE;
+				zil_replay(zfsvfs->z_os, zfsvfs,
+				    zfs_replay_vector);
+				zfsvfs->z_replay = B_FALSE;
+				zfsvfs->z_use_namecache = use_nc;
+			}
+		}
+
+		/* restore readonly bit */
+		if (readonly != 0)
+			zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
+	}
+
+	/*
+	 * Set the objset user_ptr to track its zfsvfs.
+	 */
+	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+
+	return (0);
+}
+
+extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
+
+void
+zfsvfs_free(zfsvfs_t *zfsvfs)
+{
+	int i;
+
+	/*
+	 * This is a barrier to prevent the filesystem from going away in
+	 * zfs_znode_move() until we can safely ensure that the filesystem is
+	 * not unmounted. We consider the filesystem valid before the barrier
+	 * and invalid after the barrier.
+	 */
+	rw_enter(&zfsvfs_lock, RW_READER);
+	rw_exit(&zfsvfs_lock);
+
+	zfs_fuid_destroy(zfsvfs);
+
+	mutex_destroy(&zfsvfs->z_znodes_lock);
+	mutex_destroy(&zfsvfs->z_lock);
+	ASSERT(zfsvfs->z_nr_znodes == 0);
+	list_destroy(&zfsvfs->z_all_znodes);
+	rrm_destroy(&zfsvfs->z_teardown_lock);
+	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
+	rw_destroy(&zfsvfs->z_fuid_lock);
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+	dataset_kstats_destroy(&zfsvfs->z_kstat);
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+
+static void
+zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
+{
+	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+	if (zfsvfs->z_vfs) {
+		if (zfsvfs->z_use_fuids) {
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+		} else {
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+		}
+	}
+	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+}
+
+static int
+zfs_domount(vfs_t *vfsp, char *osname)
+{
+	uint64_t recordsize, fsid_guid;
+	int error = 0;
+	zfsvfs_t *zfsvfs;
+
+	ASSERT(vfsp);
+	ASSERT(osname);
+
+	error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
+	if (error)
+		return (error);
+	zfsvfs->z_vfs = vfsp;
+
+	if ((error = dsl_prop_get_integer(osname,
+	    "recordsize", &recordsize, NULL)))
+		goto out;
+	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
+	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
+
+	vfsp->vfs_data = zfsvfs;
+	vfsp->mnt_flag |= MNT_LOCAL;
+	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
+	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
+	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
+	/*
+	 * This can cause a loss of coherence between ARC and page cache
+	 * on ZoF - unclear if the problem is in FreeBSD or ZoF
+	 */
+	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
+	vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
+	vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
+
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+	vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
+#endif
+	/*
+	 * The fsid is 64 bits, composed of an 8-bit fs type, which
+	 * separates our fsid from any other filesystem types, and a
+	 * 56-bit objset unique ID.  The objset unique ID is unique to
+	 * all objsets open on this system, provided by unique_create().
+	 * The 8-bit fs type must be put in the low bits of fsid[1]
+	 * because that's where other Solaris filesystems put it.
+	 */
+	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
+	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
+	vfsp->vfs_fsid.val[0] = fsid_guid;
+	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
+	    (vfsp->mnt_vfc->vfc_typenum & 0xFF);
+
+	/*
+	 * Set features for file system.
+	 */
+	zfs_set_fuid_feature(zfsvfs);
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
+		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
+		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
+	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
+		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
+		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
+	}
+	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
+
+	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
+		uint64_t pval;
+
+		atime_changed_cb(zfsvfs, B_FALSE);
+		readonly_changed_cb(zfsvfs, B_TRUE);
+		if ((error = dsl_prop_get_integer(osname,
+		    "xattr", &pval, NULL)))
+			goto out;
+		xattr_changed_cb(zfsvfs, pval);
+		zfsvfs->z_issnap = B_TRUE;
+		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
+
+		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+	} else {
+		if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
+			goto out;
+	}
+
+	vfs_mountedfrom(vfsp, osname);
+
+	if (!zfsvfs->z_issnap)
+		zfsctl_create(zfsvfs);
+out:
+	if (error) {
+		dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
+		zfsvfs_free(zfsvfs);
+	} else {
+		atomic_inc_32(&zfs_active_fs_count);
+	}
+
+	return (error);
+}
+
+static void
+zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
+{
+	objset_t *os = zfsvfs->z_os;
+
+	if (!dmu_objset_is_snapshot(os))
+		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
+}
+
+#ifdef SECLABEL
+/*
+ * Convert a decimal digit string to a uint64_t integer.
+ */
+static int
+str_to_uint64(char *str, uint64_t *objnum)
+{
+	uint64_t num = 0;
+
+	while (*str) {
+		if (*str < '0' || *str > '9')
+			return (SET_ERROR(EINVAL));
+
+		num = num*10 + *str++ - '0';
+	}
+
+	*objnum = num;
+	return (0);
+}
+
+/*
+ * The boot path passed from the boot loader is in the form of
+ * "rootpool-name/root-filesystem-object-number'. Convert this
+ * string to a dataset name: "rootpool-name/root-filesystem-name".
+ */
+static int
+zfs_parse_bootfs(char *bpath, char *outpath)
+{
+	char *slashp;
+	uint64_t objnum;
+	int error;
+
+	if (*bpath == 0 || *bpath == '/')
+		return (SET_ERROR(EINVAL));
+
+	(void) strcpy(outpath, bpath);
+
+	slashp = strchr(bpath, '/');
+
+	/* if no '/', just return the pool name */
+	if (slashp == NULL) {
+		return (0);
+	}
+
+	/* if not a number, just return the root dataset name */
+	if (str_to_uint64(slashp+1, &objnum)) {
+		return (0);
+	}
+
+	*slashp = '\0';
+	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
+	*slashp = '/';
+
+	return (error);
+}
+
+/*
+ * Check that the hex label string is appropriate for the dataset being
+ * mounted into the global_zone proper.
+ *
+ * Return an error if the hex label string is not default or
+ * admin_low/admin_high.  For admin_low labels, the corresponding
+ * dataset must be readonly.
+ */
+int
+zfs_check_global_label(const char *dsname, const char *hexsl)
+{
+	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+		return (0);
+	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
+		return (0);
+	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
+		/* must be readonly */
+		uint64_t rdonly;
+
+		if (dsl_prop_get_integer(dsname,
+		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
+			return (SET_ERROR(EACCES));
+		return (rdonly ? 0 : EACCES);
+	}
+	return (SET_ERROR(EACCES));
+}
+
+/*
+ * Determine whether the mount is allowed according to MAC check.
+ * by comparing (where appropriate) label of the dataset against
+ * the label of the zone being mounted into.  If the dataset has
+ * no label, create one.
+ *
+ * Returns 0 if access allowed, error otherwise (e.g. EACCES)
+ */
+static int
+zfs_mount_label_policy(vfs_t *vfsp, char *osname)
+{
+	int		error, retv;
+	zone_t		*mntzone = NULL;
+	ts_label_t	*mnt_tsl;
+	bslabel_t	*mnt_sl;
+	bslabel_t	ds_sl;
+	char		ds_hexsl[MAXNAMELEN];
+
+	retv = EACCES;				/* assume the worst */
+
+	/*
+	 * Start by getting the dataset label if it exists.
+	 */
+	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+	if (error)
+		return (SET_ERROR(EACCES));
+
+	/*
+	 * If labeling is NOT enabled, then disallow the mount of datasets
+	 * which have a non-default label already.  No other label checks
+	 * are needed.
+	 */
+	if (!is_system_labeled()) {
+		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+			return (0);
+		return (SET_ERROR(EACCES));
+	}
+
+	/*
+	 * Get the label of the mountpoint.  If mounting into the global
+	 * zone (i.e. mountpoint is not within an active zone and the
+	 * zoned property is off), the label must be default or
+	 * admin_low/admin_high only; no other checks are needed.
+	 */
+	mntzone = zone_find_by_any_path(vfsp->vfs_mntpt, B_FALSE);
+	if (mntzone->zone_id == GLOBAL_ZONEID) {
+		uint64_t zoned;
+
+		zone_rele(mntzone);
+
+		if (dsl_prop_get_integer(osname,
+		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+			return (SET_ERROR(EACCES));
+		if (!zoned)
+			return (zfs_check_global_label(osname, ds_hexsl));
+		else
+			/*
+			 * This is the case of a zone dataset being mounted
+			 * initially, before the zone has been fully created;
+			 * allow this mount into global zone.
+			 */
+			return (0);
+	}
+
+	mnt_tsl = mntzone->zone_slabel;
+	ASSERT(mnt_tsl != NULL);
+	label_hold(mnt_tsl);
+	mnt_sl = label2bslabel(mnt_tsl);
+
+	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
+		/*
+		 * The dataset doesn't have a real label, so fabricate one.
+		 */
+		char *str = NULL;
+
+		if (l_to_str_internal(mnt_sl, &str) == 0 &&
+		    dsl_prop_set_string(osname,
+		    zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+		    ZPROP_SRC_LOCAL, str) == 0)
+			retv = 0;
+		if (str != NULL)
+			kmem_free(str, strlen(str) + 1);
+	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
+		/*
+		 * Now compare labels to complete the MAC check.  If the
+		 * labels are equal then allow access.  If the mountpoint
+		 * label dominates the dataset label, allow readonly access.
+		 * Otherwise, access is denied.
+		 */
+		if (blequal(mnt_sl, &ds_sl))
+			retv = 0;
+		else if (bldominates(mnt_sl, &ds_sl)) {
+			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
+			retv = 0;
+		}
+	}
+
+	label_rele(mnt_tsl);
+	zone_rele(mntzone);
+	return (retv);
+}
+#endif	/* SECLABEL */
+
+static int
+getpoolname(const char *osname, char *poolname)
+{
+	char *p;
+
+	p = strchr(osname, '/');
+	if (p == NULL) {
+		if (strlen(osname) >= MAXNAMELEN)
+			return (ENAMETOOLONG);
+		(void) strcpy(poolname, osname);
+	} else {
+		if (p - osname >= MAXNAMELEN)
+			return (ENAMETOOLONG);
+		(void) strncpy(poolname, osname, p - osname);
+		poolname[p - osname] = '\0';
+	}
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_mount(vfs_t *vfsp)
+{
+	kthread_t	*td = curthread;
+	vnode_t		*mvp = vfsp->mnt_vnodecovered;
+	cred_t		*cr = td->td_ucred;
+	char		*osname;
+	int		error = 0;
+	int		canwrite;
+
+	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * If full-owner-access is enabled and delegated administration is
+	 * turned on, we must set nosuid.
+	 */
+	if (zfs_super_owner &&
+	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
+		secpolicy_fs_mount_clearopts(cr, vfsp);
+	}
+
+	/*
+	 * Check for mount privilege?
+	 *
+	 * If we don't have privilege then see if
+	 * we have local permission to allow it
+	 */
+	error = secpolicy_fs_mount(cr, mvp, vfsp);
+	if (error) {
+		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
+			goto out;
+
+		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
+			vattr_t		vattr;
+
+			/*
+			 * Make sure user is the owner of the mount point
+			 * or has sufficient privileges.
+			 */
+
+			vattr.va_mask = AT_UID;
+
+			vn_lock(mvp, LK_SHARED | LK_RETRY);
+			if (VOP_GETATTR(mvp, &vattr, cr)) {
+				VOP_UNLOCK1(mvp);
+				goto out;
+			}
+
+			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
+			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
+				VOP_UNLOCK1(mvp);
+				goto out;
+			}
+			VOP_UNLOCK1(mvp);
+		}
+
+		secpolicy_fs_mount_clearopts(cr, vfsp);
+	}
+
+	/*
+	 * Refuse to mount a filesystem if we are in a local zone and the
+	 * dataset is not visible.
+	 */
+	if (!INGLOBALZONE(curproc) &&
+	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
+		error = SET_ERROR(EPERM);
+		goto out;
+	}
+
+#ifdef SECLABEL
+	error = zfs_mount_label_policy(vfsp, osname);
+	if (error)
+		goto out;
+#endif
+
+	vfsp->vfs_flag |= MNT_NFS4ACLS;
+
+	/*
+	 * When doing a remount, we simply refresh our temporary properties
+	 * according to those options set in the current VFS options.
+	 */
+	if (vfsp->vfs_flag & MS_REMOUNT) {
+		zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+		/*
+		 * Refresh mount options with z_teardown_lock blocking I/O while
+		 * the filesystem is in an inconsistent state.
+		 * The lock also serializes this code with filesystem
+		 * manipulations between entry to zfs_suspend_fs() and return
+		 * from zfs_resume_fs().
+		 */
+		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+		zfs_unregister_callbacks(zfsvfs);
+		error = zfs_register_callbacks(vfsp);
+		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+		goto out;
+	}
+
+	/* Initial root mount: try hard to import the requested root pool. */
+	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
+	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
+		char pname[MAXNAMELEN];
+
+		error = getpoolname(osname, pname);
+		if (error == 0)
+			error = spa_import_rootpool(pname, false);
+		if (error)
+			goto out;
+	}
+	DROP_GIANT();
+	error = zfs_domount(vfsp, osname);
+	PICKUP_GIANT();
+
+out:
+	return (error);
+}
+
+static int
+zfs_statfs(vfs_t *vfsp, struct statfs *statp)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	uint64_t refdbytes, availbytes, usedobjs, availobjs;
+
+	statp->f_version = STATFS_VERSION;
+
+	ZFS_ENTER(zfsvfs);
+
+	dmu_objset_space(zfsvfs->z_os,
+	    &refdbytes, &availbytes, &usedobjs, &availobjs);
+
+	/*
+	 * The underlying storage pool actually uses multiple block sizes.
+	 * We report the fragsize as the smallest block size we support,
+	 * and we report our blocksize as the filesystem's maximum blocksize.
+	 */
+	statp->f_bsize = SPA_MINBLOCKSIZE;
+	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
+
+	/*
+	 * The following report "total" blocks of various kinds in the
+	 * file system, but reported in terms of f_frsize - the
+	 * "fragment" size.
+	 */
+
+	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
+	statp->f_bfree = availbytes / statp->f_bsize;
+	statp->f_bavail = statp->f_bfree; /* no root reservation */
+
+	/*
+	 * statvfs() should really be called statufs(), because it assumes
+	 * static metadata.  ZFS doesn't preallocate files, so the best
+	 * we can do is report the max that could possibly fit in f_files,
+	 * and that minus the number actually used in f_ffree.
+	 * For f_ffree, report the smaller of the number of object available
+	 * and the number of blocks (each object will take at least a block).
+	 */
+	statp->f_ffree = MIN(availobjs, statp->f_bfree);
+	statp->f_files = statp->f_ffree + usedobjs;
+
+	/*
+	 * We're a zfs filesystem.
+	 */
+	strlcpy(statp->f_fstypename, "zfs",
+	    sizeof (statp->f_fstypename));
+
+	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
+	    sizeof (statp->f_mntfromname));
+	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
+	    sizeof (statp->f_mntonname));
+
+	statp->f_namemax = MAXNAMELEN - 1;
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+static int
+zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	znode_t *rootzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+	if (error == 0)
+		*vpp = ZTOV(rootzp);
+
+	ZFS_EXIT(zfsvfs);
+
+	if (error == 0) {
+		error = vn_lock(*vpp, flags);
+		if (error != 0) {
+			VN_RELE(*vpp);
+			*vpp = NULL;
+		}
+	}
+	return (error);
+}
+
+/*
+ * Teardown the zfsvfs::z_os.
+ *
+ * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
+ * and 'z_teardown_inactive_lock' held.
+ */
+static int
+zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
+{
+	znode_t	*zp;
+	dsl_dir_t *dd;
+
+	/*
+	 * If someone has not already unmounted this file system,
+	 * drain the zrele_taskq to ensure all active references to the
+	 * zfsvfs_t have been handled only then can it be safely destroyed.
+	 */
+	if (zfsvfs->z_os) {
+		/*
+		 * If we're unmounting we have to wait for the list to
+		 * drain completely.
+		 *
+		 * If we're not unmounting there's no guarantee the list
+		 * will drain completely, but zreles run from the taskq
+		 * may add the parents of dir-based xattrs to the taskq
+		 * so we want to wait for these.
+		 *
+		 * We can safely read z_nr_znodes without locking because the
+		 * VFS has already blocked operations which add to the
+		 * z_all_znodes list and thus increment z_nr_znodes.
+		 */
+		int round = 0;
+		while (zfsvfs->z_nr_znodes > 0) {
+			taskq_wait_outstanding(dsl_pool_zrele_taskq(
+			    dmu_objset_pool(zfsvfs->z_os)), 0);
+			if (++round > 1 && !unmounting)
+				break;
+		}
+	}
+	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+
+	if (!unmounting) {
+		/*
+		 * We purge the parent filesystem's vfsp as the parent
+		 * filesystem and all of its snapshots have their vnode's
+		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
+		 * 'z_parent' is self referential for non-snapshots.
+		 */
+#ifdef FREEBSD_NAMECACHE
+		cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
+#endif
+	}
+
+	/*
+	 * Close the zil. NB: Can't close the zil while zfs_inactive
+	 * threads are blocked as zil_close can call zfs_inactive.
+	 */
+	if (zfsvfs->z_log) {
+		zil_close(zfsvfs->z_log);
+		zfsvfs->z_log = NULL;
+	}
+
+	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
+
+	/*
+	 * If we are not unmounting (ie: online recv) and someone already
+	 * unmounted this file system while we were doing the switcheroo,
+	 * or a reopen of z_os failed then just bail out now.
+	 */
+	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+		return (SET_ERROR(EIO));
+	}
+
+	/*
+	 * At this point there are no vops active, and any new vops will
+	 * fail with EIO since we have z_teardown_lock for writer (only
+	 * relevant for forced unmount).
+	 *
+	 * Release all holds on dbufs.
+	 */
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
+	    zp = list_next(&zfsvfs->z_all_znodes, zp))
+		if (zp->z_sa_hdl) {
+			ASSERT(ZTOV(zp)->v_count >= 0);
+			zfs_znode_dmu_fini(zp);
+		}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	/*
+	 * If we are unmounting, set the unmounted flag and let new vops
+	 * unblock.  zfs_inactive will have the unmounted behavior, and all
+	 * other vops will fail with EIO.
+	 */
+	if (unmounting) {
+		zfsvfs->z_unmounted = B_TRUE;
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+	}
+
+	/*
+	 * z_os will be NULL if there was an error in attempting to reopen
+	 * zfsvfs, so just return as the properties had already been
+	 * unregistered and cached data had been evicted before.
+	 */
+	if (zfsvfs->z_os == NULL)
+		return (0);
+
+	/*
+	 * Unregister properties.
+	 */
+	zfs_unregister_callbacks(zfsvfs);
+
+	/*
+	 * Evict cached data
+	 */
+	if (!zfs_is_readonly(zfsvfs))
+		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+	dmu_objset_evict_dbufs(zfsvfs->z_os);
+	dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
+	dsl_dir_cancel_waiters(dd);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_umount(vfs_t *vfsp, int fflag)
+{
+	kthread_t *td = curthread;
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	objset_t *os;
+	cred_t *cr = td->td_ucred;
+	int ret;
+
+	ret = secpolicy_fs_unmount(cr, vfsp);
+	if (ret) {
+		if (dsl_deleg_access((char *)vfsp->vfs_resource,
+		    ZFS_DELEG_PERM_MOUNT, cr))
+			return (ret);
+	}
+
+	/*
+	 * Unmount any snapshots mounted under .zfs before unmounting the
+	 * dataset itself.
+	 */
+	if (zfsvfs->z_ctldir != NULL) {
+		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
+			return (ret);
+	}
+
+	if (fflag & MS_FORCE) {
+		/*
+		 * Mark file system as unmounted before calling
+		 * vflush(FORCECLOSE). This way we ensure no future vnops
+		 * will be called and risk operating on DOOMED vnodes.
+		 */
+		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+		zfsvfs->z_unmounted = B_TRUE;
+		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+	}
+
+	/*
+	 * Flush all the files.
+	 */
+	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
+	if (ret != 0)
+		return (ret);
+	while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
+	    &zfsvfs->z_unlinked_drain_task, NULL) != 0)
+		taskqueue_drain(zfsvfs_taskq->tq_queue,
+		    &zfsvfs->z_unlinked_drain_task);
+
+	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+	os = zfsvfs->z_os;
+
+	/*
+	 * z_os will be NULL if there was an error in
+	 * attempting to reopen zfsvfs.
+	 */
+	if (os != NULL) {
+		/*
+		 * Unset the objset user_ptr.
+		 */
+		mutex_enter(&os->os_user_ptr_lock);
+		dmu_objset_set_user(os, NULL);
+		mutex_exit(&os->os_user_ptr_lock);
+
+		/*
+		 * Finally release the objset
+		 */
+		dmu_objset_disown(os, B_TRUE, zfsvfs);
+	}
+
+	/*
+	 * We can now safely destroy the '.zfs' directory node.
+	 */
+	if (zfsvfs->z_ctldir != NULL)
+		zfsctl_destroy(zfsvfs);
+	zfs_freevfs(vfsp);
+
+	return (0);
+}
+
+static int
+zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
+{
+	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
+	znode_t		*zp;
+	int 		err;
+
+	/*
+	 * zfs_zget() can't operate on virtual entries like .zfs/ or
+	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
+	 * This will make NFS to switch to LOOKUP instead of using VGET.
+	 */
+	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
+	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
+		return (EOPNOTSUPP);
+
+	ZFS_ENTER(zfsvfs);
+	err = zfs_zget(zfsvfs, ino, &zp);
+	if (err == 0 && zp->z_unlinked) {
+		vrele(ZTOV(zp));
+		err = EINVAL;
+	}
+	if (err == 0)
+		*vpp = ZTOV(zp);
+	ZFS_EXIT(zfsvfs);
+	if (err == 0) {
+		err = vn_lock(*vpp, flags);
+		if (err != 0)
+			vrele(*vpp);
+	}
+	if (err != 0)
+		*vpp = NULL;
+	return (err);
+}
+
+static int
+#if __FreeBSD_version >= 1300098
+zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
+    struct ucred **credanonp, int *numsecflavors, int *secflavors)
+#else
+zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
+    struct ucred **credanonp, int *numsecflavors, int **secflavors)
+#endif
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+	/*
+	 * If this is regular file system vfsp is the same as
+	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
+	 * zfsvfs->z_parent->z_vfs represents parent file system
+	 * which we have to use here, because only this file system
+	 * has mnt_export configured.
+	 */
+	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
+	    credanonp, numsecflavors, secflavors));
+}
+
+CTASSERT(SHORT_FID_LEN <= sizeof (struct fid));
+CTASSERT(LONG_FID_LEN <= sizeof (struct fid));
+
+static int
+zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
+{
+	struct componentname cn;
+	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
+	znode_t		*zp;
+	vnode_t		*dvp;
+	uint64_t	object = 0;
+	uint64_t	fid_gen = 0;
+	uint64_t	gen_mask;
+	uint64_t	zp_gen;
+	int 		i, err;
+
+	*vpp = NULL;
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * On FreeBSD we can get snapshot's mount point or its parent file
+	 * system mount point depending if snapshot is already mounted or not.
+	 */
+	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
+		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
+		uint64_t	objsetid = 0;
+		uint64_t	setgen = 0;
+
+		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
+
+		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
+
+		ZFS_EXIT(zfsvfs);
+
+		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
+		if (err)
+			return (SET_ERROR(EINVAL));
+		ZFS_ENTER(zfsvfs);
+	}
+
+	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
+		zfid_short_t	*zfid = (zfid_short_t *)fidp;
+
+		for (i = 0; i < sizeof (zfid->zf_object); i++)
+			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
+
+		for (i = 0; i < sizeof (zfid->zf_gen); i++)
+			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
+	} else {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
+	 * directory tree. If the object == zfsvfs->z_shares_dir, then
+	 * we are in the .zfs/shares directory tree.
+	 */
+	if ((fid_gen == 0 &&
+	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
+	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
+		ZFS_EXIT(zfsvfs);
+		VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
+		if (object == ZFSCTL_INO_SNAPDIR) {
+			cn.cn_nameptr = "snapshot";
+			cn.cn_namelen = strlen(cn.cn_nameptr);
+			cn.cn_nameiop = LOOKUP;
+			cn.cn_flags = ISLASTCN | LOCKLEAF;
+			cn.cn_lkflags = flags;
+			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+			vput(dvp);
+		} else if (object == zfsvfs->z_shares_dir) {
+			/*
+			 * XXX This branch must not be taken,
+			 * if it is, then the lookup below will
+			 * explode.
+			 */
+			cn.cn_nameptr = "shares";
+			cn.cn_namelen = strlen(cn.cn_nameptr);
+			cn.cn_nameiop = LOOKUP;
+			cn.cn_flags = ISLASTCN;
+			cn.cn_lkflags = flags;
+			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+			vput(dvp);
+		} else {
+			*vpp = dvp;
+		}
+		return (err);
+	}
+
+	gen_mask = -1ULL >> (64 - 8 * i);
+
+	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
+	if ((err = zfs_zget(zfsvfs, object, &zp))) {
+		ZFS_EXIT(zfsvfs);
+		return (err);
+	}
+	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+	    sizeof (uint64_t));
+	zp_gen = zp_gen & gen_mask;
+	if (zp_gen == 0)
+		zp_gen = 1;
+	if (zp->z_unlinked || zp_gen != fid_gen) {
+		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
+		vrele(ZTOV(zp));
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	*vpp = ZTOV(zp);
+	ZFS_EXIT(zfsvfs);
+	err = vn_lock(*vpp, flags);
+	if (err == 0)
+		vnode_create_vobject(*vpp, zp->z_size, curthread);
+	else
+		*vpp = NULL;
+	return (err);
+}
+
+/*
+ * Block out VOPs and close zfsvfs_t::z_os
+ *
+ * Note, if successful, then we return with the 'z_teardown_lock' and
+ * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
+ * dataset and objset intact so that they can be atomically handed off during
+ * a subsequent rollback or recv operation and the resume thereafter.
+ */
+int
+zfs_suspend_fs(zfsvfs_t *zfsvfs)
+{
+	int error;
+
+	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
+		return (error);
+
+	return (0);
+}
+
+/*
+ * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
+ * is an invariant across any of the operations that can be performed while the
+ * filesystem was suspended.  Whether it succeeded or failed, the preconditions
+ * are the same: the relevant objset and associated dataset are owned by
+ * zfsvfs, held, and long held on entry.
+ */
+int
+zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+	int err;
+	znode_t *zp;
+
+	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
+	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+	/*
+	 * We already own this, so just update the objset_t, as the one we
+	 * had before may have been evicted.
+	 */
+	objset_t *os;
+	VERIFY3P(ds->ds_owner, ==, zfsvfs);
+	VERIFY(dsl_dataset_long_held(ds));
+	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
+	dsl_pool_config_enter(dp, FTAG);
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	dsl_pool_config_exit(dp, FTAG);
+
+	err = zfsvfs_init(zfsvfs, os);
+	if (err != 0)
+		goto bail;
+
+	ds->ds_dir->dd_activity_cancelled = B_FALSE;
+	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+
+	zfs_set_fuid_feature(zfsvfs);
+
+	/*
+	 * Attempt to re-establish all the active znodes with
+	 * their dbufs.  If a zfs_rezget() fails, then we'll let
+	 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
+	 * when they try to use their znode.
+	 */
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+		(void) zfs_rezget(zp);
+	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+bail:
+	/* release the VOPs */
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
+	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+
+	if (err) {
+		/*
+		 * Since we couldn't setup the sa framework, try to force
+		 * unmount this file system.
+		 */
+		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
+			vfs_ref(zfsvfs->z_vfs);
+			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
+		}
+	}
+	return (err);
+}
+
+static void
+zfs_freevfs(vfs_t *vfsp)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+	zfsvfs_free(zfsvfs);
+
+	atomic_dec_32(&zfs_active_fs_count);
+}
+
+#ifdef __i386__
+static int desiredvnodes_backup;
+#include <sys/vmmeter.h>
+
+
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#endif
+
+static void
+zfs_vnodes_adjust(void)
+{
+#ifdef __i386__
+	int newdesiredvnodes;
+
+	desiredvnodes_backup = desiredvnodes;
+
+	/*
+	 * We calculate newdesiredvnodes the same way it is done in
+	 * vntblinit(). If it is equal to desiredvnodes, it means that
+	 * it wasn't tuned by the administrator and we can tune it down.
+	 */
+	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
+	    vm_kmem_size / (5 * (sizeof (struct vm_object) +
+	    sizeof (struct vnode))));
+	if (newdesiredvnodes == desiredvnodes)
+		desiredvnodes = (3 * newdesiredvnodes) / 4;
+#endif
+}
+
+static void
+zfs_vnodes_adjust_back(void)
+{
+
+#ifdef __i386__
+	desiredvnodes = desiredvnodes_backup;
+#endif
+}
+
+void
+zfs_init(void)
+{
+
+	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
+
+	/*
+	 * Initialize .zfs directory structures
+	 */
+	zfsctl_init();
+
+	/*
+	 * Initialize znode cache, vnode ops, etc...
+	 */
+	zfs_znode_init();
+
+	/*
+	 * Reduce number of vnodes. Originally number of vnodes is calculated
+	 * with UFS inode in mind. We reduce it here, because it's too big for
+	 * ZFS/i386.
+	 */
+	zfs_vnodes_adjust();
+
+	dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
+
+	zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
+}
+
+void
+zfs_fini(void)
+{
+	taskq_destroy(zfsvfs_taskq);
+	zfsctl_fini();
+	zfs_znode_fini();
+	zfs_vnodes_adjust_back();
+}
+
+int
+zfs_busy(void)
+{
+	return (zfs_active_fs_count != 0);
+}
+
+/*
+ * Release VOPs and unmount a suspended filesystem.
+ */
+int
+zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
+	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+	/*
+	 * We already own this, so just hold and rele it to update the
+	 * objset_t, as the one we had before may have been evicted.
+	 */
+	objset_t *os;
+	VERIFY3P(ds->ds_owner, ==, zfsvfs);
+	VERIFY(dsl_dataset_long_held(ds));
+	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
+	dsl_pool_config_enter(dp, FTAG);
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	dsl_pool_config_exit(dp, FTAG);
+	zfsvfs->z_os = os;
+
+	/* release the VOPs */
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
+	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+
+	/*
+	 * Try to force unmount this file system.
+	 */
+	(void) zfs_umount(zfsvfs->z_vfs, 0);
+	zfsvfs->z_unmounted = B_TRUE;
+	return (0);
+}
+
+int
+zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
+{
+	int error;
+	objset_t *os = zfsvfs->z_os;
+	dmu_tx_t *tx;
+
+	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
+		return (SET_ERROR(EINVAL));
+
+	if (newvers < zfsvfs->z_version)
+		return (SET_ERROR(EINVAL));
+
+	if (zfs_spa_version_map(newvers) >
+	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
+		return (SET_ERROR(ENOTSUP));
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+		    ZFS_SA_ATTRS);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	}
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+
+	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+	    8, 1, &newvers, tx);
+
+	if (error) {
+		dmu_tx_commit(tx);
+		return (error);
+	}
+
+	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+		uint64_t sa_obj;
+
+		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+		    SPA_VERSION_SA);
+		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+		    DMU_OT_NONE, 0, tx);
+
+		error = zap_add(os, MASTER_NODE_OBJ,
+		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+		ASSERT0(error);
+
+		VERIFY(0 == sa_set_sa_object(os, sa_obj));
+		sa_register_update_callback(os, zfs_sa_upgrade);
+	}
+
+	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
+	    "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
+	    (uintmax_t)newvers);
+	dmu_tx_commit(tx);
+
+	zfsvfs->z_version = newvers;
+	os->os_version = newvers;
+
+	zfs_set_fuid_feature(zfsvfs);
+
+	return (0);
+}
+
+/*
+ * Read a property stored within the master node.
+ */
+int
+zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
+{
+	uint64_t *cached_copy = NULL;
+
+	/*
+	 * Figure out where in the objset_t the cached copy would live, if it
+	 * is available for the requested property.
+	 */
+	if (os != NULL) {
+		switch (prop) {
+		case ZFS_PROP_VERSION:
+			cached_copy = &os->os_version;
+			break;
+		case ZFS_PROP_NORMALIZE:
+			cached_copy = &os->os_normalization;
+			break;
+		case ZFS_PROP_UTF8ONLY:
+			cached_copy = &os->os_utf8only;
+			break;
+		case ZFS_PROP_CASE:
+			cached_copy = &os->os_casesensitivity;
+			break;
+		default:
+			break;
+		}
+	}
+	if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+		*value = *cached_copy;
+		return (0);
+	}
+
+	/*
+	 * If the property wasn't cached, look up the file system's value for
+	 * the property. For the version property, we look up a slightly
+	 * different string.
+	 */
+	const char *pname;
+	int error = ENOENT;
+	if (prop == ZFS_PROP_VERSION) {
+		pname = ZPL_VERSION_STR;
+	} else {
+		pname = zfs_prop_to_name(prop);
+	}
+
+	if (os != NULL) {
+		ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
+	}
+
+	if (error == ENOENT) {
+		/* No value set, use the default value */
+		switch (prop) {
+		case ZFS_PROP_VERSION:
+			*value = ZPL_VERSION;
+			break;
+		case ZFS_PROP_NORMALIZE:
+		case ZFS_PROP_UTF8ONLY:
+			*value = 0;
+			break;
+		case ZFS_PROP_CASE:
+			*value = ZFS_CASE_SENSITIVE;
+			break;
+		default:
+			return (error);
+		}
+		error = 0;
+	}
+
+	/*
+	 * If one of the methods for getting the property value above worked,
+	 * copy it into the objset_t's cache.
+	 */
+	if (error == 0 && cached_copy != NULL) {
+		*cached_copy = *value;
+	}
+
+	return (error);
+}
+
+/*
+ * Return true if the corresponding vfs's unmounted flag is set.
+ * Otherwise return false.
+ * If this function returns true we know VFS unmount has been initiated.
+ */
+boolean_t
+zfs_get_vfs_flag_unmounted(objset_t *os)
+{
+	zfsvfs_t *zfvp;
+	boolean_t unmounted = B_FALSE;
+
+	ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
+
+	mutex_enter(&os->os_user_ptr_lock);
+	zfvp = dmu_objset_get_user(os);
+	if (zfvp != NULL && zfvp->z_vfs != NULL &&
+	    (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
+		unmounted = B_TRUE;
+	mutex_exit(&os->os_user_ptr_lock);
+
+	return (unmounted);
+}
+
+#ifdef _KERNEL
+void
+zfsvfs_update_fromname(const char *oldname, const char *newname)
+{
+	char tmpbuf[MAXPATHLEN];
+	struct mount *mp;
+	char *fromname;
+	size_t oldlen;
+
+	oldlen = strlen(oldname);
+
+	mtx_lock(&mountlist_mtx);
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+		fromname = mp->mnt_stat.f_mntfromname;
+		if (strcmp(fromname, oldname) == 0) {
+			(void) strlcpy(fromname, newname,
+			    sizeof (mp->mnt_stat.f_mntfromname));
+			continue;
+		}
+		if (strncmp(fromname, oldname, oldlen) == 0 &&
+		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
+			(void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
+			    newname, fromname + oldlen);
+			(void) strlcpy(fromname, tmpbuf,
+			    sizeof (mp->mnt_stat.f_mntfromname));
+			continue;
+		}
+	}
+	mtx_unlock(&mountlist_mtx);
+}
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c
new file mode 100644
index 000000000000..2a4acf21582f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c
@@ -0,0 +1,6629 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/endian.h>
+#include <sys/vm.h>
+#include <sys/vnode.h>
+#if __FreeBSD_version >= 1300102
+#include <sys/smr.h>
+#endif
+#include <sys/dirent.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/uio.h>
+#include <sys/atomic.h>
+#include <sys/namei.h>
+#include <sys/mman.h>
+#include <sys/cmn_err.h>
+#include <sys/kdb.h>
+#include <sys/sysproto.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/policy.h>
+#include <sys/sunddi.h>
+#include <sys/filio.h>
+#include <sys/sid.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_rlock.h>
+#include <sys/extdirent.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/sched.h>
+#include <sys/acl.h>
+#include <sys/vmmeter.h>
+#include <vm/vm_param.h>
+#include <sys/zil.h>
+#include <sys/zfs_vnops.h>
+
+#include <vm/vm_object.h>
+
+#include <sys/extattr.h>
+#include <sys/priv.h>
+
+#ifndef VN_OPEN_INVFS
+#define	VN_OPEN_INVFS	0x0
+#endif
+
+VFS_SMR_DECLARE;
+
+#if __FreeBSD_version >= 1300047
+#define	vm_page_wire_lock(pp)
+#define	vm_page_wire_unlock(pp)
+#else
+#define	vm_page_wire_lock(pp) vm_page_lock(pp)
+#define	vm_page_wire_unlock(pp) vm_page_unlock(pp)
+#endif
+
+static int
+zfs_u8_validate(const char *u8str, size_t n, char **list, int flag, int *errnum)
+{
+
+	return (u8_validate(__DECONST(char *, u8str), n, list, flag, errnum));
+}
+#define	u8_validate zfs_u8_validate
+
+#ifdef DEBUG_VFS_LOCKS
+#define	VNCHECKREF(vp)				  \
+	VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp,	\
+	    ("%s: wrong ref counts", __func__));
+#else
+#define	VNCHECKREF(vp)
+#endif
+
+/*
+ * Programming rules.
+ *
+ * Each vnode op performs some logical unit of work.  To do this, the ZPL must
+ * properly lock its in-core state, create a DMU transaction, do the work,
+ * record this work in the intent log (ZIL), commit the DMU transaction,
+ * and wait for the intent log to commit if it is a synchronous operation.
+ * Moreover, the vnode ops must work in both normal and log replay context.
+ * The ordering of events is important to avoid deadlocks and references
+ * to freed memory.  The example below illustrates the following Big Rules:
+ *
+ *  (1)	A check must be made in each zfs thread for a mounted file system.
+ *	This is done avoiding races using ZFS_ENTER(zfsvfs).
+ *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
+ *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
+ *	can return EIO from the calling function.
+ *
+ *  (2)	VN_RELE() should always be the last thing except for zil_commit()
+ *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
+ *	First, if it's the last reference, the vnode/znode
+ *	can be freed, so the zp may point to freed memory.  Second, the last
+ *	reference will call zfs_zinactive(), which may induce a lot of work --
+ *	pushing cached pages (which acquires range locks) and syncing out
+ *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
+ *	which could deadlock the system if you were already holding one.
+ *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
+ *
+ *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
+ *	as they can span dmu_tx_assign() calls.
+ *
+ *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
+ *      dmu_tx_assign().  This is critical because we don't want to block
+ *      while holding locks.
+ *
+ *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
+ *	reduces lock contention and CPU usage when we must wait (note that if
+ *	throughput is constrained by the storage, nearly every transaction
+ *	must wait).
+ *
+ *      Note, in particular, that if a lock is sometimes acquired before
+ *      the tx assigns, and sometimes after (e.g. z_lock), then failing
+ *      to use a non-blocking assign can deadlock the system.  The scenario:
+ *
+ *	Thread A has grabbed a lock before calling dmu_tx_assign().
+ *	Thread B is in an already-assigned tx, and blocks for this lock.
+ *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
+ *	forever, because the previous txg can't quiesce until B's tx commits.
+ *
+ *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+ *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
+ *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
+ *	to indicate that this operation has already called dmu_tx_wait().
+ *	This will ensure that we don't retry forever, waiting a short bit
+ *	each time.
+ *
+ *  (5)	If the operation succeeded, generate the intent log entry for it
+ *	before dropping locks.  This ensures that the ordering of events
+ *	in the intent log matches the order in which they actually occurred.
+ *	During ZIL replay the zfs_log_* functions will update the sequence
+ *	number to indicate the zil transaction has replayed.
+ *
+ *  (6)	At the end of each vnode op, the DMU tx must always commit,
+ *	regardless of whether there were any errors.
+ *
+ *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
+ *	to ensure that synchronous semantics are provided when necessary.
+ *
+ * In general, this is how things should be ordered in each vnode op:
+ *
+ *	ZFS_ENTER(zfsvfs);		// exit if unmounted
+ * top:
+ *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
+ *	rw_enter(...);			// grab any other locks you need
+ *	tx = dmu_tx_create(...);	// get DMU tx
+ *	dmu_tx_hold_*();		// hold each object you might modify
+ *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ *	if (error) {
+ *		rw_exit(...);		// drop locks
+ *		zfs_dirent_unlock(dl);	// unlock directory entry
+ *		VN_RELE(...);		// release held vnodes
+ *		if (error == ERESTART) {
+ *			waited = B_TRUE;
+ *			dmu_tx_wait(tx);
+ *			dmu_tx_abort(tx);
+ *			goto top;
+ *		}
+ *		dmu_tx_abort(tx);	// abort DMU tx
+ *		ZFS_EXIT(zfsvfs);	// finished in zfs
+ *		return (error);		// really out of space
+ *	}
+ *	error = do_real_work();		// do whatever this VOP does
+ *	if (error == 0)
+ *		zfs_log_*(...);		// on success, make ZIL entry
+ *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
+ *	rw_exit(...);			// drop locks
+ *	zfs_dirent_unlock(dl);		// unlock directory entry
+ *	VN_RELE(...);			// release held vnodes
+ *	zil_commit(zilog, foid);	// synchronous when necessary
+ *	ZFS_EXIT(zfsvfs);		// finished in zfs
+ *	return (error);			// done, report error
+ */
+
+/* ARGSUSED */
+static int
+zfs_open(vnode_t **vpp, int flag, cred_t *cr)
+{
+	znode_t	*zp = VTOZ(*vpp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
+	    ((flag & FAPPEND) == 0)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
+	    ZTOV(zp)->v_type == VREG &&
+	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
+		if (fs_vscan(*vpp, cr, 0) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EACCES));
+		}
+	}
+
+	/* Keep a count of the synchronous opens in the znode */
+	if (flag & (FSYNC | FDSYNC))
+		atomic_inc_32(&zp->z_sync_cnt);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
+{
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	/* Decrement the synchronous opens in the znode */
+	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
+		atomic_dec_32(&zp->z_sync_cnt);
+
+	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
+	    ZTOV(zp)->v_type == VREG &&
+	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
+		VERIFY(fs_vscan(vp, cr, 1) == 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
+ * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
+ */
+static int
+zfs_holey(vnode_t *vp, ulong_t cmd, offset_t *off)
+{
+	znode_t	*zp = VTOZ(vp);
+	uint64_t noff = (uint64_t)*off; /* new offset */
+	uint64_t file_sz;
+	int error;
+	boolean_t hole;
+
+	file_sz = zp->z_size;
+	if (noff >= file_sz)  {
+		return (SET_ERROR(ENXIO));
+	}
+
+	if (cmd == _FIO_SEEK_HOLE)
+		hole = B_TRUE;
+	else
+		hole = B_FALSE;
+
+	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
+
+	if (error == ESRCH)
+		return (SET_ERROR(ENXIO));
+
+	/* file was dirty, so fall back to using generic logic */
+	if (error == EBUSY) {
+		if (hole)
+			*off = file_sz;
+
+		return (0);
+	}
+
+	/*
+	 * We could find a hole that begins after the logical end-of-file,
+	 * because dmu_offset_next() only works on whole blocks.  If the
+	 * EOF falls mid-block, then indicate that the "virtual hole"
+	 * at the end of the file begins at the logical EOF, rather than
+	 * at the end of the last block.
+	 */
+	if (noff > file_sz) {
+		ASSERT(hole);
+		noff = file_sz;
+	}
+
+	if (noff < *off)
+		return (error);
+	*off = noff;
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
+    int *rvalp)
+{
+	offset_t off;
+	int error;
+	zfsvfs_t *zfsvfs;
+	znode_t *zp;
+
+	switch (com) {
+	case _FIOFFS:
+	{
+		return (0);
+
+		/*
+		 * The following two ioctls are used by bfu.  Faking out,
+		 * necessary to avoid bfu errors.
+		 */
+	}
+	case _FIOGDIO:
+	case _FIOSDIO:
+	{
+		return (0);
+	}
+
+	case _FIO_SEEK_DATA:
+	case _FIO_SEEK_HOLE:
+	{
+		off = *(offset_t *)data;
+		zp = VTOZ(vp);
+		zfsvfs = zp->z_zfsvfs;
+		ZFS_ENTER(zfsvfs);
+		ZFS_VERIFY_ZP(zp);
+
+		/* offset parameter is in/out */
+		error = zfs_holey(vp, com, &off);
+		ZFS_EXIT(zfsvfs);
+		if (error)
+			return (error);
+		*(offset_t *)data = off;
+		return (0);
+	}
+	}
+	return (SET_ERROR(ENOTTY));
+}
+
+static vm_page_t
+page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
+{
+	vm_object_t obj;
+	vm_page_t pp;
+	int64_t end;
+
+	/*
+	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
+	 * aligned boundaries, if the range is not aligned.  As a result a
+	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
+	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
+	 * the whole page would be considered clean despite have some
+	 * dirty data.
+	 * For this reason we should shrink the range to DEV_BSIZE aligned
+	 * boundaries before calling vm_page_clear_dirty.
+	 */
+	end = rounddown2(off + nbytes, DEV_BSIZE);
+	off = roundup2(off, DEV_BSIZE);
+	nbytes = end - off;
+
+	obj = vp->v_object;
+	zfs_vmobject_assert_wlocked_12(obj);
+#if __FreeBSD_version < 1300050
+	for (;;) {
+		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+		    pp->valid) {
+			if (vm_page_xbusied(pp)) {
+				/*
+				 * Reference the page before unlocking and
+				 * sleeping so that the page daemon is less
+				 * likely to reclaim it.
+				 */
+				vm_page_reference(pp);
+				vm_page_lock(pp);
+				zfs_vmobject_wunlock(obj);
+				vm_page_busy_sleep(pp, "zfsmwb", true);
+				zfs_vmobject_wlock(obj);
+				continue;
+			}
+			vm_page_sbusy(pp);
+		} else if (pp != NULL) {
+			ASSERT(!pp->valid);
+			pp = NULL;
+		}
+		if (pp != NULL) {
+			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+			vm_object_pip_add(obj, 1);
+			pmap_remove_write(pp);
+			if (nbytes != 0)
+				vm_page_clear_dirty(pp, off, nbytes);
+		}
+		break;
+	}
+#else
+	vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start),
+	    VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL |
+	    VM_ALLOC_IGN_SBUSY);
+	if (pp != NULL) {
+		ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+		vm_object_pip_add(obj, 1);
+		pmap_remove_write(pp);
+		if (nbytes != 0)
+			vm_page_clear_dirty(pp, off, nbytes);
+	}
+#endif
+	return (pp);
+}
+
+static void
+page_unbusy(vm_page_t pp)
+{
+
+	vm_page_sunbusy(pp);
+#if __FreeBSD_version >= 1300041
+	vm_object_pip_wakeup(pp->object);
+#else
+	vm_object_pip_subtract(pp->object, 1);
+#endif
+}
+
+#if __FreeBSD_version > 1300051
+static vm_page_t
+page_hold(vnode_t *vp, int64_t start)
+{
+	vm_object_t obj;
+	vm_page_t m;
+
+	obj = vp->v_object;
+	vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start),
+	    VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
+	    VM_ALLOC_NOBUSY);
+	return (m);
+}
+#else
+static vm_page_t
+page_hold(vnode_t *vp, int64_t start)
+{
+	vm_object_t obj;
+	vm_page_t pp;
+
+	obj = vp->v_object;
+	zfs_vmobject_assert_wlocked(obj);
+
+	for (;;) {
+		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+		    pp->valid) {
+			if (vm_page_xbusied(pp)) {
+				/*
+				 * Reference the page before unlocking and
+				 * sleeping so that the page daemon is less
+				 * likely to reclaim it.
+				 */
+				vm_page_reference(pp);
+				vm_page_lock(pp);
+				zfs_vmobject_wunlock(obj);
+				vm_page_busy_sleep(pp, "zfsmwb", true);
+				zfs_vmobject_wlock(obj);
+				continue;
+			}
+
+			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+			vm_page_wire_lock(pp);
+			vm_page_hold(pp);
+			vm_page_wire_unlock(pp);
+
+		} else
+			pp = NULL;
+		break;
+	}
+	return (pp);
+}
+#endif
+
+static void
+page_unhold(vm_page_t pp)
+{
+
+	vm_page_wire_lock(pp);
+#if __FreeBSD_version >= 1300035
+	vm_page_unwire(pp, PQ_ACTIVE);
+#else
+	vm_page_unhold(pp);
+#endif
+	vm_page_wire_unlock(pp);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages.  What this means:
+ *
+ * On Write:	If we find a memory mapped page, we write to *both*
+ *		the page and the dmu buffer.
+ */
+static void
+update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
+    int segflg, dmu_tx_t *tx)
+{
+	vm_object_t obj;
+	struct sf_buf *sf;
+	caddr_t va;
+	int off;
+
+	ASSERT(segflg != UIO_NOCOPY);
+	ASSERT(vp->v_mount != NULL);
+	obj = vp->v_object;
+	ASSERT(obj != NULL);
+
+	off = start & PAGEOFFSET;
+	zfs_vmobject_wlock_12(obj);
+#if __FreeBSD_version >= 1300041
+	vm_object_pip_add(obj, 1);
+#endif
+	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+		vm_page_t pp;
+		int nbytes = imin(PAGESIZE - off, len);
+
+		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
+			zfs_vmobject_wunlock_12(obj);
+
+			va = zfs_map_page(pp, &sf);
+			(void) dmu_read(os, oid, start+off, nbytes,
+			    va+off, DMU_READ_PREFETCH);
+			zfs_unmap_page(sf);
+
+			zfs_vmobject_wlock_12(obj);
+			page_unbusy(pp);
+		}
+		len -= nbytes;
+		off = 0;
+	}
+#if __FreeBSD_version >= 1300041
+	vm_object_pip_wakeup(obj);
+#else
+	vm_object_pip_wakeupn(obj, 0);
+#endif
+	zfs_vmobject_wunlock_12(obj);
+}
+
+/*
+ * Read with UIO_NOCOPY flag means that sendfile(2) requests
+ * ZFS to populate a range of page cache pages with data.
+ *
+ * NOTE: this function could be optimized to pre-allocate
+ * all pages in advance, drain exclusive busy on all of them,
+ * map them into contiguous KVA region and populate them
+ * in one single dmu_read() call.
+ */
+static int
+mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
+{
+	znode_t *zp = VTOZ(vp);
+	objset_t *os = zp->z_zfsvfs->z_os;
+	struct sf_buf *sf;
+	vm_object_t obj;
+	vm_page_t pp;
+	int64_t start;
+	caddr_t va;
+	int len = nbytes;
+	int error = 0;
+
+	ASSERT(uio->uio_segflg == UIO_NOCOPY);
+	ASSERT(vp->v_mount != NULL);
+	obj = vp->v_object;
+	ASSERT(obj != NULL);
+	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
+
+	zfs_vmobject_wlock_12(obj);
+	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
+		int bytes = MIN(PAGESIZE, len);
+
+		pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start),
+		    VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
+		if (vm_page_none_valid(pp)) {
+			zfs_vmobject_wunlock_12(obj);
+			va = zfs_map_page(pp, &sf);
+			error = dmu_read(os, zp->z_id, start, bytes, va,
+			    DMU_READ_PREFETCH);
+			if (bytes != PAGESIZE && error == 0)
+				bzero(va + bytes, PAGESIZE - bytes);
+			zfs_unmap_page(sf);
+			zfs_vmobject_wlock_12(obj);
+#if  __FreeBSD_version >= 1300081
+			if (error == 0) {
+				vm_page_valid(pp);
+				vm_page_activate(pp);
+				vm_page_do_sunbusy(pp);
+			} else {
+				zfs_vmobject_wlock(obj);
+				if (!vm_page_wired(pp) && pp->valid == 0 &&
+				    vm_page_busy_tryupgrade(pp))
+					vm_page_free(pp);
+				else
+					vm_page_sunbusy(pp);
+				zfs_vmobject_wunlock(obj);
+			}
+#else
+			vm_page_do_sunbusy(pp);
+			vm_page_lock(pp);
+			if (error) {
+				if (pp->wire_count == 0 && pp->valid == 0 &&
+				    !vm_page_busied(pp))
+					vm_page_free(pp);
+			} else {
+				pp->valid = VM_PAGE_BITS_ALL;
+				vm_page_activate(pp);
+			}
+			vm_page_unlock(pp);
+#endif
+		} else {
+			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+			vm_page_do_sunbusy(pp);
+		}
+		if (error)
+			break;
+		uio->uio_resid -= bytes;
+		uio->uio_offset += bytes;
+		len -= bytes;
+	}
+	zfs_vmobject_wunlock_12(obj);
+	return (error);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages.  What this means:
+ *
+ * On Read:	We "read" preferentially from memory mapped pages,
+ *		else we default from the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ *	 the file is memory mapped.
+ */
+static int
+mappedread(vnode_t *vp, int nbytes, uio_t *uio)
+{
+	znode_t *zp = VTOZ(vp);
+	vm_object_t obj;
+	int64_t start;
+	int len = nbytes;
+	int off;
+	int error = 0;
+
+	ASSERT(vp->v_mount != NULL);
+	obj = vp->v_object;
+	ASSERT(obj != NULL);
+
+	start = uio->uio_loffset;
+	off = start & PAGEOFFSET;
+	zfs_vmobject_wlock_12(obj);
+	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+		vm_page_t pp;
+		uint64_t bytes = MIN(PAGESIZE - off, len);
+
+		if ((pp = page_hold(vp, start))) {
+			struct sf_buf *sf;
+			caddr_t va;
+
+			zfs_vmobject_wunlock_12(obj);
+			va = zfs_map_page(pp, &sf);
+			error = vn_io_fault_uiomove(va + off, bytes, uio);
+			zfs_unmap_page(sf);
+			zfs_vmobject_wlock_12(obj);
+			page_unhold(pp);
+		} else {
+			zfs_vmobject_wunlock_12(obj);
+			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, bytes);
+			zfs_vmobject_wlock_12(obj);
+		}
+		len -= bytes;
+		off = 0;
+		if (error)
+			break;
+	}
+	zfs_vmobject_wunlock_12(obj);
+	return (error);
+}
+
+offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
+
+/*
+ * Read bytes from specified file into supplied buffer.
+ *
+ *	IN:	vp	- vnode of file to be read from.
+ *		uio	- structure supplying read location, range info,
+ *			  and return buffer.
+ *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *
+ *	OUT:	uio	- updated offset and range, buffer filled.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Side Effects:
+ *	vp - atime updated if byte count > 0
+ */
+/* ARGSUSED */
+static int
+zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	ssize_t		n, nbytes, start_resid;
+	int		error = 0;
+	int64_t		nread;
+	zfs_locked_range_t		*lr;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	/* We don't copy out anything useful for directories. */
+	if (vp->v_type == VDIR) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EISDIR));
+	}
+
+	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EACCES));
+	}
+
+	/*
+	 * Validate file offset
+	 */
+	if (uio->uio_loffset < (offset_t)0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Fasttrack empty reads
+	 */
+	if (uio->uio_resid == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/*
+	 * If we're in FRSYNC mode, sync out this znode before reading it.
+	 */
+	if (zfsvfs->z_log &&
+	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
+		zil_commit(zfsvfs->z_log, zp->z_id);
+
+	/*
+	 * Lock the range against changes.
+	 */
+	lr = zfs_rangelock_enter(&zp->z_rangelock, uio->uio_loffset,
+	    uio->uio_resid, RL_READER);
+
+	/*
+	 * If we are reading past end-of-file we can skip
+	 * to the end; but we might still need to set atime.
+	 */
+	if (uio->uio_loffset >= zp->z_size) {
+		error = 0;
+		goto out;
+	}
+
+	ASSERT(uio->uio_loffset < zp->z_size);
+	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
+	start_resid = n;
+
+	while (n > 0) {
+		nbytes = MIN(n, zfs_read_chunk_size -
+		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
+
+		if (uio->uio_segflg == UIO_NOCOPY)
+			error = mappedread_sf(vp, nbytes, uio);
+		else if (vn_has_cached_data(vp)) {
+			error = mappedread(vp, nbytes, uio);
+		} else {
+			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, nbytes);
+		}
+		if (error) {
+			/* convert checksum errors into IO errors */
+			if (error == ECKSUM)
+				error = SET_ERROR(EIO);
+			break;
+		}
+
+		n -= nbytes;
+	}
+
+	nread = start_resid - n;
+	dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
+
+out:
+	zfs_rangelock_exit(lr);
+
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Write the bytes to a file.
+ *
+ *	IN:	vp	- vnode of file to be written to.
+ *		uio	- structure supplying write location, range info,
+ *			  and data buffer.
+ *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
+ *			  set if in append mode.
+ *		cr	- credentials of caller.
+ *		ct	- caller context (NFS/CIFS fem monitor only)
+ *
+ *	OUT:	uio	- updated offset and range.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	vp - ctime|mtime updated if byte count > 0
+ */
+
+/* ARGSUSED */
+static int
+zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr)
+{
+	znode_t		*zp = VTOZ(vp);
+	rlim64_t	limit = MAXOFFSET_T;
+	ssize_t		start_resid = uio->uio_resid;
+	ssize_t		tx_bytes;
+	uint64_t	end_size;
+	dmu_buf_impl_t	*db;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zilog_t		*zilog;
+	offset_t	woff;
+	ssize_t		n, nbytes;
+	zfs_locked_range_t		*lr;
+	int		max_blksz = zfsvfs->z_max_blksz;
+	int		error = 0;
+	arc_buf_t	*abuf;
+	iovec_t		*aiov = NULL;
+	xuio_t		*xuio = NULL;
+	int		i_iov = 0;
+	int		iovcnt __unused = uio->uio_iovcnt;
+	iovec_t		*iovp = uio->uio_iov;
+	int		write_eof;
+	int		count = 0;
+	sa_bulk_attr_t	bulk[4];
+	uint64_t	mtime[2], ctime[2];
+	uint64_t	uid, gid, projid;
+	int64_t		nwritten;
+
+	/*
+	 * Fasttrack empty write
+	 */
+	n = start_resid;
+	if (n == 0)
+		return (0);
+
+	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
+		limit = MAXOFFSET_T;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+
+	/*
+	 * Callers might not be able to detect properly that we are read-only,
+	 * so check it explicitly here.
+	 */
+	if (zfs_is_readonly(zfsvfs)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * If immutable or not appending then return EPERM.
+	 * Intentionally allow ZFS_READONLY through here.
+	 * See zfs_zaccess_common()
+	 */
+	if ((zp->z_pflags & ZFS_IMMUTABLE) ||
+	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
+	    (uio->uio_loffset < zp->z_size))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	zilog = zfsvfs->z_log;
+
+	/*
+	 * Validate file offset
+	 */
+	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
+	if (woff < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * If in append mode, set the io offset pointer to eof.
+	 */
+	if (ioflag & FAPPEND) {
+		/*
+		 * Obtain an appending range lock to guarantee file append
+		 * semantics.  We reset the write offset once we have the lock.
+		 */
+		lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
+		woff = lr->lr_offset;
+		if (lr->lr_length == UINT64_MAX) {
+			/*
+			 * We overlocked the file because this write will cause
+			 * the file block size to increase.
+			 * Note that zp_size cannot change with this lock held.
+			 */
+			woff = zp->z_size;
+		}
+		uio->uio_loffset = woff;
+	} else {
+		/*
+		 * Note that if the file block size will change as a result of
+		 * this write, then this range lock will lock the entire file
+		 * so that we can re-write the block safely.
+		 */
+		lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
+	}
+
+	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
+		zfs_rangelock_exit(lr);
+		ZFS_EXIT(zfsvfs);
+		return (EFBIG);
+	}
+
+	if (woff >= limit) {
+		zfs_rangelock_exit(lr);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EFBIG));
+	}
+
+	if ((woff + n) > limit || woff > (limit - n))
+		n = limit - woff;
+
+	/* Will this write extend the file length? */
+	write_eof = (woff + n > zp->z_size);
+
+	end_size = MAX(zp->z_size, woff + n);
+
+	uid = zp->z_uid;
+	gid = zp->z_gid;
+	projid = zp->z_projid;
+
+	/*
+	 * Write the file in reasonable size chunks.  Each chunk is written
+	 * in a separate transaction; this keeps the intent log records small
+	 * and allows us to do more fine-grained space accounting.
+	 */
+	while (n > 0) {
+		woff = uio->uio_loffset;
+
+		if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) ||
+		    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) ||
+		    (projid != ZFS_DEFAULT_PROJID &&
+		    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+		    projid))) {
+			error = SET_ERROR(EDQUOT);
+			break;
+		}
+
+		abuf = NULL;
+		if (xuio) {
+			ASSERT(i_iov < iovcnt);
+			aiov = &iovp[i_iov];
+			abuf = dmu_xuio_arcbuf(xuio, i_iov);
+			dmu_xuio_clear(xuio, i_iov);
+			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
+			    iovec_t *, aiov, arc_buf_t *, abuf);
+			ASSERT((aiov->iov_base == abuf->b_data) ||
+			    ((char *)aiov->iov_base - (char *)abuf->b_data +
+			    aiov->iov_len == arc_buf_size(abuf)));
+			i_iov++;
+		} else if (n >= max_blksz &&
+		    woff >= zp->z_size &&
+		    P2PHASE(woff, max_blksz) == 0 &&
+		    zp->z_blksz == max_blksz) {
+			/*
+			 * This write covers a full block.  "Borrow" a buffer
+			 * from the dmu so that we can fill it before we enter
+			 * a transaction.  This avoids the possibility of
+			 * holding up the transaction if the data copy hangs
+			 * up on a pagefault (e.g., from an NFS server mapping).
+			 */
+			size_t cbytes;
+
+			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+			    max_blksz);
+			ASSERT(abuf != NULL);
+			ASSERT(arc_buf_size(abuf) == max_blksz);
+			if ((error = uiocopy(abuf->b_data, max_blksz,
+			    UIO_WRITE, uio, &cbytes))) {
+				dmu_return_arcbuf(abuf);
+				break;
+			}
+			ASSERT(cbytes == max_blksz);
+		}
+
+		/*
+		 * Start a transaction.
+		 */
+		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+		db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+		DB_DNODE_ENTER(db);
+		dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
+		    MIN(n, max_blksz));
+		DB_DNODE_EXIT(db);
+		zfs_sa_upgrade_txholds(tx, zp);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			if (abuf != NULL)
+				dmu_return_arcbuf(abuf);
+			break;
+		}
+
+		/*
+		 * If zfs_range_lock() over-locked we grow the blocksize
+		 * and then reduce the lock range.  This will only happen
+		 * on the first iteration since zfs_range_reduce() will
+		 * shrink down r_len to the appropriate size.
+		 */
+		if (lr->lr_length == UINT64_MAX) {
+			uint64_t new_blksz;
+
+			if (zp->z_blksz > max_blksz) {
+				/*
+				 * File's blocksize is already larger than the
+				 * "recordsize" property.  Only let it grow to
+				 * the next power of 2.
+				 */
+				ASSERT(!ISP2(zp->z_blksz));
+				new_blksz = MIN(end_size,
+				    1 << highbit64(zp->z_blksz));
+			} else {
+				new_blksz = MIN(end_size, max_blksz);
+			}
+			zfs_grow_blocksize(zp, new_blksz, tx);
+			zfs_rangelock_reduce(lr, woff, n);
+		}
+
+		/*
+		 * XXX - should we really limit each write to z_max_blksz?
+		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+		 */
+		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+
+		if (woff + nbytes > zp->z_size)
+			vnode_pager_setsize(vp, woff + nbytes);
+
+		if (abuf == NULL) {
+			tx_bytes = uio->uio_resid;
+			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, nbytes, tx);
+			tx_bytes -= uio->uio_resid;
+		} else {
+			tx_bytes = nbytes;
+			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
+			/*
+			 * If this is not a full block write, but we are
+			 * extending the file past EOF and this data starts
+			 * block-aligned, use assign_arcbuf().  Otherwise,
+			 * write via dmu_write().
+			 */
+			if (tx_bytes < max_blksz && (!write_eof ||
+			    aiov->iov_base != abuf->b_data)) {
+				ASSERT(xuio);
+				dmu_write(zfsvfs->z_os, zp->z_id, woff,
+				    aiov->iov_len, aiov->iov_base, tx);
+				dmu_return_arcbuf(abuf);
+				xuio_stat_wbuf_copied();
+			} else {
+				ASSERT(xuio || tx_bytes == max_blksz);
+				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff,
+				    abuf, tx);
+			}
+			ASSERT(tx_bytes <= uio->uio_resid);
+			uioskip(uio, tx_bytes);
+		}
+		if (tx_bytes && vn_has_cached_data(vp)) {
+			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
+			    zp->z_id, uio->uio_segflg, tx);
+		}
+
+		/*
+		 * If we made no progress, we're done.  If we made even
+		 * partial progress, update the znode and ZIL accordingly.
+		 */
+		if (tx_bytes == 0) {
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+			    (void *)&zp->z_size, sizeof (uint64_t), tx);
+			dmu_tx_commit(tx);
+			ASSERT(error != 0);
+			break;
+		}
+
+		/*
+		 * Clear Set-UID/Set-GID bits on successful write if not
+		 * privileged and at least one of the execute bits is set.
+		 *
+		 * It would be nice to to this after all writes have
+		 * been done, but that would still expose the ISUID/ISGID
+		 * to another app after the partial write is committed.
+		 *
+		 * Note: we don't call zfs_fuid_map_id() here because
+		 * user 0 is not an ephemeral uid.
+		 */
+		mutex_enter(&zp->z_acl_lock);
+		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
+		    (S_IXUSR >> 6))) != 0 &&
+		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
+		    secpolicy_vnode_setid_retain(vp, cr,
+		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
+			uint64_t newmode;
+			zp->z_mode &= ~(S_ISUID | S_ISGID);
+			newmode = zp->z_mode;
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+			    (void *)&newmode, sizeof (uint64_t), tx);
+		}
+		mutex_exit(&zp->z_acl_lock);
+
+		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+
+		/*
+		 * Update the file size (zp_size) if it has changed;
+		 * account for possible concurrent updates.
+		 */
+		while ((end_size = zp->z_size) < uio->uio_loffset) {
+			(void) atomic_cas_64(&zp->z_size, end_size,
+			    uio->uio_loffset);
+			ASSERT(error == 0 || error == EFAULT);
+		}
+		/*
+		 * If we are replaying and eof is non zero then force
+		 * the file size to the specified eof. Note, there's no
+		 * concurrency during replay.
+		 */
+		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+			zp->z_size = zfsvfs->z_replay_eof;
+
+		if (error == 0)
+			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		else
+			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
+		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes,
+		    ioflag, NULL, NULL);
+		dmu_tx_commit(tx);
+
+		if (error != 0)
+			break;
+		ASSERT(tx_bytes == nbytes);
+		n -= nbytes;
+
+	}
+
+	zfs_rangelock_exit(lr);
+
+	/*
+	 * If we're in replay mode, or we made no progress, return error.
+	 * Otherwise, it's at least a partial write, so it's successful.
+	 */
+	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * EFAULT means that at least one page of the source buffer was not
+	 * available.  VFS will re-try remaining I/O upon this error.
+	 */
+	if (error == EFAULT) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (ioflag & (FSYNC | FDSYNC) ||
+	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, zp->z_id);
+
+	nwritten = start_resid - uio->uio_resid;
+	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+int
+zfs_write_simple(znode_t *zp, const void *data, size_t len,
+    loff_t pos, size_t *presid)
+{
+	int error = 0;
+	ssize_t resid;
+
+	error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos,
+	    UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread);
+
+	if (error) {
+		return (SET_ERROR(error));
+	} else if (presid == NULL) {
+		if (resid != 0) {
+			error = SET_ERROR(EIO);
+		}
+	} else {
+		*presid = resid;
+	}
+	return (error);
+}
+
+static void
+zfs_get_done(zgd_t *zgd, int error)
+{
+	znode_t *zp = zgd->zgd_private;
+	objset_t *os = zp->z_zfsvfs->z_os;
+
+	if (zgd->zgd_db)
+		dmu_buf_rele(zgd->zgd_db, zgd);
+
+	zfs_rangelock_exit(zgd->zgd_lr);
+
+	/*
+	 * Release the vnode asynchronously as we currently have the
+	 * txg stopped from syncing.
+	 */
+	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os)));
+
+	kmem_free(zgd, sizeof (zgd_t));
+}
+
+#ifdef ZFS_DEBUG
+static int zil_fault_io = 0;
+#endif
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+	zfsvfs_t *zfsvfs = arg;
+	objset_t *os = zfsvfs->z_os;
+	znode_t *zp;
+	uint64_t object = lr->lr_foid;
+	uint64_t offset = lr->lr_offset;
+	uint64_t size = lr->lr_length;
+	dmu_buf_t *db;
+	zgd_t *zgd;
+	int error = 0;
+
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT3P(zio, !=, NULL);
+	ASSERT3U(size, !=, 0);
+
+	/*
+	 * Nothing to do if the file has been removed
+	 */
+	if (zfs_zget(zfsvfs, object, &zp) != 0)
+		return (SET_ERROR(ENOENT));
+	if (zp->z_unlinked) {
+		/*
+		 * Release the vnode asynchronously as we currently have the
+		 * txg stopped from syncing.
+		 */
+		VN_RELE_ASYNC(ZTOV(zp),
+		    dsl_pool_zrele_taskq(dmu_objset_pool(os)));
+		return (SET_ERROR(ENOENT));
+	}
+
+	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+	zgd->zgd_lwb = lwb;
+	zgd->zgd_private = zp;
+
+	/*
+	 * Write records come in two flavors: immediate and indirect.
+	 * For small writes it's cheaper to store the data with the
+	 * log record (immediate); for large writes it's cheaper to
+	 * sync the data and get a pointer to it (indirect) so that
+	 * we don't have to write the data twice.
+	 */
+	if (buf != NULL) { /* immediate write */
+		zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
+		    size, RL_READER);
+		/* test for truncation needs to be done while range locked */
+		if (offset >= zp->z_size) {
+			error = SET_ERROR(ENOENT);
+		} else {
+			error = dmu_read(os, object, offset, size, buf,
+			    DMU_READ_NO_PREFETCH);
+		}
+		ASSERT(error == 0 || error == ENOENT);
+	} else { /* indirect write */
+		/*
+		 * Have to lock the whole block to ensure when it's
+		 * written out and its checksum is being calculated
+		 * that no one can change the data. We need to re-check
+		 * blocksize after we get the lock in case it's changed!
+		 */
+		for (;;) {
+			uint64_t blkoff;
+			size = zp->z_blksz;
+			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
+			offset -= blkoff;
+			zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
+			    offset, size, RL_READER);
+			if (zp->z_blksz == size)
+				break;
+			offset += blkoff;
+			zfs_rangelock_exit(zgd->zgd_lr);
+		}
+		/* test for truncation needs to be done while range locked */
+		if (lr->lr_offset >= zp->z_size)
+			error = SET_ERROR(ENOENT);
+#ifdef ZFS_DEBUG
+		if (zil_fault_io) {
+			error = SET_ERROR(EIO);
+			zil_fault_io = 0;
+		}
+#endif
+		if (error == 0)
+			error = dmu_buf_hold(os, object, offset, zgd, &db,
+			    DMU_READ_NO_PREFETCH);
+
+		if (error == 0) {
+			blkptr_t *bp = &lr->lr_blkptr;
+
+			zgd->zgd_db = db;
+			zgd->zgd_bp = bp;
+
+			ASSERT(db->db_offset == offset);
+			ASSERT(db->db_size == size);
+
+			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+			    zfs_get_done, zgd);
+			ASSERT(error || lr->lr_length <= size);
+
+			/*
+			 * On success, we need to wait for the write I/O
+			 * initiated by dmu_sync() to complete before we can
+			 * release this dbuf.  We will finish everything up
+			 * in the zfs_get_done() callback.
+			 */
+			if (error == 0)
+				return (0);
+
+			if (error == EALREADY) {
+				lr->lr_common.lrc_txtype = TX_WRITE2;
+				/*
+				 * TX_WRITE2 relies on the data previously
+				 * written by the TX_WRITE that caused
+				 * EALREADY.  We zero out the BP because
+				 * it is the old, currently-on-disk BP,
+				 * so there's no need to zio_flush() its
+				 * vdevs (flushing would needlesly hurt
+				 * performance, and doesn't work on
+				 * indirect vdevs).
+				 */
+				zgd->zgd_bp = NULL;
+				BP_ZERO(bp);
+				error = 0;
+			}
+		}
+	}
+
+	zfs_get_done(zgd, error);
+
+	return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
+    caller_context_t *ct)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if (flag & V_ACE_MASK)
+		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
+	else
+		error = zfs_zaccess_rwx(zp, mode, flag, cr);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+static int
+zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
+{
+	int error;
+
+	*vpp = arg;
+	error = vn_lock(*vpp, lkflags);
+	if (error != 0)
+		vrele(*vpp);
+	return (error);
+}
+
+static int
+zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
+{
+	znode_t *zdp = VTOZ(dvp);
+	zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs;
+	int error;
+	int ltype;
+
+	if (zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_LOCKED(dvp, __func__);
+#ifdef DIAGNOSTIC
+	if ((zdp->z_pflags & ZFS_XATTR) == 0)
+		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+#endif
+
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		ASSERT3P(dvp, ==, vp);
+		vref(dvp);
+		ltype = lkflags & LK_TYPE_MASK;
+		if (ltype != VOP_ISLOCKED(dvp)) {
+			if (ltype == LK_EXCLUSIVE)
+				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+			else /* if (ltype == LK_SHARED) */
+				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+			/*
+			 * Relock for the "." case could leave us with
+			 * reclaimed vnode.
+			 */
+			if (VN_IS_DOOMED(dvp)) {
+				vrele(dvp);
+				return (SET_ERROR(ENOENT));
+			}
+		}
+		return (0);
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		/*
+		 * Note that in this case, dvp is the child vnode, and we
+		 * are looking up the parent vnode - exactly reverse from
+		 * normal operation.  Unlocking dvp requires some rather
+		 * tricky unlock/relock dance to prevent mp from being freed;
+		 * use vn_vget_ino_gen() which takes care of all that.
+		 *
+		 * XXX Note that there is a time window when both vnodes are
+		 * unlocked.  It is possible, although highly unlikely, that
+		 * during that window the parent-child relationship between
+		 * the vnodes may change, for example, get reversed.
+		 * In that case we would have a wrong lock order for the vnodes.
+		 * All other filesystems seem to ignore this problem, so we
+		 * do the same here.
+		 * A potential solution could be implemented as follows:
+		 * - using LK_NOWAIT when locking the second vnode and retrying
+		 *   if necessary
+		 * - checking that the parent-child relationship still holds
+		 *   after locking both vnodes and retrying if it doesn't
+		 */
+		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
+		return (error);
+	} else {
+		error = vn_lock(vp, lkflags);
+		if (error != 0)
+			vrele(vp);
+		return (error);
+	}
+}
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held vnode reference for it.
+ *
+ *	IN:	dvp	- vnode of directory to search.
+ *		nm	- name of entry to lookup.
+ *		pnp	- full pathname to lookup [UNUSED].
+ *		flags	- LOOKUP_XATTR set if looking for an attribute.
+ *		rdir	- root directory vnode [UNUSED].
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *
+ *	OUT:	vpp	- vnode of located entry, NULL if not found.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	NA
+ */
+/* ARGSUSED */
+static int
+zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
+    int nameiop, cred_t *cr, kthread_t *td, int flags, boolean_t cached)
+{
+	znode_t *zdp = VTOZ(dvp);
+	znode_t *zp;
+	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+	int	error = 0;
+
+	/*
+	 * Fast path lookup, however we must skip DNLC lookup
+	 * for case folding or normalizing lookups because the
+	 * DNLC code only stores the passed in name.  This means
+	 * creating 'a' and removing 'A' on a case insensitive
+	 * file system would work, but DNLC still thinks 'a'
+	 * exists and won't let you create it again on the next
+	 * pass through fast path.
+	 */
+	if (!(flags & LOOKUP_XATTR)) {
+		if (dvp->v_type != VDIR) {
+			return (SET_ERROR(ENOTDIR));
+		} else if (zdp->z_sa_hdl == NULL) {
+			return (SET_ERROR(EIO));
+		}
+	}
+
+	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zdp);
+
+	*vpp = NULL;
+
+	if (flags & LOOKUP_XATTR) {
+		/*
+		 * If the xattr property is off, refuse the lookup request.
+		 */
+		if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EOPNOTSUPP));
+		}
+
+		/*
+		 * We don't allow recursive attributes..
+		 * Maybe someday we will.
+		 */
+		if (zdp->z_pflags & ZFS_XATTR) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EINVAL));
+		}
+
+		if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+		*vpp = ZTOV(zp);
+
+		/*
+		 * Do we have permission to get into attribute directory?
+		 */
+		error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr);
+		if (error) {
+			vrele(ZTOV(zp));
+		}
+
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Check accessibility of directory if we're not coming in via
+	 * VOP_CACHEDLOOKUP.
+	 */
+	if (!cached) {
+#ifdef NOEXECCHECK
+		if ((cnp->cn_flags & NOEXECCHECK) != 0) {
+			cnp->cn_flags &= ~NOEXECCHECK;
+		} else
+#endif
+		if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+
+	/*
+	 * First handle the special cases.
+	 */
+	if ((cnp->cn_flags & ISDOTDOT) != 0) {
+		/*
+		 * If we are a snapshot mounted under .zfs, return
+		 * the vp for the snapshot directory.
+		 */
+		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
+			struct componentname cn;
+			vnode_t *zfsctl_vp;
+			int ltype;
+
+			ZFS_EXIT(zfsvfs);
+			ltype = VOP_ISLOCKED(dvp);
+			VOP_UNLOCK1(dvp);
+			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
+			    &zfsctl_vp);
+			if (error == 0) {
+				cn.cn_nameptr = "snapshot";
+				cn.cn_namelen = strlen(cn.cn_nameptr);
+				cn.cn_nameiop = cnp->cn_nameiop;
+				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
+				cn.cn_lkflags = cnp->cn_lkflags;
+				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
+				vput(zfsctl_vp);
+			}
+			vn_lock(dvp, ltype | LK_RETRY);
+			return (error);
+		}
+	}
+	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
+		ZFS_EXIT(zfsvfs);
+		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+			return (SET_ERROR(ENOTSUP));
+		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
+		return (error);
+	}
+
+	/*
+	 * The loop is retry the lookup if the parent-child relationship
+	 * changes during the dot-dot locking complexities.
+	 */
+	for (;;) {
+		uint64_t parent;
+
+		error = zfs_dirlook(zdp, nm, &zp);
+		if (error == 0)
+			*vpp = ZTOV(zp);
+
+		ZFS_EXIT(zfsvfs);
+		if (error != 0)
+			break;
+
+		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
+		if (error != 0) {
+			/*
+			 * If we've got a locking error, then the vnode
+			 * got reclaimed because of a force unmount.
+			 * We never enter doomed vnodes into the name cache.
+			 */
+			*vpp = NULL;
+			return (error);
+		}
+
+		if ((cnp->cn_flags & ISDOTDOT) == 0)
+			break;
+
+		ZFS_ENTER(zfsvfs);
+		if (zdp->z_sa_hdl == NULL) {
+			error = SET_ERROR(EIO);
+		} else {
+			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+			    &parent, sizeof (parent));
+		}
+		if (error != 0) {
+			ZFS_EXIT(zfsvfs);
+			vput(ZTOV(zp));
+			break;
+		}
+		if (zp->z_id == parent) {
+			ZFS_EXIT(zfsvfs);
+			break;
+		}
+		vput(ZTOV(zp));
+	}
+
+	if (error != 0)
+		*vpp = NULL;
+
+	/* Translate errors and add SAVENAME when needed. */
+	if (cnp->cn_flags & ISLASTCN) {
+		switch (nameiop) {
+		case CREATE:
+		case RENAME:
+			if (error == ENOENT) {
+				error = EJUSTRETURN;
+				cnp->cn_flags |= SAVENAME;
+				break;
+			}
+			/* FALLTHROUGH */
+		case DELETE:
+			if (error == 0)
+				cnp->cn_flags |= SAVENAME;
+			break;
+		}
+	}
+
+	/* Insert name into cache (as non-existent) if appropriate. */
+	if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
+	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
+		cache_enter(dvp, NULL, cnp);
+
+	/* Insert name into cache if appropriate. */
+	if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
+	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
+		if (!(cnp->cn_flags & ISLASTCN) ||
+		    (nameiop != DELETE && nameiop != RENAME)) {
+			cache_enter(dvp, *vpp, cnp);
+		}
+	}
+
+	return (error);
+}
+
+/*
+ * Attempt to create a new entry in a directory.  If the entry
+ * already exists, truncate the file if permissible, else return
+ * an error.  Return the vp of the created or trunc'd file.
+ *
+ *	IN:	dvp	- vnode of directory to put new file entry in.
+ *		name	- name of new file entry.
+ *		vap	- attributes of new file.
+ *		excl	- flag indicating exclusive or non-exclusive mode.
+ *		mode	- mode to open file with.
+ *		cr	- credentials of caller.
+ *		flag	- large file flag [UNUSED].
+ *		ct	- caller context
+ *		vsecp	- ACL to be set
+ *
+ *	OUT:	vpp	- vnode of created or trunc'd entry.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated if new entry created
+ *	 vp - ctime|mtime always, atime if new
+ */
+
+/* ARGSUSED */
+int
+zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode,
+    znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp)
+{
+	znode_t		*zp;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog;
+	objset_t	*os;
+	dmu_tx_t	*tx;
+	int		error;
+	ksid_t		*ksid;
+	uid_t		uid;
+	gid_t		gid = crgetgid(cr);
+	uint64_t	projid = ZFS_DEFAULT_PROJID;
+	zfs_acl_ids_t   acl_ids;
+	boolean_t	fuid_dirtied;
+	uint64_t	txtype;
+#ifdef DEBUG_VFS_LOCKS
+	vnode_t	*dvp = ZTOV(dzp);
+#endif
+
+	/*
+	 * If we have an ephemeral id, ACL, or XVATTR then
+	 * make sure file system is at proper version
+	 */
+
+	ksid = crgetsid(cr, KSID_OWNER);
+	if (ksid)
+		uid = ksid_getid(ksid);
+	else
+		uid = crgetuid(cr);
+
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (vsecp || (vap->va_mask & AT_XVATTR) ||
+	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	os = zfsvfs->z_os;
+	zilog = zfsvfs->z_log;
+
+	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+	if (vap->va_mask & AT_XVATTR) {
+		if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
+		    crgetuid(cr), cr, vap->va_type)) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	*zpp = NULL;
+
+	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
+		vap->va_mode &= ~S_ISVTX;
+
+	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+	if (error) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	ASSERT3P(zp, ==, NULL);
+
+	/*
+	 * Create a new file object and update the directory
+	 * to reference it.
+	 */
+	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+		goto out;
+	}
+
+	/*
+	 * We only support the creation of regular files in
+	 * extended attribute directories.
+	 */
+
+	if ((dzp->z_pflags & ZFS_XATTR) &&
+	    (vap->va_type != VREG)) {
+		error = SET_ERROR(EINVAL);
+		goto out;
+	}
+
+	if ((error = zfs_acl_ids_create(dzp, 0, vap,
+	    cr, vsecp, &acl_ids)) != 0)
+		goto out;
+
+	if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
+		projid = zfs_inherit_projid(dzp);
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
+		zfs_acl_ids_free(&acl_ids);
+		error = SET_ERROR(EDQUOT);
+		goto out;
+	}
+
+	getnewvnode_reserve_();
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+	if (!zfsvfs->z_use_sa &&
+	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+		    0, acl_ids.z_aclp->z_acl_bytes);
+	}
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+	    vsecp, acl_ids.z_fuidp, vap);
+	zfs_acl_ids_free(&acl_ids);
+	dmu_tx_commit(tx);
+
+	getnewvnode_drop_reserve();
+
+out:
+	VNCHECKREF(dvp);
+	if (error == 0) {
+		*zpp = zp;
+	}
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Remove an entry from a directory.
+ *
+ *	IN:	dvp	- vnode of directory to remove entry from.
+ *		name	- name of entry to remove.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime
+ *	 vp - ctime (if nlink > 0)
+ */
+
+/*ARGSUSED*/
+static int
+zfs_remove_(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
+{
+	znode_t		*dzp = VTOZ(dvp);
+	znode_t		*zp;
+	znode_t		*xzp;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog;
+	uint64_t	xattr_obj;
+	uint64_t	obj = 0;
+	dmu_tx_t	*tx;
+	boolean_t	unlinked;
+	uint64_t	txtype;
+	int		error;
+
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zp = VTOZ(vp);
+	ZFS_VERIFY_ZP(zp);
+	zilog = zfsvfs->z_log;
+
+	xattr_obj = 0;
+	xzp = NULL;
+
+	if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+		goto out;
+	}
+
+	/*
+	 * Need to use rmdir for removing directories.
+	 */
+	if (vp->v_type == VDIR) {
+		error = SET_ERROR(EPERM);
+		goto out;
+	}
+
+	vnevent_remove(vp, dvp, name, ct);
+
+	obj = zp->z_id;
+
+	/* are there any extended attributes? */
+	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+	    &xattr_obj, sizeof (xattr_obj));
+	if (error == 0 && xattr_obj) {
+		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+		ASSERT0(error);
+	}
+
+	/*
+	 * We may delete the znode now, or we may put it in the unlinked set;
+	 * it depends on whether we're the last link, and on whether there are
+	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
+	 * allow for either case.
+	 */
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	zfs_sa_upgrade_txholds(tx, dzp);
+
+	if (xzp) {
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+	}
+
+	/* charge as an update -- would be nice not to charge at all */
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+	/*
+	 * Mark this transaction as typically resulting in a net free of space
+	 */
+	dmu_tx_mark_netfree(tx);
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Remove the directory entry.
+	 */
+	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
+
+	if (error) {
+		dmu_tx_commit(tx);
+		goto out;
+	}
+
+	if (unlinked) {
+		zfs_unlinked_add(zp, tx);
+		vp->v_vflag |= VV_NOSYNC;
+	}
+	/* XXX check changes to linux vnops */
+	txtype = TX_REMOVE;
+	zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
+
+	dmu_tx_commit(tx);
+out:
+
+	if (xzp)
+		vrele(ZTOV(xzp));
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+
+static int
+zfs_lookup_internal(znode_t *dzp, char *name, vnode_t **vpp,
+    struct componentname *cnp, int nameiop)
+{
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	int error;
+
+	cnp->cn_nameptr = name;
+	cnp->cn_namelen = strlen(name);
+	cnp->cn_nameiop = nameiop;
+	cnp->cn_flags = ISLASTCN | SAVENAME;
+	cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
+	cnp->cn_cred = kcred;
+	cnp->cn_thread = curthread;
+
+	if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) {
+		struct vop_lookup_args a;
+
+		a.a_gen.a_desc = &vop_lookup_desc;
+		a.a_dvp = ZTOV(dzp);
+		a.a_vpp = vpp;
+		a.a_cnp = cnp;
+		error = vfs_cache_lookup(&a);
+	} else {
+		error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred,
+		    curthread, 0, B_FALSE);
+	}
+#ifdef ZFS_DEBUG
+	if (error) {
+		printf("got error %d on name %s on op %d\n", error, name,
+		    nameiop);
+		kdb_backtrace();
+	}
+#endif
+	return (error);
+}
+
+int
+zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
+{
+	vnode_t *vp;
+	int error;
+	struct componentname cn;
+
+	if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
+		return (error);
+
+	error = zfs_remove_(ZTOV(dzp), vp, name, cr);
+	vput(vp);
+	return (error);
+}
+/*
+ * Create a new directory and insert it into dvp using the name
+ * provided.  Return a pointer to the inserted directory.
+ *
+ *	IN:	dvp	- vnode of directory to add subdir to.
+ *		dirname	- name of new directory.
+ *		vap	- attributes of new directory.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
+ *		vsecp	- ACL to be set
+ *
+ *	OUT:	vpp	- vnode of created directory.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated
+ *	 vp - ctime|mtime|atime updated
+ */
+/*ARGSUSED*/
+int
+zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr,
+    int flags, vsecattr_t *vsecp)
+{
+	znode_t		*zp;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog;
+	uint64_t	txtype;
+	dmu_tx_t	*tx;
+	int		error;
+	ksid_t		*ksid;
+	uid_t		uid;
+	gid_t		gid = crgetgid(cr);
+	zfs_acl_ids_t   acl_ids;
+	boolean_t	fuid_dirtied;
+
+	ASSERT(vap->va_type == VDIR);
+
+	/*
+	 * If we have an ephemeral id, ACL, or XVATTR then
+	 * make sure file system is at proper version
+	 */
+
+	ksid = crgetsid(cr, KSID_OWNER);
+	if (ksid)
+		uid = ksid_getid(ksid);
+	else
+		uid = crgetuid(cr);
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    ((vap->va_mask & AT_XVATTR) ||
+	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
+
+	if (dzp->z_pflags & ZFS_XATTR) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(dirname,
+	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+	if (vap->va_mask & AT_XVATTR) {
+		if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
+		    crgetuid(cr), cr, vap->va_type)) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
+	    NULL, &acl_ids)) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * First make sure the new directory doesn't exist.
+	 *
+	 * Existence is checked first to make sure we don't return
+	 * EACCES instead of EEXIST which can cause some applications
+	 * to fail.
+	 */
+	*zpp = NULL;
+
+	if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	ASSERT3P(zp, ==, NULL);
+
+	if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EDQUOT));
+	}
+
+	/*
+	 * Add a new entry to the directory.
+	 */
+	getnewvnode_reserve_();
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+		    acl_ids.z_aclp->z_acl_bytes);
+	}
+
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Create new node.
+	 */
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	/*
+	 * Now put new name in parent dir.
+	 */
+	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
+
+	*zpp = zp;
+
+	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
+	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
+	    acl_ids.z_fuidp, vap);
+
+	zfs_acl_ids_free(&acl_ids);
+
+	dmu_tx_commit(tx);
+
+	getnewvnode_drop_reserve();
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Remove a directory subdir entry.  If the current working
+ * directory is the same as the subdir to be removed, the
+ * remove will fail.
+ *
+ *	IN:	dvp	- vnode of directory to remove from.
+ *		name	- name of directory to be removed.
+ *		cwd	- vnode of current working directory.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+static int
+zfs_rmdir_(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
+{
+	znode_t		*dzp = VTOZ(dvp);
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog;
+	dmu_tx_t	*tx;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	ZFS_VERIFY_ZP(zp);
+	zilog = zfsvfs->z_log;
+
+
+	if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+		goto out;
+	}
+
+	if (vp->v_type != VDIR) {
+		error = SET_ERROR(ENOTDIR);
+		goto out;
+	}
+
+	vnevent_rmdir(vp, dvp, name, ct);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+	zfs_sa_upgrade_txholds(tx, zp);
+	zfs_sa_upgrade_txholds(tx, dzp);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	cache_purge(dvp);
+
+	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
+
+	if (error == 0) {
+		uint64_t txtype = TX_RMDIR;
+		zfs_log_remove(zilog, tx, txtype, dzp, name,
+		    ZFS_NO_OBJECT, B_FALSE);
+	}
+
+	dmu_tx_commit(tx);
+
+	cache_purge(vp);
+out:
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+int
+zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags)
+{
+	struct componentname cn;
+	vnode_t *vp;
+	int error;
+
+	if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
+		return (error);
+
+	error = zfs_rmdir_(ZTOV(dzp), vp, name, cr);
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Read as many directory entries as will fit into the provided
+ * buffer from the given directory cursor position (specified in
+ * the uio structure).
+ *
+ *	IN:	vp	- vnode of directory to read.
+ *		uio	- structure supplying read location, range info,
+ *			  and return buffer.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
+ *
+ *	OUT:	uio	- updated offset and range, buffer filled.
+ *		eofp	- set to true if end-of-file detected.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	vp - atime updated
+ *
+ * Note that the low 4 bits of the cookie returned by zap is always zero.
+ * This allows us to use the low range for "special" directory entries:
+ * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
+ * we use the offset 2 for the '.zfs' directory.
+ */
+/* ARGSUSED */
+static int
+zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
+    int *ncookies, ulong_t **cookies)
+{
+	znode_t		*zp = VTOZ(vp);
+	iovec_t		*iovp;
+	edirent_t	*eodp;
+	dirent64_t	*odp;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	objset_t	*os;
+	caddr_t		outbuf;
+	size_t		bufsize;
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	uint_t		bytes_wanted;
+	uint64_t	offset; /* must be unsigned; checks for < 1 */
+	uint64_t	parent;
+	int		local_eof;
+	int		outcount;
+	int		error;
+	uint8_t		prefetch;
+	boolean_t	check_sysattrs;
+	uint8_t		type;
+	int		ncooks;
+	ulong_t		*cooks = NULL;
+	int		flags = 0;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (parent))) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * If we are not given an eof variable,
+	 * use a local one.
+	 */
+	if (eofp == NULL)
+		eofp = &local_eof;
+
+	/*
+	 * Check for valid iov_len.
+	 */
+	if (uio->uio_iov->iov_len <= 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Quit if directory has been removed (posix)
+	 */
+	if ((*eofp = zp->z_unlinked) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	error = 0;
+	os = zfsvfs->z_os;
+	offset = uio->uio_loffset;
+	prefetch = zp->z_zn_prefetch;
+
+	/*
+	 * Initialize the iterator cursor.
+	 */
+	if (offset <= 3) {
+		/*
+		 * Start iteration from the beginning of the directory.
+		 */
+		zap_cursor_init(&zc, os, zp->z_id);
+	} else {
+		/*
+		 * The offset is a serialized cursor.
+		 */
+		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
+	}
+
+	/*
+	 * Get space to change directory entries into fs independent format.
+	 */
+	iovp = uio->uio_iov;
+	bytes_wanted = iovp->iov_len;
+	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
+		bufsize = bytes_wanted;
+		outbuf = kmem_alloc(bufsize, KM_SLEEP);
+		odp = (struct dirent64 *)outbuf;
+	} else {
+		bufsize = bytes_wanted;
+		outbuf = NULL;
+		odp = (struct dirent64 *)iovp->iov_base;
+	}
+	eodp = (struct edirent *)odp;
+
+	if (ncookies != NULL) {
+		/*
+		 * Minimum entry size is dirent size and 1 byte for a file name.
+		 */
+		ncooks = uio->uio_resid / (sizeof (struct dirent) -
+		    sizeof (((struct dirent *)NULL)->d_name) + 1);
+		cooks = malloc(ncooks * sizeof (ulong_t), M_TEMP, M_WAITOK);
+		*cookies = cooks;
+		*ncookies = ncooks;
+	}
+	/*
+	 * If this VFS supports the system attribute view interface; and
+	 * we're looking at an extended attribute directory; and we care
+	 * about normalization conflicts on this vfs; then we must check
+	 * for normalization conflicts with the sysattr name space.
+	 */
+#ifdef TODO
+	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
+	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
+	    (flags & V_RDDIR_ENTFLAGS);
+#else
+	check_sysattrs = 0;
+#endif
+
+	/*
+	 * Transform to file-system independent format
+	 */
+	outcount = 0;
+	while (outcount < bytes_wanted) {
+		ino64_t objnum;
+		ushort_t reclen;
+		off64_t *next = NULL;
+
+		/*
+		 * Special case `.', `..', and `.zfs'.
+		 */
+		if (offset == 0) {
+			(void) strcpy(zap.za_name, ".");
+			zap.za_normalization_conflict = 0;
+			objnum = zp->z_id;
+			type = DT_DIR;
+		} else if (offset == 1) {
+			(void) strcpy(zap.za_name, "..");
+			zap.za_normalization_conflict = 0;
+			objnum = parent;
+			type = DT_DIR;
+		} else if (offset == 2 && zfs_show_ctldir(zp)) {
+			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+			zap.za_normalization_conflict = 0;
+			objnum = ZFSCTL_INO_ROOT;
+			type = DT_DIR;
+		} else {
+			/*
+			 * Grab next entry.
+			 */
+			if ((error = zap_cursor_retrieve(&zc, &zap))) {
+				if ((*eofp = (error == ENOENT)) != 0)
+					break;
+				else
+					goto update;
+			}
+
+			if (zap.za_integer_length != 8 ||
+			    zap.za_num_integers != 1) {
+				cmn_err(CE_WARN, "zap_readdir: bad directory "
+				    "entry, obj = %lld, offset = %lld\n",
+				    (u_longlong_t)zp->z_id,
+				    (u_longlong_t)offset);
+				error = SET_ERROR(ENXIO);
+				goto update;
+			}
+
+			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
+			/*
+			 * MacOS X can extract the object type here such as:
+			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+			 */
+			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+
+			if (check_sysattrs && !zap.za_normalization_conflict) {
+#ifdef TODO
+				zap.za_normalization_conflict =
+				    xattr_sysattr_casechk(zap.za_name);
+#else
+				panic("%s:%u: TODO", __func__, __LINE__);
+#endif
+			}
+		}
+
+		if (flags & V_RDDIR_ACCFILTER) {
+			/*
+			 * If we have no access at all, don't include
+			 * this entry in the returned information
+			 */
+			znode_t	*ezp;
+			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
+				goto skip_entry;
+			if (!zfs_has_access(ezp, cr)) {
+				vrele(ZTOV(ezp));
+				goto skip_entry;
+			}
+			vrele(ZTOV(ezp));
+		}
+
+		if (flags & V_RDDIR_ENTFLAGS)
+			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
+		else
+			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
+
+		/*
+		 * Will this entry fit in the buffer?
+		 */
+		if (outcount + reclen > bufsize) {
+			/*
+			 * Did we manage to fit anything in the buffer?
+			 */
+			if (!outcount) {
+				error = SET_ERROR(EINVAL);
+				goto update;
+			}
+			break;
+		}
+		if (flags & V_RDDIR_ENTFLAGS) {
+			/*
+			 * Add extended flag entry:
+			 */
+			eodp->ed_ino = objnum;
+			eodp->ed_reclen = reclen;
+			/* NOTE: ed_off is the offset for the *next* entry */
+			next = &(eodp->ed_off);
+			eodp->ed_eflags = zap.za_normalization_conflict ?
+			    ED_CASE_CONFLICT : 0;
+			(void) strncpy(eodp->ed_name, zap.za_name,
+			    EDIRENT_NAMELEN(reclen));
+			eodp = (edirent_t *)((intptr_t)eodp + reclen);
+		} else {
+			/*
+			 * Add normal entry:
+			 */
+			odp->d_ino = objnum;
+			odp->d_reclen = reclen;
+			odp->d_namlen = strlen(zap.za_name);
+			/* NOTE: d_off is the offset for the *next* entry. */
+			next = &odp->d_off;
+			strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
+			odp->d_type = type;
+			dirent_terminate(odp);
+			odp = (dirent64_t *)((intptr_t)odp + reclen);
+		}
+		outcount += reclen;
+
+		ASSERT(outcount <= bufsize);
+
+		/* Prefetch znode */
+		if (prefetch)
+			dmu_prefetch(os, objnum, 0, 0, 0,
+			    ZIO_PRIORITY_SYNC_READ);
+
+	skip_entry:
+		/*
+		 * Move to the next entry, fill in the previous offset.
+		 */
+		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
+			zap_cursor_advance(&zc);
+			offset = zap_cursor_serialize(&zc);
+		} else {
+			offset += 1;
+		}
+
+		/* Fill the offset right after advancing the cursor. */
+		if (next != NULL)
+			*next = offset;
+		if (cooks != NULL) {
+			*cooks++ = offset;
+			ncooks--;
+			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
+		}
+	}
+	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
+
+	/* Subtract unused cookies */
+	if (ncookies != NULL)
+		*ncookies -= ncooks;
+
+	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
+		iovp->iov_base += outcount;
+		iovp->iov_len -= outcount;
+		uio->uio_resid -= outcount;
+	} else if ((error = uiomove(outbuf, (long)outcount, UIO_READ, uio))) {
+		/*
+		 * Reset the pointer.
+		 */
+		offset = uio->uio_loffset;
+	}
+
+update:
+	zap_cursor_fini(&zc);
+	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
+		kmem_free(outbuf, bufsize);
+
+	if (error == ENOENT)
+		error = 0;
+
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+	uio->uio_loffset = offset;
+	ZFS_EXIT(zfsvfs);
+	if (error != 0 && cookies != NULL) {
+		free(*cookies, M_TEMP);
+		*cookies = NULL;
+		*ncookies = 0;
+	}
+	return (error);
+}
+
+ulong_t zfs_fsync_sync_cnt = 4;
+
+static int
+zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
+{
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
+
+	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+		ZFS_ENTER(zfsvfs);
+		ZFS_VERIFY_ZP(zp);
+		zil_commit(zfsvfs->z_log, zp->z_id);
+		ZFS_EXIT(zfsvfs);
+	}
+	tsd_set(zfs_fsyncer_key, NULL);
+	return (0);
+}
+
+
+/*
+ * Get the requested file attributes and place them in the provided
+ * vattr structure.
+ *
+ *	IN:	vp	- vnode of file.
+ *		vap	- va_mask identifies requested attributes.
+ *			  If AT_XVATTR set, then optional attrs are requested
+ *		flags	- ATTR_NOACLCHECK (CIFS server context)
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	vap	- attribute values.
+ *
+ *	RETURN:	0 (always succeeds).
+ */
+/* ARGSUSED */
+static int
+zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int	error = 0;
+	uint32_t blksize;
+	u_longlong_t nblocks;
+	uint64_t mtime[2], ctime[2], crtime[2], rdev;
+	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
+	xoptattr_t *xoap = NULL;
+	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+	sa_bulk_attr_t bulk[4];
+	int count = 0;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
+		    &rdev, 8);
+
+	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
+	 * Also, if we are the owner don't bother, since owner should
+	 * always be allowed to read basic attributes of file.
+	 */
+	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
+	    (vap->va_uid != crgetuid(cr))) {
+		if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
+		    skipaclchk, cr))) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	/*
+	 * Return all attributes.  It's cheaper to provide the answer
+	 * than to determine whether we were asked the question.
+	 */
+
+	vap->va_type = IFTOVT(zp->z_mode);
+	vap->va_mode = zp->z_mode & ~S_IFMT;
+	vn_fsid(vp, vap);
+	vap->va_nodeid = zp->z_id;
+	vap->va_nlink = zp->z_links;
+	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
+	    zp->z_links < ZFS_LINK_MAX)
+		vap->va_nlink++;
+	vap->va_size = zp->z_size;
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		vap->va_rdev = zfs_cmpldev(rdev);
+	vap->va_seq = zp->z_seq;
+	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
+	vap->va_filerev = zp->z_seq;
+
+	/*
+	 * Add in any requested optional attributes and the create time.
+	 * Also set the corresponding bits in the returned attribute bitmap.
+	 */
+	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
+		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+			xoap->xoa_archive =
+			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
+			XVA_SET_RTN(xvap, XAT_ARCHIVE);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+			xoap->xoa_readonly =
+			    ((zp->z_pflags & ZFS_READONLY) != 0);
+			XVA_SET_RTN(xvap, XAT_READONLY);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+			xoap->xoa_system =
+			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
+			XVA_SET_RTN(xvap, XAT_SYSTEM);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+			xoap->xoa_hidden =
+			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
+			XVA_SET_RTN(xvap, XAT_HIDDEN);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+			xoap->xoa_nounlink =
+			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
+			XVA_SET_RTN(xvap, XAT_NOUNLINK);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+			xoap->xoa_immutable =
+			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
+			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+			xoap->xoa_appendonly =
+			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
+			XVA_SET_RTN(xvap, XAT_APPENDONLY);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+			xoap->xoa_nodump =
+			    ((zp->z_pflags & ZFS_NODUMP) != 0);
+			XVA_SET_RTN(xvap, XAT_NODUMP);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+			xoap->xoa_opaque =
+			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
+			XVA_SET_RTN(xvap, XAT_OPAQUE);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+			xoap->xoa_av_quarantined =
+			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
+			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+			xoap->xoa_av_modified =
+			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
+			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
+		    vp->v_type == VREG) {
+			zfs_sa_get_scanstamp(zp, xvap);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
+			XVA_SET_RTN(xvap, XAT_REPARSE);
+		}
+		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
+			xoap->xoa_generation = zp->z_gen;
+			XVA_SET_RTN(xvap, XAT_GEN);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+			xoap->xoa_offline =
+			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
+			XVA_SET_RTN(xvap, XAT_OFFLINE);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+			xoap->xoa_sparse =
+			    ((zp->z_pflags & ZFS_SPARSE) != 0);
+			XVA_SET_RTN(xvap, XAT_SPARSE);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+			xoap->xoa_projinherit =
+			    ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
+			XVA_SET_RTN(xvap, XAT_PROJINHERIT);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+			xoap->xoa_projid = zp->z_projid;
+			XVA_SET_RTN(xvap, XAT_PROJID);
+		}
+	}
+
+	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
+	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
+	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
+	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
+
+
+	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
+	vap->va_blksize = blksize;
+	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
+
+	if (zp->z_blksz == 0) {
+		/*
+		 * Block size hasn't been set; suggest maximal I/O transfers.
+		 */
+		vap->va_blksize = zfsvfs->z_max_blksz;
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Set the file attributes to the values contained in the
+ * vattr structure.
+ *
+ *	IN:	zp	- znode of file to be modified.
+ *		vap	- new attribute values.
+ *			  If AT_XVATTR set, then optional attrs are being set
+ *		flags	- ATTR_UTIME set if non-default time values provided.
+ *			- ATTR_NOACLCHECK (CIFS context only).
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	vp - ctime updated, mtime updated if size changed.
+ */
+/* ARGSUSED */
+int
+zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
+{
+	vnode_t		*vp = ZTOV(zp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	objset_t	*os = zfsvfs->z_os;
+	zilog_t		*zilog;
+	dmu_tx_t	*tx;
+	vattr_t		oldva;
+	xvattr_t	tmpxvattr;
+	uint_t		mask = vap->va_mask;
+	uint_t		saved_mask = 0;
+	uint64_t	saved_mode;
+	int		trim_mask = 0;
+	uint64_t	new_mode;
+	uint64_t	new_uid, new_gid;
+	uint64_t	xattr_obj;
+	uint64_t	mtime[2], ctime[2];
+	uint64_t	projid = ZFS_INVALID_PROJID;
+	znode_t		*attrzp;
+	int		need_policy = FALSE;
+	int		err, err2;
+	zfs_fuid_info_t *fuidp = NULL;
+	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
+	xoptattr_t	*xoap;
+	zfs_acl_t	*aclp;
+	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+	boolean_t	fuid_dirtied = B_FALSE;
+	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
+	int		count = 0, xattr_count = 0;
+
+	if (mask == 0)
+		return (0);
+
+	if (mask & AT_NOSET)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	zilog = zfsvfs->z_log;
+
+	/*
+	 * Make sure that if we have ephemeral uid/gid or xvattr specified
+	 * that file system is at proper version level
+	 */
+
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
+	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
+	    (mask & AT_XVATTR))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (mask & AT_SIZE && vp->v_type == VDIR) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EISDIR));
+	}
+
+	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * If this is an xvattr_t, then get a pointer to the structure of
+	 * optional attributes.  If this is NULL, then we have a vattr_t.
+	 */
+	xoap = xva_getxoptattr(xvap);
+
+	xva_init(&tmpxvattr);
+
+	/*
+	 * Immutable files can only alter immutable bit and atime
+	 */
+	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
+	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
+	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	/*
+	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
+	 */
+
+	/*
+	 * Verify timestamps doesn't overflow 32 bits.
+	 * ZFS can handle large timestamps, but 32bit syscalls can't
+	 * handle times greater than 2039.  This check should be removed
+	 * once large timestamps are fully supported.
+	 */
+	if (mask & (AT_ATIME | AT_MTIME)) {
+		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
+		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EOVERFLOW));
+		}
+	}
+	if (xoap != NULL && (mask & AT_XVATTR)) {
+		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
+		    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EOVERFLOW));
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+			if (!dmu_objset_projectquota_enabled(os) ||
+			    (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) {
+				ZFS_EXIT(zfsvfs);
+				return (SET_ERROR(EOPNOTSUPP));
+			}
+
+			projid = xoap->xoa_projid;
+			if (unlikely(projid == ZFS_INVALID_PROJID)) {
+				ZFS_EXIT(zfsvfs);
+				return (SET_ERROR(EINVAL));
+			}
+
+			if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
+				projid = ZFS_INVALID_PROJID;
+			else
+				need_policy = TRUE;
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
+		    (xoap->xoa_projinherit !=
+		    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
+		    (!dmu_objset_projectquota_enabled(os) ||
+		    (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EOPNOTSUPP));
+		}
+	}
+
+	attrzp = NULL;
+	aclp = NULL;
+
+	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * First validate permissions
+	 */
+
+	if (mask & AT_SIZE) {
+		/*
+		 * XXX - Note, we are not providing any open
+		 * mode flags here (like FNDELAY), so we may
+		 * block if there are locks present... this
+		 * should be addressed in openat().
+		 */
+		/* XXX - would it be OK to generate a log record here? */
+		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
+		if (err) {
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+	}
+
+	if (mask & (AT_ATIME|AT_MTIME) ||
+	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
+	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
+	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
+	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
+	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
+	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
+		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
+		    skipaclchk, cr);
+	}
+
+	if (mask & (AT_UID|AT_GID)) {
+		int	idmask = (mask & (AT_UID|AT_GID));
+		int	take_owner;
+		int	take_group;
+
+		/*
+		 * NOTE: even if a new mode is being set,
+		 * we may clear S_ISUID/S_ISGID bits.
+		 */
+
+		if (!(mask & AT_MODE))
+			vap->va_mode = zp->z_mode;
+
+		/*
+		 * Take ownership or chgrp to group we are a member of
+		 */
+
+		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
+		take_group = (mask & AT_GID) &&
+		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
+
+		/*
+		 * If both AT_UID and AT_GID are set then take_owner and
+		 * take_group must both be set in order to allow taking
+		 * ownership.
+		 *
+		 * Otherwise, send the check through secpolicy_vnode_setattr()
+		 *
+		 */
+
+		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
+		    ((idmask == AT_UID) && take_owner) ||
+		    ((idmask == AT_GID) && take_group)) {
+			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
+			    skipaclchk, cr) == 0) {
+				/*
+				 * Remove setuid/setgid for non-privileged users
+				 */
+				secpolicy_setid_clear(vap, vp, cr);
+				trim_mask = (mask & (AT_UID|AT_GID));
+			} else {
+				need_policy =  TRUE;
+			}
+		} else {
+			need_policy =  TRUE;
+		}
+	}
+
+	oldva.va_mode = zp->z_mode;
+	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
+	if (mask & AT_XVATTR) {
+		/*
+		 * Update xvattr mask to include only those attributes
+		 * that are actually changing.
+		 *
+		 * the bits will be restored prior to actually setting
+		 * the attributes so the caller thinks they were set.
+		 */
+		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+			if (xoap->xoa_appendonly !=
+			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
+				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+			if (xoap->xoa_projinherit !=
+			    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
+				XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+			if (xoap->xoa_nounlink !=
+			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
+				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+			if (xoap->xoa_immutable !=
+			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
+				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+			if (xoap->xoa_nodump !=
+			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_NODUMP);
+				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+			if (xoap->xoa_av_modified !=
+			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
+				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+			if ((vp->v_type != VREG &&
+			    xoap->xoa_av_quarantined) ||
+			    xoap->xoa_av_quarantined !=
+			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
+				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EPERM));
+		}
+
+		if (need_policy == FALSE &&
+		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
+		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
+			need_policy = TRUE;
+		}
+	}
+
+	if (mask & AT_MODE) {
+		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
+			err = secpolicy_setid_setsticky_clear(vp, vap,
+			    &oldva, cr);
+			if (err) {
+				ZFS_EXIT(zfsvfs);
+				return (err);
+			}
+			trim_mask |= AT_MODE;
+		} else {
+			need_policy = TRUE;
+		}
+	}
+
+	if (need_policy) {
+		/*
+		 * If trim_mask is set then take ownership
+		 * has been granted or write_acl is present and user
+		 * has the ability to modify mode.  In that case remove
+		 * UID|GID and or MODE from mask so that
+		 * secpolicy_vnode_setattr() doesn't revoke it.
+		 */
+
+		if (trim_mask) {
+			saved_mask = vap->va_mask;
+			vap->va_mask &= ~trim_mask;
+			if (trim_mask & AT_MODE) {
+				/*
+				 * Save the mode, as secpolicy_vnode_setattr()
+				 * will overwrite it with ova.va_mode.
+				 */
+				saved_mode = vap->va_mode;
+			}
+		}
+		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
+		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
+		if (err) {
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+
+		if (trim_mask) {
+			vap->va_mask |= saved_mask;
+			if (trim_mask & AT_MODE) {
+				/*
+				 * Recover the mode after
+				 * secpolicy_vnode_setattr().
+				 */
+				vap->va_mode = saved_mode;
+			}
+		}
+	}
+
+	/*
+	 * secpolicy_vnode_setattr, or take ownership may have
+	 * changed va_mask
+	 */
+	mask = vap->va_mask;
+
+	if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) {
+		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+		    &xattr_obj, sizeof (xattr_obj));
+
+		if (err == 0 && xattr_obj) {
+			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
+			if (err == 0) {
+				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
+				if (err != 0)
+					vrele(ZTOV(attrzp));
+			}
+			if (err)
+				goto out2;
+		}
+		if (mask & AT_UID) {
+			new_uid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
+			if (new_uid != zp->z_uid &&
+			    zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
+			    new_uid)) {
+				if (attrzp)
+					vput(ZTOV(attrzp));
+				err = SET_ERROR(EDQUOT);
+				goto out2;
+			}
+		}
+
+		if (mask & AT_GID) {
+			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
+			    cr, ZFS_GROUP, &fuidp);
+			if (new_gid != zp->z_gid &&
+			    zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
+			    new_gid)) {
+				if (attrzp)
+					vput(ZTOV(attrzp));
+				err = SET_ERROR(EDQUOT);
+				goto out2;
+			}
+		}
+
+		if (projid != ZFS_INVALID_PROJID &&
+		    zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
+			if (attrzp)
+				vput(ZTOV(attrzp));
+			err = SET_ERROR(EDQUOT);
+			goto out2;
+		}
+	}
+	tx = dmu_tx_create(os);
+
+	if (mask & AT_MODE) {
+		uint64_t pmode = zp->z_mode;
+		uint64_t acl_obj;
+		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
+		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
+			err = SET_ERROR(EPERM);
+			goto out;
+		}
+
+		if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
+			goto out;
+
+		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
+			/*
+			 * Are we upgrading ACL from old V0 format
+			 * to V1 format?
+			 */
+			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+			    zfs_znode_acl_version(zp) ==
+			    ZFS_ACL_VERSION_INITIAL) {
+				dmu_tx_hold_free(tx, acl_obj, 0,
+				    DMU_OBJECT_END);
+				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+				    0, aclp->z_acl_bytes);
+			} else {
+				dmu_tx_hold_write(tx, acl_obj, 0,
+				    aclp->z_acl_bytes);
+			}
+		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+			    0, aclp->z_acl_bytes);
+		}
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+	} else {
+		if (((mask & AT_XVATTR) &&
+		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
+		    (projid != ZFS_INVALID_PROJID &&
+		    !(zp->z_pflags & ZFS_PROJID)))
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+		else
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	}
+
+	if (attrzp) {
+		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
+	}
+
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+
+	zfs_sa_upgrade_txholds(tx, zp);
+
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err)
+		goto out;
+
+	count = 0;
+	/*
+	 * Set each attribute requested.
+	 * We group settings according to the locks they need to acquire.
+	 *
+	 * Note: you cannot set ctime directly, although it will be
+	 * updated as a side-effect of calling this function.
+	 */
+
+	if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
+		/*
+		 * For the existed object that is upgraded from old system,
+		 * its on-disk layout has no slot for the project ID attribute.
+		 * But quota accounting logic needs to access related slots by
+		 * offset directly. So we need to adjust old objects' layout
+		 * to make the project ID to some unified and fixed offset.
+		 */
+		if (attrzp)
+			err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
+		if (err == 0)
+			err = sa_add_projid(zp->z_sa_hdl, tx, projid);
+
+		if (unlikely(err == EEXIST))
+			err = 0;
+		else if (err != 0)
+			goto out;
+		else
+			projid = ZFS_INVALID_PROJID;
+	}
+
+	if (mask & (AT_UID|AT_GID|AT_MODE))
+		mutex_enter(&zp->z_acl_lock);
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+
+	if (attrzp) {
+		if (mask & (AT_UID|AT_GID|AT_MODE))
+			mutex_enter(&attrzp->z_acl_lock);
+		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
+		    sizeof (attrzp->z_pflags));
+		if (projid != ZFS_INVALID_PROJID) {
+			attrzp->z_projid = projid;
+			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+			    SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
+			    sizeof (attrzp->z_projid));
+		}
+	}
+
+	if (mask & (AT_UID|AT_GID)) {
+
+		if (mask & AT_UID) {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+			    &new_uid, sizeof (new_uid));
+			zp->z_uid = new_uid;
+			if (attrzp) {
+				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
+				    sizeof (new_uid));
+				attrzp->z_uid = new_uid;
+			}
+		}
+
+		if (mask & AT_GID) {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
+			    NULL, &new_gid, sizeof (new_gid));
+			zp->z_gid = new_gid;
+			if (attrzp) {
+				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
+				    sizeof (new_gid));
+				attrzp->z_gid = new_gid;
+			}
+		}
+		if (!(mask & AT_MODE)) {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
+			    NULL, &new_mode, sizeof (new_mode));
+			new_mode = zp->z_mode;
+		}
+		err = zfs_acl_chown_setattr(zp);
+		ASSERT(err == 0);
+		if (attrzp) {
+			err = zfs_acl_chown_setattr(attrzp);
+			ASSERT(err == 0);
+		}
+	}
+
+	if (mask & AT_MODE) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+		    &new_mode, sizeof (new_mode));
+		zp->z_mode = new_mode;
+		ASSERT3U((uintptr_t)aclp, !=, 0);
+		err = zfs_aclset_common(zp, aclp, cr, tx);
+		ASSERT0(err);
+		if (zp->z_acl_cached)
+			zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = aclp;
+		aclp = NULL;
+	}
+
+
+	if (mask & AT_ATIME) {
+		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+		    &zp->z_atime, sizeof (zp->z_atime));
+	}
+
+	if (mask & AT_MTIME) {
+		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+		    mtime, sizeof (mtime));
+	}
+
+	if (projid != ZFS_INVALID_PROJID) {
+		zp->z_projid = projid;
+		SA_ADD_BULK_ATTR(bulk, count,
+		    SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
+		    sizeof (zp->z_projid));
+	}
+
+	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
+	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+		    NULL, mtime, sizeof (mtime));
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    &ctime, sizeof (ctime));
+		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+	} else if (mask != 0) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    &ctime, sizeof (ctime));
+		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime);
+		if (attrzp) {
+			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+			    SA_ZPL_CTIME(zfsvfs), NULL,
+			    &ctime, sizeof (ctime));
+			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
+			    mtime, ctime);
+		}
+	}
+
+	/*
+	 * Do this after setting timestamps to prevent timestamp
+	 * update from toggling bit
+	 */
+
+	if (xoap && (mask & AT_XVATTR)) {
+
+		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+			xoap->xoa_createtime = vap->va_birthtime;
+		/*
+		 * restore trimmed off masks
+		 * so that return masks can be set for caller.
+		 */
+
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
+			XVA_SET_REQ(xvap, XAT_APPENDONLY);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
+			XVA_SET_REQ(xvap, XAT_NOUNLINK);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
+			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
+			XVA_SET_REQ(xvap, XAT_NODUMP);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
+			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
+			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) {
+			XVA_SET_REQ(xvap, XAT_PROJINHERIT);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+			ASSERT(vp->v_type == VREG);
+
+		zfs_xvattr_set(zp, xvap, tx);
+	}
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	if (mask != 0)
+		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
+
+	if (mask & (AT_UID|AT_GID|AT_MODE))
+		mutex_exit(&zp->z_acl_lock);
+
+	if (attrzp) {
+		if (mask & (AT_UID|AT_GID|AT_MODE))
+			mutex_exit(&attrzp->z_acl_lock);
+	}
+out:
+	if (err == 0 && attrzp) {
+		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
+		    xattr_count, tx);
+		ASSERT(err2 == 0);
+	}
+
+	if (attrzp)
+		vput(ZTOV(attrzp));
+
+	if (aclp)
+		zfs_acl_free(aclp);
+
+	if (fuidp) {
+		zfs_fuid_info_free(fuidp);
+		fuidp = NULL;
+	}
+
+	if (err) {
+		dmu_tx_abort(tx);
+	} else {
+		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		dmu_tx_commit(tx);
+	}
+
+out2:
+	if (os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (err);
+}
+
+/*
+ * We acquire all but fdvp locks using non-blocking acquisitions.  If we
+ * fail to acquire any lock in the path we will drop all held locks,
+ * acquire the new lock in a blocking fashion, and then release it and
+ * restart the rename.  This acquire/release step ensures that we do not
+ * spin on a lock waiting for release.  On error release all vnode locks
+ * and decrement references the way tmpfs_rename() would do.
+ */
+static int
+zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
+    struct vnode *tdvp, struct vnode **tvpp,
+    const struct componentname *scnp, const struct componentname *tcnp)
+{
+	zfsvfs_t	*zfsvfs;
+	struct vnode	*nvp, *svp, *tvp;
+	znode_t		*sdzp, *tdzp, *szp, *tzp;
+	const char	*snm = scnp->cn_nameptr;
+	const char	*tnm = tcnp->cn_nameptr;
+	int error;
+
+	VOP_UNLOCK1(tdvp);
+	if (*tvpp != NULL && *tvpp != tdvp)
+		VOP_UNLOCK1(*tvpp);
+
+relock:
+	error = vn_lock(sdvp, LK_EXCLUSIVE);
+	if (error)
+		goto out;
+	sdzp = VTOZ(sdvp);
+
+	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
+	if (error != 0) {
+		VOP_UNLOCK1(sdvp);
+		if (error != EBUSY)
+			goto out;
+		error = vn_lock(tdvp, LK_EXCLUSIVE);
+		if (error)
+			goto out;
+		VOP_UNLOCK1(tdvp);
+		goto relock;
+	}
+	tdzp = VTOZ(tdvp);
+
+	/*
+	 * Before using sdzp and tdzp we must ensure that they are live.
+	 * As a porting legacy from illumos we have two things to worry
+	 * about.  One is typical for FreeBSD and it is that the vnode is
+	 * not reclaimed (doomed).  The other is that the znode is live.
+	 * The current code can invalidate the znode without acquiring the
+	 * corresponding vnode lock if the object represented by the znode
+	 * and vnode is no longer valid after a rollback or receive operation.
+	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
+	 * that protects the znodes from the invalidation.
+	 */
+	zfsvfs = sdzp->z_zfsvfs;
+	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
+	 * bypassing the cleanup code in the case of an error.
+	 */
+	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK1(sdvp);
+		VOP_UNLOCK1(tdvp);
+		error = SET_ERROR(EIO);
+		goto out;
+	}
+
+	/*
+	 * Re-resolve svp to be certain it still exists and fetch the
+	 * correct vnode.
+	 */
+	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
+	if (error != 0) {
+		/* Source entry invalid or not there. */
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK1(sdvp);
+		VOP_UNLOCK1(tdvp);
+		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
+		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
+			error = SET_ERROR(EINVAL);
+		goto out;
+	}
+	svp = ZTOV(szp);
+
+	/*
+	 * Re-resolve tvp, if it disappeared we just carry on.
+	 */
+	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK1(sdvp);
+		VOP_UNLOCK1(tdvp);
+		vrele(svp);
+		if ((tcnp->cn_flags & ISDOTDOT) != 0)
+			error = SET_ERROR(EINVAL);
+		goto out;
+	}
+	if (tzp != NULL)
+		tvp = ZTOV(tzp);
+	else
+		tvp = NULL;
+
+	/*
+	 * At present the vnode locks must be acquired before z_teardown_lock,
+	 * although it would be more logical to use the opposite order.
+	 */
+	ZFS_EXIT(zfsvfs);
+
+	/*
+	 * Now try acquire locks on svp and tvp.
+	 */
+	nvp = svp;
+	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+	if (error != 0) {
+		VOP_UNLOCK1(sdvp);
+		VOP_UNLOCK1(tdvp);
+		if (tvp != NULL)
+			vrele(tvp);
+		if (error != EBUSY) {
+			vrele(nvp);
+			goto out;
+		}
+		error = vn_lock(nvp, LK_EXCLUSIVE);
+		if (error != 0) {
+			vrele(nvp);
+			goto out;
+		}
+		VOP_UNLOCK1(nvp);
+		/*
+		 * Concurrent rename race.
+		 * XXX ?
+		 */
+		if (nvp == tdvp) {
+			vrele(nvp);
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+		vrele(*svpp);
+		*svpp = nvp;
+		goto relock;
+	}
+	vrele(*svpp);
+	*svpp = nvp;
+
+	if (*tvpp != NULL)
+		vrele(*tvpp);
+	*tvpp = NULL;
+	if (tvp != NULL) {
+		nvp = tvp;
+		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+		if (error != 0) {
+			VOP_UNLOCK1(sdvp);
+			VOP_UNLOCK1(tdvp);
+			VOP_UNLOCK1(*svpp);
+			if (error != EBUSY) {
+				vrele(nvp);
+				goto out;
+			}
+			error = vn_lock(nvp, LK_EXCLUSIVE);
+			if (error != 0) {
+				vrele(nvp);
+				goto out;
+			}
+			vput(nvp);
+			goto relock;
+		}
+		*tvpp = nvp;
+	}
+
+	return (0);
+
+out:
+	return (error);
+}
+
+/*
+ * Note that we must use VRELE_ASYNC in this function as it walks
+ * up the directory tree and vrele may need to acquire an exclusive
+ * lock if a last reference to a vnode is dropped.
+ */
+static int
+zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
+{
+	zfsvfs_t	*zfsvfs;
+	znode_t		*zp, *zp1;
+	uint64_t	parent;
+	int		error;
+
+	zfsvfs = tdzp->z_zfsvfs;
+	if (tdzp == szp)
+		return (SET_ERROR(EINVAL));
+	if (tdzp == sdzp)
+		return (0);
+	if (tdzp->z_id == zfsvfs->z_root)
+		return (0);
+	zp = tdzp;
+	for (;;) {
+		ASSERT(!zp->z_unlinked);
+		if ((error = sa_lookup(zp->z_sa_hdl,
+		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+			break;
+
+		if (parent == szp->z_id) {
+			error = SET_ERROR(EINVAL);
+			break;
+		}
+		if (parent == zfsvfs->z_root)
+			break;
+		if (parent == sdzp->z_id)
+			break;
+
+		error = zfs_zget(zfsvfs, parent, &zp1);
+		if (error != 0)
+			break;
+
+		if (zp != tdzp)
+			VN_RELE_ASYNC(ZTOV(zp),
+			    dsl_pool_zrele_taskq(
+			    dmu_objset_pool(zfsvfs->z_os)));
+		zp = zp1;
+	}
+
+	if (error == ENOTDIR)
+		panic("checkpath: .. not a directory\n");
+	if (zp != tdzp)
+		VN_RELE_ASYNC(ZTOV(zp),
+		    dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+	return (error);
+}
+
+/*
+ * Move an entry from the provided source directory to the target
+ * directory.  Change the entry name as indicated.
+ *
+ *	IN:	sdvp	- Source directory containing the "old entry".
+ *		snm	- Old entry name.
+ *		tdvp	- Target directory to contain the "new entry".
+ *		tnm	- New entry name.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	sdvp,tdvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+static int
+zfs_rename_(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
+    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
+    cred_t *cr, int log)
+{
+	zfsvfs_t	*zfsvfs;
+	znode_t		*sdzp, *tdzp, *szp, *tzp;
+	zilog_t		*zilog = NULL;
+	dmu_tx_t	*tx;
+	char		*snm = scnp->cn_nameptr;
+	char		*tnm = tcnp->cn_nameptr;
+	int		error = 0;
+	bool	want_seqc_end __maybe_unused = false;
+
+	/* Reject renames across filesystems. */
+	if ((*svpp)->v_mount != tdvp->v_mount ||
+	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
+		error = SET_ERROR(EXDEV);
+		goto out;
+	}
+
+	if (zfsctl_is_node(tdvp)) {
+		error = SET_ERROR(EXDEV);
+		goto out;
+	}
+
+	/*
+	 * Lock all four vnodes to ensure safety and semantics of renaming.
+	 */
+	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
+	if (error != 0) {
+		/* no vnodes are locked in the case of error here */
+		return (error);
+	}
+
+	tdzp = VTOZ(tdvp);
+	sdzp = VTOZ(sdvp);
+	zfsvfs = tdzp->z_zfsvfs;
+	zilog = zfsvfs->z_log;
+
+	/*
+	 * After we re-enter ZFS_ENTER() we will have to revalidate all
+	 * znodes involved.
+	 */
+	ZFS_ENTER(zfsvfs);
+
+	if (zfsvfs->z_utf8 && u8_validate(tnm,
+	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		error = SET_ERROR(EILSEQ);
+		goto unlockout;
+	}
+
+	/* If source and target are the same file, there is nothing to do. */
+	if ((*svpp) == (*tvpp)) {
+		error = 0;
+		goto unlockout;
+	}
+
+	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
+	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
+	    (*tvpp)->v_mountedhere != NULL)) {
+		error = SET_ERROR(EXDEV);
+		goto unlockout;
+	}
+
+	/*
+	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
+	 * bypassing the cleanup code in the case of an error.
+	 */
+	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+		error = SET_ERROR(EIO);
+		goto unlockout;
+	}
+
+	szp = VTOZ(*svpp);
+	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
+	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
+		error = SET_ERROR(EIO);
+		goto unlockout;
+	}
+
+	/*
+	 * This is to prevent the creation of links into attribute space
+	 * by renaming a linked file into/outof an attribute directory.
+	 * See the comment in zfs_link() for why this is considered bad.
+	 */
+	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+		error = SET_ERROR(EINVAL);
+		goto unlockout;
+	}
+
+	/*
+	 * If we are using project inheritance, means if the directory has
+	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
+	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+	 * such case, we only allow renames into our tree when the project
+	 * IDs are the same.
+	 */
+	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+	    tdzp->z_projid != szp->z_projid) {
+		error = SET_ERROR(EXDEV);
+		goto unlockout;
+	}
+
+	/*
+	 * Must have write access at the source to remove the old entry
+	 * and write access at the target to create the new entry.
+	 * Note that if target and source are the same, this can be
+	 * done in a single check.
+	 */
+	if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
+		goto unlockout;
+
+	if ((*svpp)->v_type == VDIR) {
+		/*
+		 * Avoid ".", "..", and aliases of "." for obvious reasons.
+		 */
+		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
+		    sdzp == szp ||
+		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
+			error = EINVAL;
+			goto unlockout;
+		}
+
+		/*
+		 * Check to make sure rename is valid.
+		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
+		 */
+		if ((error = zfs_rename_check(szp, sdzp, tdzp)))
+			goto unlockout;
+	}
+
+	/*
+	 * Does target exist?
+	 */
+	if (tzp) {
+		/*
+		 * Source and target must be the same type.
+		 */
+		if ((*svpp)->v_type == VDIR) {
+			if ((*tvpp)->v_type != VDIR) {
+				error = SET_ERROR(ENOTDIR);
+				goto unlockout;
+			} else {
+				cache_purge(tdvp);
+				if (sdvp != tdvp)
+					cache_purge(sdvp);
+			}
+		} else {
+			if ((*tvpp)->v_type == VDIR) {
+				error = SET_ERROR(EISDIR);
+				goto unlockout;
+			}
+		}
+	}
+
+	vn_seqc_write_begin(*svpp);
+	vn_seqc_write_begin(sdvp);
+	if (*tvpp != NULL)
+		vn_seqc_write_begin(*tvpp);
+	if (tdvp != *tvpp)
+		vn_seqc_write_begin(tdvp);
+#if	__FreeBSD_version >= 1300102
+	want_seqc_end = true;
+#endif
+	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
+	if (tzp)
+		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
+
+	/*
+	 * notify the target directory if it is not the same
+	 * as source directory.
+	 */
+	if (tdvp != sdvp) {
+		vnevent_rename_dest_dir(tdvp, ct);
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
+	if (sdzp != tdzp) {
+		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, tdzp);
+	}
+	if (tzp) {
+		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, tzp);
+	}
+
+	zfs_sa_upgrade_txholds(tx, szp);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		goto unlockout;
+	}
+
+
+	if (tzp)	/* Attempt to remove the existing target */
+		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
+
+	if (error == 0) {
+		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
+		if (error == 0) {
+			szp->z_pflags |= ZFS_AV_MODIFIED;
+
+			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+			ASSERT0(error);
+
+			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
+			    NULL);
+			if (error == 0) {
+				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
+				    snm, tdzp, tnm, szp);
+
+				/*
+				 * Update path information for the target vnode
+				 */
+				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
+			} else {
+				/*
+				 * At this point, we have successfully created
+				 * the target name, but have failed to remove
+				 * the source name.  Since the create was done
+				 * with the ZRENAMING flag, there are
+				 * complications; for one, the link count is
+				 * wrong.  The easiest way to deal with this
+				 * is to remove the newly created target, and
+				 * return the original error.  This must
+				 * succeed; fortunately, it is very unlikely to
+				 * fail, since we just created it.
+				 */
+				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
+				    ZRENAMING, NULL), ==, 0);
+			}
+		}
+		if (error == 0) {
+			cache_purge(*svpp);
+			if (*tvpp != NULL)
+				cache_purge(*tvpp);
+			cache_purge_negative(tdvp);
+		}
+	}
+
+	dmu_tx_commit(tx);
+
+unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
+	ZFS_EXIT(zfsvfs);
+	if (want_seqc_end) {
+		vn_seqc_write_end(*svpp);
+		vn_seqc_write_end(sdvp);
+		if (*tvpp != NULL)
+			vn_seqc_write_end(*tvpp);
+		if (tdvp != *tvpp)
+			vn_seqc_write_end(tdvp);
+		want_seqc_end = false;
+	}
+	VOP_UNLOCK1(*svpp);
+	VOP_UNLOCK1(sdvp);
+
+out:				/* original two vnodes are locked */
+	MPASS(!want_seqc_end);
+	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	if (*tvpp != NULL)
+		VOP_UNLOCK1(*tvpp);
+	if (tdvp != *tvpp)
+		VOP_UNLOCK1(tdvp);
+	return (error);
+}
+
+int
+zfs_rename(znode_t *sdzp, char *sname, znode_t *tdzp, char *tname,
+    cred_t *cr, int flags)
+{
+	struct componentname scn, tcn;
+	vnode_t *sdvp, *tdvp;
+	vnode_t *svp, *tvp;
+	int error;
+	svp = tvp = NULL;
+
+	sdvp = ZTOV(sdzp);
+	tdvp = ZTOV(tdzp);
+	error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE);
+	if (sdzp->z_zfsvfs->z_replay == B_FALSE)
+		VOP_UNLOCK1(sdvp);
+	if (error != 0)
+		goto fail;
+	VOP_UNLOCK1(svp);
+
+	vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
+	error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME);
+	if (error == EJUSTRETURN)
+		tvp = NULL;
+	else if (error != 0) {
+		VOP_UNLOCK1(tdvp);
+		goto fail;
+	}
+
+	error = zfs_rename_(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr, 0);
+fail:
+	if (svp != NULL)
+		vrele(svp);
+	if (tvp != NULL)
+		vrele(tvp);
+
+	return (error);
+}
+
+/*
+ * Insert the indicated symbolic reference entry into the directory.
+ *
+ *	IN:	dvp	- Directory to contain new symbolic link.
+ *		link	- Name for new symlink entry.
+ *		vap	- Attributes of new entry.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
+    const char *link, znode_t **zpp, cred_t *cr, int flags)
+{
+	znode_t		*zp;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog;
+	uint64_t	len = strlen(link);
+	int		error;
+	zfs_acl_ids_t	acl_ids;
+	boolean_t	fuid_dirtied;
+	uint64_t	txtype = TX_SYMLINK;
+
+	ASSERT(vap->va_type == VLNK);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
+
+	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+	if (len > MAXPATHLEN) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENAMETOOLONG));
+	}
+
+	if ((error = zfs_acl_ids_create(dzp, 0,
+	    vap, cr, NULL, &acl_ids)) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Attempt to lock directory; fail if entry already exists.
+	 */
+	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids,
+	    0 /* projid */)) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EDQUOT));
+	}
+
+	getnewvnode_reserve_();
+	tx = dmu_tx_create(zfsvfs->z_os);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE + len);
+	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+		    acl_ids.z_aclp->z_acl_bytes);
+	}
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Create a new object for the symlink.
+	 * for version 4 ZPL datsets the symlink will be an SA attribute
+	 */
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	if (zp->z_is_sa)
+		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
+		    __DECONST(void *, link), len, tx);
+	else
+		zfs_sa_symlink(zp, __DECONST(char *, link), len, tx);
+
+	zp->z_size = len;
+	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+	    &zp->z_size, sizeof (zp->z_size), tx);
+	/*
+	 * Insert the new object into the directory.
+	 */
+	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+
+	zfs_log_symlink(zilog, tx, txtype, dzp, zp,
+	    __DECONST(char *, name), __DECONST(char *, link));
+	*zpp = zp;
+
+	zfs_acl_ids_free(&acl_ids);
+
+	dmu_tx_commit(tx);
+
+	getnewvnode_drop_reserve();
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Return, in the buffer contained in the provided uio structure,
+ * the symbolic path referred to by vp.
+ *
+ *	IN:	vp	- vnode of symbolic link.
+ *		uio	- structure to contain the link path.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *
+ *	OUT:	uio	- structure containing the link path.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if (zp->z_is_sa)
+		error = sa_lookup_uio(zp->z_sa_hdl,
+		    SA_ZPL_SYMLINK(zfsvfs), uio);
+	else
+		error = zfs_sa_readlink(zp, uio);
+
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Insert a new entry into directory tdvp referencing svp.
+ *
+ *	IN:	tdvp	- Directory to contain new entry.
+ *		svp	- vnode of new entry.
+ *		name	- name of new entry.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	tdvp - ctime|mtime updated
+ *	 svp - ctime updated
+ */
+/* ARGSUSED */
+int
+zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
+    int flags)
+{
+	znode_t		*tzp;
+	zfsvfs_t	*zfsvfs = tdzp->z_zfsvfs;
+	zilog_t		*zilog;
+	dmu_tx_t	*tx;
+	int		error;
+	uint64_t	parent;
+	uid_t		owner;
+
+	ASSERT(ZTOV(tdzp)->v_type == VDIR);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(tdzp);
+	zilog = zfsvfs->z_log;
+
+	/*
+	 * POSIX dictates that we return EPERM here.
+	 * Better choices include ENOTSUP or EISDIR.
+	 */
+	if (ZTOV(szp)->v_type == VDIR) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	ZFS_VERIFY_ZP(szp);
+
+	/*
+	 * If we are using project inheritance, means if the directory has
+	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
+	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+	 * such case, we only allow hard link creation in our tree when the
+	 * project IDs are the same.
+	 */
+	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+	    tdzp->z_projid != szp->z_projid) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EXDEV));
+	}
+
+	if (szp->z_pflags & (ZFS_APPENDONLY |
+	    ZFS_IMMUTABLE | ZFS_READONLY)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	/* Prevent links to .zfs/shares files */
+
+	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (uint64_t))) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	if (parent == zfsvfs->z_shares_dir) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(name,
+	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+	/*
+	 * We do not support links between attributes and non-attributes
+	 * because of the potential security risk of creating links
+	 * into "normal" file space in order to circumvent restrictions
+	 * imposed in attribute space.
+	 */
+	if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+
+	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
+	if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Attempt to lock directory; fail if entry already exists.
+	 */
+	error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW);
+	if (error) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
+	zfs_sa_upgrade_txholds(tx, szp);
+	zfs_sa_upgrade_txholds(tx, tdzp);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	error = zfs_link_create(tdzp, name, szp, tx, 0);
+
+	if (error == 0) {
+		uint64_t txtype = TX_LINK;
+		zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
+	}
+
+	dmu_tx_commit(tx);
+
+	if (error == 0) {
+		vnevent_link(ZTOV(szp), ct);
+	}
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Free or allocate space in a file.  Currently, this function only
+ * supports the `F_FREESP' command.  However, this command is somewhat
+ * misnamed, as its functionality includes the ability to allocate as
+ * well as free space.
+ *
+ *	IN:	ip	- inode of file to free data in.
+ *		cmd	- action to take (only F_FREESP supported).
+ *		bfp	- section of file to free/alloc.
+ *		flag	- current file open mode flags.
+ *		offset	- current file offset.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	ip - ctime|mtime updated
+ */
+/* ARGSUSED */
+int
+zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
+    offset_t offset, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
+	uint64_t	off, len;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if (cmd != F_FREESP) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Callers might not be able to detect properly that we are read-only,
+	 * so check it explicitly here.
+	 */
+	if (zfs_is_readonly(zfsvfs)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EROFS));
+	}
+
+	if (bfp->l_len < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Permissions aren't checked on Solaris because on this OS
+	 * zfs_space() can only be called with an opened file handle.
+	 * On Linux we can get here through truncate_range() which
+	 * operates directly on inodes, so we need to check access rights.
+	 */
+	if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	off = bfp->l_start;
+	len = bfp->l_len; /* 0 means from off to end of file */
+
+	error = zfs_freesp(zp, off, len, flag, TRUE);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*ARGSUSED*/
+static void
+zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+
+	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+	if (zp->z_sa_hdl == NULL) {
+		/*
+		 * The fs has been unmounted, or we did a
+		 * suspend/resume and this file no longer exists.
+		 */
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		vrecycle(vp);
+		return;
+	}
+
+	if (zp->z_unlinked) {
+		/*
+		 * Fast path to recycle a vnode of a removed file.
+		 */
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		vrecycle(vp);
+		return;
+	}
+
+	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, zp);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+		} else {
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
+			zp->z_atime_dirty = 0;
+			dmu_tx_commit(tx);
+		}
+	}
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
+}
+
+
+CTASSERT(sizeof (struct zfid_short) <= sizeof (struct fid));
+CTASSERT(sizeof (struct zfid_long) <= sizeof (struct fid));
+
+/*ARGSUSED*/
+static int
+zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	uint32_t	gen;
+	uint64_t	gen64;
+	uint64_t	object = zp->z_id;
+	zfid_short_t	*zfid;
+	int		size, i, error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
+	    &gen64, sizeof (uint64_t))) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	gen = (uint32_t)gen64;
+
+	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
+	fidp->fid_len = size;
+
+	zfid = (zfid_short_t *)fidp;
+
+	zfid->zf_len = size;
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	/* Must have a non-zero generation number to distinguish from .zfs */
+	if (gen == 0)
+		gen = 1;
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+	if (size == LONG_FID_LEN) {
+		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
+		zfid_long_t	*zlfid;
+
+		zlfid = (zfid_long_t *)fidp;
+
+		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+		/* XXX - this should be the generation number for the objset */
+		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+			zlfid->zf_setgen[i] = 0;
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+static int
+zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+    caller_context_t *ct)
+{
+
+	switch (cmd) {
+	case _PC_LINK_MAX:
+		*valp = MIN(LONG_MAX, ZFS_LINK_MAX);
+		return (0);
+
+	case _PC_FILESIZEBITS:
+		*valp = 64;
+		return (0);
+	case _PC_MIN_HOLE_SIZE:
+		*valp = (int)SPA_MINBLOCKSIZE;
+		return (0);
+	case _PC_ACL_EXTENDED:
+		*valp = 0;
+		return (0);
+
+	case _PC_ACL_NFS4:
+		*valp = 1;
+		return (0);
+
+	case _PC_ACL_PATH_MAX:
+		*valp = ACL_MAX_ENTRIES;
+		return (0);
+
+	default:
+		return (EOPNOTSUPP);
+	}
+}
+
+/*ARGSUSED*/
+static int
+zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
+    caller_context_t *ct)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+	zilog_t	*zilog = zfsvfs->z_log;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+static int
+zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
+    int *rahead)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	objset_t *os = zp->z_zfsvfs->z_os;
+	zfs_locked_range_t *lr;
+	vm_object_t object;
+	off_t start, end, obj_size;
+	uint_t blksz;
+	int pgsin_b, pgsin_a;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	start = IDX_TO_OFF(ma[0]->pindex);
+	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
+
+	/*
+	 * Lock a range covering all required and optional pages.
+	 * Note that we need to handle the case of the block size growing.
+	 */
+	for (;;) {
+		blksz = zp->z_blksz;
+		lr = zfs_rangelock_tryenter(&zp->z_rangelock,
+		    rounddown(start, blksz),
+		    roundup(end, blksz) - rounddown(start, blksz), RL_READER);
+		if (lr == NULL) {
+			if (rahead != NULL) {
+				*rahead = 0;
+				rahead = NULL;
+			}
+			if (rbehind != NULL) {
+				*rbehind = 0;
+				rbehind = NULL;
+			}
+			break;
+		}
+		if (blksz == zp->z_blksz)
+			break;
+		zfs_rangelock_exit(lr);
+	}
+
+	object = ma[0]->object;
+	zfs_vmobject_wlock(object);
+	obj_size = object->un_pager.vnp.vnp_size;
+	zfs_vmobject_wunlock(object);
+	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
+		zfs_rangelock_exit(lr);
+		ZFS_EXIT(zfsvfs);
+		return (zfs_vm_pagerret_bad);
+	}
+
+	pgsin_b = 0;
+	if (rbehind != NULL) {
+		pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
+		pgsin_b = MIN(*rbehind, pgsin_b);
+	}
+
+	pgsin_a = 0;
+	if (rahead != NULL) {
+		pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
+		if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
+			pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
+		pgsin_a = MIN(*rahead, pgsin_a);
+	}
+
+	/*
+	 * NB: we need to pass the exact byte size of the data that we expect
+	 * to read after accounting for the file size.  This is required because
+	 * ZFS will panic if we request DMU to read beyond the end of the last
+	 * allocated block.
+	 */
+	error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
+	    MIN(end, obj_size) - (end - PAGE_SIZE));
+
+	zfs_rangelock_exit(lr);
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+	ZFS_EXIT(zfsvfs);
+
+	if (error != 0)
+		return (zfs_vm_pagerret_error);
+
+	VM_CNT_INC(v_vnodein);
+	VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
+	if (rbehind != NULL)
+		*rbehind = pgsin_b;
+	if (rahead != NULL)
+		*rahead = pgsin_a;
+	return (zfs_vm_pagerret_ok);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getpages_args {
+	struct vnode *a_vp;
+	vm_page_t *a_m;
+	int a_count;
+	int *a_rbehind;
+	int *a_rahead;
+};
+#endif
+
+static int
+zfs_freebsd_getpages(struct vop_getpages_args *ap)
+{
+
+	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
+	    ap->a_rahead));
+}
+
+static int
+zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
+    int *rtvals)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zfs_locked_range_t		*lr;
+	dmu_tx_t	*tx;
+	struct sf_buf	*sf;
+	vm_object_t	object;
+	vm_page_t	m;
+	caddr_t		va;
+	size_t		tocopy;
+	size_t		lo_len;
+	vm_ooffset_t	lo_off;
+	vm_ooffset_t	off;
+	uint_t		blksz;
+	int		ncount;
+	int		pcount;
+	int		err;
+	int		i;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	object = vp->v_object;
+	pcount = btoc(len);
+	ncount = pcount;
+
+	KASSERT(ma[0]->object == object, ("mismatching object"));
+	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
+
+	for (i = 0; i < pcount; i++)
+		rtvals[i] = zfs_vm_pagerret_error;
+
+	off = IDX_TO_OFF(ma[0]->pindex);
+	blksz = zp->z_blksz;
+	lo_off = rounddown(off, blksz);
+	lo_len = roundup(len + (off - lo_off), blksz);
+	lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
+
+	zfs_vmobject_wlock(object);
+	if (len + off > object->un_pager.vnp.vnp_size) {
+		if (object->un_pager.vnp.vnp_size > off) {
+			int pgoff;
+
+			len = object->un_pager.vnp.vnp_size - off;
+			ncount = btoc(len);
+			if ((pgoff = (int)len & PAGE_MASK) != 0) {
+				/*
+				 * If the object is locked and the following
+				 * conditions hold, then the page's dirty
+				 * field cannot be concurrently changed by a
+				 * pmap operation.
+				 */
+				m = ma[ncount - 1];
+				vm_page_assert_sbusied(m);
+				KASSERT(!pmap_page_is_write_mapped(m),
+				    ("zfs_putpages: page %p is not read-only",
+				    m));
+				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
+				    pgoff);
+			}
+		} else {
+			len = 0;
+			ncount = 0;
+		}
+		if (ncount < pcount) {
+			for (i = ncount; i < pcount; i++) {
+				rtvals[i] = zfs_vm_pagerret_bad;
+			}
+		}
+	}
+	zfs_vmobject_wunlock(object);
+
+	if (ncount == 0)
+		goto out;
+
+	if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) ||
+	    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) ||
+	    (zp->z_projid != ZFS_DEFAULT_PROJID &&
+	    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+	    zp->z_projid))) {
+		goto out;
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_write(tx, zp->z_id, off, len);
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		goto out;
+	}
+
+	if (zp->z_blksz < PAGE_SIZE) {
+		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
+			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
+			va = zfs_map_page(ma[i], &sf);
+			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
+			zfs_unmap_page(sf);
+		}
+	} else {
+		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
+	}
+
+	if (err == 0) {
+		uint64_t mtime[2], ctime[2];
+		sa_bulk_attr_t bulk[3];
+		int count = 0;
+
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+		    &mtime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    &ctime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+		    &zp->z_pflags, 8);
+		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		ASSERT0(err);
+		/*
+		 * XXX we should be passing a callback to undirty
+		 * but that would make the locking messier
+		 */
+		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
+		    len, 0, NULL, NULL);
+
+		zfs_vmobject_wlock(object);
+		for (i = 0; i < ncount; i++) {
+			rtvals[i] = zfs_vm_pagerret_ok;
+			vm_page_undirty(ma[i]);
+		}
+		zfs_vmobject_wunlock(object);
+		VM_CNT_INC(v_vnodeout);
+		VM_CNT_ADD(v_vnodepgsout, ncount);
+	}
+	dmu_tx_commit(tx);
+
+out:
+	zfs_rangelock_exit(lr);
+	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
+	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zfsvfs->z_log, zp->z_id);
+	ZFS_EXIT(zfsvfs);
+	return (rtvals[0]);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_putpages_args {
+	struct vnode *a_vp;
+	vm_page_t *a_m;
+	int a_count;
+	int a_sync;
+	int *a_rtvals;
+};
+#endif
+
+static int
+zfs_freebsd_putpages(struct vop_putpages_args *ap)
+{
+
+	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
+	    ap->a_rtvals));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_bmap_args {
+	struct vnode *a_vp;
+	daddr_t  a_bn;
+	struct bufobj **a_bop;
+	daddr_t *a_bnp;
+	int *a_runp;
+	int *a_runb;
+};
+#endif
+
+static int
+zfs_freebsd_bmap(struct vop_bmap_args *ap)
+{
+
+	if (ap->a_bop != NULL)
+		*ap->a_bop = &ap->a_vp->v_bufobj;
+	if (ap->a_bnp != NULL)
+		*ap->a_bnp = ap->a_bn;
+	if (ap->a_runp != NULL)
+		*ap->a_runp = 0;
+	if (ap->a_runb != NULL)
+		*ap->a_runb = 0;
+
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_open_args {
+	struct vnode *a_vp;
+	int a_mode;
+	struct ucred *a_cred;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_open(struct vop_open_args *ap)
+{
+	vnode_t	*vp = ap->a_vp;
+	znode_t *zp = VTOZ(vp);
+	int error;
+
+	error = zfs_open(&vp, ap->a_mode, ap->a_cred);
+	if (error == 0)
+		vnode_create_vobject(vp, zp->z_size, ap->a_td);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_close_args {
+	struct vnode *a_vp;
+	int  a_fflag;
+	struct ucred *a_cred;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_close(struct vop_close_args *ap)
+{
+
+	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_ioctl_args {
+	struct vnode *a_vp;
+	ulong_t a_command;
+	caddr_t a_data;
+	int a_fflag;
+	struct ucred *cred;
+	struct thread *td;
+};
+#endif
+
+static int
+zfs_freebsd_ioctl(struct vop_ioctl_args *ap)
+{
+
+	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
+	    ap->a_fflag, ap->a_cred, NULL));
+}
+
+static int
+ioflags(int ioflags)
+{
+	int flags = 0;
+
+	if (ioflags & IO_APPEND)
+		flags |= FAPPEND;
+	if (ioflags & IO_NDELAY)
+		flags |= FNONBLOCK;
+	if (ioflags & IO_SYNC)
+		flags |= (FSYNC | FDSYNC | FRSYNC);
+
+	return (flags);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_read_args {
+	struct vnode *a_vp;
+	struct uio *a_uio;
+	int a_ioflag;
+	struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_read(struct vop_read_args *ap)
+{
+
+	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
+	    ap->a_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_write_args {
+	struct vnode *a_vp;
+	struct uio *a_uio;
+	int a_ioflag;
+	struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_write(struct vop_write_args *ap)
+{
+
+	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
+	    ap->a_cred));
+}
+
+#if __FreeBSD_version >= 1300102
+/*
+ * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
+ * the comment above cache_fplookup for details.
+ */
+static int
+zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v)
+{
+	vnode_t *vp;
+	znode_t *zp;
+	uint64_t pflags;
+
+	vp = v->a_vp;
+	zp = VTOZ_SMR(vp);
+	if (__predict_false(zp == NULL))
+		return (EAGAIN);
+	pflags = atomic_load_64(&zp->z_pflags);
+	if (pflags & ZFS_AV_QUARANTINED)
+		return (EAGAIN);
+	if (pflags & ZFS_XATTR)
+		return (EAGAIN);
+	if ((pflags & ZFS_NO_EXECS_DENIED) == 0)
+		return (EAGAIN);
+	return (0);
+}
+#endif
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_access_args {
+	struct vnode *a_vp;
+	accmode_t a_accmode;
+	struct ucred *a_cred;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_access(struct vop_access_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+	znode_t *zp = VTOZ(vp);
+	accmode_t accmode;
+	int error = 0;
+
+
+	if (ap->a_accmode == VEXEC) {
+		if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0)
+			return (0);
+	}
+
+	/*
+	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
+	 */
+	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
+	if (accmode != 0)
+		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
+
+	/*
+	 * VADMIN has to be handled by vaccess().
+	 */
+	if (error == 0) {
+		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
+		if (accmode != 0) {
+#if __FreeBSD_version >= 1300105
+			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
+			    zp->z_gid, accmode, ap->a_cred);
+#else
+			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
+			    zp->z_gid, accmode, ap->a_cred, NULL);
+#endif
+		}
+	}
+
+	/*
+	 * For VEXEC, ensure that at least one execute bit is set for
+	 * non-directories.
+	 */
+	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
+	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
+		error = EACCES;
+	}
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_lookup_args {
+	struct vnode *a_dvp;
+	struct vnode **a_vpp;
+	struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
+{
+	struct componentname *cnp = ap->a_cnp;
+	char nm[NAME_MAX + 1];
+
+	ASSERT(cnp->cn_namelen < sizeof (nm));
+	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm)));
+
+	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
+	    cnp->cn_cred, cnp->cn_thread, 0, cached));
+}
+
+static int
+zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
+{
+
+	return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_lookup_args {
+	struct vnode *a_dvp;
+	struct vnode **a_vpp;
+	struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_cache_lookup(struct vop_lookup_args *ap)
+{
+	zfsvfs_t *zfsvfs;
+
+	zfsvfs = ap->a_dvp->v_mount->mnt_data;
+	if (zfsvfs->z_use_namecache)
+		return (vfs_cache_lookup(ap));
+	else
+		return (zfs_freebsd_lookup(ap, B_FALSE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_create_args {
+	struct vnode *a_dvp;
+	struct vnode **a_vpp;
+	struct componentname *a_cnp;
+	struct vattr *a_vap;
+};
+#endif
+
+static int
+zfs_freebsd_create(struct vop_create_args *ap)
+{
+	zfsvfs_t *zfsvfs;
+	struct componentname *cnp = ap->a_cnp;
+	vattr_t *vap = ap->a_vap;
+	znode_t *zp = NULL;
+	int rc, mode;
+
+	ASSERT(cnp->cn_flags & SAVENAME);
+
+	vattr_init_mask(vap);
+	mode = vap->va_mode & ALLPERMS;
+	zfsvfs = ap->a_dvp->v_mount->mnt_data;
+	*ap->a_vpp = NULL;
+
+	rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, !EXCL, mode,
+	    &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */);
+	if (rc == 0)
+		*ap->a_vpp = ZTOV(zp);
+	if (zfsvfs->z_use_namecache &&
+	    rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
+		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
+
+	return (rc);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_remove_args {
+	struct vnode *a_dvp;
+	struct vnode *a_vp;
+	struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_remove(struct vop_remove_args *ap)
+{
+
+	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+	return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
+	    ap->a_cnp->cn_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_mkdir_args {
+	struct vnode *a_dvp;
+	struct vnode **a_vpp;
+	struct componentname *a_cnp;
+	struct vattr *a_vap;
+};
+#endif
+
+static int
+zfs_freebsd_mkdir(struct vop_mkdir_args *ap)
+{
+	vattr_t *vap = ap->a_vap;
+	znode_t *zp = NULL;
+	int rc;
+
+	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+	vattr_init_mask(vap);
+	*ap->a_vpp = NULL;
+
+	rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp,
+	    ap->a_cnp->cn_cred, 0, NULL);
+
+	if (rc == 0)
+		*ap->a_vpp = ZTOV(zp);
+	return (rc);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_rmdir_args {
+	struct vnode *a_dvp;
+	struct vnode *a_vp;
+	struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_rmdir(struct vop_rmdir_args *ap)
+{
+	struct componentname *cnp = ap->a_cnp;
+
+	ASSERT(cnp->cn_flags & SAVENAME);
+
+	return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_readdir_args {
+	struct vnode *a_vp;
+	struct uio *a_uio;
+	struct ucred *a_cred;
+	int *a_eofflag;
+	int *a_ncookies;
+	ulong_t **a_cookies;
+};
+#endif
+
+static int
+zfs_freebsd_readdir(struct vop_readdir_args *ap)
+{
+
+	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
+	    ap->a_ncookies, ap->a_cookies));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_fsync_args {
+	struct vnode *a_vp;
+	int a_waitfor;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_fsync(struct vop_fsync_args *ap)
+{
+
+	vop_stdfsync(ap);
+	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getattr_args {
+	struct vnode *a_vp;
+	struct vattr *a_vap;
+	struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_getattr(struct vop_getattr_args *ap)
+{
+	vattr_t *vap = ap->a_vap;
+	xvattr_t xvap;
+	ulong_t fflags = 0;
+	int error;
+
+	xva_init(&xvap);
+	xvap.xva_vattr = *vap;
+	xvap.xva_vattr.va_mask |= AT_XVATTR;
+
+	/* Convert chflags into ZFS-type flags. */
+	/* XXX: what about SF_SETTABLE?. */
+	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
+	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
+	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
+	XVA_SET_REQ(&xvap, XAT_NODUMP);
+	XVA_SET_REQ(&xvap, XAT_READONLY);
+	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
+	XVA_SET_REQ(&xvap, XAT_SYSTEM);
+	XVA_SET_REQ(&xvap, XAT_HIDDEN);
+	XVA_SET_REQ(&xvap, XAT_REPARSE);
+	XVA_SET_REQ(&xvap, XAT_OFFLINE);
+	XVA_SET_REQ(&xvap, XAT_SPARSE);
+
+	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred);
+	if (error != 0)
+		return (error);
+
+	/* Convert ZFS xattr into chflags. */
+#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
+	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
+		fflags |= (fflag);					\
+} while (0)
+	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
+	    xvap.xva_xoptattrs.xoa_immutable);
+	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
+	    xvap.xva_xoptattrs.xoa_appendonly);
+	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
+	    xvap.xva_xoptattrs.xoa_nounlink);
+	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
+	    xvap.xva_xoptattrs.xoa_archive);
+	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
+	    xvap.xva_xoptattrs.xoa_nodump);
+	FLAG_CHECK(UF_READONLY, XAT_READONLY,
+	    xvap.xva_xoptattrs.xoa_readonly);
+	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
+	    xvap.xva_xoptattrs.xoa_system);
+	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
+	    xvap.xva_xoptattrs.xoa_hidden);
+	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
+	    xvap.xva_xoptattrs.xoa_reparse);
+	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
+	    xvap.xva_xoptattrs.xoa_offline);
+	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
+	    xvap.xva_xoptattrs.xoa_sparse);
+
+#undef	FLAG_CHECK
+	*vap = xvap.xva_vattr;
+	vap->va_flags = fflags;
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_setattr_args {
+	struct vnode *a_vp;
+	struct vattr *a_vap;
+	struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_setattr(struct vop_setattr_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+	vattr_t *vap = ap->a_vap;
+	cred_t *cred = ap->a_cred;
+	xvattr_t xvap;
+	ulong_t fflags;
+	uint64_t zflags;
+
+	vattr_init_mask(vap);
+	vap->va_mask &= ~AT_NOSET;
+
+	xva_init(&xvap);
+	xvap.xva_vattr = *vap;
+
+	zflags = VTOZ(vp)->z_pflags;
+
+	if (vap->va_flags != VNOVAL) {
+		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
+		int error;
+
+		if (zfsvfs->z_use_fuids == B_FALSE)
+			return (EOPNOTSUPP);
+
+		fflags = vap->va_flags;
+		/*
+		 * XXX KDM
+		 * We need to figure out whether it makes sense to allow
+		 * UF_REPARSE through, since we don't really have other
+		 * facilities to handle reparse points and zfs_setattr()
+		 * doesn't currently allow setting that attribute anyway.
+		 */
+		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
+		    UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
+		    UF_OFFLINE|UF_SPARSE)) != 0)
+			return (EOPNOTSUPP);
+		/*
+		 * Unprivileged processes are not permitted to unset system
+		 * flags, or modify flags if any system flags are set.
+		 * Privileged non-jail processes may not modify system flags
+		 * if securelevel > 0 and any existing system flags are set.
+		 * Privileged jail processes behave like privileged non-jail
+		 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
+		 * otherwise, they behave like unprivileged processes.
+		 */
+		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
+		    spl_priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
+			if (zflags &
+			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
+				error = securelevel_gt(cred, 0);
+				if (error != 0)
+					return (error);
+			}
+		} else {
+			/*
+			 * Callers may only modify the file flags on
+			 * objects they have VADMIN rights for.
+			 */
+			if ((error = VOP_ACCESS(vp, VADMIN, cred,
+			    curthread)) != 0)
+				return (error);
+			if (zflags &
+			    (ZFS_IMMUTABLE | ZFS_APPENDONLY |
+			    ZFS_NOUNLINK)) {
+				return (EPERM);
+			}
+			if (fflags &
+			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
+				return (EPERM);
+			}
+		}
+
+#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
+	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
+	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
+		XVA_SET_REQ(&xvap, (xflag));				\
+		(xfield) = ((fflags & (fflag)) != 0);			\
+	}								\
+} while (0)
+		/* Convert chflags into ZFS-type flags. */
+		/* XXX: what about SF_SETTABLE?. */
+		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
+		    xvap.xva_xoptattrs.xoa_immutable);
+		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
+		    xvap.xva_xoptattrs.xoa_appendonly);
+		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
+		    xvap.xva_xoptattrs.xoa_nounlink);
+		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
+		    xvap.xva_xoptattrs.xoa_archive);
+		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
+		    xvap.xva_xoptattrs.xoa_nodump);
+		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
+		    xvap.xva_xoptattrs.xoa_readonly);
+		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
+		    xvap.xva_xoptattrs.xoa_system);
+		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
+		    xvap.xva_xoptattrs.xoa_hidden);
+		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
+		    xvap.xva_xoptattrs.xoa_reparse);
+		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
+		    xvap.xva_xoptattrs.xoa_offline);
+		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
+		    xvap.xva_xoptattrs.xoa_sparse);
+#undef	FLAG_CHANGE
+	}
+	if (vap->va_birthtime.tv_sec != VNOVAL) {
+		xvap.xva_vattr.va_mask |= AT_XVATTR;
+		XVA_SET_REQ(&xvap, XAT_CREATETIME);
+	}
+	return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_rename_args {
+	struct vnode *a_fdvp;
+	struct vnode *a_fvp;
+	struct componentname *a_fcnp;
+	struct vnode *a_tdvp;
+	struct vnode *a_tvp;
+	struct componentname *a_tcnp;
+};
+#endif
+
+static int
+zfs_freebsd_rename(struct vop_rename_args *ap)
+{
+	vnode_t *fdvp = ap->a_fdvp;
+	vnode_t *fvp = ap->a_fvp;
+	vnode_t *tdvp = ap->a_tdvp;
+	vnode_t *tvp = ap->a_tvp;
+	int error;
+
+	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
+	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
+
+	error = zfs_rename_(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
+	    ap->a_tcnp, ap->a_fcnp->cn_cred, 1);
+
+	vrele(fdvp);
+	vrele(fvp);
+	vrele(tdvp);
+	if (tvp != NULL)
+		vrele(tvp);
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_symlink_args {
+	struct vnode *a_dvp;
+	struct vnode **a_vpp;
+	struct componentname *a_cnp;
+	struct vattr *a_vap;
+	char *a_target;
+};
+#endif
+
+static int
+zfs_freebsd_symlink(struct vop_symlink_args *ap)
+{
+	struct componentname *cnp = ap->a_cnp;
+	vattr_t *vap = ap->a_vap;
+	znode_t *zp = NULL;
+	int rc;
+
+	ASSERT(cnp->cn_flags & SAVENAME);
+
+	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
+	vattr_init_mask(vap);
+	*ap->a_vpp = NULL;
+
+	rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap,
+	    ap->a_target, &zp, cnp->cn_cred, 0 /* flags */);
+	if (rc == 0)
+		*ap->a_vpp = ZTOV(zp);
+	return (rc);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_readlink_args {
+	struct vnode *a_vp;
+	struct uio *a_uio;
+	struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_readlink(struct vop_readlink_args *ap)
+{
+
+	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_link_args {
+	struct vnode *a_tdvp;
+	struct vnode *a_vp;
+	struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_link(struct vop_link_args *ap)
+{
+	struct componentname *cnp = ap->a_cnp;
+	vnode_t *vp = ap->a_vp;
+	vnode_t *tdvp = ap->a_tdvp;
+
+	if (tdvp->v_mount != vp->v_mount)
+		return (EXDEV);
+
+	ASSERT(cnp->cn_flags & SAVENAME);
+
+	return (zfs_link(VTOZ(tdvp), VTOZ(vp),
+	    cnp->cn_nameptr, cnp->cn_cred, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_inactive_args {
+	struct vnode *a_vp;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_inactive(struct vop_inactive_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+
+	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
+	return (0);
+}
+
+#if __FreeBSD_version >= 1300042
+#ifndef _SYS_SYSPROTO_H_
+struct vop_need_inactive_args {
+	struct vnode *a_vp;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int need;
+
+	if (vn_need_pageq_flush(vp))
+		return (1);
+
+	if (!rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER))
+		return (1);
+	need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty);
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
+
+	return (need);
+}
+#endif
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_reclaim_args {
+	struct vnode *a_vp;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_reclaim(struct vop_reclaim_args *ap)
+{
+	vnode_t	*vp = ap->a_vp;
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ASSERT(zp != NULL);
+
+#if __FreeBSD_version < 1300042
+	/* Destroy the vm object and flush associated pages. */
+	vnode_destroy_vobject(vp);
+#endif
+	/*
+	 * z_teardown_inactive_lock protects from a race with
+	 * zfs_znode_dmu_fini in zfsvfs_teardown during
+	 * force unmount.
+	 */
+	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+	if (zp->z_sa_hdl == NULL)
+		zfs_znode_free(zp);
+	else
+		zfs_zinactive(zp);
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
+
+	vp->v_data = NULL;
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_fid_args {
+	struct vnode *a_vp;
+	struct fid *a_fid;
+};
+#endif
+
+static int
+zfs_freebsd_fid(struct vop_fid_args *ap)
+{
+
+	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
+}
+
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_pathconf_args {
+	struct vnode *a_vp;
+	int a_name;
+	register_t *a_retval;
+} *ap;
+#endif
+
+static int
+zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
+{
+	ulong_t val;
+	int error;
+
+	error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
+	    curthread->td_ucred, NULL);
+	if (error == 0) {
+		*ap->a_retval = val;
+		return (error);
+	}
+	if (error != EOPNOTSUPP)
+		return (error);
+
+	switch (ap->a_name) {
+	case _PC_NAME_MAX:
+		*ap->a_retval = NAME_MAX;
+		return (0);
+	case _PC_PIPE_BUF:
+		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
+			*ap->a_retval = PIPE_BUF;
+			return (0);
+		}
+		return (EINVAL);
+	default:
+		return (vop_stdpathconf(ap));
+	}
+}
+
+/*
+ * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
+ * extended attribute name:
+ *
+ *	NAMESPACE	PREFIX
+ *	system		freebsd:system:
+ *	user		(none, can be used to access ZFS fsattr(5) attributes
+ *			created on Solaris)
+ */
+static int
+zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
+    size_t size)
+{
+	const char *namespace, *prefix, *suffix;
+
+	/* We don't allow '/' character in attribute name. */
+	if (strchr(name, '/') != NULL)
+		return (EINVAL);
+	/* We don't allow attribute names that start with "freebsd:" string. */
+	if (strncmp(name, "freebsd:", 8) == 0)
+		return (EINVAL);
+
+	bzero(attrname, size);
+
+	switch (attrnamespace) {
+	case EXTATTR_NAMESPACE_USER:
+#if 0
+		prefix = "freebsd:";
+		namespace = EXTATTR_NAMESPACE_USER_STRING;
+		suffix = ":";
+#else
+		/*
+		 * This is the default namespace by which we can access all
+		 * attributes created on Solaris.
+		 */
+		prefix = namespace = suffix = "";
+#endif
+		break;
+	case EXTATTR_NAMESPACE_SYSTEM:
+		prefix = "freebsd:";
+		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
+		suffix = ":";
+		break;
+	case EXTATTR_NAMESPACE_EMPTY:
+	default:
+		return (EINVAL);
+	}
+	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
+	    name) >= size) {
+		return (ENAMETOOLONG);
+	}
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	IN const char *a_name;
+	INOUT struct uio *a_uio;
+	OUT size_t *a_size;
+	IN struct ucred *a_cred;
+	IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operating to retrieve a named extended attribute.
+ */
+static int
+zfs_getextattr(struct vop_getextattr_args *ap)
+{
+	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+	struct thread *td = ap->a_td;
+	struct nameidata nd;
+	char attrname[255];
+	struct vattr va;
+	vnode_t *xvp = NULL, *vp;
+	int error, flags;
+
+	/*
+	 * If the xattr property is off, refuse the request.
+	 */
+	if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+	    ap->a_cred, ap->a_td, VREAD);
+	if (error != 0)
+		return (error);
+
+	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+	    sizeof (attrname));
+	if (error != 0)
+		return (error);
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+	    LOOKUP_XATTR, B_FALSE);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	flags = FREAD;
+	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
+	    xvp, td);
+	error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		if (error == ENOENT)
+			error = ENOATTR;
+		return (error);
+	}
+
+	if (ap->a_size != NULL) {
+		error = VOP_GETATTR(vp, &va, ap->a_cred);
+		if (error == 0)
+			*ap->a_size = (size_t)va.va_size;
+	} else if (ap->a_uio != NULL)
+		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
+
+	VOP_UNLOCK1(vp);
+	vn_close(vp, flags, ap->a_cred, td);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_deleteextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	IN const char *a_name;
+	IN struct ucred *a_cred;
+	IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operation to remove a named attribute.
+ */
+static int
+zfs_deleteextattr(struct vop_deleteextattr_args *ap)
+{
+	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+	struct thread *td = ap->a_td;
+	struct nameidata nd;
+	char attrname[255];
+	vnode_t *xvp = NULL, *vp;
+	int error;
+
+	/*
+	 * If the xattr property is off, refuse the request.
+	 */
+	if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+	    ap->a_cred, ap->a_td, VWRITE);
+	if (error != 0)
+		return (error);
+
+	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+	    sizeof (attrname));
+	if (error != 0)
+		return (error);
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+	    LOOKUP_XATTR, B_FALSE);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
+	    UIO_SYSSPACE, attrname, xvp, td);
+	error = namei(&nd);
+	vp = nd.ni_vp;
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (error == ENOENT)
+			error = ENOATTR;
+		return (error);
+	}
+
+	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	vput(nd.ni_dvp);
+	if (vp == nd.ni_dvp)
+		vrele(vp);
+	else
+		vput(vp);
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_setextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	IN const char *a_name;
+	INOUT struct uio *a_uio;
+	IN struct ucred *a_cred;
+	IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operation to set a named attribute.
+ */
+static int
+zfs_setextattr(struct vop_setextattr_args *ap)
+{
+	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+	struct thread *td = ap->a_td;
+	struct nameidata nd;
+	char attrname[255];
+	struct vattr va;
+	vnode_t *xvp = NULL, *vp;
+	int error, flags;
+
+	/*
+	 * If the xattr property is off, refuse the request.
+	 */
+	if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+	    ap->a_cred, ap->a_td, VWRITE);
+	if (error != 0)
+		return (error);
+	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+	    sizeof (attrname));
+	if (error != 0)
+		return (error);
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+	    LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	flags = FFLAGS(O_WRONLY | O_CREAT);
+	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
+	    xvp, td);
+	error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
+	    NULL);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	VATTR_NULL(&va);
+	va.va_size = 0;
+	error = VOP_SETATTR(vp, &va, ap->a_cred);
+	if (error == 0)
+		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
+
+	VOP_UNLOCK1(vp);
+	vn_close(vp, flags, ap->a_cred, td);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_listextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	INOUT struct uio *a_uio;
+	OUT size_t *a_size;
+	IN struct ucred *a_cred;
+	IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operation to retrieve extended attributes on a vnode.
+ */
+static int
+zfs_listextattr(struct vop_listextattr_args *ap)
+{
+	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+	struct thread *td = ap->a_td;
+	struct nameidata nd;
+	char attrprefix[16];
+	uint8_t dirbuf[sizeof (struct dirent)];
+	struct dirent *dp;
+	struct iovec aiov;
+	struct uio auio, *uio = ap->a_uio;
+	size_t *sizep = ap->a_size;
+	size_t plen;
+	vnode_t *xvp = NULL, *vp;
+	int done, error, eof, pos;
+
+	/*
+	 * If the xattr property is off, refuse the request.
+	 */
+	if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+	    ap->a_cred, ap->a_td, VREAD);
+	if (error != 0)
+		return (error);
+
+	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
+	    sizeof (attrprefix));
+	if (error != 0)
+		return (error);
+	plen = strlen(attrprefix);
+
+	ZFS_ENTER(zfsvfs);
+
+	if (sizep != NULL)
+		*sizep = 0;
+
+	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+	    LOOKUP_XATTR, B_FALSE);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		/*
+		 * ENOATTR means that the EA directory does not yet exist,
+		 * i.e. there are no extended attributes there.
+		 */
+		if (error == ENOATTR)
+			error = 0;
+		return (error);
+	}
+
+	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
+	    UIO_SYSSPACE, ".", xvp, td);
+	error = namei(&nd);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_td = td;
+	auio.uio_rw = UIO_READ;
+	auio.uio_offset = 0;
+
+	do {
+		uint8_t nlen;
+
+		aiov.iov_base = (void *)dirbuf;
+		aiov.iov_len = sizeof (dirbuf);
+		auio.uio_resid = sizeof (dirbuf);
+		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
+		done = sizeof (dirbuf) - auio.uio_resid;
+		if (error != 0)
+			break;
+		for (pos = 0; pos < done; ) {
+			dp = (struct dirent *)(dirbuf + pos);
+			pos += dp->d_reclen;
+			/*
+			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
+			 * is what we get when attribute was created on Solaris.
+			 */
+			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
+				continue;
+			if (plen == 0 &&
+			    strncmp(dp->d_name, "freebsd:", 8) == 0)
+				continue;
+			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
+				continue;
+			nlen = dp->d_namlen - plen;
+			if (sizep != NULL)
+				*sizep += 1 + nlen;
+			else if (uio != NULL) {
+				/*
+				 * Format of extattr name entry is one byte for
+				 * length and the rest for name.
+				 */
+				error = uiomove(&nlen, 1, uio->uio_rw, uio);
+				if (error == 0) {
+					error = uiomove(dp->d_name + plen, nlen,
+					    uio->uio_rw, uio);
+				}
+				if (error != 0)
+					break;
+			}
+		}
+	} while (!eof && error == 0);
+
+	vput(vp);
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getacl_args {
+	struct vnode *vp;
+	acl_type_t type;
+	struct acl *aclp;
+	struct ucred *cred;
+	struct thread *td;
+};
+#endif
+
+static int
+zfs_freebsd_getacl(struct vop_getacl_args *ap)
+{
+	int		error;
+	vsecattr_t	vsecattr;
+
+	if (ap->a_type != ACL_TYPE_NFS4)
+		return (EINVAL);
+
+	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
+	if ((error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)))
+		return (error);
+
+	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp,
+	    vsecattr.vsa_aclcnt);
+	if (vsecattr.vsa_aclentp != NULL)
+		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_setacl_args {
+	struct vnode *vp;
+	acl_type_t type;
+	struct acl *aclp;
+	struct ucred *cred;
+	struct thread *td;
+};
+#endif
+
+static int
+zfs_freebsd_setacl(struct vop_setacl_args *ap)
+{
+	int		error;
+	vsecattr_t vsecattr;
+	int		aclbsize;	/* size of acl list in bytes */
+	aclent_t	*aaclp;
+
+	if (ap->a_type != ACL_TYPE_NFS4)
+		return (EINVAL);
+
+	if (ap->a_aclp == NULL)
+		return (EINVAL);
+
+	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
+		return (EINVAL);
+
+	/*
+	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
+	 * splitting every entry into two and appending "canonical six"
+	 * entries at the end.  Don't allow for setting an ACL that would
+	 * cause chmod(2) to run out of ACL entries.
+	 */
+	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
+		return (ENOSPC);
+
+	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
+	if (error != 0)
+		return (error);
+
+	vsecattr.vsa_mask = VSA_ACE;
+	aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t);
+	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
+	aaclp = vsecattr.vsa_aclentp;
+	vsecattr.vsa_aclentsz = aclbsize;
+
+	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
+	error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred);
+	kmem_free(aaclp, aclbsize);
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_aclcheck_args {
+	struct vnode *vp;
+	acl_type_t type;
+	struct acl *aclp;
+	struct ucred *cred;
+	struct thread *td;
+};
+#endif
+
+static int
+zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+zfs_vptocnp(struct vop_vptocnp_args *ap)
+{
+	vnode_t *covered_vp;
+	vnode_t *vp = ap->a_vp;
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	znode_t *zp = VTOZ(vp);
+	int ltype;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	/*
+	 * If we are a snapshot mounted under .zfs, run the operation
+	 * on the covered vnode.
+	 */
+	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
+		char name[MAXNAMLEN + 1];
+		znode_t *dzp;
+		size_t len;
+
+		error = zfs_znode_parent_and_name(zp, &dzp, name);
+		if (error == 0) {
+			len = strlen(name);
+			if (*ap->a_buflen < len)
+				error = SET_ERROR(ENOMEM);
+		}
+		if (error == 0) {
+			*ap->a_buflen -= len;
+			bcopy(name, ap->a_buf + *ap->a_buflen, len);
+			*ap->a_vpp = ZTOV(dzp);
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	ZFS_EXIT(zfsvfs);
+
+	covered_vp = vp->v_mount->mnt_vnodecovered;
+#if __FreeBSD_version >= 1300045
+	enum vgetstate vs = vget_prep(covered_vp);
+#else
+	vhold(covered_vp);
+#endif
+	ltype = VOP_ISLOCKED(vp);
+	VOP_UNLOCK1(vp);
+#if __FreeBSD_version >= 1300045
+	error = vget_finish(covered_vp, LK_SHARED, vs);
+#else
+	error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread);
+#endif
+	if (error == 0) {
+		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
+		    ap->a_buf, ap->a_buflen);
+		vput(covered_vp);
+	}
+	vn_lock(vp, ltype | LK_RETRY);
+	if (VN_IS_DOOMED(vp))
+		error = SET_ERROR(ENOENT);
+	return (error);
+}
+
+#ifdef DIAGNOSTIC
+#ifndef _SYS_SYSPROTO_H_
+struct vop_lock1_args {
+	struct vnode *a_vp;
+	int a_flags;
+	char *file;
+	int line;
+};
+#endif
+
+static int
+zfs_lock(struct vop_lock1_args *ap)
+{
+	vnode_t *vp;
+	znode_t *zp;
+	int err;
+
+#if __FreeBSD_version >= 1300064
+	err = vop_lock(ap);
+#else
+	err = vop_stdlock(ap);
+#endif
+	if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
+		vp = ap->a_vp;
+		zp = vp->v_data;
+		if (vp->v_mount != NULL && !VN_IS_DOOMED(vp) &&
+		    zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
+			VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
+	}
+	return (err);
+}
+#endif
+
+struct vop_vector zfs_vnodeops;
+struct vop_vector zfs_fifoops;
+struct vop_vector zfs_shareops;
+
+struct vop_vector zfs_vnodeops = {
+	.vop_default =		&default_vnodeops,
+	.vop_inactive =		zfs_freebsd_inactive,
+#if __FreeBSD_version >= 1300042
+	.vop_need_inactive =	zfs_freebsd_need_inactive,
+#endif
+	.vop_reclaim =		zfs_freebsd_reclaim,
+#if __FreeBSD_version >= 1300102
+	.vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
+#endif
+	.vop_access =		zfs_freebsd_access,
+	.vop_allocate =		VOP_EINVAL,
+	.vop_lookup =		zfs_cache_lookup,
+	.vop_cachedlookup =	zfs_freebsd_cachedlookup,
+	.vop_getattr =		zfs_freebsd_getattr,
+	.vop_setattr =		zfs_freebsd_setattr,
+	.vop_create =		zfs_freebsd_create,
+	.vop_mknod =		(vop_mknod_t *)zfs_freebsd_create,
+	.vop_mkdir =		zfs_freebsd_mkdir,
+	.vop_readdir =		zfs_freebsd_readdir,
+	.vop_fsync =		zfs_freebsd_fsync,
+	.vop_open =		zfs_freebsd_open,
+	.vop_close =		zfs_freebsd_close,
+	.vop_rmdir =		zfs_freebsd_rmdir,
+	.vop_ioctl =		zfs_freebsd_ioctl,
+	.vop_link =		zfs_freebsd_link,
+	.vop_symlink =		zfs_freebsd_symlink,
+	.vop_readlink =		zfs_freebsd_readlink,
+	.vop_read =		zfs_freebsd_read,
+	.vop_write =		zfs_freebsd_write,
+	.vop_remove =		zfs_freebsd_remove,
+	.vop_rename =		zfs_freebsd_rename,
+	.vop_pathconf =		zfs_freebsd_pathconf,
+	.vop_bmap =		zfs_freebsd_bmap,
+	.vop_fid =		zfs_freebsd_fid,
+	.vop_getextattr =	zfs_getextattr,
+	.vop_deleteextattr =	zfs_deleteextattr,
+	.vop_setextattr =	zfs_setextattr,
+	.vop_listextattr =	zfs_listextattr,
+	.vop_getacl =		zfs_freebsd_getacl,
+	.vop_setacl =		zfs_freebsd_setacl,
+	.vop_aclcheck =		zfs_freebsd_aclcheck,
+	.vop_getpages =		zfs_freebsd_getpages,
+	.vop_putpages =		zfs_freebsd_putpages,
+	.vop_vptocnp =		zfs_vptocnp,
+#if __FreeBSD_version >= 1300064
+#ifdef DIAGNOSTIC
+	.vop_lock1 =		zfs_lock,
+#else
+	.vop_lock1 =		vop_lock,
+#endif
+	.vop_unlock =		vop_unlock,
+	.vop_islocked =		vop_islocked,
+#else
+#ifdef DIAGNOSTIC
+	.vop_lock1 =		zfs_lock,
+#endif
+#endif
+};
+VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
+
+struct vop_vector zfs_fifoops = {
+	.vop_default =		&fifo_specops,
+	.vop_fsync =		zfs_freebsd_fsync,
+#if __FreeBSD_version >= 1300102
+	.vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
+#endif
+	.vop_access =		zfs_freebsd_access,
+	.vop_getattr =		zfs_freebsd_getattr,
+	.vop_inactive =		zfs_freebsd_inactive,
+	.vop_read =		VOP_PANIC,
+	.vop_reclaim =		zfs_freebsd_reclaim,
+	.vop_setattr =		zfs_freebsd_setattr,
+	.vop_write =		VOP_PANIC,
+	.vop_pathconf = 	zfs_freebsd_pathconf,
+	.vop_fid =		zfs_freebsd_fid,
+	.vop_getacl =		zfs_freebsd_getacl,
+	.vop_setacl =		zfs_freebsd_setacl,
+	.vop_aclcheck =		zfs_freebsd_aclcheck,
+};
+VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
+
+/*
+ * special share hidden files vnode operations template
+ */
+struct vop_vector zfs_shareops = {
+	.vop_default =		&default_vnodeops,
+	.vop_access =		zfs_freebsd_access,
+	.vop_inactive =		zfs_freebsd_inactive,
+	.vop_reclaim =		zfs_freebsd_reclaim,
+	.vop_fid =		zfs_freebsd_fid,
+	.vop_pathconf =		zfs_freebsd_pathconf,
+};
+VFS_VOP_VECTOR_REGISTER(zfs_shareops);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
new file mode 100644
index 000000000000..76e24b1bdf51
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
@@ -0,0 +1,2067 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/mntent.h>
+#include <sys/u8_textprep.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/atomic.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_rlock.h>
+#include <sys/zfs_fuid.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#endif /* _KERNEL */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfs_refcount.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+/* Used by fstat(1). */
+SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD,
+	SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)");
+
+/*
+ * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef	ZFS_DEBUG
+#define	ZNODE_STATS
+#endif	/* DEBUG */
+
+#ifdef	ZNODE_STATS
+#define	ZNODE_STAT_ADD(stat)			((stat)++)
+#else
+#define	ZNODE_STAT_ADD(stat)			/* nothing */
+#endif	/* ZNODE_STATS */
+
+/*
+ * Functions needed for userland (ie: libzpool) are not put under
+ * #ifdef_KERNEL; the rest of the functions have dependencies
+ * (such as VFS logic) that will not compile easily in userland.
+ */
+#ifdef _KERNEL
+/*
+ * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
+ * be freed before it can be safely accessed.
+ */
+krwlock_t zfsvfs_lock;
+
+#if defined(_KERNEL) && !defined(KMEM_DEBUG) && \
+    __FreeBSD_version >= 1300102
+#define	_ZFS_USE_SMR
+static uma_zone_t znode_uma_zone;
+#else
+static kmem_cache_t *znode_cache = NULL;
+#endif
+
+extern struct vop_vector zfs_vnodeops;
+extern struct vop_vector zfs_fifoops;
+extern struct vop_vector zfs_shareops;
+
+
+/*
+ * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
+ * z_rangelock. It will modify the offset and length of the lock to reflect
+ * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
+ * called with the rangelock_t's rl_lock held, which avoids races.
+ */
+static void
+zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
+{
+	znode_t *zp = arg;
+
+	/*
+	 * If in append mode, convert to writer and lock starting at the
+	 * current end of file.
+	 */
+	if (new->lr_type == RL_APPEND) {
+		new->lr_offset = zp->z_size;
+		new->lr_type = RL_WRITER;
+	}
+
+	/*
+	 * If we need to grow the block size then lock the whole file range.
+	 */
+	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
+	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+	    zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
+		new->lr_offset = 0;
+		new->lr_length = UINT64_MAX;
+	}
+}
+
+static int
+zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
+{
+	znode_t *zp = buf;
+
+	POINTER_INVALIDATE(&zp->z_zfsvfs);
+
+	list_link_init(&zp->z_link_node);
+
+	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
+
+	zp->z_acl_cached = NULL;
+	zp->z_vnode = NULL;
+	zp->z_moved = 0;
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *arg)
+{
+	znode_t *zp = buf;
+
+	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+	ASSERT3P(zp->z_vnode, ==, NULL);
+	ASSERT(!list_link_active(&zp->z_link_node));
+	mutex_destroy(&zp->z_acl_lock);
+	zfs_rangelock_fini(&zp->z_rangelock);
+
+	ASSERT(zp->z_acl_cached == NULL);
+}
+
+
+#ifdef _ZFS_USE_SMR
+VFS_SMR_DECLARE;
+
+static int
+zfs_znode_cache_constructor_smr(void *mem, int size __unused, void *private,
+    int flags)
+{
+
+	return (zfs_znode_cache_constructor(mem, private, flags));
+}
+
+static void
+zfs_znode_cache_destructor_smr(void *mem, int size __unused, void *private)
+{
+
+	zfs_znode_cache_destructor(mem, private);
+}
+
+void
+zfs_znode_init(void)
+{
+	/*
+	 * Initialize zcache
+	 */
+	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
+	ASSERT(znode_uma_zone == NULL);
+	znode_uma_zone = uma_zcreate("zfs_znode_cache",
+	    sizeof (znode_t), zfs_znode_cache_constructor_smr,
+	    zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0);
+	VFS_SMR_ZONE_SET(znode_uma_zone);
+}
+
+static znode_t *
+zfs_znode_alloc_kmem(int flags)
+{
+
+	return (uma_zalloc_smr(znode_uma_zone, flags));
+}
+
+static void
+zfs_znode_free_kmem(znode_t *zp)
+{
+
+	uma_zfree_smr(znode_uma_zone, zp);
+}
+#else
+void
+zfs_znode_init(void)
+{
+	/*
+	 * Initialize zcache
+	 */
+	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
+	ASSERT(znode_cache == NULL);
+	znode_cache = kmem_cache_create("zfs_znode_cache",
+	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
+	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+static znode_t *
+zfs_znode_alloc_kmem(int flags)
+{
+
+	return (kmem_cache_alloc(znode_cache, flags));
+}
+
+static void
+zfs_znode_free_kmem(znode_t *zp)
+{
+
+	kmem_cache_free(znode_cache, zp);
+}
+#endif
+
+void
+zfs_znode_fini(void)
+{
+	/*
+	 * Cleanup zcache
+	 */
+#ifdef _ZFS_USE_SMR
+	if (znode_uma_zone) {
+		uma_zdestroy(znode_uma_zone);
+		znode_uma_zone = NULL;
+	}
+#else
+	if (znode_cache) {
+		kmem_cache_destroy(znode_cache);
+		znode_cache = NULL;
+	}
+#endif
+	rw_destroy(&zfsvfs_lock);
+}
+
+
+static int
+zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+	zfs_acl_ids_t acl_ids;
+	vattr_t vattr;
+	znode_t *sharezp;
+	znode_t *zp;
+	int error;
+
+	vattr.va_mask = AT_MODE|AT_UID|AT_GID;
+	vattr.va_type = VDIR;
+	vattr.va_mode = S_IFDIR|0555;
+	vattr.va_uid = crgetuid(kcred);
+	vattr.va_gid = crgetgid(kcred);
+
+	sharezp = zfs_znode_alloc_kmem(KM_SLEEP);
+	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
+	sharezp->z_moved = 0;
+	sharezp->z_unlinked = 0;
+	sharezp->z_atime_dirty = 0;
+	sharezp->z_zfsvfs = zfsvfs;
+	sharezp->z_is_sa = zfsvfs->z_use_sa;
+
+	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
+	    kcred, NULL, &acl_ids));
+	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
+	ASSERT3P(zp, ==, sharezp);
+	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
+	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
+	zfsvfs->z_shares_dir = sharezp->z_id;
+
+	zfs_acl_ids_free(&acl_ids);
+	sa_handle_destroy(sharezp->z_sa_hdl);
+	zfs_znode_free_kmem(sharezp);
+
+	return (error);
+}
+
+/*
+ * define a couple of values we need available
+ * for both 64 and 32 bit environments.
+ */
+#ifndef NBITSMINOR64
+#define	NBITSMINOR64	32
+#endif
+#ifndef MAXMAJ64
+#define	MAXMAJ64	0xffffffffUL
+#endif
+#ifndef	MAXMIN64
+#define	MAXMIN64	0xffffffffUL
+#endif
+
+/*
+ * Create special expldev for ZFS private use.
+ * Can't use standard expldev since it doesn't do
+ * what we want.  The standard expldev() takes a
+ * dev32_t in LP64 and expands it to a long dev_t.
+ * We need an interface that takes a dev32_t in ILP32
+ * and expands it to a long dev_t.
+ */
+static uint64_t
+zfs_expldev(dev_t dev)
+{
+	return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
+}
+/*
+ * Special cmpldev for ZFS private use.
+ * Can't use standard cmpldev since it takes
+ * a long dev_t and compresses it to dev32_t in
+ * LP64.  We need to do a compaction of a long dev_t
+ * to a dev32_t in ILP32.
+ */
+dev_t
+zfs_cmpldev(uint64_t dev)
+{
+	return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
+}
+
+static void
+zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
+    dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
+{
+	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
+	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
+
+	ASSERT(zp->z_sa_hdl == NULL);
+	ASSERT(zp->z_acl_cached == NULL);
+	if (sa_hdl == NULL) {
+		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+		    SA_HDL_SHARED, &zp->z_sa_hdl));
+	} else {
+		zp->z_sa_hdl = sa_hdl;
+		sa_set_userp(sa_hdl, zp);
+	}
+
+	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
+
+	/*
+	 * Slap on VROOT if we are the root znode unless we are the root
+	 * node of a snapshot mounted under .zfs.
+	 */
+	if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
+		ZTOV(zp)->v_flag |= VROOT;
+
+	vn_exists(ZTOV(zp));
+}
+
+void
+zfs_znode_dmu_fini(znode_t *zp)
+{
+	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
+	    zp->z_unlinked ||
+	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
+
+	sa_handle_destroy(zp->z_sa_hdl);
+	zp->z_sa_hdl = NULL;
+}
+
+static void
+zfs_vnode_forget(vnode_t *vp)
+{
+
+	/* copied from insmntque_stddtr */
+	vp->v_data = NULL;
+	vp->v_op = &dead_vnodeops;
+	vgone(vp);
+	vput(vp);
+}
+
+/*
+ * Construct a new znode/vnode and initialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+static znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
+    dmu_object_type_t obj_type, sa_handle_t *hdl)
+{
+	znode_t	*zp;
+	vnode_t *vp;
+	uint64_t mode;
+	uint64_t parent;
+#ifdef notyet
+	uint64_t mtime[2], ctime[2];
+#endif
+	uint64_t projid = ZFS_DEFAULT_PROJID;
+	sa_bulk_attr_t bulk[9];
+	int count = 0;
+	int error;
+
+	zp = zfs_znode_alloc_kmem(KM_SLEEP);
+
+#ifndef _ZFS_USE_SMR
+	KASSERT((zfsvfs->z_parent->z_vfs->mnt_kern_flag & MNTK_FPLOOKUP) == 0,
+	    ("%s: fast path lookup enabled without smr", __func__));
+#endif
+
+#if __FreeBSD_version >= 1300076
+	KASSERT(curthread->td_vp_reserved != NULL,
+	    ("zfs_znode_alloc: getnewvnode without any vnodes reserved"));
+#else
+	KASSERT(curthread->td_vp_reserv > 0,
+	    ("zfs_znode_alloc: getnewvnode without any vnodes reserved"));
+#endif
+	error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp);
+	if (error != 0) {
+		zfs_znode_free_kmem(zp);
+		return (NULL);
+	}
+	zp->z_vnode = vp;
+	vp->v_data = zp;
+
+	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+	zp->z_moved = 0;
+
+	/*
+	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
+	 * the zfs_znode_move() callback.
+	 */
+	zp->z_sa_hdl = NULL;
+	zp->z_unlinked = 0;
+	zp->z_atime_dirty = 0;
+	zp->z_mapcnt = 0;
+	zp->z_id = db->db_object;
+	zp->z_blksz = blksz;
+	zp->z_seq = 0x7A4653;
+	zp->z_sync_cnt = 0;
+
+	vp = ZTOV(zp);
+
+	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &zp->z_links, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+	    &zp->z_atime, 16);
+#ifdef notyet
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+	    &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    &ctime, 16);
+#endif
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+	    &zp->z_uid, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+	    &zp->z_gid, 8);
+
+	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 ||
+	    (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+	    (zp->z_pflags & ZFS_PROJID) &&
+	    sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
+		if (hdl == NULL)
+			sa_handle_destroy(zp->z_sa_hdl);
+		zfs_vnode_forget(vp);
+		zp->z_vnode = NULL;
+		zfs_znode_free_kmem(zp);
+		return (NULL);
+	}
+
+	zp->z_projid = projid;
+	zp->z_mode = mode;
+
+	/* Cache the xattr parent id */
+	if (zp->z_pflags & ZFS_XATTR)
+		zp->z_xattr_parent = parent;
+
+	vp->v_type = IFTOVT((mode_t)mode);
+
+	switch (vp->v_type) {
+	case VDIR:
+		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
+		break;
+	case VFIFO:
+		vp->v_op = &zfs_fifoops;
+		break;
+	case VREG:
+		if (parent == zfsvfs->z_shares_dir) {
+			ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
+			vp->v_op = &zfs_shareops;
+		}
+		break;
+	default:
+			break;
+	}
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	list_insert_tail(&zfsvfs->z_all_znodes, zp);
+	zfsvfs->z_nr_znodes++;
+	membar_producer();
+	/*
+	 * Everything else must be valid before assigning z_zfsvfs makes the
+	 * znode eligible for zfs_znode_move().
+	 */
+	zp->z_zfsvfs = zfsvfs;
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	/*
+	 * Acquire vnode lock before making it available to the world.
+	 */
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	VN_LOCK_AREC(vp);
+	if (vp->v_type != VFIFO)
+		VN_LOCK_ASHARE(vp);
+
+	return (zp);
+}
+
+static uint64_t empty_xattr;
+static uint64_t pad[4];
+static zfs_acl_phys_t acl_phys;
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ *	IN:	dzp	- parent directory for new znode
+ *		vap	- file attributes for new znode
+ *		tx	- dmu transaction id for zap operations
+ *		cr	- credentials of caller
+ *		flag	- flags:
+ *			  IS_ROOT_NODE	- new object will be root
+ *			  IS_XATTR	- new object is an attribute
+ *		bonuslen - length of bonus buffer
+ *		setaclp  - File/Dir initial ACL
+ *		fuidp	 - Tracks fuid allocation.
+ *
+ *	OUT:	zpp	- allocated znode
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
+    uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
+{
+	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
+	uint64_t	mode, size, links, parent, pflags;
+	uint64_t	dzp_pflags = 0;
+	uint64_t	rdev = 0;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	dmu_buf_t	*db;
+	timestruc_t	now;
+	uint64_t	gen, obj;
+	int		err;
+	int		bonuslen;
+	int		dnodesize;
+	sa_handle_t	*sa_hdl;
+	dmu_object_type_t obj_type;
+	sa_bulk_attr_t	*sa_attrs;
+	int		cnt = 0;
+	zfs_acl_locator_cb_t locate = { 0 };
+
+	ASSERT(vap && ((vap->va_mask & AT_MODE) == AT_MODE));
+
+	if (zfsvfs->z_replay) {
+		obj = vap->va_nodeid;
+		now = vap->va_ctime;		/* see zfs_replay_create() */
+		gen = vap->va_nblocks;		/* ditto */
+		dnodesize = vap->va_fsid;	/* ditto */
+	} else {
+		obj = 0;
+		vfs_timestamp(&now);
+		gen = dmu_tx_get_txg(tx);
+		dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
+	}
+
+	if (dnodesize == 0)
+		dnodesize = DNODE_MIN_SIZE;
+
+	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+	bonuslen = (obj_type == DMU_OT_SA) ?
+	    DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
+
+	/*
+	 * Create a new DMU object.
+	 */
+	/*
+	 * There's currently no mechanism for pre-reading the blocks that will
+	 * be needed to allocate a new object, so we accept the small chance
+	 * that there will be an i/o error and we will fail one of the
+	 * assertions below.
+	 */
+	if (vap->va_type == VDIR) {
+		if (zfsvfs->z_replay) {
+			VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
+			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+			    obj_type, bonuslen, dnodesize, tx));
+		} else {
+			obj = zap_create_norm_dnsize(zfsvfs->z_os,
+			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+			    obj_type, bonuslen, dnodesize, tx);
+		}
+	} else {
+		if (zfsvfs->z_replay) {
+			VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
+			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
+			    obj_type, bonuslen, dnodesize, tx));
+		} else {
+			obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
+			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
+			    obj_type, bonuslen, dnodesize, tx);
+		}
+	}
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
+	VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
+
+	/*
+	 * If this is the root, fix up the half-initialized parent pointer
+	 * to reference the just-allocated physical data area.
+	 */
+	if (flag & IS_ROOT_NODE) {
+		dzp->z_id = obj;
+	} else {
+		dzp_pflags = dzp->z_pflags;
+	}
+
+	/*
+	 * If parent is an xattr, so am I.
+	 */
+	if (dzp_pflags & ZFS_XATTR) {
+		flag |= IS_XATTR;
+	}
+
+	if (zfsvfs->z_use_fuids)
+		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+	else
+		pflags = 0;
+
+	if (vap->va_type == VDIR) {
+		size = 2;		/* contents ("." and "..") */
+		links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+	} else {
+		size = links = 0;
+	}
+
+	if (vap->va_type == VBLK || vap->va_type == VCHR) {
+		rdev = zfs_expldev(vap->va_rdev);
+	}
+
+	parent = dzp->z_id;
+	mode = acl_ids->z_mode;
+	if (flag & IS_XATTR)
+		pflags |= ZFS_XATTR;
+
+	/*
+	 * No execs denied will be determined when zfs_mode_compute() is called.
+	 */
+	pflags |= acl_ids->z_aclp->z_hints &
+	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
+	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
+
+	ZFS_TIME_ENCODE(&now, crtime);
+	ZFS_TIME_ENCODE(&now, ctime);
+
+	if (vap->va_mask & AT_ATIME) {
+		ZFS_TIME_ENCODE(&vap->va_atime, atime);
+	} else {
+		ZFS_TIME_ENCODE(&now, atime);
+	}
+
+	if (vap->va_mask & AT_MTIME) {
+		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+	} else {
+		ZFS_TIME_ENCODE(&now, mtime);
+	}
+
+	/* Now add in all of the "SA" attributes */
+	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+	    &sa_hdl));
+
+	/*
+	 * Setup the array of attributes to be replaced/set on the new file
+	 *
+	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
+	 * in the old znode_phys_t format.  Don't change this ordering
+	 */
+	sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+		    NULL, &atime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+		    NULL, &mtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+		    NULL, &ctime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+		    NULL, &crtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+		    NULL, &gen, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+		    NULL, &mode, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+		    NULL, &size, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+		    NULL, &parent, 8);
+	} else {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+		    NULL, &mode, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+		    NULL, &size, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+		    NULL, &gen, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
+		    NULL, &acl_ids->z_fuid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
+		    NULL, &acl_ids->z_fgid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+		    NULL, &parent, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &pflags, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+		    NULL, &atime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+		    NULL, &mtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+		    NULL, &ctime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+		    NULL, &crtime, 16);
+	}
+
+	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
+		    &empty_xattr, 8);
+	}
+	if (obj_type == DMU_OT_ZNODE ||
+	    (vap->va_type == VBLK || vap->va_type == VCHR)) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
+		    NULL, &rdev, 8);
+
+	}
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &pflags, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+		    &acl_ids->z_fuid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+		    &acl_ids->z_fgid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
+		    sizeof (uint64_t) * 4);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+		    &acl_phys, sizeof (zfs_acl_phys_t));
+	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+		    &acl_ids->z_aclp->z_acl_count, 8);
+		locate.cb_aclp = acl_ids->z_aclp;
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
+		    zfs_acl_data_locator, &locate,
+		    acl_ids->z_aclp->z_acl_bytes);
+		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
+		    acl_ids->z_fuid, acl_ids->z_fgid);
+	}
+
+	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+
+	if (!(flag & IS_ROOT_NODE)) {
+		*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
+		ASSERT(*zpp != NULL);
+	} else {
+		/*
+		 * If we are creating the root node, the "parent" we
+		 * passed in is the znode for the root.
+		 */
+		*zpp = dzp;
+
+		(*zpp)->z_sa_hdl = sa_hdl;
+	}
+
+	(*zpp)->z_pflags = pflags;
+	(*zpp)->z_mode = mode;
+	(*zpp)->z_dnodesize = dnodesize;
+
+	if (vap->va_mask & AT_XVATTR)
+		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
+
+	if (obj_type == DMU_OT_ZNODE ||
+	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
+		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+	}
+	if (!(flag & IS_ROOT_NODE)) {
+		vnode_t *vp;
+
+		vp = ZTOV(*zpp);
+		vp->v_vflag |= VV_FORCEINSMQ;
+		err = insmntque(vp, zfsvfs->z_vfs);
+		vp->v_vflag &= ~VV_FORCEINSMQ;
+		KASSERT(err == 0, ("insmntque() failed: error %d", err));
+	}
+	kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+}
+
+/*
+ * Update in-core attributes.  It is assumed the caller will be doing an
+ * sa_bulk_update to push the changes out.
+ */
+void
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+	xoptattr_t *xoap;
+
+	xoap = xva_getxoptattr(xvap);
+	ASSERT(xoap);
+
+	ASSERT_VOP_IN_SEQC(ZTOV(zp));
+
+	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
+		uint64_t times[2];
+		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
+		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+		    &times, sizeof (times), tx);
+		XVA_SET_RTN(xvap, XAT_CREATETIME);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_READONLY);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_HIDDEN);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_SYSTEM);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_ARCHIVE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_NOUNLINK);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_APPENDONLY);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_NODUMP);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_OPAQUE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
+		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+		zfs_sa_set_scanstamp(zp, xvap, tx);
+		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_REPARSE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_OFFLINE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_SPARSE);
+	}
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+	dmu_object_info_t doi;
+	dmu_buf_t	*db;
+	znode_t		*zp;
+	vnode_t		*vp;
+	sa_handle_t	*hdl;
+	struct thread	*td;
+	int locked;
+	int err;
+
+	td = curthread;
+	getnewvnode_reserve_();
+again:
+	*zpp = NULL;
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	if (err) {
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		getnewvnode_drop_reserve();
+		return (err);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	if (doi.doi_bonus_type != DMU_OT_SA &&
+	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+		sa_buf_rele(db, NULL);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		getnewvnode_drop_reserve();
+		return (SET_ERROR(EINVAL));
+	}
+
+	hdl = dmu_buf_get_user(db);
+	if (hdl != NULL) {
+		zp  = sa_get_userdata(hdl);
+
+		/*
+		 * Since "SA" does immediate eviction we
+		 * should never find a sa handle that doesn't
+		 * know about the znode.
+		 */
+		ASSERT3P(zp, !=, NULL);
+		ASSERT3U(zp->z_id, ==, obj_num);
+		if (zp->z_unlinked) {
+			err = SET_ERROR(ENOENT);
+		} else {
+			vp = ZTOV(zp);
+			/*
+			 * Don't let the vnode disappear after
+			 * ZFS_OBJ_HOLD_EXIT.
+			 */
+			VN_HOLD(vp);
+			*zpp = zp;
+			err = 0;
+		}
+
+		sa_buf_rele(db, NULL);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+		if (err) {
+			getnewvnode_drop_reserve();
+			return (err);
+		}
+
+		locked = VOP_ISLOCKED(vp);
+		VI_LOCK(vp);
+		if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) {
+			/*
+			 * The vnode is doomed and this thread doesn't
+			 * hold the exclusive lock on it, so the vnode
+			 * must be being reclaimed by another thread.
+			 * Otherwise the doomed vnode is being reclaimed
+			 * by this thread and zfs_zget is called from
+			 * ZIL internals.
+			 */
+			VI_UNLOCK(vp);
+
+			/*
+			 * XXX vrele() locks the vnode when the last reference
+			 * is dropped.  Although in this case the vnode is
+			 * doomed / dead and so no inactivation is required,
+			 * the vnode lock is still acquired.  That could result
+			 * in a LOR with z_teardown_lock if another thread holds
+			 * the vnode's lock and tries to take z_teardown_lock.
+			 * But that is only possible if the other thread peforms
+			 * a ZFS vnode operation on the vnode.  That either
+			 * should not happen if the vnode is dead or the thread
+			 * should also have a reference to the vnode and thus
+			 * our reference is not last.
+			 */
+			VN_RELE(vp);
+			goto again;
+		}
+		VI_UNLOCK(vp);
+		getnewvnode_drop_reserve();
+		return (err);
+	}
+
+	/*
+	 * Not found create new znode/vnode
+	 * but only if file exists.
+	 *
+	 * There is a small window where zfs_vget() could
+	 * find this object while a file create is still in
+	 * progress.  This is checked for in zfs_znode_alloc()
+	 *
+	 * if zfs_znode_alloc() fails it will drop the hold on the
+	 * bonus buffer.
+	 */
+	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
+	    doi.doi_bonus_type, NULL);
+	if (zp == NULL) {
+		err = SET_ERROR(ENOENT);
+	} else {
+		*zpp = zp;
+	}
+	if (err == 0) {
+		vnode_t *vp = ZTOV(zp);
+
+		err = insmntque(vp, zfsvfs->z_vfs);
+		if (err == 0) {
+			vp->v_hash = obj_num;
+			VOP_UNLOCK1(vp);
+		} else {
+			zp->z_vnode = NULL;
+			zfs_znode_dmu_fini(zp);
+			zfs_znode_free(zp);
+			*zpp = NULL;
+		}
+	}
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+	getnewvnode_drop_reserve();
+	return (err);
+}
+
+int
+zfs_rezget(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	dmu_object_info_t doi;
+	dmu_buf_t *db;
+	vnode_t *vp;
+	uint64_t obj_num = zp->z_id;
+	uint64_t mode, size;
+	sa_bulk_attr_t bulk[8];
+	int err;
+	int count = 0;
+	uint64_t gen;
+
+	/*
+	 * Remove cached pages before reloading the znode, so that they are not
+	 * lingering after we run into any error.  Ideally, we should vgone()
+	 * the vnode in case of error, but currently we cannot do that
+	 * because of the LOR between the vnode lock and z_teardown_lock.
+	 * So, instead, we have to "doom" the znode in the illumos style.
+	 */
+	vp = ZTOV(zp);
+	vn_pages_remove(vp, 0, 0);
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+	mutex_enter(&zp->z_acl_lock);
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+	ASSERT(zp->z_sa_hdl == NULL);
+	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	if (err) {
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (err);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	if (doi.doi_bonus_type != DMU_OT_SA &&
+	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+		sa_buf_rele(db, NULL);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (SET_ERROR(EINVAL));
+	}
+
+	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
+	size = zp->z_size;
+
+	/* reload cached values */
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+	    &gen, sizeof (gen));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, sizeof (zp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &zp->z_links, sizeof (zp->z_links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+	    &zp->z_atime, sizeof (zp->z_atime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+	    &zp->z_uid, sizeof (zp->z_uid));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+	    &zp->z_gid, sizeof (zp->z_gid));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+	    &mode, sizeof (mode));
+
+	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
+		zfs_znode_dmu_fini(zp);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (SET_ERROR(EIO));
+	}
+
+	zp->z_mode = mode;
+
+	if (gen != zp->z_gen) {
+		zfs_znode_dmu_fini(zp);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (SET_ERROR(EIO));
+	}
+
+	/*
+	 * It is highly improbable but still quite possible that two
+	 * objects in different datasets are created with the same
+	 * object numbers and in transaction groups with the same
+	 * numbers.  znodes corresponding to those objects would
+	 * have the same z_id and z_gen, but their other attributes
+	 * may be different.
+	 * zfs recv -F may replace one of such objects with the other.
+	 * As a result file properties recorded in the replaced
+	 * object's vnode may no longer match the received object's
+	 * properties.  At present the only cached property is the
+	 * files type recorded in v_type.
+	 * So, handle this case by leaving the old vnode and znode
+	 * disassociated from the actual object.  A new vnode and a
+	 * znode will be created if the object is accessed
+	 * (e.g. via a look-up).  The old vnode and znode will be
+	 * recycled when the last vnode reference is dropped.
+	 */
+	if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
+		zfs_znode_dmu_fini(zp);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (SET_ERROR(EIO));
+	}
+
+	/*
+	 * If the file has zero links, then it has been unlinked on the send
+	 * side and it must be in the received unlinked set.
+	 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
+	 * stale data and to prevent automatically removal of the file in
+	 * zfs_zinactive().  The file will be removed either when it is removed
+	 * on the send side and the next incremental stream is received or
+	 * when the unlinked set gets processed.
+	 */
+	zp->z_unlinked = (zp->z_links == 0);
+	if (zp->z_unlinked) {
+		zfs_znode_dmu_fini(zp);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (0);
+	}
+
+	zp->z_blksz = doi.doi_data_block_size;
+	if (zp->z_size != size)
+		vnode_pager_setsize(vp, zp->z_size);
+
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+	return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	objset_t *os = zfsvfs->z_os;
+	uint64_t obj = zp->z_id;
+	uint64_t acl_obj = zfs_external_acl(zp);
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
+	if (acl_obj) {
+		VERIFY(!zp->z_is_sa);
+		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+	}
+	VERIFY(0 == dmu_object_free(os, obj, tx));
+	zfs_znode_dmu_fini(zp);
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+	zfs_znode_free(zp);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	uint64_t z_id = zp->z_id;
+
+	ASSERT(zp->z_sa_hdl);
+
+	/*
+	 * Don't allow a zfs_zget() while were trying to release this znode
+	 */
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
+
+	/*
+	 * If this was the last reference to a file with no links, remove
+	 * the file from the file system unless the file system is mounted
+	 * read-only.  That can happen, for example, if the file system was
+	 * originally read-write, the file was opened, then unlinked and
+	 * the file system was made read-only before the file was finally
+	 * closed.  The file will remain in the unlinked set.
+	 */
+	if (zp->z_unlinked) {
+		ASSERT(!zfsvfs->z_issnap);
+		if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) {
+			ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+			zfs_rmnode(zp);
+			return;
+		}
+	}
+
+	zfs_znode_dmu_fini(zp);
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+	zfs_znode_free(zp);
+}
+
+void
+zfs_znode_free(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ASSERT(zp->z_sa_hdl == NULL);
+	zp->z_vnode = NULL;
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	POINTER_INVALIDATE(&zp->z_zfsvfs);
+	list_remove(&zfsvfs->z_all_znodes, zp);
+	zfsvfs->z_nr_znodes--;
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
+	zfs_znode_free_kmem(zp);
+}
+
+void
+zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2],
+    uint64_t ctime[2], boolean_t have_tx)
+{
+	timestruc_t	now;
+
+	vfs_timestamp(&now);
+
+	if (have_tx) {	/* will sa_bulk_update happen really soon? */
+		zp->z_atime_dirty = 0;
+		zp->z_seq++;
+	} else {
+		zp->z_atime_dirty = 1;
+	}
+
+	if (flag & AT_ATIME) {
+		ZFS_TIME_ENCODE(&now, zp->z_atime);
+	}
+
+	if (flag & AT_MTIME) {
+		ZFS_TIME_ENCODE(&now, mtime);
+		if (zp->z_zfsvfs->z_use_fuids) {
+			zp->z_pflags |= (ZFS_ARCHIVE |
+			    ZFS_AV_MODIFIED);
+		}
+	}
+
+	if (flag & AT_CTIME) {
+		ZFS_TIME_ENCODE(&now, ctime);
+		if (zp->z_zfsvfs->z_use_fuids)
+			zp->z_pflags |= ZFS_ARCHIVE;
+	}
+}
+
+
+void
+zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+    uint64_t ctime[2])
+{
+	zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE);
+}
+/*
+ * Grow the block size for a file.
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		size	- requested block size
+ *		tx	- open transaction.
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+void
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+	int		error;
+	u_longlong_t	dummy;
+
+	if (size <= zp->z_blksz)
+		return;
+	/*
+	 * If the file size is already greater than the current blocksize,
+	 * we will not grow.  If there is more than one block in a file,
+	 * the blocksize cannot change.
+	 */
+	if (zp->z_blksz && zp->z_size > zp->z_blksz)
+		return;
+
+	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
+	    size, 0, tx);
+
+	if (error == ENOTSUP)
+		return;
+	ASSERT0(error);
+
+	/* What blocksize did we actually get? */
+	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
+}
+
+/*
+ * Increase the file length
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		end	- new end-of-file
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+static int
+zfs_extend(znode_t *zp, uint64_t end)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	dmu_tx_t *tx;
+	zfs_locked_range_t *lr;
+	uint64_t newblksz;
+	int error;
+
+	/*
+	 * We will change zp_size, lock the whole file.
+	 */
+	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (end <= zp->z_size) {
+		zfs_rangelock_exit(lr);
+		return (0);
+	}
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	if (end > zp->z_blksz &&
+	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
+		/*
+		 * We are growing the file past the current block size.
+		 */
+		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+			/*
+			 * File's blocksize is already larger than the
+			 * "recordsize" property.  Only let it grow to
+			 * the next power of 2.
+			 */
+			ASSERT(!ISP2(zp->z_blksz));
+			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
+		} else {
+			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
+		}
+		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
+	} else {
+		newblksz = 0;
+	}
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_rangelock_exit(lr);
+		return (error);
+	}
+
+	if (newblksz)
+		zfs_grow_blocksize(zp, newblksz, tx);
+
+	zp->z_size = end;
+
+	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
+	    &zp->z_size, sizeof (zp->z_size), tx));
+
+	vnode_pager_setsize(ZTOV(zp), end);
+
+	zfs_rangelock_exit(lr);
+
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
+/*
+ * Free space in a file.
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		off	- start of section to free.
+ *		len	- length of section to free.
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+static int
+zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	zfs_locked_range_t *lr;
+	int error;
+
+	/*
+	 * Lock the range being freed.
+	 */
+	lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (off >= zp->z_size) {
+		zfs_rangelock_exit(lr);
+		return (0);
+	}
+
+	if (off + len > zp->z_size)
+		len = zp->z_size - off;
+
+	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
+
+	if (error == 0) {
+		/*
+		 * In FreeBSD we cannot free block in the middle of a file,
+		 * but only at the end of a file, so this code path should
+		 * never happen.
+		 */
+		vnode_pager_setsize(ZTOV(zp), off);
+	}
+
+	zfs_rangelock_exit(lr);
+
+	return (error);
+}
+
+/*
+ * Truncate a file
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		end	- new end-of-file.
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+static int
+zfs_trunc(znode_t *zp, uint64_t end)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	vnode_t *vp = ZTOV(zp);
+	dmu_tx_t *tx;
+	zfs_locked_range_t *lr;
+	int error;
+	sa_bulk_attr_t bulk[2];
+	int count = 0;
+
+	/*
+	 * We will change zp_size, lock the whole file.
+	 */
+	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (end >= zp->z_size) {
+		zfs_rangelock_exit(lr);
+		return (0);
+	}
+
+	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
+	    DMU_OBJECT_END);
+	if (error) {
+		zfs_rangelock_exit(lr);
+		return (error);
+	}
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_rangelock_exit(lr);
+		return (error);
+	}
+
+	zp->z_size = end;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+	    NULL, &zp->z_size, sizeof (zp->z_size));
+
+	if (end == 0) {
+		zp->z_pflags &= ~ZFS_SPARSE;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &zp->z_pflags, 8);
+	}
+	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
+
+	dmu_tx_commit(tx);
+
+	/*
+	 * Clear any mapped pages in the truncated region.  This has to
+	 * happen outside of the transaction to avoid the possibility of
+	 * a deadlock with someone trying to push a page that we are
+	 * about to invalidate.
+	 */
+	vnode_pager_setsize(vp, end);
+
+	zfs_rangelock_exit(lr);
+
+	return (0);
+}
+
+/*
+ * Free space in a file
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		off	- start of range
+ *		len	- end of range (0 => EOF)
+ *		flag	- current file open mode flags.
+ *		log	- TRUE if this action should be logged
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+	dmu_tx_t *tx;
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	zilog_t *zilog = zfsvfs->z_log;
+	uint64_t mode;
+	uint64_t mtime[2], ctime[2];
+	sa_bulk_attr_t bulk[3];
+	int count = 0;
+	int error;
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
+	    sizeof (mode))) != 0)
+		return (error);
+
+	if (off > zp->z_size) {
+		error =  zfs_extend(zp, off+len);
+		if (error == 0 && log)
+			goto log;
+		else
+			return (error);
+	}
+
+	if (len == 0) {
+		error = zfs_trunc(zp, off);
+	} else {
+		if ((error = zfs_free_range(zp, off, len)) == 0 &&
+		    off + len > zp->z_size)
+			error = zfs_extend(zp, off+len);
+	}
+	if (error || !log)
+		return (error);
+log:
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+	    NULL, &zp->z_pflags, 8);
+	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+	ASSERT(error == 0);
+
+	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
+{
+	uint64_t	moid, obj, sa_obj, version;
+	uint64_t	sense = ZFS_CASE_SENSITIVE;
+	uint64_t	norm = 0;
+	nvpair_t	*elem;
+	int		error;
+	int		i;
+	znode_t		*rootzp = NULL;
+	zfsvfs_t	*zfsvfs;
+	vattr_t		vattr;
+	znode_t		*zp;
+	zfs_acl_ids_t	acl_ids;
+
+	/*
+	 * First attempt to create master node.
+	 */
+	/*
+	 * In an empty objset, there are no blocks to read and thus
+	 * there can be no i/o errors (which we assert below).
+	 */
+	moid = MASTER_NODE_OBJ;
+	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+
+	/*
+	 * Set starting attributes.
+	 */
+	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
+		/* For the moment we expect all zpl props to be uint64_ts */
+		uint64_t val;
+		char *name;
+
+		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
+		VERIFY(nvpair_value_uint64(elem, &val) == 0);
+		name = nvpair_name(elem);
+		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
+			if (val < version)
+				version = val;
+		} else {
+			error = zap_update(os, moid, name, 8, 1, &val, tx);
+		}
+		ASSERT(error == 0);
+		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
+			norm = val;
+		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
+			sense = val;
+	}
+	ASSERT(version != 0);
+	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
+
+	/*
+	 * Create zap object used for SA attribute registration
+	 */
+
+	if (version >= ZPL_VERSION_SA) {
+		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+		    DMU_OT_NONE, 0, tx);
+		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+		ASSERT(error == 0);
+	} else {
+		sa_obj = 0;
+	}
+	/*
+	 * Create a delete queue.
+	 */
+	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+
+	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
+	ASSERT(error == 0);
+
+	/*
+	 * Create root znode.  Create minimal znode/vnode/zfsvfs
+	 * to allow zfs_mknode to work.
+	 */
+	VATTR_NULL(&vattr);
+	vattr.va_mask = AT_MODE|AT_UID|AT_GID;
+	vattr.va_type = VDIR;
+	vattr.va_mode = S_IFDIR|0755;
+	vattr.va_uid = crgetuid(cr);
+	vattr.va_gid = crgetgid(cr);
+
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+	rootzp = zfs_znode_alloc_kmem(KM_SLEEP);
+	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
+	rootzp->z_moved = 0;
+	rootzp->z_unlinked = 0;
+	rootzp->z_atime_dirty = 0;
+	rootzp->z_is_sa = USE_SA(version, os);
+
+	zfsvfs->z_os = os;
+	zfsvfs->z_parent = zfsvfs;
+	zfsvfs->z_version = version;
+	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
+	zfsvfs->z_use_sa = USE_SA(version, os);
+	zfsvfs->z_norm = norm;
+
+	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+	    &zfsvfs->z_attr_table);
+
+	ASSERT(error == 0);
+
+	/*
+	 * Fold case on file systems that are always or sometimes case
+	 * insensitive.
+	 */
+	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
+		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+	    offsetof(znode_t, z_link_node));
+
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+	rootzp->z_zfsvfs = zfsvfs;
+	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+	    cr, NULL, &acl_ids));
+	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
+	ASSERT3P(zp, ==, rootzp);
+	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
+	ASSERT(error == 0);
+	zfs_acl_ids_free(&acl_ids);
+	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
+
+	sa_handle_destroy(rootzp->z_sa_hdl);
+	zfs_znode_free_kmem(rootzp);
+
+	/*
+	 * Create shares directory
+	 */
+
+	error = zfs_create_share_dir(zfsvfs, tx);
+
+	ASSERT(error == 0);
+
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+#endif /* _KERNEL */
+
+static int
+zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
+{
+	uint64_t sa_obj = 0;
+	int error;
+
+	error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+	if (error != 0 && error != ENOENT)
+		return (error);
+
+	error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
+	return (error);
+}
+
+static int
+zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
+    dmu_buf_t **db, void *tag)
+{
+	dmu_object_info_t doi;
+	int error;
+
+	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
+		return (error);
+
+	dmu_object_info_from_db(*db, &doi);
+	if ((doi.doi_bonus_type != DMU_OT_SA &&
+	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t))) {
+		sa_buf_rele(*db, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
+	if (error != 0) {
+		sa_buf_rele(*db, tag);
+		return (error);
+	}
+
+	return (0);
+}
+
+static void
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
+{
+	sa_handle_destroy(hdl);
+	sa_buf_rele(db, tag);
+}
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
+    uint64_t *pobjp, int *is_xattrdir)
+{
+	uint64_t parent;
+	uint64_t pflags;
+	uint64_t mode;
+	uint64_t parent_mode;
+	sa_bulk_attr_t bulk[3];
+	sa_handle_t *sa_hdl;
+	dmu_buf_t *sa_db;
+	int count = 0;
+	int error;
+
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
+	    &parent, sizeof (parent));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
+	    &pflags, sizeof (pflags));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+	    &mode, sizeof (mode));
+
+	if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
+		return (error);
+
+	/*
+	 * When a link is removed its parent pointer is not changed and will
+	 * be invalid.  There are two cases where a link is removed but the
+	 * file stays around, when it goes to the delete queue and when there
+	 * are additional links.
+	 */
+	error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
+	zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+	if (error != 0)
+		return (error);
+
+	*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
+
+	/*
+	 * Extended attributes can be applied to files, directories, etc.
+	 * Otherwise the parent must be a directory.
+	 */
+	if (!*is_xattrdir && !S_ISDIR(parent_mode))
+		return (SET_ERROR(EINVAL));
+
+	*pobjp = parent;
+
+	return (0);
+}
+
+/*
+ * Given an object number, return some zpl level statistics
+ */
+static int
+zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
+    zfs_stat_t *sb)
+{
+	sa_bulk_attr_t bulk[4];
+	int count = 0;
+
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+	    &sb->zs_mode, sizeof (sb->zs_mode));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
+	    &sb->zs_gen, sizeof (sb->zs_gen));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
+	    &sb->zs_links, sizeof (sb->zs_links));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
+	    &sb->zs_ctime, sizeof (sb->zs_ctime));
+
+	return (sa_bulk_lookup(hdl, bulk, count));
+}
+
+static int
+zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
+    sa_attr_type_t *sa_table, char *buf, int len)
+{
+	sa_handle_t *sa_hdl;
+	sa_handle_t *prevhdl = NULL;
+	dmu_buf_t *prevdb = NULL;
+	dmu_buf_t *sa_db = NULL;
+	char *path = buf + len - 1;
+	int error;
+
+	*path = '\0';
+	sa_hdl = hdl;
+
+	uint64_t deleteq_obj;
+	VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
+	    ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
+	error = zap_lookup_int(osp, deleteq_obj, obj);
+	if (error == 0) {
+		return (ESTALE);
+	} else if (error != ENOENT) {
+		return (error);
+	}
+	error = 0;
+
+	for (;;) {
+		uint64_t pobj;
+		char component[MAXNAMELEN + 2];
+		size_t complen;
+		int is_xattrdir;
+
+		if (prevdb)
+			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
+
+		if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
+		    &is_xattrdir)) != 0)
+			break;
+
+		if (pobj == obj) {
+			if (path[0] != '/')
+				*--path = '/';
+			break;
+		}
+
+		component[0] = '/';
+		if (is_xattrdir) {
+			(void) sprintf(component + 1, "<xattrdir>");
+		} else {
+			error = zap_value_search(osp, pobj, obj,
+			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
+			if (error != 0)
+				break;
+		}
+
+		complen = strlen(component);
+		path -= complen;
+		ASSERT(path >= buf);
+		bcopy(component, path, complen);
+		obj = pobj;
+
+		if (sa_hdl != hdl) {
+			prevhdl = sa_hdl;
+			prevdb = sa_db;
+		}
+		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
+		if (error != 0) {
+			sa_hdl = prevhdl;
+			sa_db = prevdb;
+			break;
+		}
+	}
+
+	if (sa_hdl != NULL && sa_hdl != hdl) {
+		ASSERT(sa_db != NULL);
+		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+	}
+
+	if (error == 0)
+		(void) memmove(buf, path, buf + len - path);
+
+	return (error);
+}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+	sa_attr_type_t *sa_table;
+	sa_handle_t *hdl;
+	dmu_buf_t *db;
+	int error;
+
+	error = zfs_sa_setup(osp, &sa_table);
+	if (error != 0)
+		return (error);
+
+	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+	zfs_release_sa_handle(hdl, db, FTAG);
+	return (error);
+}
+
+int
+zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+    char *buf, int len)
+{
+	char *path = buf + len - 1;
+	sa_attr_type_t *sa_table;
+	sa_handle_t *hdl;
+	dmu_buf_t *db;
+	int error;
+
+	*path = '\0';
+
+	error = zfs_sa_setup(osp, &sa_table);
+	if (error != 0)
+		return (error);
+
+	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
+	if (error != 0) {
+		zfs_release_sa_handle(hdl, db, FTAG);
+		return (error);
+	}
+
+	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+	zfs_release_sa_handle(hdl, db, FTAG);
+	return (error);
+}
+
+#ifdef _KERNEL
+int
+zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	uint64_t parent;
+	int is_xattrdir;
+	int err;
+
+	/* Extended attributes should not be visible as regular files. */
+	if ((zp->z_pflags & ZFS_XATTR) != 0)
+		return (SET_ERROR(EINVAL));
+
+	err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table,
+	    &parent, &is_xattrdir);
+	if (err != 0)
+		return (err);
+	ASSERT0(is_xattrdir);
+
+	/* No name as this is a root object. */
+	if (parent == zp->z_id)
+		return (SET_ERROR(EINVAL));
+
+	err = zap_value_search(zfsvfs->z_os, parent, zp->z_id,
+	    ZFS_DIRENT_OBJ(-1ULL), buf);
+	if (err != 0)
+		return (err);
+	err = zfs_zget(zfsvfs, parent, dzpp);
+	return (err);
+}
+#endif /* _KERNEL */
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
new file mode 100644
index 000000000000..d89ef80edd66
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
@@ -0,0 +1,1882 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/zio_crypt.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+#include <sys/sha2.h>
+#include <sys/hkdf.h>
+
+/*
+ * This file is responsible for handling all of the details of generating
+ * encryption parameters and performing encryption and authentication.
+ *
+ * BLOCK ENCRYPTION PARAMETERS:
+ * Encryption /Authentication Algorithm Suite (crypt):
+ * The encryption algorithm, mode, and key length we are going to use. We
+ * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
+ * keys. All authentication is currently done with SHA512-HMAC.
+ *
+ * Plaintext:
+ * The unencrypted data that we want to encrypt.
+ *
+ * Initialization Vector (IV):
+ * An initialization vector for the encryption algorithms. This is used to
+ * "tweak" the encryption algorithms so that two blocks of the same data are
+ * encrypted into different ciphertext outputs, thus obfuscating block patterns.
+ * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
+ * never reused with the same encryption key. This value is stored unencrypted
+ * and must simply be provided to the decryption function. We use a 96 bit IV
+ * (as recommended by NIST) for all block encryption. For non-dedup blocks we
+ * derive the IV randomly. The first 64 bits of the IV are stored in the second
+ * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
+ * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
+ * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
+ * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
+ * level 0 blocks is the number of allocated dnodes in that block. The on-disk
+ * format supports at most 2^15 slots per L0 dnode block, because the maximum
+ * block size is 16MB (2^24). In either case, for level 0 blocks this number
+ * will still be smaller than UINT32_MAX so it is safe to store the IV in the
+ * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
+ * for the dnode code.
+ *
+ * Master key:
+ * This is the most important secret data of an encrypted dataset. It is used
+ * along with the salt to generate that actual encryption keys via HKDF. We
+ * do not use the master key to directly encrypt any data because there are
+ * theoretical limits on how much data can actually be safely encrypted with
+ * any encryption mode. The master key is stored encrypted on disk with the
+ * user's wrapping key. Its length is determined by the encryption algorithm.
+ * For details on how this is stored see the block comment in dsl_crypt.c
+ *
+ * Salt:
+ * Used as an input to the HKDF function, along with the master key. We use a
+ * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
+ * can be used for encrypting many blocks, so we cache the current salt and the
+ * associated derived key in zio_crypt_t so we do not need to derive it again
+ * needlessly.
+ *
+ * Encryption Key:
+ * A secret binary key, generated from an HKDF function used to encrypt and
+ * decrypt data.
+ *
+ * Message Authentication Code (MAC)
+ * The MAC is an output of authenticated encryption modes such as AES-GCM and
+ * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
+ * data on disk and return garbage to the application. Effectively, it is a
+ * checksum that can not be reproduced by an attacker. We store the MAC in the
+ * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
+ * regular checksum of the ciphertext which can be used for scrubbing.
+ *
+ * OBJECT AUTHENTICATION:
+ * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
+ * they contain some info that always needs to be readable. To prevent this
+ * data from being altered, we authenticate this data using SHA512-HMAC. This
+ * will produce a MAC (similar to the one produced via encryption) which can
+ * be used to verify the object was not modified. HMACs do not require key
+ * rotation or IVs, so we can keep up to the full 3 copies of authenticated
+ * data.
+ *
+ * ZIL ENCRYPTION:
+ * ZIL blocks have their bp written to disk ahead of the associated data, so we
+ * cannot store the MAC there as we normally do. For these blocks the MAC is
+ * stored in the embedded checksum within the zil_chain_t header. The salt and
+ * IV are generated for the block on bp allocation instead of at encryption
+ * time. In addition, ZIL blocks have some pieces that must be left in plaintext
+ * for claiming even though all of the sensitive user data still needs to be
+ * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
+ * pieces of the block need to be encrypted. All data that is not encrypted is
+ * authenticated using the AAD mechanisms that the supported encryption modes
+ * provide for. In order to preserve the semantics of the ZIL for encrypted
+ * datasets, the ZIL is not protected at the objset level as described below.
+ *
+ * DNODE ENCRYPTION:
+ * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
+ * in plaintext for scrubbing and claiming, but the bonus buffers might contain
+ * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
+ * which which pieces of the block need to be encrypted. For more details about
+ * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
+ *
+ * OBJECT SET AUTHENTICATION:
+ * Up to this point, everything we have encrypted and authenticated has been
+ * at level 0 (or -2 for the ZIL). If we did not do any further work the
+ * on-disk format would be susceptible to attacks that deleted or rearranged
+ * the order of level 0 blocks. Ideally, the cleanest solution would be to
+ * maintain a tree of authentication MACs going up the bp tree. However, this
+ * presents a problem for raw sends. Send files do not send information about
+ * indirect blocks so there would be no convenient way to transfer the MACs and
+ * they cannot be recalculated on the receive side without the master key which
+ * would defeat one of the purposes of raw sends in the first place. Instead,
+ * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
+ * from the level below. We also include some portable fields from blk_prop such
+ * as the lsize and compression algorithm to prevent the data from being
+ * misinterpreted.
+ *
+ * At the objset level, we maintain 2 separate 256 bit MACs in the
+ * objset_phys_t. The first one is "portable" and is the logical root of the
+ * MAC tree maintained in the metadnode's bps. The second, is "local" and is
+ * used as the root MAC for the user accounting objects, which are also not
+ * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
+ * of the send file. The useraccounting code ensures that the useraccounting
+ * info is not present upon a receive, so the local MAC can simply be cleared
+ * out at that time. For more info about objset_phys_t authentication, see
+ * zio_crypt_do_objset_hmacs().
+ *
+ * CONSIDERATIONS FOR DEDUP:
+ * In order for dedup to work, blocks that we want to dedup with one another
+ * need to use the same IV and encryption key, so that they will have the same
+ * ciphertext. Normally, one should never reuse an IV with the same encryption
+ * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
+ * blocks. In this case, however, since we are using the same plaintext as
+ * well all that we end up with is a duplicate of the original ciphertext we
+ * already had. As a result, an attacker with read access to the raw disk will
+ * be able to tell which blocks are the same but this information is given away
+ * by dedup anyway. In order to get the same IVs and encryption keys for
+ * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
+ * here so that a reproducible checksum of the plaintext is never available to
+ * the attacker. The HMAC key is kept alongside the master key, encrypted on
+ * disk. The first 64 bits of the HMAC are used in place of the random salt, and
+ * the next 96 bits are used as the IV. As a result of this mechanism, dedup
+ * will only work within a clone family since encrypted dedup requires use of
+ * the same master and HMAC keys.
+ */
+
+/*
+ * After encrypting many blocks with the same key we may start to run up
+ * against the theoretical limits of how much data can securely be encrypted
+ * with a single key using the supported encryption modes. The most obvious
+ * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
+ * the more IVs we generate (which both GCM and CCM modes strictly forbid).
+ * This risk actually grows surprisingly quickly over time according to the
+ * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
+ * generated n IVs with a cryptographically secure RNG, the approximate
+ * probability p(n) of a collision is given as:
+ *
+ * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
+ *
+ * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
+ *
+ * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
+ * we must not write more than 398,065,730 blocks with the same encryption key.
+ * Therefore, we rotate our keys after 400,000,000 blocks have been written by
+ * generating a new random 64 bit salt for our HKDF encryption key generation
+ * function.
+ */
+#define	ZFS_KEY_MAX_SALT_USES_DEFAULT	400000000
+#define	ZFS_CURRENT_MAX_SALT_USES	\
+	(MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
+unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
+
+/*
+ * Set to a nonzero value to cause zio_do_crypt_uio() to fail 1/this many
+ * calls, to test decryption error handling code paths.
+ */
+uint64_t zio_decrypt_fail_fraction = 0;
+
+typedef struct blkptr_auth_buf {
+	uint64_t bab_prop;			/* blk_prop - portable mask */
+	uint8_t bab_mac[ZIO_DATA_MAC_LEN];	/* MAC from blk_cksum */
+	uint64_t bab_pad;			/* reserved for future use */
+} blkptr_auth_buf_t;
+
+zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
+	{"",			ZC_TYPE_NONE,	0,	"inherit"},
+	{"",			ZC_TYPE_NONE,	0,	"on"},
+	{"",			ZC_TYPE_NONE,	0,	"off"},
+	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	16,	"aes-128-ccm"},
+	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	24,	"aes-192-ccm"},
+	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	32,	"aes-256-ccm"},
+	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	16,	"aes-128-gcm"},
+	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	24,	"aes-192-gcm"},
+	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	32,	"aes-256-gcm"}
+};
+
+static void
+zio_crypt_key_destroy_early(zio_crypt_key_t *key)
+{
+	rw_destroy(&key->zk_salt_lock);
+
+	/* free crypto templates */
+	bzero(&key->zk_session, sizeof (key->zk_session));
+
+	/* zero out sensitive data */
+	bzero(key, sizeof (zio_crypt_key_t));
+}
+
+void
+zio_crypt_key_destroy(zio_crypt_key_t *key)
+{
+
+	freebsd_crypt_freesession(&key->zk_session);
+	zio_crypt_key_destroy_early(key);
+}
+
+int
+zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
+{
+	int ret;
+	crypto_mechanism_t mech __unused;
+	uint_t keydata_len;
+	zio_crypt_info_t *ci = NULL;
+
+	ASSERT(key != NULL);
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+
+	ci = &zio_crypt_table[crypt];
+	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
+	    ci->ci_crypt_type != ZC_TYPE_CCM)
+		return (ENOTSUP);
+
+	keydata_len = zio_crypt_table[crypt].ci_keylen;
+	bzero(key, sizeof (zio_crypt_key_t));
+	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+	/* fill keydata buffers and salt with random data */
+	ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
+	if (ret != 0)
+		goto error;
+
+	ret = random_get_bytes(key->zk_master_keydata, keydata_len);
+	if (ret != 0)
+		goto error;
+
+	ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
+	if (ret != 0)
+		goto error;
+
+	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+	if (ret != 0)
+		goto error;
+
+	/* derive the current key from the master key */
+	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+	    keydata_len);
+	if (ret != 0)
+		goto error;
+
+	/* initialize keys for the ICP */
+	key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_current_key.ck_data = key->zk_current_keydata;
+	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+	key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_hmac_key.ck_data = &key->zk_hmac_key;
+	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+	ci = &zio_crypt_table[crypt];
+	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
+	    ci->ci_crypt_type != ZC_TYPE_CCM)
+		return (ENOTSUP);
+
+	ret = freebsd_crypt_newsession(&key->zk_session, ci,
+	    &key->zk_current_key);
+	if (ret)
+		goto error;
+
+	key->zk_crypt = crypt;
+	key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
+	key->zk_salt_count = 0;
+
+	return (0);
+
+error:
+	zio_crypt_key_destroy_early(key);
+	return (ret);
+}
+
+static int
+zio_crypt_key_change_salt(zio_crypt_key_t *key)
+{
+	int ret = 0;
+	uint8_t salt[ZIO_DATA_SALT_LEN];
+	crypto_mechanism_t mech __unused;
+
+	uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
+
+	/* generate a new salt */
+	ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
+	if (ret != 0)
+		goto error;
+
+	rw_enter(&key->zk_salt_lock, RW_WRITER);
+
+	/* someone beat us to the salt rotation, just unlock and return */
+	if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
+		goto out_unlock;
+
+	/* derive the current key from the master key and the new salt */
+	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+	    salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
+	if (ret != 0)
+		goto out_unlock;
+
+	/* assign the salt and reset the usage count */
+	bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
+	key->zk_salt_count = 0;
+
+	freebsd_crypt_freesession(&key->zk_session);
+	ret = freebsd_crypt_newsession(&key->zk_session,
+	    &zio_crypt_table[key->zk_crypt], &key->zk_current_key);
+	if (ret != 0)
+		goto out_unlock;
+
+	rw_exit(&key->zk_salt_lock);
+
+	return (0);
+
+out_unlock:
+	rw_exit(&key->zk_salt_lock);
+error:
+	return (ret);
+}
+
+/* See comment above zfs_key_max_salt_uses definition for details */
+int
+zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
+{
+	int ret;
+	boolean_t salt_change;
+
+	rw_enter(&key->zk_salt_lock, RW_READER);
+
+	bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
+	salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
+	    ZFS_CURRENT_MAX_SALT_USES);
+
+	rw_exit(&key->zk_salt_lock);
+
+	if (salt_change) {
+		ret = zio_crypt_key_change_salt(key);
+		if (ret != 0)
+			goto error;
+	}
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+void *failed_decrypt_buf;
+int failed_decrypt_size;
+
+/*
+ * This function handles all encryption and decryption in zfs. When
+ * encrypting it expects puio to reference the plaintext and cuio to
+ * reference the ciphertext. cuio must have enough space for the
+ * ciphertext + room for a MAC. datalen should be the length of the
+ * plaintext / ciphertext alone.
+ */
+/*
+ * The implementation for FreeBSD's OpenCrypto.
+ *
+ * The big difference between ICP and FOC is that FOC uses a single
+ * buffer for input and output.  This means that (for AES-GCM, the
+ * only one supported right now) the source must be copied into the
+ * destination, and the destination must have the AAD, and the tag/MAC,
+ * already associated with it.  (Both implementations can use a uio.)
+ *
+ * Since the auth data is part of the iovec array, all we need to know
+ * is the length:  0 means there's no AAD.
+ *
+ */
+static int
+zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess,
+    uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen,
+    uio_t *uio, uint_t auth_len)
+{
+	zio_crypt_info_t *ci;
+	int ret;
+
+	ci = &zio_crypt_table[crypt];
+	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
+	    ci->ci_crypt_type != ZC_TYPE_CCM)
+		return (ENOTSUP);
+
+
+	ret = freebsd_crypt_uio(encrypt, sess, ci, uio, key, ivbuf,
+	    datalen, auth_len);
+	if (ret != 0) {
+#ifdef FCRYPTO_DEBUG
+		printf("%s(%d):  Returning error %s\n",
+		    __FUNCTION__, __LINE__, encrypt ? "EIO" : "ECKSUM");
+#endif
+		ret = SET_ERROR(encrypt ? EIO : ECKSUM);
+	}
+
+	return (ret);
+}
+
+int
+zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
+    uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
+{
+	int ret;
+	uint64_t aad[3];
+	/*
+	 * With OpenCrypto in FreeBSD, the same buffer is used for
+	 * input and output.  Also, the AAD (for AES-GMC at least)
+	 * needs to logically go in front.
+	 */
+	uio_t cuio;
+	iovec_t iovecs[4];
+	uint64_t crypt = key->zk_crypt;
+	uint_t enc_len, keydata_len, aad_len;
+
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+	ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+	keydata_len = zio_crypt_table[crypt].ci_keylen;
+
+	/* generate iv for wrapping the master and hmac key */
+	ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
+	if (ret != 0)
+		goto error;
+
+	/*
+	 * Since we only support one buffer, we need to copy
+	 * the plain text (source) to the cipher buffer (dest).
+	 * We set iovecs[0] -- the authentication data -- below.
+	 */
+	bcopy((void*)key->zk_master_keydata, keydata_out, keydata_len);
+	bcopy((void*)key->zk_hmac_keydata, hmac_keydata_out,
+	    SHA512_HMAC_KEYLEN);
+	iovecs[1].iov_base = keydata_out;
+	iovecs[1].iov_len = keydata_len;
+	iovecs[2].iov_base = hmac_keydata_out;
+	iovecs[2].iov_len = SHA512_HMAC_KEYLEN;
+	iovecs[3].iov_base = mac;
+	iovecs[3].iov_len = WRAPPING_MAC_LEN;
+
+	/*
+	 * Although we don't support writing to the old format, we do
+	 * support rewrapping the key so that the user can move and
+	 * quarantine datasets on the old format.
+	 */
+	if (key->zk_version == 0) {
+		aad_len = sizeof (uint64_t);
+		aad[0] = LE_64(key->zk_guid);
+	} else {
+		ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+		aad_len = sizeof (uint64_t) * 3;
+		aad[0] = LE_64(key->zk_guid);
+		aad[1] = LE_64(crypt);
+		aad[2] = LE_64(key->zk_version);
+	}
+
+	iovecs[0].iov_base = aad;
+	iovecs[0].iov_len = aad_len;
+	enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
+
+	cuio.uio_iov = iovecs;
+	cuio.uio_iovcnt = 4;
+	cuio.uio_segflg = UIO_SYSSPACE;
+
+	/* encrypt the keys and store the resulting ciphertext and mac */
+	ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey,
+	    iv, enc_len, &cuio, aad_len);
+	if (ret != 0)
+		goto error;
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+int
+zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
+    uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
+    uint8_t *mac, zio_crypt_key_t *key)
+{
+	int ret;
+	uint64_t aad[3];
+	/*
+	 * With OpenCrypto in FreeBSD, the same buffer is used for
+	 * input and output.  Also, the AAD (for AES-GMC at least)
+	 * needs to logically go in front.
+	 */
+	uio_t cuio;
+	iovec_t iovecs[4];
+	void *src, *dst;
+	uint_t enc_len, keydata_len, aad_len;
+
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+	ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+	keydata_len = zio_crypt_table[crypt].ci_keylen;
+	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+	/*
+	 * Since we only support one buffer, we need to copy
+	 * the encrypted buffer (source) to the plain buffer
+	 * (dest).  We set iovecs[0] -- the authentication data --
+	 * below.
+	 */
+	dst = key->zk_master_keydata;
+	src = keydata;
+
+	bcopy(src, dst, keydata_len);
+
+	dst = key->zk_hmac_keydata;
+	src = hmac_keydata;
+	bcopy(src, dst, SHA512_HMAC_KEYLEN);
+
+	iovecs[1].iov_base = key->zk_master_keydata;
+	iovecs[1].iov_len = keydata_len;
+	iovecs[2].iov_base = key->zk_hmac_keydata;
+	iovecs[2].iov_len = SHA512_HMAC_KEYLEN;
+	iovecs[3].iov_base = mac;
+	iovecs[3].iov_len = WRAPPING_MAC_LEN;
+
+	if (version == 0) {
+		aad_len = sizeof (uint64_t);
+		aad[0] = LE_64(guid);
+	} else {
+		ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+		aad_len = sizeof (uint64_t) * 3;
+		aad[0] = LE_64(guid);
+		aad[1] = LE_64(crypt);
+		aad[2] = LE_64(version);
+	}
+
+	enc_len = keydata_len + SHA512_HMAC_KEYLEN;
+	iovecs[0].iov_base = aad;
+	iovecs[0].iov_len = aad_len;
+
+	cuio.uio_iov = iovecs;
+	cuio.uio_iovcnt = 4;
+	cuio.uio_segflg = UIO_SYSSPACE;
+
+	/* decrypt the keys and store the result in the output buffers */
+	ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey,
+	    iv, enc_len, &cuio, aad_len);
+
+	if (ret != 0)
+		goto error;
+
+	/* generate a fresh salt */
+	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+	if (ret != 0)
+		goto error;
+
+	/* derive the current key from the master key */
+	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+	    keydata_len);
+	if (ret != 0)
+		goto error;
+
+	/* initialize keys for ICP */
+	key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_current_key.ck_data = key->zk_current_keydata;
+	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+	key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
+	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+	ret = freebsd_crypt_newsession(&key->zk_session,
+	    &zio_crypt_table[crypt], &key->zk_current_key);
+	if (ret != 0)
+		goto error;
+
+	key->zk_crypt = crypt;
+	key->zk_version = version;
+	key->zk_guid = guid;
+	key->zk_salt_count = 0;
+
+	return (0);
+
+error:
+	zio_crypt_key_destroy_early(key);
+	return (ret);
+}
+
+int
+zio_crypt_generate_iv(uint8_t *ivbuf)
+{
+	int ret;
+
+	/* randomly generate the IV */
+	ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
+	if (ret != 0)
+		goto error;
+
+	return (0);
+
+error:
+	bzero(ivbuf, ZIO_DATA_IV_LEN);
+	return (ret);
+}
+
+int
+zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
+    uint8_t *digestbuf, uint_t digestlen)
+{
+	uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
+
+	ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
+
+	crypto_mac(&key->zk_hmac_key, data, datalen,
+	    raw_digestbuf, SHA512_DIGEST_LENGTH);
+
+	bcopy(raw_digestbuf, digestbuf, digestlen);
+
+	return (0);
+}
+
+int
+zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
+    uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
+{
+	int ret;
+	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+	ret = zio_crypt_do_hmac(key, data, datalen,
+	    digestbuf, SHA512_DIGEST_LENGTH);
+	if (ret != 0)
+		return (ret);
+
+	bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN);
+	bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN);
+
+	return (0);
+}
+
+/*
+ * The following functions are used to encode and decode encryption parameters
+ * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
+ * byte strings, which normally means that these strings would not need to deal
+ * with byteswapping at all. However, both blkptr_t and zil_header_t may be
+ * byteswapped by lower layers and so we must "undo" that byteswap here upon
+ * decoding and encoding in a non-native byteorder. These functions require
+ * that the byteorder bit is correct before being called.
+ */
+void
+zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+	uint64_t val64;
+	uint32_t val32;
+
+	ASSERT(BP_IS_ENCRYPTED(bp));
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
+		bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
+		bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+		BP_SET_IV2(bp, val32);
+	} else {
+		bcopy(salt, &val64, sizeof (uint64_t));
+		bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
+
+		bcopy(iv, &val64, sizeof (uint64_t));
+		bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
+
+		bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+		BP_SET_IV2(bp, BSWAP_32(val32));
+	}
+}
+
+void
+zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+	uint64_t val64;
+	uint32_t val32;
+
+	ASSERT(BP_IS_PROTECTED(bp));
+
+	/* for convenience, so callers don't need to check */
+	if (BP_IS_AUTHENTICATED(bp)) {
+		bzero(salt, ZIO_DATA_SALT_LEN);
+		bzero(iv, ZIO_DATA_IV_LEN);
+		return;
+	}
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
+		bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
+
+		val32 = (uint32_t)BP_GET_IV2(bp);
+		bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+	} else {
+		val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
+		bcopy(&val64, salt, sizeof (uint64_t));
+
+		val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
+		bcopy(&val64, iv, sizeof (uint64_t));
+
+		val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
+		bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+	}
+}
+
+void
+zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
+{
+	uint64_t val64;
+
+	ASSERT(BP_USES_CRYPT(bp));
+	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
+		bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
+		    sizeof (uint64_t));
+	} else {
+		bcopy(mac, &val64, sizeof (uint64_t));
+		bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
+
+		bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
+		bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
+	}
+}
+
+void
+zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
+{
+	uint64_t val64;
+
+	ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
+
+	/* for convenience, so callers don't need to check */
+	if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+		bzero(mac, ZIO_DATA_MAC_LEN);
+		return;
+	}
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
+		bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
+		    sizeof (uint64_t));
+	} else {
+		val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
+		bcopy(&val64, mac, sizeof (uint64_t));
+
+		val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
+		bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
+	}
+}
+
+void
+zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
+{
+	zil_chain_t *zilc = data;
+
+	bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
+	bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
+	    sizeof (uint64_t));
+}
+
+void
+zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
+{
+	/*
+	 * The ZIL MAC is embedded in the block it protects, which will
+	 * not have been byteswapped by the time this function has been called.
+	 * As a result, we don't need to worry about byteswapping the MAC.
+	 */
+	const zil_chain_t *zilc = data;
+
+	bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
+	bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
+	    sizeof (uint64_t));
+}
+
+/*
+ * This routine takes a block of dnodes (src_abd) and copies only the bonus
+ * buffers to the same offsets in the dst buffer. datalen should be the size
+ * of both the src_abd and the dst buffer (not just the length of the bonus
+ * buffers).
+ */
+void
+zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
+{
+	uint_t i, max_dnp = datalen >> DNODE_SHIFT;
+	uint8_t *src;
+	dnode_phys_t *dnp, *sdnp, *ddnp;
+
+	src = abd_borrow_buf_copy(src_abd, datalen);
+
+	sdnp = (dnode_phys_t *)src;
+	ddnp = (dnode_phys_t *)dst;
+
+	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+		dnp = &sdnp[i];
+		if (dnp->dn_type != DMU_OT_NONE &&
+		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+		    dnp->dn_bonuslen != 0) {
+			bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]),
+			    DN_MAX_BONUS_LEN(dnp));
+		}
+	}
+
+	abd_return_buf(src_abd, src, datalen);
+}
+
+/*
+ * This function decides what fields from blk_prop are included in
+ * the on-disk various MAC algorithms.
+ */
+static void
+zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
+{
+	int avoidlint = SPA_MINBLOCKSIZE;
+	/*
+	 * Version 0 did not properly zero out all non-portable fields
+	 * as it should have done. We maintain this code so that we can
+	 * do read-only imports of pools on this version.
+	 */
+	if (version == 0) {
+		BP_SET_DEDUP(bp, 0);
+		BP_SET_CHECKSUM(bp, 0);
+		BP_SET_PSIZE(bp, avoidlint);
+		return;
+	}
+
+	ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+
+	/*
+	 * The hole_birth feature might set these fields even if this bp
+	 * is a hole. We zero them out here to guarantee that raw sends
+	 * will function with or without the feature.
+	 */
+	if (BP_IS_HOLE(bp)) {
+		bp->blk_prop = 0ULL;
+		return;
+	}
+
+	/*
+	 * At L0 we want to verify these fields to ensure that data blocks
+	 * can not be reinterpreted. For instance, we do not want an attacker
+	 * to trick us into returning raw lz4 compressed data to the user
+	 * by modifying the compression bits. At higher levels, we cannot
+	 * enforce this policy since raw sends do not convey any information
+	 * about indirect blocks, so these values might be different on the
+	 * receive side. Fortunately, this does not open any new attack
+	 * vectors, since any alterations that can be made to a higher level
+	 * bp must still verify the correct order of the layer below it.
+	 */
+	if (BP_GET_LEVEL(bp) != 0) {
+		BP_SET_BYTEORDER(bp, 0);
+		BP_SET_COMPRESS(bp, 0);
+
+		/*
+		 * psize cannot be set to zero or it will trigger
+		 * asserts, but the value doesn't really matter as
+		 * long as it is constant.
+		 */
+		BP_SET_PSIZE(bp, avoidlint);
+	}
+
+	BP_SET_DEDUP(bp, 0);
+	BP_SET_CHECKSUM(bp, 0);
+}
+
+static void
+zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
+    blkptr_auth_buf_t *bab, uint_t *bab_len)
+{
+	blkptr_t tmpbp = *bp;
+
+	if (should_bswap)
+		byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
+
+	ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
+	ASSERT0(BP_IS_EMBEDDED(&tmpbp));
+
+	zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
+
+	/*
+	 * We always MAC blk_prop in LE to ensure portability. This
+	 * must be done after decoding the mac, since the endianness
+	 * will get zero'd out here.
+	 */
+	zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
+	bab->bab_prop = LE_64(tmpbp.blk_prop);
+	bab->bab_pad = 0ULL;
+
+	/* version 0 did not include the padding */
+	*bab_len = sizeof (blkptr_auth_buf_t);
+	if (version == 0)
+		*bab_len -= sizeof (uint64_t);
+}
+
+static int
+zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
+    boolean_t should_bswap, blkptr_t *bp)
+{
+	uint_t bab_len;
+	blkptr_auth_buf_t bab;
+
+	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+	crypto_mac_update(ctx, &bab, bab_len);
+
+	return (0);
+}
+
+static void
+zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
+    boolean_t should_bswap, blkptr_t *bp)
+{
+	uint_t bab_len;
+	blkptr_auth_buf_t bab;
+
+	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+	SHA2Update(ctx, &bab, bab_len);
+}
+
+static void
+zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
+    boolean_t should_bswap, blkptr_t *bp)
+{
+	uint_t bab_len;
+	blkptr_auth_buf_t bab;
+
+	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+	bcopy(&bab, *aadp, bab_len);
+	*aadp += bab_len;
+	*aad_len += bab_len;
+}
+
+static int
+zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
+    boolean_t should_bswap, dnode_phys_t *dnp)
+{
+	int ret, i;
+	dnode_phys_t *adnp;
+	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+	uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)];
+
+	/* authenticate the core dnode (masking out non-portable bits) */
+	bcopy(dnp, tmp_dncore, sizeof (tmp_dncore));
+	adnp = (dnode_phys_t *)tmp_dncore;
+	if (le_bswap) {
+		adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
+		adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
+		adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
+		adnp->dn_used = BSWAP_64(adnp->dn_used);
+	}
+	adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+	adnp->dn_used = 0;
+
+	crypto_mac_update(ctx, adnp, sizeof (tmp_dncore));
+
+	for (i = 0; i < dnp->dn_nblkptr; i++) {
+		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+		    should_bswap, &dnp->dn_blkptr[i]);
+		if (ret != 0)
+			goto error;
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+		    should_bswap, DN_SPILL_BLKPTR(dnp));
+		if (ret != 0)
+			goto error;
+	}
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+/*
+ * objset_phys_t blocks introduce a number of exceptions to the normal
+ * authentication process. objset_phys_t's contain 2 separate HMACS for
+ * protecting the integrity of their data. The portable_mac protects the
+ * metadnode. This MAC can be sent with a raw send and protects against
+ * reordering of data within the metadnode. The local_mac protects the user
+ * accounting objects which are not sent from one system to another.
+ *
+ * In addition, objset blocks are the only blocks that can be modified and
+ * written to disk without the key loaded under certain circumstances. During
+ * zil_claim() we need to be able to update the zil_header_t to complete
+ * claiming log blocks and during raw receives we need to write out the
+ * portable_mac from the send file. Both of these actions are possible
+ * because these fields are not protected by either MAC so neither one will
+ * need to modify the MACs without the key. However, when the modified blocks
+ * are written out they will be byteswapped into the host machine's native
+ * endianness which will modify fields protected by the MAC. As a result, MAC
+ * calculation for objset blocks works slightly differently from other block
+ * types. Where other block types MAC the data in whatever endianness is
+ * written to disk, objset blocks always MAC little endian version of their
+ * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
+ * and le_bswap indicates whether a byteswap is needed to get this block
+ * into little endian format.
+ */
+/* ARGSUSED */
+int
+zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
+    boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
+{
+	int ret;
+	struct hmac_ctx hash_ctx;
+	struct hmac_ctx *ctx = &hash_ctx;
+	objset_phys_t *osp = data;
+	uint64_t intval;
+	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+	uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
+	uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
+
+
+	/* calculate the portable MAC from the portable fields and metadnode */
+	crypto_mac_init(ctx, &key->zk_hmac_key);
+
+	/* add in the os_type */
+	intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
+	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
+
+	/* add in the portable os_flags */
+	intval = osp->os_flags;
+	if (should_bswap)
+		intval = BSWAP_64(intval);
+	intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+	/* CONSTCOND */
+	if (!ZFS_HOST_BYTEORDER)
+		intval = BSWAP_64(intval);
+
+	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
+
+	/* add in fields from the metadnode */
+	ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+	    should_bswap, &osp->os_meta_dnode);
+	if (ret)
+		goto error;
+
+	crypto_mac_final(ctx, raw_portable_mac, SHA512_DIGEST_LENGTH);
+
+	bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
+
+	/*
+	 * The local MAC protects the user, group and project accounting.
+	 * If these objects are not present, the local MAC is zeroed out.
+	 */
+	if ((datalen >= OBJSET_PHYS_SIZE_V3 &&
+	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
+	    osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
+	    (datalen >= OBJSET_PHYS_SIZE_V2 &&
+	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
+	    (datalen <= OBJSET_PHYS_SIZE_V1)) {
+		bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+		return (0);
+	}
+
+	/* calculate the local MAC from the userused and groupused dnodes */
+	crypto_mac_init(ctx, &key->zk_hmac_key);
+
+	/* add in the non-portable os_flags */
+	intval = osp->os_flags;
+	if (should_bswap)
+		intval = BSWAP_64(intval);
+	intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+	/* CONSTCOND */
+	if (!ZFS_HOST_BYTEORDER)
+		intval = BSWAP_64(intval);
+
+	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
+
+	/* XXX check dnode type ... */
+	/* add in fields from the user accounting dnodes */
+	if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
+		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+		    should_bswap, &osp->os_userused_dnode);
+		if (ret)
+			goto error;
+	}
+
+	if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
+		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+		    should_bswap, &osp->os_groupused_dnode);
+		if (ret)
+			goto error;
+	}
+
+	if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
+	    datalen >= OBJSET_PHYS_SIZE_V3) {
+		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+		    should_bswap, &osp->os_projectused_dnode);
+		if (ret)
+			goto error;
+	}
+
+	crypto_mac_final(ctx, raw_local_mac, SHA512_DIGEST_LENGTH);
+
+	bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN);
+
+	return (0);
+
+error:
+	bzero(portable_mac, ZIO_OBJSET_MAC_LEN);
+	bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+	return (ret);
+}
+
+static void
+zio_crypt_destroy_uio(uio_t *uio)
+{
+	if (uio->uio_iov)
+		kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t));
+}
+
+/*
+ * This function parses an uncompressed indirect block and returns a checksum
+ * of all the portable fields from all of the contained bps. The portable
+ * fields are the MAC and all of the fields from blk_prop except for the dedup,
+ * checksum, and psize bits. For an explanation of the purpose of this, see
+ * the comment block on object set authentication.
+ */
+static int
+zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
+    uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
+{
+	blkptr_t *bp;
+	int i, epb = datalen >> SPA_BLKPTRSHIFT;
+	SHA2_CTX ctx;
+	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+	/* checksum all of the MACs from the layer below */
+	SHA2Init(SHA512, &ctx);
+	for (i = 0, bp = buf; i < epb; i++, bp++) {
+		zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
+		    byteswap, bp);
+	}
+	SHA2Final(digestbuf, &ctx);
+
+	if (generate) {
+		bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN);
+		return (0);
+	}
+
+	if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) {
+#ifdef FCRYPTO_DEBUG
+		printf("%s(%d): Setting ECKSUM\n", __FUNCTION__, __LINE__);
+#endif
+		return (SET_ERROR(ECKSUM));
+	}
+	return (0);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
+    uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+	int ret;
+
+	/*
+	 * Unfortunately, callers of this function will not always have
+	 * easy access to the on-disk format version. This info is
+	 * normally found in the DSL Crypto Key, but the checksum-of-MACs
+	 * is expected to be verifiable even when the key isn't loaded.
+	 * Here, instead of doing a ZAP lookup for the version for each
+	 * zio, we simply try both existing formats.
+	 */
+	ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
+	    datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
+	if (ret == ECKSUM) {
+		ASSERT(!generate);
+		ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
+		    buf, datalen, 0, byteswap, cksum);
+	}
+
+	return (ret);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
+    uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+	int ret;
+	void *buf;
+
+	buf = abd_borrow_buf_copy(abd, datalen);
+	ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
+	    byteswap, cksum);
+	abd_return_buf(abd, buf, datalen);
+
+	return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting ZIL blocks.
+ * We do not check for the older ZIL chain because the encryption feature
+ * was not available before the newer ZIL chain was introduced. The goal
+ * here is to encrypt everything except the blkptr_t of a lr_write_t and
+ * the zil_chain_t header. Everything that is not encrypted is authenticated.
+ */
+/*
+ * The OpenCrypto used in FreeBSD does not use separate source and
+ * destination buffers; instead, the same buffer is used.  Further, to
+ * accommodate some of the drivers, the authbuf needs to be logically before
+ * the data.  This means that we need to copy the source to the destination,
+ * and set up an extra iovec_t at the beginning to handle the authbuf.
+ * It also means we'll only return one uio_t, which we do via the clumsy
+ * ifdef in the function declaration.
+ */
+
+/* ARGSUSED */
+static int
+zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
+    uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio,
+    uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
+    boolean_t *no_crypt)
+{
+	int ret;
+	uint64_t txtype, lr_len;
+	uint_t nr_src, nr_dst, crypt_len;
+	uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
+	iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
+	uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
+	zil_chain_t *zilc;
+	lr_t *lr;
+	uint8_t *aadbuf = zio_buf_alloc(datalen);
+
+	/* cipherbuf always needs an extra iovec for the MAC */
+	if (encrypt) {
+		src = plainbuf;
+		dst = cipherbuf;
+		nr_src = 0;
+		nr_dst = 1;
+	} else {
+		src = cipherbuf;
+		dst = plainbuf;
+		nr_src = 1;
+		nr_dst = 0;
+	}
+
+	/*
+	 * We need at least two iovecs -- one for the AAD,
+	 * one for the MAC.
+	 */
+	bcopy(src, dst, datalen);
+	nr_dst = 2;
+
+	/* find the start and end record of the log block */
+	zilc = (zil_chain_t *)src;
+	slrp = src + sizeof (zil_chain_t);
+	aadp = aadbuf;
+	blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+
+	/* calculate the number of encrypted iovecs we will need */
+	for (; slrp < blkend; slrp += lr_len) {
+		lr = (lr_t *)slrp;
+
+		if (!byteswap) {
+			txtype = lr->lrc_txtype;
+			lr_len = lr->lrc_reclen;
+		} else {
+			txtype = BSWAP_64(lr->lrc_txtype);
+			lr_len = BSWAP_64(lr->lrc_reclen);
+		}
+
+		nr_iovecs++;
+		if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
+			nr_iovecs++;
+	}
+
+	nr_src = 0;
+	nr_dst += nr_iovecs;
+
+	/* allocate the iovec arrays */
+	if (nr_src != 0) {
+		src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
+		if (src_iovecs == NULL) {
+			ret = SET_ERROR(ENOMEM);
+			goto error;
+		}
+		bzero(src_iovecs, nr_src * sizeof (iovec_t));
+	}
+
+	if (nr_dst != 0) {
+		dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
+		if (dst_iovecs == NULL) {
+			ret = SET_ERROR(ENOMEM);
+			goto error;
+		}
+		bzero(dst_iovecs, nr_dst * sizeof (iovec_t));
+	}
+
+	/*
+	 * Copy the plain zil header over and authenticate everything except
+	 * the checksum that will store our MAC. If we are writing the data
+	 * the embedded checksum will not have been calculated yet, so we don't
+	 * authenticate that.
+	 */
+	bcopy(src, dst, sizeof (zil_chain_t));
+	bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t));
+	aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+	aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+
+	/* loop over records again, filling in iovecs */
+	/* The first one will contain the authbuf */
+	nr_iovecs = 1;
+
+	slrp = src + sizeof (zil_chain_t);
+	dlrp = dst + sizeof (zil_chain_t);
+
+	for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
+		lr = (lr_t *)slrp;
+
+		if (!byteswap) {
+			txtype = lr->lrc_txtype;
+			lr_len = lr->lrc_reclen;
+		} else {
+			txtype = BSWAP_64(lr->lrc_txtype);
+			lr_len = BSWAP_64(lr->lrc_reclen);
+		}
+
+		/* copy the common lr_t */
+		bcopy(slrp, dlrp, sizeof (lr_t));
+		bcopy(slrp, aadp, sizeof (lr_t));
+		aadp += sizeof (lr_t);
+		aad_len += sizeof (lr_t);
+
+		ASSERT3P(dst_iovecs, !=, NULL);
+
+		/*
+		 * If this is a TX_WRITE record we want to encrypt everything
+		 * except the bp if exists. If the bp does exist we want to
+		 * authenticate it.
+		 */
+		if (txtype == TX_WRITE) {
+			crypt_len = sizeof (lr_write_t) -
+			    sizeof (lr_t) - sizeof (blkptr_t);
+			dst_iovecs[nr_iovecs].iov_base = (char *)dlrp +
+			    sizeof (lr_t);
+			dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+			/* copy the bp now since it will not be encrypted */
+			bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+			    dlrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+			    sizeof (blkptr_t));
+			bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+			    aadp, sizeof (blkptr_t));
+			aadp += sizeof (blkptr_t);
+			aad_len += sizeof (blkptr_t);
+			nr_iovecs++;
+			total_len += crypt_len;
+
+			if (lr_len != sizeof (lr_write_t)) {
+				crypt_len = lr_len - sizeof (lr_write_t);
+				dst_iovecs[nr_iovecs].iov_base = (char *)
+				    dlrp + sizeof (lr_write_t);
+				dst_iovecs[nr_iovecs].iov_len = crypt_len;
+				nr_iovecs++;
+				total_len += crypt_len;
+			}
+		} else {
+			crypt_len = lr_len - sizeof (lr_t);
+			dst_iovecs[nr_iovecs].iov_base = (char *)dlrp +
+			    sizeof (lr_t);
+			dst_iovecs[nr_iovecs].iov_len = crypt_len;
+			nr_iovecs++;
+			total_len += crypt_len;
+		}
+	}
+
+	*no_crypt = (nr_iovecs == 0);
+	*enc_len = total_len;
+	*authbuf = aadbuf;
+	*auth_len = aad_len;
+	dst_iovecs[0].iov_base = aadbuf;
+	dst_iovecs[0].iov_len = aad_len;
+
+	out_uio->uio_iov = dst_iovecs;
+	out_uio->uio_iovcnt = nr_dst;
+
+	return (0);
+
+error:
+	zio_buf_free(aadbuf, datalen);
+	if (src_iovecs != NULL)
+		kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
+	if (dst_iovecs != NULL)
+		kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
+
+	*enc_len = 0;
+	*authbuf = NULL;
+	*auth_len = 0;
+	*no_crypt = B_FALSE;
+	puio->uio_iov = NULL;
+	puio->uio_iovcnt = 0;
+	out_uio->uio_iov = NULL;
+	out_uio->uio_iovcnt = 0;
+
+	return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting dnode blocks.
+ */
+static int
+zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
+    uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+    uio_t *puio, uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf,
+    uint_t *auth_len, boolean_t *no_crypt)
+{
+	int ret;
+	uint_t nr_src, nr_dst, crypt_len;
+	uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
+	uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
+	iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
+	uint8_t *src, *dst, *aadp;
+	dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
+	uint8_t *aadbuf = zio_buf_alloc(datalen);
+
+	if (encrypt) {
+		src = plainbuf;
+		dst = cipherbuf;
+		nr_src = 0;
+		nr_dst = 1;
+	} else {
+		src = cipherbuf;
+		dst = plainbuf;
+		nr_src = 1;
+		nr_dst = 0;
+	}
+
+	bcopy(src, dst, datalen);
+	nr_dst = 2;
+
+	sdnp = (dnode_phys_t *)src;
+	ddnp = (dnode_phys_t *)dst;
+	aadp = aadbuf;
+
+	/*
+	 * Count the number of iovecs we will need to do the encryption by
+	 * counting the number of bonus buffers that need to be encrypted.
+	 */
+	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+		/*
+		 * This block may still be byteswapped. However, all of the
+		 * values we use are either uint8_t's (for which byteswapping
+		 * is a noop) or a * != 0 check, which will work regardless
+		 * of whether or not we byteswap.
+		 */
+		if (sdnp[i].dn_type != DMU_OT_NONE &&
+		    DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
+		    sdnp[i].dn_bonuslen != 0) {
+			nr_iovecs++;
+		}
+	}
+
+	nr_src = 0;
+	nr_dst += nr_iovecs;
+
+	if (nr_src != 0) {
+		src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
+		if (src_iovecs == NULL) {
+			ret = SET_ERROR(ENOMEM);
+			goto error;
+		}
+		bzero(src_iovecs, nr_src * sizeof (iovec_t));
+	}
+
+	if (nr_dst != 0) {
+		dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
+		if (dst_iovecs == NULL) {
+			ret = SET_ERROR(ENOMEM);
+			goto error;
+		}
+		bzero(dst_iovecs, nr_dst * sizeof (iovec_t));
+	}
+
+	nr_iovecs = 1;
+
+	/*
+	 * Iterate through the dnodes again, this time filling in the uios
+	 * we allocated earlier. We also concatenate any data we want to
+	 * authenticate onto aadbuf.
+	 */
+	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+		dnp = &sdnp[i];
+
+		/* copy over the core fields and blkptrs (kept as plaintext) */
+		bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
+
+		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+			bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]),
+			    sizeof (blkptr_t));
+		}
+
+		/*
+		 * Handle authenticated data. We authenticate everything in
+		 * the dnode that can be brought over when we do a raw send.
+		 * This includes all of the core fields as well as the MACs
+		 * stored in the bp checksums and all of the portable bits
+		 * from blk_prop. We include the dnode padding here in case it
+		 * ever gets used in the future. Some dn_flags and dn_used are
+		 * not portable so we mask those out values out of the
+		 * authenticated data.
+		 */
+		crypt_len = offsetof(dnode_phys_t, dn_blkptr);
+		bcopy(dnp, aadp, crypt_len);
+		adnp = (dnode_phys_t *)aadp;
+		adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+		adnp->dn_used = 0;
+		aadp += crypt_len;
+		aad_len += crypt_len;
+
+		for (j = 0; j < dnp->dn_nblkptr; j++) {
+			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+			    version, byteswap, &dnp->dn_blkptr[j]);
+		}
+
+		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+			    version, byteswap, DN_SPILL_BLKPTR(dnp));
+		}
+
+		/*
+		 * If this bonus buffer needs to be encrypted, we prepare an
+		 * iovec_t. The encryption / decryption functions will fill
+		 * this in for us with the encrypted or decrypted data.
+		 * Otherwise we add the bonus buffer to the authenticated
+		 * data buffer and copy it over to the destination. The
+		 * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
+		 * we can guarantee alignment with the AES block size
+		 * (128 bits).
+		 */
+		crypt_len = DN_MAX_BONUS_LEN(dnp);
+		if (dnp->dn_type != DMU_OT_NONE &&
+		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+		    dnp->dn_bonuslen != 0) {
+			ASSERT3U(nr_iovecs, <, nr_dst);
+			ASSERT3P(dst_iovecs, !=, NULL);
+			dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]);
+			dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+			nr_iovecs++;
+			total_len += crypt_len;
+		} else {
+			bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len);
+			bcopy(DN_BONUS(dnp), aadp, crypt_len);
+			aadp += crypt_len;
+			aad_len += crypt_len;
+		}
+	}
+
+	*no_crypt = (nr_iovecs == 0);
+	*enc_len = total_len;
+	*authbuf = aadbuf;
+	*auth_len = aad_len;
+
+	dst_iovecs[0].iov_base = aadbuf;
+	dst_iovecs[0].iov_len = aad_len;
+	out_uio->uio_iov = dst_iovecs;
+	out_uio->uio_iovcnt = nr_dst;
+
+	return (0);
+
+error:
+	zio_buf_free(aadbuf, datalen);
+	if (src_iovecs != NULL)
+		kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
+	if (dst_iovecs != NULL)
+		kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
+
+	*enc_len = 0;
+	*authbuf = NULL;
+	*auth_len = 0;
+	*no_crypt = B_FALSE;
+	out_uio->uio_iov = NULL;
+	out_uio->uio_iovcnt = 0;
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
+    uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *out_uio,
+    uint_t *enc_len)
+{
+	int ret;
+	uint_t nr_plain = 1, nr_cipher = 2;
+	iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
+	void *src, *dst;
+
+	cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t),
+	    KM_SLEEP);
+	if (!cipher_iovecs) {
+		ret = SET_ERROR(ENOMEM);
+		goto error;
+	}
+	bzero(cipher_iovecs, nr_cipher * sizeof (iovec_t));
+
+	if (encrypt) {
+		src = plainbuf;
+		dst = cipherbuf;
+	} else {
+		src = cipherbuf;
+		dst = plainbuf;
+	}
+	bcopy(src, dst, datalen);
+	cipher_iovecs[0].iov_base = dst;
+	cipher_iovecs[0].iov_len = datalen;
+
+	*enc_len = datalen;
+	out_uio->uio_iov = cipher_iovecs;
+	out_uio->uio_iovcnt = nr_cipher;
+
+	return (0);
+
+error:
+	if (plain_iovecs != NULL)
+		kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
+	if (cipher_iovecs != NULL)
+		kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
+
+	*enc_len = 0;
+	out_uio->uio_iov = NULL;
+	out_uio->uio_iovcnt = 0;
+
+	return (ret);
+}
+
+/*
+ * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
+ * that they can be used for encryption and decryption by zio_do_crypt_uio().
+ * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
+ * requiring special handling to parse out pieces that are to be encrypted. The
+ * authbuf is used by these special cases to store additional authenticated
+ * data (AAD) for the encryption modes.
+ */
+static int
+zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
+    uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+    uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf,
+    uint_t *auth_len, boolean_t *no_crypt)
+{
+	int ret;
+	iovec_t *mac_iov;
+
+	ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
+
+	/* route to handler */
+	switch (ot) {
+	case DMU_OT_INTENT_LOG:
+		ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
+		    datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
+		    no_crypt);
+		break;
+	case DMU_OT_DNODE:
+		ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
+		    cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
+		    auth_len, no_crypt);
+		break;
+	default:
+		ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
+		    datalen, puio, cuio, enc_len);
+		*authbuf = NULL;
+		*auth_len = 0;
+		*no_crypt = B_FALSE;
+		break;
+	}
+
+	if (ret != 0)
+		goto error;
+
+	/* populate the uios */
+	cuio->uio_segflg = UIO_SYSSPACE;
+
+	mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]);
+	mac_iov->iov_base = (void *)mac;
+	mac_iov->iov_len = ZIO_DATA_MAC_LEN;
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+void *failed_decrypt_buf;
+int faile_decrypt_size;
+
+/*
+ * Primary encryption / decryption entrypoint for zio data.
+ */
+int
+zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
+    dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
+    uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
+    boolean_t *no_crypt)
+{
+	int ret;
+	boolean_t locked = B_FALSE;
+	uint64_t crypt = key->zk_crypt;
+	uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
+	uint_t enc_len, auth_len;
+	uio_t puio, cuio;
+	uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
+	crypto_key_t tmp_ckey, *ckey = NULL;
+	freebsd_crypt_session_t *tmpl = NULL;
+	uint8_t *authbuf = NULL;
+
+	bzero(&puio, sizeof (uio_t));
+	bzero(&cuio, sizeof (uio_t));
+
+#ifdef FCRYPTO_DEBUG
+	printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n",
+	    __FUNCTION__,
+	    encrypt ? "encrypt" : "decrypt",
+	    key, salt, ot, iv, mac, datalen,
+	    byteswap ? "byteswap" : "native_endian", plainbuf,
+	    cipherbuf, no_crypt);
+
+	printf("\tkey = {");
+	for (int i = 0; i < key->zk_current_key.ck_length/8; i++)
+		printf("%02x ", ((uint8_t *)key->zk_current_key.ck_data)[i]);
+	printf("}\n");
+#endif
+	/* create uios for encryption */
+	ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
+	    cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
+	    &authbuf, &auth_len, no_crypt);
+	if (ret != 0)
+		return (ret);
+
+	/*
+	 * If the needed key is the current one, just use it. Otherwise we
+	 * need to generate a temporary one from the given salt + master key.
+	 * If we are encrypting, we must return a copy of the current salt
+	 * so that it can be stored in the blkptr_t.
+	 */
+	rw_enter(&key->zk_salt_lock, RW_READER);
+	locked = B_TRUE;
+
+	if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
+		ckey = &key->zk_current_key;
+		tmpl = &key->zk_session;
+	} else {
+		rw_exit(&key->zk_salt_lock);
+		locked = B_FALSE;
+
+		ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+		    salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
+		if (ret != 0)
+			goto error;
+		tmp_ckey.ck_format = CRYPTO_KEY_RAW;
+		tmp_ckey.ck_data = enc_keydata;
+		tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+		ckey = &tmp_ckey;
+		tmpl = NULL;
+	}
+
+	/* perform the encryption / decryption */
+	ret = zio_do_crypt_uio_opencrypto(encrypt, tmpl, key->zk_crypt,
+	    ckey, iv, enc_len, &cuio, auth_len);
+	if (ret != 0)
+		goto error;
+	if (locked) {
+		rw_exit(&key->zk_salt_lock);
+		locked = B_FALSE;
+	}
+
+	if (authbuf != NULL)
+		zio_buf_free(authbuf, datalen);
+	if (ckey == &tmp_ckey)
+		bzero(enc_keydata, keydata_len);
+	zio_crypt_destroy_uio(&puio);
+	zio_crypt_destroy_uio(&cuio);
+
+	return (0);
+
+error:
+	if (!encrypt) {
+		if (failed_decrypt_buf != NULL)
+			kmem_free(failed_decrypt_buf, failed_decrypt_size);
+		failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP);
+		failed_decrypt_size = datalen;
+		bcopy(cipherbuf, failed_decrypt_buf, datalen);
+	}
+	if (locked)
+		rw_exit(&key->zk_salt_lock);
+	if (authbuf != NULL)
+		zio_buf_free(authbuf, datalen);
+	if (ckey == &tmp_ckey)
+		bzero(enc_keydata, keydata_len);
+	zio_crypt_destroy_uio(&puio);
+	zio_crypt_destroy_uio(&cuio);
+	return (SET_ERROR(ret));
+}
+
+/*
+ * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
+ * linear buffers.
+ */
+int
+zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
+    boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
+    uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
+{
+	int ret;
+	void *ptmp, *ctmp;
+
+	if (encrypt) {
+		ptmp = abd_borrow_buf_copy(pabd, datalen);
+		ctmp = abd_borrow_buf(cabd, datalen);
+	} else {
+		ptmp = abd_borrow_buf(pabd, datalen);
+		ctmp = abd_borrow_buf_copy(cabd, datalen);
+	}
+
+	ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
+	    datalen, ptmp, ctmp, no_crypt);
+	if (ret != 0)
+		goto error;
+
+	if (encrypt) {
+		abd_return_buf(pabd, ptmp, datalen);
+		abd_return_buf_copy(cabd, ctmp, datalen);
+	} else {
+		abd_return_buf_copy(pabd, ptmp, datalen);
+		abd_return_buf(cabd, ctmp, datalen);
+	}
+
+	return (0);
+
+error:
+	if (encrypt) {
+		abd_return_buf(pabd, ptmp, datalen);
+		abd_return_buf_copy(cabd, ctmp, datalen);
+	} else {
+		abd_return_buf_copy(pabd, ptmp, datalen);
+		abd_return_buf(cabd, ctmp, datalen);
+	}
+
+	return (SET_ERROR(ret));
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+/* BEGIN CSTYLED */
+module_param(zfs_key_max_salt_uses, ulong, 0644);
+MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
+	"can be used for generating encryption keys before it is rotated");
+/* END CSTYLED */
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
new file mode 100644
index 000000000000..113733a5c11a
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
@@ -0,0 +1,1454 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions Copyright 2010 Robert Milkowski
+ *
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
+
+/*
+ * ZFS volume emulation driver.
+ *
+ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
+ * Volumes are accessed through the symbolic links named:
+ *
+ * /dev/zvol/<pool_name>/<dataset_name>
+ *
+ * Volumes are persistent through reboot.  No user command needs to be
+ * run before opening and using a device.
+ *
+ * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
+ * in the system. Except when they're simply character devices (volmode=dev).
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/proc.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/disk.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dnode.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/byteorder.h>
+#include <sys/sunddi.h>
+#include <sys/dirent.h>
+#include <sys/policy.h>
+#include <sys/queue.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zil.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_rlock.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
+#include <sys/zvol.h>
+#include <sys/zil_impl.h>
+#include <sys/dataset_kstats.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
+#include <sys/zil_impl.h>
+#include <sys/filio.h>
+
+#include <geom/geom.h>
+#include <sys/zvol.h>
+#include <sys/zvol_impl.h>
+
+#include "zfs_namecheck.h"
+
+#define	ZVOL_DUMPSIZE		"dumpsize"
+
+#ifdef ZVOL_LOCK_DEBUG
+#define	ZVOL_RW_READER		RW_WRITER
+#define	ZVOL_RW_READ_HELD	RW_WRITE_HELD
+#else
+#define	ZVOL_RW_READER		RW_READER
+#define	ZVOL_RW_READ_HELD	RW_READ_HELD
+#endif
+
+enum zvol_geom_state {
+	ZVOL_GEOM_UNINIT,
+	ZVOL_GEOM_STOPPED,
+	ZVOL_GEOM_RUNNING,
+};
+
+struct zvol_state_os {
+	int zso_volmode;
+#define	zso_dev		_zso_state._zso_dev
+#define	zso_geom	_zso_state._zso_geom
+	union {
+		/* volmode=dev */
+		struct zvol_state_dev {
+			struct cdev *zsd_cdev;
+			uint64_t zsd_sync_cnt;
+		} _zso_dev;
+
+		/* volmode=geom */
+		struct zvol_state_geom {
+			struct g_provider *zsg_provider;
+			struct bio_queue_head zsg_queue;
+			struct mtx zsg_queue_mtx;
+			enum zvol_geom_state zsg_state;
+		} _zso_geom;
+	} _zso_state;
+};
+
+static uint32_t zvol_minors;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
+	"Expose as GEOM providers (1), device files (2) or neither");
+static boolean_t zpool_on_zvol = B_FALSE;
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
+	"Allow zpools to use zvols as vdevs (DANGEROUS)");
+
+/*
+ * Toggle unmap functionality.
+ */
+boolean_t zvol_unmap_enabled = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
+	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
+
+/*
+ * zvol maximum transfer in one DMU tx.
+ */
+int zvol_maxphys = DMU_MAX_ACCESS / 2;
+
+static void zvol_ensure_zilog(zvol_state_t *zv);
+
+static d_open_t		zvol_cdev_open;
+static d_close_t	zvol_cdev_close;
+static d_ioctl_t	zvol_cdev_ioctl;
+static d_read_t		zvol_cdev_read;
+static d_write_t	zvol_cdev_write;
+static d_strategy_t	zvol_geom_bio_strategy;
+
+static struct cdevsw zvol_cdevsw = {
+	.d_name =	"zvol",
+	.d_version =	D_VERSION,
+	.d_flags =	D_DISK | D_TRACKCLOSE,
+	.d_open =	zvol_cdev_open,
+	.d_close =	zvol_cdev_close,
+	.d_ioctl =	zvol_cdev_ioctl,
+	.d_read =	zvol_cdev_read,
+	.d_write =	zvol_cdev_write,
+	.d_strategy =	zvol_geom_bio_strategy,
+};
+
+extern uint_t zfs_geom_probe_vdev_key;
+
+struct g_class zfs_zvol_class = {
+	.name = "ZFS::ZVOL",
+	.version = G_VERSION,
+};
+
+DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
+
+static int zvol_geom_open(struct g_provider *pp, int flag, int count);
+static int zvol_geom_close(struct g_provider *pp, int flag, int count);
+static void zvol_geom_run(zvol_state_t *zv);
+static void zvol_geom_destroy(zvol_state_t *zv);
+static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
+static void zvol_geom_worker(void *arg);
+static void zvol_geom_bio_start(struct bio *bp);
+static int zvol_geom_bio_getattr(struct bio *bp);
+/* static d_strategy_t	zvol_geom_bio_strategy; (declared elsewhere) */
+
+/*
+ * GEOM mode implementation
+ */
+
+/*ARGSUSED*/
+static int
+zvol_geom_open(struct g_provider *pp, int flag, int count)
+{
+	zvol_state_t *zv;
+	int err = 0;
+	boolean_t drop_suspend = B_TRUE;
+
+	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
+		/*
+		 * if zfs_geom_probe_vdev_key is set, that means that zfs is
+		 * attempting to probe geom providers while looking for a
+		 * replacement for a missing VDEV.  In this case, the
+		 * spa_namespace_lock will not be held, but it is still illegal
+		 * to use a zvol as a vdev.  Deadlocks can result if another
+		 * thread has spa_namespace_lock
+		 */
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+	zv = pp->private;
+	if (zv == NULL) {
+		rw_exit(&zvol_state_lock);
+		return (SET_ERROR(ENXIO));
+	}
+
+	mutex_enter(&zv->zv_state_lock);
+
+	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+
+	/*
+	 * make sure zvol is not suspended during first open
+	 * (hold zv_suspend_lock) and respect proper lock acquisition
+	 * ordering - zv_suspend_lock before zv_state_lock
+	 */
+	if (zv->zv_open_count == 0) {
+		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+			mutex_exit(&zv->zv_state_lock);
+			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+			mutex_enter(&zv->zv_state_lock);
+			/* check to see if zv_suspend_lock is needed */
+			if (zv->zv_open_count != 0) {
+				rw_exit(&zv->zv_suspend_lock);
+				drop_suspend = B_FALSE;
+			}
+		}
+	} else {
+		drop_suspend = B_FALSE;
+	}
+	rw_exit(&zvol_state_lock);
+
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	if (zv->zv_open_count == 0) {
+		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+		err = zvol_first_open(zv, !(flag & FWRITE));
+		if (err)
+			goto out_mutex;
+		pp->mediasize = zv->zv_volsize;
+		pp->stripeoffset = 0;
+		pp->stripesize = zv->zv_volblocksize;
+	}
+
+	/*
+	 * Check for a bad on-disk format version now since we
+	 * lied about owning the dataset readonly before.
+	 */
+	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
+	    dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
+		err = EROFS;
+		goto out_open_count;
+	}
+	if (zv->zv_flags & ZVOL_EXCL) {
+		err = EBUSY;
+		goto out_open_count;
+	}
+#ifdef FEXCL
+	if (flag & FEXCL) {
+		if (zv->zv_open_count != 0) {
+			err = EBUSY;
+			goto out_open_count;
+		}
+		zv->zv_flags |= ZVOL_EXCL;
+	}
+#endif
+
+	zv->zv_open_count += count;
+	mutex_exit(&zv->zv_state_lock);
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+	return (0);
+
+out_open_count:
+	if (zv->zv_open_count == 0)
+		zvol_last_close(zv);
+out_mutex:
+	mutex_exit(&zv->zv_state_lock);
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+	return (SET_ERROR(err));
+}
+
+/*ARGSUSED*/
+static int
+zvol_geom_close(struct g_provider *pp, int flag, int count)
+{
+	zvol_state_t *zv;
+	boolean_t drop_suspend = B_TRUE;
+
+	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+	zv = pp->private;
+	if (zv == NULL) {
+		rw_exit(&zvol_state_lock);
+		return (SET_ERROR(ENXIO));
+	}
+
+	mutex_enter(&zv->zv_state_lock);
+	if (zv->zv_flags & ZVOL_EXCL) {
+		ASSERT(zv->zv_open_count == 1);
+		zv->zv_flags &= ~ZVOL_EXCL;
+	}
+
+	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+
+	/*
+	 * If the open count is zero, this is a spurious close.
+	 * That indicates a bug in the kernel / DDI framework.
+	 */
+	ASSERT(zv->zv_open_count > 0);
+
+	/*
+	 * make sure zvol is not suspended during last close
+	 * (hold zv_suspend_lock) and respect proper lock acquisition
+	 * ordering - zv_suspend_lock before zv_state_lock
+	 */
+	if ((zv->zv_open_count - count) == 0) {
+		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+			mutex_exit(&zv->zv_state_lock);
+			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+			mutex_enter(&zv->zv_state_lock);
+			/* check to see if zv_suspend_lock is needed */
+			if (zv->zv_open_count != 1) {
+				rw_exit(&zv->zv_suspend_lock);
+				drop_suspend = B_FALSE;
+			}
+		}
+	} else {
+		drop_suspend = B_FALSE;
+	}
+	rw_exit(&zvol_state_lock);
+
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	/*
+	 * You may get multiple opens, but only one close.
+	 */
+	zv->zv_open_count -= count;
+
+	if (zv->zv_open_count == 0) {
+		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+		zvol_last_close(zv);
+	}
+
+	mutex_exit(&zv->zv_state_lock);
+
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+	return (0);
+}
+
+static void
+zvol_geom_run(zvol_state_t *zv)
+{
+	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+	struct g_provider *pp = zsg->zsg_provider;
+
+	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+
+	g_error_provider(pp, 0);
+
+	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
+	    "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
+}
+
+static void
+zvol_geom_destroy(zvol_state_t *zv)
+{
+	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+	struct g_provider *pp = zsg->zsg_provider;
+
+	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+
+	g_topology_assert();
+
+	mutex_enter(&zv->zv_state_lock);
+	VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING);
+	mutex_exit(&zv->zv_state_lock);
+	zsg->zsg_provider = NULL;
+	pp->private = NULL;
+	g_wither_geom(pp->geom, ENXIO);
+}
+
+static int
+zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
+{
+	int count, error, flags;
+
+	g_topology_assert();
+
+	/*
+	 * To make it easier we expect either open or close, but not both
+	 * at the same time.
+	 */
+	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
+	    (acr <= 0 && acw <= 0 && ace <= 0),
+	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
+	    pp->name, acr, acw, ace));
+
+	if (pp->private == NULL) {
+		if (acr <= 0 && acw <= 0 && ace <= 0)
+			return (0);
+		return (pp->error);
+	}
+
+	/*
+	 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
+	 * ace != 0, because GEOM already handles that and handles it a bit
+	 * differently. GEOM allows for multiple read/exclusive consumers and
+	 * ZFS allows only one exclusive consumer, no matter if it is reader or
+	 * writer. I like better the way GEOM works so I'll leave it for GEOM
+	 * to decide what to do.
+	 */
+
+	count = acr + acw + ace;
+	if (count == 0)
+		return (0);
+
+	flags = 0;
+	if (acr != 0 || ace != 0)
+		flags |= FREAD;
+	if (acw != 0)
+		flags |= FWRITE;
+
+	g_topology_unlock();
+	if (count > 0)
+		error = zvol_geom_open(pp, flags, count);
+	else
+		error = zvol_geom_close(pp, flags, -count);
+	g_topology_lock();
+	return (error);
+}
+
+static void
+zvol_geom_worker(void *arg)
+{
+	zvol_state_t *zv = arg;
+	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+	struct bio *bp;
+
+	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+
+	thread_lock(curthread);
+	sched_prio(curthread, PRIBIO);
+	thread_unlock(curthread);
+
+	for (;;) {
+		mtx_lock(&zsg->zsg_queue_mtx);
+		bp = bioq_takefirst(&zsg->zsg_queue);
+		if (bp == NULL) {
+			if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
+				zsg->zsg_state = ZVOL_GEOM_RUNNING;
+				wakeup(&zsg->zsg_state);
+				mtx_unlock(&zsg->zsg_queue_mtx);
+				kthread_exit();
+			}
+			msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
+			    PRIBIO | PDROP, "zvol:io", 0);
+			continue;
+		}
+		mtx_unlock(&zsg->zsg_queue_mtx);
+		zvol_geom_bio_strategy(bp);
+	}
+}
+
+static void
+zvol_geom_bio_start(struct bio *bp)
+{
+	zvol_state_t *zv = bp->bio_to->private;
+	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+	boolean_t first;
+
+	if (bp->bio_cmd == BIO_GETATTR) {
+		if (zvol_geom_bio_getattr(bp))
+			g_io_deliver(bp, EOPNOTSUPP);
+		return;
+	}
+
+	if (!THREAD_CAN_SLEEP()) {
+		mtx_lock(&zsg->zsg_queue_mtx);
+		first = (bioq_first(&zsg->zsg_queue) == NULL);
+		bioq_insert_tail(&zsg->zsg_queue, bp);
+		mtx_unlock(&zsg->zsg_queue_mtx);
+		if (first)
+			wakeup_one(&zsg->zsg_queue);
+		return;
+	}
+
+	zvol_geom_bio_strategy(bp);
+}
+
+static int
+zvol_geom_bio_getattr(struct bio *bp)
+{
+	zvol_state_t *zv;
+
+	zv = bp->bio_to->private;
+	ASSERT(zv != NULL);
+
+	spa_t *spa = dmu_objset_spa(zv->zv_objset);
+	uint64_t refd, avail, usedobjs, availobjs;
+
+	if (g_handleattr_int(bp, "GEOM::candelete", 1))
+		return (0);
+	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
+		dmu_objset_space(zv->zv_objset, &refd, &avail,
+		    &usedobjs, &availobjs);
+		if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
+			return (0);
+	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
+		dmu_objset_space(zv->zv_objset, &refd, &avail,
+		    &usedobjs, &availobjs);
+		if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
+			return (0);
+	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
+		avail = metaslab_class_get_space(spa_normal_class(spa));
+		avail -= metaslab_class_get_alloc(spa_normal_class(spa));
+		if (g_handleattr_off_t(bp, "poolblocksavail",
+		    avail / DEV_BSIZE))
+			return (0);
+	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
+		refd = metaslab_class_get_alloc(spa_normal_class(spa));
+		if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
+			return (0);
+	}
+	return (1);
+}
+
+static void
+zvol_geom_bio_strategy(struct bio *bp)
+{
+	zvol_state_t *zv;
+	uint64_t off, volsize;
+	size_t resid;
+	char *addr;
+	objset_t *os;
+	zfs_locked_range_t *lr;
+	int error = 0;
+	boolean_t doread = B_FALSE;
+	boolean_t is_dumpified;
+	boolean_t sync;
+
+	if (bp->bio_to)
+		zv = bp->bio_to->private;
+	else
+		zv = bp->bio_dev->si_drv2;
+
+	if (zv == NULL) {
+		error = SET_ERROR(ENXIO);
+		goto out;
+	}
+
+	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+
+	switch (bp->bio_cmd) {
+	case BIO_READ:
+		doread = B_TRUE;
+		break;
+	case BIO_WRITE:
+	case BIO_FLUSH:
+	case BIO_DELETE:
+		if (zv->zv_flags & ZVOL_RDONLY) {
+			error = SET_ERROR(EROFS);
+			goto resume;
+		}
+		zvol_ensure_zilog(zv);
+		if (bp->bio_cmd == BIO_FLUSH)
+			goto sync;
+		break;
+	default:
+		error = EOPNOTSUPP;
+		goto resume;
+	}
+
+	off = bp->bio_offset;
+	volsize = zv->zv_volsize;
+
+	os = zv->zv_objset;
+	ASSERT(os != NULL);
+
+	addr = bp->bio_data;
+	resid = bp->bio_length;
+
+	if (resid > 0 && (off < 0 || off >= volsize)) {
+		error = SET_ERROR(EIO);
+		goto resume;
+	}
+
+	is_dumpified = B_FALSE;
+	sync = !doread && !is_dumpified &&
+	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+
+	/*
+	 * There must be no buffer changes when doing a dmu_sync() because
+	 * we can't change the data whilst calculating the checksum.
+	 */
+	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
+	    doread ? RL_READER : RL_WRITER);
+
+	if (bp->bio_cmd == BIO_DELETE) {
+		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error != 0) {
+			dmu_tx_abort(tx);
+		} else {
+			zvol_log_truncate(zv, tx, off, resid, sync);
+			dmu_tx_commit(tx);
+			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+			    off, resid);
+			resid = 0;
+		}
+		goto unlock;
+	}
+	while (resid != 0 && off < volsize) {
+		size_t size = MIN(resid, zvol_maxphys);
+		if (doread) {
+			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
+			    DMU_READ_PREFETCH);
+		} else {
+			dmu_tx_t *tx = dmu_tx_create(os);
+			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
+			error = dmu_tx_assign(tx, TXG_WAIT);
+			if (error) {
+				dmu_tx_abort(tx);
+			} else {
+				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
+				zvol_log_write(zv, tx, off, size, sync);
+				dmu_tx_commit(tx);
+			}
+		}
+		if (error) {
+			/* convert checksum errors into IO errors */
+			if (error == ECKSUM)
+				error = SET_ERROR(EIO);
+			break;
+		}
+		off += size;
+		addr += size;
+		resid -= size;
+	}
+unlock:
+	zfs_rangelock_exit(lr);
+
+	bp->bio_completed = bp->bio_length - resid;
+	if (bp->bio_completed < bp->bio_length && off > volsize)
+		error = EINVAL;
+
+	switch (bp->bio_cmd) {
+	case BIO_FLUSH:
+		break;
+	case BIO_READ:
+		dataset_kstats_update_read_kstats(&zv->zv_kstat,
+		    bp->bio_completed);
+		break;
+	case BIO_WRITE:
+		dataset_kstats_update_write_kstats(&zv->zv_kstat,
+		    bp->bio_completed);
+		break;
+	case BIO_DELETE:
+		break;
+	default:
+		break;
+	}
+
+	if (sync) {
+sync:
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+	}
+resume:
+	rw_exit(&zv->zv_suspend_lock);
+out:
+	if (bp->bio_to)
+		g_io_deliver(bp, error);
+	else
+		biofinish(bp, NULL, error);
+}
+
+/*
+ * Character device mode implementation
+ */
+
+static int
+zvol_cdev_read(struct cdev *dev, struct uio *uio, int ioflag)
+{
+	zvol_state_t *zv;
+	uint64_t volsize;
+	zfs_locked_range_t *lr;
+	int error = 0;
+
+	zv = dev->si_drv2;
+
+	volsize = zv->zv_volsize;
+	/*
+	 * uio_loffset == volsize isn't an error as
+	 * its required for EOF processing.
+	 */
+	if (uio->uio_resid > 0 &&
+	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
+		return (SET_ERROR(EIO));
+
+	lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset,
+	    uio->uio_resid, RL_READER);
+	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
+		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
+
+		/* don't read past the end */
+		if (bytes > volsize - uio->uio_loffset)
+			bytes = volsize - uio->uio_loffset;
+
+		error =  dmu_read_uio_dnode(zv->zv_dn, uio, bytes);
+		if (error) {
+			/* convert checksum errors into IO errors */
+			if (error == ECKSUM)
+				error = SET_ERROR(EIO);
+			break;
+		}
+	}
+	zfs_rangelock_exit(lr);
+
+	return (error);
+}
+
+static int
+zvol_cdev_write(struct cdev *dev, struct uio *uio, int ioflag)
+{
+	zvol_state_t *zv;
+	uint64_t volsize;
+	zfs_locked_range_t *lr;
+	int error = 0;
+	boolean_t sync;
+
+	zv = dev->si_drv2;
+
+	volsize = zv->zv_volsize;
+
+	if (uio->uio_resid > 0 &&
+	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
+		return (SET_ERROR(EIO));
+
+	sync = (ioflag & IO_SYNC) ||
+	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+
+	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+	zvol_ensure_zilog(zv);
+
+	lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset,
+	    uio->uio_resid, RL_WRITER);
+	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
+		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
+		uint64_t off = uio->uio_loffset;
+		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+
+		if (bytes > volsize - off)	/* don't write past the end */
+			bytes = volsize - off;
+
+		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			break;
+		}
+		error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
+		if (error == 0)
+			zvol_log_write(zv, tx, off, bytes, sync);
+		dmu_tx_commit(tx);
+
+		if (error)
+			break;
+	}
+	zfs_rangelock_exit(lr);
+	if (sync)
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+	rw_exit(&zv->zv_suspend_lock);
+	return (error);
+}
+
+static int
+zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+	zvol_state_t *zv;
+	struct zvol_state_dev *zsd;
+	int err = 0;
+	boolean_t drop_suspend = B_TRUE;
+
+	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+	zv = dev->si_drv2;
+	if (zv == NULL) {
+		rw_exit(&zvol_state_lock);
+		return (SET_ERROR(ENXIO));
+	}
+
+	mutex_enter(&zv->zv_state_lock);
+
+	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV);
+
+	/*
+	 * make sure zvol is not suspended during first open
+	 * (hold zv_suspend_lock) and respect proper lock acquisition
+	 * ordering - zv_suspend_lock before zv_state_lock
+	 */
+	if (zv->zv_open_count == 0) {
+		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+			mutex_exit(&zv->zv_state_lock);
+			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+			mutex_enter(&zv->zv_state_lock);
+			/* check to see if zv_suspend_lock is needed */
+			if (zv->zv_open_count != 0) {
+				rw_exit(&zv->zv_suspend_lock);
+				drop_suspend = B_FALSE;
+			}
+		}
+	} else {
+		drop_suspend = B_FALSE;
+	}
+	rw_exit(&zvol_state_lock);
+
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	if (zv->zv_open_count == 0) {
+		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+		err = zvol_first_open(zv, !(flags & FWRITE));
+		if (err)
+			goto out_locked;
+	}
+
+	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+		err = EROFS;
+		goto out_opened;
+	}
+	if (zv->zv_flags & ZVOL_EXCL) {
+		err = EBUSY;
+		goto out_opened;
+	}
+#ifdef FEXCL
+	if (flags & FEXCL) {
+		if (zv->zv_open_count != 0) {
+			err = EBUSY;
+			goto out_opened;
+		}
+		zv->zv_flags |= ZVOL_EXCL;
+	}
+#endif
+
+	zv->zv_open_count++;
+	if (flags & (FSYNC | FDSYNC)) {
+		zsd = &zv->zv_zso->zso_dev;
+		zsd->zsd_sync_cnt++;
+		if (zsd->zsd_sync_cnt == 1)
+			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
+	}
+
+	mutex_exit(&zv->zv_state_lock);
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+	return (0);
+
+out_opened:
+	if (zv->zv_open_count == 0)
+		zvol_last_close(zv);
+out_locked:
+	mutex_exit(&zv->zv_state_lock);
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+	return (SET_ERROR(err));
+}
+
+static int
+zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+	zvol_state_t *zv;
+	struct zvol_state_dev *zsd;
+	boolean_t drop_suspend = B_TRUE;
+
+	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+	zv = dev->si_drv2;
+	if (zv == NULL) {
+		rw_exit(&zvol_state_lock);
+		return (SET_ERROR(ENXIO));
+	}
+
+	mutex_enter(&zv->zv_state_lock);
+	if (zv->zv_flags & ZVOL_EXCL) {
+		ASSERT(zv->zv_open_count == 1);
+		zv->zv_flags &= ~ZVOL_EXCL;
+	}
+
+	ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV);
+
+	/*
+	 * If the open count is zero, this is a spurious close.
+	 * That indicates a bug in the kernel / DDI framework.
+	 */
+	ASSERT(zv->zv_open_count > 0);
+	/*
+	 * make sure zvol is not suspended during last close
+	 * (hold zv_suspend_lock) and respect proper lock acquisition
+	 * ordering - zv_suspend_lock before zv_state_lock
+	 */
+	if (zv->zv_open_count == 1) {
+		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+			mutex_exit(&zv->zv_state_lock);
+			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+			mutex_enter(&zv->zv_state_lock);
+			/* check to see if zv_suspend_lock is needed */
+			if (zv->zv_open_count != 1) {
+				rw_exit(&zv->zv_suspend_lock);
+				drop_suspend = B_FALSE;
+			}
+		}
+	} else {
+		drop_suspend = B_FALSE;
+	}
+	rw_exit(&zvol_state_lock);
+
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	/*
+	 * You may get multiple opens, but only one close.
+	 */
+	zv->zv_open_count--;
+	if (flags & (FSYNC | FDSYNC)) {
+		zsd = &zv->zv_zso->zso_dev;
+		zsd->zsd_sync_cnt--;
+	}
+
+	if (zv->zv_open_count == 0) {
+		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+		zvol_last_close(zv);
+	}
+
+	mutex_exit(&zv->zv_state_lock);
+
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+	return (0);
+}
+
+static int
+zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
+    int fflag, struct thread *td)
+{
+	zvol_state_t *zv;
+	zfs_locked_range_t *lr;
+	off_t offset, length;
+	int i, error;
+	boolean_t sync;
+
+	zv = dev->si_drv2;
+
+	error = 0;
+	KASSERT(zv->zv_open_count > 0,
+	    ("Device with zero access count in %s", __func__));
+
+	i = IOCPARM_LEN(cmd);
+	switch (cmd) {
+	case DIOCGSECTORSIZE:
+		*(uint32_t *)data = DEV_BSIZE;
+		break;
+	case DIOCGMEDIASIZE:
+		*(off_t *)data = zv->zv_volsize;
+		break;
+	case DIOCGFLUSH:
+		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+		if (zv->zv_zilog != NULL)
+			zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		rw_exit(&zv->zv_suspend_lock);
+		break;
+	case DIOCGDELETE:
+		if (!zvol_unmap_enabled)
+			break;
+
+		offset = ((off_t *)data)[0];
+		length = ((off_t *)data)[1];
+		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
+		    offset < 0 || offset >= zv->zv_volsize ||
+		    length <= 0) {
+			printf("%s: offset=%jd length=%jd\n", __func__, offset,
+			    length);
+			error = EINVAL;
+			break;
+		}
+		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+		zvol_ensure_zilog(zv);
+		lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
+		    RL_WRITER);
+		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error != 0) {
+			sync = FALSE;
+			dmu_tx_abort(tx);
+		} else {
+			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+			zvol_log_truncate(zv, tx, offset, length, sync);
+			dmu_tx_commit(tx);
+			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+			    offset, length);
+		}
+		zfs_rangelock_exit(lr);
+		if (sync)
+			zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		rw_exit(&zv->zv_suspend_lock);
+		break;
+	case DIOCGSTRIPESIZE:
+		*(off_t *)data = zv->zv_volblocksize;
+		break;
+	case DIOCGSTRIPEOFFSET:
+		*(off_t *)data = 0;
+		break;
+	case DIOCGATTR: {
+		spa_t *spa = dmu_objset_spa(zv->zv_objset);
+		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
+		uint64_t refd, avail, usedobjs, availobjs;
+
+		if (strcmp(arg->name, "GEOM::candelete") == 0)
+			arg->value.i = 1;
+		else if (strcmp(arg->name, "blocksavail") == 0) {
+			dmu_objset_space(zv->zv_objset, &refd, &avail,
+			    &usedobjs, &availobjs);
+			arg->value.off = avail / DEV_BSIZE;
+		} else if (strcmp(arg->name, "blocksused") == 0) {
+			dmu_objset_space(zv->zv_objset, &refd, &avail,
+			    &usedobjs, &availobjs);
+			arg->value.off = refd / DEV_BSIZE;
+		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
+			avail = metaslab_class_get_space(spa_normal_class(spa));
+			avail -= metaslab_class_get_alloc(
+			    spa_normal_class(spa));
+			arg->value.off = avail / DEV_BSIZE;
+		} else if (strcmp(arg->name, "poolblocksused") == 0) {
+			refd = metaslab_class_get_alloc(spa_normal_class(spa));
+			arg->value.off = refd / DEV_BSIZE;
+		} else
+			error = ENOIOCTL;
+		break;
+	}
+	case FIOSEEKHOLE:
+	case FIOSEEKDATA: {
+		off_t *off = (off_t *)data;
+		uint64_t noff;
+		boolean_t hole;
+
+		hole = (cmd == FIOSEEKHOLE);
+		noff = *off;
+		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
+		*off = noff;
+		break;
+	}
+	default:
+		error = ENOIOCTL;
+	}
+
+	return (error);
+}
+
+/*
+ * Misc. helpers
+ */
+
+static void
+zvol_ensure_zilog(zvol_state_t *zv)
+{
+	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+
+	/*
+	 * Open a ZIL if this is the first time we have written to this
+	 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
+	 * than zv_state_lock so that we don't need to acquire an
+	 * additional lock in this path.
+	 */
+	if (zv->zv_zilog == NULL) {
+		if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
+			rw_exit(&zv->zv_suspend_lock);
+			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+		}
+		if (zv->zv_zilog == NULL) {
+			zv->zv_zilog = zil_open(zv->zv_objset,
+			    zvol_get_data);
+			zv->zv_flags |= ZVOL_WRITTEN_TO;
+		}
+		rw_downgrade(&zv->zv_suspend_lock);
+	}
+}
+
+static boolean_t
+zvol_is_zvol_impl(const char *device)
+{
+	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
+}
+
+static void
+zvol_rename_minor(zvol_state_t *zv, const char *newname)
+{
+	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	/* move to new hashtable entry  */
+	zv->zv_hash = zvol_name_hash(zv->zv_name);
+	hlist_del(&zv->zv_hlink);
+	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
+
+	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+		struct g_provider *pp = zsg->zsg_provider;
+		struct g_geom *gp;
+
+		g_topology_lock();
+		gp = pp->geom;
+		ASSERT(gp != NULL);
+
+		zsg->zsg_provider = NULL;
+		g_wither_provider(pp, ENXIO);
+
+		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
+		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+		pp->sectorsize = DEV_BSIZE;
+		pp->mediasize = zv->zv_volsize;
+		pp->private = zv;
+		zsg->zsg_provider = pp;
+		g_error_provider(pp, 0);
+		g_topology_unlock();
+	} else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
+		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+		struct cdev *dev;
+		struct make_dev_args args;
+
+		dev = zsd->zsd_cdev;
+		if (dev != NULL) {
+			destroy_dev(dev);
+			dev = zsd->zsd_cdev = NULL;
+			if (zv->zv_open_count > 0) {
+				zv->zv_flags &= ~ZVOL_EXCL;
+				zv->zv_open_count = 0;
+				/* XXX  need suspend lock but lock order */
+				zvol_last_close(zv);
+			}
+		}
+
+		make_dev_args_init(&args);
+		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+		args.mda_devsw = &zvol_cdevsw;
+		args.mda_cr = NULL;
+		args.mda_uid = UID_ROOT;
+		args.mda_gid = GID_OPERATOR;
+		args.mda_mode = 0640;
+		args.mda_si_drv2 = zv;
+		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
+		    == 0) {
+			dev->si_iosize_max = MAXPHYS;
+			zsd->zsd_cdev = dev;
+		}
+	}
+	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
+}
+
+/*
+ * Remove minor node for the specified volume.
+ */
+static void
+zvol_free(zvol_state_t *zv)
+{
+	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT(zv->zv_open_count == 0);
+
+	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+
+	rw_destroy(&zv->zv_suspend_lock);
+	zfs_rangelock_fini(&zv->zv_rangelock);
+
+	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+
+		g_topology_lock();
+		zvol_geom_destroy(zv);
+		g_topology_unlock();
+		mtx_destroy(&zsg->zsg_queue_mtx);
+	} else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
+		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+		struct cdev *dev = zsd->zsd_cdev;
+
+		if (dev != NULL)
+			destroy_dev(dev);
+	}
+
+	mutex_destroy(&zv->zv_state_lock);
+	dataset_kstats_destroy(&zv->zv_kstat);
+	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+	kmem_free(zv, sizeof (zvol_state_t));
+	zvol_minors--;
+}
+
+/*
+ * Create a minor node (plus a whole lot more) for the specified volume.
+ */
+static int
+zvol_create_minor_impl(const char *name)
+{
+	zvol_state_t *zv;
+	objset_t *os;
+	dmu_object_info_t *doi;
+	uint64_t volsize;
+	uint64_t volmode, hash;
+	int error;
+
+	ZFS_LOG(1, "Creating ZVOL %s...", name);
+
+	hash = zvol_name_hash(name);
+	if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
+		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+		mutex_exit(&zv->zv_state_lock);
+		return (SET_ERROR(EEXIST));
+	}
+
+	DROP_GIANT();
+	/* lie and say we're read-only */
+	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
+	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+
+	if (error)
+		goto out_doi;
+
+	error = dmu_object_info(os, ZVOL_OBJ, doi);
+	if (error)
+		goto out_dmu_objset_disown;
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+	if (error)
+		goto out_dmu_objset_disown;
+
+	error = dsl_prop_get_integer(name,
+	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
+	if (error != 0 || volmode == ZFS_VOLMODE_DEFAULT)
+		volmode = zvol_volmode;
+	/*
+	 * zvol_alloc equivalent ...
+	 */
+	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
+	zv->zv_hash = hash;
+	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
+	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
+	zv->zv_zso->zso_volmode = volmode;
+	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+		struct g_provider *pp;
+		struct g_geom *gp;
+
+		zsg->zsg_state = ZVOL_GEOM_UNINIT;
+		mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
+
+		g_topology_lock();
+		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+		gp->start = zvol_geom_bio_start;
+		gp->access = zvol_geom_access;
+		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
+		/* TODO: NULL check? */
+		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+		pp->sectorsize = DEV_BSIZE;
+		pp->mediasize = 0;
+		pp->private = zv;
+
+		zsg->zsg_provider = pp;
+		bioq_init(&zsg->zsg_queue);
+	} else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
+		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+		struct cdev *dev;
+		struct make_dev_args args;
+
+		make_dev_args_init(&args);
+		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+		args.mda_devsw = &zvol_cdevsw;
+		args.mda_cr = NULL;
+		args.mda_uid = UID_ROOT;
+		args.mda_gid = GID_OPERATOR;
+		args.mda_mode = 0640;
+		args.mda_si_drv2 = zv;
+		error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
+		if (error != 0) {
+			mutex_destroy(&zv->zv_state_lock);
+			kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+			kmem_free(zv, sizeof (*zv));
+			dmu_objset_disown(os, B_TRUE, FTAG);
+			goto out_giant;
+		}
+		dev->si_iosize_max = MAXPHYS;
+		zsd->zsd_cdev = dev;
+	}
+	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
+	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
+	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
+
+	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
+		zv->zv_flags |= ZVOL_RDONLY;
+
+	zv->zv_volblocksize = doi->doi_data_block_size;
+	zv->zv_volsize = volsize;
+	zv->zv_objset = os;
+
+	if (spa_writeable(dmu_objset_spa(os))) {
+		if (zil_replay_disable)
+			zil_destroy(dmu_objset_zil(os), B_FALSE);
+		else
+			zil_replay(os, zv, zvol_replay_vector);
+	}
+	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+	dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
+
+	/* XXX do prefetch */
+
+	zv->zv_objset = NULL;
+out_dmu_objset_disown:
+	dmu_objset_disown(os, B_TRUE, FTAG);
+
+	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+		if (error == 0)
+			zvol_geom_run(zv);
+		g_topology_unlock();
+	}
+out_doi:
+	kmem_free(doi, sizeof (dmu_object_info_t));
+	if (error == 0) {
+		rw_enter(&zvol_state_lock, RW_WRITER);
+		zvol_insert(zv);
+		zvol_minors++;
+		rw_exit(&zvol_state_lock);
+	}
+	ZFS_LOG(1, "ZVOL %s created.", name);
+out_giant:
+	PICKUP_GIANT();
+	return (error);
+}
+
+static void
+zvol_clear_private(zvol_state_t *zv)
+{
+	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
+	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+		struct g_provider *pp = zsg->zsg_provider;
+
+		if (pp == NULL) /* XXX when? */
+			return;
+
+		mtx_lock(&zsg->zsg_queue_mtx);
+		zsg->zsg_state = ZVOL_GEOM_STOPPED;
+		pp->private = NULL;
+		wakeup_one(&zsg->zsg_queue);
+		while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
+			msleep(&zsg->zsg_state,
+			    &zsg->zsg_queue_mtx,
+			    0, "zvol:w", 0);
+		mtx_unlock(&zsg->zsg_queue_mtx);
+		ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+	}
+}
+
+static int
+zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
+{
+	zv->zv_volsize = volsize;
+	if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+		struct g_provider *pp = zsg->zsg_provider;
+
+		if (pp == NULL) /* XXX when? */
+			return (0);
+
+		g_topology_lock();
+
+		/*
+		 * Do not invoke resize event when initial size was zero.
+		 * ZVOL initializes the size on first open, this is not
+		 * real resizing.
+		 */
+		if (pp->mediasize == 0)
+			pp->mediasize = zv->zv_volsize;
+		else
+			g_resize_provider(pp, zv->zv_volsize);
+
+		g_topology_unlock();
+	}
+	return (0);
+}
+
+static void
+zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
+{
+	// XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
+}
+
+static void
+zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
+{
+	// XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
+}
+
+const static zvol_platform_ops_t zvol_freebsd_ops = {
+	.zv_free = zvol_free,
+	.zv_rename_minor = zvol_rename_minor,
+	.zv_create_minor = zvol_create_minor_impl,
+	.zv_update_volsize = zvol_update_volsize,
+	.zv_clear_private = zvol_clear_private,
+	.zv_is_zvol = zvol_is_zvol_impl,
+	.zv_set_disk_ro = zvol_set_disk_ro_impl,
+	.zv_set_capacity = zvol_set_capacity_impl,
+};
+
+/*
+ * Public interfaces
+ */
+
+int
+zvol_busy(void)
+{
+	return (zvol_minors != 0);
+}
+
+int
+zvol_init(void)
+{
+	zvol_init_impl();
+	zvol_register_ops(&zvol_freebsd_ops);
+	return (0);
+}
+
+void
+zvol_fini(void)
+{
+	zvol_fini_impl();
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/Makefile.in b/sys/contrib/openzfs/module/os/linux/spl/Makefile.in
new file mode 100644
index 000000000000..b2325f91b4a7
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/Makefile.in
@@ -0,0 +1,17 @@
+$(MODULE)-objs += ../os/linux/spl/spl-atomic.o
+$(MODULE)-objs += ../os/linux/spl/spl-condvar.o
+$(MODULE)-objs += ../os/linux/spl/spl-cred.o
+$(MODULE)-objs += ../os/linux/spl/spl-err.o
+$(MODULE)-objs += ../os/linux/spl/spl-generic.o
+$(MODULE)-objs += ../os/linux/spl/spl-kmem.o
+$(MODULE)-objs += ../os/linux/spl/spl-kmem-cache.o
+$(MODULE)-objs += ../os/linux/spl/spl-kstat.o
+$(MODULE)-objs += ../os/linux/spl/spl-proc.o
+$(MODULE)-objs += ../os/linux/spl/spl-procfs-list.o
+$(MODULE)-objs += ../os/linux/spl/spl-taskq.o
+$(MODULE)-objs += ../os/linux/spl/spl-thread.o
+$(MODULE)-objs += ../os/linux/spl/spl-trace.o
+$(MODULE)-objs += ../os/linux/spl/spl-tsd.o
+$(MODULE)-objs += ../os/linux/spl/spl-vmem.o
+$(MODULE)-objs += ../os/linux/spl/spl-xdr.o
+$(MODULE)-objs += ../os/linux/spl/spl-zlib.o
diff --git a/sys/contrib/openzfs/module/os/linux/spl/README.md b/sys/contrib/openzfs/module/os/linux/spl/README.md
new file mode 100644
index 000000000000..51166425f063
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/README.md
@@ -0,0 +1,16 @@
+The Solaris Porting Layer, SPL, is a Linux kernel module which provides a
+compatibility layer used by the [ZFS on Linux](https://zfsonlinux.org) project.
+
+# Installation
+
+The latest version of the SPL is maintained as part of this repository.
+Only when building ZFS version 0.7.x or earlier must an external SPL release
+be used.  These releases can be found at:
+
+  * Version 0.7.x: https://github.com/zfsonlinux/spl/tree/spl-0.7-release  
+  * Version 0.6.5.x: https://github.com/zfsonlinux/spl/tree/spl-0.6.5-release  
+
+# Release
+
+The SPL is released under a GPLv2 license.  
+For more details see the NOTICE and THIRDPARTYLICENSE files; `UCRL-CODE-235197`
diff --git a/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2 b/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2
new file mode 100644
index 000000000000..d159169d1050
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2
@@ -0,0 +1,339 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip b/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip
new file mode 100644
index 000000000000..78535a8ee133
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip
@@ -0,0 +1 @@
+COMPATIBILITY LAYER FOR OPENZFS ON LINUX
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c
new file mode 100644
index 000000000000..47ed1886e157
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c
@@ -0,0 +1,36 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Atomic Implementation.
+ */
+
+#include <sys/atomic.h>
+
+#ifdef ATOMIC_SPINLOCK
+/* Global atomic lock declarations */
+DEFINE_SPINLOCK(atomic32_lock);
+DEFINE_SPINLOCK(atomic64_lock);
+
+EXPORT_SYMBOL(atomic32_lock);
+EXPORT_SYMBOL(atomic64_lock);
+#endif /* ATOMIC_SPINLOCK */
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
new file mode 100644
index 000000000000..9d045e3e8a66
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
@@ -0,0 +1,466 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Credential Implementation.
+ */
+
+#include <sys/condvar.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <linux/hrtimer.h>
+#include <linux/compiler_compat.h>
+#include <linux/mod_compat.h>
+
+#include <linux/sched.h>
+
+#ifdef HAVE_SCHED_SIGNAL_HEADER
+#include <linux/sched/signal.h>
+#endif
+
+#define	MAX_HRTIMEOUT_SLACK_US	1000
+unsigned int spl_schedule_hrtimeout_slack_us = 0;
+
+static int
+param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp)
+{
+	unsigned long val;
+	int error;
+
+	error = kstrtoul(buf, 0, &val);
+	if (error)
+		return (error);
+
+	if (val > MAX_HRTIMEOUT_SLACK_US)
+		return (-EINVAL);
+
+	error = param_set_uint(buf, kp);
+	if (error < 0)
+		return (error);
+
+	return (0);
+}
+
+module_param_call(spl_schedule_hrtimeout_slack_us, param_set_hrtimeout_slack,
+	param_get_uint, &spl_schedule_hrtimeout_slack_us, 0644);
+MODULE_PARM_DESC(spl_schedule_hrtimeout_slack_us,
+	"schedule_hrtimeout_range() delta/slack value in us, default(0)");
+
+void
+__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg)
+{
+	ASSERT(cvp);
+	ASSERT(name == NULL);
+	ASSERT(type == CV_DEFAULT);
+	ASSERT(arg == NULL);
+
+	cvp->cv_magic = CV_MAGIC;
+	init_waitqueue_head(&cvp->cv_event);
+	init_waitqueue_head(&cvp->cv_destroy);
+	atomic_set(&cvp->cv_waiters, 0);
+	atomic_set(&cvp->cv_refs, 1);
+	cvp->cv_mutex = NULL;
+}
+EXPORT_SYMBOL(__cv_init);
+
+static int
+cv_destroy_wakeup(kcondvar_t *cvp)
+{
+	if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) {
+		ASSERT(cvp->cv_mutex == NULL);
+		ASSERT(!waitqueue_active(&cvp->cv_event));
+		return (1);
+	}
+
+	return (0);
+}
+
+void
+__cv_destroy(kcondvar_t *cvp)
+{
+	ASSERT(cvp);
+	ASSERT(cvp->cv_magic == CV_MAGIC);
+
+	cvp->cv_magic = CV_DESTROY;
+	atomic_dec(&cvp->cv_refs);
+
+	/* Block until all waiters are woken and references dropped. */
+	while (cv_destroy_wakeup(cvp) == 0)
+		wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1);
+
+	ASSERT3P(cvp->cv_mutex, ==, NULL);
+	ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0);
+	ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0);
+	ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0);
+}
+EXPORT_SYMBOL(__cv_destroy);
+
+static void
+cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state, int io)
+{
+	DEFINE_WAIT(wait);
+	kmutex_t *m;
+
+	ASSERT(cvp);
+	ASSERT(mp);
+	ASSERT(cvp->cv_magic == CV_MAGIC);
+	ASSERT(mutex_owned(mp));
+	atomic_inc(&cvp->cv_refs);
+
+	m = READ_ONCE(cvp->cv_mutex);
+	if (!m)
+		m = xchg(&cvp->cv_mutex, mp);
+	/* Ensure the same mutex is used by all callers */
+	ASSERT(m == NULL || m == mp);
+
+	prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+	atomic_inc(&cvp->cv_waiters);
+
+	/*
+	 * Mutex should be dropped after prepare_to_wait() this
+	 * ensures we're linked in to the waiters list and avoids the
+	 * race where 'cvp->cv_waiters > 0' but the list is empty.
+	 */
+	mutex_exit(mp);
+	if (io)
+		io_schedule();
+	else
+		schedule();
+
+	/* No more waiters a different mutex could be used */
+	if (atomic_dec_and_test(&cvp->cv_waiters)) {
+		/*
+		 * This is set without any lock, so it's racy. But this is
+		 * just for debug anyway, so make it best-effort
+		 */
+		cvp->cv_mutex = NULL;
+		wake_up(&cvp->cv_destroy);
+	}
+
+	finish_wait(&cvp->cv_event, &wait);
+	atomic_dec(&cvp->cv_refs);
+
+	/*
+	 * Hold mutex after we release the cvp, otherwise we could dead lock
+	 * with a thread holding the mutex and call cv_destroy.
+	 */
+	mutex_enter(mp);
+}
+
+void
+__cv_wait(kcondvar_t *cvp, kmutex_t *mp)
+{
+	cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 0);
+}
+EXPORT_SYMBOL(__cv_wait);
+
+void
+__cv_wait_io(kcondvar_t *cvp, kmutex_t *mp)
+{
+	cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 1);
+}
+EXPORT_SYMBOL(__cv_wait_io);
+
+int
+__cv_wait_io_sig(kcondvar_t *cvp, kmutex_t *mp)
+{
+	cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 1);
+
+	return (signal_pending(current) ? 0 : 1);
+}
+EXPORT_SYMBOL(__cv_wait_io_sig);
+
+int
+__cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp)
+{
+	cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0);
+
+	return (signal_pending(current) ? 0 : 1);
+}
+EXPORT_SYMBOL(__cv_wait_sig);
+
+#if defined(HAVE_IO_SCHEDULE_TIMEOUT)
+#define	spl_io_schedule_timeout(t)	io_schedule_timeout(t)
+#else
+
+struct spl_task_timer {
+	struct timer_list timer;
+	struct task_struct *task;
+};
+
+static void
+__cv_wakeup(spl_timer_list_t t)
+{
+	struct timer_list *tmr = (struct timer_list *)t;
+	struct spl_task_timer *task_timer = from_timer(task_timer, tmr, timer);
+
+	wake_up_process(task_timer->task);
+}
+
+static long
+spl_io_schedule_timeout(long time_left)
+{
+	long expire_time = jiffies + time_left;
+	struct spl_task_timer task_timer;
+	struct timer_list *timer = &task_timer.timer;
+
+	task_timer.task = current;
+
+	timer_setup(timer, __cv_wakeup, 0);
+
+	timer->expires = expire_time;
+	add_timer(timer);
+
+	io_schedule();
+
+	del_timer_sync(timer);
+
+	time_left = expire_time - jiffies;
+
+	return (time_left < 0 ? 0 : time_left);
+}
+#endif
+
+/*
+ * 'expire_time' argument is an absolute wall clock time in jiffies.
+ * Return value is time left (expire_time - now) or -1 if timeout occurred.
+ */
+static clock_t
+__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time,
+    int state, int io)
+{
+	DEFINE_WAIT(wait);
+	kmutex_t *m;
+	clock_t time_left;
+
+	ASSERT(cvp);
+	ASSERT(mp);
+	ASSERT(cvp->cv_magic == CV_MAGIC);
+	ASSERT(mutex_owned(mp));
+
+	/* XXX - Does not handle jiffie wrap properly */
+	time_left = expire_time - jiffies;
+	if (time_left <= 0)
+		return (-1);
+
+	atomic_inc(&cvp->cv_refs);
+	m = READ_ONCE(cvp->cv_mutex);
+	if (!m)
+		m = xchg(&cvp->cv_mutex, mp);
+	/* Ensure the same mutex is used by all callers */
+	ASSERT(m == NULL || m == mp);
+
+	prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+	atomic_inc(&cvp->cv_waiters);
+
+	/*
+	 * Mutex should be dropped after prepare_to_wait() this
+	 * ensures we're linked in to the waiters list and avoids the
+	 * race where 'cvp->cv_waiters > 0' but the list is empty.
+	 */
+	mutex_exit(mp);
+	if (io)
+		time_left = spl_io_schedule_timeout(time_left);
+	else
+		time_left = schedule_timeout(time_left);
+
+	/* No more waiters a different mutex could be used */
+	if (atomic_dec_and_test(&cvp->cv_waiters)) {
+		/*
+		 * This is set without any lock, so it's racy. But this is
+		 * just for debug anyway, so make it best-effort
+		 */
+		cvp->cv_mutex = NULL;
+		wake_up(&cvp->cv_destroy);
+	}
+
+	finish_wait(&cvp->cv_event, &wait);
+	atomic_dec(&cvp->cv_refs);
+
+	/*
+	 * Hold mutex after we release the cvp, otherwise we could dead lock
+	 * with a thread holding the mutex and call cv_destroy.
+	 */
+	mutex_enter(mp);
+	return (time_left > 0 ? 1 : -1);
+}
+
+int
+__cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+	return (__cv_timedwait_common(cvp, mp, exp_time,
+	    TASK_UNINTERRUPTIBLE, 0));
+}
+EXPORT_SYMBOL(__cv_timedwait);
+
+int
+__cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+	return (__cv_timedwait_common(cvp, mp, exp_time,
+	    TASK_UNINTERRUPTIBLE, 1));
+}
+EXPORT_SYMBOL(__cv_timedwait_io);
+
+int
+__cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+	int rc;
+
+	rc = __cv_timedwait_common(cvp, mp, exp_time, TASK_INTERRUPTIBLE, 0);
+	return (signal_pending(current) ? 0 : rc);
+}
+EXPORT_SYMBOL(__cv_timedwait_sig);
+
+/*
+ * 'expire_time' argument is an absolute clock time in nanoseconds.
+ * Return value is time left (expire_time - now) or -1 if timeout occurred.
+ */
+static clock_t
+__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time,
+    hrtime_t res, int state)
+{
+	DEFINE_WAIT(wait);
+	kmutex_t *m;
+	hrtime_t time_left;
+	ktime_t ktime_left;
+	u64 slack = 0;
+	int rc;
+
+	ASSERT(cvp);
+	ASSERT(mp);
+	ASSERT(cvp->cv_magic == CV_MAGIC);
+	ASSERT(mutex_owned(mp));
+
+	time_left = expire_time - gethrtime();
+	if (time_left <= 0)
+		return (-1);
+
+	atomic_inc(&cvp->cv_refs);
+	m = READ_ONCE(cvp->cv_mutex);
+	if (!m)
+		m = xchg(&cvp->cv_mutex, mp);
+	/* Ensure the same mutex is used by all callers */
+	ASSERT(m == NULL || m == mp);
+
+	prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+	atomic_inc(&cvp->cv_waiters);
+
+	/*
+	 * Mutex should be dropped after prepare_to_wait() this
+	 * ensures we're linked in to the waiters list and avoids the
+	 * race where 'cvp->cv_waiters > 0' but the list is empty.
+	 */
+	mutex_exit(mp);
+
+	ktime_left = ktime_set(0, time_left);
+	slack = MIN(MAX(res, spl_schedule_hrtimeout_slack_us * NSEC_PER_USEC),
+	    MAX_HRTIMEOUT_SLACK_US * NSEC_PER_USEC);
+	rc = schedule_hrtimeout_range(&ktime_left, slack, HRTIMER_MODE_REL);
+
+	/* No more waiters a different mutex could be used */
+	if (atomic_dec_and_test(&cvp->cv_waiters)) {
+		/*
+		 * This is set without any lock, so it's racy. But this is
+		 * just for debug anyway, so make it best-effort
+		 */
+		cvp->cv_mutex = NULL;
+		wake_up(&cvp->cv_destroy);
+	}
+
+	finish_wait(&cvp->cv_event, &wait);
+	atomic_dec(&cvp->cv_refs);
+
+	mutex_enter(mp);
+	return (rc == -EINTR ? 1 : -1);
+}
+
+/*
+ * Compatibility wrapper for the cv_timedwait_hires() Illumos interface.
+ */
+static int
+cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+    hrtime_t res, int flag, int state)
+{
+	if (!(flag & CALLOUT_FLAG_ABSOLUTE))
+		tim += gethrtime();
+
+	return (__cv_timedwait_hires(cvp, mp, tim, res, state));
+}
+
+int
+cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res,
+    int flag)
+{
+	return (cv_timedwait_hires_common(cvp, mp, tim, res, flag,
+	    TASK_UNINTERRUPTIBLE));
+}
+EXPORT_SYMBOL(cv_timedwait_hires);
+
+int
+cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+    hrtime_t res, int flag)
+{
+	int rc;
+
+	rc = cv_timedwait_hires_common(cvp, mp, tim, res, flag,
+	    TASK_INTERRUPTIBLE);
+	return (signal_pending(current) ? 0 : rc);
+}
+EXPORT_SYMBOL(cv_timedwait_sig_hires);
+
+void
+__cv_signal(kcondvar_t *cvp)
+{
+	ASSERT(cvp);
+	ASSERT(cvp->cv_magic == CV_MAGIC);
+	atomic_inc(&cvp->cv_refs);
+
+	/*
+	 * All waiters are added with WQ_FLAG_EXCLUSIVE so only one
+	 * waiter will be set runnable with each call to wake_up().
+	 * Additionally wake_up() holds a spin_lock associated with
+	 * the wait queue to ensure we don't race waking up processes.
+	 */
+	if (atomic_read(&cvp->cv_waiters) > 0)
+		wake_up(&cvp->cv_event);
+
+	atomic_dec(&cvp->cv_refs);
+}
+EXPORT_SYMBOL(__cv_signal);
+
+void
+__cv_broadcast(kcondvar_t *cvp)
+{
+	ASSERT(cvp);
+	ASSERT(cvp->cv_magic == CV_MAGIC);
+	atomic_inc(&cvp->cv_refs);
+
+	/*
+	 * Wake_up_all() will wake up all waiters even those which
+	 * have the WQ_FLAG_EXCLUSIVE flag set.
+	 */
+	if (atomic_read(&cvp->cv_waiters) > 0)
+		wake_up_all(&cvp->cv_event);
+
+	atomic_dec(&cvp->cv_refs);
+}
+EXPORT_SYMBOL(__cv_broadcast);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c b/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c
new file mode 100644
index 000000000000..6e93a32e60d7
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c
@@ -0,0 +1,196 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Credential Implementation.
+ */
+
+#include <sys/cred.h>
+
+static int
+cr_groups_search(const struct group_info *group_info, kgid_t grp)
+{
+	unsigned int left, right, mid;
+	int cmp;
+
+	if (!group_info)
+		return (0);
+
+	left = 0;
+	right = group_info->ngroups;
+	while (left < right) {
+		mid = (left + right) / 2;
+		cmp = KGID_TO_SGID(grp) -
+		    KGID_TO_SGID(GROUP_AT(group_info, mid));
+
+		if (cmp > 0)
+			left = mid + 1;
+		else if (cmp < 0)
+			right = mid;
+		else
+			return (1);
+	}
+	return (0);
+}
+
+/* Hold a reference on the credential */
+void
+crhold(cred_t *cr)
+{
+	(void) get_cred((const cred_t *)cr);
+}
+
+/* Free a reference on the credential */
+void
+crfree(cred_t *cr)
+{
+	put_cred((const cred_t *)cr);
+}
+
+/* Return the number of supplemental groups */
+int
+crgetngroups(const cred_t *cr)
+{
+	struct group_info *gi;
+	int rc;
+
+	gi = cr->group_info;
+	rc = gi->ngroups;
+#ifndef HAVE_GROUP_INFO_GID
+	/*
+	 * For Linux <= 4.8,
+	 * crgetgroups will only returns gi->blocks[0], which contains only
+	 * the first NGROUPS_PER_BLOCK groups.
+	 */
+	if (rc > NGROUPS_PER_BLOCK) {
+		WARN_ON_ONCE(1);
+		rc = NGROUPS_PER_BLOCK;
+	}
+#endif
+	return (rc);
+}
+
+/*
+ * Return an array of supplemental gids.  The returned address is safe
+ * to use as long as the caller has taken a reference with crhold().
+ *
+ * Linux 4.9 API change, group_info changed from 2d array via ->blocks to 1d
+ * array via ->gid.
+ */
+gid_t *
+crgetgroups(const cred_t *cr)
+{
+	struct group_info *gi;
+	gid_t *gids = NULL;
+
+	gi = cr->group_info;
+#ifdef HAVE_GROUP_INFO_GID
+	gids = KGIDP_TO_SGIDP(gi->gid);
+#else
+	if (gi->nblocks > 0)
+		gids = KGIDP_TO_SGIDP(gi->blocks[0]);
+#endif
+	return (gids);
+}
+
+/* Check if the passed gid is available in supplied credential. */
+int
+groupmember(gid_t gid, const cred_t *cr)
+{
+	struct group_info *gi;
+	int rc;
+
+	gi = cr->group_info;
+	rc = cr_groups_search(gi, SGID_TO_KGID(gid));
+
+	return (rc);
+}
+
+/* Return the effective user id */
+uid_t
+crgetuid(const cred_t *cr)
+{
+	return (KUID_TO_SUID(cr->euid));
+}
+
+/* Return the real user id */
+uid_t
+crgetruid(const cred_t *cr)
+{
+	return (KUID_TO_SUID(cr->uid));
+}
+
+/* Return the saved user id */
+uid_t
+crgetsuid(const cred_t *cr)
+{
+	return (KUID_TO_SUID(cr->suid));
+}
+
+/* Return the filesystem user id */
+uid_t
+crgetfsuid(const cred_t *cr)
+{
+	return (KUID_TO_SUID(cr->fsuid));
+}
+
+/* Return the effective group id */
+gid_t
+crgetgid(const cred_t *cr)
+{
+	return (KGID_TO_SGID(cr->egid));
+}
+
+/* Return the real group id */
+gid_t
+crgetrgid(const cred_t *cr)
+{
+	return (KGID_TO_SGID(cr->gid));
+}
+
+/* Return the saved group id */
+gid_t
+crgetsgid(const cred_t *cr)
+{
+	return (KGID_TO_SGID(cr->sgid));
+}
+
+/* Return the filesystem group id */
+gid_t
+crgetfsgid(const cred_t *cr)
+{
+	return (KGID_TO_SGID(cr->fsgid));
+}
+
+EXPORT_SYMBOL(crhold);
+EXPORT_SYMBOL(crfree);
+EXPORT_SYMBOL(crgetuid);
+EXPORT_SYMBOL(crgetruid);
+EXPORT_SYMBOL(crgetsuid);
+EXPORT_SYMBOL(crgetfsuid);
+EXPORT_SYMBOL(crgetgid);
+EXPORT_SYMBOL(crgetrgid);
+EXPORT_SYMBOL(crgetsgid);
+EXPORT_SYMBOL(crgetfsgid);
+EXPORT_SYMBOL(crgetngroups);
+EXPORT_SYMBOL(crgetgroups);
+EXPORT_SYMBOL(groupmember);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-err.c b/sys/contrib/openzfs/module/os/linux/spl/spl-err.c
new file mode 100644
index 000000000000..3c0bb71c0629
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-err.c
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Error Implementation.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+
+/*
+ * It is often useful to actually have the panic crash the node so you
+ * can then get notified of the event, get the crashdump for later
+ * analysis and other such goodies.
+ * But we would still default to the current default of not to do that.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_panic_halt;
+module_param(spl_panic_halt, uint, 0644);
+MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures");
+/* END CSTYLED */
+
+void
+spl_dumpstack(void)
+{
+	printk("Showing stack for process %d\n", current->pid);
+	dump_stack();
+}
+EXPORT_SYMBOL(spl_dumpstack);
+
+int
+spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
+{
+	const char *newfile;
+	char msg[MAXMSGLEN];
+	va_list ap;
+
+	newfile = strrchr(file, '/');
+	if (newfile != NULL)
+		newfile = newfile + 1;
+	else
+		newfile = file;
+
+	va_start(ap, fmt);
+	(void) vsnprintf(msg, sizeof (msg), fmt, ap);
+	va_end(ap);
+
+	printk(KERN_EMERG "%s", msg);
+	printk(KERN_EMERG "PANIC at %s:%d:%s()\n", newfile, line, func);
+	if (spl_panic_halt)
+		panic("%s", msg);
+
+	spl_dumpstack();
+
+	/* Halt the thread to facilitate further debugging */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (1)
+		schedule();
+
+	/* Unreachable */
+	return (1);
+}
+EXPORT_SYMBOL(spl_panic);
+
+void
+vcmn_err(int ce, const char *fmt, va_list ap)
+{
+	char msg[MAXMSGLEN];
+
+	vsnprintf(msg, MAXMSGLEN, fmt, ap);
+
+	switch (ce) {
+	case CE_IGNORE:
+		break;
+	case CE_CONT:
+		printk("%s", msg);
+		break;
+	case CE_NOTE:
+		printk(KERN_NOTICE "NOTICE: %s\n", msg);
+		break;
+	case CE_WARN:
+		printk(KERN_WARNING "WARNING: %s\n", msg);
+		break;
+	case CE_PANIC:
+		printk(KERN_EMERG "PANIC: %s\n", msg);
+		spl_dumpstack();
+
+		/* Halt the thread to facilitate further debugging */
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		while (1)
+			schedule();
+	}
+} /* vcmn_err() */
+EXPORT_SYMBOL(vcmn_err);
+
+void
+cmn_err(int ce, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vcmn_err(ce, fmt, ap);
+	va_end(ap);
+} /* cmn_err() */
+EXPORT_SYMBOL(cmn_err);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
new file mode 100644
index 000000000000..820fb86c3c7d
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
@@ -0,0 +1,844 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Generic Implementation.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/systeminfo.h>
+#include <sys/vmsystm.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/taskq.h>
+#include <sys/tsd.h>
+#include <sys/zmod.h>
+#include <sys/debug.h>
+#include <sys/proc.h>
+#include <sys/kstat.h>
+#include <sys/file.h>
+#include <sys/sunddi.h>
+#include <linux/ctype.h>
+#include <sys/disp.h>
+#include <sys/random.h>
+#include <sys/strings.h>
+#include <linux/kmod.h>
+#include "zfs_gitrev.h"
+#include <linux/mod_compat.h>
+#include <sys/cred.h>
+#include <sys/vnode.h>
+
+char spl_gitrev[64] = ZFS_META_GITREV;
+
+/* BEGIN CSTYLED */
+unsigned long spl_hostid = 0;
+EXPORT_SYMBOL(spl_hostid);
+/* BEGIN CSTYLED */
+module_param(spl_hostid, ulong, 0644);
+MODULE_PARM_DESC(spl_hostid, "The system hostid.");
+/* END CSTYLED */
+
+proc_t p0;
+EXPORT_SYMBOL(p0);
+
+/*
+ * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna
+ *
+ * "Further scramblings of Marsaglia's xorshift generators"
+ * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf
+ *
+ * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose
+ * is to provide bytes containing random numbers. It is mapped to /dev/urandom
+ * on Illumos, which uses a "FIPS 186-2 algorithm". No user of the SPL's
+ * random_get_pseudo_bytes() needs bytes that are of cryptographic quality, so
+ * we can implement it using a fast PRNG that we seed using Linux' actual
+ * equivalent to random_get_pseudo_bytes(). We do this by providing each CPU
+ * with an independent seed so that all calls to random_get_pseudo_bytes() are
+ * free of atomic instructions.
+ *
+ * A consequence of using a fast PRNG is that using random_get_pseudo_bytes()
+ * to generate words larger than 128 bits will paradoxically be limited to
+ * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1`
+ * 128-bit words and selecting the first will implicitly select the second. If
+ * a caller finds this behavior undesirable, random_get_bytes() should be used
+ * instead.
+ *
+ * XXX: Linux interrupt handlers that trigger within the critical section
+ * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will
+ * see the same numbers. Nothing in the code currently calls this in an
+ * interrupt handler, so this is considered to be okay. If that becomes a
+ * problem, we could create a set of per-cpu variables for interrupt handlers
+ * and use them when in_interrupt() from linux/preempt_mask.h evaluates to
+ * true.
+ */
+void __percpu *spl_pseudo_entropy;
+
+/*
+ * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed
+ * file:
+ *
+ * http://xorshift.di.unimi.it/xorshift128plus.c
+ */
+
+static inline uint64_t
+spl_rand_next(uint64_t *s)
+{
+	uint64_t s1 = s[0];
+	const uint64_t s0 = s[1];
+	s[0] = s0;
+	s1 ^= s1 << 23; // a
+	s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
+	return (s[1] + s0);
+}
+
+static inline void
+spl_rand_jump(uint64_t *s)
+{
+	static const uint64_t JUMP[] =
+	    { 0x8a5cd789635d2dff, 0x121fd2155c472f96 };
+
+	uint64_t s0 = 0;
+	uint64_t s1 = 0;
+	int i, b;
+	for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++)
+		for (b = 0; b < 64; b++) {
+			if (JUMP[i] & 1ULL << b) {
+				s0 ^= s[0];
+				s1 ^= s[1];
+			}
+			(void) spl_rand_next(s);
+		}
+
+	s[0] = s0;
+	s[1] = s1;
+}
+
+int
+random_get_pseudo_bytes(uint8_t *ptr, size_t len)
+{
+	uint64_t *xp, s[2];
+
+	ASSERT(ptr);
+
+	xp = get_cpu_ptr(spl_pseudo_entropy);
+
+	s[0] = xp[0];
+	s[1] = xp[1];
+
+	while (len) {
+		union {
+			uint64_t ui64;
+			uint8_t byte[sizeof (uint64_t)];
+		}entropy;
+		int i = MIN(len, sizeof (uint64_t));
+
+		len -= i;
+		entropy.ui64 = spl_rand_next(s);
+
+		while (i--)
+			*ptr++ = entropy.byte[i];
+	}
+
+	xp[0] = s[0];
+	xp[1] = s[1];
+
+	put_cpu_ptr(spl_pseudo_entropy);
+
+	return (0);
+}
+
+
+EXPORT_SYMBOL(random_get_pseudo_bytes);
+
+#if BITS_PER_LONG == 32
+
+/*
+ * Support 64/64 => 64 division on a 32-bit platform.  While the kernel
+ * provides a div64_u64() function for this we do not use it because the
+ * implementation is flawed.  There are cases which return incorrect
+ * results as late as linux-2.6.35.  Until this is fixed upstream the
+ * spl must provide its own implementation.
+ *
+ * This implementation is a slightly modified version of the algorithm
+ * proposed by the book 'Hacker's Delight'.  The original source can be
+ * found here and is available for use without restriction.
+ *
+ * http://www.hackersdelight.org/HDcode/newCode/divDouble.c
+ */
+
+/*
+ * Calculate number of leading of zeros for a 64-bit value.
+ */
+static int
+nlz64(uint64_t x)
+{
+	register int n = 0;
+
+	if (x == 0)
+		return (64);
+
+	if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; }
+	if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; }
+	if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n +  8; x = x <<  8; }
+	if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n +  4; x = x <<  4; }
+	if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n +  2; x = x <<  2; }
+	if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n +  1; }
+
+	return (n);
+}
+
+/*
+ * Newer kernels have a div_u64() function but we define our own
+ * to simplify portability between kernel versions.
+ */
+static inline uint64_t
+__div_u64(uint64_t u, uint32_t v)
+{
+	(void) do_div(u, v);
+	return (u);
+}
+
+/*
+ * Turn off missing prototypes warning for these functions. They are
+ * replacements for libgcc-provided functions and will never be called
+ * directly.
+ */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmissing-prototypes"
+
+/*
+ * Implementation of 64-bit unsigned division for 32-bit machines.
+ *
+ * First the procedure takes care of the case in which the divisor is a
+ * 32-bit quantity. There are two subcases: (1) If the left half of the
+ * dividend is less than the divisor, one execution of do_div() is all that
+ * is required (overflow is not possible). (2) Otherwise it does two
+ * divisions, using the grade school method.
+ */
+uint64_t
+__udivdi3(uint64_t u, uint64_t v)
+{
+	uint64_t u0, u1, v1, q0, q1, k;
+	int n;
+
+	if (v >> 32 == 0) {			// If v < 2**32:
+		if (u >> 32 < v) {		// If u/v cannot overflow,
+			return (__div_u64(u, v)); // just do one division.
+		} else {			// If u/v would overflow:
+			u1 = u >> 32;		// Break u into two halves.
+			u0 = u & 0xFFFFFFFF;
+			q1 = __div_u64(u1, v);	// First quotient digit.
+			k  = u1 - q1 * v;	// First remainder, < v.
+			u0 += (k << 32);
+			q0 = __div_u64(u0, v);	// Seconds quotient digit.
+			return ((q1 << 32) + q0);
+		}
+	} else {				// If v >= 2**32:
+		n = nlz64(v);			// 0 <= n <= 31.
+		v1 = (v << n) >> 32;		// Normalize divisor, MSB is 1.
+		u1 = u >> 1;			// To ensure no overflow.
+		q1 = __div_u64(u1, v1);		// Get quotient from
+		q0 = (q1 << n) >> 31;		// Undo normalization and
+						// division of u by 2.
+		if (q0 != 0)			// Make q0 correct or
+			q0 = q0 - 1;		// too small by 1.
+		if ((u - q0 * v) >= v)
+			q0 = q0 + 1;		// Now q0 is correct.
+
+		return (q0);
+	}
+}
+EXPORT_SYMBOL(__udivdi3);
+
+/* BEGIN CSTYLED */
+#ifndef abs64
+#define	abs64(x)	({ uint64_t t = (x) >> 63; ((x) ^ t) - t; })
+#endif
+/* END CSTYLED */
+
+/*
+ * Implementation of 64-bit signed division for 32-bit machines.
+ */
+int64_t
+__divdi3(int64_t u, int64_t v)
+{
+	int64_t q, t;
+	// cppcheck-suppress shiftTooManyBitsSigned
+	q = __udivdi3(abs64(u), abs64(v));
+	// cppcheck-suppress shiftTooManyBitsSigned
+	t = (u ^ v) >> 63;	// If u, v have different
+	return ((q ^ t) - t);	// signs, negate q.
+}
+EXPORT_SYMBOL(__divdi3);
+
+/*
+ * Implementation of 64-bit unsigned modulo for 32-bit machines.
+ */
+uint64_t
+__umoddi3(uint64_t dividend, uint64_t divisor)
+{
+	return (dividend - (divisor * __udivdi3(dividend, divisor)));
+}
+EXPORT_SYMBOL(__umoddi3);
+
+/* 64-bit signed modulo for 32-bit machines. */
+int64_t
+__moddi3(int64_t n, int64_t d)
+{
+	int64_t q;
+	boolean_t nn = B_FALSE;
+
+	if (n < 0) {
+		nn = B_TRUE;
+		n = -n;
+	}
+	if (d < 0)
+		d = -d;
+
+	q = __umoddi3(n, d);
+
+	return (nn ? -q : q);
+}
+EXPORT_SYMBOL(__moddi3);
+
+/*
+ * Implementation of 64-bit unsigned division/modulo for 32-bit machines.
+ */
+uint64_t
+__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r)
+{
+	uint64_t q = __udivdi3(n, d);
+	if (r)
+		*r = n - d * q;
+	return (q);
+}
+EXPORT_SYMBOL(__udivmoddi4);
+
+/*
+ * Implementation of 64-bit signed division/modulo for 32-bit machines.
+ */
+int64_t
+__divmoddi4(int64_t n, int64_t d, int64_t *r)
+{
+	int64_t q, rr;
+	boolean_t nn = B_FALSE;
+	boolean_t nd = B_FALSE;
+	if (n < 0) {
+		nn = B_TRUE;
+		n = -n;
+	}
+	if (d < 0) {
+		nd = B_TRUE;
+		d = -d;
+	}
+
+	q = __udivmoddi4(n, d, (uint64_t *)&rr);
+
+	if (nn != nd)
+		q = -q;
+	if (nn)
+		rr = -rr;
+	if (r)
+		*r = rr;
+	return (q);
+}
+EXPORT_SYMBOL(__divmoddi4);
+
+#if defined(__arm) || defined(__arm__)
+/*
+ * Implementation of 64-bit (un)signed division for 32-bit arm machines.
+ *
+ * Run-time ABI for the ARM Architecture (page 20).  A pair of (unsigned)
+ * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1},
+ * and the remainder in {r2, r3}.  The return type is specifically left
+ * set to 'void' to ensure the compiler does not overwrite these registers
+ * during the return.  All results are in registers as per ABI
+ */
+void
+__aeabi_uldivmod(uint64_t u, uint64_t v)
+{
+	uint64_t res;
+	uint64_t mod;
+
+	res = __udivdi3(u, v);
+	mod = __umoddi3(u, v);
+	{
+		register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
+		register uint32_t r1 asm("r1") = (res >> 32);
+		register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
+		register uint32_t r3 asm("r3") = (mod >> 32);
+
+		/* BEGIN CSTYLED */
+		asm volatile(""
+		    : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3)  /* output */
+		    : "r"(r0), "r"(r1), "r"(r2), "r"(r3));   /* input */
+		/* END CSTYLED */
+
+		return; /* r0; */
+	}
+}
+EXPORT_SYMBOL(__aeabi_uldivmod);
+
+void
+__aeabi_ldivmod(int64_t u, int64_t v)
+{
+	int64_t res;
+	uint64_t mod;
+
+	res =  __divdi3(u, v);
+	mod = __umoddi3(u, v);
+	{
+		register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
+		register uint32_t r1 asm("r1") = (res >> 32);
+		register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
+		register uint32_t r3 asm("r3") = (mod >> 32);
+
+		/* BEGIN CSTYLED */
+		asm volatile(""
+		    : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3)  /* output */
+		    : "r"(r0), "r"(r1), "r"(r2), "r"(r3));   /* input */
+		/* END CSTYLED */
+
+		return; /* r0; */
+	}
+}
+EXPORT_SYMBOL(__aeabi_ldivmod);
+#endif /* __arm || __arm__ */
+
+#pragma GCC diagnostic pop
+
+#endif /* BITS_PER_LONG */
+
+/*
+ * NOTE: The strtoxx behavior is solely based on my reading of the Solaris
+ * ddi_strtol(9F) man page.  I have not verified the behavior of these
+ * functions against their Solaris counterparts.  It is possible that I
+ * may have misinterpreted the man page or the man page is incorrect.
+ */
+int ddi_strtoul(const char *, char **, int, unsigned long *);
+int ddi_strtol(const char *, char **, int, long *);
+int ddi_strtoull(const char *, char **, int, unsigned long long *);
+int ddi_strtoll(const char *, char **, int, long long *);
+
+#define	define_ddi_strtoux(type, valtype)				\
+int ddi_strtou##type(const char *str, char **endptr,			\
+    int base, valtype *result)						\
+{									\
+	valtype last_value, value = 0;					\
+	char *ptr = (char *)str;					\
+	int flag = 1, digit;						\
+									\
+	if (strlen(ptr) == 0)						\
+		return (EINVAL);					\
+									\
+	/* Auto-detect base based on prefix */				\
+	if (!base) {							\
+		if (str[0] == '0') {					\
+			if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \
+				base = 16; /* hex */			\
+				ptr += 2;				\
+			} else if (str[1] >= '0' && str[1] < 8) {	\
+				base = 8; /* octal */			\
+				ptr += 1;				\
+			} else {					\
+				return (EINVAL);			\
+			}						\
+		} else {						\
+			base = 10; /* decimal */			\
+		}							\
+	}								\
+									\
+	while (1) {							\
+		if (isdigit(*ptr))					\
+			digit = *ptr - '0';				\
+		else if (isalpha(*ptr))					\
+			digit = tolower(*ptr) - 'a' + 10;		\
+		else							\
+			break;						\
+									\
+		if (digit >= base)					\
+			break;						\
+									\
+		last_value = value;					\
+		value = value * base + digit;				\
+		if (last_value > value) /* Overflow */			\
+			return (ERANGE);				\
+									\
+		flag = 1;						\
+		ptr++;							\
+	}								\
+									\
+	if (flag)							\
+		*result = value;					\
+									\
+	if (endptr)							\
+		*endptr = (char *)(flag ? ptr : str);			\
+									\
+	return (0);							\
+}									\
+
+#define	define_ddi_strtox(type, valtype)				\
+int ddi_strto##type(const char *str, char **endptr,			\
+    int base, valtype *result)						\
+{									\
+	int rc;								\
+									\
+	if (*str == '-') {						\
+		rc = ddi_strtou##type(str + 1, endptr, base, result);	\
+		if (!rc) {						\
+			if (*endptr == str + 1)				\
+				*endptr = (char *)str;			\
+			else						\
+				*result = -*result;			\
+		}							\
+	} else {							\
+		rc = ddi_strtou##type(str, endptr, base, result);	\
+	}								\
+									\
+	return (rc);							\
+}
+
+define_ddi_strtoux(l, unsigned long)
+define_ddi_strtox(l, long)
+define_ddi_strtoux(ll, unsigned long long)
+define_ddi_strtox(ll, long long)
+
+EXPORT_SYMBOL(ddi_strtoul);
+EXPORT_SYMBOL(ddi_strtol);
+EXPORT_SYMBOL(ddi_strtoll);
+EXPORT_SYMBOL(ddi_strtoull);
+
+int
+ddi_copyin(const void *from, void *to, size_t len, int flags)
+{
+	/* Fake ioctl() issued by kernel, 'from' is a kernel address */
+	if (flags & FKIOCTL) {
+		memcpy(to, from, len);
+		return (0);
+	}
+
+	return (copyin(from, to, len));
+}
+EXPORT_SYMBOL(ddi_copyin);
+
+int
+ddi_copyout(const void *from, void *to, size_t len, int flags)
+{
+	/* Fake ioctl() issued by kernel, 'from' is a kernel address */
+	if (flags & FKIOCTL) {
+		memcpy(to, from, len);
+		return (0);
+	}
+
+	return (copyout(from, to, len));
+}
+EXPORT_SYMBOL(ddi_copyout);
+
+static ssize_t
+spl_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
+{
+#if defined(HAVE_KERNEL_READ_PPOS)
+	return (kernel_read(file, buf, count, pos));
+#else
+	mm_segment_t saved_fs;
+	ssize_t ret;
+
+	saved_fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	ret = vfs_read(file, (void __user *)buf, count, pos);
+
+	set_fs(saved_fs);
+
+	return (ret);
+#endif
+}
+
+static int
+spl_getattr(struct file *filp, struct kstat *stat)
+{
+	int rc;
+
+	ASSERT(filp);
+	ASSERT(stat);
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+	rc = vfs_getattr(&filp->f_path, stat, STATX_BASIC_STATS,
+	    AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+	rc = vfs_getattr(&filp->f_path, stat);
+#else
+	rc = vfs_getattr(filp->f_path.mnt, filp->f_dentry, stat);
+#endif
+	if (rc)
+		return (-rc);
+
+	return (0);
+}
+
+/*
+ * Read the unique system identifier from the /etc/hostid file.
+ *
+ * The behavior of /usr/bin/hostid on Linux systems with the
+ * regular eglibc and coreutils is:
+ *
+ *   1. Generate the value if the /etc/hostid file does not exist
+ *      or if the /etc/hostid file is less than four bytes in size.
+ *
+ *   2. If the /etc/hostid file is at least 4 bytes, then return
+ *      the first four bytes [0..3] in native endian order.
+ *
+ *   3. Always ignore bytes [4..] if they exist in the file.
+ *
+ * Only the first four bytes are significant, even on systems that
+ * have a 64-bit word size.
+ *
+ * See:
+ *
+ *   eglibc: sysdeps/unix/sysv/linux/gethostid.c
+ *   coreutils: src/hostid.c
+ *
+ * Notes:
+ *
+ * The /etc/hostid file on Solaris is a text file that often reads:
+ *
+ *   # DO NOT EDIT
+ *   "0123456789"
+ *
+ * Directly copying this file to Linux results in a constant
+ * hostid of 4f442023 because the default comment constitutes
+ * the first four bytes of the file.
+ *
+ */
+
+char *spl_hostid_path = HW_HOSTID_PATH;
+module_param(spl_hostid_path, charp, 0444);
+MODULE_PARM_DESC(spl_hostid_path, "The system hostid file (/etc/hostid)");
+
+static int
+hostid_read(uint32_t *hostid)
+{
+	uint64_t size;
+	uint32_t value = 0;
+	int error;
+	loff_t off;
+	struct file *filp;
+	struct kstat stat;
+
+	filp = filp_open(spl_hostid_path, 0, 0);
+
+	if (IS_ERR(filp))
+		return (ENOENT);
+
+	error = spl_getattr(filp, &stat);
+	if (error) {
+		filp_close(filp, 0);
+		return (error);
+	}
+	size = stat.size;
+	if (size < sizeof (HW_HOSTID_MASK)) {
+		filp_close(filp, 0);
+		return (EINVAL);
+	}
+
+	off = 0;
+	/*
+	 * Read directly into the variable like eglibc does.
+	 * Short reads are okay; native behavior is preserved.
+	 */
+	error = spl_kernel_read(filp, &value, sizeof (value), &off);
+	if (error < 0) {
+		filp_close(filp, 0);
+		return (EIO);
+	}
+
+	/* Mask down to 32 bits like coreutils does. */
+	*hostid = (value & HW_HOSTID_MASK);
+	filp_close(filp, 0);
+
+	return (0);
+}
+
+/*
+ * Return the system hostid.  Preferentially use the spl_hostid module option
+ * when set, otherwise use the value in the /etc/hostid file.
+ */
+uint32_t
+zone_get_hostid(void *zone)
+{
+	uint32_t hostid;
+
+	ASSERT3P(zone, ==, NULL);
+
+	if (spl_hostid != 0)
+		return ((uint32_t)(spl_hostid & HW_HOSTID_MASK));
+
+	if (hostid_read(&hostid) == 0)
+		return (hostid);
+
+	return (0);
+}
+EXPORT_SYMBOL(zone_get_hostid);
+
+static int
+spl_kvmem_init(void)
+{
+	int rc = 0;
+
+	rc = spl_kmem_init();
+	if (rc)
+		return (rc);
+
+	rc = spl_vmem_init();
+	if (rc) {
+		spl_kmem_fini();
+		return (rc);
+	}
+
+	return (rc);
+}
+
+/*
+ * We initialize the random number generator with 128 bits of entropy from the
+ * system random number generator. In the improbable case that we have a zero
+ * seed, we fallback to the system jiffies, unless it is also zero, in which
+ * situation we use a preprogrammed seed. We step forward by 2^64 iterations to
+ * initialize each of the per-cpu seeds so that the sequences generated on each
+ * CPU are guaranteed to never overlap in practice.
+ */
+static void __init
+spl_random_init(void)
+{
+	uint64_t s[2];
+	int i = 0;
+
+	spl_pseudo_entropy = __alloc_percpu(2 * sizeof (uint64_t),
+	    sizeof (uint64_t));
+
+	get_random_bytes(s, sizeof (s));
+
+	if (s[0] == 0 && s[1] == 0) {
+		if (jiffies != 0) {
+			s[0] = jiffies;
+			s[1] = ~0 - jiffies;
+		} else {
+			(void) memcpy(s, "improbable seed", sizeof (s));
+		}
+		printk("SPL: get_random_bytes() returned 0 "
+		    "when generating random seed. Setting initial seed to "
+		    "0x%016llx%016llx.\n", cpu_to_be64(s[0]),
+		    cpu_to_be64(s[1]));
+	}
+
+	for_each_possible_cpu(i) {
+		uint64_t *wordp = per_cpu_ptr(spl_pseudo_entropy, i);
+
+		spl_rand_jump(s);
+
+		wordp[0] = s[0];
+		wordp[1] = s[1];
+	}
+}
+
+static void
+spl_random_fini(void)
+{
+	free_percpu(spl_pseudo_entropy);
+}
+
+static void
+spl_kvmem_fini(void)
+{
+	spl_vmem_fini();
+	spl_kmem_fini();
+}
+
+static int __init
+spl_init(void)
+{
+	int rc = 0;
+
+	bzero(&p0, sizeof (proc_t));
+	spl_random_init();
+
+	if ((rc = spl_kvmem_init()))
+		goto out1;
+
+	if ((rc = spl_tsd_init()))
+		goto out2;
+
+	if ((rc = spl_taskq_init()))
+		goto out3;
+
+	if ((rc = spl_kmem_cache_init()))
+		goto out4;
+
+	if ((rc = spl_proc_init()))
+		goto out5;
+
+	if ((rc = spl_kstat_init()))
+		goto out6;
+
+	if ((rc = spl_zlib_init()))
+		goto out7;
+
+	return (rc);
+
+out7:
+	spl_kstat_fini();
+out6:
+	spl_proc_fini();
+out5:
+	spl_kmem_cache_fini();
+out4:
+	spl_taskq_fini();
+out3:
+	spl_tsd_fini();
+out2:
+	spl_kvmem_fini();
+out1:
+	return (rc);
+}
+
+static void __exit
+spl_fini(void)
+{
+	spl_zlib_fini();
+	spl_kstat_fini();
+	spl_proc_fini();
+	spl_kmem_cache_fini();
+	spl_taskq_fini();
+	spl_tsd_fini();
+	spl_kvmem_fini();
+	spl_random_fini();
+}
+
+module_init(spl_init);
+module_exit(spl_fini);
+
+ZFS_MODULE_DESCRIPTION("Solaris Porting Layer");
+ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
+ZFS_MODULE_LICENSE("GPL");
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
new file mode 100644
index 000000000000..15dc27624c55
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
@@ -0,0 +1,1469 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/percpu_compat.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/taskq.h>
+#include <sys/timer.h>
+#include <sys/vmem.h>
+#include <sys/wait.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/prefetch.h>
+
+/*
+ * Within the scope of spl-kmem.c file the kmem_cache_* definitions
+ * are removed to allow access to the real Linux slab allocator.
+ */
+#undef kmem_cache_destroy
+#undef kmem_cache_create
+#undef kmem_cache_alloc
+#undef kmem_cache_free
+
+
+/*
+ * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}()
+ * with smp_mb__{before,after}_atomic() because they were redundant. This is
+ * only used inside our SLAB allocator, so we implement an internal wrapper
+ * here to give us smp_mb__{before,after}_atomic() on older kernels.
+ */
+#ifndef smp_mb__before_atomic
+#define	smp_mb__before_atomic(x) smp_mb__before_clear_bit(x)
+#endif
+
+#ifndef smp_mb__after_atomic
+#define	smp_mb__after_atomic(x) smp_mb__after_clear_bit(x)
+#endif
+
+/* BEGIN CSTYLED */
+
+/*
+ * Cache magazines are an optimization designed to minimize the cost of
+ * allocating memory.  They do this by keeping a per-cpu cache of recently
+ * freed objects, which can then be reallocated without taking a lock. This
+ * can improve performance on highly contended caches.  However, because
+ * objects in magazines will prevent otherwise empty slabs from being
+ * immediately released this may not be ideal for low memory machines.
+ *
+ * For this reason spl_kmem_cache_magazine_size can be used to set a maximum
+ * magazine size.  When this value is set to 0 the magazine size will be
+ * automatically determined based on the object size.  Otherwise magazines
+ * will be limited to 2-256 objects per magazine (i.e per cpu).  Magazines
+ * may never be entirely disabled in this implementation.
+ */
+unsigned int spl_kmem_cache_magazine_size = 0;
+module_param(spl_kmem_cache_magazine_size, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
+	"Default magazine size (2-256), set automatically (0)");
+
+/*
+ * The default behavior is to report the number of objects remaining in the
+ * cache.  This allows the Linux VM to repeatedly reclaim objects from the
+ * cache when memory is low satisfy other memory allocations.  Alternately,
+ * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
+ * is reclaimed.  This may increase the likelihood of out of memory events.
+ */
+unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
+module_param(spl_kmem_cache_reclaim, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
+
+unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
+module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
+
+unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE;
+module_param(spl_kmem_cache_max_size, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
+
+/*
+ * For small objects the Linux slab allocator should be used to make the most
+ * efficient use of the memory.  However, large objects are not supported by
+ * the Linux slab and therefore the SPL implementation is preferred.  A cutoff
+ * of 16K was determined to be optimal for architectures using 4K pages.
+ */
+#if PAGE_SIZE == 4096
+unsigned int spl_kmem_cache_slab_limit = 16384;
+#else
+unsigned int spl_kmem_cache_slab_limit = 0;
+#endif
+module_param(spl_kmem_cache_slab_limit, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
+	"Objects less than N bytes use the Linux slab");
+
+/*
+ * The number of threads available to allocate new slabs for caches.  This
+ * should not need to be tuned but it is available for performance analysis.
+ */
+unsigned int spl_kmem_cache_kmem_threads = 4;
+module_param(spl_kmem_cache_kmem_threads, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_kmem_threads,
+	"Number of spl_kmem_cache threads");
+/* END CSTYLED */
+
+/*
+ * Slab allocation interfaces
+ *
+ * While the Linux slab implementation was inspired by the Solaris
+ * implementation I cannot use it to emulate the Solaris APIs.  I
+ * require two features which are not provided by the Linux slab.
+ *
+ * 1) Constructors AND destructors.  Recent versions of the Linux
+ *    kernel have removed support for destructors.  This is a deal
+ *    breaker for the SPL which contains particularly expensive
+ *    initializers for mutex's, condition variables, etc.  We also
+ *    require a minimal level of cleanup for these data types unlike
+ *    many Linux data types which do need to be explicitly destroyed.
+ *
+ * 2) Virtual address space backed slab.  Callers of the Solaris slab
+ *    expect it to work well for both small are very large allocations.
+ *    Because of memory fragmentation the Linux slab which is backed
+ *    by kmalloc'ed memory performs very badly when confronted with
+ *    large numbers of large allocations.  Basing the slab on the
+ *    virtual address space removes the need for contiguous pages
+ *    and greatly improve performance for large allocations.
+ *
+ * For these reasons, the SPL has its own slab implementation with
+ * the needed features.  It is not as highly optimized as either the
+ * Solaris or Linux slabs, but it should get me most of what is
+ * needed until it can be optimized or obsoleted by another approach.
+ *
+ * One serious concern I do have about this method is the relatively
+ * small virtual address space on 32bit arches.  This will seriously
+ * constrain the size of the slab caches and their performance.
+ */
+
+struct list_head spl_kmem_cache_list;   /* List of caches */
+struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
+taskq_t *spl_kmem_cache_taskq;		/* Task queue for aging / reclaim */
+
+static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
+
+static void *
+kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
+{
+	gfp_t lflags = kmem_flags_convert(flags);
+	void *ptr;
+
+	ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
+
+	/* Resulting allocated memory will be page aligned */
+	ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
+
+	return (ptr);
+}
+
+static void
+kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
+{
+	ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
+
+	/*
+	 * The Linux direct reclaim path uses this out of band value to
+	 * determine if forward progress is being made.  Normally this is
+	 * incremented by kmem_freepages() which is part of the various
+	 * Linux slab implementations.  However, since we are using none
+	 * of that infrastructure we are responsible for incrementing it.
+	 */
+	if (current->reclaim_state)
+		current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
+
+	vfree(ptr);
+}
+
+/*
+ * Required space for each aligned sks.
+ */
+static inline uint32_t
+spl_sks_size(spl_kmem_cache_t *skc)
+{
+	return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t),
+	    skc->skc_obj_align, uint32_t));
+}
+
+/*
+ * Required space for each aligned object.
+ */
+static inline uint32_t
+spl_obj_size(spl_kmem_cache_t *skc)
+{
+	uint32_t align = skc->skc_obj_align;
+
+	return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
+	    P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t));
+}
+
+uint64_t
+spl_kmem_cache_inuse(kmem_cache_t *cache)
+{
+	return (cache->skc_obj_total);
+}
+EXPORT_SYMBOL(spl_kmem_cache_inuse);
+
+uint64_t
+spl_kmem_cache_entry_size(kmem_cache_t *cache)
+{
+	return (cache->skc_obj_size);
+}
+EXPORT_SYMBOL(spl_kmem_cache_entry_size);
+
+/*
+ * Lookup the spl_kmem_object_t for an object given that object.
+ */
+static inline spl_kmem_obj_t *
+spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
+{
+	return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
+	    skc->skc_obj_align, uint32_t));
+}
+
+/*
+ * It's important that we pack the spl_kmem_obj_t structure and the
+ * actual objects in to one large address space to minimize the number
+ * of calls to the allocator.  It is far better to do a few large
+ * allocations and then subdivide it ourselves.  Now which allocator
+ * we use requires balancing a few trade offs.
+ *
+ * For small objects we use kmem_alloc() because as long as you are
+ * only requesting a small number of pages (ideally just one) its cheap.
+ * However, when you start requesting multiple pages with kmem_alloc()
+ * it gets increasingly expensive since it requires contiguous pages.
+ * For this reason we shift to vmem_alloc() for slabs of large objects
+ * which removes the need for contiguous pages.  We do not use
+ * vmem_alloc() in all cases because there is significant locking
+ * overhead in __get_vm_area_node().  This function takes a single
+ * global lock when acquiring an available virtual address range which
+ * serializes all vmem_alloc()'s for all slab caches.  Using slightly
+ * different allocation functions for small and large objects should
+ * give us the best of both worlds.
+ *
+ * +------------------------+
+ * | spl_kmem_slab_t --+-+  |
+ * | skc_obj_size    <-+ |  |
+ * | spl_kmem_obj_t      |  |
+ * | skc_obj_size    <---+  |
+ * | spl_kmem_obj_t      |  |
+ * | ...                 v  |
+ * +------------------------+
+ */
+static spl_kmem_slab_t *
+spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
+{
+	spl_kmem_slab_t *sks;
+	void *base;
+	uint32_t obj_size;
+
+	base = kv_alloc(skc, skc->skc_slab_size, flags);
+	if (base == NULL)
+		return (NULL);
+
+	sks = (spl_kmem_slab_t *)base;
+	sks->sks_magic = SKS_MAGIC;
+	sks->sks_objs = skc->skc_slab_objs;
+	sks->sks_age = jiffies;
+	sks->sks_cache = skc;
+	INIT_LIST_HEAD(&sks->sks_list);
+	INIT_LIST_HEAD(&sks->sks_free_list);
+	sks->sks_ref = 0;
+	obj_size = spl_obj_size(skc);
+
+	for (int i = 0; i < sks->sks_objs; i++) {
+		void *obj = base + spl_sks_size(skc) + (i * obj_size);
+
+		ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
+		spl_kmem_obj_t *sko = spl_sko_from_obj(skc, obj);
+		sko->sko_addr = obj;
+		sko->sko_magic = SKO_MAGIC;
+		sko->sko_slab = sks;
+		INIT_LIST_HEAD(&sko->sko_list);
+		list_add_tail(&sko->sko_list, &sks->sks_free_list);
+	}
+
+	return (sks);
+}
+
+/*
+ * Remove a slab from complete or partial list, it must be called with
+ * the 'skc->skc_lock' held but the actual free must be performed
+ * outside the lock to prevent deadlocking on vmem addresses.
+ */
+static void
+spl_slab_free(spl_kmem_slab_t *sks,
+    struct list_head *sks_list, struct list_head *sko_list)
+{
+	spl_kmem_cache_t *skc;
+
+	ASSERT(sks->sks_magic == SKS_MAGIC);
+	ASSERT(sks->sks_ref == 0);
+
+	skc = sks->sks_cache;
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+
+	/*
+	 * Update slab/objects counters in the cache, then remove the
+	 * slab from the skc->skc_partial_list.  Finally add the slab
+	 * and all its objects in to the private work lists where the
+	 * destructors will be called and the memory freed to the system.
+	 */
+	skc->skc_obj_total -= sks->sks_objs;
+	skc->skc_slab_total--;
+	list_del(&sks->sks_list);
+	list_add(&sks->sks_list, sks_list);
+	list_splice_init(&sks->sks_free_list, sko_list);
+}
+
+/*
+ * Reclaim empty slabs at the end of the partial list.
+ */
+static void
+spl_slab_reclaim(spl_kmem_cache_t *skc)
+{
+	spl_kmem_slab_t *sks = NULL, *m = NULL;
+	spl_kmem_obj_t *sko = NULL, *n = NULL;
+	LIST_HEAD(sks_list);
+	LIST_HEAD(sko_list);
+
+	/*
+	 * Empty slabs and objects must be moved to a private list so they
+	 * can be safely freed outside the spin lock.  All empty slabs are
+	 * at the end of skc->skc_partial_list, therefore once a non-empty
+	 * slab is found we can stop scanning.
+	 */
+	spin_lock(&skc->skc_lock);
+	list_for_each_entry_safe_reverse(sks, m,
+	    &skc->skc_partial_list, sks_list) {
+
+		if (sks->sks_ref > 0)
+			break;
+
+		spl_slab_free(sks, &sks_list, &sko_list);
+	}
+	spin_unlock(&skc->skc_lock);
+
+	/*
+	 * The following two loops ensure all the object destructors are run,
+	 * and the slabs themselves are freed.  This is all done outside the
+	 * skc->skc_lock since this allows the destructor to sleep, and
+	 * allows us to perform a conditional reschedule when a freeing a
+	 * large number of objects and slabs back to the system.
+	 */
+
+	list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
+		ASSERT(sko->sko_magic == SKO_MAGIC);
+	}
+
+	list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
+		ASSERT(sks->sks_magic == SKS_MAGIC);
+		kv_free(skc, sks, skc->skc_slab_size);
+	}
+}
+
+static spl_kmem_emergency_t *
+spl_emergency_search(struct rb_root *root, void *obj)
+{
+	struct rb_node *node = root->rb_node;
+	spl_kmem_emergency_t *ske;
+	unsigned long address = (unsigned long)obj;
+
+	while (node) {
+		ske = container_of(node, spl_kmem_emergency_t, ske_node);
+
+		if (address < ske->ske_obj)
+			node = node->rb_left;
+		else if (address > ske->ske_obj)
+			node = node->rb_right;
+		else
+			return (ske);
+	}
+
+	return (NULL);
+}
+
+static int
+spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	spl_kmem_emergency_t *ske_tmp;
+	unsigned long address = ske->ske_obj;
+
+	while (*new) {
+		ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
+
+		parent = *new;
+		if (address < ske_tmp->ske_obj)
+			new = &((*new)->rb_left);
+		else if (address > ske_tmp->ske_obj)
+			new = &((*new)->rb_right);
+		else
+			return (0);
+	}
+
+	rb_link_node(&ske->ske_node, parent, new);
+	rb_insert_color(&ske->ske_node, root);
+
+	return (1);
+}
+
+/*
+ * Allocate a single emergency object and track it in a red black tree.
+ */
+static int
+spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+	gfp_t lflags = kmem_flags_convert(flags);
+	spl_kmem_emergency_t *ske;
+	int order = get_order(skc->skc_obj_size);
+	int empty;
+
+	/* Last chance use a partial slab if one now exists */
+	spin_lock(&skc->skc_lock);
+	empty = list_empty(&skc->skc_partial_list);
+	spin_unlock(&skc->skc_lock);
+	if (!empty)
+		return (-EEXIST);
+
+	ske = kmalloc(sizeof (*ske), lflags);
+	if (ske == NULL)
+		return (-ENOMEM);
+
+	ske->ske_obj = __get_free_pages(lflags, order);
+	if (ske->ske_obj == 0) {
+		kfree(ske);
+		return (-ENOMEM);
+	}
+
+	spin_lock(&skc->skc_lock);
+	empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
+	if (likely(empty)) {
+		skc->skc_obj_total++;
+		skc->skc_obj_emergency++;
+		if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
+			skc->skc_obj_emergency_max = skc->skc_obj_emergency;
+	}
+	spin_unlock(&skc->skc_lock);
+
+	if (unlikely(!empty)) {
+		free_pages(ske->ske_obj, order);
+		kfree(ske);
+		return (-EINVAL);
+	}
+
+	*obj = (void *)ske->ske_obj;
+
+	return (0);
+}
+
+/*
+ * Locate the passed object in the red black tree and free it.
+ */
+static int
+spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
+{
+	spl_kmem_emergency_t *ske;
+	int order = get_order(skc->skc_obj_size);
+
+	spin_lock(&skc->skc_lock);
+	ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
+	if (ske) {
+		rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
+		skc->skc_obj_emergency--;
+		skc->skc_obj_total--;
+	}
+	spin_unlock(&skc->skc_lock);
+
+	if (ske == NULL)
+		return (-ENOENT);
+
+	free_pages(ske->ske_obj, order);
+	kfree(ske);
+
+	return (0);
+}
+
+/*
+ * Release objects from the per-cpu magazine back to their slab.  The flush
+ * argument contains the max number of entries to remove from the magazine.
+ */
+static void
+spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
+{
+	spin_lock(&skc->skc_lock);
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(skm->skm_magic == SKM_MAGIC);
+
+	int count = MIN(flush, skm->skm_avail);
+	for (int i = 0; i < count; i++)
+		spl_cache_shrink(skc, skm->skm_objs[i]);
+
+	skm->skm_avail -= count;
+	memmove(skm->skm_objs, &(skm->skm_objs[count]),
+	    sizeof (void *) * skm->skm_avail);
+
+	spin_unlock(&skc->skc_lock);
+}
+
+/*
+ * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
+ * When on-slab we want to target spl_kmem_cache_obj_per_slab.  However,
+ * for very small objects we may end up with more than this so as not
+ * to waste space in the minimal allocation of a single page.  Also for
+ * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
+ * lower than this and we will fail.
+ */
+static int
+spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
+{
+	uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs;
+
+	sks_size = spl_sks_size(skc);
+	obj_size = spl_obj_size(skc);
+	max_size = (spl_kmem_cache_max_size * 1024 * 1024);
+	tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size);
+
+	if (tgt_size <= max_size) {
+		tgt_objs = (tgt_size - sks_size) / obj_size;
+	} else {
+		tgt_objs = (max_size - sks_size) / obj_size;
+		tgt_size = (tgt_objs * obj_size) + sks_size;
+	}
+
+	if (tgt_objs == 0)
+		return (-ENOSPC);
+
+	*objs = tgt_objs;
+	*size = tgt_size;
+
+	return (0);
+}
+
+/*
+ * Make a guess at reasonable per-cpu magazine size based on the size of
+ * each object and the cost of caching N of them in each magazine.  Long
+ * term this should really adapt based on an observed usage heuristic.
+ */
+static int
+spl_magazine_size(spl_kmem_cache_t *skc)
+{
+	uint32_t obj_size = spl_obj_size(skc);
+	int size;
+
+	if (spl_kmem_cache_magazine_size > 0)
+		return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2));
+
+	/* Per-magazine sizes below assume a 4Kib page size */
+	if (obj_size > (PAGE_SIZE * 256))
+		size = 4;  /* Minimum 4Mib per-magazine */
+	else if (obj_size > (PAGE_SIZE * 32))
+		size = 16; /* Minimum 2Mib per-magazine */
+	else if (obj_size > (PAGE_SIZE))
+		size = 64; /* Minimum 256Kib per-magazine */
+	else if (obj_size > (PAGE_SIZE / 4))
+		size = 128; /* Minimum 128Kib per-magazine */
+	else
+		size = 256;
+
+	return (size);
+}
+
+/*
+ * Allocate a per-cpu magazine to associate with a specific core.
+ */
+static spl_kmem_magazine_t *
+spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
+{
+	spl_kmem_magazine_t *skm;
+	int size = sizeof (spl_kmem_magazine_t) +
+	    sizeof (void *) * skc->skc_mag_size;
+
+	skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
+	if (skm) {
+		skm->skm_magic = SKM_MAGIC;
+		skm->skm_avail = 0;
+		skm->skm_size = skc->skc_mag_size;
+		skm->skm_refill = skc->skc_mag_refill;
+		skm->skm_cache = skc;
+		skm->skm_cpu = cpu;
+	}
+
+	return (skm);
+}
+
+/*
+ * Free a per-cpu magazine associated with a specific core.
+ */
+static void
+spl_magazine_free(spl_kmem_magazine_t *skm)
+{
+	ASSERT(skm->skm_magic == SKM_MAGIC);
+	ASSERT(skm->skm_avail == 0);
+	kfree(skm);
+}
+
+/*
+ * Create all pre-cpu magazines of reasonable sizes.
+ */
+static int
+spl_magazine_create(spl_kmem_cache_t *skc)
+{
+	int i = 0;
+
+	ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+
+	skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
+	    num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
+	skc->skc_mag_size = spl_magazine_size(skc);
+	skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
+
+	for_each_possible_cpu(i) {
+		skc->skc_mag[i] = spl_magazine_alloc(skc, i);
+		if (!skc->skc_mag[i]) {
+			for (i--; i >= 0; i--)
+				spl_magazine_free(skc->skc_mag[i]);
+
+			kfree(skc->skc_mag);
+			return (-ENOMEM);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Destroy all pre-cpu magazines.
+ */
+static void
+spl_magazine_destroy(spl_kmem_cache_t *skc)
+{
+	spl_kmem_magazine_t *skm;
+	int i = 0;
+
+	ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+
+	for_each_possible_cpu(i) {
+		skm = skc->skc_mag[i];
+		spl_cache_flush(skc, skm, skm->skm_avail);
+		spl_magazine_free(skm);
+	}
+
+	kfree(skc->skc_mag);
+}
+
+/*
+ * Create a object cache based on the following arguments:
+ * name		cache name
+ * size		cache object size
+ * align	cache object alignment
+ * ctor		cache object constructor
+ * dtor		cache object destructor
+ * reclaim	cache object reclaim
+ * priv		cache private data for ctor/dtor/reclaim
+ * vmp		unused must be NULL
+ * flags
+ *	KMC_KVMEM       Force kvmem backed SPL cache
+ *	KMC_SLAB        Force Linux slab backed cache
+ *	KMC_NODEBUG	Disable debugging (unsupported)
+ */
+spl_kmem_cache_t *
+spl_kmem_cache_create(char *name, size_t size, size_t align,
+    spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim,
+    void *priv, void *vmp, int flags)
+{
+	gfp_t lflags = kmem_flags_convert(KM_SLEEP);
+	spl_kmem_cache_t *skc;
+	int rc;
+
+	/*
+	 * Unsupported flags
+	 */
+	ASSERT(vmp == NULL);
+	ASSERT(reclaim == NULL);
+
+	might_sleep();
+
+	skc = kzalloc(sizeof (*skc), lflags);
+	if (skc == NULL)
+		return (NULL);
+
+	skc->skc_magic = SKC_MAGIC;
+	skc->skc_name_size = strlen(name) + 1;
+	skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags);
+	if (skc->skc_name == NULL) {
+		kfree(skc);
+		return (NULL);
+	}
+	strncpy(skc->skc_name, name, skc->skc_name_size);
+
+	skc->skc_ctor = ctor;
+	skc->skc_dtor = dtor;
+	skc->skc_private = priv;
+	skc->skc_vmp = vmp;
+	skc->skc_linux_cache = NULL;
+	skc->skc_flags = flags;
+	skc->skc_obj_size = size;
+	skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
+	atomic_set(&skc->skc_ref, 0);
+
+	INIT_LIST_HEAD(&skc->skc_list);
+	INIT_LIST_HEAD(&skc->skc_complete_list);
+	INIT_LIST_HEAD(&skc->skc_partial_list);
+	skc->skc_emergency_tree = RB_ROOT;
+	spin_lock_init(&skc->skc_lock);
+	init_waitqueue_head(&skc->skc_waitq);
+	skc->skc_slab_fail = 0;
+	skc->skc_slab_create = 0;
+	skc->skc_slab_destroy = 0;
+	skc->skc_slab_total = 0;
+	skc->skc_slab_alloc = 0;
+	skc->skc_slab_max = 0;
+	skc->skc_obj_total = 0;
+	skc->skc_obj_alloc = 0;
+	skc->skc_obj_max = 0;
+	skc->skc_obj_deadlock = 0;
+	skc->skc_obj_emergency = 0;
+	skc->skc_obj_emergency_max = 0;
+
+	rc = percpu_counter_init_common(&skc->skc_linux_alloc, 0,
+	    GFP_KERNEL);
+	if (rc != 0) {
+		kfree(skc);
+		return (NULL);
+	}
+
+	/*
+	 * Verify the requested alignment restriction is sane.
+	 */
+	if (align) {
+		VERIFY(ISP2(align));
+		VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
+		VERIFY3U(align, <=, PAGE_SIZE);
+		skc->skc_obj_align = align;
+	}
+
+	/*
+	 * When no specific type of slab is requested (kmem, vmem, or
+	 * linuxslab) then select a cache type based on the object size
+	 * and default tunables.
+	 */
+	if (!(skc->skc_flags & (KMC_SLAB | KMC_KVMEM))) {
+		if (spl_kmem_cache_slab_limit &&
+		    size <= (size_t)spl_kmem_cache_slab_limit) {
+			/*
+			 * Objects smaller than spl_kmem_cache_slab_limit can
+			 * use the Linux slab for better space-efficiency.
+			 */
+			skc->skc_flags |= KMC_SLAB;
+		} else {
+			/*
+			 * All other objects are considered large and are
+			 * placed on kvmem backed slabs.
+			 */
+			skc->skc_flags |= KMC_KVMEM;
+		}
+	}
+
+	/*
+	 * Given the type of slab allocate the required resources.
+	 */
+	if (skc->skc_flags & KMC_KVMEM) {
+		rc = spl_slab_size(skc,
+		    &skc->skc_slab_objs, &skc->skc_slab_size);
+		if (rc)
+			goto out;
+
+		rc = spl_magazine_create(skc);
+		if (rc)
+			goto out;
+	} else {
+		unsigned long slabflags = 0;
+
+		if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) {
+			rc = EINVAL;
+			goto out;
+		}
+
+#if defined(SLAB_USERCOPY)
+		/*
+		 * Required for PAX-enabled kernels if the slab is to be
+		 * used for copying between user and kernel space.
+		 */
+		slabflags |= SLAB_USERCOPY;
+#endif
+
+#if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY)
+		/*
+		 * Newer grsec patchset uses kmem_cache_create_usercopy()
+		 * instead of SLAB_USERCOPY flag
+		 */
+		skc->skc_linux_cache = kmem_cache_create_usercopy(
+		    skc->skc_name, size, align, slabflags, 0, size, NULL);
+#else
+		skc->skc_linux_cache = kmem_cache_create(
+		    skc->skc_name, size, align, slabflags, NULL);
+#endif
+		if (skc->skc_linux_cache == NULL) {
+			rc = ENOMEM;
+			goto out;
+		}
+	}
+
+	down_write(&spl_kmem_cache_sem);
+	list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
+	up_write(&spl_kmem_cache_sem);
+
+	return (skc);
+out:
+	kfree(skc->skc_name);
+	percpu_counter_destroy(&skc->skc_linux_alloc);
+	kfree(skc);
+	return (NULL);
+}
+EXPORT_SYMBOL(spl_kmem_cache_create);
+
+/*
+ * Register a move callback for cache defragmentation.
+ * XXX: Unimplemented but harmless to stub out for now.
+ */
+void
+spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
+    kmem_cbrc_t (move)(void *, void *, size_t, void *))
+{
+	ASSERT(move != NULL);
+}
+EXPORT_SYMBOL(spl_kmem_cache_set_move);
+
+/*
+ * Destroy a cache and all objects associated with the cache.
+ */
+void
+spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
+{
+	DECLARE_WAIT_QUEUE_HEAD(wq);
+	taskqid_t id;
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(skc->skc_flags & (KMC_KVMEM | KMC_SLAB));
+
+	down_write(&spl_kmem_cache_sem);
+	list_del_init(&skc->skc_list);
+	up_write(&spl_kmem_cache_sem);
+
+	/* Cancel any and wait for any pending delayed tasks */
+	VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+	spin_lock(&skc->skc_lock);
+	id = skc->skc_taskqid;
+	spin_unlock(&skc->skc_lock);
+
+	taskq_cancel_id(spl_kmem_cache_taskq, id);
+
+	/*
+	 * Wait until all current callers complete, this is mainly
+	 * to catch the case where a low memory situation triggers a
+	 * cache reaping action which races with this destroy.
+	 */
+	wait_event(wq, atomic_read(&skc->skc_ref) == 0);
+
+	if (skc->skc_flags & KMC_KVMEM) {
+		spl_magazine_destroy(skc);
+		spl_slab_reclaim(skc);
+	} else {
+		ASSERT(skc->skc_flags & KMC_SLAB);
+		kmem_cache_destroy(skc->skc_linux_cache);
+	}
+
+	spin_lock(&skc->skc_lock);
+
+	/*
+	 * Validate there are no objects in use and free all the
+	 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
+	 */
+	ASSERT3U(skc->skc_slab_alloc, ==, 0);
+	ASSERT3U(skc->skc_obj_alloc, ==, 0);
+	ASSERT3U(skc->skc_slab_total, ==, 0);
+	ASSERT3U(skc->skc_obj_total, ==, 0);
+	ASSERT3U(skc->skc_obj_emergency, ==, 0);
+	ASSERT(list_empty(&skc->skc_complete_list));
+
+	ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0);
+	percpu_counter_destroy(&skc->skc_linux_alloc);
+
+	spin_unlock(&skc->skc_lock);
+
+	kfree(skc->skc_name);
+	kfree(skc);
+}
+EXPORT_SYMBOL(spl_kmem_cache_destroy);
+
+/*
+ * Allocate an object from a slab attached to the cache.  This is used to
+ * repopulate the per-cpu magazine caches in batches when they run low.
+ */
+static void *
+spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
+{
+	spl_kmem_obj_t *sko;
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(sks->sks_magic == SKS_MAGIC);
+
+	sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
+	ASSERT(sko->sko_magic == SKO_MAGIC);
+	ASSERT(sko->sko_addr != NULL);
+
+	/* Remove from sks_free_list */
+	list_del_init(&sko->sko_list);
+
+	sks->sks_age = jiffies;
+	sks->sks_ref++;
+	skc->skc_obj_alloc++;
+
+	/* Track max obj usage statistics */
+	if (skc->skc_obj_alloc > skc->skc_obj_max)
+		skc->skc_obj_max = skc->skc_obj_alloc;
+
+	/* Track max slab usage statistics */
+	if (sks->sks_ref == 1) {
+		skc->skc_slab_alloc++;
+
+		if (skc->skc_slab_alloc > skc->skc_slab_max)
+			skc->skc_slab_max = skc->skc_slab_alloc;
+	}
+
+	return (sko->sko_addr);
+}
+
+/*
+ * Generic slab allocation function to run by the global work queues.
+ * It is responsible for allocating a new slab, linking it in to the list
+ * of partial slabs, and then waking any waiters.
+ */
+static int
+__spl_cache_grow(spl_kmem_cache_t *skc, int flags)
+{
+	spl_kmem_slab_t *sks;
+
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+	sks = spl_slab_alloc(skc, flags);
+	spl_fstrans_unmark(cookie);
+
+	spin_lock(&skc->skc_lock);
+	if (sks) {
+		skc->skc_slab_total++;
+		skc->skc_obj_total += sks->sks_objs;
+		list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+
+		smp_mb__before_atomic();
+		clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+		smp_mb__after_atomic();
+	}
+	spin_unlock(&skc->skc_lock);
+
+	return (sks == NULL ? -ENOMEM : 0);
+}
+
+static void
+spl_cache_grow_work(void *data)
+{
+	spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
+	spl_kmem_cache_t *skc = ska->ska_cache;
+
+	int error = __spl_cache_grow(skc, ska->ska_flags);
+
+	atomic_dec(&skc->skc_ref);
+	smp_mb__before_atomic();
+	clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+	smp_mb__after_atomic();
+	if (error == 0)
+		wake_up_all(&skc->skc_waitq);
+
+	kfree(ska);
+}
+
+/*
+ * Returns non-zero when a new slab should be available.
+ */
+static int
+spl_cache_grow_wait(spl_kmem_cache_t *skc)
+{
+	return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags));
+}
+
+/*
+ * No available objects on any slabs, create a new slab.  Note that this
+ * functionality is disabled for KMC_SLAB caches which are backed by the
+ * Linux slab.
+ */
+static int
+spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+	int remaining, rc = 0;
+
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+	might_sleep();
+	*obj = NULL;
+
+	/*
+	 * Before allocating a new slab wait for any reaping to complete and
+	 * then return so the local magazine can be rechecked for new objects.
+	 */
+	if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
+		rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
+		    TASK_UNINTERRUPTIBLE);
+		return (rc ? rc : -EAGAIN);
+	}
+
+	/*
+	 * Note: It would be nice to reduce the overhead of context switch
+	 * and improve NUMA locality, by trying to allocate a new slab in the
+	 * current process context with KM_NOSLEEP flag.
+	 *
+	 * However, this can't be applied to vmem/kvmem due to a bug that
+	 * spl_vmalloc() doesn't honor gfp flags in page table allocation.
+	 */
+
+	/*
+	 * This is handled by dispatching a work request to the global work
+	 * queue.  This allows us to asynchronously allocate a new slab while
+	 * retaining the ability to safely fall back to a smaller synchronous
+	 * allocations to ensure forward progress is always maintained.
+	 */
+	if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
+		spl_kmem_alloc_t *ska;
+
+		ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags));
+		if (ska == NULL) {
+			clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags);
+			smp_mb__after_atomic();
+			wake_up_all(&skc->skc_waitq);
+			return (-ENOMEM);
+		}
+
+		atomic_inc(&skc->skc_ref);
+		ska->ska_cache = skc;
+		ska->ska_flags = flags;
+		taskq_init_ent(&ska->ska_tqe);
+		taskq_dispatch_ent(spl_kmem_cache_taskq,
+		    spl_cache_grow_work, ska, 0, &ska->ska_tqe);
+	}
+
+	/*
+	 * The goal here is to only detect the rare case where a virtual slab
+	 * allocation has deadlocked.  We must be careful to minimize the use
+	 * of emergency objects which are more expensive to track.  Therefore,
+	 * we set a very long timeout for the asynchronous allocation and if
+	 * the timeout is reached the cache is flagged as deadlocked.  From
+	 * this point only new emergency objects will be allocated until the
+	 * asynchronous allocation completes and clears the deadlocked flag.
+	 */
+	if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
+		rc = spl_emergency_alloc(skc, flags, obj);
+	} else {
+		remaining = wait_event_timeout(skc->skc_waitq,
+		    spl_cache_grow_wait(skc), HZ / 10);
+
+		if (!remaining) {
+			spin_lock(&skc->skc_lock);
+			if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
+				set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+				skc->skc_obj_deadlock++;
+			}
+			spin_unlock(&skc->skc_lock);
+		}
+
+		rc = -ENOMEM;
+	}
+
+	return (rc);
+}
+
+/*
+ * Refill a per-cpu magazine with objects from the slabs for this cache.
+ * Ideally the magazine can be repopulated using existing objects which have
+ * been released, however if we are unable to locate enough free objects new
+ * slabs of objects will be created.  On success NULL is returned, otherwise
+ * the address of a single emergency object is returned for use by the caller.
+ */
+static void *
+spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
+{
+	spl_kmem_slab_t *sks;
+	int count = 0, rc, refill;
+	void *obj = NULL;
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(skm->skm_magic == SKM_MAGIC);
+
+	refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
+	spin_lock(&skc->skc_lock);
+
+	while (refill > 0) {
+		/* No slabs available we may need to grow the cache */
+		if (list_empty(&skc->skc_partial_list)) {
+			spin_unlock(&skc->skc_lock);
+
+			local_irq_enable();
+			rc = spl_cache_grow(skc, flags, &obj);
+			local_irq_disable();
+
+			/* Emergency object for immediate use by caller */
+			if (rc == 0 && obj != NULL)
+				return (obj);
+
+			if (rc)
+				goto out;
+
+			/* Rescheduled to different CPU skm is not local */
+			if (skm != skc->skc_mag[smp_processor_id()])
+				goto out;
+
+			/*
+			 * Potentially rescheduled to the same CPU but
+			 * allocations may have occurred from this CPU while
+			 * we were sleeping so recalculate max refill.
+			 */
+			refill = MIN(refill, skm->skm_size - skm->skm_avail);
+
+			spin_lock(&skc->skc_lock);
+			continue;
+		}
+
+		/* Grab the next available slab */
+		sks = list_entry((&skc->skc_partial_list)->next,
+		    spl_kmem_slab_t, sks_list);
+		ASSERT(sks->sks_magic == SKS_MAGIC);
+		ASSERT(sks->sks_ref < sks->sks_objs);
+		ASSERT(!list_empty(&sks->sks_free_list));
+
+		/*
+		 * Consume as many objects as needed to refill the requested
+		 * cache.  We must also be careful not to overfill it.
+		 */
+		while (sks->sks_ref < sks->sks_objs && refill-- > 0 &&
+		    ++count) {
+			ASSERT(skm->skm_avail < skm->skm_size);
+			ASSERT(count < skm->skm_size);
+			skm->skm_objs[skm->skm_avail++] =
+			    spl_cache_obj(skc, sks);
+		}
+
+		/* Move slab to skc_complete_list when full */
+		if (sks->sks_ref == sks->sks_objs) {
+			list_del(&sks->sks_list);
+			list_add(&sks->sks_list, &skc->skc_complete_list);
+		}
+	}
+
+	spin_unlock(&skc->skc_lock);
+out:
+	return (NULL);
+}
+
+/*
+ * Release an object back to the slab from which it came.
+ */
+static void
+spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
+{
+	spl_kmem_slab_t *sks = NULL;
+	spl_kmem_obj_t *sko = NULL;
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+
+	sko = spl_sko_from_obj(skc, obj);
+	ASSERT(sko->sko_magic == SKO_MAGIC);
+	sks = sko->sko_slab;
+	ASSERT(sks->sks_magic == SKS_MAGIC);
+	ASSERT(sks->sks_cache == skc);
+	list_add(&sko->sko_list, &sks->sks_free_list);
+
+	sks->sks_age = jiffies;
+	sks->sks_ref--;
+	skc->skc_obj_alloc--;
+
+	/*
+	 * Move slab to skc_partial_list when no longer full.  Slabs
+	 * are added to the head to keep the partial list is quasi-full
+	 * sorted order.  Fuller at the head, emptier at the tail.
+	 */
+	if (sks->sks_ref == (sks->sks_objs - 1)) {
+		list_del(&sks->sks_list);
+		list_add(&sks->sks_list, &skc->skc_partial_list);
+	}
+
+	/*
+	 * Move empty slabs to the end of the partial list so
+	 * they can be easily found and freed during reclamation.
+	 */
+	if (sks->sks_ref == 0) {
+		list_del(&sks->sks_list);
+		list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+		skc->skc_slab_alloc--;
+	}
+}
+
+/*
+ * Allocate an object from the per-cpu magazine, or if the magazine
+ * is empty directly allocate from a slab and repopulate the magazine.
+ */
+void *
+spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
+{
+	spl_kmem_magazine_t *skm;
+	void *obj = NULL;
+
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+	/*
+	 * Allocate directly from a Linux slab.  All optimizations are left
+	 * to the underlying cache we only need to guarantee that KM_SLEEP
+	 * callers will never fail.
+	 */
+	if (skc->skc_flags & KMC_SLAB) {
+		struct kmem_cache *slc = skc->skc_linux_cache;
+		do {
+			obj = kmem_cache_alloc(slc, kmem_flags_convert(flags));
+		} while ((obj == NULL) && !(flags & KM_NOSLEEP));
+
+		if (obj != NULL) {
+			/*
+			 * Even though we leave everything up to the
+			 * underlying cache we still keep track of
+			 * how many objects we've allocated in it for
+			 * better debuggability.
+			 */
+			percpu_counter_inc(&skc->skc_linux_alloc);
+		}
+		goto ret;
+	}
+
+	local_irq_disable();
+
+restart:
+	/*
+	 * Safe to update per-cpu structure without lock, but
+	 * in the restart case we must be careful to reacquire
+	 * the local magazine since this may have changed
+	 * when we need to grow the cache.
+	 */
+	skm = skc->skc_mag[smp_processor_id()];
+	ASSERT(skm->skm_magic == SKM_MAGIC);
+
+	if (likely(skm->skm_avail)) {
+		/* Object available in CPU cache, use it */
+		obj = skm->skm_objs[--skm->skm_avail];
+	} else {
+		obj = spl_cache_refill(skc, skm, flags);
+		if ((obj == NULL) && !(flags & KM_NOSLEEP))
+			goto restart;
+
+		local_irq_enable();
+		goto ret;
+	}
+
+	local_irq_enable();
+	ASSERT(obj);
+	ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
+
+ret:
+	/* Pre-emptively migrate object to CPU L1 cache */
+	if (obj) {
+		if (obj && skc->skc_ctor)
+			skc->skc_ctor(obj, skc->skc_private, flags);
+		else
+			prefetchw(obj);
+	}
+
+	return (obj);
+}
+EXPORT_SYMBOL(spl_kmem_cache_alloc);
+
+/*
+ * Free an object back to the local per-cpu magazine, there is no
+ * guarantee that this is the same magazine the object was originally
+ * allocated from.  We may need to flush entire from the magazine
+ * back to the slabs to make space.
+ */
+void
+spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
+{
+	spl_kmem_magazine_t *skm;
+	unsigned long flags;
+	int do_reclaim = 0;
+	int do_emergency = 0;
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+	/*
+	 * Run the destructor
+	 */
+	if (skc->skc_dtor)
+		skc->skc_dtor(obj, skc->skc_private);
+
+	/*
+	 * Free the object from the Linux underlying Linux slab.
+	 */
+	if (skc->skc_flags & KMC_SLAB) {
+		kmem_cache_free(skc->skc_linux_cache, obj);
+		percpu_counter_dec(&skc->skc_linux_alloc);
+		return;
+	}
+
+	/*
+	 * While a cache has outstanding emergency objects all freed objects
+	 * must be checked.  However, since emergency objects will never use
+	 * a virtual address these objects can be safely excluded as an
+	 * optimization.
+	 */
+	if (!is_vmalloc_addr(obj)) {
+		spin_lock(&skc->skc_lock);
+		do_emergency = (skc->skc_obj_emergency > 0);
+		spin_unlock(&skc->skc_lock);
+
+		if (do_emergency && (spl_emergency_free(skc, obj) == 0))
+			return;
+	}
+
+	local_irq_save(flags);
+
+	/*
+	 * Safe to update per-cpu structure without lock, but
+	 * no remote memory allocation tracking is being performed
+	 * it is entirely possible to allocate an object from one
+	 * CPU cache and return it to another.
+	 */
+	skm = skc->skc_mag[smp_processor_id()];
+	ASSERT(skm->skm_magic == SKM_MAGIC);
+
+	/*
+	 * Per-CPU cache full, flush it to make space for this object,
+	 * this may result in an empty slab which can be reclaimed once
+	 * interrupts are re-enabled.
+	 */
+	if (unlikely(skm->skm_avail >= skm->skm_size)) {
+		spl_cache_flush(skc, skm, skm->skm_refill);
+		do_reclaim = 1;
+	}
+
+	/* Available space in cache, use it */
+	skm->skm_objs[skm->skm_avail++] = obj;
+
+	local_irq_restore(flags);
+
+	if (do_reclaim)
+		spl_slab_reclaim(skc);
+}
+EXPORT_SYMBOL(spl_kmem_cache_free);
+
+/*
+ * Depending on how many and which objects are released it may simply
+ * repopulate the local magazine which will then need to age-out.  Objects
+ * which cannot fit in the magazine will be released back to their slabs
+ * which will also need to age out before being released.  This is all just
+ * best effort and we do not want to thrash creating and destroying slabs.
+ */
+void
+spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
+{
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+	if (skc->skc_flags & KMC_SLAB)
+		return;
+
+	atomic_inc(&skc->skc_ref);
+
+	/*
+	 * Prevent concurrent cache reaping when contended.
+	 */
+	if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
+		goto out;
+
+	/* Reclaim from the magazine and free all now empty slabs. */
+	unsigned long irq_flags;
+	local_irq_save(irq_flags);
+	spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
+	spl_cache_flush(skc, skm, skm->skm_avail);
+	local_irq_restore(irq_flags);
+
+	spl_slab_reclaim(skc);
+	clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
+out:
+	atomic_dec(&skc->skc_ref);
+}
+EXPORT_SYMBOL(spl_kmem_cache_reap_now);
+
+/*
+ * This is stubbed out for code consistency with other platforms.  There
+ * is existing logic to prevent concurrent reaping so while this is ugly
+ * it should do no harm.
+ */
+int
+spl_kmem_cache_reap_active()
+{
+	return (0);
+}
+EXPORT_SYMBOL(spl_kmem_cache_reap_active);
+
+/*
+ * Reap all free slabs from all registered caches.
+ */
+void
+spl_kmem_reap(void)
+{
+	spl_kmem_cache_t *skc = NULL;
+
+	down_read(&spl_kmem_cache_sem);
+	list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+		spl_kmem_cache_reap_now(skc);
+	}
+	up_read(&spl_kmem_cache_sem);
+}
+EXPORT_SYMBOL(spl_kmem_reap);
+
+int
+spl_kmem_cache_init(void)
+{
+	init_rwsem(&spl_kmem_cache_sem);
+	INIT_LIST_HEAD(&spl_kmem_cache_list);
+	spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
+	    spl_kmem_cache_kmem_threads, maxclsyspri,
+	    spl_kmem_cache_kmem_threads * 8, INT_MAX,
+	    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+
+	return (0);
+}
+
+void
+spl_kmem_cache_fini(void)
+{
+	taskq_destroy(spl_kmem_cache_taskq);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
new file mode 100644
index 000000000000..f19421cfcc03
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
@@ -0,0 +1,618 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/vmem.h>
+
+/*
+ * As a general rule kmem_alloc() allocations should be small, preferably
+ * just a few pages since they must by physically contiguous.  Therefore, a
+ * rate limited warning will be printed to the console for any kmem_alloc()
+ * which exceeds a reasonable threshold.
+ *
+ * The default warning threshold is set to sixteen pages but capped at 64K to
+ * accommodate systems using large pages.  This value was selected to be small
+ * enough to ensure the largest allocations are quickly noticed and fixed.
+ * But large enough to avoid logging any warnings when a allocation size is
+ * larger than optimal but not a serious concern.  Since this value is tunable,
+ * developers are encouraged to set it lower when testing so any new largish
+ * allocations are quickly caught.  These warnings may be disabled by setting
+ * the threshold to zero.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024);
+module_param(spl_kmem_alloc_warn, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_warn,
+	"Warning threshold in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_warn);
+
+/*
+ * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
+ * Allocations which are marginally smaller than this limit may succeed but
+ * should still be avoided due to the expense of locating a contiguous range
+ * of free pages.  Therefore, a maximum kmem size with reasonable safely
+ * margin of 4x is set.  Kmem_alloc() allocations larger than this maximum
+ * will quickly fail.  Vmem_alloc() allocations less than or equal to this
+ * value will use kmalloc(), but shift to vmalloc() when exceeding this value.
+ */
+unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2);
+module_param(spl_kmem_alloc_max, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_max,
+	"Maximum size in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_max);
+/* END CSTYLED */
+
+int
+kmem_debugging(void)
+{
+	return (0);
+}
+EXPORT_SYMBOL(kmem_debugging);
+
+char *
+kmem_vasprintf(const char *fmt, va_list ap)
+{
+	va_list aq;
+	char *ptr;
+
+	do {
+		va_copy(aq, ap);
+		ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq);
+		va_end(aq);
+	} while (ptr == NULL);
+
+	return (ptr);
+}
+EXPORT_SYMBOL(kmem_vasprintf);
+
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+	va_list ap;
+	char *ptr;
+
+	do {
+		va_start(ap, fmt);
+		ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap);
+		va_end(ap);
+	} while (ptr == NULL);
+
+	return (ptr);
+}
+EXPORT_SYMBOL(kmem_asprintf);
+
+static char *
+__strdup(const char *str, int flags)
+{
+	char *ptr;
+	int n;
+
+	n = strlen(str);
+	ptr = kmalloc(n + 1, kmem_flags_convert(flags));
+	if (ptr)
+		memcpy(ptr, str, n + 1);
+
+	return (ptr);
+}
+
+char *
+kmem_strdup(const char *str)
+{
+	return (__strdup(str, KM_SLEEP));
+}
+EXPORT_SYMBOL(kmem_strdup);
+
+void
+kmem_strfree(char *str)
+{
+	kfree(str);
+}
+EXPORT_SYMBOL(kmem_strfree);
+
+void *
+spl_kvmalloc(size_t size, gfp_t lflags)
+{
+#ifdef HAVE_KVMALLOC
+	/*
+	 * GFP_KERNEL allocations can safely use kvmalloc which may
+	 * improve performance by avoiding a) high latency caused by
+	 * vmalloc's on-access allocation, b) performance loss due to
+	 * MMU memory address mapping and c) vmalloc locking overhead.
+	 * This has the side-effect that the slab statistics will
+	 * incorrectly report this as a vmem allocation, but that is
+	 * purely cosmetic.
+	 */
+	if ((lflags & GFP_KERNEL) == GFP_KERNEL)
+		return (kvmalloc(size, lflags));
+#endif
+
+	gfp_t kmalloc_lflags = lflags;
+
+	if (size > PAGE_SIZE) {
+		/*
+		 * We need to set __GFP_NOWARN here since spl_kvmalloc is not
+		 * only called by spl_kmem_alloc_impl but can be called
+		 * directly with custom lflags, too. In that case
+		 * kmem_flags_convert does not get called, which would
+		 * implicitly set __GFP_NOWARN.
+		 */
+		kmalloc_lflags |= __GFP_NOWARN;
+
+		/*
+		 * N.B. __GFP_RETRY_MAYFAIL is supported only for large
+		 * e (>32kB) allocations.
+		 *
+		 * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY
+		 * for !costly requests because there is no other way to tell
+		 * the allocator that we want to fail rather than retry
+		 * endlessly.
+		 */
+		if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) ||
+		    (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+			kmalloc_lflags |= __GFP_NORETRY;
+		}
+	}
+
+	/*
+	 * We first try kmalloc - even for big sizes - and fall back to
+	 * spl_vmalloc if that fails.
+	 *
+	 * For non-__GFP-RECLAIM allocations we always stick to
+	 * kmalloc_node, and fail when kmalloc is not successful (returns
+	 * NULL).
+	 * We cannot fall back to spl_vmalloc in this case because spl_vmalloc
+	 * internally uses GPF_KERNEL allocations.
+	 */
+	void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE);
+	if (ptr || size <= PAGE_SIZE ||
+	    (lflags & __GFP_RECLAIM) != __GFP_RECLAIM) {
+		return (ptr);
+	}
+
+	return (spl_vmalloc(size, lflags | __GFP_HIGHMEM));
+}
+
+/*
+ * General purpose unified implementation of kmem_alloc(). It is an
+ * amalgamation of Linux and Illumos allocator design. It should never be
+ * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains
+ * relatively portable.  Consumers may only access this function through
+ * wrappers that enforce the common flags to ensure portability.
+ */
+inline void *
+spl_kmem_alloc_impl(size_t size, int flags, int node)
+{
+	gfp_t lflags = kmem_flags_convert(flags);
+	void *ptr;
+
+	/*
+	 * Log abnormally large allocations and rate limit the console output.
+	 * Allocations larger than spl_kmem_alloc_warn should be performed
+	 * through the vmem_alloc()/vmem_zalloc() interfaces.
+	 */
+	if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) &&
+	    !(flags & KM_VMEM)) {
+		printk(KERN_WARNING
+		    "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n"
+		    "https://github.com/zfsonlinux/zfs/issues/new\n",
+		    (unsigned long)size, flags);
+		dump_stack();
+	}
+
+	/*
+	 * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used
+	 * unlike kmem_alloc() with KM_SLEEP on Illumos.
+	 */
+	do {
+		/*
+		 * Calling kmalloc_node() when the size >= spl_kmem_alloc_max
+		 * is unsafe.  This must fail for all for kmem_alloc() and
+		 * kmem_zalloc() callers.
+		 *
+		 * For vmem_alloc() and vmem_zalloc() callers it is permissible
+		 * to use spl_vmalloc().  However, in general use of
+		 * spl_vmalloc() is strongly discouraged because a global lock
+		 * must be acquired.  Contention on this lock can significantly
+		 * impact performance so frequently manipulating the virtual
+		 * address space is strongly discouraged.
+		 */
+		if (size > spl_kmem_alloc_max) {
+			if (flags & KM_VMEM) {
+				ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
+			} else {
+				return (NULL);
+			}
+		} else {
+			if (flags & KM_VMEM) {
+				ptr = spl_kvmalloc(size, lflags);
+			} else {
+				ptr = kmalloc_node(size, lflags, node);
+			}
+		}
+
+		if (likely(ptr) || (flags & KM_NOSLEEP))
+			return (ptr);
+
+		/*
+		 * Try hard to satisfy the allocation. However, when progress
+		 * cannot be made, the allocation is allowed to fail.
+		 */
+		if ((lflags & GFP_KERNEL) == GFP_KERNEL)
+			lflags |= __GFP_RETRY_MAYFAIL;
+
+		/*
+		 * Use cond_resched() instead of congestion_wait() to avoid
+		 * deadlocking systems where there are no block devices.
+		 */
+		cond_resched();
+	} while (1);
+
+	return (NULL);
+}
+
+inline void
+spl_kmem_free_impl(const void *buf, size_t size)
+{
+	if (is_vmalloc_addr(buf))
+		vfree(buf);
+	else
+		kfree(buf);
+}
+
+/*
+ * Memory allocation and accounting for kmem_* * style allocations.  When
+ * DEBUG_KMEM is enabled the total memory allocated will be tracked and
+ * any memory leaked will be reported during module unload.
+ *
+ * ./configure --enable-debug-kmem
+ */
+#ifdef DEBUG_KMEM
+
+/* Shim layer memory accounting */
+#ifdef HAVE_ATOMIC64_T
+atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
+unsigned long long kmem_alloc_max = 0;
+#else  /* HAVE_ATOMIC64_T */
+atomic_t kmem_alloc_used = ATOMIC_INIT(0);
+unsigned long long kmem_alloc_max = 0;
+#endif /* HAVE_ATOMIC64_T */
+
+EXPORT_SYMBOL(kmem_alloc_used);
+EXPORT_SYMBOL(kmem_alloc_max);
+
+inline void *
+spl_kmem_alloc_debug(size_t size, int flags, int node)
+{
+	void *ptr;
+
+	ptr = spl_kmem_alloc_impl(size, flags, node);
+	if (ptr) {
+		kmem_alloc_used_add(size);
+		if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
+			kmem_alloc_max = kmem_alloc_used_read();
+	}
+
+	return (ptr);
+}
+
+inline void
+spl_kmem_free_debug(const void *ptr, size_t size)
+{
+	kmem_alloc_used_sub(size);
+	spl_kmem_free_impl(ptr, size);
+}
+
+/*
+ * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
+ * but also the location of every alloc and free.  When the SPL module is
+ * unloaded a list of all leaked addresses and where they were allocated
+ * will be dumped to the console.  Enabling this feature has a significant
+ * impact on performance but it makes finding memory leaks straight forward.
+ *
+ * Not surprisingly with debugging enabled the xmem_locks are very highly
+ * contended particularly on xfree().  If we want to run with this detailed
+ * debugging enabled for anything other than debugging  we need to minimize
+ * the contention by moving to a lock per xmem_table entry model.
+ *
+ * ./configure --enable-debug-kmem-tracking
+ */
+#ifdef DEBUG_KMEM_TRACKING
+
+#include <linux/hash.h>
+#include <linux/ctype.h>
+
+#define	KMEM_HASH_BITS		10
+#define	KMEM_TABLE_SIZE		(1 << KMEM_HASH_BITS)
+
+typedef struct kmem_debug {
+	struct hlist_node kd_hlist;	/* Hash node linkage */
+	struct list_head kd_list;	/* List of all allocations */
+	void *kd_addr;			/* Allocation pointer */
+	size_t kd_size;			/* Allocation size */
+	const char *kd_func;		/* Allocation function */
+	int kd_line;			/* Allocation line */
+} kmem_debug_t;
+
+static spinlock_t kmem_lock;
+static struct hlist_head kmem_table[KMEM_TABLE_SIZE];
+static struct list_head kmem_list;
+
+static kmem_debug_t *
+kmem_del_init(spinlock_t *lock, struct hlist_head *table,
+    int bits, const void *addr)
+{
+	struct hlist_head *head;
+	struct hlist_node *node = NULL;
+	struct kmem_debug *p;
+	unsigned long flags;
+
+	spin_lock_irqsave(lock, flags);
+
+	head = &table[hash_ptr((void *)addr, bits)];
+	hlist_for_each(node, head) {
+		p = list_entry(node, struct kmem_debug, kd_hlist);
+		if (p->kd_addr == addr) {
+			hlist_del_init(&p->kd_hlist);
+			list_del_init(&p->kd_list);
+			spin_unlock_irqrestore(lock, flags);
+			return (p);
+		}
+	}
+
+	spin_unlock_irqrestore(lock, flags);
+
+	return (NULL);
+}
+
+inline void *
+spl_kmem_alloc_track(size_t size, int flags,
+    const char *func, int line, int node)
+{
+	void *ptr = NULL;
+	kmem_debug_t *dptr;
+	unsigned long irq_flags;
+
+	dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags));
+	if (dptr == NULL)
+		return (NULL);
+
+	dptr->kd_func = __strdup(func, flags);
+	if (dptr->kd_func == NULL) {
+		kfree(dptr);
+		return (NULL);
+	}
+
+	ptr = spl_kmem_alloc_debug(size, flags, node);
+	if (ptr == NULL) {
+		kfree(dptr->kd_func);
+		kfree(dptr);
+		return (NULL);
+	}
+
+	INIT_HLIST_NODE(&dptr->kd_hlist);
+	INIT_LIST_HEAD(&dptr->kd_list);
+
+	dptr->kd_addr = ptr;
+	dptr->kd_size = size;
+	dptr->kd_line = line;
+
+	spin_lock_irqsave(&kmem_lock, irq_flags);
+	hlist_add_head(&dptr->kd_hlist,
+	    &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
+	list_add_tail(&dptr->kd_list, &kmem_list);
+	spin_unlock_irqrestore(&kmem_lock, irq_flags);
+
+	return (ptr);
+}
+
+inline void
+spl_kmem_free_track(const void *ptr, size_t size)
+{
+	kmem_debug_t *dptr;
+
+	/* Ignore NULL pointer since we haven't tracked it at all */
+	if (ptr == NULL)
+		return;
+
+	/* Must exist in hash due to kmem_alloc() */
+	dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
+	ASSERT3P(dptr, !=, NULL);
+	ASSERT3S(dptr->kd_size, ==, size);
+
+	kfree(dptr->kd_func);
+	kfree(dptr);
+
+	spl_kmem_free_debug(ptr, size);
+}
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+
+/*
+ * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces.
+ */
+void *
+spl_kmem_alloc(size_t size, int flags, const char *func, int line)
+{
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+	return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_alloc);
+
+void *
+spl_kmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+	flags |= KM_ZERO;
+
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+	return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_zalloc);
+
+void
+spl_kmem_free(const void *buf, size_t size)
+{
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_free_debug(buf, size));
+#else
+	return (spl_kmem_free_track(buf, size));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_free);
+
+#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
+static char *
+spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
+{
+	int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
+	int i, flag = 1;
+
+	ASSERT(str != NULL && len >= 17);
+	memset(str, 0, len);
+
+	/*
+	 * Check for a fully printable string, and while we are at
+	 * it place the printable characters in the passed buffer.
+	 */
+	for (i = 0; i < size; i++) {
+		str[i] = ((char *)(kd->kd_addr))[i];
+		if (isprint(str[i])) {
+			continue;
+		} else {
+			/*
+			 * Minimum number of printable characters found
+			 * to make it worthwhile to print this as ascii.
+			 */
+			if (i > min)
+				break;
+
+			flag = 0;
+			break;
+		}
+	}
+
+	if (!flag) {
+		sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
+		    *((uint8_t *)kd->kd_addr),
+		    *((uint8_t *)kd->kd_addr + 2),
+		    *((uint8_t *)kd->kd_addr + 4),
+		    *((uint8_t *)kd->kd_addr + 6),
+		    *((uint8_t *)kd->kd_addr + 8),
+		    *((uint8_t *)kd->kd_addr + 10),
+		    *((uint8_t *)kd->kd_addr + 12),
+		    *((uint8_t *)kd->kd_addr + 14));
+	}
+
+	return (str);
+}
+
+static int
+spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
+{
+	int i;
+
+	spin_lock_init(lock);
+	INIT_LIST_HEAD(list);
+
+	for (i = 0; i < size; i++)
+		INIT_HLIST_HEAD(&kmem_table[i]);
+
+	return (0);
+}
+
+static void
+spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
+{
+	unsigned long flags;
+	kmem_debug_t *kd = NULL;
+	char str[17];
+
+	spin_lock_irqsave(lock, flags);
+	if (!list_empty(list))
+		printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
+		    "size", "data", "func", "line");
+
+	list_for_each_entry(kd, list, kd_list) {
+		printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
+		    (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
+		    kd->kd_func, kd->kd_line);
+	}
+
+	spin_unlock_irqrestore(lock, flags);
+}
+#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
+
+int
+spl_kmem_init(void)
+{
+
+#ifdef DEBUG_KMEM
+	kmem_alloc_used_set(0);
+
+
+
+#ifdef DEBUG_KMEM_TRACKING
+	spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+
+	return (0);
+}
+
+void
+spl_kmem_fini(void)
+{
+#ifdef DEBUG_KMEM
+	/*
+	 * Display all unreclaimed memory addresses, including the
+	 * allocation size and the first few bytes of what's located
+	 * at that address to aid in debugging.  Performance is not
+	 * a serious concern here since it is module unload time.
+	 */
+	if (kmem_alloc_used_read() != 0)
+		printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
+		    (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
+
+#ifdef DEBUG_KMEM_TRACKING
+	spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
new file mode 100644
index 000000000000..b971b4498cc6
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
@@ -0,0 +1,778 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Kstat Implementation.
+ */
+
+#include <linux/seq_file.h>
+#include <sys/kstat.h>
+#include <sys/vmem.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+
+static kmutex_t kstat_module_lock;
+static struct list_head kstat_module_list;
+static kid_t kstat_id;
+
+static int
+kstat_resize_raw(kstat_t *ksp)
+{
+	if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX)
+		return (ENOMEM);
+
+	vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+	ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX);
+	ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP);
+
+	return (0);
+}
+
+void
+kstat_waitq_enter(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t wcnt;
+
+	new = gethrtime();
+	delta = new - kiop->wlastupdate;
+	kiop->wlastupdate = new;
+	wcnt = kiop->wcnt++;
+	if (wcnt != 0) {
+		kiop->wlentime += delta * wcnt;
+		kiop->wtime += delta;
+	}
+}
+EXPORT_SYMBOL(kstat_waitq_enter);
+
+void
+kstat_waitq_exit(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t wcnt;
+
+	new = gethrtime();
+	delta = new - kiop->wlastupdate;
+	kiop->wlastupdate = new;
+	wcnt = kiop->wcnt--;
+	ASSERT((int)wcnt > 0);
+	kiop->wlentime += delta * wcnt;
+	kiop->wtime += delta;
+}
+EXPORT_SYMBOL(kstat_waitq_exit);
+
+void
+kstat_runq_enter(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t rcnt;
+
+	new = gethrtime();
+	delta = new - kiop->rlastupdate;
+	kiop->rlastupdate = new;
+	rcnt = kiop->rcnt++;
+	if (rcnt != 0) {
+		kiop->rlentime += delta * rcnt;
+		kiop->rtime += delta;
+	}
+}
+EXPORT_SYMBOL(kstat_runq_enter);
+
+void
+kstat_runq_exit(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t rcnt;
+
+	new = gethrtime();
+	delta = new - kiop->rlastupdate;
+	kiop->rlastupdate = new;
+	rcnt = kiop->rcnt--;
+	ASSERT((int)rcnt > 0);
+	kiop->rlentime += delta * rcnt;
+	kiop->rtime += delta;
+}
+EXPORT_SYMBOL(kstat_runq_exit);
+
+static int
+kstat_seq_show_headers(struct seq_file *f)
+{
+	kstat_t *ksp = (kstat_t *)f->private;
+	int rc = 0;
+
+	ASSERT(ksp->ks_magic == KS_MAGIC);
+
+	seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n",
+	    ksp->ks_kid, ksp->ks_type, ksp->ks_flags,
+	    ksp->ks_ndata, (int)ksp->ks_data_size,
+	    ksp->ks_crtime, ksp->ks_snaptime);
+
+	switch (ksp->ks_type) {
+		case KSTAT_TYPE_RAW:
+restart:
+			if (ksp->ks_raw_ops.headers) {
+				rc = ksp->ks_raw_ops.headers(
+				    ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+				if (rc == ENOMEM && !kstat_resize_raw(ksp))
+					goto restart;
+				if (!rc)
+					seq_puts(f, ksp->ks_raw_buf);
+			} else {
+				seq_printf(f, "raw data\n");
+			}
+			break;
+		case KSTAT_TYPE_NAMED:
+			seq_printf(f, "%-31s %-4s %s\n",
+			    "name", "type", "data");
+			break;
+		case KSTAT_TYPE_INTR:
+			seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n",
+			    "hard", "soft", "watchdog",
+			    "spurious", "multsvc");
+			break;
+		case KSTAT_TYPE_IO:
+			seq_printf(f,
+			    "%-8s %-8s %-8s %-8s %-8s %-8s "
+			    "%-8s %-8s %-8s %-8s %-8s %-8s\n",
+			    "nread", "nwritten", "reads", "writes",
+			    "wtime", "wlentime", "wupdate",
+			    "rtime", "rlentime", "rupdate",
+			    "wcnt", "rcnt");
+			break;
+		case KSTAT_TYPE_TIMER:
+			seq_printf(f,
+			    "%-31s %-8s "
+			    "%-8s %-8s %-8s %-8s %-8s\n",
+			    "name", "events", "elapsed",
+			    "min", "max", "start", "stop");
+			break;
+		default:
+			PANIC("Undefined kstat type %d\n", ksp->ks_type);
+	}
+
+	return (-rc);
+}
+
+static int
+kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l)
+{
+	int i, j;
+
+	for (i = 0; ; i++) {
+		seq_printf(f, "%03x:", i);
+
+		for (j = 0; j < 16; j++) {
+			if (i * 16 + j >= l) {
+				seq_printf(f, "\n");
+				goto out;
+			}
+
+			seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]);
+		}
+		seq_printf(f, "\n");
+	}
+out:
+	return (0);
+}
+
+static int
+kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp)
+{
+	seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type);
+
+	switch (knp->data_type) {
+		case KSTAT_DATA_CHAR:
+			knp->value.c[15] = '\0'; /* NULL terminate */
+			seq_printf(f, "%-16s", knp->value.c);
+			break;
+		/*
+		 * NOTE - We need to be more careful able what tokens are
+		 * used for each arch, for now this is correct for x86_64.
+		 */
+		case KSTAT_DATA_INT32:
+			seq_printf(f, "%d", knp->value.i32);
+			break;
+		case KSTAT_DATA_UINT32:
+			seq_printf(f, "%u", knp->value.ui32);
+			break;
+		case KSTAT_DATA_INT64:
+			seq_printf(f, "%lld", (signed long long)knp->value.i64);
+			break;
+		case KSTAT_DATA_UINT64:
+			seq_printf(f, "%llu",
+			    (unsigned long long)knp->value.ui64);
+			break;
+		case KSTAT_DATA_LONG:
+			seq_printf(f, "%ld", knp->value.l);
+			break;
+		case KSTAT_DATA_ULONG:
+			seq_printf(f, "%lu", knp->value.ul);
+			break;
+		case KSTAT_DATA_STRING:
+			KSTAT_NAMED_STR_PTR(knp)
+				[KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0';
+			seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp));
+			break;
+		default:
+			PANIC("Undefined kstat data type %d\n", knp->data_type);
+	}
+
+	seq_printf(f, "\n");
+
+	return (0);
+}
+
+static int
+kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip)
+{
+	seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n",
+	    kip->intrs[KSTAT_INTR_HARD],
+	    kip->intrs[KSTAT_INTR_SOFT],
+	    kip->intrs[KSTAT_INTR_WATCHDOG],
+	    kip->intrs[KSTAT_INTR_SPURIOUS],
+	    kip->intrs[KSTAT_INTR_MULTSVC]);
+
+	return (0);
+}
+
+static int
+kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip)
+{
+	/* though wlentime & friends are signed, they will never be negative */
+	seq_printf(f,
+	    "%-8llu %-8llu %-8u %-8u %-8llu %-8llu "
+	    "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n",
+	    kip->nread, kip->nwritten,
+	    kip->reads, kip->writes,
+	    kip->wtime, kip->wlentime, kip->wlastupdate,
+	    kip->rtime, kip->rlentime, kip->rlastupdate,
+	    kip->wcnt,  kip->rcnt);
+
+	return (0);
+}
+
+static int
+kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp)
+{
+	seq_printf(f,
+	    "%-31s %-8llu %-8llu %-8llu %-8llu %-8llu %-8llu\n",
+	    ktp->name, ktp->num_events, ktp->elapsed_time,
+	    ktp->min_time, ktp->max_time,
+	    ktp->start_time, ktp->stop_time);
+
+	return (0);
+}
+
+static int
+kstat_seq_show(struct seq_file *f, void *p)
+{
+	kstat_t *ksp = (kstat_t *)f->private;
+	int rc = 0;
+
+	ASSERT(ksp->ks_magic == KS_MAGIC);
+
+	switch (ksp->ks_type) {
+		case KSTAT_TYPE_RAW:
+restart:
+			if (ksp->ks_raw_ops.data) {
+				rc = ksp->ks_raw_ops.data(
+				    ksp->ks_raw_buf, ksp->ks_raw_bufsize, p);
+				if (rc == ENOMEM && !kstat_resize_raw(ksp))
+					goto restart;
+				if (!rc)
+					seq_puts(f, ksp->ks_raw_buf);
+			} else {
+				ASSERT(ksp->ks_ndata == 1);
+				rc = kstat_seq_show_raw(f, ksp->ks_data,
+				    ksp->ks_data_size);
+			}
+			break;
+		case KSTAT_TYPE_NAMED:
+			rc = kstat_seq_show_named(f, (kstat_named_t *)p);
+			break;
+		case KSTAT_TYPE_INTR:
+			rc = kstat_seq_show_intr(f, (kstat_intr_t *)p);
+			break;
+		case KSTAT_TYPE_IO:
+			rc = kstat_seq_show_io(f, (kstat_io_t *)p);
+			break;
+		case KSTAT_TYPE_TIMER:
+			rc = kstat_seq_show_timer(f, (kstat_timer_t *)p);
+			break;
+		default:
+			PANIC("Undefined kstat type %d\n", ksp->ks_type);
+	}
+
+	return (-rc);
+}
+
+static int
+kstat_default_update(kstat_t *ksp, int rw)
+{
+	ASSERT(ksp != NULL);
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	return (0);
+}
+
+static void *
+kstat_seq_data_addr(kstat_t *ksp, loff_t n)
+{
+	void *rc = NULL;
+
+	switch (ksp->ks_type) {
+		case KSTAT_TYPE_RAW:
+			if (ksp->ks_raw_ops.addr)
+				rc = ksp->ks_raw_ops.addr(ksp, n);
+			else
+				rc = ksp->ks_data;
+			break;
+		case KSTAT_TYPE_NAMED:
+			rc = ksp->ks_data + n * sizeof (kstat_named_t);
+			break;
+		case KSTAT_TYPE_INTR:
+			rc = ksp->ks_data + n * sizeof (kstat_intr_t);
+			break;
+		case KSTAT_TYPE_IO:
+			rc = ksp->ks_data + n * sizeof (kstat_io_t);
+			break;
+		case KSTAT_TYPE_TIMER:
+			rc = ksp->ks_data + n * sizeof (kstat_timer_t);
+			break;
+		default:
+			PANIC("Undefined kstat type %d\n", ksp->ks_type);
+	}
+
+	return (rc);
+}
+
+static void *
+kstat_seq_start(struct seq_file *f, loff_t *pos)
+{
+	loff_t n = *pos;
+	kstat_t *ksp = (kstat_t *)f->private;
+	ASSERT(ksp->ks_magic == KS_MAGIC);
+
+	mutex_enter(ksp->ks_lock);
+
+	if (ksp->ks_type == KSTAT_TYPE_RAW) {
+		ksp->ks_raw_bufsize = PAGE_SIZE;
+		ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP);
+	}
+
+	/* Dynamically update kstat, on error existing kstats are used */
+	(void) ksp->ks_update(ksp, KSTAT_READ);
+
+	ksp->ks_snaptime = gethrtime();
+
+	if (!(ksp->ks_flags & KSTAT_FLAG_NO_HEADERS) && !n &&
+	    kstat_seq_show_headers(f))
+		return (NULL);
+
+	if (n >= ksp->ks_ndata)
+		return (NULL);
+
+	return (kstat_seq_data_addr(ksp, n));
+}
+
+static void *
+kstat_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+	kstat_t *ksp = (kstat_t *)f->private;
+	ASSERT(ksp->ks_magic == KS_MAGIC);
+
+	++*pos;
+	if (*pos >= ksp->ks_ndata)
+		return (NULL);
+
+	return (kstat_seq_data_addr(ksp, *pos));
+}
+
+static void
+kstat_seq_stop(struct seq_file *f, void *v)
+{
+	kstat_t *ksp = (kstat_t *)f->private;
+	ASSERT(ksp->ks_magic == KS_MAGIC);
+
+	if (ksp->ks_type == KSTAT_TYPE_RAW)
+		vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+
+	mutex_exit(ksp->ks_lock);
+}
+
+static struct seq_operations kstat_seq_ops = {
+	.show  = kstat_seq_show,
+	.start = kstat_seq_start,
+	.next  = kstat_seq_next,
+	.stop  = kstat_seq_stop,
+};
+
+static kstat_module_t *
+kstat_find_module(char *name)
+{
+	kstat_module_t *module = NULL;
+
+	list_for_each_entry(module, &kstat_module_list, ksm_module_list) {
+		if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0)
+			return (module);
+	}
+
+	return (NULL);
+}
+
+static kstat_module_t *
+kstat_create_module(char *name)
+{
+	kstat_module_t *module;
+	struct proc_dir_entry *pde;
+
+	pde = proc_mkdir(name, proc_spl_kstat);
+	if (pde == NULL)
+		return (NULL);
+
+	module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP);
+	module->ksm_proc = pde;
+	strlcpy(module->ksm_name, name, KSTAT_STRLEN+1);
+	INIT_LIST_HEAD(&module->ksm_kstat_list);
+	list_add_tail(&module->ksm_module_list, &kstat_module_list);
+
+	return (module);
+
+}
+
+static void
+kstat_delete_module(kstat_module_t *module)
+{
+	ASSERT(list_empty(&module->ksm_kstat_list));
+	remove_proc_entry(module->ksm_name, proc_spl_kstat);
+	list_del(&module->ksm_module_list);
+	kmem_free(module, sizeof (kstat_module_t));
+}
+
+static int
+proc_kstat_open(struct inode *inode, struct file *filp)
+{
+	struct seq_file *f;
+	int rc;
+
+	rc = seq_open(filp, &kstat_seq_ops);
+	if (rc)
+		return (rc);
+
+	f = filp->private_data;
+	f->private = PDE_DATA(inode);
+
+	return (rc);
+}
+
+static ssize_t
+proc_kstat_write(struct file *filp, const char __user *buf, size_t len,
+    loff_t *ppos)
+{
+	struct seq_file *f = filp->private_data;
+	kstat_t *ksp = f->private;
+	int rc;
+
+	ASSERT(ksp->ks_magic == KS_MAGIC);
+
+	mutex_enter(ksp->ks_lock);
+	rc = ksp->ks_update(ksp, KSTAT_WRITE);
+	mutex_exit(ksp->ks_lock);
+
+	if (rc)
+		return (-rc);
+
+	*ppos += len;
+	return (len);
+}
+
+static const kstat_proc_op_t proc_kstat_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+	.proc_open	= proc_kstat_open,
+	.proc_write	= proc_kstat_write,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
+#else
+	.open		= proc_kstat_open,
+	.write		= proc_kstat_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+#endif
+};
+
+void
+__kstat_set_raw_ops(kstat_t *ksp,
+    int (*headers)(char *buf, size_t size),
+    int (*data)(char *buf, size_t size, void *data),
+    void *(*addr)(kstat_t *ksp, loff_t index))
+{
+	ksp->ks_raw_ops.headers = headers;
+	ksp->ks_raw_ops.data    = data;
+	ksp->ks_raw_ops.addr    = addr;
+}
+EXPORT_SYMBOL(__kstat_set_raw_ops);
+
+void
+kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module,
+    const char *name)
+{
+	kpep->kpe_owner = NULL;
+	kpep->kpe_proc = NULL;
+	INIT_LIST_HEAD(&kpep->kpe_list);
+	strncpy(kpep->kpe_module, module, KSTAT_STRLEN);
+	strncpy(kpep->kpe_name, name, KSTAT_STRLEN);
+}
+EXPORT_SYMBOL(kstat_proc_entry_init);
+
+kstat_t *
+__kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
+    const char *ks_class, uchar_t ks_type, uint_t ks_ndata,
+    uchar_t ks_flags)
+{
+	kstat_t *ksp;
+
+	ASSERT(ks_module);
+	ASSERT(ks_instance == 0);
+	ASSERT(ks_name);
+
+	if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
+		ASSERT(ks_ndata == 1);
+
+	ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP);
+	if (ksp == NULL)
+		return (ksp);
+
+	mutex_enter(&kstat_module_lock);
+	ksp->ks_kid = kstat_id;
+	kstat_id++;
+	mutex_exit(&kstat_module_lock);
+
+	ksp->ks_magic = KS_MAGIC;
+	mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL);
+	ksp->ks_lock = &ksp->ks_private_lock;
+
+	ksp->ks_crtime = gethrtime();
+	ksp->ks_snaptime = ksp->ks_crtime;
+	ksp->ks_instance = ks_instance;
+	strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN);
+	ksp->ks_type = ks_type;
+	ksp->ks_flags = ks_flags;
+	ksp->ks_update = kstat_default_update;
+	ksp->ks_private = NULL;
+	ksp->ks_raw_ops.headers = NULL;
+	ksp->ks_raw_ops.data = NULL;
+	ksp->ks_raw_ops.addr = NULL;
+	ksp->ks_raw_buf = NULL;
+	ksp->ks_raw_bufsize = 0;
+	kstat_proc_entry_init(&ksp->ks_proc, ks_module, ks_name);
+
+	switch (ksp->ks_type) {
+		case KSTAT_TYPE_RAW:
+			ksp->ks_ndata = 1;
+			ksp->ks_data_size = ks_ndata;
+			break;
+		case KSTAT_TYPE_NAMED:
+			ksp->ks_ndata = ks_ndata;
+			ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t);
+			break;
+		case KSTAT_TYPE_INTR:
+			ksp->ks_ndata = ks_ndata;
+			ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t);
+			break;
+		case KSTAT_TYPE_IO:
+			ksp->ks_ndata = ks_ndata;
+			ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t);
+			break;
+		case KSTAT_TYPE_TIMER:
+			ksp->ks_ndata = ks_ndata;
+			ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t);
+			break;
+		default:
+			PANIC("Undefined kstat type %d\n", ksp->ks_type);
+	}
+
+	if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) {
+		ksp->ks_data = NULL;
+	} else {
+		ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP);
+		if (ksp->ks_data == NULL) {
+			kmem_free(ksp, sizeof (*ksp));
+			ksp = NULL;
+		}
+	}
+
+	return (ksp);
+}
+EXPORT_SYMBOL(__kstat_create);
+
+static int
+kstat_detect_collision(kstat_proc_entry_t *kpep)
+{
+	kstat_module_t *module;
+	kstat_proc_entry_t *tmp = NULL;
+	char *parent;
+	char *cp;
+
+	parent = kmem_asprintf("%s", kpep->kpe_module);
+
+	if ((cp = strrchr(parent, '/')) == NULL) {
+		kmem_strfree(parent);
+		return (0);
+	}
+
+	cp[0] = '\0';
+	if ((module = kstat_find_module(parent)) != NULL) {
+		list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) {
+			if (strncmp(tmp->kpe_name, cp+1, KSTAT_STRLEN) == 0) {
+				kmem_strfree(parent);
+				return (EEXIST);
+			}
+		}
+	}
+
+	kmem_strfree(parent);
+	return (0);
+}
+
+/*
+ * Add a file to the proc filesystem under the kstat namespace (i.e.
+ * /proc/spl/kstat/). The file need not necessarily be implemented as a
+ * kstat.
+ */
+void
+kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode,
+    const kstat_proc_op_t *proc_ops, void *data)
+{
+	kstat_module_t *module;
+	kstat_proc_entry_t *tmp = NULL;
+
+	ASSERT(kpep);
+
+	mutex_enter(&kstat_module_lock);
+
+	module = kstat_find_module(kpep->kpe_module);
+	if (module == NULL) {
+		if (kstat_detect_collision(kpep) != 0) {
+			cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \
+			    " collision", kpep->kpe_module, kpep->kpe_name);
+			goto out;
+		}
+		module = kstat_create_module(kpep->kpe_module);
+		if (module == NULL)
+			goto out;
+	}
+
+	/*
+	 * Only one entry by this name per-module, on failure the module
+	 * shouldn't be deleted because we know it has at least one entry.
+	 */
+	list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) {
+		if (strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0)
+			goto out;
+	}
+
+	list_add_tail(&kpep->kpe_list, &module->ksm_kstat_list);
+
+	kpep->kpe_owner = module;
+	kpep->kpe_proc = proc_create_data(kpep->kpe_name, mode,
+	    module->ksm_proc, proc_ops, data);
+	if (kpep->kpe_proc == NULL) {
+		list_del_init(&kpep->kpe_list);
+		if (list_empty(&module->ksm_kstat_list))
+			kstat_delete_module(module);
+	}
+out:
+	mutex_exit(&kstat_module_lock);
+
+}
+EXPORT_SYMBOL(kstat_proc_entry_install);
+
+void
+__kstat_install(kstat_t *ksp)
+{
+	ASSERT(ksp);
+	mode_t mode;
+	/* Specify permission modes for different kstats */
+	if (strncmp(ksp->ks_proc.kpe_name, "dbufs", KSTAT_STRLEN) == 0) {
+		mode = 0600;
+	} else {
+		mode = 0644;
+	}
+	kstat_proc_entry_install(
+	    &ksp->ks_proc, mode, &proc_kstat_operations, ksp);
+}
+EXPORT_SYMBOL(__kstat_install);
+
+void
+kstat_proc_entry_delete(kstat_proc_entry_t *kpep)
+{
+	kstat_module_t *module = kpep->kpe_owner;
+	if (kpep->kpe_proc)
+		remove_proc_entry(kpep->kpe_name, module->ksm_proc);
+
+	mutex_enter(&kstat_module_lock);
+	list_del_init(&kpep->kpe_list);
+
+	/*
+	 * Remove top level module directory if it wasn't empty before, but now
+	 * is.
+	 */
+	if (kpep->kpe_proc && list_empty(&module->ksm_kstat_list))
+		kstat_delete_module(module);
+	mutex_exit(&kstat_module_lock);
+
+}
+EXPORT_SYMBOL(kstat_proc_entry_delete);
+
+void
+__kstat_delete(kstat_t *ksp)
+{
+	kstat_proc_entry_delete(&ksp->ks_proc);
+
+	if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL))
+		kmem_free(ksp->ks_data, ksp->ks_data_size);
+
+	ksp->ks_lock = NULL;
+	mutex_destroy(&ksp->ks_private_lock);
+	kmem_free(ksp, sizeof (*ksp));
+}
+EXPORT_SYMBOL(__kstat_delete);
+
+int
+spl_kstat_init(void)
+{
+	mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL);
+	INIT_LIST_HEAD(&kstat_module_list);
+	kstat_id = 0;
+	return (0);
+}
+
+void
+spl_kstat_fini(void)
+{
+	ASSERT(list_empty(&kstat_module_list));
+	mutex_destroy(&kstat_module_lock);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
new file mode 100644
index 000000000000..6936db5d6466
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
@@ -0,0 +1,791 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Proc Implementation.
+ */
+
+#include <sys/systeminfo.h>
+#include <sys/kstat.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <sys/taskq.h>
+#include <sys/proc.h>
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/version.h>
+
+#if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+typedef struct ctl_table __no_const spl_ctl_table;
+#else
+typedef struct ctl_table spl_ctl_table;
+#endif
+
+static unsigned long table_min = 0;
+static unsigned long table_max = ~0;
+
+static struct ctl_table_header *spl_header = NULL;
+static struct proc_dir_entry *proc_spl = NULL;
+static struct proc_dir_entry *proc_spl_kmem = NULL;
+static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
+static struct proc_dir_entry *proc_spl_taskq_all = NULL;
+static struct proc_dir_entry *proc_spl_taskq = NULL;
+struct proc_dir_entry *proc_spl_kstat = NULL;
+
+static int
+proc_copyin_string(char *kbuffer, int kbuffer_size, const char *ubuffer,
+    int ubuffer_size)
+{
+	int size;
+
+	if (ubuffer_size > kbuffer_size)
+		return (-EOVERFLOW);
+
+	if (copy_from_user((void *)kbuffer, (void *)ubuffer, ubuffer_size))
+		return (-EFAULT);
+
+	/* strip trailing whitespace */
+	size = strnlen(kbuffer, ubuffer_size);
+	while (size-- >= 0)
+		if (!isspace(kbuffer[size]))
+			break;
+
+	/* empty string */
+	if (size < 0)
+		return (-EINVAL);
+
+	/* no space to terminate */
+	if (size == kbuffer_size)
+		return (-EOVERFLOW);
+
+	kbuffer[size + 1] = 0;
+	return (0);
+}
+
+static int
+proc_copyout_string(char *ubuffer, int ubuffer_size, const char *kbuffer,
+    char *append)
+{
+	/*
+	 * NB if 'append' != NULL, it's a single character to append to the
+	 * copied out string - usually "\n", for /proc entries and
+	 * (i.e. a terminating zero byte) for sysctl entries
+	 */
+	int size = MIN(strlen(kbuffer), ubuffer_size);
+
+	if (copy_to_user(ubuffer, kbuffer, size))
+		return (-EFAULT);
+
+	if (append != NULL && size < ubuffer_size) {
+		if (copy_to_user(ubuffer + size, append, 1))
+			return (-EFAULT);
+
+		size++;
+	}
+
+	return (size);
+}
+
+#ifdef DEBUG_KMEM
+static int
+proc_domemused(struct ctl_table *table, int write,
+    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc = 0;
+	unsigned long min = 0, max = ~0, val;
+	spl_ctl_table dummy = *table;
+
+	dummy.data = &val;
+	dummy.proc_handler = &proc_dointvec;
+	dummy.extra1 = &min;
+	dummy.extra2 = &max;
+
+	if (write) {
+		*ppos += *lenp;
+	} else {
+#ifdef HAVE_ATOMIC64_T
+		val = atomic64_read((atomic64_t *)table->data);
+#else
+		val = atomic_read((atomic_t *)table->data);
+#endif /* HAVE_ATOMIC64_T */
+		rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
+	}
+
+	return (rc);
+}
+#endif /* DEBUG_KMEM */
+
+static int
+proc_doslab(struct ctl_table *table, int write,
+    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc = 0;
+	unsigned long min = 0, max = ~0, val = 0, mask;
+	spl_ctl_table dummy = *table;
+	spl_kmem_cache_t *skc = NULL;
+
+	dummy.data = &val;
+	dummy.proc_handler = &proc_dointvec;
+	dummy.extra1 = &min;
+	dummy.extra2 = &max;
+
+	if (write) {
+		*ppos += *lenp;
+	} else {
+		down_read(&spl_kmem_cache_sem);
+		mask = (unsigned long)table->data;
+
+		list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+
+			/* Only use slabs of the correct kmem/vmem type */
+			if (!(skc->skc_flags & mask))
+				continue;
+
+			/* Sum the specified field for selected slabs */
+			switch (mask & (KMC_TOTAL | KMC_ALLOC | KMC_MAX)) {
+			case KMC_TOTAL:
+				val += skc->skc_slab_size * skc->skc_slab_total;
+				break;
+			case KMC_ALLOC:
+				val += skc->skc_obj_size * skc->skc_obj_alloc;
+				break;
+			case KMC_MAX:
+				val += skc->skc_obj_size * skc->skc_obj_max;
+				break;
+			}
+		}
+
+		up_read(&spl_kmem_cache_sem);
+		rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
+	}
+
+	return (rc);
+}
+
+static int
+proc_dohostid(struct ctl_table *table, int write,
+    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int len, rc = 0;
+	char *end, str[32];
+
+	if (write) {
+		/*
+		 * We can't use proc_doulongvec_minmax() in the write
+		 * case here because hostid while a hex value has no
+		 * leading 0x which confuses the helper function.
+		 */
+		rc = proc_copyin_string(str, sizeof (str), buffer, *lenp);
+		if (rc < 0)
+			return (rc);
+
+		spl_hostid = simple_strtoul(str, &end, 16);
+		if (str == end)
+			return (-EINVAL);
+
+	} else {
+		len = snprintf(str, sizeof (str), "%lx",
+		    (unsigned long) zone_get_hostid(NULL));
+		if (*ppos >= len)
+			rc = 0;
+		else
+			rc = proc_copyout_string(buffer,
+			    *lenp, str + *ppos, "\n");
+
+		if (rc >= 0) {
+			*lenp = rc;
+			*ppos += rc;
+		}
+	}
+
+	return (rc);
+}
+
+static void
+taskq_seq_show_headers(struct seq_file *f)
+{
+	seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n",
+	    "taskq", "act", "nthr", "spwn", "maxt", "pri",
+	    "mina", "maxa", "cura", "flags");
+}
+
+/* indices into the lheads array below */
+#define	LHEAD_PEND	0
+#define	LHEAD_PRIO	1
+#define	LHEAD_DELAY	2
+#define	LHEAD_WAIT	3
+#define	LHEAD_ACTIVE	4
+#define	LHEAD_SIZE	5
+
+/* BEGIN CSTYLED */
+static unsigned int spl_max_show_tasks = 512;
+module_param(spl_max_show_tasks, uint, 0644);
+MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc");
+/* END CSTYLED */
+
+static int
+taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag)
+{
+	taskq_t *tq = p;
+	taskq_thread_t *tqt = NULL;
+	spl_wait_queue_entry_t *wq;
+	struct task_struct *tsk;
+	taskq_ent_t *tqe;
+	char name[100];
+	struct list_head *lheads[LHEAD_SIZE], *lh;
+	static char *list_names[LHEAD_SIZE] =
+	    {"pend", "prio", "delay", "wait", "active" };
+	int i, j, have_lheads = 0;
+	unsigned long wflags, flags;
+
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags);
+
+	/* get the various lists and check whether they're empty */
+	lheads[LHEAD_PEND] = &tq->tq_pend_list;
+	lheads[LHEAD_PRIO] = &tq->tq_prio_list;
+	lheads[LHEAD_DELAY] = &tq->tq_delay_list;
+#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
+	lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head;
+#else
+	lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list;
+#endif
+	lheads[LHEAD_ACTIVE] = &tq->tq_active_list;
+
+	for (i = 0; i < LHEAD_SIZE; ++i) {
+		if (list_empty(lheads[i]))
+			lheads[i] = NULL;
+		else
+			++have_lheads;
+	}
+
+	/* early return in non-"all" mode if lists are all empty */
+	if (!allflag && !have_lheads) {
+		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+		spin_unlock_irqrestore(&tq->tq_lock, flags);
+		return (0);
+	}
+
+	/* unlock the waitq quickly */
+	if (!lheads[LHEAD_WAIT])
+		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+
+	/* show the base taskq contents */
+	snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance);
+	seq_printf(f, "%-25s ", name);
+	seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n",
+	    tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn,
+	    tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc,
+	    tq->tq_nalloc, tq->tq_flags);
+
+	/* show the active list */
+	if (lheads[LHEAD_ACTIVE]) {
+		j = 0;
+		list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) {
+			if (j == 0)
+				seq_printf(f, "\t%s:",
+				    list_names[LHEAD_ACTIVE]);
+			else if (j == 2) {
+				seq_printf(f, "\n\t       ");
+				j = 0;
+			}
+			seq_printf(f, " [%d]%pf(%ps)",
+			    tqt->tqt_thread->pid,
+			    tqt->tqt_task->tqent_func,
+			    tqt->tqt_task->tqent_arg);
+			++j;
+		}
+		seq_printf(f, "\n");
+	}
+
+	for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i)
+		if (lheads[i]) {
+			j = 0;
+			list_for_each(lh, lheads[i]) {
+				if (spl_max_show_tasks != 0 &&
+				    j >= spl_max_show_tasks) {
+					seq_printf(f, "\n\t(truncated)");
+					break;
+				}
+				/* show the wait waitq list */
+				if (i == LHEAD_WAIT) {
+#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
+					wq = list_entry(lh,
+					    spl_wait_queue_entry_t, entry);
+#else
+					wq = list_entry(lh,
+					    spl_wait_queue_entry_t, task_list);
+#endif
+					if (j == 0)
+						seq_printf(f, "\t%s:",
+						    list_names[i]);
+					else if (j % 8 == 0)
+						seq_printf(f, "\n\t     ");
+
+					tsk = wq->private;
+					seq_printf(f, " %d", tsk->pid);
+				/* pend, prio and delay lists */
+				} else {
+					tqe = list_entry(lh, taskq_ent_t,
+					    tqent_list);
+					if (j == 0)
+						seq_printf(f, "\t%s:",
+						    list_names[i]);
+					else if (j % 2 == 0)
+						seq_printf(f, "\n\t     ");
+
+					seq_printf(f, " %pf(%ps)",
+					    tqe->tqent_func,
+					    tqe->tqent_arg);
+				}
+				++j;
+			}
+			seq_printf(f, "\n");
+		}
+	if (lheads[LHEAD_WAIT])
+		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	return (0);
+}
+
+static int
+taskq_all_seq_show(struct seq_file *f, void *p)
+{
+	return (taskq_seq_show_impl(f, p, B_TRUE));
+}
+
+static int
+taskq_seq_show(struct seq_file *f, void *p)
+{
+	return (taskq_seq_show_impl(f, p, B_FALSE));
+}
+
+static void *
+taskq_seq_start(struct seq_file *f, loff_t *pos)
+{
+	struct list_head *p;
+	loff_t n = *pos;
+
+	down_read(&tq_list_sem);
+	if (!n)
+		taskq_seq_show_headers(f);
+
+	p = tq_list.next;
+	while (n--) {
+		p = p->next;
+		if (p == &tq_list)
+		return (NULL);
+	}
+
+	return (list_entry(p, taskq_t, tq_taskqs));
+}
+
+static void *
+taskq_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+	taskq_t *tq = p;
+
+	++*pos;
+	return ((tq->tq_taskqs.next == &tq_list) ?
+	    NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs));
+}
+
+static void
+slab_seq_show_headers(struct seq_file *f)
+{
+	seq_printf(f,
+	    "--------------------- cache ----------"
+	    "---------------------------------------------  "
+	    "----- slab ------  "
+	    "---- object -----  "
+	    "--- emergency ---\n");
+	seq_printf(f,
+	    "name                                  "
+	    "  flags      size     alloc slabsize  objsize  "
+	    "total alloc   max  "
+	    "total alloc   max  "
+	    "dlock alloc   max\n");
+}
+
+static int
+slab_seq_show(struct seq_file *f, void *p)
+{
+	spl_kmem_cache_t *skc = p;
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+
+	if (skc->skc_flags & KMC_SLAB) {
+		/*
+		 * This cache is backed by a generic Linux kmem cache which
+		 * has its own accounting. For these caches we only track
+		 * the number of active allocated objects that exist within
+		 * the underlying Linux slabs. For the overall statistics of
+		 * the underlying Linux cache please refer to /proc/slabinfo.
+		 */
+		spin_lock(&skc->skc_lock);
+		uint64_t objs_allocated =
+		    percpu_counter_sum(&skc->skc_linux_alloc);
+		seq_printf(f, "%-36s  ", skc->skc_name);
+		seq_printf(f, "0x%05lx %9s %9lu %8s %8u  "
+		    "%5s %5s %5s  %5s %5lu %5s  %5s %5s %5s\n",
+		    (long unsigned)skc->skc_flags,
+		    "-",
+		    (long unsigned)(skc->skc_obj_size * objs_allocated),
+		    "-",
+		    (unsigned)skc->skc_obj_size,
+		    "-", "-", "-", "-",
+		    (long unsigned)objs_allocated,
+		    "-", "-", "-", "-");
+		spin_unlock(&skc->skc_lock);
+		return (0);
+	}
+
+	spin_lock(&skc->skc_lock);
+	seq_printf(f, "%-36s  ", skc->skc_name);
+	seq_printf(f, "0x%05lx %9lu %9lu %8u %8u  "
+	    "%5lu %5lu %5lu  %5lu %5lu %5lu  %5lu %5lu %5lu\n",
+	    (long unsigned)skc->skc_flags,
+	    (long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
+	    (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
+	    (unsigned)skc->skc_slab_size,
+	    (unsigned)skc->skc_obj_size,
+	    (long unsigned)skc->skc_slab_total,
+	    (long unsigned)skc->skc_slab_alloc,
+	    (long unsigned)skc->skc_slab_max,
+	    (long unsigned)skc->skc_obj_total,
+	    (long unsigned)skc->skc_obj_alloc,
+	    (long unsigned)skc->skc_obj_max,
+	    (long unsigned)skc->skc_obj_deadlock,
+	    (long unsigned)skc->skc_obj_emergency,
+	    (long unsigned)skc->skc_obj_emergency_max);
+	spin_unlock(&skc->skc_lock);
+	return (0);
+}
+
+static void *
+slab_seq_start(struct seq_file *f, loff_t *pos)
+{
+	struct list_head *p;
+	loff_t n = *pos;
+
+	down_read(&spl_kmem_cache_sem);
+	if (!n)
+		slab_seq_show_headers(f);
+
+	p = spl_kmem_cache_list.next;
+	while (n--) {
+		p = p->next;
+		if (p == &spl_kmem_cache_list)
+			return (NULL);
+	}
+
+	return (list_entry(p, spl_kmem_cache_t, skc_list));
+}
+
+static void *
+slab_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+	spl_kmem_cache_t *skc = p;
+
+	++*pos;
+	return ((skc->skc_list.next == &spl_kmem_cache_list) ?
+	    NULL : list_entry(skc->skc_list.next, spl_kmem_cache_t, skc_list));
+}
+
+static void
+slab_seq_stop(struct seq_file *f, void *v)
+{
+	up_read(&spl_kmem_cache_sem);
+}
+
+static struct seq_operations slab_seq_ops = {
+	.show  = slab_seq_show,
+	.start = slab_seq_start,
+	.next  = slab_seq_next,
+	.stop  = slab_seq_stop,
+};
+
+static int
+proc_slab_open(struct inode *inode, struct file *filp)
+{
+	return (seq_open(filp, &slab_seq_ops));
+}
+
+static const kstat_proc_op_t proc_slab_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+	.proc_open	= proc_slab_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
+#else
+	.open		= proc_slab_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+#endif
+};
+
+static void
+taskq_seq_stop(struct seq_file *f, void *v)
+{
+	up_read(&tq_list_sem);
+}
+
+static struct seq_operations taskq_all_seq_ops = {
+	.show	= taskq_all_seq_show,
+	.start	= taskq_seq_start,
+	.next	= taskq_seq_next,
+	.stop	= taskq_seq_stop,
+};
+
+static struct seq_operations taskq_seq_ops = {
+	.show	= taskq_seq_show,
+	.start	= taskq_seq_start,
+	.next	= taskq_seq_next,
+	.stop	= taskq_seq_stop,
+};
+
+static int
+proc_taskq_all_open(struct inode *inode, struct file *filp)
+{
+	return (seq_open(filp, &taskq_all_seq_ops));
+}
+
+static int
+proc_taskq_open(struct inode *inode, struct file *filp)
+{
+	return (seq_open(filp, &taskq_seq_ops));
+}
+
+static const kstat_proc_op_t proc_taskq_all_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+	.proc_open	= proc_taskq_all_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
+#else
+	.open		= proc_taskq_all_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+#endif
+};
+
+static const kstat_proc_op_t proc_taskq_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+	.proc_open	= proc_taskq_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
+#else
+	.open		= proc_taskq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+#endif
+};
+
+static struct ctl_table spl_kmem_table[] = {
+#ifdef DEBUG_KMEM
+	{
+		.procname	= "kmem_used",
+		.data		= &kmem_alloc_used,
+#ifdef HAVE_ATOMIC64_T
+		.maxlen		= sizeof (atomic64_t),
+#else
+		.maxlen		= sizeof (atomic_t),
+#endif /* HAVE_ATOMIC64_T */
+		.mode		= 0444,
+		.proc_handler	= &proc_domemused,
+	},
+	{
+		.procname	= "kmem_max",
+		.data		= &kmem_alloc_max,
+		.maxlen		= sizeof (unsigned long),
+		.extra1		= &table_min,
+		.extra2		= &table_max,
+		.mode		= 0444,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
+#endif /* DEBUG_KMEM */
+	{
+		.procname	= "slab_kvmem_total",
+		.data		= (void *)(KMC_KVMEM | KMC_TOTAL),
+		.maxlen		= sizeof (unsigned long),
+		.extra1		= &table_min,
+		.extra2		= &table_max,
+		.mode		= 0444,
+		.proc_handler	= &proc_doslab,
+	},
+	{
+		.procname	= "slab_kvmem_alloc",
+		.data		= (void *)(KMC_KVMEM | KMC_ALLOC),
+		.maxlen		= sizeof (unsigned long),
+		.extra1		= &table_min,
+		.extra2		= &table_max,
+		.mode		= 0444,
+		.proc_handler	= &proc_doslab,
+	},
+	{
+		.procname	= "slab_kvmem_max",
+		.data		= (void *)(KMC_KVMEM | KMC_MAX),
+		.maxlen		= sizeof (unsigned long),
+		.extra1		= &table_min,
+		.extra2		= &table_max,
+		.mode		= 0444,
+		.proc_handler	= &proc_doslab,
+	},
+	{},
+};
+
+static struct ctl_table spl_kstat_table[] = {
+	{},
+};
+
+static struct ctl_table spl_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since
+	 * sysctl(8) prefers to go via /proc for portability.
+	 */
+	{
+		.procname	= "gitrev",
+		.data		= spl_gitrev,
+		.maxlen		= sizeof (spl_gitrev),
+		.mode		= 0444,
+		.proc_handler	= &proc_dostring,
+	},
+	{
+		.procname	= "hostid",
+		.data		= &spl_hostid,
+		.maxlen		= sizeof (unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_dohostid,
+	},
+	{
+		.procname	= "kmem",
+		.mode		= 0555,
+		.child		= spl_kmem_table,
+	},
+	{
+		.procname	= "kstat",
+		.mode		= 0555,
+		.child		= spl_kstat_table,
+	},
+	{},
+};
+
+static struct ctl_table spl_dir[] = {
+	{
+		.procname	= "spl",
+		.mode		= 0555,
+		.child		= spl_table,
+	},
+	{}
+};
+
+static struct ctl_table spl_root[] = {
+	{
+	.procname = "kernel",
+	.mode = 0555,
+	.child = spl_dir,
+	},
+	{}
+};
+
+int
+spl_proc_init(void)
+{
+	int rc = 0;
+
+	spl_header = register_sysctl_table(spl_root);
+	if (spl_header == NULL)
+		return (-EUNATCH);
+
+	proc_spl = proc_mkdir("spl", NULL);
+	if (proc_spl == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+
+	proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl,
+	    &proc_taskq_all_operations, NULL);
+	if (proc_spl_taskq_all == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+
+	proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl,
+	    &proc_taskq_operations, NULL);
+	if (proc_spl_taskq == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+
+	proc_spl_kmem = proc_mkdir("kmem", proc_spl);
+	if (proc_spl_kmem == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+
+	proc_spl_kmem_slab = proc_create_data("slab", 0444, proc_spl_kmem,
+	    &proc_slab_operations, NULL);
+	if (proc_spl_kmem_slab == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+
+	proc_spl_kstat = proc_mkdir("kstat", proc_spl);
+	if (proc_spl_kstat == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+out:
+	if (rc) {
+		remove_proc_entry("kstat", proc_spl);
+		remove_proc_entry("slab", proc_spl_kmem);
+		remove_proc_entry("kmem", proc_spl);
+		remove_proc_entry("taskq-all", proc_spl);
+		remove_proc_entry("taskq", proc_spl);
+		remove_proc_entry("spl", NULL);
+		unregister_sysctl_table(spl_header);
+	}
+
+	return (rc);
+}
+
+void
+spl_proc_fini(void)
+{
+	remove_proc_entry("kstat", proc_spl);
+	remove_proc_entry("slab", proc_spl_kmem);
+	remove_proc_entry("kmem", proc_spl);
+	remove_proc_entry("taskq-all", proc_spl);
+	remove_proc_entry("taskq", proc_spl);
+	remove_proc_entry("spl", NULL);
+
+	ASSERT(spl_header != NULL);
+	unregister_sysctl_table(spl_header);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c b/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c
new file mode 100644
index 000000000000..189d6a7c6082
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c
@@ -0,0 +1,264 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/list.h>
+#include <sys/mutex.h>
+#include <sys/procfs_list.h>
+#include <linux/proc_fs.h>
+
+/*
+ * A procfs_list is a wrapper around a linked list which implements the seq_file
+ * interface, allowing the contents of the list to be exposed through procfs.
+ * The kernel already has some utilities to help implement the seq_file
+ * interface for linked lists (seq_list_*), but they aren't appropriate for use
+ * with lists that have many entries, because seq_list_start walks the list at
+ * the start of each read syscall to find where it left off, so reading a file
+ * ends up being quadratic in the number of entries in the list.
+ *
+ * This implementation avoids this penalty by maintaining a separate cursor into
+ * the list per instance of the file that is open. It also maintains some extra
+ * information in each node of the list to prevent reads of entries that have
+ * been dropped from the list.
+ *
+ * Callers should only add elements to the list using procfs_list_add, which
+ * adds an element to the tail of the list. Other operations can be performed
+ * directly on the wrapped list using the normal list manipulation functions,
+ * but elements should only be removed from the head of the list.
+ */
+
+#define	NODE_ID(procfs_list, obj) \
+		(((procfs_list_node_t *)(((char *)obj) + \
+		(procfs_list)->pl_node_offset))->pln_id)
+
+typedef struct procfs_list_cursor {
+	procfs_list_t	*procfs_list;	/* List into which this cursor points */
+	void		*cached_node;	/* Most recently accessed node */
+	loff_t		cached_pos;	/* Position of cached_node */
+} procfs_list_cursor_t;
+
+static int
+procfs_list_seq_show(struct seq_file *f, void *p)
+{
+	procfs_list_cursor_t *cursor = f->private;
+	procfs_list_t *procfs_list = cursor->procfs_list;
+
+	ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+	if (p == SEQ_START_TOKEN) {
+		if (procfs_list->pl_show_header != NULL)
+			return (procfs_list->pl_show_header(f));
+		else
+			return (0);
+	}
+	return (procfs_list->pl_show(f, p));
+}
+
+static void *
+procfs_list_next_node(procfs_list_cursor_t *cursor, loff_t *pos)
+{
+	void *next_node;
+	procfs_list_t *procfs_list = cursor->procfs_list;
+
+	if (cursor->cached_node == SEQ_START_TOKEN)
+		next_node = list_head(&procfs_list->pl_list);
+	else
+		next_node = list_next(&procfs_list->pl_list,
+		    cursor->cached_node);
+
+	if (next_node != NULL) {
+		cursor->cached_node = next_node;
+		cursor->cached_pos = NODE_ID(procfs_list, cursor->cached_node);
+		*pos = cursor->cached_pos;
+	}
+	return (next_node);
+}
+
+static void *
+procfs_list_seq_start(struct seq_file *f, loff_t *pos)
+{
+	procfs_list_cursor_t *cursor = f->private;
+	procfs_list_t *procfs_list = cursor->procfs_list;
+
+	mutex_enter(&procfs_list->pl_lock);
+
+	if (*pos == 0) {
+		cursor->cached_node = SEQ_START_TOKEN;
+		cursor->cached_pos = 0;
+		return (SEQ_START_TOKEN);
+	}
+
+	/*
+	 * Check if our cached pointer has become stale, which happens if the
+	 * the message where we left off has been dropped from the list since
+	 * the last read syscall completed.
+	 */
+	void *oldest_node = list_head(&procfs_list->pl_list);
+	if (cursor->cached_node != SEQ_START_TOKEN && (oldest_node == NULL ||
+	    NODE_ID(procfs_list, oldest_node) > cursor->cached_pos))
+		return (ERR_PTR(-EIO));
+
+	/*
+	 * If it isn't starting from the beginning of the file, the seq_file
+	 * code will either pick up at the same position it visited last or the
+	 * following one.
+	 */
+	if (*pos == cursor->cached_pos) {
+		return (cursor->cached_node);
+	} else {
+		ASSERT3U(*pos, ==, cursor->cached_pos + 1);
+		return (procfs_list_next_node(cursor, pos));
+	}
+}
+
+static void *
+procfs_list_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+	procfs_list_cursor_t *cursor = f->private;
+	ASSERT(MUTEX_HELD(&cursor->procfs_list->pl_lock));
+	return (procfs_list_next_node(cursor, pos));
+}
+
+static void
+procfs_list_seq_stop(struct seq_file *f, void *p)
+{
+	procfs_list_cursor_t *cursor = f->private;
+	procfs_list_t *procfs_list = cursor->procfs_list;
+	mutex_exit(&procfs_list->pl_lock);
+}
+
+static struct seq_operations procfs_list_seq_ops = {
+	.show  = procfs_list_seq_show,
+	.start = procfs_list_seq_start,
+	.next  = procfs_list_seq_next,
+	.stop  = procfs_list_seq_stop,
+};
+
+static int
+procfs_list_open(struct inode *inode, struct file *filp)
+{
+	int rc = seq_open_private(filp, &procfs_list_seq_ops,
+	    sizeof (procfs_list_cursor_t));
+	if (rc != 0)
+		return (rc);
+
+	struct seq_file *f = filp->private_data;
+	procfs_list_cursor_t *cursor = f->private;
+	cursor->procfs_list = PDE_DATA(inode);
+	cursor->cached_node = NULL;
+	cursor->cached_pos = 0;
+
+	return (0);
+}
+
+static ssize_t
+procfs_list_write(struct file *filp, const char __user *buf, size_t len,
+    loff_t *ppos)
+{
+	struct seq_file *f = filp->private_data;
+	procfs_list_cursor_t *cursor = f->private;
+	procfs_list_t *procfs_list = cursor->procfs_list;
+	int rc;
+
+	if (procfs_list->pl_clear != NULL &&
+	    (rc = procfs_list->pl_clear(procfs_list)) != 0)
+		return (-rc);
+	return (len);
+}
+
+static const kstat_proc_op_t procfs_list_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+	.proc_open	= procfs_list_open,
+	.proc_write	= procfs_list_write,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release_private,
+#else
+	.open		= procfs_list_open,
+	.write		= procfs_list_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+#endif
+};
+
+/*
+ * Initialize a procfs_list and create a file for it in the proc filesystem
+ * under the kstat namespace.
+ */
+void
+procfs_list_install(const char *module,
+    const char *name,
+    mode_t mode,
+    procfs_list_t *procfs_list,
+    int (*show)(struct seq_file *f, void *p),
+    int (*show_header)(struct seq_file *f),
+    int (*clear)(procfs_list_t *procfs_list),
+    size_t procfs_list_node_off)
+{
+	mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&procfs_list->pl_list,
+	    procfs_list_node_off + sizeof (procfs_list_node_t),
+	    procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
+	procfs_list->pl_next_id = 1; /* Save id 0 for SEQ_START_TOKEN */
+	procfs_list->pl_show = show;
+	procfs_list->pl_show_header = show_header;
+	procfs_list->pl_clear = clear;
+	procfs_list->pl_node_offset = procfs_list_node_off;
+
+	kstat_proc_entry_init(&procfs_list->pl_kstat_entry, module, name);
+	kstat_proc_entry_install(&procfs_list->pl_kstat_entry, mode,
+	    &procfs_list_operations, procfs_list);
+}
+EXPORT_SYMBOL(procfs_list_install);
+
+/* Remove the proc filesystem file corresponding to the given list */
+void
+procfs_list_uninstall(procfs_list_t *procfs_list)
+{
+	kstat_proc_entry_delete(&procfs_list->pl_kstat_entry);
+}
+EXPORT_SYMBOL(procfs_list_uninstall);
+
+void
+procfs_list_destroy(procfs_list_t *procfs_list)
+{
+	ASSERT(list_is_empty(&procfs_list->pl_list));
+	list_destroy(&procfs_list->pl_list);
+	mutex_destroy(&procfs_list->pl_lock);
+}
+EXPORT_SYMBOL(procfs_list_destroy);
+
+/*
+ * Add a new node to the tail of the list. While the standard list manipulation
+ * functions can be use for all other operation, adding elements to the list
+ * should only be done using this helper so that the id of the new node is set
+ * correctly.
+ */
+void
+procfs_list_add(procfs_list_t *procfs_list, void *p)
+{
+	ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+	NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
+	list_insert_tail(&procfs_list->pl_list, p);
+}
+EXPORT_SYMBOL(procfs_list_add);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
new file mode 100644
index 000000000000..9cbf3e38137c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
@@ -0,0 +1,1308 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Task Queue Implementation.
+ */
+
+#include <sys/timer.h>
+#include <sys/taskq.h>
+#include <sys/kmem.h>
+#include <sys/tsd.h>
+#include <sys/trace_spl.h>
+
+int spl_taskq_thread_bind = 0;
+module_param(spl_taskq_thread_bind, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
+
+
+int spl_taskq_thread_dynamic = 1;
+module_param(spl_taskq_thread_dynamic, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads");
+
+int spl_taskq_thread_priority = 1;
+module_param(spl_taskq_thread_priority, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_priority,
+	"Allow non-default priority for taskq threads");
+
+int spl_taskq_thread_sequential = 4;
+module_param(spl_taskq_thread_sequential, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_sequential,
+	"Create new taskq threads after N sequential tasks");
+
+/* Global system-wide dynamic task queue available for all consumers */
+taskq_t *system_taskq;
+EXPORT_SYMBOL(system_taskq);
+/* Global dynamic task queue for long delay */
+taskq_t *system_delay_taskq;
+EXPORT_SYMBOL(system_delay_taskq);
+
+/* Private dedicated taskq for creating new taskq threads on demand. */
+static taskq_t *dynamic_taskq;
+static taskq_thread_t *taskq_thread_create(taskq_t *);
+
+/* List of all taskqs */
+LIST_HEAD(tq_list);
+struct rw_semaphore tq_list_sem;
+static uint_t taskq_tsd;
+
+static int
+task_km_flags(uint_t flags)
+{
+	if (flags & TQ_NOSLEEP)
+		return (KM_NOSLEEP);
+
+	if (flags & TQ_PUSHPAGE)
+		return (KM_PUSHPAGE);
+
+	return (KM_SLEEP);
+}
+
+/*
+ * taskq_find_by_name - Find the largest instance number of a named taskq.
+ */
+static int
+taskq_find_by_name(const char *name)
+{
+	struct list_head *tql = NULL;
+	taskq_t *tq;
+
+	list_for_each_prev(tql, &tq_list) {
+		tq = list_entry(tql, taskq_t, tq_taskqs);
+		if (strcmp(name, tq->tq_name) == 0)
+			return (tq->tq_instance);
+	}
+	return (-1);
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, returns a list_t which
+ * is not attached to the free, work, or pending taskq lists.
+ */
+static taskq_ent_t *
+task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags)
+{
+	taskq_ent_t *t;
+	int count = 0;
+
+	ASSERT(tq);
+retry:
+	/* Acquire taskq_ent_t's from free list if available */
+	if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) {
+		t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
+
+		ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+		ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL));
+		ASSERT(!timer_pending(&t->tqent_timer));
+
+		list_del_init(&t->tqent_list);
+		return (t);
+	}
+
+	/* Free list is empty and memory allocations are prohibited */
+	if (flags & TQ_NOALLOC)
+		return (NULL);
+
+	/* Hit maximum taskq_ent_t pool size */
+	if (tq->tq_nalloc >= tq->tq_maxalloc) {
+		if (flags & TQ_NOSLEEP)
+			return (NULL);
+
+		/*
+		 * Sleep periodically polling the free list for an available
+		 * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed
+		 * but we cannot block forever waiting for an taskq_ent_t to
+		 * show up in the free list, otherwise a deadlock can happen.
+		 *
+		 * Therefore, we need to allocate a new task even if the number
+		 * of allocated tasks is above tq->tq_maxalloc, but we still
+		 * end up delaying the task allocation by one second, thereby
+		 * throttling the task dispatch rate.
+		 */
+		spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
+		schedule_timeout(HZ / 100);
+		spin_lock_irqsave_nested(&tq->tq_lock, *irqflags,
+		    tq->tq_lock_class);
+		if (count < 100) {
+			count++;
+			goto retry;
+		}
+	}
+
+	spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
+	t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags));
+	spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class);
+
+	if (t) {
+		taskq_init_ent(t);
+		tq->tq_nalloc++;
+	}
+
+	return (t);
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t
+ * to already be removed from the free, work, or pending taskq lists.
+ */
+static void
+task_free(taskq_t *tq, taskq_ent_t *t)
+{
+	ASSERT(tq);
+	ASSERT(t);
+	ASSERT(list_empty(&t->tqent_list));
+	ASSERT(!timer_pending(&t->tqent_timer));
+
+	kmem_free(t, sizeof (taskq_ent_t));
+	tq->tq_nalloc--;
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, either destroys the
+ * taskq_ent_t if too many exist or moves it to the free list for later use.
+ */
+static void
+task_done(taskq_t *tq, taskq_ent_t *t)
+{
+	ASSERT(tq);
+	ASSERT(t);
+
+	/* Wake tasks blocked in taskq_wait_id() */
+	wake_up_all(&t->tqent_waitq);
+
+	list_del_init(&t->tqent_list);
+
+	if (tq->tq_nalloc <= tq->tq_minalloc) {
+		t->tqent_id = TASKQID_INVALID;
+		t->tqent_func = NULL;
+		t->tqent_arg = NULL;
+		t->tqent_flags = 0;
+
+		list_add_tail(&t->tqent_list, &tq->tq_free_list);
+	} else {
+		task_free(tq, t);
+	}
+}
+
+/*
+ * When a delayed task timer expires remove it from the delay list and
+ * add it to the priority list in order for immediate processing.
+ */
+static void
+task_expire_impl(taskq_ent_t *t)
+{
+	taskq_ent_t *w;
+	taskq_t *tq = t->tqent_taskq;
+	struct list_head *l = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+
+	if (t->tqent_flags & TQENT_FLAG_CANCEL) {
+		ASSERT(list_empty(&t->tqent_list));
+		spin_unlock_irqrestore(&tq->tq_lock, flags);
+		return;
+	}
+
+	t->tqent_birth = jiffies;
+	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
+
+	/*
+	 * The priority list must be maintained in strict task id order
+	 * from lowest to highest for lowest_id to be easily calculable.
+	 */
+	list_del(&t->tqent_list);
+	list_for_each_prev(l, &tq->tq_prio_list) {
+		w = list_entry(l, taskq_ent_t, tqent_list);
+		if (w->tqent_id < t->tqent_id) {
+			list_add(&t->tqent_list, l);
+			break;
+		}
+	}
+	if (l == &tq->tq_prio_list)
+		list_add(&t->tqent_list, &tq->tq_prio_list);
+
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	wake_up(&tq->tq_work_waitq);
+}
+
+static void
+task_expire(spl_timer_list_t tl)
+{
+	struct timer_list *tmr = (struct timer_list *)tl;
+	taskq_ent_t *t = from_timer(t, tmr, tqent_timer);
+	task_expire_impl(t);
+}
+
+/*
+ * Returns the lowest incomplete taskqid_t.  The taskqid_t may
+ * be queued on the pending list, on the priority list, on the
+ * delay list, or on the work list currently being handled, but
+ * it is not 100% complete yet.
+ */
+static taskqid_t
+taskq_lowest_id(taskq_t *tq)
+{
+	taskqid_t lowest_id = tq->tq_next_id;
+	taskq_ent_t *t;
+	taskq_thread_t *tqt;
+
+	ASSERT(tq);
+
+	if (!list_empty(&tq->tq_pend_list)) {
+		t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list);
+		lowest_id = MIN(lowest_id, t->tqent_id);
+	}
+
+	if (!list_empty(&tq->tq_prio_list)) {
+		t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list);
+		lowest_id = MIN(lowest_id, t->tqent_id);
+	}
+
+	if (!list_empty(&tq->tq_delay_list)) {
+		t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list);
+		lowest_id = MIN(lowest_id, t->tqent_id);
+	}
+
+	if (!list_empty(&tq->tq_active_list)) {
+		tqt = list_entry(tq->tq_active_list.next, taskq_thread_t,
+		    tqt_active_list);
+		ASSERT(tqt->tqt_id != TASKQID_INVALID);
+		lowest_id = MIN(lowest_id, tqt->tqt_id);
+	}
+
+	return (lowest_id);
+}
+
+/*
+ * Insert a task into a list keeping the list sorted by increasing taskqid.
+ */
+static void
+taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt)
+{
+	taskq_thread_t *w;
+	struct list_head *l = NULL;
+
+	ASSERT(tq);
+	ASSERT(tqt);
+
+	list_for_each_prev(l, &tq->tq_active_list) {
+		w = list_entry(l, taskq_thread_t, tqt_active_list);
+		if (w->tqt_id < tqt->tqt_id) {
+			list_add(&tqt->tqt_active_list, l);
+			break;
+		}
+	}
+	if (l == &tq->tq_active_list)
+		list_add(&tqt->tqt_active_list, &tq->tq_active_list);
+}
+
+/*
+ * Find and return a task from the given list if it exists.  The list
+ * must be in lowest to highest task id order.
+ */
+static taskq_ent_t *
+taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id)
+{
+	struct list_head *l = NULL;
+	taskq_ent_t *t;
+
+	list_for_each(l, lh) {
+		t = list_entry(l, taskq_ent_t, tqent_list);
+
+		if (t->tqent_id == id)
+			return (t);
+
+		if (t->tqent_id > id)
+			break;
+	}
+
+	return (NULL);
+}
+
+/*
+ * Find an already dispatched task given the task id regardless of what
+ * state it is in.  If a task is still pending it will be returned.
+ * If a task is executing, then -EBUSY will be returned instead.
+ * If the task has already been run then NULL is returned.
+ */
+static taskq_ent_t *
+taskq_find(taskq_t *tq, taskqid_t id)
+{
+	taskq_thread_t *tqt;
+	struct list_head *l = NULL;
+	taskq_ent_t *t;
+
+	t = taskq_find_list(tq, &tq->tq_delay_list, id);
+	if (t)
+		return (t);
+
+	t = taskq_find_list(tq, &tq->tq_prio_list, id);
+	if (t)
+		return (t);
+
+	t = taskq_find_list(tq, &tq->tq_pend_list, id);
+	if (t)
+		return (t);
+
+	list_for_each(l, &tq->tq_active_list) {
+		tqt = list_entry(l, taskq_thread_t, tqt_active_list);
+		if (tqt->tqt_id == id) {
+			/*
+			 * Instead of returning tqt_task, we just return a non
+			 * NULL value to prevent misuse, since tqt_task only
+			 * has two valid fields.
+			 */
+			return (ERR_PTR(-EBUSY));
+		}
+	}
+
+	return (NULL);
+}
+
+/*
+ * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and
+ * taskq_wait() functions below.
+ *
+ * Taskq waiting is accomplished by tracking the lowest outstanding task
+ * id and the next available task id.  As tasks are dispatched they are
+ * added to the tail of the pending, priority, or delay lists.  As worker
+ * threads become available the tasks are removed from the heads of these
+ * lists and linked to the worker threads.  This ensures the lists are
+ * kept sorted by lowest to highest task id.
+ *
+ * Therefore the lowest outstanding task id can be quickly determined by
+ * checking the head item from all of these lists.  This value is stored
+ * with the taskq as the lowest id.  It only needs to be recalculated when
+ * either the task with the current lowest id completes or is canceled.
+ *
+ * By blocking until the lowest task id exceeds the passed task id the
+ * taskq_wait_outstanding() function can be easily implemented.  Similarly,
+ * by blocking until the lowest task id matches the next task id taskq_wait()
+ * can be implemented.
+ *
+ * Callers should be aware that when there are multiple worked threads it
+ * is possible for larger task ids to complete before smaller ones.  Also
+ * when the taskq contains delay tasks with small task ids callers may
+ * block for a considerable length of time waiting for them to expire and
+ * execute.
+ */
+static int
+taskq_wait_id_check(taskq_t *tq, taskqid_t id)
+{
+	int rc;
+	unsigned long flags;
+
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	rc = (taskq_find(tq, id) == NULL);
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	return (rc);
+}
+
+/*
+ * The taskq_wait_id() function blocks until the passed task id completes.
+ * This does not guarantee that all lower task ids have completed.
+ */
+void
+taskq_wait_id(taskq_t *tq, taskqid_t id)
+{
+	wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id));
+}
+EXPORT_SYMBOL(taskq_wait_id);
+
+static int
+taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id)
+{
+	int rc;
+	unsigned long flags;
+
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	rc = (id < tq->tq_lowest_id);
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	return (rc);
+}
+
+/*
+ * The taskq_wait_outstanding() function will block until all tasks with a
+ * lower taskqid than the passed 'id' have been completed.  Note that all
+ * task id's are assigned monotonically at dispatch time.  Zero may be
+ * passed for the id to indicate all tasks dispatch up to this point,
+ * but not after, should be waited for.
+ */
+void
+taskq_wait_outstanding(taskq_t *tq, taskqid_t id)
+{
+	id = id ? id : tq->tq_next_id - 1;
+	wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id));
+}
+EXPORT_SYMBOL(taskq_wait_outstanding);
+
+static int
+taskq_wait_check(taskq_t *tq)
+{
+	int rc;
+	unsigned long flags;
+
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	rc = (tq->tq_lowest_id == tq->tq_next_id);
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	return (rc);
+}
+
+/*
+ * The taskq_wait() function will block until the taskq is empty.
+ * This means that if a taskq re-dispatches work to itself taskq_wait()
+ * callers will block indefinitely.
+ */
+void
+taskq_wait(taskq_t *tq)
+{
+	wait_event(tq->tq_wait_waitq, taskq_wait_check(tq));
+}
+EXPORT_SYMBOL(taskq_wait);
+
+int
+taskq_member(taskq_t *tq, kthread_t *t)
+{
+	return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t));
+}
+EXPORT_SYMBOL(taskq_member);
+
+taskq_t *
+taskq_of_curthread(void)
+{
+	return (tsd_get(taskq_tsd));
+}
+EXPORT_SYMBOL(taskq_of_curthread);
+
+/*
+ * Cancel an already dispatched task given the task id.  Still pending tasks
+ * will be immediately canceled, and if the task is active the function will
+ * block until it completes.  Preallocated tasks which are canceled must be
+ * freed by the caller.
+ */
+int
+taskq_cancel_id(taskq_t *tq, taskqid_t id)
+{
+	taskq_ent_t *t;
+	int rc = ENOENT;
+	unsigned long flags;
+
+	ASSERT(tq);
+
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	t = taskq_find(tq, id);
+	if (t && t != ERR_PTR(-EBUSY)) {
+		list_del_init(&t->tqent_list);
+		t->tqent_flags |= TQENT_FLAG_CANCEL;
+
+		/*
+		 * When canceling the lowest outstanding task id we
+		 * must recalculate the new lowest outstanding id.
+		 */
+		if (tq->tq_lowest_id == t->tqent_id) {
+			tq->tq_lowest_id = taskq_lowest_id(tq);
+			ASSERT3S(tq->tq_lowest_id, >, t->tqent_id);
+		}
+
+		/*
+		 * The task_expire() function takes the tq->tq_lock so drop
+		 * drop the lock before synchronously cancelling the timer.
+		 */
+		if (timer_pending(&t->tqent_timer)) {
+			spin_unlock_irqrestore(&tq->tq_lock, flags);
+			del_timer_sync(&t->tqent_timer);
+			spin_lock_irqsave_nested(&tq->tq_lock, flags,
+			    tq->tq_lock_class);
+		}
+
+		if (!(t->tqent_flags & TQENT_FLAG_PREALLOC))
+			task_done(tq, t);
+
+		rc = 0;
+	}
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	if (t == ERR_PTR(-EBUSY)) {
+		taskq_wait_id(tq, id);
+		rc = EBUSY;
+	}
+
+	return (rc);
+}
+EXPORT_SYMBOL(taskq_cancel_id);
+
+static int taskq_thread_spawn(taskq_t *tq);
+
+taskqid_t
+taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
+{
+	taskq_ent_t *t;
+	taskqid_t rc = TASKQID_INVALID;
+	unsigned long irqflags;
+
+	ASSERT(tq);
+	ASSERT(func);
+
+	spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
+
+	/* Taskq being destroyed and all tasks drained */
+	if (!(tq->tq_flags & TASKQ_ACTIVE))
+		goto out;
+
+	/* Do not queue the task unless there is idle thread for it */
+	ASSERT(tq->tq_nactive <= tq->tq_nthreads);
+	if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
+		/* Dynamic taskq may be able to spawn another thread */
+		if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
+		    taskq_thread_spawn(tq) == 0)
+			goto out;
+	}
+
+	if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
+		goto out;
+
+	spin_lock(&t->tqent_lock);
+
+	/* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
+	if (flags & TQ_NOQUEUE)
+		list_add(&t->tqent_list, &tq->tq_prio_list);
+	/* Queue to the priority list instead of the pending list */
+	else if (flags & TQ_FRONT)
+		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
+	else
+		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+
+	t->tqent_id = rc = tq->tq_next_id;
+	tq->tq_next_id++;
+	t->tqent_func = func;
+	t->tqent_arg = arg;
+	t->tqent_taskq = tq;
+	t->tqent_timer.function = NULL;
+	t->tqent_timer.expires = 0;
+
+	t->tqent_birth = jiffies;
+	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
+
+	ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+	spin_unlock(&t->tqent_lock);
+
+	wake_up(&tq->tq_work_waitq);
+out:
+	/* Spawn additional taskq threads if required. */
+	if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
+		(void) taskq_thread_spawn(tq);
+
+	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+	return (rc);
+}
+EXPORT_SYMBOL(taskq_dispatch);
+
+taskqid_t
+taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
+    uint_t flags, clock_t expire_time)
+{
+	taskqid_t rc = TASKQID_INVALID;
+	taskq_ent_t *t;
+	unsigned long irqflags;
+
+	ASSERT(tq);
+	ASSERT(func);
+
+	spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
+
+	/* Taskq being destroyed and all tasks drained */
+	if (!(tq->tq_flags & TASKQ_ACTIVE))
+		goto out;
+
+	if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
+		goto out;
+
+	spin_lock(&t->tqent_lock);
+
+	/* Queue to the delay list for subsequent execution */
+	list_add_tail(&t->tqent_list, &tq->tq_delay_list);
+
+	t->tqent_id = rc = tq->tq_next_id;
+	tq->tq_next_id++;
+	t->tqent_func = func;
+	t->tqent_arg = arg;
+	t->tqent_taskq = tq;
+	t->tqent_timer.function = task_expire;
+	t->tqent_timer.expires = (unsigned long)expire_time;
+	add_timer(&t->tqent_timer);
+
+	ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+	spin_unlock(&t->tqent_lock);
+out:
+	/* Spawn additional taskq threads if required. */
+	if (tq->tq_nactive == tq->tq_nthreads)
+		(void) taskq_thread_spawn(tq);
+	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+	return (rc);
+}
+EXPORT_SYMBOL(taskq_dispatch_delay);
+
+void
+taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
+    taskq_ent_t *t)
+{
+	unsigned long irqflags;
+	ASSERT(tq);
+	ASSERT(func);
+
+	spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
+	    tq->tq_lock_class);
+
+	/* Taskq being destroyed and all tasks drained */
+	if (!(tq->tq_flags & TASKQ_ACTIVE)) {
+		t->tqent_id = TASKQID_INVALID;
+		goto out;
+	}
+
+	if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
+		/* Dynamic taskq may be able to spawn another thread */
+		if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
+		    taskq_thread_spawn(tq) == 0)
+			goto out2;
+		flags |= TQ_FRONT;
+	}
+
+	spin_lock(&t->tqent_lock);
+
+	/*
+	 * Make sure the entry is not on some other taskq; it is important to
+	 * ASSERT() under lock
+	 */
+	ASSERT(taskq_empty_ent(t));
+
+	/*
+	 * Mark it as a prealloc'd task.  This is important
+	 * to ensure that we don't free it later.
+	 */
+	t->tqent_flags |= TQENT_FLAG_PREALLOC;
+
+	/* Queue to the priority list instead of the pending list */
+	if (flags & TQ_FRONT)
+		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
+	else
+		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+
+	t->tqent_id = tq->tq_next_id;
+	tq->tq_next_id++;
+	t->tqent_func = func;
+	t->tqent_arg = arg;
+	t->tqent_taskq = tq;
+
+	t->tqent_birth = jiffies;
+	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
+
+	spin_unlock(&t->tqent_lock);
+
+	wake_up(&tq->tq_work_waitq);
+out:
+	/* Spawn additional taskq threads if required. */
+	if (tq->tq_nactive == tq->tq_nthreads)
+		(void) taskq_thread_spawn(tq);
+out2:
+	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+}
+EXPORT_SYMBOL(taskq_dispatch_ent);
+
+int
+taskq_empty_ent(taskq_ent_t *t)
+{
+	return (list_empty(&t->tqent_list));
+}
+EXPORT_SYMBOL(taskq_empty_ent);
+
+void
+taskq_init_ent(taskq_ent_t *t)
+{
+	spin_lock_init(&t->tqent_lock);
+	init_waitqueue_head(&t->tqent_waitq);
+	timer_setup(&t->tqent_timer, NULL, 0);
+	INIT_LIST_HEAD(&t->tqent_list);
+	t->tqent_id = 0;
+	t->tqent_func = NULL;
+	t->tqent_arg = NULL;
+	t->tqent_flags = 0;
+	t->tqent_taskq = NULL;
+}
+EXPORT_SYMBOL(taskq_init_ent);
+
+/*
+ * Return the next pending task, preference is given to tasks on the
+ * priority list which were dispatched with TQ_FRONT.
+ */
+static taskq_ent_t *
+taskq_next_ent(taskq_t *tq)
+{
+	struct list_head *list;
+
+	if (!list_empty(&tq->tq_prio_list))
+		list = &tq->tq_prio_list;
+	else if (!list_empty(&tq->tq_pend_list))
+		list = &tq->tq_pend_list;
+	else
+		return (NULL);
+
+	return (list_entry(list->next, taskq_ent_t, tqent_list));
+}
+
+/*
+ * Spawns a new thread for the specified taskq.
+ */
+static void
+taskq_thread_spawn_task(void *arg)
+{
+	taskq_t *tq = (taskq_t *)arg;
+	unsigned long flags;
+
+	if (taskq_thread_create(tq) == NULL) {
+		/* restore spawning count if failed */
+		spin_lock_irqsave_nested(&tq->tq_lock, flags,
+		    tq->tq_lock_class);
+		tq->tq_nspawn--;
+		spin_unlock_irqrestore(&tq->tq_lock, flags);
+	}
+}
+
+/*
+ * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current
+ * number of threads is insufficient to handle the pending tasks.  These
+ * new threads must be created by the dedicated dynamic_taskq to avoid
+ * deadlocks between thread creation and memory reclaim.  The system_taskq
+ * which is also a dynamic taskq cannot be safely used for this.
+ */
+static int
+taskq_thread_spawn(taskq_t *tq)
+{
+	int spawning = 0;
+
+	if (!(tq->tq_flags & TASKQ_DYNAMIC))
+		return (0);
+
+	if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) &&
+	    (tq->tq_flags & TASKQ_ACTIVE)) {
+		spawning = (++tq->tq_nspawn);
+		taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task,
+		    tq, TQ_NOSLEEP);
+	}
+
+	return (spawning);
+}
+
+/*
+ * Threads in a dynamic taskq should only exit once it has been completely
+ * drained and no other threads are actively servicing tasks.  This prevents
+ * threads from being created and destroyed more than is required.
+ *
+ * The first thread is the thread list is treated as the primary thread.
+ * There is nothing special about the primary thread but in order to avoid
+ * all the taskq pids from changing we opt to make it long running.
+ */
+static int
+taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
+{
+	if (!(tq->tq_flags & TASKQ_DYNAMIC))
+		return (0);
+
+	if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t,
+	    tqt_thread_list) == tqt)
+		return (0);
+
+	return
+	    ((tq->tq_nspawn == 0) &&	/* No threads are being spawned */
+	    (tq->tq_nactive == 0) &&	/* No threads are handling tasks */
+	    (tq->tq_nthreads > 1) &&	/* More than 1 thread is running */
+	    (!taskq_next_ent(tq)) &&	/* There are no pending tasks */
+	    (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
+}
+
+static int
+taskq_thread(void *args)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	sigset_t blocked;
+	taskq_thread_t *tqt = args;
+	taskq_t *tq;
+	taskq_ent_t *t;
+	int seq_tasks = 0;
+	unsigned long flags;
+	taskq_ent_t dup_task = {};
+
+	ASSERT(tqt);
+	ASSERT(tqt->tqt_tq);
+	tq = tqt->tqt_tq;
+	current->flags |= PF_NOFREEZE;
+
+	(void) spl_fstrans_mark();
+
+	sigfillset(&blocked);
+	sigprocmask(SIG_BLOCK, &blocked, NULL);
+	flush_signals(current);
+
+	tsd_set(taskq_tsd, tq);
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	/*
+	 * If we are dynamically spawned, decrease spawning count. Note that
+	 * we could be created during taskq_create, in which case we shouldn't
+	 * do the decrement. But it's fine because taskq_create will reset
+	 * tq_nspawn later.
+	 */
+	if (tq->tq_flags & TASKQ_DYNAMIC)
+		tq->tq_nspawn--;
+
+	/* Immediately exit if more threads than allowed were created. */
+	if (tq->tq_nthreads >= tq->tq_maxthreads)
+		goto error;
+
+	tq->tq_nthreads++;
+	list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list);
+	wake_up(&tq->tq_wait_waitq);
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+
+		if (list_empty(&tq->tq_pend_list) &&
+		    list_empty(&tq->tq_prio_list)) {
+
+			if (taskq_thread_should_stop(tq, tqt)) {
+				wake_up_all(&tq->tq_wait_waitq);
+				break;
+			}
+
+			add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
+			spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+			schedule();
+			seq_tasks = 0;
+
+			spin_lock_irqsave_nested(&tq->tq_lock, flags,
+			    tq->tq_lock_class);
+			remove_wait_queue(&tq->tq_work_waitq, &wait);
+		} else {
+			__set_current_state(TASK_RUNNING);
+		}
+
+		if ((t = taskq_next_ent(tq)) != NULL) {
+			list_del_init(&t->tqent_list);
+
+			/*
+			 * A TQENT_FLAG_PREALLOC task may be reused or freed
+			 * during the task function call. Store tqent_id and
+			 * tqent_flags here.
+			 *
+			 * Also use an on stack taskq_ent_t for tqt_task
+			 * assignment in this case; we want to make sure
+			 * to duplicate all fields, so the values are
+			 * correct when it's accessed via DTRACE_PROBE*.
+			 */
+			tqt->tqt_id = t->tqent_id;
+			tqt->tqt_flags = t->tqent_flags;
+
+			if (t->tqent_flags & TQENT_FLAG_PREALLOC) {
+				dup_task = *t;
+				t = &dup_task;
+			}
+			tqt->tqt_task = t;
+
+			taskq_insert_in_order(tq, tqt);
+			tq->tq_nactive++;
+			spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+			DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t);
+
+			/* Perform the requested task */
+			t->tqent_func(t->tqent_arg);
+
+			DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t);
+
+			spin_lock_irqsave_nested(&tq->tq_lock, flags,
+			    tq->tq_lock_class);
+			tq->tq_nactive--;
+			list_del_init(&tqt->tqt_active_list);
+			tqt->tqt_task = NULL;
+
+			/* For prealloc'd tasks, we don't free anything. */
+			if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC))
+				task_done(tq, t);
+
+			/*
+			 * When the current lowest outstanding taskqid is
+			 * done calculate the new lowest outstanding id
+			 */
+			if (tq->tq_lowest_id == tqt->tqt_id) {
+				tq->tq_lowest_id = taskq_lowest_id(tq);
+				ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id);
+			}
+
+			/* Spawn additional taskq threads if required. */
+			if ((++seq_tasks) > spl_taskq_thread_sequential &&
+			    taskq_thread_spawn(tq))
+				seq_tasks = 0;
+
+			tqt->tqt_id = TASKQID_INVALID;
+			tqt->tqt_flags = 0;
+			wake_up_all(&tq->tq_wait_waitq);
+		} else {
+			if (taskq_thread_should_stop(tq, tqt))
+				break;
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+	}
+
+	__set_current_state(TASK_RUNNING);
+	tq->tq_nthreads--;
+	list_del_init(&tqt->tqt_thread_list);
+error:
+	kmem_free(tqt, sizeof (taskq_thread_t));
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	tsd_set(taskq_tsd, NULL);
+
+	return (0);
+}
+
+static taskq_thread_t *
+taskq_thread_create(taskq_t *tq)
+{
+	static int last_used_cpu = 0;
+	taskq_thread_t *tqt;
+
+	tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE);
+	INIT_LIST_HEAD(&tqt->tqt_thread_list);
+	INIT_LIST_HEAD(&tqt->tqt_active_list);
+	tqt->tqt_tq = tq;
+	tqt->tqt_id = TASKQID_INVALID;
+
+	tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt,
+	    "%s", tq->tq_name);
+	if (tqt->tqt_thread == NULL) {
+		kmem_free(tqt, sizeof (taskq_thread_t));
+		return (NULL);
+	}
+
+	if (spl_taskq_thread_bind) {
+		last_used_cpu = (last_used_cpu + 1) % num_online_cpus();
+		kthread_bind(tqt->tqt_thread, last_used_cpu);
+	}
+
+	if (spl_taskq_thread_priority)
+		set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri));
+
+	wake_up_process(tqt->tqt_thread);
+
+	return (tqt);
+}
+
+taskq_t *
+taskq_create(const char *name, int nthreads, pri_t pri,
+    int minalloc, int maxalloc, uint_t flags)
+{
+	taskq_t *tq;
+	taskq_thread_t *tqt;
+	int count = 0, rc = 0, i;
+	unsigned long irqflags;
+
+	ASSERT(name != NULL);
+	ASSERT(minalloc >= 0);
+	ASSERT(maxalloc <= INT_MAX);
+	ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */
+
+	/* Scale the number of threads using nthreads as a percentage */
+	if (flags & TASKQ_THREADS_CPU_PCT) {
+		ASSERT(nthreads <= 100);
+		ASSERT(nthreads >= 0);
+		nthreads = MIN(nthreads, 100);
+		nthreads = MAX(nthreads, 0);
+		nthreads = MAX((num_online_cpus() * nthreads) / 100, 1);
+	}
+
+	tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE);
+	if (tq == NULL)
+		return (NULL);
+
+	spin_lock_init(&tq->tq_lock);
+	INIT_LIST_HEAD(&tq->tq_thread_list);
+	INIT_LIST_HEAD(&tq->tq_active_list);
+	tq->tq_name = kmem_strdup(name);
+	tq->tq_nactive = 0;
+	tq->tq_nthreads = 0;
+	tq->tq_nspawn = 0;
+	tq->tq_maxthreads = nthreads;
+	tq->tq_pri = pri;
+	tq->tq_minalloc = minalloc;
+	tq->tq_maxalloc = maxalloc;
+	tq->tq_nalloc = 0;
+	tq->tq_flags = (flags | TASKQ_ACTIVE);
+	tq->tq_next_id = TASKQID_INITIAL;
+	tq->tq_lowest_id = TASKQID_INITIAL;
+	INIT_LIST_HEAD(&tq->tq_free_list);
+	INIT_LIST_HEAD(&tq->tq_pend_list);
+	INIT_LIST_HEAD(&tq->tq_prio_list);
+	INIT_LIST_HEAD(&tq->tq_delay_list);
+	init_waitqueue_head(&tq->tq_work_waitq);
+	init_waitqueue_head(&tq->tq_wait_waitq);
+	tq->tq_lock_class = TQ_LOCK_GENERAL;
+	INIT_LIST_HEAD(&tq->tq_taskqs);
+
+	if (flags & TASKQ_PREPOPULATE) {
+		spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
+		    tq->tq_lock_class);
+
+		for (i = 0; i < minalloc; i++)
+			task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW,
+			    &irqflags));
+
+		spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+	}
+
+	if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic)
+		nthreads = 1;
+
+	for (i = 0; i < nthreads; i++) {
+		tqt = taskq_thread_create(tq);
+		if (tqt == NULL)
+			rc = 1;
+		else
+			count++;
+	}
+
+	/* Wait for all threads to be started before potential destroy */
+	wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count);
+	/*
+	 * taskq_thread might have touched nspawn, but we don't want them to
+	 * because they're not dynamically spawned. So we reset it to 0
+	 */
+	tq->tq_nspawn = 0;
+
+	if (rc) {
+		taskq_destroy(tq);
+		tq = NULL;
+	} else {
+		down_write(&tq_list_sem);
+		tq->tq_instance = taskq_find_by_name(name) + 1;
+		list_add_tail(&tq->tq_taskqs, &tq_list);
+		up_write(&tq_list_sem);
+	}
+
+	return (tq);
+}
+EXPORT_SYMBOL(taskq_create);
+
+void
+taskq_destroy(taskq_t *tq)
+{
+	struct task_struct *thread;
+	taskq_thread_t *tqt;
+	taskq_ent_t *t;
+	unsigned long flags;
+
+	ASSERT(tq);
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	tq->tq_flags &= ~TASKQ_ACTIVE;
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	/*
+	 * When TASKQ_ACTIVE is clear new tasks may not be added nor may
+	 * new worker threads be spawned for dynamic taskq.
+	 */
+	if (dynamic_taskq != NULL)
+		taskq_wait_outstanding(dynamic_taskq, 0);
+
+	taskq_wait(tq);
+
+	/* remove taskq from global list used by the kstats */
+	down_write(&tq_list_sem);
+	list_del(&tq->tq_taskqs);
+	up_write(&tq_list_sem);
+
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	/* wait for spawning threads to insert themselves to the list */
+	while (tq->tq_nspawn) {
+		spin_unlock_irqrestore(&tq->tq_lock, flags);
+		schedule_timeout_interruptible(1);
+		spin_lock_irqsave_nested(&tq->tq_lock, flags,
+		    tq->tq_lock_class);
+	}
+
+	/*
+	 * Signal each thread to exit and block until it does.  Each thread
+	 * is responsible for removing itself from the list and freeing its
+	 * taskq_thread_t.  This allows for idle threads to opt to remove
+	 * themselves from the taskq.  They can be recreated as needed.
+	 */
+	while (!list_empty(&tq->tq_thread_list)) {
+		tqt = list_entry(tq->tq_thread_list.next,
+		    taskq_thread_t, tqt_thread_list);
+		thread = tqt->tqt_thread;
+		spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+		kthread_stop(thread);
+
+		spin_lock_irqsave_nested(&tq->tq_lock, flags,
+		    tq->tq_lock_class);
+	}
+
+	while (!list_empty(&tq->tq_free_list)) {
+		t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
+
+		ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+		list_del_init(&t->tqent_list);
+		task_free(tq, t);
+	}
+
+	ASSERT0(tq->tq_nthreads);
+	ASSERT0(tq->tq_nalloc);
+	ASSERT0(tq->tq_nspawn);
+	ASSERT(list_empty(&tq->tq_thread_list));
+	ASSERT(list_empty(&tq->tq_active_list));
+	ASSERT(list_empty(&tq->tq_free_list));
+	ASSERT(list_empty(&tq->tq_pend_list));
+	ASSERT(list_empty(&tq->tq_prio_list));
+	ASSERT(list_empty(&tq->tq_delay_list));
+
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	kmem_strfree(tq->tq_name);
+	kmem_free(tq, sizeof (taskq_t));
+}
+EXPORT_SYMBOL(taskq_destroy);
+
+
+static unsigned int spl_taskq_kick = 0;
+
+/*
+ * 2.6.36 API Change
+ * module_param_cb is introduced to take kernel_param_ops and
+ * module_param_call is marked as obsolete. Also set and get operations
+ * were changed to take a 'const struct kernel_param *'.
+ */
+static int
+#ifdef module_param_cb
+param_set_taskq_kick(const char *val, const struct kernel_param *kp)
+#else
+param_set_taskq_kick(const char *val, struct kernel_param *kp)
+#endif
+{
+	int ret;
+	taskq_t *tq = NULL;
+	taskq_ent_t *t;
+	unsigned long flags;
+
+	ret = param_set_uint(val, kp);
+	if (ret < 0 || !spl_taskq_kick)
+		return (ret);
+	/* reset value */
+	spl_taskq_kick = 0;
+
+	down_read(&tq_list_sem);
+	list_for_each_entry(tq, &tq_list, tq_taskqs) {
+		spin_lock_irqsave_nested(&tq->tq_lock, flags,
+		    tq->tq_lock_class);
+		/* Check if the first pending is older than 5 seconds */
+		t = taskq_next_ent(tq);
+		if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) {
+			(void) taskq_thread_spawn(tq);
+			printk(KERN_INFO "spl: Kicked taskq %s/%d\n",
+			    tq->tq_name, tq->tq_instance);
+		}
+		spin_unlock_irqrestore(&tq->tq_lock, flags);
+	}
+	up_read(&tq_list_sem);
+	return (ret);
+}
+
+#ifdef module_param_cb
+static const struct kernel_param_ops param_ops_taskq_kick = {
+	.set = param_set_taskq_kick,
+	.get = param_get_uint,
+};
+module_param_cb(spl_taskq_kick, &param_ops_taskq_kick, &spl_taskq_kick, 0644);
+#else
+module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint,
+	&spl_taskq_kick, 0644);
+#endif
+MODULE_PARM_DESC(spl_taskq_kick,
+	"Write nonzero to kick stuck taskqs to spawn more threads");
+
+int
+spl_taskq_init(void)
+{
+	init_rwsem(&tq_list_sem);
+	tsd_create(&taskq_tsd, NULL);
+
+	system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64),
+	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
+	if (system_taskq == NULL)
+		return (1);
+
+	system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4),
+	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
+	if (system_delay_taskq == NULL) {
+		taskq_destroy(system_taskq);
+		return (1);
+	}
+
+	dynamic_taskq = taskq_create("spl_dynamic_taskq", 1,
+	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE);
+	if (dynamic_taskq == NULL) {
+		taskq_destroy(system_taskq);
+		taskq_destroy(system_delay_taskq);
+		return (1);
+	}
+
+	/*
+	 * This is used to annotate tq_lock, so
+	 *   taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch
+	 * does not trigger a lockdep warning re: possible recursive locking
+	 */
+	dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;
+
+	return (0);
+}
+
+void
+spl_taskq_fini(void)
+{
+	taskq_destroy(dynamic_taskq);
+	dynamic_taskq = NULL;
+
+	taskq_destroy(system_delay_taskq);
+	system_delay_taskq = NULL;
+
+	taskq_destroy(system_taskq);
+	system_taskq = NULL;
+
+	tsd_destroy(&taskq_tsd);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
new file mode 100644
index 000000000000..0352a31ea835
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Thread Implementation.
+ */
+
+#include <sys/thread.h>
+#include <sys/kmem.h>
+#include <sys/tsd.h>
+
+/*
+ * Thread interfaces
+ */
+typedef struct thread_priv_s {
+	unsigned long tp_magic;		/* Magic */
+	int tp_name_size;		/* Name size */
+	char *tp_name;			/* Name (without _thread suffix) */
+	void (*tp_func)(void *);	/* Registered function */
+	void *tp_args;			/* Args to be passed to function */
+	size_t tp_len;			/* Len to be passed to function */
+	int tp_state;			/* State to start thread at */
+	pri_t tp_pri;			/* Priority to start threat at */
+} thread_priv_t;
+
+static int
+thread_generic_wrapper(void *arg)
+{
+	thread_priv_t *tp = (thread_priv_t *)arg;
+	void (*func)(void *);
+	void *args;
+
+	ASSERT(tp->tp_magic == TP_MAGIC);
+	func = tp->tp_func;
+	args = tp->tp_args;
+	set_current_state(tp->tp_state);
+	set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
+	kmem_free(tp->tp_name, tp->tp_name_size);
+	kmem_free(tp, sizeof (thread_priv_t));
+
+	if (func)
+		func(args);
+
+	return (0);
+}
+
+void
+__thread_exit(void)
+{
+	tsd_exit();
+	complete_and_exit(NULL, 0);
+	/* Unreachable */
+}
+EXPORT_SYMBOL(__thread_exit);
+
+/*
+ * thread_create() may block forever if it cannot create a thread or
+ * allocate memory.  This is preferable to returning a NULL which Solaris
+ * style callers likely never check for... since it can't fail.
+ */
+kthread_t *
+__thread_create(caddr_t stk, size_t  stksize, thread_func_t func,
+    const char *name, void *args, size_t len, proc_t *pp, int state, pri_t pri)
+{
+	thread_priv_t *tp;
+	struct task_struct *tsk;
+	char *p;
+
+	/* Option pp is simply ignored */
+	/* Variable stack size unsupported */
+	ASSERT(stk == NULL);
+
+	tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE);
+	if (tp == NULL)
+		return (NULL);
+
+	tp->tp_magic = TP_MAGIC;
+	tp->tp_name_size = strlen(name) + 1;
+
+	tp->tp_name = kmem_alloc(tp->tp_name_size, KM_PUSHPAGE);
+	if (tp->tp_name == NULL) {
+		kmem_free(tp, sizeof (thread_priv_t));
+		return (NULL);
+	}
+
+	strncpy(tp->tp_name, name, tp->tp_name_size);
+
+	/*
+	 * Strip trailing "_thread" from passed name which will be the func
+	 * name since the exposed API has no parameter for passing a name.
+	 */
+	p = strstr(tp->tp_name, "_thread");
+	if (p)
+		p[0] = '\0';
+
+	tp->tp_func  = func;
+	tp->tp_args  = args;
+	tp->tp_len   = len;
+	tp->tp_state = state;
+	tp->tp_pri   = pri;
+
+	tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp,
+	    "%s", tp->tp_name);
+	if (IS_ERR(tsk))
+		return (NULL);
+
+	wake_up_process(tsk);
+	return ((kthread_t *)tsk);
+}
+EXPORT_SYMBOL(__thread_create);
+
+/*
+ * spl_kthread_create - Wrapper providing pre-3.13 semantics for
+ * kthread_create() in which it is not killable and less likely
+ * to return -ENOMEM.
+ */
+struct task_struct *
+spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...)
+{
+	struct task_struct *tsk;
+	va_list args;
+	char name[TASK_COMM_LEN];
+
+	va_start(args, namefmt);
+	vsnprintf(name, sizeof (name), namefmt, args);
+	va_end(args);
+	do {
+		tsk = kthread_create(func, data, "%s", name);
+		if (IS_ERR(tsk)) {
+			if (signal_pending(current)) {
+				clear_thread_flag(TIF_SIGPENDING);
+				continue;
+			}
+			if (PTR_ERR(tsk) == -ENOMEM)
+				continue;
+			return (NULL);
+		} else {
+			return (tsk);
+		}
+	} while (1);
+}
+EXPORT_SYMBOL(spl_kthread_create);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c b/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c
new file mode 100644
index 000000000000..7912a381294d
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c
@@ -0,0 +1,33 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Each DTRACE_PROBE must define its trace point in one (and only one)
+ * source file, so this dummy file exists for that purpose.
+ */
+
+#include <sys/taskq.h>
+
+#ifdef _KERNEL
+#define	CREATE_TRACE_POINTS
+#include <sys/trace.h>
+#include <sys/trace_taskq.h>
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
new file mode 100644
index 000000000000..b955ed65470f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
@@ -0,0 +1,720 @@
+/*
+ *  Copyright (C) 2010 Lawrence Livermore National Security, LLC.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ *  Solaris Porting Layer (SPL) Thread Specific Data Implementation.
+ *
+ *  Thread specific data has implemented using a hash table, this avoids
+ *  the need to add a member to the task structure and allows maximum
+ *  portability between kernels.  This implementation has been optimized
+ *  to keep the tsd_set() and tsd_get() times as small as possible.
+ *
+ *  The majority of the entries in the hash table are for specific tsd
+ *  entries.  These entries are hashed by the product of their key and
+ *  pid because by design the key and pid are guaranteed to be unique.
+ *  Their product also has the desirable properly that it will be uniformly
+ *  distributed over the hash bins providing neither the pid nor key is zero.
+ *  Under linux the zero pid is always the init process and thus won't be
+ *  used, and this implementation is careful to never to assign a zero key.
+ *  By default the hash table is sized to 512 bins which is expected to
+ *  be sufficient for light to moderate usage of thread specific data.
+ *
+ *  The hash table contains two additional type of entries.  They first
+ *  type is entry is called a 'key' entry and it is added to the hash during
+ *  tsd_create().  It is used to store the address of the destructor function
+ *  and it is used as an anchor point.  All tsd entries which use the same
+ *  key will be linked to this entry.  This is used during tsd_destroy() to
+ *  quickly call the destructor function for all tsd associated with the key.
+ *  The 'key' entry may be looked up with tsd_hash_search() by passing the
+ *  key you wish to lookup and DTOR_PID constant as the pid.
+ *
+ *  The second type of entry is called a 'pid' entry and it is added to the
+ *  hash the first time a process set a key.  The 'pid' entry is also used
+ *  as an anchor and all tsd for the process will be linked to it.  This
+ *  list is using during tsd_exit() to ensure all registered destructors
+ *  are run for the process.  The 'pid' entry may be looked up with
+ *  tsd_hash_search() by passing the PID_KEY constant as the key, and
+ *  the process pid.  Note that tsd_exit() is called by thread_exit()
+ *  so if your using the Solaris thread API you should not need to call
+ *  tsd_exit() directly.
+ *
+ */
+
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/tsd.h>
+#include <linux/hash.h>
+
+typedef struct tsd_hash_bin {
+	spinlock_t		hb_lock;
+	struct hlist_head	hb_head;
+} tsd_hash_bin_t;
+
+typedef struct tsd_hash_table {
+	spinlock_t		ht_lock;
+	uint_t			ht_bits;
+	uint_t			ht_key;
+	tsd_hash_bin_t		*ht_bins;
+} tsd_hash_table_t;
+
+typedef struct tsd_hash_entry {
+	uint_t			he_key;
+	pid_t			he_pid;
+	dtor_func_t		he_dtor;
+	void			*he_value;
+	struct hlist_node	he_list;
+	struct list_head	he_key_list;
+	struct list_head	he_pid_list;
+} tsd_hash_entry_t;
+
+static tsd_hash_table_t *tsd_hash_table = NULL;
+
+
+/*
+ * tsd_hash_search - searches hash table for tsd_hash_entry
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ */
+static tsd_hash_entry_t *
+tsd_hash_search(tsd_hash_table_t *table, uint_t key, pid_t pid)
+{
+	struct hlist_node *node = NULL;
+	tsd_hash_entry_t *entry;
+	tsd_hash_bin_t *bin;
+	ulong_t hash;
+
+	hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits);
+	bin = &table->ht_bins[hash];
+	spin_lock(&bin->hb_lock);
+	hlist_for_each(node, &bin->hb_head) {
+		entry = list_entry(node, tsd_hash_entry_t, he_list);
+		if ((entry->he_key == key) && (entry->he_pid == pid)) {
+			spin_unlock(&bin->hb_lock);
+			return (entry);
+		}
+	}
+
+	spin_unlock(&bin->hb_lock);
+	return (NULL);
+}
+
+/*
+ * tsd_hash_dtor - call the destructor and free all entries on the list
+ * @work: list of hash entries
+ *
+ * For a list of entries which have all already been removed from the
+ * hash call their registered destructor then free the associated memory.
+ */
+static void
+tsd_hash_dtor(struct hlist_head *work)
+{
+	tsd_hash_entry_t *entry;
+
+	while (!hlist_empty(work)) {
+		entry = hlist_entry(work->first, tsd_hash_entry_t, he_list);
+		hlist_del(&entry->he_list);
+
+		if (entry->he_dtor && entry->he_pid != DTOR_PID)
+			entry->he_dtor(entry->he_value);
+
+		kmem_free(entry, sizeof (tsd_hash_entry_t));
+	}
+}
+
+/*
+ * tsd_hash_add - adds an entry to hash table
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ *
+ * The caller is responsible for ensuring the unique key/pid do not
+ * already exist in the hash table.  This possible because all entries
+ * are thread specific thus a concurrent thread will never attempt to
+ * add this key/pid.  Because multiple bins must be checked to add
+ * links to the dtor and pid entries the entire table is locked.
+ */
+static int
+tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value)
+{
+	tsd_hash_entry_t *entry, *dtor_entry, *pid_entry;
+	tsd_hash_bin_t *bin;
+	ulong_t hash;
+	int rc = 0;
+
+	ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL);
+
+	/* New entry allocate structure, set value, and add to hash */
+	entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+	if (entry == NULL)
+		return (ENOMEM);
+
+	entry->he_key = key;
+	entry->he_pid = pid;
+	entry->he_value = value;
+	INIT_HLIST_NODE(&entry->he_list);
+	INIT_LIST_HEAD(&entry->he_key_list);
+	INIT_LIST_HEAD(&entry->he_pid_list);
+
+	spin_lock(&table->ht_lock);
+
+	/* Destructor entry must exist for all valid keys */
+	dtor_entry = tsd_hash_search(table, entry->he_key, DTOR_PID);
+	ASSERT3P(dtor_entry, !=, NULL);
+	entry->he_dtor = dtor_entry->he_dtor;
+
+	/* Process entry must exist for all valid processes */
+	pid_entry = tsd_hash_search(table, PID_KEY, entry->he_pid);
+	ASSERT3P(pid_entry, !=, NULL);
+
+	hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits);
+	bin = &table->ht_bins[hash];
+	spin_lock(&bin->hb_lock);
+
+	/* Add to the hash, key, and pid lists */
+	hlist_add_head(&entry->he_list, &bin->hb_head);
+	list_add(&entry->he_key_list, &dtor_entry->he_key_list);
+	list_add(&entry->he_pid_list, &pid_entry->he_pid_list);
+
+	spin_unlock(&bin->hb_lock);
+	spin_unlock(&table->ht_lock);
+
+	return (rc);
+}
+
+/*
+ * tsd_hash_add_key - adds a destructor entry to the hash table
+ * @table: hash table
+ * @keyp: search key
+ * @dtor: key destructor
+ *
+ * For every unique key there is a single entry in the hash which is used
+ * as anchor.  All other thread specific entries for this key are linked
+ * to this anchor via the 'he_key_list' list head.  On return they keyp
+ * will be set to the next available key for the hash table.
+ */
+static int
+tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor)
+{
+	tsd_hash_entry_t *tmp_entry, *entry;
+	tsd_hash_bin_t *bin;
+	ulong_t hash;
+	int keys_checked = 0;
+
+	ASSERT3P(table, !=, NULL);
+
+	/* Allocate entry to be used as a destructor for this key */
+	entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+	if (entry == NULL)
+		return (ENOMEM);
+
+	/* Determine next available key value */
+	spin_lock(&table->ht_lock);
+	do {
+		/* Limited to TSD_KEYS_MAX concurrent unique keys */
+		if (table->ht_key++ > TSD_KEYS_MAX)
+			table->ht_key = 1;
+
+		/* Ensure failure when all TSD_KEYS_MAX keys are in use */
+		if (keys_checked++ >= TSD_KEYS_MAX) {
+			spin_unlock(&table->ht_lock);
+			return (ENOENT);
+		}
+
+		tmp_entry = tsd_hash_search(table, table->ht_key, DTOR_PID);
+	} while (tmp_entry);
+
+	/* Add destructor entry in to hash table */
+	entry->he_key = *keyp = table->ht_key;
+	entry->he_pid = DTOR_PID;
+	entry->he_dtor = dtor;
+	entry->he_value = NULL;
+	INIT_HLIST_NODE(&entry->he_list);
+	INIT_LIST_HEAD(&entry->he_key_list);
+	INIT_LIST_HEAD(&entry->he_pid_list);
+
+	hash = hash_long((ulong_t)*keyp * (ulong_t)DTOR_PID, table->ht_bits);
+	bin = &table->ht_bins[hash];
+	spin_lock(&bin->hb_lock);
+
+	hlist_add_head(&entry->he_list, &bin->hb_head);
+
+	spin_unlock(&bin->hb_lock);
+	spin_unlock(&table->ht_lock);
+
+	return (0);
+}
+
+/*
+ * tsd_hash_add_pid - adds a process entry to the hash table
+ * @table: hash table
+ * @pid: search pid
+ *
+ * For every process there is a single entry in the hash which is used
+ * as anchor.  All other thread specific entries for this process are
+ * linked to this anchor via the 'he_pid_list' list head.
+ */
+static int
+tsd_hash_add_pid(tsd_hash_table_t *table, pid_t pid)
+{
+	tsd_hash_entry_t *entry;
+	tsd_hash_bin_t *bin;
+	ulong_t hash;
+
+	/* Allocate entry to be used as the process reference */
+	entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+	if (entry == NULL)
+		return (ENOMEM);
+
+	spin_lock(&table->ht_lock);
+	entry->he_key = PID_KEY;
+	entry->he_pid = pid;
+	entry->he_dtor = NULL;
+	entry->he_value = NULL;
+	INIT_HLIST_NODE(&entry->he_list);
+	INIT_LIST_HEAD(&entry->he_key_list);
+	INIT_LIST_HEAD(&entry->he_pid_list);
+
+	hash = hash_long((ulong_t)PID_KEY * (ulong_t)pid, table->ht_bits);
+	bin = &table->ht_bins[hash];
+	spin_lock(&bin->hb_lock);
+
+	hlist_add_head(&entry->he_list, &bin->hb_head);
+
+	spin_unlock(&bin->hb_lock);
+	spin_unlock(&table->ht_lock);
+
+	return (0);
+}
+
+/*
+ * tsd_hash_del - delete an entry from hash table, key, and pid lists
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ */
+static void
+tsd_hash_del(tsd_hash_table_t *table, tsd_hash_entry_t *entry)
+{
+	hlist_del(&entry->he_list);
+	list_del_init(&entry->he_key_list);
+	list_del_init(&entry->he_pid_list);
+}
+
+/*
+ * tsd_hash_table_init - allocate a hash table
+ * @bits: hash table size
+ *
+ * A hash table with 2^bits bins will be created, it may not be resized
+ * after the fact and must be free'd with tsd_hash_table_fini().
+ */
+static tsd_hash_table_t *
+tsd_hash_table_init(uint_t bits)
+{
+	tsd_hash_table_t *table;
+	int hash, size = (1 << bits);
+
+	table = kmem_zalloc(sizeof (tsd_hash_table_t), KM_SLEEP);
+	if (table == NULL)
+		return (NULL);
+
+	table->ht_bins = kmem_zalloc(sizeof (tsd_hash_bin_t) * size, KM_SLEEP);
+	if (table->ht_bins == NULL) {
+		kmem_free(table, sizeof (tsd_hash_table_t));
+		return (NULL);
+	}
+
+	for (hash = 0; hash < size; hash++) {
+		spin_lock_init(&table->ht_bins[hash].hb_lock);
+		INIT_HLIST_HEAD(&table->ht_bins[hash].hb_head);
+	}
+
+	spin_lock_init(&table->ht_lock);
+	table->ht_bits = bits;
+	table->ht_key = 1;
+
+	return (table);
+}
+
+/*
+ * tsd_hash_table_fini - free a hash table
+ * @table: hash table
+ *
+ * Free a hash table allocated by tsd_hash_table_init().  If the hash
+ * table is not empty this function will call the proper destructor for
+ * all remaining entries before freeing the memory used by those entries.
+ */
+static void
+tsd_hash_table_fini(tsd_hash_table_t *table)
+{
+	HLIST_HEAD(work);
+	tsd_hash_bin_t *bin;
+	tsd_hash_entry_t *entry;
+	int size, i;
+
+	ASSERT3P(table, !=, NULL);
+	spin_lock(&table->ht_lock);
+	for (i = 0, size = (1 << table->ht_bits); i < size; i++) {
+		bin = &table->ht_bins[i];
+		spin_lock(&bin->hb_lock);
+		while (!hlist_empty(&bin->hb_head)) {
+			entry = hlist_entry(bin->hb_head.first,
+			    tsd_hash_entry_t, he_list);
+			tsd_hash_del(table, entry);
+			hlist_add_head(&entry->he_list, &work);
+		}
+		spin_unlock(&bin->hb_lock);
+	}
+	spin_unlock(&table->ht_lock);
+
+	tsd_hash_dtor(&work);
+	kmem_free(table->ht_bins, sizeof (tsd_hash_bin_t)*(1<<table->ht_bits));
+	kmem_free(table, sizeof (tsd_hash_table_t));
+}
+
+/*
+ * tsd_remove_entry - remove a tsd entry for this thread
+ * @entry: entry to remove
+ *
+ * Remove the thread specific data @entry for this thread.
+ * If this is the last entry for this thread, also remove the PID entry.
+ */
+static void
+tsd_remove_entry(tsd_hash_entry_t *entry)
+{
+	HLIST_HEAD(work);
+	tsd_hash_table_t *table;
+	tsd_hash_entry_t *pid_entry;
+	tsd_hash_bin_t *pid_entry_bin, *entry_bin;
+	ulong_t hash;
+
+	table = tsd_hash_table;
+	ASSERT3P(table, !=, NULL);
+	ASSERT3P(entry, !=, NULL);
+
+	spin_lock(&table->ht_lock);
+
+	hash = hash_long((ulong_t)entry->he_key *
+	    (ulong_t)entry->he_pid, table->ht_bits);
+	entry_bin = &table->ht_bins[hash];
+
+	/* save the possible pid_entry */
+	pid_entry = list_entry(entry->he_pid_list.next, tsd_hash_entry_t,
+	    he_pid_list);
+
+	/* remove entry */
+	spin_lock(&entry_bin->hb_lock);
+	tsd_hash_del(table, entry);
+	hlist_add_head(&entry->he_list, &work);
+	spin_unlock(&entry_bin->hb_lock);
+
+	/* if pid_entry is indeed pid_entry, then remove it if it's empty */
+	if (pid_entry->he_key == PID_KEY &&
+	    list_empty(&pid_entry->he_pid_list)) {
+		hash = hash_long((ulong_t)pid_entry->he_key *
+		    (ulong_t)pid_entry->he_pid, table->ht_bits);
+		pid_entry_bin = &table->ht_bins[hash];
+
+		spin_lock(&pid_entry_bin->hb_lock);
+		tsd_hash_del(table, pid_entry);
+		hlist_add_head(&pid_entry->he_list, &work);
+		spin_unlock(&pid_entry_bin->hb_lock);
+	}
+
+	spin_unlock(&table->ht_lock);
+
+	tsd_hash_dtor(&work);
+}
+
+/*
+ * tsd_set - set thread specific data
+ * @key: lookup key
+ * @value: value to set
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy(), protected
+ * from racing tsd_get() or tsd_set() because it is thread specific.
+ * This function has been optimized to be fast for the update case.
+ * When setting the tsd initially it will be slower due to additional
+ * required locking and potential memory allocations.
+ */
+int
+tsd_set(uint_t key, void *value)
+{
+	tsd_hash_table_t *table;
+	tsd_hash_entry_t *entry;
+	pid_t pid;
+	int rc;
+	/* mark remove if value is NULL */
+	boolean_t remove = (value == NULL);
+
+	table = tsd_hash_table;
+	pid = curthread->pid;
+	ASSERT3P(table, !=, NULL);
+
+	if ((key == 0) || (key > TSD_KEYS_MAX))
+		return (EINVAL);
+
+	/* Entry already exists in hash table update value */
+	entry = tsd_hash_search(table, key, pid);
+	if (entry) {
+		entry->he_value = value;
+		/* remove the entry */
+		if (remove)
+			tsd_remove_entry(entry);
+		return (0);
+	}
+
+	/* don't create entry if value is NULL */
+	if (remove)
+		return (0);
+
+	/* Add a process entry to the hash if not yet exists */
+	entry = tsd_hash_search(table, PID_KEY, pid);
+	if (entry == NULL) {
+		rc = tsd_hash_add_pid(table, pid);
+		if (rc)
+			return (rc);
+	}
+
+	rc = tsd_hash_add(table, key, pid, value);
+	return (rc);
+}
+EXPORT_SYMBOL(tsd_set);
+
+/*
+ * tsd_get - get thread specific data
+ * @key: lookup key
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy().  This
+ * implementation is designed to be fast and scalable, it does not
+ * lock the entire table only a single hash bin.
+ */
+void *
+tsd_get(uint_t key)
+{
+	tsd_hash_entry_t *entry;
+
+	ASSERT3P(tsd_hash_table, !=, NULL);
+
+	if ((key == 0) || (key > TSD_KEYS_MAX))
+		return (NULL);
+
+	entry = tsd_hash_search(tsd_hash_table, key, curthread->pid);
+	if (entry == NULL)
+		return (NULL);
+
+	return (entry->he_value);
+}
+EXPORT_SYMBOL(tsd_get);
+
+/*
+ * tsd_get_by_thread - get thread specific data for specified thread
+ * @key: lookup key
+ * @thread: thread to lookup
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy().  This
+ * implementation is designed to be fast and scalable, it does not
+ * lock the entire table only a single hash bin.
+ */
+void *
+tsd_get_by_thread(uint_t key, kthread_t *thread)
+{
+	tsd_hash_entry_t *entry;
+
+	ASSERT3P(tsd_hash_table, !=, NULL);
+
+	if ((key == 0) || (key > TSD_KEYS_MAX))
+		return (NULL);
+
+	entry = tsd_hash_search(tsd_hash_table, key, thread->pid);
+	if (entry == NULL)
+		return (NULL);
+
+	return (entry->he_value);
+}
+EXPORT_SYMBOL(tsd_get_by_thread);
+
+/*
+ * tsd_create - create thread specific data key
+ * @keyp: lookup key address
+ * @dtor: destructor called during tsd_destroy() or tsd_exit()
+ *
+ * Provided key must be set to 0 or it assumed to be already in use.
+ * The dtor is allowed to be NULL in which case no additional cleanup
+ * for the data is performed during tsd_destroy() or tsd_exit().
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_create(uint_t *keyp, dtor_func_t dtor)
+{
+	ASSERT3P(keyp, !=, NULL);
+	if (*keyp)
+		return;
+
+	(void) tsd_hash_add_key(tsd_hash_table, keyp, dtor);
+}
+EXPORT_SYMBOL(tsd_create);
+
+/*
+ * tsd_destroy - destroy thread specific data
+ * @keyp: lookup key address
+ *
+ * Destroys the thread specific data on all threads which use this key.
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_destroy(uint_t *keyp)
+{
+	HLIST_HEAD(work);
+	tsd_hash_table_t *table;
+	tsd_hash_entry_t *dtor_entry, *entry;
+	tsd_hash_bin_t *dtor_entry_bin, *entry_bin;
+	ulong_t hash;
+
+	table = tsd_hash_table;
+	ASSERT3P(table, !=, NULL);
+
+	spin_lock(&table->ht_lock);
+	dtor_entry = tsd_hash_search(table, *keyp, DTOR_PID);
+	if (dtor_entry == NULL) {
+		spin_unlock(&table->ht_lock);
+		return;
+	}
+
+	/*
+	 * All threads which use this key must be linked off of the
+	 * DTOR_PID entry.  They are removed from the hash table and
+	 * linked in to a private working list to be destroyed.
+	 */
+	while (!list_empty(&dtor_entry->he_key_list)) {
+		entry = list_entry(dtor_entry->he_key_list.next,
+		    tsd_hash_entry_t, he_key_list);
+		ASSERT3U(dtor_entry->he_key, ==, entry->he_key);
+		ASSERT3P(dtor_entry->he_dtor, ==, entry->he_dtor);
+
+		hash = hash_long((ulong_t)entry->he_key *
+		    (ulong_t)entry->he_pid, table->ht_bits);
+		entry_bin = &table->ht_bins[hash];
+
+		spin_lock(&entry_bin->hb_lock);
+		tsd_hash_del(table, entry);
+		hlist_add_head(&entry->he_list, &work);
+		spin_unlock(&entry_bin->hb_lock);
+	}
+
+	hash = hash_long((ulong_t)dtor_entry->he_key *
+	    (ulong_t)dtor_entry->he_pid, table->ht_bits);
+	dtor_entry_bin = &table->ht_bins[hash];
+
+	spin_lock(&dtor_entry_bin->hb_lock);
+	tsd_hash_del(table, dtor_entry);
+	hlist_add_head(&dtor_entry->he_list, &work);
+	spin_unlock(&dtor_entry_bin->hb_lock);
+	spin_unlock(&table->ht_lock);
+
+	tsd_hash_dtor(&work);
+	*keyp = 0;
+}
+EXPORT_SYMBOL(tsd_destroy);
+
+/*
+ * tsd_exit - destroys all thread specific data for this thread
+ *
+ * Destroys all the thread specific data for this thread.
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_exit(void)
+{
+	HLIST_HEAD(work);
+	tsd_hash_table_t *table;
+	tsd_hash_entry_t *pid_entry, *entry;
+	tsd_hash_bin_t *pid_entry_bin, *entry_bin;
+	ulong_t hash;
+
+	table = tsd_hash_table;
+	ASSERT3P(table, !=, NULL);
+
+	spin_lock(&table->ht_lock);
+	pid_entry = tsd_hash_search(table, PID_KEY, curthread->pid);
+	if (pid_entry == NULL) {
+		spin_unlock(&table->ht_lock);
+		return;
+	}
+
+	/*
+	 * All keys associated with this pid must be linked off of the
+	 * PID_KEY entry.  They are removed from the hash table and
+	 * linked in to a private working list to be destroyed.
+	 */
+
+	while (!list_empty(&pid_entry->he_pid_list)) {
+		entry = list_entry(pid_entry->he_pid_list.next,
+		    tsd_hash_entry_t, he_pid_list);
+		ASSERT3U(pid_entry->he_pid, ==, entry->he_pid);
+
+		hash = hash_long((ulong_t)entry->he_key *
+		    (ulong_t)entry->he_pid, table->ht_bits);
+		entry_bin = &table->ht_bins[hash];
+
+		spin_lock(&entry_bin->hb_lock);
+		tsd_hash_del(table, entry);
+		hlist_add_head(&entry->he_list, &work);
+		spin_unlock(&entry_bin->hb_lock);
+	}
+
+	hash = hash_long((ulong_t)pid_entry->he_key *
+	    (ulong_t)pid_entry->he_pid, table->ht_bits);
+	pid_entry_bin = &table->ht_bins[hash];
+
+	spin_lock(&pid_entry_bin->hb_lock);
+	tsd_hash_del(table, pid_entry);
+	hlist_add_head(&pid_entry->he_list, &work);
+	spin_unlock(&pid_entry_bin->hb_lock);
+	spin_unlock(&table->ht_lock);
+
+	tsd_hash_dtor(&work);
+}
+EXPORT_SYMBOL(tsd_exit);
+
+int
+spl_tsd_init(void)
+{
+	tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT);
+	if (tsd_hash_table == NULL)
+		return (1);
+
+	return (0);
+}
+
+void
+spl_tsd_fini(void)
+{
+	tsd_hash_table_fini(tsd_hash_table);
+	tsd_hash_table = NULL;
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-vmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-vmem.c
new file mode 100644
index 000000000000..32372e6f2b60
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-vmem.c
@@ -0,0 +1,91 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/percpu_compat.h>
+#include <sys/debug.h>
+#include <sys/vmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/shrinker.h>
+#include <linux/module.h>
+
+/*
+ * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces.
+ */
+void *
+spl_vmem_alloc(size_t size, int flags, const char *func, int line)
+{
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+	flags |= KM_VMEM;
+
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+	return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_alloc);
+
+void *
+spl_vmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+	flags |= (KM_VMEM | KM_ZERO);
+
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+	return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_zalloc);
+
+void
+spl_vmem_free(const void *buf, size_t size)
+{
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_free_debug(buf, size));
+#else
+	return (spl_kmem_free_track(buf, size));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_free);
+
+int
+spl_vmem_init(void)
+{
+	return (0);
+}
+
+void
+spl_vmem_fini(void)
+{
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c b/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c
new file mode 100644
index 000000000000..1dd31ffc1483
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c
@@ -0,0 +1,513 @@
+/*
+ *  Copyright (c) 2008-2010 Sun Microsystems, Inc.
+ *  Written by Ricardo Correia <Ricardo.M.Correia@Sun.COM>
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) XDR Implementation.
+ */
+
+#include <linux/string.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <rpc/xdr.h>
+
+/*
+ * SPL's XDR mem implementation.
+ *
+ * This is used by libnvpair to serialize/deserialize the name-value pair data
+ * structures into byte arrays in a well-defined and portable manner.
+ *
+ * These data structures are used by the DMU/ZFS to flexibly manipulate various
+ * information in memory and later serialize it/deserialize it to disk.
+ * Examples of usages include the pool configuration, lists of pool and dataset
+ * properties, etc.
+ *
+ * Reference documentation for the XDR representation and XDR operations can be
+ * found in RFC 1832 and xdr(3), respectively.
+ *
+ * ===  Implementation shortcomings ===
+ *
+ * It is assumed that the following C types have the following sizes:
+ *
+ * char/unsigned char:      1 byte
+ * short/unsigned short:    2 bytes
+ * int/unsigned int:        4 bytes
+ * longlong_t/u_longlong_t: 8 bytes
+ *
+ * The C standard allows these types to be larger (and in the case of ints,
+ * shorter), so if that is the case on some compiler/architecture, the build
+ * will fail (on purpose).
+ *
+ * If someone wants to fix the code to work properly on such environments, then:
+ *
+ * 1) Preconditions should be added to xdrmem_enc functions to make sure the
+ *    caller doesn't pass arguments which exceed the expected range.
+ * 2) Functions which take signed integers should be changed to properly do
+ *    sign extension.
+ * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger
+ *    problems than this implementation.
+ *
+ * It is also assumed that:
+ *
+ * 1) Chars have 8 bits.
+ * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned
+ *    memcpy, memset and memcmp.
+ * 3) Arrays passed to xdr_array() are packed and the compiler/architecture
+ *    supports element-sized-aligned memory accesses.
+ * 4) Negative integers are natively stored in two's complement binary
+ *    representation.
+ *
+ * No checks are done for the 4 assumptions above, though.
+ *
+ * === Caller expectations ===
+ *
+ * Existing documentation does not describe the semantics of XDR operations very
+ * well.  Therefore, some assumptions about failure semantics will be made and
+ * will be described below:
+ *
+ * 1) If any encoding operation fails (e.g., due to lack of buffer space), the
+ * the stream should be considered valid only up to the encoding operation
+ * previous to the one that first failed. However, the stream size as returned
+ * by xdr_control() cannot be considered to be strictly correct (it may be
+ * bigger).
+ *
+ * Putting it another way, if there is an encoding failure it's undefined
+ * whether anything is added to the stream in that operation and therefore
+ * neither xdr_control() nor future encoding operations on the same stream can
+ * be relied upon to produce correct results.
+ *
+ * 2) If a decoding operation fails, it's undefined whether anything will be
+ * decoded into passed buffers/pointers during that operation, or what the
+ * values on those buffers will look like.
+ *
+ * Future decoding operations on the same stream will also have similar
+ * undefined behavior.
+ *
+ * 3) When the first decoding operation fails it is OK to trust the results of
+ * previous decoding operations on the same stream, as long as the caller
+ * expects a failure to be possible (e.g. due to end-of-stream).
+ *
+ * However, this is highly discouraged because the caller should know the
+ * stream size and should be coded to expect any decoding failure to be data
+ * corruption due to hardware, accidental or even malicious causes, which should
+ * be handled gracefully in all cases.
+ *
+ * In very rare situations where there are strong reasons to believe the data
+ * can be trusted to be valid and non-tampered with, then the caller may assume
+ * a decoding failure to be a bug (e.g. due to mismatched data types) and may
+ * fail non-gracefully.
+ *
+ * 4) Non-zero padding bytes will cause the decoding operation to fail.
+ *
+ * 5) Zero bytes on string types will also cause the decoding operation to fail.
+ *
+ * 6) It is assumed that either the pointer to the stream buffer given by the
+ * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int
+ * memory accesses.
+ *
+ * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap.
+ *
+ * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user
+ * space or MMIO space), the computer may explode.
+ */
+
+static struct xdr_ops xdrmem_encode_ops;
+static struct xdr_ops xdrmem_decode_ops;
+
+void
+xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size,
+    const enum xdr_op op)
+{
+	switch (op) {
+		case XDR_ENCODE:
+			xdrs->x_ops = &xdrmem_encode_ops;
+			break;
+		case XDR_DECODE:
+			xdrs->x_ops = &xdrmem_decode_ops;
+			break;
+		default:
+			xdrs->x_ops = NULL; /* Let the caller know we failed */
+			return;
+	}
+
+	xdrs->x_op = op;
+	xdrs->x_addr = addr;
+	xdrs->x_addr_end = addr + size;
+
+	if (xdrs->x_addr_end < xdrs->x_addr) {
+		xdrs->x_ops = NULL;
+	}
+}
+EXPORT_SYMBOL(xdrmem_create);
+
+static bool_t
+xdrmem_control(XDR *xdrs, int req, void *info)
+{
+	struct xdr_bytesrec *rec = (struct xdr_bytesrec *)info;
+
+	if (req != XDR_GET_BYTES_AVAIL)
+		return (FALSE);
+
+	rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */
+	rec->xc_num_avail = xdrs->x_addr_end - xdrs->x_addr;
+
+	return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+	uint_t size = roundup(cnt, 4);
+	uint_t pad;
+
+	if (size < cnt)
+		return (FALSE); /* Integer overflow */
+
+	if (xdrs->x_addr > xdrs->x_addr_end)
+		return (FALSE);
+
+	if (xdrs->x_addr_end - xdrs->x_addr < size)
+		return (FALSE);
+
+	memcpy(xdrs->x_addr, cp, cnt);
+
+	xdrs->x_addr += cnt;
+
+	pad = size - cnt;
+	if (pad > 0) {
+		memset(xdrs->x_addr, 0, pad);
+		xdrs->x_addr += pad;
+	}
+
+	return (TRUE);
+}
+
+static bool_t
+xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+	static uint32_t zero = 0;
+	uint_t size = roundup(cnt, 4);
+	uint_t pad;
+
+	if (size < cnt)
+		return (FALSE); /* Integer overflow */
+
+	if (xdrs->x_addr > xdrs->x_addr_end)
+		return (FALSE);
+
+	if (xdrs->x_addr_end - xdrs->x_addr < size)
+		return (FALSE);
+
+	memcpy(cp, xdrs->x_addr, cnt);
+	xdrs->x_addr += cnt;
+
+	pad = size - cnt;
+	if (pad > 0) {
+		/* An inverted memchr() would be useful here... */
+		if (memcmp(&zero, xdrs->x_addr, pad) != 0)
+			return (FALSE);
+
+		xdrs->x_addr += pad;
+	}
+
+	return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_uint32(XDR *xdrs, uint32_t val)
+{
+	if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end)
+		return (FALSE);
+
+	*((uint32_t *)xdrs->x_addr) = cpu_to_be32(val);
+
+	xdrs->x_addr += sizeof (uint32_t);
+
+	return (TRUE);
+}
+
+static bool_t
+xdrmem_dec_uint32(XDR *xdrs, uint32_t *val)
+{
+	if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end)
+		return (FALSE);
+
+	*val = be32_to_cpu(*((uint32_t *)xdrs->x_addr));
+
+	xdrs->x_addr += sizeof (uint32_t);
+
+	return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_char(XDR *xdrs, char *cp)
+{
+	uint32_t val;
+
+	BUILD_BUG_ON(sizeof (char) != 1);
+	val = *((unsigned char *) cp);
+
+	return (xdrmem_enc_uint32(xdrs, val));
+}
+
+static bool_t
+xdrmem_dec_char(XDR *xdrs, char *cp)
+{
+	uint32_t val;
+
+	BUILD_BUG_ON(sizeof (char) != 1);
+
+	if (!xdrmem_dec_uint32(xdrs, &val))
+		return (FALSE);
+
+	/*
+	 * If any of the 3 other bytes are non-zero then val will be greater
+	 * than 0xff and we fail because according to the RFC, this block does
+	 * not have a char encoded in it.
+	 */
+	if (val > 0xff)
+		return (FALSE);
+
+	*((unsigned char *) cp) = val;
+
+	return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp)
+{
+	BUILD_BUG_ON(sizeof (unsigned short) != 2);
+
+	return (xdrmem_enc_uint32(xdrs, *usp));
+}
+
+static bool_t
+xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp)
+{
+	uint32_t val;
+
+	BUILD_BUG_ON(sizeof (unsigned short) != 2);
+
+	if (!xdrmem_dec_uint32(xdrs, &val))
+		return (FALSE);
+
+	/*
+	 * Short ints are not in the RFC, but we assume similar logic as in
+	 * xdrmem_dec_char().
+	 */
+	if (val > 0xffff)
+		return (FALSE);
+
+	*usp = val;
+
+	return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_uint(XDR *xdrs, unsigned *up)
+{
+	BUILD_BUG_ON(sizeof (unsigned) != 4);
+
+	return (xdrmem_enc_uint32(xdrs, *up));
+}
+
+static bool_t
+xdrmem_dec_uint(XDR *xdrs, unsigned *up)
+{
+	BUILD_BUG_ON(sizeof (unsigned) != 4);
+
+	return (xdrmem_dec_uint32(xdrs, (uint32_t *)up));
+}
+
+static bool_t
+xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp)
+{
+	BUILD_BUG_ON(sizeof (u_longlong_t) != 8);
+
+	if (!xdrmem_enc_uint32(xdrs, *ullp >> 32))
+		return (FALSE);
+
+	return (xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff));
+}
+
+static bool_t
+xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp)
+{
+	uint32_t low, high;
+
+	BUILD_BUG_ON(sizeof (u_longlong_t) != 8);
+
+	if (!xdrmem_dec_uint32(xdrs, &high))
+		return (FALSE);
+	if (!xdrmem_dec_uint32(xdrs, &low))
+		return (FALSE);
+
+	*ullp = ((u_longlong_t)high << 32) | low;
+
+	return (TRUE);
+}
+
+static bool_t
+xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize,
+    const uint_t elsize, const xdrproc_t elproc)
+{
+	uint_t i;
+	caddr_t addr = *arrp;
+
+	if (*sizep > maxsize || *sizep > UINT_MAX / elsize)
+		return (FALSE);
+
+	if (!xdrmem_enc_uint(xdrs, sizep))
+		return (FALSE);
+
+	for (i = 0; i < *sizep; i++) {
+		if (!elproc(xdrs, addr))
+			return (FALSE);
+		addr += elsize;
+	}
+
+	return (TRUE);
+}
+
+static bool_t
+xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize,
+    const uint_t elsize, const xdrproc_t elproc)
+{
+	uint_t i, size;
+	bool_t alloc = FALSE;
+	caddr_t addr;
+
+	if (!xdrmem_dec_uint(xdrs, sizep))
+		return (FALSE);
+
+	size = *sizep;
+
+	if (size > maxsize || size > UINT_MAX / elsize)
+		return (FALSE);
+
+	/*
+	 * The Solaris man page says: "If *arrp is NULL when decoding,
+	 * xdr_array() allocates memory and *arrp points to it".
+	 */
+	if (*arrp == NULL) {
+		BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t));
+
+		*arrp = kmem_alloc(size * elsize, KM_NOSLEEP);
+		if (*arrp == NULL)
+			return (FALSE);
+
+		alloc = TRUE;
+	}
+
+	addr = *arrp;
+
+	for (i = 0; i < size; i++) {
+		if (!elproc(xdrs, addr)) {
+			if (alloc)
+				kmem_free(*arrp, size * elsize);
+			return (FALSE);
+		}
+		addr += elsize;
+	}
+
+	return (TRUE);
+}
+
+static bool_t
+xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize)
+{
+	size_t slen = strlen(*sp);
+	uint_t len;
+
+	if (slen > maxsize)
+		return (FALSE);
+
+	len = slen;
+
+	if (!xdrmem_enc_uint(xdrs, &len))
+		return (FALSE);
+
+	return (xdrmem_enc_bytes(xdrs, *sp, len));
+}
+
+static bool_t
+xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize)
+{
+	uint_t size;
+	bool_t alloc = FALSE;
+
+	if (!xdrmem_dec_uint(xdrs, &size))
+		return (FALSE);
+
+	if (size > maxsize || size > UINT_MAX - 1)
+		return (FALSE);
+
+	/*
+	 * Solaris man page: "If *sp is NULL when decoding, xdr_string()
+	 * allocates memory and *sp points to it".
+	 */
+	if (*sp == NULL) {
+		BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t));
+
+		*sp = kmem_alloc(size + 1, KM_NOSLEEP);
+		if (*sp == NULL)
+			return (FALSE);
+
+		alloc = TRUE;
+	}
+
+	if (!xdrmem_dec_bytes(xdrs, *sp, size))
+		goto fail;
+
+	if (memchr(*sp, 0, size) != NULL)
+		goto fail;
+
+	(*sp)[size] = '\0';
+
+	return (TRUE);
+
+fail:
+	if (alloc)
+		kmem_free(*sp, size + 1);
+
+	return (FALSE);
+}
+
+static struct xdr_ops xdrmem_encode_ops = {
+	.xdr_control		= xdrmem_control,
+	.xdr_char		= xdrmem_enc_char,
+	.xdr_u_short		= xdrmem_enc_ushort,
+	.xdr_u_int		= xdrmem_enc_uint,
+	.xdr_u_longlong_t	= xdrmem_enc_ulonglong,
+	.xdr_opaque		= xdrmem_enc_bytes,
+	.xdr_string		= xdr_enc_string,
+	.xdr_array		= xdr_enc_array
+};
+
+static struct xdr_ops xdrmem_decode_ops = {
+	.xdr_control		= xdrmem_control,
+	.xdr_char		= xdrmem_dec_char,
+	.xdr_u_short		= xdrmem_dec_ushort,
+	.xdr_u_int		= xdrmem_dec_uint,
+	.xdr_u_longlong_t	= xdrmem_dec_ulonglong,
+	.xdr_opaque		= xdrmem_dec_bytes,
+	.xdr_string		= xdr_dec_string,
+	.xdr_array		= xdr_dec_array
+};
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c
new file mode 100644
index 000000000000..db05e28925b5
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c
@@ -0,0 +1,218 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ *  z_compress_level/z_uncompress are nearly identical copies of the
+ *  compress2/uncompress functions provided by the official zlib package
+ *  available at http://zlib.net/.  The only changes made we to slightly
+ *  adapt the functions called to match the linux kernel implementation
+ *  of zlib.  The full zlib license follows:
+ *
+ *  zlib.h -- interface of the 'zlib' general purpose compression library
+ *  version 1.2.5, April 19th, 2010
+ *
+ *  Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
+ *
+ *  This software is provided 'as-is', without any express or implied
+ *  warranty.  In no event will the authors be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this software must not be misrepresented; you must not
+ *     claim that you wrote the original software. If you use this software
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original software.
+ *  3. This notice may not be removed or altered from any source distribution.
+ *
+ *  Jean-loup Gailly
+ *  Mark Adler
+ */
+
+
+#include <linux/percpu_compat.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/zmod.h>
+
+static spl_kmem_cache_t *zlib_workspace_cache;
+
+/*
+ * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc
+ * and vfree for every call.  Using a kmem_cache also has the advantage
+ * that improves the odds that the memory used will be local to this cpu.
+ * To further improve things it might be wise to create a dedicated per-cpu
+ * workspace for use.  This would take some additional care because we then
+ * must disable preemption around the critical section, and verify that
+ * zlib_deflate* and zlib_inflate* never internally call schedule().
+ */
+static void *
+zlib_workspace_alloc(int flags)
+{
+	return (kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS)));
+}
+
+static void
+zlib_workspace_free(void *workspace)
+{
+	kmem_cache_free(zlib_workspace_cache, workspace);
+}
+
+/*
+ * Compresses the source buffer into the destination buffer. The level
+ * parameter has the same meaning as in deflateInit.  sourceLen is the byte
+ * length of the source buffer. Upon entry, destLen is the total size of the
+ * destination buffer, which must be at least 0.1% larger than sourceLen plus
+ * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
+ *
+ * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ * memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+ * Z_STREAM_ERROR if the level parameter is invalid.
+ */
+int
+z_compress_level(void *dest, size_t *destLen, const void *source,
+    size_t sourceLen, int level)
+{
+	z_stream stream;
+	int err;
+
+	stream.next_in = (Byte *)source;
+	stream.avail_in = (uInt)sourceLen;
+	stream.next_out = dest;
+	stream.avail_out = (uInt)*destLen;
+
+	if ((size_t)stream.avail_out != *destLen)
+		return (Z_BUF_ERROR);
+
+	stream.workspace = zlib_workspace_alloc(KM_SLEEP);
+	if (!stream.workspace)
+		return (Z_MEM_ERROR);
+
+	err = zlib_deflateInit(&stream, level);
+	if (err != Z_OK) {
+		zlib_workspace_free(stream.workspace);
+		return (err);
+	}
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END) {
+		zlib_deflateEnd(&stream);
+		zlib_workspace_free(stream.workspace);
+		return (err == Z_OK ? Z_BUF_ERROR : err);
+	}
+	*destLen = stream.total_out;
+
+	err = zlib_deflateEnd(&stream);
+	zlib_workspace_free(stream.workspace);
+
+	return (err);
+}
+EXPORT_SYMBOL(z_compress_level);
+
+/*
+ * Decompresses the source buffer into the destination buffer.  sourceLen is
+ * the byte length of the source buffer. Upon entry, destLen is the total
+ * size of the destination buffer, which must be large enough to hold the
+ * entire uncompressed data. (The size of the uncompressed data must have
+ * been saved previously by the compressor and transmitted to the decompressor
+ * by some mechanism outside the scope of this compression library.)
+ * Upon exit, destLen is the actual size of the compressed buffer.
+ * This function can be used to decompress a whole file at once if the
+ * input file is mmap'ed.
+ *
+ * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+ * enough memory, Z_BUF_ERROR if there was not enough room in the output
+ * buffer, or Z_DATA_ERROR if the input data was corrupted.
+ */
+int
+z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen)
+{
+	z_stream stream;
+	int err;
+
+	stream.next_in = (Byte *)source;
+	stream.avail_in = (uInt)sourceLen;
+	stream.next_out = dest;
+	stream.avail_out = (uInt)*destLen;
+
+	if ((size_t)stream.avail_out != *destLen)
+		return (Z_BUF_ERROR);
+
+	stream.workspace = zlib_workspace_alloc(KM_SLEEP);
+	if (!stream.workspace)
+		return (Z_MEM_ERROR);
+
+	err = zlib_inflateInit(&stream);
+	if (err != Z_OK) {
+		zlib_workspace_free(stream.workspace);
+		return (err);
+	}
+
+	err = zlib_inflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END) {
+		zlib_inflateEnd(&stream);
+		zlib_workspace_free(stream.workspace);
+
+		if (err == Z_NEED_DICT ||
+		    (err == Z_BUF_ERROR && stream.avail_in == 0))
+			return (Z_DATA_ERROR);
+
+		return (err);
+	}
+	*destLen = stream.total_out;
+
+	err = zlib_inflateEnd(&stream);
+	zlib_workspace_free(stream.workspace);
+
+	return (err);
+}
+EXPORT_SYMBOL(z_uncompress);
+
+int
+spl_zlib_init(void)
+{
+	int size;
+
+	size = MAX(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+	    zlib_inflate_workspacesize());
+
+	zlib_workspace_cache = kmem_cache_create(
+	    "spl_zlib_workspace_cache",
+	    size, 0, NULL, NULL, NULL, NULL, NULL,
+	    KMC_KVMEM);
+	if (!zlib_workspace_cache)
+		return (1);
+
+	return (0);
+}
+
+void
+spl_zlib_fini(void)
+{
+	kmem_cache_destroy(zlib_workspace_cache);
+	zlib_workspace_cache = NULL;
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in b/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in
new file mode 100644
index 000000000000..9f493ef16de5
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in
@@ -0,0 +1,37 @@
+#
+# Linux specific sources included from module/zfs/Makefile.in
+#
+
+# Suppress unused-value warnings in sparc64 architecture headers
+ccflags-$(CONFIG_SPARC64) += -Wno-unused-value
+
+$(MODULE)-objs += ../os/linux/zfs/abd_os.o
+$(MODULE)-objs += ../os/linux/zfs/arc_os.o
+$(MODULE)-objs += ../os/linux/zfs/mmp_os.o
+$(MODULE)-objs += ../os/linux/zfs/policy.o
+$(MODULE)-objs += ../os/linux/zfs/trace.o
+$(MODULE)-objs += ../os/linux/zfs/qat.o
+$(MODULE)-objs += ../os/linux/zfs/qat_compress.o
+$(MODULE)-objs += ../os/linux/zfs/qat_crypt.o
+$(MODULE)-objs += ../os/linux/zfs/spa_misc_os.o
+$(MODULE)-objs += ../os/linux/zfs/spa_stats.o
+$(MODULE)-objs += ../os/linux/zfs/vdev_disk.o
+$(MODULE)-objs += ../os/linux/zfs/vdev_file.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_acl.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_ctldir.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_debug.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_dir.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_file_os.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_ioctl_os.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_sysfs.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_vfsops.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_vnops.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_znode.o
+$(MODULE)-objs += ../os/linux/zfs/zio_crypt.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_ctldir.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_export.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_file.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_inode.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_super.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_xattr.o
+$(MODULE)-objs += ../os/linux/zfs/zvol_os.o
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
new file mode 100644
index 000000000000..c2281449ed12
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@@ -0,0 +1,1074 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+
+/*
+ * See abd.c for a general overview of the arc buffered data (ABD).
+ *
+ * Linear buffers act exactly like normal buffers and are always mapped into the
+ * kernel's virtual memory space, while scattered ABD data chunks are allocated
+ * as physical pages and then mapped in only while they are actually being
+ * accessed through one of the abd_* library functions. Using scattered ABDs
+ * provides several benefits:
+ *
+ *  (1) They avoid use of kmem_*, preventing performance problems where running
+ *      kmem_reap on very large memory systems never finishes and causes
+ *      constant TLB shootdowns.
+ *
+ *  (2) Fragmentation is less of an issue since when we are at the limit of
+ *      allocatable space, we won't have to search around for a long free
+ *      hole in the VA space for large ARC allocations. Each chunk is mapped in
+ *      individually, so even if we are using HIGHMEM (see next point) we
+ *      wouldn't need to worry about finding a contiguous address range.
+ *
+ *  (3) If we are not using HIGHMEM, then all physical memory is always
+ *      mapped into the kernel's address space, so we also avoid the map /
+ *      unmap costs on each ABD access.
+ *
+ * If we are not using HIGHMEM, scattered buffers which have only one chunk
+ * can be treated as linear buffers, because they are contiguous in the
+ * kernel's virtual address space.  See abd_alloc_chunks() for details.
+ */
+
+#include <sys/abd_impl.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+#ifdef _KERNEL
+#include <linux/kmap_compat.h>
+#include <linux/scatterlist.h>
+#else
+#define	MAX_ORDER	1
+#endif
+
+typedef struct abd_stats {
+	kstat_named_t abdstat_struct_size;
+	kstat_named_t abdstat_linear_cnt;
+	kstat_named_t abdstat_linear_data_size;
+	kstat_named_t abdstat_scatter_cnt;
+	kstat_named_t abdstat_scatter_data_size;
+	kstat_named_t abdstat_scatter_chunk_waste;
+	kstat_named_t abdstat_scatter_orders[MAX_ORDER];
+	kstat_named_t abdstat_scatter_page_multi_chunk;
+	kstat_named_t abdstat_scatter_page_multi_zone;
+	kstat_named_t abdstat_scatter_page_alloc_retry;
+	kstat_named_t abdstat_scatter_sg_table_retry;
+} abd_stats_t;
+
+static abd_stats_t abd_stats = {
+	/* Amount of memory occupied by all of the abd_t struct allocations */
+	{ "struct_size",			KSTAT_DATA_UINT64 },
+	/*
+	 * The number of linear ABDs which are currently allocated, excluding
+	 * ABDs which don't own their data (for instance the ones which were
+	 * allocated through abd_get_offset() and abd_get_from_buf()). If an
+	 * ABD takes ownership of its buf then it will become tracked.
+	 */
+	{ "linear_cnt",				KSTAT_DATA_UINT64 },
+	/* Amount of data stored in all linear ABDs tracked by linear_cnt */
+	{ "linear_data_size",			KSTAT_DATA_UINT64 },
+	/*
+	 * The number of scatter ABDs which are currently allocated, excluding
+	 * ABDs which don't own their data (for instance the ones which were
+	 * allocated through abd_get_offset()).
+	 */
+	{ "scatter_cnt",			KSTAT_DATA_UINT64 },
+	/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
+	{ "scatter_data_size",			KSTAT_DATA_UINT64 },
+	/*
+	 * The amount of space wasted at the end of the last chunk across all
+	 * scatter ABDs tracked by scatter_cnt.
+	 */
+	{ "scatter_chunk_waste",		KSTAT_DATA_UINT64 },
+	/*
+	 * The number of compound allocations of a given order.  These
+	 * allocations are spread over all currently allocated ABDs, and
+	 * act as a measure of memory fragmentation.
+	 */
+	{ { "scatter_order_N",			KSTAT_DATA_UINT64 } },
+	/*
+	 * The number of scatter ABDs which contain multiple chunks.
+	 * ABDs are preferentially allocated from the minimum number of
+	 * contiguous multi-page chunks, a single chunk is optimal.
+	 */
+	{ "scatter_page_multi_chunk",		KSTAT_DATA_UINT64 },
+	/*
+	 * The number of scatter ABDs which are split across memory zones.
+	 * ABDs are preferentially allocated using pages from a single zone.
+	 */
+	{ "scatter_page_multi_zone",		KSTAT_DATA_UINT64 },
+	/*
+	 *  The total number of retries encountered when attempting to
+	 *  allocate the pages to populate the scatter ABD.
+	 */
+	{ "scatter_page_alloc_retry",		KSTAT_DATA_UINT64 },
+	/*
+	 *  The total number of retries encountered when attempting to
+	 *  allocate the sg table for an ABD.
+	 */
+	{ "scatter_sg_table_retry",		KSTAT_DATA_UINT64 },
+};
+
+#define	abd_for_each_sg(abd, sg, n, i)	\
+	for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
+
+unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1;
+
+/*
+ * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
+ * ABD's.  Smaller allocations will use linear ABD's which uses
+ * zio_[data_]buf_alloc().
+ *
+ * Scatter ABD's use at least one page each, so sub-page allocations waste
+ * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
+ * half of each page).  Using linear ABD's for small allocations means that
+ * they will be put on slabs which contain many allocations.  This can
+ * improve memory efficiency, but it also makes it much harder for ARC
+ * evictions to actually free pages, because all the buffers on one slab need
+ * to be freed in order for the slab (and underlying pages) to be freed.
+ * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
+ * possible for them to actually waste more memory than scatter (one page per
+ * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
+ *
+ * Spill blocks are typically 512B and are heavily used on systems running
+ * selinux with the default dnode size and the `xattr=sa` property set.
+ *
+ * By default we use linear allocations for 512B and 1KB, and scatter
+ * allocations for larger (1.5KB and up).
+ */
+int zfs_abd_scatter_min_size = 512 * 3;
+
+/*
+ * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are
+ * just a single zero'd page. This allows us to conserve memory by
+ * only using a single zero page for the scatterlist.
+ */
+abd_t *abd_zero_scatter = NULL;
+
+struct page;
+/*
+ * abd_zero_page we will be an allocated zero'd PAGESIZE buffer, which is
+ * assigned to set each of the pages of abd_zero_scatter.
+ */
+static struct page *abd_zero_page = NULL;
+
+static kmem_cache_t *abd_cache = NULL;
+static kstat_t *abd_ksp;
+
+static size_t
+abd_chunkcnt_for_bytes(size_t size)
+{
+	return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
+}
+
+abd_t *
+abd_alloc_struct(size_t size)
+{
+	/*
+	 * In Linux we do not use the size passed in during ABD
+	 * allocation, so we just ignore it.
+	 */
+	abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
+	ASSERT3P(abd, !=, NULL);
+	list_link_init(&abd->abd_gang_link);
+	mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL);
+	ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
+
+	return (abd);
+}
+
+void
+abd_free_struct(abd_t *abd)
+{
+	mutex_destroy(&abd->abd_mtx);
+	ASSERT(!list_link_active(&abd->abd_gang_link));
+	kmem_cache_free(abd_cache, abd);
+	ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
+}
+
+#ifdef _KERNEL
+/*
+ * Mark zfs data pages so they can be excluded from kernel crash dumps
+ */
+#ifdef _LP64
+#define	ABD_FILE_CACHE_PAGE	0x2F5ABDF11ECAC4E
+
+static inline void
+abd_mark_zfs_page(struct page *page)
+{
+	get_page(page);
+	SetPagePrivate(page);
+	set_page_private(page, ABD_FILE_CACHE_PAGE);
+}
+
+static inline void
+abd_unmark_zfs_page(struct page *page)
+{
+	set_page_private(page, 0UL);
+	ClearPagePrivate(page);
+	put_page(page);
+}
+#else
+#define	abd_mark_zfs_page(page)
+#define	abd_unmark_zfs_page(page)
+#endif /* _LP64 */
+
+#ifndef CONFIG_HIGHMEM
+
+#ifndef __GFP_RECLAIM
+#define	__GFP_RECLAIM		__GFP_WAIT
+#endif
+
+/*
+ * The goal is to minimize fragmentation by preferentially populating ABDs
+ * with higher order compound pages from a single zone.  Allocation size is
+ * progressively decreased until it can be satisfied without performing
+ * reclaim or compaction.  When necessary this function will degenerate to
+ * allocating individual pages and allowing reclaim to satisfy allocations.
+ */
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+	struct list_head pages;
+	struct sg_table table;
+	struct scatterlist *sg;
+	struct page *page, *tmp_page = NULL;
+	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
+	gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
+	int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1);
+	int nr_pages = abd_chunkcnt_for_bytes(size);
+	int chunks = 0, zones = 0;
+	size_t remaining_size;
+	int nid = NUMA_NO_NODE;
+	int alloc_pages = 0;
+
+	INIT_LIST_HEAD(&pages);
+
+	while (alloc_pages < nr_pages) {
+		unsigned chunk_pages;
+		int order;
+
+		order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
+		chunk_pages = (1U << order);
+
+		page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
+		if (page == NULL) {
+			if (order == 0) {
+				ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
+				schedule_timeout_interruptible(1);
+			} else {
+				max_order = MAX(0, order - 1);
+			}
+			continue;
+		}
+
+		list_add_tail(&page->lru, &pages);
+
+		if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
+			zones++;
+
+		nid = page_to_nid(page);
+		ABDSTAT_BUMP(abdstat_scatter_orders[order]);
+		chunks++;
+		alloc_pages += chunk_pages;
+	}
+
+	ASSERT3S(alloc_pages, ==, nr_pages);
+
+	while (sg_alloc_table(&table, chunks, gfp)) {
+		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+		schedule_timeout_interruptible(1);
+	}
+
+	sg = table.sgl;
+	remaining_size = size;
+	list_for_each_entry_safe(page, tmp_page, &pages, lru) {
+		size_t sg_size = MIN(PAGESIZE << compound_order(page),
+		    remaining_size);
+		sg_set_page(sg, page, sg_size, 0);
+		abd_mark_zfs_page(page);
+		remaining_size -= sg_size;
+
+		sg = sg_next(sg);
+		list_del(&page->lru);
+	}
+
+	/*
+	 * These conditions ensure that a possible transformation to a linear
+	 * ABD would be valid.
+	 */
+	ASSERT(!PageHighMem(sg_page(table.sgl)));
+	ASSERT0(ABD_SCATTER(abd).abd_offset);
+
+	if (table.nents == 1) {
+		/*
+		 * Since there is only one entry, this ABD can be represented
+		 * as a linear buffer.  All single-page (4K) ABD's can be
+		 * represented this way.  Some multi-page ABD's can also be
+		 * represented this way, if we were able to allocate a single
+		 * "chunk" (higher-order "page" which represents a power-of-2
+		 * series of physically-contiguous pages).  This is often the
+		 * case for 2-page (8K) ABD's.
+		 *
+		 * Representing a single-entry scatter ABD as a linear ABD
+		 * has the performance advantage of avoiding the copy (and
+		 * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
+		 * A performance increase of around 5% has been observed for
+		 * ARC-cached reads (of small blocks which can take advantage
+		 * of this).
+		 *
+		 * Note that this optimization is only possible because the
+		 * pages are always mapped into the kernel's address space.
+		 * This is not the case for highmem pages, so the
+		 * optimization can not be made there.
+		 */
+		abd->abd_flags |= ABD_FLAG_LINEAR;
+		abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
+		abd->abd_u.abd_linear.abd_sgl = table.sgl;
+		ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl));
+	} else if (table.nents > 1) {
+		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+
+		if (zones) {
+			ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
+			abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
+		}
+
+		ABD_SCATTER(abd).abd_sgl = table.sgl;
+		ABD_SCATTER(abd).abd_nents = table.nents;
+	}
+}
+#else
+
+/*
+ * Allocate N individual pages to construct a scatter ABD.  This function
+ * makes no attempt to request contiguous pages and requires the minimal
+ * number of kernel interfaces.  It's designed for maximum compatibility.
+ */
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+	struct scatterlist *sg = NULL;
+	struct sg_table table;
+	struct page *page;
+	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
+	int nr_pages = abd_chunkcnt_for_bytes(size);
+	int i = 0;
+
+	while (sg_alloc_table(&table, nr_pages, gfp)) {
+		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+		schedule_timeout_interruptible(1);
+	}
+
+	ASSERT3U(table.nents, ==, nr_pages);
+	ABD_SCATTER(abd).abd_sgl = table.sgl;
+	ABD_SCATTER(abd).abd_nents = nr_pages;
+
+	abd_for_each_sg(abd, sg, nr_pages, i) {
+		while ((page = __page_cache_alloc(gfp)) == NULL) {
+			ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
+			schedule_timeout_interruptible(1);
+		}
+
+		ABDSTAT_BUMP(abdstat_scatter_orders[0]);
+		sg_set_page(sg, page, PAGESIZE, 0);
+		abd_mark_zfs_page(page);
+	}
+
+	if (nr_pages > 1) {
+		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+	}
+}
+#endif /* !CONFIG_HIGHMEM */
+
+/*
+ * This must be called if any of the sg_table allocation functions
+ * are called.
+ */
+static void
+abd_free_sg_table(abd_t *abd)
+{
+	struct sg_table table;
+
+	table.sgl = ABD_SCATTER(abd).abd_sgl;
+	table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents;
+	sg_free_table(&table);
+}
+
+void
+abd_free_chunks(abd_t *abd)
+{
+	struct scatterlist *sg = NULL;
+	struct page *page;
+	int nr_pages = ABD_SCATTER(abd).abd_nents;
+	int order, i = 0;
+
+	if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
+		ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
+
+	if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
+		ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
+
+	abd_for_each_sg(abd, sg, nr_pages, i) {
+		page = sg_page(sg);
+		abd_unmark_zfs_page(page);
+		order = compound_order(page);
+		__free_pages(page, order);
+		ASSERT3U(sg->length, <=, PAGE_SIZE << order);
+		ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
+	}
+	abd_free_sg_table(abd);
+}
+
+/*
+ * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in
+ * the scatterlist will be set to the zero'd out buffer abd_zero_page.
+ */
+static void
+abd_alloc_zero_scatter(void)
+{
+	struct scatterlist *sg = NULL;
+	struct sg_table table;
+	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
+	gfp_t gfp_zero_page = gfp | __GFP_ZERO;
+	int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
+	int i = 0;
+
+	while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) {
+		ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
+		schedule_timeout_interruptible(1);
+	}
+	abd_mark_zfs_page(abd_zero_page);
+
+	while (sg_alloc_table(&table, nr_pages, gfp)) {
+		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+		schedule_timeout_interruptible(1);
+	}
+	ASSERT3U(table.nents, ==, nr_pages);
+
+	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+	abd_zero_scatter->abd_flags = ABD_FLAG_OWNER;
+	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
+	ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
+	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
+	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
+	abd_zero_scatter->abd_parent = NULL;
+	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
+	zfs_refcount_create(&abd_zero_scatter->abd_children);
+
+	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
+		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
+	}
+
+	ABDSTAT_BUMP(abdstat_scatter_cnt);
+	ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
+	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+}
+
+#else /* _KERNEL */
+
+#ifndef PAGE_SHIFT
+#define	PAGE_SHIFT (highbit64(PAGESIZE)-1)
+#endif
+
+#define	zfs_kmap_atomic(chunk, km)	((void *)chunk)
+#define	zfs_kunmap_atomic(addr, km)	do { (void)(addr); } while (0)
+#define	local_irq_save(flags)		do { (void)(flags); } while (0)
+#define	local_irq_restore(flags)	do { (void)(flags); } while (0)
+#define	nth_page(pg, i) \
+	((struct page *)((void *)(pg) + (i) * PAGESIZE))
+
+struct scatterlist {
+	struct page *page;
+	int length;
+	int end;
+};
+
+static void
+sg_init_table(struct scatterlist *sg, int nr)
+{
+	memset(sg, 0, nr * sizeof (struct scatterlist));
+	sg[nr - 1].end = 1;
+}
+
+/*
+ * This must be called if any of the sg_table allocation functions
+ * are called.
+ */
+static void
+abd_free_sg_table(abd_t *abd)
+{
+	int nents = ABD_SCATTER(abd).abd_nents;
+	vmem_free(ABD_SCATTER(abd).abd_sgl,
+	    nents * sizeof (struct scatterlist));
+}
+
+#define	for_each_sg(sgl, sg, nr, i)	\
+	for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
+
+static inline void
+sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
+    unsigned int offset)
+{
+	/* currently we don't use offset */
+	ASSERT(offset == 0);
+	sg->page = page;
+	sg->length = len;
+}
+
+static inline struct page *
+sg_page(struct scatterlist *sg)
+{
+	return (sg->page);
+}
+
+static inline struct scatterlist *
+sg_next(struct scatterlist *sg)
+{
+	if (sg->end)
+		return (NULL);
+
+	return (sg + 1);
+}
+
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+	unsigned nr_pages = abd_chunkcnt_for_bytes(size);
+	struct scatterlist *sg;
+	int i;
+
+	ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
+	    sizeof (struct scatterlist), KM_SLEEP);
+	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
+
+	abd_for_each_sg(abd, sg, nr_pages, i) {
+		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
+		sg_set_page(sg, p, PAGESIZE, 0);
+	}
+	ABD_SCATTER(abd).abd_nents = nr_pages;
+}
+
+void
+abd_free_chunks(abd_t *abd)
+{
+	int i, n = ABD_SCATTER(abd).abd_nents;
+	struct scatterlist *sg;
+
+	abd_for_each_sg(abd, sg, n, i) {
+		for (int j = 0; j < sg->length; j += PAGESIZE) {
+			struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
+			umem_free(p, PAGESIZE);
+		}
+	}
+	abd_free_sg_table(abd);
+}
+
+static void
+abd_alloc_zero_scatter(void)
+{
+	unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
+	struct scatterlist *sg;
+	int i;
+
+	abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
+	memset(abd_zero_page, 0, PAGESIZE);
+	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+	abd_zero_scatter->abd_flags = ABD_FLAG_OWNER;
+	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
+	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
+	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
+	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
+	abd_zero_scatter->abd_parent = NULL;
+	zfs_refcount_create(&abd_zero_scatter->abd_children);
+	ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
+	    sizeof (struct scatterlist), KM_SLEEP);
+
+	sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
+
+	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
+		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
+	}
+
+	ABDSTAT_BUMP(abdstat_scatter_cnt);
+	ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
+	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+}
+
+#endif /* _KERNEL */
+
+boolean_t
+abd_size_alloc_linear(size_t size)
+{
+	return (size < zfs_abd_scatter_min_size ? B_TRUE : B_FALSE);
+}
+
+void
+abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
+{
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+	int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size;
+	if (op == ABDSTAT_INCR) {
+		ABDSTAT_BUMP(abdstat_scatter_cnt);
+		ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
+		ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
+		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	} else {
+		ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+		ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
+		ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
+		arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	}
+}
+
+void
+abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
+{
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+	if (op == ABDSTAT_INCR) {
+		ABDSTAT_BUMP(abdstat_linear_cnt);
+		ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
+	} else {
+		ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+		ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+	}
+}
+
+void
+abd_verify_scatter(abd_t *abd)
+{
+	size_t n;
+	int i = 0;
+	struct scatterlist *sg = NULL;
+
+	ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
+	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
+	    ABD_SCATTER(abd).abd_sgl->length);
+	n = ABD_SCATTER(abd).abd_nents;
+	abd_for_each_sg(abd, sg, n, i) {
+		ASSERT3P(sg_page(sg), !=, NULL);
+	}
+}
+
+static void
+abd_free_zero_scatter(void)
+{
+	zfs_refcount_destroy(&abd_zero_scatter->abd_children);
+	ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE);
+	ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
+
+	abd_free_sg_table(abd_zero_scatter);
+	abd_free_struct(abd_zero_scatter);
+	abd_zero_scatter = NULL;
+	ASSERT3P(abd_zero_page, !=, NULL);
+#if defined(_KERNEL)
+	abd_unmark_zfs_page(abd_zero_page);
+	__free_page(abd_zero_page);
+#else
+	umem_free(abd_zero_page, PAGESIZE);
+#endif /* _KERNEL */
+}
+
+void
+abd_init(void)
+{
+	int i;
+
+	abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
+	    sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+	if (abd_ksp != NULL) {
+		for (i = 0; i < MAX_ORDER; i++) {
+			snprintf(abd_stats.abdstat_scatter_orders[i].name,
+			    KSTAT_STRLEN, "scatter_order_%d", i);
+			abd_stats.abdstat_scatter_orders[i].data_type =
+			    KSTAT_DATA_UINT64;
+		}
+		abd_ksp->ks_data = &abd_stats;
+		kstat_install(abd_ksp);
+	}
+
+	abd_alloc_zero_scatter();
+}
+
+void
+abd_fini(void)
+{
+	abd_free_zero_scatter();
+
+	if (abd_ksp != NULL) {
+		kstat_delete(abd_ksp);
+		abd_ksp = NULL;
+	}
+
+	if (abd_cache) {
+		kmem_cache_destroy(abd_cache);
+		abd_cache = NULL;
+	}
+}
+
+void
+abd_free_linear_page(abd_t *abd)
+{
+	/* Transform it back into a scatter ABD for freeing */
+	struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
+	abd->abd_flags &= ~ABD_FLAG_LINEAR;
+	abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
+	ABD_SCATTER(abd).abd_nents = 1;
+	ABD_SCATTER(abd).abd_offset = 0;
+	ABD_SCATTER(abd).abd_sgl = sg;
+	abd_free_chunks(abd);
+
+	zfs_refcount_destroy(&abd->abd_children);
+	abd_update_scatter_stats(abd, ABDSTAT_DECR);
+	abd_free_struct(abd);
+}
+
+/*
+ * If we're going to use this ABD for doing I/O using the block layer, the
+ * consumer of the ABD data doesn't care if it's scattered or not, and we don't
+ * plan to store this ABD in memory for a long period of time, we should
+ * allocate the ABD type that requires the least data copying to do the I/O.
+ *
+ * On Linux the optimal thing to do would be to use abd_get_offset() and
+ * construct a new ABD which shares the original pages thereby eliminating
+ * the copy.  But for the moment a new linear ABD is allocated until this
+ * performance optimization can be implemented.
+ */
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+	return (abd_alloc(size, is_metadata));
+}
+
+abd_t *
+abd_get_offset_scatter(abd_t *sabd, size_t off)
+{
+	abd_t *abd = NULL;
+	int i = 0;
+	struct scatterlist *sg = NULL;
+
+	abd_verify(sabd);
+	ASSERT3U(off, <=, sabd->abd_size);
+
+	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
+
+	abd = abd_alloc_struct(0);
+
+	/*
+	 * Even if this buf is filesystem metadata, we only track that
+	 * if we own the underlying data buffer, which is not true in
+	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
+	 */
+	abd->abd_flags = 0;
+
+	abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
+		if (new_offset < sg->length)
+			break;
+		new_offset -= sg->length;
+	}
+
+	ABD_SCATTER(abd).abd_sgl = sg;
+	ABD_SCATTER(abd).abd_offset = new_offset;
+	ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
+
+	return (abd);
+}
+
+/*
+ * Initialize the abd_iter.
+ */
+void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+{
+	ASSERT(!abd_is_gang(abd));
+	abd_verify(abd);
+	aiter->iter_abd = abd;
+	aiter->iter_mapaddr = NULL;
+	aiter->iter_mapsize = 0;
+	aiter->iter_pos = 0;
+	if (abd_is_linear(abd)) {
+		aiter->iter_offset = 0;
+		aiter->iter_sg = NULL;
+	} else {
+		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
+		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
+	}
+}
+
+/*
+ * This is just a helper function to see if we have exhausted the
+ * abd_iter and reached the end.
+ */
+boolean_t
+abd_iter_at_end(struct abd_iter *aiter)
+{
+	return (aiter->iter_pos == aiter->iter_abd->abd_size);
+}
+
+/*
+ * Advance the iterator by a certain amount. Cannot be called when a chunk is
+ * in use. This can be safely called when the aiter has already exhausted, in
+ * which case this does nothing.
+ */
+void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	/* There's nothing left to advance to, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	aiter->iter_pos += amount;
+	aiter->iter_offset += amount;
+	if (!abd_is_linear(aiter->iter_abd)) {
+		while (aiter->iter_offset >= aiter->iter_sg->length) {
+			aiter->iter_offset -= aiter->iter_sg->length;
+			aiter->iter_sg = sg_next(aiter->iter_sg);
+			if (aiter->iter_sg == NULL) {
+				ASSERT0(aiter->iter_offset);
+				break;
+			}
+		}
+	}
+}
+
+/*
+ * Map the current chunk into aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_map(struct abd_iter *aiter)
+{
+	void *paddr;
+	size_t offset = 0;
+
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	/* There's nothing left to iterate over, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	if (abd_is_linear(aiter->iter_abd)) {
+		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
+		offset = aiter->iter_offset;
+		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
+		paddr = ABD_LINEAR_BUF(aiter->iter_abd);
+	} else {
+		offset = aiter->iter_offset;
+		aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
+		    aiter->iter_abd->abd_size - aiter->iter_pos);
+
+		paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg),
+		    km_table[aiter->iter_km]);
+	}
+
+	aiter->iter_mapaddr = (char *)paddr + offset;
+}
+
+/*
+ * Unmap the current chunk from aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+	/* There's nothing left to unmap, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	if (!abd_is_linear(aiter->iter_abd)) {
+		/* LINTED E_FUNC_SET_NOT_USED */
+		zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset,
+		    km_table[aiter->iter_km]);
+	}
+
+	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+	ASSERT3U(aiter->iter_mapsize, >, 0);
+
+	aiter->iter_mapaddr = NULL;
+	aiter->iter_mapsize = 0;
+}
+
+void
+abd_cache_reap_now(void)
+{
+}
+
+#if defined(_KERNEL)
+/*
+ * bio_nr_pages for ABD.
+ * @off is the offset in @abd
+ */
+unsigned long
+abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
+{
+	unsigned long pos;
+
+	while (abd_is_gang(abd))
+		abd = abd_gang_get_offset(abd, &off);
+
+	ASSERT(!abd_is_gang(abd));
+	if (abd_is_linear(abd))
+		pos = (unsigned long)abd_to_buf(abd) + off;
+	else
+		pos = ABD_SCATTER(abd).abd_offset + off;
+
+	return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
+	    (pos >> PAGE_SHIFT);
+}
+
+static unsigned int
+bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size)
+{
+	unsigned int offset, size, i;
+	struct page *page;
+
+	offset = offset_in_page(buf_ptr);
+	for (i = 0; i < bio->bi_max_vecs; i++) {
+		size = PAGE_SIZE - offset;
+
+		if (bio_size <= 0)
+			break;
+
+		if (size > bio_size)
+			size = bio_size;
+
+		if (is_vmalloc_addr(buf_ptr))
+			page = vmalloc_to_page(buf_ptr);
+		else
+			page = virt_to_page(buf_ptr);
+
+		/*
+		 * Some network related block device uses tcp_sendpage, which
+		 * doesn't behave well when using 0-count page, this is a
+		 * safety net to catch them.
+		 */
+		ASSERT3S(page_count(page), >, 0);
+
+		if (bio_add_page(bio, page, size, offset) != size)
+			break;
+
+		buf_ptr += size;
+		bio_size -= size;
+		offset = 0;
+	}
+
+	return (bio_size);
+}
+
+/*
+ * bio_map for gang ABD.
+ */
+static unsigned int
+abd_gang_bio_map_off(struct bio *bio, abd_t *abd,
+    unsigned int io_size, size_t off)
+{
+	ASSERT(abd_is_gang(abd));
+
+	for (abd_t *cabd = abd_gang_get_offset(abd, &off);
+	    cabd != NULL;
+	    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
+		ASSERT3U(off, <, cabd->abd_size);
+		int size = MIN(io_size, cabd->abd_size - off);
+		int remainder = abd_bio_map_off(bio, cabd, size, off);
+		io_size -= (size - remainder);
+		if (io_size == 0 || remainder > 0)
+			return (io_size);
+		off = 0;
+	}
+	ASSERT0(io_size);
+	return (io_size);
+}
+
+/*
+ * bio_map for ABD.
+ * @off is the offset in @abd
+ * Remaining IO size is returned
+ */
+unsigned int
+abd_bio_map_off(struct bio *bio, abd_t *abd,
+    unsigned int io_size, size_t off)
+{
+	int i;
+	struct abd_iter aiter;
+
+	ASSERT3U(io_size, <=, abd->abd_size - off);
+	if (abd_is_linear(abd))
+		return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size));
+
+	ASSERT(!abd_is_linear(abd));
+	if (abd_is_gang(abd))
+		return (abd_gang_bio_map_off(bio, abd, io_size, off));
+
+	abd_iter_init(&aiter, abd);
+	abd_iter_advance(&aiter, off);
+
+	for (i = 0; i < bio->bi_max_vecs; i++) {
+		struct page *pg;
+		size_t len, sgoff, pgoff;
+		struct scatterlist *sg;
+
+		if (io_size <= 0)
+			break;
+
+		sg = aiter.iter_sg;
+		sgoff = aiter.iter_offset;
+		pgoff = sgoff & (PAGESIZE - 1);
+		len = MIN(io_size, PAGESIZE - pgoff);
+		ASSERT(len > 0);
+
+		pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
+		if (bio_add_page(bio, pg, len, pgoff) != len)
+			break;
+
+		io_size -= len;
+		abd_iter_advance(&aiter, len);
+	}
+
+	return (io_size);
+}
+
+/* Tunable Parameters */
+module_param(zfs_abd_scatter_enabled, int, 0644);
+MODULE_PARM_DESC(zfs_abd_scatter_enabled,
+	"Toggle whether ABD allocations must be linear.");
+module_param(zfs_abd_scatter_min_size, int, 0644);
+MODULE_PARM_DESC(zfs_abd_scatter_min_size,
+	"Minimum size of scatter allocations.");
+/* CSTYLED */
+module_param(zfs_abd_scatter_max_order, uint, 0644);
+MODULE_PARM_DESC(zfs_abd_scatter_max_order,
+	"Maximum order allocation used for a scatter ABD.");
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
new file mode 100644
index 000000000000..92f9bae8ccd3
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
@@ -0,0 +1,465 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/zfs_refcount.h>
+#include <sys/vdev.h>
+#include <sys/vdev_trim.h>
+#include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/multilist.h>
+#include <sys/abd.h>
+#include <sys/zil.h>
+#include <sys/fm/fs/zfs.h>
+#ifdef _KERNEL
+#include <sys/shrinker.h>
+#include <sys/vmsystm.h>
+#include <sys/zpl.h>
+#include <linux/page_compat.h>
+#endif
+#include <sys/callb.h>
+#include <sys/kstat.h>
+#include <sys/zthr.h>
+#include <zfs_fletcher.h>
+#include <sys/arc_impl.h>
+#include <sys/trace_zfs.h>
+#include <sys/aggsum.h>
+
+/*
+ * This is a limit on how many pages the ARC shrinker makes available for
+ * eviction in response to one page allocation attempt.  Note that in
+ * practice, the kernel's shrinker can ask us to evict up to about 4x this
+ * for one allocation attempt.
+ *
+ * The default limit of 10,000 (in practice, 160MB per allocation attempt
+ * with 4K pages) limits the amount of time spent attempting to reclaim ARC
+ * memory to less than 100ms per allocation attempt, even with a small
+ * average compressed block size of ~8KB.
+ *
+ * See also the comment in arc_shrinker_count().
+ * Set to 0 to disable limit.
+ */
+int zfs_arc_shrinker_limit = 10000;
+
+
+/*
+ * Return a default max arc size based on the amount of physical memory.
+ */
+uint64_t
+arc_default_max(uint64_t min, uint64_t allmem)
+{
+	/* Default to 1/2 of all memory. */
+	return (MAX(allmem / 2, min));
+}
+
+#ifdef _KERNEL
+/*
+ * Return maximum amount of memory that we could possibly use.  Reduced
+ * to half of all memory in user space which is primarily used for testing.
+ */
+uint64_t
+arc_all_memory(void)
+{
+#ifdef CONFIG_HIGHMEM
+	return (ptob(zfs_totalram_pages - zfs_totalhigh_pages));
+#else
+	return (ptob(zfs_totalram_pages));
+#endif /* CONFIG_HIGHMEM */
+}
+
+/*
+ * Return the amount of memory that is considered free.  In user space
+ * which is primarily used for testing we pretend that free memory ranges
+ * from 0-20% of all memory.
+ */
+uint64_t
+arc_free_memory(void)
+{
+#ifdef CONFIG_HIGHMEM
+	struct sysinfo si;
+	si_meminfo(&si);
+	return (ptob(si.freeram - si.freehigh));
+#else
+	return (ptob(nr_free_pages() +
+	    nr_inactive_file_pages() +
+	    nr_slab_reclaimable_pages()));
+#endif /* CONFIG_HIGHMEM */
+}
+
+/*
+ * Return the amount of memory that can be consumed before reclaim will be
+ * needed.  Positive if there is sufficient free memory, negative indicates
+ * the amount of memory that needs to be freed up.
+ */
+int64_t
+arc_available_memory(void)
+{
+	return (arc_free_memory() - arc_sys_free);
+}
+
+static uint64_t
+arc_evictable_memory(void)
+{
+	int64_t asize = aggsum_value(&arc_size);
+	uint64_t arc_clean =
+	    zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) +
+	    zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) +
+	    zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) +
+	    zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+	uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0);
+
+	/*
+	 * Scale reported evictable memory in proportion to page cache, cap
+	 * at specified min/max.
+	 */
+	uint64_t min = (ptob(nr_file_pages()) / 100) * zfs_arc_pc_percent;
+	min = MAX(arc_c_min, MIN(arc_c_max, min));
+
+	if (arc_dirty >= min)
+		return (arc_clean);
+
+	return (MAX((int64_t)asize - (int64_t)min, 0));
+}
+
+/*
+ * The _count() function returns the number of free-able objects.
+ * The _scan() function returns the number of objects that were freed.
+ */
+static unsigned long
+arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	/*
+	 * __GFP_FS won't be set if we are called from ZFS code (see
+	 * kmem_flags_convert(), which removes it).  To avoid a deadlock, we
+	 * don't allow evicting in this case.  We return 0 rather than
+	 * SHRINK_STOP so that the shrinker logic doesn't accumulate a
+	 * deficit against us.
+	 */
+	if (!(sc->gfp_mask & __GFP_FS)) {
+		return (0);
+	}
+
+	/*
+	 * This code is reached in the "direct reclaim" case, where the
+	 * kernel (outside ZFS) is trying to allocate a page, and the system
+	 * is low on memory.
+	 *
+	 * The kernel's shrinker code doesn't understand how many pages the
+	 * ARC's callback actually frees, so it may ask the ARC to shrink a
+	 * lot for one page allocation. This is problematic because it may
+	 * take a long time, thus delaying the page allocation, and because
+	 * it may force the ARC to unnecessarily shrink very small.
+	 *
+	 * Therefore, we limit the amount of data that we say is evictable,
+	 * which limits the amount that the shrinker will ask us to evict for
+	 * one page allocation attempt.
+	 *
+	 * In practice, we may be asked to shrink 4x the limit to satisfy one
+	 * page allocation, before the kernel's shrinker code gives up on us.
+	 * When that happens, we rely on the kernel code to find the pages
+	 * that we freed before invoking the OOM killer.  This happens in
+	 * __alloc_pages_slowpath(), which retries and finds the pages we
+	 * freed when it calls get_page_from_freelist().
+	 *
+	 * See also the comment above zfs_arc_shrinker_limit.
+	 */
+	int64_t limit = zfs_arc_shrinker_limit != 0 ?
+	    zfs_arc_shrinker_limit : INT64_MAX;
+	return (MIN(limit, btop((int64_t)arc_evictable_memory())));
+}
+
+static unsigned long
+arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	ASSERT((sc->gfp_mask & __GFP_FS) != 0);
+
+	/* The arc is considered warm once reclaim has occurred */
+	if (unlikely(arc_warm == B_FALSE))
+		arc_warm = B_TRUE;
+
+	/*
+	 * Evict the requested number of pages by reducing arc_c and waiting
+	 * for the requested amount of data to be evicted.
+	 */
+	arc_reduce_target_size(ptob(sc->nr_to_scan));
+	arc_wait_for_eviction(ptob(sc->nr_to_scan));
+	if (current->reclaim_state != NULL)
+		current->reclaim_state->reclaimed_slab += sc->nr_to_scan;
+
+	/*
+	 * We are experiencing memory pressure which the arc_evict_zthr was
+	 * unable to keep up with. Set arc_no_grow to briefly pause arc
+	 * growth to avoid compounding the memory pressure.
+	 */
+	arc_no_grow = B_TRUE;
+
+	/*
+	 * When direct reclaim is observed it usually indicates a rapid
+	 * increase in memory pressure.  This occurs because the kswapd
+	 * threads were unable to asynchronously keep enough free memory
+	 * available.
+	 */
+	if (current_is_kswapd()) {
+		ARCSTAT_BUMP(arcstat_memory_indirect_count);
+	} else {
+		ARCSTAT_BUMP(arcstat_memory_direct_count);
+	}
+
+	return (sc->nr_to_scan);
+}
+
+SPL_SHRINKER_DECLARE(arc_shrinker,
+    arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS);
+
+int
+arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
+{
+	uint64_t free_memory = arc_free_memory();
+
+	if (free_memory > arc_all_memory() * arc_lotsfree_percent / 100)
+		return (0);
+
+	if (txg > spa->spa_lowmem_last_txg) {
+		spa->spa_lowmem_last_txg = txg;
+		spa->spa_lowmem_page_load = 0;
+	}
+	/*
+	 * If we are in pageout, we know that memory is already tight,
+	 * the arc is already going to be evicting, so we just want to
+	 * continue to let page writes occur as quickly as possible.
+	 */
+	if (current_is_kswapd()) {
+		if (spa->spa_lowmem_page_load >
+		    MAX(arc_sys_free / 4, free_memory) / 4) {
+			DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
+			return (SET_ERROR(ERESTART));
+		}
+		/* Note: reserve is inflated, so we deflate */
+		atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
+		return (0);
+	} else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
+		/* memory is low, delay before restarting */
+		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+		DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
+		return (SET_ERROR(EAGAIN));
+	}
+	spa->spa_lowmem_page_load = 0;
+	return (0);
+}
+
+void
+arc_lowmem_init(void)
+{
+	uint64_t allmem = arc_all_memory();
+
+	/*
+	 * Register a shrinker to support synchronous (direct) memory
+	 * reclaim from the arc.  This is done to prevent kswapd from
+	 * swapping out pages when it is preferable to shrink the arc.
+	 */
+	spl_register_shrinker(&arc_shrinker);
+
+	/*
+	 * The ARC tries to keep at least this much memory available for the
+	 * system.  This gives the ARC time to shrink in response to memory
+	 * pressure, before running completely out of memory and invoking the
+	 * direct-reclaim ARC shrinker.
+	 *
+	 * This should be more than twice high_wmark_pages(), so that
+	 * arc_wait_for_eviction() will wait until at least the
+	 * high_wmark_pages() are free (see arc_evict_state_impl()).
+	 *
+	 * Note: Even when the system is very low on memory, the kernel's
+	 * shrinker code may only ask for one "batch" of pages (512KB) to be
+	 * evicted.  If concurrent allocations consume these pages, there may
+	 * still be insufficient free pages, and the OOM killer takes action.
+	 *
+	 * By setting arc_sys_free large enough, and having
+	 * arc_wait_for_eviction() wait until there is at least arc_sys_free/2
+	 * free memory, it is much less likely that concurrent allocations can
+	 * consume all the memory that was evicted before checking for
+	 * OOM.
+	 *
+	 * It's hard to iterate the zones from a linux kernel module, which
+	 * makes it difficult to determine the watermark dynamically. Instead
+	 * we compute the maximum high watermark for this system, based
+	 * on the amount of memory, assuming default parameters on Linux kernel
+	 * 5.3.
+	 */
+
+	/*
+	 * Base wmark_low is 4 * the square root of Kbytes of RAM.
+	 */
+	long wmark = 4 * int_sqrt(allmem/1024) * 1024;
+
+	/*
+	 * Clamp to between 128K and 64MB.
+	 */
+	wmark = MAX(wmark, 128 * 1024);
+	wmark = MIN(wmark, 64 * 1024 * 1024);
+
+	/*
+	 * watermark_boost can increase the wmark by up to 150%.
+	 */
+	wmark += wmark * 150 / 100;
+
+	/*
+	 * arc_sys_free needs to be more than 2x the watermark, because
+	 * arc_wait_for_eviction() waits for half of arc_sys_free.  Bump this up
+	 * to 3x to ensure we're above it.
+	 */
+	arc_sys_free = wmark * 3 + allmem / 32;
+}
+
+void
+arc_lowmem_fini(void)
+{
+	spl_unregister_shrinker(&arc_shrinker);
+}
+
+int
+param_set_arc_long(const char *buf, zfs_kernel_param_t *kp)
+{
+	int error;
+
+	error = param_set_long(buf, kp);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	arc_tuning_update(B_TRUE);
+
+	return (0);
+}
+
+int
+param_set_arc_int(const char *buf, zfs_kernel_param_t *kp)
+{
+	int error;
+
+	error = param_set_int(buf, kp);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	arc_tuning_update(B_TRUE);
+
+	return (0);
+}
+#else /* _KERNEL */
+int64_t
+arc_available_memory(void)
+{
+	int64_t lowest = INT64_MAX;
+
+	/* Every 100 calls, free a small amount */
+	if (spa_get_random(100) == 0)
+		lowest = -1024;
+
+	return (lowest);
+}
+
+int
+arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
+{
+	return (0);
+}
+
+uint64_t
+arc_all_memory(void)
+{
+	return (ptob(physmem) / 2);
+}
+
+uint64_t
+arc_free_memory(void)
+{
+	return (spa_get_random(arc_all_memory() * 20 / 100));
+}
+#endif /* _KERNEL */
+
+/*
+ * Helper function for arc_prune_async() it is responsible for safely
+ * handling the execution of a registered arc_prune_func_t.
+ */
+static void
+arc_prune_task(void *ptr)
+{
+	arc_prune_t *ap = (arc_prune_t *)ptr;
+	arc_prune_func_t *func = ap->p_pfunc;
+
+	if (func != NULL)
+		func(ap->p_adjust, ap->p_private);
+
+	zfs_refcount_remove(&ap->p_refcnt, func);
+}
+
+/*
+ * Notify registered consumers they must drop holds on a portion of the ARC
+ * buffered they reference.  This provides a mechanism to ensure the ARC can
+ * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers.  This
+ * is analogous to dnlc_reduce_cache() but more generic.
+ *
+ * This operation is performed asynchronously so it may be safely called
+ * in the context of the arc_reclaim_thread().  A reference is taken here
+ * for each registered arc_prune_t and the arc_prune_task() is responsible
+ * for releasing it once the registered arc_prune_func_t has completed.
+ */
+void
+arc_prune_async(int64_t adjust)
+{
+	arc_prune_t *ap;
+
+	mutex_enter(&arc_prune_mtx);
+	for (ap = list_head(&arc_prune_list); ap != NULL;
+	    ap = list_next(&arc_prune_list, ap)) {
+
+		if (zfs_refcount_count(&ap->p_refcnt) >= 2)
+			continue;
+
+		zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
+		ap->p_adjust = adjust;
+		if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
+		    ap, TQ_SLEEP) == TASKQID_INVALID) {
+			zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
+			continue;
+		}
+		ARCSTAT_BUMP(arcstat_prune);
+	}
+	mutex_exit(&arc_prune_mtx);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW,
+	"Limit on number of pages that ARC shrinker can reclaim at once");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c b/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c
new file mode 100644
index 000000000000..ff3ef1bf6ad9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/mmp.h>
+
+int
+param_set_multihost_interval(const char *val, zfs_kernel_param_t *kp)
+{
+	int ret;
+
+	ret = param_set_ulong(val, kp);
+	if (ret < 0)
+		return (ret);
+
+	if (spa_mode_global != SPA_MODE_UNINIT)
+		mmp_signal_all_threads();
+
+	return (ret);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/policy.c b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
new file mode 100644
index 000000000000..5267d67eea82
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
@@ -0,0 +1,374 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright (C) 2016 Lawrence Livermore National Security, LLC.
+ *
+ * For Linux the vast majority of this enforcement is already handled via
+ * the standard Linux VFS permission checks.  However certain administrative
+ * commands which bypass the standard mechanisms may need to make use of
+ * this functionality.
+ */
+
+#include <sys/policy.h>
+#include <linux/security.h>
+#include <linux/vfs_compat.h>
+
+/*
+ * The passed credentials cannot be directly verified because Linux only
+ * provides and interface to check the *current* process credentials.  In
+ * order to handle this the capable() test is only run when the passed
+ * credentials match the current process credentials or the kcred.  In
+ * all other cases this function must fail and return the passed err.
+ */
+static int
+priv_policy_ns(const cred_t *cr, int capability, int err,
+    struct user_namespace *ns)
+{
+	if (cr != CRED() && (cr != kcred))
+		return (err);
+
+#if defined(CONFIG_USER_NS)
+	if (!(ns ? ns_capable(ns, capability) : capable(capability)))
+#else
+	if (!capable(capability))
+#endif
+		return (err);
+
+	return (0);
+}
+
+static int
+priv_policy(const cred_t *cr, int capability, int err)
+{
+	return (priv_policy_ns(cr, capability, err, NULL));
+}
+
+static int
+priv_policy_user(const cred_t *cr, int capability, int err)
+{
+	/*
+	 * All priv_policy_user checks are preceded by kuid/kgid_has_mapping()
+	 * checks. If we cannot do them, we shouldn't be using ns_capable()
+	 * since we don't know whether the affected files are valid in our
+	 * namespace.
+	 */
+#if defined(CONFIG_USER_NS)
+	return (priv_policy_ns(cr, capability, err, cr->user_ns));
+#else
+	return (priv_policy_ns(cr, capability, err, NULL));
+#endif
+}
+
+/*
+ * Checks for operations that are either client-only or are used by
+ * both clients and servers.
+ */
+int
+secpolicy_nfs(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_SYS_ADMIN, EPERM));
+}
+
+/*
+ * Catch all system configuration.
+ */
+int
+secpolicy_sys_config(const cred_t *cr, boolean_t checkonly)
+{
+	return (priv_policy(cr, CAP_SYS_ADMIN, EPERM));
+}
+
+/*
+ * Like secpolicy_vnode_access() but we get the actual wanted mode and the
+ * current mode of the file, not the missing bits.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner,
+    mode_t curmode, mode_t wantmode)
+{
+	return (0);
+}
+
+/*
+ * This is a special routine for ZFS; it is used to determine whether
+ * any of the privileges in effect allow any form of access to the
+ * file.  There's no reason to audit this or any reason to record
+ * this.  More work is needed to do the "KPLD" stuff.
+ */
+int
+secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner)
+{
+	if (crgetfsuid(cr) == owner)
+		return (0);
+
+	if (inode_owner_or_capable(ip))
+		return (0);
+
+#if defined(CONFIG_USER_NS)
+	if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+		return (EPERM);
+#endif
+
+	if (priv_policy_user(cr, CAP_DAC_OVERRIDE, EPERM) == 0)
+		return (0);
+
+	if (priv_policy_user(cr, CAP_DAC_READ_SEARCH, EPERM) == 0)
+		return (0);
+
+	return (EPERM);
+}
+
+/*
+ * Determine if subject can chown owner of a file.
+ */
+int
+secpolicy_vnode_chown(const cred_t *cr, uid_t owner)
+{
+	if (crgetfsuid(cr) == owner)
+		return (0);
+
+#if defined(CONFIG_USER_NS)
+	if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+		return (EPERM);
+#endif
+
+	return (priv_policy_user(cr, CAP_FOWNER, EPERM));
+}
+
+/*
+ * Determine if subject can change group ownership of a file.
+ */
+int
+secpolicy_vnode_create_gid(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_SETGID, EPERM));
+}
+
+/*
+ * Policy determines whether we can remove an entry from a directory,
+ * regardless of permission bits.
+ */
+int
+secpolicy_vnode_remove(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_FOWNER, EPERM));
+}
+
+/*
+ * Determine that subject can modify the mode of a file.  allzone privilege
+ * needed when modifying root owned object.
+ */
+int
+secpolicy_vnode_setdac(const cred_t *cr, uid_t owner)
+{
+	if (crgetfsuid(cr) == owner)
+		return (0);
+
+#if defined(CONFIG_USER_NS)
+	if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+		return (EPERM);
+#endif
+
+	return (priv_policy_user(cr, CAP_FOWNER, EPERM));
+}
+
+/*
+ * Are we allowed to retain the set-uid/set-gid bits when
+ * changing ownership or when writing to a file?
+ * "issuid" should be true when set-uid; only in that case
+ * root ownership is checked (setgid is assumed).
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot)
+{
+	return (priv_policy_user(cr, CAP_FSETID, EPERM));
+}
+
+/*
+ * Determine that subject can set the file setgid flag.
+ */
+int
+secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid)
+{
+#if defined(CONFIG_USER_NS)
+	if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid)))
+		return (EPERM);
+#endif
+	if (crgetfsgid(cr) != gid && !groupmember(gid, cr))
+		return (priv_policy_user(cr, CAP_FSETID, EPERM));
+
+	return (0);
+}
+
+/*
+ * Determine if the subject can inject faults in the ZFS fault injection
+ * framework.  Requires all privileges.
+ */
+int
+secpolicy_zinject(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_SYS_ADMIN, EACCES));
+}
+
+/*
+ * Determine if the subject has permission to manipulate ZFS datasets
+ * (not pools).  Equivalent to the SYS_MOUNT privilege.
+ */
+int
+secpolicy_zfs(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_SYS_ADMIN, EACCES));
+}
+
+/*
+ * Equivalent to secpolicy_zfs(), but works even if the cred_t is not that of
+ * the current process.  Takes both cred_t and proc_t so that this can work
+ * easily on all platforms.
+ *
+ * The has_capability() function was first exported in the 4.10 Linux kernel
+ * then backported to some LTS kernels.  Prior to this change there was no
+ * mechanism to perform this check therefore EACCES is returned when the
+ * functionality is not present in the kernel.
+ */
+int
+secpolicy_zfs_proc(const cred_t *cr, proc_t *proc)
+{
+#if defined(HAVE_HAS_CAPABILITY)
+	if (!has_capability(proc, CAP_SYS_ADMIN))
+		return (EACCES);
+	return (0);
+#else
+	return (EACCES);
+#endif
+}
+
+void
+secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
+{
+	if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
+	    secpolicy_vnode_setid_retain(cr,
+	    (vap->va_mode & S_ISUID) != 0 &&
+	    (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) {
+		vap->va_mask |= AT_MODE;
+		vap->va_mode &= ~(S_ISUID|S_ISGID);
+	}
+}
+
+/*
+ * Determine that subject can set the file setid flags.
+ */
+static int
+secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner)
+{
+	if (crgetfsuid(cr) == owner)
+		return (0);
+
+#if defined(CONFIG_USER_NS)
+	if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+		return (EPERM);
+#endif
+
+	return (priv_policy_user(cr, CAP_FSETID, EPERM));
+}
+
+/*
+ * Determine that subject can make a file a "sticky".
+ *
+ * Enforced in the Linux VFS.
+ */
+static int
+secpolicy_vnode_stky_modify(const cred_t *cr)
+{
+	return (0);
+}
+
+int
+secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap,
+    const vattr_t *ovap, cred_t *cr)
+{
+	int error;
+
+	if ((vap->va_mode & S_ISUID) != 0 &&
+	    (error = secpolicy_vnode_setid_modify(cr,
+	    ovap->va_uid)) != 0) {
+		return (error);
+	}
+
+	/*
+	 * Check privilege if attempting to set the
+	 * sticky bit on a non-directory.
+	 */
+	if (!S_ISDIR(ip->i_mode) && (vap->va_mode & S_ISVTX) != 0 &&
+	    secpolicy_vnode_stky_modify(cr) != 0) {
+		vap->va_mode &= ~S_ISVTX;
+	}
+
+	/*
+	 * Check for privilege if attempting to set the
+	 * group-id bit.
+	 */
+	if ((vap->va_mode & S_ISGID) != 0 &&
+	    secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) {
+		vap->va_mode &= ~S_ISGID;
+	}
+
+	return (0);
+}
+
+/*
+ * Check privileges for setting xvattr attributes
+ */
+int
+secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, mode_t type)
+{
+	return (secpolicy_vnode_chown(cr, owner));
+}
+
+/*
+ * Check privileges for setattr attributes.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_setattr(cred_t *cr, struct inode *ip, struct vattr *vap,
+    const struct vattr *ovap, int flags,
+    int unlocked_access(void *, int, cred_t *), void *node)
+{
+	return (0);
+}
+
+/*
+ * Check privileges for links.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_basic_link(const cred_t *cr)
+{
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat.c b/sys/contrib/openzfs/module/os/linux/zfs/qat.c
new file mode 100644
index 000000000000..08613b3a2042
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/qat.c
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <sys/zfs_context.h>
+#include <sys/qat.h>
+
+qat_stats_t qat_stats = {
+	{ "comp_requests",			KSTAT_DATA_UINT64 },
+	{ "comp_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "comp_total_out_bytes",		KSTAT_DATA_UINT64 },
+	{ "decomp_requests",			KSTAT_DATA_UINT64 },
+	{ "decomp_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "decomp_total_out_bytes",		KSTAT_DATA_UINT64 },
+	{ "dc_fails",				KSTAT_DATA_UINT64 },
+	{ "encrypt_requests",			KSTAT_DATA_UINT64 },
+	{ "encrypt_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "encrypt_total_out_bytes",		KSTAT_DATA_UINT64 },
+	{ "decrypt_requests",			KSTAT_DATA_UINT64 },
+	{ "decrypt_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "decrypt_total_out_bytes",		KSTAT_DATA_UINT64 },
+	{ "crypt_fails",			KSTAT_DATA_UINT64 },
+	{ "cksum_requests",			KSTAT_DATA_UINT64 },
+	{ "cksum_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "cksum_fails",			KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *qat_ksp = NULL;
+
+CpaStatus
+qat_mem_alloc_contig(void **pp_mem_addr, Cpa32U size_bytes)
+{
+	*pp_mem_addr = kmalloc(size_bytes, GFP_KERNEL);
+	if (*pp_mem_addr == NULL)
+		return (CPA_STATUS_RESOURCE);
+	return (CPA_STATUS_SUCCESS);
+}
+
+void
+qat_mem_free_contig(void **pp_mem_addr)
+{
+	if (*pp_mem_addr != NULL) {
+		kfree(*pp_mem_addr);
+		*pp_mem_addr = NULL;
+	}
+}
+
+int
+qat_init(void)
+{
+	qat_ksp = kstat_create("zfs", 0, "qat", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (qat_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (qat_ksp != NULL) {
+		qat_ksp->ks_data = &qat_stats;
+		kstat_install(qat_ksp);
+	}
+
+	/*
+	 * Just set the disable flag when qat init failed, qat can be
+	 * turned on again in post-process after zfs module is loaded, e.g.:
+	 * echo 0 > /sys/module/zfs/parameters/zfs_qat_compress_disable
+	 */
+	if (qat_dc_init() != 0)
+		zfs_qat_compress_disable = 1;
+
+	if (qat_cy_init() != 0) {
+		zfs_qat_checksum_disable = 1;
+		zfs_qat_encrypt_disable = 1;
+	}
+
+	return (0);
+}
+
+void
+qat_fini(void)
+{
+	if (qat_ksp != NULL) {
+		kstat_delete(qat_ksp);
+		qat_ksp = NULL;
+	}
+
+	qat_cy_fini();
+	qat_dc_fini();
+}
+
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c b/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c
new file mode 100644
index 000000000000..ad3ead3b16e3
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c
@@ -0,0 +1,569 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/completion.h>
+#include <sys/zfs_context.h>
+#include <sys/byteorder.h>
+#include <sys/zio.h>
+#include <sys/qat.h>
+
+/*
+ * Max instances in a QAT device, each instance is a channel to submit
+ * jobs to QAT hardware, this is only for pre-allocating instance and
+ * session arrays; the actual number of instances are defined in the
+ * QAT driver's configuration file.
+ */
+#define	QAT_DC_MAX_INSTANCES	48
+
+/*
+ * ZLIB head and foot size
+ */
+#define	ZLIB_HEAD_SZ		2
+#define	ZLIB_FOOT_SZ		4
+
+static CpaInstanceHandle dc_inst_handles[QAT_DC_MAX_INSTANCES];
+static CpaDcSessionHandle session_handles[QAT_DC_MAX_INSTANCES];
+static CpaBufferList **buffer_array[QAT_DC_MAX_INSTANCES];
+static Cpa16U num_inst = 0;
+static Cpa32U inst_num = 0;
+static boolean_t qat_dc_init_done = B_FALSE;
+int zfs_qat_compress_disable = 0;
+
+boolean_t
+qat_dc_use_accel(size_t s_len)
+{
+	return (!zfs_qat_compress_disable &&
+	    qat_dc_init_done &&
+	    s_len >= QAT_MIN_BUF_SIZE &&
+	    s_len <= QAT_MAX_BUF_SIZE);
+}
+
+static void
+qat_dc_callback(void *p_callback, CpaStatus status)
+{
+	if (p_callback != NULL)
+		complete((struct completion *)p_callback);
+}
+
+static void
+qat_dc_clean(void)
+{
+	Cpa16U buff_num = 0;
+	Cpa16U num_inter_buff_lists = 0;
+
+	for (Cpa16U i = 0; i < num_inst; i++) {
+		cpaDcStopInstance(dc_inst_handles[i]);
+		QAT_PHYS_CONTIG_FREE(session_handles[i]);
+		/* free intermediate buffers  */
+		if (buffer_array[i] != NULL) {
+			cpaDcGetNumIntermediateBuffers(
+			    dc_inst_handles[i], &num_inter_buff_lists);
+			for (buff_num = 0; buff_num < num_inter_buff_lists;
+			    buff_num++) {
+				CpaBufferList *buffer_inter =
+				    buffer_array[i][buff_num];
+				if (buffer_inter->pBuffers) {
+					QAT_PHYS_CONTIG_FREE(
+					    buffer_inter->pBuffers->pData);
+					QAT_PHYS_CONTIG_FREE(
+					    buffer_inter->pBuffers);
+				}
+				QAT_PHYS_CONTIG_FREE(
+				    buffer_inter->pPrivateMetaData);
+				QAT_PHYS_CONTIG_FREE(buffer_inter);
+			}
+		}
+	}
+
+	num_inst = 0;
+	qat_dc_init_done = B_FALSE;
+}
+
+int
+qat_dc_init(void)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa32U sess_size = 0;
+	Cpa32U ctx_size = 0;
+	Cpa16U num_inter_buff_lists = 0;
+	Cpa16U buff_num = 0;
+	Cpa32U buff_meta_size = 0;
+	CpaDcSessionSetupData sd = {0};
+
+	if (qat_dc_init_done)
+		return (0);
+
+	status = cpaDcGetNumInstances(&num_inst);
+	if (status != CPA_STATUS_SUCCESS)
+		return (-1);
+
+	/* if the user has configured no QAT compression units just return */
+	if (num_inst == 0)
+		return (0);
+
+	if (num_inst > QAT_DC_MAX_INSTANCES)
+		num_inst = QAT_DC_MAX_INSTANCES;
+
+	status = cpaDcGetInstances(num_inst, &dc_inst_handles[0]);
+	if (status != CPA_STATUS_SUCCESS)
+		return (-1);
+
+	for (Cpa16U i = 0; i < num_inst; i++) {
+		cpaDcSetAddressTranslation(dc_inst_handles[i],
+		    (void*)virt_to_phys);
+
+		status = cpaDcBufferListGetMetaSize(dc_inst_handles[i],
+		    1, &buff_meta_size);
+
+		if (status == CPA_STATUS_SUCCESS)
+			status = cpaDcGetNumIntermediateBuffers(
+			    dc_inst_handles[i], &num_inter_buff_lists);
+
+		if (status == CPA_STATUS_SUCCESS && num_inter_buff_lists != 0)
+			status = QAT_PHYS_CONTIG_ALLOC(&buffer_array[i],
+			    num_inter_buff_lists *
+			    sizeof (CpaBufferList *));
+
+		for (buff_num = 0; buff_num < num_inter_buff_lists;
+		    buff_num++) {
+			if (status == CPA_STATUS_SUCCESS)
+				status = QAT_PHYS_CONTIG_ALLOC(
+				    &buffer_array[i][buff_num],
+				    sizeof (CpaBufferList));
+
+			if (status == CPA_STATUS_SUCCESS)
+				status = QAT_PHYS_CONTIG_ALLOC(
+				    &buffer_array[i][buff_num]->
+				    pPrivateMetaData,
+				    buff_meta_size);
+
+			if (status == CPA_STATUS_SUCCESS)
+				status = QAT_PHYS_CONTIG_ALLOC(
+				    &buffer_array[i][buff_num]->pBuffers,
+				    sizeof (CpaFlatBuffer));
+
+			if (status == CPA_STATUS_SUCCESS) {
+				/*
+				 *  implementation requires an intermediate
+				 *  buffer approximately twice the size of
+				 *  output buffer, which is 2x max buffer
+				 *  size here.
+				 */
+				status = QAT_PHYS_CONTIG_ALLOC(
+				    &buffer_array[i][buff_num]->pBuffers->
+				    pData, 2 * QAT_MAX_BUF_SIZE);
+				if (status != CPA_STATUS_SUCCESS)
+					goto fail;
+
+				buffer_array[i][buff_num]->numBuffers = 1;
+				buffer_array[i][buff_num]->pBuffers->
+				    dataLenInBytes = 2 * QAT_MAX_BUF_SIZE;
+			}
+		}
+
+		status = cpaDcStartInstance(dc_inst_handles[i],
+		    num_inter_buff_lists, buffer_array[i]);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+
+		sd.compLevel = CPA_DC_L1;
+		sd.compType = CPA_DC_DEFLATE;
+		sd.huffType = CPA_DC_HT_FULL_DYNAMIC;
+		sd.sessDirection = CPA_DC_DIR_COMBINED;
+		sd.sessState = CPA_DC_STATELESS;
+		sd.deflateWindowSize = 7;
+		sd.checksum = CPA_DC_ADLER32;
+		status = cpaDcGetSessionSize(dc_inst_handles[i],
+		    &sd, &sess_size, &ctx_size);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+
+		QAT_PHYS_CONTIG_ALLOC(&session_handles[i], sess_size);
+		if (session_handles[i] == NULL)
+			goto fail;
+
+		status = cpaDcInitSession(dc_inst_handles[i],
+		    session_handles[i],
+		    &sd, NULL, qat_dc_callback);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+	}
+
+	qat_dc_init_done = B_TRUE;
+	return (0);
+fail:
+	qat_dc_clean();
+	return (-1);
+}
+
+void
+qat_dc_fini(void)
+{
+	if (!qat_dc_init_done)
+		return;
+
+	qat_dc_clean();
+}
+
+/*
+ * The "add" parameter is an additional buffer which is passed
+ * to QAT as a scratch buffer alongside the destination buffer
+ * in case the "compressed" data ends up being larger than the
+ * original source data. This is necessary to prevent QAT from
+ * generating buffer overflow warnings for incompressible data.
+ */
+static int
+qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len,
+    char *dst, int dst_len, char *add, int add_len, size_t *c_len)
+{
+	CpaInstanceHandle dc_inst_handle;
+	CpaDcSessionHandle session_handle;
+	CpaBufferList *buf_list_src = NULL;
+	CpaBufferList *buf_list_dst = NULL;
+	CpaFlatBuffer *flat_buf_src = NULL;
+	CpaFlatBuffer *flat_buf_dst = NULL;
+	Cpa8U *buffer_meta_src = NULL;
+	Cpa8U *buffer_meta_dst = NULL;
+	Cpa32U buffer_meta_size = 0;
+	CpaDcRqResults dc_results;
+	CpaStatus status = CPA_STATUS_FAIL;
+	Cpa32U hdr_sz = 0;
+	Cpa32U compressed_sz;
+	Cpa32U num_src_buf = (src_len >> PAGE_SHIFT) + 2;
+	Cpa32U num_dst_buf = (dst_len >> PAGE_SHIFT) + 2;
+	Cpa32U num_add_buf = (add_len >> PAGE_SHIFT) + 2;
+	Cpa32U bytes_left;
+	Cpa32U dst_pages = 0;
+	Cpa32U adler32 = 0;
+	char *data;
+	struct page *page;
+	struct page **in_pages = NULL;
+	struct page **out_pages = NULL;
+	struct page **add_pages = NULL;
+	Cpa32U page_off = 0;
+	struct completion complete;
+	Cpa32U page_num = 0;
+	Cpa16U i;
+
+	/*
+	 * We increment num_src_buf and num_dst_buf by 2 to allow
+	 * us to handle non page-aligned buffer addresses and buffers
+	 * whose sizes are not divisible by PAGE_SIZE.
+	 */
+	Cpa32U src_buffer_list_mem_size = sizeof (CpaBufferList) +
+	    (num_src_buf * sizeof (CpaFlatBuffer));
+	Cpa32U dst_buffer_list_mem_size = sizeof (CpaBufferList) +
+	    ((num_dst_buf + num_add_buf) * sizeof (CpaFlatBuffer));
+
+	status = QAT_PHYS_CONTIG_ALLOC(&in_pages,
+	    num_src_buf * sizeof (struct page *));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	status = QAT_PHYS_CONTIG_ALLOC(&out_pages,
+	    num_dst_buf * sizeof (struct page *));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	status = QAT_PHYS_CONTIG_ALLOC(&add_pages,
+	    num_add_buf * sizeof (struct page *));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+	dc_inst_handle = dc_inst_handles[i];
+	session_handle = session_handles[i];
+
+	cpaDcBufferListGetMetaSize(dc_inst_handle, num_src_buf,
+	    &buffer_meta_size);
+	status = QAT_PHYS_CONTIG_ALLOC(&buffer_meta_src, buffer_meta_size);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	cpaDcBufferListGetMetaSize(dc_inst_handle, num_dst_buf + num_add_buf,
+	    &buffer_meta_size);
+	status = QAT_PHYS_CONTIG_ALLOC(&buffer_meta_dst, buffer_meta_size);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	/* build source buffer list */
+	status = QAT_PHYS_CONTIG_ALLOC(&buf_list_src, src_buffer_list_mem_size);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	flat_buf_src = (CpaFlatBuffer *)(buf_list_src + 1);
+
+	buf_list_src->pBuffers = flat_buf_src; /* always point to first one */
+
+	/* build destination buffer list */
+	status = QAT_PHYS_CONTIG_ALLOC(&buf_list_dst, dst_buffer_list_mem_size);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1);
+
+	buf_list_dst->pBuffers = flat_buf_dst; /* always point to first one */
+
+	buf_list_src->numBuffers = 0;
+	buf_list_src->pPrivateMetaData = buffer_meta_src;
+	bytes_left = src_len;
+	data = src;
+	page_num = 0;
+	while (bytes_left > 0) {
+		page_off = ((long)data & ~PAGE_MASK);
+		page = qat_mem_to_page(data);
+		in_pages[page_num] = page;
+		flat_buf_src->pData = kmap(page) + page_off;
+		flat_buf_src->dataLenInBytes =
+		    min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+		bytes_left -= flat_buf_src->dataLenInBytes;
+		data += flat_buf_src->dataLenInBytes;
+		flat_buf_src++;
+		buf_list_src->numBuffers++;
+		page_num++;
+	}
+
+	buf_list_dst->numBuffers = 0;
+	buf_list_dst->pPrivateMetaData = buffer_meta_dst;
+	bytes_left = dst_len;
+	data = dst;
+	page_num = 0;
+	while (bytes_left > 0) {
+		page_off = ((long)data & ~PAGE_MASK);
+		page = qat_mem_to_page(data);
+		flat_buf_dst->pData = kmap(page) + page_off;
+		out_pages[page_num] = page;
+		flat_buf_dst->dataLenInBytes =
+		    min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+		bytes_left -= flat_buf_dst->dataLenInBytes;
+		data += flat_buf_dst->dataLenInBytes;
+		flat_buf_dst++;
+		buf_list_dst->numBuffers++;
+		page_num++;
+		dst_pages++;
+	}
+
+	/* map additional scratch pages into the destination buffer list */
+	bytes_left = add_len;
+	data = add;
+	page_num = 0;
+	while (bytes_left > 0) {
+		page_off = ((long)data & ~PAGE_MASK);
+		page = qat_mem_to_page(data);
+		flat_buf_dst->pData = kmap(page) + page_off;
+		add_pages[page_num] = page;
+		flat_buf_dst->dataLenInBytes =
+		    min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+		bytes_left -= flat_buf_dst->dataLenInBytes;
+		data += flat_buf_dst->dataLenInBytes;
+		flat_buf_dst++;
+		buf_list_dst->numBuffers++;
+		page_num++;
+	}
+
+	init_completion(&complete);
+
+	if (dir == QAT_COMPRESS) {
+		QAT_STAT_BUMP(comp_requests);
+		QAT_STAT_INCR(comp_total_in_bytes, src_len);
+
+		cpaDcGenerateHeader(session_handle,
+		    buf_list_dst->pBuffers, &hdr_sz);
+		buf_list_dst->pBuffers->pData += hdr_sz;
+		buf_list_dst->pBuffers->dataLenInBytes -= hdr_sz;
+		status = cpaDcCompressData(
+		    dc_inst_handle, session_handle,
+		    buf_list_src, buf_list_dst,
+		    &dc_results, CPA_DC_FLUSH_FINAL,
+		    &complete);
+		if (status != CPA_STATUS_SUCCESS) {
+			goto fail;
+		}
+
+		/* we now wait until the completion of the operation. */
+		wait_for_completion(&complete);
+
+		if (dc_results.status != CPA_STATUS_SUCCESS) {
+			status = CPA_STATUS_FAIL;
+			goto fail;
+		}
+
+		compressed_sz = dc_results.produced;
+		if (compressed_sz + hdr_sz + ZLIB_FOOT_SZ > dst_len) {
+			status = CPA_STATUS_INCOMPRESSIBLE;
+			goto fail;
+		}
+
+		flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1);
+		/* move to the last page */
+		flat_buf_dst += (compressed_sz + hdr_sz) >> PAGE_SHIFT;
+
+		/* no space for gzip footer in the last page */
+		if (((compressed_sz + hdr_sz) % PAGE_SIZE)
+		    + ZLIB_FOOT_SZ > PAGE_SIZE) {
+			status = CPA_STATUS_INCOMPRESSIBLE;
+			goto fail;
+		}
+
+		/* jump to the end of the buffer and append footer */
+		flat_buf_dst->pData =
+		    (char *)((unsigned long)flat_buf_dst->pData & PAGE_MASK)
+		    + ((compressed_sz + hdr_sz) % PAGE_SIZE);
+		flat_buf_dst->dataLenInBytes = ZLIB_FOOT_SZ;
+
+		dc_results.produced = 0;
+		status = cpaDcGenerateFooter(session_handle,
+		    flat_buf_dst, &dc_results);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+
+		*c_len = compressed_sz + dc_results.produced + hdr_sz;
+		QAT_STAT_INCR(comp_total_out_bytes, *c_len);
+	} else {
+		ASSERT3U(dir, ==, QAT_DECOMPRESS);
+		QAT_STAT_BUMP(decomp_requests);
+		QAT_STAT_INCR(decomp_total_in_bytes, src_len);
+
+		buf_list_src->pBuffers->pData += ZLIB_HEAD_SZ;
+		buf_list_src->pBuffers->dataLenInBytes -= ZLIB_HEAD_SZ;
+		status = cpaDcDecompressData(dc_inst_handle, session_handle,
+		    buf_list_src, buf_list_dst, &dc_results, CPA_DC_FLUSH_FINAL,
+		    &complete);
+
+		if (CPA_STATUS_SUCCESS != status) {
+			status = CPA_STATUS_FAIL;
+			goto fail;
+		}
+
+		/* we now wait until the completion of the operation. */
+		wait_for_completion(&complete);
+
+		if (dc_results.status != CPA_STATUS_SUCCESS) {
+			status = CPA_STATUS_FAIL;
+			goto fail;
+		}
+
+		/* verify adler checksum */
+		adler32 = *(Cpa32U *)(src + dc_results.consumed + ZLIB_HEAD_SZ);
+		if (adler32 != BSWAP_32(dc_results.checksum)) {
+			status = CPA_STATUS_FAIL;
+			goto fail;
+		}
+		*c_len = dc_results.produced;
+		QAT_STAT_INCR(decomp_total_out_bytes, *c_len);
+	}
+
+fail:
+	if (status != CPA_STATUS_SUCCESS && status != CPA_STATUS_INCOMPRESSIBLE)
+		QAT_STAT_BUMP(dc_fails);
+
+	if (in_pages) {
+		for (page_num = 0;
+		    page_num < buf_list_src->numBuffers;
+		    page_num++) {
+			kunmap(in_pages[page_num]);
+		}
+		QAT_PHYS_CONTIG_FREE(in_pages);
+	}
+
+	if (out_pages) {
+		for (page_num = 0; page_num < dst_pages; page_num++) {
+			kunmap(out_pages[page_num]);
+		}
+		QAT_PHYS_CONTIG_FREE(out_pages);
+	}
+
+	if (add_pages) {
+		for (page_num = 0;
+		    page_num < buf_list_dst->numBuffers - dst_pages;
+		    page_num++) {
+			kunmap(add_pages[page_num]);
+		}
+		QAT_PHYS_CONTIG_FREE(add_pages);
+	}
+
+	QAT_PHYS_CONTIG_FREE(buffer_meta_src);
+	QAT_PHYS_CONTIG_FREE(buffer_meta_dst);
+	QAT_PHYS_CONTIG_FREE(buf_list_src);
+	QAT_PHYS_CONTIG_FREE(buf_list_dst);
+
+	return (status);
+}
+
+/*
+ * Entry point for QAT accelerated compression / decompression.
+ */
+int
+qat_compress(qat_compress_dir_t dir, char *src, int src_len,
+    char *dst, int dst_len, size_t *c_len)
+{
+	int ret;
+	size_t add_len = 0;
+	void *add = NULL;
+
+	if (dir == QAT_COMPRESS) {
+		add_len = dst_len;
+		add = zio_data_buf_alloc(add_len);
+	}
+
+	ret = qat_compress_impl(dir, src, src_len, dst,
+	    dst_len, add, add_len, c_len);
+
+	if (dir == QAT_COMPRESS)
+		zio_data_buf_free(add, add_len);
+
+	return (ret);
+}
+
+static int
+param_set_qat_compress(const char *val, zfs_kernel_param_t *kp)
+{
+	int ret;
+	int *pvalue = kp->arg;
+	ret = param_set_int(val, kp);
+	if (ret)
+		return (ret);
+	/*
+	 * zfs_qat_compress_disable = 0: enable qat compress
+	 * try to initialize qat instance if it has not been done
+	 */
+	if (*pvalue == 0 && !qat_dc_init_done) {
+		ret = qat_dc_init();
+		if (ret != 0) {
+			zfs_qat_compress_disable = 1;
+			return (ret);
+		}
+	}
+	return (ret);
+}
+
+module_param_call(zfs_qat_compress_disable, param_set_qat_compress,
+    param_get_int, &zfs_qat_compress_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_compress_disable, "Enable/Disable QAT compression");
+
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c
new file mode 100644
index 000000000000..4771b2f3bec5
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c
@@ -0,0 +1,630 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * This file represents the QAT implementation of checksums and encryption.
+ * Internally, QAT shares the same cryptographic instances for both of these
+ * operations, so the code has been combined here. QAT data compression uses
+ * compression instances, so that code is separated into qat_compress.c
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/completion.h>
+#include <sys/zfs_context.h>
+#include <sys/zio_crypt.h>
+#include "lac/cpa_cy_im.h"
+#include "lac/cpa_cy_common.h"
+#include <sys/qat.h>
+
+/*
+ * Max instances in a QAT device, each instance is a channel to submit
+ * jobs to QAT hardware, this is only for pre-allocating instances
+ * and session arrays; the actual number of instances are defined in
+ * the QAT driver's configure file.
+ */
+#define	QAT_CRYPT_MAX_INSTANCES		48
+
+#define	MAX_PAGE_NUM			1024
+
+static Cpa32U inst_num = 0;
+static Cpa16U num_inst = 0;
+static CpaInstanceHandle cy_inst_handles[QAT_CRYPT_MAX_INSTANCES];
+static boolean_t qat_cy_init_done = B_FALSE;
+int zfs_qat_encrypt_disable = 0;
+int zfs_qat_checksum_disable = 0;
+
+typedef struct cy_callback {
+	CpaBoolean verify_result;
+	struct completion complete;
+} cy_callback_t;
+
+static void
+symcallback(void *p_callback, CpaStatus status, const CpaCySymOp operation,
+    void *op_data, CpaBufferList *buf_list_dst, CpaBoolean verify)
+{
+	cy_callback_t *cb = p_callback;
+
+	if (cb != NULL) {
+		/* indicate that the function has been called */
+		cb->verify_result = verify;
+		complete(&cb->complete);
+	}
+}
+
+boolean_t
+qat_crypt_use_accel(size_t s_len)
+{
+	return (!zfs_qat_encrypt_disable &&
+	    qat_cy_init_done &&
+	    s_len >= QAT_MIN_BUF_SIZE &&
+	    s_len <= QAT_MAX_BUF_SIZE);
+}
+
+boolean_t
+qat_checksum_use_accel(size_t s_len)
+{
+	return (!zfs_qat_checksum_disable &&
+	    qat_cy_init_done &&
+	    s_len >= QAT_MIN_BUF_SIZE &&
+	    s_len <= QAT_MAX_BUF_SIZE);
+}
+
+void
+qat_cy_clean(void)
+{
+	for (Cpa16U i = 0; i < num_inst; i++)
+		cpaCyStopInstance(cy_inst_handles[i]);
+
+	num_inst = 0;
+	qat_cy_init_done = B_FALSE;
+}
+
+int
+qat_cy_init(void)
+{
+	CpaStatus status = CPA_STATUS_FAIL;
+
+	if (qat_cy_init_done)
+		return (0);
+
+	status = cpaCyGetNumInstances(&num_inst);
+	if (status != CPA_STATUS_SUCCESS)
+		return (-1);
+
+	/* if the user has configured no QAT encryption units just return */
+	if (num_inst == 0)
+		return (0);
+
+	if (num_inst > QAT_CRYPT_MAX_INSTANCES)
+		num_inst = QAT_CRYPT_MAX_INSTANCES;
+
+	status = cpaCyGetInstances(num_inst, &cy_inst_handles[0]);
+	if (status != CPA_STATUS_SUCCESS)
+		return (-1);
+
+	for (Cpa16U i = 0; i < num_inst; i++) {
+		status = cpaCySetAddressTranslation(cy_inst_handles[i],
+		    (void *)virt_to_phys);
+		if (status != CPA_STATUS_SUCCESS)
+			goto error;
+
+		status = cpaCyStartInstance(cy_inst_handles[i]);
+		if (status != CPA_STATUS_SUCCESS)
+			goto error;
+	}
+
+	qat_cy_init_done = B_TRUE;
+	return (0);
+
+error:
+	qat_cy_clean();
+	return (-1);
+}
+
+void
+qat_cy_fini(void)
+{
+	if (!qat_cy_init_done)
+		return;
+
+	qat_cy_clean();
+}
+
+static CpaStatus
+qat_init_crypt_session_ctx(qat_encrypt_dir_t dir, CpaInstanceHandle inst_handle,
+    CpaCySymSessionCtx **cy_session_ctx, crypto_key_t *key,
+    Cpa64U crypt, Cpa32U aad_len)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa32U ctx_size;
+	Cpa32U ciper_algorithm;
+	Cpa32U hash_algorithm;
+	CpaCySymSessionSetupData sd = { 0 };
+
+	if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_CCM) {
+		return (CPA_STATUS_FAIL);
+	} else {
+		ciper_algorithm = CPA_CY_SYM_CIPHER_AES_GCM;
+		hash_algorithm = CPA_CY_SYM_HASH_AES_GCM;
+	}
+
+	sd.cipherSetupData.cipherAlgorithm = ciper_algorithm;
+	sd.cipherSetupData.pCipherKey = key->ck_data;
+	sd.cipherSetupData.cipherKeyLenInBytes = key->ck_length / 8;
+	sd.hashSetupData.hashAlgorithm = hash_algorithm;
+	sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_AUTH;
+	sd.hashSetupData.digestResultLenInBytes = ZIO_DATA_MAC_LEN;
+	sd.hashSetupData.authModeSetupData.aadLenInBytes = aad_len;
+	sd.sessionPriority = CPA_CY_PRIORITY_NORMAL;
+	sd.symOperation = CPA_CY_SYM_OP_ALGORITHM_CHAINING;
+	sd.digestIsAppended = CPA_FALSE;
+	sd.verifyDigest = CPA_FALSE;
+
+	if (dir == QAT_ENCRYPT) {
+		sd.cipherSetupData.cipherDirection =
+		    CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT;
+		sd.algChainOrder =
+		    CPA_CY_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER;
+	} else {
+		ASSERT3U(dir, ==, QAT_DECRYPT);
+		sd.cipherSetupData.cipherDirection =
+		    CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT;
+		sd.algChainOrder =
+		    CPA_CY_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH;
+	}
+
+	status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = cpaCySymInitSession(inst_handle, symcallback, &sd,
+	    *cy_session_ctx);
+	if (status != CPA_STATUS_SUCCESS) {
+		QAT_PHYS_CONTIG_FREE(*cy_session_ctx);
+		return (status);
+	}
+
+	return (CPA_STATUS_SUCCESS);
+}
+
+static CpaStatus
+qat_init_checksum_session_ctx(CpaInstanceHandle inst_handle,
+    CpaCySymSessionCtx **cy_session_ctx, Cpa64U cksum)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa32U ctx_size;
+	Cpa32U hash_algorithm;
+	CpaCySymSessionSetupData sd = { 0 };
+
+	/*
+	 * ZFS's SHA512 checksum is actually SHA512/256, which uses
+	 * a different IV from standard SHA512. QAT does not support
+	 * SHA512/256, so we can only support SHA256.
+	 */
+	if (cksum == ZIO_CHECKSUM_SHA256)
+		hash_algorithm = CPA_CY_SYM_HASH_SHA256;
+	else
+		return (CPA_STATUS_FAIL);
+
+	sd.sessionPriority = CPA_CY_PRIORITY_NORMAL;
+	sd.symOperation = CPA_CY_SYM_OP_HASH;
+	sd.hashSetupData.hashAlgorithm = hash_algorithm;
+	sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_PLAIN;
+	sd.hashSetupData.digestResultLenInBytes = sizeof (zio_cksum_t);
+	sd.digestIsAppended = CPA_FALSE;
+	sd.verifyDigest = CPA_FALSE;
+
+	status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = cpaCySymInitSession(inst_handle, symcallback, &sd,
+	    *cy_session_ctx);
+	if (status != CPA_STATUS_SUCCESS) {
+		QAT_PHYS_CONTIG_FREE(*cy_session_ctx);
+		return (status);
+	}
+
+	return (CPA_STATUS_SUCCESS);
+}
+
+static CpaStatus
+qat_init_cy_buffer_lists(CpaInstanceHandle inst_handle, uint32_t nr_bufs,
+    CpaBufferList *src, CpaBufferList *dst)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa32U meta_size = 0;
+
+	status = cpaCyBufferListGetMetaSize(inst_handle, nr_bufs, &meta_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = QAT_PHYS_CONTIG_ALLOC(&src->pPrivateMetaData, meta_size);
+	if (status != CPA_STATUS_SUCCESS)
+		goto error;
+
+	if (src != dst) {
+		status = QAT_PHYS_CONTIG_ALLOC(&dst->pPrivateMetaData,
+		    meta_size);
+		if (status != CPA_STATUS_SUCCESS)
+			goto error;
+	}
+
+	return (CPA_STATUS_SUCCESS);
+
+error:
+	QAT_PHYS_CONTIG_FREE(src->pPrivateMetaData);
+	if (src != dst)
+		QAT_PHYS_CONTIG_FREE(dst->pPrivateMetaData);
+
+	return (status);
+}
+
+int
+qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf,
+    uint8_t *aad_buf, uint32_t aad_len, uint8_t *iv_buf, uint8_t *digest_buf,
+    crypto_key_t *key, uint64_t crypt, uint32_t enc_len)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa16U i;
+	CpaInstanceHandle cy_inst_handle;
+	Cpa16U nr_bufs = (enc_len >> PAGE_SHIFT) + 2;
+	Cpa32U bytes_left = 0;
+	Cpa8S *data = NULL;
+	CpaCySymSessionCtx *cy_session_ctx = NULL;
+	cy_callback_t cb;
+	CpaCySymOpData op_data = { 0 };
+	CpaBufferList src_buffer_list = { 0 };
+	CpaBufferList dst_buffer_list = { 0 };
+	CpaFlatBuffer *flat_src_buf_array = NULL;
+	CpaFlatBuffer *flat_src_buf = NULL;
+	CpaFlatBuffer *flat_dst_buf_array = NULL;
+	CpaFlatBuffer *flat_dst_buf = NULL;
+	struct page *in_pages[MAX_PAGE_NUM];
+	struct page *out_pages[MAX_PAGE_NUM];
+	Cpa32U in_page_num = 0;
+	Cpa32U out_page_num = 0;
+	Cpa32U in_page_off = 0;
+	Cpa32U out_page_off = 0;
+
+	if (dir == QAT_ENCRYPT) {
+		QAT_STAT_BUMP(encrypt_requests);
+		QAT_STAT_INCR(encrypt_total_in_bytes, enc_len);
+	} else {
+		QAT_STAT_BUMP(decrypt_requests);
+		QAT_STAT_INCR(decrypt_total_in_bytes, enc_len);
+	}
+
+	i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+	cy_inst_handle = cy_inst_handles[i];
+
+	status = qat_init_crypt_session_ctx(dir, cy_inst_handle,
+	    &cy_session_ctx, key, crypt, aad_len);
+	if (status != CPA_STATUS_SUCCESS) {
+		/* don't count CCM as a failure since it's not supported */
+		if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_GCM)
+			QAT_STAT_BUMP(crypt_fails);
+		return (status);
+	}
+
+	/*
+	 * We increment nr_bufs by 2 to allow us to handle non
+	 * page-aligned buffer addresses and buffers whose sizes
+	 * are not divisible by PAGE_SIZE.
+	 */
+	status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs,
+	    &src_buffer_list, &dst_buffer_list);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array,
+	    nr_bufs * sizeof (CpaFlatBuffer));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	status = QAT_PHYS_CONTIG_ALLOC(&flat_dst_buf_array,
+	    nr_bufs * sizeof (CpaFlatBuffer));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	status = QAT_PHYS_CONTIG_ALLOC(&op_data.pDigestResult,
+	    ZIO_DATA_MAC_LEN);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	status = QAT_PHYS_CONTIG_ALLOC(&op_data.pIv,
+	    ZIO_DATA_IV_LEN);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	if (aad_len > 0) {
+		status = QAT_PHYS_CONTIG_ALLOC(&op_data.pAdditionalAuthData,
+		    aad_len);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+		bcopy(aad_buf, op_data.pAdditionalAuthData, aad_len);
+	}
+
+	bytes_left = enc_len;
+	data = src_buf;
+	flat_src_buf = flat_src_buf_array;
+	while (bytes_left > 0) {
+		in_page_off = ((long)data & ~PAGE_MASK);
+		in_pages[in_page_num] = qat_mem_to_page(data);
+		flat_src_buf->pData = kmap(in_pages[in_page_num]) + in_page_off;
+		flat_src_buf->dataLenInBytes =
+		    min((long)PAGE_SIZE - in_page_off, (long)bytes_left);
+		data += flat_src_buf->dataLenInBytes;
+		bytes_left -= flat_src_buf->dataLenInBytes;
+		flat_src_buf++;
+		in_page_num++;
+	}
+	src_buffer_list.pBuffers = flat_src_buf_array;
+	src_buffer_list.numBuffers = in_page_num;
+
+	bytes_left = enc_len;
+	data = dst_buf;
+	flat_dst_buf = flat_dst_buf_array;
+	while (bytes_left > 0) {
+		out_page_off = ((long)data & ~PAGE_MASK);
+		out_pages[out_page_num] = qat_mem_to_page(data);
+		flat_dst_buf->pData = kmap(out_pages[out_page_num]) +
+		    out_page_off;
+		flat_dst_buf->dataLenInBytes =
+		    min((long)PAGE_SIZE - out_page_off, (long)bytes_left);
+		data += flat_dst_buf->dataLenInBytes;
+		bytes_left -= flat_dst_buf->dataLenInBytes;
+		flat_dst_buf++;
+		out_page_num++;
+	}
+	dst_buffer_list.pBuffers = flat_dst_buf_array;
+	dst_buffer_list.numBuffers = out_page_num;
+
+	op_data.sessionCtx = cy_session_ctx;
+	op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
+	op_data.cryptoStartSrcOffsetInBytes = 0;
+	op_data.messageLenToCipherInBytes = 0;
+	op_data.hashStartSrcOffsetInBytes = 0;
+	op_data.messageLenToHashInBytes = 0;
+	op_data.messageLenToCipherInBytes = enc_len;
+	op_data.ivLenInBytes = ZIO_DATA_IV_LEN;
+	bcopy(iv_buf, op_data.pIv, ZIO_DATA_IV_LEN);
+	/* if dir is QAT_DECRYPT, copy digest_buf to pDigestResult */
+	if (dir == QAT_DECRYPT)
+		bcopy(digest_buf, op_data.pDigestResult, ZIO_DATA_MAC_LEN);
+
+	cb.verify_result = CPA_FALSE;
+	init_completion(&cb.complete);
+	status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data,
+	    &src_buffer_list, &dst_buffer_list, NULL);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	/* we now wait until the completion of the operation. */
+	wait_for_completion(&cb.complete);
+
+	if (cb.verify_result == CPA_FALSE) {
+		status = CPA_STATUS_FAIL;
+		goto fail;
+	}
+
+	if (dir == QAT_ENCRYPT) {
+		/* if dir is QAT_ENCRYPT, save pDigestResult to digest_buf */
+		bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN);
+		QAT_STAT_INCR(encrypt_total_out_bytes, enc_len);
+	} else {
+		QAT_STAT_INCR(decrypt_total_out_bytes, enc_len);
+	}
+
+fail:
+	if (status != CPA_STATUS_SUCCESS)
+		QAT_STAT_BUMP(crypt_fails);
+
+	for (i = 0; i < in_page_num; i++)
+		kunmap(in_pages[i]);
+	for (i = 0; i < out_page_num; i++)
+		kunmap(out_pages[i]);
+
+	cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx);
+	if (aad_len > 0)
+		QAT_PHYS_CONTIG_FREE(op_data.pAdditionalAuthData);
+	QAT_PHYS_CONTIG_FREE(op_data.pIv);
+	QAT_PHYS_CONTIG_FREE(op_data.pDigestResult);
+	QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData);
+	QAT_PHYS_CONTIG_FREE(dst_buffer_list.pPrivateMetaData);
+	QAT_PHYS_CONTIG_FREE(cy_session_ctx);
+	QAT_PHYS_CONTIG_FREE(flat_src_buf_array);
+	QAT_PHYS_CONTIG_FREE(flat_dst_buf_array);
+
+	return (status);
+}
+
+int
+qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	CpaStatus status;
+	Cpa16U i;
+	CpaInstanceHandle cy_inst_handle;
+	Cpa16U nr_bufs = (size >> PAGE_SHIFT) + 2;
+	Cpa32U bytes_left = 0;
+	Cpa8S *data = NULL;
+	CpaCySymSessionCtx *cy_session_ctx = NULL;
+	cy_callback_t cb;
+	Cpa8U *digest_buffer = NULL;
+	CpaCySymOpData op_data = { 0 };
+	CpaBufferList src_buffer_list = { 0 };
+	CpaFlatBuffer *flat_src_buf_array = NULL;
+	CpaFlatBuffer *flat_src_buf = NULL;
+	struct page *in_pages[MAX_PAGE_NUM];
+	Cpa32U page_num = 0;
+	Cpa32U page_off = 0;
+
+	QAT_STAT_BUMP(cksum_requests);
+	QAT_STAT_INCR(cksum_total_in_bytes, size);
+
+	i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+	cy_inst_handle = cy_inst_handles[i];
+
+	status = qat_init_checksum_session_ctx(cy_inst_handle,
+	    &cy_session_ctx, cksum);
+	if (status != CPA_STATUS_SUCCESS) {
+		/* don't count unsupported checksums as a failure */
+		if (cksum == ZIO_CHECKSUM_SHA256 ||
+		    cksum == ZIO_CHECKSUM_SHA512)
+			QAT_STAT_BUMP(cksum_fails);
+		return (status);
+	}
+
+	/*
+	 * We increment nr_bufs by 2 to allow us to handle non
+	 * page-aligned buffer addresses and buffers whose sizes
+	 * are not divisible by PAGE_SIZE.
+	 */
+	status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs,
+	    &src_buffer_list, &src_buffer_list);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array,
+	    nr_bufs * sizeof (CpaFlatBuffer));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	status = QAT_PHYS_CONTIG_ALLOC(&digest_buffer,
+	    sizeof (zio_cksum_t));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	bytes_left = size;
+	data = buf;
+	flat_src_buf = flat_src_buf_array;
+	while (bytes_left > 0) {
+		page_off = ((long)data & ~PAGE_MASK);
+		in_pages[page_num] = qat_mem_to_page(data);
+		flat_src_buf->pData = kmap(in_pages[page_num]) + page_off;
+		flat_src_buf->dataLenInBytes =
+		    min((long)PAGE_SIZE - page_off, (long)bytes_left);
+		data += flat_src_buf->dataLenInBytes;
+		bytes_left -= flat_src_buf->dataLenInBytes;
+		flat_src_buf++;
+		page_num++;
+	}
+	src_buffer_list.pBuffers = flat_src_buf_array;
+	src_buffer_list.numBuffers = page_num;
+
+	op_data.sessionCtx = cy_session_ctx;
+	op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
+	op_data.hashStartSrcOffsetInBytes = 0;
+	op_data.messageLenToHashInBytes = size;
+	op_data.pDigestResult = digest_buffer;
+
+	cb.verify_result = CPA_FALSE;
+	init_completion(&cb.complete);
+	status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data,
+	    &src_buffer_list, &src_buffer_list, NULL);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	/* we now wait until the completion of the operation. */
+	wait_for_completion(&cb.complete);
+
+	if (cb.verify_result == CPA_FALSE) {
+		status = CPA_STATUS_FAIL;
+		goto fail;
+	}
+
+	bcopy(digest_buffer, zcp, sizeof (zio_cksum_t));
+
+fail:
+	if (status != CPA_STATUS_SUCCESS)
+		QAT_STAT_BUMP(cksum_fails);
+
+	for (i = 0; i < page_num; i++)
+		kunmap(in_pages[i]);
+
+	cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx);
+	QAT_PHYS_CONTIG_FREE(digest_buffer);
+	QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData);
+	QAT_PHYS_CONTIG_FREE(cy_session_ctx);
+	QAT_PHYS_CONTIG_FREE(flat_src_buf_array);
+
+	return (status);
+}
+
+static int
+param_set_qat_encrypt(const char *val, zfs_kernel_param_t *kp)
+{
+	int ret;
+	int *pvalue = kp->arg;
+	ret = param_set_int(val, kp);
+	if (ret)
+		return (ret);
+	/*
+	 * zfs_qat_encrypt_disable = 0: enable qat encrypt
+	 * try to initialize qat instance if it has not been done
+	 */
+	if (*pvalue == 0 && !qat_cy_init_done) {
+		ret = qat_cy_init();
+		if (ret != 0) {
+			zfs_qat_encrypt_disable = 1;
+			return (ret);
+		}
+	}
+	return (ret);
+}
+
+static int
+param_set_qat_checksum(const char *val, zfs_kernel_param_t *kp)
+{
+	int ret;
+	int *pvalue = kp->arg;
+	ret = param_set_int(val, kp);
+	if (ret)
+		return (ret);
+	/*
+	 * set_checksum_param_ops = 0: enable qat checksum
+	 * try to initialize qat instance if it has not been done
+	 */
+	if (*pvalue == 0 && !qat_cy_init_done) {
+		ret = qat_cy_init();
+		if (ret != 0) {
+			zfs_qat_checksum_disable = 1;
+			return (ret);
+		}
+	}
+	return (ret);
+}
+
+module_param_call(zfs_qat_encrypt_disable, param_set_qat_encrypt,
+    param_get_int, &zfs_qat_encrypt_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_encrypt_disable, "Enable/Disable QAT encryption");
+
+module_param_call(zfs_qat_checksum_disable, param_set_qat_checksum,
+    param_get_int, &zfs_qat_checksum_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_checksum_disable, "Enable/Disable QAT checksumming");
+
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c
new file mode 100644
index 000000000000..5672cd6d5c5e
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c
@@ -0,0 +1,110 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/fm/util.h>
+#include <sys/dsl_scan.h>
+#include <sys/fs/zfs.h>
+#include <sys/kstat.h>
+#include "zfs_prop.h"
+
+
+int
+param_set_deadman_failmode(const char *val, zfs_kernel_param_t *kp)
+{
+	int error;
+
+	error = -param_set_deadman_failmode_common(val);
+	if (error == 0)
+		error = param_set_charp(val, kp);
+
+	return (error);
+}
+
+int
+param_set_deadman_ziotime(const char *val, zfs_kernel_param_t *kp)
+{
+	int error;
+
+	error = param_set_ulong(val, kp);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_ziotime_ms));
+
+	return (0);
+}
+
+int
+param_set_deadman_synctime(const char *val, zfs_kernel_param_t *kp)
+{
+	int error;
+
+	error = param_set_ulong(val, kp);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms));
+
+	return (0);
+}
+
+int
+param_set_slop_shift(const char *buf, zfs_kernel_param_t *kp)
+{
+	unsigned long val;
+	int error;
+
+	error = kstrtoul(buf, 0, &val);
+	if (error)
+		return (SET_ERROR(error));
+
+	if (val < 1 || val > 31)
+		return (SET_ERROR(-EINVAL));
+
+	error = param_set_int(buf, kp);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	return (0);
+}
+
+const char *
+spa_history_zone(void)
+{
+	return ("linux");
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/spa_stats.c b/sys/contrib/openzfs/module/os/linux/zfs/spa_stats.c
new file mode 100644
index 000000000000..86cefa6dddab
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/spa_stats.c
@@ -0,0 +1,1047 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/spa.h>
+#include <zfs_comutil.h>
+
+/*
+ * Keeps stats on last N reads per spa_t, disabled by default.
+ */
+int zfs_read_history = 0;
+
+/*
+ * Include cache hits in history, disabled by default.
+ */
+int zfs_read_history_hits = 0;
+
+/*
+ * Keeps stats on the last 100 txgs by default.
+ */
+int zfs_txg_history = 100;
+
+/*
+ * Keeps stats on the last N MMP updates, disabled by default.
+ */
+int zfs_multihost_history = 0;
+
+/*
+ * ==========================================================================
+ * SPA Read History Routines
+ * ==========================================================================
+ */
+
+/*
+ * Read statistics - Information exported regarding each arc_read call
+ */
+typedef struct spa_read_history {
+	hrtime_t	start;		/* time read completed */
+	uint64_t	objset;		/* read from this objset */
+	uint64_t	object;		/* read of this object number */
+	uint64_t	level;		/* block's indirection level */
+	uint64_t	blkid;		/* read of this block id */
+	char		origin[24];	/* read originated from here */
+	uint32_t	aflags;		/* ARC flags (cached, prefetch, etc.) */
+	pid_t		pid;		/* PID of task doing read */
+	char		comm[16];	/* process name of task doing read */
+	procfs_list_node_t	srh_node;
+} spa_read_history_t;
+
+static int
+spa_read_history_show_header(struct seq_file *f)
+{
+	seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
+	    "%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
+	    "level", "blkid", "aflags", "origin", "pid", "process");
+
+	return (0);
+}
+
+static int
+spa_read_history_show(struct seq_file *f, void *data)
+{
+	spa_read_history_t *srh = (spa_read_history_t *)data;
+
+	seq_printf(f, "%-8llu %-16llu 0x%-6llx "
+	    "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
+	    (u_longlong_t)srh->srh_node.pln_id, srh->start,
+	    (longlong_t)srh->objset, (longlong_t)srh->object,
+	    (longlong_t)srh->level, (longlong_t)srh->blkid,
+	    srh->aflags, srh->origin, srh->pid, srh->comm);
+
+	return (0);
+}
+
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_read_history_truncate(spa_history_list_t *shl, unsigned int size)
+{
+	spa_read_history_t *srh;
+	while (shl->size > size) {
+		srh = list_remove_head(&shl->procfs_list.pl_list);
+		ASSERT3P(srh, !=, NULL);
+		kmem_free(srh, sizeof (spa_read_history_t));
+		shl->size--;
+	}
+
+	if (size == 0)
+		ASSERT(list_is_empty(&shl->procfs_list.pl_list));
+}
+
+static int
+spa_read_history_clear(procfs_list_t *procfs_list)
+{
+	spa_history_list_t *shl = procfs_list->pl_private;
+	mutex_enter(&procfs_list->pl_lock);
+	spa_read_history_truncate(shl, 0);
+	mutex_exit(&procfs_list->pl_lock);
+	return (0);
+}
+
+static void
+spa_read_history_init(spa_t *spa)
+{
+	spa_history_list_t *shl = &spa->spa_stats.read_history;
+	char *module;
+
+	shl->size = 0;
+
+	module = kmem_asprintf("zfs/%s", spa_name(spa));
+
+	shl->procfs_list.pl_private = shl;
+	procfs_list_install(module,
+	    "reads",
+	    0600,
+	    &shl->procfs_list,
+	    spa_read_history_show,
+	    spa_read_history_show_header,
+	    spa_read_history_clear,
+	    offsetof(spa_read_history_t, srh_node));
+
+	kmem_strfree(module);
+}
+
+static void
+spa_read_history_destroy(spa_t *spa)
+{
+	spa_history_list_t *shl = &spa->spa_stats.read_history;
+	procfs_list_uninstall(&shl->procfs_list);
+	spa_read_history_truncate(shl, 0);
+	procfs_list_destroy(&shl->procfs_list);
+}
+
+void
+spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
+{
+	spa_history_list_t *shl = &spa->spa_stats.read_history;
+	spa_read_history_t *srh;
+
+	ASSERT3P(spa, !=, NULL);
+	ASSERT3P(zb,  !=, NULL);
+
+	if (zfs_read_history == 0 && shl->size == 0)
+		return;
+
+	if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED))
+		return;
+
+	srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP);
+	strlcpy(srh->comm, getcomm(), sizeof (srh->comm));
+	srh->start  = gethrtime();
+	srh->objset = zb->zb_objset;
+	srh->object = zb->zb_object;
+	srh->level  = zb->zb_level;
+	srh->blkid  = zb->zb_blkid;
+	srh->aflags = aflags;
+	srh->pid    = getpid();
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+
+	procfs_list_add(&shl->procfs_list, srh);
+	shl->size++;
+
+	spa_read_history_truncate(shl, zfs_read_history);
+
+	mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA TXG History Routines
+ * ==========================================================================
+ */
+
+/*
+ * Txg statistics - Information exported regarding each txg sync
+ */
+
+typedef struct spa_txg_history {
+	uint64_t	txg;		/* txg id */
+	txg_state_t	state;		/* active txg state */
+	uint64_t	nread;		/* number of bytes read */
+	uint64_t	nwritten;	/* number of bytes written */
+	uint64_t	reads;		/* number of read operations */
+	uint64_t	writes;		/* number of write operations */
+	uint64_t	ndirty;		/* number of dirty bytes */
+	hrtime_t	times[TXG_STATE_COMMITTED]; /* completion times */
+	procfs_list_node_t	sth_node;
+} spa_txg_history_t;
+
+static int
+spa_txg_history_show_header(struct seq_file *f)
+{
+	seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s "
+	    "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state",
+	    "ndirty", "nread", "nwritten", "reads", "writes",
+	    "otime", "qtime", "wtime", "stime");
+	return (0);
+}
+
+static int
+spa_txg_history_show(struct seq_file *f, void *data)
+{
+	spa_txg_history_t *sth = (spa_txg_history_t *)data;
+	uint64_t open = 0, quiesce = 0, wait = 0, sync = 0;
+	char state;
+
+	switch (sth->state) {
+		case TXG_STATE_BIRTH:		state = 'B';	break;
+		case TXG_STATE_OPEN:		state = 'O';	break;
+		case TXG_STATE_QUIESCED:	state = 'Q';	break;
+		case TXG_STATE_WAIT_FOR_SYNC:	state = 'W';	break;
+		case TXG_STATE_SYNCED:		state = 'S';	break;
+		case TXG_STATE_COMMITTED:	state = 'C';	break;
+		default:			state = '?';	break;
+	}
+
+	if (sth->times[TXG_STATE_OPEN])
+		open = sth->times[TXG_STATE_OPEN] -
+		    sth->times[TXG_STATE_BIRTH];
+
+	if (sth->times[TXG_STATE_QUIESCED])
+		quiesce = sth->times[TXG_STATE_QUIESCED] -
+		    sth->times[TXG_STATE_OPEN];
+
+	if (sth->times[TXG_STATE_WAIT_FOR_SYNC])
+		wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] -
+		    sth->times[TXG_STATE_QUIESCED];
+
+	if (sth->times[TXG_STATE_SYNCED])
+		sync = sth->times[TXG_STATE_SYNCED] -
+		    sth->times[TXG_STATE_WAIT_FOR_SYNC];
+
+	seq_printf(f, "%-8llu %-16llu %-5c %-12llu "
+	    "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n",
+	    (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state,
+	    (u_longlong_t)sth->ndirty,
+	    (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten,
+	    (u_longlong_t)sth->reads, (u_longlong_t)sth->writes,
+	    (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait,
+	    (u_longlong_t)sync);
+
+	return (0);
+}
+
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size)
+{
+	spa_txg_history_t *sth;
+	while (shl->size > size) {
+		sth = list_remove_head(&shl->procfs_list.pl_list);
+		ASSERT3P(sth, !=, NULL);
+		kmem_free(sth, sizeof (spa_txg_history_t));
+		shl->size--;
+	}
+
+	if (size == 0)
+		ASSERT(list_is_empty(&shl->procfs_list.pl_list));
+
+}
+
+static int
+spa_txg_history_clear(procfs_list_t *procfs_list)
+{
+	spa_history_list_t *shl = procfs_list->pl_private;
+	mutex_enter(&procfs_list->pl_lock);
+	spa_txg_history_truncate(shl, 0);
+	mutex_exit(&procfs_list->pl_lock);
+	return (0);
+}
+
+static void
+spa_txg_history_init(spa_t *spa)
+{
+	spa_history_list_t *shl = &spa->spa_stats.txg_history;
+	char *module;
+
+	shl->size = 0;
+
+	module = kmem_asprintf("zfs/%s", spa_name(spa));
+
+	shl->procfs_list.pl_private = shl;
+	procfs_list_install(module,
+	    "txgs",
+	    0644,
+	    &shl->procfs_list,
+	    spa_txg_history_show,
+	    spa_txg_history_show_header,
+	    spa_txg_history_clear,
+	    offsetof(spa_txg_history_t, sth_node));
+
+	kmem_strfree(module);
+}
+
+static void
+spa_txg_history_destroy(spa_t *spa)
+{
+	spa_history_list_t *shl = &spa->spa_stats.txg_history;
+	procfs_list_uninstall(&shl->procfs_list);
+	spa_txg_history_truncate(shl, 0);
+	procfs_list_destroy(&shl->procfs_list);
+}
+
+/*
+ * Add a new txg to historical record.
+ */
+void
+spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time)
+{
+	spa_history_list_t *shl = &spa->spa_stats.txg_history;
+	spa_txg_history_t *sth;
+
+	if (zfs_txg_history == 0 && shl->size == 0)
+		return;
+
+	sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP);
+	sth->txg = txg;
+	sth->state = TXG_STATE_OPEN;
+	sth->times[TXG_STATE_BIRTH] = birth_time;
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	procfs_list_add(&shl->procfs_list, sth);
+	shl->size++;
+	spa_txg_history_truncate(shl, zfs_txg_history);
+	mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+/*
+ * Set txg state completion time and increment current state.
+ */
+int
+spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
+    hrtime_t completed_time)
+{
+	spa_history_list_t *shl = &spa->spa_stats.txg_history;
+	spa_txg_history_t *sth;
+	int error = ENOENT;
+
+	if (zfs_txg_history == 0)
+		return (0);
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
+	    sth = list_prev(&shl->procfs_list.pl_list, sth)) {
+		if (sth->txg == txg) {
+			sth->times[completed_state] = completed_time;
+			sth->state++;
+			error = 0;
+			break;
+		}
+	}
+	mutex_exit(&shl->procfs_list.pl_lock);
+
+	return (error);
+}
+
+/*
+ * Set txg IO stats.
+ */
+static int
+spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
+    uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty)
+{
+	spa_history_list_t *shl = &spa->spa_stats.txg_history;
+	spa_txg_history_t *sth;
+	int error = ENOENT;
+
+	if (zfs_txg_history == 0)
+		return (0);
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
+	    sth = list_prev(&shl->procfs_list.pl_list, sth)) {
+		if (sth->txg == txg) {
+			sth->nread = nread;
+			sth->nwritten = nwritten;
+			sth->reads = reads;
+			sth->writes = writes;
+			sth->ndirty = ndirty;
+			error = 0;
+			break;
+		}
+	}
+	mutex_exit(&shl->procfs_list.pl_lock);
+
+	return (error);
+}
+
+txg_stat_t *
+spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp)
+{
+	txg_stat_t *ts;
+
+	if (zfs_txg_history == 0)
+		return (NULL);
+
+	ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP);
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	vdev_get_stats(spa->spa_root_vdev, &ts->vs1);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	ts->txg = txg;
+	ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
+
+	spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime());
+
+	return (ts);
+}
+
+void
+spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts)
+{
+	if (ts == NULL)
+		return;
+
+	if (zfs_txg_history == 0) {
+		kmem_free(ts, sizeof (txg_stat_t));
+		return;
+	}
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	vdev_get_stats(spa->spa_root_vdev, &ts->vs2);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime());
+	spa_txg_history_set_io(spa, ts->txg,
+	    ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ],
+	    ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE],
+	    ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ],
+	    ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE],
+	    ts->ndirty);
+
+	kmem_free(ts, sizeof (txg_stat_t));
+}
+
+/*
+ * ==========================================================================
+ * SPA TX Assign Histogram Routines
+ * ==========================================================================
+ */
+
+/*
+ * Tx statistics - Information exported regarding dmu_tx_assign time.
+ */
+
+/*
+ * When the kstat is written zero all buckets.  When the kstat is read
+ * count the number of trailing buckets set to zero and update ks_ndata
+ * such that they are not output.
+ */
+static int
+spa_tx_assign_update(kstat_t *ksp, int rw)
+{
+	spa_t *spa = ksp->ks_private;
+	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+	int i;
+
+	if (rw == KSTAT_WRITE) {
+		for (i = 0; i < shk->count; i++)
+			((kstat_named_t *)shk->priv)[i].value.ui64 = 0;
+	}
+
+	for (i = shk->count; i > 0; i--)
+		if (((kstat_named_t *)shk->priv)[i-1].value.ui64 != 0)
+			break;
+
+	ksp->ks_ndata = i;
+	ksp->ks_data_size = i * sizeof (kstat_named_t);
+
+	return (0);
+}
+
+static void
+spa_tx_assign_init(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+	char *name;
+	kstat_named_t *ks;
+	kstat_t *ksp;
+	int i;
+
+	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+	shk->count = 42; /* power of two buckets for 1ns to 2,199s */
+	shk->size = shk->count * sizeof (kstat_named_t);
+	shk->priv = kmem_alloc(shk->size, KM_SLEEP);
+
+	name = kmem_asprintf("zfs/%s", spa_name(spa));
+
+	for (i = 0; i < shk->count; i++) {
+		ks = &((kstat_named_t *)shk->priv)[i];
+		ks->data_type = KSTAT_DATA_UINT64;
+		ks->value.ui64 = 0;
+		(void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns",
+		    (u_longlong_t)1 << i);
+	}
+
+	ksp = kstat_create(name, 0, "dmu_tx_assign", "misc",
+	    KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL);
+	shk->kstat = ksp;
+
+	if (ksp) {
+		ksp->ks_lock = &shk->lock;
+		ksp->ks_data = shk->priv;
+		ksp->ks_ndata = shk->count;
+		ksp->ks_data_size = shk->size;
+		ksp->ks_private = spa;
+		ksp->ks_update = spa_tx_assign_update;
+		kstat_install(ksp);
+	}
+	kmem_strfree(name);
+}
+
+static void
+spa_tx_assign_destroy(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+	kstat_t *ksp;
+
+	ksp = shk->kstat;
+	if (ksp)
+		kstat_delete(ksp);
+
+	kmem_free(shk->priv, shk->size);
+	mutex_destroy(&shk->lock);
+}
+
+void
+spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+	uint64_t idx = 0;
+
+	while (((1ULL << idx) < nsecs) && (idx < shk->size - 1))
+		idx++;
+
+	atomic_inc_64(&((kstat_named_t *)shk->priv)[idx].value.ui64);
+}
+
+/*
+ * ==========================================================================
+ * SPA IO History Routines
+ * ==========================================================================
+ */
+static int
+spa_io_history_update(kstat_t *ksp, int rw)
+{
+	if (rw == KSTAT_WRITE)
+		memset(ksp->ks_data, 0, ksp->ks_data_size);
+
+	return (0);
+}
+
+static void
+spa_io_history_init(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+	char *name;
+	kstat_t *ksp;
+
+	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+	name = kmem_asprintf("zfs/%s", spa_name(spa));
+
+	ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0);
+	shk->kstat = ksp;
+
+	if (ksp) {
+		ksp->ks_lock = &shk->lock;
+		ksp->ks_private = spa;
+		ksp->ks_update = spa_io_history_update;
+		kstat_install(ksp);
+	}
+	kmem_strfree(name);
+}
+
+static void
+spa_io_history_destroy(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+
+	if (shk->kstat)
+		kstat_delete(shk->kstat);
+
+	mutex_destroy(&shk->lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA MMP History Routines
+ * ==========================================================================
+ */
+
+/*
+ * MMP statistics - Information exported regarding attempted MMP writes
+ *   For MMP writes issued, fields used as per comments below.
+ *   For MMP writes skipped, an entry represents a span of time when
+ *      writes were skipped for same reason (error from mmp_random_leaf).
+ *      Differences are:
+ *      timestamp	time first write skipped, if >1 skipped in a row
+ *      mmp_delay	delay value at timestamp
+ *      vdev_guid	number of writes skipped
+ *      io_error	one of enum mmp_error
+ *      duration	time span (ns) of skipped writes
+ */
+
+typedef struct spa_mmp_history {
+	uint64_t	mmp_node_id;	/* unique # for updates */
+	uint64_t	txg;		/* txg of last sync */
+	uint64_t	timestamp;	/* UTC time MMP write issued */
+	uint64_t	mmp_delay;	/* mmp_thread.mmp_delay at timestamp */
+	uint64_t	vdev_guid;	/* unique ID of leaf vdev */
+	char		*vdev_path;
+	int		vdev_label;	/* vdev label */
+	int		io_error;	/* error status of MMP write */
+	hrtime_t	error_start;	/* hrtime of start of error period */
+	hrtime_t	duration;	/* time from submission to completion */
+	procfs_list_node_t	smh_node;
+} spa_mmp_history_t;
+
+static int
+spa_mmp_history_show_header(struct seq_file *f)
+{
+	seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s "
+	    "%-10s %s\n", "id", "txg", "timestamp", "error", "duration",
+	    "mmp_delay", "vdev_guid", "vdev_label", "vdev_path");
+	return (0);
+}
+
+static int
+spa_mmp_history_show(struct seq_file *f, void *data)
+{
+	spa_mmp_history_t *smh = (spa_mmp_history_t *)data;
+	char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu "
+	    "%-10lld %s\n";
+	char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu "
+	    "%-10lld %s\n";
+
+	seq_printf(f, (smh->error_start ? skip_fmt : write_fmt),
+	    (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg,
+	    (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error,
+	    (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay,
+	    (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label,
+	    (smh->vdev_path ? smh->vdev_path : "-"));
+
+	return (0);
+}
+
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size)
+{
+	spa_mmp_history_t *smh;
+	while (shl->size > size) {
+		smh = list_remove_head(&shl->procfs_list.pl_list);
+		if (smh->vdev_path)
+			kmem_strfree(smh->vdev_path);
+		kmem_free(smh, sizeof (spa_mmp_history_t));
+		shl->size--;
+	}
+
+	if (size == 0)
+		ASSERT(list_is_empty(&shl->procfs_list.pl_list));
+
+}
+
+static int
+spa_mmp_history_clear(procfs_list_t *procfs_list)
+{
+	spa_history_list_t *shl = procfs_list->pl_private;
+	mutex_enter(&procfs_list->pl_lock);
+	spa_mmp_history_truncate(shl, 0);
+	mutex_exit(&procfs_list->pl_lock);
+	return (0);
+}
+
+static void
+spa_mmp_history_init(spa_t *spa)
+{
+	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+	char *module;
+
+	shl->size = 0;
+
+	module = kmem_asprintf("zfs/%s", spa_name(spa));
+
+	shl->procfs_list.pl_private = shl;
+	procfs_list_install(module,
+	    "multihost",
+	    0644,
+	    &shl->procfs_list,
+	    spa_mmp_history_show,
+	    spa_mmp_history_show_header,
+	    spa_mmp_history_clear,
+	    offsetof(spa_mmp_history_t, smh_node));
+
+	kmem_strfree(module);
+}
+
+static void
+spa_mmp_history_destroy(spa_t *spa)
+{
+	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+	procfs_list_uninstall(&shl->procfs_list);
+	spa_mmp_history_truncate(shl, 0);
+	procfs_list_destroy(&shl->procfs_list);
+}
+
+/*
+ * Set duration in existing "skip" record to how long we have waited for a leaf
+ * vdev to become available.
+ *
+ * Important that we start search at the tail of the list where new
+ * records are inserted, so this is normally an O(1) operation.
+ */
+int
+spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id)
+{
+	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+	spa_mmp_history_t *smh;
+	int error = ENOENT;
+
+	if (zfs_multihost_history == 0 && shl->size == 0)
+		return (0);
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
+	    smh = list_prev(&shl->procfs_list.pl_list, smh)) {
+		if (smh->mmp_node_id == mmp_node_id) {
+			ASSERT3U(smh->io_error, !=, 0);
+			smh->duration = gethrtime() - smh->error_start;
+			smh->vdev_guid++;
+			error = 0;
+			break;
+		}
+	}
+	mutex_exit(&shl->procfs_list.pl_lock);
+
+	return (error);
+}
+
+/*
+ * Set MMP write duration and error status in existing record.
+ * See comment re: search order above spa_mmp_history_set_skip().
+ */
+int
+spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error,
+    hrtime_t duration)
+{
+	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+	spa_mmp_history_t *smh;
+	int error = ENOENT;
+
+	if (zfs_multihost_history == 0 && shl->size == 0)
+		return (0);
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
+	    smh = list_prev(&shl->procfs_list.pl_list, smh)) {
+		if (smh->mmp_node_id == mmp_node_id) {
+			ASSERT(smh->io_error == 0);
+			smh->io_error = io_error;
+			smh->duration = duration;
+			error = 0;
+			break;
+		}
+	}
+	mutex_exit(&shl->procfs_list.pl_lock);
+
+	return (error);
+}
+
+/*
+ * Add a new MMP historical record.
+ * error == 0 : a write was issued.
+ * error != 0 : a write was not issued because no leaves were found.
+ */
+void
+spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
+    uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id,
+    int error)
+{
+	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+	spa_mmp_history_t *smh;
+
+	if (zfs_multihost_history == 0 && shl->size == 0)
+		return;
+
+	smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP);
+	smh->txg = txg;
+	smh->timestamp = timestamp;
+	smh->mmp_delay = mmp_delay;
+	if (vd) {
+		smh->vdev_guid = vd->vdev_guid;
+		if (vd->vdev_path)
+			smh->vdev_path = kmem_strdup(vd->vdev_path);
+	}
+	smh->vdev_label = label;
+	smh->mmp_node_id = mmp_node_id;
+
+	if (error) {
+		smh->io_error = error;
+		smh->error_start = gethrtime();
+		smh->vdev_guid = 1;
+	}
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	procfs_list_add(&shl->procfs_list, smh);
+	shl->size++;
+	spa_mmp_history_truncate(shl, zfs_multihost_history);
+	mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+static void *
+spa_state_addr(kstat_t *ksp, loff_t n)
+{
+	return (ksp->ks_private);	/* return the spa_t */
+}
+
+static int
+spa_state_data(char *buf, size_t size, void *data)
+{
+	spa_t *spa = (spa_t *)data;
+	(void) snprintf(buf, size, "%s\n", spa_state_to_name(spa));
+	return (0);
+}
+
+/*
+ * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state.
+ *
+ * This is a lock-less read of the pool's state (unlike using 'zpool', which
+ * can potentially block for seconds).  Because it doesn't block, it can useful
+ * as a pool heartbeat value.
+ */
+static void
+spa_state_init(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.state;
+	char *name;
+	kstat_t *ksp;
+
+	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+	name = kmem_asprintf("zfs/%s", spa_name(spa));
+	ksp = kstat_create(name, 0, "state", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+	shk->kstat = ksp;
+	if (ksp) {
+		ksp->ks_lock = &shk->lock;
+		ksp->ks_data = NULL;
+		ksp->ks_private = spa;
+		ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS;
+		kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr);
+		kstat_install(ksp);
+	}
+
+	kmem_strfree(name);
+}
+
+static void
+spa_health_destroy(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.state;
+	kstat_t *ksp = shk->kstat;
+	if (ksp)
+		kstat_delete(ksp);
+
+	mutex_destroy(&shk->lock);
+}
+
+static spa_iostats_t spa_iostats_template = {
+	{ "trim_extents_written",		KSTAT_DATA_UINT64 },
+	{ "trim_bytes_written",			KSTAT_DATA_UINT64 },
+	{ "trim_extents_skipped",		KSTAT_DATA_UINT64 },
+	{ "trim_bytes_skipped",			KSTAT_DATA_UINT64 },
+	{ "trim_extents_failed",		KSTAT_DATA_UINT64 },
+	{ "trim_bytes_failed",			KSTAT_DATA_UINT64 },
+	{ "autotrim_extents_written",		KSTAT_DATA_UINT64 },
+	{ "autotrim_bytes_written",		KSTAT_DATA_UINT64 },
+	{ "autotrim_extents_skipped",		KSTAT_DATA_UINT64 },
+	{ "autotrim_bytes_skipped",		KSTAT_DATA_UINT64 },
+	{ "autotrim_extents_failed",		KSTAT_DATA_UINT64 },
+	{ "autotrim_bytes_failed",		KSTAT_DATA_UINT64 },
+	{ "simple_trim_extents_written",	KSTAT_DATA_UINT64 },
+	{ "simple_trim_bytes_written",		KSTAT_DATA_UINT64 },
+	{ "simple_trim_extents_skipped",	KSTAT_DATA_UINT64 },
+	{ "simple_trim_bytes_skipped",		KSTAT_DATA_UINT64 },
+	{ "simple_trim_extents_failed",		KSTAT_DATA_UINT64 },
+	{ "simple_trim_bytes_failed",		KSTAT_DATA_UINT64 },
+};
+
+#define	SPA_IOSTATS_ADD(stat, val) \
+    atomic_add_64(&iostats->stat.value.ui64, (val));
+
+void
+spa_iostats_trim_add(spa_t *spa, trim_type_t type,
+    uint64_t extents_written, uint64_t bytes_written,
+    uint64_t extents_skipped, uint64_t bytes_skipped,
+    uint64_t extents_failed, uint64_t bytes_failed)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+	kstat_t *ksp = shk->kstat;
+	spa_iostats_t *iostats;
+
+	if (ksp == NULL)
+		return;
+
+	iostats = ksp->ks_data;
+	if (type == TRIM_TYPE_MANUAL) {
+		SPA_IOSTATS_ADD(trim_extents_written, extents_written);
+		SPA_IOSTATS_ADD(trim_bytes_written, bytes_written);
+		SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped);
+		SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped);
+		SPA_IOSTATS_ADD(trim_extents_failed, extents_failed);
+		SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed);
+	} else if (type == TRIM_TYPE_AUTO) {
+		SPA_IOSTATS_ADD(autotrim_extents_written, extents_written);
+		SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written);
+		SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped);
+		SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped);
+		SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed);
+		SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed);
+	} else {
+		SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written);
+		SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written);
+		SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped);
+		SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped);
+		SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed);
+		SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed);
+	}
+}
+
+static int
+spa_iostats_update(kstat_t *ksp, int rw)
+{
+	if (rw == KSTAT_WRITE) {
+		memcpy(ksp->ks_data, &spa_iostats_template,
+		    sizeof (spa_iostats_t));
+	}
+
+	return (0);
+}
+
+static void
+spa_iostats_init(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+
+	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+	char *name = kmem_asprintf("zfs/%s", spa_name(spa));
+	kstat_t *ksp = kstat_create(name, 0, "iostats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	shk->kstat = ksp;
+	if (ksp) {
+		int size = sizeof (spa_iostats_t);
+		ksp->ks_lock = &shk->lock;
+		ksp->ks_private = spa;
+		ksp->ks_update = spa_iostats_update;
+		ksp->ks_data = kmem_alloc(size, KM_SLEEP);
+		memcpy(ksp->ks_data, &spa_iostats_template, size);
+		kstat_install(ksp);
+	}
+
+	kmem_strfree(name);
+}
+
+static void
+spa_iostats_destroy(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+	kstat_t *ksp = shk->kstat;
+	if (ksp) {
+		kmem_free(ksp->ks_data, sizeof (spa_iostats_t));
+		kstat_delete(ksp);
+	}
+
+	mutex_destroy(&shk->lock);
+}
+
+void
+spa_stats_init(spa_t *spa)
+{
+	spa_read_history_init(spa);
+	spa_txg_history_init(spa);
+	spa_tx_assign_init(spa);
+	spa_io_history_init(spa);
+	spa_mmp_history_init(spa);
+	spa_state_init(spa);
+	spa_iostats_init(spa);
+}
+
+void
+spa_stats_destroy(spa_t *spa)
+{
+	spa_iostats_destroy(spa);
+	spa_health_destroy(spa);
+	spa_tx_assign_destroy(spa);
+	spa_txg_history_destroy(spa);
+	spa_read_history_destroy(spa);
+	spa_io_history_destroy(spa);
+	spa_mmp_history_destroy(spa);
+}
+
+#if defined(_KERNEL)
+/* CSTYLED */
+module_param(zfs_read_history, int, 0644);
+MODULE_PARM_DESC(zfs_read_history,
+	"Historical statistics for the last N reads");
+
+module_param(zfs_read_history_hits, int, 0644);
+MODULE_PARM_DESC(zfs_read_history_hits,
+	"Include cache hits in read history");
+
+module_param(zfs_txg_history, int, 0644);
+MODULE_PARM_DESC(zfs_txg_history,
+	"Historical statistics for the last N txgs");
+
+module_param(zfs_multihost_history, int, 0644);
+MODULE_PARM_DESC(zfs_multihost_history,
+	"Historical statistics for last N multihost writes");
+/* END CSTYLED */
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/trace.c b/sys/contrib/openzfs/module/os/linux/zfs/trace.c
new file mode 100644
index 000000000000..a690822ae14c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/trace.c
@@ -0,0 +1,55 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Each DTRACE_PROBE must define its trace point in one (and only one)
+ * source file, so this dummy file exists for that purpose.
+ */
+
+#include <sys/multilist.h>
+#include <sys/arc_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+#include <sys/zfs_znode.h>
+#include <sys/zil_impl.h>
+
+#ifdef _KERNEL
+#define	CREATE_TRACE_POINTS
+#include <sys/trace.h>
+#include <sys/trace_acl.h>
+#include <sys/trace_arc.h>
+#include <sys/trace_dbgmsg.h>
+#include <sys/trace_dbuf.h>
+#include <sys/trace_dmu.h>
+#include <sys/trace_dnode.h>
+#include <sys/trace_multilist.h>
+#include <sys/trace_rrwlock.h>
+#include <sys/trace_txg.h>
+#include <sys/trace_vdev.h>
+#include <sys/trace_zil.h>
+#include <sys/trace_zio.h>
+#include <sys/trace_zrlock.h>
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
new file mode 100644
index 000000000000..5a2245436c72
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -0,0 +1,873 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * LLNL-CODE-403049.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <linux/msdos_fs.h>
+#include <linux/vfs_compat.h>
+
+typedef struct vdev_disk {
+	struct block_device		*vd_bdev;
+	krwlock_t			vd_lock;
+} vdev_disk_t;
+
+/*
+ * Unique identifier for the exclusive vdev holder.
+ */
+static void *zfs_vdev_holder = VDEV_HOLDER;
+
+/*
+ * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the
+ * device is missing. The missing path may be transient since the links
+ * can be briefly removed and recreated in response to udev events.
+ */
+static unsigned zfs_vdev_open_timeout_ms = 1000;
+
+/*
+ * Size of the "reserved" partition, in blocks.
+ */
+#define	EFI_MIN_RESV_SIZE	(16 * 1024)
+
+/*
+ * Virtual device vector for disks.
+ */
+typedef struct dio_request {
+	zio_t			*dr_zio;	/* Parent ZIO */
+	atomic_t		dr_ref;		/* References */
+	int			dr_error;	/* Bio error */
+	int			dr_bio_count;	/* Count of bio's */
+	struct bio		*dr_bio[0];	/* Attached bio's */
+} dio_request_t;
+
+static fmode_t
+vdev_bdev_mode(spa_mode_t spa_mode)
+{
+	fmode_t mode = 0;
+
+	if (spa_mode & SPA_MODE_READ)
+		mode |= FMODE_READ;
+
+	if (spa_mode & SPA_MODE_WRITE)
+		mode |= FMODE_WRITE;
+
+	return (mode);
+}
+
+/*
+ * Returns the usable capacity (in bytes) for the partition or disk.
+ */
+static uint64_t
+bdev_capacity(struct block_device *bdev)
+{
+	return (i_size_read(bdev->bd_inode));
+}
+
+/*
+ * Returns the maximum expansion capacity of the block device (in bytes).
+ *
+ * It is possible to expand a vdev when it has been created as a wholedisk
+ * and the containing block device has increased in capacity.  Or when the
+ * partition containing the pool has been manually increased in size.
+ *
+ * This function is only responsible for calculating the potential expansion
+ * size so it can be reported by 'zpool list'.  The efi_use_whole_disk() is
+ * responsible for verifying the expected partition layout in the wholedisk
+ * case, and updating the partition table if appropriate.  Once the partition
+ * size has been increased the additional capacity will be visible using
+ * bdev_capacity().
+ *
+ * The returned maximum expansion capacity is always expected to be larger, or
+ * at the very least equal, to its usable capacity to prevent overestimating
+ * the pool expandsize.
+ */
+static uint64_t
+bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
+{
+	uint64_t psize;
+	int64_t available;
+
+	if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
+		/*
+		 * When reporting maximum expansion capacity for a wholedisk
+		 * deduct any capacity which is expected to be lost due to
+		 * alignment restrictions.  Over reporting this value isn't
+		 * harmful and would only result in slightly less capacity
+		 * than expected post expansion.
+		 * The estimated available space may be slightly smaller than
+		 * bdev_capacity() for devices where the number of sectors is
+		 * not a multiple of the alignment size and the partition layout
+		 * is keeping less than PARTITION_END_ALIGNMENT bytes after the
+		 * "reserved" EFI partition: in such cases return the device
+		 * usable capacity.
+		 */
+		available = i_size_read(bdev->bd_contains->bd_inode) -
+		    ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
+		    PARTITION_END_ALIGNMENT) << SECTOR_BITS);
+		psize = MAX(available, bdev_capacity(bdev));
+	} else {
+		psize = bdev_capacity(bdev);
+	}
+
+	return (psize);
+}
+
+static void
+vdev_disk_error(zio_t *zio)
+{
+	/*
+	 * This function can be called in interrupt context, for instance while
+	 * handling IRQs coming from a misbehaving disk device; use printk()
+	 * which is safe from any context.
+	 */
+	printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
+	    "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
+	    zio->io_vd->vdev_path, zio->io_error, zio->io_type,
+	    (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
+	    zio->io_flags);
+}
+
+static int
+vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+	struct block_device *bdev;
+	fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
+	hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
+	vdev_disk_t *vd;
+
+	/* Must have a pathname and it must be absolute. */
+	if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
+		v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		vdev_dbgmsg(v, "invalid vdev_path");
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Reopen the device if it is currently open.  When expanding a
+	 * partition force re-scanning the partition table while closed
+	 * in order to get an accurate updated block device size.  Then
+	 * since udev may need to recreate the device links increase the
+	 * open retry timeout before reporting the device as unavailable.
+	 */
+	vd = v->vdev_tsd;
+	if (vd) {
+		char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
+		boolean_t reread_part = B_FALSE;
+
+		rw_enter(&vd->vd_lock, RW_WRITER);
+		bdev = vd->vd_bdev;
+		vd->vd_bdev = NULL;
+
+		if (bdev) {
+			if (v->vdev_expanding && bdev != bdev->bd_contains) {
+				bdevname(bdev->bd_contains, disk_name + 5);
+				reread_part = B_TRUE;
+			}
+
+			blkdev_put(bdev, mode | FMODE_EXCL);
+		}
+
+		if (reread_part) {
+			bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL,
+			    zfs_vdev_holder);
+			if (!IS_ERR(bdev)) {
+				int error = vdev_bdev_reread_part(bdev);
+				blkdev_put(bdev, mode | FMODE_EXCL);
+				if (error == 0) {
+					timeout = MSEC2NSEC(
+					    zfs_vdev_open_timeout_ms * 2);
+				}
+			}
+		}
+	} else {
+		vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+		rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
+		rw_enter(&vd->vd_lock, RW_WRITER);
+	}
+
+	/*
+	 * Devices are always opened by the path provided at configuration
+	 * time.  This means that if the provided path is a udev by-id path
+	 * then drives may be re-cabled without an issue.  If the provided
+	 * path is a udev by-path path, then the physical location information
+	 * will be preserved.  This can be critical for more complicated
+	 * configurations where drives are located in specific physical
+	 * locations to maximize the systems tolerance to component failure.
+	 *
+	 * Alternatively, you can provide your own udev rule to flexibly map
+	 * the drives as you see fit.  It is not advised that you use the
+	 * /dev/[hd]d devices which may be reordered due to probing order.
+	 * Devices in the wrong locations will be detected by the higher
+	 * level vdev validation.
+	 *
+	 * The specified paths may be briefly removed and recreated in
+	 * response to udev events.  This should be exceptionally unlikely
+	 * because the zpool command makes every effort to verify these paths
+	 * have already settled prior to reaching this point.  Therefore,
+	 * a ENOENT failure at this point is highly likely to be transient
+	 * and it is reasonable to sleep and retry before giving up.  In
+	 * practice delays have been observed to be on the order of 100ms.
+	 */
+	hrtime_t start = gethrtime();
+	bdev = ERR_PTR(-ENXIO);
+	while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) {
+		bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL,
+		    zfs_vdev_holder);
+		if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
+			schedule_timeout(MSEC_TO_TICK(10));
+		} else if (IS_ERR(bdev)) {
+			break;
+		}
+	}
+
+	if (IS_ERR(bdev)) {
+		int error = -PTR_ERR(bdev);
+		vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error,
+		    (u_longlong_t)(gethrtime() - start),
+		    (u_longlong_t)timeout);
+		vd->vd_bdev = NULL;
+		v->vdev_tsd = vd;
+		rw_exit(&vd->vd_lock);
+		return (SET_ERROR(error));
+	} else {
+		vd->vd_bdev = bdev;
+		v->vdev_tsd = vd;
+		rw_exit(&vd->vd_lock);
+	}
+
+	struct request_queue *q = bdev_get_queue(vd->vd_bdev);
+
+	/*  Determine the physical block size */
+	int physical_block_size = bdev_physical_block_size(vd->vd_bdev);
+
+	/*  Determine the logical block size */
+	int logical_block_size = bdev_logical_block_size(vd->vd_bdev);
+
+	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
+	v->vdev_nowritecache = B_FALSE;
+
+	/* Set when device reports it supports TRIM. */
+	v->vdev_has_trim = !!blk_queue_discard(q);
+
+	/* Set when device reports it supports secure TRIM. */
+	v->vdev_has_securetrim = !!blk_queue_discard_secure(q);
+
+	/* Inform the ZIO pipeline that we are non-rotational */
+	v->vdev_nonrot = blk_queue_nonrot(q);
+
+	/* Physical volume size in bytes for the partition */
+	*psize = bdev_capacity(vd->vd_bdev);
+
+	/* Physical volume size in bytes including possible expansion space */
+	*max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
+
+	/* Based on the minimum sector size set the block size */
+	*physical_ashift = highbit64(MAX(physical_block_size,
+	    SPA_MINBLOCKSIZE)) - 1;
+
+	*logical_ashift = highbit64(MAX(logical_block_size,
+	    SPA_MINBLOCKSIZE)) - 1;
+
+	return (0);
+}
+
+static void
+vdev_disk_close(vdev_t *v)
+{
+	vdev_disk_t *vd = v->vdev_tsd;
+
+	if (v->vdev_reopening || vd == NULL)
+		return;
+
+	if (vd->vd_bdev != NULL) {
+		blkdev_put(vd->vd_bdev,
+		    vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL);
+	}
+
+	rw_destroy(&vd->vd_lock);
+	kmem_free(vd, sizeof (vdev_disk_t));
+	v->vdev_tsd = NULL;
+}
+
+static dio_request_t *
+vdev_disk_dio_alloc(int bio_count)
+{
+	dio_request_t *dr;
+	int i;
+
+	dr = kmem_zalloc(sizeof (dio_request_t) +
+	    sizeof (struct bio *) * bio_count, KM_SLEEP);
+	if (dr) {
+		atomic_set(&dr->dr_ref, 0);
+		dr->dr_bio_count = bio_count;
+		dr->dr_error = 0;
+
+		for (i = 0; i < dr->dr_bio_count; i++)
+			dr->dr_bio[i] = NULL;
+	}
+
+	return (dr);
+}
+
+static void
+vdev_disk_dio_free(dio_request_t *dr)
+{
+	int i;
+
+	for (i = 0; i < dr->dr_bio_count; i++)
+		if (dr->dr_bio[i])
+			bio_put(dr->dr_bio[i]);
+
+	kmem_free(dr, sizeof (dio_request_t) +
+	    sizeof (struct bio *) * dr->dr_bio_count);
+}
+
+static void
+vdev_disk_dio_get(dio_request_t *dr)
+{
+	atomic_inc(&dr->dr_ref);
+}
+
+static int
+vdev_disk_dio_put(dio_request_t *dr)
+{
+	int rc = atomic_dec_return(&dr->dr_ref);
+
+	/*
+	 * Free the dio_request when the last reference is dropped and
+	 * ensure zio_interpret is called only once with the correct zio
+	 */
+	if (rc == 0) {
+		zio_t *zio = dr->dr_zio;
+		int error = dr->dr_error;
+
+		vdev_disk_dio_free(dr);
+
+		if (zio) {
+			zio->io_error = error;
+			ASSERT3S(zio->io_error, >=, 0);
+			if (zio->io_error)
+				vdev_disk_error(zio);
+
+			zio_delay_interrupt(zio);
+		}
+	}
+
+	return (rc);
+}
+
+BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
+{
+	dio_request_t *dr = bio->bi_private;
+	int rc;
+
+	if (dr->dr_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+		dr->dr_error = BIO_END_IO_ERROR(bio);
+#else
+		if (error)
+			dr->dr_error = -(error);
+		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			dr->dr_error = EIO;
+#endif
+	}
+
+	/* Drop reference acquired by __vdev_disk_physio */
+	rc = vdev_disk_dio_put(dr);
+}
+
+static inline void
+vdev_submit_bio_impl(struct bio *bio)
+{
+#ifdef HAVE_1ARG_SUBMIT_BIO
+	submit_bio(bio);
+#else
+	submit_bio(0, bio);
+#endif
+}
+
+#ifdef HAVE_BIO_SET_DEV
+#if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
+/*
+ * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by
+ * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched().
+ * As a side effect the function was converted to GPL-only.  Define our
+ * own version when needed which uses rcu_read_lock_sched().
+ */
+#if defined(HAVE_BLKG_TRYGET_GPL_ONLY)
+static inline bool
+vdev_blkg_tryget(struct blkcg_gq *blkg)
+{
+	struct percpu_ref *ref = &blkg->refcnt;
+	unsigned long __percpu *count;
+	bool rc;
+
+	rcu_read_lock_sched();
+
+	if (__ref_is_percpu(ref, &count)) {
+		this_cpu_inc(*count);
+		rc = true;
+	} else {
+		rc = atomic_long_inc_not_zero(&ref->count);
+	}
+
+	rcu_read_unlock_sched();
+
+	return (rc);
+}
+#elif defined(HAVE_BLKG_TRYGET)
+#define	vdev_blkg_tryget(bg)	blkg_tryget(bg)
+#endif
+/*
+ * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
+ * GPL-only bio_associate_blkg() symbol thus inadvertently converting
+ * the entire macro.  Provide a minimal version which always assigns the
+ * request queue's root_blkg to the bio.
+ */
+static inline void
+vdev_bio_associate_blkg(struct bio *bio)
+{
+	struct request_queue *q = bio->bi_disk->queue;
+
+	ASSERT3P(q, !=, NULL);
+	ASSERT3P(bio->bi_blkg, ==, NULL);
+
+	if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
+		bio->bi_blkg = q->root_blkg;
+}
+#define	bio_associate_blkg vdev_bio_associate_blkg
+#endif
+#else
+/*
+ * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
+ */
+static inline void
+bio_set_dev(struct bio *bio, struct block_device *bdev)
+{
+	bio->bi_bdev = bdev;
+}
+#endif /* HAVE_BIO_SET_DEV */
+
+static inline void
+vdev_submit_bio(struct bio *bio)
+{
+	struct bio_list *bio_list = current->bio_list;
+	current->bio_list = NULL;
+	vdev_submit_bio_impl(bio);
+	current->bio_list = bio_list;
+}
+
+static int
+__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
+    size_t io_size, uint64_t io_offset, int rw, int flags)
+{
+	dio_request_t *dr;
+	uint64_t abd_offset;
+	uint64_t bio_offset;
+	int bio_size, bio_count = 16;
+	int i = 0, error = 0;
+	struct blk_plug plug;
+
+	/*
+	 * Accessing outside the block device is never allowed.
+	 */
+	if (io_offset + io_size > bdev->bd_inode->i_size) {
+		vdev_dbgmsg(zio->io_vd,
+		    "Illegal access %llu size %llu, device size %llu",
+		    io_offset, io_size, i_size_read(bdev->bd_inode));
+		return (SET_ERROR(EIO));
+	}
+
+retry:
+	dr = vdev_disk_dio_alloc(bio_count);
+	if (dr == NULL)
+		return (SET_ERROR(ENOMEM));
+
+	if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
+		bio_set_flags_failfast(bdev, &flags);
+
+	dr->dr_zio = zio;
+
+	/*
+	 * When the IO size exceeds the maximum bio size for the request
+	 * queue we are forced to break the IO in multiple bio's and wait
+	 * for them all to complete.  Ideally, all pool users will set
+	 * their volume block size to match the maximum request size and
+	 * the common case will be one bio per vdev IO request.
+	 */
+
+	abd_offset = 0;
+	bio_offset = io_offset;
+	bio_size   = io_size;
+	for (i = 0; i <= dr->dr_bio_count; i++) {
+
+		/* Finished constructing bio's for given buffer */
+		if (bio_size <= 0)
+			break;
+
+		/*
+		 * By default only 'bio_count' bio's per dio are allowed.
+		 * However, if we find ourselves in a situation where more
+		 * are needed we allocate a larger dio and warn the user.
+		 */
+		if (dr->dr_bio_count == i) {
+			vdev_disk_dio_free(dr);
+			bio_count *= 2;
+			goto retry;
+		}
+
+		/* bio_alloc() with __GFP_WAIT never returns NULL */
+		dr->dr_bio[i] = bio_alloc(GFP_NOIO,
+		    MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
+		    BIO_MAX_PAGES));
+		if (unlikely(dr->dr_bio[i] == NULL)) {
+			vdev_disk_dio_free(dr);
+			return (SET_ERROR(ENOMEM));
+		}
+
+		/* Matching put called by vdev_disk_physio_completion */
+		vdev_disk_dio_get(dr);
+
+		bio_set_dev(dr->dr_bio[i], bdev);
+		BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
+		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
+		dr->dr_bio[i]->bi_private = dr;
+		bio_set_op_attrs(dr->dr_bio[i], rw, flags);
+
+		/* Remaining size is returned to become the new size */
+		bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd,
+		    bio_size, abd_offset);
+
+		/* Advance in buffer and construct another bio if needed */
+		abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
+		bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
+	}
+
+	/* Extra reference to protect dio_request during vdev_submit_bio */
+	vdev_disk_dio_get(dr);
+
+	if (dr->dr_bio_count > 1)
+		blk_start_plug(&plug);
+
+	/* Submit all bio's associated with this dio */
+	for (i = 0; i < dr->dr_bio_count; i++)
+		if (dr->dr_bio[i])
+			vdev_submit_bio(dr->dr_bio[i]);
+
+	if (dr->dr_bio_count > 1)
+		blk_finish_plug(&plug);
+
+	(void) vdev_disk_dio_put(dr);
+
+	return (error);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
+{
+	zio_t *zio = bio->bi_private;
+#ifdef HAVE_1ARG_BIO_END_IO_T
+	zio->io_error = BIO_END_IO_ERROR(bio);
+#else
+	zio->io_error = -error;
+#endif
+
+	if (zio->io_error && (zio->io_error == EOPNOTSUPP))
+		zio->io_vd->vdev_nowritecache = B_TRUE;
+
+	bio_put(bio);
+	ASSERT3S(zio->io_error, >=, 0);
+	if (zio->io_error)
+		vdev_disk_error(zio);
+	zio_interrupt(zio);
+}
+
+static int
+vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
+{
+	struct request_queue *q;
+	struct bio *bio;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return (SET_ERROR(ENXIO));
+
+	bio = bio_alloc(GFP_NOIO, 0);
+	/* bio_alloc() with __GFP_WAIT never returns NULL */
+	if (unlikely(bio == NULL))
+		return (SET_ERROR(ENOMEM));
+
+	bio->bi_end_io = vdev_disk_io_flush_completion;
+	bio->bi_private = zio;
+	bio_set_dev(bio, bdev);
+	bio_set_flush(bio);
+	vdev_submit_bio(bio);
+	invalidate_bdev(bdev);
+
+	return (0);
+}
+
+static void
+vdev_disk_io_start(zio_t *zio)
+{
+	vdev_t *v = zio->io_vd;
+	vdev_disk_t *vd = v->vdev_tsd;
+	unsigned long trim_flags = 0;
+	int rw, error;
+
+	/*
+	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+	 * Nothing to be done here but return failure.
+	 */
+	if (vd == NULL) {
+		zio->io_error = ENXIO;
+		zio_interrupt(zio);
+		return;
+	}
+
+	rw_enter(&vd->vd_lock, RW_READER);
+
+	/*
+	 * If the vdev is closed, it's likely due to a failed reopen and is
+	 * in the UNAVAIL state.  Nothing to be done here but return failure.
+	 */
+	if (vd->vd_bdev == NULL) {
+		rw_exit(&vd->vd_lock);
+		zio->io_error = ENXIO;
+		zio_interrupt(zio);
+		return;
+	}
+
+	switch (zio->io_type) {
+	case ZIO_TYPE_IOCTL:
+
+		if (!vdev_readable(v)) {
+			rw_exit(&vd->vd_lock);
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
+		}
+
+		switch (zio->io_cmd) {
+		case DKIOCFLUSHWRITECACHE:
+
+			if (zfs_nocacheflush)
+				break;
+
+			if (v->vdev_nowritecache) {
+				zio->io_error = SET_ERROR(ENOTSUP);
+				break;
+			}
+
+			error = vdev_disk_io_flush(vd->vd_bdev, zio);
+			if (error == 0) {
+				rw_exit(&vd->vd_lock);
+				return;
+			}
+
+			zio->io_error = error;
+
+			break;
+
+		default:
+			zio->io_error = SET_ERROR(ENOTSUP);
+		}
+
+		rw_exit(&vd->vd_lock);
+		zio_execute(zio);
+		return;
+	case ZIO_TYPE_WRITE:
+		rw = WRITE;
+		break;
+
+	case ZIO_TYPE_READ:
+		rw = READ;
+		break;
+
+	case ZIO_TYPE_TRIM:
+#if defined(BLKDEV_DISCARD_SECURE)
+		if (zio->io_trim_flags & ZIO_TRIM_SECURE)
+			trim_flags |= BLKDEV_DISCARD_SECURE;
+#endif
+		zio->io_error = -blkdev_issue_discard(vd->vd_bdev,
+		    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS,
+		    trim_flags);
+
+		rw_exit(&vd->vd_lock);
+		zio_interrupt(zio);
+		return;
+
+	default:
+		rw_exit(&vd->vd_lock);
+		zio->io_error = SET_ERROR(ENOTSUP);
+		zio_interrupt(zio);
+		return;
+	}
+
+	zio->io_target_timestamp = zio_handle_io_delay(zio);
+	error = __vdev_disk_physio(vd->vd_bdev, zio,
+	    zio->io_size, zio->io_offset, rw, 0);
+	rw_exit(&vd->vd_lock);
+
+	if (error) {
+		zio->io_error = error;
+		zio_interrupt(zio);
+		return;
+	}
+}
+
+static void
+vdev_disk_io_done(zio_t *zio)
+{
+	/*
+	 * If the device returned EIO, we revalidate the media.  If it is
+	 * determined the media has changed this triggers the asynchronous
+	 * removal of the device from the configuration.
+	 */
+	if (zio->io_error == EIO) {
+		vdev_t *v = zio->io_vd;
+		vdev_disk_t *vd = v->vdev_tsd;
+
+		if (check_disk_change(vd->vd_bdev)) {
+			invalidate_bdev(vd->vd_bdev);
+			v->vdev_remove_wanted = B_TRUE;
+			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+		}
+	}
+}
+
+static void
+vdev_disk_hold(vdev_t *vd)
+{
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+	/* We must have a pathname, and it must be absolute. */
+	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
+		return;
+
+	/*
+	 * Only prefetch path and devid info if the device has
+	 * never been opened.
+	 */
+	if (vd->vdev_tsd != NULL)
+		return;
+
+}
+
+static void
+vdev_disk_rele(vdev_t *vd)
+{
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+	/* XXX: Implement me as a vnode rele for the device */
+}
+
+vdev_ops_t vdev_disk_ops = {
+	.vdev_op_open = vdev_disk_open,
+	.vdev_op_close = vdev_disk_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_disk_io_start,
+	.vdev_op_io_done = vdev_disk_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_disk_hold,
+	.vdev_op_rele = vdev_disk_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
+};
+
+/*
+ * The zfs_vdev_scheduler module option has been deprecated. Setting this
+ * value no longer has any effect.  It has not yet been entirely removed
+ * to allow the module to be loaded if this option is specified in the
+ * /etc/modprobe.d/zfs.conf file.  The following warning will be logged.
+ */
+static int
+param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
+{
+	int error = param_set_charp(val, kp);
+	if (error == 0) {
+		printk(KERN_INFO "The 'zfs_vdev_scheduler' module option "
+		    "is not supported.\n");
+	}
+
+	return (error);
+}
+
+char *zfs_vdev_scheduler = "unused";
+module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
+    param_get_charp, &zfs_vdev_scheduler, 0644);
+MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
+
+int
+param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
+{
+	uint64_t val;
+	int error;
+
+	error = kstrtoull(buf, 0, &val);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
+		return (SET_ERROR(-EINVAL));
+
+	error = param_set_ulong(buf, kp);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	return (0);
+}
+
+int
+param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
+{
+	uint64_t val;
+	int error;
+
+	error = kstrtoull(buf, 0, &val);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
+		return (SET_ERROR(-EINVAL));
+
+	error = param_set_ulong(buf, kp);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c
new file mode 100644
index 000000000000..a4e71ca40788
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c
@@ -0,0 +1,348 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/abd.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/zfs_file.h>
+#ifdef _KERNEL
+#include <linux/falloc.h>
+#endif
+/*
+ * Virtual device vector for files.
+ */
+
+static taskq_t *vdev_file_taskq;
+
+static void
+vdev_file_hold(vdev_t *vd)
+{
+	ASSERT(vd->vdev_path != NULL);
+}
+
+static void
+vdev_file_rele(vdev_t *vd)
+{
+	ASSERT(vd->vdev_path != NULL);
+}
+
+static mode_t
+vdev_file_open_mode(spa_mode_t spa_mode)
+{
+	mode_t mode = 0;
+
+	if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) {
+		mode = O_RDWR;
+	} else if (spa_mode & SPA_MODE_READ) {
+		mode = O_RDONLY;
+	} else if (spa_mode & SPA_MODE_WRITE) {
+		mode = O_WRONLY;
+	}
+
+	return (mode | O_LARGEFILE);
+}
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+	vdev_file_t *vf;
+	zfs_file_t *fp;
+	zfs_file_attr_t zfa;
+	int error;
+
+	/*
+	 * Rotational optimizations only make sense on block devices.
+	 */
+	vd->vdev_nonrot = B_TRUE;
+
+	/*
+	 * Allow TRIM on file based vdevs.  This may not always be supported,
+	 * since it depends on your kernel version and underlying filesystem
+	 * type but it is always safe to attempt.
+	 */
+	vd->vdev_has_trim = B_TRUE;
+
+	/*
+	 * Disable secure TRIM on file based vdevs.  There is no way to
+	 * request this behavior from the underlying filesystem.
+	 */
+	vd->vdev_has_securetrim = B_FALSE;
+
+	/*
+	 * We must have a pathname, and it must be absolute.
+	 */
+	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Reopen the device if it's not currently open.  Otherwise,
+	 * just update the physical size of the device.
+	 */
+	if (vd->vdev_tsd != NULL) {
+		ASSERT(vd->vdev_reopening);
+		vf = vd->vdev_tsd;
+		goto skip_open;
+	}
+
+	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+	/*
+	 * We always open the files from the root of the global zone, even if
+	 * we're in a local zone.  If the user has gotten to this point, the
+	 * administrator has already decided that the pool should be available
+	 * to local zone users, so the underlying devices should be as well.
+	 */
+	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+
+	error = zfs_file_open(vd->vdev_path,
+	    vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp);
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	vf->vf_file = fp;
+
+#ifdef _KERNEL
+	/*
+	 * Make sure it's a regular file.
+	 */
+	if (zfs_file_getattr(fp, &zfa)) {
+		return (SET_ERROR(ENODEV));
+	}
+	if (!S_ISREG(zfa.zfa_mode)) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (SET_ERROR(ENODEV));
+	}
+#endif
+
+skip_open:
+
+	error =  zfs_file_getattr(vf->vf_file, &zfa);
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	*max_psize = *psize = zfa.zfa_size;
+	*logical_ashift = SPA_MINBLOCKSHIFT;
+	*physical_ashift = SPA_MINBLOCKSHIFT;
+
+	return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+	vdev_file_t *vf = vd->vdev_tsd;
+
+	if (vd->vdev_reopening || vf == NULL)
+		return;
+
+	if (vf->vf_file != NULL) {
+		(void) zfs_file_close(vf->vf_file);
+	}
+
+	vd->vdev_delayed_close = B_FALSE;
+	kmem_free(vf, sizeof (vdev_file_t));
+	vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_file_io_strategy(void *arg)
+{
+	zio_t *zio = (zio_t *)arg;
+	vdev_t *vd = zio->io_vd;
+	vdev_file_t *vf = vd->vdev_tsd;
+	ssize_t resid;
+	void *buf;
+	loff_t off;
+	ssize_t size;
+	int err;
+
+	off = zio->io_offset;
+	size = zio->io_size;
+	resid = 0;
+
+	if (zio->io_type == ZIO_TYPE_READ) {
+		buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+		err = zfs_file_pread(vf->vf_file, buf, size, off, &resid);
+		abd_return_buf_copy(zio->io_abd, buf, size);
+	} else {
+		buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+		err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
+		abd_return_buf(zio->io_abd, buf, size);
+	}
+	zio->io_error = err;
+	if (resid != 0 && zio->io_error == 0)
+		zio->io_error = SET_ERROR(ENOSPC);
+
+	zio_delay_interrupt(zio);
+}
+
+static void
+vdev_file_io_fsync(void *arg)
+{
+	zio_t *zio = (zio_t *)arg;
+	vdev_file_t *vf = zio->io_vd->vdev_tsd;
+
+	zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC);
+
+	zio_interrupt(zio);
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_file_t *vf = vd->vdev_tsd;
+
+	if (zio->io_type == ZIO_TYPE_IOCTL) {
+		/* XXPOLICY */
+		if (!vdev_readable(vd)) {
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
+		}
+
+		switch (zio->io_cmd) {
+		case DKIOCFLUSHWRITECACHE:
+
+			if (zfs_nocacheflush)
+				break;
+
+			/*
+			 * We cannot safely call vfs_fsync() when PF_FSTRANS
+			 * is set in the current context.  Filesystems like
+			 * XFS include sanity checks to verify it is not
+			 * already set, see xfs_vm_writepage().  Therefore
+			 * the sync must be dispatched to a different context.
+			 */
+			if (__spl_pf_fstrans_check()) {
+				VERIFY3U(taskq_dispatch(vdev_file_taskq,
+				    vdev_file_io_fsync, zio, TQ_SLEEP), !=,
+				    TASKQID_INVALID);
+				return;
+			}
+
+			zio->io_error = zfs_file_fsync(vf->vf_file,
+			    O_SYNC | O_DSYNC);
+			break;
+		default:
+			zio->io_error = SET_ERROR(ENOTSUP);
+		}
+
+		zio_execute(zio);
+		return;
+	} else if (zio->io_type == ZIO_TYPE_TRIM) {
+		int mode = 0;
+
+		ASSERT3U(zio->io_size, !=, 0);
+#ifdef __linux__
+		mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+#endif
+		zio->io_error = zfs_file_fallocate(vf->vf_file,
+		    mode, zio->io_offset, zio->io_size);
+		zio_execute(zio);
+		return;
+	}
+
+	zio->io_target_timestamp = zio_handle_io_delay(zio);
+
+	VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
+	    TQ_SLEEP), !=, TASKQID_INVALID);
+}
+
+/* ARGSUSED */
+static void
+vdev_file_io_done(zio_t *zio)
+{
+}
+
+vdev_ops_t vdev_file_ops = {
+	.vdev_op_open = vdev_file_open,
+	.vdev_op_close = vdev_file_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_file_io_start,
+	.vdev_op_io_done = vdev_file_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_file_hold,
+	.vdev_op_rele = vdev_file_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_type = VDEV_TYPE_FILE,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
+};
+
+void
+vdev_file_init(void)
+{
+	vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16),
+	    minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC);
+
+	VERIFY(vdev_file_taskq);
+}
+
+void
+vdev_file_fini(void)
+{
+	taskq_destroy(vdev_file_taskq);
+}
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+	.vdev_op_open = vdev_file_open,
+	.vdev_op_close = vdev_file_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_io_start = vdev_file_io_start,
+	.vdev_op_io_done = vdev_file_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_file_hold,
+	.vdev_op_rele = vdev_file_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
+};
+
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
new file mode 100644
index 000000000000..8d79878c0458
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
@@ -0,0 +1,2932 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+ */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/sid.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/fs/zfs.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/trace_acl.h>
+#include <sys/zpl.h>
+
+#define	ALLOW	ACE_ACCESS_ALLOWED_ACE_TYPE
+#define	DENY	ACE_ACCESS_DENIED_ACE_TYPE
+#define	MAX_ACE_TYPE	ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
+#define	MIN_ACE_TYPE	ALLOW
+
+#define	OWNING_GROUP		(ACE_GROUP|ACE_IDENTIFIER_GROUP)
+#define	EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
+    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
+#define	EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define	OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+
+#define	ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
+    ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
+    ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
+    ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
+
+#define	WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
+#define	WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
+    ACE_DELETE|ACE_DELETE_CHILD)
+#define	WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
+
+#define	OGE_CLEAR	(ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define	OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define	ALL_INHERIT	(ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
+    ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE)
+
+#define	RESTRICTED_CLEAR	(ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define	V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\
+    ZFS_ACL_PROTECTED)
+
+#define	ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
+    ZFS_ACL_OBJ_ACE)
+
+#define	ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
+
+#define	IDMAP_WK_CREATOR_OWNER_UID	2147483648U
+
+static uint16_t
+zfs_ace_v0_get_type(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_v0_get_flags(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_v0_get_mask(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_v0_get_who(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_v0_set_type(void *acep, uint16_t type)
+{
+	((zfs_oldace_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_v0_set_flags(void *acep, uint16_t flags)
+{
+	((zfs_oldace_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_v0_set_mask(void *acep, uint32_t mask)
+{
+	((zfs_oldace_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_v0_set_who(void *acep, uint64_t who)
+{
+	((zfs_oldace_t *)acep)->z_fuid = who;
+}
+
+/*ARGSUSED*/
+static size_t
+zfs_ace_v0_size(void *acep)
+{
+	return (sizeof (zfs_oldace_t));
+}
+
+static size_t
+zfs_ace_v0_abstract_size(void)
+{
+	return (sizeof (zfs_oldace_t));
+}
+
+static int
+zfs_ace_v0_mask_off(void)
+{
+	return (offsetof(zfs_oldace_t, z_access_mask));
+}
+
+/*ARGSUSED*/
+static int
+zfs_ace_v0_data(void *acep, void **datap)
+{
+	*datap = NULL;
+	return (0);
+}
+
+static acl_ops_t zfs_acl_v0_ops = {
+	.ace_mask_get = zfs_ace_v0_get_mask,
+	.ace_mask_set = zfs_ace_v0_set_mask,
+	.ace_flags_get = zfs_ace_v0_get_flags,
+	.ace_flags_set = zfs_ace_v0_set_flags,
+	.ace_type_get = zfs_ace_v0_get_type,
+	.ace_type_set = zfs_ace_v0_set_type,
+	.ace_who_get = zfs_ace_v0_get_who,
+	.ace_who_set = zfs_ace_v0_set_who,
+	.ace_size = zfs_ace_v0_size,
+	.ace_abstract_size = zfs_ace_v0_abstract_size,
+	.ace_mask_off = zfs_ace_v0_mask_off,
+	.ace_data = zfs_ace_v0_data
+};
+
+static uint16_t
+zfs_ace_fuid_get_type(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_fuid_get_flags(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_fuid_get_mask(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_fuid_get_who(void *args)
+{
+	uint16_t entry_type;
+	zfs_ace_t *acep = args;
+
+	entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+	if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+	    entry_type == ACE_EVERYONE)
+		return (-1);
+	return (((zfs_ace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_fuid_set_type(void *acep, uint16_t type)
+{
+	((zfs_ace_hdr_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
+{
+	((zfs_ace_hdr_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
+{
+	((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_fuid_set_who(void *arg, uint64_t who)
+{
+	zfs_ace_t *acep = arg;
+
+	uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+	if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+	    entry_type == ACE_EVERYONE)
+		return;
+	acep->z_fuid = who;
+}
+
+static size_t
+zfs_ace_fuid_size(void *acep)
+{
+	zfs_ace_hdr_t *zacep = acep;
+	uint16_t entry_type;
+
+	switch (zacep->z_type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		return (sizeof (zfs_object_ace_t));
+	case ALLOW:
+	case DENY:
+		entry_type =
+		    (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
+		if (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE)
+			return (sizeof (zfs_ace_hdr_t));
+		/*FALLTHROUGH*/
+	default:
+		return (sizeof (zfs_ace_t));
+	}
+}
+
+static size_t
+zfs_ace_fuid_abstract_size(void)
+{
+	return (sizeof (zfs_ace_hdr_t));
+}
+
+static int
+zfs_ace_fuid_mask_off(void)
+{
+	return (offsetof(zfs_ace_hdr_t, z_access_mask));
+}
+
+static int
+zfs_ace_fuid_data(void *acep, void **datap)
+{
+	zfs_ace_t *zacep = acep;
+	zfs_object_ace_t *zobjp;
+
+	switch (zacep->z_hdr.z_type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		zobjp = acep;
+		*datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
+		return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
+	default:
+		*datap = NULL;
+		return (0);
+	}
+}
+
+static acl_ops_t zfs_acl_fuid_ops = {
+	.ace_mask_get = zfs_ace_fuid_get_mask,
+	.ace_mask_set = zfs_ace_fuid_set_mask,
+	.ace_flags_get = zfs_ace_fuid_get_flags,
+	.ace_flags_set = zfs_ace_fuid_set_flags,
+	.ace_type_get = zfs_ace_fuid_get_type,
+	.ace_type_set = zfs_ace_fuid_set_type,
+	.ace_who_get = zfs_ace_fuid_get_who,
+	.ace_who_set = zfs_ace_fuid_set_who,
+	.ace_size = zfs_ace_fuid_size,
+	.ace_abstract_size = zfs_ace_fuid_abstract_size,
+	.ace_mask_off = zfs_ace_fuid_mask_off,
+	.ace_data = zfs_ace_fuid_data
+};
+
+/*
+ * The following three functions are provided for compatibility with
+ * older ZPL version in order to determine if the file use to have
+ * an external ACL and what version of ACL previously existed on the
+ * file.  Would really be nice to not need this, sigh.
+ */
+uint64_t
+zfs_external_acl(znode_t *zp)
+{
+	zfs_acl_phys_t acl_phys;
+	int error;
+
+	if (zp->z_is_sa)
+		return (0);
+
+	/*
+	 * Need to deal with a potential
+	 * race where zfs_sa_upgrade could cause
+	 * z_isa_sa to change.
+	 *
+	 * If the lookup fails then the state of z_is_sa should have
+	 * changed.
+	 */
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(ZTOZSB(zp)),
+	    &acl_phys, sizeof (acl_phys))) == 0)
+		return (acl_phys.z_acl_extern_obj);
+	else {
+		/*
+		 * after upgrade the SA_ZPL_ZNODE_ACL should have been
+		 * removed
+		 */
+		VERIFY(zp->z_is_sa && error == ENOENT);
+		return (0);
+	}
+}
+
+/*
+ * Determine size of ACL in bytes
+ *
+ * This is more complicated than it should be since we have to deal
+ * with old external ACLs.
+ */
+static int
+zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
+    zfs_acl_phys_t *aclphys)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	uint64_t acl_count;
+	int size;
+	int error;
+
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	if (zp->z_is_sa) {
+		if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
+		    &size)) != 0)
+			return (error);
+		*aclsize = size;
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
+		    &acl_count, sizeof (acl_count))) != 0)
+			return (error);
+		*aclcount = acl_count;
+	} else {
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+		    aclphys, sizeof (*aclphys))) != 0)
+			return (error);
+
+		if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+			*aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
+			*aclcount = aclphys->z_acl_size;
+		} else {
+			*aclsize = aclphys->z_acl_size;
+			*aclcount = aclphys->z_acl_count;
+		}
+	}
+	return (0);
+}
+
+int
+zfs_znode_acl_version(znode_t *zp)
+{
+	zfs_acl_phys_t acl_phys;
+
+	if (zp->z_is_sa)
+		return (ZFS_ACL_VERSION_FUID);
+	else {
+		int error;
+
+		/*
+		 * Need to deal with a potential
+		 * race where zfs_sa_upgrade could cause
+		 * z_isa_sa to change.
+		 *
+		 * If the lookup fails then the state of z_is_sa should have
+		 * changed.
+		 */
+		if ((error = sa_lookup(zp->z_sa_hdl,
+		    SA_ZPL_ZNODE_ACL(ZTOZSB(zp)),
+		    &acl_phys, sizeof (acl_phys))) == 0)
+			return (acl_phys.z_acl_version);
+		else {
+			/*
+			 * After upgrade SA_ZPL_ZNODE_ACL should have
+			 * been removed.
+			 */
+			VERIFY(zp->z_is_sa && error == ENOENT);
+			return (ZFS_ACL_VERSION_FUID);
+		}
+	}
+}
+
+static int
+zfs_acl_version(int version)
+{
+	if (version < ZPL_VERSION_FUID)
+		return (ZFS_ACL_VERSION_INITIAL);
+	else
+		return (ZFS_ACL_VERSION_FUID);
+}
+
+static int
+zfs_acl_version_zp(znode_t *zp)
+{
+	return (zfs_acl_version(ZTOZSB(zp)->z_version));
+}
+
+zfs_acl_t *
+zfs_acl_alloc(int vers)
+{
+	zfs_acl_t *aclp;
+
+	aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
+	list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
+	    offsetof(zfs_acl_node_t, z_next));
+	aclp->z_version = vers;
+	if (vers == ZFS_ACL_VERSION_FUID)
+		aclp->z_ops = &zfs_acl_fuid_ops;
+	else
+		aclp->z_ops = &zfs_acl_v0_ops;
+	return (aclp);
+}
+
+zfs_acl_node_t *
+zfs_acl_node_alloc(size_t bytes)
+{
+	zfs_acl_node_t *aclnode;
+
+	aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
+	if (bytes) {
+		aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
+		aclnode->z_allocdata = aclnode->z_acldata;
+		aclnode->z_allocsize = bytes;
+		aclnode->z_size = bytes;
+	}
+
+	return (aclnode);
+}
+
+static void
+zfs_acl_node_free(zfs_acl_node_t *aclnode)
+{
+	if (aclnode->z_allocsize)
+		kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
+	kmem_free(aclnode, sizeof (zfs_acl_node_t));
+}
+
+static void
+zfs_acl_release_nodes(zfs_acl_t *aclp)
+{
+	zfs_acl_node_t *aclnode;
+
+	while ((aclnode = list_head(&aclp->z_acl))) {
+		list_remove(&aclp->z_acl, aclnode);
+		zfs_acl_node_free(aclnode);
+	}
+	aclp->z_acl_count = 0;
+	aclp->z_acl_bytes = 0;
+}
+
+void
+zfs_acl_free(zfs_acl_t *aclp)
+{
+	zfs_acl_release_nodes(aclp);
+	list_destroy(&aclp->z_acl);
+	kmem_free(aclp, sizeof (zfs_acl_t));
+}
+
+static boolean_t
+zfs_acl_valid_ace_type(uint_t type, uint_t flags)
+{
+	uint16_t entry_type;
+
+	switch (type) {
+	case ALLOW:
+	case DENY:
+	case ACE_SYSTEM_AUDIT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_ACE_TYPE:
+		entry_type = flags & ACE_TYPE_FLAGS;
+		return (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE || entry_type == 0 ||
+		    entry_type == ACE_IDENTIFIER_GROUP);
+	default:
+		if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+static boolean_t
+zfs_ace_valid(umode_t obj_mode, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
+{
+	/*
+	 * first check type of entry
+	 */
+
+	if (!zfs_acl_valid_ace_type(type, iflags))
+		return (B_FALSE);
+
+	switch (type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		if (aclp->z_version < ZFS_ACL_VERSION_FUID)
+			return (B_FALSE);
+		aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+	}
+
+	/*
+	 * next check inheritance level flags
+	 */
+
+	if (S_ISDIR(obj_mode) &&
+	    (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+		aclp->z_hints |= ZFS_INHERIT_ACE;
+
+	if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+		if ((iflags & (ACE_FILE_INHERIT_ACE|
+		    ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+			return (B_FALSE);
+		}
+	}
+
+	return (B_TRUE);
+}
+
+static void *
+zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
+    uint32_t *access_mask, uint16_t *iflags, uint16_t *type)
+{
+	zfs_acl_node_t *aclnode;
+
+	ASSERT(aclp);
+
+	if (start == NULL) {
+		aclnode = list_head(&aclp->z_acl);
+		if (aclnode == NULL)
+			return (NULL);
+
+		aclp->z_next_ace = aclnode->z_acldata;
+		aclp->z_curr_node = aclnode;
+		aclnode->z_ace_idx = 0;
+	}
+
+	aclnode = aclp->z_curr_node;
+
+	if (aclnode == NULL)
+		return (NULL);
+
+	if (aclnode->z_ace_idx >= aclnode->z_ace_count) {
+		aclnode = list_next(&aclp->z_acl, aclnode);
+		if (aclnode == NULL)
+			return (NULL);
+		else {
+			aclp->z_curr_node = aclnode;
+			aclnode->z_ace_idx = 0;
+			aclp->z_next_ace = aclnode->z_acldata;
+		}
+	}
+
+	if (aclnode->z_ace_idx < aclnode->z_ace_count) {
+		void *acep = aclp->z_next_ace;
+		size_t ace_size;
+
+		/*
+		 * Make sure we don't overstep our bounds
+		 */
+		ace_size = aclp->z_ops->ace_size(acep);
+
+		if (((caddr_t)acep + ace_size) >
+		    ((caddr_t)aclnode->z_acldata + aclnode->z_size)) {
+			return (NULL);
+		}
+
+		*iflags = aclp->z_ops->ace_flags_get(acep);
+		*type = aclp->z_ops->ace_type_get(acep);
+		*access_mask = aclp->z_ops->ace_mask_get(acep);
+		*who = aclp->z_ops->ace_who_get(acep);
+		aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
+		aclnode->z_ace_idx++;
+
+		return ((void *)acep);
+	}
+	return (NULL);
+}
+
+/*ARGSUSED*/
+static uint64_t
+zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
+    uint16_t *flags, uint16_t *type, uint32_t *mask)
+{
+	zfs_acl_t *aclp = datap;
+	zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
+	uint64_t who;
+
+	acep = zfs_acl_next_ace(aclp, acep, &who, mask,
+	    flags, type);
+	return ((uint64_t)(uintptr_t)acep);
+}
+
+/*
+ * Copy ACE to internal ZFS format.
+ * While processing the ACL each ACE will be validated for correctness.
+ * ACE FUIDs will be created later.
+ */
+static int
+zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *aclp,
+    void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
+    zfs_fuid_info_t **fuidp, cred_t *cr)
+{
+	int i;
+	uint16_t entry_type;
+	zfs_ace_t *aceptr = z_acl;
+	ace_t *acep = datap;
+	zfs_object_ace_t *zobjacep;
+	ace_object_t *aceobjp;
+
+	for (i = 0; i != aclcnt; i++) {
+		aceptr->z_hdr.z_access_mask = acep->a_access_mask;
+		aceptr->z_hdr.z_flags = acep->a_flags;
+		aceptr->z_hdr.z_type = acep->a_type;
+		entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
+		if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
+		    entry_type != ACE_EVERYONE) {
+			aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
+			    cr, (entry_type == 0) ?
+			    ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
+		}
+
+		/*
+		 * Make sure ACE is valid
+		 */
+		if (zfs_ace_valid(obj_mode, aclp, aceptr->z_hdr.z_type,
+		    aceptr->z_hdr.z_flags) != B_TRUE)
+			return (SET_ERROR(EINVAL));
+
+		switch (acep->a_type) {
+		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+		case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+			zobjacep = (zfs_object_ace_t *)aceptr;
+			aceobjp = (ace_object_t *)acep;
+
+			bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
+			    sizeof (aceobjp->a_obj_type));
+			bcopy(aceobjp->a_inherit_obj_type,
+			    zobjacep->z_inherit_type,
+			    sizeof (aceobjp->a_inherit_obj_type));
+			acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
+			break;
+		default:
+			acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
+		}
+
+		aceptr = (zfs_ace_t *)((caddr_t)aceptr +
+		    aclp->z_ops->ace_size(aceptr));
+	}
+
+	*size = (caddr_t)aceptr - (caddr_t)z_acl;
+
+	return (0);
+}
+
+/*
+ * Copy ZFS ACEs to fixed size ace_t layout
+ */
+static void
+zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr,
+    void *datap, int filter)
+{
+	uint64_t who;
+	uint32_t access_mask;
+	uint16_t iflags, type;
+	zfs_ace_hdr_t *zacep = NULL;
+	ace_t *acep = datap;
+	ace_object_t *objacep;
+	zfs_object_ace_t *zobjacep;
+	size_t ace_size;
+	uint16_t entry_type;
+
+	while ((zacep = zfs_acl_next_ace(aclp, zacep,
+	    &who, &access_mask, &iflags, &type))) {
+
+		switch (type) {
+		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+		case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+			if (filter) {
+				continue;
+			}
+			zobjacep = (zfs_object_ace_t *)zacep;
+			objacep = (ace_object_t *)acep;
+			bcopy(zobjacep->z_object_type,
+			    objacep->a_obj_type,
+			    sizeof (zobjacep->z_object_type));
+			bcopy(zobjacep->z_inherit_type,
+			    objacep->a_inherit_obj_type,
+			    sizeof (zobjacep->z_inherit_type));
+			ace_size = sizeof (ace_object_t);
+			break;
+		default:
+			ace_size = sizeof (ace_t);
+			break;
+		}
+
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+		if ((entry_type != ACE_OWNER &&
+		    entry_type != OWNING_GROUP &&
+		    entry_type != ACE_EVERYONE)) {
+			acep->a_who = zfs_fuid_map_id(zfsvfs, who,
+			    cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
+			    ZFS_ACE_GROUP : ZFS_ACE_USER);
+		} else {
+			acep->a_who = (uid_t)(int64_t)who;
+		}
+		acep->a_access_mask = access_mask;
+		acep->a_flags = iflags;
+		acep->a_type = type;
+		acep = (ace_t *)((caddr_t)acep + ace_size);
+	}
+}
+
+static int
+zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep,
+    zfs_oldace_t *z_acl, int aclcnt, size_t *size)
+{
+	int i;
+	zfs_oldace_t *aceptr = z_acl;
+
+	for (i = 0; i != aclcnt; i++, aceptr++) {
+		aceptr->z_access_mask = acep[i].a_access_mask;
+		aceptr->z_type = acep[i].a_type;
+		aceptr->z_flags = acep[i].a_flags;
+		aceptr->z_fuid = acep[i].a_who;
+		/*
+		 * Make sure ACE is valid
+		 */
+		if (zfs_ace_valid(obj_mode, aclp, aceptr->z_type,
+		    aceptr->z_flags) != B_TRUE)
+			return (SET_ERROR(EINVAL));
+	}
+	*size = (caddr_t)aceptr - (caddr_t)z_acl;
+	return (0);
+}
+
+/*
+ * convert old ACL format to new
+ */
+void
+zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
+{
+	zfs_oldace_t *oldaclp;
+	int i;
+	uint16_t type, iflags;
+	uint32_t access_mask;
+	uint64_t who;
+	void *cookie = NULL;
+	zfs_acl_node_t *newaclnode;
+
+	ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
+	/*
+	 * First create the ACE in a contiguous piece of memory
+	 * for zfs_copy_ace_2_fuid().
+	 *
+	 * We only convert an ACL once, so this won't happen
+	 * every time.
+	 */
+	oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
+	    KM_SLEEP);
+	i = 0;
+	while ((cookie = zfs_acl_next_ace(aclp, cookie, &who,
+	    &access_mask, &iflags, &type))) {
+		oldaclp[i].z_flags = iflags;
+		oldaclp[i].z_type = type;
+		oldaclp[i].z_fuid = who;
+		oldaclp[i++].z_access_mask = access_mask;
+	}
+
+	newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
+	    sizeof (zfs_object_ace_t));
+	aclp->z_ops = &zfs_acl_fuid_ops;
+	VERIFY(zfs_copy_ace_2_fuid(ZTOZSB(zp), ZTOI(zp)->i_mode,
+	    aclp, oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
+	    &newaclnode->z_size, NULL, cr) == 0);
+	newaclnode->z_ace_count = aclp->z_acl_count;
+	aclp->z_version = ZFS_ACL_VERSION;
+	kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
+
+	/*
+	 * Release all previous ACL nodes
+	 */
+
+	zfs_acl_release_nodes(aclp);
+
+	list_insert_head(&aclp->z_acl, newaclnode);
+
+	aclp->z_acl_bytes = newaclnode->z_size;
+	aclp->z_acl_count = newaclnode->z_ace_count;
+
+}
+
+/*
+ * Convert unix access mask to v4 access mask
+ */
+static uint32_t
+zfs_unix_to_v4(uint32_t access_mask)
+{
+	uint32_t new_mask = 0;
+
+	if (access_mask & S_IXOTH)
+		new_mask |= ACE_EXECUTE;
+	if (access_mask & S_IWOTH)
+		new_mask |= ACE_WRITE_DATA;
+	if (access_mask & S_IROTH)
+		new_mask |= ACE_READ_DATA;
+	return (new_mask);
+}
+
+static void
+zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
+    uint16_t access_type, uint64_t fuid, uint16_t entry_type)
+{
+	uint16_t type = entry_type & ACE_TYPE_FLAGS;
+
+	aclp->z_ops->ace_mask_set(acep, access_mask);
+	aclp->z_ops->ace_type_set(acep, access_type);
+	aclp->z_ops->ace_flags_set(acep, entry_type);
+	if ((type != ACE_OWNER && type != OWNING_GROUP &&
+	    type != ACE_EVERYONE))
+		aclp->z_ops->ace_who_set(acep, fuid);
+}
+
+/*
+ * Determine mode of file based on ACL.
+ */
+uint64_t
+zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
+    uint64_t *pflags, uint64_t fuid, uint64_t fgid)
+{
+	int		entry_type;
+	mode_t		mode;
+	mode_t		seen = 0;
+	zfs_ace_hdr_t 	*acep = NULL;
+	uint64_t	who;
+	uint16_t	iflags, type;
+	uint32_t	access_mask;
+	boolean_t	an_exec_denied = B_FALSE;
+
+	mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+
+	while ((acep = zfs_acl_next_ace(aclp, acep, &who,
+	    &access_mask, &iflags, &type))) {
+
+		if (!zfs_acl_valid_ace_type(type, iflags))
+			continue;
+
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+
+		/*
+		 * Skip over any inherit_only ACEs
+		 */
+		if (iflags & ACE_INHERIT_ONLY_ACE)
+			continue;
+
+		if (entry_type == ACE_OWNER || (entry_type == 0 &&
+		    who == fuid)) {
+			if ((access_mask & ACE_READ_DATA) &&
+			    (!(seen & S_IRUSR))) {
+				seen |= S_IRUSR;
+				if (type == ALLOW) {
+					mode |= S_IRUSR;
+				}
+			}
+			if ((access_mask & ACE_WRITE_DATA) &&
+			    (!(seen & S_IWUSR))) {
+				seen |= S_IWUSR;
+				if (type == ALLOW) {
+					mode |= S_IWUSR;
+				}
+			}
+			if ((access_mask & ACE_EXECUTE) &&
+			    (!(seen & S_IXUSR))) {
+				seen |= S_IXUSR;
+				if (type == ALLOW) {
+					mode |= S_IXUSR;
+				}
+			}
+		} else if (entry_type == OWNING_GROUP ||
+		    (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
+			if ((access_mask & ACE_READ_DATA) &&
+			    (!(seen & S_IRGRP))) {
+				seen |= S_IRGRP;
+				if (type == ALLOW) {
+					mode |= S_IRGRP;
+				}
+			}
+			if ((access_mask & ACE_WRITE_DATA) &&
+			    (!(seen & S_IWGRP))) {
+				seen |= S_IWGRP;
+				if (type == ALLOW) {
+					mode |= S_IWGRP;
+				}
+			}
+			if ((access_mask & ACE_EXECUTE) &&
+			    (!(seen & S_IXGRP))) {
+				seen |= S_IXGRP;
+				if (type == ALLOW) {
+					mode |= S_IXGRP;
+				}
+			}
+		} else if (entry_type == ACE_EVERYONE) {
+			if ((access_mask & ACE_READ_DATA)) {
+				if (!(seen & S_IRUSR)) {
+					seen |= S_IRUSR;
+					if (type == ALLOW) {
+						mode |= S_IRUSR;
+					}
+				}
+				if (!(seen & S_IRGRP)) {
+					seen |= S_IRGRP;
+					if (type == ALLOW) {
+						mode |= S_IRGRP;
+					}
+				}
+				if (!(seen & S_IROTH)) {
+					seen |= S_IROTH;
+					if (type == ALLOW) {
+						mode |= S_IROTH;
+					}
+				}
+			}
+			if ((access_mask & ACE_WRITE_DATA)) {
+				if (!(seen & S_IWUSR)) {
+					seen |= S_IWUSR;
+					if (type == ALLOW) {
+						mode |= S_IWUSR;
+					}
+				}
+				if (!(seen & S_IWGRP)) {
+					seen |= S_IWGRP;
+					if (type == ALLOW) {
+						mode |= S_IWGRP;
+					}
+				}
+				if (!(seen & S_IWOTH)) {
+					seen |= S_IWOTH;
+					if (type == ALLOW) {
+						mode |= S_IWOTH;
+					}
+				}
+			}
+			if ((access_mask & ACE_EXECUTE)) {
+				if (!(seen & S_IXUSR)) {
+					seen |= S_IXUSR;
+					if (type == ALLOW) {
+						mode |= S_IXUSR;
+					}
+				}
+				if (!(seen & S_IXGRP)) {
+					seen |= S_IXGRP;
+					if (type == ALLOW) {
+						mode |= S_IXGRP;
+					}
+				}
+				if (!(seen & S_IXOTH)) {
+					seen |= S_IXOTH;
+					if (type == ALLOW) {
+						mode |= S_IXOTH;
+					}
+				}
+			}
+		} else {
+			/*
+			 * Only care if this IDENTIFIER_GROUP or
+			 * USER ACE denies execute access to someone,
+			 * mode is not affected
+			 */
+			if ((access_mask & ACE_EXECUTE) && type == DENY)
+				an_exec_denied = B_TRUE;
+		}
+	}
+
+	/*
+	 * Failure to allow is effectively a deny, so execute permission
+	 * is denied if it was never mentioned or if we explicitly
+	 * weren't allowed it.
+	 */
+	if (!an_exec_denied &&
+	    ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
+	    (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
+		an_exec_denied = B_TRUE;
+
+	if (an_exec_denied)
+		*pflags &= ~ZFS_NO_EXECS_DENIED;
+	else
+		*pflags |= ZFS_NO_EXECS_DENIED;
+
+	return (mode);
+}
+
+/*
+ * Read an external acl object.  If the intent is to modify, always
+ * create a new acl and leave any cached acl in place.
+ */
+int
+zfs_acl_node_read(struct znode *zp, boolean_t have_lock, zfs_acl_t **aclpp,
+    boolean_t will_modify)
+{
+	zfs_acl_t	*aclp;
+	int		aclsize = 0;
+	int		acl_count = 0;
+	zfs_acl_node_t	*aclnode;
+	zfs_acl_phys_t	znode_acl;
+	int		version;
+	int		error;
+	boolean_t	drop_lock = B_FALSE;
+
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+	if (zp->z_acl_cached && !will_modify) {
+		*aclpp = zp->z_acl_cached;
+		return (0);
+	}
+
+	/*
+	 * close race where znode could be upgrade while trying to
+	 * read the znode attributes.
+	 *
+	 * But this could only happen if the file isn't already an SA
+	 * znode
+	 */
+	if (!zp->z_is_sa && !have_lock) {
+		mutex_enter(&zp->z_lock);
+		drop_lock = B_TRUE;
+	}
+	version = zfs_znode_acl_version(zp);
+
+	if ((error = zfs_acl_znode_info(zp, &aclsize,
+	    &acl_count, &znode_acl)) != 0) {
+		goto done;
+	}
+
+	aclp = zfs_acl_alloc(version);
+
+	aclp->z_acl_count = acl_count;
+	aclp->z_acl_bytes = aclsize;
+
+	aclnode = zfs_acl_node_alloc(aclsize);
+	aclnode->z_ace_count = aclp->z_acl_count;
+	aclnode->z_size = aclsize;
+
+	if (!zp->z_is_sa) {
+		if (znode_acl.z_acl_extern_obj) {
+			error = dmu_read(ZTOZSB(zp)->z_os,
+			    znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
+			    aclnode->z_acldata, DMU_READ_PREFETCH);
+		} else {
+			bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
+			    aclnode->z_size);
+		}
+	} else {
+		error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(ZTOZSB(zp)),
+		    aclnode->z_acldata, aclnode->z_size);
+	}
+
+	if (error != 0) {
+		zfs_acl_free(aclp);
+		zfs_acl_node_free(aclnode);
+		/* convert checksum errors into IO errors */
+		if (error == ECKSUM)
+			error = SET_ERROR(EIO);
+		goto done;
+	}
+
+	list_insert_head(&aclp->z_acl, aclnode);
+
+	*aclpp = aclp;
+	if (!will_modify)
+		zp->z_acl_cached = aclp;
+done:
+	if (drop_lock)
+		mutex_exit(&zp->z_lock);
+	return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
+    boolean_t start, void *userdata)
+{
+	zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
+
+	if (start) {
+		cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
+	} else {
+		cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
+		    cb->cb_acl_node);
+	}
+	*dataptr = cb->cb_acl_node->z_acldata;
+	*length = cb->cb_acl_node->z_size;
+}
+
+int
+zfs_acl_chown_setattr(znode_t *zp)
+{
+	int error;
+	zfs_acl_t *aclp;
+
+	if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIXACL)
+		return (0);
+
+	ASSERT(MUTEX_HELD(&zp->z_lock));
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+	error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE);
+	if (error == 0 && aclp->z_acl_count > 0)
+		zp->z_mode = ZTOI(zp)->i_mode =
+		    zfs_mode_compute(zp->z_mode, aclp,
+		    &zp->z_pflags, KUID_TO_SUID(ZTOI(zp)->i_uid),
+		    KGID_TO_SGID(ZTOI(zp)->i_gid));
+
+	/*
+	 * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL
+	 * nor a DACL_ACES SA in which case ENOENT is returned from
+	 * zfs_acl_node_read() when the SA can't be located.
+	 * Allow chown/chgrp to succeed in these cases rather than
+	 * returning an error that makes no sense in the context of
+	 * the caller.
+	 */
+	if (error == ENOENT)
+		return (0);
+
+	return (error);
+}
+
+typedef struct trivial_acl {
+	uint32_t	allow0;		/* allow mask for bits only in owner */
+	uint32_t	deny1;		/* deny mask for bits not in owner */
+	uint32_t	deny2;		/* deny mask for bits not in group */
+	uint32_t	owner;		/* allow mask matching mode */
+	uint32_t	group;		/* allow mask matching mode */
+	uint32_t	everyone;	/* allow mask matching mode */
+} trivial_acl_t;
+
+static void
+acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks)
+{
+	uint32_t read_mask = ACE_READ_DATA;
+	uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA;
+	uint32_t execute_mask = ACE_EXECUTE;
+
+	if (isdir)
+		write_mask |= ACE_DELETE_CHILD;
+
+	masks->deny1 = 0;
+
+	if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH)))
+		masks->deny1 |= read_mask;
+	if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH)))
+		masks->deny1 |= write_mask;
+	if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH)))
+		masks->deny1 |= execute_mask;
+
+	masks->deny2 = 0;
+	if (!(mode & S_IRGRP) && (mode & S_IROTH))
+		masks->deny2 |= read_mask;
+	if (!(mode & S_IWGRP) && (mode & S_IWOTH))
+		masks->deny2 |= write_mask;
+	if (!(mode & S_IXGRP) && (mode & S_IXOTH))
+		masks->deny2 |= execute_mask;
+
+	masks->allow0 = 0;
+	if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH)))
+		masks->allow0 |= read_mask;
+	if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH)))
+		masks->allow0 |= write_mask;
+	if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH)))
+		masks->allow0 |= execute_mask;
+
+	masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL|
+	    ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES|
+	    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE;
+	if (mode & S_IRUSR)
+		masks->owner |= read_mask;
+	if (mode & S_IWUSR)
+		masks->owner |= write_mask;
+	if (mode & S_IXUSR)
+		masks->owner |= execute_mask;
+
+	masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+	    ACE_SYNCHRONIZE;
+	if (mode & S_IRGRP)
+		masks->group |= read_mask;
+	if (mode & S_IWGRP)
+		masks->group |= write_mask;
+	if (mode & S_IXGRP)
+		masks->group |= execute_mask;
+
+	masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+	    ACE_SYNCHRONIZE;
+	if (mode & S_IROTH)
+		masks->everyone |= read_mask;
+	if (mode & S_IWOTH)
+		masks->everyone |= write_mask;
+	if (mode & S_IXOTH)
+		masks->everyone |= execute_mask;
+}
+
+/*
+ * ace_trivial:
+ * determine whether an ace_t acl is trivial
+ *
+ * Trivialness implies that the acl is composed of only
+ * owner, group, everyone entries.  ACL can't
+ * have read_acl denied, and write_owner/write_acl/write_attributes
+ * can only be owner@ entry.
+ */
+static int
+ace_trivial_common(void *acep, int aclcnt,
+    uint64_t (*walk)(void *, uint64_t, int aclcnt,
+    uint16_t *, uint16_t *, uint32_t *))
+{
+	uint16_t flags;
+	uint32_t mask;
+	uint16_t type;
+	uint64_t cookie = 0;
+
+	while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) {
+		switch (flags & ACE_TYPE_FLAGS) {
+		case ACE_OWNER:
+		case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+		case ACE_EVERYONE:
+			break;
+		default:
+			return (1);
+		}
+
+		if (flags & (ACE_FILE_INHERIT_ACE|
+		    ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
+		    ACE_INHERIT_ONLY_ACE))
+			return (1);
+
+		/*
+		 * Special check for some special bits
+		 *
+		 * Don't allow anybody to deny reading basic
+		 * attributes or a files ACL.
+		 */
+		if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+		    (type == ACE_ACCESS_DENIED_ACE_TYPE))
+			return (1);
+
+		/*
+		 * Delete permission is never set by default
+		 */
+		if (mask & ACE_DELETE)
+			return (1);
+
+		/*
+		 * Child delete permission should be accompanied by write
+		 */
+		if ((mask & ACE_DELETE_CHILD) && !(mask & ACE_WRITE_DATA))
+			return (1);
+
+		/*
+		 * only allow owner@ to have
+		 * write_acl/write_owner/write_attributes/write_xattr/
+		 */
+		if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+		    (!(flags & ACE_OWNER) && (mask &
+		    (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
+		    ACE_WRITE_NAMED_ATTRS))))
+			return (1);
+
+	}
+
+	return (0);
+}
+
+/*
+ * common code for setting ACLs.
+ *
+ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
+ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
+ * already checked the acl and knows whether to inherit.
+ */
+int
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
+{
+	int			error;
+	zfsvfs_t		*zfsvfs = ZTOZSB(zp);
+	dmu_object_type_t	otype;
+	zfs_acl_locator_cb_t	locate = { 0 };
+	uint64_t		mode;
+	sa_bulk_attr_t		bulk[5];
+	uint64_t		ctime[2];
+	int			count = 0;
+	zfs_acl_phys_t		acl_phys;
+
+	mode = zp->z_mode;
+
+	mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
+	    KUID_TO_SUID(ZTOI(zp)->i_uid), KGID_TO_SGID(ZTOI(zp)->i_gid));
+
+	zp->z_mode = ZTOI(zp)->i_mode = mode;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+	    &mode, sizeof (mode));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    &ctime, sizeof (ctime));
+
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
+	/*
+	 * Upgrade needed?
+	 */
+	if (!zfsvfs->z_use_fuids) {
+		otype = DMU_OT_OLDACL;
+	} else {
+		if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
+		    (zfsvfs->z_version >= ZPL_VERSION_FUID))
+			zfs_acl_xform(zp, aclp, cr);
+		ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
+		otype = DMU_OT_ACL;
+	}
+
+	/*
+	 * Arrgh, we have to handle old on disk format
+	 * as well as newer (preferred) SA format.
+	 */
+
+	if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
+		locate.cb_aclp = aclp;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
+		    zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
+		    NULL, &aclp->z_acl_count, sizeof (uint64_t));
+	} else { /* Painful legacy way */
+		zfs_acl_node_t *aclnode;
+		uint64_t off = 0;
+		uint64_t aoid;
+
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+		    &acl_phys, sizeof (acl_phys))) != 0)
+			return (error);
+
+		aoid = acl_phys.z_acl_extern_obj;
+
+		if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+			/*
+			 * If ACL was previously external and we are now
+			 * converting to new ACL format then release old
+			 * ACL object and create a new one.
+			 */
+			if (aoid &&
+			    aclp->z_version != acl_phys.z_acl_version) {
+				error = dmu_object_free(zfsvfs->z_os, aoid, tx);
+				if (error)
+					return (error);
+				aoid = 0;
+			}
+			if (aoid == 0) {
+				aoid = dmu_object_alloc(zfsvfs->z_os,
+				    otype, aclp->z_acl_bytes,
+				    otype == DMU_OT_ACL ?
+				    DMU_OT_SYSACL : DMU_OT_NONE,
+				    otype == DMU_OT_ACL ?
+				    DN_OLD_MAX_BONUSLEN : 0, tx);
+			} else {
+				(void) dmu_object_set_blocksize(zfsvfs->z_os,
+				    aoid, aclp->z_acl_bytes, 0, tx);
+			}
+			acl_phys.z_acl_extern_obj = aoid;
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				if (aclnode->z_ace_count == 0)
+					continue;
+				dmu_write(zfsvfs->z_os, aoid, off,
+				    aclnode->z_size, aclnode->z_acldata, tx);
+				off += aclnode->z_size;
+			}
+		} else {
+			void *start = acl_phys.z_ace_data;
+			/*
+			 * Migrating back embedded?
+			 */
+			if (acl_phys.z_acl_extern_obj) {
+				error = dmu_object_free(zfsvfs->z_os,
+				    acl_phys.z_acl_extern_obj, tx);
+				if (error)
+					return (error);
+				acl_phys.z_acl_extern_obj = 0;
+			}
+
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				if (aclnode->z_ace_count == 0)
+					continue;
+				bcopy(aclnode->z_acldata, start,
+				    aclnode->z_size);
+				start = (caddr_t)start + aclnode->z_size;
+			}
+		}
+		/*
+		 * If Old version then swap count/bytes to match old
+		 * layout of znode_acl_phys_t.
+		 */
+		if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+			acl_phys.z_acl_size = aclp->z_acl_count;
+			acl_phys.z_acl_count = aclp->z_acl_bytes;
+		} else {
+			acl_phys.z_acl_size = aclp->z_acl_bytes;
+			acl_phys.z_acl_count = aclp->z_acl_count;
+		}
+		acl_phys.z_acl_version = aclp->z_version;
+
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+		    &acl_phys, sizeof (acl_phys));
+	}
+
+	/*
+	 * Replace ACL wide bits, but first clear them.
+	 */
+	zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
+
+	zp->z_pflags |= aclp->z_hints;
+
+	if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
+		zp->z_pflags |= ZFS_ACL_TRIVIAL;
+
+	zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
+	return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
+}
+
+static void
+zfs_acl_chmod(boolean_t isdir, uint64_t mode, boolean_t split, boolean_t trim,
+    zfs_acl_t *aclp)
+{
+	void		*acep = NULL;
+	uint64_t	who;
+	int		new_count, new_bytes;
+	int		ace_size;
+	int		entry_type;
+	uint16_t	iflags, type;
+	uint32_t	access_mask;
+	zfs_acl_node_t	*newnode;
+	size_t		abstract_size = aclp->z_ops->ace_abstract_size();
+	void		*zacep;
+	trivial_acl_t	masks;
+
+	new_count = new_bytes = 0;
+
+	acl_trivial_access_masks((mode_t)mode, isdir, &masks);
+
+	newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
+
+	zacep = newnode->z_acldata;
+	if (masks.allow0) {
+		zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
+	if (masks.deny1) {
+		zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
+	if (masks.deny2) {
+		zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
+
+	while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+	    &iflags, &type))) {
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+		/*
+		 * ACEs used to represent the file mode may be divided
+		 * into an equivalent pair of inherit-only and regular
+		 * ACEs, if they are inheritable.
+		 * Skip regular ACEs, which are replaced by the new mode.
+		 */
+		if (split && (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE)) {
+			if (!isdir || !(iflags &
+			    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+				continue;
+			/*
+			 * We preserve owner@, group@, or @everyone
+			 * permissions, if they are inheritable, by
+			 * copying them to inherit_only ACEs. This
+			 * prevents inheritable permissions from being
+			 * altered along with the file mode.
+			 */
+			iflags |= ACE_INHERIT_ONLY_ACE;
+		}
+
+		/*
+		 * If this ACL has any inheritable ACEs, mark that in
+		 * the hints (which are later masked into the pflags)
+		 * so create knows to do inheritance.
+		 */
+		if (isdir && (iflags &
+		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+			aclp->z_hints |= ZFS_INHERIT_ACE;
+
+		if ((type != ALLOW && type != DENY) ||
+		    (iflags & ACE_INHERIT_ONLY_ACE)) {
+			switch (type) {
+			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+				aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+				break;
+			}
+		} else {
+			/*
+			 * Limit permissions to be no greater than
+			 * group permissions.
+			 * The "aclinherit" and "aclmode" properties
+			 * affect policy for create and chmod(2),
+			 * respectively.
+			 */
+			if ((type == ALLOW) && trim)
+				access_mask &= masks.group;
+		}
+		zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
+		ace_size = aclp->z_ops->ace_size(acep);
+		zacep = (void *)((uintptr_t)zacep + ace_size);
+		new_count++;
+		new_bytes += ace_size;
+	}
+	zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER);
+	zacep = (void *)((uintptr_t)zacep + abstract_size);
+	zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP);
+	zacep = (void *)((uintptr_t)zacep + abstract_size);
+	zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE);
+
+	new_count += 3;
+	new_bytes += abstract_size * 3;
+	zfs_acl_release_nodes(aclp);
+	aclp->z_acl_count = new_count;
+	aclp->z_acl_bytes = new_bytes;
+	newnode->z_ace_count = new_count;
+	newnode->z_size = new_bytes;
+	list_insert_tail(&aclp->z_acl, newnode);
+}
+
+int
+zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
+{
+	int error = 0;
+
+	mutex_enter(&zp->z_acl_lock);
+	mutex_enter(&zp->z_lock);
+	if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_DISCARD)
+		*aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+	else
+		error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE);
+
+	if (error == 0) {
+		(*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
+		zfs_acl_chmod(S_ISDIR(ZTOI(zp)->i_mode), mode, B_TRUE,
+		    (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
+	}
+	mutex_exit(&zp->z_lock);
+	mutex_exit(&zp->z_acl_lock);
+
+	return (error);
+}
+
+/*
+ * Should ACE be inherited?
+ */
+static int
+zfs_ace_can_use(umode_t obj_mode, uint16_t acep_flags)
+{
+	int	iflags = (acep_flags & 0xf);
+
+	if (S_ISDIR(obj_mode) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+		return (1);
+	else if (iflags & ACE_FILE_INHERIT_ACE)
+		return (!(S_ISDIR(obj_mode) &&
+		    (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
+	return (0);
+}
+
+/*
+ * inherit inheritable ACEs from parent
+ */
+static zfs_acl_t *
+zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t va_mode, zfs_acl_t *paclp,
+    uint64_t mode, boolean_t *need_chmod)
+{
+	void		*pacep = NULL;
+	void		*acep;
+	zfs_acl_node_t  *aclnode;
+	zfs_acl_t	*aclp = NULL;
+	uint64_t	who;
+	uint32_t	access_mask;
+	uint16_t	iflags, newflags, type;
+	size_t		ace_size;
+	void		*data1, *data2;
+	size_t		data1sz, data2sz;
+	uint_t		aclinherit;
+	boolean_t	isdir = S_ISDIR(va_mode);
+	boolean_t	isreg = S_ISREG(va_mode);
+
+	*need_chmod = B_TRUE;
+
+	aclp = zfs_acl_alloc(paclp->z_version);
+	aclinherit = zfsvfs->z_acl_inherit;
+	if (aclinherit == ZFS_ACL_DISCARD || S_ISLNK(va_mode))
+		return (aclp);
+
+	while ((pacep = zfs_acl_next_ace(paclp, pacep, &who,
+	    &access_mask, &iflags, &type))) {
+
+		/*
+		 * don't inherit bogus ACEs
+		 */
+		if (!zfs_acl_valid_ace_type(type, iflags))
+			continue;
+
+		/*
+		 * Check if ACE is inheritable by this vnode
+		 */
+		if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) ||
+		    !zfs_ace_can_use(va_mode, iflags))
+			continue;
+
+		/*
+		 * If owner@, group@, or everyone@ inheritable
+		 * then zfs_acl_chmod() isn't needed.
+		 */
+		if ((aclinherit == ZFS_ACL_PASSTHROUGH ||
+		    aclinherit == ZFS_ACL_PASSTHROUGH_X) &&
+		    ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
+		    ((iflags & OWNING_GROUP) == OWNING_GROUP)) &&
+		    (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE))))
+			*need_chmod = B_FALSE;
+
+		/*
+		 * Strip inherited execute permission from file if
+		 * not in mode
+		 */
+		if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW &&
+		    !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) {
+			access_mask &= ~ACE_EXECUTE;
+		}
+
+		/*
+		 * Strip write_acl and write_owner from permissions
+		 * when inheriting an ACE
+		 */
+		if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) {
+			access_mask &= ~RESTRICTED_CLEAR;
+		}
+
+		ace_size = aclp->z_ops->ace_size(pacep);
+		aclnode = zfs_acl_node_alloc(ace_size);
+		list_insert_tail(&aclp->z_acl, aclnode);
+		acep = aclnode->z_acldata;
+
+		zfs_set_ace(aclp, acep, access_mask, type,
+		    who, iflags|ACE_INHERITED_ACE);
+
+		/*
+		 * Copy special opaque data if any
+		 */
+		if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) {
+			VERIFY((data2sz = aclp->z_ops->ace_data(acep,
+			    &data2)) == data1sz);
+			bcopy(data1, data2, data2sz);
+		}
+
+		aclp->z_acl_count++;
+		aclnode->z_ace_count++;
+		aclp->z_acl_bytes += aclnode->z_size;
+		newflags = aclp->z_ops->ace_flags_get(acep);
+
+		/*
+		 * If ACE is not to be inherited further, or if the vnode is
+		 * not a directory, remove all inheritance flags
+		 */
+		if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) {
+			newflags &= ~ALL_INHERIT;
+			aclp->z_ops->ace_flags_set(acep,
+			    newflags|ACE_INHERITED_ACE);
+			continue;
+		}
+
+		/*
+		 * This directory has an inheritable ACE
+		 */
+		aclp->z_hints |= ZFS_INHERIT_ACE;
+
+		/*
+		 * If only FILE_INHERIT is set then turn on
+		 * inherit_only
+		 */
+		if ((iflags & (ACE_FILE_INHERIT_ACE |
+		    ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
+			newflags |= ACE_INHERIT_ONLY_ACE;
+			aclp->z_ops->ace_flags_set(acep,
+			    newflags|ACE_INHERITED_ACE);
+		} else {
+			newflags &= ~ACE_INHERIT_ONLY_ACE;
+			aclp->z_ops->ace_flags_set(acep,
+			    newflags|ACE_INHERITED_ACE);
+		}
+	}
+	if (zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
+	    aclp->z_acl_count != 0) {
+		*need_chmod = B_FALSE;
+	}
+
+	return (aclp);
+}
+
+/*
+ * Create file system object initial permissions
+ * including inheritable ACEs.
+ * Also, create FUIDs for owner and group.
+ */
+int
+zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
+    vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
+{
+	int		error;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zfs_acl_t	*paclp;
+	gid_t		gid = vap->va_gid;
+	boolean_t	need_chmod = B_TRUE;
+	boolean_t	trim = B_FALSE;
+	boolean_t	inherited = B_FALSE;
+
+	bzero(acl_ids, sizeof (zfs_acl_ids_t));
+	acl_ids->z_mode = vap->va_mode;
+
+	if (vsecp)
+		if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_mode, vsecp,
+		    cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
+			return (error);
+
+	acl_ids->z_fuid = vap->va_uid;
+	acl_ids->z_fgid = vap->va_gid;
+#ifdef HAVE_KSID
+	/*
+	 * Determine uid and gid.
+	 */
+	if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
+	    ((flag & IS_XATTR) && (S_ISDIR(vap->va_mode)))) {
+		acl_ids->z_fuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid,
+		    cr, ZFS_OWNER, &acl_ids->z_fuidp);
+		acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
+		    cr, ZFS_GROUP, &acl_ids->z_fuidp);
+		gid = vap->va_gid;
+	} else {
+		acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
+		    cr, &acl_ids->z_fuidp);
+		acl_ids->z_fgid = 0;
+		if (vap->va_mask & AT_GID)  {
+			acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_gid,
+			    cr, ZFS_GROUP, &acl_ids->z_fuidp);
+			gid = vap->va_gid;
+			if (acl_ids->z_fgid != KGID_TO_SGID(ZTOI(dzp)->i_gid) &&
+			    !groupmember(vap->va_gid, cr) &&
+			    secpolicy_vnode_create_gid(cr) != 0)
+				acl_ids->z_fgid = 0;
+		}
+		if (acl_ids->z_fgid == 0) {
+			if (dzp->z_mode & S_ISGID) {
+				char		*domain;
+				uint32_t	rid;
+
+				acl_ids->z_fgid = KGID_TO_SGID(
+				    ZTOI(dzp)->i_gid);
+				gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
+				    cr, ZFS_GROUP);
+
+				if (zfsvfs->z_use_fuids &&
+				    IS_EPHEMERAL(acl_ids->z_fgid)) {
+					domain = zfs_fuid_idx_domain(
+					    &zfsvfs->z_fuid_idx,
+					    FUID_INDEX(acl_ids->z_fgid));
+					rid = FUID_RID(acl_ids->z_fgid);
+					zfs_fuid_node_add(&acl_ids->z_fuidp,
+					    domain, rid,
+					    FUID_INDEX(acl_ids->z_fgid),
+					    acl_ids->z_fgid, ZFS_GROUP);
+				}
+			} else {
+				acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
+				    ZFS_GROUP, cr, &acl_ids->z_fuidp);
+				gid = crgetgid(cr);
+			}
+		}
+	}
+#endif /* HAVE_KSID */
+
+	/*
+	 * If we're creating a directory, and the parent directory has the
+	 * set-GID bit set, set in on the new directory.
+	 * Otherwise, if the user is neither privileged nor a member of the
+	 * file's new group, clear the file's set-GID bit.
+	 */
+
+	if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
+	    (S_ISDIR(vap->va_mode))) {
+		acl_ids->z_mode |= S_ISGID;
+	} else {
+		if ((acl_ids->z_mode & S_ISGID) &&
+		    secpolicy_vnode_setids_setgids(cr, gid) != 0)
+			acl_ids->z_mode &= ~S_ISGID;
+	}
+
+	if (acl_ids->z_aclp == NULL) {
+		mutex_enter(&dzp->z_acl_lock);
+		mutex_enter(&dzp->z_lock);
+		if (!(flag & IS_ROOT_NODE) &&
+		    (dzp->z_pflags & ZFS_INHERIT_ACE) &&
+		    !(dzp->z_pflags & ZFS_XATTR)) {
+			VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
+			    &paclp, B_FALSE));
+			acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
+			    vap->va_mode, paclp, acl_ids->z_mode, &need_chmod);
+			inherited = B_TRUE;
+		} else {
+			acl_ids->z_aclp =
+			    zfs_acl_alloc(zfs_acl_version_zp(dzp));
+			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+		}
+		mutex_exit(&dzp->z_lock);
+		mutex_exit(&dzp->z_acl_lock);
+
+		if (need_chmod) {
+			if (S_ISDIR(vap->va_mode))
+				acl_ids->z_aclp->z_hints |=
+				    ZFS_ACL_AUTO_INHERIT;
+
+			if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK &&
+			    zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH &&
+			    zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X)
+				trim = B_TRUE;
+			zfs_acl_chmod(vap->va_mode, acl_ids->z_mode, B_FALSE,
+			    trim, acl_ids->z_aclp);
+		}
+	}
+
+	if (inherited || vsecp) {
+		acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
+		    acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
+		    acl_ids->z_fuid, acl_ids->z_fgid);
+		if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
+			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+	}
+
+	return (0);
+}
+
+/*
+ * Free ACL and fuid_infop, but not the acl_ids structure
+ */
+void
+zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
+{
+	if (acl_ids->z_aclp)
+		zfs_acl_free(acl_ids->z_aclp);
+	if (acl_ids->z_fuidp)
+		zfs_fuid_info_free(acl_ids->z_fuidp);
+	acl_ids->z_aclp = NULL;
+	acl_ids->z_fuidp = NULL;
+}
+
+boolean_t
+zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid)
+{
+	return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) ||
+	    zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) ||
+	    (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID &&
+	    zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid)));
+}
+
+/*
+ * Retrieve a file's ACL
+ */
+int
+zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+	zfs_acl_t	*aclp;
+	ulong_t		mask;
+	int		error;
+	int 		count = 0;
+	int		largeace = 0;
+
+	mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
+	    VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
+
+	if (mask == 0)
+		return (SET_ERROR(ENOSYS));
+
+	if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)))
+		return (error);
+
+	mutex_enter(&zp->z_acl_lock);
+
+	error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+	if (error != 0) {
+		mutex_exit(&zp->z_acl_lock);
+		return (error);
+	}
+
+	/*
+	 * Scan ACL to determine number of ACEs
+	 */
+	if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
+		void *zacep = NULL;
+		uint64_t who;
+		uint32_t access_mask;
+		uint16_t type, iflags;
+
+		while ((zacep = zfs_acl_next_ace(aclp, zacep,
+		    &who, &access_mask, &iflags, &type))) {
+			switch (type) {
+			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+				largeace++;
+				continue;
+			default:
+				count++;
+			}
+		}
+		vsecp->vsa_aclcnt = count;
+	} else
+		count = (int)aclp->z_acl_count;
+
+	if (mask & VSA_ACECNT) {
+		vsecp->vsa_aclcnt = count;
+	}
+
+	if (mask & VSA_ACE) {
+		size_t aclsz;
+
+		aclsz = count * sizeof (ace_t) +
+		    sizeof (ace_object_t) * largeace;
+
+		vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
+		vsecp->vsa_aclentsz = aclsz;
+
+		if (aclp->z_version == ZFS_ACL_VERSION_FUID)
+			zfs_copy_fuid_2_ace(ZTOZSB(zp), aclp, cr,
+			    vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
+		else {
+			zfs_acl_node_t *aclnode;
+			void *start = vsecp->vsa_aclentp;
+
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				bcopy(aclnode->z_acldata, start,
+				    aclnode->z_size);
+				start = (caddr_t)start + aclnode->z_size;
+			}
+			ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
+			    aclp->z_acl_bytes);
+		}
+	}
+	if (mask & VSA_ACE_ACLFLAGS) {
+		vsecp->vsa_aclflags = 0;
+		if (zp->z_pflags & ZFS_ACL_DEFAULTED)
+			vsecp->vsa_aclflags |= ACL_DEFAULTED;
+		if (zp->z_pflags & ZFS_ACL_PROTECTED)
+			vsecp->vsa_aclflags |= ACL_PROTECTED;
+		if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
+			vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+
+	return (0);
+}
+
+int
+zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_mode,
+    vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
+{
+	zfs_acl_t *aclp;
+	zfs_acl_node_t *aclnode;
+	int aclcnt = vsecp->vsa_aclcnt;
+	int error;
+
+	if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
+		return (SET_ERROR(EINVAL));
+
+	aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
+
+	aclp->z_hints = 0;
+	aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
+	if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+		if ((error = zfs_copy_ace_2_oldace(obj_mode, aclp,
+		    (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
+		    aclcnt, &aclnode->z_size)) != 0) {
+			zfs_acl_free(aclp);
+			zfs_acl_node_free(aclnode);
+			return (error);
+		}
+	} else {
+		if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_mode, aclp,
+		    vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
+		    &aclnode->z_size, fuidp, cr)) != 0) {
+			zfs_acl_free(aclp);
+			zfs_acl_node_free(aclnode);
+			return (error);
+		}
+	}
+	aclp->z_acl_bytes = aclnode->z_size;
+	aclnode->z_ace_count = aclcnt;
+	aclp->z_acl_count = aclcnt;
+	list_insert_head(&aclp->z_acl, aclnode);
+
+	/*
+	 * If flags are being set then add them to z_hints
+	 */
+	if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
+		if (vsecp->vsa_aclflags & ACL_PROTECTED)
+			aclp->z_hints |= ZFS_ACL_PROTECTED;
+		if (vsecp->vsa_aclflags & ACL_DEFAULTED)
+			aclp->z_hints |= ZFS_ACL_DEFAULTED;
+		if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
+			aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
+	}
+
+	*zaclp = aclp;
+
+	return (0);
+}
+
+/*
+ * Set a file's ACL
+ */
+int
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
+	zilog_t		*zilog = zfsvfs->z_log;
+	ulong_t		mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+	dmu_tx_t	*tx;
+	int		error;
+	zfs_acl_t	*aclp;
+	zfs_fuid_info_t	*fuidp = NULL;
+	boolean_t	fuid_dirtied;
+	uint64_t	acl_obj;
+
+	if (mask == 0)
+		return (SET_ERROR(ENOSYS));
+
+	if (zp->z_pflags & ZFS_IMMUTABLE)
+		return (SET_ERROR(EPERM));
+
+	if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)))
+		return (error);
+
+	error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp,
+	    &aclp);
+	if (error)
+		return (error);
+
+	/*
+	 * If ACL wide flags aren't being set then preserve any
+	 * existing flags.
+	 */
+	if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
+		aclp->z_hints |=
+		    (zp->z_pflags & V4_ACL_WIDE_FLAGS);
+	}
+top:
+	mutex_enter(&zp->z_acl_lock);
+	mutex_enter(&zp->z_lock);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+
+	/*
+	 * If old version and ACL won't fit in bonus and we aren't
+	 * upgrading then take out necessary DMU holds
+	 */
+
+	if ((acl_obj = zfs_external_acl(zp)) != 0) {
+		if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+		    zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
+			dmu_tx_hold_free(tx, acl_obj, 0,
+			    DMU_OBJECT_END);
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+			    aclp->z_acl_bytes);
+		} else {
+			dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
+		}
+	} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
+	}
+
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	if (error) {
+		mutex_exit(&zp->z_acl_lock);
+		mutex_exit(&zp->z_lock);
+
+		if (error == ERESTART) {
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
+		dmu_tx_abort(tx);
+		zfs_acl_free(aclp);
+		return (error);
+	}
+
+	error = zfs_aclset_common(zp, aclp, cr, tx);
+	ASSERT(error == 0);
+	ASSERT(zp->z_acl_cached == NULL);
+	zp->z_acl_cached = aclp;
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
+
+	if (fuidp)
+		zfs_fuid_info_free(fuidp);
+	dmu_tx_commit(tx);
+
+	mutex_exit(&zp->z_lock);
+	mutex_exit(&zp->z_acl_lock);
+
+	return (error);
+}
+
+/*
+ * Check accesses of interest (AoI) against attributes of the dataset
+ * such as read-only.  Returns zero if no AoI conflict with dataset
+ * attributes, otherwise an appropriate errno is returned.
+ */
+static int
+zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
+{
+	if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) &&
+	    (!Z_ISDEV(ZTOI(zp)->i_mode) ||
+	    (Z_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) {
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * Only check for READONLY on non-directories.
+	 */
+	if ((v4_mode & WRITE_MASK_DATA) &&
+	    ((!S_ISDIR(ZTOI(zp)->i_mode) &&
+	    (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
+	    (S_ISDIR(ZTOI(zp)->i_mode) &&
+	    (zp->z_pflags & ZFS_IMMUTABLE)))) {
+		return (SET_ERROR(EPERM));
+	}
+
+	if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
+	    (zp->z_pflags & ZFS_NOUNLINK)) {
+		return (SET_ERROR(EPERM));
+	}
+
+	if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
+	    (zp->z_pflags & ZFS_AV_QUARANTINED))) {
+		return (SET_ERROR(EACCES));
+	}
+
+	return (0);
+}
+
+/*
+ * The primary usage of this function is to loop through all of the
+ * ACEs in the znode, determining what accesses of interest (AoI) to
+ * the caller are allowed or denied.  The AoI are expressed as bits in
+ * the working_mode parameter.  As each ACE is processed, bits covered
+ * by that ACE are removed from the working_mode.  This removal
+ * facilitates two things.  The first is that when the working mode is
+ * empty (= 0), we know we've looked at all the AoI. The second is
+ * that the ACE interpretation rules don't allow a later ACE to undo
+ * something granted or denied by an earlier ACE.  Removing the
+ * discovered access or denial enforces this rule.  At the end of
+ * processing the ACEs, all AoI that were found to be denied are
+ * placed into the working_mode, giving the caller a mask of denied
+ * accesses.  Returns:
+ *	0		if all AoI granted
+ *	EACCES 		if the denied mask is non-zero
+ *	other error	if abnormal failure (e.g., IO error)
+ *
+ * A secondary usage of the function is to determine if any of the
+ * AoI are granted.  If an ACE grants any access in
+ * the working_mode, we immediately short circuit out of the function.
+ * This mode is chosen by setting anyaccess to B_TRUE.  The
+ * working_mode is not a denied access mask upon exit if the function
+ * is used in this manner.
+ */
+static int
+zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
+    boolean_t anyaccess, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
+	zfs_acl_t	*aclp;
+	int		error;
+	uid_t		uid = crgetuid(cr);
+	uint64_t	who;
+	uint16_t	type, iflags;
+	uint16_t	entry_type;
+	uint32_t	access_mask;
+	uint32_t	deny_mask = 0;
+	zfs_ace_hdr_t	*acep = NULL;
+	boolean_t	checkit;
+	uid_t		gowner;
+	uid_t		fowner;
+
+	zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
+
+	mutex_enter(&zp->z_acl_lock);
+
+	error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+	if (error != 0) {
+		mutex_exit(&zp->z_acl_lock);
+		return (error);
+	}
+
+	ASSERT(zp->z_acl_cached);
+
+	while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+	    &iflags, &type))) {
+		uint32_t mask_matched;
+
+		if (!zfs_acl_valid_ace_type(type, iflags))
+			continue;
+
+		if (S_ISDIR(ZTOI(zp)->i_mode) &&
+		    (iflags & ACE_INHERIT_ONLY_ACE))
+			continue;
+
+		/* Skip ACE if it does not affect any AoI */
+		mask_matched = (access_mask & *working_mode);
+		if (!mask_matched)
+			continue;
+
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+
+		checkit = B_FALSE;
+
+		switch (entry_type) {
+		case ACE_OWNER:
+			if (uid == fowner)
+				checkit = B_TRUE;
+			break;
+		case OWNING_GROUP:
+			who = gowner;
+			/*FALLTHROUGH*/
+		case ACE_IDENTIFIER_GROUP:
+			checkit = zfs_groupmember(zfsvfs, who, cr);
+			break;
+		case ACE_EVERYONE:
+			checkit = B_TRUE;
+			break;
+
+		/* USER Entry */
+		default:
+			if (entry_type == 0) {
+				uid_t newid;
+
+				newid = zfs_fuid_map_id(zfsvfs, who, cr,
+				    ZFS_ACE_USER);
+				if (newid != IDMAP_WK_CREATOR_OWNER_UID &&
+				    uid == newid)
+					checkit = B_TRUE;
+				break;
+			} else {
+				mutex_exit(&zp->z_acl_lock);
+				return (SET_ERROR(EIO));
+			}
+		}
+
+		if (checkit) {
+			if (type == DENY) {
+				DTRACE_PROBE3(zfs__ace__denies,
+				    znode_t *, zp,
+				    zfs_ace_hdr_t *, acep,
+				    uint32_t, mask_matched);
+				deny_mask |= mask_matched;
+			} else {
+				DTRACE_PROBE3(zfs__ace__allows,
+				    znode_t *, zp,
+				    zfs_ace_hdr_t *, acep,
+				    uint32_t, mask_matched);
+				if (anyaccess) {
+					mutex_exit(&zp->z_acl_lock);
+					return (0);
+				}
+			}
+			*working_mode &= ~mask_matched;
+		}
+
+		/* Are we done? */
+		if (*working_mode == 0)
+			break;
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+
+	/* Put the found 'denies' back on the working mode */
+	if (deny_mask) {
+		*working_mode |= deny_mask;
+		return (SET_ERROR(EACCES));
+	} else if (*working_mode) {
+		return (-1);
+	}
+
+	return (0);
+}
+
+/*
+ * Return true if any access whatsoever granted, we don't actually
+ * care what access is granted.
+ */
+boolean_t
+zfs_has_access(znode_t *zp, cred_t *cr)
+{
+	uint32_t have = ACE_ALL_PERMS;
+
+	if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
+		uid_t owner;
+
+		owner = zfs_fuid_map_id(ZTOZSB(zp),
+		    KUID_TO_SUID(ZTOI(zp)->i_uid), cr, ZFS_OWNER);
+		return (secpolicy_vnode_any_access(cr, ZTOI(zp), owner) == 0);
+	}
+	return (B_TRUE);
+}
+
+static int
+zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
+    boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	int err;
+
+	*working_mode = v4_mode;
+	*check_privs = B_TRUE;
+
+	/*
+	 * Short circuit empty requests
+	 */
+	if (v4_mode == 0 || zfsvfs->z_replay) {
+		*working_mode = 0;
+		return (0);
+	}
+
+	if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
+		*check_privs = B_FALSE;
+		return (err);
+	}
+
+	/*
+	 * The caller requested that the ACL check be skipped.  This
+	 * would only happen if the caller checked VOP_ACCESS() with a
+	 * 32 bit ACE mask and already had the appropriate permissions.
+	 */
+	if (skipaclchk) {
+		*working_mode = 0;
+		return (0);
+	}
+
+	return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
+}
+
+static int
+zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
+    cred_t *cr)
+{
+	if (*working_mode != ACE_WRITE_DATA)
+		return (SET_ERROR(EACCES));
+
+	return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
+	    check_privs, B_FALSE, cr));
+}
+
+int
+zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
+{
+	boolean_t owner = B_FALSE;
+	boolean_t groupmbr = B_FALSE;
+	boolean_t is_attr;
+	uid_t uid = crgetuid(cr);
+	int error;
+
+	if (zdp->z_pflags & ZFS_AV_QUARANTINED)
+		return (SET_ERROR(EACCES));
+
+	is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
+	    (S_ISDIR(ZTOI(zdp)->i_mode)));
+	if (is_attr)
+		goto slow;
+
+
+	mutex_enter(&zdp->z_acl_lock);
+
+	if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) {
+		mutex_exit(&zdp->z_acl_lock);
+		return (0);
+	}
+
+	if (KUID_TO_SUID(ZTOI(zdp)->i_uid) != 0 ||
+	    KGID_TO_SGID(ZTOI(zdp)->i_gid) != 0) {
+		mutex_exit(&zdp->z_acl_lock);
+		goto slow;
+	}
+
+	if (uid == KUID_TO_SUID(ZTOI(zdp)->i_uid)) {
+		owner = B_TRUE;
+		if (zdp->z_mode & S_IXUSR) {
+			mutex_exit(&zdp->z_acl_lock);
+			return (0);
+		} else {
+			mutex_exit(&zdp->z_acl_lock);
+			goto slow;
+		}
+	}
+	if (groupmember(KGID_TO_SGID(ZTOI(zdp)->i_gid), cr)) {
+		groupmbr = B_TRUE;
+		if (zdp->z_mode & S_IXGRP) {
+			mutex_exit(&zdp->z_acl_lock);
+			return (0);
+		} else {
+			mutex_exit(&zdp->z_acl_lock);
+			goto slow;
+		}
+	}
+	if (!owner && !groupmbr) {
+		if (zdp->z_mode & S_IXOTH) {
+			mutex_exit(&zdp->z_acl_lock);
+			return (0);
+		}
+	}
+
+	mutex_exit(&zdp->z_acl_lock);
+
+slow:
+	DTRACE_PROBE(zfs__fastpath__execute__access__miss);
+	ZFS_ENTER(ZTOZSB(zdp));
+	error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
+	ZFS_EXIT(ZTOZSB(zdp));
+	return (error);
+}
+
+/*
+ * Determine whether Access should be granted/denied.
+ *
+ * The least priv subsystem is always consulted as a basic privilege
+ * can define any form of access.
+ */
+int
+zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
+{
+	uint32_t	working_mode;
+	int		error;
+	int		is_attr;
+	boolean_t 	check_privs;
+	znode_t		*xzp;
+	znode_t 	*check_zp = zp;
+	mode_t		needed_bits;
+	uid_t		owner;
+
+	is_attr = ((zp->z_pflags & ZFS_XATTR) && S_ISDIR(ZTOI(zp)->i_mode));
+
+	/*
+	 * If attribute then validate against base file
+	 */
+	if (is_attr) {
+		if ((error = zfs_zget(ZTOZSB(zp),
+		    zp->z_xattr_parent, &xzp)) != 0) {
+			return (error);
+		}
+
+		check_zp = xzp;
+
+		/*
+		 * fixup mode to map to xattr perms
+		 */
+
+		if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+			mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+			mode |= ACE_WRITE_NAMED_ATTRS;
+		}
+
+		if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+			mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+			mode |= ACE_READ_NAMED_ATTRS;
+		}
+	}
+
+	owner = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid),
+	    cr, ZFS_OWNER);
+	/*
+	 * Map the bits required to the standard inode flags
+	 * S_IRUSR|S_IWUSR|S_IXUSR in the needed_bits.  Map the bits
+	 * mapped by working_mode (currently missing) in missing_bits.
+	 * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
+	 * needed_bits.
+	 */
+	needed_bits = 0;
+
+	working_mode = mode;
+	if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+	    owner == crgetuid(cr))
+		working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+	if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+	    ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+		needed_bits |= S_IRUSR;
+	if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+	    ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+		needed_bits |= S_IWUSR;
+	if (working_mode & ACE_EXECUTE)
+		needed_bits |= S_IXUSR;
+
+	if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
+	    &check_privs, skipaclchk, cr)) == 0) {
+		if (is_attr)
+			zrele(xzp);
+		return (secpolicy_vnode_access2(cr, ZTOI(zp), owner,
+		    needed_bits, needed_bits));
+	}
+
+	if (error && !check_privs) {
+		if (is_attr)
+			zrele(xzp);
+		return (error);
+	}
+
+	if (error && (flags & V_APPEND)) {
+		error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
+	}
+
+	if (error && check_privs) {
+		mode_t		checkmode = 0;
+
+		/*
+		 * First check for implicit owner permission on
+		 * read_acl/read_attributes
+		 */
+
+		error = 0;
+		ASSERT(working_mode != 0);
+
+		if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
+		    owner == crgetuid(cr)))
+			working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+		if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+		    ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+			checkmode |= S_IRUSR;
+		if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+		    ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+			checkmode |= S_IWUSR;
+		if (working_mode & ACE_EXECUTE)
+			checkmode |= S_IXUSR;
+
+		error = secpolicy_vnode_access2(cr, ZTOI(check_zp), owner,
+		    needed_bits & ~checkmode, needed_bits);
+
+		if (error == 0 && (working_mode & ACE_WRITE_OWNER))
+			error = secpolicy_vnode_chown(cr, owner);
+		if (error == 0 && (working_mode & ACE_WRITE_ACL))
+			error = secpolicy_vnode_setdac(cr, owner);
+
+		if (error == 0 && (working_mode &
+		    (ACE_DELETE|ACE_DELETE_CHILD)))
+			error = secpolicy_vnode_remove(cr);
+
+		if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
+			error = secpolicy_vnode_chown(cr, owner);
+		}
+		if (error == 0) {
+			/*
+			 * See if any bits other than those already checked
+			 * for are still present.  If so then return EACCES
+			 */
+			if (working_mode & ~(ZFS_CHECKED_MASKS)) {
+				error = SET_ERROR(EACCES);
+			}
+		}
+	} else if (error == 0) {
+		error = secpolicy_vnode_access2(cr, ZTOI(zp), owner,
+		    needed_bits, needed_bits);
+	}
+
+	if (is_attr)
+		zrele(xzp);
+
+	return (error);
+}
+
+/*
+ * Translate traditional unix S_IRUSR/S_IWUSR/S_IXUSR mode into
+ * native ACL format and call zfs_zaccess()
+ */
+int
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
+{
+	return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
+}
+
+/*
+ * Access function for secpolicy_vnode_setattr
+ */
+int
+zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
+{
+	int v4_mode = zfs_unix_to_v4(mode >> 6);
+
+	return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
+}
+
+/* See zfs_zaccess_delete() */
+int zfs_write_implies_delete_child = 1;
+
+/*
+ * Determine whether delete access should be granted.
+ *
+ * The following chart outlines how we handle delete permissions which is
+ * how recent versions of windows (Windows 2008) handles it.  The efficiency
+ * comes from not having to check the parent ACL where the object itself grants
+ * delete:
+ *
+ *      -------------------------------------------------------
+ *      |   Parent Dir  |      Target Object Permissions      |
+ *      |  permissions  |                                     |
+ *      -------------------------------------------------------
+ *      |               | ACL Allows | ACL Denies| Delete     |
+ *      |               |  Delete    |  Delete   | unspecified|
+ *      -------------------------------------------------------
+ *      | ACL Allows    | Permit     | Deny *    | Permit     |
+ *      | DELETE_CHILD  |            |           |            |
+ *      -------------------------------------------------------
+ *      | ACL Denies    | Permit     | Deny      | Deny       |
+ *      | DELETE_CHILD  |            |           |            |
+ *      -------------------------------------------------------
+ *      | ACL specifies |            |           |            |
+ *      | only allow    | Permit     | Deny *    | Permit     |
+ *      | write and     |            |           |            |
+ *      | execute       |            |           |            |
+ *      -------------------------------------------------------
+ *      | ACL denies    |            |           |            |
+ *      | write and     | Permit     | Deny      | Deny       |
+ *      | execute       |            |           |            |
+ *      -------------------------------------------------------
+ *         ^
+ *         |
+ *         Re. execute permission on the directory:  if that's missing,
+ *	   the vnode lookup of the target will fail before we get here.
+ *
+ * Re [*] in the table above:  NFSv4 would normally Permit delete for
+ * these two cells of the matrix.
+ * See acl.h for notes on which ACE_... flags should be checked for which
+ * operations.  Specifically, the NFSv4 committee recommendation is in
+ * conflict with the Windows interpretation of DENY ACEs, where DENY ACEs
+ * should take precedence ahead of ALLOW ACEs.
+ *
+ * This implementation always consults the target object's ACL first.
+ * If a DENY ACE is present on the target object that specifies ACE_DELETE,
+ * delete access is denied.  If an ALLOW ACE with ACE_DELETE is present on
+ * the target object, access is allowed.  If and only if no entries with
+ * ACE_DELETE are present in the object's ACL, check the container's ACL
+ * for entries with ACE_DELETE_CHILD.
+ *
+ * A summary of the logic implemented from the table above is as follows:
+ *
+ * First check for DENY ACEs that apply.
+ * If either target or container has a deny, EACCES.
+ *
+ * Delete access can then be summarized as follows:
+ * 1: The object to be deleted grants ACE_DELETE, or
+ * 2: The containing directory grants ACE_DELETE_CHILD.
+ * In a Windows system, that would be the end of the story.
+ * In this system, (2) has some complications...
+ * 2a: "sticky" bit on a directory adds restrictions, and
+ * 2b: existing ACEs from previous versions of ZFS may
+ * not carry ACE_DELETE_CHILD where they should, so we
+ * also allow delete when ACE_WRITE_DATA is granted.
+ *
+ * Note: 2b is technically a work-around for a prior bug,
+ * which hopefully can go away some day.  For those who
+ * no longer need the work around, and for testing, this
+ * work-around is made conditional via the tunable:
+ * zfs_write_implies_delete_child
+ */
+int
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+{
+	uint32_t wanted_dirperms;
+	uint32_t dzp_working_mode = 0;
+	uint32_t zp_working_mode = 0;
+	int dzp_error, zp_error;
+	boolean_t dzpcheck_privs;
+	boolean_t zpcheck_privs;
+
+	if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+		return (SET_ERROR(EPERM));
+
+	/*
+	 * Case 1:
+	 * If target object grants ACE_DELETE then we are done.  This is
+	 * indicated by a return value of 0.  For this case we don't worry
+	 * about the sticky bit because sticky only applies to the parent
+	 * directory and this is the child access result.
+	 *
+	 * If we encounter a DENY ACE here, we're also done (EACCES).
+	 * Note that if we hit a DENY ACE here (on the target) it should
+	 * take precedence over a DENY ACE on the container, so that when
+	 * we have more complete auditing support we will be able to
+	 * report an access failure against the specific target.
+	 * (This is part of why we're checking the target first.)
+	 */
+	zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
+	    &zpcheck_privs, B_FALSE, cr);
+	if (zp_error == EACCES) {
+		/* We hit a DENY ACE. */
+		if (!zpcheck_privs)
+			return (SET_ERROR(zp_error));
+		return (secpolicy_vnode_remove(cr));
+
+	}
+	if (zp_error == 0)
+		return (0);
+
+	/*
+	 * Case 2:
+	 * If the containing directory grants ACE_DELETE_CHILD,
+	 * or we're in backward compatibility mode and the
+	 * containing directory has ACE_WRITE_DATA, allow.
+	 * Case 2b is handled with wanted_dirperms.
+	 */
+	wanted_dirperms = ACE_DELETE_CHILD;
+	if (zfs_write_implies_delete_child)
+		wanted_dirperms |= ACE_WRITE_DATA;
+	dzp_error = zfs_zaccess_common(dzp, wanted_dirperms,
+	    &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
+	if (dzp_error == EACCES) {
+		/* We hit a DENY ACE. */
+		if (!dzpcheck_privs)
+			return (SET_ERROR(dzp_error));
+		return (secpolicy_vnode_remove(cr));
+	}
+
+	/*
+	 * Cases 2a, 2b (continued)
+	 *
+	 * Note: dzp_working_mode now contains any permissions
+	 * that were NOT granted.  Therefore, if any of the
+	 * wanted_dirperms WERE granted, we will have:
+	 *   dzp_working_mode != wanted_dirperms
+	 * We're really asking if ANY of those permissions
+	 * were granted, and if so, grant delete access.
+	 */
+	if (dzp_working_mode != wanted_dirperms)
+		dzp_error = 0;
+
+	/*
+	 * dzp_error is 0 if the container granted us permissions to "modify".
+	 * If we do not have permission via one or more ACEs, our current
+	 * privileges may still permit us to modify the container.
+	 *
+	 * dzpcheck_privs is false when i.e. the FS is read-only.
+	 * Otherwise, do privilege checks for the container.
+	 */
+	if (dzp_error != 0 && dzpcheck_privs) {
+		uid_t owner;
+
+		/*
+		 * The secpolicy call needs the requested access and
+		 * the current access mode of the container, but it
+		 * only knows about Unix-style modes (VEXEC, VWRITE),
+		 * so this must condense the fine-grained ACE bits into
+		 * Unix modes.
+		 *
+		 * The VEXEC flag is easy, because we know that has
+		 * always been checked before we get here (during the
+		 * lookup of the target vnode).  The container has not
+		 * granted us permissions to "modify", so we do not set
+		 * the VWRITE flag in the current access mode.
+		 */
+		owner = zfs_fuid_map_id(ZTOZSB(dzp),
+		    KUID_TO_SUID(ZTOI(dzp)->i_uid), cr, ZFS_OWNER);
+		dzp_error = secpolicy_vnode_access2(cr, ZTOI(dzp),
+		    owner, S_IXUSR, S_IWUSR|S_IXUSR);
+	}
+	if (dzp_error != 0) {
+		/*
+		 * Note: We may have dzp_error = -1 here (from
+		 * zfs_zacess_common).  Don't return that.
+		 */
+		return (SET_ERROR(EACCES));
+	}
+
+
+	/*
+	 * At this point, we know that the directory permissions allow
+	 * us to modify, but we still need to check for the additional
+	 * restrictions that apply when the "sticky bit" is set.
+	 *
+	 * Yes, zfs_sticky_remove_access() also checks this bit, but
+	 * checking it here and skipping the call below is nice when
+	 * you're watching all of this with dtrace.
+	 */
+	if ((dzp->z_mode & S_ISVTX) == 0)
+		return (0);
+
+	/*
+	 * zfs_sticky_remove_access will succeed if:
+	 * 1. The sticky bit is absent.
+	 * 2. We pass the sticky bit restrictions.
+	 * 3. We have privileges that always allow file removal.
+	 */
+	return (zfs_sticky_remove_access(dzp, zp, cr));
+}
+
+int
+zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
+    znode_t *tzp, cred_t *cr)
+{
+	int add_perm;
+	int error;
+
+	if (szp->z_pflags & ZFS_AV_QUARANTINED)
+		return (SET_ERROR(EACCES));
+
+	add_perm = S_ISDIR(ZTOI(szp)->i_mode) ?
+	    ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
+
+	/*
+	 * Rename permissions are combination of delete permission +
+	 * add file/subdir permission.
+	 */
+
+	/*
+	 * first make sure we do the delete portion.
+	 *
+	 * If that succeeds then check for add_file/add_subdir permissions
+	 */
+
+	if ((error = zfs_zaccess_delete(sdzp, szp, cr)))
+		return (error);
+
+	/*
+	 * If we have a tzp, see if we can delete it?
+	 */
+	if (tzp) {
+		if ((error = zfs_zaccess_delete(tdzp, tzp, cr)))
+			return (error);
+	}
+
+	/*
+	 * Now check for add permissions
+	 */
+	error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
+
+	return (error);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
new file mode 100644
index 000000000000..c2748ce450b4
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
@@ -0,0 +1,1241 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ *
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * LLNL-CODE-403049.
+ * Rewritten for Linux by:
+ *   Rohan Puri <rohan.puri15@gmail.com>
+ *   Brian Behlendorf <behlendorf1@llnl.gov>
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright (c) 2018 George Melikov. All Rights Reserved.
+ * Copyright (c) 2019 Datto, Inc. All rights reserved.
+ */
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' and 'shares' directory, but this may
+ * expand in the future.  The elements are built dynamically, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab.  We have three
+ * types of objects:
+ *
+ *	ctldir ------> snapshotdir -------> snapshot
+ *                                             |
+ *                                             |
+ *                                             V
+ *                                         mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding inode.
+ *
+ * All mounts are handled automatically by an user mode helper which invokes
+ * the mount procedure.  Unmounts are handled by allowing the mount
+ * point to expire so the kernel may automatically unmount it.
+ *
+ * The '.zfs', '.zfs/snapshot', and all directories created under
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same
+ * zfsvfs_t as the head filesystem (what '.zfs' lives under).
+ *
+ * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths
+ * (ie: snapshots) are complete ZFS filesystems and have their own unique
+ * zfsvfs_t.  However, the fsid reported by these mounts will be the same
+ * as that used by the parent zfsvfs_t to make NFS happy.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_deleg.h>
+#include <sys/zpl.h>
+#include <sys/mntent.h>
+#include "zfs_namecheck.h"
+
+/*
+ * Two AVL trees are maintained which contain all currently automounted
+ * snapshots.  Every automounted snapshots maps to a single zfs_snapentry_t
+ * entry which MUST:
+ *
+ *   - be attached to both trees, and
+ *   - be unique, no duplicate entries are allowed.
+ *
+ * The zfs_snapshots_by_name tree is indexed by the full dataset name
+ * while the zfs_snapshots_by_objsetid tree is indexed by the unique
+ * objsetid.  This allows for fast lookups either by name or objsetid.
+ */
+static avl_tree_t zfs_snapshots_by_name;
+static avl_tree_t zfs_snapshots_by_objsetid;
+static krwlock_t zfs_snapshot_lock;
+
+/*
+ * Control Directory Tunables (.zfs)
+ */
+int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
+int zfs_admin_snapshot = 0;
+
+typedef struct {
+	char		*se_name;	/* full snapshot name */
+	char		*se_path;	/* full mount path */
+	spa_t		*se_spa;	/* pool spa */
+	uint64_t	se_objsetid;	/* snapshot objset id */
+	struct dentry   *se_root_dentry; /* snapshot root dentry */
+	taskqid_t	se_taskqid;	/* scheduled unmount taskqid */
+	avl_node_t	se_node_name;	/* zfs_snapshots_by_name link */
+	avl_node_t	se_node_objsetid; /* zfs_snapshots_by_objsetid link */
+	zfs_refcount_t	se_refcount;	/* reference count */
+} zfs_snapentry_t;
+
+static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
+
+/*
+ * Allocate a new zfs_snapentry_t being careful to make a copy of the
+ * the snapshot name and provided mount point.  No reference is taken.
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_alloc(char *full_name, char *full_path, spa_t *spa,
+    uint64_t objsetid, struct dentry *root_dentry)
+{
+	zfs_snapentry_t *se;
+
+	se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);
+
+	se->se_name = kmem_strdup(full_name);
+	se->se_path = kmem_strdup(full_path);
+	se->se_spa = spa;
+	se->se_objsetid = objsetid;
+	se->se_root_dentry = root_dentry;
+	se->se_taskqid = TASKQID_INVALID;
+
+	zfs_refcount_create(&se->se_refcount);
+
+	return (se);
+}
+
+/*
+ * Free a zfs_snapentry_t the caller must ensure there are no active
+ * references.
+ */
+static void
+zfsctl_snapshot_free(zfs_snapentry_t *se)
+{
+	zfs_refcount_destroy(&se->se_refcount);
+	kmem_strfree(se->se_name);
+	kmem_strfree(se->se_path);
+
+	kmem_free(se, sizeof (zfs_snapentry_t));
+}
+
+/*
+ * Hold a reference on the zfs_snapentry_t.
+ */
+static void
+zfsctl_snapshot_hold(zfs_snapentry_t *se)
+{
+	zfs_refcount_add(&se->se_refcount, NULL);
+}
+
+/*
+ * Release a reference on the zfs_snapentry_t.  When the number of
+ * references drops to zero the structure will be freed.
+ */
+static void
+zfsctl_snapshot_rele(zfs_snapentry_t *se)
+{
+	if (zfs_refcount_remove(&se->se_refcount, NULL) == 0)
+		zfsctl_snapshot_free(se);
+}
+
+/*
+ * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
+ * zfs_snapshots_by_objsetid trees.  While the zfs_snapentry_t is part
+ * of the trees a reference is held.
+ */
+static void
+zfsctl_snapshot_add(zfs_snapentry_t *se)
+{
+	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+	zfsctl_snapshot_hold(se);
+	avl_add(&zfs_snapshots_by_name, se);
+	avl_add(&zfs_snapshots_by_objsetid, se);
+}
+
+/*
+ * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
+ * zfs_snapshots_by_objsetid trees.  Upon removal a reference is dropped,
+ * this can result in the structure being freed if that was the last
+ * remaining reference.
+ */
+static void
+zfsctl_snapshot_remove(zfs_snapentry_t *se)
+{
+	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+	avl_remove(&zfs_snapshots_by_name, se);
+	avl_remove(&zfs_snapshots_by_objsetid, se);
+	zfsctl_snapshot_rele(se);
+}
+
+/*
+ * Snapshot name comparison function for the zfs_snapshots_by_name.
+ */
+static int
+snapentry_compare_by_name(const void *a, const void *b)
+{
+	const zfs_snapentry_t *se_a = a;
+	const zfs_snapentry_t *se_b = b;
+	int ret;
+
+	ret = strcmp(se_a->se_name, se_b->se_name);
+
+	if (ret < 0)
+		return (-1);
+	else if (ret > 0)
+		return (1);
+	else
+		return (0);
+}
+
+/*
+ * Snapshot name comparison function for the zfs_snapshots_by_objsetid.
+ */
+static int
+snapentry_compare_by_objsetid(const void *a, const void *b)
+{
+	const zfs_snapentry_t *se_a = a;
+	const zfs_snapentry_t *se_b = b;
+
+	if (se_a->se_spa != se_b->se_spa)
+		return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1);
+
+	if (se_a->se_objsetid < se_b->se_objsetid)
+		return (-1);
+	else if (se_a->se_objsetid > se_b->se_objsetid)
+		return (1);
+	else
+		return (0);
+}
+
+/*
+ * Find a zfs_snapentry_t in zfs_snapshots_by_name.  If the snapname
+ * is found a pointer to the zfs_snapentry_t is returned and a reference
+ * taken on the structure.  The caller is responsible for dropping the
+ * reference with zfsctl_snapshot_rele().  If the snapname is not found
+ * NULL will be returned.
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_find_by_name(char *snapname)
+{
+	zfs_snapentry_t *se, search;
+
+	ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
+
+	search.se_name = snapname;
+	se = avl_find(&zfs_snapshots_by_name, &search, NULL);
+	if (se)
+		zfsctl_snapshot_hold(se);
+
+	return (se);
+}
+
+/*
+ * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id
+ * rather than the snapname.  In all other respects it behaves the same
+ * as zfsctl_snapshot_find_by_name().
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid)
+{
+	zfs_snapentry_t *se, search;
+
+	ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
+
+	search.se_spa = spa;
+	search.se_objsetid = objsetid;
+	se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL);
+	if (se)
+		zfsctl_snapshot_hold(se);
+
+	return (se);
+}
+
+/*
+ * Rename a zfs_snapentry_t in the zfs_snapshots_by_name.  The structure is
+ * removed, renamed, and added back to the new correct location in the tree.
+ */
+static int
+zfsctl_snapshot_rename(char *old_snapname, char *new_snapname)
+{
+	zfs_snapentry_t *se;
+
+	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+
+	se = zfsctl_snapshot_find_by_name(old_snapname);
+	if (se == NULL)
+		return (SET_ERROR(ENOENT));
+
+	zfsctl_snapshot_remove(se);
+	kmem_strfree(se->se_name);
+	se->se_name = kmem_strdup(new_snapname);
+	zfsctl_snapshot_add(se);
+	zfsctl_snapshot_rele(se);
+
+	return (0);
+}
+
+/*
+ * Delayed task responsible for unmounting an expired automounted snapshot.
+ */
+static void
+snapentry_expire(void *data)
+{
+	zfs_snapentry_t *se = (zfs_snapentry_t *)data;
+	spa_t *spa = se->se_spa;
+	uint64_t objsetid = se->se_objsetid;
+
+	if (zfs_expire_snapshot <= 0) {
+		zfsctl_snapshot_rele(se);
+		return;
+	}
+
+	se->se_taskqid = TASKQID_INVALID;
+	(void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);
+	zfsctl_snapshot_rele(se);
+
+	/*
+	 * Reschedule the unmount if the zfs_snapentry_t wasn't removed.
+	 * This can occur when the snapshot is busy.
+	 */
+	rw_enter(&zfs_snapshot_lock, RW_READER);
+	if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
+		zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
+		zfsctl_snapshot_rele(se);
+	}
+	rw_exit(&zfs_snapshot_lock);
+}
+
+/*
+ * Cancel an automatic unmount of a snapname.  This callback is responsible
+ * for dropping the reference on the zfs_snapentry_t which was taken when
+ * during dispatch.
+ */
+static void
+zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
+{
+	if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) {
+		se->se_taskqid = TASKQID_INVALID;
+		zfsctl_snapshot_rele(se);
+	}
+}
+
+/*
+ * Dispatch the unmount task for delayed handling with a hold protecting it.
+ */
+static void
+zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
+{
+	ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID);
+
+	if (delay <= 0)
+		return;
+
+	zfsctl_snapshot_hold(se);
+	se->se_taskqid = taskq_dispatch_delay(system_delay_taskq,
+	    snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
+}
+
+/*
+ * Schedule an automatic unmount of objset id to occur in delay seconds from
+ * now.  Any previous delayed unmount will be cancelled in favor of the
+ * updated deadline.  A reference is taken by zfsctl_snapshot_find_by_name()
+ * and held until the outstanding task is handled or cancelled.
+ */
+int
+zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
+{
+	zfs_snapentry_t *se;
+	int error = ENOENT;
+
+	rw_enter(&zfs_snapshot_lock, RW_READER);
+	if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
+		zfsctl_snapshot_unmount_cancel(se);
+		zfsctl_snapshot_unmount_delay_impl(se, delay);
+		zfsctl_snapshot_rele(se);
+		error = 0;
+	}
+	rw_exit(&zfs_snapshot_lock);
+
+	return (error);
+}
+
+/*
+ * Check if snapname is currently mounted.  Returned non-zero when mounted
+ * and zero when unmounted.
+ */
+static boolean_t
+zfsctl_snapshot_ismounted(char *snapname)
+{
+	zfs_snapentry_t *se;
+	boolean_t ismounted = B_FALSE;
+
+	rw_enter(&zfs_snapshot_lock, RW_READER);
+	if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
+		zfsctl_snapshot_rele(se);
+		ismounted = B_TRUE;
+	}
+	rw_exit(&zfs_snapshot_lock);
+
+	return (ismounted);
+}
+
+/*
+ * Check if the given inode is a part of the virtual .zfs directory.
+ */
+boolean_t
+zfsctl_is_node(struct inode *ip)
+{
+	return (ITOZ(ip)->z_is_ctldir);
+}
+
+/*
+ * Check if the given inode is a .zfs/snapshots/snapname directory.
+ */
+boolean_t
+zfsctl_is_snapdir(struct inode *ip)
+{
+	return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS));
+}
+
+/*
+ * Allocate a new inode with the passed id and ops.
+ */
+static struct inode *
+zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
+    const struct file_operations *fops, const struct inode_operations *ops)
+{
+	inode_timespec_t now;
+	struct inode *ip;
+	znode_t *zp;
+
+	ip = new_inode(zfsvfs->z_sb);
+	if (ip == NULL)
+		return (NULL);
+
+	now = current_time(ip);
+	zp = ITOZ(ip);
+	ASSERT3P(zp->z_dirlocks, ==, NULL);
+	ASSERT3P(zp->z_acl_cached, ==, NULL);
+	ASSERT3P(zp->z_xattr_cached, ==, NULL);
+	zp->z_id = id;
+	zp->z_unlinked = B_FALSE;
+	zp->z_atime_dirty = B_FALSE;
+	zp->z_zn_prefetch = B_FALSE;
+	zp->z_moved = B_FALSE;
+	zp->z_is_sa = B_FALSE;
+	zp->z_is_mapped = B_FALSE;
+	zp->z_is_ctldir = B_TRUE;
+	zp->z_is_stale = B_FALSE;
+	zp->z_sa_hdl = NULL;
+	zp->z_blksz = 0;
+	zp->z_seq = 0;
+	zp->z_mapcnt = 0;
+	zp->z_size = 0;
+	zp->z_pflags = 0;
+	zp->z_mode = 0;
+	zp->z_sync_cnt = 0;
+	ip->i_generation = 0;
+	ip->i_ino = id;
+	ip->i_mode = (S_IFDIR | S_IRWXUGO);
+	ip->i_uid = SUID_TO_KUID(0);
+	ip->i_gid = SGID_TO_KGID(0);
+	ip->i_blkbits = SPA_MINBLOCKSHIFT;
+	ip->i_atime = now;
+	ip->i_mtime = now;
+	ip->i_ctime = now;
+	ip->i_fop = fops;
+	ip->i_op = ops;
+#if defined(IOP_XATTR)
+	ip->i_opflags &= ~IOP_XATTR;
+#endif
+
+	if (insert_inode_locked(ip)) {
+		unlock_new_inode(ip);
+		iput(ip);
+		return (NULL);
+	}
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	list_insert_tail(&zfsvfs->z_all_znodes, zp);
+	zfsvfs->z_nr_znodes++;
+	membar_producer();
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	unlock_new_inode(ip);
+
+	return (ip);
+}
+
+/*
+ * Lookup the inode with given id, it will be allocated if needed.
+ */
+static struct inode *
+zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
+    const struct file_operations *fops, const struct inode_operations *ops)
+{
+	struct inode *ip = NULL;
+
+	while (ip == NULL) {
+		ip = ilookup(zfsvfs->z_sb, (unsigned long)id);
+		if (ip)
+			break;
+
+		/* May fail due to concurrent zfsctl_inode_alloc() */
+		ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops);
+	}
+
+	return (ip);
+}
+
+/*
+ * Create the '.zfs' directory.  This directory is cached as part of the VFS
+ * structure.  This results in a hold on the zfsvfs_t.  The code in zfs_umount()
+ * therefore checks against a vfs_count of 2 instead of 1.  This reference
+ * is removed when the ctldir is destroyed in the unmount.  All other entities
+ * under the '.zfs' directory are created dynamically as needed.
+ *
+ * Because the dynamically created '.zfs' directory entries assume the use
+ * of 64-bit inode numbers this support must be disabled on 32-bit systems.
+ */
+int
+zfsctl_create(zfsvfs_t *zfsvfs)
+{
+	ASSERT(zfsvfs->z_ctldir == NULL);
+
+	zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
+	    &zpl_fops_root, &zpl_ops_root);
+	if (zfsvfs->z_ctldir == NULL)
+		return (SET_ERROR(ENOENT));
+
+	return (0);
+}
+
+/*
+ * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name.
+ * Only called when the filesystem is unmounted.
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+	if (zfsvfs->z_issnap) {
+		zfs_snapentry_t *se;
+		spa_t *spa = zfsvfs->z_os->os_spa;
+		uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
+
+		rw_enter(&zfs_snapshot_lock, RW_WRITER);
+		se = zfsctl_snapshot_find_by_objsetid(spa, objsetid);
+		if (se != NULL)
+			zfsctl_snapshot_remove(se);
+		rw_exit(&zfs_snapshot_lock);
+		if (se != NULL) {
+			zfsctl_snapshot_unmount_cancel(se);
+			zfsctl_snapshot_rele(se);
+		}
+	} else if (zfsvfs->z_ctldir) {
+		iput(zfsvfs->z_ctldir);
+		zfsvfs->z_ctldir = NULL;
+	}
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+struct inode *
+zfsctl_root(znode_t *zp)
+{
+	ASSERT(zfs_has_ctldir(zp));
+	igrab(ZTOZSB(zp)->z_ctldir);
+	return (ZTOZSB(zp)->z_ctldir);
+}
+
+/*
+ * Generate a long fid to indicate a snapdir. We encode whether snapdir is
+ * already mounted in gen field. We do this because nfsd lookup will not
+ * trigger automount. Next time the nfsd does fh_to_dentry, we will notice
+ * this and do automount and return ESTALE to force nfsd revalidate and follow
+ * mount.
+ */
+static int
+zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp)
+{
+	zfid_short_t *zfid = (zfid_short_t *)fidp;
+	zfid_long_t *zlfid = (zfid_long_t *)fidp;
+	uint32_t gen = 0;
+	uint64_t object;
+	uint64_t objsetid;
+	int i;
+	struct dentry *dentry;
+
+	if (fidp->fid_len < LONG_FID_LEN) {
+		fidp->fid_len = LONG_FID_LEN;
+		return (SET_ERROR(ENOSPC));
+	}
+
+	object = ip->i_ino;
+	objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino;
+	zfid->zf_len = LONG_FID_LEN;
+
+	dentry = d_obtain_alias(igrab(ip));
+	if (!IS_ERR(dentry)) {
+		gen = !!d_mountpoint(dentry);
+		dput(dentry);
+	}
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+	for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+		zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+	for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+		zlfid->zf_setgen[i] = 0;
+
+	return (0);
+}
+
+/*
+ * Generate an appropriate fid for an entry in the .zfs directory.
+ */
+int
+zfsctl_fid(struct inode *ip, fid_t *fidp)
+{
+	znode_t		*zp = ITOZ(ip);
+	zfsvfs_t	*zfsvfs = ITOZSB(ip);
+	uint64_t	object = zp->z_id;
+	zfid_short_t	*zfid;
+	int		i;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (zfsctl_is_snapdir(ip)) {
+		ZFS_EXIT(zfsvfs);
+		return (zfsctl_snapdir_fid(ip, fidp));
+	}
+
+	if (fidp->fid_len < SHORT_FID_LEN) {
+		fidp->fid_len = SHORT_FID_LEN;
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENOSPC));
+	}
+
+	zfid = (zfid_short_t *)fidp;
+
+	zfid->zf_len = SHORT_FID_LEN;
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	/* .zfs znodes always have a generation number of 0 */
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = 0;
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Construct a full dataset name in full_name: "pool/dataset@snap_name"
+ */
+static int
+zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len,
+    char *full_name)
+{
+	objset_t *os = zfsvfs->z_os;
+
+	if (zfs_component_namecheck(snap_name, NULL, NULL) != 0)
+		return (SET_ERROR(EILSEQ));
+
+	dmu_objset_name(os, full_name);
+	if ((strlen(full_name) + 1 + strlen(snap_name)) >= len)
+		return (SET_ERROR(ENAMETOOLONG));
+
+	(void) strcat(full_name, "@");
+	(void) strcat(full_name, snap_name);
+
+	return (0);
+}
+
+/*
+ * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
+ */
+static int
+zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid,
+    int path_len, char *full_path)
+{
+	objset_t *os = zfsvfs->z_os;
+	fstrans_cookie_t cookie;
+	char *snapname;
+	boolean_t case_conflict;
+	uint64_t id, pos = 0;
+	int error = 0;
+
+	if (zfsvfs->z_vfs->vfs_mntpoint == NULL)
+		return (SET_ERROR(ENOENT));
+
+	cookie = spl_fstrans_mark();
+	snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+	while (error == 0) {
+		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+		error = dmu_snapshot_list_next(zfsvfs->z_os,
+		    ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos,
+		    &case_conflict);
+		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+		if (error)
+			goto out;
+
+		if (id == objsetid)
+			break;
+	}
+
+	snprintf(full_path, path_len, "%s/.zfs/snapshot/%s",
+	    zfsvfs->z_vfs->vfs_mntpoint, snapname);
+out:
+	kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
+	spl_fstrans_unmark(cookie);
+
+	return (error);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+int
+zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(dip);
+	int error = 0;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (strcmp(name, "..") == 0) {
+		*ipp = dip->i_sb->s_root->d_inode;
+	} else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) {
+		*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR,
+		    &zpl_fops_snapdir, &zpl_ops_snapdir);
+	} else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) {
+		*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES,
+		    &zpl_fops_shares, &zpl_ops_shares);
+	} else {
+		*ipp = NULL;
+	}
+
+	if (*ipp == NULL)
+		error = SET_ERROR(ENOENT);
+
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Lookup entry point for the 'snapshot' directory.  Try to open the
+ * snapshot if it exist, creating the pseudo filesystem inode as necessary.
+ */
+int
+zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(dip);
+	uint64_t id;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id);
+	if (error) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id,
+	    &simple_dir_operations, &simple_dir_inode_operations);
+	if (*ipp == NULL)
+		error = SET_ERROR(ENOENT);
+
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Renaming a directory under '.zfs/snapshot' will automatically trigger
+ * a rename of the snapshot to the new given name.  The rename is confined
+ * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
+ */
+int
+zfsctl_snapdir_rename(struct inode *sdip, char *snm,
+    struct inode *tdip, char *tnm, cred_t *cr, int flags)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(sdip);
+	char *to, *from, *real, *fsname;
+	int error;
+
+	if (!zfs_admin_snapshot)
+		return (SET_ERROR(EACCES));
+
+	ZFS_ENTER(zfsvfs);
+
+	to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+		error = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
+		    ZFS_MAX_DATASET_NAME_LEN, NULL);
+		if (error == 0) {
+			snm = real;
+		} else if (error != ENOTSUP) {
+			goto out;
+		}
+	}
+
+	dmu_objset_name(zfsvfs->z_os, fsname);
+
+	error = zfsctl_snapshot_name(ITOZSB(sdip), snm,
+	    ZFS_MAX_DATASET_NAME_LEN, from);
+	if (error == 0)
+		error = zfsctl_snapshot_name(ITOZSB(tdip), tnm,
+		    ZFS_MAX_DATASET_NAME_LEN, to);
+	if (error == 0)
+		error = zfs_secpolicy_rename_perms(from, to, cr);
+	if (error != 0)
+		goto out;
+
+	/*
+	 * Cannot move snapshots out of the snapdir.
+	 */
+	if (sdip != tdip) {
+		error = SET_ERROR(EINVAL);
+		goto out;
+	}
+
+	/*
+	 * No-op when names are identical.
+	 */
+	if (strcmp(snm, tnm) == 0) {
+		error = 0;
+		goto out;
+	}
+
+	rw_enter(&zfs_snapshot_lock, RW_WRITER);
+
+	error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
+	if (error == 0)
+		(void) zfsctl_snapshot_rename(snm, tnm);
+
+	rw_exit(&zfs_snapshot_lock);
+out:
+	kmem_free(from, ZFS_MAX_DATASET_NAME_LEN);
+	kmem_free(to, ZFS_MAX_DATASET_NAME_LEN);
+	kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
+	kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
+
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Removing a directory under '.zfs/snapshot' will automatically trigger
+ * the removal of the snapshot with the given name.
+ */
+int
+zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(dip);
+	char *snapname, *real;
+	int error;
+
+	if (!zfs_admin_snapshot)
+		return (SET_ERROR(EACCES));
+
+	ZFS_ENTER(zfsvfs);
+
+	snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+		error = dmu_snapshot_realname(zfsvfs->z_os, name, real,
+		    ZFS_MAX_DATASET_NAME_LEN, NULL);
+		if (error == 0) {
+			name = real;
+		} else if (error != ENOTSUP) {
+			goto out;
+		}
+	}
+
+	error = zfsctl_snapshot_name(ITOZSB(dip), name,
+	    ZFS_MAX_DATASET_NAME_LEN, snapname);
+	if (error == 0)
+		error = zfs_secpolicy_destroy_perms(snapname, cr);
+	if (error != 0)
+		goto out;
+
+	error = zfsctl_snapshot_unmount(snapname, MNT_FORCE);
+	if ((error == 0) || (error == ENOENT))
+		error = dsl_destroy_snapshot(snapname, B_FALSE);
+out:
+	kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
+	kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
+
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Creating a directory under '.zfs/snapshot' will automatically trigger
+ * the creation of a new snapshot with the given name.
+ */
+int
+zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
+    struct inode **ipp, cred_t *cr, int flags)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(dip);
+	char *dsname;
+	int error;
+
+	if (!zfs_admin_snapshot)
+		return (SET_ERROR(EACCES));
+
+	dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+	if (zfs_component_namecheck(dirname, NULL, NULL) != 0) {
+		error = SET_ERROR(EILSEQ);
+		goto out;
+	}
+
+	dmu_objset_name(zfsvfs->z_os, dsname);
+
+	error = zfs_secpolicy_snapshot_perms(dsname, cr);
+	if (error != 0)
+		goto out;
+
+	if (error == 0) {
+		error = dmu_objset_snapshot_one(dsname, dirname);
+		if (error != 0)
+			goto out;
+
+		error = zfsctl_snapdir_lookup(dip, dirname, ipp,
+		    0, cr, NULL, NULL);
+	}
+out:
+	kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
+
+	return (error);
+}
+
+/*
+ * Attempt to unmount a snapshot by making a call to user space.
+ * There is no assurance that this can or will succeed, is just a
+ * best effort.  In the case where it does fail, perhaps because
+ * it's in use, the unmount will fail harmlessly.
+ */
+int
+zfsctl_snapshot_unmount(char *snapname, int flags)
+{
+	char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL,
+	    NULL };
+	char *envp[] = { NULL };
+	zfs_snapentry_t *se;
+	int error;
+
+	rw_enter(&zfs_snapshot_lock, RW_READER);
+	if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) {
+		rw_exit(&zfs_snapshot_lock);
+		return (SET_ERROR(ENOENT));
+	}
+	rw_exit(&zfs_snapshot_lock);
+
+	if (flags & MNT_FORCE)
+		argv[4] = "-fn";
+	argv[5] = se->se_path;
+	dprintf("unmount; path=%s\n", se->se_path);
+	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	zfsctl_snapshot_rele(se);
+
+
+	/*
+	 * The umount system utility will return 256 on error.  We must
+	 * assume this error is because the file system is busy so it is
+	 * converted to the more sensible EBUSY.
+	 */
+	if (error)
+		error = SET_ERROR(EBUSY);
+
+	return (error);
+}
+
+int
+zfsctl_snapshot_mount(struct path *path, int flags)
+{
+	struct dentry *dentry = path->dentry;
+	struct inode *ip = dentry->d_inode;
+	zfsvfs_t *zfsvfs;
+	zfsvfs_t *snap_zfsvfs;
+	zfs_snapentry_t *se;
+	char *full_name, *full_path;
+	char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL,
+	    NULL };
+	char *envp[] = { NULL };
+	int error;
+	struct path spath;
+
+	if (ip == NULL)
+		return (SET_ERROR(EISDIR));
+
+	zfsvfs = ITOZSB(ip);
+	ZFS_ENTER(zfsvfs);
+
+	full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+	error = zfsctl_snapshot_name(zfsvfs, dname(dentry),
+	    ZFS_MAX_DATASET_NAME_LEN, full_name);
+	if (error)
+		goto error;
+
+	/*
+	 * Construct a mount point path from sb of the ctldir inode and dirent
+	 * name, instead of from d_path(), so that chroot'd process doesn't fail
+	 * on mount.zfs(8).
+	 */
+	snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s",
+	    zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "",
+	    dname(dentry));
+
+	/*
+	 * Multiple concurrent automounts of a snapshot are never allowed.
+	 * The snapshot may be manually mounted as many times as desired.
+	 */
+	if (zfsctl_snapshot_ismounted(full_name)) {
+		error = 0;
+		goto error;
+	}
+
+	/*
+	 * Attempt to mount the snapshot from user space.  Normally this
+	 * would be done using the vfs_kern_mount() function, however that
+	 * function is marked GPL-only and cannot be used.  On error we
+	 * careful to log the real error to the console and return EISDIR
+	 * to safely abort the automount.  This should be very rare.
+	 *
+	 * If the user mode helper happens to return EBUSY, a concurrent
+	 * mount is already in progress in which case the error is ignored.
+	 * Take note that if the program was executed successfully the return
+	 * value from call_usermodehelper() will be (exitcode << 8 + signal).
+	 */
+	dprintf("mount; name=%s path=%s\n", full_name, full_path);
+	argv[5] = full_name;
+	argv[6] = full_path;
+	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	if (error) {
+		if (!(error & MOUNT_BUSY << 8)) {
+			zfs_dbgmsg("Unable to automount %s error=%d",
+			    full_path, error);
+			error = SET_ERROR(EISDIR);
+		} else {
+			/*
+			 * EBUSY, this could mean a concurrent mount, or the
+			 * snapshot has already been mounted at completely
+			 * different place. We return 0 so VFS will retry. For
+			 * the latter case the VFS will retry several times
+			 * and return ELOOP, which is probably not a very good
+			 * behavior.
+			 */
+			error = 0;
+		}
+		goto error;
+	}
+
+	/*
+	 * Follow down in to the mounted snapshot and set MNT_SHRINKABLE
+	 * to identify this as an automounted filesystem.
+	 */
+	spath = *path;
+	path_get(&spath);
+	if (follow_down_one(&spath)) {
+		snap_zfsvfs = ITOZSB(spath.dentry->d_inode);
+		snap_zfsvfs->z_parent = zfsvfs;
+		dentry = spath.dentry;
+		spath.mnt->mnt_flags |= MNT_SHRINKABLE;
+
+		rw_enter(&zfs_snapshot_lock, RW_WRITER);
+		se = zfsctl_snapshot_alloc(full_name, full_path,
+		    snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os),
+		    dentry);
+		zfsctl_snapshot_add(se);
+		zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
+		rw_exit(&zfs_snapshot_lock);
+	}
+	path_put(&spath);
+error:
+	kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);
+	kmem_free(full_path, MAXPATHLEN);
+
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Get the snapdir inode from fid
+ */
+int
+zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen,
+    struct inode **ipp)
+{
+	int error;
+	struct path path;
+	char *mnt;
+	struct dentry *dentry;
+
+	mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+	error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid,
+	    MAXPATHLEN, mnt);
+	if (error)
+		goto out;
+
+	/* Trigger automount */
+	error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
+	if (error)
+		goto out;
+
+	path_put(&path);
+	/*
+	 * Get the snapdir inode. Note, we don't want to use the above
+	 * path because it contains the root of the snapshot rather
+	 * than the snapdir.
+	 */
+	*ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid);
+	if (*ipp == NULL) {
+		error = SET_ERROR(ENOENT);
+		goto out;
+	}
+
+	/* check gen, see zfsctl_snapdir_fid */
+	dentry = d_obtain_alias(igrab(*ipp));
+	if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) {
+		iput(*ipp);
+		*ipp = NULL;
+		error = SET_ERROR(ENOENT);
+	}
+	if (!IS_ERR(dentry))
+		dput(dentry);
+out:
+	kmem_free(mnt, MAXPATHLEN);
+	return (error);
+}
+
+int
+zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(dip);
+	znode_t *zp;
+	znode_t *dzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (zfsvfs->z_shares_dir == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
+		error = zfs_lookup(dzp, name, &zp, 0, cr, NULL, NULL);
+		zrele(dzp);
+	}
+
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Initialize the various pieces we'll need to create and manipulate .zfs
+ * directories.  Currently this is unused but available.
+ */
+void
+zfsctl_init(void)
+{
+	avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name,
+	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
+	    se_node_name));
+	avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid,
+	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
+	    se_node_objsetid));
+	rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL);
+}
+
+/*
+ * Cleanup the various pieces we needed for .zfs directories.  In particular
+ * ensure the expiry timer is canceled safely.
+ */
+void
+zfsctl_fini(void)
+{
+	avl_destroy(&zfs_snapshots_by_name);
+	avl_destroy(&zfs_snapshots_by_objsetid);
+	rw_destroy(&zfs_snapshot_lock);
+}
+
+module_param(zfs_admin_snapshot, int, 0644);
+MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot");
+
+module_param(zfs_expire_snapshot, int, 0644);
+MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c
new file mode 100644
index 000000000000..d98463f1b7f7
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c
@@ -0,0 +1,254 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/trace_zfs.h>
+
+typedef struct zfs_dbgmsg {
+	procfs_list_node_t	zdm_node;
+	uint64_t		zdm_timestamp;
+	int			zdm_size;
+	char			zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+procfs_list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size = 0;
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
+
+/*
+ * Internal ZFS debug messages are enabled by default.
+ *
+ * # Print debug messages
+ * cat /proc/spl/kstat/zfs/dbgmsg
+ *
+ * # Disable the kernel debug message log.
+ * echo 0 > /sys/module/zfs/parameters/zfs_dbgmsg_enable
+ *
+ * # Clear the kernel debug message log.
+ * echo 0 >/proc/spl/kstat/zfs/dbgmsg
+ */
+int zfs_dbgmsg_enable = 1;
+
+static int
+zfs_dbgmsg_show_header(struct seq_file *f)
+{
+	seq_printf(f, "%-12s %-8s\n", "timestamp", "message");
+	return (0);
+}
+
+static int
+zfs_dbgmsg_show(struct seq_file *f, void *p)
+{
+	zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)p;
+	seq_printf(f, "%-12llu %-s\n",
+	    (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg);
+	return (0);
+}
+
+static void
+zfs_dbgmsg_purge(int max_size)
+{
+	while (zfs_dbgmsg_size > max_size) {
+		zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list);
+		if (zdm == NULL)
+			return;
+
+		int size = zdm->zdm_size;
+		kmem_free(zdm, size);
+		zfs_dbgmsg_size -= size;
+	}
+}
+
+static int
+zfs_dbgmsg_clear(procfs_list_t *procfs_list)
+{
+	mutex_enter(&zfs_dbgmsgs.pl_lock);
+	zfs_dbgmsg_purge(0);
+	mutex_exit(&zfs_dbgmsgs.pl_lock);
+	return (0);
+}
+
+void
+zfs_dbgmsg_init(void)
+{
+	procfs_list_install("zfs",
+	    "dbgmsg",
+	    0600,
+	    &zfs_dbgmsgs,
+	    zfs_dbgmsg_show,
+	    zfs_dbgmsg_show_header,
+	    zfs_dbgmsg_clear,
+	    offsetof(zfs_dbgmsg_t, zdm_node));
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+	procfs_list_uninstall(&zfs_dbgmsgs);
+	zfs_dbgmsg_purge(0);
+
+	/*
+	 * TODO - decide how to make this permanent
+	 */
+#ifdef _KERNEL
+	procfs_list_destroy(&zfs_dbgmsgs);
+#endif
+}
+
+void
+__set_error(const char *file, const char *func, int line, int err)
+{
+	/*
+	 * To enable this:
+	 *
+	 * $ echo 512 >/sys/module/zfs/parameters/zfs_flags
+	 */
+	if (zfs_flags & ZFS_DEBUG_SET_ERROR)
+		__dprintf(B_FALSE, file, func, line, "error %lu", err);
+}
+
+void
+__zfs_dbgmsg(char *buf)
+{
+	int size = sizeof (zfs_dbgmsg_t) + strlen(buf);
+	zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP);
+	zdm->zdm_size = size;
+	zdm->zdm_timestamp = gethrestime_sec();
+	strcpy(zdm->zdm_msg, buf);
+
+	mutex_enter(&zfs_dbgmsgs.pl_lock);
+	procfs_list_add(&zfs_dbgmsgs, zdm);
+	zfs_dbgmsg_size += size;
+	zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0));
+	mutex_exit(&zfs_dbgmsgs.pl_lock);
+}
+
+#ifdef _KERNEL
+
+void
+__dprintf(boolean_t dprint, const char *file, const char *func,
+    int line, const char *fmt, ...)
+{
+	const char *newfile;
+	va_list adx;
+	size_t size;
+	char *buf;
+	char *nl;
+	int i;
+	char *prefix = (dprint) ? "dprintf: " : "";
+
+	size = 1024;
+	buf = kmem_alloc(size, KM_SLEEP);
+
+	/*
+	 * Get rid of annoying prefix to filename.
+	 */
+	newfile = strrchr(file, '/');
+	if (newfile != NULL) {
+		newfile = newfile + 1; /* Get rid of leading / */
+	} else {
+		newfile = file;
+	}
+
+	i = snprintf(buf, size, "%s%s:%d:%s(): ", prefix, newfile, line, func);
+
+	if (i < size) {
+		va_start(adx, fmt);
+		(void) vsnprintf(buf + i, size - i, fmt, adx);
+		va_end(adx);
+	}
+
+	/*
+	 * Get rid of trailing newline for dprintf logs.
+	 */
+	if (dprint && buf[0] != '\0') {
+		nl = &buf[strlen(buf) - 1];
+		if (*nl == '\n')
+			*nl = '\0';
+	}
+
+	/*
+	 * To get this data enable the zfs__dprintf trace point as shown:
+	 *
+	 * # Enable zfs__dprintf tracepoint, clear the tracepoint ring buffer
+	 * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable
+	 * $ echo 0 > /sys/kernel/debug/tracing/trace
+	 *
+	 * # Dump the ring buffer.
+	 * $ cat /sys/kernel/debug/tracing/trace
+	 */
+	DTRACE_PROBE1(zfs__dprintf, char *, buf);
+
+	/*
+	 * To get this data:
+	 *
+	 * $ cat /proc/spl/kstat/zfs/dbgmsg
+	 *
+	 * To clear the buffer:
+	 * $ echo 0 > /proc/spl/kstat/zfs/dbgmsg
+	 */
+	__zfs_dbgmsg(buf);
+
+	kmem_free(buf, size);
+}
+
+#else
+
+void
+zfs_dbgmsg_print(const char *tag)
+{
+	ssize_t ret __attribute__((unused));
+
+	/*
+	 * We use write() in this function instead of printf()
+	 * so it is safe to call from a signal handler.
+	 */
+	ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11);
+	ret = write(STDOUT_FILENO, tag, strlen(tag));
+	ret = write(STDOUT_FILENO, ") START:\n", 9);
+
+	mutex_enter(&zfs_dbgmsgs.pl_lock);
+	for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL;
+	    zdm = list_next(&zfs_dbgmsgs.pl_list, zdm)) {
+		ret = write(STDOUT_FILENO, zdm->zdm_msg,
+		    strlen(zdm->zdm_msg));
+		ret = write(STDOUT_FILENO, "\n", 1);
+	}
+
+	ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11);
+	ret = write(STDOUT_FILENO, tag, strlen(tag));
+	ret = write(STDOUT_FILENO, ") END\n", 6);
+
+	mutex_exit(&zfs_dbgmsgs.pl_lock);
+}
+#endif /* _KERNEL */
+
+#ifdef _KERNEL
+module_param(zfs_dbgmsg_enable, int, 0644);
+MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log");
+
+module_param(zfs_dbgmsg_maxsize, int, 0644);
+MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size");
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
new file mode 100644
index 000000000000..383657208df3
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
@@ -0,0 +1,1224 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/sunddi.h>
+#include <sys/random.h>
+#include <sys/policy.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_vnops.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+
+/*
+ * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups
+ * of names after deciding which is the appropriate lookup interface.
+ */
+static int
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, matchtype_t mt,
+    boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
+{
+	boolean_t conflict = B_FALSE;
+	int error;
+
+	if (zfsvfs->z_norm) {
+		size_t bufsz = 0;
+		char *buf = NULL;
+
+		if (rpnp) {
+			buf = rpnp->pn_buf;
+			bufsz = rpnp->pn_bufsize;
+		}
+
+		/*
+		 * In the non-mixed case we only expect there would ever
+		 * be one match, but we need to use the normalizing lookup.
+		 */
+		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
+		    zoid, mt, buf, bufsz, &conflict);
+	} else {
+		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
+	}
+
+	/*
+	 * Allow multiple entries provided the first entry is
+	 * the object id.  Non-zpl consumers may safely make
+	 * use of the additional space.
+	 *
+	 * XXX: This should be a feature flag for compatibility
+	 */
+	if (error == EOVERFLOW)
+		error = 0;
+
+	if (zfsvfs->z_norm && !error && deflags)
+		*deflags = conflict ? ED_CASE_CONFLICT : 0;
+
+	*zoid = ZFS_DIRENT_OBJ(*zoid);
+
+	return (error);
+}
+
+/*
+ * Lock a directory entry.  A dirlock on <dzp, name> protects that name
+ * in dzp's directory zap object.  As long as you hold a dirlock, you can
+ * assume two things: (1) dzp cannot be reaped, and (2) no other thread
+ * can change the zap entry for (i.e. link or unlink) this name.
+ *
+ * Input arguments:
+ *	dzp	- znode for directory
+ *	name	- name of entry to lock
+ *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
+ *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
+ *		  ZSHARED: allow concurrent access with other ZSHARED callers.
+ *		  ZXATTR: we want dzp's xattr directory
+ *		  ZCILOOK: On a mixed sensitivity file system,
+ *			   this lookup should be case-insensitive.
+ *		  ZCIEXACT: On a purely case-insensitive file system,
+ *			    this lookup should be case-sensitive.
+ *		  ZRENAMING: we are locking for renaming, force narrow locks
+ *		  ZHAVELOCK: Don't grab the z_name_lock for this call. The
+ *			     current thread already holds it.
+ *
+ * Output arguments:
+ *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
+ *	dlpp	- pointer to the dirlock for this entry (NULL on error)
+ *      direntflags - (case-insensitive lookup only)
+ *		flags if multiple case-sensitive matches exist in directory
+ *      realpnp     - (case-insensitive lookup only)
+ *		actual name matched within the directory
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ * NOTE: For case-insensitive file systems we take wide locks (see below),
+ *	 but return znode pointers to a single match.
+ */
+int
+zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
+    int flag, int *direntflags, pathname_t *realpnp)
+{
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zfs_dirlock_t	*dl;
+	boolean_t	update;
+	matchtype_t	mt = 0;
+	uint64_t	zoid;
+	int		error = 0;
+	int		cmpflags;
+
+	*zpp = NULL;
+	*dlpp = NULL;
+
+	/*
+	 * Verify that we are not trying to lock '.', '..', or '.zfs'
+	 */
+	if ((name[0] == '.' &&
+	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) ||
+	    (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))
+		return (SET_ERROR(EEXIST));
+
+	/*
+	 * Case sensitivity and normalization preferences are set when
+	 * the file system is created.  These are stored in the
+	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
+	 * affect what vnodes can be cached in the DNLC, how we
+	 * perform zap lookups, and the "width" of our dirlocks.
+	 *
+	 * A normal dirlock locks a single name.  Note that with
+	 * normalization a name can be composed multiple ways, but
+	 * when normalized, these names all compare equal.  A wide
+	 * dirlock locks multiple names.  We need these when the file
+	 * system is supporting mixed-mode access.  It is sometimes
+	 * necessary to lock all case permutations of file name at
+	 * once so that simultaneous case-insensitive/case-sensitive
+	 * behaves as rationally as possible.
+	 */
+
+	/*
+	 * When matching we may need to normalize & change case according to
+	 * FS settings.
+	 *
+	 * Note that a normalized match is necessary for a case insensitive
+	 * filesystem when the lookup request is not exact because normalization
+	 * can fold case independent of normalizing code point sequences.
+	 *
+	 * See the table above zfs_dropname().
+	 */
+	if (zfsvfs->z_norm != 0) {
+		mt = MT_NORMALIZE;
+
+		/*
+		 * Determine if the match needs to honor the case specified in
+		 * lookup, and if so keep track of that so that during
+		 * normalization we don't fold case.
+		 */
+		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE &&
+		    (flag & ZCIEXACT)) ||
+		    (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) {
+			mt |= MT_MATCH_CASE;
+		}
+	}
+
+	/*
+	 * Only look in or update the DNLC if we are looking for the
+	 * name on a file system that does not require normalization
+	 * or case folding.  We can also look there if we happen to be
+	 * on a non-normalizing, mixed sensitivity file system IF we
+	 * are looking for the exact name.
+	 *
+	 * Maybe can add TO-UPPERed version of name to dnlc in ci-only
+	 * case for performance improvement?
+	 */
+	update = !zfsvfs->z_norm ||
+	    (zfsvfs->z_case == ZFS_CASE_MIXED &&
+	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
+
+	/*
+	 * ZRENAMING indicates we are in a situation where we should
+	 * take narrow locks regardless of the file system's
+	 * preferences for normalizing and case folding.  This will
+	 * prevent us deadlocking trying to grab the same wide lock
+	 * twice if the two names happen to be case-insensitive
+	 * matches.
+	 */
+	if (flag & ZRENAMING)
+		cmpflags = 0;
+	else
+		cmpflags = zfsvfs->z_norm;
+
+	/*
+	 * Wait until there are no locks on this name.
+	 *
+	 * Don't grab the lock if it is already held. However, cannot
+	 * have both ZSHARED and ZHAVELOCK together.
+	 */
+	ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
+	if (!(flag & ZHAVELOCK))
+		rw_enter(&dzp->z_name_lock, RW_READER);
+
+	mutex_enter(&dzp->z_lock);
+	for (;;) {
+		if (dzp->z_unlinked && !(flag & ZXATTR)) {
+			mutex_exit(&dzp->z_lock);
+			if (!(flag & ZHAVELOCK))
+				rw_exit(&dzp->z_name_lock);
+			return (SET_ERROR(ENOENT));
+		}
+		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
+			if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
+			    U8_UNICODE_LATEST, &error) == 0) || error != 0)
+				break;
+		}
+		if (error != 0) {
+			mutex_exit(&dzp->z_lock);
+			if (!(flag & ZHAVELOCK))
+				rw_exit(&dzp->z_name_lock);
+			return (SET_ERROR(ENOENT));
+		}
+		if (dl == NULL)	{
+			/*
+			 * Allocate a new dirlock and add it to the list.
+			 */
+			dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
+			cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
+			dl->dl_name = name;
+			dl->dl_sharecnt = 0;
+			dl->dl_namelock = 0;
+			dl->dl_namesize = 0;
+			dl->dl_dzp = dzp;
+			dl->dl_next = dzp->z_dirlocks;
+			dzp->z_dirlocks = dl;
+			break;
+		}
+		if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
+			break;
+		cv_wait(&dl->dl_cv, &dzp->z_lock);
+	}
+
+	/*
+	 * If the z_name_lock was NOT held for this dirlock record it.
+	 */
+	if (flag & ZHAVELOCK)
+		dl->dl_namelock = 1;
+
+	if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
+		/*
+		 * We're the second shared reference to dl.  Make a copy of
+		 * dl_name in case the first thread goes away before we do.
+		 * Note that we initialize the new name before storing its
+		 * pointer into dl_name, because the first thread may load
+		 * dl->dl_name at any time.  It'll either see the old value,
+		 * which belongs to it, or the new shared copy; either is OK.
+		 */
+		dl->dl_namesize = strlen(dl->dl_name) + 1;
+		name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
+		bcopy(dl->dl_name, name, dl->dl_namesize);
+		dl->dl_name = name;
+	}
+
+	mutex_exit(&dzp->z_lock);
+
+	/*
+	 * We have a dirlock on the name.  (Note that it is the dirlock,
+	 * not the dzp's z_lock, that protects the name in the zap object.)
+	 * See if there's an object by this name; if so, put a hold on it.
+	 */
+	if (flag & ZXATTR) {
+		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
+		    sizeof (zoid));
+		if (error == 0)
+			error = (zoid == 0 ? SET_ERROR(ENOENT) : 0);
+	} else {
+		error = zfs_match_find(zfsvfs, dzp, name, mt,
+		    update, direntflags, realpnp, &zoid);
+	}
+	if (error) {
+		if (error != ENOENT || (flag & ZEXISTS)) {
+			zfs_dirent_unlock(dl);
+			return (error);
+		}
+	} else {
+		if (flag & ZNEW) {
+			zfs_dirent_unlock(dl);
+			return (SET_ERROR(EEXIST));
+		}
+		error = zfs_zget(zfsvfs, zoid, zpp);
+		if (error) {
+			zfs_dirent_unlock(dl);
+			return (error);
+		}
+	}
+
+	*dlpp = dl;
+
+	return (0);
+}
+
+/*
+ * Unlock this directory entry and wake anyone who was waiting for it.
+ */
+void
+zfs_dirent_unlock(zfs_dirlock_t *dl)
+{
+	znode_t *dzp = dl->dl_dzp;
+	zfs_dirlock_t **prev_dl, *cur_dl;
+
+	mutex_enter(&dzp->z_lock);
+
+	if (!dl->dl_namelock)
+		rw_exit(&dzp->z_name_lock);
+
+	if (dl->dl_sharecnt > 1) {
+		dl->dl_sharecnt--;
+		mutex_exit(&dzp->z_lock);
+		return;
+	}
+	prev_dl = &dzp->z_dirlocks;
+	while ((cur_dl = *prev_dl) != dl)
+		prev_dl = &cur_dl->dl_next;
+	*prev_dl = dl->dl_next;
+	cv_broadcast(&dl->dl_cv);
+	mutex_exit(&dzp->z_lock);
+
+	if (dl->dl_namesize != 0)
+		kmem_free(dl->dl_name, dl->dl_namesize);
+	cv_destroy(&dl->dl_cv);
+	kmem_free(dl, sizeof (*dl));
+}
+
+/*
+ * Look up an entry in a directory.
+ *
+ * NOTE: '.' and '..' are handled as special cases because
+ *	no directory entries are actually stored for them.  If this is
+ *	the root of a filesystem, then '.zfs' is also treated as a
+ *	special pseudo-directory.
+ */
+int
+zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags,
+    int *deflg, pathname_t *rpnp)
+{
+	zfs_dirlock_t *dl;
+	znode_t *zp;
+	struct inode *ip;
+	int error = 0;
+	uint64_t parent;
+
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		*zpp = dzp;
+		zhold(*zpp);
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+
+		/*
+		 * If we are a snapshot mounted under .zfs, return
+		 * the inode pointer for the snapshot directory.
+		 */
+		if ((error = sa_lookup(dzp->z_sa_hdl,
+		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+			return (error);
+
+		if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
+			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
+			    "snapshot", &ip, 0, kcred, NULL, NULL);
+			*zpp = ITOZ(ip);
+			return (error);
+		}
+		rw_enter(&dzp->z_parent_lock, RW_READER);
+		error = zfs_zget(zfsvfs, parent, &zp);
+		if (error == 0)
+			*zpp = zp;
+		rw_exit(&dzp->z_parent_lock);
+	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
+		ip = zfsctl_root(dzp);
+		*zpp = ITOZ(ip);
+	} else {
+		int zf;
+
+		zf = ZEXISTS | ZSHARED;
+		if (flags & FIGNORECASE)
+			zf |= ZCILOOK;
+
+		error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
+		if (error == 0) {
+			*zpp = zp;
+			zfs_dirent_unlock(dl);
+			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+		}
+		rpnp = NULL;
+	}
+
+	if ((flags & FIGNORECASE) && rpnp && !error)
+		(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
+
+	return (error);
+}
+
+/*
+ * unlinked Set (formerly known as the "delete queue") Error Handling
+ *
+ * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating.  We
+ * also fib and say that we won't be adding any new entries to the
+ * unlinked set, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem).  So on the small
+ * chance that the nlink list is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the unlinked set below to fail due to i/o error.  On a
+ * nondebug system, this will result in the space being leaked.
+ */
+void
+zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+
+	ASSERT(zp->z_unlinked);
+	ASSERT(ZTOI(zp)->i_nlink == 0);
+
+	VERIFY3U(0, ==,
+	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+	dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
+}
+
+/*
+ * Clean up any znodes that had no links when we either crashed or
+ * (force) umounted the file system.
+ */
+static void
+zfs_unlinked_drain_task(void *arg)
+{
+	zfsvfs_t *zfsvfs = arg;
+	zap_cursor_t	zc;
+	zap_attribute_t zap;
+	dmu_object_info_t doi;
+	znode_t		*zp;
+	int		error;
+
+	ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);
+
+	/*
+	 * Iterate over the contents of the unlinked set.
+	 */
+	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
+	    zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel;
+	    zap_cursor_advance(&zc)) {
+
+		/*
+		 * See what kind of object we have in list
+		 */
+
+		error = dmu_object_info(zfsvfs->z_os,
+		    zap.za_first_integer, &doi);
+		if (error != 0)
+			continue;
+
+		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+		/*
+		 * We need to re-mark these list entries for deletion,
+		 * so we pull them back into core and set zp->z_unlinked.
+		 */
+		error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+		/*
+		 * We may pick up znodes that are already marked for deletion.
+		 * This could happen during the purge of an extended attribute
+		 * directory.  All we need to do is skip over them, since they
+		 * are already in the system marked z_unlinked.
+		 */
+		if (error != 0)
+			continue;
+
+		zp->z_unlinked = B_TRUE;
+
+		/*
+		 * zrele() decrements the znode's ref count and may cause
+		 * it to be synchronously freed. We interrupt freeing
+		 * of this znode by checking the return value of
+		 * dmu_objset_zfs_unmounting() in dmu_free_long_range()
+		 * when an unmount is requested.
+		 */
+		zrele(zp);
+		ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+	}
+	zap_cursor_fini(&zc);
+
+	zfsvfs->z_draining = B_FALSE;
+	zfsvfs->z_drain_task = TASKQID_INVALID;
+}
+
+/*
+ * Sets z_draining then tries to dispatch async unlinked drain.
+ * If that fails executes synchronous unlinked drain.
+ */
+void
+zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+{
+	ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+	ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);
+
+	zfsvfs->z_draining = B_TRUE;
+	zfsvfs->z_drain_cancel = B_FALSE;
+
+	zfsvfs->z_drain_task = taskq_dispatch(
+	    dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)),
+	    zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP);
+	if (zfsvfs->z_drain_task == TASKQID_INVALID) {
+		zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");
+		zfs_unlinked_drain_task(zfsvfs);
+	}
+}
+
+/*
+ * Wait for the unlinked drain taskq task to stop. This will interrupt the
+ * unlinked set processing if it is in progress.
+ */
+void
+zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
+{
+	ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+
+	if (zfsvfs->z_draining) {
+		zfsvfs->z_drain_cancel = B_TRUE;
+		taskq_cancel_id(dsl_pool_unlinked_drain_taskq(
+		    dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);
+		zfsvfs->z_drain_task = TASKQID_INVALID;
+		zfsvfs->z_draining = B_FALSE;
+	}
+}
+
+/*
+ * Delete the entire contents of a directory.  Return a count
+ * of the number of entries that could not be deleted. If we encounter
+ * an error, return a count of at least one so that the directory stays
+ * in the unlinked set.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ *	so there is no need to lock its entries before deletion.
+ *	Also, it assumes the directory contents is *only* regular
+ *	files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	znode_t		*xzp;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zfs_dirlock_t	dl;
+	int skipped = 0;
+	int error;
+
+	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+	    zap_cursor_advance(&zc)) {
+		error = zfs_zget(zfsvfs,
+		    ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
+		if (error) {
+			skipped += 1;
+			continue;
+		}
+
+		ASSERT(S_ISREG(ZTOI(xzp)->i_mode) ||
+		    S_ISLNK(ZTOI(xzp)->i_mode));
+
+		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
+		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+		/* Is this really needed ? */
+		zfs_sa_upgrade_txholds(tx, xzp);
+		dmu_tx_mark_netfree(tx);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			zfs_zrele_async(xzp);
+			skipped += 1;
+			continue;
+		}
+		bzero(&dl, sizeof (dl));
+		dl.dl_dzp = dzp;
+		dl.dl_name = zap.za_name;
+
+		error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+		if (error)
+			skipped += 1;
+		dmu_tx_commit(tx);
+
+		zfs_zrele_async(xzp);
+	}
+	zap_cursor_fini(&zc);
+	if (error != ENOENT)
+		skipped += 1;
+	return (skipped);
+}
+
+void
+zfs_rmnode(znode_t *zp)
+{
+	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
+	objset_t	*os = zfsvfs->z_os;
+	znode_t		*xzp = NULL;
+	dmu_tx_t	*tx;
+	uint64_t	acl_obj;
+	uint64_t	xattr_obj;
+	uint64_t	links;
+	int		error;
+
+	ASSERT(ZTOI(zp)->i_nlink == 0);
+	ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0);
+
+	/*
+	 * If this is an attribute directory, purge its contents.
+	 */
+	if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {
+		if (zfs_purgedir(zp) != 0) {
+			/*
+			 * Not enough space to delete some xattrs.
+			 * Leave it in the unlinked set.
+			 */
+			zfs_znode_dmu_fini(zp);
+
+			return;
+		}
+	}
+
+	/*
+	 * Free up all the data in the file.  We don't do this for directories
+	 * because we need truncate and remove to be in the same tx, like in
+	 * zfs_znode_delete(). Otherwise, if we crash here we'll end up with
+	 * an inconsistent truncated zap object in the delete queue.  Note a
+	 * truncated file is harmless since it only contains user data.
+	 */
+	if (S_ISREG(ZTOI(zp)->i_mode)) {
+		error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
+		if (error) {
+			/*
+			 * Not enough space or we were interrupted by unmount.
+			 * Leave the file in the unlinked set.
+			 */
+			zfs_znode_dmu_fini(zp);
+			return;
+		}
+	}
+
+	/*
+	 * If the file has extended attributes, we're going to unlink
+	 * the xattr dir.
+	 */
+	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+	    &xattr_obj, sizeof (xattr_obj));
+	if (error == 0 && xattr_obj) {
+		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+		ASSERT(error == 0);
+	}
+
+	acl_obj = zfs_external_acl(zp);
+
+	/*
+	 * Set up the final transaction.
+	 */
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+	if (xzp) {
+		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+	}
+	if (acl_obj)
+		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		/*
+		 * Not enough space to delete the file.  Leave it in the
+		 * unlinked set, leaking it until the fs is remounted (at
+		 * which point we'll call zfs_unlinked_drain() to process it).
+		 */
+		dmu_tx_abort(tx);
+		zfs_znode_dmu_fini(zp);
+		goto out;
+	}
+
+	if (xzp) {
+		ASSERT(error == 0);
+		mutex_enter(&xzp->z_lock);
+		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
+		clear_nlink(ZTOI(xzp));		/* no more links to it */
+		links = 0;
+		VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+		    &links, sizeof (links), tx));
+		mutex_exit(&xzp->z_lock);
+		zfs_unlinked_add(xzp, tx);
+	}
+
+	mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
+
+	/*
+	 * Remove this znode from the unlinked set.  If a has rollback has
+	 * occurred while a file is open and unlinked.  Then when the file
+	 * is closed post rollback it will not exist in the rolled back
+	 * version of the unlinked object.
+	 */
+	error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+	    zp->z_id, tx);
+	VERIFY(error == 0 || error == ENOENT);
+
+	uint64_t count;
+	if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) {
+		cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv);
+	}
+
+	mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
+
+	dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
+
+	zfs_znode_delete(zp, tx);
+
+	dmu_tx_commit(tx);
+out:
+	if (xzp)
+		zfs_zrele_async(xzp);
+}
+
+static uint64_t
+zfs_dirent(znode_t *zp, uint64_t mode)
+{
+	uint64_t de = zp->z_id;
+
+	if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE)
+		de |= IFTODT(mode) << 60;
+	return (de);
+}
+
+/*
+ * Link zp into dl.  Can fail in the following cases :
+ * - if zp has been unlinked.
+ * - if the number of entries with the same hash (aka. colliding entries)
+ *    exceed the capacity of a leaf-block of fatzap and splitting of the
+ *    leaf-block does not help.
+ */
+int
+zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+{
+	znode_t *dzp = dl->dl_dzp;
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	uint64_t value;
+	int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
+	sa_bulk_attr_t bulk[5];
+	uint64_t mtime[2], ctime[2];
+	uint64_t links;
+	int count = 0;
+	int error;
+
+	mutex_enter(&zp->z_lock);
+
+	if (!(flag & ZRENAMING)) {
+		if (zp->z_unlinked) {	/* no new links to unlinked zp */
+			ASSERT(!(flag & (ZNEW | ZEXISTS)));
+			mutex_exit(&zp->z_lock);
+			return (SET_ERROR(ENOENT));
+		}
+		if (!(flag & ZNEW)) {
+			/*
+			 * ZNEW nodes come from zfs_mknode() where the link
+			 * count has already been initialised
+			 */
+			inc_nlink(ZTOI(zp));
+			links = ZTOI(zp)->i_nlink;
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+			    NULL, &links, sizeof (links));
+		}
+	}
+
+	value = zfs_dirent(zp, zp->z_mode);
+	error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1,
+	    &value, tx);
+
+	/*
+	 * zap_add could fail to add the entry if it exceeds the capacity of the
+	 * leaf-block and zap_leaf_split() failed to help.
+	 * The caller of this routine is responsible for failing the transaction
+	 * which will rollback the SA updates done above.
+	 */
+	if (error != 0) {
+		if (!(flag & ZRENAMING) && !(flag & ZNEW))
+			drop_nlink(ZTOI(zp));
+		mutex_exit(&zp->z_lock);
+		return (error);
+	}
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+	    &dzp->z_id, sizeof (dzp->z_id));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+
+	if (!(flag & ZNEW)) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    ctime, sizeof (ctime));
+		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+		    ctime);
+	}
+	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+	ASSERT(error == 0);
+
+	mutex_exit(&zp->z_lock);
+
+	mutex_enter(&dzp->z_lock);
+	dzp->z_size++;
+	if (zp_is_dir)
+		inc_nlink(ZTOI(dzp));
+	links = ZTOI(dzp)->i_nlink;
+	count = 0;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &dzp->z_size, sizeof (dzp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &links, sizeof (links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+	    mtime, sizeof (mtime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    ctime, sizeof (ctime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &dzp->z_pflags, sizeof (dzp->z_pflags));
+	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+	ASSERT(error == 0);
+	mutex_exit(&dzp->z_lock);
+
+	return (0);
+}
+
+/*
+ * The match type in the code for this function should conform to:
+ *
+ * ------------------------------------------------------------------------
+ * fs type  | z_norm      | lookup type | match type
+ * ---------|-------------|-------------|----------------------------------
+ * CS !norm | 0           |           0 | 0 (exact)
+ * CS  norm | formX       |           0 | MT_NORMALIZE
+ * CI !norm | upper       |   !ZCIEXACT | MT_NORMALIZE
+ * CI !norm | upper       |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CI  norm | upper|formX |   !ZCIEXACT | MT_NORMALIZE
+ * CI  norm | upper|formX |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper       |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper       |     ZCILOOK | MT_NORMALIZE
+ * CM  norm | upper|formX |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM  norm | upper|formX |     ZCILOOK | MT_NORMALIZE
+ *
+ * Abbreviations:
+ *    CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
+ *    upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
+ *    formX = unicode normalization form set on fs creation
+ */
+static int
+zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
+    int flag)
+{
+	int error;
+
+	if (ZTOZSB(zp)->z_norm) {
+		matchtype_t mt = MT_NORMALIZE;
+
+		if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE &&
+		    (flag & ZCIEXACT)) ||
+		    (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED &&
+		    !(flag & ZCILOOK))) {
+			mt |= MT_MATCH_CASE;
+		}
+
+		error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id,
+		    dl->dl_name, mt, tx);
+	} else {
+		error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name,
+		    tx);
+	}
+
+	return (error);
+}
+
+/*
+ * Unlink zp from dl, and mark zp for deletion if this was the last link. Can
+ * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY).
+ * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
+ * If it's non-NULL, we use it to indicate whether the znode needs deletion,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
+    boolean_t *unlinkedp)
+{
+	znode_t *dzp = dl->dl_dzp;
+	zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+	int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
+	boolean_t unlinked = B_FALSE;
+	sa_bulk_attr_t bulk[5];
+	uint64_t mtime[2], ctime[2];
+	uint64_t links;
+	int count = 0;
+	int error;
+
+	if (!(flag & ZRENAMING)) {
+		mutex_enter(&zp->z_lock);
+
+		if (zp_is_dir && !zfs_dirempty(zp)) {
+			mutex_exit(&zp->z_lock);
+			return (SET_ERROR(ENOTEMPTY));
+		}
+
+		/*
+		 * If we get here, we are going to try to remove the object.
+		 * First try removing the name from the directory; if that
+		 * fails, return the error.
+		 */
+		error = zfs_dropname(dl, zp, dzp, tx, flag);
+		if (error != 0) {
+			mutex_exit(&zp->z_lock);
+			return (error);
+		}
+
+		if (ZTOI(zp)->i_nlink <= zp_is_dir) {
+			zfs_panic_recover("zfs: link count on %lu is %u, "
+			    "should be at least %u", zp->z_id,
+			    (int)ZTOI(zp)->i_nlink, zp_is_dir + 1);
+			set_nlink(ZTOI(zp), zp_is_dir + 1);
+		}
+		drop_nlink(ZTOI(zp));
+		if (ZTOI(zp)->i_nlink == zp_is_dir) {
+			zp->z_unlinked = B_TRUE;
+			clear_nlink(ZTOI(zp));
+			unlinked = B_TRUE;
+		} else {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+			    NULL, &ctime, sizeof (ctime));
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+			    NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+			zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+			    ctime);
+		}
+		links = ZTOI(zp)->i_nlink;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+		    NULL, &links, sizeof (links));
+		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		count = 0;
+		ASSERT(error == 0);
+		mutex_exit(&zp->z_lock);
+	} else {
+		error = zfs_dropname(dl, zp, dzp, tx, flag);
+		if (error != 0)
+			return (error);
+	}
+
+	mutex_enter(&dzp->z_lock);
+	dzp->z_size--;		/* one dirent removed */
+	if (zp_is_dir)
+		drop_nlink(ZTOI(dzp));	/* ".." link from zp */
+	links = ZTOI(dzp)->i_nlink;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+	    NULL, &links, sizeof (links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+	    NULL, &dzp->z_size, sizeof (dzp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+	    NULL, ctime, sizeof (ctime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+	    NULL, mtime, sizeof (mtime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
+	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+	ASSERT(error == 0);
+	mutex_exit(&dzp->z_lock);
+
+	if (unlinkedp != NULL)
+		*unlinkedp = unlinked;
+	else if (unlinked)
+		zfs_unlinked_add(zp, tx);
+
+	return (0);
+}
+
+/*
+ * Indicate whether the directory is empty.  Works with or without z_lock
+ * held, but can only be consider a hint in the latter case.  Returns true
+ * if only "." and ".." remain and there's no work in progress.
+ *
+ * The internal ZAP size, rather than zp->z_size, needs to be checked since
+ * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+	uint64_t count;
+	int error;
+
+	if (dzp->z_dirlocks != NULL)
+		return (B_FALSE);
+
+	error = zap_count(zfsvfs->z_os, dzp->z_id, &count);
+	if (error != 0 || count != 0)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	znode_t *xzp;
+	dmu_tx_t *tx;
+	int error;
+	zfs_acl_ids_t acl_ids;
+	boolean_t fuid_dirtied;
+#ifdef ZFS_DEBUG
+	uint64_t parent;
+#endif
+
+	*xzpp = NULL;
+
+	if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)))
+		return (error);
+
+	if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
+	    &acl_ids)) != 0)
+		return (error);
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) {
+		zfs_acl_ids_free(&acl_ids);
+		return (SET_ERROR(EDQUOT));
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		return (error);
+	}
+	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+#ifdef ZFS_DEBUG
+	error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (parent));
+	ASSERT(error == 0 && parent == zp->z_id);
+#endif
+
+	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+	    sizeof (xzp->z_id), tx));
+
+	if (!zp->z_unlinked)
+		(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
+		    xzp, "", NULL, acl_ids.z_fuidp, vap);
+
+	zfs_acl_ids_free(&acl_ids);
+	dmu_tx_commit(tx);
+
+	*xzpp = xzp;
+
+	return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ *	IN:	zp	- znode to obtain attribute directory from
+ *		cr	- credentials of caller
+ *		flags	- flags from the VOP_LOOKUP call
+ *
+ *	OUT:	xipp	- pointer to extended attribute znode
+ *
+ *	RETURN:	0 on success
+ *		error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags)
+{
+	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
+	znode_t		*xzp;
+	zfs_dirlock_t	*dl;
+	vattr_t		va;
+	int		error;
+top:
+	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
+	if (error)
+		return (error);
+
+	if (xzp != NULL) {
+		*xzpp = xzp;
+		zfs_dirent_unlock(dl);
+		return (0);
+	}
+
+	if (!(flags & CREATE_XATTR_DIR)) {
+		zfs_dirent_unlock(dl);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if (zfs_is_readonly(zfsvfs)) {
+		zfs_dirent_unlock(dl);
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * The ability to 'create' files in an attribute
+	 * directory comes from the write_xattr permission on the base file.
+	 *
+	 * The ability to 'search' an attribute directory requires
+	 * read_xattr permission on the base file.
+	 *
+	 * Once in a directory the ability to read/write attributes
+	 * is controlled by the permissions on the attribute file.
+	 */
+	va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID;
+	va.va_mode = S_IFDIR | S_ISVTX | 0777;
+	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
+
+	va.va_dentry = NULL;
+	error = zfs_make_xattrdir(zp, &va, xzpp, cr);
+	zfs_dirent_unlock(dl);
+
+	if (error == ERESTART) {
+		/* NB: we already did dmu_tx_wait() if necessary */
+		goto top;
+	}
+
+	return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ *	you own the directory,
+ *	you own the entry,
+ *	you have write access to the entry,
+ *	or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+	uid_t		uid;
+	uid_t		downer;
+	uid_t		fowner;
+	zfsvfs_t	*zfsvfs = ZTOZSB(zdp);
+
+	if (zfsvfs->z_replay)
+		return (0);
+
+	if ((zdp->z_mode & S_ISVTX) == 0)
+		return (0);
+
+	downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid),
+	    cr, ZFS_OWNER);
+	fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid),
+	    cr, ZFS_OWNER);
+
+	if ((uid = crgetuid(cr)) == downer || uid == fowner ||
+	    zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)
+		return (0);
+	else
+		return (secpolicy_vnode_remove(cr));
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
new file mode 100644
index 000000000000..99c6ffc95940
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
@@ -0,0 +1,440 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_file.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <linux/falloc.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#ifdef HAVE_FDTABLE_HEADER
+#include <linux/fdtable.h>
+#endif
+
+/*
+ * Open file
+ *
+ * path - fully qualified path to file
+ * flags - file attributes O_READ / O_WRITE / O_EXCL
+ * fpp - pointer to return file pointer
+ *
+ * Returns 0 on success underlying error on failure.
+ */
+int
+zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
+{
+	struct file *filp;
+	int saved_umask;
+
+	if (!(flags & O_CREAT) && (flags & O_WRONLY))
+		flags |= O_EXCL;
+
+	if (flags & O_CREAT)
+		saved_umask = xchg(&current->fs->umask, 0);
+
+	filp = filp_open(path, flags, mode);
+
+	if (flags & O_CREAT)
+		(void) xchg(&current->fs->umask, saved_umask);
+
+	if (IS_ERR(filp))
+		return (-PTR_ERR(filp));
+
+	*fpp = filp;
+	return (0);
+}
+
+void
+zfs_file_close(zfs_file_t *fp)
+{
+	filp_close(fp, 0);
+}
+
+static ssize_t
+zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *off)
+{
+#if defined(HAVE_KERNEL_WRITE_PPOS)
+	return (kernel_write(fp, buf, count, off));
+#else
+	mm_segment_t saved_fs;
+	ssize_t rc;
+
+	saved_fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	rc = vfs_write(fp, (__force const char __user __user *)buf, count, off);
+
+	set_fs(saved_fs);
+
+	return (rc);
+#endif
+}
+
+/*
+ * Stateful write - use os internal file pointer to determine where to
+ * write and update on successful completion.
+ *
+ * fp -  pointer to file (pipe, socket, etc) to write to
+ * buf - buffer to write
+ * count - # of bytes to write
+ * resid -  pointer to count of unwritten bytes  (if short write)
+ *
+ * Returns 0 on success errno on failure.
+ */
+int
+zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
+{
+	loff_t off = fp->f_pos;
+	ssize_t rc;
+
+	rc = zfs_file_write_impl(fp, buf, count, &off);
+	if (rc < 0)
+		return (-rc);
+
+	fp->f_pos = off;
+
+	if (resid) {
+		*resid = count - rc;
+	} else if (rc != count) {
+		return (EIO);
+	}
+
+	return (0);
+}
+
+/*
+ * Stateless write - os internal file pointer is not updated.
+ *
+ * fp -  pointer to file (pipe, socket, etc) to write to
+ * buf - buffer to write
+ * count - # of bytes to write
+ * off - file offset to write to (only valid for seekable types)
+ * resid -  pointer to count of unwritten bytes
+ *
+ * Returns 0 on success errno on failure.
+ */
+int
+zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
+    ssize_t *resid)
+{
+	ssize_t rc;
+
+	rc  = zfs_file_write_impl(fp, buf, count, &off);
+	if (rc < 0)
+		return (-rc);
+
+	if (resid) {
+		*resid = count - rc;
+	} else if (rc != count) {
+		return (EIO);
+	}
+
+	return (0);
+}
+
+static ssize_t
+zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *off)
+{
+#if defined(HAVE_KERNEL_READ_PPOS)
+	return (kernel_read(fp, buf, count, off));
+#else
+	mm_segment_t saved_fs;
+	ssize_t rc;
+
+	saved_fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	rc = vfs_read(fp, (void __user *)buf, count, off);
+	set_fs(saved_fs);
+
+	return (rc);
+#endif
+}
+
+/*
+ * Stateful read - use os internal file pointer to determine where to
+ * read and update on successful completion.
+ *
+ * fp -  pointer to file (pipe, socket, etc) to read from
+ * buf - buffer to write
+ * count - # of bytes to read
+ * resid -  pointer to count of unread bytes (if short read)
+ *
+ * Returns 0 on success errno on failure.
+ */
+int
+zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
+{
+	loff_t off = fp->f_pos;
+	ssize_t rc;
+
+	rc = zfs_file_read_impl(fp, buf, count, &off);
+	if (rc < 0)
+		return (-rc);
+
+	fp->f_pos = off;
+
+	if (resid) {
+		*resid = count - rc;
+	} else if (rc != count) {
+		return (EIO);
+	}
+
+	return (0);
+}
+
+/*
+ * Stateless read - os internal file pointer is not updated.
+ *
+ * fp -  pointer to file (pipe, socket, etc) to read from
+ * buf - buffer to write
+ * count - # of bytes to write
+ * off - file offset to read from (only valid for seekable types)
+ * resid -  pointer to count of unwritten bytes (if short write)
+ *
+ * Returns 0 on success errno on failure.
+ */
+int
+zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
+    ssize_t *resid)
+{
+	ssize_t rc;
+
+	rc = zfs_file_read_impl(fp, buf, count, &off);
+	if (rc < 0)
+		return (-rc);
+
+	if (resid) {
+		*resid = count - rc;
+	} else if (rc != count) {
+		return (EIO);
+	}
+
+	return (0);
+}
+
+/*
+ * lseek - set / get file pointer
+ *
+ * fp -  pointer to file (pipe, socket, etc) to read from
+ * offp - value to seek to, returns current value plus passed offset
+ * whence - see man pages for standard lseek whence values
+ *
+ * Returns 0 on success errno on failure (ESPIPE for non seekable types)
+ */
+int
+zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
+{
+	loff_t rc;
+
+	if (*offp < 0 || *offp > MAXOFFSET_T)
+		return (EINVAL);
+
+	rc = vfs_llseek(fp, *offp, whence);
+	if (rc < 0)
+		return (-rc);
+
+	*offp = rc;
+
+	return (0);
+}
+
+/*
+ * Get file attributes
+ *
+ * filp - file pointer
+ * zfattr - pointer to file attr structure
+ *
+ * Currently only used for fetching size and file mode.
+ *
+ * Returns 0 on success or error code of underlying getattr call on failure.
+ */
+int
+zfs_file_getattr(zfs_file_t *filp, zfs_file_attr_t *zfattr)
+{
+	struct kstat stat;
+	int rc;
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+	rc = vfs_getattr(&filp->f_path, &stat, STATX_BASIC_STATS,
+	    AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+	rc = vfs_getattr(&filp->f_path, &stat);
+#else
+	rc = vfs_getattr(filp->f_path.mnt, filp->f_dentry, &stat);
+#endif
+	if (rc)
+		return (-rc);
+
+	zfattr->zfa_size = stat.size;
+	zfattr->zfa_mode = stat.mode;
+
+	return (0);
+}
+
+/*
+ * Sync file to disk
+ *
+ * filp - file pointer
+ * flags - O_SYNC and or O_DSYNC
+ *
+ * Returns 0 on success or error code of underlying sync call on failure.
+ */
+int
+zfs_file_fsync(zfs_file_t *filp, int flags)
+{
+	int datasync = 0;
+	int error;
+	int fstrans;
+
+	if (flags & O_DSYNC)
+		datasync = 1;
+
+	/*
+	 * May enter XFS which generates a warning when PF_FSTRANS is set.
+	 * To avoid this the flag is cleared over vfs_sync() and then reset.
+	 */
+	fstrans = __spl_pf_fstrans_check();
+	if (fstrans)
+		current->flags &= ~(__SPL_PF_FSTRANS);
+
+	error = -vfs_fsync(filp, datasync);
+
+	if (fstrans)
+		current->flags |= __SPL_PF_FSTRANS;
+
+	return (error);
+}
+
+/*
+ * fallocate - allocate or free space on disk
+ *
+ * fp - file pointer
+ * mode (non-standard options for hole punching etc)
+ * offset - offset to start allocating or freeing from
+ * len - length to free / allocate
+ *
+ * OPTIONAL
+ */
+int
+zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len)
+{
+	/*
+	 * May enter XFS which generates a warning when PF_FSTRANS is set.
+	 * To avoid this the flag is cleared over vfs_sync() and then reset.
+	 */
+	int fstrans = __spl_pf_fstrans_check();
+	if (fstrans)
+		current->flags &= ~(__SPL_PF_FSTRANS);
+
+	/*
+	 * When supported by the underlying file system preferentially
+	 * use the fallocate() callback to preallocate the space.
+	 */
+	int error = EOPNOTSUPP;
+	if (fp->f_op->fallocate)
+		error = fp->f_op->fallocate(fp, mode, offset, len);
+
+	if (fstrans)
+		current->flags |= __SPL_PF_FSTRANS;
+
+	return (error);
+}
+
+/*
+ * Request current file pointer offset
+ *
+ * fp - pointer to file
+ *
+ * Returns current file offset.
+ */
+loff_t
+zfs_file_off(zfs_file_t *fp)
+{
+	return (fp->f_pos);
+}
+
+/*
+ * Request file pointer private data
+ *
+ * fp - pointer to file
+ *
+ * Returns pointer to file private data.
+ */
+void *
+zfs_file_private(zfs_file_t *fp)
+{
+	return (fp->private_data);
+}
+
+/*
+ * unlink file
+ *
+ * path - fully qualified file path
+ *
+ * Returns 0 on success.
+ *
+ * OPTIONAL
+ */
+int
+zfs_file_unlink(const char *path)
+{
+	return (EOPNOTSUPP);
+}
+
+/*
+ * Get reference to file pointer
+ *
+ * fd - input file descriptor
+ * fpp - pointer to file pointer
+ *
+ * Returns 0 on success EBADF on failure.
+ */
+int
+zfs_file_get(int fd, zfs_file_t **fpp)
+{
+	zfs_file_t *fp;
+
+	fp = fget(fd);
+	if (fp == NULL)
+		return (EBADF);
+
+	*fpp = fp;
+
+	return (0);
+}
+
+/*
+ * Drop reference to file pointer
+ *
+ * fd - input file descriptor
+ */
+void
+zfs_file_put(int fd)
+{
+	struct file *fp;
+
+	if ((fp = fget(fd)) != NULL) {
+		fput(fp);
+		fput(fp);
+	}
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
new file mode 100644
index 000000000000..b88e0497d000
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
@@ -0,0 +1,329 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright 2011 Martin Matuska
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Portions Copyright 2012 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright 2017 RackTop Systems.
+ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ * Copyright (c) 2019 Datto Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/nvpair.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/fm/util.h>
+#include <sys/dsl_crypt.h>
+
+#include <sys/zfs_ioctl_impl.h>
+
+#include <sys/zfs_sysfs.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+
+boolean_t
+zfs_vfs_held(zfsvfs_t *zfsvfs)
+{
+	return (zfsvfs->z_sb != NULL);
+}
+
+int
+zfs_vfs_ref(zfsvfs_t **zfvp)
+{
+	if (*zfvp == NULL || (*zfvp)->z_sb == NULL ||
+	    !atomic_inc_not_zero(&((*zfvp)->z_sb->s_active))) {
+		return (SET_ERROR(ESRCH));
+	}
+	return (0);
+}
+
+void
+zfs_vfs_rele(zfsvfs_t *zfsvfs)
+{
+	deactivate_super(zfsvfs->z_sb);
+}
+
+static int
+zfsdev_state_init(struct file *filp)
+{
+	zfsdev_state_t *zs, *zsprev = NULL;
+	minor_t minor;
+	boolean_t newzs = B_FALSE;
+
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+	minor = zfsdev_minor_alloc();
+	if (minor == 0)
+		return (SET_ERROR(ENXIO));
+
+	for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+		if (zs->zs_minor == -1)
+			break;
+		zsprev = zs;
+	}
+
+	if (!zs) {
+		zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
+		newzs = B_TRUE;
+	}
+
+	filp->private_data = zs;
+
+	zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit);
+	zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent);
+
+	/*
+	 * In order to provide for lock-free concurrent read access
+	 * to the minor list in zfsdev_get_state_impl(), new entries
+	 * must be completely written before linking them into the
+	 * list whereas existing entries are already linked; the last
+	 * operation must be updating zs_minor (from -1 to the new
+	 * value).
+	 */
+	if (newzs) {
+		zs->zs_minor = minor;
+		smp_wmb();
+		zsprev->zs_next = zs;
+	} else {
+		smp_wmb();
+		zs->zs_minor = minor;
+	}
+
+	return (0);
+}
+
+static int
+zfsdev_state_destroy(struct file *filp)
+{
+	zfsdev_state_t *zs;
+
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+	ASSERT(filp->private_data != NULL);
+
+	zs = filp->private_data;
+	zs->zs_minor = -1;
+	zfs_onexit_destroy(zs->zs_onexit);
+	zfs_zevent_destroy(zs->zs_zevent);
+	zs->zs_onexit = NULL;
+	zs->zs_zevent = NULL;
+
+	return (0);
+}
+
+static int
+zfsdev_open(struct inode *ino, struct file *filp)
+{
+	int error;
+
+	mutex_enter(&zfsdev_state_lock);
+	error = zfsdev_state_init(filp);
+	mutex_exit(&zfsdev_state_lock);
+
+	return (-error);
+}
+
+static int
+zfsdev_release(struct inode *ino, struct file *filp)
+{
+	int error;
+
+	mutex_enter(&zfsdev_state_lock);
+	error = zfsdev_state_destroy(filp);
+	mutex_exit(&zfsdev_state_lock);
+
+	return (-error);
+}
+
+static long
+zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
+{
+	uint_t vecnum;
+	zfs_cmd_t *zc;
+	int error, rc;
+
+	vecnum = cmd - ZFS_IOC_FIRST;
+
+	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+
+	if (ddi_copyin((void *)(uintptr_t)arg, zc, sizeof (zfs_cmd_t), 0)) {
+		error = -SET_ERROR(EFAULT);
+		goto out;
+	}
+	error = -zfsdev_ioctl_common(vecnum, zc, 0);
+	rc = ddi_copyout(zc, (void *)(uintptr_t)arg, sizeof (zfs_cmd_t), 0);
+	if (error == 0 && rc != 0)
+		error = -SET_ERROR(EFAULT);
+out:
+	kmem_free(zc, sizeof (zfs_cmd_t));
+	return (error);
+
+}
+
+uint64_t
+zfs_max_nvlist_src_size_os(void)
+{
+	if (zfs_max_nvlist_src_size != 0)
+		return (zfs_max_nvlist_src_size);
+
+	return (KMALLOC_MAX_SIZE);
+}
+
+void
+zfs_ioctl_init_os(void)
+{
+}
+
+#ifdef CONFIG_COMPAT
+static long
+zfsdev_compat_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
+{
+	return (zfsdev_ioctl(filp, cmd, arg));
+}
+#else
+#define	zfsdev_compat_ioctl	NULL
+#endif
+
+static const struct file_operations zfsdev_fops = {
+	.open		= zfsdev_open,
+	.release	= zfsdev_release,
+	.unlocked_ioctl	= zfsdev_ioctl,
+	.compat_ioctl	= zfsdev_compat_ioctl,
+	.owner		= THIS_MODULE,
+};
+
+static struct miscdevice zfs_misc = {
+	.minor		= ZFS_DEVICE_MINOR,
+	.name		= ZFS_DRIVER,
+	.fops		= &zfsdev_fops,
+};
+
+MODULE_ALIAS_MISCDEV(ZFS_DEVICE_MINOR);
+MODULE_ALIAS("devname:zfs");
+
+int
+zfsdev_attach(void)
+{
+	int error;
+
+	error = misc_register(&zfs_misc);
+	if (error == -EBUSY) {
+		/*
+		 * Fallback to dynamic minor allocation in the event of a
+		 * collision with a reserved minor in linux/miscdevice.h.
+		 * In this case the kernel modules must be manually loaded.
+		 */
+		printk(KERN_INFO "ZFS: misc_register() with static minor %d "
+		    "failed %d, retrying with MISC_DYNAMIC_MINOR\n",
+		    ZFS_DEVICE_MINOR, error);
+
+		zfs_misc.minor = MISC_DYNAMIC_MINOR;
+		error = misc_register(&zfs_misc);
+	}
+
+	if (error)
+		printk(KERN_INFO "ZFS: misc_register() failed %d\n", error);
+
+	return (error);
+}
+
+void
+zfsdev_detach(void)
+{
+	misc_deregister(&zfs_misc);
+}
+
+#ifdef ZFS_DEBUG
+#define	ZFS_DEBUG_STR	" (DEBUG mode)"
+#else
+#define	ZFS_DEBUG_STR	""
+#endif
+
+static int __init
+_init(void)
+{
+	int error;
+
+	if ((error = zfs_kmod_init()) != 0) {
+		printk(KERN_NOTICE "ZFS: Failed to Load ZFS Filesystem v%s-%s%s"
+		    ", rc = %d\n", ZFS_META_VERSION, ZFS_META_RELEASE,
+		    ZFS_DEBUG_STR, error);
+
+		return (-error);
+	}
+
+	zfs_sysfs_init();
+
+	printk(KERN_NOTICE "ZFS: Loaded module v%s-%s%s, "
+	    "ZFS pool version %s, ZFS filesystem version %s\n",
+	    ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR,
+	    SPA_VERSION_STRING, ZPL_VERSION_STRING);
+#ifndef CONFIG_FS_POSIX_ACL
+	printk(KERN_NOTICE "ZFS: Posix ACLs disabled by kernel\n");
+#endif /* CONFIG_FS_POSIX_ACL */
+
+	return (0);
+}
+
+static void __exit
+_fini(void)
+{
+	zfs_sysfs_fini();
+	zfs_kmod_fini();
+
+	printk(KERN_NOTICE "ZFS: Unloaded module v%s-%s%s\n",
+	    ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR);
+}
+
+#if defined(_KERNEL)
+module_init(_init);
+module_exit(_fini);
+#endif
+
+ZFS_MODULE_DESCRIPTION("ZFS");
+ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
+ZFS_MODULE_LICENSE(ZFS_META_LICENSE);
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
new file mode 100644
index 000000000000..fb7c68987360
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
@@ -0,0 +1,662 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_sysfs.h>
+#include <sys/kmem.h>
+#include <sys/fs/zfs.h>
+#include <linux/kobject.h>
+
+#include "zfs_prop.h"
+
+#if !defined(_KERNEL)
+#error kernel builds only
+#endif
+
+/*
+ * ZFS Module sysfs support
+ *
+ * This extends our sysfs '/sys/module/zfs' entry to include feature
+ * and property attributes. The primary consumer of this information
+ * is user processes, like the zfs CLI, that need to know what the
+ * current loaded ZFS module supports. The libzfs binary will consult
+ * this information when instantiating the zfs|zpool property tables
+ * and the pool features table.
+ *
+ * The added top-level directories are:
+ * /sys/module/zfs
+ *		├── features.kernel
+ *		├── features.pool
+ *		├── properties.dataset
+ *		└── properties.pool
+ *
+ * The local interface for the zfs kobjects includes:
+ *	zfs_kobj_init()
+ *	zfs_kobj_add()
+ *	zfs_kobj_release()
+ *	zfs_kobj_add_attr()
+ *	zfs_kobj_fini()
+ */
+
+/*
+ * A zfs_mod_kobj_t represents a zfs kobject under '/sys/module/zfs'
+ */
+struct zfs_mod_kobj;
+typedef struct zfs_mod_kobj zfs_mod_kobj_t;
+
+struct zfs_mod_kobj {
+	struct kobject		zko_kobj;
+	struct kobj_type	zko_kobj_type;
+	struct sysfs_ops	zko_sysfs_ops;
+	size_t			zko_attr_count;
+	struct attribute	*zko_attr_list;		/* allocated */
+	struct attribute	**zko_default_attrs;	/* allocated */
+	size_t			zko_child_count;
+	zfs_mod_kobj_t		*zko_children;		/* allocated */
+};
+
+#define	ATTR_TABLE_SIZE(cnt)	(sizeof (struct attribute) * (cnt))
+/* Note +1 for NULL terminator slot */
+#define	DEFAULT_ATTR_SIZE(cnt)	(sizeof (struct attribute *) * (cnt + 1))
+#define	CHILD_TABLE_SIZE(cnt)	(sizeof (zfs_mod_kobj_t) * (cnt))
+
+/*
+ * These are the top-level kobjects under '/sys/module/zfs/'
+ */
+static zfs_mod_kobj_t kernel_features_kobj;
+static zfs_mod_kobj_t pool_features_kobj;
+static zfs_mod_kobj_t dataset_props_kobj;
+static zfs_mod_kobj_t pool_props_kobj;
+
+/*
+ * The show function is used to provide the content
+ * of an attribute into a PAGE_SIZE buffer.
+ */
+typedef ssize_t	(*sysfs_show_func)(struct kobject *, struct attribute *,
+    char *);
+
+static void
+zfs_kobj_fini(zfs_mod_kobj_t *zkobj)
+{
+	/* finalize any child kobjects */
+	if (zkobj->zko_child_count != 0) {
+		ASSERT(zkobj->zko_children);
+		for (int i = 0; i < zkobj->zko_child_count; i++)
+			zfs_kobj_fini(&zkobj->zko_children[i]);
+	}
+
+	/* kobject_put() will call zfs_kobj_release() to release memory */
+	kobject_del(&zkobj->zko_kobj);
+	kobject_put(&zkobj->zko_kobj);
+}
+
+static void
+zfs_kobj_release(struct kobject *kobj)
+{
+	zfs_mod_kobj_t *zkobj = container_of(kobj, zfs_mod_kobj_t, zko_kobj);
+
+	if (zkobj->zko_attr_list != NULL) {
+		ASSERT3S(zkobj->zko_attr_count, !=, 0);
+		kmem_free(zkobj->zko_attr_list,
+		    ATTR_TABLE_SIZE(zkobj->zko_attr_count));
+		zkobj->zko_attr_list = NULL;
+	}
+
+	if (zkobj->zko_default_attrs != NULL) {
+		kmem_free(zkobj->zko_default_attrs,
+		    DEFAULT_ATTR_SIZE(zkobj->zko_attr_count));
+		zkobj->zko_default_attrs = NULL;
+	}
+
+	if (zkobj->zko_child_count != 0) {
+		ASSERT(zkobj->zko_children);
+
+		kmem_free(zkobj->zko_children,
+		    CHILD_TABLE_SIZE(zkobj->zko_child_count));
+		zkobj->zko_child_count = 0;
+		zkobj->zko_children = NULL;
+	}
+
+	zkobj->zko_attr_count = 0;
+}
+
+#ifndef sysfs_attr_init
+#define	sysfs_attr_init(attr) do {} while (0)
+#endif
+
+static void
+zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name)
+{
+	VERIFY3U(attr_num, <, zkobj->zko_attr_count);
+	ASSERT(zkobj->zko_attr_list);
+	ASSERT(zkobj->zko_default_attrs);
+
+	zkobj->zko_attr_list[attr_num].name = attr_name;
+	zkobj->zko_attr_list[attr_num].mode = 0444;
+	zkobj->zko_default_attrs[attr_num] = &zkobj->zko_attr_list[attr_num];
+	sysfs_attr_init(&zkobj->zko_attr_list[attr_num]);
+}
+
+static int
+zfs_kobj_init(zfs_mod_kobj_t *zkobj, int attr_cnt, int child_cnt,
+    sysfs_show_func show_func)
+{
+	/*
+	 * Initialize object's attributes. Count can be zero.
+	 */
+	if (attr_cnt > 0) {
+		zkobj->zko_attr_list = kmem_zalloc(ATTR_TABLE_SIZE(attr_cnt),
+		    KM_SLEEP);
+		if (zkobj->zko_attr_list == NULL)
+			return (ENOMEM);
+	}
+	/* this will always have at least one slot for NULL termination */
+	zkobj->zko_default_attrs = kmem_zalloc(DEFAULT_ATTR_SIZE(attr_cnt),
+	    KM_SLEEP);
+	if (zkobj->zko_default_attrs == NULL) {
+		if (zkobj->zko_attr_list != NULL) {
+			kmem_free(zkobj->zko_attr_list,
+			    ATTR_TABLE_SIZE(attr_cnt));
+		}
+		return (ENOMEM);
+	}
+	zkobj->zko_attr_count = attr_cnt;
+	zkobj->zko_kobj_type.default_attrs = zkobj->zko_default_attrs;
+
+	if (child_cnt > 0) {
+		zkobj->zko_children = kmem_zalloc(CHILD_TABLE_SIZE(child_cnt),
+		    KM_SLEEP);
+		if (zkobj->zko_children == NULL) {
+			if (zkobj->zko_default_attrs != NULL) {
+				kmem_free(zkobj->zko_default_attrs,
+				    DEFAULT_ATTR_SIZE(attr_cnt));
+			}
+			if (zkobj->zko_attr_list != NULL) {
+				kmem_free(zkobj->zko_attr_list,
+				    ATTR_TABLE_SIZE(attr_cnt));
+			}
+			return (ENOMEM);
+		}
+		zkobj->zko_child_count = child_cnt;
+	}
+
+	zkobj->zko_sysfs_ops.show = show_func;
+	zkobj->zko_kobj_type.sysfs_ops = &zkobj->zko_sysfs_ops;
+	zkobj->zko_kobj_type.release = zfs_kobj_release;
+
+	return (0);
+}
+
+static int
+zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name)
+{
+	/* zko_default_attrs must be NULL terminated */
+	ASSERT(zkobj->zko_default_attrs != NULL);
+	ASSERT(zkobj->zko_default_attrs[zkobj->zko_attr_count] == NULL);
+
+	kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type);
+	return (kobject_add(&zkobj->zko_kobj, parent, name));
+}
+
+/*
+ * Each zfs property has these common attributes
+ */
+static const char *zprop_attrs[]  = {
+	"type",
+	"readonly",
+	"setonce",
+	"visible",
+	"values",
+	"default",
+	"datasets"	/* zfs properties only */
+};
+
+#define	ZFS_PROP_ATTR_COUNT	ARRAY_SIZE(zprop_attrs)
+#define	ZPOOL_PROP_ATTR_COUNT	(ZFS_PROP_ATTR_COUNT - 1)
+
+static const char *zprop_types[]  = {
+	"number",
+	"string",
+	"index",
+};
+
+typedef struct zfs_type_map {
+	zfs_type_t	ztm_type;
+	const char	*ztm_name;
+} zfs_type_map_t;
+
+static zfs_type_map_t type_map[] = {
+	{ZFS_TYPE_FILESYSTEM,	"filesystem"},
+	{ZFS_TYPE_SNAPSHOT,	"snapshot"},
+	{ZFS_TYPE_VOLUME,	"volume"},
+	{ZFS_TYPE_BOOKMARK,	"bookmark"}
+};
+
+/*
+ * Show the content for a zfs property attribute
+ */
+static ssize_t
+zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property,
+    char *buf, size_t buflen)
+{
+	const char *show_str;
+	char number[32];
+
+	/* For dataset properties list the dataset types that apply */
+	if (strcmp(attr_name, "datasets") == 0 &&
+	    property->pd_types != ZFS_TYPE_POOL) {
+		int len = 0;
+
+		for (int i = 0; i < ARRAY_SIZE(type_map); i++) {
+			if (type_map[i].ztm_type & property->pd_types)  {
+				len += snprintf(buf + len, buflen - len, "%s ",
+				    type_map[i].ztm_name);
+			}
+		}
+		len += snprintf(buf + len, buflen - len, "\n");
+		return (len);
+	}
+
+	if (strcmp(attr_name, "type") == 0) {
+		show_str = zprop_types[property->pd_proptype];
+	} else if (strcmp(attr_name, "readonly") == 0) {
+		show_str = property->pd_attr == PROP_READONLY ? "1" : "0";
+	} else if (strcmp(attr_name, "setonce") == 0) {
+		show_str = property->pd_attr == PROP_ONETIME ? "1" : "0";
+	} else if (strcmp(attr_name, "visible") == 0) {
+		show_str = property->pd_visible ? "1" : "0";
+	} else if (strcmp(attr_name, "values") == 0) {
+		show_str = property->pd_values ? property->pd_values : "";
+	} else if (strcmp(attr_name, "default") == 0) {
+		switch (property->pd_proptype) {
+		case PROP_TYPE_NUMBER:
+			(void) snprintf(number, sizeof (number), "%llu",
+			    (u_longlong_t)property->pd_numdefault);
+			show_str = number;
+			break;
+		case PROP_TYPE_STRING:
+			show_str = property->pd_strdefault ?
+			    property->pd_strdefault : "";
+			break;
+		case PROP_TYPE_INDEX:
+			if (zprop_index_to_string(property->pd_propnum,
+			    property->pd_numdefault, &show_str,
+			    property->pd_types) != 0) {
+				show_str = "";
+			}
+			break;
+		default:
+			return (0);
+		}
+	} else {
+		return (0);
+	}
+
+	return (snprintf(buf, buflen, "%s\n", show_str));
+}
+
+static ssize_t
+dataset_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	zfs_prop_t prop = zfs_name_to_prop(kobject_name(kobj));
+	zprop_desc_t *prop_tbl = zfs_prop_get_table();
+	ssize_t len;
+
+	ASSERT3U(prop, <, ZFS_NUM_PROPS);
+
+	len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE);
+
+	return (len);
+}
+
+static ssize_t
+pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	zpool_prop_t prop = zpool_name_to_prop(kobject_name(kobj));
+	zprop_desc_t *prop_tbl = zpool_prop_get_table();
+	ssize_t len;
+
+	ASSERT3U(prop, <, ZPOOL_NUM_PROPS);
+
+	len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE);
+
+	return (len);
+}
+
+/*
+ * ZFS kernel feature attributes for '/sys/module/zfs/features.kernel'
+ *
+ * This list is intended for kernel features that don't have a pool feature
+ * association or that extend existing user kernel interfaces.
+ *
+ * A user process can easily check if the running zfs kernel module
+ * supports the new feature.
+ */
+static const char *zfs_kernel_features[] = {
+	/* --> Add new kernel features here */
+	"com.delphix:vdev_initialize",
+	"org.zfsonlinux:vdev_trim",
+	"org.openzfs:l2arc_persistent",
+};
+
+#define	KERNEL_FEATURE_COUNT	ARRAY_SIZE(zfs_kernel_features)
+
+static ssize_t
+kernel_feature_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	if (strcmp(attr->name, "supported") == 0)
+		return (snprintf(buf, PAGE_SIZE, "yes\n"));
+	return (0);
+}
+
+static void
+kernel_feature_to_kobj(zfs_mod_kobj_t *parent, int slot, const char *name)
+{
+	zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[slot];
+
+	ASSERT3U(slot, <, KERNEL_FEATURE_COUNT);
+	ASSERT(name);
+
+	int err = zfs_kobj_init(zfs_kobj, 1, 0, kernel_feature_show);
+	if (err)
+		return;
+
+	zfs_kobj_add_attr(zfs_kobj, 0, "supported");
+
+	err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+	if (err)
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+}
+
+static int
+zfs_kernel_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent)
+{
+	/*
+	 * Create a parent kobject to host kernel features.
+	 *
+	 * '/sys/module/zfs/features.kernel'
+	 */
+	int err = zfs_kobj_init(zfs_kobj, 0, KERNEL_FEATURE_COUNT,
+	    kernel_feature_show);
+	if (err)
+		return (err);
+	err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_KERNEL_FEATURES);
+	if (err) {
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+		return (err);
+	}
+
+	/*
+	 * Now create a kobject for each feature.
+	 *
+	 * '/sys/module/zfs/features.kernel/<feature>'
+	 */
+	for (int f = 0; f < KERNEL_FEATURE_COUNT; f++)
+		kernel_feature_to_kobj(zfs_kobj, f, zfs_kernel_features[f]);
+
+	return (0);
+}
+
+/*
+ * Each pool feature has these common attributes
+ */
+static const char *pool_feature_attrs[]  = {
+	"description",
+	"guid",
+	"uname",
+	"readonly_compatible",
+	"required_for_mos",
+	"activate_on_enable",
+	"per_dataset"
+};
+
+#define	ZPOOL_FEATURE_ATTR_COUNT	ARRAY_SIZE(pool_feature_attrs)
+
+/*
+ * Show the content for the given zfs pool feature attribute
+ */
+static ssize_t
+pool_feature_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	spa_feature_t fid;
+
+	if (zfeature_lookup_guid(kobject_name(kobj), &fid) != 0)
+		return (0);
+
+	ASSERT3U(fid, <, SPA_FEATURES);
+
+	zfeature_flags_t flags = spa_feature_table[fid].fi_flags;
+	const char *show_str = NULL;
+
+	if (strcmp(attr->name, "description") == 0) {
+		show_str = spa_feature_table[fid].fi_desc;
+	} else if (strcmp(attr->name, "guid") == 0) {
+		show_str = spa_feature_table[fid].fi_guid;
+	} else if (strcmp(attr->name, "uname") == 0) {
+		show_str = spa_feature_table[fid].fi_uname;
+	} else if (strcmp(attr->name, "readonly_compatible") == 0) {
+		show_str = flags & ZFEATURE_FLAG_READONLY_COMPAT ? "1" : "0";
+	} else if (strcmp(attr->name, "required_for_mos") == 0) {
+		show_str = flags & ZFEATURE_FLAG_MOS ? "1" : "0";
+	} else if (strcmp(attr->name, "activate_on_enable") == 0) {
+		show_str = flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE ? "1" : "0";
+	} else if (strcmp(attr->name, "per_dataset") == 0) {
+		show_str = flags & ZFEATURE_FLAG_PER_DATASET ? "1" : "0";
+	}
+	if (show_str == NULL)
+		return (0);
+
+	return (snprintf(buf, PAGE_SIZE, "%s\n", show_str));
+}
+
+static void
+pool_feature_to_kobj(zfs_mod_kobj_t *parent, spa_feature_t fid,
+    const char *name)
+{
+	zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[fid];
+
+	ASSERT3U(fid, <, SPA_FEATURES);
+	ASSERT(name);
+
+	int err = zfs_kobj_init(zfs_kobj, ZPOOL_FEATURE_ATTR_COUNT, 0,
+	    pool_feature_show);
+	if (err)
+		return;
+
+	for (int i = 0; i < ZPOOL_FEATURE_ATTR_COUNT; i++)
+		zfs_kobj_add_attr(zfs_kobj, i, pool_feature_attrs[i]);
+
+	err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+	if (err)
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+}
+
+static int
+zfs_pool_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent)
+{
+	/*
+	 * Create a parent kobject to host pool features.
+	 *
+	 * '/sys/module/zfs/features.pool'
+	 */
+	int err = zfs_kobj_init(zfs_kobj, 0, SPA_FEATURES, pool_feature_show);
+	if (err)
+		return (err);
+	err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_POOL_FEATURES);
+	if (err) {
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+		return (err);
+	}
+
+	/*
+	 * Now create a kobject for each feature.
+	 *
+	 * '/sys/module/zfs/features.pool/<feature>'
+	 */
+	for (spa_feature_t i = 0; i < SPA_FEATURES; i++)
+		pool_feature_to_kobj(zfs_kobj, i, spa_feature_table[i].fi_guid);
+
+	return (0);
+}
+
+typedef struct prop_to_kobj_arg {
+	zprop_desc_t	*p2k_table;
+	zfs_mod_kobj_t	*p2k_parent;
+	sysfs_show_func	p2k_show_func;
+	int		p2k_attr_count;
+} prop_to_kobj_arg_t;
+
+static int
+zprop_to_kobj(int prop, void *args)
+{
+	prop_to_kobj_arg_t *data = args;
+	zfs_mod_kobj_t *parent = data->p2k_parent;
+	zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[prop];
+	const char *name = data->p2k_table[prop].pd_name;
+	int err;
+
+	ASSERT(name);
+
+	err = zfs_kobj_init(zfs_kobj, data->p2k_attr_count, 0,
+	    data->p2k_show_func);
+	if (err)
+		return (ZPROP_CONT);
+
+	for (int i = 0; i < data->p2k_attr_count; i++)
+		zfs_kobj_add_attr(zfs_kobj, i, zprop_attrs[i]);
+
+	err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+	if (err)
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+
+	return (ZPROP_CONT);
+}
+
+static int
+zfs_sysfs_properties_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent,
+    zfs_type_t type)
+{
+	prop_to_kobj_arg_t context;
+	const char *name;
+	int err;
+
+	/*
+	 * Create a parent kobject to host properties.
+	 *
+	 * '/sys/module/zfs/properties.<type>'
+	 */
+	if (type == ZFS_TYPE_POOL) {
+		name = ZFS_SYSFS_POOL_PROPERTIES;
+		context.p2k_table = zpool_prop_get_table();
+		context.p2k_attr_count = ZPOOL_PROP_ATTR_COUNT;
+		context.p2k_parent = zfs_kobj;
+		context.p2k_show_func = pool_property_show;
+		err = zfs_kobj_init(zfs_kobj, 0, ZPOOL_NUM_PROPS,
+		    pool_property_show);
+	} else {
+		name = ZFS_SYSFS_DATASET_PROPERTIES;
+		context.p2k_table = zfs_prop_get_table();
+		context.p2k_attr_count = ZFS_PROP_ATTR_COUNT;
+		context.p2k_parent = zfs_kobj;
+		context.p2k_show_func = dataset_property_show;
+		err = zfs_kobj_init(zfs_kobj, 0, ZFS_NUM_PROPS,
+		    dataset_property_show);
+	}
+
+	if (err)
+		return (err);
+
+	err = zfs_kobj_add(zfs_kobj, parent, name);
+	if (err) {
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+		return (err);
+	}
+
+	/*
+	 * Create a kobject for each property.
+	 *
+	 * '/sys/module/zfs/properties.<type>/<property>'
+	 */
+	(void) zprop_iter_common(zprop_to_kobj, &context, B_TRUE,
+	    B_FALSE, type);
+
+	return (err);
+}
+
+void
+zfs_sysfs_init(void)
+{
+	struct kobject *parent;
+#if defined(CONFIG_ZFS) && !defined(CONFIG_ZFS_MODULE)
+	parent = kobject_create_and_add("zfs", fs_kobj);
+#else
+	parent = &(((struct module *)(THIS_MODULE))->mkobj).kobj;
+#endif
+	int err;
+
+	if (parent == NULL)
+		return;
+
+	err = zfs_kernel_features_init(&kernel_features_kobj, parent);
+	if (err)
+		return;
+
+	err = zfs_pool_features_init(&pool_features_kobj, parent);
+	if (err) {
+		zfs_kobj_fini(&kernel_features_kobj);
+		return;
+	}
+
+	err = zfs_sysfs_properties_init(&pool_props_kobj, parent,
+	    ZFS_TYPE_POOL);
+	if (err) {
+		zfs_kobj_fini(&kernel_features_kobj);
+		zfs_kobj_fini(&pool_features_kobj);
+		return;
+	}
+
+	err = zfs_sysfs_properties_init(&dataset_props_kobj, parent,
+	    ZFS_TYPE_FILESYSTEM);
+	if (err) {
+		zfs_kobj_fini(&kernel_features_kobj);
+		zfs_kobj_fini(&pool_features_kobj);
+		zfs_kobj_fini(&pool_props_kobj);
+		return;
+	}
+}
+
+void
+zfs_sysfs_fini(void)
+{
+	/*
+	 * Remove top-level kobjects; each will remove any children kobjects
+	 */
+	zfs_kobj_fini(&kernel_features_kobj);
+	zfs_kobj_fini(&pool_features_kobj);
+	zfs_kobj_fini(&dataset_props_kobj);
+	zfs_kobj_fini(&pool_props_kobj);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
new file mode 100644
index 000000000000..db831bf54704
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -0,0 +1,2165 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/mntent.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_quota.h>
+#include <sys/sunddi.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/spa_boot.h>
+#include <sys/objlist.h>
+#include <sys/zpl.h>
+#include <linux/vfs_compat.h>
+#include "zfs_comutil.h"
+
+enum {
+	TOKEN_RO,
+	TOKEN_RW,
+	TOKEN_SETUID,
+	TOKEN_NOSETUID,
+	TOKEN_EXEC,
+	TOKEN_NOEXEC,
+	TOKEN_DEVICES,
+	TOKEN_NODEVICES,
+	TOKEN_DIRXATTR,
+	TOKEN_SAXATTR,
+	TOKEN_XATTR,
+	TOKEN_NOXATTR,
+	TOKEN_ATIME,
+	TOKEN_NOATIME,
+	TOKEN_RELATIME,
+	TOKEN_NORELATIME,
+	TOKEN_NBMAND,
+	TOKEN_NONBMAND,
+	TOKEN_MNTPOINT,
+	TOKEN_LAST,
+};
+
+static const match_table_t zpl_tokens = {
+	{ TOKEN_RO,		MNTOPT_RO },
+	{ TOKEN_RW,		MNTOPT_RW },
+	{ TOKEN_SETUID,		MNTOPT_SETUID },
+	{ TOKEN_NOSETUID,	MNTOPT_NOSETUID },
+	{ TOKEN_EXEC,		MNTOPT_EXEC },
+	{ TOKEN_NOEXEC,		MNTOPT_NOEXEC },
+	{ TOKEN_DEVICES,	MNTOPT_DEVICES },
+	{ TOKEN_NODEVICES,	MNTOPT_NODEVICES },
+	{ TOKEN_DIRXATTR,	MNTOPT_DIRXATTR },
+	{ TOKEN_SAXATTR,	MNTOPT_SAXATTR },
+	{ TOKEN_XATTR,		MNTOPT_XATTR },
+	{ TOKEN_NOXATTR,	MNTOPT_NOXATTR },
+	{ TOKEN_ATIME,		MNTOPT_ATIME },
+	{ TOKEN_NOATIME,	MNTOPT_NOATIME },
+	{ TOKEN_RELATIME,	MNTOPT_RELATIME },
+	{ TOKEN_NORELATIME,	MNTOPT_NORELATIME },
+	{ TOKEN_NBMAND,		MNTOPT_NBMAND },
+	{ TOKEN_NONBMAND,	MNTOPT_NONBMAND },
+	{ TOKEN_MNTPOINT,	MNTOPT_MNTPOINT "=%s" },
+	{ TOKEN_LAST,		NULL },
+};
+
+static void
+zfsvfs_vfs_free(vfs_t *vfsp)
+{
+	if (vfsp != NULL) {
+		if (vfsp->vfs_mntpoint != NULL)
+			kmem_strfree(vfsp->vfs_mntpoint);
+
+		kmem_free(vfsp, sizeof (vfs_t));
+	}
+}
+
+static int
+zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp)
+{
+	switch (token) {
+	case TOKEN_RO:
+		vfsp->vfs_readonly = B_TRUE;
+		vfsp->vfs_do_readonly = B_TRUE;
+		break;
+	case TOKEN_RW:
+		vfsp->vfs_readonly = B_FALSE;
+		vfsp->vfs_do_readonly = B_TRUE;
+		break;
+	case TOKEN_SETUID:
+		vfsp->vfs_setuid = B_TRUE;
+		vfsp->vfs_do_setuid = B_TRUE;
+		break;
+	case TOKEN_NOSETUID:
+		vfsp->vfs_setuid = B_FALSE;
+		vfsp->vfs_do_setuid = B_TRUE;
+		break;
+	case TOKEN_EXEC:
+		vfsp->vfs_exec = B_TRUE;
+		vfsp->vfs_do_exec = B_TRUE;
+		break;
+	case TOKEN_NOEXEC:
+		vfsp->vfs_exec = B_FALSE;
+		vfsp->vfs_do_exec = B_TRUE;
+		break;
+	case TOKEN_DEVICES:
+		vfsp->vfs_devices = B_TRUE;
+		vfsp->vfs_do_devices = B_TRUE;
+		break;
+	case TOKEN_NODEVICES:
+		vfsp->vfs_devices = B_FALSE;
+		vfsp->vfs_do_devices = B_TRUE;
+		break;
+	case TOKEN_DIRXATTR:
+		vfsp->vfs_xattr = ZFS_XATTR_DIR;
+		vfsp->vfs_do_xattr = B_TRUE;
+		break;
+	case TOKEN_SAXATTR:
+		vfsp->vfs_xattr = ZFS_XATTR_SA;
+		vfsp->vfs_do_xattr = B_TRUE;
+		break;
+	case TOKEN_XATTR:
+		vfsp->vfs_xattr = ZFS_XATTR_DIR;
+		vfsp->vfs_do_xattr = B_TRUE;
+		break;
+	case TOKEN_NOXATTR:
+		vfsp->vfs_xattr = ZFS_XATTR_OFF;
+		vfsp->vfs_do_xattr = B_TRUE;
+		break;
+	case TOKEN_ATIME:
+		vfsp->vfs_atime = B_TRUE;
+		vfsp->vfs_do_atime = B_TRUE;
+		break;
+	case TOKEN_NOATIME:
+		vfsp->vfs_atime = B_FALSE;
+		vfsp->vfs_do_atime = B_TRUE;
+		break;
+	case TOKEN_RELATIME:
+		vfsp->vfs_relatime = B_TRUE;
+		vfsp->vfs_do_relatime = B_TRUE;
+		break;
+	case TOKEN_NORELATIME:
+		vfsp->vfs_relatime = B_FALSE;
+		vfsp->vfs_do_relatime = B_TRUE;
+		break;
+	case TOKEN_NBMAND:
+		vfsp->vfs_nbmand = B_TRUE;
+		vfsp->vfs_do_nbmand = B_TRUE;
+		break;
+	case TOKEN_NONBMAND:
+		vfsp->vfs_nbmand = B_FALSE;
+		vfsp->vfs_do_nbmand = B_TRUE;
+		break;
+	case TOKEN_MNTPOINT:
+		vfsp->vfs_mntpoint = match_strdup(&args[0]);
+		if (vfsp->vfs_mntpoint == NULL)
+			return (SET_ERROR(ENOMEM));
+
+		break;
+	default:
+		break;
+	}
+
+	return (0);
+}
+
+/*
+ * Parse the raw mntopts and return a vfs_t describing the options.
+ */
+static int
+zfsvfs_parse_options(char *mntopts, vfs_t **vfsp)
+{
+	vfs_t *tmp_vfsp;
+	int error;
+
+	tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP);
+
+	if (mntopts != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		char *tmp_mntopts, *p, *t;
+		int token;
+
+		tmp_mntopts = t = kmem_strdup(mntopts);
+		if (tmp_mntopts == NULL)
+			return (SET_ERROR(ENOMEM));
+
+		while ((p = strsep(&t, ",")) != NULL) {
+			if (!*p)
+				continue;
+
+			args[0].to = args[0].from = NULL;
+			token = match_token(p, zpl_tokens, args);
+			error = zfsvfs_parse_option(p, token, args, tmp_vfsp);
+			if (error) {
+				kmem_strfree(tmp_mntopts);
+				zfsvfs_vfs_free(tmp_vfsp);
+				return (error);
+			}
+		}
+
+		kmem_strfree(tmp_mntopts);
+	}
+
+	*vfsp = tmp_vfsp;
+
+	return (0);
+}
+
+boolean_t
+zfs_is_readonly(zfsvfs_t *zfsvfs)
+{
+	return (!!(zfsvfs->z_sb->s_flags & SB_RDONLY));
+}
+
+/*ARGSUSED*/
+int
+zfs_sync(struct super_block *sb, int wait, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = sb->s_fs_info;
+
+	/*
+	 * Semantically, the only requirement is that the sync be initiated.
+	 * The DMU syncs out txgs frequently, so there's nothing to do.
+	 */
+	if (!wait)
+		return (0);
+
+	if (zfsvfs != NULL) {
+		/*
+		 * Sync a specific filesystem.
+		 */
+		dsl_pool_t *dp;
+
+		ZFS_ENTER(zfsvfs);
+		dp = dmu_objset_pool(zfsvfs->z_os);
+
+		/*
+		 * If the system is shutting down, then skip any
+		 * filesystems which may exist on a suspended pool.
+		 */
+		if (spa_suspended(dp->dp_spa)) {
+			ZFS_EXIT(zfsvfs);
+			return (0);
+		}
+
+		if (zfsvfs->z_log != NULL)
+			zil_commit(zfsvfs->z_log, 0);
+
+		ZFS_EXIT(zfsvfs);
+	} else {
+		/*
+		 * Sync all ZFS filesystems.  This is what happens when you
+		 * run sync(1M).  Unlike other filesystems, ZFS honors the
+		 * request by waiting for all pools to commit all dirty data.
+		 */
+		spa_sync_allpools();
+	}
+
+	return (0);
+}
+
+static void
+atime_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+	struct super_block *sb = zfsvfs->z_sb;
+
+	if (sb == NULL)
+		return;
+	/*
+	 * Update SB_NOATIME bit in VFS super block.  Since atime update is
+	 * determined by atime_needs_update(), atime_needs_update() needs to
+	 * return false if atime is turned off, and not unconditionally return
+	 * false if atime is turned on.
+	 */
+	if (newval)
+		sb->s_flags &= ~SB_NOATIME;
+	else
+		sb->s_flags |= SB_NOATIME;
+}
+
+static void
+relatime_changed_cb(void *arg, uint64_t newval)
+{
+	((zfsvfs_t *)arg)->z_relatime = newval;
+}
+
+static void
+xattr_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == ZFS_XATTR_OFF) {
+		zfsvfs->z_flags &= ~ZSB_XATTR;
+	} else {
+		zfsvfs->z_flags |= ZSB_XATTR;
+
+		if (newval == ZFS_XATTR_SA)
+			zfsvfs->z_xattr_sa = B_TRUE;
+		else
+			zfsvfs->z_xattr_sa = B_FALSE;
+	}
+}
+
+static void
+acltype_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	switch (newval) {
+	case ZFS_ACLTYPE_OFF:
+		zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF;
+		zfsvfs->z_sb->s_flags &= ~SB_POSIXACL;
+		break;
+	case ZFS_ACLTYPE_POSIXACL:
+#ifdef CONFIG_FS_POSIX_ACL
+		zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIXACL;
+		zfsvfs->z_sb->s_flags |= SB_POSIXACL;
+#else
+		zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF;
+		zfsvfs->z_sb->s_flags &= ~SB_POSIXACL;
+#endif /* CONFIG_FS_POSIX_ACL */
+		break;
+	default:
+		break;
+	}
+}
+
+static void
+blksz_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+	ASSERT(ISP2(newval));
+
+	zfsvfs->z_max_blksz = newval;
+}
+
+static void
+readonly_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+	struct super_block *sb = zfsvfs->z_sb;
+
+	if (sb == NULL)
+		return;
+
+	if (newval)
+		sb->s_flags |= SB_RDONLY;
+	else
+		sb->s_flags &= ~SB_RDONLY;
+}
+
+static void
+devices_changed_cb(void *arg, uint64_t newval)
+{
+}
+
+static void
+setuid_changed_cb(void *arg, uint64_t newval)
+{
+}
+
+static void
+exec_changed_cb(void *arg, uint64_t newval)
+{
+}
+
+static void
+nbmand_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+	struct super_block *sb = zfsvfs->z_sb;
+
+	if (sb == NULL)
+		return;
+
+	if (newval == TRUE)
+		sb->s_flags |= SB_MANDLOCK;
+	else
+		sb->s_flags &= ~SB_MANDLOCK;
+}
+
+static void
+snapdir_changed_cb(void *arg, uint64_t newval)
+{
+	((zfsvfs_t *)arg)->z_show_ctldir = newval;
+}
+
+static void
+vscan_changed_cb(void *arg, uint64_t newval)
+{
+	((zfsvfs_t *)arg)->z_vscan = newval;
+}
+
+static void
+acl_mode_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_acl_mode = newval;
+}
+
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
+{
+	((zfsvfs_t *)arg)->z_acl_inherit = newval;
+}
+
+static int
+zfs_register_callbacks(vfs_t *vfsp)
+{
+	struct dsl_dataset *ds = NULL;
+	objset_t *os = NULL;
+	zfsvfs_t *zfsvfs = NULL;
+	int error = 0;
+
+	ASSERT(vfsp);
+	zfsvfs = vfsp->vfs_data;
+	ASSERT(zfsvfs);
+	os = zfsvfs->z_os;
+
+	/*
+	 * The act of registering our callbacks will destroy any mount
+	 * options we may have.  In order to enable temporary overrides
+	 * of mount options, we stash away the current values and
+	 * restore them after we register the callbacks.
+	 */
+	if (zfs_is_readonly(zfsvfs) || !spa_writeable(dmu_objset_spa(os))) {
+		vfsp->vfs_do_readonly = B_TRUE;
+		vfsp->vfs_readonly = B_TRUE;
+	}
+
+	/*
+	 * Register property callbacks.
+	 *
+	 * It would probably be fine to just check for i/o error from
+	 * the first prop_register(), but I guess I like to go
+	 * overboard...
+	 */
+	ds = dmu_objset_ds(os);
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+	error = dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
+	    zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zfsvfs);
+	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+	if (error)
+		goto unregister;
+
+	/*
+	 * Invoke our callbacks to restore temporary mount options.
+	 */
+	if (vfsp->vfs_do_readonly)
+		readonly_changed_cb(zfsvfs, vfsp->vfs_readonly);
+	if (vfsp->vfs_do_setuid)
+		setuid_changed_cb(zfsvfs, vfsp->vfs_setuid);
+	if (vfsp->vfs_do_exec)
+		exec_changed_cb(zfsvfs, vfsp->vfs_exec);
+	if (vfsp->vfs_do_devices)
+		devices_changed_cb(zfsvfs, vfsp->vfs_devices);
+	if (vfsp->vfs_do_xattr)
+		xattr_changed_cb(zfsvfs, vfsp->vfs_xattr);
+	if (vfsp->vfs_do_atime)
+		atime_changed_cb(zfsvfs, vfsp->vfs_atime);
+	if (vfsp->vfs_do_relatime)
+		relatime_changed_cb(zfsvfs, vfsp->vfs_relatime);
+	if (vfsp->vfs_do_nbmand)
+		nbmand_changed_cb(zfsvfs, vfsp->vfs_nbmand);
+
+	return (0);
+
+unregister:
+	dsl_prop_unregister_all(ds, zfsvfs);
+	return (error);
+}
+
+/*
+ * Takes a dataset, a property, a value and that value's setpoint as
+ * found in the ZAP. Checks if the property has been changed in the vfs.
+ * If so, val and setpoint will be overwritten with updated content.
+ * Otherwise, they are left unchanged.
+ */
+int
+zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
+    char *setpoint)
+{
+	int error;
+	zfsvfs_t *zfvp;
+	vfs_t *vfsp;
+	objset_t *os;
+	uint64_t tmp = *val;
+
+	error = dmu_objset_from_ds(ds, &os);
+	if (error != 0)
+		return (error);
+
+	if (dmu_objset_type(os) != DMU_OST_ZFS)
+		return (EINVAL);
+
+	mutex_enter(&os->os_user_ptr_lock);
+	zfvp = dmu_objset_get_user(os);
+	mutex_exit(&os->os_user_ptr_lock);
+	if (zfvp == NULL)
+		return (ESRCH);
+
+	vfsp = zfvp->z_vfs;
+
+	switch (zfs_prop) {
+	case ZFS_PROP_ATIME:
+		if (vfsp->vfs_do_atime)
+			tmp = vfsp->vfs_atime;
+		break;
+	case ZFS_PROP_RELATIME:
+		if (vfsp->vfs_do_relatime)
+			tmp = vfsp->vfs_relatime;
+		break;
+	case ZFS_PROP_DEVICES:
+		if (vfsp->vfs_do_devices)
+			tmp = vfsp->vfs_devices;
+		break;
+	case ZFS_PROP_EXEC:
+		if (vfsp->vfs_do_exec)
+			tmp = vfsp->vfs_exec;
+		break;
+	case ZFS_PROP_SETUID:
+		if (vfsp->vfs_do_setuid)
+			tmp = vfsp->vfs_setuid;
+		break;
+	case ZFS_PROP_READONLY:
+		if (vfsp->vfs_do_readonly)
+			tmp = vfsp->vfs_readonly;
+		break;
+	case ZFS_PROP_XATTR:
+		if (vfsp->vfs_do_xattr)
+			tmp = vfsp->vfs_xattr;
+		break;
+	case ZFS_PROP_NBMAND:
+		if (vfsp->vfs_do_nbmand)
+			tmp = vfsp->vfs_nbmand;
+		break;
+	default:
+		return (ENOENT);
+	}
+
+	if (tmp != *val) {
+		(void) strcpy(setpoint, "temporary");
+		*val = tmp;
+	}
+	return (0);
+}
+
+/*
+ * Associate this zfsvfs with the given objset, which must be owned.
+ * This will cache a bunch of on-disk state from the objset in the
+ * zfsvfs.
+ */
+static int
+zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
+{
+	int error;
+	uint64_t val;
+
+	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
+	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
+	zfsvfs->z_os = os;
+
+	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
+	if (error != 0)
+		return (error);
+	if (zfsvfs->z_version >
+	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
+		(void) printk("Can't mount a version %lld file system "
+		    "on a version %lld pool\n. Pool must be upgraded to mount "
+		    "this file system.\n", (u_longlong_t)zfsvfs->z_version,
+		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
+		return (SET_ERROR(ENOTSUP));
+	}
+	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_norm = (int)val;
+
+	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_utf8 = (val != 0);
+
+	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_case = (uint_t)val;
+
+	if ((error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val)) != 0)
+		return (error);
+	zfsvfs->z_acl_type = (uint_t)val;
+
+	/*
+	 * Fold case on file systems that are always or sometimes case
+	 * insensitive.
+	 */
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+	    zfsvfs->z_case == ZFS_CASE_MIXED)
+		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+	uint64_t sa_obj = 0;
+	if (zfsvfs->z_use_sa) {
+		/* should either have both of these objects or none */
+		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+		    &sa_obj);
+		if (error != 0)
+			return (error);
+
+		error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
+		if ((error == 0) && (val == ZFS_XATTR_SA))
+			zfsvfs->z_xattr_sa = B_TRUE;
+	}
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+	    &zfsvfs->z_root);
+	if (error != 0)
+		return (error);
+	ASSERT(zfsvfs->z_root != 0);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+	    &zfsvfs->z_unlinkedobj);
+	if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
+	    8, 1, &zfsvfs->z_userquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_userquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
+	    8, 1, &zfsvfs->z_groupquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_groupquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
+	    8, 1, &zfsvfs->z_projectquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_projectquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
+	    8, 1, &zfsvfs->z_userobjquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_userobjquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
+	    8, 1, &zfsvfs->z_groupobjquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_groupobjquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
+	    8, 1, &zfsvfs->z_projectobjquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_projectobjquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
+	    &zfsvfs->z_fuid_obj);
+	if (error == ENOENT)
+		zfsvfs->z_fuid_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
+	    &zfsvfs->z_shares_dir);
+	if (error == ENOENT)
+		zfsvfs->z_shares_dir = 0;
+	else if (error != 0)
+		return (error);
+
+	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+	    &zfsvfs->z_attr_table);
+	if (error != 0)
+		return (error);
+
+	if (zfsvfs->z_version >= ZPL_VERSION_SA)
+		sa_register_update_callback(os, zfs_sa_upgrade);
+
+	return (0);
+}
+
+int
+zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
+{
+	objset_t *os;
+	zfsvfs_t *zfsvfs;
+	int error;
+	boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
+
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+	error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os);
+	if (error != 0) {
+		kmem_free(zfsvfs, sizeof (zfsvfs_t));
+		return (error);
+	}
+
+	error = zfsvfs_create_impl(zfvp, zfsvfs, os);
+	if (error != 0) {
+		dmu_objset_disown(os, B_TRUE, zfsvfs);
+	}
+	return (error);
+}
+
+
+/*
+ * Note: zfsvfs is assumed to be malloc'd, and will be freed by this function
+ * on a failure.  Do not pass in a statically allocated zfsvfs.
+ */
+int
+zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
+{
+	int error;
+
+	zfsvfs->z_vfs = NULL;
+	zfsvfs->z_sb = NULL;
+	zfsvfs->z_parent = zfsvfs;
+
+	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+	    offsetof(znode_t, z_link_node));
+	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
+	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
+
+	int size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1),
+	    ZFS_OBJ_MTX_MAX);
+	zfsvfs->z_hold_size = size;
+	zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
+	    KM_SLEEP);
+	zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
+	for (int i = 0; i != size; i++) {
+		avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
+		    sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
+		mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
+	}
+
+	error = zfsvfs_init(zfsvfs, os);
+	if (error != 0) {
+		*zfvp = NULL;
+		zfsvfs_free(zfsvfs);
+		return (error);
+	}
+
+	zfsvfs->z_drain_task = TASKQID_INVALID;
+	zfsvfs->z_draining = B_FALSE;
+	zfsvfs->z_drain_cancel = B_TRUE;
+
+	*zfvp = zfsvfs;
+	return (0);
+}
+
+static int
+zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
+{
+	int error;
+	boolean_t readonly = zfs_is_readonly(zfsvfs);
+
+	error = zfs_register_callbacks(zfsvfs->z_vfs);
+	if (error)
+		return (error);
+
+	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+
+	/*
+	 * If we are not mounting (ie: online recv), then we don't
+	 * have to worry about replaying the log as we blocked all
+	 * operations out since we closed the ZIL.
+	 */
+	if (mounting) {
+		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+		dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
+
+		/*
+		 * During replay we remove the read only flag to
+		 * allow replays to succeed.
+		 */
+		if (readonly != 0) {
+			readonly_changed_cb(zfsvfs, B_FALSE);
+		} else {
+			zap_stats_t zs;
+			if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+			    &zs) == 0) {
+				dataset_kstats_update_nunlinks_kstat(
+				    &zfsvfs->z_kstat, zs.zs_num_entries);
+				dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
+				    "num_entries in unlinked set: %llu",
+				    zs.zs_num_entries);
+			}
+			zfs_unlinked_drain(zfsvfs);
+			dsl_dir_t *dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
+			dd->dd_activity_cancelled = B_FALSE;
+		}
+
+		/*
+		 * Parse and replay the intent log.
+		 *
+		 * Because of ziltest, this must be done after
+		 * zfs_unlinked_drain().  (Further note: ziltest
+		 * doesn't use readonly mounts, where
+		 * zfs_unlinked_drain() isn't called.)  This is because
+		 * ziltest causes spa_sync() to think it's committed,
+		 * but actually it is not, so the intent log contains
+		 * many txg's worth of changes.
+		 *
+		 * In particular, if object N is in the unlinked set in
+		 * the last txg to actually sync, then it could be
+		 * actually freed in a later txg and then reallocated
+		 * in a yet later txg.  This would write a "create
+		 * object N" record to the intent log.  Normally, this
+		 * would be fine because the spa_sync() would have
+		 * written out the fact that object N is free, before
+		 * we could write the "create object N" intent log
+		 * record.
+		 *
+		 * But when we are in ziltest mode, we advance the "open
+		 * txg" without actually spa_sync()-ing the changes to
+		 * disk.  So we would see that object N is still
+		 * allocated and in the unlinked set, and there is an
+		 * intent log record saying to allocate it.
+		 */
+		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
+			if (zil_replay_disable) {
+				zil_destroy(zfsvfs->z_log, B_FALSE);
+			} else {
+				zfsvfs->z_replay = B_TRUE;
+				zil_replay(zfsvfs->z_os, zfsvfs,
+				    zfs_replay_vector);
+				zfsvfs->z_replay = B_FALSE;
+			}
+		}
+
+		/* restore readonly bit */
+		if (readonly != 0)
+			readonly_changed_cb(zfsvfs, B_TRUE);
+	}
+
+	/*
+	 * Set the objset user_ptr to track its zfsvfs.
+	 */
+	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+
+	return (0);
+}
+
+void
+zfsvfs_free(zfsvfs_t *zfsvfs)
+{
+	int i, size = zfsvfs->z_hold_size;
+
+	zfs_fuid_destroy(zfsvfs);
+
+	mutex_destroy(&zfsvfs->z_znodes_lock);
+	mutex_destroy(&zfsvfs->z_lock);
+	list_destroy(&zfsvfs->z_all_znodes);
+	rrm_destroy(&zfsvfs->z_teardown_lock);
+	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
+	rw_destroy(&zfsvfs->z_fuid_lock);
+	for (i = 0; i != size; i++) {
+		avl_destroy(&zfsvfs->z_hold_trees[i]);
+		mutex_destroy(&zfsvfs->z_hold_locks[i]);
+	}
+	vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
+	vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
+	zfsvfs_vfs_free(zfsvfs->z_vfs);
+	dataset_kstats_destroy(&zfsvfs->z_kstat);
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+
+static void
+zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
+{
+	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+}
+
+static void
+zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
+{
+	objset_t *os = zfsvfs->z_os;
+
+	if (!dmu_objset_is_snapshot(os))
+		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
+}
+
+#ifdef HAVE_MLSLABEL
+/*
+ * Check that the hex label string is appropriate for the dataset being
+ * mounted into the global_zone proper.
+ *
+ * Return an error if the hex label string is not default or
+ * admin_low/admin_high.  For admin_low labels, the corresponding
+ * dataset must be readonly.
+ */
+int
+zfs_check_global_label(const char *dsname, const char *hexsl)
+{
+	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+		return (0);
+	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
+		return (0);
+	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
+		/* must be readonly */
+		uint64_t rdonly;
+
+		if (dsl_prop_get_integer(dsname,
+		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
+			return (SET_ERROR(EACCES));
+		return (rdonly ? 0 : SET_ERROR(EACCES));
+	}
+	return (SET_ERROR(EACCES));
+}
+#endif /* HAVE_MLSLABEL */
+
+static int
+zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp,
+    uint32_t bshift)
+{
+	char buf[20 + DMU_OBJACCT_PREFIX_LEN];
+	uint64_t offset = DMU_OBJACCT_PREFIX_LEN;
+	uint64_t quota;
+	uint64_t used;
+	int err;
+
+	strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1);
+	err = zfs_id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset,
+	    sizeof (buf) - offset, B_FALSE);
+	if (err)
+		return (err);
+
+	if (zfsvfs->z_projectquota_obj == 0)
+		goto objs;
+
+	err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj,
+	    buf + offset, 8, 1, &quota);
+	if (err == ENOENT)
+		goto objs;
+	else if (err)
+		return (err);
+
+	err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
+	    buf + offset, 8, 1, &used);
+	if (unlikely(err == ENOENT)) {
+		uint32_t blksize;
+		u_longlong_t nblocks;
+
+		/*
+		 * Quota accounting is async, so it is possible race case.
+		 * There is at least one object with the given project ID.
+		 */
+		sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
+		if (unlikely(zp->z_blksz == 0))
+			blksize = zfsvfs->z_max_blksz;
+
+		used = blksize * nblocks;
+	} else if (err) {
+		return (err);
+	}
+
+	statp->f_blocks = quota >> bshift;
+	statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0;
+	statp->f_bavail = statp->f_bfree;
+
+objs:
+	if (zfsvfs->z_projectobjquota_obj == 0)
+		return (0);
+
+	err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj,
+	    buf + offset, 8, 1, &quota);
+	if (err == ENOENT)
+		return (0);
+	else if (err)
+		return (err);
+
+	err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
+	    buf, 8, 1, &used);
+	if (unlikely(err == ENOENT)) {
+		/*
+		 * Quota accounting is async, so it is possible race case.
+		 * There is at least one object with the given project ID.
+		 */
+		used = 1;
+	} else if (err) {
+		return (err);
+	}
+
+	statp->f_files = quota;
+	statp->f_ffree = (quota > used) ? (quota - used) : 0;
+
+	return (0);
+}
+
+int
+zfs_statvfs(struct inode *ip, struct kstatfs *statp)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	uint64_t refdbytes, availbytes, usedobjs, availobjs;
+	int err = 0;
+
+	ZFS_ENTER(zfsvfs);
+
+	dmu_objset_space(zfsvfs->z_os,
+	    &refdbytes, &availbytes, &usedobjs, &availobjs);
+
+	uint64_t fsid = dmu_objset_fsid_guid(zfsvfs->z_os);
+	/*
+	 * The underlying storage pool actually uses multiple block
+	 * size.  Under Solaris frsize (fragment size) is reported as
+	 * the smallest block size we support, and bsize (block size)
+	 * as the filesystem's maximum block size.  Unfortunately,
+	 * under Linux the fragment size and block size are often used
+	 * interchangeably.  Thus we are forced to report both of them
+	 * as the filesystem's maximum block size.
+	 */
+	statp->f_frsize = zfsvfs->z_max_blksz;
+	statp->f_bsize = zfsvfs->z_max_blksz;
+	uint32_t bshift = fls(statp->f_bsize) - 1;
+
+	/*
+	 * The following report "total" blocks of various kinds in
+	 * the file system, but reported in terms of f_bsize - the
+	 * "preferred" size.
+	 */
+
+	/* Round up so we never have a filesystem using 0 blocks. */
+	refdbytes = P2ROUNDUP(refdbytes, statp->f_bsize);
+	statp->f_blocks = (refdbytes + availbytes) >> bshift;
+	statp->f_bfree = availbytes >> bshift;
+	statp->f_bavail = statp->f_bfree; /* no root reservation */
+
+	/*
+	 * statvfs() should really be called statufs(), because it assumes
+	 * static metadata.  ZFS doesn't preallocate files, so the best
+	 * we can do is report the max that could possibly fit in f_files,
+	 * and that minus the number actually used in f_ffree.
+	 * For f_ffree, report the smaller of the number of objects available
+	 * and the number of blocks (each object will take at least a block).
+	 */
+	statp->f_ffree = MIN(availobjs, availbytes >> DNODE_SHIFT);
+	statp->f_files = statp->f_ffree + usedobjs;
+	statp->f_fsid.val[0] = (uint32_t)fsid;
+	statp->f_fsid.val[1] = (uint32_t)(fsid >> 32);
+	statp->f_type = ZFS_SUPER_MAGIC;
+	statp->f_namelen = MAXNAMELEN - 1;
+
+	/*
+	 * We have all of 40 characters to stuff a string here.
+	 * Is there anything useful we could/should provide?
+	 */
+	bzero(statp->f_spare, sizeof (statp->f_spare));
+
+	if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+	    dmu_objset_projectquota_present(zfsvfs->z_os)) {
+		znode_t *zp = ITOZ(ip);
+
+		if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid &&
+		    zpl_is_valid_projid(zp->z_projid))
+			err = zfs_statfs_project(zfsvfs, zp, statp, bshift);
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (err);
+}
+
+static int
+zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
+{
+	znode_t *rootzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+	if (error == 0)
+		*ipp = ZTOI(rootzp);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Linux kernels older than 3.1 do not support a per-filesystem shrinker.
+ * To accommodate this we must improvise and manually walk the list of znodes
+ * attempting to prune dentries in order to be able to drop the inodes.
+ *
+ * To avoid scanning the same znodes multiple times they are always rotated
+ * to the end of the z_all_znodes list.  New znodes are inserted at the
+ * end of the list so we're always scanning the oldest znodes first.
+ */
+static int
+zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
+{
+	znode_t **zp_array, *zp;
+	int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
+	int objects = 0;
+	int i = 0, j = 0;
+
+	zp_array = kmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
+
+		if ((i++ > nr_to_scan) || (j >= max_array))
+			break;
+
+		ASSERT(list_link_active(&zp->z_link_node));
+		list_remove(&zfsvfs->z_all_znodes, zp);
+		list_insert_tail(&zfsvfs->z_all_znodes, zp);
+
+		/* Skip active znodes and .zfs entries */
+		if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
+			continue;
+
+		if (igrab(ZTOI(zp)) == NULL)
+			continue;
+
+		zp_array[j] = zp;
+		j++;
+	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	for (i = 0; i < j; i++) {
+		zp = zp_array[i];
+
+		ASSERT3P(zp, !=, NULL);
+		d_prune_aliases(ZTOI(zp));
+
+		if (atomic_read(&ZTOI(zp)->i_count) == 1)
+			objects++;
+
+		zrele(zp);
+	}
+
+	kmem_free(zp_array, max_array * sizeof (znode_t *));
+
+	return (objects);
+}
+
+/*
+ * The ARC has requested that the filesystem drop entries from the dentry
+ * and inode caches.  This can occur when the ARC needs to free meta data
+ * blocks but can't because they are all pinned by entries in these caches.
+ */
+int
+zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
+{
+	zfsvfs_t *zfsvfs = sb->s_fs_info;
+	int error = 0;
+	struct shrinker *shrinker = &sb->s_shrink;
+	struct shrink_control sc = {
+		.nr_to_scan = nr_to_scan,
+		.gfp_mask = GFP_KERNEL,
+	};
+
+	ZFS_ENTER(zfsvfs);
+
+#if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \
+	defined(SHRINK_CONTROL_HAS_NID) && \
+	defined(SHRINKER_NUMA_AWARE)
+	if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) {
+		*objects = 0;
+		for_each_online_node(sc.nid) {
+			*objects += (*shrinker->scan_objects)(shrinker, &sc);
+		}
+	} else {
+			*objects = (*shrinker->scan_objects)(shrinker, &sc);
+	}
+
+#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
+	*objects = (*shrinker->scan_objects)(shrinker, &sc);
+#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK)
+	*objects = (*shrinker->shrink)(shrinker, &sc);
+#elif defined(HAVE_D_PRUNE_ALIASES)
+#define	D_PRUNE_ALIASES_IS_DEFAULT
+	*objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
+#else
+#error "No available dentry and inode cache pruning mechanism."
+#endif
+
+#if defined(HAVE_D_PRUNE_ALIASES) && !defined(D_PRUNE_ALIASES_IS_DEFAULT)
+#undef	D_PRUNE_ALIASES_IS_DEFAULT
+	/*
+	 * Fall back to zfs_prune_aliases if the kernel's per-superblock
+	 * shrinker couldn't free anything, possibly due to the inodes being
+	 * allocated in a different memcg.
+	 */
+	if (*objects == 0)
+		*objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
+#endif
+
+	ZFS_EXIT(zfsvfs);
+
+	dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
+	    "pruning, nr_to_scan=%lu objects=%d error=%d\n",
+	    nr_to_scan, *objects, error);
+
+	return (error);
+}
+
+/*
+ * Teardown the zfsvfs_t.
+ *
+ * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
+ * and 'z_teardown_inactive_lock' held.
+ */
+static int
+zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
+{
+	znode_t	*zp;
+
+	zfs_unlinked_drain_stop_wait(zfsvfs);
+
+	/*
+	 * If someone has not already unmounted this file system,
+	 * drain the zrele_taskq to ensure all active references to the
+	 * zfsvfs_t have been handled only then can it be safely destroyed.
+	 */
+	if (zfsvfs->z_os) {
+		/*
+		 * If we're unmounting we have to wait for the list to
+		 * drain completely.
+		 *
+		 * If we're not unmounting there's no guarantee the list
+		 * will drain completely, but iputs run from the taskq
+		 * may add the parents of dir-based xattrs to the taskq
+		 * so we want to wait for these.
+		 *
+		 * We can safely read z_nr_znodes without locking because the
+		 * VFS has already blocked operations which add to the
+		 * z_all_znodes list and thus increment z_nr_znodes.
+		 */
+		int round = 0;
+		while (zfsvfs->z_nr_znodes > 0) {
+			taskq_wait_outstanding(dsl_pool_zrele_taskq(
+			    dmu_objset_pool(zfsvfs->z_os)), 0);
+			if (++round > 1 && !unmounting)
+				break;
+		}
+	}
+
+	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+
+	if (!unmounting) {
+		/*
+		 * We purge the parent filesystem's super block as the
+		 * parent filesystem and all of its snapshots have their
+		 * inode's super block set to the parent's filesystem's
+		 * super block.  Note,  'z_parent' is self referential
+		 * for non-snapshots.
+		 */
+		shrink_dcache_sb(zfsvfs->z_parent->z_sb);
+	}
+
+	/*
+	 * Close the zil. NB: Can't close the zil while zfs_inactive
+	 * threads are blocked as zil_close can call zfs_inactive.
+	 */
+	if (zfsvfs->z_log) {
+		zil_close(zfsvfs->z_log);
+		zfsvfs->z_log = NULL;
+	}
+
+	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
+
+	/*
+	 * If we are not unmounting (ie: online recv) and someone already
+	 * unmounted this file system while we were doing the switcheroo,
+	 * or a reopen of z_os failed then just bail out now.
+	 */
+	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+		return (SET_ERROR(EIO));
+	}
+
+	/*
+	 * At this point there are no VFS ops active, and any new VFS ops
+	 * will fail with EIO since we have z_teardown_lock for writer (only
+	 * relevant for forced unmount).
+	 *
+	 * Release all holds on dbufs. We also grab an extra reference to all
+	 * the remaining inodes so that the kernel does not attempt to free
+	 * any inodes of a suspended fs. This can cause deadlocks since the
+	 * zfs_resume_fs() process may involve starting threads, which might
+	 * attempt to free unreferenced inodes to free up memory for the new
+	 * thread.
+	 */
+	if (!unmounting) {
+		mutex_enter(&zfsvfs->z_znodes_lock);
+		for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
+		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+			if (zp->z_sa_hdl)
+				zfs_znode_dmu_fini(zp);
+			if (igrab(ZTOI(zp)) != NULL)
+				zp->z_suspended = B_TRUE;
+
+		}
+		mutex_exit(&zfsvfs->z_znodes_lock);
+	}
+
+	/*
+	 * If we are unmounting, set the unmounted flag and let new VFS ops
+	 * unblock.  zfs_inactive will have the unmounted behavior, and all
+	 * other VFS ops will fail with EIO.
+	 */
+	if (unmounting) {
+		zfsvfs->z_unmounted = B_TRUE;
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+	}
+
+	/*
+	 * z_os will be NULL if there was an error in attempting to reopen
+	 * zfsvfs, so just return as the properties had already been
+	 *
+	 * unregistered and cached data had been evicted before.
+	 */
+	if (zfsvfs->z_os == NULL)
+		return (0);
+
+	/*
+	 * Unregister properties.
+	 */
+	zfs_unregister_callbacks(zfsvfs);
+
+	/*
+	 * Evict cached data. We must write out any dirty data before
+	 * disowning the dataset.
+	 */
+	objset_t *os = zfsvfs->z_os;
+	boolean_t os_dirty = B_FALSE;
+	for (int t = 0; t < TXG_SIZE; t++) {
+		if (dmu_objset_is_dirty(os, t)) {
+			os_dirty = B_TRUE;
+			break;
+		}
+	}
+	if (!zfs_is_readonly(zfsvfs) && os_dirty) {
+		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+	}
+	dmu_objset_evict_dbufs(zfsvfs->z_os);
+	dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
+	dsl_dir_cancel_waiters(dd);
+
+	return (0);
+}
+
+#if defined(HAVE_SUPER_SETUP_BDI_NAME)
+atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0);
+#endif
+
+int
+zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
+{
+	const char *osname = zm->mnt_osname;
+	struct inode *root_inode;
+	uint64_t recordsize;
+	int error = 0;
+	zfsvfs_t *zfsvfs = NULL;
+	vfs_t *vfs = NULL;
+
+	ASSERT(zm);
+	ASSERT(osname);
+
+	error = zfsvfs_parse_options(zm->mnt_data, &vfs);
+	if (error)
+		return (error);
+
+	error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs);
+	if (error) {
+		zfsvfs_vfs_free(vfs);
+		goto out;
+	}
+
+	if ((error = dsl_prop_get_integer(osname, "recordsize",
+	    &recordsize, NULL))) {
+		zfsvfs_vfs_free(vfs);
+		goto out;
+	}
+
+	vfs->vfs_data = zfsvfs;
+	zfsvfs->z_vfs = vfs;
+	zfsvfs->z_sb = sb;
+	sb->s_fs_info = zfsvfs;
+	sb->s_magic = ZFS_SUPER_MAGIC;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_time_gran = 1;
+	sb->s_blocksize = recordsize;
+	sb->s_blocksize_bits = ilog2(recordsize);
+
+	error = -zpl_bdi_setup(sb, "zfs");
+	if (error)
+		goto out;
+
+	sb->s_bdi->ra_pages = 0;
+
+	/* Set callback operations for the file system. */
+	sb->s_op = &zpl_super_operations;
+	sb->s_xattr = zpl_xattr_handlers;
+	sb->s_export_op = &zpl_export_operations;
+	sb->s_d_op = &zpl_dentry_operations;
+
+	/* Set features for file system. */
+	zfs_set_fuid_feature(zfsvfs);
+
+	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
+		uint64_t pval;
+
+		atime_changed_cb(zfsvfs, B_FALSE);
+		readonly_changed_cb(zfsvfs, B_TRUE);
+		if ((error = dsl_prop_get_integer(osname,
+		    "xattr", &pval, NULL)))
+			goto out;
+		xattr_changed_cb(zfsvfs, pval);
+		if ((error = dsl_prop_get_integer(osname,
+		    "acltype", &pval, NULL)))
+			goto out;
+		acltype_changed_cb(zfsvfs, pval);
+		zfsvfs->z_issnap = B_TRUE;
+		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
+		zfsvfs->z_snap_defer_time = jiffies;
+
+		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+	} else {
+		if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
+			goto out;
+	}
+
+	/* Allocate a root inode for the filesystem. */
+	error = zfs_root(zfsvfs, &root_inode);
+	if (error) {
+		(void) zfs_umount(sb);
+		goto out;
+	}
+
+	/* Allocate a root dentry for the filesystem */
+	sb->s_root = d_make_root(root_inode);
+	if (sb->s_root == NULL) {
+		(void) zfs_umount(sb);
+		error = SET_ERROR(ENOMEM);
+		goto out;
+	}
+
+	if (!zfsvfs->z_issnap)
+		zfsctl_create(zfsvfs);
+
+	zfsvfs->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb);
+out:
+	if (error) {
+		if (zfsvfs != NULL) {
+			dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
+			zfsvfs_free(zfsvfs);
+		}
+		/*
+		 * make sure we don't have dangling sb->s_fs_info which
+		 * zfs_preumount will use.
+		 */
+		sb->s_fs_info = NULL;
+	}
+
+	return (error);
+}
+
+/*
+ * Called when an unmount is requested and certain sanity checks have
+ * already passed.  At this point no dentries or inodes have been reclaimed
+ * from their respective caches.  We drop the extra reference on the .zfs
+ * control directory to allow everything to be reclaimed.  All snapshots
+ * must already have been unmounted to reach this point.
+ */
+void
+zfs_preumount(struct super_block *sb)
+{
+	zfsvfs_t *zfsvfs = sb->s_fs_info;
+
+	/* zfsvfs is NULL when zfs_domount fails during mount */
+	if (zfsvfs) {
+		zfs_unlinked_drain_stop_wait(zfsvfs);
+		zfsctl_destroy(sb->s_fs_info);
+		/*
+		 * Wait for zrele_async before entering evict_inodes in
+		 * generic_shutdown_super. The reason we must finish before
+		 * evict_inodes is when lazytime is on, or when zfs_purgedir
+		 * calls zfs_zget, zrele would bump i_count from 0 to 1. This
+		 * would race with the i_count check in evict_inodes. This means
+		 * it could destroy the inode while we are still using it.
+		 *
+		 * We wait for two passes. xattr directories in the first pass
+		 * may add xattr entries in zfs_purgedir, so in the second pass
+		 * we wait for them. We don't use taskq_wait here because it is
+		 * a pool wide taskq. Other mounted filesystems can constantly
+		 * do zrele_async and there's no guarantee when taskq will be
+		 * empty.
+		 */
+		taskq_wait_outstanding(dsl_pool_zrele_taskq(
+		    dmu_objset_pool(zfsvfs->z_os)), 0);
+		taskq_wait_outstanding(dsl_pool_zrele_taskq(
+		    dmu_objset_pool(zfsvfs->z_os)), 0);
+	}
+}
+
+/*
+ * Called once all other unmount released tear down has occurred.
+ * It is our responsibility to release any remaining infrastructure.
+ */
+/*ARGSUSED*/
+int
+zfs_umount(struct super_block *sb)
+{
+	zfsvfs_t *zfsvfs = sb->s_fs_info;
+	objset_t *os;
+
+	if (zfsvfs->z_arc_prune != NULL)
+		arc_remove_prune_callback(zfsvfs->z_arc_prune);
+	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+	os = zfsvfs->z_os;
+	zpl_bdi_destroy(sb);
+
+	/*
+	 * z_os will be NULL if there was an error in
+	 * attempting to reopen zfsvfs.
+	 */
+	if (os != NULL) {
+		/*
+		 * Unset the objset user_ptr.
+		 */
+		mutex_enter(&os->os_user_ptr_lock);
+		dmu_objset_set_user(os, NULL);
+		mutex_exit(&os->os_user_ptr_lock);
+
+		/*
+		 * Finally release the objset
+		 */
+		dmu_objset_disown(os, B_TRUE, zfsvfs);
+	}
+
+	zfsvfs_free(zfsvfs);
+	return (0);
+}
+
+int
+zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm)
+{
+	zfsvfs_t *zfsvfs = sb->s_fs_info;
+	vfs_t *vfsp;
+	boolean_t issnap = dmu_objset_is_snapshot(zfsvfs->z_os);
+	int error;
+
+	if ((issnap || !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) &&
+	    !(*flags & SB_RDONLY)) {
+		*flags |= SB_RDONLY;
+		return (EROFS);
+	}
+
+	error = zfsvfs_parse_options(zm->mnt_data, &vfsp);
+	if (error)
+		return (error);
+
+	if (!zfs_is_readonly(zfsvfs) && (*flags & SB_RDONLY))
+		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+
+	zfs_unregister_callbacks(zfsvfs);
+	zfsvfs_vfs_free(zfsvfs->z_vfs);
+
+	vfsp->vfs_data = zfsvfs;
+	zfsvfs->z_vfs = vfsp;
+	if (!issnap)
+		(void) zfs_register_callbacks(vfsp);
+
+	return (error);
+}
+
+int
+zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
+{
+	zfsvfs_t	*zfsvfs = sb->s_fs_info;
+	znode_t		*zp;
+	uint64_t	object = 0;
+	uint64_t	fid_gen = 0;
+	uint64_t	gen_mask;
+	uint64_t	zp_gen;
+	int		i, err;
+
+	*ipp = NULL;
+
+	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
+		zfid_short_t	*zfid = (zfid_short_t *)fidp;
+
+		for (i = 0; i < sizeof (zfid->zf_object); i++)
+			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
+
+		for (i = 0; i < sizeof (zfid->zf_gen); i++)
+			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
+	} else {
+		return (SET_ERROR(EINVAL));
+	}
+
+	/* LONG_FID_LEN means snapdirs */
+	if (fidp->fid_len == LONG_FID_LEN) {
+		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
+		uint64_t	objsetid = 0;
+		uint64_t	setgen = 0;
+
+		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
+
+		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
+
+		if (objsetid != ZFSCTL_INO_SNAPDIRS - object) {
+			dprintf("snapdir fid: objsetid (%llu) != "
+			    "ZFSCTL_INO_SNAPDIRS (%llu) - object (%llu)\n",
+			    objsetid, ZFSCTL_INO_SNAPDIRS, object);
+
+			return (SET_ERROR(EINVAL));
+		}
+
+		if (fid_gen > 1 || setgen != 0) {
+			dprintf("snapdir fid: fid_gen (%llu) and setgen "
+			    "(%llu)\n", fid_gen, setgen);
+			return (SET_ERROR(EINVAL));
+		}
+
+		return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp));
+	}
+
+	ZFS_ENTER(zfsvfs);
+	/* A zero fid_gen means we are in the .zfs control directories */
+	if (fid_gen == 0 &&
+	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
+		*ipp = zfsvfs->z_ctldir;
+		ASSERT(*ipp != NULL);
+		if (object == ZFSCTL_INO_SNAPDIR) {
+			VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp,
+			    0, kcred, NULL, NULL) == 0);
+		} else {
+			igrab(*ipp);
+		}
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	gen_mask = -1ULL >> (64 - 8 * i);
+
+	dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask);
+	if ((err = zfs_zget(zfsvfs, object, &zp))) {
+		ZFS_EXIT(zfsvfs);
+		return (err);
+	}
+
+	/* Don't export xattr stuff */
+	if (zp->z_pflags & ZFS_XATTR) {
+		zrele(zp);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENOENT));
+	}
+
+	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+	    sizeof (uint64_t));
+	zp_gen = zp_gen & gen_mask;
+	if (zp_gen == 0)
+		zp_gen = 1;
+	if ((fid_gen == 0) && (zfsvfs->z_root == object))
+		fid_gen = zp_gen;
+	if (zp->z_unlinked || zp_gen != fid_gen) {
+		dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen,
+		    fid_gen);
+		zrele(zp);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENOENT));
+	}
+
+	*ipp = ZTOI(zp);
+	if (*ipp)
+		zfs_inode_update(ITOZ(*ipp));
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Block out VFS ops and close zfsvfs_t
+ *
+ * Note, if successful, then we return with the 'z_teardown_lock' and
+ * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
+ * dataset and objset intact so that they can be atomically handed off during
+ * a subsequent rollback or recv operation and the resume thereafter.
+ */
+int
+zfs_suspend_fs(zfsvfs_t *zfsvfs)
+{
+	int error;
+
+	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
+		return (error);
+
+	return (0);
+}
+
+/*
+ * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
+ * is an invariant across any of the operations that can be performed while the
+ * filesystem was suspended.  Whether it succeeded or failed, the preconditions
+ * are the same: the relevant objset and associated dataset are owned by
+ * zfsvfs, held, and long held on entry.
+ */
+int
+zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+	int err, err2;
+	znode_t *zp;
+
+	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
+	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+	/*
+	 * We already own this, so just update the objset_t, as the one we
+	 * had before may have been evicted.
+	 */
+	objset_t *os;
+	VERIFY3P(ds->ds_owner, ==, zfsvfs);
+	VERIFY(dsl_dataset_long_held(ds));
+	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
+	dsl_pool_config_enter(dp, FTAG);
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	dsl_pool_config_exit(dp, FTAG);
+
+	err = zfsvfs_init(zfsvfs, os);
+	if (err != 0)
+		goto bail;
+
+	ds->ds_dir->dd_activity_cancelled = B_FALSE;
+	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+
+	zfs_set_fuid_feature(zfsvfs);
+	zfsvfs->z_rollback_time = jiffies;
+
+	/*
+	 * Attempt to re-establish all the active inodes with their
+	 * dbufs.  If a zfs_rezget() fails, then we unhash the inode
+	 * and mark it stale.  This prevents a collision if a new
+	 * inode/object is created which must use the same inode
+	 * number.  The stale inode will be be released when the
+	 * VFS prunes the dentry holding the remaining references
+	 * on the stale inode.
+	 */
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+		err2 = zfs_rezget(zp);
+		if (err2) {
+			remove_inode_hash(ZTOI(zp));
+			zp->z_is_stale = B_TRUE;
+		}
+
+		/* see comment in zfs_suspend_fs() */
+		if (zp->z_suspended) {
+			zfs_zrele_async(zp);
+			zp->z_suspended = B_FALSE;
+		}
+	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) {
+		/*
+		 * zfs_suspend_fs() could have interrupted freeing
+		 * of dnodes. We need to restart this freeing so
+		 * that we don't "leak" the space.
+		 */
+		zfs_unlinked_drain(zfsvfs);
+	}
+
+	/*
+	 * Most of the time zfs_suspend_fs is used for changing the contents
+	 * of the underlying dataset. ZFS rollback and receive operations
+	 * might create files for which negative dentries are present in
+	 * the cache. Since walking the dcache would require a lot of GPL-only
+	 * code duplication, it's much easier on these rather rare occasions
+	 * just to flush the whole dcache for the given dataset/filesystem.
+	 */
+	shrink_dcache_sb(zfsvfs->z_sb);
+
+bail:
+	if (err != 0)
+		zfsvfs->z_unmounted = B_TRUE;
+
+	/* release the VFS ops */
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
+	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+
+	if (err != 0) {
+		/*
+		 * Since we couldn't setup the sa framework, try to force
+		 * unmount this file system.
+		 */
+		if (zfsvfs->z_os)
+			(void) zfs_umount(zfsvfs->z_sb);
+	}
+	return (err);
+}
+
+/*
+ * Release VOPs and unmount a suspended filesystem.
+ */
+int
+zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
+	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+	/*
+	 * We already own this, so just hold and rele it to update the
+	 * objset_t, as the one we had before may have been evicted.
+	 */
+	objset_t *os;
+	VERIFY3P(ds->ds_owner, ==, zfsvfs);
+	VERIFY(dsl_dataset_long_held(ds));
+	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
+	dsl_pool_config_enter(dp, FTAG);
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	dsl_pool_config_exit(dp, FTAG);
+	zfsvfs->z_os = os;
+
+	/* release the VOPs */
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
+	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+
+	/*
+	 * Try to force unmount this file system.
+	 */
+	(void) zfs_umount(zfsvfs->z_sb);
+	zfsvfs->z_unmounted = B_TRUE;
+	return (0);
+}
+
+/*
+ * Automounted snapshots rely on periodic revalidation
+ * to defer snapshots from being automatically unmounted.
+ */
+
+inline void
+zfs_exit_fs(zfsvfs_t *zfsvfs)
+{
+	if (!zfsvfs->z_issnap)
+		return;
+
+	if (time_after(jiffies, zfsvfs->z_snap_defer_time +
+	    MAX(zfs_expire_snapshot * HZ / 2, HZ))) {
+		zfsvfs->z_snap_defer_time = jiffies;
+		zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa,
+		    dmu_objset_id(zfsvfs->z_os),
+		    zfs_expire_snapshot);
+	}
+}
+
+int
+zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
+{
+	int error;
+	objset_t *os = zfsvfs->z_os;
+	dmu_tx_t *tx;
+
+	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
+		return (SET_ERROR(EINVAL));
+
+	if (newvers < zfsvfs->z_version)
+		return (SET_ERROR(EINVAL));
+
+	if (zfs_spa_version_map(newvers) >
+	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
+		return (SET_ERROR(ENOTSUP));
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+		    ZFS_SA_ATTRS);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	}
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+
+	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+	    8, 1, &newvers, tx);
+
+	if (error) {
+		dmu_tx_commit(tx);
+		return (error);
+	}
+
+	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+		uint64_t sa_obj;
+
+		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+		    SPA_VERSION_SA);
+		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+		    DMU_OT_NONE, 0, tx);
+
+		error = zap_add(os, MASTER_NODE_OBJ,
+		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+		ASSERT0(error);
+
+		VERIFY(0 == sa_set_sa_object(os, sa_obj));
+		sa_register_update_callback(os, zfs_sa_upgrade);
+	}
+
+	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
+	    "from %llu to %llu", zfsvfs->z_version, newvers);
+
+	dmu_tx_commit(tx);
+
+	zfsvfs->z_version = newvers;
+	os->os_version = newvers;
+
+	zfs_set_fuid_feature(zfsvfs);
+
+	return (0);
+}
+
+/*
+ * Read a property stored within the master node.
+ */
+int
+zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
+{
+	uint64_t *cached_copy = NULL;
+
+	/*
+	 * Figure out where in the objset_t the cached copy would live, if it
+	 * is available for the requested property.
+	 */
+	if (os != NULL) {
+		switch (prop) {
+		case ZFS_PROP_VERSION:
+			cached_copy = &os->os_version;
+			break;
+		case ZFS_PROP_NORMALIZE:
+			cached_copy = &os->os_normalization;
+			break;
+		case ZFS_PROP_UTF8ONLY:
+			cached_copy = &os->os_utf8only;
+			break;
+		case ZFS_PROP_CASE:
+			cached_copy = &os->os_casesensitivity;
+			break;
+		default:
+			break;
+		}
+	}
+	if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+		*value = *cached_copy;
+		return (0);
+	}
+
+	/*
+	 * If the property wasn't cached, look up the file system's value for
+	 * the property. For the version property, we look up a slightly
+	 * different string.
+	 */
+	const char *pname;
+	int error = ENOENT;
+	if (prop == ZFS_PROP_VERSION)
+		pname = ZPL_VERSION_STR;
+	else
+		pname = zfs_prop_to_name(prop);
+
+	if (os != NULL) {
+		ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
+	}
+
+	if (error == ENOENT) {
+		/* No value set, use the default value */
+		switch (prop) {
+		case ZFS_PROP_VERSION:
+			*value = ZPL_VERSION;
+			break;
+		case ZFS_PROP_NORMALIZE:
+		case ZFS_PROP_UTF8ONLY:
+			*value = 0;
+			break;
+		case ZFS_PROP_CASE:
+			*value = ZFS_CASE_SENSITIVE;
+			break;
+		case ZFS_PROP_ACLTYPE:
+			*value = ZFS_ACLTYPE_OFF;
+			break;
+		default:
+			return (error);
+		}
+		error = 0;
+	}
+
+	/*
+	 * If one of the methods for getting the property value above worked,
+	 * copy it into the objset_t's cache.
+	 */
+	if (error == 0 && cached_copy != NULL) {
+		*cached_copy = *value;
+	}
+
+	return (error);
+}
+
+/*
+ * Return true if the corresponding vfs's unmounted flag is set.
+ * Otherwise return false.
+ * If this function returns true we know VFS unmount has been initiated.
+ */
+boolean_t
+zfs_get_vfs_flag_unmounted(objset_t *os)
+{
+	zfsvfs_t *zfvp;
+	boolean_t unmounted = B_FALSE;
+
+	ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
+
+	mutex_enter(&os->os_user_ptr_lock);
+	zfvp = dmu_objset_get_user(os);
+	if (zfvp != NULL && zfvp->z_unmounted)
+		unmounted = B_TRUE;
+	mutex_exit(&os->os_user_ptr_lock);
+
+	return (unmounted);
+}
+
+void
+zfs_init(void)
+{
+	zfsctl_init();
+	zfs_znode_init();
+	dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
+	register_filesystem(&zpl_fs_type);
+}
+
+void
+zfs_fini(void)
+{
+	/*
+	 * we don't use outstanding because zpl_posix_acl_free might add more.
+	 */
+	taskq_wait(system_delay_taskq);
+	taskq_wait(system_taskq);
+	unregister_filesystem(&zpl_fs_type);
+	zfs_znode_fini();
+	zfsctl_fini();
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_suspend_fs);
+EXPORT_SYMBOL(zfs_resume_fs);
+EXPORT_SYMBOL(zfs_set_version);
+EXPORT_SYMBOL(zfsvfs_create);
+EXPORT_SYMBOL(zfsvfs_free);
+EXPORT_SYMBOL(zfs_is_readonly);
+EXPORT_SYMBOL(zfs_domount);
+EXPORT_SYMBOL(zfs_preumount);
+EXPORT_SYMBOL(zfs_umount);
+EXPORT_SYMBOL(zfs_remount);
+EXPORT_SYMBOL(zfs_statvfs);
+EXPORT_SYMBOL(zfs_vget);
+EXPORT_SYMBOL(zfs_prune);
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c
new file mode 100644
index 000000000000..2d104a5001ec
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c
@@ -0,0 +1,5031 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/uio.h>
+#include <sys/vmsystm.h>
+#include <sys/atomic.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/policy.h>
+#include <sys/sunddi.h>
+#include <sys/sid.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_rlock.h>
+#include <sys/cred.h>
+#include <sys/zpl.h>
+#include <sys/zil.h>
+#include <sys/sa_impl.h>
+
+/*
+ * Programming rules.
+ *
+ * Each vnode op performs some logical unit of work.  To do this, the ZPL must
+ * properly lock its in-core state, create a DMU transaction, do the work,
+ * record this work in the intent log (ZIL), commit the DMU transaction,
+ * and wait for the intent log to commit if it is a synchronous operation.
+ * Moreover, the vnode ops must work in both normal and log replay context.
+ * The ordering of events is important to avoid deadlocks and references
+ * to freed memory.  The example below illustrates the following Big Rules:
+ *
+ *  (1) A check must be made in each zfs thread for a mounted file system.
+ *	This is done avoiding races using ZFS_ENTER(zfsvfs).
+ *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
+ *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
+ *      can return EIO from the calling function.
+ *
+ *  (2)	zrele() should always be the last thing except for zil_commit()
+ *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
+ *	First, if it's the last reference, the vnode/znode
+ *	can be freed, so the zp may point to freed memory.  Second, the last
+ *	reference will call zfs_zinactive(), which may induce a lot of work --
+ *	pushing cached pages (which acquires range locks) and syncing out
+ *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
+ *	which could deadlock the system if you were already holding one.
+ *	If you must call zrele() within a tx then use zfs_zrele_async().
+ *
+ *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
+ *	as they can span dmu_tx_assign() calls.
+ *
+ *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
+ *      dmu_tx_assign().  This is critical because we don't want to block
+ *      while holding locks.
+ *
+ *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
+ *	reduces lock contention and CPU usage when we must wait (note that if
+ *	throughput is constrained by the storage, nearly every transaction
+ *	must wait).
+ *
+ *      Note, in particular, that if a lock is sometimes acquired before
+ *      the tx assigns, and sometimes after (e.g. z_lock), then failing
+ *      to use a non-blocking assign can deadlock the system.  The scenario:
+ *
+ *	Thread A has grabbed a lock before calling dmu_tx_assign().
+ *	Thread B is in an already-assigned tx, and blocks for this lock.
+ *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
+ *	forever, because the previous txg can't quiesce until B's tx commits.
+ *
+ *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+ *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
+ *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
+ *	to indicate that this operation has already called dmu_tx_wait().
+ *	This will ensure that we don't retry forever, waiting a short bit
+ *	each time.
+ *
+ *  (5)	If the operation succeeded, generate the intent log entry for it
+ *	before dropping locks.  This ensures that the ordering of events
+ *	in the intent log matches the order in which they actually occurred.
+ *	During ZIL replay the zfs_log_* functions will update the sequence
+ *	number to indicate the zil transaction has replayed.
+ *
+ *  (6)	At the end of each vnode op, the DMU tx must always commit,
+ *	regardless of whether there were any errors.
+ *
+ *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
+ *	to ensure that synchronous semantics are provided when necessary.
+ *
+ * In general, this is how things should be ordered in each vnode op:
+ *
+ *	ZFS_ENTER(zfsvfs);		// exit if unmounted
+ * top:
+ *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may igrab())
+ *	rw_enter(...);			// grab any other locks you need
+ *	tx = dmu_tx_create(...);	// get DMU tx
+ *	dmu_tx_hold_*();		// hold each object you might modify
+ *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ *	if (error) {
+ *		rw_exit(...);		// drop locks
+ *		zfs_dirent_unlock(dl);	// unlock directory entry
+ *		zrele(...);		// release held znodes
+ *		if (error == ERESTART) {
+ *			waited = B_TRUE;
+ *			dmu_tx_wait(tx);
+ *			dmu_tx_abort(tx);
+ *			goto top;
+ *		}
+ *		dmu_tx_abort(tx);	// abort DMU tx
+ *		ZFS_EXIT(zfsvfs);	// finished in zfs
+ *		return (error);		// really out of space
+ *	}
+ *	error = do_real_work();		// do whatever this VOP does
+ *	if (error == 0)
+ *		zfs_log_*(...);		// on success, make ZIL entry
+ *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
+ *	rw_exit(...);			// drop locks
+ *	zfs_dirent_unlock(dl);		// unlock directory entry
+ *	zrele(...);			// release held znodes
+ *	zil_commit(zilog, foid);	// synchronous when necessary
+ *	ZFS_EXIT(zfsvfs);		// finished in zfs
+ *	return (error);			// done, report error
+ */
+
+/*
+ * Virus scanning is unsupported.  It would be possible to add a hook
+ * here to performance the required virus scan.  This could be done
+ * entirely in the kernel or potentially as an update to invoke a
+ * scanning utility.
+ */
+static int
+zfs_vscan(struct inode *ip, cred_t *cr, int async)
+{
+	return (0);
+}
+
+/* ARGSUSED */
+int
+zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
+{
+	znode_t	*zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	/* Honor ZFS_APPENDONLY file attribute */
+	if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
+	    ((flag & O_APPEND) == 0)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	/* Virus scan eligible files on open */
+	if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
+	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
+		if (zfs_vscan(ip, cr, 0) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EACCES));
+		}
+	}
+
+	/* Keep a count of the synchronous opens in the znode */
+	if (flag & O_SYNC)
+		atomic_inc_32(&zp->z_sync_cnt);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/* ARGSUSED */
+int
+zfs_close(struct inode *ip, int flag, cred_t *cr)
+{
+	znode_t	*zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	/* Decrement the synchronous opens in the znode */
+	if (flag & O_SYNC)
+		atomic_dec_32(&zp->z_sync_cnt);
+
+	if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
+	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
+		VERIFY(zfs_vscan(ip, cr, 1) == 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+#if defined(SEEK_HOLE) && defined(SEEK_DATA)
+/*
+ * Lseek support for finding holes (cmd == SEEK_HOLE) and
+ * data (cmd == SEEK_DATA). "off" is an in/out parameter.
+ */
+static int
+zfs_holey_common(struct inode *ip, int cmd, loff_t *off)
+{
+	znode_t	*zp = ITOZ(ip);
+	uint64_t noff = (uint64_t)*off; /* new offset */
+	uint64_t file_sz;
+	int error;
+	boolean_t hole;
+
+	file_sz = zp->z_size;
+	if (noff >= file_sz)  {
+		return (SET_ERROR(ENXIO));
+	}
+
+	if (cmd == SEEK_HOLE)
+		hole = B_TRUE;
+	else
+		hole = B_FALSE;
+
+	error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
+
+	if (error == ESRCH)
+		return (SET_ERROR(ENXIO));
+
+	/* file was dirty, so fall back to using generic logic */
+	if (error == EBUSY) {
+		if (hole)
+			*off = file_sz;
+
+		return (0);
+	}
+
+	/*
+	 * We could find a hole that begins after the logical end-of-file,
+	 * because dmu_offset_next() only works on whole blocks.  If the
+	 * EOF falls mid-block, then indicate that the "virtual hole"
+	 * at the end of the file begins at the logical EOF, rather than
+	 * at the end of the last block.
+	 */
+	if (noff > file_sz) {
+		ASSERT(hole);
+		noff = file_sz;
+	}
+
+	if (noff < *off)
+		return (error);
+	*off = noff;
+	return (error);
+}
+
+int
+zfs_holey(struct inode *ip, int cmd, loff_t *off)
+{
+	znode_t	*zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	error = zfs_holey_common(ip, cmd, off);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+#endif /* SEEK_HOLE && SEEK_DATA */
+
+#if defined(_KERNEL)
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages.  What this means:
+ *
+ * On Write:	If we find a memory mapped page, we write to *both*
+ *		the page and the dmu buffer.
+ */
+static void
+update_pages(struct inode *ip, int64_t start, int len,
+    objset_t *os, uint64_t oid)
+{
+	struct address_space *mp = ip->i_mapping;
+	struct page *pp;
+	uint64_t nbytes;
+	int64_t	off;
+	void *pb;
+
+	off = start & (PAGE_SIZE-1);
+	for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
+		nbytes = MIN(PAGE_SIZE - off, len);
+
+		pp = find_lock_page(mp, start >> PAGE_SHIFT);
+		if (pp) {
+			if (mapping_writably_mapped(mp))
+				flush_dcache_page(pp);
+
+			pb = kmap(pp);
+			(void) dmu_read(os, oid, start+off, nbytes, pb+off,
+			    DMU_READ_PREFETCH);
+			kunmap(pp);
+
+			if (mapping_writably_mapped(mp))
+				flush_dcache_page(pp);
+
+			mark_page_accessed(pp);
+			SetPageUptodate(pp);
+			ClearPageError(pp);
+			unlock_page(pp);
+			put_page(pp);
+		}
+
+		len -= nbytes;
+		off = 0;
+	}
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages.  What this means:
+ *
+ * On Read:	We "read" preferentially from memory mapped pages,
+ *		else we default from the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ *	 the file is memory mapped.
+ */
+static int
+mappedread(struct inode *ip, int nbytes, uio_t *uio)
+{
+	struct address_space *mp = ip->i_mapping;
+	struct page *pp;
+	znode_t *zp = ITOZ(ip);
+	int64_t	start, off;
+	uint64_t bytes;
+	int len = nbytes;
+	int error = 0;
+	void *pb;
+
+	start = uio->uio_loffset;
+	off = start & (PAGE_SIZE-1);
+	for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
+		bytes = MIN(PAGE_SIZE - off, len);
+
+		pp = find_lock_page(mp, start >> PAGE_SHIFT);
+		if (pp) {
+			ASSERT(PageUptodate(pp));
+			unlock_page(pp);
+
+			pb = kmap(pp);
+			error = uiomove(pb + off, bytes, UIO_READ, uio);
+			kunmap(pp);
+
+			if (mapping_writably_mapped(mp))
+				flush_dcache_page(pp);
+
+			mark_page_accessed(pp);
+			put_page(pp);
+		} else {
+			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, bytes);
+		}
+
+		len -= bytes;
+		off = 0;
+		if (error)
+			break;
+	}
+	return (error);
+}
+#endif /* _KERNEL */
+
+unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */
+unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
+
+/*
+ * Read bytes from specified file into supplied buffer.
+ *
+ *	IN:	ip	- inode of file to be read from.
+ *		uio	- structure supplying read location, range info,
+ *			  and return buffer.
+ *		ioflag	- O_SYNC flags; used to provide FRSYNC semantics.
+ *			  O_DIRECT flag; used to bypass page cache.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	uio	- updated offset and range, buffer filled.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Side Effects:
+ *	inode - atime updated if byte count > 0
+ */
+/* ARGSUSED */
+int
+zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
+{
+	int error = 0;
+	boolean_t frsync = B_FALSE;
+
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EACCES));
+	}
+
+	/*
+	 * Validate file offset
+	 */
+	if (uio->uio_loffset < (offset_t)0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Fasttrack empty reads
+	 */
+	if (uio->uio_resid == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+#ifdef FRSYNC
+	/*
+	 * If we're in FRSYNC mode, sync out this znode before reading it.
+	 * Only do this for non-snapshots.
+	 *
+	 * Some platforms do not support FRSYNC and instead map it
+	 * to O_SYNC, which results in unnecessary calls to zil_commit. We
+	 * only honor FRSYNC requests on platforms which support it.
+	 */
+	frsync = !!(ioflag & FRSYNC);
+#endif
+	if (zfsvfs->z_log &&
+	    (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
+		zil_commit(zfsvfs->z_log, zp->z_id);
+
+	/*
+	 * Lock the range against changes.
+	 */
+	zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
+	    uio->uio_loffset, uio->uio_resid, RL_READER);
+
+	/*
+	 * If we are reading past end-of-file we can skip
+	 * to the end; but we might still need to set atime.
+	 */
+	if (uio->uio_loffset >= zp->z_size) {
+		error = 0;
+		goto out;
+	}
+
+	ASSERT(uio->uio_loffset < zp->z_size);
+	ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
+	ssize_t start_resid = n;
+
+#ifdef HAVE_UIO_ZEROCOPY
+	xuio_t *xuio = NULL;
+	if ((uio->uio_extflg == UIO_XUIO) &&
+	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
+		int nblk;
+		int blksz = zp->z_blksz;
+		uint64_t offset = uio->uio_loffset;
+
+		xuio = (xuio_t *)uio;
+		if ((ISP2(blksz))) {
+			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
+			    blksz)) / blksz;
+		} else {
+			ASSERT(offset + n <= blksz);
+			nblk = 1;
+		}
+		(void) dmu_xuio_init(xuio, nblk);
+
+		if (vn_has_cached_data(ip)) {
+			/*
+			 * For simplicity, we always allocate a full buffer
+			 * even if we only expect to read a portion of a block.
+			 */
+			while (--nblk >= 0) {
+				(void) dmu_xuio_add(xuio,
+				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+				    blksz), 0, blksz);
+			}
+		}
+	}
+#endif /* HAVE_UIO_ZEROCOPY */
+
+	while (n > 0) {
+		ssize_t nbytes = MIN(n, zfs_read_chunk_size -
+		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
+
+		if (zp->z_is_mapped && !(ioflag & O_DIRECT)) {
+			error = mappedread(ip, nbytes, uio);
+		} else {
+			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, nbytes);
+		}
+
+		if (error) {
+			/* convert checksum errors into IO errors */
+			if (error == ECKSUM)
+				error = SET_ERROR(EIO);
+			break;
+		}
+
+		n -= nbytes;
+	}
+
+	int64_t nread = start_resid - n;
+	dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
+	task_io_account_read(nread);
+out:
+	zfs_rangelock_exit(lr);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Write the bytes to a file.
+ *
+ *	IN:	ip	- inode of file to be written to.
+ *		uio	- structure supplying write location, range info,
+ *			  and data buffer.
+ *		ioflag	- O_APPEND flag set if in append mode.
+ *			  O_DIRECT flag; used to bypass page cache.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	uio	- updated offset and range.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	ip - ctime|mtime updated if byte count > 0
+ */
+
+/* ARGSUSED */
+int
+zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
+{
+	int error = 0;
+	ssize_t start_resid = uio->uio_resid;
+
+	/*
+	 * Fasttrack empty write
+	 */
+	ssize_t n = start_resid;
+	if (n == 0)
+		return (0);
+
+	rlim64_t limit = uio->uio_limit;
+	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
+		limit = MAXOFFSET_T;
+
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	sa_bulk_attr_t bulk[4];
+	int count = 0;
+	uint64_t mtime[2], ctime[2];
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+
+	/*
+	 * Callers might not be able to detect properly that we are read-only,
+	 * so check it explicitly here.
+	 */
+	if (zfs_is_readonly(zfsvfs)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * If immutable or not appending then return EPERM
+	 */
+	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
+	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
+	    (uio->uio_loffset < zp->z_size))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	/*
+	 * Validate file offset
+	 */
+	offset_t woff = ioflag & O_APPEND ? zp->z_size : uio->uio_loffset;
+	if (woff < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	int max_blksz = zfsvfs->z_max_blksz;
+	xuio_t *xuio = NULL;
+
+	/*
+	 * Pre-fault the pages to ensure slow (eg NFS) pages
+	 * don't hold up txg.
+	 * Skip this if uio contains loaned arc_buf.
+	 */
+#ifdef HAVE_UIO_ZEROCOPY
+	if ((uio->uio_extflg == UIO_XUIO) &&
+	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+		xuio = (xuio_t *)uio;
+	else
+#endif
+		if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EFAULT));
+		}
+
+	/*
+	 * If in append mode, set the io offset pointer to eof.
+	 */
+	zfs_locked_range_t *lr;
+	if (ioflag & O_APPEND) {
+		/*
+		 * Obtain an appending range lock to guarantee file append
+		 * semantics.  We reset the write offset once we have the lock.
+		 */
+		lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
+		woff = lr->lr_offset;
+		if (lr->lr_length == UINT64_MAX) {
+			/*
+			 * We overlocked the file because this write will cause
+			 * the file block size to increase.
+			 * Note that zp_size cannot change with this lock held.
+			 */
+			woff = zp->z_size;
+		}
+		uio->uio_loffset = woff;
+	} else {
+		/*
+		 * Note that if the file block size will change as a result of
+		 * this write, then this range lock will lock the entire file
+		 * so that we can re-write the block safely.
+		 */
+		lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
+	}
+
+	if (woff >= limit) {
+		zfs_rangelock_exit(lr);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EFBIG));
+	}
+
+	if ((woff + n) > limit || woff > (limit - n))
+		n = limit - woff;
+
+	/* Will this write extend the file length? */
+	int write_eof = (woff + n > zp->z_size);
+
+	uint64_t end_size = MAX(zp->z_size, woff + n);
+	zilog_t *zilog = zfsvfs->z_log;
+#ifdef HAVE_UIO_ZEROCOPY
+	int i_iov = 0;
+	const iovec_t *iovp = uio->uio_iov;
+	int iovcnt __maybe_unused = uio->uio_iovcnt;
+#endif
+
+
+	/*
+	 * Write the file in reasonable size chunks.  Each chunk is written
+	 * in a separate transaction; this keeps the intent log records small
+	 * and allows us to do more fine-grained space accounting.
+	 */
+	while (n > 0) {
+		woff = uio->uio_loffset;
+
+		if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
+		    KUID_TO_SUID(ip->i_uid)) ||
+		    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
+		    KGID_TO_SGID(ip->i_gid)) ||
+		    (zp->z_projid != ZFS_DEFAULT_PROJID &&
+		    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+		    zp->z_projid))) {
+			error = SET_ERROR(EDQUOT);
+			break;
+		}
+
+		arc_buf_t *abuf = NULL;
+		const iovec_t *aiov = NULL;
+		if (xuio) {
+#ifdef HAVE_UIO_ZEROCOPY
+			ASSERT(i_iov < iovcnt);
+			ASSERT3U(uio->uio_segflg, !=, UIO_BVEC);
+			aiov = &iovp[i_iov];
+			abuf = dmu_xuio_arcbuf(xuio, i_iov);
+			dmu_xuio_clear(xuio, i_iov);
+			ASSERT((aiov->iov_base == abuf->b_data) ||
+			    ((char *)aiov->iov_base - (char *)abuf->b_data +
+			    aiov->iov_len == arc_buf_size(abuf)));
+			i_iov++;
+#endif
+		} else if (n >= max_blksz && woff >= zp->z_size &&
+		    P2PHASE(woff, max_blksz) == 0 &&
+		    zp->z_blksz == max_blksz) {
+			/*
+			 * This write covers a full block.  "Borrow" a buffer
+			 * from the dmu so that we can fill it before we enter
+			 * a transaction.  This avoids the possibility of
+			 * holding up the transaction if the data copy hangs
+			 * up on a pagefault (e.g., from an NFS server mapping).
+			 */
+			size_t cbytes;
+
+			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+			    max_blksz);
+			ASSERT(abuf != NULL);
+			ASSERT(arc_buf_size(abuf) == max_blksz);
+			if ((error = uiocopy(abuf->b_data, max_blksz,
+			    UIO_WRITE, uio, &cbytes))) {
+				dmu_return_arcbuf(abuf);
+				break;
+			}
+			ASSERT(cbytes == max_blksz);
+		}
+
+		/*
+		 * Start a transaction.
+		 */
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+		DB_DNODE_ENTER(db);
+		dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
+		    MIN(n, max_blksz));
+		DB_DNODE_EXIT(db);
+		zfs_sa_upgrade_txholds(tx, zp);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			if (abuf != NULL)
+				dmu_return_arcbuf(abuf);
+			break;
+		}
+
+		/*
+		 * If rangelock_enter() over-locked we grow the blocksize
+		 * and then reduce the lock range.  This will only happen
+		 * on the first iteration since rangelock_reduce() will
+		 * shrink down lr_length to the appropriate size.
+		 */
+		if (lr->lr_length == UINT64_MAX) {
+			uint64_t new_blksz;
+
+			if (zp->z_blksz > max_blksz) {
+				/*
+				 * File's blocksize is already larger than the
+				 * "recordsize" property.  Only let it grow to
+				 * the next power of 2.
+				 */
+				ASSERT(!ISP2(zp->z_blksz));
+				new_blksz = MIN(end_size,
+				    1 << highbit64(zp->z_blksz));
+			} else {
+				new_blksz = MIN(end_size, max_blksz);
+			}
+			zfs_grow_blocksize(zp, new_blksz, tx);
+			zfs_rangelock_reduce(lr, woff, n);
+		}
+
+		/*
+		 * XXX - should we really limit each write to z_max_blksz?
+		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+		 */
+		ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+
+		ssize_t tx_bytes;
+		if (abuf == NULL) {
+			tx_bytes = uio->uio_resid;
+			uio->uio_fault_disable = B_TRUE;
+			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, nbytes, tx);
+			uio->uio_fault_disable = B_FALSE;
+			if (error == EFAULT) {
+				dmu_tx_commit(tx);
+				/*
+				 * Account for partial writes before
+				 * continuing the loop.
+				 * Update needs to occur before the next
+				 * uio_prefaultpages, or prefaultpages may
+				 * error, and we may break the loop early.
+				 */
+				if (tx_bytes != uio->uio_resid)
+					n -= tx_bytes - uio->uio_resid;
+				if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+					break;
+				}
+				continue;
+			} else if (error != 0) {
+				dmu_tx_commit(tx);
+				break;
+			}
+			tx_bytes -= uio->uio_resid;
+		} else {
+			tx_bytes = nbytes;
+			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
+			/*
+			 * If this is not a full block write, but we are
+			 * extending the file past EOF and this data starts
+			 * block-aligned, use assign_arcbuf().  Otherwise,
+			 * write via dmu_write().
+			 */
+			if (tx_bytes < max_blksz && (!write_eof ||
+			    aiov->iov_base != abuf->b_data)) {
+				ASSERT(xuio);
+				dmu_write(zfsvfs->z_os, zp->z_id, woff,
+				    /* cppcheck-suppress nullPointer */
+				    aiov->iov_len, aiov->iov_base, tx);
+				dmu_return_arcbuf(abuf);
+				xuio_stat_wbuf_copied();
+			} else {
+				ASSERT(xuio || tx_bytes == max_blksz);
+				error = dmu_assign_arcbuf_by_dbuf(
+				    sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
+				if (error != 0) {
+					dmu_return_arcbuf(abuf);
+					dmu_tx_commit(tx);
+					break;
+				}
+			}
+			ASSERT(tx_bytes <= uio->uio_resid);
+			uioskip(uio, tx_bytes);
+		}
+		if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) {
+			update_pages(ip, woff,
+			    tx_bytes, zfsvfs->z_os, zp->z_id);
+		}
+
+		/*
+		 * If we made no progress, we're done.  If we made even
+		 * partial progress, update the znode and ZIL accordingly.
+		 */
+		if (tx_bytes == 0) {
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+			    (void *)&zp->z_size, sizeof (uint64_t), tx);
+			dmu_tx_commit(tx);
+			ASSERT(error != 0);
+			break;
+		}
+
+		/*
+		 * Clear Set-UID/Set-GID bits on successful write if not
+		 * privileged and at least one of the execute bits is set.
+		 *
+		 * It would be nice to do this after all writes have
+		 * been done, but that would still expose the ISUID/ISGID
+		 * to another app after the partial write is committed.
+		 *
+		 * Note: we don't call zfs_fuid_map_id() here because
+		 * user 0 is not an ephemeral uid.
+		 */
+		mutex_enter(&zp->z_acl_lock);
+		uint32_t uid = KUID_TO_SUID(ip->i_uid);
+		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
+		    (S_IXUSR >> 6))) != 0 &&
+		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
+		    secpolicy_vnode_setid_retain(cr,
+		    ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
+			uint64_t newmode;
+			zp->z_mode &= ~(S_ISUID | S_ISGID);
+			ip->i_mode = newmode = zp->z_mode;
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+			    (void *)&newmode, sizeof (uint64_t), tx);
+		}
+		mutex_exit(&zp->z_acl_lock);
+
+		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+
+		/*
+		 * Update the file size (zp_size) if it has changed;
+		 * account for possible concurrent updates.
+		 */
+		while ((end_size = zp->z_size) < uio->uio_loffset) {
+			(void) atomic_cas_64(&zp->z_size, end_size,
+			    uio->uio_loffset);
+			ASSERT(error == 0);
+		}
+		/*
+		 * If we are replaying and eof is non zero then force
+		 * the file size to the specified eof. Note, there's no
+		 * concurrency during replay.
+		 */
+		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+			zp->z_size = zfsvfs->z_replay_eof;
+
+		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
+		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
+		    NULL, NULL);
+		dmu_tx_commit(tx);
+
+		if (error != 0)
+			break;
+		ASSERT(tx_bytes == nbytes);
+		n -= nbytes;
+
+		if (!xuio && n > 0) {
+			if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+				error = EFAULT;
+				break;
+			}
+		}
+	}
+
+	zfs_inode_update(zp);
+	zfs_rangelock_exit(lr);
+
+	/*
+	 * If we're in replay mode, or we made no progress, return error.
+	 * Otherwise, it's at least a partial write, so it's successful.
+	 */
+	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (ioflag & (O_SYNC | O_DSYNC) ||
+	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, zp->z_id);
+
+	int64_t nwritten = start_resid - uio->uio_resid;
+	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
+	task_io_account_write(nwritten);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Write the bytes to a file.
+ *
+ *	IN:	zp	- znode of file to be written to
+ *		data	- bytes to write
+ *		len	- number of bytes to write
+ *		pos	- offset to start writing at
+ *
+ *	OUT:	resid	- remaining bytes to write
+ *
+ *	RETURN:	0 if success
+ *		positive error code if failure
+ *
+ * Timestamps:
+ *	zp - ctime|mtime updated if byte count > 0
+ */
+int
+zfs_write_simple(znode_t *zp, const void *data, size_t len,
+    loff_t pos, size_t *resid)
+{
+	ssize_t written;
+	int error = 0;
+
+	written = zpl_write_common(ZTOI(zp), data, len, &pos,
+	    UIO_SYSSPACE, 0, kcred);
+	if (written < 0) {
+		error = -written;
+	} else if (resid == NULL) {
+		if (written < len)
+			error = SET_ERROR(EIO); /* short write */
+	} else {
+		*resid = len - written;
+	}
+	return (error);
+}
+
+/*
+ * Drop a reference on the passed inode asynchronously. This ensures
+ * that the caller will never drop the last reference on an inode in
+ * the current context. Doing so while holding open a tx could result
+ * in a deadlock if iput_final() re-enters the filesystem code.
+ */
+void
+zfs_zrele_async(znode_t *zp)
+{
+	struct inode *ip = ZTOI(zp);
+	objset_t *os = ITOZSB(ip)->z_os;
+
+	ASSERT(atomic_read(&ip->i_count) > 0);
+	ASSERT(os != NULL);
+
+	if (atomic_read(&ip->i_count) == 1)
+		VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
+		    (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
+	else
+		zrele(zp);
+}
+
+/* ARGSUSED */
+static void
+zfs_get_done(zgd_t *zgd, int error)
+{
+	znode_t *zp = zgd->zgd_private;
+
+	if (zgd->zgd_db)
+		dmu_buf_rele(zgd->zgd_db, zgd);
+
+	zfs_rangelock_exit(zgd->zgd_lr);
+
+	/*
+	 * Release the vnode asynchronously as we currently have the
+	 * txg stopped from syncing.
+	 */
+	zfs_zrele_async(zp);
+
+	kmem_free(zgd, sizeof (zgd_t));
+}
+
+#ifdef ZFS_DEBUG
+static int zil_fault_io = 0;
+#endif
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+	zfsvfs_t *zfsvfs = arg;
+	objset_t *os = zfsvfs->z_os;
+	znode_t *zp;
+	uint64_t object = lr->lr_foid;
+	uint64_t offset = lr->lr_offset;
+	uint64_t size = lr->lr_length;
+	dmu_buf_t *db;
+	zgd_t *zgd;
+	int error = 0;
+
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT3P(zio, !=, NULL);
+	ASSERT3U(size, !=, 0);
+
+	/*
+	 * Nothing to do if the file has been removed
+	 */
+	if (zfs_zget(zfsvfs, object, &zp) != 0)
+		return (SET_ERROR(ENOENT));
+	if (zp->z_unlinked) {
+		/*
+		 * Release the vnode asynchronously as we currently have the
+		 * txg stopped from syncing.
+		 */
+		zfs_zrele_async(zp);
+		return (SET_ERROR(ENOENT));
+	}
+
+	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+	zgd->zgd_lwb = lwb;
+	zgd->zgd_private = zp;
+
+	/*
+	 * Write records come in two flavors: immediate and indirect.
+	 * For small writes it's cheaper to store the data with the
+	 * log record (immediate); for large writes it's cheaper to
+	 * sync the data and get a pointer to it (indirect) so that
+	 * we don't have to write the data twice.
+	 */
+	if (buf != NULL) { /* immediate write */
+		zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
+		    offset, size, RL_READER);
+		/* test for truncation needs to be done while range locked */
+		if (offset >= zp->z_size) {
+			error = SET_ERROR(ENOENT);
+		} else {
+			error = dmu_read(os, object, offset, size, buf,
+			    DMU_READ_NO_PREFETCH);
+		}
+		ASSERT(error == 0 || error == ENOENT);
+	} else { /* indirect write */
+		/*
+		 * Have to lock the whole block to ensure when it's
+		 * written out and its checksum is being calculated
+		 * that no one can change the data. We need to re-check
+		 * blocksize after we get the lock in case it's changed!
+		 */
+		for (;;) {
+			uint64_t blkoff;
+			size = zp->z_blksz;
+			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
+			offset -= blkoff;
+			zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
+			    offset, size, RL_READER);
+			if (zp->z_blksz == size)
+				break;
+			offset += blkoff;
+			zfs_rangelock_exit(zgd->zgd_lr);
+		}
+		/* test for truncation needs to be done while range locked */
+		if (lr->lr_offset >= zp->z_size)
+			error = SET_ERROR(ENOENT);
+#ifdef ZFS_DEBUG
+		if (zil_fault_io) {
+			error = SET_ERROR(EIO);
+			zil_fault_io = 0;
+		}
+#endif
+		if (error == 0)
+			error = dmu_buf_hold(os, object, offset, zgd, &db,
+			    DMU_READ_NO_PREFETCH);
+
+		if (error == 0) {
+			blkptr_t *bp = &lr->lr_blkptr;
+
+			zgd->zgd_db = db;
+			zgd->zgd_bp = bp;
+
+			ASSERT(db->db_offset == offset);
+			ASSERT(db->db_size == size);
+
+			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+			    zfs_get_done, zgd);
+			ASSERT(error || lr->lr_length <= size);
+
+			/*
+			 * On success, we need to wait for the write I/O
+			 * initiated by dmu_sync() to complete before we can
+			 * release this dbuf.  We will finish everything up
+			 * in the zfs_get_done() callback.
+			 */
+			if (error == 0)
+				return (0);
+
+			if (error == EALREADY) {
+				lr->lr_common.lrc_txtype = TX_WRITE2;
+				/*
+				 * TX_WRITE2 relies on the data previously
+				 * written by the TX_WRITE that caused
+				 * EALREADY.  We zero out the BP because
+				 * it is the old, currently-on-disk BP.
+				 */
+				zgd->zgd_bp = NULL;
+				BP_ZERO(bp);
+				error = 0;
+			}
+		}
+	}
+
+	zfs_get_done(zgd, error);
+
+	return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_access(struct inode *ip, int mode, int flag, cred_t *cr)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if (flag & V_ACE_MASK)
+		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
+	else
+		error = zfs_zaccess_rwx(zp, mode, flag, cr);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held inode reference for it.
+ *
+ *	IN:	zdp	- znode of directory to search.
+ *		nm	- name of entry to lookup.
+ *		flags	- LOOKUP_XATTR set if looking for an attribute.
+ *		cr	- credentials of caller.
+ *		direntflags - directory lookup flags
+ *		realpnp - returned pathname.
+ *
+ *	OUT:	zpp	- znode of located entry, NULL if not found.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	NA
+ */
+/* ARGSUSED */
+int
+zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags,
+    cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zdp);
+	int error = 0;
+
+	/*
+	 * Fast path lookup, however we must skip DNLC lookup
+	 * for case folding or normalizing lookups because the
+	 * DNLC code only stores the passed in name.  This means
+	 * creating 'a' and removing 'A' on a case insensitive
+	 * file system would work, but DNLC still thinks 'a'
+	 * exists and won't let you create it again on the next
+	 * pass through fast path.
+	 */
+	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
+
+		if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
+			return (SET_ERROR(ENOTDIR));
+		} else if (zdp->z_sa_hdl == NULL) {
+			return (SET_ERROR(EIO));
+		}
+
+		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
+			error = zfs_fastaccesschk_execute(zdp, cr);
+			if (!error) {
+				*zpp = zdp;
+				zhold(*zpp);
+				return (0);
+			}
+			return (error);
+		}
+	}
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zdp);
+
+	*zpp = NULL;
+
+	if (flags & LOOKUP_XATTR) {
+		/*
+		 * We don't allow recursive attributes..
+		 * Maybe someday we will.
+		 */
+		if (zdp->z_pflags & ZFS_XATTR) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EINVAL));
+		}
+
+		if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+
+		/*
+		 * Do we have permission to get into attribute directory?
+		 */
+
+		if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
+		    B_FALSE, cr))) {
+			zrele(*zpp);
+			*zpp = NULL;
+		}
+
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENOTDIR));
+	}
+
+	/*
+	 * Check accessibility of directory.
+	 */
+
+	if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+	error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
+	if ((error == 0) && (*zpp))
+		zfs_inode_update(*zpp);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Attempt to create a new entry in a directory.  If the entry
+ * already exists, truncate the file if permissible, else return
+ * an error.  Return the ip of the created or trunc'd file.
+ *
+ *	IN:	dzp	- znode of directory to put new file entry in.
+ *		name	- name of new file entry.
+ *		vap	- attributes of new file.
+ *		excl	- flag indicating exclusive or non-exclusive mode.
+ *		mode	- mode to open file with.
+ *		cr	- credentials of caller.
+ *		flag	- file flag.
+ *		vsecp	- ACL to be set
+ *
+ *	OUT:	zpp	- znode of created or trunc'd entry.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dzp - ctime|mtime updated if new entry created
+ *	 zp - ctime|mtime always, atime if new
+ */
+
+/* ARGSUSED */
+int
+zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
+    int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp)
+{
+	znode_t		*zp;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zilog_t		*zilog;
+	objset_t	*os;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	int		error;
+	uid_t		uid;
+	gid_t		gid;
+	zfs_acl_ids_t   acl_ids;
+	boolean_t	fuid_dirtied;
+	boolean_t	have_acl = B_FALSE;
+	boolean_t	waited = B_FALSE;
+
+	/*
+	 * If we have an ephemeral id, ACL, or XVATTR then
+	 * make sure file system is at proper version
+	 */
+
+	gid = crgetgid(cr);
+	uid = crgetuid(cr);
+
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+		return (SET_ERROR(EINVAL));
+
+	if (name == NULL)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	os = zfsvfs->z_os;
+	zilog = zfsvfs->z_log;
+
+	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+	if (vap->va_mask & ATTR_XVATTR) {
+		if ((error = secpolicy_xvattr((xvattr_t *)vap,
+		    crgetuid(cr), cr, vap->va_mode)) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+top:
+	*zpp = NULL;
+	if (*name == '\0') {
+		/*
+		 * Null component name refers to the directory itself.
+		 */
+		zhold(dzp);
+		zp = dzp;
+		dl = NULL;
+		error = 0;
+	} else {
+		/* possible igrab(zp) */
+		int zflg = 0;
+
+		if (flag & FIGNORECASE)
+			zflg |= ZCILOOK;
+
+		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+		    NULL, NULL);
+		if (error) {
+			if (have_acl)
+				zfs_acl_ids_free(&acl_ids);
+			if (strcmp(name, "..") == 0)
+				error = SET_ERROR(EISDIR);
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	if (zp == NULL) {
+		uint64_t txtype;
+		uint64_t projid = ZFS_DEFAULT_PROJID;
+
+		/*
+		 * Create a new file object and update the directory
+		 * to reference it.
+		 */
+		if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+			if (have_acl)
+				zfs_acl_ids_free(&acl_ids);
+			goto out;
+		}
+
+		/*
+		 * We only support the creation of regular files in
+		 * extended attribute directories.
+		 */
+
+		if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
+			if (have_acl)
+				zfs_acl_ids_free(&acl_ids);
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+
+		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
+		    cr, vsecp, &acl_ids)) != 0)
+			goto out;
+		have_acl = B_TRUE;
+
+		if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
+			projid = zfs_inherit_projid(dzp);
+		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
+			zfs_acl_ids_free(&acl_ids);
+			error = SET_ERROR(EDQUOT);
+			goto out;
+		}
+
+		tx = dmu_tx_create(os);
+
+		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+		    ZFS_SA_BASE_ATTR_SIZE);
+
+		fuid_dirtied = zfsvfs->z_fuid_dirty;
+		if (fuid_dirtied)
+			zfs_fuid_txhold(zfsvfs, tx);
+		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+		if (!zfsvfs->z_use_sa &&
+		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+			    0, acl_ids.z_aclp->z_acl_bytes);
+		}
+
+		error = dmu_tx_assign(tx,
+		    (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+		if (error) {
+			zfs_dirent_unlock(dl);
+			if (error == ERESTART) {
+				waited = B_TRUE;
+				dmu_tx_wait(tx);
+				dmu_tx_abort(tx);
+				goto top;
+			}
+			zfs_acl_ids_free(&acl_ids);
+			dmu_tx_abort(tx);
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+		error = zfs_link_create(dl, zp, tx, ZNEW);
+		if (error != 0) {
+			/*
+			 * Since, we failed to add the directory entry for it,
+			 * delete the newly created dnode.
+			 */
+			zfs_znode_delete(zp, tx);
+			remove_inode_hash(ZTOI(zp));
+			zfs_acl_ids_free(&acl_ids);
+			dmu_tx_commit(tx);
+			goto out;
+		}
+
+		if (fuid_dirtied)
+			zfs_fuid_sync(zfsvfs, tx);
+
+		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+		if (flag & FIGNORECASE)
+			txtype |= TX_CI;
+		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+		    vsecp, acl_ids.z_fuidp, vap);
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_commit(tx);
+	} else {
+		int aflags = (flag & O_APPEND) ? V_APPEND : 0;
+
+		if (have_acl)
+			zfs_acl_ids_free(&acl_ids);
+		have_acl = B_FALSE;
+
+		/*
+		 * A directory entry already exists for this name.
+		 */
+		/*
+		 * Can't truncate an existing file if in exclusive mode.
+		 */
+		if (excl) {
+			error = SET_ERROR(EEXIST);
+			goto out;
+		}
+		/*
+		 * Can't open a directory for writing.
+		 */
+		if (S_ISDIR(ZTOI(zp)->i_mode)) {
+			error = SET_ERROR(EISDIR);
+			goto out;
+		}
+		/*
+		 * Verify requested access to file.
+		 */
+		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
+			goto out;
+		}
+
+		mutex_enter(&dzp->z_lock);
+		dzp->z_seq++;
+		mutex_exit(&dzp->z_lock);
+
+		/*
+		 * Truncate regular files if requested.
+		 */
+		if (S_ISREG(ZTOI(zp)->i_mode) &&
+		    (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
+			/* we can't hold any locks when calling zfs_freesp() */
+			if (dl) {
+				zfs_dirent_unlock(dl);
+				dl = NULL;
+			}
+			error = zfs_freesp(zp, 0, 0, mode, TRUE);
+		}
+	}
+out:
+
+	if (dl)
+		zfs_dirent_unlock(dl);
+
+	if (error) {
+		if (zp)
+			zrele(zp);
+	} else {
+		zfs_inode_update(dzp);
+		zfs_inode_update(zp);
+		*zpp = zp;
+	}
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/* ARGSUSED */
+int
+zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
+    int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
+{
+	znode_t		*zp = NULL, *dzp = ITOZ(dip);
+	zfsvfs_t	*zfsvfs = ITOZSB(dip);
+	objset_t	*os;
+	dmu_tx_t	*tx;
+	int		error;
+	uid_t		uid;
+	gid_t		gid;
+	zfs_acl_ids_t   acl_ids;
+	uint64_t	projid = ZFS_DEFAULT_PROJID;
+	boolean_t	fuid_dirtied;
+	boolean_t	have_acl = B_FALSE;
+	boolean_t	waited = B_FALSE;
+
+	/*
+	 * If we have an ephemeral id, ACL, or XVATTR then
+	 * make sure file system is at proper version
+	 */
+
+	gid = crgetgid(cr);
+	uid = crgetuid(cr);
+
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	os = zfsvfs->z_os;
+
+	if (vap->va_mask & ATTR_XVATTR) {
+		if ((error = secpolicy_xvattr((xvattr_t *)vap,
+		    crgetuid(cr), cr, vap->va_mode)) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+top:
+	*ipp = NULL;
+
+	/*
+	 * Create a new file object and update the directory
+	 * to reference it.
+	 */
+	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+		if (have_acl)
+			zfs_acl_ids_free(&acl_ids);
+		goto out;
+	}
+
+	if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
+	    cr, vsecp, &acl_ids)) != 0)
+		goto out;
+	have_acl = B_TRUE;
+
+	if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
+		projid = zfs_inherit_projid(dzp);
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
+		zfs_acl_ids_free(&acl_ids);
+		error = SET_ERROR(EDQUOT);
+		goto out;
+	}
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	if (!zfsvfs->z_use_sa &&
+	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+		    0, acl_ids.z_aclp->z_acl_bytes);
+	}
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+	if (error) {
+		if (error == ERESTART) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	/* Add to unlinked set */
+	zp->z_unlinked = B_TRUE;
+	zfs_unlinked_add(zp, tx);
+	zfs_acl_ids_free(&acl_ids);
+	dmu_tx_commit(tx);
+out:
+
+	if (error) {
+		if (zp)
+			zrele(zp);
+	} else {
+		zfs_inode_update(dzp);
+		zfs_inode_update(zp);
+		*ipp = ZTOI(zp);
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Remove an entry from a directory.
+ *
+ *	IN:	dzp	- znode of directory to remove entry from.
+ *		name	- name of entry to remove.
+ *		cr	- credentials of caller.
+ *		flags	- case flags.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	dzp - ctime|mtime
+ *	 ip - ctime (if nlink > 0)
+ */
+
+uint64_t null_xattr = 0;
+
+/*ARGSUSED*/
+int
+zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
+{
+	znode_t		*zp;
+	znode_t		*xzp;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zilog_t		*zilog;
+	uint64_t	acl_obj, xattr_obj;
+	uint64_t	xattr_obj_unlinked = 0;
+	uint64_t	obj = 0;
+	uint64_t	links;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	boolean_t	may_delete_now, delete_now = FALSE;
+	boolean_t	unlinked, toobig = FALSE;
+	uint64_t	txtype;
+	pathname_t	*realnmp = NULL;
+	pathname_t	realnm;
+	int		error;
+	int		zflg = ZEXISTS;
+	boolean_t	waited = B_FALSE;
+
+	if (name == NULL)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
+
+	if (flags & FIGNORECASE) {
+		zflg |= ZCILOOK;
+		pn_alloc(&realnm);
+		realnmp = &realnm;
+	}
+
+top:
+	xattr_obj = 0;
+	xzp = NULL;
+	/*
+	 * Attempt to lock directory; fail if entry doesn't exist.
+	 */
+	if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+	    NULL, realnmp))) {
+		if (realnmp)
+			pn_free(realnmp);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+		goto out;
+	}
+
+	/*
+	 * Need to use rmdir for removing directories.
+	 */
+	if (S_ISDIR(ZTOI(zp)->i_mode)) {
+		error = SET_ERROR(EPERM);
+		goto out;
+	}
+
+	mutex_enter(&zp->z_lock);
+	may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
+	    !(zp->z_is_mapped);
+	mutex_exit(&zp->z_lock);
+
+	/*
+	 * We may delete the znode now, or we may put it in the unlinked set;
+	 * it depends on whether we're the last link, and on whether there are
+	 * other holds on the inode.  So we dmu_tx_hold() the right things to
+	 * allow for either case.
+	 */
+	obj = zp->z_id;
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	zfs_sa_upgrade_txholds(tx, dzp);
+	if (may_delete_now) {
+		toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
+		/* if the file is too big, only hold_free a token amount */
+		dmu_tx_hold_free(tx, zp->z_id, 0,
+		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
+	}
+
+	/* are there any extended attributes? */
+	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+	    &xattr_obj, sizeof (xattr_obj));
+	if (error == 0 && xattr_obj) {
+		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+		ASSERT0(error);
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+	}
+
+	mutex_enter(&zp->z_lock);
+	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
+		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+	mutex_exit(&zp->z_lock);
+
+	/* charge as an update -- would be nice not to charge at all */
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+	/*
+	 * Mark this transaction as typically resulting in a net free of space
+	 */
+	dmu_tx_mark_netfree(tx);
+
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+	if (error) {
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			zrele(zp);
+			if (xzp)
+				zrele(xzp);
+			goto top;
+		}
+		if (realnmp)
+			pn_free(realnmp);
+		dmu_tx_abort(tx);
+		zrele(zp);
+		if (xzp)
+			zrele(xzp);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Remove the directory entry.
+	 */
+	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
+
+	if (error) {
+		dmu_tx_commit(tx);
+		goto out;
+	}
+
+	if (unlinked) {
+		/*
+		 * Hold z_lock so that we can make sure that the ACL obj
+		 * hasn't changed.  Could have been deleted due to
+		 * zfs_sa_upgrade().
+		 */
+		mutex_enter(&zp->z_lock);
+		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
+		delete_now = may_delete_now && !toobig &&
+		    atomic_read(&ZTOI(zp)->i_count) == 1 &&
+		    !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked &&
+		    zfs_external_acl(zp) == acl_obj;
+	}
+
+	if (delete_now) {
+		if (xattr_obj_unlinked) {
+			ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
+			mutex_enter(&xzp->z_lock);
+			xzp->z_unlinked = B_TRUE;
+			clear_nlink(ZTOI(xzp));
+			links = 0;
+			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+			    &links, sizeof (links), tx);
+			ASSERT3U(error,  ==,  0);
+			mutex_exit(&xzp->z_lock);
+			zfs_unlinked_add(xzp, tx);
+
+			if (zp->z_is_sa)
+				error = sa_remove(zp->z_sa_hdl,
+				    SA_ZPL_XATTR(zfsvfs), tx);
+			else
+				error = sa_update(zp->z_sa_hdl,
+				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
+				    sizeof (uint64_t), tx);
+			ASSERT0(error);
+		}
+		/*
+		 * Add to the unlinked set because a new reference could be
+		 * taken concurrently resulting in a deferred destruction.
+		 */
+		zfs_unlinked_add(zp, tx);
+		mutex_exit(&zp->z_lock);
+	} else if (unlinked) {
+		mutex_exit(&zp->z_lock);
+		zfs_unlinked_add(zp, tx);
+	}
+
+	txtype = TX_REMOVE;
+	if (flags & FIGNORECASE)
+		txtype |= TX_CI;
+	zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
+
+	dmu_tx_commit(tx);
+out:
+	if (realnmp)
+		pn_free(realnmp);
+
+	zfs_dirent_unlock(dl);
+	zfs_inode_update(dzp);
+	zfs_inode_update(zp);
+
+	if (delete_now)
+		zrele(zp);
+	else
+		zfs_zrele_async(zp);
+
+	if (xzp) {
+		zfs_inode_update(xzp);
+		zfs_zrele_async(xzp);
+	}
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Create a new directory and insert it into dzp using the name
+ * provided.  Return a pointer to the inserted directory.
+ *
+ *	IN:	dzp	- znode of directory to add subdir to.
+ *		dirname	- name of new directory.
+ *		vap	- attributes of new directory.
+ *		cr	- credentials of caller.
+ *		flags	- case flags.
+ *		vsecp	- ACL to be set
+ *
+ *	OUT:	zpp	- znode of created directory.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	dzp - ctime|mtime updated
+ *	zpp - ctime|mtime|atime updated
+ */
+/*ARGSUSED*/
+int
+zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
+    cred_t *cr, int flags, vsecattr_t *vsecp)
+{
+	znode_t		*zp;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zilog_t		*zilog;
+	zfs_dirlock_t	*dl;
+	uint64_t	txtype;
+	dmu_tx_t	*tx;
+	int		error;
+	int		zf = ZNEW;
+	uid_t		uid;
+	gid_t		gid = crgetgid(cr);
+	zfs_acl_ids_t   acl_ids;
+	boolean_t	fuid_dirtied;
+	boolean_t	waited = B_FALSE;
+
+	ASSERT(S_ISDIR(vap->va_mode));
+
+	/*
+	 * If we have an ephemeral id, ACL, or XVATTR then
+	 * make sure file system is at proper version
+	 */
+
+	uid = crgetuid(cr);
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+		return (SET_ERROR(EINVAL));
+
+	if (dirname == NULL)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
+
+	if (dzp->z_pflags & ZFS_XATTR) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(dirname,
+	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+	if (flags & FIGNORECASE)
+		zf |= ZCILOOK;
+
+	if (vap->va_mask & ATTR_XVATTR) {
+		if ((error = secpolicy_xvattr((xvattr_t *)vap,
+		    crgetuid(cr), cr, vap->va_mode)) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
+	    vsecp, &acl_ids)) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	/*
+	 * First make sure the new directory doesn't exist.
+	 *
+	 * Existence is checked first to make sure we don't return
+	 * EACCES instead of EEXIST which can cause some applications
+	 * to fail.
+	 */
+top:
+	*zpp = NULL;
+
+	if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
+	    NULL, NULL))) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_dirent_unlock(dl);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_dirent_unlock(dl);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EDQUOT));
+	}
+
+	/*
+	 * Add a new entry to the directory.
+	 */
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+		    acl_ids.z_aclp->z_acl_bytes);
+	}
+
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+	if (error) {
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Create new node.
+	 */
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+	/*
+	 * Now put new name in parent dir.
+	 */
+	error = zfs_link_create(dl, zp, tx, ZNEW);
+	if (error != 0) {
+		zfs_znode_delete(zp, tx);
+		remove_inode_hash(ZTOI(zp));
+		goto out;
+	}
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	*zpp = zp;
+
+	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
+	if (flags & FIGNORECASE)
+		txtype |= TX_CI;
+	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
+	    acl_ids.z_fuidp, vap);
+
+out:
+	zfs_acl_ids_free(&acl_ids);
+
+	dmu_tx_commit(tx);
+
+	zfs_dirent_unlock(dl);
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	if (error != 0) {
+		zrele(zp);
+	} else {
+		zfs_inode_update(dzp);
+		zfs_inode_update(zp);
+	}
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Remove a directory subdir entry.  If the current working
+ * directory is the same as the subdir to be removed, the
+ * remove will fail.
+ *
+ *	IN:	dzp	- znode of directory to remove from.
+ *		name	- name of directory to be removed.
+ *		cwd	- inode of current working directory.
+ *		cr	- credentials of caller.
+ *		flags	- case flags
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dzp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
+    int flags)
+{
+	znode_t		*zp;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zilog_t		*zilog;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	int		error;
+	int		zflg = ZEXISTS;
+	boolean_t	waited = B_FALSE;
+
+	if (name == NULL)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
+
+	if (flags & FIGNORECASE)
+		zflg |= ZCILOOK;
+top:
+	zp = NULL;
+
+	/*
+	 * Attempt to lock directory; fail if entry doesn't exist.
+	 */
+	if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+	    NULL, NULL))) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+		goto out;
+	}
+
+	if (!S_ISDIR(ZTOI(zp)->i_mode)) {
+		error = SET_ERROR(ENOTDIR);
+		goto out;
+	}
+
+	if (zp == cwd) {
+		error = SET_ERROR(EINVAL);
+		goto out;
+	}
+
+	/*
+	 * Grab a lock on the directory to make sure that no one is
+	 * trying to add (or lookup) entries while we are removing it.
+	 */
+	rw_enter(&zp->z_name_lock, RW_WRITER);
+
+	/*
+	 * Grab a lock on the parent pointer to make sure we play well
+	 * with the treewalk and directory rename code.
+	 */
+	rw_enter(&zp->z_parent_lock, RW_WRITER);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+	zfs_sa_upgrade_txholds(tx, zp);
+	zfs_sa_upgrade_txholds(tx, dzp);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+	if (error) {
+		rw_exit(&zp->z_parent_lock);
+		rw_exit(&zp->z_name_lock);
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			zrele(zp);
+			goto top;
+		}
+		dmu_tx_abort(tx);
+		zrele(zp);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
+
+	if (error == 0) {
+		uint64_t txtype = TX_RMDIR;
+		if (flags & FIGNORECASE)
+			txtype |= TX_CI;
+		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
+		    B_FALSE);
+	}
+
+	dmu_tx_commit(tx);
+
+	rw_exit(&zp->z_parent_lock);
+	rw_exit(&zp->z_name_lock);
+out:
+	zfs_dirent_unlock(dl);
+
+	zfs_inode_update(dzp);
+	zfs_inode_update(zp);
+	zrele(zp);
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Read directory entries from the given directory cursor position and emit
+ * name and position for each entry.
+ *
+ *	IN:	ip	- inode of directory to read.
+ *		ctx	- directory entry context.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	ip - atime updated
+ *
+ * Note that the low 4 bits of the cookie returned by zap is always zero.
+ * This allows us to use the low range for "special" directory entries:
+ * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
+ * we use the offset 2 for the '.zfs' directory.
+ */
+/* ARGSUSED */
+int
+zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
+{
+	znode_t		*zp = ITOZ(ip);
+	zfsvfs_t	*zfsvfs = ITOZSB(ip);
+	objset_t	*os;
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	int		error;
+	uint8_t		prefetch;
+	uint8_t		type;
+	int		done = 0;
+	uint64_t	parent;
+	uint64_t	offset; /* must be unsigned; checks for < 1 */
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (parent))) != 0)
+		goto out;
+
+	/*
+	 * Quit if directory has been removed (posix)
+	 */
+	if (zp->z_unlinked)
+		goto out;
+
+	error = 0;
+	os = zfsvfs->z_os;
+	offset = ctx->pos;
+	prefetch = zp->z_zn_prefetch;
+
+	/*
+	 * Initialize the iterator cursor.
+	 */
+	if (offset <= 3) {
+		/*
+		 * Start iteration from the beginning of the directory.
+		 */
+		zap_cursor_init(&zc, os, zp->z_id);
+	} else {
+		/*
+		 * The offset is a serialized cursor.
+		 */
+		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
+	}
+
+	/*
+	 * Transform to file-system independent format
+	 */
+	while (!done) {
+		uint64_t objnum;
+		/*
+		 * Special case `.', `..', and `.zfs'.
+		 */
+		if (offset == 0) {
+			(void) strcpy(zap.za_name, ".");
+			zap.za_normalization_conflict = 0;
+			objnum = zp->z_id;
+			type = DT_DIR;
+		} else if (offset == 1) {
+			(void) strcpy(zap.za_name, "..");
+			zap.za_normalization_conflict = 0;
+			objnum = parent;
+			type = DT_DIR;
+		} else if (offset == 2 && zfs_show_ctldir(zp)) {
+			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+			zap.za_normalization_conflict = 0;
+			objnum = ZFSCTL_INO_ROOT;
+			type = DT_DIR;
+		} else {
+			/*
+			 * Grab next entry.
+			 */
+			if ((error = zap_cursor_retrieve(&zc, &zap))) {
+				if (error == ENOENT)
+					break;
+				else
+					goto update;
+			}
+
+			/*
+			 * Allow multiple entries provided the first entry is
+			 * the object id.  Non-zpl consumers may safely make
+			 * use of the additional space.
+			 *
+			 * XXX: This should be a feature flag for compatibility
+			 */
+			if (zap.za_integer_length != 8 ||
+			    zap.za_num_integers == 0) {
+				cmn_err(CE_WARN, "zap_readdir: bad directory "
+				    "entry, obj = %lld, offset = %lld, "
+				    "length = %d, num = %lld\n",
+				    (u_longlong_t)zp->z_id,
+				    (u_longlong_t)offset,
+				    zap.za_integer_length,
+				    (u_longlong_t)zap.za_num_integers);
+				error = SET_ERROR(ENXIO);
+				goto update;
+			}
+
+			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
+			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+		}
+
+		done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
+		    objnum, type);
+		if (done)
+			break;
+
+		/* Prefetch znode */
+		if (prefetch) {
+			dmu_prefetch(os, objnum, 0, 0, 0,
+			    ZIO_PRIORITY_SYNC_READ);
+		}
+
+		/*
+		 * Move to the next entry, fill in the previous offset.
+		 */
+		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
+			zap_cursor_advance(&zc);
+			offset = zap_cursor_serialize(&zc);
+		} else {
+			offset += 1;
+		}
+		ctx->pos = offset;
+	}
+	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
+
+update:
+	zap_cursor_fini(&zc);
+	if (error == ENOENT)
+		error = 0;
+out:
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+ulong_t zfs_fsync_sync_cnt = 4;
+
+int
+zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+
+	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
+
+	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+		ZFS_ENTER(zfsvfs);
+		ZFS_VERIFY_ZP(zp);
+		zil_commit(zfsvfs->z_log, zp->z_id);
+		ZFS_EXIT(zfsvfs);
+	}
+	tsd_set(zfs_fsyncer_key, NULL);
+
+	return (0);
+}
+
+/*
+ * Get the basic file attributes and place them in the provided kstat
+ * structure.  The inode is assumed to be the authoritative source
+ * for most of the attributes.  However, the znode currently has the
+ * authoritative atime, blksize, and block count.
+ *
+ *	IN:	ip	- inode of file.
+ *
+ *	OUT:	sp	- kstat values.
+ *
+ *	RETURN:	0 (always succeeds)
+ */
+/* ARGSUSED */
+int
+zfs_getattr_fast(struct inode *ip, struct kstat *sp)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	uint32_t blksize;
+	u_longlong_t nblocks;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	mutex_enter(&zp->z_lock);
+
+	generic_fillattr(ip, sp);
+	/*
+	 * +1 link count for root inode with visible '.zfs' directory.
+	 */
+	if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
+		if (sp->nlink < ZFS_LINK_MAX)
+			sp->nlink++;
+
+	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
+	sp->blksize = blksize;
+	sp->blocks = nblocks;
+
+	if (unlikely(zp->z_blksz == 0)) {
+		/*
+		 * Block size hasn't been set; suggest maximal I/O transfers.
+		 */
+		sp->blksize = zfsvfs->z_max_blksz;
+	}
+
+	mutex_exit(&zp->z_lock);
+
+	/*
+	 * Required to prevent NFS client from detecting different inode
+	 * numbers of snapshot root dentry before and after snapshot mount.
+	 */
+	if (zfsvfs->z_issnap) {
+		if (ip->i_sb->s_root->d_inode == ip)
+			sp->ino = ZFSCTL_INO_SNAPDIRS -
+			    dmu_objset_id(zfsvfs->z_os);
+	}
+
+	ZFS_EXIT(zfsvfs);
+
+	return (0);
+}
+
+/*
+ * For the operation of changing file's user/group/project, we need to
+ * handle not only the main object that is assigned to the file directly,
+ * but also the ones that are used by the file via hidden xattr directory.
+ *
+ * Because the xattr directory may contains many EA entries, as to it may
+ * be impossible to change all of them via the transaction of changing the
+ * main object's user/group/project attributes. Then we have to change them
+ * via other multiple independent transactions one by one. It may be not good
+ * solution, but we have no better idea yet.
+ */
+static int
+zfs_setattr_dir(znode_t *dzp)
+{
+	struct inode	*dxip = ZTOI(dzp);
+	struct inode	*xip = NULL;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	objset_t	*os = zfsvfs->z_os;
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	zfs_dirlock_t	*dl;
+	znode_t		*zp;
+	dmu_tx_t	*tx = NULL;
+	uint64_t	uid, gid;
+	sa_bulk_attr_t	bulk[4];
+	int		count;
+	int		err;
+
+	zap_cursor_init(&zc, os, dzp->z_id);
+	while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
+		count = 0;
+		if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
+			err = ENXIO;
+			break;
+		}
+
+		err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
+		    ZEXISTS, NULL, NULL);
+		if (err == ENOENT)
+			goto next;
+		if (err)
+			break;
+
+		xip = ZTOI(zp);
+		if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
+		    KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
+		    zp->z_projid == dzp->z_projid)
+			goto next;
+
+		tx = dmu_tx_create(os);
+		if (!(zp->z_pflags & ZFS_PROJID))
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+		else
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err)
+			break;
+
+		mutex_enter(&dzp->z_lock);
+
+		if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
+			xip->i_uid = dxip->i_uid;
+			uid = zfs_uid_read(dxip);
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+			    &uid, sizeof (uid));
+		}
+
+		if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
+			xip->i_gid = dxip->i_gid;
+			gid = zfs_gid_read(dxip);
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+			    &gid, sizeof (gid));
+		}
+
+		if (zp->z_projid != dzp->z_projid) {
+			if (!(zp->z_pflags & ZFS_PROJID)) {
+				zp->z_pflags |= ZFS_PROJID;
+				SA_ADD_BULK_ATTR(bulk, count,
+				    SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
+				    sizeof (zp->z_pflags));
+			}
+
+			zp->z_projid = dzp->z_projid;
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
+			    NULL, &zp->z_projid, sizeof (zp->z_projid));
+		}
+
+		mutex_exit(&dzp->z_lock);
+
+		if (likely(count > 0)) {
+			err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+			dmu_tx_commit(tx);
+		} else {
+			dmu_tx_abort(tx);
+		}
+		tx = NULL;
+		if (err != 0 && err != ENOENT)
+			break;
+
+next:
+		if (zp) {
+			zrele(zp);
+			zp = NULL;
+			zfs_dirent_unlock(dl);
+		}
+		zap_cursor_advance(&zc);
+	}
+
+	if (tx)
+		dmu_tx_abort(tx);
+	if (zp) {
+		zrele(zp);
+		zfs_dirent_unlock(dl);
+	}
+	zap_cursor_fini(&zc);
+
+	return (err == ENOENT ? 0 : err);
+}
+
+/*
+ * Set the file attributes to the values contained in the
+ * vattr structure.
+ *
+ *	IN:	zp	- znode of file to be modified.
+ *		vap	- new attribute values.
+ *			  If ATTR_XVATTR set, then optional attrs are being set
+ *		flags	- ATTR_UTIME set if non-default time values provided.
+ *			- ATTR_NOACLCHECK (CIFS context only).
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	ip - ctime updated, mtime updated if size changed.
+ */
+/* ARGSUSED */
+int
+zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
+{
+	struct inode	*ip;
+	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
+	objset_t	*os = zfsvfs->z_os;
+	zilog_t		*zilog;
+	dmu_tx_t	*tx;
+	vattr_t		oldva;
+	xvattr_t	*tmpxvattr;
+	uint_t		mask = vap->va_mask;
+	uint_t		saved_mask = 0;
+	int		trim_mask = 0;
+	uint64_t	new_mode;
+	uint64_t	new_kuid = 0, new_kgid = 0, new_uid, new_gid;
+	uint64_t	xattr_obj;
+	uint64_t	mtime[2], ctime[2], atime[2];
+	uint64_t	projid = ZFS_INVALID_PROJID;
+	znode_t		*attrzp;
+	int		need_policy = FALSE;
+	int		err, err2 = 0;
+	zfs_fuid_info_t *fuidp = NULL;
+	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
+	xoptattr_t	*xoap;
+	zfs_acl_t	*aclp;
+	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+	boolean_t	fuid_dirtied = B_FALSE;
+	boolean_t	handle_eadir = B_FALSE;
+	sa_bulk_attr_t	*bulk, *xattr_bulk;
+	int		count = 0, xattr_count = 0, bulks = 8;
+
+	if (mask == 0)
+		return (0);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+	ip = ZTOI(zp);
+
+	/*
+	 * If this is a xvattr_t, then get a pointer to the structure of
+	 * optional attributes.  If this is NULL, then we have a vattr_t.
+	 */
+	xoap = xva_getxoptattr(xvap);
+	if (xoap != NULL && (mask & ATTR_XVATTR)) {
+		if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+			if (!dmu_objset_projectquota_enabled(os) ||
+			    (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
+				ZFS_EXIT(zfsvfs);
+				return (SET_ERROR(ENOTSUP));
+			}
+
+			projid = xoap->xoa_projid;
+			if (unlikely(projid == ZFS_INVALID_PROJID)) {
+				ZFS_EXIT(zfsvfs);
+				return (SET_ERROR(EINVAL));
+			}
+
+			if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
+				projid = ZFS_INVALID_PROJID;
+			else
+				need_policy = TRUE;
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
+		    (xoap->xoa_projinherit !=
+		    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
+		    (!dmu_objset_projectquota_enabled(os) ||
+		    (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(ENOTSUP));
+		}
+	}
+
+	zilog = zfsvfs->z_log;
+
+	/*
+	 * Make sure that if we have ephemeral uid/gid or xvattr specified
+	 * that file system is at proper version level
+	 */
+
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
+	    ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
+	    (mask & ATTR_XVATTR))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EISDIR));
+	}
+
+	if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
+	xva_init(tmpxvattr);
+
+	bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
+	xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
+
+	/*
+	 * Immutable files can only alter immutable bit and atime
+	 */
+	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
+	    ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
+	    ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
+		err = SET_ERROR(EPERM);
+		goto out3;
+	}
+
+	if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
+		err = SET_ERROR(EPERM);
+		goto out3;
+	}
+
+	/*
+	 * Verify timestamps doesn't overflow 32 bits.
+	 * ZFS can handle large timestamps, but 32bit syscalls can't
+	 * handle times greater than 2039.  This check should be removed
+	 * once large timestamps are fully supported.
+	 */
+	if (mask & (ATTR_ATIME | ATTR_MTIME)) {
+		if (((mask & ATTR_ATIME) &&
+		    TIMESPEC_OVERFLOW(&vap->va_atime)) ||
+		    ((mask & ATTR_MTIME) &&
+		    TIMESPEC_OVERFLOW(&vap->va_mtime))) {
+			err = SET_ERROR(EOVERFLOW);
+			goto out3;
+		}
+	}
+
+top:
+	attrzp = NULL;
+	aclp = NULL;
+
+	/* Can this be moved to before the top label? */
+	if (zfs_is_readonly(zfsvfs)) {
+		err = SET_ERROR(EROFS);
+		goto out3;
+	}
+
+	/*
+	 * First validate permissions
+	 */
+
+	if (mask & ATTR_SIZE) {
+		err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
+		if (err)
+			goto out3;
+
+		/*
+		 * XXX - Note, we are not providing any open
+		 * mode flags here (like FNDELAY), so we may
+		 * block if there are locks present... this
+		 * should be addressed in openat().
+		 */
+		/* XXX - would it be OK to generate a log record here? */
+		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
+		if (err)
+			goto out3;
+	}
+
+	if (mask & (ATTR_ATIME|ATTR_MTIME) ||
+	    ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
+	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
+	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
+	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
+	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
+	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
+		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
+		    skipaclchk, cr);
+	}
+
+	if (mask & (ATTR_UID|ATTR_GID)) {
+		int	idmask = (mask & (ATTR_UID|ATTR_GID));
+		int	take_owner;
+		int	take_group;
+
+		/*
+		 * NOTE: even if a new mode is being set,
+		 * we may clear S_ISUID/S_ISGID bits.
+		 */
+
+		if (!(mask & ATTR_MODE))
+			vap->va_mode = zp->z_mode;
+
+		/*
+		 * Take ownership or chgrp to group we are a member of
+		 */
+
+		take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
+		take_group = (mask & ATTR_GID) &&
+		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
+
+		/*
+		 * If both ATTR_UID and ATTR_GID are set then take_owner and
+		 * take_group must both be set in order to allow taking
+		 * ownership.
+		 *
+		 * Otherwise, send the check through secpolicy_vnode_setattr()
+		 *
+		 */
+
+		if (((idmask == (ATTR_UID|ATTR_GID)) &&
+		    take_owner && take_group) ||
+		    ((idmask == ATTR_UID) && take_owner) ||
+		    ((idmask == ATTR_GID) && take_group)) {
+			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
+			    skipaclchk, cr) == 0) {
+				/*
+				 * Remove setuid/setgid for non-privileged users
+				 */
+				(void) secpolicy_setid_clear(vap, cr);
+				trim_mask = (mask & (ATTR_UID|ATTR_GID));
+			} else {
+				need_policy =  TRUE;
+			}
+		} else {
+			need_policy =  TRUE;
+		}
+	}
+
+	mutex_enter(&zp->z_lock);
+	oldva.va_mode = zp->z_mode;
+	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
+	if (mask & ATTR_XVATTR) {
+		/*
+		 * Update xvattr mask to include only those attributes
+		 * that are actually changing.
+		 *
+		 * the bits will be restored prior to actually setting
+		 * the attributes so the caller thinks they were set.
+		 */
+		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+			if (xoap->xoa_appendonly !=
+			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
+				XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+			if (xoap->xoa_projinherit !=
+			    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
+				XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+			if (xoap->xoa_nounlink !=
+			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
+				XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+			if (xoap->xoa_immutable !=
+			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
+				XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+			if (xoap->xoa_nodump !=
+			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_NODUMP);
+				XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+			if (xoap->xoa_av_modified !=
+			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
+				XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+			if ((!S_ISREG(ip->i_mode) &&
+			    xoap->xoa_av_quarantined) ||
+			    xoap->xoa_av_quarantined !=
+			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
+				XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+			mutex_exit(&zp->z_lock);
+			err = SET_ERROR(EPERM);
+			goto out3;
+		}
+
+		if (need_policy == FALSE &&
+		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
+		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
+			need_policy = TRUE;
+		}
+	}
+
+	mutex_exit(&zp->z_lock);
+
+	if (mask & ATTR_MODE) {
+		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
+			err = secpolicy_setid_setsticky_clear(ip, vap,
+			    &oldva, cr);
+			if (err)
+				goto out3;
+
+			trim_mask |= ATTR_MODE;
+		} else {
+			need_policy = TRUE;
+		}
+	}
+
+	if (need_policy) {
+		/*
+		 * If trim_mask is set then take ownership
+		 * has been granted or write_acl is present and user
+		 * has the ability to modify mode.  In that case remove
+		 * UID|GID and or MODE from mask so that
+		 * secpolicy_vnode_setattr() doesn't revoke it.
+		 */
+
+		if (trim_mask) {
+			saved_mask = vap->va_mask;
+			vap->va_mask &= ~trim_mask;
+		}
+		err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
+		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
+		if (err)
+			goto out3;
+
+		if (trim_mask)
+			vap->va_mask |= saved_mask;
+	}
+
+	/*
+	 * secpolicy_vnode_setattr, or take ownership may have
+	 * changed va_mask
+	 */
+	mask = vap->va_mask;
+
+	if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
+		handle_eadir = B_TRUE;
+		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+		    &xattr_obj, sizeof (xattr_obj));
+
+		if (err == 0 && xattr_obj) {
+			err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
+			if (err)
+				goto out2;
+		}
+		if (mask & ATTR_UID) {
+			new_kuid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
+			if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
+			    zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
+			    new_kuid)) {
+				if (attrzp)
+					zrele(attrzp);
+				err = SET_ERROR(EDQUOT);
+				goto out2;
+			}
+		}
+
+		if (mask & ATTR_GID) {
+			new_kgid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
+			if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
+			    zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
+			    new_kgid)) {
+				if (attrzp)
+					zrele(attrzp);
+				err = SET_ERROR(EDQUOT);
+				goto out2;
+			}
+		}
+
+		if (projid != ZFS_INVALID_PROJID &&
+		    zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
+			if (attrzp)
+				zrele(attrzp);
+			err = EDQUOT;
+			goto out2;
+		}
+	}
+	tx = dmu_tx_create(os);
+
+	if (mask & ATTR_MODE) {
+		uint64_t pmode = zp->z_mode;
+		uint64_t acl_obj;
+		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+		if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
+		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
+			err = EPERM;
+			goto out;
+		}
+
+		if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
+			goto out;
+
+		mutex_enter(&zp->z_lock);
+		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
+			/*
+			 * Are we upgrading ACL from old V0 format
+			 * to V1 format?
+			 */
+			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+			    zfs_znode_acl_version(zp) ==
+			    ZFS_ACL_VERSION_INITIAL) {
+				dmu_tx_hold_free(tx, acl_obj, 0,
+				    DMU_OBJECT_END);
+				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+				    0, aclp->z_acl_bytes);
+			} else {
+				dmu_tx_hold_write(tx, acl_obj, 0,
+				    aclp->z_acl_bytes);
+			}
+		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+			    0, aclp->z_acl_bytes);
+		}
+		mutex_exit(&zp->z_lock);
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+	} else {
+		if (((mask & ATTR_XVATTR) &&
+		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
+		    (projid != ZFS_INVALID_PROJID &&
+		    !(zp->z_pflags & ZFS_PROJID)))
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+		else
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	}
+
+	if (attrzp) {
+		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
+	}
+
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+
+	zfs_sa_upgrade_txholds(tx, zp);
+
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err)
+		goto out;
+
+	count = 0;
+	/*
+	 * Set each attribute requested.
+	 * We group settings according to the locks they need to acquire.
+	 *
+	 * Note: you cannot set ctime directly, although it will be
+	 * updated as a side-effect of calling this function.
+	 */
+
+	if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
+		/*
+		 * For the existed object that is upgraded from old system,
+		 * its on-disk layout has no slot for the project ID attribute.
+		 * But quota accounting logic needs to access related slots by
+		 * offset directly. So we need to adjust old objects' layout
+		 * to make the project ID to some unified and fixed offset.
+		 */
+		if (attrzp)
+			err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
+		if (err == 0)
+			err = sa_add_projid(zp->z_sa_hdl, tx, projid);
+
+		if (unlikely(err == EEXIST))
+			err = 0;
+		else if (err != 0)
+			goto out;
+		else
+			projid = ZFS_INVALID_PROJID;
+	}
+
+	if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+		mutex_enter(&zp->z_acl_lock);
+	mutex_enter(&zp->z_lock);
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+
+	if (attrzp) {
+		if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+			mutex_enter(&attrzp->z_acl_lock);
+		mutex_enter(&attrzp->z_lock);
+		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
+		    sizeof (attrzp->z_pflags));
+		if (projid != ZFS_INVALID_PROJID) {
+			attrzp->z_projid = projid;
+			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+			    SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
+			    sizeof (attrzp->z_projid));
+		}
+	}
+
+	if (mask & (ATTR_UID|ATTR_GID)) {
+
+		if (mask & ATTR_UID) {
+			ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
+			new_uid = zfs_uid_read(ZTOI(zp));
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+			    &new_uid, sizeof (new_uid));
+			if (attrzp) {
+				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
+				    sizeof (new_uid));
+				ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
+			}
+		}
+
+		if (mask & ATTR_GID) {
+			ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
+			new_gid = zfs_gid_read(ZTOI(zp));
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
+			    NULL, &new_gid, sizeof (new_gid));
+			if (attrzp) {
+				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
+				    sizeof (new_gid));
+				ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
+			}
+		}
+		if (!(mask & ATTR_MODE)) {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
+			    NULL, &new_mode, sizeof (new_mode));
+			new_mode = zp->z_mode;
+		}
+		err = zfs_acl_chown_setattr(zp);
+		ASSERT(err == 0);
+		if (attrzp) {
+			err = zfs_acl_chown_setattr(attrzp);
+			ASSERT(err == 0);
+		}
+	}
+
+	if (mask & ATTR_MODE) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+		    &new_mode, sizeof (new_mode));
+		zp->z_mode = ZTOI(zp)->i_mode = new_mode;
+		ASSERT3P(aclp, !=, NULL);
+		err = zfs_aclset_common(zp, aclp, cr, tx);
+		ASSERT0(err);
+		if (zp->z_acl_cached)
+			zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = aclp;
+		aclp = NULL;
+	}
+
+	if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
+		zp->z_atime_dirty = B_FALSE;
+		ZFS_TIME_ENCODE(&ip->i_atime, atime);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+		    &atime, sizeof (atime));
+	}
+
+	if (mask & (ATTR_MTIME | ATTR_SIZE)) {
+		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+		ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate(
+		    vap->va_mtime, ZTOI(zp));
+
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+		    mtime, sizeof (mtime));
+	}
+
+	if (mask & (ATTR_CTIME | ATTR_SIZE)) {
+		ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
+		ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime,
+		    ZTOI(zp));
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    ctime, sizeof (ctime));
+	}
+
+	if (projid != ZFS_INVALID_PROJID) {
+		zp->z_projid = projid;
+		SA_ADD_BULK_ATTR(bulk, count,
+		    SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
+		    sizeof (zp->z_projid));
+	}
+
+	if (attrzp && mask) {
+		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+		    SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
+		    sizeof (ctime));
+	}
+
+	/*
+	 * Do this after setting timestamps to prevent timestamp
+	 * update from toggling bit
+	 */
+
+	if (xoap && (mask & ATTR_XVATTR)) {
+
+		/*
+		 * restore trimmed off masks
+		 * so that return masks can be set for caller.
+		 */
+
+		if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
+			XVA_SET_REQ(xvap, XAT_APPENDONLY);
+		}
+		if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
+			XVA_SET_REQ(xvap, XAT_NOUNLINK);
+		}
+		if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
+			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
+		}
+		if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
+			XVA_SET_REQ(xvap, XAT_NODUMP);
+		}
+		if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
+			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
+		}
+		if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
+			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
+		}
+		if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
+			XVA_SET_REQ(xvap, XAT_PROJINHERIT);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+			ASSERT(S_ISREG(ip->i_mode));
+
+		zfs_xvattr_set(zp, xvap, tx);
+	}
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	if (mask != 0)
+		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
+
+	mutex_exit(&zp->z_lock);
+	if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+		mutex_exit(&zp->z_acl_lock);
+
+	if (attrzp) {
+		if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+			mutex_exit(&attrzp->z_acl_lock);
+		mutex_exit(&attrzp->z_lock);
+	}
+out:
+	if (err == 0 && xattr_count > 0) {
+		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
+		    xattr_count, tx);
+		ASSERT(err2 == 0);
+	}
+
+	if (aclp)
+		zfs_acl_free(aclp);
+
+	if (fuidp) {
+		zfs_fuid_info_free(fuidp);
+		fuidp = NULL;
+	}
+
+	if (err) {
+		dmu_tx_abort(tx);
+		if (attrzp)
+			zrele(attrzp);
+		if (err == ERESTART)
+			goto top;
+	} else {
+		if (count > 0)
+			err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		dmu_tx_commit(tx);
+		if (attrzp) {
+			if (err2 == 0 && handle_eadir)
+				err2 = zfs_setattr_dir(attrzp);
+			zrele(attrzp);
+		}
+		zfs_inode_update(zp);
+	}
+
+out2:
+	if (os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+out3:
+	kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
+	kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
+	kmem_free(tmpxvattr, sizeof (xvattr_t));
+	ZFS_EXIT(zfsvfs);
+	return (err);
+}
+
+typedef struct zfs_zlock {
+	krwlock_t	*zl_rwlock;	/* lock we acquired */
+	znode_t		*zl_znode;	/* znode we held */
+	struct zfs_zlock *zl_next;	/* next in list */
+} zfs_zlock_t;
+
+/*
+ * Drop locks and release vnodes that were held by zfs_rename_lock().
+ */
+static void
+zfs_rename_unlock(zfs_zlock_t **zlpp)
+{
+	zfs_zlock_t *zl;
+
+	while ((zl = *zlpp) != NULL) {
+		if (zl->zl_znode != NULL)
+			zfs_zrele_async(zl->zl_znode);
+		rw_exit(zl->zl_rwlock);
+		*zlpp = zl->zl_next;
+		kmem_free(zl, sizeof (*zl));
+	}
+}
+
+/*
+ * Search back through the directory tree, using the ".." entries.
+ * Lock each directory in the chain to prevent concurrent renames.
+ * Fail any attempt to move a directory into one of its own descendants.
+ * XXX - z_parent_lock can overlap with map or grow locks
+ */
+static int
+zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
+{
+	zfs_zlock_t	*zl;
+	znode_t		*zp = tdzp;
+	uint64_t	rootid = ZTOZSB(zp)->z_root;
+	uint64_t	oidp = zp->z_id;
+	krwlock_t	*rwlp = &szp->z_parent_lock;
+	krw_t		rw = RW_WRITER;
+
+	/*
+	 * First pass write-locks szp and compares to zp->z_id.
+	 * Later passes read-lock zp and compare to zp->z_parent.
+	 */
+	do {
+		if (!rw_tryenter(rwlp, rw)) {
+			/*
+			 * Another thread is renaming in this path.
+			 * Note that if we are a WRITER, we don't have any
+			 * parent_locks held yet.
+			 */
+			if (rw == RW_READER && zp->z_id > szp->z_id) {
+				/*
+				 * Drop our locks and restart
+				 */
+				zfs_rename_unlock(&zl);
+				*zlpp = NULL;
+				zp = tdzp;
+				oidp = zp->z_id;
+				rwlp = &szp->z_parent_lock;
+				rw = RW_WRITER;
+				continue;
+			} else {
+				/*
+				 * Wait for other thread to drop its locks
+				 */
+				rw_enter(rwlp, rw);
+			}
+		}
+
+		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
+		zl->zl_rwlock = rwlp;
+		zl->zl_znode = NULL;
+		zl->zl_next = *zlpp;
+		*zlpp = zl;
+
+		if (oidp == szp->z_id)		/* We're a descendant of szp */
+			return (SET_ERROR(EINVAL));
+
+		if (oidp == rootid)		/* We've hit the top */
+			return (0);
+
+		if (rw == RW_READER) {		/* i.e. not the first pass */
+			int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
+			if (error)
+				return (error);
+			zl->zl_znode = zp;
+		}
+		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
+		    &oidp, sizeof (oidp));
+		rwlp = &zp->z_parent_lock;
+		rw = RW_READER;
+
+	} while (zp->z_id != sdzp->z_id);
+
+	return (0);
+}
+
+/*
+ * Move an entry from the provided source directory to the target
+ * directory.  Change the entry name as indicated.
+ *
+ *	IN:	sdzp	- Source directory containing the "old entry".
+ *		snm	- Old entry name.
+ *		tdzp	- Target directory to contain the "new entry".
+ *		tnm	- New entry name.
+ *		cr	- credentials of caller.
+ *		flags	- case flags
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	sdzp,tdzp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
+    cred_t *cr, int flags)
+{
+	znode_t		*szp, *tzp;
+	zfsvfs_t	*zfsvfs = ZTOZSB(sdzp);
+	zilog_t		*zilog;
+	zfs_dirlock_t	*sdl, *tdl;
+	dmu_tx_t	*tx;
+	zfs_zlock_t	*zl;
+	int		cmp, serr, terr;
+	int		error = 0;
+	int		zflg = 0;
+	boolean_t	waited = B_FALSE;
+
+	if (snm == NULL || tnm == NULL)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(sdzp);
+	zilog = zfsvfs->z_log;
+
+	ZFS_VERIFY_ZP(tdzp);
+
+	/*
+	 * We check i_sb because snapshots and the ctldir must have different
+	 * super blocks.
+	 */
+	if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
+	    zfsctl_is_node(ZTOI(tdzp))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EXDEV));
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(tnm,
+	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+	if (flags & FIGNORECASE)
+		zflg |= ZCILOOK;
+
+top:
+	szp = NULL;
+	tzp = NULL;
+	zl = NULL;
+
+	/*
+	 * This is to prevent the creation of links into attribute space
+	 * by renaming a linked file into/outof an attribute directory.
+	 * See the comment in zfs_link() for why this is considered bad.
+	 */
+	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Lock source and target directory entries.  To prevent deadlock,
+	 * a lock ordering must be defined.  We lock the directory with
+	 * the smallest object id first, or if it's a tie, the one with
+	 * the lexically first name.
+	 */
+	if (sdzp->z_id < tdzp->z_id) {
+		cmp = -1;
+	} else if (sdzp->z_id > tdzp->z_id) {
+		cmp = 1;
+	} else {
+		/*
+		 * First compare the two name arguments without
+		 * considering any case folding.
+		 */
+		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
+
+		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
+		ASSERT(error == 0 || !zfsvfs->z_utf8);
+		if (cmp == 0) {
+			/*
+			 * POSIX: "If the old argument and the new argument
+			 * both refer to links to the same existing file,
+			 * the rename() function shall return successfully
+			 * and perform no other action."
+			 */
+			ZFS_EXIT(zfsvfs);
+			return (0);
+		}
+		/*
+		 * If the file system is case-folding, then we may
+		 * have some more checking to do.  A case-folding file
+		 * system is either supporting mixed case sensitivity
+		 * access or is completely case-insensitive.  Note
+		 * that the file system is always case preserving.
+		 *
+		 * In mixed sensitivity mode case sensitive behavior
+		 * is the default.  FIGNORECASE must be used to
+		 * explicitly request case insensitive behavior.
+		 *
+		 * If the source and target names provided differ only
+		 * by case (e.g., a request to rename 'tim' to 'Tim'),
+		 * we will treat this as a special case in the
+		 * case-insensitive mode: as long as the source name
+		 * is an exact match, we will allow this to proceed as
+		 * a name-change request.
+		 */
+		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
+		    flags & FIGNORECASE)) &&
+		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
+		    &error) == 0) {
+			/*
+			 * case preserving rename request, require exact
+			 * name matches
+			 */
+			zflg |= ZCIEXACT;
+			zflg &= ~ZCILOOK;
+		}
+	}
+
+	/*
+	 * If the source and destination directories are the same, we should
+	 * grab the z_name_lock of that directory only once.
+	 */
+	if (sdzp == tdzp) {
+		zflg |= ZHAVELOCK;
+		rw_enter(&sdzp->z_name_lock, RW_READER);
+	}
+
+	if (cmp < 0) {
+		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
+		    ZEXISTS | zflg, NULL, NULL);
+		terr = zfs_dirent_lock(&tdl,
+		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
+	} else {
+		terr = zfs_dirent_lock(&tdl,
+		    tdzp, tnm, &tzp, zflg, NULL, NULL);
+		serr = zfs_dirent_lock(&sdl,
+		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
+		    NULL, NULL);
+	}
+
+	if (serr) {
+		/*
+		 * Source entry invalid or not there.
+		 */
+		if (!terr) {
+			zfs_dirent_unlock(tdl);
+			if (tzp)
+				zrele(tzp);
+		}
+
+		if (sdzp == tdzp)
+			rw_exit(&sdzp->z_name_lock);
+
+		if (strcmp(snm, "..") == 0)
+			serr = EINVAL;
+		ZFS_EXIT(zfsvfs);
+		return (serr);
+	}
+	if (terr) {
+		zfs_dirent_unlock(sdl);
+		zrele(szp);
+
+		if (sdzp == tdzp)
+			rw_exit(&sdzp->z_name_lock);
+
+		if (strcmp(tnm, "..") == 0)
+			terr = EINVAL;
+		ZFS_EXIT(zfsvfs);
+		return (terr);
+	}
+
+	/*
+	 * If we are using project inheritance, means if the directory has
+	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
+	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+	 * such case, we only allow renames into our tree when the project
+	 * IDs are the same.
+	 */
+	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+	    tdzp->z_projid != szp->z_projid) {
+		error = SET_ERROR(EXDEV);
+		goto out;
+	}
+
+	/*
+	 * Must have write access at the source to remove the old entry
+	 * and write access at the target to create the new entry.
+	 * Note that if target and source are the same, this can be
+	 * done in a single check.
+	 */
+
+	if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
+		goto out;
+
+	if (S_ISDIR(ZTOI(szp)->i_mode)) {
+		/*
+		 * Check to make sure rename is valid.
+		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
+		 */
+		if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
+			goto out;
+	}
+
+	/*
+	 * Does target exist?
+	 */
+	if (tzp) {
+		/*
+		 * Source and target must be the same type.
+		 */
+		if (S_ISDIR(ZTOI(szp)->i_mode)) {
+			if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
+				error = SET_ERROR(ENOTDIR);
+				goto out;
+			}
+		} else {
+			if (S_ISDIR(ZTOI(tzp)->i_mode)) {
+				error = SET_ERROR(EISDIR);
+				goto out;
+			}
+		}
+		/*
+		 * POSIX dictates that when the source and target
+		 * entries refer to the same file object, rename
+		 * must do nothing and exit without error.
+		 */
+		if (szp->z_id == tzp->z_id) {
+			error = 0;
+			goto out;
+		}
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
+	if (sdzp != tdzp) {
+		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, tdzp);
+	}
+	if (tzp) {
+		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, tzp);
+	}
+
+	zfs_sa_upgrade_txholds(tx, szp);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+	if (error) {
+		if (zl != NULL)
+			zfs_rename_unlock(&zl);
+		zfs_dirent_unlock(sdl);
+		zfs_dirent_unlock(tdl);
+
+		if (sdzp == tdzp)
+			rw_exit(&sdzp->z_name_lock);
+
+		if (error == ERESTART) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			zrele(szp);
+			if (tzp)
+				zrele(tzp);
+			goto top;
+		}
+		dmu_tx_abort(tx);
+		zrele(szp);
+		if (tzp)
+			zrele(tzp);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (tzp)	/* Attempt to remove the existing target */
+		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+
+	if (error == 0) {
+		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+		if (error == 0) {
+			szp->z_pflags |= ZFS_AV_MODIFIED;
+			if (tdzp->z_pflags & ZFS_PROJINHERIT)
+				szp->z_pflags |= ZFS_PROJINHERIT;
+
+			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+			ASSERT0(error);
+
+			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+			if (error == 0) {
+				zfs_log_rename(zilog, tx, TX_RENAME |
+				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
+				    sdl->dl_name, tdzp, tdl->dl_name, szp);
+			} else {
+				/*
+				 * At this point, we have successfully created
+				 * the target name, but have failed to remove
+				 * the source name.  Since the create was done
+				 * with the ZRENAMING flag, there are
+				 * complications; for one, the link count is
+				 * wrong.  The easiest way to deal with this
+				 * is to remove the newly created target, and
+				 * return the original error.  This must
+				 * succeed; fortunately, it is very unlikely to
+				 * fail, since we just created it.
+				 */
+				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
+				    ZRENAMING, NULL), ==, 0);
+			}
+		} else {
+			/*
+			 * If we had removed the existing target, subsequent
+			 * call to zfs_link_create() to add back the same entry
+			 * but, the new dnode (szp) should not fail.
+			 */
+			ASSERT(tzp == NULL);
+		}
+	}
+
+	dmu_tx_commit(tx);
+out:
+	if (zl != NULL)
+		zfs_rename_unlock(&zl);
+
+	zfs_dirent_unlock(sdl);
+	zfs_dirent_unlock(tdl);
+
+	zfs_inode_update(sdzp);
+	if (sdzp == tdzp)
+		rw_exit(&sdzp->z_name_lock);
+
+	if (sdzp != tdzp)
+		zfs_inode_update(tdzp);
+
+	zfs_inode_update(szp);
+	zrele(szp);
+	if (tzp) {
+		zfs_inode_update(tzp);
+		zrele(tzp);
+	}
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Insert the indicated symbolic reference entry into the directory.
+ *
+ *	IN:	dzp	- Directory to contain new symbolic link.
+ *		name	- Name of directory entry in dip.
+ *		vap	- Attributes of new entry.
+ *		link	- Name for new symlink entry.
+ *		cr	- credentials of caller.
+ *		flags	- case flags
+ *
+ *	OUT:	zpp	- Znode for new symbolic link.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dip - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
+    znode_t **zpp, cred_t *cr, int flags)
+{
+	znode_t		*zp;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zilog_t		*zilog;
+	uint64_t	len = strlen(link);
+	int		error;
+	int		zflg = ZNEW;
+	zfs_acl_ids_t	acl_ids;
+	boolean_t	fuid_dirtied;
+	uint64_t	txtype = TX_SYMLINK;
+	boolean_t	waited = B_FALSE;
+
+	ASSERT(S_ISLNK(vap->va_mode));
+
+	if (name == NULL)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
+
+	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+	if (flags & FIGNORECASE)
+		zflg |= ZCILOOK;
+
+	if (len > MAXPATHLEN) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENAMETOOLONG));
+	}
+
+	if ((error = zfs_acl_ids_create(dzp, 0,
+	    vap, cr, NULL, &acl_ids)) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+top:
+	*zpp = NULL;
+
+	/*
+	 * Attempt to lock directory; fail if entry already exists.
+	 */
+	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
+	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_dirent_unlock(dl);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_dirent_unlock(dl);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EDQUOT));
+	}
+	tx = dmu_tx_create(zfsvfs->z_os);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE + len);
+	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+		    acl_ids.z_aclp->z_acl_bytes);
+	}
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+	if (error) {
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Create a new object for the symlink.
+	 * for version 4 ZPL datsets the symlink will be an SA attribute
+	 */
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	mutex_enter(&zp->z_lock);
+	if (zp->z_is_sa)
+		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
+		    link, len, tx);
+	else
+		zfs_sa_symlink(zp, link, len, tx);
+	mutex_exit(&zp->z_lock);
+
+	zp->z_size = len;
+	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+	    &zp->z_size, sizeof (zp->z_size), tx);
+	/*
+	 * Insert the new object into the directory.
+	 */
+	error = zfs_link_create(dl, zp, tx, ZNEW);
+	if (error != 0) {
+		zfs_znode_delete(zp, tx);
+		remove_inode_hash(ZTOI(zp));
+	} else {
+		if (flags & FIGNORECASE)
+			txtype |= TX_CI;
+		zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
+
+		zfs_inode_update(dzp);
+		zfs_inode_update(zp);
+	}
+
+	zfs_acl_ids_free(&acl_ids);
+
+	dmu_tx_commit(tx);
+
+	zfs_dirent_unlock(dl);
+
+	if (error == 0) {
+		*zpp = zp;
+
+		if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+			zil_commit(zilog, 0);
+	} else {
+		zrele(zp);
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Return, in the buffer contained in the provided uio structure,
+ * the symbolic path referred to by ip.
+ *
+ *	IN:	ip	- inode of symbolic link
+ *		uio	- structure to contain the link path.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	ip - atime updated
+ */
+/* ARGSUSED */
+int
+zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr)
+{
+	znode_t		*zp = ITOZ(ip);
+	zfsvfs_t	*zfsvfs = ITOZSB(ip);
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	mutex_enter(&zp->z_lock);
+	if (zp->z_is_sa)
+		error = sa_lookup_uio(zp->z_sa_hdl,
+		    SA_ZPL_SYMLINK(zfsvfs), uio);
+	else
+		error = zfs_sa_readlink(zp, uio);
+	mutex_exit(&zp->z_lock);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Insert a new entry into directory tdzp referencing szp.
+ *
+ *	IN:	tdzp	- Directory to contain new entry.
+ *		szp	- znode of new entry.
+ *		name	- name of new entry.
+ *		cr	- credentials of caller.
+ *		flags	- case flags.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	tdzp - ctime|mtime updated
+ *	 szp - ctime updated
+ */
+/* ARGSUSED */
+int
+zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
+    int flags)
+{
+	struct inode *sip = ZTOI(szp);
+	znode_t		*tzp;
+	zfsvfs_t	*zfsvfs = ZTOZSB(tdzp);
+	zilog_t		*zilog;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	int		error;
+	int		zf = ZNEW;
+	uint64_t	parent;
+	uid_t		owner;
+	boolean_t	waited = B_FALSE;
+	boolean_t	is_tmpfile = 0;
+	uint64_t	txg;
+#ifdef HAVE_TMPFILE
+	is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
+#endif
+	ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
+
+	if (name == NULL)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(tdzp);
+	zilog = zfsvfs->z_log;
+
+	/*
+	 * POSIX dictates that we return EPERM here.
+	 * Better choices include ENOTSUP or EISDIR.
+	 */
+	if (S_ISDIR(sip->i_mode)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	ZFS_VERIFY_ZP(szp);
+
+	/*
+	 * If we are using project inheritance, means if the directory has
+	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
+	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+	 * such case, we only allow hard link creation in our tree when the
+	 * project IDs are the same.
+	 */
+	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+	    tdzp->z_projid != szp->z_projid) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EXDEV));
+	}
+
+	/*
+	 * We check i_sb because snapshots and the ctldir must have different
+	 * super blocks.
+	 */
+	if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EXDEV));
+	}
+
+	/* Prevent links to .zfs/shares files */
+
+	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (uint64_t))) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	if (parent == zfsvfs->z_shares_dir) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(name,
+	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+	if (flags & FIGNORECASE)
+		zf |= ZCILOOK;
+
+	/*
+	 * We do not support links between attributes and non-attributes
+	 * because of the potential security risk of creating links
+	 * into "normal" file space in order to circumvent restrictions
+	 * imposed in attribute space.
+	 */
+	if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
+	    cr, ZFS_OWNER);
+	if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+top:
+	/*
+	 * Attempt to lock directory; fail if entry already exists.
+	 */
+	error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
+	if (error) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
+	if (is_tmpfile)
+		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+	zfs_sa_upgrade_txholds(tx, szp);
+	zfs_sa_upgrade_txholds(tx, tdzp);
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+	if (error) {
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
+		dmu_tx_abort(tx);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	/* unmark z_unlinked so zfs_link_create will not reject */
+	if (is_tmpfile)
+		szp->z_unlinked = B_FALSE;
+	error = zfs_link_create(dl, szp, tx, 0);
+
+	if (error == 0) {
+		uint64_t txtype = TX_LINK;
+		/*
+		 * tmpfile is created to be in z_unlinkedobj, so remove it.
+		 * Also, we don't log in ZIL, because all previous file
+		 * operation on the tmpfile are ignored by ZIL. Instead we
+		 * always wait for txg to sync to make sure all previous
+		 * operation are sync safe.
+		 */
+		if (is_tmpfile) {
+			VERIFY(zap_remove_int(zfsvfs->z_os,
+			    zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
+		} else {
+			if (flags & FIGNORECASE)
+				txtype |= TX_CI;
+			zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
+		}
+	} else if (is_tmpfile) {
+		/* restore z_unlinked since when linking failed */
+		szp->z_unlinked = B_TRUE;
+	}
+	txg = dmu_tx_get_txg(tx);
+	dmu_tx_commit(tx);
+
+	zfs_dirent_unlock(dl);
+
+	if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
+		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
+
+	zfs_inode_update(tdzp);
+	zfs_inode_update(szp);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+static void
+zfs_putpage_commit_cb(void *arg)
+{
+	struct page *pp = arg;
+
+	ClearPageError(pp);
+	end_page_writeback(pp);
+}
+
+/*
+ * Push a page out to disk, once the page is on stable storage the
+ * registered commit callback will be run as notification of completion.
+ *
+ *	IN:	ip	- page mapped for inode.
+ *		pp	- page to push (page is locked)
+ *		wbc	- writeback control data
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	ip - ctime|mtime updated
+ */
+/* ARGSUSED */
+int
+zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
+{
+	znode_t		*zp = ITOZ(ip);
+	zfsvfs_t	*zfsvfs = ITOZSB(ip);
+	loff_t		offset;
+	loff_t		pgoff;
+	unsigned int	pglen;
+	dmu_tx_t	*tx;
+	caddr_t		va;
+	int		err = 0;
+	uint64_t	mtime[2], ctime[2];
+	sa_bulk_attr_t	bulk[3];
+	int		cnt = 0;
+	struct address_space *mapping;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	ASSERT(PageLocked(pp));
+
+	pgoff = page_offset(pp);	/* Page byte-offset in file */
+	offset = i_size_read(ip);	/* File length in bytes */
+	pglen = MIN(PAGE_SIZE,		/* Page length in bytes */
+	    P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
+
+	/* Page is beyond end of file */
+	if (pgoff >= offset) {
+		unlock_page(pp);
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/* Truncate page length to end of file */
+	if (pgoff + pglen > offset)
+		pglen = offset - pgoff;
+
+#if 0
+	/*
+	 * FIXME: Allow mmap writes past its quota.  The correct fix
+	 * is to register a page_mkwrite() handler to count the page
+	 * against its quota when it is about to be dirtied.
+	 */
+	if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
+	    KUID_TO_SUID(ip->i_uid)) ||
+	    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
+	    KGID_TO_SGID(ip->i_gid)) ||
+	    (zp->z_projid != ZFS_DEFAULT_PROJID &&
+	    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+	    zp->z_projid))) {
+		err = EDQUOT;
+	}
+#endif
+
+	/*
+	 * The ordering here is critical and must adhere to the following
+	 * rules in order to avoid deadlocking in either zfs_read() or
+	 * zfs_free_range() due to a lock inversion.
+	 *
+	 * 1) The page must be unlocked prior to acquiring the range lock.
+	 *    This is critical because zfs_read() calls find_lock_page()
+	 *    which may block on the page lock while holding the range lock.
+	 *
+	 * 2) Before setting or clearing write back on a page the range lock
+	 *    must be held in order to prevent a lock inversion with the
+	 *    zfs_free_range() function.
+	 *
+	 * This presents a problem because upon entering this function the
+	 * page lock is already held.  To safely acquire the range lock the
+	 * page lock must be dropped.  This creates a window where another
+	 * process could truncate, invalidate, dirty, or write out the page.
+	 *
+	 * Therefore, after successfully reacquiring the range and page locks
+	 * the current page state is checked.  In the common case everything
+	 * will be as is expected and it can be written out.  However, if
+	 * the page state has changed it must be handled accordingly.
+	 */
+	mapping = pp->mapping;
+	redirty_page_for_writepage(wbc, pp);
+	unlock_page(pp);
+
+	zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
+	    pgoff, pglen, RL_WRITER);
+	lock_page(pp);
+
+	/* Page mapping changed or it was no longer dirty, we're done */
+	if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
+		unlock_page(pp);
+		zfs_rangelock_exit(lr);
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/* Another process started write block if required */
+	if (PageWriteback(pp)) {
+		unlock_page(pp);
+		zfs_rangelock_exit(lr);
+
+		if (wbc->sync_mode != WB_SYNC_NONE) {
+			if (PageWriteback(pp))
+				wait_on_page_bit(pp, PG_writeback);
+		}
+
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/* Clear the dirty flag the required locks are held */
+	if (!clear_page_dirty_for_io(pp)) {
+		unlock_page(pp);
+		zfs_rangelock_exit(lr);
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/*
+	 * Counterpart for redirty_page_for_writepage() above.  This page
+	 * was in fact not skipped and should not be counted as if it were.
+	 */
+	wbc->pages_skipped--;
+	set_page_writeback(pp);
+	unlock_page(pp);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+
+	err = dmu_tx_assign(tx, TXG_NOWAIT);
+	if (err != 0) {
+		if (err == ERESTART)
+			dmu_tx_wait(tx);
+
+		dmu_tx_abort(tx);
+		__set_page_dirty_nobuffers(pp);
+		ClearPageError(pp);
+		end_page_writeback(pp);
+		zfs_rangelock_exit(lr);
+		ZFS_EXIT(zfsvfs);
+		return (err);
+	}
+
+	va = kmap(pp);
+	ASSERT3U(pglen, <=, PAGE_SIZE);
+	dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
+	kunmap(pp);
+
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+
+	/* Preserve the mtime and ctime provided by the inode */
+	ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
+	ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
+	zp->z_atime_dirty = B_FALSE;
+	zp->z_seq++;
+
+	err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
+
+	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
+	    zfs_putpage_commit_cb, pp);
+	dmu_tx_commit(tx);
+
+	zfs_rangelock_exit(lr);
+
+	if (wbc->sync_mode != WB_SYNC_NONE) {
+		/*
+		 * Note that this is rarely called under writepages(), because
+		 * writepages() normally handles the entire commit for
+		 * performance reasons.
+		 */
+		zil_commit(zfsvfs->z_log, zp->z_id);
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (err);
+}
+
+/*
+ * Update the system attributes when the inode has been dirtied.  For the
+ * moment we only update the mode, atime, mtime, and ctime.
+ */
+int
+zfs_dirty_inode(struct inode *ip, int flags)
+{
+	znode_t		*zp = ITOZ(ip);
+	zfsvfs_t	*zfsvfs = ITOZSB(ip);
+	dmu_tx_t	*tx;
+	uint64_t	mode, atime[2], mtime[2], ctime[2];
+	sa_bulk_attr_t	bulk[4];
+	int		error = 0;
+	int		cnt = 0;
+
+	if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
+		return (0);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+#ifdef I_DIRTY_TIME
+	/*
+	 * This is the lazytime semantic introduced in Linux 4.0
+	 * This flag will only be called from update_time when lazytime is set.
+	 * (Note, I_DIRTY_SYNC will also set if not lazytime)
+	 * Fortunately mtime and ctime are managed within ZFS itself, so we
+	 * only need to dirty atime.
+	 */
+	if (flags == I_DIRTY_TIME) {
+		zp->z_atime_dirty = B_TRUE;
+		goto out;
+	}
+#endif
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		goto out;
+	}
+
+	mutex_enter(&zp->z_lock);
+	zp->z_atime_dirty = B_FALSE;
+
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+
+	/* Preserve the mode, mtime and ctime provided by the inode */
+	ZFS_TIME_ENCODE(&ip->i_atime, atime);
+	ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
+	ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
+	mode = ip->i_mode;
+
+	zp->z_mode = mode;
+
+	error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
+	mutex_exit(&zp->z_lock);
+
+	dmu_tx_commit(tx);
+out:
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_inactive(struct inode *ip)
+{
+	znode_t	*zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	uint64_t atime[2];
+	int error;
+	int need_unlock = 0;
+
+	/* Only read lock if we haven't already write locked, e.g. rollback */
+	if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
+		need_unlock = 1;
+		rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+	}
+	if (zp->z_sa_hdl == NULL) {
+		if (need_unlock)
+			rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		return;
+	}
+
+	if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, zp);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+		} else {
+			ZFS_TIME_ENCODE(&ip->i_atime, atime);
+			mutex_enter(&zp->z_lock);
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+			    (void *)&atime, sizeof (atime), tx);
+			zp->z_atime_dirty = B_FALSE;
+			mutex_exit(&zp->z_lock);
+			dmu_tx_commit(tx);
+		}
+	}
+
+	zfs_zinactive(zp);
+	if (need_unlock)
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+}
+
+/*
+ * Fill pages with data from the disk.
+ */
+static int
+zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	objset_t *os;
+	struct page *cur_pp;
+	u_offset_t io_off, total;
+	size_t io_len;
+	loff_t i_size;
+	unsigned page_idx;
+	int err;
+
+	os = zfsvfs->z_os;
+	io_len = nr_pages << PAGE_SHIFT;
+	i_size = i_size_read(ip);
+	io_off = page_offset(pl[0]);
+
+	if (io_off + io_len > i_size)
+		io_len = i_size - io_off;
+
+	/*
+	 * Iterate over list of pages and read each page individually.
+	 */
+	page_idx = 0;
+	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
+		caddr_t va;
+
+		cur_pp = pl[page_idx++];
+		va = kmap(cur_pp);
+		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
+		    DMU_READ_PREFETCH);
+		kunmap(cur_pp);
+		if (err) {
+			/* convert checksum errors into IO errors */
+			if (err == ECKSUM)
+				err = SET_ERROR(EIO);
+			return (err);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Uses zfs_fillpage to read data from the file and fill the pages.
+ *
+ *	IN:	ip	 - inode of file to get data from.
+ *		pl	 - list of pages to read
+ *		nr_pages - number of pages to read
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	vp - atime updated
+ */
+/* ARGSUSED */
+int
+zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
+{
+	znode_t	 *zp  = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	int	 err;
+
+	if (pl == NULL)
+		return (0);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	err = zfs_fillpage(ip, pl, nr_pages);
+
+	ZFS_EXIT(zfsvfs);
+	return (err);
+}
+
+/*
+ * Check ZFS specific permissions to memory map a section of a file.
+ *
+ *	IN:	ip	- inode of the file to mmap
+ *		off	- file offset
+ *		addrp	- start address in memory region
+ *		len	- length of memory region
+ *		vm_flags- address flags
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ */
+/*ARGSUSED*/
+int
+zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
+    unsigned long vm_flags)
+{
+	znode_t  *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if ((vm_flags & VM_WRITE) && (zp->z_pflags &
+	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	if ((vm_flags & (VM_READ | VM_EXEC)) &&
+	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EACCES));
+	}
+
+	if (off < 0 || len > MAXOFFSET_T - off) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENXIO));
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Free or allocate space in a file.  Currently, this function only
+ * supports the `F_FREESP' command.  However, this command is somewhat
+ * misnamed, as its functionality includes the ability to allocate as
+ * well as free space.
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		cmd	- action to take (only F_FREESP supported).
+ *		bfp	- section of file to free/alloc.
+ *		flag	- current file open mode flags.
+ *		offset	- current file offset.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	zp - ctime|mtime updated
+ */
+/* ARGSUSED */
+int
+zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
+    offset_t offset, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
+	uint64_t	off, len;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if (cmd != F_FREESP) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Callers might not be able to detect properly that we are read-only,
+	 * so check it explicitly here.
+	 */
+	if (zfs_is_readonly(zfsvfs)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EROFS));
+	}
+
+	if (bfp->l_len < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Permissions aren't checked on Solaris because on this OS
+	 * zfs_space() can only be called with an opened file handle.
+	 * On Linux we can get here through truncate_range() which
+	 * operates directly on inodes, so we need to check access rights.
+	 */
+	if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	off = bfp->l_start;
+	len = bfp->l_len; /* 0 means from off to end of file */
+
+	error = zfs_freesp(zp, off, len, flag, TRUE);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_fid(struct inode *ip, fid_t *fidp)
+{
+	znode_t		*zp = ITOZ(ip);
+	zfsvfs_t	*zfsvfs = ITOZSB(ip);
+	uint32_t	gen;
+	uint64_t	gen64;
+	uint64_t	object = zp->z_id;
+	zfid_short_t	*zfid;
+	int		size, i, error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
+	    &gen64, sizeof (uint64_t))) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	gen = (uint32_t)gen64;
+
+	size = SHORT_FID_LEN;
+
+	zfid = (zfid_short_t *)fidp;
+
+	zfid->zf_len = size;
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	/* Must have a non-zero generation number to distinguish from .zfs */
+	if (gen == 0)
+		gen = 1;
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	int error;
+	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	int error;
+	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+	zilog_t	*zilog = zfsvfs->z_log;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+#ifdef HAVE_UIO_ZEROCOPY
+/*
+ * The smallest read we may consider to loan out an arcbuf.
+ * This must be a power of 2.
+ */
+int zcr_blksz_min = (1 << 10);	/* 1K */
+/*
+ * If set to less than the file block size, allow loaning out of an
+ * arcbuf for a partial block read.  This must be a power of 2.
+ */
+int zcr_blksz_max = (1 << 17);	/* 128K */
+
+/*ARGSUSED*/
+static int
+zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr)
+{
+	znode_t	*zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	int max_blksz = zfsvfs->z_max_blksz;
+	uio_t *uio = &xuio->xu_uio;
+	ssize_t size = uio->uio_resid;
+	offset_t offset = uio->uio_loffset;
+	int blksz;
+	int fullblk, i;
+	arc_buf_t *abuf;
+	ssize_t maxsize;
+	int preamble, postamble;
+
+	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+	switch (ioflag) {
+	case UIO_WRITE:
+		/*
+		 * Loan out an arc_buf for write if write size is bigger than
+		 * max_blksz, and the file's block size is also max_blksz.
+		 */
+		blksz = max_blksz;
+		if (size < blksz || zp->z_blksz != blksz) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EINVAL));
+		}
+		/*
+		 * Caller requests buffers for write before knowing where the
+		 * write offset might be (e.g. NFS TCP write).
+		 */
+		if (offset == -1) {
+			preamble = 0;
+		} else {
+			preamble = P2PHASE(offset, blksz);
+			if (preamble) {
+				preamble = blksz - preamble;
+				size -= preamble;
+			}
+		}
+
+		postamble = P2PHASE(size, blksz);
+		size -= postamble;
+
+		fullblk = size / blksz;
+		(void) dmu_xuio_init(xuio,
+		    (preamble != 0) + fullblk + (postamble != 0));
+
+		/*
+		 * Have to fix iov base/len for partial buffers.  They
+		 * currently represent full arc_buf's.
+		 */
+		if (preamble) {
+			/* data begins in the middle of the arc_buf */
+			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+			    blksz);
+			ASSERT(abuf);
+			(void) dmu_xuio_add(xuio, abuf,
+			    blksz - preamble, preamble);
+		}
+
+		for (i = 0; i < fullblk; i++) {
+			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+			    blksz);
+			ASSERT(abuf);
+			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
+		}
+
+		if (postamble) {
+			/* data ends in the middle of the arc_buf */
+			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+			    blksz);
+			ASSERT(abuf);
+			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
+		}
+		break;
+	case UIO_READ:
+		/*
+		 * Loan out an arc_buf for read if the read size is larger than
+		 * the current file block size.  Block alignment is not
+		 * considered.  Partial arc_buf will be loaned out for read.
+		 */
+		blksz = zp->z_blksz;
+		if (blksz < zcr_blksz_min)
+			blksz = zcr_blksz_min;
+		if (blksz > zcr_blksz_max)
+			blksz = zcr_blksz_max;
+		/* avoid potential complexity of dealing with it */
+		if (blksz > max_blksz) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EINVAL));
+		}
+
+		maxsize = zp->z_size - uio->uio_loffset;
+		if (size > maxsize)
+			size = maxsize;
+
+		if (size < blksz) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EINVAL));
+		}
+		break;
+	default:
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	uio->uio_extflg = UIO_XUIO;
+	XUIO_XUZC_RW(xuio) = ioflag;
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr)
+{
+	int i;
+	arc_buf_t *abuf;
+	int ioflag = XUIO_XUZC_RW(xuio);
+
+	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
+
+	i = dmu_xuio_cnt(xuio);
+	while (i-- > 0) {
+		abuf = dmu_xuio_arcbuf(xuio, i);
+		/*
+		 * if abuf == NULL, it must be a write buffer
+		 * that has been returned in zfs_write().
+		 */
+		if (abuf)
+			dmu_return_arcbuf(abuf);
+		ASSERT(abuf || ioflag == UIO_WRITE);
+	}
+
+	dmu_xuio_fini(xuio);
+	return (0);
+}
+#endif /* HAVE_UIO_ZEROCOPY */
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_open);
+EXPORT_SYMBOL(zfs_close);
+EXPORT_SYMBOL(zfs_read);
+EXPORT_SYMBOL(zfs_write);
+EXPORT_SYMBOL(zfs_access);
+EXPORT_SYMBOL(zfs_lookup);
+EXPORT_SYMBOL(zfs_create);
+EXPORT_SYMBOL(zfs_tmpfile);
+EXPORT_SYMBOL(zfs_remove);
+EXPORT_SYMBOL(zfs_mkdir);
+EXPORT_SYMBOL(zfs_rmdir);
+EXPORT_SYMBOL(zfs_readdir);
+EXPORT_SYMBOL(zfs_fsync);
+EXPORT_SYMBOL(zfs_getattr_fast);
+EXPORT_SYMBOL(zfs_setattr);
+EXPORT_SYMBOL(zfs_rename);
+EXPORT_SYMBOL(zfs_symlink);
+EXPORT_SYMBOL(zfs_readlink);
+EXPORT_SYMBOL(zfs_link);
+EXPORT_SYMBOL(zfs_inactive);
+EXPORT_SYMBOL(zfs_space);
+EXPORT_SYMBOL(zfs_fid);
+EXPORT_SYMBOL(zfs_getsecattr);
+EXPORT_SYMBOL(zfs_setsecattr);
+EXPORT_SYMBOL(zfs_getpage);
+EXPORT_SYMBOL(zfs_putpage);
+EXPORT_SYMBOL(zfs_dirty_inode);
+EXPORT_SYMBOL(zfs_map);
+
+/* BEGIN CSTYLED */
+module_param(zfs_delete_blocks, ulong, 0644);
+MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
+module_param(zfs_read_chunk_size, ulong, 0644);
+MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk");
+/* END CSTYLED */
+
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
new file mode 100644
index 000000000000..a542c662cb15
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
@@ -0,0 +1,2249 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/mntent.h>
+#include <sys/u8_textprep.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/atomic.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_rlock.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zpl.h>
+#endif /* _KERNEL */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfs_refcount.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+/*
+ * Functions needed for userland (ie: libzpool) are not put under
+ * #ifdef_KERNEL; the rest of the functions have dependencies
+ * (such as VFS logic) that will not compile easily in userland.
+ */
+#ifdef _KERNEL
+
+static kmem_cache_t *znode_cache = NULL;
+static kmem_cache_t *znode_hold_cache = NULL;
+unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
+
+/*
+ * This is used by the test suite so that it can delay znodes from being
+ * freed in order to inspect the unlinked set.
+ */
+int zfs_unlink_suspend_progress = 0;
+
+/*
+ * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
+ * z_rangelock. It will modify the offset and length of the lock to reflect
+ * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
+ * called with the rangelock_t's rl_lock held, which avoids races.
+ */
+static void
+zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
+{
+	znode_t *zp = arg;
+
+	/*
+	 * If in append mode, convert to writer and lock starting at the
+	 * current end of file.
+	 */
+	if (new->lr_type == RL_APPEND) {
+		new->lr_offset = zp->z_size;
+		new->lr_type = RL_WRITER;
+	}
+
+	/*
+	 * If we need to grow the block size then lock the whole file range.
+	 */
+	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
+	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+	    zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
+		new->lr_offset = 0;
+		new->lr_length = UINT64_MAX;
+	}
+}
+
+/*ARGSUSED*/
+static int
+zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
+{
+	znode_t *zp = buf;
+
+	inode_init_once(ZTOI(zp));
+	list_link_init(&zp->z_link_node);
+
+	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
+	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
+	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+	rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
+
+	zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
+
+	zp->z_dirlocks = NULL;
+	zp->z_acl_cached = NULL;
+	zp->z_xattr_cached = NULL;
+	zp->z_xattr_parent = 0;
+	zp->z_moved = B_FALSE;
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *arg)
+{
+	znode_t *zp = buf;
+
+	ASSERT(!list_link_active(&zp->z_link_node));
+	mutex_destroy(&zp->z_lock);
+	rw_destroy(&zp->z_parent_lock);
+	rw_destroy(&zp->z_name_lock);
+	mutex_destroy(&zp->z_acl_lock);
+	rw_destroy(&zp->z_xattr_lock);
+	zfs_rangelock_fini(&zp->z_rangelock);
+
+	ASSERT(zp->z_dirlocks == NULL);
+	ASSERT(zp->z_acl_cached == NULL);
+	ASSERT(zp->z_xattr_cached == NULL);
+}
+
+static int
+zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
+{
+	znode_hold_t *zh = buf;
+
+	mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
+	zfs_refcount_create(&zh->zh_refcount);
+	zh->zh_obj = ZFS_NO_OBJECT;
+
+	return (0);
+}
+
+static void
+zfs_znode_hold_cache_destructor(void *buf, void *arg)
+{
+	znode_hold_t *zh = buf;
+
+	mutex_destroy(&zh->zh_lock);
+	zfs_refcount_destroy(&zh->zh_refcount);
+}
+
+void
+zfs_znode_init(void)
+{
+	/*
+	 * Initialize zcache.  The KMC_SLAB hint is used in order that it be
+	 * backed by kmalloc() when on the Linux slab in order that any
+	 * wait_on_bit() operations on the related inode operate properly.
+	 */
+	ASSERT(znode_cache == NULL);
+	znode_cache = kmem_cache_create("zfs_znode_cache",
+	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
+	    zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
+
+	ASSERT(znode_hold_cache == NULL);
+	znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
+	    sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
+	    zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+zfs_znode_fini(void)
+{
+	/*
+	 * Cleanup zcache
+	 */
+	if (znode_cache)
+		kmem_cache_destroy(znode_cache);
+	znode_cache = NULL;
+
+	if (znode_hold_cache)
+		kmem_cache_destroy(znode_hold_cache);
+	znode_hold_cache = NULL;
+}
+
+/*
+ * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
+ * serialize access to a znode and its SA buffer while the object is being
+ * created or destroyed.  This kind of locking would normally reside in the
+ * znode itself but in this case that's impossible because the znode and SA
+ * buffer may not yet exist.  Therefore the locking is handled externally
+ * with an array of mutexs and AVLs trees which contain per-object locks.
+ *
+ * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
+ * in to the correct AVL tree and finally the per-object lock is held.  In
+ * zfs_znode_hold_exit() the process is reversed.  The per-object lock is
+ * released, removed from the AVL tree and destroyed if there are no waiters.
+ *
+ * This scheme has two important properties:
+ *
+ * 1) No memory allocations are performed while holding one of the z_hold_locks.
+ *    This ensures evict(), which can be called from direct memory reclaim, will
+ *    never block waiting on a z_hold_locks which just happens to have hashed
+ *    to the same index.
+ *
+ * 2) All locks used to serialize access to an object are per-object and never
+ *    shared.  This minimizes lock contention without creating a large number
+ *    of dedicated locks.
+ *
+ * On the downside it does require znode_lock_t structures to be frequently
+ * allocated and freed.  However, because these are backed by a kmem cache
+ * and very short lived this cost is minimal.
+ */
+int
+zfs_znode_hold_compare(const void *a, const void *b)
+{
+	const znode_hold_t *zh_a = (const znode_hold_t *)a;
+	const znode_hold_t *zh_b = (const znode_hold_t *)b;
+
+	return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj));
+}
+
+static boolean_t __maybe_unused
+zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
+{
+	znode_hold_t *zh, search;
+	int i = ZFS_OBJ_HASH(zfsvfs, obj);
+	boolean_t held;
+
+	search.zh_obj = obj;
+
+	mutex_enter(&zfsvfs->z_hold_locks[i]);
+	zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
+	held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
+	mutex_exit(&zfsvfs->z_hold_locks[i]);
+
+	return (held);
+}
+
+static znode_hold_t *
+zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
+{
+	znode_hold_t *zh, *zh_new, search;
+	int i = ZFS_OBJ_HASH(zfsvfs, obj);
+	boolean_t found = B_FALSE;
+
+	zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
+	zh_new->zh_obj = obj;
+	search.zh_obj = obj;
+
+	mutex_enter(&zfsvfs->z_hold_locks[i]);
+	zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
+	if (likely(zh == NULL)) {
+		zh = zh_new;
+		avl_add(&zfsvfs->z_hold_trees[i], zh);
+	} else {
+		ASSERT3U(zh->zh_obj, ==, obj);
+		found = B_TRUE;
+	}
+	zfs_refcount_add(&zh->zh_refcount, NULL);
+	mutex_exit(&zfsvfs->z_hold_locks[i]);
+
+	if (found == B_TRUE)
+		kmem_cache_free(znode_hold_cache, zh_new);
+
+	ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
+	ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
+	mutex_enter(&zh->zh_lock);
+
+	return (zh);
+}
+
+static void
+zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
+{
+	int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
+	boolean_t remove = B_FALSE;
+
+	ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
+	ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
+	mutex_exit(&zh->zh_lock);
+
+	mutex_enter(&zfsvfs->z_hold_locks[i]);
+	if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) {
+		avl_remove(&zfsvfs->z_hold_trees[i], zh);
+		remove = B_TRUE;
+	}
+	mutex_exit(&zfsvfs->z_hold_locks[i]);
+
+	if (remove == B_TRUE)
+		kmem_cache_free(znode_hold_cache, zh);
+}
+
+dev_t
+zfs_cmpldev(uint64_t dev)
+{
+	return (dev);
+}
+
+static void
+zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
+    dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
+{
+	ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
+
+	mutex_enter(&zp->z_lock);
+
+	ASSERT(zp->z_sa_hdl == NULL);
+	ASSERT(zp->z_acl_cached == NULL);
+	if (sa_hdl == NULL) {
+		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+		    SA_HDL_SHARED, &zp->z_sa_hdl));
+	} else {
+		zp->z_sa_hdl = sa_hdl;
+		sa_set_userp(sa_hdl, zp);
+	}
+
+	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
+
+	mutex_exit(&zp->z_lock);
+}
+
+void
+zfs_znode_dmu_fini(znode_t *zp)
+{
+	ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked ||
+	    RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
+
+	sa_handle_destroy(zp->z_sa_hdl);
+	zp->z_sa_hdl = NULL;
+}
+
+/*
+ * Called by new_inode() to allocate a new inode.
+ */
+int
+zfs_inode_alloc(struct super_block *sb, struct inode **ip)
+{
+	znode_t *zp;
+
+	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+	*ip = ZTOI(zp);
+
+	return (0);
+}
+
+/*
+ * Called in multiple places when an inode should be destroyed.
+ */
+void
+zfs_inode_destroy(struct inode *ip)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	if (list_link_active(&zp->z_link_node)) {
+		list_remove(&zfsvfs->z_all_znodes, zp);
+		zfsvfs->z_nr_znodes--;
+	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
+	if (zp->z_xattr_cached) {
+		nvlist_free(zp->z_xattr_cached);
+		zp->z_xattr_cached = NULL;
+	}
+
+	kmem_cache_free(znode_cache, zp);
+}
+
+static void
+zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
+{
+	uint64_t rdev = 0;
+
+	switch (ip->i_mode & S_IFMT) {
+	case S_IFREG:
+		ip->i_op = &zpl_inode_operations;
+		ip->i_fop = &zpl_file_operations;
+		ip->i_mapping->a_ops = &zpl_address_space_operations;
+		break;
+
+	case S_IFDIR:
+		ip->i_op = &zpl_dir_inode_operations;
+		ip->i_fop = &zpl_dir_file_operations;
+		ITOZ(ip)->z_zn_prefetch = B_TRUE;
+		break;
+
+	case S_IFLNK:
+		ip->i_op = &zpl_symlink_inode_operations;
+		break;
+
+	/*
+	 * rdev is only stored in a SA only for device files.
+	 */
+	case S_IFCHR:
+	case S_IFBLK:
+		(void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
+		    sizeof (rdev));
+		/*FALLTHROUGH*/
+	case S_IFIFO:
+	case S_IFSOCK:
+		init_special_inode(ip, ip->i_mode, rdev);
+		ip->i_op = &zpl_special_inode_operations;
+		break;
+
+	default:
+		zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
+		    (u_longlong_t)ip->i_ino, ip->i_mode);
+
+		/* Assume the inode is a file and attempt to continue */
+		ip->i_mode = S_IFREG | 0644;
+		ip->i_op = &zpl_inode_operations;
+		ip->i_fop = &zpl_file_operations;
+		ip->i_mapping->a_ops = &zpl_address_space_operations;
+		break;
+	}
+}
+
+static void
+zfs_set_inode_flags(znode_t *zp, struct inode *ip)
+{
+	/*
+	 * Linux and Solaris have different sets of file attributes, so we
+	 * restrict this conversion to the intersection of the two.
+	 */
+#ifdef HAVE_INODE_SET_FLAGS
+	unsigned int flags = 0;
+	if (zp->z_pflags & ZFS_IMMUTABLE)
+		flags |= S_IMMUTABLE;
+	if (zp->z_pflags & ZFS_APPENDONLY)
+		flags |= S_APPEND;
+
+	inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
+#else
+	if (zp->z_pflags & ZFS_IMMUTABLE)
+		ip->i_flags |= S_IMMUTABLE;
+	else
+		ip->i_flags &= ~S_IMMUTABLE;
+
+	if (zp->z_pflags & ZFS_APPENDONLY)
+		ip->i_flags |= S_APPEND;
+	else
+		ip->i_flags &= ~S_APPEND;
+#endif
+}
+
+/*
+ * Update the embedded inode given the znode.  We should work toward
+ * eliminating this function as soon as possible by removing values
+ * which are duplicated between the znode and inode.  If the generic
+ * inode has the correct field it should be used, and the ZFS code
+ * updated to access the inode.  This can be done incrementally.
+ */
+void
+zfs_inode_update(znode_t *zp)
+{
+	zfsvfs_t	*zfsvfs;
+	struct inode	*ip;
+	uint32_t	blksize;
+	u_longlong_t	i_blocks;
+
+	ASSERT(zp != NULL);
+	zfsvfs = ZTOZSB(zp);
+	ip = ZTOI(zp);
+
+	/* Skip .zfs control nodes which do not exist on disk. */
+	if (zfsctl_is_node(ip))
+		return;
+
+	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
+
+	spin_lock(&ip->i_lock);
+	ip->i_blocks = i_blocks;
+	i_size_write(ip, zp->z_size);
+	spin_unlock(&ip->i_lock);
+}
+
+
+/*
+ * Construct a znode+inode and initialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+static znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
+    dmu_object_type_t obj_type, sa_handle_t *hdl)
+{
+	znode_t	*zp;
+	struct inode *ip;
+	uint64_t mode;
+	uint64_t parent;
+	uint64_t tmp_gen;
+	uint64_t links;
+	uint64_t z_uid, z_gid;
+	uint64_t atime[2], mtime[2], ctime[2];
+	uint64_t projid = ZFS_DEFAULT_PROJID;
+	sa_bulk_attr_t bulk[11];
+	int count = 0;
+
+	ASSERT(zfsvfs != NULL);
+
+	ip = new_inode(zfsvfs->z_sb);
+	if (ip == NULL)
+		return (NULL);
+
+	zp = ITOZ(ip);
+	ASSERT(zp->z_dirlocks == NULL);
+	ASSERT3P(zp->z_acl_cached, ==, NULL);
+	ASSERT3P(zp->z_xattr_cached, ==, NULL);
+	zp->z_unlinked = B_FALSE;
+	zp->z_atime_dirty = B_FALSE;
+	zp->z_moved = B_FALSE;
+	zp->z_is_mapped = B_FALSE;
+	zp->z_is_ctldir = B_FALSE;
+	zp->z_is_stale = B_FALSE;
+	zp->z_suspended = B_FALSE;
+	zp->z_sa_hdl = NULL;
+	zp->z_mapcnt = 0;
+	zp->z_id = db->db_object;
+	zp->z_blksz = blksz;
+	zp->z_seq = 0x7A4653;
+	zp->z_sync_cnt = 0;
+
+	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+	    &parent, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+
+	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
+	    (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+	    (zp->z_pflags & ZFS_PROJID) &&
+	    sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
+		if (hdl == NULL)
+			sa_handle_destroy(zp->z_sa_hdl);
+		zp->z_sa_hdl = NULL;
+		goto error;
+	}
+
+	zp->z_projid = projid;
+	zp->z_mode = ip->i_mode = mode;
+	ip->i_generation = (uint32_t)tmp_gen;
+	ip->i_blkbits = SPA_MINBLOCKSHIFT;
+	set_nlink(ip, (uint32_t)links);
+	zfs_uid_write(ip, z_uid);
+	zfs_gid_write(ip, z_gid);
+	zfs_set_inode_flags(zp, ip);
+
+	/* Cache the xattr parent id */
+	if (zp->z_pflags & ZFS_XATTR)
+		zp->z_xattr_parent = parent;
+
+	ZFS_TIME_DECODE(&ip->i_atime, atime);
+	ZFS_TIME_DECODE(&ip->i_mtime, mtime);
+	ZFS_TIME_DECODE(&ip->i_ctime, ctime);
+
+	ip->i_ino = zp->z_id;
+	zfs_inode_update(zp);
+	zfs_inode_set_ops(zfsvfs, ip);
+
+	/*
+	 * The only way insert_inode_locked() can fail is if the ip->i_ino
+	 * number is already hashed for this super block.  This can never
+	 * happen because the inode numbers map 1:1 with the object numbers.
+	 *
+	 * The one exception is rolling back a mounted file system, but in
+	 * this case all the active inode are unhashed during the rollback.
+	 */
+	VERIFY3S(insert_inode_locked(ip), ==, 0);
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	list_insert_tail(&zfsvfs->z_all_znodes, zp);
+	zfsvfs->z_nr_znodes++;
+	membar_producer();
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	unlock_new_inode(ip);
+	return (zp);
+
+error:
+	iput(ip);
+	return (NULL);
+}
+
+/*
+ * Safely mark an inode dirty.  Inodes which are part of a read-only
+ * file system or snapshot may not be dirtied.
+ */
+void
+zfs_mark_inode_dirty(struct inode *ip)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+	if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
+		return;
+
+	mark_inode_dirty(ip);
+}
+
+static uint64_t empty_xattr;
+static uint64_t pad[4];
+static zfs_acl_phys_t acl_phys;
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ *	IN:	dzp	- parent directory for new znode
+ *		vap	- file attributes for new znode
+ *		tx	- dmu transaction id for zap operations
+ *		cr	- credentials of caller
+ *		flag	- flags:
+ *			  IS_ROOT_NODE	- new object will be root
+ *			  IS_TMPFILE	- new object is of O_TMPFILE
+ *			  IS_XATTR	- new object is an attribute
+ *		acl_ids	- ACL related attributes
+ *
+ *	OUT:	zpp	- allocated znode (set to dzp if IS_ROOT_NODE)
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
+    uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
+{
+	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
+	uint64_t	mode, size, links, parent, pflags;
+	uint64_t	projid = ZFS_DEFAULT_PROJID;
+	uint64_t	rdev = 0;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	dmu_buf_t	*db;
+	inode_timespec_t now;
+	uint64_t	gen, obj;
+	int		bonuslen;
+	int		dnodesize;
+	sa_handle_t	*sa_hdl;
+	dmu_object_type_t obj_type;
+	sa_bulk_attr_t	*sa_attrs;
+	int		cnt = 0;
+	zfs_acl_locator_cb_t locate = { 0 };
+	znode_hold_t	*zh;
+
+	if (zfsvfs->z_replay) {
+		obj = vap->va_nodeid;
+		now = vap->va_ctime;		/* see zfs_replay_create() */
+		gen = vap->va_nblocks;		/* ditto */
+		dnodesize = vap->va_fsid;	/* ditto */
+	} else {
+		obj = 0;
+		gethrestime(&now);
+		gen = dmu_tx_get_txg(tx);
+		dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
+	}
+
+	if (dnodesize == 0)
+		dnodesize = DNODE_MIN_SIZE;
+
+	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+
+	bonuslen = (obj_type == DMU_OT_SA) ?
+	    DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
+
+	/*
+	 * Create a new DMU object.
+	 */
+	/*
+	 * There's currently no mechanism for pre-reading the blocks that will
+	 * be needed to allocate a new object, so we accept the small chance
+	 * that there will be an i/o error and we will fail one of the
+	 * assertions below.
+	 */
+	if (S_ISDIR(vap->va_mode)) {
+		if (zfsvfs->z_replay) {
+			VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
+			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+			    obj_type, bonuslen, dnodesize, tx));
+		} else {
+			obj = zap_create_norm_dnsize(zfsvfs->z_os,
+			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+			    obj_type, bonuslen, dnodesize, tx);
+		}
+	} else {
+		if (zfsvfs->z_replay) {
+			VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
+			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
+			    obj_type, bonuslen, dnodesize, tx));
+		} else {
+			obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
+			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
+			    obj_type, bonuslen, dnodesize, tx);
+		}
+	}
+
+	zh = zfs_znode_hold_enter(zfsvfs, obj);
+	VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
+
+	/*
+	 * If this is the root, fix up the half-initialized parent pointer
+	 * to reference the just-allocated physical data area.
+	 */
+	if (flag & IS_ROOT_NODE) {
+		dzp->z_id = obj;
+	}
+
+	/*
+	 * If parent is an xattr, so am I.
+	 */
+	if (dzp->z_pflags & ZFS_XATTR) {
+		flag |= IS_XATTR;
+	}
+
+	if (zfsvfs->z_use_fuids)
+		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+	else
+		pflags = 0;
+
+	if (S_ISDIR(vap->va_mode)) {
+		size = 2;		/* contents ("." and "..") */
+		links = 2;
+	} else {
+		size = 0;
+		links = (flag & IS_TMPFILE) ? 0 : 1;
+	}
+
+	if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
+		rdev = vap->va_rdev;
+
+	parent = dzp->z_id;
+	mode = acl_ids->z_mode;
+	if (flag & IS_XATTR)
+		pflags |= ZFS_XATTR;
+
+	if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
+		/*
+		 * With ZFS_PROJID flag, we can easily know whether there is
+		 * project ID stored on disk or not. See zfs_space_delta_cb().
+		 */
+		if (obj_type != DMU_OT_ZNODE &&
+		    dmu_objset_projectquota_enabled(zfsvfs->z_os))
+			pflags |= ZFS_PROJID;
+
+		/*
+		 * Inherit project ID from parent if required.
+		 */
+		projid = zfs_inherit_projid(dzp);
+		if (dzp->z_pflags & ZFS_PROJINHERIT)
+			pflags |= ZFS_PROJINHERIT;
+	}
+
+	/*
+	 * No execs denied will be determined when zfs_mode_compute() is called.
+	 */
+	pflags |= acl_ids->z_aclp->z_hints &
+	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
+	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
+
+	ZFS_TIME_ENCODE(&now, crtime);
+	ZFS_TIME_ENCODE(&now, ctime);
+
+	if (vap->va_mask & ATTR_ATIME) {
+		ZFS_TIME_ENCODE(&vap->va_atime, atime);
+	} else {
+		ZFS_TIME_ENCODE(&now, atime);
+	}
+
+	if (vap->va_mask & ATTR_MTIME) {
+		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+	} else {
+		ZFS_TIME_ENCODE(&now, mtime);
+	}
+
+	/* Now add in all of the "SA" attributes */
+	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+	    &sa_hdl));
+
+	/*
+	 * Setup the array of attributes to be replaced/set on the new file
+	 *
+	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
+	 * in the old znode_phys_t format.  Don't change this ordering
+	 */
+	sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+		    NULL, &atime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+		    NULL, &mtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+		    NULL, &ctime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+		    NULL, &crtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+		    NULL, &gen, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+		    NULL, &mode, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+		    NULL, &size, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+		    NULL, &parent, 8);
+	} else {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+		    NULL, &mode, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+		    NULL, &size, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+		    NULL, &gen, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
+		    NULL, &acl_ids->z_fuid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
+		    NULL, &acl_ids->z_fgid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+		    NULL, &parent, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &pflags, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+		    NULL, &atime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+		    NULL, &mtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+		    NULL, &ctime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+		    NULL, &crtime, 16);
+	}
+
+	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
+		    &empty_xattr, 8);
+	} else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+	    pflags & ZFS_PROJID) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
+		    NULL, &projid, 8);
+	}
+	if (obj_type == DMU_OT_ZNODE ||
+	    (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
+		    NULL, &rdev, 8);
+	}
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &pflags, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+		    &acl_ids->z_fuid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+		    &acl_ids->z_fgid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
+		    sizeof (uint64_t) * 4);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+		    &acl_phys, sizeof (zfs_acl_phys_t));
+	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+		    &acl_ids->z_aclp->z_acl_count, 8);
+		locate.cb_aclp = acl_ids->z_aclp;
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
+		    zfs_acl_data_locator, &locate,
+		    acl_ids->z_aclp->z_acl_bytes);
+		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
+		    acl_ids->z_fuid, acl_ids->z_fgid);
+	}
+
+	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+
+	if (!(flag & IS_ROOT_NODE)) {
+		/*
+		 * The call to zfs_znode_alloc() may fail if memory is low
+		 * via the call path: alloc_inode() -> inode_init_always() ->
+		 * security_inode_alloc() -> inode_alloc_security().  Since
+		 * the existing code is written such that zfs_mknode() can
+		 * not fail retry until sufficient memory has been reclaimed.
+		 */
+		do {
+			*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
+		} while (*zpp == NULL);
+
+		VERIFY(*zpp != NULL);
+		VERIFY(dzp != NULL);
+	} else {
+		/*
+		 * If we are creating the root node, the "parent" we
+		 * passed in is the znode for the root.
+		 */
+		*zpp = dzp;
+
+		(*zpp)->z_sa_hdl = sa_hdl;
+	}
+
+	(*zpp)->z_pflags = pflags;
+	(*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
+	(*zpp)->z_dnodesize = dnodesize;
+	(*zpp)->z_projid = projid;
+
+	if (obj_type == DMU_OT_ZNODE ||
+	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
+		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+	}
+	kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
+	zfs_znode_hold_exit(zfsvfs, zh);
+}
+
+/*
+ * Update in-core attributes.  It is assumed the caller will be doing an
+ * sa_bulk_update to push the changes out.
+ */
+void
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+	xoptattr_t *xoap;
+	boolean_t update_inode = B_FALSE;
+
+	xoap = xva_getxoptattr(xvap);
+	ASSERT(xoap);
+
+	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
+		uint64_t times[2];
+		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
+		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
+		    &times, sizeof (times), tx);
+		XVA_SET_RTN(xvap, XAT_CREATETIME);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_READONLY);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_HIDDEN);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_SYSTEM);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_ARCHIVE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+
+		update_inode = B_TRUE;
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_NOUNLINK);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_APPENDONLY);
+
+		update_inode = B_TRUE;
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_NODUMP);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_OPAQUE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
+		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+		zfs_sa_set_scanstamp(zp, xvap, tx);
+		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_REPARSE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_OFFLINE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_SPARSE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+		ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_PROJINHERIT);
+	}
+
+	if (update_inode)
+		zfs_set_inode_flags(zp, ZTOI(zp));
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+	dmu_object_info_t doi;
+	dmu_buf_t	*db;
+	znode_t		*zp;
+	znode_hold_t	*zh;
+	int err;
+	sa_handle_t	*hdl;
+
+	*zpp = NULL;
+
+again:
+	zh = zfs_znode_hold_enter(zfsvfs, obj_num);
+
+	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	if (err) {
+		zfs_znode_hold_exit(zfsvfs, zh);
+		return (err);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	if (doi.doi_bonus_type != DMU_OT_SA &&
+	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+		sa_buf_rele(db, NULL);
+		zfs_znode_hold_exit(zfsvfs, zh);
+		return (SET_ERROR(EINVAL));
+	}
+
+	hdl = dmu_buf_get_user(db);
+	if (hdl != NULL) {
+		zp = sa_get_userdata(hdl);
+
+
+		/*
+		 * Since "SA" does immediate eviction we
+		 * should never find a sa handle that doesn't
+		 * know about the znode.
+		 */
+
+		ASSERT3P(zp, !=, NULL);
+
+		mutex_enter(&zp->z_lock);
+		ASSERT3U(zp->z_id, ==, obj_num);
+		/*
+		 * If zp->z_unlinked is set, the znode is already marked
+		 * for deletion and should not be discovered. Check this
+		 * after checking igrab() due to fsetxattr() & O_TMPFILE.
+		 *
+		 * If igrab() returns NULL the VFS has independently
+		 * determined the inode should be evicted and has
+		 * called iput_final() to start the eviction process.
+		 * The SA handle is still valid but because the VFS
+		 * requires that the eviction succeed we must drop
+		 * our locks and references to allow the eviction to
+		 * complete.  The zfs_zget() may then be retried.
+		 *
+		 * This unlikely case could be optimized by registering
+		 * a sops->drop_inode() callback.  The callback would
+		 * need to detect the active SA hold thereby informing
+		 * the VFS that this inode should not be evicted.
+		 */
+		if (igrab(ZTOI(zp)) == NULL) {
+			if (zp->z_unlinked)
+				err = SET_ERROR(ENOENT);
+			else
+				err = SET_ERROR(EAGAIN);
+		} else {
+			*zpp = zp;
+			err = 0;
+		}
+
+		mutex_exit(&zp->z_lock);
+		sa_buf_rele(db, NULL);
+		zfs_znode_hold_exit(zfsvfs, zh);
+
+		if (err == EAGAIN) {
+			/* inode might need this to finish evict */
+			cond_resched();
+			goto again;
+		}
+		return (err);
+	}
+
+	/*
+	 * Not found create new znode/vnode but only if file exists.
+	 *
+	 * There is a small window where zfs_vget() could
+	 * find this object while a file create is still in
+	 * progress.  This is checked for in zfs_znode_alloc()
+	 *
+	 * if zfs_znode_alloc() fails it will drop the hold on the
+	 * bonus buffer.
+	 */
+	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
+	    doi.doi_bonus_type, NULL);
+	if (zp == NULL) {
+		err = SET_ERROR(ENOENT);
+	} else {
+		*zpp = zp;
+	}
+	zfs_znode_hold_exit(zfsvfs, zh);
+	return (err);
+}
+
+int
+zfs_rezget(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	dmu_object_info_t doi;
+	dmu_buf_t *db;
+	uint64_t obj_num = zp->z_id;
+	uint64_t mode;
+	uint64_t links;
+	sa_bulk_attr_t bulk[10];
+	int err;
+	int count = 0;
+	uint64_t gen;
+	uint64_t z_uid, z_gid;
+	uint64_t atime[2], mtime[2], ctime[2];
+	uint64_t projid = ZFS_DEFAULT_PROJID;
+	znode_hold_t *zh;
+
+	/*
+	 * skip ctldir, otherwise they will always get invalidated. This will
+	 * cause funny behaviour for the mounted snapdirs. Especially for
+	 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
+	 * anyone automount it again as long as someone is still using the
+	 * detached mount.
+	 */
+	if (zp->z_is_ctldir)
+		return (0);
+
+	zh = zfs_znode_hold_enter(zfsvfs, obj_num);
+
+	mutex_enter(&zp->z_acl_lock);
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+	mutex_exit(&zp->z_acl_lock);
+
+	rw_enter(&zp->z_xattr_lock, RW_WRITER);
+	if (zp->z_xattr_cached) {
+		nvlist_free(zp->z_xattr_cached);
+		zp->z_xattr_cached = NULL;
+	}
+	rw_exit(&zp->z_xattr_lock);
+
+	ASSERT(zp->z_sa_hdl == NULL);
+	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	if (err) {
+		zfs_znode_hold_exit(zfsvfs, zh);
+		return (err);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	if (doi.doi_bonus_type != DMU_OT_SA &&
+	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+		sa_buf_rele(db, NULL);
+		zfs_znode_hold_exit(zfsvfs, zh);
+		return (SET_ERROR(EINVAL));
+	}
+
+	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
+
+	/* reload cached values */
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+	    &gen, sizeof (gen));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, sizeof (zp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &links, sizeof (links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+	    &z_uid, sizeof (z_uid));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+	    &z_gid, sizeof (z_gid));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+	    &mode, sizeof (mode));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+	    &atime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+	    &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    &ctime, 16);
+
+	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
+		zfs_znode_dmu_fini(zp);
+		zfs_znode_hold_exit(zfsvfs, zh);
+		return (SET_ERROR(EIO));
+	}
+
+	if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
+		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
+		    &projid, 8);
+		if (err != 0 && err != ENOENT) {
+			zfs_znode_dmu_fini(zp);
+			zfs_znode_hold_exit(zfsvfs, zh);
+			return (SET_ERROR(err));
+		}
+	}
+
+	zp->z_projid = projid;
+	zp->z_mode = ZTOI(zp)->i_mode = mode;
+	zfs_uid_write(ZTOI(zp), z_uid);
+	zfs_gid_write(ZTOI(zp), z_gid);
+
+	ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
+	ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
+	ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
+
+	if ((uint32_t)gen != ZTOI(zp)->i_generation) {
+		zfs_znode_dmu_fini(zp);
+		zfs_znode_hold_exit(zfsvfs, zh);
+		return (SET_ERROR(EIO));
+	}
+
+	set_nlink(ZTOI(zp), (uint32_t)links);
+	zfs_set_inode_flags(zp, ZTOI(zp));
+
+	zp->z_blksz = doi.doi_data_block_size;
+	zp->z_atime_dirty = B_FALSE;
+	zfs_inode_update(zp);
+
+	/*
+	 * If the file has zero links, then it has been unlinked on the send
+	 * side and it must be in the received unlinked set.
+	 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
+	 * stale data and to prevent automatic removal of the file in
+	 * zfs_zinactive().  The file will be removed either when it is removed
+	 * on the send side and the next incremental stream is received or
+	 * when the unlinked set gets processed.
+	 */
+	zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
+	if (zp->z_unlinked)
+		zfs_znode_dmu_fini(zp);
+
+	zfs_znode_hold_exit(zfsvfs, zh);
+
+	return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	objset_t *os = zfsvfs->z_os;
+	uint64_t obj = zp->z_id;
+	uint64_t acl_obj = zfs_external_acl(zp);
+	znode_hold_t *zh;
+
+	zh = zfs_znode_hold_enter(zfsvfs, obj);
+	if (acl_obj) {
+		VERIFY(!zp->z_is_sa);
+		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+	}
+	VERIFY(0 == dmu_object_free(os, obj, tx));
+	zfs_znode_dmu_fini(zp);
+	zfs_znode_hold_exit(zfsvfs, zh);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	uint64_t z_id = zp->z_id;
+	znode_hold_t *zh;
+
+	ASSERT(zp->z_sa_hdl);
+
+	/*
+	 * Don't allow a zfs_zget() while were trying to release this znode.
+	 */
+	zh = zfs_znode_hold_enter(zfsvfs, z_id);
+
+	mutex_enter(&zp->z_lock);
+
+	/*
+	 * If this was the last reference to a file with no links, remove
+	 * the file from the file system unless the file system is mounted
+	 * read-only.  That can happen, for example, if the file system was
+	 * originally read-write, the file was opened, then unlinked and
+	 * the file system was made read-only before the file was finally
+	 * closed.  The file will remain in the unlinked set.
+	 */
+	if (zp->z_unlinked) {
+		ASSERT(!zfsvfs->z_issnap);
+		if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
+			mutex_exit(&zp->z_lock);
+			zfs_znode_hold_exit(zfsvfs, zh);
+			zfs_rmnode(zp);
+			return;
+		}
+	}
+
+	mutex_exit(&zp->z_lock);
+	zfs_znode_dmu_fini(zp);
+
+	zfs_znode_hold_exit(zfsvfs, zh);
+}
+
+#if defined(HAVE_INODE_TIMESPEC64_TIMES)
+#define	zfs_compare_timespec timespec64_compare
+#else
+#define	zfs_compare_timespec timespec_compare
+#endif
+
+/*
+ * Determine whether the znode's atime must be updated.  The logic mostly
+ * duplicates the Linux kernel's relatime_need_update() functionality.
+ * This function is only called if the underlying filesystem actually has
+ * atime updates enabled.
+ */
+boolean_t
+zfs_relatime_need_update(const struct inode *ip)
+{
+	inode_timespec_t now;
+
+	gethrestime(&now);
+	/*
+	 * In relatime mode, only update the atime if the previous atime
+	 * is earlier than either the ctime or mtime or if at least a day
+	 * has passed since the last update of atime.
+	 */
+	if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0)
+		return (B_TRUE);
+
+	if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0)
+		return (B_TRUE);
+
+	if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+/*
+ * Prepare to update znode time stamps.
+ *
+ *	IN:	zp	- znode requiring timestamp update
+ *		flag	- ATTR_MTIME, ATTR_CTIME flags
+ *
+ *	OUT:	zp	- z_seq
+ *		mtime	- new mtime
+ *		ctime	- new ctime
+ *
+ *	Note: We don't update atime here, because we rely on Linux VFS to do
+ *	atime updating.
+ */
+void
+zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+    uint64_t ctime[2])
+{
+	inode_timespec_t now;
+
+	gethrestime(&now);
+
+	zp->z_seq++;
+
+	if (flag & ATTR_MTIME) {
+		ZFS_TIME_ENCODE(&now, mtime);
+		ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
+		if (ZTOZSB(zp)->z_use_fuids) {
+			zp->z_pflags |= (ZFS_ARCHIVE |
+			    ZFS_AV_MODIFIED);
+		}
+	}
+
+	if (flag & ATTR_CTIME) {
+		ZFS_TIME_ENCODE(&now, ctime);
+		ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime);
+		if (ZTOZSB(zp)->z_use_fuids)
+			zp->z_pflags |= ZFS_ARCHIVE;
+	}
+}
+
+/*
+ * Grow the block size for a file.
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		size	- requested block size
+ *		tx	- open transaction.
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+void
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+	int		error;
+	u_longlong_t	dummy;
+
+	if (size <= zp->z_blksz)
+		return;
+	/*
+	 * If the file size is already greater than the current blocksize,
+	 * we will not grow.  If there is more than one block in a file,
+	 * the blocksize cannot change.
+	 */
+	if (zp->z_blksz && zp->z_size > zp->z_blksz)
+		return;
+
+	error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
+	    size, 0, tx);
+
+	if (error == ENOTSUP)
+		return;
+	ASSERT0(error);
+
+	/* What blocksize did we actually get? */
+	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
+}
+
+/*
+ * Increase the file length
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		end	- new end-of-file
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+static int
+zfs_extend(znode_t *zp, uint64_t end)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	dmu_tx_t *tx;
+	zfs_locked_range_t *lr;
+	uint64_t newblksz;
+	int error;
+
+	/*
+	 * We will change zp_size, lock the whole file.
+	 */
+	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (end <= zp->z_size) {
+		zfs_rangelock_exit(lr);
+		return (0);
+	}
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	if (end > zp->z_blksz &&
+	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
+		/*
+		 * We are growing the file past the current block size.
+		 */
+		if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
+			/*
+			 * File's blocksize is already larger than the
+			 * "recordsize" property.  Only let it grow to
+			 * the next power of 2.
+			 */
+			ASSERT(!ISP2(zp->z_blksz));
+			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
+		} else {
+			newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
+		}
+		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
+	} else {
+		newblksz = 0;
+	}
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_rangelock_exit(lr);
+		return (error);
+	}
+
+	if (newblksz)
+		zfs_grow_blocksize(zp, newblksz, tx);
+
+	zp->z_size = end;
+
+	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
+	    &zp->z_size, sizeof (zp->z_size), tx));
+
+	zfs_rangelock_exit(lr);
+
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
+/*
+ * zfs_zero_partial_page - Modeled after update_pages() but
+ * with different arguments and semantics for use by zfs_freesp().
+ *
+ * Zeroes a piece of a single page cache entry for zp at offset
+ * start and length len.
+ *
+ * Caller must acquire a range lock on the file for the region
+ * being zeroed in order that the ARC and page cache stay in sync.
+ */
+static void
+zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
+{
+	struct address_space *mp = ZTOI(zp)->i_mapping;
+	struct page *pp;
+	int64_t	off;
+	void *pb;
+
+	ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
+
+	off = start & (PAGE_SIZE - 1);
+	start &= PAGE_MASK;
+
+	pp = find_lock_page(mp, start >> PAGE_SHIFT);
+	if (pp) {
+		if (mapping_writably_mapped(mp))
+			flush_dcache_page(pp);
+
+		pb = kmap(pp);
+		bzero(pb + off, len);
+		kunmap(pp);
+
+		if (mapping_writably_mapped(mp))
+			flush_dcache_page(pp);
+
+		mark_page_accessed(pp);
+		SetPageUptodate(pp);
+		ClearPageError(pp);
+		unlock_page(pp);
+		put_page(pp);
+	}
+}
+
+/*
+ * Free space in a file.
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		off	- start of section to free.
+ *		len	- length of section to free.
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+static int
+zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	zfs_locked_range_t *lr;
+	int error;
+
+	/*
+	 * Lock the range being freed.
+	 */
+	lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (off >= zp->z_size) {
+		zfs_rangelock_exit(lr);
+		return (0);
+	}
+
+	if (off + len > zp->z_size)
+		len = zp->z_size - off;
+
+	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
+
+	/*
+	 * Zero partial page cache entries.  This must be done under a
+	 * range lock in order to keep the ARC and page cache in sync.
+	 */
+	if (zp->z_is_mapped) {
+		loff_t first_page, last_page, page_len;
+		loff_t first_page_offset, last_page_offset;
+
+		/* first possible full page in hole */
+		first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		/* last page of hole */
+		last_page = (off + len) >> PAGE_SHIFT;
+
+		/* offset of first_page */
+		first_page_offset = first_page << PAGE_SHIFT;
+		/* offset of last_page */
+		last_page_offset = last_page << PAGE_SHIFT;
+
+		/* truncate whole pages */
+		if (last_page_offset > first_page_offset) {
+			truncate_inode_pages_range(ZTOI(zp)->i_mapping,
+			    first_page_offset, last_page_offset - 1);
+		}
+
+		/* truncate sub-page ranges */
+		if (first_page > last_page) {
+			/* entire punched area within a single page */
+			zfs_zero_partial_page(zp, off, len);
+		} else {
+			/* beginning of punched area at the end of a page */
+			page_len  = first_page_offset - off;
+			if (page_len > 0)
+				zfs_zero_partial_page(zp, off, page_len);
+
+			/* end of punched area at the beginning of a page */
+			page_len = off + len - last_page_offset;
+			if (page_len > 0)
+				zfs_zero_partial_page(zp, last_page_offset,
+				    page_len);
+		}
+	}
+	zfs_rangelock_exit(lr);
+
+	return (error);
+}
+
+/*
+ * Truncate a file
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		end	- new end-of-file.
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+static int
+zfs_trunc(znode_t *zp, uint64_t end)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	dmu_tx_t *tx;
+	zfs_locked_range_t *lr;
+	int error;
+	sa_bulk_attr_t bulk[2];
+	int count = 0;
+
+	/*
+	 * We will change zp_size, lock the whole file.
+	 */
+	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (end >= zp->z_size) {
+		zfs_rangelock_exit(lr);
+		return (0);
+	}
+
+	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
+	    DMU_OBJECT_END);
+	if (error) {
+		zfs_rangelock_exit(lr);
+		return (error);
+	}
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_rangelock_exit(lr);
+		return (error);
+	}
+
+	zp->z_size = end;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+	    NULL, &zp->z_size, sizeof (zp->z_size));
+
+	if (end == 0) {
+		zp->z_pflags &= ~ZFS_SPARSE;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &zp->z_pflags, 8);
+	}
+	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
+
+	dmu_tx_commit(tx);
+	zfs_rangelock_exit(lr);
+
+	return (0);
+}
+
+/*
+ * Free space in a file
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		off	- start of range
+ *		len	- end of range (0 => EOF)
+ *		flag	- current file open mode flags.
+ *		log	- TRUE if this action should be logged
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+	dmu_tx_t *tx;
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	zilog_t *zilog = zfsvfs->z_log;
+	uint64_t mode;
+	uint64_t mtime[2], ctime[2];
+	sa_bulk_attr_t bulk[3];
+	int count = 0;
+	int error;
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
+	    sizeof (mode))) != 0)
+		return (error);
+
+	if (off > zp->z_size) {
+		error =  zfs_extend(zp, off+len);
+		if (error == 0 && log)
+			goto log;
+		goto out;
+	}
+
+	if (len == 0) {
+		error = zfs_trunc(zp, off);
+	} else {
+		if ((error = zfs_free_range(zp, off, len)) == 0 &&
+		    off + len > zp->z_size)
+			error = zfs_extend(zp, off+len);
+	}
+	if (error || !log)
+		goto out;
+log:
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		goto out;
+	}
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+	    NULL, &zp->z_pflags, 8);
+	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+	ASSERT(error == 0);
+
+	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+
+	dmu_tx_commit(tx);
+
+	zfs_inode_update(zp);
+	error = 0;
+
+out:
+	/*
+	 * Truncate the page cache - for file truncate operations, use
+	 * the purpose-built API for truncations.  For punching operations,
+	 * the truncation is handled under a range lock in zfs_free_range.
+	 */
+	if (len == 0)
+		truncate_setsize(ZTOI(zp), off);
+	return (error);
+}
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
+{
+	struct super_block *sb;
+	zfsvfs_t	*zfsvfs;
+	uint64_t	moid, obj, sa_obj, version;
+	uint64_t	sense = ZFS_CASE_SENSITIVE;
+	uint64_t	norm = 0;
+	nvpair_t	*elem;
+	int		size;
+	int		error;
+	int		i;
+	znode_t		*rootzp = NULL;
+	vattr_t		vattr;
+	znode_t		*zp;
+	zfs_acl_ids_t	acl_ids;
+
+	/*
+	 * First attempt to create master node.
+	 */
+	/*
+	 * In an empty objset, there are no blocks to read and thus
+	 * there can be no i/o errors (which we assert below).
+	 */
+	moid = MASTER_NODE_OBJ;
+	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+
+	/*
+	 * Set starting attributes.
+	 */
+	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
+		/* For the moment we expect all zpl props to be uint64_ts */
+		uint64_t val;
+		char *name;
+
+		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
+		VERIFY(nvpair_value_uint64(elem, &val) == 0);
+		name = nvpair_name(elem);
+		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
+			if (val < version)
+				version = val;
+		} else {
+			error = zap_update(os, moid, name, 8, 1, &val, tx);
+		}
+		ASSERT(error == 0);
+		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
+			norm = val;
+		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
+			sense = val;
+	}
+	ASSERT(version != 0);
+	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
+
+	/*
+	 * Create zap object used for SA attribute registration
+	 */
+
+	if (version >= ZPL_VERSION_SA) {
+		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+		    DMU_OT_NONE, 0, tx);
+		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+		ASSERT(error == 0);
+	} else {
+		sa_obj = 0;
+	}
+	/*
+	 * Create a delete queue.
+	 */
+	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+
+	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
+	ASSERT(error == 0);
+
+	/*
+	 * Create root znode.  Create minimal znode/inode/zfsvfs/sb
+	 * to allow zfs_mknode to work.
+	 */
+	vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
+	vattr.va_mode = S_IFDIR|0755;
+	vattr.va_uid = crgetuid(cr);
+	vattr.va_gid = crgetgid(cr);
+
+	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+	rootzp->z_unlinked = B_FALSE;
+	rootzp->z_atime_dirty = B_FALSE;
+	rootzp->z_moved = B_FALSE;
+	rootzp->z_is_sa = USE_SA(version, os);
+	rootzp->z_pflags = 0;
+
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+	zfsvfs->z_os = os;
+	zfsvfs->z_parent = zfsvfs;
+	zfsvfs->z_version = version;
+	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
+	zfsvfs->z_use_sa = USE_SA(version, os);
+	zfsvfs->z_norm = norm;
+
+	sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
+	sb->s_fs_info = zfsvfs;
+
+	ZTOI(rootzp)->i_sb = sb;
+
+	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+	    &zfsvfs->z_attr_table);
+
+	ASSERT(error == 0);
+
+	/*
+	 * Fold case on file systems that are always or sometimes case
+	 * insensitive.
+	 */
+	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
+		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+	    offsetof(znode_t, z_link_node));
+
+	size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
+	zfsvfs->z_hold_size = size;
+	zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
+	    KM_SLEEP);
+	zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
+	for (i = 0; i != size; i++) {
+		avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
+		    sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
+		mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
+	}
+
+	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+	    cr, NULL, &acl_ids));
+	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
+	ASSERT3P(zp, ==, rootzp);
+	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
+	ASSERT(error == 0);
+	zfs_acl_ids_free(&acl_ids);
+
+	atomic_set(&ZTOI(rootzp)->i_count, 0);
+	sa_handle_destroy(rootzp->z_sa_hdl);
+	kmem_cache_free(znode_cache, rootzp);
+
+	for (i = 0; i != size; i++) {
+		avl_destroy(&zfsvfs->z_hold_trees[i]);
+		mutex_destroy(&zfsvfs->z_hold_locks[i]);
+	}
+
+	mutex_destroy(&zfsvfs->z_znodes_lock);
+
+	vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
+	vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
+	kmem_free(sb, sizeof (struct super_block));
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+#endif /* _KERNEL */
+
+static int
+zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
+{
+	uint64_t sa_obj = 0;
+	int error;
+
+	error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+	if (error != 0 && error != ENOENT)
+		return (error);
+
+	error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
+	return (error);
+}
+
+static int
+zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
+    dmu_buf_t **db, void *tag)
+{
+	dmu_object_info_t doi;
+	int error;
+
+	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
+		return (error);
+
+	dmu_object_info_from_db(*db, &doi);
+	if ((doi.doi_bonus_type != DMU_OT_SA &&
+	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t))) {
+		sa_buf_rele(*db, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
+	if (error != 0) {
+		sa_buf_rele(*db, tag);
+		return (error);
+	}
+
+	return (0);
+}
+
+static void
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
+{
+	sa_handle_destroy(hdl);
+	sa_buf_rele(db, tag);
+}
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
+    uint64_t *pobjp, int *is_xattrdir)
+{
+	uint64_t parent;
+	uint64_t pflags;
+	uint64_t mode;
+	uint64_t parent_mode;
+	sa_bulk_attr_t bulk[3];
+	sa_handle_t *sa_hdl;
+	dmu_buf_t *sa_db;
+	int count = 0;
+	int error;
+
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
+	    &parent, sizeof (parent));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
+	    &pflags, sizeof (pflags));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+	    &mode, sizeof (mode));
+
+	if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
+		return (error);
+
+	/*
+	 * When a link is removed its parent pointer is not changed and will
+	 * be invalid.  There are two cases where a link is removed but the
+	 * file stays around, when it goes to the delete queue and when there
+	 * are additional links.
+	 */
+	error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
+	zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+	if (error != 0)
+		return (error);
+
+	*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
+
+	/*
+	 * Extended attributes can be applied to files, directories, etc.
+	 * Otherwise the parent must be a directory.
+	 */
+	if (!*is_xattrdir && !S_ISDIR(parent_mode))
+		return (SET_ERROR(EINVAL));
+
+	*pobjp = parent;
+
+	return (0);
+}
+
+/*
+ * Given an object number, return some zpl level statistics
+ */
+static int
+zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
+    zfs_stat_t *sb)
+{
+	sa_bulk_attr_t bulk[4];
+	int count = 0;
+
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+	    &sb->zs_mode, sizeof (sb->zs_mode));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
+	    &sb->zs_gen, sizeof (sb->zs_gen));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
+	    &sb->zs_links, sizeof (sb->zs_links));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
+	    &sb->zs_ctime, sizeof (sb->zs_ctime));
+
+	return (sa_bulk_lookup(hdl, bulk, count));
+}
+
+static int
+zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
+    sa_attr_type_t *sa_table, char *buf, int len)
+{
+	sa_handle_t *sa_hdl;
+	sa_handle_t *prevhdl = NULL;
+	dmu_buf_t *prevdb = NULL;
+	dmu_buf_t *sa_db = NULL;
+	char *path = buf + len - 1;
+	int error;
+
+	*path = '\0';
+	sa_hdl = hdl;
+
+	uint64_t deleteq_obj;
+	VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
+	    ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
+	error = zap_lookup_int(osp, deleteq_obj, obj);
+	if (error == 0) {
+		return (ESTALE);
+	} else if (error != ENOENT) {
+		return (error);
+	}
+	error = 0;
+
+	for (;;) {
+		uint64_t pobj = 0;
+		char component[MAXNAMELEN + 2];
+		size_t complen;
+		int is_xattrdir = 0;
+
+		if (prevdb)
+			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
+
+		if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
+		    &is_xattrdir)) != 0)
+			break;
+
+		if (pobj == obj) {
+			if (path[0] != '/')
+				*--path = '/';
+			break;
+		}
+
+		component[0] = '/';
+		if (is_xattrdir) {
+			(void) sprintf(component + 1, "<xattrdir>");
+		} else {
+			error = zap_value_search(osp, pobj, obj,
+			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
+			if (error != 0)
+				break;
+		}
+
+		complen = strlen(component);
+		path -= complen;
+		ASSERT(path >= buf);
+		bcopy(component, path, complen);
+		obj = pobj;
+
+		if (sa_hdl != hdl) {
+			prevhdl = sa_hdl;
+			prevdb = sa_db;
+		}
+		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
+		if (error != 0) {
+			sa_hdl = prevhdl;
+			sa_db = prevdb;
+			break;
+		}
+	}
+
+	if (sa_hdl != NULL && sa_hdl != hdl) {
+		ASSERT(sa_db != NULL);
+		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+	}
+
+	if (error == 0)
+		(void) memmove(buf, path, buf + len - path);
+
+	return (error);
+}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+	sa_attr_type_t *sa_table;
+	sa_handle_t *hdl;
+	dmu_buf_t *db;
+	int error;
+
+	error = zfs_sa_setup(osp, &sa_table);
+	if (error != 0)
+		return (error);
+
+	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+	zfs_release_sa_handle(hdl, db, FTAG);
+	return (error);
+}
+
+int
+zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+    char *buf, int len)
+{
+	char *path = buf + len - 1;
+	sa_attr_type_t *sa_table;
+	sa_handle_t *hdl;
+	dmu_buf_t *db;
+	int error;
+
+	*path = '\0';
+
+	error = zfs_sa_setup(osp, &sa_table);
+	if (error != 0)
+		return (error);
+
+	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
+	if (error != 0) {
+		zfs_release_sa_handle(hdl, db, FTAG);
+		return (error);
+	}
+
+	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+	zfs_release_sa_handle(hdl, db, FTAG);
+	return (error);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_create_fs);
+EXPORT_SYMBOL(zfs_obj_to_path);
+
+/* CSTYLED */
+module_param(zfs_object_mutex_size, uint, 0644);
+MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
+module_param(zfs_unlink_suspend_progress, int, 0644);
+MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
+"(debug - leaks space into the unlinked set)");
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
new file mode 100644
index 000000000000..96dabe55a138
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
@@ -0,0 +1,2036 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/zio_crypt.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+#include <sys/sha2.h>
+#include <sys/hkdf.h>
+#include <sys/qat.h>
+
+/*
+ * This file is responsible for handling all of the details of generating
+ * encryption parameters and performing encryption and authentication.
+ *
+ * BLOCK ENCRYPTION PARAMETERS:
+ * Encryption /Authentication Algorithm Suite (crypt):
+ * The encryption algorithm, mode, and key length we are going to use. We
+ * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
+ * keys. All authentication is currently done with SHA512-HMAC.
+ *
+ * Plaintext:
+ * The unencrypted data that we want to encrypt.
+ *
+ * Initialization Vector (IV):
+ * An initialization vector for the encryption algorithms. This is used to
+ * "tweak" the encryption algorithms so that two blocks of the same data are
+ * encrypted into different ciphertext outputs, thus obfuscating block patterns.
+ * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
+ * never reused with the same encryption key. This value is stored unencrypted
+ * and must simply be provided to the decryption function. We use a 96 bit IV
+ * (as recommended by NIST) for all block encryption. For non-dedup blocks we
+ * derive the IV randomly. The first 64 bits of the IV are stored in the second
+ * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
+ * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
+ * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
+ * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
+ * level 0 blocks is the number of allocated dnodes in that block. The on-disk
+ * format supports at most 2^15 slots per L0 dnode block, because the maximum
+ * block size is 16MB (2^24). In either case, for level 0 blocks this number
+ * will still be smaller than UINT32_MAX so it is safe to store the IV in the
+ * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
+ * for the dnode code.
+ *
+ * Master key:
+ * This is the most important secret data of an encrypted dataset. It is used
+ * along with the salt to generate that actual encryption keys via HKDF. We
+ * do not use the master key to directly encrypt any data because there are
+ * theoretical limits on how much data can actually be safely encrypted with
+ * any encryption mode. The master key is stored encrypted on disk with the
+ * user's wrapping key. Its length is determined by the encryption algorithm.
+ * For details on how this is stored see the block comment in dsl_crypt.c
+ *
+ * Salt:
+ * Used as an input to the HKDF function, along with the master key. We use a
+ * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
+ * can be used for encrypting many blocks, so we cache the current salt and the
+ * associated derived key in zio_crypt_t so we do not need to derive it again
+ * needlessly.
+ *
+ * Encryption Key:
+ * A secret binary key, generated from an HKDF function used to encrypt and
+ * decrypt data.
+ *
+ * Message Authentication Code (MAC)
+ * The MAC is an output of authenticated encryption modes such as AES-GCM and
+ * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
+ * data on disk and return garbage to the application. Effectively, it is a
+ * checksum that can not be reproduced by an attacker. We store the MAC in the
+ * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
+ * regular checksum of the ciphertext which can be used for scrubbing.
+ *
+ * OBJECT AUTHENTICATION:
+ * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
+ * they contain some info that always needs to be readable. To prevent this
+ * data from being altered, we authenticate this data using SHA512-HMAC. This
+ * will produce a MAC (similar to the one produced via encryption) which can
+ * be used to verify the object was not modified. HMACs do not require key
+ * rotation or IVs, so we can keep up to the full 3 copies of authenticated
+ * data.
+ *
+ * ZIL ENCRYPTION:
+ * ZIL blocks have their bp written to disk ahead of the associated data, so we
+ * cannot store the MAC there as we normally do. For these blocks the MAC is
+ * stored in the embedded checksum within the zil_chain_t header. The salt and
+ * IV are generated for the block on bp allocation instead of at encryption
+ * time. In addition, ZIL blocks have some pieces that must be left in plaintext
+ * for claiming even though all of the sensitive user data still needs to be
+ * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
+ * pieces of the block need to be encrypted. All data that is not encrypted is
+ * authenticated using the AAD mechanisms that the supported encryption modes
+ * provide for. In order to preserve the semantics of the ZIL for encrypted
+ * datasets, the ZIL is not protected at the objset level as described below.
+ *
+ * DNODE ENCRYPTION:
+ * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
+ * in plaintext for scrubbing and claiming, but the bonus buffers might contain
+ * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
+ * which which pieces of the block need to be encrypted. For more details about
+ * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
+ *
+ * OBJECT SET AUTHENTICATION:
+ * Up to this point, everything we have encrypted and authenticated has been
+ * at level 0 (or -2 for the ZIL). If we did not do any further work the
+ * on-disk format would be susceptible to attacks that deleted or rearranged
+ * the order of level 0 blocks. Ideally, the cleanest solution would be to
+ * maintain a tree of authentication MACs going up the bp tree. However, this
+ * presents a problem for raw sends. Send files do not send information about
+ * indirect blocks so there would be no convenient way to transfer the MACs and
+ * they cannot be recalculated on the receive side without the master key which
+ * would defeat one of the purposes of raw sends in the first place. Instead,
+ * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
+ * from the level below. We also include some portable fields from blk_prop such
+ * as the lsize and compression algorithm to prevent the data from being
+ * misinterpreted.
+ *
+ * At the objset level, we maintain 2 separate 256 bit MACs in the
+ * objset_phys_t. The first one is "portable" and is the logical root of the
+ * MAC tree maintained in the metadnode's bps. The second, is "local" and is
+ * used as the root MAC for the user accounting objects, which are also not
+ * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
+ * of the send file. The useraccounting code ensures that the useraccounting
+ * info is not present upon a receive, so the local MAC can simply be cleared
+ * out at that time. For more info about objset_phys_t authentication, see
+ * zio_crypt_do_objset_hmacs().
+ *
+ * CONSIDERATIONS FOR DEDUP:
+ * In order for dedup to work, blocks that we want to dedup with one another
+ * need to use the same IV and encryption key, so that they will have the same
+ * ciphertext. Normally, one should never reuse an IV with the same encryption
+ * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
+ * blocks. In this case, however, since we are using the same plaintext as
+ * well all that we end up with is a duplicate of the original ciphertext we
+ * already had. As a result, an attacker with read access to the raw disk will
+ * be able to tell which blocks are the same but this information is given away
+ * by dedup anyway. In order to get the same IVs and encryption keys for
+ * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
+ * here so that a reproducible checksum of the plaintext is never available to
+ * the attacker. The HMAC key is kept alongside the master key, encrypted on
+ * disk. The first 64 bits of the HMAC are used in place of the random salt, and
+ * the next 96 bits are used as the IV. As a result of this mechanism, dedup
+ * will only work within a clone family since encrypted dedup requires use of
+ * the same master and HMAC keys.
+ */
+
+/*
+ * After encrypting many blocks with the same key we may start to run up
+ * against the theoretical limits of how much data can securely be encrypted
+ * with a single key using the supported encryption modes. The most obvious
+ * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
+ * the more IVs we generate (which both GCM and CCM modes strictly forbid).
+ * This risk actually grows surprisingly quickly over time according to the
+ * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
+ * generated n IVs with a cryptographically secure RNG, the approximate
+ * probability p(n) of a collision is given as:
+ *
+ * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
+ *
+ * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
+ *
+ * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
+ * we must not write more than 398,065,730 blocks with the same encryption key.
+ * Therefore, we rotate our keys after 400,000,000 blocks have been written by
+ * generating a new random 64 bit salt for our HKDF encryption key generation
+ * function.
+ */
+#define	ZFS_KEY_MAX_SALT_USES_DEFAULT	400000000
+#define	ZFS_CURRENT_MAX_SALT_USES	\
+	(MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
+unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
+
+typedef struct blkptr_auth_buf {
+	uint64_t bab_prop;			/* blk_prop - portable mask */
+	uint8_t bab_mac[ZIO_DATA_MAC_LEN];	/* MAC from blk_cksum */
+	uint64_t bab_pad;			/* reserved for future use */
+} blkptr_auth_buf_t;
+
+zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
+	{"",			ZC_TYPE_NONE,	0,	"inherit"},
+	{"",			ZC_TYPE_NONE,	0,	"on"},
+	{"",			ZC_TYPE_NONE,	0,	"off"},
+	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	16,	"aes-128-ccm"},
+	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	24,	"aes-192-ccm"},
+	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	32,	"aes-256-ccm"},
+	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	16,	"aes-128-gcm"},
+	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	24,	"aes-192-gcm"},
+	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	32,	"aes-256-gcm"}
+};
+
+void
+zio_crypt_key_destroy(zio_crypt_key_t *key)
+{
+	rw_destroy(&key->zk_salt_lock);
+
+	/* free crypto templates */
+	crypto_destroy_ctx_template(key->zk_current_tmpl);
+	crypto_destroy_ctx_template(key->zk_hmac_tmpl);
+
+	/* zero out sensitive data */
+	bzero(key, sizeof (zio_crypt_key_t));
+}
+
+int
+zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
+{
+	int ret;
+	crypto_mechanism_t mech;
+	uint_t keydata_len;
+
+	ASSERT(key != NULL);
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+
+	keydata_len = zio_crypt_table[crypt].ci_keylen;
+	bzero(key, sizeof (zio_crypt_key_t));
+
+	/* fill keydata buffers and salt with random data */
+	ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
+	if (ret != 0)
+		goto error;
+
+	ret = random_get_bytes(key->zk_master_keydata, keydata_len);
+	if (ret != 0)
+		goto error;
+
+	ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
+	if (ret != 0)
+		goto error;
+
+	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+	if (ret != 0)
+		goto error;
+
+	/* derive the current key from the master key */
+	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+	    keydata_len);
+	if (ret != 0)
+		goto error;
+
+	/* initialize keys for the ICP */
+	key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_current_key.ck_data = key->zk_current_keydata;
+	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+	key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_hmac_key.ck_data = &key->zk_hmac_key;
+	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+	/*
+	 * Initialize the crypto templates. It's ok if this fails because
+	 * this is just an optimization.
+	 */
+	mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
+	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+	    &key->zk_current_tmpl, KM_SLEEP);
+	if (ret != CRYPTO_SUCCESS)
+		key->zk_current_tmpl = NULL;
+
+	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+	ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
+	    &key->zk_hmac_tmpl, KM_SLEEP);
+	if (ret != CRYPTO_SUCCESS)
+		key->zk_hmac_tmpl = NULL;
+
+	key->zk_crypt = crypt;
+	key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
+	key->zk_salt_count = 0;
+	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+	return (0);
+
+error:
+	zio_crypt_key_destroy(key);
+	return (ret);
+}
+
+static int
+zio_crypt_key_change_salt(zio_crypt_key_t *key)
+{
+	int ret = 0;
+	uint8_t salt[ZIO_DATA_SALT_LEN];
+	crypto_mechanism_t mech;
+	uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
+
+	/* generate a new salt */
+	ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
+	if (ret != 0)
+		goto error;
+
+	rw_enter(&key->zk_salt_lock, RW_WRITER);
+
+	/* someone beat us to the salt rotation, just unlock and return */
+	if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
+		goto out_unlock;
+
+	/* derive the current key from the master key and the new salt */
+	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+	    salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
+	if (ret != 0)
+		goto out_unlock;
+
+	/* assign the salt and reset the usage count */
+	bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
+	key->zk_salt_count = 0;
+
+	/* destroy the old context template and create the new one */
+	crypto_destroy_ctx_template(key->zk_current_tmpl);
+	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+	    &key->zk_current_tmpl, KM_SLEEP);
+	if (ret != CRYPTO_SUCCESS)
+		key->zk_current_tmpl = NULL;
+
+	rw_exit(&key->zk_salt_lock);
+
+	return (0);
+
+out_unlock:
+	rw_exit(&key->zk_salt_lock);
+error:
+	return (ret);
+}
+
+/* See comment above zfs_key_max_salt_uses definition for details */
+int
+zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
+{
+	int ret;
+	boolean_t salt_change;
+
+	rw_enter(&key->zk_salt_lock, RW_READER);
+
+	bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
+	salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
+	    ZFS_CURRENT_MAX_SALT_USES);
+
+	rw_exit(&key->zk_salt_lock);
+
+	if (salt_change) {
+		ret = zio_crypt_key_change_salt(key);
+		if (ret != 0)
+			goto error;
+	}
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+/*
+ * This function handles all encryption and decryption in zfs. When
+ * encrypting it expects puio to reference the plaintext and cuio to
+ * reference the ciphertext. cuio must have enough space for the
+ * ciphertext + room for a MAC. datalen should be the length of the
+ * plaintext / ciphertext alone.
+ */
+static int
+zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key,
+    crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen,
+    uio_t *puio, uio_t *cuio, uint8_t *authbuf, uint_t auth_len)
+{
+	int ret;
+	crypto_data_t plaindata, cipherdata;
+	CK_AES_CCM_PARAMS ccmp;
+	CK_AES_GCM_PARAMS gcmp;
+	crypto_mechanism_t mech;
+	zio_crypt_info_t crypt_info;
+	uint_t plain_full_len, maclen;
+
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+	ASSERT3U(key->ck_format, ==, CRYPTO_KEY_RAW);
+
+	/* lookup the encryption info */
+	crypt_info = zio_crypt_table[crypt];
+
+	/* the mac will always be the last iovec_t in the cipher uio */
+	maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len;
+
+	ASSERT(maclen <= ZIO_DATA_MAC_LEN);
+
+	/* setup encryption mechanism (same as crypt) */
+	mech.cm_type = crypto_mech2id(crypt_info.ci_mechname);
+
+	/*
+	 * Strangely, the ICP requires that plain_full_len must include
+	 * the MAC length when decrypting, even though the UIO does not
+	 * need to have the extra space allocated.
+	 */
+	if (encrypt) {
+		plain_full_len = datalen;
+	} else {
+		plain_full_len = datalen + maclen;
+	}
+
+	/*
+	 * setup encryption params (currently only AES CCM and AES GCM
+	 * are supported)
+	 */
+	if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) {
+		ccmp.ulNonceSize = ZIO_DATA_IV_LEN;
+		ccmp.ulAuthDataSize = auth_len;
+		ccmp.authData = authbuf;
+		ccmp.ulMACSize = maclen;
+		ccmp.nonce = ivbuf;
+		ccmp.ulDataSize = plain_full_len;
+
+		mech.cm_param = (char *)(&ccmp);
+		mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS);
+	} else {
+		gcmp.ulIvLen = ZIO_DATA_IV_LEN;
+		gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN);
+		gcmp.ulAADLen = auth_len;
+		gcmp.pAAD = authbuf;
+		gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen);
+		gcmp.pIv = ivbuf;
+
+		mech.cm_param = (char *)(&gcmp);
+		mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS);
+	}
+
+	/* populate the cipher and plain data structs. */
+	plaindata.cd_format = CRYPTO_DATA_UIO;
+	plaindata.cd_offset = 0;
+	plaindata.cd_uio = puio;
+	plaindata.cd_miscdata = NULL;
+	plaindata.cd_length = plain_full_len;
+
+	cipherdata.cd_format = CRYPTO_DATA_UIO;
+	cipherdata.cd_offset = 0;
+	cipherdata.cd_uio = cuio;
+	cipherdata.cd_miscdata = NULL;
+	cipherdata.cd_length = datalen + maclen;
+
+	/* perform the actual encryption */
+	if (encrypt) {
+		ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata,
+		    NULL);
+		if (ret != CRYPTO_SUCCESS) {
+			ret = SET_ERROR(EIO);
+			goto error;
+		}
+	} else {
+		ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata,
+		    NULL);
+		if (ret != CRYPTO_SUCCESS) {
+			ASSERT3U(ret, ==, CRYPTO_INVALID_MAC);
+			ret = SET_ERROR(ECKSUM);
+			goto error;
+		}
+	}
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+int
+zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
+    uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
+{
+	int ret;
+	uio_t puio, cuio;
+	uint64_t aad[3];
+	iovec_t plain_iovecs[2], cipher_iovecs[3];
+	uint64_t crypt = key->zk_crypt;
+	uint_t enc_len, keydata_len, aad_len;
+
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+	ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+	keydata_len = zio_crypt_table[crypt].ci_keylen;
+
+	/* generate iv for wrapping the master and hmac key */
+	ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
+	if (ret != 0)
+		goto error;
+
+	/* initialize uio_ts */
+	plain_iovecs[0].iov_base = key->zk_master_keydata;
+	plain_iovecs[0].iov_len = keydata_len;
+	plain_iovecs[1].iov_base = key->zk_hmac_keydata;
+	plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+
+	cipher_iovecs[0].iov_base = keydata_out;
+	cipher_iovecs[0].iov_len = keydata_len;
+	cipher_iovecs[1].iov_base = hmac_keydata_out;
+	cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+	cipher_iovecs[2].iov_base = mac;
+	cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
+
+	/*
+	 * Although we don't support writing to the old format, we do
+	 * support rewrapping the key so that the user can move and
+	 * quarantine datasets on the old format.
+	 */
+	if (key->zk_version == 0) {
+		aad_len = sizeof (uint64_t);
+		aad[0] = LE_64(key->zk_guid);
+	} else {
+		ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+		aad_len = sizeof (uint64_t) * 3;
+		aad[0] = LE_64(key->zk_guid);
+		aad[1] = LE_64(crypt);
+		aad[2] = LE_64(key->zk_version);
+	}
+
+	enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
+	puio.uio_iov = plain_iovecs;
+	puio.uio_iovcnt = 2;
+	puio.uio_segflg = UIO_SYSSPACE;
+	cuio.uio_iov = cipher_iovecs;
+	cuio.uio_iovcnt = 3;
+	cuio.uio_segflg = UIO_SYSSPACE;
+
+	/* encrypt the keys and store the resulting ciphertext and mac */
+	ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len,
+	    &puio, &cuio, (uint8_t *)aad, aad_len);
+	if (ret != 0)
+		goto error;
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+int
+zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
+    uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
+    uint8_t *mac, zio_crypt_key_t *key)
+{
+	crypto_mechanism_t mech;
+	uio_t puio, cuio;
+	uint64_t aad[3];
+	iovec_t plain_iovecs[2], cipher_iovecs[3];
+	uint_t enc_len, keydata_len, aad_len;
+	int ret;
+
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+	ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+	keydata_len = zio_crypt_table[crypt].ci_keylen;
+
+	/* initialize uio_ts */
+	plain_iovecs[0].iov_base = key->zk_master_keydata;
+	plain_iovecs[0].iov_len = keydata_len;
+	plain_iovecs[1].iov_base = key->zk_hmac_keydata;
+	plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+
+	cipher_iovecs[0].iov_base = keydata;
+	cipher_iovecs[0].iov_len = keydata_len;
+	cipher_iovecs[1].iov_base = hmac_keydata;
+	cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+	cipher_iovecs[2].iov_base = mac;
+	cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
+
+	if (version == 0) {
+		aad_len = sizeof (uint64_t);
+		aad[0] = LE_64(guid);
+	} else {
+		ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+		aad_len = sizeof (uint64_t) * 3;
+		aad[0] = LE_64(guid);
+		aad[1] = LE_64(crypt);
+		aad[2] = LE_64(version);
+	}
+
+	enc_len = keydata_len + SHA512_HMAC_KEYLEN;
+	puio.uio_iov = plain_iovecs;
+	puio.uio_segflg = UIO_SYSSPACE;
+	puio.uio_iovcnt = 2;
+	cuio.uio_iov = cipher_iovecs;
+	cuio.uio_iovcnt = 3;
+	cuio.uio_segflg = UIO_SYSSPACE;
+
+	/* decrypt the keys and store the result in the output buffers */
+	ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len,
+	    &puio, &cuio, (uint8_t *)aad, aad_len);
+	if (ret != 0)
+		goto error;
+
+	/* generate a fresh salt */
+	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+	if (ret != 0)
+		goto error;
+
+	/* derive the current key from the master key */
+	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+	    keydata_len);
+	if (ret != 0)
+		goto error;
+
+	/* initialize keys for ICP */
+	key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_current_key.ck_data = key->zk_current_keydata;
+	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+	key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
+	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+	/*
+	 * Initialize the crypto templates. It's ok if this fails because
+	 * this is just an optimization.
+	 */
+	mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
+	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+	    &key->zk_current_tmpl, KM_SLEEP);
+	if (ret != CRYPTO_SUCCESS)
+		key->zk_current_tmpl = NULL;
+
+	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+	ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
+	    &key->zk_hmac_tmpl, KM_SLEEP);
+	if (ret != CRYPTO_SUCCESS)
+		key->zk_hmac_tmpl = NULL;
+
+	key->zk_crypt = crypt;
+	key->zk_version = version;
+	key->zk_guid = guid;
+	key->zk_salt_count = 0;
+
+	return (0);
+
+error:
+	zio_crypt_key_destroy(key);
+	return (ret);
+}
+
+int
+zio_crypt_generate_iv(uint8_t *ivbuf)
+{
+	int ret;
+
+	/* randomly generate the IV */
+	ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
+	if (ret != 0)
+		goto error;
+
+	return (0);
+
+error:
+	bzero(ivbuf, ZIO_DATA_IV_LEN);
+	return (ret);
+}
+
+int
+zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
+    uint8_t *digestbuf, uint_t digestlen)
+{
+	int ret;
+	crypto_mechanism_t mech;
+	crypto_data_t in_data, digest_data;
+	uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
+
+	ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
+
+	/* initialize sha512-hmac mechanism and crypto data */
+	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+	mech.cm_param = NULL;
+	mech.cm_param_len = 0;
+
+	/* initialize the crypto data */
+	in_data.cd_format = CRYPTO_DATA_RAW;
+	in_data.cd_offset = 0;
+	in_data.cd_length = datalen;
+	in_data.cd_raw.iov_base = (char *)data;
+	in_data.cd_raw.iov_len = in_data.cd_length;
+
+	digest_data.cd_format = CRYPTO_DATA_RAW;
+	digest_data.cd_offset = 0;
+	digest_data.cd_length = SHA512_DIGEST_LENGTH;
+	digest_data.cd_raw.iov_base = (char *)raw_digestbuf;
+	digest_data.cd_raw.iov_len = digest_data.cd_length;
+
+	/* generate the hmac */
+	ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl,
+	    &digest_data, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	bcopy(raw_digestbuf, digestbuf, digestlen);
+
+	return (0);
+
+error:
+	bzero(digestbuf, digestlen);
+	return (ret);
+}
+
+int
+zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
+    uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
+{
+	int ret;
+	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+	ret = zio_crypt_do_hmac(key, data, datalen,
+	    digestbuf, SHA512_DIGEST_LENGTH);
+	if (ret != 0)
+		return (ret);
+
+	bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN);
+	bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN);
+
+	return (0);
+}
+
+/*
+ * The following functions are used to encode and decode encryption parameters
+ * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
+ * byte strings, which normally means that these strings would not need to deal
+ * with byteswapping at all. However, both blkptr_t and zil_header_t may be
+ * byteswapped by lower layers and so we must "undo" that byteswap here upon
+ * decoding and encoding in a non-native byteorder. These functions require
+ * that the byteorder bit is correct before being called.
+ */
+void
+zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+	uint64_t val64;
+	uint32_t val32;
+
+	ASSERT(BP_IS_ENCRYPTED(bp));
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
+		bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
+		bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+		BP_SET_IV2(bp, val32);
+	} else {
+		bcopy(salt, &val64, sizeof (uint64_t));
+		bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
+
+		bcopy(iv, &val64, sizeof (uint64_t));
+		bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
+
+		bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+		BP_SET_IV2(bp, BSWAP_32(val32));
+	}
+}
+
+void
+zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+	uint64_t val64;
+	uint32_t val32;
+
+	ASSERT(BP_IS_PROTECTED(bp));
+
+	/* for convenience, so callers don't need to check */
+	if (BP_IS_AUTHENTICATED(bp)) {
+		bzero(salt, ZIO_DATA_SALT_LEN);
+		bzero(iv, ZIO_DATA_IV_LEN);
+		return;
+	}
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
+		bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
+
+		val32 = (uint32_t)BP_GET_IV2(bp);
+		bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+	} else {
+		val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
+		bcopy(&val64, salt, sizeof (uint64_t));
+
+		val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
+		bcopy(&val64, iv, sizeof (uint64_t));
+
+		val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
+		bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+	}
+}
+
+void
+zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
+{
+	uint64_t val64;
+
+	ASSERT(BP_USES_CRYPT(bp));
+	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
+		bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
+		    sizeof (uint64_t));
+	} else {
+		bcopy(mac, &val64, sizeof (uint64_t));
+		bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
+
+		bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
+		bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
+	}
+}
+
+void
+zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
+{
+	uint64_t val64;
+
+	ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
+
+	/* for convenience, so callers don't need to check */
+	if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+		bzero(mac, ZIO_DATA_MAC_LEN);
+		return;
+	}
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
+		bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
+		    sizeof (uint64_t));
+	} else {
+		val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
+		bcopy(&val64, mac, sizeof (uint64_t));
+
+		val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
+		bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
+	}
+}
+
+void
+zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
+{
+	zil_chain_t *zilc = data;
+
+	bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
+	bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
+	    sizeof (uint64_t));
+}
+
+void
+zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
+{
+	/*
+	 * The ZIL MAC is embedded in the block it protects, which will
+	 * not have been byteswapped by the time this function has been called.
+	 * As a result, we don't need to worry about byteswapping the MAC.
+	 */
+	const zil_chain_t *zilc = data;
+
+	bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
+	bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
+	    sizeof (uint64_t));
+}
+
+/*
+ * This routine takes a block of dnodes (src_abd) and copies only the bonus
+ * buffers to the same offsets in the dst buffer. datalen should be the size
+ * of both the src_abd and the dst buffer (not just the length of the bonus
+ * buffers).
+ */
+void
+zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
+{
+	uint_t i, max_dnp = datalen >> DNODE_SHIFT;
+	uint8_t *src;
+	dnode_phys_t *dnp, *sdnp, *ddnp;
+
+	src = abd_borrow_buf_copy(src_abd, datalen);
+
+	sdnp = (dnode_phys_t *)src;
+	ddnp = (dnode_phys_t *)dst;
+
+	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+		dnp = &sdnp[i];
+		if (dnp->dn_type != DMU_OT_NONE &&
+		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+		    dnp->dn_bonuslen != 0) {
+			bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]),
+			    DN_MAX_BONUS_LEN(dnp));
+		}
+	}
+
+	abd_return_buf(src_abd, src, datalen);
+}
+
+/*
+ * This function decides what fields from blk_prop are included in
+ * the on-disk various MAC algorithms.
+ */
+static void
+zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
+{
+	/*
+	 * Version 0 did not properly zero out all non-portable fields
+	 * as it should have done. We maintain this code so that we can
+	 * do read-only imports of pools on this version.
+	 */
+	if (version == 0) {
+		BP_SET_DEDUP(bp, 0);
+		BP_SET_CHECKSUM(bp, 0);
+		BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
+		return;
+	}
+
+	ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+
+	/*
+	 * The hole_birth feature might set these fields even if this bp
+	 * is a hole. We zero them out here to guarantee that raw sends
+	 * will function with or without the feature.
+	 */
+	if (BP_IS_HOLE(bp)) {
+		bp->blk_prop = 0ULL;
+		return;
+	}
+
+	/*
+	 * At L0 we want to verify these fields to ensure that data blocks
+	 * can not be reinterpreted. For instance, we do not want an attacker
+	 * to trick us into returning raw lz4 compressed data to the user
+	 * by modifying the compression bits. At higher levels, we cannot
+	 * enforce this policy since raw sends do not convey any information
+	 * about indirect blocks, so these values might be different on the
+	 * receive side. Fortunately, this does not open any new attack
+	 * vectors, since any alterations that can be made to a higher level
+	 * bp must still verify the correct order of the layer below it.
+	 */
+	if (BP_GET_LEVEL(bp) != 0) {
+		BP_SET_BYTEORDER(bp, 0);
+		BP_SET_COMPRESS(bp, 0);
+
+		/*
+		 * psize cannot be set to zero or it will trigger
+		 * asserts, but the value doesn't really matter as
+		 * long as it is constant.
+		 */
+		BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
+	}
+
+	BP_SET_DEDUP(bp, 0);
+	BP_SET_CHECKSUM(bp, 0);
+}
+
+static void
+zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
+    blkptr_auth_buf_t *bab, uint_t *bab_len)
+{
+	blkptr_t tmpbp = *bp;
+
+	if (should_bswap)
+		byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
+
+	ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
+	ASSERT0(BP_IS_EMBEDDED(&tmpbp));
+
+	zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
+
+	/*
+	 * We always MAC blk_prop in LE to ensure portability. This
+	 * must be done after decoding the mac, since the endianness
+	 * will get zero'd out here.
+	 */
+	zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
+	bab->bab_prop = LE_64(tmpbp.blk_prop);
+	bab->bab_pad = 0ULL;
+
+	/* version 0 did not include the padding */
+	*bab_len = sizeof (blkptr_auth_buf_t);
+	if (version == 0)
+		*bab_len -= sizeof (uint64_t);
+}
+
+static int
+zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
+    boolean_t should_bswap, blkptr_t *bp)
+{
+	int ret;
+	uint_t bab_len;
+	blkptr_auth_buf_t bab;
+	crypto_data_t cd;
+
+	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+	cd.cd_format = CRYPTO_DATA_RAW;
+	cd.cd_offset = 0;
+	cd.cd_length = bab_len;
+	cd.cd_raw.iov_base = (char *)&bab;
+	cd.cd_raw.iov_len = cd.cd_length;
+
+	ret = crypto_mac_update(ctx, &cd, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+static void
+zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
+    boolean_t should_bswap, blkptr_t *bp)
+{
+	uint_t bab_len;
+	blkptr_auth_buf_t bab;
+
+	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+	SHA2Update(ctx, &bab, bab_len);
+}
+
+static void
+zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
+    boolean_t should_bswap, blkptr_t *bp)
+{
+	uint_t bab_len;
+	blkptr_auth_buf_t bab;
+
+	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+	bcopy(&bab, *aadp, bab_len);
+	*aadp += bab_len;
+	*aad_len += bab_len;
+}
+
+static int
+zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
+    boolean_t should_bswap, dnode_phys_t *dnp)
+{
+	int ret, i;
+	dnode_phys_t *adnp;
+	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+	crypto_data_t cd;
+	uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)];
+
+	cd.cd_format = CRYPTO_DATA_RAW;
+	cd.cd_offset = 0;
+
+	/* authenticate the core dnode (masking out non-portable bits) */
+	bcopy(dnp, tmp_dncore, sizeof (tmp_dncore));
+	adnp = (dnode_phys_t *)tmp_dncore;
+	if (le_bswap) {
+		adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
+		adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
+		adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
+		adnp->dn_used = BSWAP_64(adnp->dn_used);
+	}
+	adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+	adnp->dn_used = 0;
+
+	cd.cd_length = sizeof (tmp_dncore);
+	cd.cd_raw.iov_base = (char *)adnp;
+	cd.cd_raw.iov_len = cd.cd_length;
+
+	ret = crypto_mac_update(ctx, &cd, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	for (i = 0; i < dnp->dn_nblkptr; i++) {
+		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+		    should_bswap, &dnp->dn_blkptr[i]);
+		if (ret != 0)
+			goto error;
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+		    should_bswap, DN_SPILL_BLKPTR(dnp));
+		if (ret != 0)
+			goto error;
+	}
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+/*
+ * objset_phys_t blocks introduce a number of exceptions to the normal
+ * authentication process. objset_phys_t's contain 2 separate HMACS for
+ * protecting the integrity of their data. The portable_mac protects the
+ * metadnode. This MAC can be sent with a raw send and protects against
+ * reordering of data within the metadnode. The local_mac protects the user
+ * accounting objects which are not sent from one system to another.
+ *
+ * In addition, objset blocks are the only blocks that can be modified and
+ * written to disk without the key loaded under certain circumstances. During
+ * zil_claim() we need to be able to update the zil_header_t to complete
+ * claiming log blocks and during raw receives we need to write out the
+ * portable_mac from the send file. Both of these actions are possible
+ * because these fields are not protected by either MAC so neither one will
+ * need to modify the MACs without the key. However, when the modified blocks
+ * are written out they will be byteswapped into the host machine's native
+ * endianness which will modify fields protected by the MAC. As a result, MAC
+ * calculation for objset blocks works slightly differently from other block
+ * types. Where other block types MAC the data in whatever endianness is
+ * written to disk, objset blocks always MAC little endian version of their
+ * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
+ * and le_bswap indicates whether a byteswap is needed to get this block
+ * into little endian format.
+ */
+int
+zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
+    boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
+{
+	int ret;
+	crypto_mechanism_t mech;
+	crypto_context_t ctx;
+	crypto_data_t cd;
+	objset_phys_t *osp = data;
+	uint64_t intval;
+	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+	uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
+	uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
+
+	/* initialize HMAC mechanism */
+	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+	mech.cm_param = NULL;
+	mech.cm_param_len = 0;
+
+	cd.cd_format = CRYPTO_DATA_RAW;
+	cd.cd_offset = 0;
+
+	/* calculate the portable MAC from the portable fields and metadnode */
+	ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	/* add in the os_type */
+	intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
+	cd.cd_length = sizeof (uint64_t);
+	cd.cd_raw.iov_base = (char *)&intval;
+	cd.cd_raw.iov_len = cd.cd_length;
+
+	ret = crypto_mac_update(ctx, &cd, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	/* add in the portable os_flags */
+	intval = osp->os_flags;
+	if (should_bswap)
+		intval = BSWAP_64(intval);
+	intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+	if (!ZFS_HOST_BYTEORDER)
+		intval = BSWAP_64(intval);
+
+	cd.cd_length = sizeof (uint64_t);
+	cd.cd_raw.iov_base = (char *)&intval;
+	cd.cd_raw.iov_len = cd.cd_length;
+
+	ret = crypto_mac_update(ctx, &cd, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	/* add in fields from the metadnode */
+	ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+	    should_bswap, &osp->os_meta_dnode);
+	if (ret)
+		goto error;
+
+	/* store the final digest in a temporary buffer and copy what we need */
+	cd.cd_length = SHA512_DIGEST_LENGTH;
+	cd.cd_raw.iov_base = (char *)raw_portable_mac;
+	cd.cd_raw.iov_len = cd.cd_length;
+
+	ret = crypto_mac_final(ctx, &cd, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
+
+	/*
+	 * The local MAC protects the user, group and project accounting.
+	 * If these objects are not present, the local MAC is zeroed out.
+	 */
+	if ((datalen >= OBJSET_PHYS_SIZE_V3 &&
+	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
+	    osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
+	    (datalen >= OBJSET_PHYS_SIZE_V2 &&
+	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
+	    (datalen <= OBJSET_PHYS_SIZE_V1)) {
+		bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+		return (0);
+	}
+
+	/* calculate the local MAC from the userused and groupused dnodes */
+	ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	/* add in the non-portable os_flags */
+	intval = osp->os_flags;
+	if (should_bswap)
+		intval = BSWAP_64(intval);
+	intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+	if (!ZFS_HOST_BYTEORDER)
+		intval = BSWAP_64(intval);
+
+	cd.cd_length = sizeof (uint64_t);
+	cd.cd_raw.iov_base = (char *)&intval;
+	cd.cd_raw.iov_len = cd.cd_length;
+
+	ret = crypto_mac_update(ctx, &cd, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	/* add in fields from the user accounting dnodes */
+	if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
+		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+		    should_bswap, &osp->os_userused_dnode);
+		if (ret)
+			goto error;
+	}
+
+	if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
+		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+		    should_bswap, &osp->os_groupused_dnode);
+		if (ret)
+			goto error;
+	}
+
+	if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
+	    datalen >= OBJSET_PHYS_SIZE_V3) {
+		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+		    should_bswap, &osp->os_projectused_dnode);
+		if (ret)
+			goto error;
+	}
+
+	/* store the final digest in a temporary buffer and copy what we need */
+	cd.cd_length = SHA512_DIGEST_LENGTH;
+	cd.cd_raw.iov_base = (char *)raw_local_mac;
+	cd.cd_raw.iov_len = cd.cd_length;
+
+	ret = crypto_mac_final(ctx, &cd, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN);
+
+	return (0);
+
+error:
+	bzero(portable_mac, ZIO_OBJSET_MAC_LEN);
+	bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+	return (ret);
+}
+
+static void
+zio_crypt_destroy_uio(uio_t *uio)
+{
+	if (uio->uio_iov)
+		kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t));
+}
+
+/*
+ * This function parses an uncompressed indirect block and returns a checksum
+ * of all the portable fields from all of the contained bps. The portable
+ * fields are the MAC and all of the fields from blk_prop except for the dedup,
+ * checksum, and psize bits. For an explanation of the purpose of this, see
+ * the comment block on object set authentication.
+ */
+static int
+zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
+    uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
+{
+	blkptr_t *bp;
+	int i, epb = datalen >> SPA_BLKPTRSHIFT;
+	SHA2_CTX ctx;
+	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+	/* checksum all of the MACs from the layer below */
+	SHA2Init(SHA512, &ctx);
+	for (i = 0, bp = buf; i < epb; i++, bp++) {
+		zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
+		    byteswap, bp);
+	}
+	SHA2Final(digestbuf, &ctx);
+
+	if (generate) {
+		bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN);
+		return (0);
+	}
+
+	if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0)
+		return (SET_ERROR(ECKSUM));
+
+	return (0);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
+    uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+	int ret;
+
+	/*
+	 * Unfortunately, callers of this function will not always have
+	 * easy access to the on-disk format version. This info is
+	 * normally found in the DSL Crypto Key, but the checksum-of-MACs
+	 * is expected to be verifiable even when the key isn't loaded.
+	 * Here, instead of doing a ZAP lookup for the version for each
+	 * zio, we simply try both existing formats.
+	 */
+	ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
+	    datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
+	if (ret == ECKSUM) {
+		ASSERT(!generate);
+		ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
+		    buf, datalen, 0, byteswap, cksum);
+	}
+
+	return (ret);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
+    uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+	int ret;
+	void *buf;
+
+	buf = abd_borrow_buf_copy(abd, datalen);
+	ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
+	    byteswap, cksum);
+	abd_return_buf(abd, buf, datalen);
+
+	return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting ZIL blocks.
+ * We do not check for the older ZIL chain because the encryption feature
+ * was not available before the newer ZIL chain was introduced. The goal
+ * here is to encrypt everything except the blkptr_t of a lr_write_t and
+ * the zil_chain_t header. Everything that is not encrypted is authenticated.
+ */
+static int
+zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
+    uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio,
+    uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
+    boolean_t *no_crypt)
+{
+	int ret;
+	uint64_t txtype, lr_len;
+	uint_t nr_src, nr_dst, crypt_len;
+	uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
+	iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
+	uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
+	zil_chain_t *zilc;
+	lr_t *lr;
+	uint8_t *aadbuf = zio_buf_alloc(datalen);
+
+	/* cipherbuf always needs an extra iovec for the MAC */
+	if (encrypt) {
+		src = plainbuf;
+		dst = cipherbuf;
+		nr_src = 0;
+		nr_dst = 1;
+	} else {
+		src = cipherbuf;
+		dst = plainbuf;
+		nr_src = 1;
+		nr_dst = 0;
+	}
+
+	/* find the start and end record of the log block */
+	zilc = (zil_chain_t *)src;
+	slrp = src + sizeof (zil_chain_t);
+	aadp = aadbuf;
+	blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+
+	/* calculate the number of encrypted iovecs we will need */
+	for (; slrp < blkend; slrp += lr_len) {
+		lr = (lr_t *)slrp;
+
+		if (!byteswap) {
+			txtype = lr->lrc_txtype;
+			lr_len = lr->lrc_reclen;
+		} else {
+			txtype = BSWAP_64(lr->lrc_txtype);
+			lr_len = BSWAP_64(lr->lrc_reclen);
+		}
+
+		nr_iovecs++;
+		if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
+			nr_iovecs++;
+	}
+
+	nr_src += nr_iovecs;
+	nr_dst += nr_iovecs;
+
+	/* allocate the iovec arrays */
+	if (nr_src != 0) {
+		src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
+		if (src_iovecs == NULL) {
+			ret = SET_ERROR(ENOMEM);
+			goto error;
+		}
+	}
+
+	if (nr_dst != 0) {
+		dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
+		if (dst_iovecs == NULL) {
+			ret = SET_ERROR(ENOMEM);
+			goto error;
+		}
+	}
+
+	/*
+	 * Copy the plain zil header over and authenticate everything except
+	 * the checksum that will store our MAC. If we are writing the data
+	 * the embedded checksum will not have been calculated yet, so we don't
+	 * authenticate that.
+	 */
+	bcopy(src, dst, sizeof (zil_chain_t));
+	bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t));
+	aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+	aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+
+	/* loop over records again, filling in iovecs */
+	nr_iovecs = 0;
+	slrp = src + sizeof (zil_chain_t);
+	dlrp = dst + sizeof (zil_chain_t);
+
+	for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
+		lr = (lr_t *)slrp;
+
+		if (!byteswap) {
+			txtype = lr->lrc_txtype;
+			lr_len = lr->lrc_reclen;
+		} else {
+			txtype = BSWAP_64(lr->lrc_txtype);
+			lr_len = BSWAP_64(lr->lrc_reclen);
+		}
+
+		/* copy the common lr_t */
+		bcopy(slrp, dlrp, sizeof (lr_t));
+		bcopy(slrp, aadp, sizeof (lr_t));
+		aadp += sizeof (lr_t);
+		aad_len += sizeof (lr_t);
+
+		ASSERT3P(src_iovecs, !=, NULL);
+		ASSERT3P(dst_iovecs, !=, NULL);
+
+		/*
+		 * If this is a TX_WRITE record we want to encrypt everything
+		 * except the bp if exists. If the bp does exist we want to
+		 * authenticate it.
+		 */
+		if (txtype == TX_WRITE) {
+			crypt_len = sizeof (lr_write_t) -
+			    sizeof (lr_t) - sizeof (blkptr_t);
+			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
+			src_iovecs[nr_iovecs].iov_len = crypt_len;
+			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
+			dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+			/* copy the bp now since it will not be encrypted */
+			bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+			    dlrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+			    sizeof (blkptr_t));
+			bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+			    aadp, sizeof (blkptr_t));
+			aadp += sizeof (blkptr_t);
+			aad_len += sizeof (blkptr_t);
+			nr_iovecs++;
+			total_len += crypt_len;
+
+			if (lr_len != sizeof (lr_write_t)) {
+				crypt_len = lr_len - sizeof (lr_write_t);
+				src_iovecs[nr_iovecs].iov_base =
+				    slrp + sizeof (lr_write_t);
+				src_iovecs[nr_iovecs].iov_len = crypt_len;
+				dst_iovecs[nr_iovecs].iov_base =
+				    dlrp + sizeof (lr_write_t);
+				dst_iovecs[nr_iovecs].iov_len = crypt_len;
+				nr_iovecs++;
+				total_len += crypt_len;
+			}
+		} else {
+			crypt_len = lr_len - sizeof (lr_t);
+			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
+			src_iovecs[nr_iovecs].iov_len = crypt_len;
+			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
+			dst_iovecs[nr_iovecs].iov_len = crypt_len;
+			nr_iovecs++;
+			total_len += crypt_len;
+		}
+	}
+
+	*no_crypt = (nr_iovecs == 0);
+	*enc_len = total_len;
+	*authbuf = aadbuf;
+	*auth_len = aad_len;
+
+	if (encrypt) {
+		puio->uio_iov = src_iovecs;
+		puio->uio_iovcnt = nr_src;
+		cuio->uio_iov = dst_iovecs;
+		cuio->uio_iovcnt = nr_dst;
+	} else {
+		puio->uio_iov = dst_iovecs;
+		puio->uio_iovcnt = nr_dst;
+		cuio->uio_iov = src_iovecs;
+		cuio->uio_iovcnt = nr_src;
+	}
+
+	return (0);
+
+error:
+	zio_buf_free(aadbuf, datalen);
+	if (src_iovecs != NULL)
+		kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
+	if (dst_iovecs != NULL)
+		kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
+
+	*enc_len = 0;
+	*authbuf = NULL;
+	*auth_len = 0;
+	*no_crypt = B_FALSE;
+	puio->uio_iov = NULL;
+	puio->uio_iovcnt = 0;
+	cuio->uio_iov = NULL;
+	cuio->uio_iovcnt = 0;
+	return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting dnode blocks.
+ */
+static int
+zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
+    uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+    uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf,
+    uint_t *auth_len, boolean_t *no_crypt)
+{
+	int ret;
+	uint_t nr_src, nr_dst, crypt_len;
+	uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
+	uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
+	iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
+	uint8_t *src, *dst, *aadp;
+	dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
+	uint8_t *aadbuf = zio_buf_alloc(datalen);
+
+	if (encrypt) {
+		src = plainbuf;
+		dst = cipherbuf;
+		nr_src = 0;
+		nr_dst = 1;
+	} else {
+		src = cipherbuf;
+		dst = plainbuf;
+		nr_src = 1;
+		nr_dst = 0;
+	}
+
+	sdnp = (dnode_phys_t *)src;
+	ddnp = (dnode_phys_t *)dst;
+	aadp = aadbuf;
+
+	/*
+	 * Count the number of iovecs we will need to do the encryption by
+	 * counting the number of bonus buffers that need to be encrypted.
+	 */
+	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+		/*
+		 * This block may still be byteswapped. However, all of the
+		 * values we use are either uint8_t's (for which byteswapping
+		 * is a noop) or a * != 0 check, which will work regardless
+		 * of whether or not we byteswap.
+		 */
+		if (sdnp[i].dn_type != DMU_OT_NONE &&
+		    DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
+		    sdnp[i].dn_bonuslen != 0) {
+			nr_iovecs++;
+		}
+	}
+
+	nr_src += nr_iovecs;
+	nr_dst += nr_iovecs;
+
+	if (nr_src != 0) {
+		src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
+		if (src_iovecs == NULL) {
+			ret = SET_ERROR(ENOMEM);
+			goto error;
+		}
+	}
+
+	if (nr_dst != 0) {
+		dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
+		if (dst_iovecs == NULL) {
+			ret = SET_ERROR(ENOMEM);
+			goto error;
+		}
+	}
+
+	nr_iovecs = 0;
+
+	/*
+	 * Iterate through the dnodes again, this time filling in the uios
+	 * we allocated earlier. We also concatenate any data we want to
+	 * authenticate onto aadbuf.
+	 */
+	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+		dnp = &sdnp[i];
+
+		/* copy over the core fields and blkptrs (kept as plaintext) */
+		bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
+
+		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+			bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]),
+			    sizeof (blkptr_t));
+		}
+
+		/*
+		 * Handle authenticated data. We authenticate everything in
+		 * the dnode that can be brought over when we do a raw send.
+		 * This includes all of the core fields as well as the MACs
+		 * stored in the bp checksums and all of the portable bits
+		 * from blk_prop. We include the dnode padding here in case it
+		 * ever gets used in the future. Some dn_flags and dn_used are
+		 * not portable so we mask those out values out of the
+		 * authenticated data.
+		 */
+		crypt_len = offsetof(dnode_phys_t, dn_blkptr);
+		bcopy(dnp, aadp, crypt_len);
+		adnp = (dnode_phys_t *)aadp;
+		adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+		adnp->dn_used = 0;
+		aadp += crypt_len;
+		aad_len += crypt_len;
+
+		for (j = 0; j < dnp->dn_nblkptr; j++) {
+			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+			    version, byteswap, &dnp->dn_blkptr[j]);
+		}
+
+		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+			    version, byteswap, DN_SPILL_BLKPTR(dnp));
+		}
+
+		/*
+		 * If this bonus buffer needs to be encrypted, we prepare an
+		 * iovec_t. The encryption / decryption functions will fill
+		 * this in for us with the encrypted or decrypted data.
+		 * Otherwise we add the bonus buffer to the authenticated
+		 * data buffer and copy it over to the destination. The
+		 * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
+		 * we can guarantee alignment with the AES block size
+		 * (128 bits).
+		 */
+		crypt_len = DN_MAX_BONUS_LEN(dnp);
+		if (dnp->dn_type != DMU_OT_NONE &&
+		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+		    dnp->dn_bonuslen != 0) {
+			ASSERT3U(nr_iovecs, <, nr_src);
+			ASSERT3U(nr_iovecs, <, nr_dst);
+			ASSERT3P(src_iovecs, !=, NULL);
+			ASSERT3P(dst_iovecs, !=, NULL);
+			src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp);
+			src_iovecs[nr_iovecs].iov_len = crypt_len;
+			dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]);
+			dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+			nr_iovecs++;
+			total_len += crypt_len;
+		} else {
+			bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len);
+			bcopy(DN_BONUS(dnp), aadp, crypt_len);
+			aadp += crypt_len;
+			aad_len += crypt_len;
+		}
+	}
+
+	*no_crypt = (nr_iovecs == 0);
+	*enc_len = total_len;
+	*authbuf = aadbuf;
+	*auth_len = aad_len;
+
+	if (encrypt) {
+		puio->uio_iov = src_iovecs;
+		puio->uio_iovcnt = nr_src;
+		cuio->uio_iov = dst_iovecs;
+		cuio->uio_iovcnt = nr_dst;
+	} else {
+		puio->uio_iov = dst_iovecs;
+		puio->uio_iovcnt = nr_dst;
+		cuio->uio_iov = src_iovecs;
+		cuio->uio_iovcnt = nr_src;
+	}
+
+	return (0);
+
+error:
+	zio_buf_free(aadbuf, datalen);
+	if (src_iovecs != NULL)
+		kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
+	if (dst_iovecs != NULL)
+		kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
+
+	*enc_len = 0;
+	*authbuf = NULL;
+	*auth_len = 0;
+	*no_crypt = B_FALSE;
+	puio->uio_iov = NULL;
+	puio->uio_iovcnt = 0;
+	cuio->uio_iov = NULL;
+	cuio->uio_iovcnt = 0;
+	return (ret);
+}
+
+static int
+zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
+    uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *cuio,
+    uint_t *enc_len)
+{
+	int ret;
+	uint_t nr_plain = 1, nr_cipher = 2;
+	iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
+
+	/* allocate the iovecs for the plain and cipher data */
+	plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t),
+	    KM_SLEEP);
+	if (!plain_iovecs) {
+		ret = SET_ERROR(ENOMEM);
+		goto error;
+	}
+
+	cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t),
+	    KM_SLEEP);
+	if (!cipher_iovecs) {
+		ret = SET_ERROR(ENOMEM);
+		goto error;
+	}
+
+	plain_iovecs[0].iov_base = plainbuf;
+	plain_iovecs[0].iov_len = datalen;
+	cipher_iovecs[0].iov_base = cipherbuf;
+	cipher_iovecs[0].iov_len = datalen;
+
+	*enc_len = datalen;
+	puio->uio_iov = plain_iovecs;
+	puio->uio_iovcnt = nr_plain;
+	cuio->uio_iov = cipher_iovecs;
+	cuio->uio_iovcnt = nr_cipher;
+
+	return (0);
+
+error:
+	if (plain_iovecs != NULL)
+		kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
+	if (cipher_iovecs != NULL)
+		kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
+
+	*enc_len = 0;
+	puio->uio_iov = NULL;
+	puio->uio_iovcnt = 0;
+	cuio->uio_iov = NULL;
+	cuio->uio_iovcnt = 0;
+	return (ret);
+}
+
+/*
+ * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
+ * that they can be used for encryption and decryption by zio_do_crypt_uio().
+ * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
+ * requiring special handling to parse out pieces that are to be encrypted. The
+ * authbuf is used by these special cases to store additional authenticated
+ * data (AAD) for the encryption modes.
+ */
+static int
+zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
+    uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+    uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf,
+    uint_t *auth_len, boolean_t *no_crypt)
+{
+	int ret;
+	iovec_t *mac_iov;
+
+	ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
+
+	/* route to handler */
+	switch (ot) {
+	case DMU_OT_INTENT_LOG:
+		ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
+		    datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
+		    no_crypt);
+		break;
+	case DMU_OT_DNODE:
+		ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
+		    cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
+		    auth_len, no_crypt);
+		break;
+	default:
+		ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
+		    datalen, puio, cuio, enc_len);
+		*authbuf = NULL;
+		*auth_len = 0;
+		*no_crypt = B_FALSE;
+		break;
+	}
+
+	if (ret != 0)
+		goto error;
+
+	/* populate the uios */
+	puio->uio_segflg = UIO_SYSSPACE;
+	cuio->uio_segflg = UIO_SYSSPACE;
+
+	mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]);
+	mac_iov->iov_base = mac;
+	mac_iov->iov_len = ZIO_DATA_MAC_LEN;
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+/*
+ * Primary encryption / decryption entrypoint for zio data.
+ */
+int
+zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
+    dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
+    uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
+    boolean_t *no_crypt)
+{
+	int ret;
+	boolean_t locked = B_FALSE;
+	uint64_t crypt = key->zk_crypt;
+	uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
+	uint_t enc_len, auth_len;
+	uio_t puio, cuio;
+	uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
+	crypto_key_t tmp_ckey, *ckey = NULL;
+	crypto_ctx_template_t tmpl;
+	uint8_t *authbuf = NULL;
+
+	/*
+	 * If the needed key is the current one, just use it. Otherwise we
+	 * need to generate a temporary one from the given salt + master key.
+	 * If we are encrypting, we must return a copy of the current salt
+	 * so that it can be stored in the blkptr_t.
+	 */
+	rw_enter(&key->zk_salt_lock, RW_READER);
+	locked = B_TRUE;
+
+	if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
+		ckey = &key->zk_current_key;
+		tmpl = key->zk_current_tmpl;
+	} else {
+		rw_exit(&key->zk_salt_lock);
+		locked = B_FALSE;
+
+		ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+		    salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
+		if (ret != 0)
+			goto error;
+
+		tmp_ckey.ck_format = CRYPTO_KEY_RAW;
+		tmp_ckey.ck_data = enc_keydata;
+		tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+		ckey = &tmp_ckey;
+		tmpl = NULL;
+	}
+
+	/*
+	 * Attempt to use QAT acceleration if we can. We currently don't
+	 * do this for metadnode and ZIL blocks, since they have a much
+	 * more involved buffer layout and the qat_crypt() function only
+	 * works in-place.
+	 */
+	if (qat_crypt_use_accel(datalen) &&
+	    ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) {
+		uint8_t *srcbuf, *dstbuf;
+
+		if (encrypt) {
+			srcbuf = plainbuf;
+			dstbuf = cipherbuf;
+		} else {
+			srcbuf = cipherbuf;
+			dstbuf = plainbuf;
+		}
+
+		ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf,
+		    dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen);
+		if (ret == CPA_STATUS_SUCCESS) {
+			if (locked) {
+				rw_exit(&key->zk_salt_lock);
+				locked = B_FALSE;
+			}
+
+			return (0);
+		}
+		/* If the hardware implementation fails fall back to software */
+	}
+
+	bzero(&puio, sizeof (uio_t));
+	bzero(&cuio, sizeof (uio_t));
+
+	/* create uios for encryption */
+	ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
+	    cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
+	    &authbuf, &auth_len, no_crypt);
+	if (ret != 0)
+		goto error;
+
+	/* perform the encryption / decryption in software */
+	ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len,
+	    &puio, &cuio, authbuf, auth_len);
+	if (ret != 0)
+		goto error;
+
+	if (locked) {
+		rw_exit(&key->zk_salt_lock);
+		locked = B_FALSE;
+	}
+
+	if (authbuf != NULL)
+		zio_buf_free(authbuf, datalen);
+	if (ckey == &tmp_ckey)
+		bzero(enc_keydata, keydata_len);
+	zio_crypt_destroy_uio(&puio);
+	zio_crypt_destroy_uio(&cuio);
+
+	return (0);
+
+error:
+	if (locked)
+		rw_exit(&key->zk_salt_lock);
+	if (authbuf != NULL)
+		zio_buf_free(authbuf, datalen);
+	if (ckey == &tmp_ckey)
+		bzero(enc_keydata, keydata_len);
+	zio_crypt_destroy_uio(&puio);
+	zio_crypt_destroy_uio(&cuio);
+
+	return (ret);
+}
+
+/*
+ * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
+ * linear buffers.
+ */
+int
+zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
+    boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
+    uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
+{
+	int ret;
+	void *ptmp, *ctmp;
+
+	if (encrypt) {
+		ptmp = abd_borrow_buf_copy(pabd, datalen);
+		ctmp = abd_borrow_buf(cabd, datalen);
+	} else {
+		ptmp = abd_borrow_buf(pabd, datalen);
+		ctmp = abd_borrow_buf_copy(cabd, datalen);
+	}
+
+	ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
+	    datalen, ptmp, ctmp, no_crypt);
+	if (ret != 0)
+		goto error;
+
+	if (encrypt) {
+		abd_return_buf(pabd, ptmp, datalen);
+		abd_return_buf_copy(cabd, ctmp, datalen);
+	} else {
+		abd_return_buf_copy(pabd, ptmp, datalen);
+		abd_return_buf(cabd, ctmp, datalen);
+	}
+
+	return (0);
+
+error:
+	if (encrypt) {
+		abd_return_buf(pabd, ptmp, datalen);
+		abd_return_buf_copy(cabd, ctmp, datalen);
+	} else {
+		abd_return_buf_copy(pabd, ptmp, datalen);
+		abd_return_buf(cabd, ctmp, datalen);
+	}
+
+	return (ret);
+}
+
+#if defined(_KERNEL)
+/* BEGIN CSTYLED */
+module_param(zfs_key_max_salt_uses, ulong, 0644);
+MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
+	"can be used for generating encryption keys before it is rotated");
+/* END CSTYLED */
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
new file mode 100644
index 000000000000..fa4500f6f8d1
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
@@ -0,0 +1,551 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * LLNL-CODE-403049.
+ * Rewritten for Linux by:
+ *   Rohan Puri <rohan.puri15@gmail.com>
+ *   Brian Behlendorf <behlendorf1@llnl.gov>
+ */
+
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+/*
+ * Common open routine.  Disallow any write access.
+ */
+/* ARGSUSED */
+static int
+zpl_common_open(struct inode *ip, struct file *filp)
+{
+	if (filp->f_mode & FMODE_WRITE)
+		return (-EACCES);
+
+	return (generic_file_open(ip, filp));
+}
+
+/*
+ * Get root directory contents.
+ */
+static int
+zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+	int error = 0;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (!zpl_dir_emit_dots(filp, ctx))
+		goto out;
+
+	if (ctx->pos == 2) {
+		if (!zpl_dir_emit(ctx, ZFS_SNAPDIR_NAME,
+		    strlen(ZFS_SNAPDIR_NAME), ZFSCTL_INO_SNAPDIR, DT_DIR))
+			goto out;
+
+		ctx->pos++;
+	}
+
+	if (ctx->pos == 3) {
+		if (!zpl_dir_emit(ctx, ZFS_SHAREDIR_NAME,
+		    strlen(ZFS_SHAREDIR_NAME), ZFSCTL_INO_SHARES, DT_DIR))
+			goto out;
+
+		ctx->pos++;
+	}
+out:
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	zpl_dir_context_t ctx =
+	    ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+	int error;
+
+	error = zpl_root_iterate(filp, &ctx);
+	filp->f_pos = ctx.pos;
+
+	return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+/*
+ * Get root directory attributes.
+ */
+/* ARGSUSED */
+static int
+zpl_root_getattr_impl(const struct path *path, struct kstat *stat,
+    u32 request_mask, unsigned int query_flags)
+{
+	struct inode *ip = path->dentry->d_inode;
+
+	generic_fillattr(ip, stat);
+	stat->atime = current_time(ip);
+
+	return (0);
+}
+ZPL_GETATTR_WRAPPER(zpl_root_getattr);
+
+static struct dentry *
+zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags)
+{
+	cred_t *cr = CRED();
+	struct inode *ip;
+	int error;
+
+	crhold(cr);
+	error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	if (error) {
+		if (error == -ENOENT)
+			return (d_splice_alias(NULL, dentry));
+		else
+			return (ERR_PTR(error));
+	}
+
+	return (d_splice_alias(ip, dentry));
+}
+
+/*
+ * The '.zfs' control directory file and inode operations.
+ */
+const struct file_operations zpl_fops_root = {
+	.open		= zpl_common_open,
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+	.iterate_shared	= zpl_root_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+	.iterate	= zpl_root_iterate,
+#else
+	.readdir	= zpl_root_readdir,
+#endif
+};
+
+const struct inode_operations zpl_ops_root = {
+	.lookup		= zpl_root_lookup,
+	.getattr	= zpl_root_getattr,
+};
+
+static struct vfsmount *
+zpl_snapdir_automount(struct path *path)
+{
+	int error;
+
+	error = -zfsctl_snapshot_mount(path, 0);
+	if (error)
+		return (ERR_PTR(error));
+
+	/*
+	 * Rather than returning the new vfsmount for the snapshot we must
+	 * return NULL to indicate a mount collision.  This is done because
+	 * the user space mount calls do_add_mount() which adds the vfsmount
+	 * to the name space.  If we returned the new mount here it would be
+	 * added again to the vfsmount list resulting in list corruption.
+	 */
+	return (NULL);
+}
+
+/*
+ * Negative dentries must always be revalidated so newly created snapshots
+ * can be detected and automounted.  Normal dentries should be kept because
+ * as of the 3.18 kernel revaliding the mountpoint dentry will result in
+ * the snapshot being immediately unmounted.
+ */
+static int
+#ifdef HAVE_D_REVALIDATE_NAMEIDATA
+zpl_snapdir_revalidate(struct dentry *dentry, struct nameidata *i)
+#else
+zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
+#endif
+{
+	return (!!dentry->d_inode);
+}
+
+dentry_operations_t zpl_dops_snapdirs = {
+/*
+ * Auto mounting of snapshots is only supported for 2.6.37 and
+ * newer kernels.  Prior to this kernel the ops->follow_link()
+ * callback was used as a hack to trigger the mount.  The
+ * resulting vfsmount was then explicitly grafted in to the
+ * name space.  While it might be possible to add compatibility
+ * code to accomplish this it would require considerable care.
+ */
+	.d_automount	= zpl_snapdir_automount,
+	.d_revalidate	= zpl_snapdir_revalidate,
+};
+
+static struct dentry *
+zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
+    unsigned int flags)
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	struct inode *ip = NULL;
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip,
+	    0, cr, NULL, NULL);
+	ASSERT3S(error, <=, 0);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	if (error && error != -ENOENT)
+		return (ERR_PTR(error));
+
+	ASSERT(error == 0 || ip == NULL);
+	d_clear_d_op(dentry);
+	d_set_d_op(dentry, &zpl_dops_snapdirs);
+	dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+
+	return (d_splice_alias(ip, dentry));
+}
+
+static int
+zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+	fstrans_cookie_t cookie;
+	char snapname[MAXNAMELEN];
+	boolean_t case_conflict;
+	uint64_t id, pos;
+	int error = 0;
+
+	ZFS_ENTER(zfsvfs);
+	cookie = spl_fstrans_mark();
+
+	if (!zpl_dir_emit_dots(filp, ctx))
+		goto out;
+
+	pos = ctx->pos;
+	while (error == 0) {
+		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN,
+		    snapname, &id, &pos, &case_conflict);
+		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		if (error)
+			goto out;
+
+		if (!zpl_dir_emit(ctx, snapname, strlen(snapname),
+		    ZFSCTL_INO_SHARES - id, DT_DIR))
+			goto out;
+
+		ctx->pos = pos;
+	}
+out:
+	spl_fstrans_unmark(cookie);
+	ZFS_EXIT(zfsvfs);
+
+	if (error == -ENOENT)
+		return (0);
+
+	return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	zpl_dir_context_t ctx =
+	    ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+	int error;
+
+	error = zpl_snapdir_iterate(filp, &ctx);
+	filp->f_pos = ctx.pos;
+
+	return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+static int
+zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry,
+    struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+{
+	cred_t *cr = CRED();
+	int error;
+
+	/* We probably don't want to support renameat2(2) in ctldir */
+	if (flags)
+		return (-EINVAL);
+
+	crhold(cr);
+	error = -zfsctl_snapdir_rename(sdip, dname(sdentry),
+	    tdip, dname(tdentry), cr, 0);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	return (error);
+}
+
+#ifndef HAVE_RENAME_WANTS_FLAGS
+static int
+zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry,
+    struct inode *tdip, struct dentry *tdentry)
+{
+	return (zpl_snapdir_rename2(sdip, sdentry, tdip, tdentry, 0));
+}
+#endif
+
+static int
+zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry)
+{
+	cred_t *cr = CRED();
+	int error;
+
+	crhold(cr);
+	error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	return (error);
+}
+
+static int
+zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
+{
+	cred_t *cr = CRED();
+	vattr_t *vap;
+	struct inode *ip;
+	int error;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dip, mode | S_IFDIR, cr);
+
+	error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
+	if (error == 0) {
+		d_clear_d_op(dentry);
+		d_set_d_op(dentry, &zpl_dops_snapdirs);
+		d_instantiate(dentry, ip);
+	}
+
+	kmem_free(vap, sizeof (vattr_t));
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	return (error);
+}
+
+/*
+ * Get snapshot directory attributes.
+ */
+/* ARGSUSED */
+static int
+zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat,
+    u32 request_mask, unsigned int query_flags)
+{
+	struct inode *ip = path->dentry->d_inode;
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+	ZFS_ENTER(zfsvfs);
+	generic_fillattr(ip, stat);
+
+	stat->nlink = stat->size = 2;
+	stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+	stat->atime = current_time(ip);
+	ZFS_EXIT(zfsvfs);
+
+	return (0);
+}
+ZPL_GETATTR_WRAPPER(zpl_snapdir_getattr);
+
+/*
+ * The '.zfs/snapshot' directory file operations.  These mainly control
+ * generating the list of available snapshots when doing an 'ls' in the
+ * directory.  See zpl_snapdir_readdir().
+ */
+const struct file_operations zpl_fops_snapdir = {
+	.open		= zpl_common_open,
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+	.iterate_shared	= zpl_snapdir_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+	.iterate	= zpl_snapdir_iterate,
+#else
+	.readdir	= zpl_snapdir_readdir,
+#endif
+
+};
+
+/*
+ * The '.zfs/snapshot' directory inode operations.  These mainly control
+ * creating an inode for a snapshot directory and initializing the needed
+ * infrastructure to automount the snapshot.  See zpl_snapdir_lookup().
+ */
+const struct inode_operations zpl_ops_snapdir = {
+	.lookup		= zpl_snapdir_lookup,
+	.getattr	= zpl_snapdir_getattr,
+#ifdef HAVE_RENAME_WANTS_FLAGS
+	.rename		= zpl_snapdir_rename2,
+#else
+	.rename		= zpl_snapdir_rename,
+#endif
+	.rmdir		= zpl_snapdir_rmdir,
+	.mkdir		= zpl_snapdir_mkdir,
+};
+
+static struct dentry *
+zpl_shares_lookup(struct inode *dip, struct dentry *dentry,
+    unsigned int flags)
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	struct inode *ip = NULL;
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfsctl_shares_lookup(dip, dname(dentry), &ip,
+	    0, cr, NULL, NULL);
+	ASSERT3S(error, <=, 0);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	if (error) {
+		if (error == -ENOENT)
+			return (d_splice_alias(NULL, dentry));
+		else
+			return (ERR_PTR(error));
+	}
+
+	return (d_splice_alias(ip, dentry));
+}
+
+static int
+zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+	znode_t *dzp;
+	int error = 0;
+
+	ZFS_ENTER(zfsvfs);
+	cookie = spl_fstrans_mark();
+
+	if (zfsvfs->z_shares_dir == 0) {
+		zpl_dir_emit_dots(filp, ctx);
+		goto out;
+	}
+
+	error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
+	if (error)
+		goto out;
+
+	crhold(cr);
+	error = -zfs_readdir(ZTOI(dzp), ctx, cr);
+	crfree(cr);
+
+	iput(ZTOI(dzp));
+out:
+	spl_fstrans_unmark(cookie);
+	ZFS_EXIT(zfsvfs);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	zpl_dir_context_t ctx =
+	    ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+	int error;
+
+	error = zpl_shares_iterate(filp, &ctx);
+	filp->f_pos = ctx.pos;
+
+	return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+/* ARGSUSED */
+static int
+zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
+    u32 request_mask, unsigned int query_flags)
+{
+	struct inode *ip = path->dentry->d_inode;
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	znode_t *dzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (zfsvfs->z_shares_dir == 0) {
+		generic_fillattr(path->dentry->d_inode, stat);
+		stat->nlink = stat->size = 2;
+		stat->atime = current_time(ip);
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
+	if (error == 0) {
+		error = -zfs_getattr_fast(ZTOI(dzp), stat);
+		iput(ZTOI(dzp));
+	}
+
+	ZFS_EXIT(zfsvfs);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+ZPL_GETATTR_WRAPPER(zpl_shares_getattr);
+
+/*
+ * The '.zfs/shares' directory file operations.
+ */
+const struct file_operations zpl_fops_shares = {
+	.open		= zpl_common_open,
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+	.iterate_shared	= zpl_shares_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+	.iterate	= zpl_shares_iterate,
+#else
+	.readdir	= zpl_shares_readdir,
+#endif
+
+};
+
+/*
+ * The '.zfs/shares' directory inode operations.
+ */
+const struct inode_operations zpl_ops_shares = {
+	.lookup		= zpl_shares_lookup,
+	.getattr	= zpl_shares_getattr,
+};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c
new file mode 100644
index 000000000000..eaf048c38db1
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c
@@ -0,0 +1,154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Gunnar Beutner
+ * Copyright (c) 2012 Cyril Plisko. All rights reserved.
+ */
+
+
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+
+static int
+#ifdef HAVE_ENCODE_FH_WITH_INODE
+zpl_encode_fh(struct inode *ip, __u32 *fh, int *max_len, struct inode *parent)
+{
+#else
+zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable)
+{
+	/* CSTYLED */
+	struct inode *ip = dentry->d_inode;
+#endif /* HAVE_ENCODE_FH_WITH_INODE */
+	fstrans_cookie_t cookie;
+	fid_t *fid = (fid_t *)fh;
+	int len_bytes, rc;
+
+	len_bytes = *max_len * sizeof (__u32);
+
+	if (len_bytes < offsetof(fid_t, fid_data))
+		return (255);
+
+	fid->fid_len = len_bytes - offsetof(fid_t, fid_data);
+	cookie = spl_fstrans_mark();
+
+	if (zfsctl_is_node(ip))
+		rc = zfsctl_fid(ip, fid);
+	else
+		rc = zfs_fid(ip, fid);
+
+	spl_fstrans_unmark(cookie);
+	len_bytes = offsetof(fid_t, fid_data) + fid->fid_len;
+	*max_len = roundup(len_bytes, sizeof (__u32)) / sizeof (__u32);
+
+	return (rc == 0 ? FILEID_INO32_GEN : 255);
+}
+
+static struct dentry *
+zpl_fh_to_dentry(struct super_block *sb, struct fid *fh,
+    int fh_len, int fh_type)
+{
+	fid_t *fid = (fid_t *)fh;
+	fstrans_cookie_t cookie;
+	struct inode *ip;
+	int len_bytes, rc;
+
+	len_bytes = fh_len * sizeof (__u32);
+
+	if (fh_type != FILEID_INO32_GEN ||
+	    len_bytes < offsetof(fid_t, fid_data) ||
+	    len_bytes < offsetof(fid_t, fid_data) + fid->fid_len)
+		return (ERR_PTR(-EINVAL));
+
+	cookie = spl_fstrans_mark();
+	rc = zfs_vget(sb, &ip, fid);
+	spl_fstrans_unmark(cookie);
+
+	if (rc) {
+		/*
+		 * If we see ENOENT it might mean that an NFSv4 * client
+		 * is using a cached inode value in a file handle and
+		 * that the sought after file has had its inode changed
+		 * by a third party.  So change the error to ESTALE
+		 * which will trigger a full lookup by the client and
+		 * will find the new filename/inode pair if it still
+		 * exists.
+		 */
+		if (rc == ENOENT)
+			rc = ESTALE;
+
+		return (ERR_PTR(-rc));
+	}
+
+	ASSERT((ip != NULL) && !IS_ERR(ip));
+
+	return (d_obtain_alias(ip));
+}
+
+static struct dentry *
+zpl_get_parent(struct dentry *child)
+{
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	znode_t *zp;
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_lookup(ITOZ(child->d_inode), "..", &zp, 0, cr, NULL, NULL);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	if (error)
+		return (ERR_PTR(error));
+
+	return (d_obtain_alias(ZTOI(zp)));
+}
+
+static int
+zpl_commit_metadata(struct inode *inode)
+{
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	int error;
+
+	if (zfsctl_is_node(inode))
+		return (0);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_fsync(ITOZ(inode), 0, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+const struct export_operations zpl_export_operations = {
+	.encode_fh		= zpl_encode_fh,
+	.fh_to_dentry		= zpl_fh_to_dentry,
+	.get_parent		= zpl_get_parent,
+	.commit_metadata	= zpl_commit_metadata,
+};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
new file mode 100644
index 000000000000..51e189a87272
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
@@ -0,0 +1,1079 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+#include <sys/file.h>
+#include <sys/dmu_objset.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_project.h>
+
+/*
+ * When using fallocate(2) to preallocate space, inflate the requested
+ * capacity check by 10% to account for the required metadata blocks.
+ */
+unsigned int zfs_fallocate_reserve_percent = 110;
+
+static int
+zpl_open(struct inode *ip, struct file *filp)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+
+	error = generic_file_open(ip, filp);
+	if (error)
+		return (error);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_release(struct inode *ip, struct file *filp)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	if (ITOZ(ip)->z_atime_dirty)
+		zfs_mark_inode_dirty(ip);
+
+	crhold(cr);
+	error = -zfs_close(ip, filp->f_flags, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_readdir(file_inode(filp), ctx, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	zpl_dir_context_t ctx =
+	    ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+	int error;
+
+	error = zpl_iterate(filp, &ctx);
+	filp->f_pos = ctx.pos;
+
+	return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+#if defined(HAVE_FSYNC_WITHOUT_DENTRY)
+/*
+ * Linux 2.6.35 - 3.0 API,
+ * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed
+ * redundant.  The dentry is still accessible via filp->f_path.dentry,
+ * and we are guaranteed that filp will never be NULL.
+ */
+static int
+zpl_fsync(struct file *filp, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_fsync(ITOZ(inode), datasync, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#ifdef HAVE_FILE_AIO_FSYNC
+static int
+zpl_aio_fsync(struct kiocb *kiocb, int datasync)
+{
+	return (zpl_fsync(kiocb->ki_filp, datasync));
+}
+#endif
+
+#elif defined(HAVE_FSYNC_RANGE)
+/*
+ * Linux 3.1 - 3.x API,
+ * As of 3.1 the responsibility to call filemap_write_and_wait_range() has
+ * been pushed down in to the .fsync() vfs hook.  Additionally, the i_mutex
+ * lock is no longer held by the caller, for zfs we don't require the lock
+ * to be held so we don't acquire it.
+ */
+static int
+zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+
+	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (error)
+		return (error);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_fsync(ITOZ(inode), datasync, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#ifdef HAVE_FILE_AIO_FSYNC
+static int
+zpl_aio_fsync(struct kiocb *kiocb, int datasync)
+{
+	return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync));
+}
+#endif
+
+#else
+#error "Unsupported fops->fsync() implementation"
+#endif
+
+static inline int
+zfs_io_flags(struct kiocb *kiocb)
+{
+	int flags = 0;
+
+#if defined(IOCB_DSYNC)
+	if (kiocb->ki_flags & IOCB_DSYNC)
+		flags |= O_DSYNC;
+#endif
+#if defined(IOCB_SYNC)
+	if (kiocb->ki_flags & IOCB_SYNC)
+		flags |= O_SYNC;
+#endif
+#if defined(IOCB_APPEND)
+	if (kiocb->ki_flags & IOCB_APPEND)
+		flags |= O_APPEND;
+#endif
+#if defined(IOCB_DIRECT)
+	if (kiocb->ki_flags & IOCB_DIRECT)
+		flags |= O_DIRECT;
+#endif
+	return (flags);
+}
+
+static ssize_t
+zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
+    unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags,
+    cred_t *cr, size_t skip)
+{
+	ssize_t read;
+	uio_t uio = { { 0 }, 0 };
+	int error;
+	fstrans_cookie_t cookie;
+
+	uio.uio_iov = iovp;
+	uio.uio_iovcnt = nr_segs;
+	uio.uio_loffset = *ppos;
+	uio.uio_segflg = segment;
+	uio.uio_limit = MAXOFFSET_T;
+	uio.uio_resid = count;
+	uio.uio_skip = skip;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_read(ip, &uio, flags, cr);
+	spl_fstrans_unmark(cookie);
+	if (error < 0)
+		return (error);
+
+	read = count - uio.uio_resid;
+	*ppos += read;
+
+	return (read);
+}
+
+inline ssize_t
+zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos,
+    uio_seg_t segment, int flags, cred_t *cr)
+{
+	struct iovec iov;
+
+	iov.iov_base = (void *)buf;
+	iov.iov_len = len;
+
+	return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment,
+	    flags, cr, 0));
+}
+
+static ssize_t
+zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp,
+    unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip)
+{
+	cred_t *cr = CRED();
+	struct file *filp = kiocb->ki_filp;
+	struct inode *ip = filp->f_mapping->host;
+	zfsvfs_t *zfsvfs = ZTOZSB(ITOZ(ip));
+	ssize_t read;
+	unsigned int f_flags = filp->f_flags;
+
+	f_flags |= zfs_io_flags(kiocb);
+	crhold(cr);
+	read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count,
+	    nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip);
+	crfree(cr);
+
+	/*
+	 * If relatime is enabled, call file_accessed() only if
+	 * zfs_relatime_need_update() is true.  This is needed since datasets
+	 * with inherited "relatime" property aren't necessarily mounted with
+	 * MNT_RELATIME flag (e.g. after `zfs set relatime=...`), which is what
+	 * relatime test in VFS by relatime_need_update() is based on.
+	 */
+	if (!IS_NOATIME(ip) && zfsvfs->z_relatime) {
+		if (zfs_relatime_need_update(ip))
+			file_accessed(filp);
+	} else {
+		file_accessed(filp);
+	}
+
+	return (read);
+}
+
+#if defined(HAVE_VFS_RW_ITERATE)
+static ssize_t
+zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
+{
+	ssize_t ret;
+	uio_seg_t seg = UIO_USERSPACE;
+	if (to->type & ITER_KVEC)
+		seg = UIO_SYSSPACE;
+	if (to->type & ITER_BVEC)
+		seg = UIO_BVEC;
+	ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs,
+	    iov_iter_count(to), seg, to->iov_offset);
+	if (ret > 0)
+		iov_iter_advance(to, ret);
+	return (ret);
+}
+#else
+static ssize_t
+zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp,
+    unsigned long nr_segs, loff_t pos)
+{
+	ssize_t ret;
+	size_t count;
+
+	ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_WRITE);
+	if (ret)
+		return (ret);
+
+	return (zpl_iter_read_common(kiocb, iovp, nr_segs, count,
+	    UIO_USERSPACE, 0));
+}
+#endif /* HAVE_VFS_RW_ITERATE */
+
+static ssize_t
+zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
+    unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags,
+    cred_t *cr, size_t skip)
+{
+	ssize_t wrote;
+	uio_t uio = { { 0 }, 0 };
+	int error;
+	fstrans_cookie_t cookie;
+
+	if (flags & O_APPEND)
+		*ppos = i_size_read(ip);
+
+	uio.uio_iov = iovp;
+	uio.uio_iovcnt = nr_segs;
+	uio.uio_loffset = *ppos;
+	uio.uio_segflg = segment;
+	uio.uio_limit = MAXOFFSET_T;
+	uio.uio_resid = count;
+	uio.uio_skip = skip;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_write(ip, &uio, flags, cr);
+	spl_fstrans_unmark(cookie);
+	if (error < 0)
+		return (error);
+
+	wrote = count - uio.uio_resid;
+	*ppos += wrote;
+
+	return (wrote);
+}
+
+inline ssize_t
+zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos,
+    uio_seg_t segment, int flags, cred_t *cr)
+{
+	struct iovec iov;
+
+	iov.iov_base = (void *)buf;
+	iov.iov_len = len;
+
+	return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment,
+	    flags, cr, 0));
+}
+
+static ssize_t
+zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp,
+    unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip)
+{
+	cred_t *cr = CRED();
+	struct file *filp = kiocb->ki_filp;
+	ssize_t wrote;
+	unsigned int f_flags = filp->f_flags;
+
+	f_flags |= zfs_io_flags(kiocb);
+	crhold(cr);
+	wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count,
+	    nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip);
+	crfree(cr);
+
+	return (wrote);
+}
+
+#if defined(HAVE_VFS_RW_ITERATE)
+static ssize_t
+zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
+{
+	size_t count;
+	ssize_t ret;
+	uio_seg_t seg = UIO_USERSPACE;
+
+#ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB
+	struct file *file = kiocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *ip = mapping->host;
+	int isblk = S_ISBLK(ip->i_mode);
+
+	count = iov_iter_count(from);
+	ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk);
+	if (ret)
+		return (ret);
+#else
+	/*
+	 * XXX - ideally this check should be in the same lock region with
+	 * write operations, so that there's no TOCTTOU race when doing
+	 * append and someone else grow the file.
+	 */
+	ret = generic_write_checks(kiocb, from);
+	if (ret <= 0)
+		return (ret);
+	count = ret;
+#endif
+
+	if (from->type & ITER_KVEC)
+		seg = UIO_SYSSPACE;
+	if (from->type & ITER_BVEC)
+		seg = UIO_BVEC;
+
+	ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs,
+	    count, seg, from->iov_offset);
+	if (ret > 0)
+		iov_iter_advance(from, ret);
+
+	return (ret);
+}
+#else
+static ssize_t
+zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp,
+    unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = kiocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *ip = mapping->host;
+	int isblk = S_ISBLK(ip->i_mode);
+	size_t count;
+	ssize_t ret;
+
+	ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_READ);
+	if (ret)
+		return (ret);
+
+	ret = generic_write_checks(file, &pos, &count, isblk);
+	if (ret)
+		return (ret);
+
+	return (zpl_iter_write_common(kiocb, iovp, nr_segs, count,
+	    UIO_USERSPACE, 0));
+}
+#endif /* HAVE_VFS_RW_ITERATE */
+
+#if defined(HAVE_VFS_RW_ITERATE)
+static ssize_t
+zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter)
+{
+	if (rw == WRITE)
+		return (zpl_iter_write(kiocb, iter));
+	else
+		return (zpl_iter_read(kiocb, iter));
+}
+#if defined(HAVE_VFS_DIRECT_IO_ITER)
+static ssize_t
+zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
+{
+	return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
+}
+#elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET)
+static ssize_t
+zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
+{
+	ASSERT3S(pos, ==, kiocb->ki_pos);
+	return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
+}
+#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
+static ssize_t
+zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
+{
+	ASSERT3S(pos, ==, kiocb->ki_pos);
+	return (zpl_direct_IO_impl(rw, kiocb, iter));
+}
+#else
+#error "Unknown direct IO interface"
+#endif
+
+#else
+
+#if defined(HAVE_VFS_DIRECT_IO_IOVEC)
+static ssize_t
+zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iovp,
+    loff_t pos, unsigned long nr_segs)
+{
+	if (rw == WRITE)
+		return (zpl_aio_write(kiocb, iovp, nr_segs, pos));
+	else
+		return (zpl_aio_read(kiocb, iovp, nr_segs, pos));
+}
+#else
+#error "Unknown direct IO interface"
+#endif
+
+#endif /* HAVE_VFS_RW_ITERATE */
+
+static loff_t
+zpl_llseek(struct file *filp, loff_t offset, int whence)
+{
+#if defined(SEEK_HOLE) && defined(SEEK_DATA)
+	fstrans_cookie_t cookie;
+
+	if (whence == SEEK_DATA || whence == SEEK_HOLE) {
+		struct inode *ip = filp->f_mapping->host;
+		loff_t maxbytes = ip->i_sb->s_maxbytes;
+		loff_t error;
+
+		spl_inode_lock_shared(ip);
+		cookie = spl_fstrans_mark();
+		error = -zfs_holey(ip, whence, &offset);
+		spl_fstrans_unmark(cookie);
+		if (error == 0)
+			error = lseek_execute(filp, ip, offset, maxbytes);
+		spl_inode_unlock_shared(ip);
+
+		return (error);
+	}
+#endif /* SEEK_HOLE && SEEK_DATA */
+
+	return (generic_file_llseek(filp, offset, whence));
+}
+
+/*
+ * It's worth taking a moment to describe how mmap is implemented
+ * for zfs because it differs considerably from other Linux filesystems.
+ * However, this issue is handled the same way under OpenSolaris.
+ *
+ * The issue is that by design zfs bypasses the Linux page cache and
+ * leaves all caching up to the ARC.  This has been shown to work
+ * well for the common read(2)/write(2) case.  However, mmap(2)
+ * is problem because it relies on being tightly integrated with the
+ * page cache.  To handle this we cache mmap'ed files twice, once in
+ * the ARC and a second time in the page cache.  The code is careful
+ * to keep both copies synchronized.
+ *
+ * When a file with an mmap'ed region is written to using write(2)
+ * both the data in the ARC and existing pages in the page cache
+ * are updated.  For a read(2) data will be read first from the page
+ * cache then the ARC if needed.  Neither a write(2) or read(2) will
+ * will ever result in new pages being added to the page cache.
+ *
+ * New pages are added to the page cache only via .readpage() which
+ * is called when the vfs needs to read a page off disk to back the
+ * virtual memory region.  These pages may be modified without
+ * notifying the ARC and will be written out periodically via
+ * .writepage().  This will occur due to either a sync or the usual
+ * page aging behavior.  Note because a read(2) of a mmap'ed file
+ * will always check the page cache first even when the ARC is out
+ * of date correct data will still be returned.
+ *
+ * While this implementation ensures correct behavior it does have
+ * have some drawbacks.  The most obvious of which is that it
+ * increases the required memory footprint when access mmap'ed
+ * files.  It also adds additional complexity to the code keeping
+ * both caches synchronized.
+ *
+ * Longer term it may be possible to cleanly resolve this wart by
+ * mapping page cache pages directly on to the ARC buffers.  The
+ * Linux address space operations are flexible enough to allow
+ * selection of which pages back a particular index.  The trick
+ * would be working out the details of which subsystem is in
+ * charge, the ARC, the page cache, or both.  It may also prove
+ * helpful to move the ARC buffers to a scatter-gather lists
+ * rather than a vmalloc'ed region.
+ */
+static int
+zpl_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct inode *ip = filp->f_mapping->host;
+	znode_t *zp = ITOZ(ip);
+	int error;
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
+	    (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
+	spl_fstrans_unmark(cookie);
+	if (error)
+		return (error);
+
+	error = generic_file_mmap(filp, vma);
+	if (error)
+		return (error);
+
+	mutex_enter(&zp->z_lock);
+	zp->z_is_mapped = B_TRUE;
+	mutex_exit(&zp->z_lock);
+
+	return (error);
+}
+
+/*
+ * Populate a page with data for the Linux page cache.  This function is
+ * only used to support mmap(2).  There will be an identical copy of the
+ * data in the ARC which is kept up to date via .write() and .writepage().
+ *
+ * Current this function relies on zpl_read_common() and the O_DIRECT
+ * flag to read in a page.  This works but the more correct way is to
+ * update zfs_fillpage() to be Linux friendly and use that interface.
+ */
+static int
+zpl_readpage(struct file *filp, struct page *pp)
+{
+	struct inode *ip;
+	struct page *pl[1];
+	int error = 0;
+	fstrans_cookie_t cookie;
+
+	ASSERT(PageLocked(pp));
+	ip = pp->mapping->host;
+	pl[0] = pp;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_getpage(ip, pl, 1);
+	spl_fstrans_unmark(cookie);
+
+	if (error) {
+		SetPageError(pp);
+		ClearPageUptodate(pp);
+	} else {
+		ClearPageError(pp);
+		SetPageUptodate(pp);
+		flush_dcache_page(pp);
+	}
+
+	unlock_page(pp);
+	return (error);
+}
+
+/*
+ * Populate a set of pages with data for the Linux page cache.  This
+ * function will only be called for read ahead and never for demand
+ * paging.  For simplicity, the code relies on read_cache_pages() to
+ * correctly lock each page for IO and call zpl_readpage().
+ */
+static int
+zpl_readpages(struct file *filp, struct address_space *mapping,
+    struct list_head *pages, unsigned nr_pages)
+{
+	return (read_cache_pages(mapping, pages,
+	    (filler_t *)zpl_readpage, filp));
+}
+
+static int
+zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
+{
+	struct address_space *mapping = data;
+	fstrans_cookie_t cookie;
+
+	ASSERT(PageLocked(pp));
+	ASSERT(!PageWriteback(pp));
+
+	cookie = spl_fstrans_mark();
+	(void) zfs_putpage(mapping->host, pp, wbc);
+	spl_fstrans_unmark(cookie);
+
+	return (0);
+}
+
+static int
+zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	znode_t		*zp = ITOZ(mapping->host);
+	zfsvfs_t	*zfsvfs = ITOZSB(mapping->host);
+	enum writeback_sync_modes sync_mode;
+	int result;
+
+	ZFS_ENTER(zfsvfs);
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		wbc->sync_mode = WB_SYNC_ALL;
+	ZFS_EXIT(zfsvfs);
+	sync_mode = wbc->sync_mode;
+
+	/*
+	 * We don't want to run write_cache_pages() in SYNC mode here, because
+	 * that would make putpage() wait for a single page to be committed to
+	 * disk every single time, resulting in atrocious performance. Instead
+	 * we run it once in non-SYNC mode so that the ZIL gets all the data,
+	 * and then we commit it all in one go.
+	 */
+	wbc->sync_mode = WB_SYNC_NONE;
+	result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+	if (sync_mode != wbc->sync_mode) {
+		ZFS_ENTER(zfsvfs);
+		ZFS_VERIFY_ZP(zp);
+		if (zfsvfs->z_log != NULL)
+			zil_commit(zfsvfs->z_log, zp->z_id);
+		ZFS_EXIT(zfsvfs);
+
+		/*
+		 * We need to call write_cache_pages() again (we can't just
+		 * return after the commit) because the previous call in
+		 * non-SYNC mode does not guarantee that we got all the dirty
+		 * pages (see the implementation of write_cache_pages() for
+		 * details). That being said, this is a no-op in most cases.
+		 */
+		wbc->sync_mode = sync_mode;
+		result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+	}
+	return (result);
+}
+
+/*
+ * Write out dirty pages to the ARC, this function is only required to
+ * support mmap(2).  Mapped pages may be dirtied by memory operations
+ * which never call .write().  These dirty pages are kept in sync with
+ * the ARC buffers via this hook.
+ */
+static int
+zpl_writepage(struct page *pp, struct writeback_control *wbc)
+{
+	if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		wbc->sync_mode = WB_SYNC_ALL;
+
+	return (zpl_putpage(pp, wbc, pp->mapping));
+}
+
+/*
+ * The flag combination which matches the behavior of zfs_space() is
+ * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE.  The FALLOC_FL_PUNCH_HOLE
+ * flag was introduced in the 2.6.38 kernel.
+ *
+ * The original mode=0 (allocate space) behavior can be reasonably emulated
+ * by checking if enough space exists and creating a sparse file, as real
+ * persistent space reservation is not possible due to COW, snapshots, etc.
+ */
+static long
+zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
+{
+	cred_t *cr = CRED();
+	loff_t olen;
+	fstrans_cookie_t cookie;
+	int error = 0;
+
+	if ((mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) != 0)
+		return (-EOPNOTSUPP);
+
+	if (offset < 0 || len <= 0)
+		return (-EINVAL);
+
+	spl_inode_lock(ip);
+	olen = i_size_read(ip);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		flock64_t bf;
+
+		if (offset > olen)
+			goto out_unmark;
+
+		if (offset + len > olen)
+			len = olen - offset;
+		bf.l_type = F_WRLCK;
+		bf.l_whence = SEEK_SET;
+		bf.l_start = offset;
+		bf.l_len = len;
+		bf.l_pid = 0;
+
+		error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
+	} else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) {
+		unsigned int percent = zfs_fallocate_reserve_percent;
+		struct kstatfs statfs;
+
+		/* Legacy mode, disable fallocate compatibility. */
+		if (percent == 0) {
+			error = -EOPNOTSUPP;
+			goto out_unmark;
+		}
+
+		/*
+		 * Use zfs_statvfs() instead of dmu_objset_space() since it
+		 * also checks project quota limits, which are relevant here.
+		 */
+		error = zfs_statvfs(ip, &statfs);
+		if (error)
+			goto out_unmark;
+
+		/*
+		 * Shrink available space a bit to account for overhead/races.
+		 * We know the product previously fit into availbytes from
+		 * dmu_objset_space(), so the smaller product will also fit.
+		 */
+		if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) {
+			error = -ENOSPC;
+			goto out_unmark;
+		}
+		if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen)
+			error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE);
+	}
+out_unmark:
+	spl_fstrans_unmark(cookie);
+	spl_inode_unlock(ip);
+
+	crfree(cr);
+
+	return (error);
+}
+
+static long
+zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
+{
+	return zpl_fallocate_common(file_inode(filp),
+	    mode, offset, len);
+}
+
+#define	ZFS_FL_USER_VISIBLE	(FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
+#define	ZFS_FL_USER_MODIFIABLE	(FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
+
+static uint32_t
+__zpl_ioctl_getflags(struct inode *ip)
+{
+	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
+	uint32_t ioctl_flags = 0;
+
+	if (zfs_flags & ZFS_IMMUTABLE)
+		ioctl_flags |= FS_IMMUTABLE_FL;
+
+	if (zfs_flags & ZFS_APPENDONLY)
+		ioctl_flags |= FS_APPEND_FL;
+
+	if (zfs_flags & ZFS_NODUMP)
+		ioctl_flags |= FS_NODUMP_FL;
+
+	if (zfs_flags & ZFS_PROJINHERIT)
+		ioctl_flags |= ZFS_PROJINHERIT_FL;
+
+	return (ioctl_flags & ZFS_FL_USER_VISIBLE);
+}
+
+/*
+ * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file
+ * attributes common to both Linux and Solaris are mapped.
+ */
+static int
+zpl_ioctl_getflags(struct file *filp, void __user *arg)
+{
+	uint32_t flags;
+	int err;
+
+	flags = __zpl_ioctl_getflags(file_inode(filp));
+	err = copy_to_user(arg, &flags, sizeof (flags));
+
+	return (err);
+}
+
+/*
+ * fchange() is a helper macro to detect if we have been asked to change a
+ * flag. This is ugly, but the requirement that we do this is a consequence of
+ * how the Linux file attribute interface was designed. Another consequence is
+ * that concurrent modification of files suffers from a TOCTOU race. Neither
+ * are things we can fix without modifying the kernel-userland interface, which
+ * is outside of our jurisdiction.
+ */
+
+#define	fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1)))
+
+static int
+__zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
+{
+	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
+	xoptattr_t *xoap;
+
+	if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
+	    ZFS_PROJINHERIT_FL))
+		return (-EOPNOTSUPP);
+
+	if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
+		return (-EACCES);
+
+	if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) ||
+	    fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return (-EACCES);
+
+	if (!inode_owner_or_capable(ip))
+		return (-EACCES);
+
+	xva_init(xva);
+	xoap = xva_getxoptattr(xva);
+
+	XVA_SET_REQ(xva, XAT_IMMUTABLE);
+	if (ioctl_flags & FS_IMMUTABLE_FL)
+		xoap->xoa_immutable = B_TRUE;
+
+	XVA_SET_REQ(xva, XAT_APPENDONLY);
+	if (ioctl_flags & FS_APPEND_FL)
+		xoap->xoa_appendonly = B_TRUE;
+
+	XVA_SET_REQ(xva, XAT_NODUMP);
+	if (ioctl_flags & FS_NODUMP_FL)
+		xoap->xoa_nodump = B_TRUE;
+
+	XVA_SET_REQ(xva, XAT_PROJINHERIT);
+	if (ioctl_flags & ZFS_PROJINHERIT_FL)
+		xoap->xoa_projinherit = B_TRUE;
+
+	return (0);
+}
+
+static int
+zpl_ioctl_setflags(struct file *filp, void __user *arg)
+{
+	struct inode *ip = file_inode(filp);
+	uint32_t flags;
+	cred_t *cr = CRED();
+	xvattr_t xva;
+	int err;
+	fstrans_cookie_t cookie;
+
+	if (copy_from_user(&flags, arg, sizeof (flags)))
+		return (-EFAULT);
+
+	err = __zpl_ioctl_setflags(ip, flags, &xva);
+	if (err)
+		return (err);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	return (err);
+}
+
+static int
+zpl_ioctl_getxattr(struct file *filp, void __user *arg)
+{
+	zfsxattr_t fsx = { 0 };
+	struct inode *ip = file_inode(filp);
+	int err;
+
+	fsx.fsx_xflags = __zpl_ioctl_getflags(ip);
+	fsx.fsx_projid = ITOZ(ip)->z_projid;
+	err = copy_to_user(arg, &fsx, sizeof (fsx));
+
+	return (err);
+}
+
+static int
+zpl_ioctl_setxattr(struct file *filp, void __user *arg)
+{
+	struct inode *ip = file_inode(filp);
+	zfsxattr_t fsx;
+	cred_t *cr = CRED();
+	xvattr_t xva;
+	xoptattr_t *xoap;
+	int err;
+	fstrans_cookie_t cookie;
+
+	if (copy_from_user(&fsx, arg, sizeof (fsx)))
+		return (-EFAULT);
+
+	if (!zpl_is_valid_projid(fsx.fsx_projid))
+		return (-EINVAL);
+
+	err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);
+	if (err)
+		return (err);
+
+	xoap = xva_getxoptattr(&xva);
+	XVA_SET_REQ(&xva, XAT_PROJID);
+	xoap->xoa_projid = fsx.fsx_projid;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	return (err);
+}
+
+static long
+zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		return (zpl_ioctl_getflags(filp, (void *)arg));
+	case FS_IOC_SETFLAGS:
+		return (zpl_ioctl_setflags(filp, (void *)arg));
+	case ZFS_IOC_FSGETXATTR:
+		return (zpl_ioctl_getxattr(filp, (void *)arg));
+	case ZFS_IOC_FSSETXATTR:
+		return (zpl_ioctl_setxattr(filp, (void *)arg));
+	default:
+		return (-ENOTTY);
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long
+zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case FS_IOC32_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	default:
+		return (-ENOTTY);
+	}
+	return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)));
+}
+#endif /* CONFIG_COMPAT */
+
+
+const struct address_space_operations zpl_address_space_operations = {
+	.readpages	= zpl_readpages,
+	.readpage	= zpl_readpage,
+	.writepage	= zpl_writepage,
+	.writepages	= zpl_writepages,
+	.direct_IO	= zpl_direct_IO,
+};
+
+const struct file_operations zpl_file_operations = {
+	.open		= zpl_open,
+	.release	= zpl_release,
+	.llseek		= zpl_llseek,
+#ifdef HAVE_VFS_RW_ITERATE
+#ifdef HAVE_NEW_SYNC_READ
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+#endif
+	.read_iter	= zpl_iter_read,
+	.write_iter	= zpl_iter_write,
+#else
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= zpl_aio_read,
+	.aio_write	= zpl_aio_write,
+#endif
+	.mmap		= zpl_mmap,
+	.fsync		= zpl_fsync,
+#ifdef HAVE_FILE_AIO_FSYNC
+	.aio_fsync	= zpl_aio_fsync,
+#endif
+	.fallocate	= zpl_fallocate,
+	.unlocked_ioctl	= zpl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= zpl_compat_ioctl,
+#endif
+};
+
+const struct file_operations zpl_dir_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+#if defined(HAVE_VFS_ITERATE_SHARED)
+	.iterate_shared	= zpl_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+	.iterate	= zpl_iterate,
+#else
+	.readdir	= zpl_readdir,
+#endif
+	.fsync		= zpl_fsync,
+	.unlocked_ioctl = zpl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = zpl_compat_ioctl,
+#endif
+};
+
+/* BEGIN CSTYLED */
+module_param(zfs_fallocate_reserve_percent, uint, 0644);
+MODULE_PARM_DESC(zfs_fallocate_reserve_percent,
+    "Percentage of length to use for the available capacity check");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
new file mode 100644
index 000000000000..f3b97a22074c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
@@ -0,0 +1,747 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
+#include <sys/dmu_objset.h>
+#include <sys/vfs.h>
+#include <sys/zpl.h>
+#include <sys/file.h>
+
+
+static struct dentry *
+zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	cred_t *cr = CRED();
+	struct inode *ip;
+	znode_t *zp;
+	int error;
+	fstrans_cookie_t cookie;
+	pathname_t *ppn = NULL;
+	pathname_t pn;
+	int zfs_flags = 0;
+	zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+	if (dlen(dentry) >= ZAP_MAXNAMELEN)
+		return (ERR_PTR(-ENAMETOOLONG));
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+
+	/* If we are a case insensitive fs, we need the real name */
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+		zfs_flags = FIGNORECASE;
+		pn_alloc(&pn);
+		ppn = &pn;
+	}
+
+	error = -zfs_lookup(ITOZ(dir), dname(dentry), &zp,
+	    zfs_flags, cr, NULL, ppn);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	spin_lock(&dentry->d_lock);
+	dentry->d_time = jiffies;
+	spin_unlock(&dentry->d_lock);
+
+	if (error) {
+		/*
+		 * If we have a case sensitive fs, we do not want to
+		 * insert negative entries, so return NULL for ENOENT.
+		 * Fall through if the error is not ENOENT. Also free memory.
+		 */
+		if (ppn) {
+			pn_free(ppn);
+			if (error == -ENOENT)
+				return (NULL);
+		}
+
+		if (error == -ENOENT)
+			return (d_splice_alias(NULL, dentry));
+		else
+			return (ERR_PTR(error));
+	}
+	ip = ZTOI(zp);
+
+	/*
+	 * If we are case insensitive, call the correct function
+	 * to install the name.
+	 */
+	if (ppn) {
+		struct dentry *new_dentry;
+		struct qstr ci_name;
+
+		if (strcmp(dname(dentry), pn.pn_buf) == 0) {
+			new_dentry = d_splice_alias(ip,  dentry);
+		} else {
+			ci_name.name = pn.pn_buf;
+			ci_name.len = strlen(pn.pn_buf);
+			new_dentry = d_add_ci(dentry, ip, &ci_name);
+		}
+		pn_free(ppn);
+		return (new_dentry);
+	} else {
+		return (d_splice_alias(ip, dentry));
+	}
+}
+
+void
+zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr)
+{
+	vap->va_mask = ATTR_MODE;
+	vap->va_mode = mode;
+	vap->va_uid = crgetfsuid(cr);
+
+	if (dir && dir->i_mode & S_ISGID) {
+		vap->va_gid = KGID_TO_SGID(dir->i_gid);
+		if (S_ISDIR(mode))
+			vap->va_mode |= S_ISGID;
+	} else {
+		vap->va_gid = crgetfsgid(cr);
+	}
+}
+
+static int
+zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag)
+{
+	cred_t *cr = CRED();
+	znode_t *zp;
+	vattr_t *vap;
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dir, mode, cr);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0,
+	    mode, &zp, cr, 0, NULL);
+	if (error == 0) {
+		d_instantiate(dentry, ZTOI(zp));
+
+		error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
+		if (error == 0)
+			error = zpl_init_acl(ZTOI(zp), dir);
+
+		if (error)
+			(void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0);
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+    dev_t rdev)
+{
+	cred_t *cr = CRED();
+	znode_t *zp;
+	vattr_t *vap;
+	int error;
+	fstrans_cookie_t cookie;
+
+	/*
+	 * We currently expect Linux to supply rdev=0 for all sockets
+	 * and fifos, but we want to know if this behavior ever changes.
+	 */
+	if (S_ISSOCK(mode) || S_ISFIFO(mode))
+		ASSERT(rdev == 0);
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dir, mode, cr);
+	vap->va_rdev = rdev;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0,
+	    mode, &zp, cr, 0, NULL);
+	if (error == 0) {
+		d_instantiate(dentry, ZTOI(zp));
+
+		error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
+		if (error == 0)
+			error = zpl_init_acl(ZTOI(zp), dir);
+
+		if (error)
+			(void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0);
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#ifdef HAVE_TMPFILE
+static int
+zpl_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	cred_t *cr = CRED();
+	struct inode *ip;
+	vattr_t *vap;
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	/*
+	 * The VFS does not apply the umask, therefore it is applied here
+	 * when POSIX ACLs are not enabled.
+	 */
+	if (!IS_POSIXACL(dir))
+		mode &= ~current_umask();
+	zpl_vap_init(vap, dir, mode, cr);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL);
+	if (error == 0) {
+		/* d_tmpfile will do drop_nlink, so we should set it first */
+		set_nlink(ip, 1);
+		d_tmpfile(dentry, ip);
+
+		error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+		if (error == 0)
+			error = zpl_init_acl(ip, dir);
+		/*
+		 * don't need to handle error here, file is already in
+		 * unlinked set.
+		 */
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+#endif
+
+static int
+zpl_unlink(struct inode *dir, struct dentry *dentry)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+	zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_remove(ITOZ(dir), dname(dentry), cr, 0);
+
+	/*
+	 * For a CI FS we must invalidate the dentry to prevent the
+	 * creation of negative entries.
+	 */
+	if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE)
+		d_invalidate(dentry);
+
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	cred_t *cr = CRED();
+	vattr_t *vap;
+	znode_t *zp;
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dir, mode | S_IFDIR, cr);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_mkdir(ITOZ(dir), dname(dentry), vap, &zp, cr, 0, NULL);
+	if (error == 0) {
+		d_instantiate(dentry, ZTOI(zp));
+
+		error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
+		if (error == 0)
+			error = zpl_init_acl(ZTOI(zp), dir);
+
+		if (error)
+			(void) zfs_rmdir(ITOZ(dir), dname(dentry), NULL, cr, 0);
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+	zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_rmdir(ITOZ(dir), dname(dentry), NULL, cr, 0);
+
+	/*
+	 * For a CI FS we must invalidate the dentry to prevent the
+	 * creation of negative entries.
+	 */
+	if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE)
+		d_invalidate(dentry);
+
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
+    unsigned int query_flags)
+{
+	int error;
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+
+	/*
+	 * XXX request_mask and query_flags currently ignored.
+	 */
+
+	error = -zfs_getattr_fast(path->dentry->d_inode, stat);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+ZPL_GETATTR_WRAPPER(zpl_getattr);
+
+static int
+zpl_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	struct inode *ip = dentry->d_inode;
+	cred_t *cr = CRED();
+	vattr_t *vap;
+	int error;
+	fstrans_cookie_t cookie;
+
+	error = setattr_prepare(dentry, ia);
+	if (error)
+		return (error);
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK;
+	vap->va_mode = ia->ia_mode;
+	vap->va_uid = KUID_TO_SUID(ia->ia_uid);
+	vap->va_gid = KGID_TO_SGID(ia->ia_gid);
+	vap->va_size = ia->ia_size;
+	vap->va_atime = ia->ia_atime;
+	vap->va_mtime = ia->ia_mtime;
+	vap->va_ctime = ia->ia_ctime;
+
+	if (vap->va_mask & ATTR_ATIME)
+		ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_setattr(ITOZ(ip), vap, 0, cr);
+	if (!error && (ia->ia_valid & ATTR_MODE))
+		error = zpl_chmod_acl(ip);
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_rename2(struct inode *sdip, struct dentry *sdentry,
+    struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+
+	/* We don't have renameat2(2) support */
+	if (flags)
+		return (-EINVAL);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip),
+	    dname(tdentry), cr, 0);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#ifndef HAVE_RENAME_WANTS_FLAGS
+static int
+zpl_rename(struct inode *sdip, struct dentry *sdentry,
+    struct inode *tdip, struct dentry *tdentry)
+{
+	return (zpl_rename2(sdip, sdentry, tdip, tdentry, 0));
+}
+#endif
+
+static int
+zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name)
+{
+	cred_t *cr = CRED();
+	vattr_t *vap;
+	znode_t *zp;
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_symlink(ITOZ(dir), dname(dentry), vap,
+	    (char *)name, &zp, cr, 0);
+	if (error == 0) {
+		d_instantiate(dentry, ZTOI(zp));
+
+		error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
+		if (error)
+			(void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0);
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#if defined(HAVE_PUT_LINK_COOKIE)
+static void
+zpl_put_link(struct inode *unused, void *cookie)
+{
+	kmem_free(cookie, MAXPATHLEN);
+}
+#elif defined(HAVE_PUT_LINK_NAMEIDATA)
+static void
+zpl_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
+{
+	const char *link = nd_get_link(nd);
+
+	if (!IS_ERR(link))
+		kmem_free(link, MAXPATHLEN);
+}
+#elif defined(HAVE_PUT_LINK_DELAYED)
+static void
+zpl_put_link(void *ptr)
+{
+	kmem_free(ptr, MAXPATHLEN);
+}
+#endif
+
+static int
+zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link)
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	struct iovec iov;
+	uio_t uio = { { 0 }, 0 };
+	int error;
+
+	crhold(cr);
+	*link = NULL;
+	iov.iov_len = MAXPATHLEN;
+	iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+	uio.uio_iov = &iov;
+	uio.uio_iovcnt = 1;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_resid = (MAXPATHLEN - 1);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_readlink(ip, &uio, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	if (error)
+		kmem_free(iov.iov_base, MAXPATHLEN);
+	else
+		*link = iov.iov_base;
+
+	return (error);
+}
+
+#if defined(HAVE_GET_LINK_DELAYED)
+static const char *
+zpl_get_link(struct dentry *dentry, struct inode *inode,
+    struct delayed_call *done)
+{
+	char *link = NULL;
+	int error;
+
+	if (!dentry)
+		return (ERR_PTR(-ECHILD));
+
+	error = zpl_get_link_common(dentry, inode, &link);
+	if (error)
+		return (ERR_PTR(error));
+
+	set_delayed_call(done, zpl_put_link, link);
+
+	return (link);
+}
+#elif defined(HAVE_GET_LINK_COOKIE)
+static const char *
+zpl_get_link(struct dentry *dentry, struct inode *inode, void **cookie)
+{
+	char *link = NULL;
+	int error;
+
+	if (!dentry)
+		return (ERR_PTR(-ECHILD));
+
+	error = zpl_get_link_common(dentry, inode, &link);
+	if (error)
+		return (ERR_PTR(error));
+
+	return (*cookie = link);
+}
+#elif defined(HAVE_FOLLOW_LINK_COOKIE)
+static const char *
+zpl_follow_link(struct dentry *dentry, void **cookie)
+{
+	char *link = NULL;
+	int error;
+
+	error = zpl_get_link_common(dentry, dentry->d_inode, &link);
+	if (error)
+		return (ERR_PTR(error));
+
+	return (*cookie = link);
+}
+#elif defined(HAVE_FOLLOW_LINK_NAMEIDATA)
+static void *
+zpl_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	char *link = NULL;
+	int error;
+
+	error = zpl_get_link_common(dentry, dentry->d_inode, &link);
+	if (error)
+		nd_set_link(nd, ERR_PTR(error));
+	else
+		nd_set_link(nd, link);
+
+	return (NULL);
+}
+#endif
+
+static int
+zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+	cred_t *cr = CRED();
+	struct inode *ip = old_dentry->d_inode;
+	int error;
+	fstrans_cookie_t cookie;
+
+	if (ip->i_nlink >= ZFS_LINK_MAX)
+		return (-EMLINK);
+
+	crhold(cr);
+	ip->i_ctime = current_time(ip);
+	igrab(ip); /* Use ihold() if available */
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_link(ITOZ(dir), ITOZ(ip), dname(dentry), cr, 0);
+	if (error) {
+		iput(ip);
+		goto out;
+	}
+
+	d_instantiate(dentry, ip);
+out:
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+#ifdef HAVE_D_REVALIDATE_NAMEIDATA
+zpl_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned int flags = (nd ? nd->flags : 0);
+#else
+zpl_revalidate(struct dentry *dentry, unsigned int flags)
+{
+#endif /* HAVE_D_REVALIDATE_NAMEIDATA */
+	/* CSTYLED */
+	zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+	int error;
+
+	if (flags & LOOKUP_RCU)
+		return (-ECHILD);
+
+	/*
+	 * After a rollback negative dentries created before the rollback
+	 * time must be invalidated.  Otherwise they can obscure files which
+	 * are only present in the rolled back dataset.
+	 */
+	if (dentry->d_inode == NULL) {
+		spin_lock(&dentry->d_lock);
+		error = time_before(dentry->d_time, zfsvfs->z_rollback_time);
+		spin_unlock(&dentry->d_lock);
+
+		if (error)
+			return (0);
+	}
+
+	/*
+	 * The dentry may reference a stale inode if a mounted file system
+	 * was rolled back to a point in time where the object didn't exist.
+	 */
+	if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale)
+		return (0);
+
+	return (1);
+}
+
+const struct inode_operations zpl_inode_operations = {
+	.setattr	= zpl_setattr,
+	.getattr	= zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.listxattr	= zpl_xattr_list,
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+	.set_acl	= zpl_set_acl,
+#endif /* HAVE_SET_ACL */
+	.get_acl	= zpl_get_acl,
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+const struct inode_operations zpl_dir_inode_operations = {
+	.create		= zpl_create,
+	.lookup		= zpl_lookup,
+	.link		= zpl_link,
+	.unlink		= zpl_unlink,
+	.symlink	= zpl_symlink,
+	.mkdir		= zpl_mkdir,
+	.rmdir		= zpl_rmdir,
+	.mknod		= zpl_mknod,
+#ifdef HAVE_RENAME_WANTS_FLAGS
+	.rename		= zpl_rename2,
+#else
+	.rename		= zpl_rename,
+#endif
+#ifdef HAVE_TMPFILE
+	.tmpfile	= zpl_tmpfile,
+#endif
+	.setattr	= zpl_setattr,
+	.getattr	= zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.listxattr	= zpl_xattr_list,
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+	.set_acl	= zpl_set_acl,
+#endif /* HAVE_SET_ACL */
+	.get_acl	= zpl_get_acl,
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+const struct inode_operations zpl_symlink_inode_operations = {
+#ifdef HAVE_GENERIC_READLINK
+	.readlink	= generic_readlink,
+#endif
+#if defined(HAVE_GET_LINK_DELAYED) || defined(HAVE_GET_LINK_COOKIE)
+	.get_link	= zpl_get_link,
+#elif defined(HAVE_FOLLOW_LINK_COOKIE) || defined(HAVE_FOLLOW_LINK_NAMEIDATA)
+	.follow_link	= zpl_follow_link,
+#endif
+#if defined(HAVE_PUT_LINK_COOKIE) || defined(HAVE_PUT_LINK_NAMEIDATA)
+	.put_link	= zpl_put_link,
+#endif
+	.setattr	= zpl_setattr,
+	.getattr	= zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.listxattr	= zpl_xattr_list,
+};
+
+const struct inode_operations zpl_special_inode_operations = {
+	.setattr	= zpl_setattr,
+	.getattr	= zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.listxattr	= zpl_xattr_list,
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+	.set_acl	= zpl_set_acl,
+#endif /* HAVE_SET_ACL */
+	.get_acl	= zpl_get_acl,
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+dentry_operations_t zpl_dentry_operations = {
+	.d_revalidate	= zpl_revalidate,
+};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
new file mode 100644
index 000000000000..75adff51782e
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -0,0 +1,326 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ */
+
+
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+
+static struct inode *
+zpl_inode_alloc(struct super_block *sb)
+{
+	struct inode *ip;
+
+	VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
+	inode_set_iversion(ip, 1);
+
+	return (ip);
+}
+
+static void
+zpl_inode_destroy(struct inode *ip)
+{
+	ASSERT(atomic_read(&ip->i_count) == 0);
+	zfs_inode_destroy(ip);
+}
+
+/*
+ * Called from __mark_inode_dirty() to reflect that something in the
+ * inode has changed.  We use it to ensure the znode system attributes
+ * are always strictly update to date with respect to the inode.
+ */
+#ifdef HAVE_DIRTY_INODE_WITH_FLAGS
+static void
+zpl_dirty_inode(struct inode *ip, int flags)
+{
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	zfs_dirty_inode(ip, flags);
+	spl_fstrans_unmark(cookie);
+}
+#else
+static void
+zpl_dirty_inode(struct inode *ip)
+{
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	zfs_dirty_inode(ip, 0);
+	spl_fstrans_unmark(cookie);
+}
+#endif /* HAVE_DIRTY_INODE_WITH_FLAGS */
+
+/*
+ * When ->drop_inode() is called its return value indicates if the
+ * inode should be evicted from the inode cache.  If the inode is
+ * unhashed and has no links the default policy is to evict it
+ * immediately.
+ *
+ * The ->evict_inode() callback must minimally truncate the inode pages,
+ * and call clear_inode().  For 2.6.35 and later kernels this will
+ * simply update the inode state, with the sync occurring before the
+ * truncate in evict().  For earlier kernels clear_inode() maps to
+ * end_writeback() which is responsible for completing all outstanding
+ * write back.  In either case, once this is done it is safe to cleanup
+ * any remaining inode specific data via zfs_inactive().
+ * remaining filesystem specific data.
+ */
+static void
+zpl_evict_inode(struct inode *ip)
+{
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	truncate_setsize(ip, 0);
+	clear_inode(ip);
+	zfs_inactive(ip);
+	spl_fstrans_unmark(cookie);
+}
+
+static void
+zpl_put_super(struct super_block *sb)
+{
+	fstrans_cookie_t cookie;
+	int error;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_umount(sb);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+}
+
+static int
+zpl_sync_fs(struct super_block *sb, int wait)
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_sync(sb, wait, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
+{
+	fstrans_cookie_t cookie;
+	int error;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_statvfs(dentry->d_inode, statp);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+
+	/*
+	 * If required by a 32-bit system call, dynamically scale the
+	 * block size up to 16MiB and decrease the block counts.  This
+	 * allows for a maximum size of 64EiB to be reported.  The file
+	 * counts must be artificially capped at 2^32-1.
+	 */
+	if (unlikely(zpl_is_32bit_api())) {
+		while (statp->f_blocks > UINT32_MAX &&
+		    statp->f_bsize < SPA_MAXBLOCKSIZE) {
+			statp->f_frsize <<= 1;
+			statp->f_bsize <<= 1;
+
+			statp->f_blocks >>= 1;
+			statp->f_bfree >>= 1;
+			statp->f_bavail >>= 1;
+		}
+
+		uint64_t usedobjs = statp->f_files - statp->f_ffree;
+		statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
+		statp->f_files = statp->f_ffree + usedobjs;
+	}
+
+	return (error);
+}
+
+static int
+zpl_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
+	fstrans_cookie_t cookie;
+	int error;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_remount(sb, flags, &zm);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+__zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
+{
+	seq_printf(seq, ",%s",
+	    zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
+
+#ifdef CONFIG_FS_POSIX_ACL
+	switch (zfsvfs->z_acl_type) {
+	case ZFS_ACLTYPE_POSIXACL:
+		seq_puts(seq, ",posixacl");
+		break;
+	default:
+		seq_puts(seq, ",noacl");
+		break;
+	}
+#endif /* CONFIG_FS_POSIX_ACL */
+
+	return (0);
+}
+
+static int
+zpl_show_options(struct seq_file *seq, struct dentry *root)
+{
+	return (__zpl_show_options(seq, root->d_sb->s_fs_info));
+}
+
+static int
+zpl_fill_super(struct super_block *sb, void *data, int silent)
+{
+	zfs_mnt_t *zm = (zfs_mnt_t *)data;
+	fstrans_cookie_t cookie;
+	int error;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_domount(sb, zm, silent);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_test_super(struct super_block *s, void *data)
+{
+	zfsvfs_t *zfsvfs = s->s_fs_info;
+	objset_t *os = data;
+
+	if (zfsvfs == NULL)
+		return (0);
+
+	return (os == zfsvfs->z_os);
+}
+
+static struct super_block *
+zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
+{
+	struct super_block *s;
+	objset_t *os;
+	int err;
+
+	err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
+	if (err)
+		return (ERR_PTR(-err));
+
+	/*
+	 * The dsl pool lock must be released prior to calling sget().
+	 * It is possible sget() may block on the lock in grab_super()
+	 * while deactivate_super() holds that same lock and waits for
+	 * a txg sync.  If the dsl_pool lock is held over sget()
+	 * this can prevent the pool sync and cause a deadlock.
+	 */
+	dsl_pool_rele(dmu_objset_pool(os), FTAG);
+	s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
+	dsl_dataset_rele(dmu_objset_ds(os), FTAG);
+
+	if (IS_ERR(s))
+		return (ERR_CAST(s));
+
+	if (s->s_root == NULL) {
+		err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
+		if (err) {
+			deactivate_locked_super(s);
+			return (ERR_PTR(err));
+		}
+		s->s_flags |= SB_ACTIVE;
+	} else if ((flags ^ s->s_flags) & SB_RDONLY) {
+		deactivate_locked_super(s);
+		return (ERR_PTR(-EBUSY));
+	}
+
+	return (s);
+}
+
+static struct dentry *
+zpl_mount(struct file_system_type *fs_type, int flags,
+    const char *osname, void *data)
+{
+	zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
+
+	struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
+	if (IS_ERR(sb))
+		return (ERR_CAST(sb));
+
+	return (dget(sb->s_root));
+}
+
+static void
+zpl_kill_sb(struct super_block *sb)
+{
+	zfs_preumount(sb);
+	kill_anon_super(sb);
+}
+
+void
+zpl_prune_sb(int64_t nr_to_scan, void *arg)
+{
+	struct super_block *sb = (struct super_block *)arg;
+	int objects = 0;
+
+	(void) -zfs_prune(sb, nr_to_scan, &objects);
+}
+
+const struct super_operations zpl_super_operations = {
+	.alloc_inode		= zpl_inode_alloc,
+	.destroy_inode		= zpl_inode_destroy,
+	.dirty_inode		= zpl_dirty_inode,
+	.write_inode		= NULL,
+	.evict_inode		= zpl_evict_inode,
+	.put_super		= zpl_put_super,
+	.sync_fs		= zpl_sync_fs,
+	.statfs			= zpl_statfs,
+	.remount_fs		= zpl_remount_fs,
+	.show_options		= zpl_show_options,
+	.show_stats		= NULL,
+};
+
+struct file_system_type zpl_fs_type = {
+	.owner			= THIS_MODULE,
+	.name			= ZFS_DRIVER,
+	.mount			= zpl_mount,
+	.kill_sb		= zpl_kill_sb,
+};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
new file mode 100644
index 000000000000..fa3c036405b0
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
@@ -0,0 +1,1480 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ *
+ * Extended attributes (xattr) on Solaris are implemented as files
+ * which exist in a hidden xattr directory.  These extended attributes
+ * can be accessed using the attropen() system call which opens
+ * the extended attribute.  It can then be manipulated just like
+ * a standard file descriptor.  This has a couple advantages such
+ * as practically no size limit on the file, and the extended
+ * attributes permissions may differ from those of the parent file.
+ * This interface is really quite clever, but it's also completely
+ * different than what is supported on Linux.  It also comes with a
+ * steep performance penalty when accessing small xattrs because they
+ * are not stored with the parent file.
+ *
+ * Under Linux extended attributes are manipulated by the system
+ * calls getxattr(2), setxattr(2), and listxattr(2).  They consider
+ * extended attributes to be name/value pairs where the name is a
+ * NULL terminated string.  The name must also include one of the
+ * following namespace prefixes:
+ *
+ *   user     - No restrictions and is available to user applications.
+ *   trusted  - Restricted to kernel and root (CAP_SYS_ADMIN) use.
+ *   system   - Used for access control lists (system.nfs4_acl, etc).
+ *   security - Used by SELinux to store a files security context.
+ *
+ * The value under Linux to limited to 65536 bytes of binary data.
+ * In practice, individual xattrs tend to be much smaller than this
+ * and are typically less than 100 bytes.  A good example of this
+ * are the security.selinux xattrs which are less than 100 bytes and
+ * exist for every file when xattr labeling is enabled.
+ *
+ * The Linux xattr implementation has been written to take advantage of
+ * this typical usage.  When the dataset property 'xattr=sa' is set,
+ * then xattrs will be preferentially stored as System Attributes (SA).
+ * This allows tiny xattrs (~100 bytes) to be stored with the dnode and
+ * up to 64k of xattrs to be stored in the spill block.  If additional
+ * xattr space is required, which is unlikely under Linux, they will
+ * be stored using the traditional directory approach.
+ *
+ * This optimization results in roughly a 3x performance improvement
+ * when accessing xattrs because it avoids the need to perform a seek
+ * for every xattr value.  When multiple xattrs are stored per-file
+ * the performance improvements are even greater because all of the
+ * xattrs stored in the spill block will be cached.
+ *
+ * However, by default SA based xattrs are disabled in the Linux port
+ * to maximize compatibility with other implementations.  If you do
+ * enable SA based xattrs then they will not be visible on platforms
+ * which do not support this feature.
+ *
+ * NOTE: One additional consequence of the xattr directory implementation
+ * is that when an extended attribute is manipulated an inode is created.
+ * This inode will exist in the Linux inode cache but there will be no
+ * associated entry in the dentry cache which references it.  This is
+ * safe but it may result in some confusion.  Enabling SA based xattrs
+ * largely avoids the issue except in the overflow case.
+ */
+
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zap.h>
+#include <sys/vfs.h>
+#include <sys/zpl.h>
+
+typedef struct xattr_filldir {
+	size_t size;
+	size_t offset;
+	char *buf;
+	struct dentry *dentry;
+} xattr_filldir_t;
+
+static const struct xattr_handler *zpl_xattr_handler(const char *);
+
+static int
+zpl_xattr_permission(xattr_filldir_t *xf, const char *name, int name_len)
+{
+	static const struct xattr_handler *handler;
+	struct dentry *d = xf->dentry;
+
+	handler = zpl_xattr_handler(name);
+	if (!handler)
+		return (0);
+
+	if (handler->list) {
+#if defined(HAVE_XATTR_LIST_SIMPLE)
+		if (!handler->list(d))
+			return (0);
+#elif defined(HAVE_XATTR_LIST_DENTRY)
+		if (!handler->list(d, NULL, 0, name, name_len, 0))
+			return (0);
+#elif defined(HAVE_XATTR_LIST_HANDLER)
+		if (!handler->list(handler, d, NULL, 0, name, name_len))
+			return (0);
+#endif
+	}
+
+	return (1);
+}
+
+/*
+ * Determine is a given xattr name should be visible and if so copy it
+ * in to the provided buffer (xf->buf).
+ */
+static int
+zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len)
+{
+	/* Check permissions using the per-namespace list xattr handler. */
+	if (!zpl_xattr_permission(xf, name, name_len))
+		return (0);
+
+	/* When xf->buf is NULL only calculate the required size. */
+	if (xf->buf) {
+		if (xf->offset + name_len + 1 > xf->size)
+			return (-ERANGE);
+
+		memcpy(xf->buf + xf->offset, name, name_len);
+		xf->buf[xf->offset + name_len] = '\0';
+	}
+
+	xf->offset += (name_len + 1);
+
+	return (0);
+}
+
+/*
+ * Read as many directory entry names as will fit in to the provided buffer,
+ * or when no buffer is provided calculate the required buffer size.
+ */
+static int
+zpl_xattr_readdir(struct inode *dxip, xattr_filldir_t *xf)
+{
+	zap_cursor_t zc;
+	zap_attribute_t	zap;
+	int error;
+
+	zap_cursor_init(&zc, ITOZSB(dxip)->z_os, ITOZ(dxip)->z_id);
+
+	while ((error = -zap_cursor_retrieve(&zc, &zap)) == 0) {
+
+		if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
+			error = -ENXIO;
+			break;
+		}
+
+		error = zpl_xattr_filldir(xf, zap.za_name, strlen(zap.za_name));
+		if (error)
+			break;
+
+		zap_cursor_advance(&zc);
+	}
+
+	zap_cursor_fini(&zc);
+
+	if (error == -ENOENT)
+		error = 0;
+
+	return (error);
+}
+
+static ssize_t
+zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr)
+{
+	struct inode *ip = xf->dentry->d_inode;
+	struct inode *dxip = NULL;
+	znode_t *dxzp;
+	int error;
+
+	/* Lookup the xattr directory */
+	error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR,
+	    cr, NULL, NULL);
+	if (error) {
+		if (error == -ENOENT)
+			error = 0;
+
+		return (error);
+	}
+
+	dxip = ZTOI(dxzp);
+	error = zpl_xattr_readdir(dxip, xf);
+	iput(dxip);
+
+	return (error);
+}
+
+static ssize_t
+zpl_xattr_list_sa(xattr_filldir_t *xf)
+{
+	znode_t *zp = ITOZ(xf->dentry->d_inode);
+	nvpair_t *nvp = NULL;
+	int error = 0;
+
+	mutex_enter(&zp->z_lock);
+	if (zp->z_xattr_cached == NULL)
+		error = -zfs_sa_get_xattr(zp);
+	mutex_exit(&zp->z_lock);
+
+	if (error)
+		return (error);
+
+	ASSERT(zp->z_xattr_cached);
+
+	while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) {
+		ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY);
+
+		error = zpl_xattr_filldir(xf, nvpair_name(nvp),
+		    strlen(nvpair_name(nvp)));
+		if (error)
+			return (error);
+	}
+
+	return (0);
+}
+
+ssize_t
+zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	znode_t *zp = ITOZ(dentry->d_inode);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	xattr_filldir_t xf = { buffer_size, 0, buffer, dentry };
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	int error = 0;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	ZPL_ENTER(zfsvfs);
+	ZPL_VERIFY_ZP(zp);
+	rw_enter(&zp->z_xattr_lock, RW_READER);
+
+	if (zfsvfs->z_use_sa && zp->z_is_sa) {
+		error = zpl_xattr_list_sa(&xf);
+		if (error)
+			goto out;
+	}
+
+	error = zpl_xattr_list_dir(&xf, cr);
+	if (error)
+		goto out;
+
+	error = xf.offset;
+out:
+
+	rw_exit(&zp->z_xattr_lock);
+	ZPL_EXIT(zfsvfs);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	return (error);
+}
+
+static int
+zpl_xattr_get_dir(struct inode *ip, const char *name, void *value,
+    size_t size, cred_t *cr)
+{
+	struct inode *xip = NULL;
+	znode_t *dxzp = NULL;
+	znode_t *xzp = NULL;
+	loff_t pos = 0;
+	int error;
+
+	/* Lookup the xattr directory */
+	error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR,
+	    cr, NULL, NULL);
+	if (error)
+		goto out;
+
+	/* Lookup a specific xattr name in the directory */
+	error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL);
+	if (error)
+		goto out;
+
+	xip = ZTOI(xzp);
+	if (!size) {
+		error = i_size_read(xip);
+		goto out;
+	}
+
+	if (size < i_size_read(xip)) {
+		error = -ERANGE;
+		goto out;
+	}
+
+	error = zpl_read_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr);
+out:
+	if (xzp)
+		zrele(xzp);
+
+	if (dxzp)
+		zrele(dxzp);
+
+	return (error);
+}
+
+static int
+zpl_xattr_get_sa(struct inode *ip, const char *name, void *value, size_t size)
+{
+	znode_t *zp = ITOZ(ip);
+	uchar_t *nv_value;
+	uint_t nv_size;
+	int error = 0;
+
+	ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
+
+	mutex_enter(&zp->z_lock);
+	if (zp->z_xattr_cached == NULL)
+		error = -zfs_sa_get_xattr(zp);
+	mutex_exit(&zp->z_lock);
+
+	if (error)
+		return (error);
+
+	ASSERT(zp->z_xattr_cached);
+	error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name,
+	    &nv_value, &nv_size);
+	if (error)
+		return (error);
+
+	if (size == 0 || value == NULL)
+		return (nv_size);
+
+	if (size < nv_size)
+		return (-ERANGE);
+
+	memcpy(value, nv_value, nv_size);
+
+	return (nv_size);
+}
+
+static int
+__zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size,
+    cred_t *cr)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	int error;
+
+	ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
+
+	if (zfsvfs->z_use_sa && zp->z_is_sa) {
+		error = zpl_xattr_get_sa(ip, name, value, size);
+		if (error != -ENOENT)
+			goto out;
+	}
+
+	error = zpl_xattr_get_dir(ip, name, value, size, cr);
+out:
+	if (error == -ENOENT)
+		error = -ENODATA;
+
+	return (error);
+}
+
+#define	XATTR_NOENT	0x0
+#define	XATTR_IN_SA	0x1
+#define	XATTR_IN_DIR	0x2
+/* check where the xattr resides */
+static int
+__zpl_xattr_where(struct inode *ip, const char *name, int *where, cred_t *cr)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	int error;
+
+	ASSERT(where);
+	ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
+
+	*where = XATTR_NOENT;
+	if (zfsvfs->z_use_sa && zp->z_is_sa) {
+		error = zpl_xattr_get_sa(ip, name, NULL, 0);
+		if (error >= 0)
+			*where |= XATTR_IN_SA;
+		else if (error != -ENOENT)
+			return (error);
+	}
+
+	error = zpl_xattr_get_dir(ip, name, NULL, 0, cr);
+	if (error >= 0)
+		*where |= XATTR_IN_DIR;
+	else if (error != -ENOENT)
+		return (error);
+
+	if (*where == (XATTR_IN_SA|XATTR_IN_DIR))
+		cmn_err(CE_WARN, "ZFS: inode %p has xattr \"%s\""
+		    " in both SA and dir", ip, name);
+	if (*where == XATTR_NOENT)
+		error = -ENODATA;
+	else
+		error = 0;
+	return (error);
+}
+
+static int
+zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	ZPL_ENTER(zfsvfs);
+	ZPL_VERIFY_ZP(zp);
+	rw_enter(&zp->z_xattr_lock, RW_READER);
+	error = __zpl_xattr_get(ip, name, value, size, cr);
+	rw_exit(&zp->z_xattr_lock);
+	ZPL_EXIT(zfsvfs);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	return (error);
+}
+
+static int
+zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value,
+    size_t size, int flags, cred_t *cr)
+{
+	znode_t *dxzp = NULL;
+	znode_t *xzp = NULL;
+	vattr_t *vap = NULL;
+	ssize_t wrote;
+	int lookup_flags, error;
+	const int xattr_mode = S_IFREG | 0644;
+	loff_t pos = 0;
+
+	/*
+	 * Lookup the xattr directory.  When we're adding an entry pass
+	 * CREATE_XATTR_DIR to ensure the xattr directory is created.
+	 * When removing an entry this flag is not passed to avoid
+	 * unnecessarily creating a new xattr directory.
+	 */
+	lookup_flags = LOOKUP_XATTR;
+	if (value != NULL)
+		lookup_flags |= CREATE_XATTR_DIR;
+
+	error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, lookup_flags,
+	    cr, NULL, NULL);
+	if (error)
+		goto out;
+
+	/* Lookup a specific xattr name in the directory */
+	error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL);
+	if (error && (error != -ENOENT))
+		goto out;
+
+	error = 0;
+
+	/* Remove a specific name xattr when value is set to NULL. */
+	if (value == NULL) {
+		if (xzp)
+			error = -zfs_remove(dxzp, (char *)name, cr, 0);
+
+		goto out;
+	}
+
+	/* Lookup failed create a new xattr. */
+	if (xzp == NULL) {
+		vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+		vap->va_mode = xattr_mode;
+		vap->va_mask = ATTR_MODE;
+		vap->va_uid = crgetfsuid(cr);
+		vap->va_gid = crgetfsgid(cr);
+
+		error = -zfs_create(dxzp, (char *)name, vap, 0, 0644, &xzp,
+		    cr, 0, NULL);
+		if (error)
+			goto out;
+	}
+
+	ASSERT(xzp != NULL);
+
+	error = -zfs_freesp(xzp, 0, 0, xattr_mode, TRUE);
+	if (error)
+		goto out;
+
+	wrote = zpl_write_common(ZTOI(xzp), value, size, &pos,
+	    UIO_SYSSPACE, 0, cr);
+	if (wrote < 0)
+		error = wrote;
+
+out:
+
+	if (error == 0) {
+		ip->i_ctime = current_time(ip);
+		zfs_mark_inode_dirty(ip);
+	}
+
+	if (vap)
+		kmem_free(vap, sizeof (vattr_t));
+
+	if (xzp)
+		zrele(xzp);
+
+	if (dxzp)
+		zrele(dxzp);
+
+	if (error == -ENOENT)
+		error = -ENODATA;
+
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value,
+    size_t size, int flags, cred_t *cr)
+{
+	znode_t *zp = ITOZ(ip);
+	nvlist_t *nvl;
+	size_t sa_size;
+	int error = 0;
+
+	mutex_enter(&zp->z_lock);
+	if (zp->z_xattr_cached == NULL)
+		error = -zfs_sa_get_xattr(zp);
+	mutex_exit(&zp->z_lock);
+
+	if (error)
+		return (error);
+
+	ASSERT(zp->z_xattr_cached);
+	nvl = zp->z_xattr_cached;
+
+	if (value == NULL) {
+		error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY);
+		if (error == -ENOENT)
+			error = zpl_xattr_set_dir(ip, name, NULL, 0, flags, cr);
+	} else {
+		/* Limited to 32k to keep nvpair memory allocations small */
+		if (size > DXATTR_MAX_ENTRY_SIZE)
+			return (-EFBIG);
+
+		/* Prevent the DXATTR SA from consuming the entire SA region */
+		error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
+		if (error)
+			return (error);
+
+		if (sa_size > DXATTR_MAX_SA_SIZE)
+			return (-EFBIG);
+
+		error = -nvlist_add_byte_array(nvl, name,
+		    (uchar_t *)value, size);
+	}
+
+	/*
+	 * Update the SA for additions, modifications, and removals. On
+	 * error drop the inconsistent cached version of the nvlist, it
+	 * will be reconstructed from the ARC when next accessed.
+	 */
+	if (error == 0)
+		error = -zfs_sa_set_xattr(zp);
+
+	if (error) {
+		nvlist_free(nvl);
+		zp->z_xattr_cached = NULL;
+	}
+
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_xattr_set(struct inode *ip, const char *name, const void *value,
+    size_t size, int flags)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	int where;
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	ZPL_ENTER(zfsvfs);
+	ZPL_VERIFY_ZP(zp);
+	rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER);
+
+	/*
+	 * Before setting the xattr check to see if it already exists.
+	 * This is done to ensure the following optional flags are honored.
+	 *
+	 *   XATTR_CREATE: fail if xattr already exists
+	 *   XATTR_REPLACE: fail if xattr does not exist
+	 *
+	 * We also want to know if it resides in sa or dir, so we can make
+	 * sure we don't end up with duplicate in both places.
+	 */
+	error = __zpl_xattr_where(ip, name, &where, cr);
+	if (error < 0) {
+		if (error != -ENODATA)
+			goto out;
+		if (flags & XATTR_REPLACE)
+			goto out;
+
+		/* The xattr to be removed already doesn't exist */
+		error = 0;
+		if (value == NULL)
+			goto out;
+	} else {
+		error = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto out;
+	}
+
+	/* Preferentially store the xattr as a SA for better performance */
+	if (zfsvfs->z_use_sa && zp->z_is_sa &&
+	    (zfsvfs->z_xattr_sa || (value == NULL && where & XATTR_IN_SA))) {
+		error = zpl_xattr_set_sa(ip, name, value, size, flags, cr);
+		if (error == 0) {
+			/*
+			 * Successfully put into SA, we need to clear the one
+			 * in dir.
+			 */
+			if (where & XATTR_IN_DIR)
+				zpl_xattr_set_dir(ip, name, NULL, 0, 0, cr);
+			goto out;
+		}
+	}
+
+	error = zpl_xattr_set_dir(ip, name, value, size, flags, cr);
+	/*
+	 * Successfully put into dir, we need to clear the one in SA.
+	 */
+	if (error == 0 && (where & XATTR_IN_SA))
+		zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr);
+out:
+	rw_exit(&ITOZ(ip)->z_xattr_lock);
+	ZPL_EXIT(zfsvfs);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+/*
+ * Extended user attributes
+ *
+ * "Extended user attributes may be assigned to files and directories for
+ * storing arbitrary additional information such as the mime type,
+ * character set or encoding of a file.  The access permissions for user
+ * attributes are defined by the file permission bits: read permission
+ * is required to retrieve the attribute value, and writer permission is
+ * required to change it.
+ *
+ * The file permission bits of regular files and directories are
+ * interpreted differently from the file permission bits of special
+ * files and symbolic links.  For regular files and directories the file
+ * permission bits define access to the file's contents, while for
+ * device special files they define access to the device described by
+ * the special file.  The file permissions of symbolic links are not
+ * used in access checks.  These differences would allow users to
+ * consume filesystem resources in a way not controllable by disk quotas
+ * for group or world writable special files and directories.
+ *
+ * For this reason, extended user attributes are allowed only for
+ * regular files and directories, and access to extended user attributes
+ * is restricted to the owner and to users with appropriate capabilities
+ * for directories with the sticky bit set (see the chmod(1) manual page
+ * for an explanation of the sticky bit)." - xattr(7)
+ *
+ * ZFS allows extended user attributes to be disabled administratively
+ * by setting the 'xattr=off' property on the dataset.
+ */
+static int
+__zpl_xattr_user_list(struct inode *ip, char *list, size_t list_size,
+    const char *name, size_t name_len)
+{
+	return (ITOZSB(ip)->z_flags & ZSB_XATTR);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_user_list);
+
+static int
+__zpl_xattr_user_get(struct inode *ip, const char *name,
+    void *value, size_t size)
+{
+	char *xattr_name;
+	int error;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") == 0)
+		return (-EINVAL);
+#endif
+	if (!(ITOZSB(ip)->z_flags & ZSB_XATTR))
+		return (-EOPNOTSUPP);
+
+	xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
+	error = zpl_xattr_get(ip, xattr_name, value, size);
+	kmem_strfree(xattr_name);
+
+	return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get);
+
+static int
+__zpl_xattr_user_set(struct inode *ip, const char *name,
+    const void *value, size_t size, int flags)
+{
+	char *xattr_name;
+	int error;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") == 0)
+		return (-EINVAL);
+#endif
+	if (!(ITOZSB(ip)->z_flags & ZSB_XATTR))
+		return (-EOPNOTSUPP);
+
+	xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
+	error = zpl_xattr_set(ip, xattr_name, value, size, flags);
+	kmem_strfree(xattr_name);
+
+	return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set);
+
+xattr_handler_t zpl_xattr_user_handler =
+{
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= zpl_xattr_user_list,
+	.get	= zpl_xattr_user_get,
+	.set	= zpl_xattr_user_set,
+};
+
+/*
+ * Trusted extended attributes
+ *
+ * "Trusted extended attributes are visible and accessible only to
+ * processes that have the CAP_SYS_ADMIN capability.  Attributes in this
+ * class are used to implement mechanisms in user space (i.e., outside
+ * the kernel) which keep information in extended attributes to which
+ * ordinary processes should not have access." - xattr(7)
+ */
+static int
+__zpl_xattr_trusted_list(struct inode *ip, char *list, size_t list_size,
+    const char *name, size_t name_len)
+{
+	return (capable(CAP_SYS_ADMIN));
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_trusted_list);
+
+static int
+__zpl_xattr_trusted_get(struct inode *ip, const char *name,
+    void *value, size_t size)
+{
+	char *xattr_name;
+	int error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return (-EACCES);
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") == 0)
+		return (-EINVAL);
+#endif
+	xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
+	error = zpl_xattr_get(ip, xattr_name, value, size);
+	kmem_strfree(xattr_name);
+
+	return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get);
+
+static int
+__zpl_xattr_trusted_set(struct inode *ip, const char *name,
+    const void *value, size_t size, int flags)
+{
+	char *xattr_name;
+	int error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return (-EACCES);
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") == 0)
+		return (-EINVAL);
+#endif
+	xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
+	error = zpl_xattr_set(ip, xattr_name, value, size, flags);
+	kmem_strfree(xattr_name);
+
+	return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set);
+
+xattr_handler_t zpl_xattr_trusted_handler =
+{
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= zpl_xattr_trusted_list,
+	.get	= zpl_xattr_trusted_get,
+	.set	= zpl_xattr_trusted_set,
+};
+
+/*
+ * Extended security attributes
+ *
+ * "The security attribute namespace is used by kernel security modules,
+ * such as Security Enhanced Linux, and also to implement file
+ * capabilities (see capabilities(7)).  Read and write access
+ * permissions to security attributes depend on the policy implemented
+ * for each security attribute by the security module.  When no security
+ * module is loaded, all processes have read access to extended security
+ * attributes, and write access is limited to processes that have the
+ * CAP_SYS_ADMIN capability." - xattr(7)
+ */
+static int
+__zpl_xattr_security_list(struct inode *ip, char *list, size_t list_size,
+    const char *name, size_t name_len)
+{
+	return (1);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_security_list);
+
+static int
+__zpl_xattr_security_get(struct inode *ip, const char *name,
+    void *value, size_t size)
+{
+	char *xattr_name;
+	int error;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") == 0)
+		return (-EINVAL);
+#endif
+	xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
+	error = zpl_xattr_get(ip, xattr_name, value, size);
+	kmem_strfree(xattr_name);
+
+	return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get);
+
+static int
+__zpl_xattr_security_set(struct inode *ip, const char *name,
+    const void *value, size_t size, int flags)
+{
+	char *xattr_name;
+	int error;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") == 0)
+		return (-EINVAL);
+#endif
+	xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
+	error = zpl_xattr_set(ip, xattr_name, value, size, flags);
+	kmem_strfree(xattr_name);
+
+	return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set);
+
+static int
+zpl_xattr_security_init_impl(struct inode *ip, const struct xattr *xattrs,
+    void *fs_info)
+{
+	const struct xattr *xattr;
+	int error = 0;
+
+	for (xattr = xattrs; xattr->name != NULL; xattr++) {
+		error = __zpl_xattr_security_set(ip,
+		    xattr->name, xattr->value, xattr->value_len, 0);
+
+		if (error < 0)
+			break;
+	}
+
+	return (error);
+}
+
+int
+zpl_xattr_security_init(struct inode *ip, struct inode *dip,
+    const struct qstr *qstr)
+{
+	return security_inode_init_security(ip, dip, qstr,
+	    &zpl_xattr_security_init_impl, NULL);
+}
+
+/*
+ * Security xattr namespace handlers.
+ */
+xattr_handler_t zpl_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= zpl_xattr_security_list,
+	.get	= zpl_xattr_security_get,
+	.set	= zpl_xattr_security_set,
+};
+
+/*
+ * Extended system attributes
+ *
+ * "Extended system attributes are used by the kernel to store system
+ * objects such as Access Control Lists.  Read and write access permissions
+ * to system attributes depend on the policy implemented for each system
+ * attribute implemented by filesystems in the kernel." - xattr(7)
+ */
+#ifdef CONFIG_FS_POSIX_ACL
+#ifndef HAVE_SET_ACL
+static
+#endif
+int
+zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type)
+{
+	char *name, *value = NULL;
+	int error = 0;
+	size_t size = 0;
+
+	if (S_ISLNK(ip->i_mode))
+		return (-EOPNOTSUPP);
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
+		if (acl) {
+			umode_t mode = ip->i_mode;
+			error = posix_acl_equiv_mode(acl, &mode);
+			if (error < 0) {
+				return (error);
+			} else {
+				/*
+				 * The mode bits will have been set by
+				 * ->zfs_setattr()->zfs_acl_chmod_setattr()
+				 * using the ZFS ACL conversion.  If they
+				 * differ from the Posix ACL conversion dirty
+				 * the inode to write the Posix mode bits.
+				 */
+				if (ip->i_mode != mode) {
+					ip->i_mode = mode;
+					ip->i_ctime = current_time(ip);
+					zfs_mark_inode_dirty(ip);
+				}
+
+				if (error == 0)
+					acl = NULL;
+			}
+		}
+		break;
+
+	case ACL_TYPE_DEFAULT:
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(ip->i_mode))
+			return (acl ? -EACCES : 0);
+		break;
+
+	default:
+		return (-EINVAL);
+	}
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmem_alloc(size, KM_SLEEP);
+
+		error = zpl_acl_to_xattr(acl, value, size);
+		if (error < 0) {
+			kmem_free(value, size);
+			return (error);
+		}
+	}
+
+	error = zpl_xattr_set(ip, name, value, size, 0);
+	if (value)
+		kmem_free(value, size);
+
+	if (!error) {
+		if (acl)
+			zpl_set_cached_acl(ip, type, acl);
+		else
+			zpl_forget_cached_acl(ip, type);
+	}
+
+	return (error);
+}
+
+struct posix_acl *
+zpl_get_acl(struct inode *ip, int type)
+{
+	struct posix_acl *acl;
+	void *value = NULL;
+	char *name;
+	int size;
+
+	/*
+	 * As of Linux 3.14, the kernel get_acl will check this for us.
+	 * Also as of Linux 4.7, comparing against ACL_NOT_CACHED is wrong
+	 * as the kernel get_acl will set it to temporary sentinel value.
+	 */
+#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE
+	acl = get_cached_acl(ip, type);
+	if (acl != ACL_NOT_CACHED)
+		return (acl);
+#endif
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		return (ERR_PTR(-EINVAL));
+	}
+
+	size = zpl_xattr_get(ip, name, NULL, 0);
+	if (size > 0) {
+		value = kmem_alloc(size, KM_SLEEP);
+		size = zpl_xattr_get(ip, name, value, size);
+	}
+
+	if (size > 0) {
+		acl = zpl_acl_from_xattr(value, size);
+	} else if (size == -ENODATA || size == -ENOSYS) {
+		acl = NULL;
+	} else {
+		acl = ERR_PTR(-EIO);
+	}
+
+	if (size > 0)
+		kmem_free(value, size);
+
+	/* As of Linux 4.7, the kernel get_acl will set this for us */
+#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE
+	if (!IS_ERR(acl))
+		zpl_set_cached_acl(ip, type, acl);
+#endif
+
+	return (acl);
+}
+
+int
+zpl_init_acl(struct inode *ip, struct inode *dir)
+{
+	struct posix_acl *acl = NULL;
+	int error = 0;
+
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+		return (0);
+
+	if (!S_ISLNK(ip->i_mode)) {
+		acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl))
+			return (PTR_ERR(acl));
+		if (!acl) {
+			ip->i_mode &= ~current_umask();
+			ip->i_ctime = current_time(ip);
+			zfs_mark_inode_dirty(ip);
+			return (0);
+		}
+	}
+
+	if (acl) {
+		umode_t mode;
+
+		if (S_ISDIR(ip->i_mode)) {
+			error = zpl_set_acl(ip, acl, ACL_TYPE_DEFAULT);
+			if (error)
+				goto out;
+		}
+
+		mode = ip->i_mode;
+		error = __posix_acl_create(&acl, GFP_KERNEL, &mode);
+		if (error >= 0) {
+			ip->i_mode = mode;
+			zfs_mark_inode_dirty(ip);
+			if (error > 0)
+				error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS);
+		}
+	}
+out:
+	zpl_posix_acl_release(acl);
+
+	return (error);
+}
+
+int
+zpl_chmod_acl(struct inode *ip)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+		return (0);
+
+	if (S_ISLNK(ip->i_mode))
+		return (-EOPNOTSUPP);
+
+	acl = zpl_get_acl(ip, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return (PTR_ERR(acl));
+
+	error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode);
+	if (!error)
+		error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS);
+
+	zpl_posix_acl_release(acl);
+
+	return (error);
+}
+
+static int
+__zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size,
+    const char *name, size_t name_len)
+{
+	char *xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
+	size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_ACCESS);
+
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+		return (0);
+
+	if (list && xattr_size <= list_size)
+		memcpy(list, xattr_name, xattr_size);
+
+	return (xattr_size);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_access);
+
+static int
+__zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size,
+    const char *name, size_t name_len)
+{
+	char *xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
+	size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_DEFAULT);
+
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+		return (0);
+
+	if (list && xattr_size <= list_size)
+		memcpy(list, xattr_name, xattr_size);
+
+	return (xattr_size);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_default);
+
+static int
+__zpl_xattr_acl_get_access(struct inode *ip, const char *name,
+    void *buffer, size_t size)
+{
+	struct posix_acl *acl;
+	int type = ACL_TYPE_ACCESS;
+	int error;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") != 0)
+		return (-EINVAL);
+#endif
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+		return (-EOPNOTSUPP);
+
+	acl = zpl_get_acl(ip, type);
+	if (IS_ERR(acl))
+		return (PTR_ERR(acl));
+	if (acl == NULL)
+		return (-ENODATA);
+
+	error = zpl_acl_to_xattr(acl, buffer, size);
+	zpl_posix_acl_release(acl);
+
+	return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_access);
+
+static int
+__zpl_xattr_acl_get_default(struct inode *ip, const char *name,
+    void *buffer, size_t size)
+{
+	struct posix_acl *acl;
+	int type = ACL_TYPE_DEFAULT;
+	int error;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") != 0)
+		return (-EINVAL);
+#endif
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+		return (-EOPNOTSUPP);
+
+	acl = zpl_get_acl(ip, type);
+	if (IS_ERR(acl))
+		return (PTR_ERR(acl));
+	if (acl == NULL)
+		return (-ENODATA);
+
+	error = zpl_acl_to_xattr(acl, buffer, size);
+	zpl_posix_acl_release(acl);
+
+	return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default);
+
+static int
+__zpl_xattr_acl_set_access(struct inode *ip, const char *name,
+    const void *value, size_t size, int flags)
+{
+	struct posix_acl *acl;
+	int type = ACL_TYPE_ACCESS;
+	int error = 0;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") != 0)
+		return (-EINVAL);
+#endif
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+		return (-EOPNOTSUPP);
+
+	if (!inode_owner_or_capable(ip))
+		return (-EPERM);
+
+	if (value) {
+		acl = zpl_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return (PTR_ERR(acl));
+		else if (acl) {
+			error = zpl_posix_acl_valid(ip, acl);
+			if (error) {
+				zpl_posix_acl_release(acl);
+				return (error);
+			}
+		}
+	} else {
+		acl = NULL;
+	}
+
+	error = zpl_set_acl(ip, acl, type);
+	zpl_posix_acl_release(acl);
+
+	return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access);
+
+static int
+__zpl_xattr_acl_set_default(struct inode *ip, const char *name,
+    const void *value, size_t size, int flags)
+{
+	struct posix_acl *acl;
+	int type = ACL_TYPE_DEFAULT;
+	int error = 0;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") != 0)
+		return (-EINVAL);
+#endif
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+		return (-EOPNOTSUPP);
+
+	if (!inode_owner_or_capable(ip))
+		return (-EPERM);
+
+	if (value) {
+		acl = zpl_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return (PTR_ERR(acl));
+		else if (acl) {
+			error = zpl_posix_acl_valid(ip, acl);
+			if (error) {
+				zpl_posix_acl_release(acl);
+				return (error);
+			}
+		}
+	} else {
+		acl = NULL;
+	}
+
+	error = zpl_set_acl(ip, acl, type);
+	zpl_posix_acl_release(acl);
+
+	return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_default);
+
+/*
+ * ACL access xattr namespace handlers.
+ *
+ * Use .name instead of .prefix when available. xattr_resolve_name will match
+ * whole name and reject anything that has .name only as prefix.
+ */
+xattr_handler_t zpl_xattr_acl_access_handler =
+{
+#ifdef HAVE_XATTR_HANDLER_NAME
+	.name	= XATTR_NAME_POSIX_ACL_ACCESS,
+#else
+	.prefix	= XATTR_NAME_POSIX_ACL_ACCESS,
+#endif
+	.list	= zpl_xattr_acl_list_access,
+	.get	= zpl_xattr_acl_get_access,
+	.set	= zpl_xattr_acl_set_access,
+#if defined(HAVE_XATTR_LIST_SIMPLE) || \
+    defined(HAVE_XATTR_LIST_DENTRY) || \
+    defined(HAVE_XATTR_LIST_HANDLER)
+	.flags	= ACL_TYPE_ACCESS,
+#endif
+};
+
+/*
+ * ACL default xattr namespace handlers.
+ *
+ * Use .name instead of .prefix when available. xattr_resolve_name will match
+ * whole name and reject anything that has .name only as prefix.
+ */
+xattr_handler_t zpl_xattr_acl_default_handler =
+{
+#ifdef HAVE_XATTR_HANDLER_NAME
+	.name	= XATTR_NAME_POSIX_ACL_DEFAULT,
+#else
+	.prefix	= XATTR_NAME_POSIX_ACL_DEFAULT,
+#endif
+	.list	= zpl_xattr_acl_list_default,
+	.get	= zpl_xattr_acl_get_default,
+	.set	= zpl_xattr_acl_set_default,
+#if defined(HAVE_XATTR_LIST_SIMPLE) || \
+    defined(HAVE_XATTR_LIST_DENTRY) || \
+    defined(HAVE_XATTR_LIST_HANDLER)
+	.flags	= ACL_TYPE_DEFAULT,
+#endif
+};
+
+#endif /* CONFIG_FS_POSIX_ACL */
+
+xattr_handler_t *zpl_xattr_handlers[] = {
+	&zpl_xattr_security_handler,
+	&zpl_xattr_trusted_handler,
+	&zpl_xattr_user_handler,
+#ifdef CONFIG_FS_POSIX_ACL
+	&zpl_xattr_acl_access_handler,
+	&zpl_xattr_acl_default_handler,
+#endif /* CONFIG_FS_POSIX_ACL */
+	NULL
+};
+
+static const struct xattr_handler *
+zpl_xattr_handler(const char *name)
+{
+	if (strncmp(name, XATTR_USER_PREFIX,
+	    XATTR_USER_PREFIX_LEN) == 0)
+		return (&zpl_xattr_user_handler);
+
+	if (strncmp(name, XATTR_TRUSTED_PREFIX,
+	    XATTR_TRUSTED_PREFIX_LEN) == 0)
+		return (&zpl_xattr_trusted_handler);
+
+	if (strncmp(name, XATTR_SECURITY_PREFIX,
+	    XATTR_SECURITY_PREFIX_LEN) == 0)
+		return (&zpl_xattr_security_handler);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS,
+	    sizeof (XATTR_NAME_POSIX_ACL_ACCESS)) == 0)
+		return (&zpl_xattr_acl_access_handler);
+
+	if (strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
+	    sizeof (XATTR_NAME_POSIX_ACL_DEFAULT)) == 0)
+		return (&zpl_xattr_acl_default_handler);
+#endif /* CONFIG_FS_POSIX_ACL */
+
+	return (NULL);
+}
+
+#if !defined(HAVE_POSIX_ACL_RELEASE) || defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY)
+struct acl_rel_struct {
+	struct acl_rel_struct *next;
+	struct posix_acl *acl;
+	clock_t time;
+};
+
+#define	ACL_REL_GRACE	(60*HZ)
+#define	ACL_REL_WINDOW	(1*HZ)
+#define	ACL_REL_SCHED	(ACL_REL_GRACE+ACL_REL_WINDOW)
+
+/*
+ * Lockless multi-producer single-consumer fifo list.
+ * Nodes are added to tail and removed from head. Tail pointer is our
+ * synchronization point. It always points to the next pointer of the last
+ * node, or head if list is empty.
+ */
+static struct acl_rel_struct *acl_rel_head = NULL;
+static struct acl_rel_struct **acl_rel_tail = &acl_rel_head;
+
+static void
+zpl_posix_acl_free(void *arg)
+{
+	struct acl_rel_struct *freelist = NULL;
+	struct acl_rel_struct *a;
+	clock_t new_time;
+	boolean_t refire = B_FALSE;
+
+	ASSERT3P(acl_rel_head, !=, NULL);
+	while (acl_rel_head) {
+		a = acl_rel_head;
+		if (ddi_get_lbolt() - a->time >= ACL_REL_GRACE) {
+			/*
+			 * If a is the last node we need to reset tail, but we
+			 * need to use cmpxchg to make sure it is still the
+			 * last node.
+			 */
+			if (acl_rel_tail == &a->next) {
+				acl_rel_head = NULL;
+				if (cmpxchg(&acl_rel_tail, &a->next,
+				    &acl_rel_head) == &a->next) {
+					ASSERT3P(a->next, ==, NULL);
+					a->next = freelist;
+					freelist = a;
+					break;
+				}
+			}
+			/*
+			 * a is not last node, make sure next pointer is set
+			 * by the adder and advance the head.
+			 */
+			while (READ_ONCE(a->next) == NULL)
+				cpu_relax();
+			acl_rel_head = a->next;
+			a->next = freelist;
+			freelist = a;
+		} else {
+			/*
+			 * a is still in grace period. We are responsible to
+			 * reschedule the free task, since adder will only do
+			 * so if list is empty.
+			 */
+			new_time = a->time + ACL_REL_SCHED;
+			refire = B_TRUE;
+			break;
+		}
+	}
+
+	if (refire)
+		taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free,
+		    NULL, TQ_SLEEP, new_time);
+
+	while (freelist) {
+		a = freelist;
+		freelist = a->next;
+		kfree(a->acl);
+		kmem_free(a, sizeof (struct acl_rel_struct));
+	}
+}
+
+void
+zpl_posix_acl_release_impl(struct posix_acl *acl)
+{
+	struct acl_rel_struct *a, **prev;
+
+	a = kmem_alloc(sizeof (struct acl_rel_struct), KM_SLEEP);
+	a->next = NULL;
+	a->acl = acl;
+	a->time = ddi_get_lbolt();
+	/* atomically points tail to us and get the previous tail */
+	prev = xchg(&acl_rel_tail, &a->next);
+	ASSERT3P(*prev, ==, NULL);
+	*prev = a;
+	/* if it was empty before, schedule the free task */
+	if (prev == &acl_rel_head)
+		taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free,
+		    NULL, TQ_SLEEP, ddi_get_lbolt() + ACL_REL_SCHED);
+}
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
new file mode 100644
index 000000000000..218e1101edf8
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -0,0 +1,1125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ */
+
+#include <sys/dataset_kstats.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/zil_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/zio.h>
+#include <sys/zfs_rlock.h>
+#include <sys/spa_impl.h>
+#include <sys/zvol.h>
+#include <sys/zvol_impl.h>
+
+#include <linux/blkdev_compat.h>
+#include <linux/task_io_accounting_ops.h>
+
+unsigned int zvol_major = ZVOL_MAJOR;
+unsigned int zvol_request_sync = 0;
+unsigned int zvol_prefetch_bytes = (128 * 1024);
+unsigned long zvol_max_discard_blocks = 16384;
+unsigned int zvol_threads = 32;
+
+struct zvol_state_os {
+	struct gendisk		*zvo_disk;	/* generic disk */
+	struct request_queue	*zvo_queue;	/* request queue */
+	dev_t			zvo_dev;	/* device id */
+};
+
+taskq_t *zvol_taskq;
+static struct ida zvol_ida;
+
+typedef struct zv_request {
+	zvol_state_t	*zv;
+	struct bio	*bio;
+	taskq_ent_t	ent;
+} zv_request_t;
+
+/*
+ * Given a path, return TRUE if path is a ZVOL.
+ */
+static boolean_t
+zvol_is_zvol_impl(const char *device)
+{
+	struct block_device *bdev;
+	unsigned int major;
+
+	bdev = vdev_lookup_bdev(device);
+	if (IS_ERR(bdev))
+		return (B_FALSE);
+
+	major = MAJOR(bdev->bd_dev);
+	bdput(bdev);
+
+	if (major == zvol_major)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+static void
+uio_from_bio(uio_t *uio, struct bio *bio)
+{
+	uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
+	uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
+	uio->uio_loffset = BIO_BI_SECTOR(bio) << 9;
+	uio->uio_segflg = UIO_BVEC;
+	uio->uio_limit = MAXOFFSET_T;
+	uio->uio_resid = BIO_BI_SIZE(bio);
+	uio->uio_skip = BIO_BI_SKIP(bio);
+}
+
+static void
+zvol_write(void *arg)
+{
+	int error = 0;
+
+	zv_request_t *zvr = arg;
+	struct bio *bio = zvr->bio;
+	uio_t uio = { { 0 }, 0 };
+	uio_from_bio(&uio, bio);
+
+	zvol_state_t *zv = zvr->zv;
+	ASSERT(zv && zv->zv_open_count > 0);
+	ASSERT(zv->zv_zilog != NULL);
+
+	/* bio marked as FLUSH need to flush before write */
+	if (bio_is_flush(bio))
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+	/* Some requests are just for flush and nothing else. */
+	if (uio.uio_resid == 0) {
+		rw_exit(&zv->zv_suspend_lock);
+		BIO_END_IO(bio, 0);
+		kmem_free(zvr, sizeof (zv_request_t));
+		return;
+	}
+
+	ssize_t start_resid = uio.uio_resid;
+	unsigned long start_jif = jiffies;
+	blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE,
+	    bio_sectors(bio), &zv->zv_zso->zvo_disk->part0);
+
+	boolean_t sync =
+	    bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+
+	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
+	    uio.uio_loffset, uio.uio_resid, RL_WRITER);
+
+	uint64_t volsize = zv->zv_volsize;
+	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
+		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
+		uint64_t off = uio.uio_loffset;
+		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+
+		if (bytes > volsize - off)	/* don't write past the end */
+			bytes = volsize - off;
+
+		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
+
+		/* This will only fail for ENOSPC */
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			break;
+		}
+		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
+		if (error == 0) {
+			zvol_log_write(zv, tx, off, bytes, sync);
+		}
+		dmu_tx_commit(tx);
+
+		if (error)
+			break;
+	}
+	zfs_rangelock_exit(lr);
+
+	int64_t nwritten = start_resid - uio.uio_resid;
+	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
+	task_io_account_write(nwritten);
+
+	if (sync)
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+	rw_exit(&zv->zv_suspend_lock);
+	blk_generic_end_io_acct(zv->zv_zso->zvo_queue,
+	    WRITE, &zv->zv_zso->zvo_disk->part0, start_jif);
+	BIO_END_IO(bio, -error);
+	kmem_free(zvr, sizeof (zv_request_t));
+}
+
+static void
+zvol_discard(void *arg)
+{
+	zv_request_t *zvr = arg;
+	struct bio *bio = zvr->bio;
+	zvol_state_t *zv = zvr->zv;
+	uint64_t start = BIO_BI_SECTOR(bio) << 9;
+	uint64_t size = BIO_BI_SIZE(bio);
+	uint64_t end = start + size;
+	boolean_t sync;
+	int error = 0;
+	dmu_tx_t *tx;
+	unsigned long start_jif;
+
+	ASSERT(zv && zv->zv_open_count > 0);
+	ASSERT(zv->zv_zilog != NULL);
+
+	start_jif = jiffies;
+	blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE,
+	    bio_sectors(bio), &zv->zv_zso->zvo_disk->part0);
+
+	sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+
+	if (end > zv->zv_volsize) {
+		error = SET_ERROR(EIO);
+		goto unlock;
+	}
+
+	/*
+	 * Align the request to volume block boundaries when a secure erase is
+	 * not required.  This will prevent dnode_free_range() from zeroing out
+	 * the unaligned parts which is slow (read-modify-write) and useless
+	 * since we are not freeing any space by doing so.
+	 */
+	if (!bio_is_secure_erase(bio)) {
+		start = P2ROUNDUP(start, zv->zv_volblocksize);
+		end = P2ALIGN(end, zv->zv_volblocksize);
+		size = end - start;
+	}
+
+	if (start >= end)
+		goto unlock;
+
+	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
+	    start, size, RL_WRITER);
+
+	tx = dmu_tx_create(zv->zv_objset);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error != 0) {
+		dmu_tx_abort(tx);
+	} else {
+		zvol_log_truncate(zv, tx, start, size, B_TRUE);
+		dmu_tx_commit(tx);
+		error = dmu_free_long_range(zv->zv_objset,
+		    ZVOL_OBJ, start, size);
+	}
+	zfs_rangelock_exit(lr);
+
+	if (error == 0 && sync)
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+unlock:
+	rw_exit(&zv->zv_suspend_lock);
+	blk_generic_end_io_acct(zv->zv_zso->zvo_queue, WRITE,
+	    &zv->zv_zso->zvo_disk->part0, start_jif);
+	BIO_END_IO(bio, -error);
+	kmem_free(zvr, sizeof (zv_request_t));
+}
+
+static void
+zvol_read(void *arg)
+{
+	int error = 0;
+
+	zv_request_t *zvr = arg;
+	struct bio *bio = zvr->bio;
+	uio_t uio = { { 0 }, 0 };
+	uio_from_bio(&uio, bio);
+
+	zvol_state_t *zv = zvr->zv;
+	ASSERT(zv && zv->zv_open_count > 0);
+
+	ssize_t start_resid = uio.uio_resid;
+	unsigned long start_jif = jiffies;
+	blk_generic_start_io_acct(zv->zv_zso->zvo_queue, READ, bio_sectors(bio),
+	    &zv->zv_zso->zvo_disk->part0);
+
+	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
+	    uio.uio_loffset, uio.uio_resid, RL_READER);
+
+	uint64_t volsize = zv->zv_volsize;
+	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
+		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
+
+		/* don't read past the end */
+		if (bytes > volsize - uio.uio_loffset)
+			bytes = volsize - uio.uio_loffset;
+
+		error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
+		if (error) {
+			/* convert checksum errors into IO errors */
+			if (error == ECKSUM)
+				error = SET_ERROR(EIO);
+			break;
+		}
+	}
+	zfs_rangelock_exit(lr);
+
+	int64_t nread = start_resid - uio.uio_resid;
+	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
+	task_io_account_read(nread);
+
+	rw_exit(&zv->zv_suspend_lock);
+	blk_generic_end_io_acct(zv->zv_zso->zvo_queue, READ,
+	    &zv->zv_zso->zvo_disk->part0, start_jif);
+	BIO_END_IO(bio, -error);
+	kmem_free(zvr, sizeof (zv_request_t));
+}
+
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+static blk_qc_t
+zvol_submit_bio(struct bio *bio)
+#else
+static MAKE_REQUEST_FN_RET
+zvol_request(struct request_queue *q, struct bio *bio)
+#endif
+{
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+	struct request_queue *q = bio->bi_disk->queue;
+#endif
+	zvol_state_t *zv = q->queuedata;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+	uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+	uint64_t size = BIO_BI_SIZE(bio);
+	int rw = bio_data_dir(bio);
+	zv_request_t *zvr;
+
+	if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
+		printk(KERN_INFO
+		    "%s: bad access: offset=%llu, size=%lu\n",
+		    zv->zv_zso->zvo_disk->disk_name,
+		    (long long unsigned)offset,
+		    (long unsigned)size);
+
+		BIO_END_IO(bio, -SET_ERROR(EIO));
+		goto out;
+	}
+
+	if (rw == WRITE) {
+		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
+			BIO_END_IO(bio, -SET_ERROR(EROFS));
+			goto out;
+		}
+
+		/*
+		 * Prevents the zvol from being suspended, or the ZIL being
+		 * concurrently opened.  Will be released after the i/o
+		 * completes.
+		 */
+		rw_enter(&zv->zv_suspend_lock, RW_READER);
+
+		/*
+		 * Open a ZIL if this is the first time we have written to this
+		 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
+		 * than zv_state_lock so that we don't need to acquire an
+		 * additional lock in this path.
+		 */
+		if (zv->zv_zilog == NULL) {
+			rw_exit(&zv->zv_suspend_lock);
+			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+			if (zv->zv_zilog == NULL) {
+				zv->zv_zilog = zil_open(zv->zv_objset,
+				    zvol_get_data);
+				zv->zv_flags |= ZVOL_WRITTEN_TO;
+			}
+			rw_downgrade(&zv->zv_suspend_lock);
+		}
+
+		zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
+		zvr->zv = zv;
+		zvr->bio = bio;
+		taskq_init_ent(&zvr->ent);
+
+		/*
+		 * We don't want this thread to be blocked waiting for i/o to
+		 * complete, so we instead wait from a taskq callback. The
+		 * i/o may be a ZIL write (via zil_commit()), or a read of an
+		 * indirect block, or a read of a data block (if this is a
+		 * partial-block write).  We will indicate that the i/o is
+		 * complete by calling BIO_END_IO() from the taskq callback.
+		 *
+		 * This design allows the calling thread to continue and
+		 * initiate more concurrent operations by calling
+		 * zvol_request() again. There are typically only a small
+		 * number of threads available to call zvol_request() (e.g.
+		 * one per iSCSI target), so keeping the latency of
+		 * zvol_request() low is important for performance.
+		 *
+		 * The zvol_request_sync module parameter allows this
+		 * behavior to be altered, for performance evaluation
+		 * purposes.  If the callback blocks, setting
+		 * zvol_request_sync=1 will result in much worse performance.
+		 *
+		 * We can have up to zvol_threads concurrent i/o's being
+		 * processed for all zvols on the system.  This is typically
+		 * a vast improvement over the zvol_request_sync=1 behavior
+		 * of one i/o at a time per zvol.  However, an even better
+		 * design would be for zvol_request() to initiate the zio
+		 * directly, and then be notified by the zio_done callback,
+		 * which would call BIO_END_IO().  Unfortunately, the DMU/ZIL
+		 * interfaces lack this functionality (they block waiting for
+		 * the i/o to complete).
+		 */
+		if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
+			if (zvol_request_sync) {
+				zvol_discard(zvr);
+			} else {
+				taskq_dispatch_ent(zvol_taskq,
+				    zvol_discard, zvr, 0, &zvr->ent);
+			}
+		} else {
+			if (zvol_request_sync) {
+				zvol_write(zvr);
+			} else {
+				taskq_dispatch_ent(zvol_taskq,
+				    zvol_write, zvr, 0, &zvr->ent);
+			}
+		}
+	} else {
+		/*
+		 * The SCST driver, and possibly others, may issue READ I/Os
+		 * with a length of zero bytes.  These empty I/Os contain no
+		 * data and require no additional handling.
+		 */
+		if (size == 0) {
+			BIO_END_IO(bio, 0);
+			goto out;
+		}
+
+		zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
+		zvr->zv = zv;
+		zvr->bio = bio;
+		taskq_init_ent(&zvr->ent);
+
+		rw_enter(&zv->zv_suspend_lock, RW_READER);
+
+		/* See comment in WRITE case above. */
+		if (zvol_request_sync) {
+			zvol_read(zvr);
+		} else {
+			taskq_dispatch_ent(zvol_taskq,
+			    zvol_read, zvr, 0, &zvr->ent);
+		}
+	}
+
+out:
+	spl_fstrans_unmark(cookie);
+#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
+	defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
+	return (BLK_QC_T_NONE);
+#endif
+}
+
+static int
+zvol_open(struct block_device *bdev, fmode_t flag)
+{
+	zvol_state_t *zv;
+	int error = 0;
+	boolean_t drop_suspend = B_TRUE;
+
+	rw_enter(&zvol_state_lock, RW_READER);
+	/*
+	 * Obtain a copy of private_data under the zvol_state_lock to make
+	 * sure that either the result of zvol free code path setting
+	 * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
+	 * is not called on this zv because of the positive zv_open_count.
+	 */
+	zv = bdev->bd_disk->private_data;
+	if (zv == NULL) {
+		rw_exit(&zvol_state_lock);
+		return (SET_ERROR(-ENXIO));
+	}
+
+	mutex_enter(&zv->zv_state_lock);
+	/*
+	 * make sure zvol is not suspended during first open
+	 * (hold zv_suspend_lock) and respect proper lock acquisition
+	 * ordering - zv_suspend_lock before zv_state_lock
+	 */
+	if (zv->zv_open_count == 0) {
+		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+			mutex_exit(&zv->zv_state_lock);
+			rw_enter(&zv->zv_suspend_lock, RW_READER);
+			mutex_enter(&zv->zv_state_lock);
+			/* check to see if zv_suspend_lock is needed */
+			if (zv->zv_open_count != 0) {
+				rw_exit(&zv->zv_suspend_lock);
+				drop_suspend = B_FALSE;
+			}
+		}
+	} else {
+		drop_suspend = B_FALSE;
+	}
+	rw_exit(&zvol_state_lock);
+
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock));
+
+	if (zv->zv_open_count == 0) {
+		error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
+		if (error)
+			goto out_mutex;
+	}
+
+	if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+		error = -EROFS;
+		goto out_open_count;
+	}
+
+	zv->zv_open_count++;
+
+	mutex_exit(&zv->zv_state_lock);
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+
+	check_disk_change(bdev);
+
+	return (0);
+
+out_open_count:
+	if (zv->zv_open_count == 0)
+		zvol_last_close(zv);
+
+out_mutex:
+	mutex_exit(&zv->zv_state_lock);
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+	if (error == -EINTR) {
+		error = -ERESTARTSYS;
+		schedule();
+	}
+	return (SET_ERROR(error));
+}
+
+static void
+zvol_release(struct gendisk *disk, fmode_t mode)
+{
+	zvol_state_t *zv;
+	boolean_t drop_suspend = B_TRUE;
+
+	rw_enter(&zvol_state_lock, RW_READER);
+	zv = disk->private_data;
+
+	mutex_enter(&zv->zv_state_lock);
+	ASSERT(zv->zv_open_count > 0);
+	/*
+	 * make sure zvol is not suspended during last close
+	 * (hold zv_suspend_lock) and respect proper lock acquisition
+	 * ordering - zv_suspend_lock before zv_state_lock
+	 */
+	if (zv->zv_open_count == 1) {
+		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+			mutex_exit(&zv->zv_state_lock);
+			rw_enter(&zv->zv_suspend_lock, RW_READER);
+			mutex_enter(&zv->zv_state_lock);
+			/* check to see if zv_suspend_lock is needed */
+			if (zv->zv_open_count != 1) {
+				rw_exit(&zv->zv_suspend_lock);
+				drop_suspend = B_FALSE;
+			}
+		}
+	} else {
+		drop_suspend = B_FALSE;
+	}
+	rw_exit(&zvol_state_lock);
+
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock));
+
+	zv->zv_open_count--;
+	if (zv->zv_open_count == 0)
+		zvol_last_close(zv);
+
+	mutex_exit(&zv->zv_state_lock);
+
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+}
+
+static int
+zvol_ioctl(struct block_device *bdev, fmode_t mode,
+    unsigned int cmd, unsigned long arg)
+{
+	zvol_state_t *zv = bdev->bd_disk->private_data;
+	int error = 0;
+
+	ASSERT3U(zv->zv_open_count, >, 0);
+
+	switch (cmd) {
+	case BLKFLSBUF:
+		fsync_bdev(bdev);
+		invalidate_bdev(bdev);
+		rw_enter(&zv->zv_suspend_lock, RW_READER);
+
+		if (!(zv->zv_flags & ZVOL_RDONLY))
+			txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+
+		rw_exit(&zv->zv_suspend_lock);
+		break;
+
+	case BLKZNAME:
+		mutex_enter(&zv->zv_state_lock);
+		error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
+		mutex_exit(&zv->zv_state_lock);
+		break;
+
+	default:
+		error = -ENOTTY;
+		break;
+	}
+
+	return (SET_ERROR(error));
+}
+
+#ifdef CONFIG_COMPAT
+static int
+zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
+    unsigned cmd, unsigned long arg)
+{
+	return (zvol_ioctl(bdev, mode, cmd, arg));
+}
+#else
+#define	zvol_compat_ioctl	NULL
+#endif
+
+static unsigned int
+zvol_check_events(struct gendisk *disk, unsigned int clearing)
+{
+	unsigned int mask = 0;
+
+	rw_enter(&zvol_state_lock, RW_READER);
+
+	zvol_state_t *zv = disk->private_data;
+	if (zv != NULL) {
+		mutex_enter(&zv->zv_state_lock);
+		mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
+		zv->zv_changed = 0;
+		mutex_exit(&zv->zv_state_lock);
+	}
+
+	rw_exit(&zvol_state_lock);
+
+	return (mask);
+}
+
+static int
+zvol_revalidate_disk(struct gendisk *disk)
+{
+	rw_enter(&zvol_state_lock, RW_READER);
+
+	zvol_state_t *zv = disk->private_data;
+	if (zv != NULL) {
+		mutex_enter(&zv->zv_state_lock);
+		set_capacity(zv->zv_zso->zvo_disk,
+		    zv->zv_volsize >> SECTOR_BITS);
+		mutex_exit(&zv->zv_state_lock);
+	}
+
+	rw_exit(&zvol_state_lock);
+
+	return (0);
+}
+
+static int
+zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
+{
+
+	revalidate_disk(zv->zv_zso->zvo_disk);
+	return (0);
+}
+
+static void
+zvol_clear_private(zvol_state_t *zv)
+{
+	/*
+	 * Cleared while holding zvol_state_lock as a writer
+	 * which will prevent zvol_open() from opening it.
+	 */
+	zv->zv_zso->zvo_disk->private_data = NULL;
+}
+
+/*
+ * Provide a simple virtual geometry for legacy compatibility.  For devices
+ * smaller than 1 MiB a small head and sector count is used to allow very
+ * tiny devices.  For devices over 1 Mib a standard head and sector count
+ * is used to keep the cylinders count reasonable.
+ */
+static int
+zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	zvol_state_t *zv = bdev->bd_disk->private_data;
+	sector_t sectors;
+
+	ASSERT3U(zv->zv_open_count, >, 0);
+
+	sectors = get_capacity(zv->zv_zso->zvo_disk);
+
+	if (sectors > 2048) {
+		geo->heads = 16;
+		geo->sectors = 63;
+	} else {
+		geo->heads = 2;
+		geo->sectors = 4;
+	}
+
+	geo->start = 0;
+	geo->cylinders = sectors / (geo->heads * geo->sectors);
+
+	return (0);
+}
+
+/*
+ * Find a zvol_state_t given the full major+minor dev_t. If found,
+ * return with zv_state_lock taken, otherwise, return (NULL) without
+ * taking zv_state_lock.
+ */
+static zvol_state_t *
+zvol_find_by_dev(dev_t dev)
+{
+	zvol_state_t *zv;
+
+	rw_enter(&zvol_state_lock, RW_READER);
+	for (zv = list_head(&zvol_state_list); zv != NULL;
+	    zv = list_next(&zvol_state_list, zv)) {
+		mutex_enter(&zv->zv_state_lock);
+		if (zv->zv_zso->zvo_dev == dev) {
+			rw_exit(&zvol_state_lock);
+			return (zv);
+		}
+		mutex_exit(&zv->zv_state_lock);
+	}
+	rw_exit(&zvol_state_lock);
+
+	return (NULL);
+}
+
+static struct kobject *
+zvol_probe(dev_t dev, int *part, void *arg)
+{
+	zvol_state_t *zv;
+	struct kobject *kobj;
+
+	zv = zvol_find_by_dev(dev);
+	kobj = zv ? get_disk_and_module(zv->zv_zso->zvo_disk) : NULL;
+	ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock));
+	if (zv)
+		mutex_exit(&zv->zv_state_lock);
+
+	return (kobj);
+}
+
+static struct block_device_operations zvol_ops = {
+	.open			= zvol_open,
+	.release		= zvol_release,
+	.ioctl			= zvol_ioctl,
+	.compat_ioctl		= zvol_compat_ioctl,
+	.check_events		= zvol_check_events,
+	.revalidate_disk	= zvol_revalidate_disk,
+	.getgeo			= zvol_getgeo,
+	.owner			= THIS_MODULE,
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+    .submit_bio		= zvol_submit_bio,
+#endif
+};
+
+/*
+ * Allocate memory for a new zvol_state_t and setup the required
+ * request queue and generic disk structures for the block device.
+ */
+static zvol_state_t *
+zvol_alloc(dev_t dev, const char *name)
+{
+	zvol_state_t *zv;
+	struct zvol_state_os *zso;
+	uint64_t volmode;
+
+	if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
+		return (NULL);
+
+	if (volmode == ZFS_VOLMODE_DEFAULT)
+		volmode = zvol_volmode;
+
+	if (volmode == ZFS_VOLMODE_NONE)
+		return (NULL);
+
+	zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
+	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
+	zv->zv_zso = zso;
+
+	list_link_init(&zv->zv_next);
+	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
+
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
+#else
+	zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
+#endif
+	if (zso->zvo_queue == NULL)
+		goto out_kmem;
+
+	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
+
+	/* Limit read-ahead to a single page to prevent over-prefetching. */
+	blk_queue_set_read_ahead(zso->zvo_queue, 1);
+
+	/* Disable write merging in favor of the ZIO pipeline. */
+	blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
+
+	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+	if (zso->zvo_disk == NULL)
+		goto out_queue;
+
+	zso->zvo_queue->queuedata = zv;
+	zso->zvo_dev = dev;
+	zv->zv_open_count = 0;
+	strlcpy(zv->zv_name, name, MAXNAMELEN);
+
+	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
+	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
+
+	zso->zvo_disk->major = zvol_major;
+	zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
+
+	if (volmode == ZFS_VOLMODE_DEV) {
+		/*
+		 * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
+		 * gendisk->minors = 1 as noted in include/linux/genhd.h.
+		 * Also disable extended partition numbers (GENHD_FL_EXT_DEVT)
+		 * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN)
+		 * setting gendisk->flags accordingly.
+		 */
+		zso->zvo_disk->minors = 1;
+#if defined(GENHD_FL_EXT_DEVT)
+		zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
+#endif
+#if defined(GENHD_FL_NO_PART_SCAN)
+		zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN;
+#endif
+	}
+	zso->zvo_disk->first_minor = (dev & MINORMASK);
+	zso->zvo_disk->fops = &zvol_ops;
+	zso->zvo_disk->private_data = zv;
+	zso->zvo_disk->queue = zso->zvo_queue;
+	snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
+	    ZVOL_DEV_NAME, (dev & MINORMASK));
+
+	return (zv);
+
+out_queue:
+	blk_cleanup_queue(zso->zvo_queue);
+out_kmem:
+	kmem_free(zso, sizeof (struct zvol_state_os));
+	kmem_free(zv, sizeof (zvol_state_t));
+	return (NULL);
+}
+
+/*
+ * Cleanup then free a zvol_state_t which was created by zvol_alloc().
+ * At this time, the structure is not opened by anyone, is taken off
+ * the zvol_state_list, and has its private data set to NULL.
+ * The zvol_state_lock is dropped.
+ *
+ * This function may take many milliseconds to complete (e.g. we've seen
+ * it take over 256ms), due to the calls to "blk_cleanup_queue" and
+ * "del_gendisk". Thus, consumers need to be careful to account for this
+ * latency when calling this function.
+ */
+static void
+zvol_free(zvol_state_t *zv)
+{
+
+	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT(zv->zv_open_count == 0);
+	ASSERT(zv->zv_zso->zvo_disk->private_data == NULL);
+
+	rw_destroy(&zv->zv_suspend_lock);
+	zfs_rangelock_fini(&zv->zv_rangelock);
+
+	del_gendisk(zv->zv_zso->zvo_disk);
+	blk_cleanup_queue(zv->zv_zso->zvo_queue);
+	put_disk(zv->zv_zso->zvo_disk);
+
+	ida_simple_remove(&zvol_ida,
+	    MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
+
+	mutex_destroy(&zv->zv_state_lock);
+	dataset_kstats_destroy(&zv->zv_kstat);
+
+	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+	kmem_free(zv, sizeof (zvol_state_t));
+}
+
+/*
+ * Create a block device minor node and setup the linkage between it
+ * and the specified volume.  Once this function returns the block
+ * device is live and ready for use.
+ */
+static int
+zvol_os_create_minor(const char *name)
+{
+	zvol_state_t *zv;
+	objset_t *os;
+	dmu_object_info_t *doi;
+	uint64_t volsize;
+	uint64_t len;
+	unsigned minor = 0;
+	int error = 0;
+	int idx;
+	uint64_t hash = zvol_name_hash(name);
+
+	if (zvol_inhibit_dev)
+		return (0);
+
+	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
+	if (idx < 0)
+		return (SET_ERROR(-idx));
+	minor = idx << ZVOL_MINOR_BITS;
+
+	zv = zvol_find_by_name_hash(name, hash, RW_NONE);
+	if (zv) {
+		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+		mutex_exit(&zv->zv_state_lock);
+		ida_simple_remove(&zvol_ida, idx);
+		return (SET_ERROR(EEXIST));
+	}
+
+	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+
+	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
+	if (error)
+		goto out_doi;
+
+	error = dmu_object_info(os, ZVOL_OBJ, doi);
+	if (error)
+		goto out_dmu_objset_disown;
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+	if (error)
+		goto out_dmu_objset_disown;
+
+	zv = zvol_alloc(MKDEV(zvol_major, minor), name);
+	if (zv == NULL) {
+		error = SET_ERROR(EAGAIN);
+		goto out_dmu_objset_disown;
+	}
+	zv->zv_hash = hash;
+
+	if (dmu_objset_is_snapshot(os))
+		zv->zv_flags |= ZVOL_RDONLY;
+
+	zv->zv_volblocksize = doi->doi_data_block_size;
+	zv->zv_volsize = volsize;
+	zv->zv_objset = os;
+
+	set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
+
+	blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
+	    (DMU_MAX_ACCESS / 4) >> 9);
+	blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
+	blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
+	blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
+	    zv->zv_volblocksize);
+	blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
+	blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
+	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
+	blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
+	    zv->zv_volblocksize);
+	blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
+#ifdef QUEUE_FLAG_NONROT
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
+#endif
+#ifdef QUEUE_FLAG_ADD_RANDOM
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
+#endif
+	/* This flag was introduced in kernel version 4.12. */
+#ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
+	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
+#endif
+
+	if (spa_writeable(dmu_objset_spa(os))) {
+		if (zil_replay_disable)
+			zil_destroy(dmu_objset_zil(os), B_FALSE);
+		else
+			zil_replay(os, zv, zvol_replay_vector);
+	}
+	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+	dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
+
+	/*
+	 * When udev detects the addition of the device it will immediately
+	 * invoke blkid(8) to determine the type of content on the device.
+	 * Prefetching the blocks commonly scanned by blkid(8) will speed
+	 * up this process.
+	 */
+	len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
+	if (len > 0) {
+		dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
+		    ZIO_PRIORITY_SYNC_READ);
+	}
+
+	zv->zv_objset = NULL;
+out_dmu_objset_disown:
+	dmu_objset_disown(os, B_TRUE, FTAG);
+out_doi:
+	kmem_free(doi, sizeof (dmu_object_info_t));
+
+	/*
+	 * Keep in mind that once add_disk() is called, the zvol is
+	 * announced to the world, and zvol_open()/zvol_release() can
+	 * be called at any time. Incidentally, add_disk() itself calls
+	 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
+	 * directly as well.
+	 */
+	if (error == 0) {
+		rw_enter(&zvol_state_lock, RW_WRITER);
+		zvol_insert(zv);
+		rw_exit(&zvol_state_lock);
+		add_disk(zv->zv_zso->zvo_disk);
+	} else {
+		ida_simple_remove(&zvol_ida, idx);
+	}
+
+	return (error);
+}
+
+static void
+zvol_rename_minor(zvol_state_t *zv, const char *newname)
+{
+	int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
+
+	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
+
+	/* move to new hashtable entry  */
+	zv->zv_hash = zvol_name_hash(zv->zv_name);
+	hlist_del(&zv->zv_hlink);
+	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
+
+	/*
+	 * The block device's read-only state is briefly changed causing
+	 * a KOBJ_CHANGE uevent to be issued.  This ensures udev detects
+	 * the name change and fixes the symlinks.  This does not change
+	 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
+	 * changes.  This would normally be done using kobject_uevent() but
+	 * that is a GPL-only symbol which is why we need this workaround.
+	 */
+	set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
+	set_disk_ro(zv->zv_zso->zvo_disk, readonly);
+}
+
+static void
+zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
+{
+
+	set_disk_ro(zv->zv_zso->zvo_disk, flags);
+}
+
+static void
+zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
+{
+
+	set_capacity(zv->zv_zso->zvo_disk, capacity);
+}
+
+const static zvol_platform_ops_t zvol_linux_ops = {
+	.zv_free = zvol_free,
+	.zv_rename_minor = zvol_rename_minor,
+	.zv_create_minor = zvol_os_create_minor,
+	.zv_update_volsize = zvol_update_volsize,
+	.zv_clear_private = zvol_clear_private,
+	.zv_is_zvol = zvol_is_zvol_impl,
+	.zv_set_disk_ro = zvol_set_disk_ro_impl,
+	.zv_set_capacity = zvol_set_capacity_impl,
+};
+
+int
+zvol_init(void)
+{
+	int error;
+	int threads = MIN(MAX(zvol_threads, 1), 1024);
+
+	error = register_blkdev(zvol_major, ZVOL_DRIVER);
+	if (error) {
+		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
+		return (error);
+	}
+	zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
+	    threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+	if (zvol_taskq == NULL) {
+		unregister_blkdev(zvol_major, ZVOL_DRIVER);
+		return (-ENOMEM);
+	}
+	zvol_init_impl();
+	blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
+	    THIS_MODULE, zvol_probe, NULL, NULL);
+
+	ida_init(&zvol_ida);
+	zvol_register_ops(&zvol_linux_ops);
+	return (0);
+}
+
+void
+zvol_fini(void)
+{
+	zvol_fini_impl();
+	blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
+	unregister_blkdev(zvol_major, ZVOL_DRIVER);
+	taskq_destroy(zvol_taskq);
+	ida_destroy(&zvol_ida);
+}
+
+/* BEGIN CSTYLED */
+module_param(zvol_inhibit_dev, uint, 0644);
+MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
+
+module_param(zvol_major, uint, 0444);
+MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
+
+module_param(zvol_threads, uint, 0444);
+MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
+
+module_param(zvol_request_sync, uint, 0644);
+MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
+
+module_param(zvol_max_discard_blocks, ulong, 0444);
+MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
+
+module_param(zvol_prefetch_bytes, uint, 0644);
+MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
+
+module_param(zvol_volmode, uint, 0644);
+MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
+/* END CSTYLED */