110 files changed, 27710 insertions, 9431 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
index dd2aa82304ab..d9eb88a40202 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
@@ -20,7 +20,7 @@
  */
 /* Portions Copyright 2007 Shivakumar GN */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -35,6 +35,7 @@
 #include <sys/mutex.h>
 #include <sys/sysmacros.h>
 #include <sys/systm.h>
+#include <sys/sunddi.h>
 #include <sys/uio.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
@@ -60,7 +61,7 @@
  *
  *    These routines are designed to play a support role for existing
  *    pseudo-filesystems (such as procfs).  They simplify common tasks,
- *    without enforcing the filesystem to hand over management to GFS.  The
+ *    without forcing the filesystem to hand over management to GFS.  The
  *    routines covered are:
  *
  *	gfs_readdir_init()
@@ -116,6 +117,42 @@
  */
 
 /*
+ * gfs_get_parent_ino: used to obtain a parent inode number and the
+ * inode number of the given vnode in preparation for calling gfs_readdir_init.
+ */
+int
+gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
+    ino64_t *pino, ino64_t *ino)
+{
+	vnode_t *parent;
+	gfs_dir_t *dp = dvp->v_data;
+	int error;
+
+	*ino = dp->gfsd_file.gfs_ino;
+	parent = dp->gfsd_file.gfs_parent;
+
+	if (parent == NULL) {
+		*pino = *ino;		/* root of filesystem */
+	} else if (dvp->v_flag & V_XATTRDIR) {
+#ifdef TODO
+		vattr_t va;
+
+		va.va_mask = AT_NODEID;
+		error = VOP_GETATTR(parent, &va, 0, cr, ct);
+		if (error)
+			return (error);
+		*pino = va.va_nodeid;
+#else
+		panic("%s:%u: not implemented", __func__, __LINE__);
+#endif
+	} else {
+		*pino = ((gfs_file_t *)(parent->v_data))->gfs_ino;
+	}
+
+	return (0);
+}
+
+/*
  * gfs_readdir_init: initiate a generic readdir
  *   st		- a pointer to an uninitialized gfs_readdir_state_t structure
  *   name_max	- the directory's maximum file name length
@@ -123,6 +160,7 @@
  *   uiop	- the uiop passed to readdir
  *   parent	- the parent directory's inode
  *   self	- this directory's inode
+ *   flags	- flags from VOP_READDIR
  *
  * Returns 0 or a non-zero errno.
  *
@@ -153,8 +191,10 @@
  */
 int
 gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
-    uio_t *uiop, ino64_t parent, ino64_t self)
+    uio_t *uiop, ino64_t parent, ino64_t self, int flags)
 {
+	size_t dirent_size;
+
 	if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
 	    (uiop->uio_loffset % ureclen) != 0)
 		return (EINVAL);
@@ -162,9 +202,14 @@ gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
 	st->grd_ureclen = ureclen;
 	st->grd_oresid = uiop->uio_resid;
 	st->grd_namlen = name_max;
-	st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP);
+	if (flags & V_RDDIR_ENTFLAGS)
+		dirent_size = EDIRENT_RECLEN(st->grd_namlen);
+	else
+		dirent_size = DIRENT64_RECLEN(st->grd_namlen);
+	st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP);
 	st->grd_parent = parent;
 	st->grd_self = self;
+	st->grd_flags = flags;
 
 	return (0);
 }
@@ -172,8 +217,8 @@ gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
 /*
  * gfs_readdir_emit_int: internal routine to emit directory entry
  *
- *   st		- the current readdir state, which must have d_ino and d_name
- *                set
+ *   st		- the current readdir state, which must have d_ino/ed_ino
+ *		  and d_name/ed_name set
  *   uiop	- caller-supplied uio pointer
  *   next	- the offset of the next entry
  */
@@ -182,9 +227,18 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
     int *ncookies, u_long **cookies)
 {
 	int reclen, namlen;
+	dirent64_t *dp;
+	edirent_t *edp;
 
-	namlen = strlen(st->grd_dirent->d_name);
-	reclen = DIRENT64_RECLEN(namlen);
+	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+		edp = st->grd_dirent;
+		namlen = strlen(edp->ed_name);
+		reclen = EDIRENT_RECLEN(namlen);
+	} else {
+		dp = st->grd_dirent;
+		namlen = strlen(dp->d_name);
+		reclen = DIRENT64_RECLEN(namlen);
+	}
 
 	if (reclen > uiop->uio_resid) {
 		/*
@@ -195,10 +249,15 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
 		return (-1);
 	}
 
-	/* XXX: This can change in the future. */
-	st->grd_dirent->d_type = DT_DIR;
-	st->grd_dirent->d_reclen = (ushort_t)reclen;
-	st->grd_dirent->d_namlen = namlen;
+	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+		edp->ed_off = next;
+		edp->ed_reclen = (ushort_t)reclen;
+	} else {
+		/* XXX: This can change in the future. */
+		dp->d_reclen = (ushort_t)reclen;
+		dp->d_type = DT_DIR;
+		dp->d_namlen = namlen;
+	}
 
 	if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
 		return (EFAULT);
@@ -219,6 +278,7 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
  *   voff       - the virtual offset (obtained from gfs_readdir_pred)
  *   ino        - the entry's inode
  *   name       - the entry's name
+ *   eflags	- value for ed_eflags (if processing edirent_t)
  *
  * Returns a 0 on success, a non-zero errno on failure, or -1 if the
  * readdir loop should terminate.  A non-zero result (either errno or
@@ -227,12 +287,22 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
  */
 int
 gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
-    ino64_t ino, const char *name, int *ncookies, u_long **cookies)
+    ino64_t ino, const char *name, int eflags, int *ncookies, u_long **cookies)
 {
 	offset_t off = (voff + 2) * st->grd_ureclen;
 
-	st->grd_dirent->d_ino = ino;
-	(void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen);
+	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
+		edirent_t *edp = st->grd_dirent;
+
+		edp->ed_ino = ino;
+		(void) strncpy(edp->ed_name, name, st->grd_namlen);
+		edp->ed_eflags = eflags;
+	} else {
+		dirent64_t *dp = st->grd_dirent;
+
+		dp->d_ino = ino;
+		(void) strncpy(dp->d_name, name, st->grd_namlen);
+	}
 
 	/*
 	 * Inter-entry offsets are invalid, so we assume a record size of
@@ -266,11 +336,11 @@ top:
 	voff = off - 2;
 	if (off == 0) {
 		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
-		    ".", ncookies, cookies)) == 0)
+		    ".", 0, ncookies, cookies)) == 0)
 			goto top;
 	} else if (off == 1) {
 		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
-		    "..", ncookies, cookies)) == 0)
+		    "..", 0, ncookies, cookies)) == 0)
 			goto top;
 	} else {
 		*voffp = voff;
@@ -292,7 +362,13 @@ top:
 int
 gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
 {
-	kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen));
+	size_t dirent_size;
+
+	if (st->grd_flags & V_RDDIR_ENTFLAGS)
+		dirent_size = EDIRENT_RECLEN(st->grd_namlen);
+	else
+		dirent_size = DIRENT64_RECLEN(st->grd_namlen);
+	kmem_free(st->grd_dirent, dirent_size);
 	if (error > 0)
 		return (error);
 	if (eofp)
@@ -485,7 +561,7 @@ gfs_file_inactive(vnode_t *vp)
 	gfs_dir_t *dp = NULL;
 	void *data;
 
-	if (fp->gfs_parent == NULL)
+	if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR))
 		goto found;
 
 	dp = fp->gfs_parent->v_data;
@@ -511,6 +587,8 @@ gfs_file_inactive(vnode_t *vp)
 	ge = NULL;
 
 found:
+	if (vp->v_flag & V_XATTRDIR)
+		VI_LOCK(fp->gfs_parent);
 	VI_LOCK(vp);
 	ASSERT(vp->v_count < 2);
 	/*
@@ -535,7 +613,8 @@ found:
 	 * Free vnode and release parent
 	 */
 	if (fp->gfs_parent) {
-		gfs_dir_unlock(dp);
+		if (dp)
+			gfs_dir_unlock(dp);
 		VI_LOCK(fp->gfs_parent);
 		fp->gfs_parent->v_usecount--;
 		VI_UNLOCK(fp->gfs_parent);
@@ -543,6 +622,8 @@ found:
 		ASSERT(vp->v_vfsp != NULL);
 		VFS_RELE(vp->v_vfsp);
 	}
+	if (vp->v_flag & V_XATTRDIR)
+		VI_UNLOCK(fp->gfs_parent);
 
 	return (data);
 }
@@ -570,55 +651,119 @@ gfs_dir_inactive(vnode_t *vp)
 }
 
 /*
- * gfs_dir_lookup()
+ * gfs_dir_lookup_dynamic()
  *
- * Looks up the given name in the directory and returns the corresponding vnode,
- * if found.
+ * This routine looks up the provided name amongst the dynamic entries
+ * in the gfs directory and returns the corresponding vnode, if found.
  *
- * First, we search statically defined entries, if any.  If a match is found,
- * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the
- * existing vnode.  Otherwise, we call the static entry's callback routine,
- * caching the result if necessary.
+ * The gfs directory is expected to be locked by the caller prior to
+ * calling this function.  The directory will be unlocked during the
+ * execution of this function, but will be locked upon return from the
+ * function.  This function returns 0 on success, non-zero on error.
  *
- * If no static entry is found, we invoke the lookup callback, if any.  The
- * arguments to this callback are:
+ * The dynamic lookups are performed by invoking the lookup
+ * callback, which is passed to this function as the first argument.
+ * The arguments to the callback are:
  *
- *	int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp);
+ * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr,
+ *     int flags, int *deflgs, pathname_t *rpnp);
  *
  *	pvp	- parent vnode
  *	nm	- name of entry
  *	vpp	- pointer to resulting vnode
+ *	cr	- pointer to cred
+ *	flags	- flags value from lookup request
+ *		ignored here; currently only used to request
+ *		insensitive lookups
+ *	direntflgs - output parameter, directory entry flags
+ *		ignored here; currently only used to indicate a lookup
+ *		has more than one possible match when case is not considered
+ *	realpnp	- output parameter, real pathname
+ *		ignored here; when lookup was performed case-insensitively,
+ *		this field contains the "real" name of the file.
  *
  * 	Returns 0 on success, non-zero on error.
  */
-int
-gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
+static int
+gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp,
+    const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags,
+    int *direntflags, pathname_t *realpnp)
 {
-	int i;
-	gfs_dirent_t *ge;
-	vnode_t *vp;
-	gfs_dir_t *dp = dvp->v_data;
-	int ret = 0;
-
-	ASSERT(dvp->v_type == VDIR);
+	gfs_file_t *fp;
+	ino64_t ino;
+	int ret;
 
-	if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
-		return (0);
+	ASSERT(GFS_DIR_LOCKED(dp));
 
+	/*
+	 * Drop the directory lock, as the lookup routine
+	 * will need to allocate memory, or otherwise deadlock on this
+	 * directory.
+	 */
+	gfs_dir_unlock(dp);
+	ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp);
 	gfs_dir_lock(dp);
 
 	/*
+	 * The callback for extended attributes returns a vnode
+	 * with v_data from an underlying fs.
+	 */
+	if (ret == 0 && !IS_XATTRDIR(dvp)) {
+		fp = (gfs_file_t *)((*vpp)->v_data);
+		fp->gfs_index = -1;
+		fp->gfs_ino = ino;
+	}
+
+	return (ret);
+}
+
+/*
+ * gfs_dir_lookup_static()
+ *
+ * This routine looks up the provided name amongst the static entries
+ * in the gfs directory and returns the corresponding vnode, if found.
+ * The first argument to the function is a pointer to the comparison
+ * function this function should use to decide if names are a match.
+ *
+ * If a match is found, and GFS_CACHE_VNODE is set and the vnode
+ * exists, we simply return the existing vnode.  Otherwise, we call
+ * the static entry's callback routine, caching the result if
+ * necessary.  If the idx pointer argument is non-NULL, we use it to
+ * return the index of the matching static entry.
+ *
+ * The gfs directory is expected to be locked by the caller prior to calling
+ * this function.  The directory may be unlocked during the execution of
+ * this function, but will be locked upon return from the function.
+ *
+ * This function returns 0 if a match is found, ENOENT if not.
+ */
+static int
+gfs_dir_lookup_static(int (*compare)(const char *, const char *),
+    gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx,
+    vnode_t **vpp, pathname_t *rpnp)
+{
+	gfs_dirent_t *ge;
+	vnode_t *vp = NULL;
+	int i;
+
+	ASSERT(GFS_DIR_LOCKED(dp));
+
+	/*
 	 * Search static entries.
 	 */
 	for (i = 0; i < dp->gfsd_nstatic; i++) {
 		ge = &dp->gfsd_static[i];
 
-		if (strcmp(ge->gfse_name, nm) == 0) {
+		if (compare(ge->gfse_name, nm) == 0) {
+			if (rpnp)
+				(void) strlcpy(rpnp->pn_buf, ge->gfse_name,
+				    rpnp->pn_bufsize);
+
 			if (ge->gfse_vnode) {
 				ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
 				vp = ge->gfse_vnode;
 				VN_HOLD(vp);
-				goto out;
+				break;
 			}
 
 			/*
@@ -626,8 +771,8 @@ gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
 			 * need to do KM_SLEEP allocations.  If we return from
 			 * the constructor only to find that a parallel
 			 * operation has completed, and GFS_CACHE_VNODE is set
-			 * for this entry, we discard the result in favor of the
-			 * cached vnode.
+			 * for this entry, we discard the result in favor of
+			 * the cached vnode.
 			 */
 			gfs_dir_unlock(dp);
 			vp = ge->gfse_ctor(dvp);
@@ -660,49 +805,94 @@ gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
 					gfs_dir_lock(dp);
 				}
 			}
-
-			goto out;
+			break;
 		}
 	}
 
-	/*
-	 * See if there is a dynamic constructor.
-	 */
-	if (dp->gfsd_lookup) {
-		ino64_t ino;
-		gfs_file_t *fp;
+	if (vp == NULL)
+		return (ENOENT);
+	else if (idx)
+		*idx = i;
+	*vpp = vp;
+	return (0);
+}
 
-		/*
-		 * Once again, drop the directory lock, as the lookup routine
-		 * will need to allocate memory, or otherwise deadlock on this
-		 * directory.
-		 */
-		gfs_dir_unlock(dp);
-		ret = dp->gfsd_lookup(dvp, nm, &vp, &ino);
-		gfs_dir_lock(dp);
-		if (ret != 0)
-			goto out;
+/*
+ * gfs_dir_lookup()
+ *
+ * Looks up the given name in the directory and returns the corresponding
+ * vnode, if found.
+ *
+ * First, we search statically defined entries, if any, with a call to
+ * gfs_dir_lookup_static().  If no static entry is found, and we have
+ * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic().
+ *
+ * This function returns 0 on success, non-zero on error.
+ */
+int
+gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr,
+    int flags, int *direntflags, pathname_t *realpnp)
+{
+	gfs_dir_t *dp = dvp->v_data;
+	boolean_t casecheck;
+	vnode_t *dynvp = NULL;
+	vnode_t *vp = NULL;
+	int (*compare)(const char *, const char *);
+	int error, idx;
 
-		fp = (gfs_file_t *)vp->v_data;
-		fp->gfs_index = -1;
-		fp->gfs_ino = ino;
-	} else {
-		/*
-		 * No static entry found, and there is no lookup callback, so
-		 * return ENOENT.
-		 */
-		ret = ENOENT;
+	ASSERT(dvp->v_type == VDIR);
+
+	if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
+		return (0);
+
+	casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL;
+	if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) ||
+	    (flags & FIGNORECASE))
+		compare = strcasecmp;
+	else
+		compare = strcmp;
+
+	gfs_dir_lock(dp);
+
+	error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp);
+
+	if (vp && casecheck) {
+		gfs_dirent_t *ge;
+		int i;
+
+		for (i = idx + 1; i < dp->gfsd_nstatic; i++) {
+			ge = &dp->gfsd_static[i];
+
+			if (strcasecmp(ge->gfse_name, nm) == 0) {
+				*direntflags |= ED_CASE_CONFLICT;
+				goto out;
+			}
+		}
+	}
+
+	if ((error || casecheck) && dp->gfsd_lookup)
+		error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp,
+		    &dynvp, cr, flags, direntflags, vp ? NULL : realpnp);
+
+	if (vp && dynvp) {
+		/* static and dynamic entries are case-insensitive conflict */
+		ASSERT(casecheck);
+		*direntflags |= ED_CASE_CONFLICT;
+		VN_RELE(dynvp);
+	} else if (vp == NULL) {
+		vp = dynvp;
+	} else if (error == ENOENT) {
+		error = 0;
+	} else if (error) {
+		VN_RELE(vp);
+		vp = NULL;
 	}
 
 out:
 	gfs_dir_unlock(dp);
 
-	if (ret == 0)
-		*vpp = vp;
-	else
-		*vpp = NULL;
-
-	return (ret);
+	*vpp = vp;
+	return (error);
 }
 
 /*
@@ -731,13 +921,15 @@ out:
  * This is significantly more complex, thanks to the particulars of
  * VOP_READDIR().
  *
- *	int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
- *	    offset_t *off, offset_t *nextoff, void *data)
+ *	int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp,
+ *	    offset_t *off, offset_t *nextoff, void *data, int flags)
  *
  *	vp	- directory vnode
  *	dp	- directory entry, sized according to maxlen given to
  *		  gfs_dir_create().  callback must fill in d_name and
- *		  d_ino.
+ *		  d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags
+ *		  (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS
+ *		  is set in 'flags'.
  *	eofp	- callback must set to 1 when EOF has been reached
  *	off	- on entry, the last offset read from the directory.  Callback
  *		  must set to the offset of the current entry, typically left
@@ -745,12 +937,13 @@ out:
  *	nextoff	- callback must set to offset of next entry.  Typically
  *		  (off + 1)
  *	data	- caller-supplied data
+ *	flags	- VOP_READDIR flags
  *
  *	Return 0 on success, or error on failure.
  */
 int
 gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
-    u_long **cookies, void *data)
+    u_long **cookies, void *data, cred_t *cr, int flags)
 {
 	gfs_readdir_state_t gstate;
 	int error, eof = 0;
@@ -758,16 +951,12 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
 	offset_t off, next;
 	gfs_dir_t *dp = dvp->v_data;
 
-	ino = dp->gfsd_file.gfs_ino;
-
-	if (dp->gfsd_file.gfs_parent == NULL)
-		pino = ino;		/* root of filesystem */
-	else
-		pino = ((gfs_file_t *)
-		    (dp->gfsd_file.gfs_parent->v_data))->gfs_ino;
+	error = gfs_get_parent_ino(dvp, cr, NULL, &pino, &ino);
+	if (error)
+		return (error);
 
 	if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
-	    pino, ino)) != 0)
+	    pino, ino, flags)) != 0)
 		return (error);
 
 	while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies,
@@ -777,8 +966,8 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
 			ino = dp->gfsd_inode(dvp, off);
 
 			if ((error = gfs_readdir_emit(&gstate, uiop,
-			    off, ino, dp->gfsd_static[off].gfse_name, ncookies,
-			    cookies)) != 0)
+			    off, ino, dp->gfsd_static[off].gfse_name, 0,
+			    ncookies, cookies)) != 0)
 				break;
 
 		} else if (dp->gfsd_readdir) {
@@ -786,7 +975,7 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
 
 			if ((error = dp->gfsd_readdir(dvp,
 			    gstate.grd_dirent, &eof, &off, &next,
-			    data)) != 0 || eof)
+			    data, flags)) != 0 || eof)
 				break;
 
 			off += dp->gfsd_nstatic + 2;
@@ -808,6 +997,21 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
 }
 
 /*
+ * gfs_vop_lookup: VOP_LOOKUP() entry point
+ *
+ * For use directly in vnode ops table.  Given a GFS directory, calls
+ * gfs_dir_lookup() as necessary.
+ */
+/* ARGSUSED */
+int
+gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp)
+{
+	return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp));
+}
+
+/*
  * gfs_vop_readdir: VOP_READDIR() entry point
  *
  * For use directly in vnode ops table.  Given a GFS directory, calls
@@ -827,6 +1031,7 @@ gfs_vop_readdir(ap)
 {
 	vnode_t *vp = ap->a_vp;
 	uio_t *uiop = ap->a_uio;
+	cred_t *cr = ap->a_cred;
 	int *eofp = ap->a_eofflag;
 	int ncookies = 0;
 	u_long *cookies = NULL;
@@ -842,7 +1047,8 @@ gfs_vop_readdir(ap)
 		*ap->a_ncookies = ncookies;
 	}
 
-	error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL);
+	error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL,
+	    cr, 0);
 
 	if (error == 0) {
 		/* Subtract unused cookies */
@@ -882,6 +1088,9 @@ gfs_vop_inactive(ap)
 
 	if (data != NULL)
 		kmem_free(data, fp->gfs_size);
+
+	VI_LOCK(vp);
 	vp->v_data = NULL;
+	VI_UNLOCK(vp);
 	return (0);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
new file mode 100644
index 000000000000..00a10aae8ec9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/vnode.h>
+
+/* Extensible attribute (xva) routines. */
+
+/*
+ * Zero out the structure, set the size of the requested/returned bitmaps,
+ * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
+ * to the returned attributes array.
+ */
+void
+xva_init(xvattr_t *xvap)
+{
+	bzero(xvap, sizeof (xvattr_t));
+	xvap->xva_mapsize = XVA_MAPSIZE;
+	xvap->xva_magic = XVA_MAGIC;
+	xvap->xva_vattr.va_mask = AT_XVATTR;
+	xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
+}
+
+/*
+ * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
+ * structure.  Otherwise, returns NULL.
+ */
+xoptattr_t *
+xva_getxoptattr(xvattr_t *xvap)
+{
+	xoptattr_t *xoap = NULL;
+	if (xvap->xva_vattr.va_mask & AT_XVATTR)
+		xoap = &xvap->xva_xoptattrs;
+	return (xoap);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 420f802f360d..7ca528033c4f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * DVA-based Adjustable Replacement Cache
  *
@@ -47,13 +45,13 @@
  * There are times when it is not possible to evict the requested
  * space.  In these circumstances we are unable to adjust the cache
  * size.  To prevent the cache growing unbounded at these times we
- * implement a "cache throttle" that slowes the flow of new data
- * into the cache until we can make space avaiable.
+ * implement a "cache throttle" that slows the flow of new data
+ * into the cache until we can make space available.
  *
  * 2. The Megiddo and Modha model assumes a fixed cache size.
  * Pages are evicted when the cache is full and there is a cache
  * miss.  Our model has a variable sized cache.  It grows with
- * high use, but also tries to react to memory preasure from the
+ * high use, but also tries to react to memory pressure from the
  * operating system: decreasing its size when system memory is
  * tight.
  *
@@ -75,7 +73,7 @@
  *
  * A new reference to a cache buffer can be obtained in two
  * ways: 1) via a hash table lookup using the DVA as a key,
- * or 2) via one of the ARC lists.  The arc_read() inerface
+ * or 2) via one of the ARC lists.  The arc_read() interface
  * uses method 1, while the internal arc algorithms for
  * adjusting the cache use method 2.  We therefor provide two
  * types of locks: 1) the hash table lock array, and 2) the
@@ -109,6 +107,14 @@
  *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
+ *
+ * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
+ *
+ *	- L2ARC buflist creation
+ *	- L2ARC buflist eviction
+ *	- L2ARC write completion, which walks L2ARC buflists
+ *	- ARC header destruction, as it removes from L2ARC buflists
+ *	- ARC header release, as it removes from L2ARC buflists
  */
 
 #include <sys/spa.h>
@@ -117,6 +123,7 @@
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/refcount.h>
+#include <sys/vdev.h>
 #ifdef _KERNEL
 #include <sys/dnlc.h>
 #endif
@@ -128,6 +135,10 @@ static kmutex_t		arc_reclaim_thr_lock;
 static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
 static uint8_t		arc_thread_exit;
 
+extern int zfs_write_limit_shift;
+extern uint64_t zfs_write_limit_max;
+extern kmutex_t zfs_write_limit_lock;
+
 #define	ARC_REDUCE_DNLC_PERCENT	3
 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 
@@ -148,28 +159,45 @@ static int		arc_min_prefetch_lifespan;
 static int arc_dead;
 
 /*
+ * The arc has filled available memory and has now warmed up.
+ */
+static boolean_t arc_warm;
+
+/*
  * These tunables are for performance analysis.
  */
-u_long zfs_arc_max;
-u_long zfs_arc_min;
-TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max);
-TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min);
+uint64_t zfs_arc_max;
+uint64_t zfs_arc_min;
+uint64_t zfs_arc_meta_limit = 0;
+int zfs_mdcomp_disable = 0;
+
+TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
+TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
+TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
+TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
 SYSCTL_DECL(_vfs_zfs);
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
     "Maximum ARC size");
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
     "Minimum ARC size");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
+    &zfs_mdcomp_disable, 0, "Disable metadata compression");
 
 /*
- * Note that buffers can be on one of 5 states:
+ * Note that buffers can be in one of 6 states:
  *	ARC_anon	- anonymous (discussed below)
  *	ARC_mru		- recently used, currently cached
  *	ARC_mru_ghost	- recentely used, no longer in cache
  *	ARC_mfu		- frequently used, currently cached
  *	ARC_mfu_ghost	- frequently used, no longer in cache
- * When there are no active references to the buffer, they
- * are linked onto one of the lists in arc.  These are the
- * only buffers that can be evicted or deleted.
+ *	ARC_l2c_only	- exists in L2ARC but not other states
+ * When there are no active references to the buffer, they are
+ * are linked onto a list in one of these arc states.  These are
+ * the only buffers that can be evicted or deleted.  Within each
+ * state there are multiple lists, one for meta-data and one for
+ * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
+ * etc.) is tracked separately so that it can be managed more
+ * explicitly: favored over data, limited explicitly.
  *
  * Anonymous buffers are buffers that are not associated with
  * a DVA.  These are buffers that hold dirty block copies
@@ -177,21 +205,30 @@ SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
  * they are "ref'd" and are considered part of arc_mru
  * that cannot be freed.  Generally, they will aquire a DVA
  * as they are written and migrate onto the arc_mru list.
+ *
+ * The ARC_l2c_only state is for buffers that are in the second
+ * level ARC but no longer in any of the ARC_m* lists.  The second
+ * level ARC itself may also contain buffers that are in any of
+ * the ARC_m* states - meaning that a buffer can exist in two
+ * places.  The reason for the ARC_l2c_only state is to keep the
+ * buffer header in the hash table, so that reads that hit the
+ * second level ARC benefit from these fast lookups.
  */
 
 typedef struct arc_state {
-	list_t	arcs_list;	/* linked list of evictable buffer in state */
-	uint64_t arcs_lsize;	/* total size of buffers in the linked list */
-	uint64_t arcs_size;	/* total size of all buffers in this state */
+	list_t	arcs_list[ARC_BUFC_NUMTYPES];	/* list of evictable buffers */
+	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
+	uint64_t arcs_size;	/* total amount of data in this state */
 	kmutex_t arcs_mtx;
 } arc_state_t;
 
-/* The 5 states: */
+/* The 6 states: */
 static arc_state_t ARC_anon;
 static arc_state_t ARC_mru;
 static arc_state_t ARC_mru_ghost;
 static arc_state_t ARC_mfu;
 static arc_state_t ARC_mfu_ghost;
+static arc_state_t ARC_l2c_only;
 
 typedef struct arc_stats {
 	kstat_named_t arcstat_hits;
@@ -222,6 +259,24 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_c_min;
 	kstat_named_t arcstat_c_max;
 	kstat_named_t arcstat_size;
+	kstat_named_t arcstat_hdr_size;
+	kstat_named_t arcstat_l2_hits;
+	kstat_named_t arcstat_l2_misses;
+	kstat_named_t arcstat_l2_feeds;
+	kstat_named_t arcstat_l2_rw_clash;
+	kstat_named_t arcstat_l2_writes_sent;
+	kstat_named_t arcstat_l2_writes_done;
+	kstat_named_t arcstat_l2_writes_error;
+	kstat_named_t arcstat_l2_writes_hdr_miss;
+	kstat_named_t arcstat_l2_evict_lock_retry;
+	kstat_named_t arcstat_l2_evict_reading;
+	kstat_named_t arcstat_l2_free_on_write;
+	kstat_named_t arcstat_l2_abort_lowmem;
+	kstat_named_t arcstat_l2_cksum_bad;
+	kstat_named_t arcstat_l2_io_error;
+	kstat_named_t arcstat_l2_size;
+	kstat_named_t arcstat_l2_hdr_size;
+	kstat_named_t arcstat_memory_throttle_count;
 } arc_stats_t;
 
 static arc_stats_t arc_stats = {
@@ -252,7 +307,25 @@ static arc_stats_t arc_stats = {
 	{ "c",				KSTAT_DATA_UINT64 },
 	{ "c_min",			KSTAT_DATA_UINT64 },
 	{ "c_max",			KSTAT_DATA_UINT64 },
-	{ "size",			KSTAT_DATA_UINT64 }
+	{ "size",			KSTAT_DATA_UINT64 },
+	{ "hdr_size",			KSTAT_DATA_UINT64 },
+	{ "l2_hits",			KSTAT_DATA_UINT64 },
+	{ "l2_misses",			KSTAT_DATA_UINT64 },
+	{ "l2_feeds",			KSTAT_DATA_UINT64 },
+	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
+	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
+	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
+	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
+	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
+	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
+	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
+	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
+	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
+	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
+	{ "l2_io_error",		KSTAT_DATA_UINT64 },
+	{ "l2_size",			KSTAT_DATA_UINT64 },
+	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
+	{ "memory_throttle_count",	KSTAT_DATA_UINT64 }
 };
 
 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
@@ -299,6 +372,7 @@ static arc_state_t	*arc_mru;
 static arc_state_t	*arc_mru_ghost;
 static arc_state_t	*arc_mfu;
 static arc_state_t	*arc_mfu_ghost;
+static arc_state_t	*arc_l2c_only;
 
 /*
  * There are several ARC variables that are critical to export as kstats --
@@ -316,13 +390,21 @@ static arc_state_t	*arc_mfu_ghost;
 
 static int		arc_no_grow;	/* Don't try to grow cache size */
 static uint64_t		arc_tempreserve;
+static uint64_t		arc_meta_used;
+static uint64_t		arc_meta_limit;
+static uint64_t		arc_meta_max = 0;
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN,
+    &arc_meta_used, 0, "ARC metadata used");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN,
+    &arc_meta_limit, 0, "ARC metadata limit");
+
+typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 
 typedef struct arc_callback arc_callback_t;
 
 struct arc_callback {
 	void			*acb_private;
 	arc_done_func_t		*acb_done;
-	arc_byteswap_func_t	*acb_byteswap;
 	arc_buf_t		*acb_buf;
 	zio_t			*acb_zio_dummy;
 	arc_callback_t		*acb_next;
@@ -368,6 +450,9 @@ struct arc_buf_hdr {
 
 	/* self protecting */
 	refcount_t		b_refcnt;
+
+	l2arc_buf_hdr_t		*b_l2hdr;
+	list_node_t		b_l2node;
 };
 
 static arc_buf_t *arc_eviction_list;
@@ -375,9 +460,12 @@ static kmutex_t arc_eviction_mtx;
 static arc_buf_hdr_t arc_eviction_hdr;
 static void arc_get_data_buf(arc_buf_t *buf);
 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
+static int arc_evict_needed(arc_buf_contents_t type);
+static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
 
 #define	GHOST_STATE(state)	\
-	((state) == arc_mru_ghost || (state) == arc_mfu_ghost)
+	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
+	(state) == arc_l2c_only)
 
 /*
  * Private ARC flags.  These flags are private ARC only flags that will show up
@@ -393,12 +481,31 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
 #define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
 #define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
+#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
+#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
+#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
+#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
+#define	ARC_STORED		(1 << 19)	/* has been store()d to */
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
 #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
 #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
+#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
+#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
+#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
+				    (hdr)->b_l2hdr != NULL)
+#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
+#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
+#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
+
+/*
+ * Other sizes
+ */
+
+#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 
 /*
  * Hash table routines
@@ -431,8 +538,90 @@ static buf_hash_table_t buf_hash_table;
 
 uint64_t zfs_crc64_table[256];
 
+/*
+ * Level 2 ARC
+ */
+
+#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
+#define	L2ARC_HEADROOM		4		/* num of writes */
+#define	L2ARC_FEED_SECS		1		/* caching interval */
+
+#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
+#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
+
+/*
+ * L2ARC Performance Tunables
+ */
+uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
+uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
+uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
+uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
+boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
+
+/*
+ * L2ARC Internals
+ */
+typedef struct l2arc_dev {
+	vdev_t			*l2ad_vdev;	/* vdev */
+	spa_t			*l2ad_spa;	/* spa */
+	uint64_t		l2ad_hand;	/* next write location */
+	uint64_t		l2ad_write;	/* desired write size, bytes */
+	uint64_t		l2ad_boost;	/* warmup write boost, bytes */
+	uint64_t		l2ad_start;	/* first addr on device */
+	uint64_t		l2ad_end;	/* last addr on device */
+	uint64_t		l2ad_evict;	/* last addr eviction reached */
+	boolean_t		l2ad_first;	/* first sweep through */
+	list_t			*l2ad_buflist;	/* buffer list */
+	list_node_t		l2ad_node;	/* device list node */
+} l2arc_dev_t;
+
+static list_t L2ARC_dev_list;			/* device list */
+static list_t *l2arc_dev_list;			/* device list pointer */
+static kmutex_t l2arc_dev_mtx;			/* device list mutex */
+static l2arc_dev_t *l2arc_dev_last;		/* last device used */
+static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
+static list_t L2ARC_free_on_write;		/* free after write buf list */
+static list_t *l2arc_free_on_write;		/* free after write list ptr */
+static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
+static uint64_t l2arc_ndev;			/* number of devices */
+
+typedef struct l2arc_read_callback {
+	arc_buf_t	*l2rcb_buf;		/* read buffer */
+	spa_t		*l2rcb_spa;		/* spa */
+	blkptr_t	l2rcb_bp;		/* original blkptr */
+	zbookmark_t	l2rcb_zb;		/* original bookmark */
+	int		l2rcb_flags;		/* original flags */
+} l2arc_read_callback_t;
+
+typedef struct l2arc_write_callback {
+	l2arc_dev_t	*l2wcb_dev;		/* device info */
+	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
+} l2arc_write_callback_t;
+
+struct l2arc_buf_hdr {
+	/* protected by arc_buf_hdr  mutex */
+	l2arc_dev_t	*b_dev;			/* L2ARC device */
+	daddr_t		b_daddr;		/* disk address, offset byte */
+};
+
+typedef struct l2arc_data_free {
+	/* protected by l2arc_free_on_write_mtx */
+	void		*l2df_data;
+	size_t		l2df_size;
+	void		(*l2df_func)(void *, size_t);
+	list_node_t	l2df_list_node;
+} l2arc_data_free_t;
+
+static kmutex_t l2arc_feed_thr_lock;
+static kcondvar_t l2arc_feed_thr_cv;
+static uint8_t l2arc_thread_exit;
+
+static void l2arc_read_done(zio_t *zio);
+static void l2arc_hdr_stat_add(void);
+static void l2arc_hdr_stat_remove(void);
+
 static uint64_t
-buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
+buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
 {
 	uintptr_t spav = (uintptr_t)spa;
 	uint8_t *vdva = (uint8_t *)dva;
@@ -460,7 +649,7 @@ buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 
 static arc_buf_hdr_t *
-buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
+buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 {
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
@@ -579,6 +768,20 @@ hdr_cons(void *vbuf, void *unused, int kmflag)
 	bzero(buf, sizeof (arc_buf_hdr_t));
 	refcount_create(&buf->b_refcnt);
 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+buf_cons(void *vbuf, void *unused, int kmflag)
+{
+	arc_buf_t *buf = vbuf;
+
+	bzero(buf, sizeof (arc_buf_t));
+	rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
 	return (0);
 }
 
@@ -594,6 +797,18 @@ hdr_dest(void *vbuf, void *unused)
 
 	refcount_destroy(&buf->b_refcnt);
 	cv_destroy(&buf->b_cv);
+	mutex_destroy(&buf->b_freeze_lock);
+
+	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
+}
+
+/* ARGSUSED */
+static void
+buf_dest(void *vbuf, void *unused)
+{
+	arc_buf_t *buf = vbuf;
+
+	rw_destroy(&buf->b_lock);
 }
 
 /*
@@ -639,7 +854,7 @@ retry:
 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
-	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < 256; i++)
 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
@@ -673,10 +888,24 @@ arc_cksum_verify(arc_buf_t *buf)
 	mutex_exit(&buf->b_hdr->b_freeze_lock);
 }
 
+static int
+arc_cksum_equal(arc_buf_t *buf)
+{
+	zio_cksum_t zc;
+	int equal;
+
+	mutex_enter(&buf->b_hdr->b_freeze_lock);
+	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
+	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
+	mutex_exit(&buf->b_hdr->b_freeze_lock);
+
+	return (equal);
+}
+
 static void
-arc_cksum_compute(arc_buf_t *buf)
+arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 {
-	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	mutex_enter(&buf->b_hdr->b_freeze_lock);
@@ -693,14 +922,14 @@ arc_cksum_compute(arc_buf_t *buf)
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
-	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
-		return;
+	if (zfs_flags & ZFS_DEBUG_MODIFY) {
+		if (buf->b_hdr->b_state != arc_anon)
+			panic("modifying non-anon buffer!");
+		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
+			panic("modifying buffer while i/o in progress!");
+		arc_cksum_verify(buf);
+	}
 
-	if (buf->b_hdr->b_state != arc_anon)
-		panic("modifying non-anon buffer!");
-	if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
-		panic("modifying buffer while i/o in progress!");
-	arc_cksum_verify(buf);
 	mutex_enter(&buf->b_hdr->b_freeze_lock);
 	if (buf->b_hdr->b_freeze_cksum != NULL) {
 		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
@@ -717,7 +946,7 @@ arc_buf_freeze(arc_buf_t *buf)
 
 	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
 	    buf->b_hdr->b_state == arc_anon);
-	arc_cksum_compute(buf);
+	arc_cksum_compute(buf, B_FALSE);
 }
 
 static void
@@ -728,21 +957,23 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
 	    (ab->b_state != arc_anon)) {
 		uint64_t delta = ab->b_size * ab->b_datacnt;
+		list_t *list = &ab->b_state->arcs_list[ab->b_type];
+		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
 
 		ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
 		mutex_enter(&ab->b_state->arcs_mtx);
 		ASSERT(list_link_active(&ab->b_arc_node));
-		list_remove(&ab->b_state->arcs_list, ab);
+		list_remove(list, ab);
 		if (GHOST_STATE(ab->b_state)) {
 			ASSERT3U(ab->b_datacnt, ==, 0);
 			ASSERT3P(ab->b_buf, ==, NULL);
 			delta = ab->b_size;
 		}
 		ASSERT(delta > 0);
-		ASSERT3U(ab->b_state->arcs_lsize, >=, delta);
-		atomic_add_64(&ab->b_state->arcs_lsize, -delta);
+		ASSERT3U(*size, >=, delta);
+		atomic_add_64(size, -delta);
 		mutex_exit(&ab->b_state->arcs_mtx);
-		/* remove the prefetch flag is we get a reference */
+		/* remove the prefetch flag if we get a reference */
 		if (ab->b_flags & ARC_PREFETCH)
 			ab->b_flags &= ~ARC_PREFETCH;
 	}
@@ -759,13 +990,14 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 
 	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
 	    (state != arc_anon)) {
+		uint64_t *size = &state->arcs_lsize[ab->b_type];
+
 		ASSERT(!MUTEX_HELD(&state->arcs_mtx));
 		mutex_enter(&state->arcs_mtx);
 		ASSERT(!list_link_active(&ab->b_arc_node));
-		list_insert_head(&state->arcs_list, ab);
+		list_insert_head(&state->arcs_list[ab->b_type], ab);
 		ASSERT(ab->b_datacnt > 0);
-		atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt);
-		ASSERT3U(state->arcs_size, >=, state->arcs_lsize);
+		atomic_add_64(size, ab->b_size * ab->b_datacnt);
 		mutex_exit(&state->arcs_mtx);
 	}
 	return (cnt);
@@ -796,12 +1028,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 	if (refcnt == 0) {
 		if (old_state != arc_anon) {
 			int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
+			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
 
 			if (use_mutex)
 				mutex_enter(&old_state->arcs_mtx);
 
 			ASSERT(list_link_active(&ab->b_arc_node));
-			list_remove(&old_state->arcs_list, ab);
+			list_remove(&old_state->arcs_list[ab->b_type], ab);
 
 			/*
 			 * If prefetching out of the ghost cache,
@@ -812,19 +1045,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 				ASSERT(ab->b_buf == NULL);
 				from_delta = ab->b_size;
 			}
-			ASSERT3U(old_state->arcs_lsize, >=, from_delta);
-			atomic_add_64(&old_state->arcs_lsize, -from_delta);
+			ASSERT3U(*size, >=, from_delta);
+			atomic_add_64(size, -from_delta);
 
 			if (use_mutex)
 				mutex_exit(&old_state->arcs_mtx);
 		}
 		if (new_state != arc_anon) {
 			int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
+			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
 
 			if (use_mutex)
 				mutex_enter(&new_state->arcs_mtx);
 
-			list_insert_head(&new_state->arcs_list, ab);
+			list_insert_head(&new_state->arcs_list[ab->b_type], ab);
 
 			/* ghost elements have a ghost size */
 			if (GHOST_STATE(new_state)) {
@@ -832,9 +1066,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 				ASSERT(ab->b_buf == NULL);
 				to_delta = ab->b_size;
 			}
-			atomic_add_64(&new_state->arcs_lsize, to_delta);
-			ASSERT3U(new_state->arcs_size + to_delta, >=,
-			    new_state->arcs_lsize);
+			atomic_add_64(size, to_delta);
 
 			if (use_mutex)
 				mutex_exit(&new_state->arcs_mtx);
@@ -842,7 +1074,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 	}
 
 	ASSERT(!BUF_EMPTY(ab));
-	if (new_state == arc_anon && old_state != arc_anon) {
+	if (new_state == arc_anon) {
 		buf_hash_remove(ab);
 	}
 
@@ -854,6 +1086,47 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 		atomic_add_64(&old_state->arcs_size, -from_delta);
 	}
 	ab->b_state = new_state;
+
+	/* adjust l2arc hdr stats */
+	if (new_state == arc_l2c_only)
+		l2arc_hdr_stat_add();
+	else if (old_state == arc_l2c_only)
+		l2arc_hdr_stat_remove();
+}
+
+void
+arc_space_consume(uint64_t space)
+{
+	atomic_add_64(&arc_meta_used, space);
+	atomic_add_64(&arc_size, space);
+}
+
+void
+arc_space_return(uint64_t space)
+{
+	ASSERT(arc_meta_used >= space);
+	if (arc_meta_max < arc_meta_used)
+		arc_meta_max = arc_meta_used;
+	atomic_add_64(&arc_meta_used, -space);
+	ASSERT(arc_size >= space);
+	atomic_add_64(&arc_size, -space);
+}
+
+void *
+arc_data_buf_alloc(uint64_t size)
+{
+	if (arc_evict_needed(ARC_BUFC_DATA))
+		cv_signal(&arc_reclaim_thr_cv);
+	atomic_add_64(&arc_size, size);
+	return (zio_data_buf_alloc(size));
+}
+
+void
+arc_data_buf_free(void *buf, uint64_t size)
+{
+	zio_data_buf_free(buf, size);
+	ASSERT(arc_size >= size);
+	atomic_add_64(&arc_size, -size);
 }
 
 arc_buf_t *
@@ -863,15 +1136,14 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
 	arc_buf_t *buf;
 
 	ASSERT3U(size, >, 0);
-	hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
 	ASSERT(BUF_EMPTY(hdr));
 	hdr->b_size = size;
 	hdr->b_type = type;
 	hdr->b_spa = spa;
 	hdr->b_state = arc_anon;
 	hdr->b_arc_access = 0;
-	mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
-	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
 	buf->b_efunc = NULL;
@@ -894,7 +1166,7 @@ arc_buf_clone(arc_buf_t *from)
 	arc_buf_hdr_t *hdr = from->b_hdr;
 	uint64_t size = hdr->b_size;
 
-	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
 	buf->b_efunc = NULL;
@@ -914,28 +1186,21 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
 	kmutex_t *hash_lock;
 
 	/*
-	 * Check to see if this buffer is currently being evicted via
-	 * arc_do_user_evicts().
+	 * Check to see if this buffer is evicted.  Callers
+	 * must verify b_data != NULL to know if the add_ref
+	 * was successful.
 	 */
-	mutex_enter(&arc_eviction_mtx);
-	hdr = buf->b_hdr;
-	if (hdr == NULL) {
-		mutex_exit(&arc_eviction_mtx);
+	rw_enter(&buf->b_lock, RW_READER);
+	if (buf->b_data == NULL) {
+		rw_exit(&buf->b_lock);
 		return;
 	}
+	hdr = buf->b_hdr;
+	ASSERT(hdr != NULL);
 	hash_lock = HDR_LOCK(hdr);
-	mutex_exit(&arc_eviction_mtx);
-
 	mutex_enter(hash_lock);
-	if (buf->b_data == NULL) {
-		/*
-		 * This buffer is evicted.
-		 */
-		mutex_exit(hash_lock);
-		return;
-	}
+	rw_exit(&buf->b_lock);
 
-	ASSERT(buf->b_hdr == hdr);
 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
 	add_reference(hdr, hash_lock, tag);
 	arc_access(hdr, hash_lock);
@@ -946,6 +1211,29 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
 	    data, metadata, hits);
 }
 
+/*
+ * Free the arc data buffer.  If it is an l2arc write in progress,
+ * the buffer is placed on l2arc_free_on_write to be freed later.
+ */
+static void
+arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
+    void *data, size_t size)
+{
+	if (HDR_L2_WRITING(hdr)) {
+		l2arc_data_free_t *df;
+		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+		df->l2df_data = data;
+		df->l2df_size = size;
+		df->l2df_func = free_func;
+		mutex_enter(&l2arc_free_on_write_mtx);
+		list_insert_head(l2arc_free_on_write, df);
+		mutex_exit(&l2arc_free_on_write_mtx);
+		ARCSTAT_BUMP(arcstat_l2_free_on_write);
+	} else {
+		free_func(data, size);
+	}
+}
+
 static void
 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
 {
@@ -960,18 +1248,24 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
 		arc_cksum_verify(buf);
 		if (!recycle) {
 			if (type == ARC_BUFC_METADATA) {
-				zio_buf_free(buf->b_data, size);
+				arc_buf_data_free(buf->b_hdr, zio_buf_free,
+				    buf->b_data, size);
+				arc_space_return(size);
 			} else {
 				ASSERT(type == ARC_BUFC_DATA);
-				zio_data_buf_free(buf->b_data, size);
+				arc_buf_data_free(buf->b_hdr,
+				    zio_data_buf_free, buf->b_data, size);
+				atomic_add_64(&arc_size, -size);
 			}
-			atomic_add_64(&arc_size, -size);
 		}
 		if (list_link_active(&buf->b_hdr->b_arc_node)) {
+			uint64_t *cnt = &state->arcs_lsize[type];
+
 			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
 			ASSERT(state != arc_anon);
-			ASSERT3U(state->arcs_lsize, >=, size);
-			atomic_add_64(&state->arcs_lsize, -size);
+
+			ASSERT3U(*cnt, >=, size);
+			atomic_add_64(cnt, -size);
 		}
 		ASSERT3U(state->arcs_size, >=, size);
 		atomic_add_64(&state->arcs_size, -size);
@@ -1002,6 +1296,35 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
 	ASSERT3P(hdr->b_state, ==, arc_anon);
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+	ASSERT(!(hdr->b_flags & ARC_STORED));
+
+	if (hdr->b_l2hdr != NULL) {
+		if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
+			/*
+			 * To prevent arc_free() and l2arc_evict() from
+			 * attempting to free the same buffer at the same time,
+			 * a FREE_IN_PROGRESS flag is given to arc_free() to
+			 * give it priority.  l2arc_evict() can't destroy this
+			 * header while we are waiting on l2arc_buflist_mtx.
+			 *
+			 * The hdr may be removed from l2ad_buflist before we
+			 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+			 */
+			mutex_enter(&l2arc_buflist_mtx);
+			if (hdr->b_l2hdr != NULL) {
+				list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
+				    hdr);
+			}
+			mutex_exit(&l2arc_buflist_mtx);
+		} else {
+			list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+		}
+		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+		kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
+		if (hdr->b_state == arc_l2c_only)
+			l2arc_hdr_stat_remove();
+		hdr->b_l2hdr = NULL;
+	}
 
 	if (!BUF_EMPTY(hdr)) {
 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
@@ -1014,12 +1337,14 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 
 		if (buf->b_efunc) {
 			mutex_enter(&arc_eviction_mtx);
+			rw_enter(&buf->b_lock, RW_WRITER);
 			ASSERT(buf->b_hdr != NULL);
 			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
 			hdr->b_buf = buf->b_next;
 			buf->b_hdr = &arc_eviction_hdr;
 			buf->b_next = arc_eviction_list;
 			arc_eviction_list = buf;
+			rw_exit(&buf->b_lock);
 			mutex_exit(&arc_eviction_mtx);
 		} else {
 			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
@@ -1029,7 +1354,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
 		hdr->b_freeze_cksum = NULL;
 	}
-	mutex_destroy(&hdr->b_freeze_lock);
 
 	ASSERT(!list_link_active(&hdr->b_arc_node));
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1124,14 +1448,19 @@ arc_buf_size(arc_buf_t *buf)
  * - return the data block from this buffer rather than freeing it.
  * This flag is used by callers that are trying to make space for a
  * new buffer in a full arc cache.
+ *
+ * This function makes a "best effort".  It skips over any buffers
+ * it can't get a hash_lock on, and so may not catch all candidates.
+ * It may also return without evicting as much space as requested.
  */
 static void *
-arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
+arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
     arc_buf_contents_t type)
 {
 	arc_state_t *evicted_state;
 	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
 	arc_buf_hdr_t *ab, *ab_prev = NULL;
+	list_t *list = &state->arcs_list[type];
 	kmutex_t *hash_lock;
 	boolean_t have_lock;
 	void *stolen = NULL;
@@ -1143,10 +1472,11 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
 	mutex_enter(&state->arcs_mtx);
 	mutex_enter(&evicted_state->arcs_mtx);
 
-	for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
-		ab_prev = list_prev(&state->arcs_list, ab);
+	for (ab = list_tail(list); ab; ab = ab_prev) {
+		ab_prev = list_prev(list, ab);
 		/* prefetch buffers have a minimum lifespan */
 		if (HDR_IO_IN_PROGRESS(ab) ||
+		    (spa && ab->b_spa != spa) ||
 		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
 		    LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) {
 			skipped++;
@@ -1163,10 +1493,15 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
 			ASSERT(ab->b_datacnt > 0);
 			while (ab->b_buf) {
 				arc_buf_t *buf = ab->b_buf;
+				if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
+					missed += 1;
+					break;
+				}
 				if (buf->b_data) {
 					bytes_evicted += ab->b_size;
 					if (recycle && ab->b_type == type &&
-					    ab->b_size == bytes) {
+					    ab->b_size == bytes &&
+					    !HDR_L2_WRITING(ab)) {
 						stolen = buf->b_data;
 						recycle = FALSE;
 					}
@@ -1180,16 +1515,20 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
 					buf->b_next = arc_eviction_list;
 					arc_eviction_list = buf;
 					mutex_exit(&arc_eviction_mtx);
+					rw_exit(&buf->b_lock);
 				} else {
+					rw_exit(&buf->b_lock);
 					arc_buf_destroy(buf,
 					    buf->b_data == stolen, TRUE);
 				}
 			}
-			ASSERT(ab->b_datacnt == 0);
-			arc_change_state(evicted_state, ab, hash_lock);
-			ASSERT(HDR_IN_HASH_TABLE(ab));
-			ab->b_flags = ARC_IN_HASH_TABLE;
-			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+			if (ab->b_datacnt == 0) {
+				arc_change_state(evicted_state, ab, hash_lock);
+				ASSERT(HDR_IN_HASH_TABLE(ab));
+				ab->b_flags |= ARC_IN_HASH_TABLE;
+				ab->b_flags &= ~ARC_BUF_AVAILABLE;
+				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+			}
 			if (!have_lock)
 				mutex_exit(hash_lock);
 			if (bytes >= 0 && bytes_evicted >= bytes)
@@ -1212,6 +1551,27 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
 	if (missed)
 		ARCSTAT_INCR(arcstat_mutex_miss, missed);
 
+	/*
+	 * We have just evicted some date into the ghost state, make
+	 * sure we also adjust the ghost state size if necessary.
+	 */
+	if (arc_no_grow &&
+	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
+		int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
+		    arc_mru_ghost->arcs_size - arc_c;
+
+		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
+			int64_t todelete =
+			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
+			arc_evict_ghost(arc_mru_ghost, NULL, todelete);
+		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
+			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
+			    arc_mru_ghost->arcs_size +
+			    arc_mfu_ghost->arcs_size - arc_c);
+			arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
+		}
+	}
+
 	return (stolen);
 }
 
@@ -1220,9 +1580,10 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
  * bytes.  Destroy the buffers that are removed.
  */
 static void
-arc_evict_ghost(arc_state_t *state, int64_t bytes)
+arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
 {
 	arc_buf_hdr_t *ab, *ab_prev;
+	list_t *list = &state->arcs_list[ARC_BUFC_DATA];
 	kmutex_t *hash_lock;
 	uint64_t bytes_deleted = 0;
 	uint64_t bufs_skipped = 0;
@@ -1230,17 +1591,30 @@ arc_evict_ghost(arc_state_t *state, int64_t bytes)
 	ASSERT(GHOST_STATE(state));
 top:
 	mutex_enter(&state->arcs_mtx);
-	for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
-		ab_prev = list_prev(&state->arcs_list, ab);
+	for (ab = list_tail(list); ab; ab = ab_prev) {
+		ab_prev = list_prev(list, ab);
+		if (spa && ab->b_spa != spa)
+			continue;
 		hash_lock = HDR_LOCK(ab);
 		if (mutex_tryenter(hash_lock)) {
 			ASSERT(!HDR_IO_IN_PROGRESS(ab));
 			ASSERT(ab->b_buf == NULL);
-			arc_change_state(arc_anon, ab, hash_lock);
-			mutex_exit(hash_lock);
 			ARCSTAT_BUMP(arcstat_deleted);
 			bytes_deleted += ab->b_size;
-			arc_hdr_destroy(ab);
+
+			if (ab->b_l2hdr != NULL) {
+				/*
+				 * This buffer is cached on the 2nd Level ARC;
+				 * don't destroy the header.
+				 */
+				arc_change_state(arc_l2c_only, ab, hash_lock);
+				mutex_exit(hash_lock);
+			} else {
+				arc_change_state(arc_anon, ab, hash_lock);
+				mutex_exit(hash_lock);
+				arc_hdr_destroy(ab);
+			}
+
 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
 			if (bytes >= 0 && bytes_deleted >= bytes)
 				break;
@@ -1256,6 +1630,12 @@ top:
 	}
 	mutex_exit(&state->arcs_mtx);
 
+	if (list == &state->arcs_list[ARC_BUFC_DATA] &&
+	    (bytes < 0 || bytes_deleted < bytes)) {
+		list = &state->arcs_list[ARC_BUFC_METADATA];
+		goto top;
+	}
+
 	if (bufs_skipped) {
 		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
 		ASSERT(bytes >= 0);
@@ -1271,38 +1651,58 @@ arc_adjust(void)
 {
 	int64_t top_sz, mru_over, arc_over, todelete;
 
-	top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+	top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used;
+
+	if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
+		int64_t toevict =
+		    MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p);
+		(void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA);
+		top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+	}
 
-	if (top_sz > arc_p && arc_mru->arcs_lsize > 0) {
-		int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p);
-		(void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF);
+	if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+		int64_t toevict =
+		    MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p);
+		(void) arc_evict(arc_mru, NULL, toevict, FALSE,
+		    ARC_BUFC_METADATA);
 		top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
 	}
 
 	mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
 
 	if (mru_over > 0) {
-		if (arc_mru_ghost->arcs_lsize > 0) {
-			todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over);
-			arc_evict_ghost(arc_mru_ghost, todelete);
+		if (arc_mru_ghost->arcs_size > 0) {
+			todelete = MIN(arc_mru_ghost->arcs_size, mru_over);
+			arc_evict_ghost(arc_mru_ghost, NULL, todelete);
 		}
 	}
 
 	if ((arc_over = arc_size - arc_c) > 0) {
 		int64_t tbl_over;
 
-		if (arc_mfu->arcs_lsize > 0) {
-			int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over);
-			(void) arc_evict(arc_mfu, toevict, FALSE,
-			    ARC_BUFC_UNDEF);
+		if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
+			int64_t toevict =
+			    MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over);
+			(void) arc_evict(arc_mfu, NULL, toevict, FALSE,
+			    ARC_BUFC_DATA);
+			arc_over = arc_size - arc_c;
+		}
+
+		if (arc_over > 0 &&
+		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+			int64_t toevict =
+			    MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA],
+			    arc_over);
+			(void) arc_evict(arc_mfu, NULL, toevict, FALSE,
+			    ARC_BUFC_METADATA);
 		}
 
-		tbl_over = arc_size + arc_mru_ghost->arcs_lsize +
-		    arc_mfu_ghost->arcs_lsize - arc_c*2;
+		tbl_over = arc_size + arc_mru_ghost->arcs_size +
+		    arc_mfu_ghost->arcs_size - arc_c * 2;
 
-		if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) {
-			todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over);
-			arc_evict_ghost(arc_mfu_ghost, todelete);
+		if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) {
+			todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over);
+			arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
 		}
 	}
 }
@@ -1314,7 +1714,9 @@ arc_do_user_evicts(void)
 	while (arc_eviction_list != NULL) {
 		arc_buf_t *buf = arc_eviction_list;
 		arc_eviction_list = buf->b_next;
+		rw_enter(&buf->b_lock, RW_WRITER);
 		buf->b_hdr = NULL;
+		rw_exit(&buf->b_lock);
 		mutex_exit(&arc_eviction_mtx);
 
 		if (buf->b_efunc != NULL)
@@ -1329,24 +1731,40 @@ arc_do_user_evicts(void)
 }
 
 /*
- * Flush all *evictable* data from the cache.
+ * Flush all *evictable* data from the cache for the given spa.
  * NOTE: this will not touch "active" (i.e. referenced) data.
  */
 void
-arc_flush(void)
+arc_flush(spa_t *spa)
 {
-	while (list_head(&arc_mru->arcs_list))
-		(void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF);
-	while (list_head(&arc_mfu->arcs_list))
-		(void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF);
+	while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
+		(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
+		if (spa)
+			break;
+	}
+	while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
+		(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
+		if (spa)
+			break;
+	}
+	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
+		(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
+		if (spa)
+			break;
+	}
+	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
+		(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
+		if (spa)
+			break;
+	}
 
-	arc_evict_ghost(arc_mru_ghost, -1);
-	arc_evict_ghost(arc_mfu_ghost, -1);
+	arc_evict_ghost(arc_mru_ghost, spa, -1);
+	arc_evict_ghost(arc_mfu_ghost, spa, -1);
 
 	mutex_enter(&arc_reclaim_thr_lock);
 	arc_do_user_evicts();
 	mutex_exit(&arc_reclaim_thr_lock);
-	ASSERT(arc_eviction_list == NULL);
+	ASSERT(spa || arc_eviction_list == NULL);
 }
 
 int arc_shrink_shift = 5;		/* log2(fraction of arc to reclaim) */
@@ -1380,7 +1798,7 @@ arc_shrink(void)
 		arc_adjust();
 }
 
-static int zfs_needfree = 0;
+static int needfree = 0;
 
 static int
 arc_reclaim_needed(void)
@@ -1391,13 +1809,28 @@ arc_reclaim_needed(void)
 
 #ifdef _KERNEL
 
-	if (zfs_needfree)
+	if (needfree)
 		return (1);
 
 #if 0
 	/*
+	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
+	 */
+	extra = desfree;
+
+	/*
+	 * check that we're out of range of the pageout scanner.  It starts to
+	 * schedule paging if freemem is less than lotsfree and needfree.
+	 * lotsfree is the high-water mark for pageout, and needfree is the
+	 * number of needed free pages.  We add extra pages here to make sure
+	 * the scanner doesn't start up while we're freeing memory.
+	 */
+	if (freemem < lotsfree + needfree + extra)
+		return (1);
+
+	/*
 	 * check to make sure that swapfs has enough space so that anon
-	 * reservations can still succeeed. anon_resvmem() checks that the
+	 * reservations can still succeed. anon_resvmem() checks that the
 	 * availrmem is greater than swapfs_minfree, and the number of reserved
 	 * swap pages.  We also add a bit of extra here just to prevent
 	 * circumstances from getting really dire.
@@ -1405,23 +1838,6 @@ arc_reclaim_needed(void)
 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
 		return (1);
 
-	/*
-	 * If zio data pages are being allocated out of a separate heap segment,
-	 * then check that the size of available vmem for this area remains
-	 * above 1/4th free.  This needs to be done when the size of the
-	 * non-default segment is smaller than physical memory, so we could
-	 * conceivably run out of VA in that segment before running out of
-	 * physical memory.
-	 */
-	if (zio_arena != NULL) {
-		size_t arc_ziosize =
-		    btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC));
-
-		if ((physmem > arc_ziosize) &&
-		    (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2))
-			return (1);
-	}
-
 #if defined(__i386)
 	/*
 	 * If we're on an i386 platform, it's possible that we'll exhaust the
@@ -1431,7 +1847,7 @@ arc_reclaim_needed(void)
 	 * can have in the system.  However, this is generally fixed at 25 pages
 	 * which is so low that it's useless.  In this comparison, we seek to
 	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
-	 * heap is allocated.  (Or, in the caclulation, if less than 1/4th is
+	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
 	 * free)
 	 */
 	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
@@ -1462,12 +1878,13 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 #endif
 
 #ifdef _KERNEL
-	/*
-	 * First purge some DNLC entries, in case the DNLC is using
-	 * up too much memory.
-	 */
-	dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
-
+	if (arc_meta_used >= arc_meta_limit) {
+		/*
+		 * We are exceeding our meta-data cache limit.
+		 * Purge some DNLC entries to release holds on meta-data.
+		 */
+		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
+	}
 #if defined(__i386)
 	/*
 	 * Reclaim unused memory from all kmem caches.
@@ -1477,7 +1894,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 #endif
 
 	/*
-	 * An agressive reclamation will shrink the cache size as well as
+	 * An aggressive reclamation will shrink the cache size as well as
 	 * reap free buffers from the arc kmem caches.
 	 */
 	if (strat == ARC_RECLAIM_AGGR)
@@ -1526,11 +1943,10 @@ arc_reclaim_thread(void *dummy __unused)
 
 			/* reset the growth delay for every reclaim */
 			growtime = LBOLT + (arc_grow_retry * hz);
-			ASSERT(growtime > 0);
 
-			if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) {
+			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
 				/*
-				 * If zfs_needfree is TRUE our vm_lowmem hook
+				 * If needfree is TRUE our vm_lowmem hook
 				 * was called and in that case we must free some
 				 * memory, so switch to aggressive mode.
 				 */
@@ -1538,11 +1954,13 @@ arc_reclaim_thread(void *dummy __unused)
 				last_reclaim = ARC_RECLAIM_AGGR;
 			}
 			arc_kmem_reap_now(last_reclaim);
-		} else if ((growtime > 0) && ((growtime - LBOLT) <= 0)) {
+			arc_warm = B_TRUE;
+
+		} else if (arc_no_grow && LBOLT >= growtime) {
 			arc_no_grow = FALSE;
 		}
 
-		if (zfs_needfree ||
+		if (needfree ||
 		    (2 * arc_c < arc_size +
 		    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size))
 			arc_adjust();
@@ -1551,9 +1969,9 @@ arc_reclaim_thread(void *dummy __unused)
 			arc_do_user_evicts();
 
 		if (arc_reclaim_needed()) {
-			zfs_needfree = 0;
+			needfree = 0;
 #ifdef _KERNEL
-			wakeup(&zfs_needfree);
+			wakeup(&needfree);
 #endif
 		}
 
@@ -1580,6 +1998,9 @@ arc_adapt(int bytes, arc_state_t *state)
 {
 	int mult;
 
+	if (state == arc_l2c_only)
+		return;
+
 	ASSERT(bytes > 0);
 	/*
 	 * Adapt the target size of the MRU list:
@@ -1634,8 +2055,25 @@ arc_adapt(int bytes, arc_state_t *state)
  * prior to insert.
  */
 static int
-arc_evict_needed()
+arc_evict_needed(arc_buf_contents_t type)
 {
+	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
+		return (1);
+
+#if 0
+#ifdef _KERNEL
+	/*
+	 * If zio data pages are being allocated out of a separate heap segment,
+	 * then enforce that the size of available vmem for this area remains
+	 * above about 1/32nd free.
+	 */
+	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
+	    vmem_size(zio_arena, VMEM_FREE) <
+	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
+		return (1);
+#endif
+#endif
+
 	if (arc_reclaim_needed())
 		return (1);
 
@@ -1678,14 +2116,15 @@ arc_get_data_buf(arc_buf_t *buf)
 	 * We have not yet reached cache maximum size,
 	 * just allocate a new buffer.
 	 */
-	if (!arc_evict_needed()) {
+	if (!arc_evict_needed(type)) {
 		if (type == ARC_BUFC_METADATA) {
 			buf->b_data = zio_buf_alloc(size);
+			arc_space_consume(size);
 		} else {
 			ASSERT(type == ARC_BUFC_DATA);
 			buf->b_data = zio_data_buf_alloc(size);
+			atomic_add_64(&arc_size, size);
 		}
-		atomic_add_64(&arc_size, size);
 		goto out;
 	}
 
@@ -1700,20 +2139,23 @@ arc_get_data_buf(arc_buf_t *buf)
 
 	if (state == arc_mru || state == arc_anon) {
 		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
-		state = (arc_p > mru_used) ? arc_mfu : arc_mru;
+		state = (arc_mfu->arcs_lsize[type] > 0 &&
+		    arc_p > mru_used) ? arc_mfu : arc_mru;
 	} else {
 		/* MFU cases */
 		uint64_t mfu_space = arc_c - arc_p;
-		state =  (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
+		state =  (arc_mru->arcs_lsize[type] > 0 &&
+		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
 	}
-	if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) {
+	if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
 		if (type == ARC_BUFC_METADATA) {
 			buf->b_data = zio_buf_alloc(size);
+			arc_space_consume(size);
 		} else {
 			ASSERT(type == ARC_BUFC_DATA);
 			buf->b_data = zio_data_buf_alloc(size);
+			atomic_add_64(&arc_size, size);
 		}
-		atomic_add_64(&arc_size, size);
 		ARCSTAT_BUMP(arcstat_recycle_miss);
 	}
 	ASSERT(buf->b_data != NULL);
@@ -1728,7 +2170,7 @@ out:
 		atomic_add_64(&hdr->b_state->arcs_size, size);
 		if (list_link_active(&hdr->b_arc_node)) {
 			ASSERT(refcount_is_zero(&hdr->b_refcnt));
-			atomic_add_64(&hdr->b_state->arcs_lsize, size);
+			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
 		}
 		/*
 		 * If we are growing the cache, and we are adding anonymous
@@ -1773,10 +2215,6 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
 			if (refcount_count(&buf->b_refcnt) == 0) {
 				ASSERT(list_link_active(&buf->b_arc_node));
-				mutex_enter(&arc_mru->arcs_mtx);
-				list_remove(&arc_mru->arcs_list, buf);
-				list_insert_head(&arc_mru->arcs_list, buf);
-				mutex_exit(&arc_mru->arcs_mtx);
 			} else {
 				buf->b_flags &= ~ARC_PREFETCH;
 				ARCSTAT_BUMP(arcstat_mru_hits);
@@ -1836,10 +2274,6 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
 			ASSERT(refcount_count(&buf->b_refcnt) == 0);
 			ASSERT(list_link_active(&buf->b_arc_node));
-			mutex_enter(&arc_mfu->arcs_mtx);
-			list_remove(&arc_mfu->arcs_list, buf);
-			list_insert_head(&arc_mfu->arcs_list, buf);
-			mutex_exit(&arc_mfu->arcs_mtx);
 		}
 		ARCSTAT_BUMP(arcstat_mfu_hits);
 		buf->b_arc_access = LBOLT;
@@ -1865,6 +2299,14 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		arc_change_state(new_state, buf, hash_lock);
 
 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
+	} else if (buf->b_state == arc_l2c_only) {
+		/*
+		 * This buffer is on the 2nd Level ARC.
+		 */
+
+		buf->b_arc_access = LBOLT;
+		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+		arc_change_state(arc_mfu, buf, hash_lock);
 	} else {
 		ASSERT(!"invalid arc state");
 	}
@@ -1879,7 +2321,7 @@ arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
 	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
 }
 
-/* a generic arc_done_func_t which you can use */
+/* a generic arc_done_func_t */
 void
 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
@@ -1917,15 +2359,24 @@ arc_read_done(zio_t *zio)
 	    &hash_lock);
 
 	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
-	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
+	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+	    (found == hdr && HDR_L2_READING(hdr)));
+
+	hdr->b_flags &= ~ARC_L2_EVICTED;
+	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
+		hdr->b_flags &= ~ARC_L2CACHE;
 
 	/* byteswap if necessary */
 	callback_list = hdr->b_acb;
 	ASSERT(callback_list != NULL);
-	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
-		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
+	if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
+		    byteswap_uint64_array :
+		    dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
+		func(buf->b_data, hdr->b_size);
+	}
 
-	arc_cksum_compute(buf);
+	arc_cksum_compute(buf, B_FALSE);
 
 	/* create copies of the data buffer for the callers */
 	abuf = buf;
@@ -1952,9 +2403,6 @@ arc_read_done(zio_t *zio)
 		if (HDR_IN_HASH_TABLE(hdr))
 			buf_hash_remove(hdr);
 		freeable = refcount_is_zero(&hdr->b_refcnt);
-		/* convert checksum errors into IO errors */
-		if (zio->io_error == ECKSUM)
-			zio->io_error = EIO;
 	}
 
 	/*
@@ -2020,16 +2468,40 @@ arc_read_done(zio_t *zio)
  *
  * arc_read_done() will invoke all the requested "done" functions
  * for readers of this block.
+ *
+ * Normal callers should use arc_read and pass the arc buffer and offset
+ * for the bp.  But if you know you don't need locking, you can use
+ * arc_read_bp.
  */
 int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
-    arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t *arc_flags, zbookmark_t *zb)
+arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+    arc_done_func_t *done, void *private, int priority, int zio_flags,
+    uint32_t *arc_flags, const zbookmark_t *zb)
+{
+	int err;
+	arc_buf_hdr_t *hdr = pbuf->b_hdr;
+
+	ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
+	ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
+	rw_enter(&pbuf->b_lock, RW_READER);
+
+	err = arc_read_nolock(pio, spa, bp, done, private, priority,
+	    zio_flags, arc_flags, zb);
+
+	ASSERT3P(hdr, ==, pbuf->b_hdr);
+	rw_exit(&pbuf->b_lock);
+	return (err);
+}
+
+int
+arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+    arc_done_func_t *done, void *private, int priority, int zio_flags,
+    uint32_t *arc_flags, const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
 	kmutex_t *hash_lock;
-	zio_t	*rzio;
+	zio_t *rzio;
 
 top:
 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
@@ -2053,10 +2525,9 @@ top:
 				    KM_SLEEP);
 				acb->acb_done = done;
 				acb->acb_private = private;
-				acb->acb_byteswap = swap;
 				if (pio != NULL)
 					acb->acb_zio_dummy = zio_null(pio,
-					    spa, NULL, NULL, flags);
+					    spa, NULL, NULL, zio_flags);
 
 				ASSERT(acb->acb_done != NULL);
 				acb->acb_next = hdr->b_acb;
@@ -2093,6 +2564,8 @@ top:
 		}
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 		arc_access(hdr, hash_lock);
+		if (*arc_flags & ARC_L2CACHE)
+			hdr->b_flags |= ARC_L2CACHE;
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_hits);
 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
@@ -2104,6 +2577,8 @@ top:
 	} else {
 		uint64_t size = BP_GET_LSIZE(bp);
 		arc_callback_t	*acb;
+		vdev_t *vd = NULL;
+		daddr_t addr;
 
 		if (hdr == NULL) {
 			/* this block is not in the cache */
@@ -2130,6 +2605,8 @@ top:
 				    private);
 				hdr->b_flags |= ARC_PREFETCH;
 			}
+			if (*arc_flags & ARC_L2CACHE)
+				hdr->b_flags |= ARC_L2CACHE;
 			if (BP_GET_LEVEL(bp) > 0)
 				hdr->b_flags |= ARC_INDIRECT;
 		} else {
@@ -2144,7 +2621,9 @@ top:
 				hdr->b_flags |= ARC_PREFETCH;
 			else
 				add_reference(hdr, hash_lock, private);
-			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+			if (*arc_flags & ARC_L2CACHE)
+				hdr->b_flags |= ARC_L2CACHE;
+			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 			buf->b_hdr = hdr;
 			buf->b_data = NULL;
 			buf->b_efunc = NULL;
@@ -2160,7 +2639,6 @@ top:
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
 		acb->acb_done = done;
 		acb->acb_private = private;
-		acb->acb_byteswap = swap;
 
 		ASSERT(hdr->b_acb == NULL);
 		hdr->b_acb = acb;
@@ -2176,6 +2654,18 @@ top:
 
 		if (GHOST_STATE(hdr->b_state))
 			arc_access(hdr, hash_lock);
+
+		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
+		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
+			addr = hdr->b_l2hdr->b_daddr;
+			/*
+			 * Lock out device removal.
+			 */
+			if (vdev_is_dead(vd) ||
+			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
+				vd = NULL;
+		}
+
 		mutex_exit(hash_lock);
 
 		ASSERT3U(hdr->b_size, ==, size);
@@ -2186,8 +2676,65 @@ top:
 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
 		    data, metadata, misses);
 
+		if (vd != NULL) {
+			/*
+			 * Read from the L2ARC if the following are true:
+			 * 1. The L2ARC vdev was previously cached.
+			 * 2. This buffer still has L2ARC metadata.
+			 * 3. This buffer isn't currently writing to the L2ARC.
+			 * 4. The L2ARC entry wasn't evicted, which may
+			 *    also have invalidated the vdev.
+			 */
+			if (hdr->b_l2hdr != NULL &&
+			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
+				l2arc_read_callback_t *cb;
+
+				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(arcstat_l2_hits);
+
+				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
+				    KM_SLEEP);
+				cb->l2rcb_buf = buf;
+				cb->l2rcb_spa = spa;
+				cb->l2rcb_bp = *bp;
+				cb->l2rcb_zb = *zb;
+				cb->l2rcb_flags = zio_flags;
+
+				/*
+				 * l2arc read.  The SCL_L2ARC lock will be
+				 * released by l2arc_read_done().
+				 */
+				rzio = zio_read_phys(pio, vd, addr, size,
+				    buf->b_data, ZIO_CHECKSUM_OFF, 
+				    l2arc_read_done, cb, priority, zio_flags |
+				    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+				    ZIO_FLAG_DONT_PROPAGATE |
+				    ZIO_FLAG_DONT_RETRY, B_FALSE);
+				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
+				    zio_t *, rzio);
+
+				if (*arc_flags & ARC_NOWAIT) {
+					zio_nowait(rzio);
+					return (0);
+				}
+
+				ASSERT(*arc_flags & ARC_WAIT);
+				if (zio_wait(rzio) == 0)
+					return (0);
+
+				/* l2arc read error; goto zio_read() */
+			} else {
+				DTRACE_PROBE1(l2arc__miss,
+				    arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(arcstat_l2_misses);
+				if (HDR_L2_WRITING(hdr))
+					ARCSTAT_BUMP(arcstat_l2_rw_clash);
+				spa_config_exit(spa, SCL_L2ARC, vd);
+			}
+		}
+
 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
-		    arc_read_done, buf, priority, flags, zb);
+		    arc_read_done, buf, priority, zio_flags, zb);
 
 		if (*arc_flags & ARC_WAIT)
 			return (zio_wait(rzio));
@@ -2254,45 +2801,28 @@ arc_buf_evict(arc_buf_t *buf)
 	kmutex_t *hash_lock;
 	arc_buf_t **bufp;
 
-	mutex_enter(&arc_eviction_mtx);
+	rw_enter(&buf->b_lock, RW_WRITER);
 	hdr = buf->b_hdr;
 	if (hdr == NULL) {
 		/*
 		 * We are in arc_do_user_evicts().
 		 */
 		ASSERT(buf->b_data == NULL);
-		mutex_exit(&arc_eviction_mtx);
+		rw_exit(&buf->b_lock);
 		return (0);
-	}
-	hash_lock = HDR_LOCK(hdr);
-	mutex_exit(&arc_eviction_mtx);
-
-	mutex_enter(hash_lock);
-
-	if (buf->b_data == NULL) {
+	} else if (buf->b_data == NULL) {
+		arc_buf_t copy = *buf; /* structure assignment */
 		/*
-		 * We are on the eviction list.
+		 * We are on the eviction list; process this buffer now
+		 * but let arc_do_user_evicts() do the reaping.
 		 */
-		mutex_exit(hash_lock);
-		mutex_enter(&arc_eviction_mtx);
-		if (buf->b_hdr == NULL) {
-			/*
-			 * We are already in arc_do_user_evicts().
-			 */
-			mutex_exit(&arc_eviction_mtx);
-			return (0);
-		} else {
-			arc_buf_t copy = *buf; /* structure assignment */
-			/*
-			 * Process this buffer now
-			 * but let arc_do_user_evicts() do the reaping.
-			 */
-			buf->b_efunc = NULL;
-			mutex_exit(&arc_eviction_mtx);
-			VERIFY(copy.b_efunc(&copy) == 0);
-			return (1);
-		}
+		buf->b_efunc = NULL;
+		rw_exit(&buf->b_lock);
+		VERIFY(copy.b_efunc(&copy) == 0);
+		return (1);
 	}
+	hash_lock = HDR_LOCK(hdr);
+	mutex_enter(hash_lock);
 
 	ASSERT(buf->b_hdr == hdr);
 	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
@@ -2323,12 +2853,14 @@ arc_buf_evict(arc_buf_t *buf)
 
 		arc_change_state(evicted_state, hdr, hash_lock);
 		ASSERT(HDR_IN_HASH_TABLE(hdr));
-		hdr->b_flags = ARC_IN_HASH_TABLE;
+		hdr->b_flags |= ARC_IN_HASH_TABLE;
+		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
 
 		mutex_exit(&evicted_state->arcs_mtx);
 		mutex_exit(&old_state->arcs_mtx);
 	}
 	mutex_exit(hash_lock);
+	rw_exit(&buf->b_lock);
 
 	VERIFY(buf->b_efunc(buf) == 0);
 	buf->b_efunc = NULL;
@@ -2342,16 +2874,22 @@ arc_buf_evict(arc_buf_t *buf)
  * Release this buffer from the cache.  This must be done
  * after a read and prior to modifying the buffer contents.
  * If the buffer has more than one reference, we must make
- * make a new hdr for the buffer.
+ * a new hdr for the buffer.
  */
 void
 arc_release(arc_buf_t *buf, void *tag)
 {
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-	kmutex_t *hash_lock = HDR_LOCK(hdr);
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_lock;
+	l2arc_buf_hdr_t *l2hdr;
+	uint64_t buf_size;
+
+	rw_enter(&buf->b_lock, RW_WRITER);
+	hdr = buf->b_hdr;
 
 	/* this buffer is not on any list */
 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+	ASSERT(!(hdr->b_flags & ARC_STORED));
 
 	if (hdr->b_state == arc_anon) {
 		/* this buffer is already released */
@@ -2359,22 +2897,32 @@ arc_release(arc_buf_t *buf, void *tag)
 		ASSERT(BUF_EMPTY(hdr));
 		ASSERT(buf->b_efunc == NULL);
 		arc_buf_thaw(buf);
+		rw_exit(&buf->b_lock);
 		return;
 	}
 
+	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
+	l2hdr = hdr->b_l2hdr;
+	if (l2hdr) {
+		mutex_enter(&l2arc_buflist_mtx);
+		hdr->b_l2hdr = NULL;
+		buf_size = hdr->b_size;
+	}
+
 	/*
 	 * Do we have more than one buf?
 	 */
-	if (hdr->b_buf != buf || buf->b_next != NULL) {
+	if (hdr->b_datacnt > 1) {
 		arc_buf_hdr_t *nhdr;
 		arc_buf_t **bufp;
 		uint64_t blksz = hdr->b_size;
 		spa_t *spa = hdr->b_spa;
 		arc_buf_contents_t type = hdr->b_type;
+		uint32_t flags = hdr->b_flags;
 
-		ASSERT(hdr->b_datacnt > 1);
+		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
 		/*
 		 * Pull the data off of this buf and attach it to
 		 * a new anonymous buf.
@@ -2389,37 +2937,39 @@ arc_release(arc_buf_t *buf, void *tag)
 		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
 		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
 		if (refcount_is_zero(&hdr->b_refcnt)) {
-			ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size);
-			atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size);
+			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
+			ASSERT3U(*size, >=, hdr->b_size);
+			atomic_add_64(size, -hdr->b_size);
 		}
 		hdr->b_datacnt -= 1;
 		arc_cksum_verify(buf);
 
 		mutex_exit(hash_lock);
 
-		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
 		nhdr->b_size = blksz;
 		nhdr->b_spa = spa;
 		nhdr->b_type = type;
 		nhdr->b_buf = buf;
 		nhdr->b_state = arc_anon;
 		nhdr->b_arc_access = 0;
-		nhdr->b_flags = 0;
+		nhdr->b_flags = flags & ARC_L2_WRITING;
+		nhdr->b_l2hdr = NULL;
 		nhdr->b_datacnt = 1;
 		nhdr->b_freeze_cksum = NULL;
-		mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 		(void) refcount_add(&nhdr->b_refcnt, tag);
 		buf->b_hdr = nhdr;
+		rw_exit(&buf->b_lock);
 		atomic_add_64(&arc_anon->arcs_size, blksz);
-
-		hdr = nhdr;
 	} else {
+		rw_exit(&buf->b_lock);
 		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
 		ASSERT(!list_link_active(&hdr->b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		arc_change_state(arc_anon, hdr, hash_lock);
 		hdr->b_arc_access = 0;
 		mutex_exit(hash_lock);
+
 		bzero(&hdr->b_dva, sizeof (dva_t));
 		hdr->b_birth = 0;
 		hdr->b_cksum0 = 0;
@@ -2427,25 +2977,47 @@ arc_release(arc_buf_t *buf, void *tag)
 	}
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
+
+	if (l2hdr) {
+		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
+		mutex_exit(&l2arc_buflist_mtx);
+	}
 }
 
 int
 arc_released(arc_buf_t *buf)
 {
-	return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+	int released;
+
+	rw_enter(&buf->b_lock, RW_READER);
+	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+	rw_exit(&buf->b_lock);
+	return (released);
 }
 
 int
 arc_has_callback(arc_buf_t *buf)
 {
-	return (buf->b_efunc != NULL);
+	int callback;
+
+	rw_enter(&buf->b_lock, RW_READER);
+	callback = (buf->b_efunc != NULL);
+	rw_exit(&buf->b_lock);
+	return (callback);
 }
 
 #ifdef ZFS_DEBUG
 int
 arc_referenced(arc_buf_t *buf)
 {
-	return (refcount_count(&buf->b_hdr->b_refcnt));
+	int referenced;
+
+	rw_enter(&buf->b_lock, RW_READER);
+	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
+	rw_exit(&buf->b_lock);
+	return (referenced);
 }
 #endif
 
@@ -2454,12 +3026,27 @@ arc_write_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
+	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	if (callback->awcb_ready) {
-		ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
-		callback->awcb_ready(zio, buf, callback->awcb_private);
+	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
+	callback->awcb_ready(zio, buf, callback->awcb_private);
+
+	/*
+	 * If the IO is already in progress, then this is a re-write
+	 * attempt, so we need to thaw and re-compute the cksum.
+	 * It is the responsibility of the callback to handle the
+	 * accounting for any re-write attempt.
+	 */
+	if (HDR_IO_IN_PROGRESS(hdr)) {
+		mutex_enter(&hdr->b_freeze_lock);
+		if (hdr->b_freeze_cksum != NULL) {
+			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+			hdr->b_freeze_cksum = NULL;
+		}
+		mutex_exit(&hdr->b_freeze_lock);
 	}
-	arc_cksum_compute(buf);
+	arc_cksum_compute(buf, B_FALSE);
+	hdr->b_flags |= ARC_IO_IN_PROGRESS;
 }
 
 static void
@@ -2471,9 +3058,6 @@ arc_write_done(zio_t *zio)
 
 	hdr->b_acb = NULL;
 
-	/* this buffer is on no lists and is not in the hash table */
-	ASSERT3P(hdr->b_state, ==, arc_anon);
-
 	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
 	hdr->b_birth = zio->io_bp->blk_birth;
 	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
@@ -2496,6 +3080,7 @@ arc_write_done(zio_t *zio)
 			 * sync-to-convergence, because we remove
 			 * buffers from the hash table when we arc_free().
 			 */
+			ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
 			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
 			    BP_IDENTITY(zio->io_bp)));
 			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
@@ -2509,7 +3094,9 @@ arc_write_done(zio_t *zio)
 			ASSERT3P(exists, ==, NULL);
 		}
 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
-		arc_access(hdr, hash_lock);
+		/* if it's not anon, we are doing a scrub */
+		if (hdr->b_state == arc_anon)
+			arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
 	} else if (callback->awcb_done == NULL) {
 		int destroy_hdr;
@@ -2526,6 +3113,7 @@ arc_write_done(zio_t *zio)
 	} else {
 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 	}
+	hdr->b_flags &= ~ARC_STORED;
 
 	if (callback->awcb_done) {
 		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
@@ -2535,31 +3123,74 @@ arc_write_done(zio_t *zio)
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
+static void
+write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
+{
+	boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
+
+	/* Determine checksum setting */
+	if (ismd) {
+		/*
+		 * Metadata always gets checksummed.  If the data
+		 * checksum is multi-bit correctable, and it's not a
+		 * ZBT-style checksum, then it's suitable for metadata
+		 * as well.  Otherwise, the metadata checksum defaults
+		 * to fletcher4.
+		 */
+		if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
+		    !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
+			zp->zp_checksum = wp->wp_oschecksum;
+		else
+			zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
+	} else {
+		zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
+		    wp->wp_oschecksum);
+	}
+
+	/* Determine compression setting */
+	if (ismd) {
+		/*
+		 * XXX -- we should design a compression algorithm
+		 * that specializes in arrays of bps.
+		 */
+		zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+		    ZIO_COMPRESS_LZJB;
+	} else {
+		zp->zp_compress = zio_compress_select(wp->wp_dncompress,
+		    wp->wp_oscompress);
+	}
+
+	zp->zp_type = wp->wp_type;
+	zp->zp_level = wp->wp_level;
+	zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
+}
+
 zio_t *
-arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
-    uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
+    boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
     arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb)
+    int zio_flags, const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
-	zio_t	*zio;
+	zio_t *zio;
+	zio_prop_t zp;
 
-	/* this is a private buffer - no locking required */
-	ASSERT3P(hdr->b_state, ==, arc_anon);
-	ASSERT(BUF_EMPTY(hdr));
+	ASSERT(ready != NULL);
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
 	ASSERT(hdr->b_acb == 0);
+	if (l2arc)
+		hdr->b_flags |= ARC_L2CACHE;
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
-	hdr->b_flags |= ARC_IO_IN_PROGRESS;
-	zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
-	    buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
-	    priority, flags, zb);
+
+	write_policy(spa, wp, &zp);
+	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
+	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
 
 	return (zio);
 }
@@ -2584,7 +3215,9 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 		 * nonzero, it should match what we have in the cache.
 		 */
 		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
-		    ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
+		    bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
+		    bp->blk_fill == BLK_FILL_ALREADY_FREED);
+
 		if (ab->b_state != arc_anon)
 			arc_change_state(arc_anon, ab, hash_lock);
 		if (HDR_IO_IN_PROGRESS(ab)) {
@@ -2604,6 +3237,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 			ab->b_buf->b_private = NULL;
 			mutex_exit(hash_lock);
 		} else if (refcount_is_zero(&ab->b_refcnt)) {
+			ab->b_flags |= ARC_FREE_IN_PROGRESS;
 			mutex_exit(hash_lock);
 			arc_hdr_destroy(ab);
 			ARCSTAT_BUMP(arcstat_deleted);
@@ -2624,7 +3258,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 		}
 	}
 
-	zio = zio_free(pio, spa, txg, bp, done, private);
+	zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
 
 	if (arc_flags & ARC_WAIT)
 		return (zio_wait(zio));
@@ -2635,16 +3269,75 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 	return (0);
 }
 
+static int
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
+{
+#ifdef _KERNEL
+	uint64_t inflight_data = arc_anon->arcs_size;
+	uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count);
+	static uint64_t page_load = 0;
+	static uint64_t last_txg = 0;
+
+#if 0
+#if defined(__i386)
+	available_memory =
+	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
+#endif
+#endif
+	if (available_memory >= zfs_write_limit_max)
+		return (0);
+
+	if (txg > last_txg) {
+		last_txg = txg;
+		page_load = 0;
+	}
+	/*
+	 * If we are in pageout, we know that memory is already tight,
+	 * the arc is already going to be evicting, so we just want to
+	 * continue to let page writes occur as quickly as possible.
+	 */
+	if (curproc == pageproc) {
+		if (page_load > available_memory / 4)
+			return (ERESTART);
+		/* Note: reserve is inflated, so we deflate */
+		page_load += reserve / 8;
+		return (0);
+	} else if (page_load > 0 && arc_reclaim_needed()) {
+		/* memory is low, delay before restarting */
+		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+		return (EAGAIN);
+	}
+	page_load = 0;
+
+	if (arc_size > arc_c_min) {
+		uint64_t evictable_memory =
+		    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
+		    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
+		    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
+		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
+		available_memory += MIN(evictable_memory, arc_size - arc_c_min);
+	}
+
+	if (inflight_data > available_memory / 4) {
+		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+		return (ERESTART);
+	}
+#endif
+	return (0);
+}
+
 void
-arc_tempreserve_clear(uint64_t tempreserve)
+arc_tempreserve_clear(uint64_t reserve)
 {
-	atomic_add_64(&arc_tempreserve, -tempreserve);
+	atomic_add_64(&arc_tempreserve, -reserve);
 	ASSERT((int64_t)arc_tempreserve >= 0);
 }
 
 int
-arc_tempreserve_space(uint64_t tempreserve)
+arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 {
+	int error;
+
 #ifdef ZFS_DEBUG
 	/*
 	 * Once in a while, fail for no reason.  Everything should cope.
@@ -2654,31 +3347,37 @@ arc_tempreserve_space(uint64_t tempreserve)
 		return (ERESTART);
 	}
 #endif
-	if (tempreserve > arc_c/4 && !arc_no_grow)
-		arc_c = MIN(arc_c_max, tempreserve * 4);
-	if (tempreserve > arc_c)
+	if (reserve > arc_c/4 && !arc_no_grow)
+		arc_c = MIN(arc_c_max, reserve * 4);
+	if (reserve > arc_c)
 		return (ENOMEM);
 
 	/*
+	 * Writes will, almost always, require additional memory allocations
+	 * in order to compress/encrypt/etc the data.  We therefor need to
+	 * make sure that there is sufficient available memory for this.
+	 */
+	if (error = arc_memory_throttle(reserve, txg))
+		return (error);
+
+	/*
 	 * Throttle writes when the amount of dirty data in the cache
 	 * gets too large.  We try to keep the cache less than half full
 	 * of dirty blocks so that our sync times don't grow too large.
 	 * Note: if two requests come in concurrently, we might let them
 	 * both succeed, when one of them should fail.  Not a huge deal.
-	 *
-	 * XXX The limit should be adjusted dynamically to keep the time
-	 * to sync a dataset fixed (around 1-5 seconds?).
 	 */
-
-	if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
-	    arc_tempreserve + arc_anon->arcs_size > arc_c / 4) {
-		dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
-		    "tempreserve=%lluK arc_c=%lluK\n",
-		    arc_tempreserve>>10, arc_anon->arcs_lsize>>10,
-		    tempreserve>>10, arc_c>>10);
+	if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
+	    arc_anon->arcs_size > arc_c / 4) {
+		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
+		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
+		    arc_tempreserve>>10,
+		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
+		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
+		    reserve>>10, arc_c>>10);
 		return (ERESTART);
 	}
-	atomic_add_64(&arc_tempreserve, tempreserve);
+	atomic_add_64(&arc_tempreserve, reserve);
 	return (0);
 }
 
@@ -2692,10 +3391,10 @@ arc_lowmem(void *arg __unused, int howto __unused)
 
 	/* Serialize access via arc_lowmem_lock. */
 	mutex_enter(&arc_lowmem_lock);
-	zfs_needfree = 1;
+	needfree = 1;
 	cv_signal(&arc_reclaim_thr_cv);
-	while (zfs_needfree)
-		tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5);
+	while (needfree)
+		tsleep(&needfree, 0, "zfs:lowmem", hz / 5);
 	mutex_exit(&arc_lowmem_lock);
 }
 #endif
@@ -2743,6 +3442,16 @@ arc_init(void)
 	arc_c = arc_c_max;
 	arc_p = (arc_c >> 1);
 
+	/* limit meta-data to 1/4 of the arc capacity */
+	arc_meta_limit = arc_c_max / 4;
+
+	/* Allow the tunable to override if it is reasonable */
+	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
+		arc_meta_limit = zfs_arc_meta_limit;
+
+	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
+		arc_c_min = arc_meta_limit / 2;
+
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
@@ -2757,6 +3466,7 @@ arc_init(void)
 	arc_mru_ghost = &ARC_mru_ghost;
 	arc_mfu = &ARC_mfu;
 	arc_mfu_ghost = &ARC_mfu_ghost;
+	arc_l2c_only = &ARC_l2c_only;
 	arc_size = 0;
 
 	mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -2764,15 +3474,28 @@ arc_init(void)
 	mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-
-	list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_arc_node));
+	mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+	list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
 
 	buf_init();
 
@@ -2798,6 +3521,13 @@ arc_init(void)
 #endif
 
 	arc_dead = FALSE;
+	arc_warm = B_FALSE;
+
+	if (zfs_write_limit_max == 0)
+		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
+	else
+		zfs_write_limit_shift = 0;
+	mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
 
 #ifdef _KERNEL
 	/* Warn about ZFS memory and address space requirements. */
@@ -2808,9 +3538,9 @@ arc_init(void)
 	if (kmem_size() < 512 * (1 << 20)) {
 		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
 		    "expect unstable behavior.\n");
-		printf("	     Consider tuning vm.kmem_size and "
+		printf("             Consider tuning vm.kmem_size and "
 		    "vm.kmem_size_max\n");
-		printf("	     in /boot/loader.conf.\n");
+		printf("             in /boot/loader.conf.\n");
 	}
 #endif
 }
@@ -2818,6 +3548,7 @@ arc_init(void)
 void
 arc_fini(void)
 {
+
 	mutex_enter(&arc_reclaim_thr_lock);
 	arc_thread_exit = 1;
 	cv_signal(&arc_reclaim_thr_cv);
@@ -2825,7 +3556,7 @@ arc_fini(void)
 		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
 	mutex_exit(&arc_reclaim_thr_lock);
 
-	arc_flush();
+	arc_flush(NULL);
 
 	arc_dead = TRUE;
 
@@ -2838,10 +3569,14 @@ arc_fini(void)
 	mutex_destroy(&arc_reclaim_thr_lock);
 	cv_destroy(&arc_reclaim_thr_cv);
 
-	list_destroy(&arc_mru->arcs_list);
-	list_destroy(&arc_mru_ghost->arcs_list);
-	list_destroy(&arc_mfu->arcs_list);
-	list_destroy(&arc_mfu_ghost->arcs_list);
+	list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
+	list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+	list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+	list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+	list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
+	list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+	list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
+	list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
 
 	mutex_destroy(&arc_anon->arcs_mtx);
 	mutex_destroy(&arc_mru->arcs_mtx);
@@ -2849,6 +3584,8 @@ arc_fini(void)
 	mutex_destroy(&arc_mfu->arcs_mtx);
 	mutex_destroy(&arc_mfu_ghost->arcs_mtx);
 
+	mutex_destroy(&zfs_write_limit_lock);
+
 	buf_fini();
 
 	mutex_destroy(&arc_lowmem_lock);
@@ -2857,3 +3594,985 @@ arc_fini(void)
 		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
 #endif
 }
+
+/*
+ * Level 2 ARC
+ *
+ * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
+ * It uses dedicated storage devices to hold cached data, which are populated
+ * using large infrequent writes.  The main role of this cache is to boost
+ * the performance of random read workloads.  The intended L2ARC devices
+ * include short-stroked disks, solid state disks, and other media with
+ * substantially faster read latency than disk.
+ *
+ *                 +-----------------------+
+ *                 |         ARC           |
+ *                 +-----------------------+
+ *                    |         ^     ^
+ *                    |         |     |
+ *      l2arc_feed_thread()    arc_read()
+ *                    |         |     |
+ *                    |  l2arc read   |
+ *                    V         |     |
+ *               +---------------+    |
+ *               |     L2ARC     |    |
+ *               +---------------+    |
+ *                   |    ^           |
+ *          l2arc_write() |           |
+ *                   |    |           |
+ *                   V    |           |
+ *                 +-------+      +-------+
+ *                 | vdev  |      | vdev  |
+ *                 | cache |      | cache |
+ *                 +-------+      +-------+
+ *                 +=========+     .-----.
+ *                 :  L2ARC  :    |-_____-|
+ *                 : devices :    | Disks |
+ *                 +=========+    `-_____-'
+ *
+ * Read requests are satisfied from the following sources, in order:
+ *
+ *	1) ARC
+ *	2) vdev cache of L2ARC devices
+ *	3) L2ARC devices
+ *	4) vdev cache of disks
+ *	5) disks
+ *
+ * Some L2ARC device types exhibit extremely slow write performance.
+ * To accommodate for this there are some significant differences between
+ * the L2ARC and traditional cache design:
+ *
+ * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
+ * the ARC behave as usual, freeing buffers and placing headers on ghost
+ * lists.  The ARC does not send buffers to the L2ARC during eviction as
+ * this would add inflated write latencies for all ARC memory pressure.
+ *
+ * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
+ * It does this by periodically scanning buffers from the eviction-end of
+ * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
+ * not already there.  It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction.  The thread that does this is
+ * l2arc_feed_thread(), illustrated below; example sizes are included to
+ * provide a better sense of ratio than this diagram:
+ *
+ *	       head -->                        tail
+ *	        +---------------------+----------+
+ *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
+ *	        +---------------------+----------+   |   o L2ARC eligible
+ *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
+ *	        +---------------------+----------+   |
+ *	             15.9 Gbytes      ^ 32 Mbytes    |
+ *	                           headroom          |
+ *	                                      l2arc_feed_thread()
+ *	                                             |
+ *	                 l2arc write hand <--[oooo]--'
+ *	                         |           8 Mbyte
+ *	                         |          write max
+ *	                         V
+ *		  +==============================+
+ *	L2ARC dev |####|#|###|###|    |####| ... |
+ *	          +==============================+
+ *	                     32 Gbytes
+ *
+ * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
+ * evicted, then the L2ARC has cached a buffer much sooner than it probably
+ * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
+ * safe to say that this is an uncommon case, since buffers at the end of
+ * the ARC lists have moved there due to inactivity.
+ *
+ * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
+ * then the L2ARC simply misses copying some buffers.  This serves as a
+ * pressure valve to prevent heavy read workloads from both stalling the ARC
+ * with waits and clogging the L2ARC with writes.  This also helps prevent
+ * the potential for the L2ARC to churn if it attempts to cache content too
+ * quickly, such as during backups of the entire pool.
+ *
+ * 5. After system boot and before the ARC has filled main memory, there are
+ * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
+ * lists can remain mostly static.  Instead of searching from tail of these
+ * lists as pictured, the l2arc_feed_thread() will search from the list heads
+ * for eligible buffers, greatly increasing its chance of finding them.
+ *
+ * The L2ARC device write speed is also boosted during this time so that
+ * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
+ * there are no L2ARC reads, and no fear of degrading read performance
+ * through increased writes.
+ *
+ * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
+ * the vdev queue can aggregate them into larger and fewer writes.  Each
+ * device is written to in a rotor fashion, sweeping writes through
+ * available space then repeating.
+ *
+ * 7. The L2ARC does not store dirty content.  It never needs to flush
+ * write buffers back to disk based storage.
+ *
+ * 8. If an ARC buffer is written (and dirtied) which also exists in the
+ * L2ARC, the now stale L2ARC buffer is immediately dropped.
+ *
+ * The performance of the L2ARC can be tweaked by a number of tunables, which
+ * may be necessary for different workloads:
+ *
+ *	l2arc_write_max		max write bytes per interval
+ *	l2arc_write_boost	extra write bytes during device warmup
+ *	l2arc_noprefetch	skip caching prefetched buffers
+ *	l2arc_headroom		number of max device writes to precache
+ *	l2arc_feed_secs		seconds between L2ARC writing
+ *
+ * Tunables may be removed or added as future performance improvements are
+ * integrated, and also may become zpool properties.
+ */
+
+static void
+l2arc_hdr_stat_add(void)
+{
+	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
+	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
+}
+
+static void
+l2arc_hdr_stat_remove(void)
+{
+	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
+	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
+}
+
+/*
+ * Cycle through L2ARC devices.  This is how L2ARC load balances.
+ * If a device is returned, this also returns holding the spa config lock.
+ */
+static l2arc_dev_t *
+l2arc_dev_get_next(void)
+{
+	l2arc_dev_t *first, *next = NULL;
+
+	/*
+	 * Lock out the removal of spas (spa_namespace_lock), then removal
+	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
+	 * both locks will be dropped and a spa config lock held instead.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	mutex_enter(&l2arc_dev_mtx);
+
+	/* if there are no vdevs, there is nothing to do */
+	if (l2arc_ndev == 0)
+		goto out;
+
+	first = NULL;
+	next = l2arc_dev_last;
+	do {
+		/* loop around the list looking for a non-faulted vdev */
+		if (next == NULL) {
+			next = list_head(l2arc_dev_list);
+		} else {
+			next = list_next(l2arc_dev_list, next);
+			if (next == NULL)
+				next = list_head(l2arc_dev_list);
+		}
+
+		/* if we have come back to the start, bail out */
+		if (first == NULL)
+			first = next;
+		else if (next == first)
+			break;
+
+	} while (vdev_is_dead(next->l2ad_vdev));
+
+	/* if we were unable to find any usable vdevs, return NULL */
+	if (vdev_is_dead(next->l2ad_vdev))
+		next = NULL;
+
+	l2arc_dev_last = next;
+
+out:
+	mutex_exit(&l2arc_dev_mtx);
+
+	/*
+	 * Grab the config lock to prevent the 'next' device from being
+	 * removed while we are writing to it.
+	 */
+	if (next != NULL)
+		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
+	mutex_exit(&spa_namespace_lock);
+
+	return (next);
+}
+
+/*
+ * Free buffers that were tagged for destruction.
+ */
+static void
+l2arc_do_free_on_write()
+{
+	list_t *buflist;
+	l2arc_data_free_t *df, *df_prev;
+
+	mutex_enter(&l2arc_free_on_write_mtx);
+	buflist = l2arc_free_on_write;
+
+	for (df = list_tail(buflist); df; df = df_prev) {
+		df_prev = list_prev(buflist, df);
+		ASSERT(df->l2df_data != NULL);
+		ASSERT(df->l2df_func != NULL);
+		df->l2df_func(df->l2df_data, df->l2df_size);
+		list_remove(buflist, df);
+		kmem_free(df, sizeof (l2arc_data_free_t));
+	}
+
+	mutex_exit(&l2arc_free_on_write_mtx);
+}
+
+/*
+ * A write to a cache device has completed.  Update all headers to allow
+ * reads from these buffers to begin.
+ */
+static void
+l2arc_write_done(zio_t *zio)
+{
+	l2arc_write_callback_t *cb;
+	l2arc_dev_t *dev;
+	list_t *buflist;
+	arc_buf_hdr_t *head, *ab, *ab_prev;
+	l2arc_buf_hdr_t *abl2;
+	kmutex_t *hash_lock;
+
+	cb = zio->io_private;
+	ASSERT(cb != NULL);
+	dev = cb->l2wcb_dev;
+	ASSERT(dev != NULL);
+	head = cb->l2wcb_head;
+	ASSERT(head != NULL);
+	buflist = dev->l2ad_buflist;
+	ASSERT(buflist != NULL);
+	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
+	    l2arc_write_callback_t *, cb);
+
+	if (zio->io_error != 0)
+		ARCSTAT_BUMP(arcstat_l2_writes_error);
+
+	mutex_enter(&l2arc_buflist_mtx);
+
+	/*
+	 * All writes completed, or an error was hit.
+	 */
+	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
+		ab_prev = list_prev(buflist, ab);
+
+		hash_lock = HDR_LOCK(ab);
+		if (!mutex_tryenter(hash_lock)) {
+			/*
+			 * This buffer misses out.  It may be in a stage
+			 * of eviction.  Its ARC_L2_WRITING flag will be
+			 * left set, denying reads to this buffer.
+			 */
+			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
+			continue;
+		}
+
+		if (zio->io_error != 0) {
+			/*
+			 * Error - drop L2ARC entry.
+			 */
+			list_remove(buflist, ab);
+			abl2 = ab->b_l2hdr;
+			ab->b_l2hdr = NULL;
+			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+		}
+
+		/*
+		 * Allow ARC to begin reads to this L2ARC entry.
+		 */
+		ab->b_flags &= ~ARC_L2_WRITING;
+
+		mutex_exit(hash_lock);
+	}
+
+	atomic_inc_64(&l2arc_writes_done);
+	list_remove(buflist, head);
+	kmem_cache_free(hdr_cache, head);
+	mutex_exit(&l2arc_buflist_mtx);
+
+	l2arc_do_free_on_write();
+
+	kmem_free(cb, sizeof (l2arc_write_callback_t));
+}
+
+/*
+ * A read to a cache device completed.  Validate buffer contents before
+ * handing over to the regular ARC routines.
+ */
+static void
+l2arc_read_done(zio_t *zio)
+{
+	l2arc_read_callback_t *cb;
+	arc_buf_hdr_t *hdr;
+	arc_buf_t *buf;
+	kmutex_t *hash_lock;
+	int equal;
+
+	ASSERT(zio->io_vd != NULL);
+	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
+
+	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
+
+	cb = zio->io_private;
+	ASSERT(cb != NULL);
+	buf = cb->l2rcb_buf;
+	ASSERT(buf != NULL);
+	hdr = buf->b_hdr;
+	ASSERT(hdr != NULL);
+
+	hash_lock = HDR_LOCK(hdr);
+	mutex_enter(hash_lock);
+
+	/*
+	 * Check this survived the L2ARC journey.
+	 */
+	equal = arc_cksum_equal(buf);
+	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
+		mutex_exit(hash_lock);
+		zio->io_private = buf;
+		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
+		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
+		arc_read_done(zio);
+	} else {
+		mutex_exit(hash_lock);
+		/*
+		 * Buffer didn't survive caching.  Increment stats and
+		 * reissue to the original storage device.
+		 */
+		if (zio->io_error != 0) {
+			ARCSTAT_BUMP(arcstat_l2_io_error);
+		} else {
+			zio->io_error = EIO;
+		}
+		if (!equal)
+			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
+
+		/*
+		 * If there's no waiter, issue an async i/o to the primary
+		 * storage now.  If there *is* a waiter, the caller must
+		 * issue the i/o in a context where it's OK to block.
+		 */
+		if (zio->io_waiter == NULL)
+			zio_nowait(zio_read(zio->io_parent,
+			    cb->l2rcb_spa, &cb->l2rcb_bp,
+			    buf->b_data, zio->io_size, arc_read_done, buf,
+			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
+	}
+
+	kmem_free(cb, sizeof (l2arc_read_callback_t));
+}
+
+/*
+ * This is the list priority from which the L2ARC will search for pages to
+ * cache.  This is used within loops (0..3) to cycle through lists in the
+ * desired order.  This order can have a significant effect on cache
+ * performance.
+ *
+ * Currently the metadata lists are hit first, MFU then MRU, followed by
+ * the data lists.  This function returns a locked list, and also returns
+ * the lock pointer.
+ */
+static list_t *
+l2arc_list_locked(int list_num, kmutex_t **lock)
+{
+	list_t *list;
+
+	ASSERT(list_num >= 0 && list_num <= 3);
+
+	switch (list_num) {
+	case 0:
+		list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
+		*lock = &arc_mfu->arcs_mtx;
+		break;
+	case 1:
+		list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
+		*lock = &arc_mru->arcs_mtx;
+		break;
+	case 2:
+		list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
+		*lock = &arc_mfu->arcs_mtx;
+		break;
+	case 3:
+		list = &arc_mru->arcs_list[ARC_BUFC_DATA];
+		*lock = &arc_mru->arcs_mtx;
+		break;
+	}
+
+	ASSERT(!(MUTEX_HELD(*lock)));
+	mutex_enter(*lock);
+	return (list);
+}
+
+/*
+ * Evict buffers from the device write hand to the distance specified in
+ * bytes.  This distance may span populated buffers, it may span nothing.
+ * This is clearing a region on the L2ARC device ready for writing.
+ * If the 'all' boolean is set, every buffer is evicted.
+ */
+static void
+l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
+{
+	list_t *buflist;
+	l2arc_buf_hdr_t *abl2;
+	arc_buf_hdr_t *ab, *ab_prev;
+	kmutex_t *hash_lock;
+	uint64_t taddr;
+
+	buflist = dev->l2ad_buflist;
+
+	if (buflist == NULL)
+		return;
+
+	if (!all && dev->l2ad_first) {
+		/*
+		 * This is the first sweep through the device.  There is
+		 * nothing to evict.
+		 */
+		return;
+	}
+
+	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
+		/*
+		 * When nearing the end of the device, evict to the end
+		 * before the device write hand jumps to the start.
+		 */
+		taddr = dev->l2ad_end;
+	} else {
+		taddr = dev->l2ad_hand + distance;
+	}
+	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
+	    uint64_t, taddr, boolean_t, all);
+
+top:
+	mutex_enter(&l2arc_buflist_mtx);
+	for (ab = list_tail(buflist); ab; ab = ab_prev) {
+		ab_prev = list_prev(buflist, ab);
+
+		hash_lock = HDR_LOCK(ab);
+		if (!mutex_tryenter(hash_lock)) {
+			/*
+			 * Missed the hash lock.  Retry.
+			 */
+			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
+			mutex_exit(&l2arc_buflist_mtx);
+			mutex_enter(hash_lock);
+			mutex_exit(hash_lock);
+			goto top;
+		}
+
+		if (HDR_L2_WRITE_HEAD(ab)) {
+			/*
+			 * We hit a write head node.  Leave it for
+			 * l2arc_write_done().
+			 */
+			list_remove(buflist, ab);
+			mutex_exit(hash_lock);
+			continue;
+		}
+
+		if (!all && ab->b_l2hdr != NULL &&
+		    (ab->b_l2hdr->b_daddr > taddr ||
+		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
+			/*
+			 * We've evicted to the target address,
+			 * or the end of the device.
+			 */
+			mutex_exit(hash_lock);
+			break;
+		}
+
+		if (HDR_FREE_IN_PROGRESS(ab)) {
+			/*
+			 * Already on the path to destruction.
+			 */
+			mutex_exit(hash_lock);
+			continue;
+		}
+
+		if (ab->b_state == arc_l2c_only) {
+			ASSERT(!HDR_L2_READING(ab));
+			/*
+			 * This doesn't exist in the ARC.  Destroy.
+			 * arc_hdr_destroy() will call list_remove()
+			 * and decrement arcstat_l2_size.
+			 */
+			arc_change_state(arc_anon, ab, hash_lock);
+			arc_hdr_destroy(ab);
+		} else {
+			/*
+			 * Invalidate issued or about to be issued
+			 * reads, since we may be about to write
+			 * over this location.
+			 */
+			if (HDR_L2_READING(ab)) {
+				ARCSTAT_BUMP(arcstat_l2_evict_reading);
+				ab->b_flags |= ARC_L2_EVICTED;
+			}
+
+			/*
+			 * Tell ARC this no longer exists in L2ARC.
+			 */
+			if (ab->b_l2hdr != NULL) {
+				abl2 = ab->b_l2hdr;
+				ab->b_l2hdr = NULL;
+				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+			}
+			list_remove(buflist, ab);
+
+			/*
+			 * This may have been leftover after a
+			 * failed write.
+			 */
+			ab->b_flags &= ~ARC_L2_WRITING;
+		}
+		mutex_exit(hash_lock);
+	}
+	mutex_exit(&l2arc_buflist_mtx);
+
+	spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+	dev->l2ad_evict = taddr;
+}
+
+/*
+ * Find and write ARC buffers to the L2ARC device.
+ *
+ * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
+ * for reading until they have completed writing.
+ */
+static void
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
+{
+	arc_buf_hdr_t *ab, *ab_prev, *head;
+	l2arc_buf_hdr_t *hdrl2;
+	list_t *list;
+	uint64_t passed_sz, write_sz, buf_sz, headroom;
+	void *buf_data;
+	kmutex_t *hash_lock, *list_lock;
+	boolean_t have_lock, full;
+	l2arc_write_callback_t *cb;
+	zio_t *pio, *wzio;
+	int try;
+
+	ASSERT(dev->l2ad_vdev != NULL);
+
+	pio = NULL;
+	write_sz = 0;
+	full = B_FALSE;
+	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+	head->b_flags |= ARC_L2_WRITE_HEAD;
+
+	/*
+	 * Copy buffers for L2ARC writing.
+	 */
+	mutex_enter(&l2arc_buflist_mtx);
+	for (try = 0; try <= 3; try++) {
+		list = l2arc_list_locked(try, &list_lock);
+		passed_sz = 0;
+
+		/*
+		 * L2ARC fast warmup.
+		 *
+		 * Until the ARC is warm and starts to evict, read from the
+		 * head of the ARC lists rather than the tail.
+		 */
+		headroom = target_sz * l2arc_headroom;
+		if (arc_warm == B_FALSE)
+			ab = list_head(list);
+		else
+			ab = list_tail(list);
+
+		for (; ab; ab = ab_prev) {
+			if (arc_warm == B_FALSE)
+				ab_prev = list_next(list, ab);
+			else
+				ab_prev = list_prev(list, ab);
+
+			hash_lock = HDR_LOCK(ab);
+			have_lock = MUTEX_HELD(hash_lock);
+			if (!have_lock && !mutex_tryenter(hash_lock)) {
+				/*
+				 * Skip this buffer rather than waiting.
+				 */
+				continue;
+			}
+
+			passed_sz += ab->b_size;
+			if (passed_sz > headroom) {
+				/*
+				 * Searched too far.
+				 */
+				mutex_exit(hash_lock);
+				break;
+			}
+
+			if (ab->b_spa != spa) {
+				mutex_exit(hash_lock);
+				continue;
+			}
+
+			if (ab->b_l2hdr != NULL) {
+				/*
+				 * Already in L2ARC.
+				 */
+				mutex_exit(hash_lock);
+				continue;
+			}
+
+			if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) {
+				mutex_exit(hash_lock);
+				continue;
+			}
+
+			if ((write_sz + ab->b_size) > target_sz) {
+				full = B_TRUE;
+				mutex_exit(hash_lock);
+				break;
+			}
+
+			if (ab->b_buf == NULL) {
+				DTRACE_PROBE1(l2arc__buf__null, void *, ab);
+				mutex_exit(hash_lock);
+				continue;
+			}
+
+			if (pio == NULL) {
+				/*
+				 * Insert a dummy header on the buflist so
+				 * l2arc_write_done() can find where the
+				 * write buffers begin without searching.
+				 */
+				list_insert_head(dev->l2ad_buflist, head);
+
+				cb = kmem_alloc(
+				    sizeof (l2arc_write_callback_t), KM_SLEEP);
+				cb->l2wcb_dev = dev;
+				cb->l2wcb_head = head;
+				pio = zio_root(spa, l2arc_write_done, cb,
+				    ZIO_FLAG_CANFAIL);
+			}
+
+			/*
+			 * Create and add a new L2ARC header.
+			 */
+			hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+			hdrl2->b_dev = dev;
+			hdrl2->b_daddr = dev->l2ad_hand;
+
+			ab->b_flags |= ARC_L2_WRITING;
+			ab->b_l2hdr = hdrl2;
+			list_insert_head(dev->l2ad_buflist, ab);
+			buf_data = ab->b_buf->b_data;
+			buf_sz = ab->b_size;
+
+			/*
+			 * Compute and store the buffer cksum before
+			 * writing.  On debug the cksum is verified first.
+			 */
+			arc_cksum_verify(ab->b_buf);
+			arc_cksum_compute(ab->b_buf, B_TRUE);
+
+			mutex_exit(hash_lock);
+
+			wzio = zio_write_phys(pio, dev->l2ad_vdev,
+			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
+			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
+			    ZIO_FLAG_CANFAIL, B_FALSE);
+
+			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+			    zio_t *, wzio);
+			(void) zio_nowait(wzio);
+
+			/*
+			 * Keep the clock hand suitably device-aligned.
+			 */
+			buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+
+			write_sz += buf_sz;
+			dev->l2ad_hand += buf_sz;
+		}
+
+		mutex_exit(list_lock);
+
+		if (full == B_TRUE)
+			break;
+	}
+	mutex_exit(&l2arc_buflist_mtx);
+
+	if (pio == NULL) {
+		ASSERT3U(write_sz, ==, 0);
+		kmem_cache_free(hdr_cache, head);
+		return;
+	}
+
+	ASSERT3U(write_sz, <=, target_sz);
+	ARCSTAT_BUMP(arcstat_l2_writes_sent);
+	ARCSTAT_INCR(arcstat_l2_size, write_sz);
+	spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+
+	/*
+	 * Bump device hand to the device start if it is approaching the end.
+	 * l2arc_evict() will already have evicted ahead for this case.
+	 */
+	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
+		spa_l2cache_space_update(dev->l2ad_vdev, 0,
+		    dev->l2ad_end - dev->l2ad_hand);
+		dev->l2ad_hand = dev->l2ad_start;
+		dev->l2ad_evict = dev->l2ad_start;
+		dev->l2ad_first = B_FALSE;
+	}
+
+	(void) zio_wait(pio);
+}
+
+/*
+ * This thread feeds the L2ARC at regular intervals.  This is the beating
+ * heart of the L2ARC.
+ */
+static void
+l2arc_feed_thread(void *dummy __unused)
+{
+	callb_cpr_t cpr;
+	l2arc_dev_t *dev;
+	spa_t *spa;
+	uint64_t size;
+
+	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
+
+	mutex_enter(&l2arc_feed_thr_lock);
+
+	while (l2arc_thread_exit == 0) {
+		/*
+		 * Pause for l2arc_feed_secs seconds between writes.
+		 */
+		CALLB_CPR_SAFE_BEGIN(&cpr);
+		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
+		    hz * l2arc_feed_secs);
+		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
+
+		/*
+		 * Quick check for L2ARC devices.
+		 */
+		mutex_enter(&l2arc_dev_mtx);
+		if (l2arc_ndev == 0) {
+			mutex_exit(&l2arc_dev_mtx);
+			continue;
+		}
+		mutex_exit(&l2arc_dev_mtx);
+
+		/*
+		 * This selects the next l2arc device to write to, and in
+		 * doing so the next spa to feed from: dev->l2ad_spa.   This
+		 * will return NULL if there are now no l2arc devices or if
+		 * they are all faulted.
+		 *
+		 * If a device is returned, its spa's config lock is also
+		 * held to prevent device removal.  l2arc_dev_get_next()
+		 * will grab and release l2arc_dev_mtx.
+		 */
+		if ((dev = l2arc_dev_get_next()) == NULL)
+			continue;
+
+		spa = dev->l2ad_spa;
+		ASSERT(spa != NULL);
+
+		/*
+		 * Avoid contributing to memory pressure.
+		 */
+		if (arc_reclaim_needed()) {
+			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
+			spa_config_exit(spa, SCL_L2ARC, dev);
+			continue;
+		}
+
+		ARCSTAT_BUMP(arcstat_l2_feeds);
+
+		size = dev->l2ad_write;
+		if (arc_warm == B_FALSE)
+			size += dev->l2ad_boost;
+
+		/*
+		 * Evict L2ARC buffers that will be overwritten.
+		 */
+		l2arc_evict(dev, size, B_FALSE);
+
+		/*
+		 * Write ARC buffers.
+		 */
+		l2arc_write_buffers(spa, dev, size);
+		spa_config_exit(spa, SCL_L2ARC, dev);
+	}
+
+	l2arc_thread_exit = 0;
+	cv_broadcast(&l2arc_feed_thr_cv);
+	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
+	thread_exit();
+}
+
+boolean_t
+l2arc_vdev_present(vdev_t *vd)
+{
+	l2arc_dev_t *dev;
+
+	mutex_enter(&l2arc_dev_mtx);
+	for (dev = list_head(l2arc_dev_list); dev != NULL;
+	    dev = list_next(l2arc_dev_list, dev)) {
+		if (dev->l2ad_vdev == vd)
+			break;
+	}
+	mutex_exit(&l2arc_dev_mtx);
+
+	return (dev != NULL);
+}
+
+/*
+ * Add a vdev for use by the L2ARC.  By this point the spa has already
+ * validated the vdev and opened it.
+ */
+void
+l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
+{
+	l2arc_dev_t *adddev;
+
+	ASSERT(!l2arc_vdev_present(vd));
+
+	/*
+	 * Create a new l2arc device entry.
+	 */
+	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
+	adddev->l2ad_spa = spa;
+	adddev->l2ad_vdev = vd;
+	adddev->l2ad_write = l2arc_write_max;
+	adddev->l2ad_boost = l2arc_write_boost;
+	adddev->l2ad_start = start;
+	adddev->l2ad_end = end;
+	adddev->l2ad_hand = adddev->l2ad_start;
+	adddev->l2ad_evict = adddev->l2ad_start;
+	adddev->l2ad_first = B_TRUE;
+	ASSERT3U(adddev->l2ad_write, >, 0);
+
+	/*
+	 * This is a list of all ARC buffers that are still valid on the
+	 * device.
+	 */
+	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
+	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l2node));
+
+	spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+
+	/*
+	 * Add device to global list
+	 */
+	mutex_enter(&l2arc_dev_mtx);
+	list_insert_head(l2arc_dev_list, adddev);
+	atomic_inc_64(&l2arc_ndev);
+	mutex_exit(&l2arc_dev_mtx);
+}
+
+/*
+ * Remove a vdev from the L2ARC.
+ */
+void
+l2arc_remove_vdev(vdev_t *vd)
+{
+	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
+
+	/*
+	 * Find the device by vdev
+	 */
+	mutex_enter(&l2arc_dev_mtx);
+	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
+		nextdev = list_next(l2arc_dev_list, dev);
+		if (vd == dev->l2ad_vdev) {
+			remdev = dev;
+			break;
+		}
+	}
+	ASSERT(remdev != NULL);
+
+	/*
+	 * Remove device from global list
+	 */
+	list_remove(l2arc_dev_list, remdev);
+	l2arc_dev_last = NULL;		/* may have been invalidated */
+	atomic_dec_64(&l2arc_ndev);
+	mutex_exit(&l2arc_dev_mtx);
+
+	/*
+	 * Clear all buflists and ARC references.  L2ARC device flush.
+	 */
+	l2arc_evict(remdev, 0, B_TRUE);
+	list_destroy(remdev->l2ad_buflist);
+	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
+	kmem_free(remdev, sizeof (l2arc_dev_t));
+}
+
+void
+l2arc_init(void)
+{
+	l2arc_thread_exit = 0;
+	l2arc_ndev = 0;
+	l2arc_writes_sent = 0;
+	l2arc_writes_done = 0;
+
+	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+	l2arc_dev_list = &L2ARC_dev_list;
+	l2arc_free_on_write = &L2ARC_free_on_write;
+	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
+	    offsetof(l2arc_dev_t, l2ad_node));
+	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
+	    offsetof(l2arc_data_free_t, l2df_list_node));
+}
+
+void
+l2arc_fini(void)
+{
+	/*
+	 * This is called from dmu_fini(), which is called from spa_fini();
+	 * Because of this, we can assume that all l2arc devices have
+	 * already been removed when the pools themselves were removed.
+	 */
+
+	l2arc_do_free_on_write();
+
+	mutex_destroy(&l2arc_feed_thr_lock);
+	cv_destroy(&l2arc_feed_thr_cv);
+	mutex_destroy(&l2arc_dev_mtx);
+	mutex_destroy(&l2arc_buflist_mtx);
+	mutex_destroy(&l2arc_free_on_write_mtx);
+
+	list_destroy(l2arc_dev_list);
+	list_destroy(l2arc_free_on_write);
+}
+
+void
+l2arc_start(void)
+{
+	if (!(spa_mode & FWRITE))
+		return;
+
+	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
+	    TS_RUN, minclsyspri);
+}
+
+void
+l2arc_stop(void)
+{
+	if (!(spa_mode & FWRITE))
+		return;
+
+	mutex_enter(&l2arc_feed_thr_lock);
+	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
+	l2arc_thread_exit = 1;
+	while (l2arc_thread_exit != 0)
+		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
+	mutex_exit(&l2arc_feed_thr_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
index 4442b1f28ac8..93b7741d77be 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/bplist.h>
 #include <sys/zfs_context.h>
 
@@ -47,7 +45,7 @@ bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
 {
 	int size;
 
-	size = spa_version(dmu_objset_spa(mos)) < ZFS_VERSION_BPLIST_ACCOUNT ?
+	size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ?
 	    BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
 
 	return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
@@ -181,7 +179,7 @@ bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
 }
 
 int
-bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
+bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	uint64_t blk, off;
 	blkptr_t *bparray;
@@ -229,7 +227,7 @@ bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
  * Deferred entry; will be written later by bplist_sync().
  */
 void
-bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp)
+bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
 {
 	bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
 
@@ -278,9 +276,7 @@ bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
 int
 bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 {
-	uint64_t itor = 0, comp = 0, uncomp = 0;
 	int err;
-	blkptr_t bp;
 
 	mutex_enter(&bpl->bpl_lock);
 
@@ -298,6 +294,9 @@ bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 	mutex_exit(&bpl->bpl_lock);
 
 	if (!bpl->bpl_havecomp) {
+		uint64_t itor = 0, comp = 0, uncomp = 0;
+		blkptr_t bp;
+
 		while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
 			comp += BP_GET_PSIZE(&bp);
 			uncomp += BP_GET_UCSIZE(&bp);
@@ -310,3 +309,41 @@ bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 
 	return (err);
 }
+
+/*
+ * Return (in *dasizep) the amount of space on the deadlist which is:
+ * mintxg < blk_birth <= maxtxg
+ */
+int
+bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *dasizep)
+{
+	uint64_t size = 0;
+	uint64_t itor = 0;
+	blkptr_t bp;
+	int err;
+
+	/*
+	 * As an optimization, if they want the whole txg range, just
+	 * get bpl_bytes rather than iterating over the bps.
+	 */
+	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) {
+		mutex_enter(&bpl->bpl_lock);
+		err = bplist_hold(bpl);
+		if (err == 0)
+			*dasizep = bpl->bpl_phys->bpl_bytes;
+		mutex_exit(&bpl->bpl_lock);
+		return (err);
+	}
+
+	while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
+		if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
+			size +=
+			    bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp);
+		}
+	}
+	if (err == ENOENT)
+		err = 0;
+	*dasizep = size;
+	return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index 94c63081478a..2494c1e7f9d1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
@@ -39,17 +37,10 @@
 
 static void dbuf_destroy(dmu_buf_impl_t *db);
 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
-    int compress, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 static arc_done_func_t dbuf_write_ready;
 static arc_done_func_t dbuf_write_done;
 
-int zfs_mdcomp_disable = 0;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
-    &zfs_mdcomp_disable, 0, "Disable metadata compression");
-
 /*
  * Global data structures and functions for the dbuf cache.
  */
@@ -311,7 +302,7 @@ dbuf_verify(dmu_buf_impl_t *db)
 	}
 	if (db->db_blkid == DB_BONUS_BLKID) {
 		ASSERT(dn != NULL);
-		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
 	} else {
 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
@@ -460,45 +451,45 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 static void
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 {
-	blkptr_t *bp;
+	dnode_t *dn = db->db_dnode;
 	zbookmark_t zb;
 	uint32_t aflags = ARC_NOWAIT;
+	arc_buf_t *pbuf;
 
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
-	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED);
 	ASSERT(db->db_buf == NULL);
 
 	if (db->db_blkid == DB_BONUS_BLKID) {
-		ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
+		int bonuslen = dn->dn_bonuslen;
+
+		ASSERT3U(bonuslen, <=, db->db.db_size);
 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
-		if (db->db.db_size < DN_MAX_BONUSLEN)
+		arc_space_consume(DN_MAX_BONUSLEN);
+		if (bonuslen < DN_MAX_BONUSLEN)
 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
-		bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
-		    db->db.db_size);
+		bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
+		    bonuslen);
 		dbuf_update_data(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
 	}
 
-	if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
-		bp = NULL;
-	else
-		bp = db->db_blkptr;
-
-	if (bp == NULL)
-		dprintf_dbuf(db, "blkptr: %s\n", "NULL");
-	else
-		dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
-
-	if (bp == NULL || BP_IS_HOLE(bp)) {
+	/*
+	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
+	 * processes the delete record and clears the bp while we are waiting
+	 * for the dn_mtx (resulting in a "no" from block_freed).
+	 */
+	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
+	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
+	    BP_IS_HOLE(db->db_blkptr)))) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
-		ASSERT(bp == NULL || BP_IS_HOLE(bp));
-		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+		dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
 		    db->db.db_size, db, type));
 		bzero(db->db.db_data, db->db.db_size);
 		db->db_state = DB_CACHED;
@@ -510,6 +501,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 	db->db_state = DB_READ;
 	mutex_exit(&db->db_mtx);
 
+	if (DBUF_IS_L2CACHEABLE(db))
+		aflags |= ARC_L2CACHE;
+
 	zb.zb_objset = db->db_objset->os_dsl_dataset ?
 	    db->db_objset->os_dsl_dataset->ds_object : 0;
 	zb.zb_object = db->db.db_object;
@@ -518,10 +512,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 
 	dbuf_add_ref(db, NULL);
 	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
-	ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
-	(void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
-	    db->db_level > 0 ? byteswap_uint64_array :
-	    dmu_ot[db->db_dnode->dn_type].ot_byteswap,
+
+	if (db->db_parent)
+		pbuf = db->db_parent->db_buf;
+	else
+		pbuf = db->db_objset->os_phys_buf;
+
+	(void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 	    &aflags, &zb);
@@ -546,7 +543,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
 
 	prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
-	    (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
+	    (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
+	    DBUF_IS_CACHEABLE(db);
 
 	mutex_enter(&db->db_mtx);
 	if (db->db_state == DB_CACHED) {
@@ -661,6 +659,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 	if (db->db_blkid == DB_BONUS_BLKID) {
 		/* Note that the data bufs here are zio_bufs */
 		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+		arc_space_consume(DN_MAX_BONUSLEN);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		int size = db->db.db_size;
@@ -690,7 +689,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 	/* free this block */
 	if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
 		/* XXX can get silent EIO here */
-		(void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
+		(void) dsl_free(NULL,
+		    spa_get_dsl(db->db_dnode->dn_objset->os_spa),
 		    txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
 	}
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -705,22 +705,50 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 	arc_release(dr->dt.dl.dr_data, db);
 }
 
+/*
+ * Evict (if its unreferenced) or clear (if its referenced) any level-0
+ * data blocks in the free range, so that any future readers will find
+ * empty blocks.  Also, if we happen accross any level-1 dbufs in the
+ * range that have not already been marked dirty, mark them dirty so
+ * they stay in memory.
+ */
 void
-dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db, *db_next;
 	uint64_t txg = tx->tx_txg;
+	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	uint64_t first_l1 = start >> epbs;
+	uint64_t last_l1 = end >> epbs;
 
-	dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
+	if (end > dn->dn_maxblkid) {
+		end = dn->dn_maxblkid;
+		last_l1 = end >> epbs;
+	}
+	dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
 	mutex_enter(&dn->dn_dbufs_mtx);
 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
 		db_next = list_next(&dn->dn_dbufs, db);
 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+		if (db->db_level == 1 &&
+		    db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
+			mutex_enter(&db->db_mtx);
+			if (db->db_last_dirty &&
+			    db->db_last_dirty->dr_txg < txg) {
+				dbuf_add_ref(db, FTAG);
+				mutex_exit(&db->db_mtx);
+				dbuf_will_dirty(db, tx);
+				dbuf_rele(db, FTAG);
+			} else {
+				mutex_exit(&db->db_mtx);
+			}
+		}
+
 		if (db->db_level != 0)
 			continue;
 		dprintf_dbuf(db, "found buf %s\n", "");
-		if (db->db_blkid < blkid ||
-		    db->db_blkid >= blkid+nblks)
+		if (db->db_blkid < start || db->db_blkid > end)
 			continue;
 
 		/* found a level 0 buffer in the range */
@@ -783,31 +811,28 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 }
 
 static int
-dbuf_new_block(dmu_buf_impl_t *db)
+dbuf_block_freeable(dmu_buf_impl_t *db)
 {
 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
 	uint64_t birth_txg = 0;
 
-	/* Don't count meta-objects */
-	if (ds == NULL)
-		return (FALSE);
-
 	/*
 	 * We don't need any locking to protect db_blkptr:
 	 * If it's syncing, then db_last_dirty will be set
 	 * so we'll ignore db_blkptr.
 	 */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	/* If we have been dirtied since the last snapshot, its not new */
 	if (db->db_last_dirty)
 		birth_txg = db->db_last_dirty->dr_txg;
 	else if (db->db_blkptr)
 		birth_txg = db->db_blkptr->blk_birth;
 
+	/* If we don't exist or are in a snapshot, we can't be freed */
 	if (birth_txg)
-		return (!dsl_dataset_block_freeable(ds, birth_txg));
+		return (ds == NULL ||
+		    dsl_dataset_block_freeable(ds, birth_txg));
 	else
-		return (TRUE);
+		return (FALSE);
 }
 
 void
@@ -865,6 +890,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	objset_impl_t *os = dn->dn_objset;
 	dbuf_dirty_record_t **drp, *dr;
 	int drop_struct_lock = FALSE;
+	boolean_t do_free_accounting = B_FALSE;
 	int txgoff = tx->tx_txg & TXG_MASK;
 
 	ASSERT(tx->tx_txg != 0);
@@ -922,20 +948,20 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	drp = &db->db_last_dirty;
 	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
 	    db->db.db_object == DMU_META_DNODE_OBJECT);
-	while (*drp && (*drp)->dr_txg > tx->tx_txg)
-		drp = &(*drp)->dr_next;
-	if (*drp && (*drp)->dr_txg == tx->tx_txg) {
+	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
+		drp = &dr->dr_next;
+	if (dr && dr->dr_txg == tx->tx_txg) {
 		if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
 			/*
 			 * If this buffer has already been written out,
 			 * we now need to reset its state.
 			 */
-			dbuf_unoverride(*drp);
+			dbuf_unoverride(dr);
 			if (db->db.db_object != DMU_META_DNODE_OBJECT)
 				arc_buf_thaw(db->db_buf);
 		}
 		mutex_exit(&db->db_mtx);
-		return (*drp);
+		return (dr);
 	}
 
 	/*
@@ -966,6 +992,18 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
+	if (db->db_blkid != DB_BONUS_BLKID) {
+		/*
+		 * Update the accounting.
+		 * Note: we delay "free accounting" until after we drop
+		 * the db_mtx.  This keeps us from grabbing other locks
+		 * (and possibly deadlocking) in bp_get_dasize() while
+		 * also holding the db_mtx.
+		 */
+		dnode_willuse_space(dn, db->db.db_size, tx);
+		do_free_accounting = dbuf_block_freeable(db);
+	}
+
 	/*
 	 * If this buffer is dirty in an old transaction group we need
 	 * to make a copy of it so that the changes we make in this
@@ -1015,25 +1053,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		db->db_freed_in_flight = FALSE;
 	}
 
-	if (db->db_blkid != DB_BONUS_BLKID) {
-		/*
-		 * Update the accounting.
-		 */
-		if (!dbuf_new_block(db) && db->db_blkptr) {
-			/*
-			 * This is only a guess -- if the dbuf is dirty
-			 * in a previous txg, we don't know how much
-			 * space it will use on disk yet.  We should
-			 * really have the struct_rwlock to access
-			 * db_blkptr, but since this is just a guess,
-			 * it's OK if we get an odd answer.
-			 */
-			dnode_willuse_space(dn,
-			    -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
-		}
-		dnode_willuse_space(dn, db->db.db_size, tx);
-	}
-
 	/*
 	 * This buffer is now part of this txg
 	 */
@@ -1050,11 +1069,19 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		mutex_exit(&dn->dn_mtx);
 		dnode_setdirty(dn, tx);
 		return (dr);
-	}
-
-	if (db->db_level == 0) {
-		dnode_new_blkid(dn, db->db_blkid, tx);
-		ASSERT(dn->dn_maxblkid >= db->db_blkid);
+	} else if (do_free_accounting) {
+		blkptr_t *bp = db->db_blkptr;
+		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
+		    bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+		/*
+		 * This is only a guess -- if the dbuf is dirty
+		 * in a previous txg, we don't know how much
+		 * space it will use on disk yet.  We should
+		 * really have the struct_rwlock to access
+		 * db_blkptr, but since this is just a guess,
+		 * it's OK if we get an odd answer.
+		 */
+		dnode_willuse_space(dn, -willfree, tx);
 	}
 
 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
@@ -1062,6 +1089,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		drop_struct_lock = TRUE;
 	}
 
+	if (db->db_level == 0) {
+		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
+		ASSERT(dn->dn_maxblkid >= db->db_blkid);
+	}
+
 	if (db->db_level+1 < dn->dn_nlevels) {
 		dmu_buf_impl_t *parent = db->db_parent;
 		dbuf_dirty_record_t *di;
@@ -1115,7 +1147,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn = db->db_dnode;
 	uint64_t txg = tx->tx_txg;
-	dbuf_dirty_record_t *dr;
+	dbuf_dirty_record_t *dr, **drp;
 
 	ASSERT(txg != 0);
 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
@@ -1125,7 +1157,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	/*
 	 * If this buffer is not dirty, we're done.
 	 */
-	for (dr = db->db_last_dirty; dr; dr = dr->dr_next)
+	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
 		if (dr->dr_txg <= txg)
 			break;
 	if (dr == NULL || dr->dr_txg < txg) {
@@ -1155,14 +1187,14 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	/* XXX would be nice to fix up dn_towrite_space[] */
 
-	db->db_last_dirty = dr->dr_next;
+	*drp = dr->dr_next;
 
 	if (dr->dr_parent) {
 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
 	} else if (db->db_level+1 == dn->dn_nlevels) {
-		ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
+		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 		mutex_exit(&dn->dn_mtx);
@@ -1178,8 +1210,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	} else {
 		ASSERT(db->db_buf != NULL);
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
-		list_destroy(&dr->dt.di.dr_children);
 		mutex_destroy(&dr->dt.di.dr_mtx);
+		list_destroy(&dr->dt.di.dr_children);
 	}
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
@@ -1204,7 +1236,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 void
 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
-	int rf = DB_RF_MUST_SUCCEED;
+	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
@@ -1282,8 +1314,10 @@ dbuf_clear(dmu_buf_impl_t *db)
 
 	if (db->db_state == DB_CACHED) {
 		ASSERT(db->db.db_data != NULL);
-		if (db->db_blkid == DB_BONUS_BLKID)
+		if (db->db_blkid == DB_BONUS_BLKID) {
 			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
+			arc_space_return(DN_MAX_BONUSLEN);
+		}
 		db->db.db_data = NULL;
 		db->db_state = DB_UNCACHED;
 	}
@@ -1297,6 +1331,7 @@ dbuf_clear(dmu_buf_impl_t *db)
 	if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
 		list_remove(&dn->dn_dbufs, db);
 		dnode_rele(dn, db);
+		db->db_dnode = NULL;
 	}
 
 	if (db->db_buf)
@@ -1397,10 +1432,13 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 
 	if (blkid == DB_BONUS_BLKID) {
 		ASSERT3P(parent, ==, dn->dn_dbuf);
-		db->db.db_size = dn->dn_bonuslen;
+		db->db.db_size = DN_MAX_BONUSLEN -
+		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		db->db.db_offset = DB_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
 		/* the bonus dbuf is not placed in the hash table */
+		arc_space_consume(sizeof (dmu_buf_impl_t));
 		return (db);
 	} else {
 		int blocksize =
@@ -1427,6 +1465,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 	list_insert_head(&dn->dn_dbufs, db);
 	db->db_state = DB_UNCACHED;
 	mutex_exit(&dn->dn_dbufs_mtx);
+	arc_space_consume(sizeof (dmu_buf_impl_t));
 
 	if (parent && parent != dn->dn_dbuf)
 		dbuf_add_ref(parent, db);
@@ -1469,31 +1508,33 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	ASSERT(refcount_is_zero(&db->db_holds));
 
 	if (db->db_blkid != DB_BONUS_BLKID) {
-		dnode_t *dn = db->db_dnode;
-
 		/*
 		 * If this dbuf is still on the dn_dbufs list,
 		 * remove it from that list.
 		 */
-		if (list_link_active(&db->db_link)) {
+		if (db->db_dnode) {
+			dnode_t *dn = db->db_dnode;
+
 			mutex_enter(&dn->dn_dbufs_mtx);
 			list_remove(&dn->dn_dbufs, db);
 			mutex_exit(&dn->dn_dbufs_mtx);
 
 			dnode_rele(dn, db);
+			db->db_dnode = NULL;
 		}
 		dbuf_hash_remove(db);
 	}
 	db->db_parent = NULL;
-	db->db_dnode = NULL;
 	db->db_buf = NULL;
 
+	ASSERT(!list_link_active(&db->db_link));
 	ASSERT(db->db.db_data == NULL);
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);
 	ASSERT(db->db_data_pending == NULL);
 
 	kmem_cache_free(dbuf_cache, db);
+	arc_space_return(sizeof (dmu_buf_impl_t));
 }
 
 void
@@ -1525,6 +1566,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
 
 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
 		if (bp && !BP_IS_HOLE(bp)) {
+			arc_buf_t *pbuf;
 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
 			zbookmark_t zb;
 			zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
@@ -1533,9 +1575,13 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
 			zb.zb_level = 0;
 			zb.zb_blkid = blkid;
 
-			(void) arc_read(NULL, dn->dn_objset->os_spa, bp,
-			    dmu_ot[dn->dn_type].ot_byteswap,
-			    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+			if (db)
+				pbuf = db->db_buf;
+			else
+				pbuf = dn->dn_objset->os_phys_buf;
+
+			(void) arc_read(NULL, dn->dn_objset->os_spa,
+			    bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zb);
 		}
@@ -1652,16 +1698,13 @@ dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
 	return (err ? NULL : db);
 }
 
-dmu_buf_impl_t *
+void
 dbuf_create_bonus(dnode_t *dn)
 {
-	dmu_buf_impl_t *db = dn->dn_bonus;
-
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	ASSERT(dn->dn_bonus == NULL);
-	db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
-	return (db);
+	dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
 }
 
 #pragma weak dmu_buf_add_ref = dbuf_add_ref
@@ -1716,7 +1759,10 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag)
 			dbuf_evict(db);
 		} else {
 			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
-			mutex_exit(&db->db_mtx);
+			if (!DBUF_IS_CACHEABLE(db))
+				dbuf_clear(db);
+			else
+				mutex_exit(&db->db_mtx);
 		}
 	} else {
 		mutex_exit(&db->db_mtx);
@@ -1852,15 +1898,8 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 
 	db->db_data_pending = dr;
 
-	arc_release(db->db_buf, db);
 	mutex_exit(&db->db_mtx);
-
-	/*
-	 * XXX -- we should design a compression algorithm
-	 * that specializes in arrays of bps.
-	 */
-	dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
-	    zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
+	dbuf_write(dr, db->db_buf, tx);
 
 	zio = dr->dr_zio;
 	mutex_enter(&dr->dt.di.dr_mtx);
@@ -1878,7 +1917,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	dnode_t *dn = db->db_dnode;
 	objset_impl_t *os = dn->dn_objset;
 	uint64_t txg = tx->tx_txg;
-	int checksum, compress;
 	int blksz;
 
 	ASSERT(dmu_tx_is_syncing(tx));
@@ -1909,23 +1947,21 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	 */
 	if (db->db_blkid == DB_BONUS_BLKID) {
 		dbuf_dirty_record_t **drp;
-		/*
-		 * Use dn_phys->dn_bonuslen since db.db_size is the length
-		 * of the bonus buffer in the open transaction rather than
-		 * the syncing transaction.
-		 */
+
 		ASSERT(*datap != NULL);
 		ASSERT3U(db->db_level, ==, 0);
 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
-		if (*datap != db->db.db_data)
+		if (*datap != db->db.db_data) {
 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
+			arc_space_return(DN_MAX_BONUSLEN);
+		}
 		db->db_data_pending = NULL;
 		drp = &db->db_last_dirty;
 		while (*drp != dr)
 			drp = &(*drp)->dr_next;
-		ASSERT((*drp)->dr_next == NULL);
-		*drp = NULL;
+		ASSERT(dr->dr_next == NULL);
+		*drp = dr->dr_next;
 		if (dr->dr_dbuf->db_level != 0) {
 			list_destroy(&dr->dt.di.dr_children);
 			mutex_destroy(&dr->dt.di.dr_mtx);
@@ -1939,6 +1975,14 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	}
 
 	/*
+	 * This function may have dropped the db_mtx lock allowing a dmu_sync
+	 * operation to sneak in. As a result, we need to ensure that we
+	 * don't check the dr_override_state until we have returned from
+	 * dbuf_check_blkptr.
+	 */
+	dbuf_check_blkptr(dn, db);
+
+	/*
 	 * If this buffer is in the middle of an immdiate write,
 	 * wait for the synchronous IO to complete.
 	 */
@@ -1948,8 +1992,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
 	}
 
-	dbuf_check_blkptr(dn, db);
-
 	/*
 	 * If this dbuf has already been written out via an immediate write,
 	 * just complete the write by copying over the new block pointer and
@@ -1963,6 +2005,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 		zio_fake.io_bp = db->db_blkptr;
 		zio_fake.io_bp_orig = *db->db_blkptr;
 		zio_fake.io_txg = txg;
+		zio_fake.io_flags = 0;
 
 		*db->db_blkptr = dr->dt.dl.dr_overridden_by;
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -1970,8 +2013,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 		dr->dr_zio = &zio_fake;
 		mutex_exit(&db->db_mtx);
 
+		ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp),
+		    BP_IDENTITY(&zio_fake.io_bp_orig)) ||
+		    BP_IS_HOLE(zio_fake.io_bp));
+
 		if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
-			dsl_dataset_block_kill(os->os_dsl_dataset,
+			(void) dsl_dataset_block_kill(os->os_dsl_dataset,
 			    &zio_fake.io_bp_orig, dn->dn_zio, tx);
 
 		dbuf_write_ready(&zio_fake, db->db_buf, db);
@@ -1997,14 +2044,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 			*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
 			bcopy(db->db.db_data, (*datap)->b_data, blksz);
 		}
-	} else {
-		/*
-		 * Private object buffers are released here rather
-		 * than in dbuf_dirty() since they are only modified
-		 * in the syncing context and we don't want the
-		 * overhead of making multiple copies of the data.
-		 */
-		arc_release(db->db_buf, db);
 	}
 
 	ASSERT(*datap != NULL);
@@ -2012,22 +2051,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 
 	mutex_exit(&db->db_mtx);
 
-	/*
-	 * Allow dnode settings to override objset settings,
-	 * except for metadata checksums.
-	 */
-	if (dmu_ot[dn->dn_type].ot_metadata) {
-		checksum = os->os_md_checksum;
-		compress = zio_compress_select(dn->dn_compress,
-		    os->os_md_compress);
-	} else {
-		checksum = zio_checksum_select(dn->dn_checksum,
-		    os->os_checksum);
-		compress = zio_compress_select(dn->dn_compress,
-		    os->os_compress);
-	}
-
-	dbuf_write(dr, *datap, checksum, compress, tx);
+	dbuf_write(dr, *datap, tx);
 
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	if (dn->dn_object == DMU_META_DNODE_OBJECT)
@@ -2063,8 +2087,7 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx)
 }
 
 static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
-    int compress, dmu_tx_t *tx)
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn = db->db_dnode;
@@ -2072,8 +2095,23 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
 	dmu_buf_impl_t *parent = db->db_parent;
 	uint64_t txg = tx->tx_txg;
 	zbookmark_t zb;
+	writeprops_t wp = { 0 };
 	zio_t *zio;
-	int zio_flags;
+
+	if (!BP_IS_HOLE(db->db_blkptr) &&
+	    (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
+		/*
+		 * Private object buffers are released here rather
+		 * than in dbuf_dirty() since they are only modified
+		 * in the syncing context and we don't want the
+		 * overhead of making multiple copies of the data.
+		 */
+		arc_release(data, db);
+	} else {
+		ASSERT(arc_released(data));
+		/* XXX why do we need to thaw here? */
+		arc_buf_thaw(data);
+	}
 
 	if (parent != dn->dn_dbuf) {
 		ASSERT(parent && parent->db_data_pending);
@@ -2096,17 +2134,22 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
 	zb.zb_level = db->db_level;
 	zb.zb_blkid = db->db_blkid;
 
-	zio_flags = ZIO_FLAG_MUSTSUCCEED;
-	if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
-		zio_flags |= ZIO_FLAG_METADATA;
+	wp.wp_type = dn->dn_type;
+	wp.wp_level = db->db_level;
+	wp.wp_copies = os->os_copies;
+	wp.wp_dncompress = dn->dn_compress;
+	wp.wp_oscompress = os->os_compress;
+	wp.wp_dnchecksum = dn->dn_checksum;
+	wp.wp_oschecksum = os->os_checksum;
+
 	if (BP_IS_OLDER(db->db_blkptr, txg))
-		dsl_dataset_block_kill(
+		(void) dsl_dataset_block_kill(
 		    os->os_dsl_dataset, db->db_blkptr, zio, tx);
 
-	dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
-	    dmu_get_replication_level(os, &zb, dn->dn_type), txg,
-	    db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
-	    ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
+	dr->dr_zio = arc_write(zio, os->os_spa, &wp,
+	    DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
+	    data, dbuf_write_ready, dbuf_write_done, db,
+	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 }
 
 /* ARGSUSED */
@@ -2116,27 +2159,33 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn = db->db_dnode;
 	objset_impl_t *os = dn->dn_objset;
+	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	uint64_t fill = 0;
 	int old_size, new_size, i;
 
+	ASSERT(db->db_blkptr == bp);
+
 	dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
 
 	old_size = bp_get_dasize(os->os_spa, bp_orig);
-	new_size = bp_get_dasize(os->os_spa, zio->io_bp);
+	new_size = bp_get_dasize(os->os_spa, bp);
 
-	dnode_diduse_space(dn, new_size-old_size);
+	dnode_diduse_space(dn, new_size - old_size);
 
-	if (BP_IS_HOLE(zio->io_bp)) {
+	if (BP_IS_HOLE(bp)) {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		dmu_tx_t *tx = os->os_synctx;
 
 		if (bp_orig->blk_birth == tx->tx_txg)
-			dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
-		ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
+			(void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
+		ASSERT3U(bp->blk_fill, ==, 0);
 		return;
 	}
 
+	ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
+	ASSERT(BP_GET_LEVEL(bp) == db->db_level);
+
 	mutex_enter(&db->db_mtx);
 
 	if (db->db_level == 0) {
@@ -2156,32 +2205,31 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 			fill = 1;
 		}
 	} else {
-		blkptr_t *bp = db->db.db_data;
+		blkptr_t *ibp = db->db.db_data;
 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
-		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
-			if (BP_IS_HOLE(bp))
+		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
+			if (BP_IS_HOLE(ibp))
 				continue;
-			ASSERT3U(BP_GET_LSIZE(bp), ==,
+			ASSERT3U(BP_GET_LSIZE(ibp), ==,
 			    db->db_level == 1 ? dn->dn_datablksz :
 			    (1<<dn->dn_phys->dn_indblkshift));
-			fill += bp->blk_fill;
+			fill += ibp->blk_fill;
 		}
 	}
 
-	db->db_blkptr->blk_fill = fill;
-	BP_SET_TYPE(db->db_blkptr, dn->dn_type);
-	BP_SET_LEVEL(db->db_blkptr, db->db_level);
+	bp->blk_fill = fill;
 
 	mutex_exit(&db->db_mtx);
 
-	/* We must do this after we've set the bp's type and level */
-	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
+	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+		ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		dmu_tx_t *tx = os->os_synctx;
 
 		if (bp_orig->blk_birth == tx->tx_txg)
-			dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
-		dsl_dataset_block_born(ds, zio->io_bp, tx);
+			(void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
+		dsl_dataset_block_born(ds, bp, tx);
 	}
 }
 
@@ -2198,13 +2246,12 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	mutex_enter(&db->db_mtx);
 
 	drp = &db->db_last_dirty;
-	while (*drp != db->db_data_pending)
-		drp = &(*drp)->dr_next;
-	ASSERT(!list_link_active(&(*drp)->dr_dirty_node));
-	ASSERT((*drp)->dr_txg == txg);
-	ASSERT((*drp)->dr_next == NULL);
-	dr = *drp;
-	*drp = NULL;
+	while ((dr = *drp) != db->db_data_pending)
+		drp = &dr->dr_next;
+	ASSERT(!list_link_active(&dr->dr_dirty_node));
+	ASSERT(dr->dr_txg == txg);
+	ASSERT(dr->dr_next == NULL);
+	*drp = dr->dr_next;
 
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
@@ -2230,8 +2277,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 			    >> (db->db_level * epbs), >=, db->db_blkid);
 			arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
-		list_destroy(&dr->dt.di.dr_children);
 		mutex_destroy(&dr->dt.di.dr_mtx);
+		list_destroy(&dr->dt.di.dr_children);
 	}
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index d3be6b4ff22e..377efb9d105e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
@@ -42,6 +40,7 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	byteswap_uint8_array,	TRUE,	"unallocated"		},
@@ -62,7 +61,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	zap_byteswap,		TRUE,	"DSL props"		},
 	{	byteswap_uint64_array,	TRUE,	"DSL dataset"		},
 	{	zfs_znode_byteswap,	TRUE,	"ZFS znode"		},
-	{	zfs_acl_byteswap,	TRUE,	"ZFS ACL"		},
+	{	zfs_oldacl_byteswap,	TRUE,	"ZFS V0 ACL"		},
 	{	byteswap_uint8_array,	FALSE,	"ZFS plain file"	},
 	{	zap_byteswap,		TRUE,	"ZFS directory"		},
 	{	zap_byteswap,		TRUE,	"ZFS master node"	},
@@ -75,7 +74,14 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	zap_byteswap,		TRUE,	"persistent error log"	},
 	{	byteswap_uint8_array,	TRUE,	"SPA history"		},
 	{	byteswap_uint64_array,	TRUE,	"SPA history offsets"	},
-	{	zap_byteswap,	TRUE,	"Pool properties"	},
+	{	zap_byteswap,		TRUE,	"Pool properties"	},
+	{	zap_byteswap,		TRUE,	"DSL permissions"	},
+	{	zfs_acl_byteswap,	TRUE,	"ZFS ACL"		},
+	{	byteswap_uint8_array,	TRUE,	"ZFS SYSACL"		},
+	{	byteswap_uint8_array,	TRUE,	"FUID table"		},
+	{	byteswap_uint64_array,	TRUE,	"FUID table size"	},
+	{	zap_byteswap,		TRUE,	"DSL dataset next clones"},
+	{	zap_byteswap,		TRUE,	"scrub work queue"	},
 };
 
 int
@@ -115,6 +121,19 @@ dmu_bonus_max(void)
 	return (DN_MAX_BONUSLEN);
 }
 
+int
+dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+{
+	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+	if (dn->dn_bonus != (dmu_buf_impl_t *)db)
+		return (EINVAL);
+	if (newsize < 0 || newsize > db->db_size)
+		return (EINVAL);
+	dnode_setbonuslen(dn, newsize, tx);
+	return (0);
+}
+
 /*
  * returns ENOENT, EIO, or 0.
  */
@@ -122,27 +141,27 @@ int
 dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
-	int err, count;
 	dmu_buf_impl_t *db;
+	int error;
 
-	err = dnode_hold(os->os, object, FTAG, &dn);
-	if (err)
-		return (err);
+	error = dnode_hold(os->os, object, FTAG, &dn);
+	if (error)
+		return (error);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_bonus == NULL) {
 		rw_exit(&dn->dn_struct_rwlock);
 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		if (dn->dn_bonus == NULL)
-			dn->dn_bonus = dbuf_create_bonus(dn);
+			dbuf_create_bonus(dn);
 	}
 	db = dn->dn_bonus;
 	rw_exit(&dn->dn_struct_rwlock);
-	mutex_enter(&db->db_mtx);
-	count = refcount_add(&db->db_holds, tag);
-	mutex_exit(&db->db_mtx);
-	if (count == 1)
-		dnode_add_ref(dn, db);
+
+	/* as long as the bonus buf is held, the dnode will be held */
+	if (refcount_add(&db->db_holds, tag) == 1)
+		VERIFY(dnode_add_ref(dn, db));
+
 	dnode_rele(dn, FTAG);
 
 	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
@@ -161,11 +180,13 @@ static int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
     uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
 {
+	dsl_pool_t *dp = NULL;
 	dmu_buf_t **dbp;
 	uint64_t blkid, nblks, i;
 	uint32_t flags;
 	int err;
 	zio_t *zio;
+	hrtime_t start;
 
 	ASSERT(length <= DMU_MAX_ACCESS);
 
@@ -192,7 +213,11 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
 	}
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
-	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE);
+	if (dn->dn_objset->os_dsl_dataset)
+		dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
+	if (dp && dsl_pool_sync_context(dp))
+		start = gethrtime();
+	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, offset);
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
@@ -214,6 +239,9 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
 
 	/* wait for async i/o */
 	err = zio_wait(zio);
+	/* track read overhead when we are in sync context */
+	if (dp && dsl_pool_sync_context(dp))
+		dp->dp_read_overhead += gethrtime() - start;
 	if (err) {
 		dmu_buf_rele_array(dbp, nblks, tag);
 		return (err);
@@ -343,6 +371,155 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 	dnode_rele(dn, FTAG);
 }
 
+static int
+get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit)
+{
+	uint64_t len = *offset - limit;
+	uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT;
+	uint64_t subchunk =
+	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
+
+	ASSERT(limit <= *offset);
+
+	if (len <= chunk_len) {
+		*offset = limit;
+		return (0);
+	}
+
+	ASSERT(ISP2(subchunk));
+
+	while (*offset > limit) {
+		uint64_t initial_offset = P2ROUNDUP(*offset, subchunk);
+		uint64_t delta;
+		int err;
+
+		/* skip over allocated data */
+		err = dnode_next_offset(dn,
+		    DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
+		if (err == ESRCH)
+			*offset = limit;
+		else if (err)
+			return (err);
+
+		ASSERT3U(*offset, <=, initial_offset);
+		*offset = P2ALIGN(*offset, subchunk);
+		delta = initial_offset - *offset;
+		if (delta >= chunk_len) {
+			*offset += delta - chunk_len;
+			return (0);
+		}
+		chunk_len -= delta;
+
+		/* skip over unallocated data */
+		err = dnode_next_offset(dn,
+		    DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
+		if (err == ESRCH)
+			*offset = limit;
+		else if (err)
+			return (err);
+
+		if (*offset < limit)
+			*offset = limit;
+		ASSERT3U(*offset, <, initial_offset);
+	}
+	return (0);
+}
+
+static int
+dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
+    uint64_t length, boolean_t free_dnode)
+{
+	dmu_tx_t *tx;
+	uint64_t object_size, start, end, len;
+	boolean_t trunc = (length == DMU_OBJECT_END);
+	int align, err;
+
+	align = 1 << dn->dn_datablkshift;
+	ASSERT(align > 0);
+	object_size = align == 1 ? dn->dn_datablksz :
+	    (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
+
+	if (trunc || (end = offset + length) > object_size)
+		end = object_size;
+	if (end <= offset)
+		return (0);
+	length = end - offset;
+
+	while (length) {
+		start = end;
+		err = get_next_chunk(dn, &start, offset);
+		if (err)
+			return (err);
+		len = trunc ? DMU_OBJECT_END : end - start;
+
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_free(tx, dn->dn_object, start, len);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err) {
+			dmu_tx_abort(tx);
+			return (err);
+		}
+
+		dnode_free_range(dn, start, trunc ? -1 : len, tx);
+
+		if (start == 0 && free_dnode) {
+			ASSERT(trunc);
+			dnode_free(dn, tx);
+		}
+
+		length -= end - start;
+
+		dmu_tx_commit(tx);
+		end = start;
+	}
+	return (0);
+}
+
+int
+dmu_free_long_range(objset_t *os, uint64_t object,
+    uint64_t offset, uint64_t length)
+{
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os->os, object, FTAG, &dn);
+	if (err != 0)
+		return (err);
+	err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+int
+dmu_free_object(objset_t *os, uint64_t object)
+{
+	dnode_t *dn;
+	dmu_tx_t *tx;
+	int err;
+
+	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+	    FTAG, &dn);
+	if (err != 0)
+		return (err);
+	if (dn->dn_nlevels == 1) {
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_bonus(tx, object);
+		dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err == 0) {
+			dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
+			dnode_free(dn, tx);
+			dmu_tx_commit(tx);
+		} else {
+			dmu_tx_abort(tx);
+		}
+	} else {
+		err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
+	}
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
 int
 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
@@ -384,7 +561,6 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
-		int err;
 
 		/*
 		 * NB: we could do this block-at-a-time, but it's nice
@@ -393,7 +569,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
 		    TRUE, FTAG, &numbufs, &dbp);
 		if (err)
-			return (err);
+			break;
 
 		for (i = 0; i < numbufs; i++) {
 			int tocpy;
@@ -414,7 +590,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		dmu_buf_rele_array(dbp, numbufs, FTAG);
 	}
 	dnode_rele(dn, FTAG);
-	return (0);
+	return (err);
 }
 
 void
@@ -590,9 +766,9 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
 			ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
 			thiscpy = MIN(PAGESIZE, tocpy - copied);
-			va = ppmapin(pp, PROT_READ, (caddr_t)-1);
+			va = zfs_map_page(pp, S_READ);
 			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
-			ppmapout(va);
+			zfs_unmap_page(pp, va);
 			pp = pp->p_next;
 			bufoff += PAGESIZE;
 		}
@@ -620,6 +796,22 @@ typedef struct {
 
 /* ARGSUSED */
 static void
+dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	if (!BP_IS_HOLE(bp)) {
+		dmu_sync_arg_t *in = varg;
+		dbuf_dirty_record_t *dr = in->dr;
+		dmu_buf_impl_t *db = dr->dr_dbuf;
+		ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type);
+		ASSERT(BP_GET_LEVEL(bp) == 0);
+		bp->blk_fill = 1;
+	}
+}
+
+/* ARGSUSED */
+static void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	dmu_sync_arg_t *in = varg;
@@ -627,12 +819,6 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dmu_sync_cb_t *done = in->done;
 
-	if (!BP_IS_HOLE(zio->io_bp)) {
-		zio->io_bp->blk_fill = 1;
-		BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type);
-		BP_SET_LEVEL(zio->io_bp, 0);
-	}
-
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
@@ -679,14 +865,13 @@ dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
 	dbuf_dirty_record_t *dr;
 	dmu_sync_arg_t *in;
 	zbookmark_t zb;
+	writeprops_t wp = { 0 };
 	zio_t *zio;
-	int zio_flags;
 	int err;
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT(txg != 0);
 
-
 	dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
 	    txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
 
@@ -791,15 +976,20 @@ dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
 	zb.zb_object = db->db.db_object;
 	zb.zb_level = db->db_level;
 	zb.zb_blkid = db->db_blkid;
-	zio_flags = ZIO_FLAG_MUSTSUCCEED;
-	if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0)
-		zio_flags |= ZIO_FLAG_METADATA;
-	zio = arc_write(pio, os->os_spa,
-	    zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum),
-	    zio_compress_select(db->db_dnode->dn_compress, os->os_compress),
-	    dmu_get_replication_level(os, &zb, db->db_dnode->dn_type),
-	    txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in,
-	    ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb);
+
+	wp.wp_type = db->db_dnode->dn_type;
+	wp.wp_level = db->db_level;
+	wp.wp_copies = os->os_copies;
+	wp.wp_dnchecksum = db->db_dnode->dn_checksum;
+	wp.wp_oschecksum = os->os_checksum;
+	wp.wp_dncompress = db->db_dnode->dn_compress;
+	wp.wp_oscompress = os->os_compress;
+
+	ASSERT(BP_IS_HOLE(bp));
+
+	zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db),
+	    txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in,
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	if (pio) {
 		zio_nowait(zio);
@@ -855,21 +1045,6 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
 }
 
 int
-dmu_get_replication_level(objset_impl_t *os,
-    zbookmark_t *zb, dmu_object_type_t ot)
-{
-	int ncopies = os->os_copies;
-
-	/* If it's the mos, it should have max copies set. */
-	ASSERT(zb->zb_objset != 0 ||
-	    ncopies == spa_max_replication(os->os_spa));
-
-	if (dmu_ot[ot].ot_metadata || zb->zb_level != 0)
-		ncopies++;
-	return (MIN(ncopies, spa_max_replication(os->os_spa)));
-}
-
-int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
@@ -894,7 +1069,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 			return (err);
 	}
 
-	err = dnode_next_offset(dn, hole, off, 1, 1, 0);
+	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
 	dnode_rele(dn, FTAG);
 
 	return (err);
@@ -1018,6 +1193,7 @@ dmu_init(void)
 	dbuf_init();
 	dnode_init();
 	arc_init();
+	l2arc_init();
 }
 
 void
@@ -1026,4 +1202,5 @@ dmu_fini(void)
 	arc_fini();
 	dnode_fini();
 	dbuf_fini();
+	l2arc_fini();
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
index 93168cc8901f..1b9247d66e65 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -54,7 +54,8 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
 		if (P2PHASE(object, L2_dnode_count) == 0) {
 			uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
 			int error = dnode_next_offset(osi->os_meta_dnode,
-			    B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0);
+			    DNODE_FIND_HOLE,
+			    &offset, 2, DNODES_PER_BLOCK >> 2, 0);
 			restarted = B_TRUE;
 			if (error == 0)
 				object = offset >> DNODE_SHIFT;
@@ -139,6 +140,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
 		return (err);
 
 	ASSERT(dn->dn_type != DMU_OT_NONE);
+	dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
 	dnode_free(dn, tx);
 	dnode_rele(dn, FTAG);
 
@@ -152,7 +154,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
 	int error;
 
 	error = dnode_next_offset(os->os->os_meta_dnode,
-	    hole, &offset, 0, DNODES_PER_BLOCK, txg);
+	    (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
 
 	*objectp = offset >> DNODE_SHIFT;
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
index 378fe8c15bc0..7981e06825c4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -19,12 +19,11 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
+#include <sys/cred.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
@@ -32,6 +31,7 @@
 #include <sys/dsl_prop.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
 #include <sys/dnode.h>
 #include <sys/dbuf.h>
 #include <sys/zvol.h>
@@ -40,7 +40,7 @@
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/dmu_impl.h>
-
+#include <sys/zfs_ioctl.h>
 
 spa_t *
 dmu_objset_spa(objset_t *os)
@@ -131,6 +131,34 @@ copies_changed_cb(void *arg, uint64_t newval)
 	osi->os_copies = newval;
 }
 
+static void
+primary_cache_changed_cb(void *arg, uint64_t newval)
+{
+	objset_impl_t *osi = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
+	    newval == ZFS_CACHE_METADATA);
+
+	osi->os_primary_cache = newval;
+}
+
+static void
+secondary_cache_changed_cb(void *arg, uint64_t newval)
+{
+	objset_impl_t *osi = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
+	    newval == ZFS_CACHE_METADATA);
+
+	osi->os_secondary_cache = newval;
+}
+
 void
 dmu_objset_byteswap(void *buf, size_t size)
 {
@@ -146,8 +174,10 @@ int
 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
     objset_impl_t **osip)
 {
-	objset_impl_t *winner, *osi;
-	int i, err, checksum;
+	objset_impl_t *osi;
+	int i, err;
+
+	ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
 
 	osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
 	osi->os.os = osi;
@@ -161,18 +191,26 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 		zb.zb_object = 0;
 		zb.zb_level = -1;
 		zb.zb_blkid = 0;
+		if (DMU_OS_IS_L2CACHEABLE(osi))
+			aflags |= ARC_L2CACHE;
 
 		dprintf_bp(osi->os_rootbp, "reading %s", "");
-		err = arc_read(NULL, spa, osi->os_rootbp,
-		    dmu_ot[DMU_OT_OBJSET].ot_byteswap,
+		/*
+		 * NB: when bprewrite scrub can change the bp,
+		 * and this is called from dmu_objset_open_ds_os, the bp
+		 * could change, and we'll need a lock.
+		 */
+		err = arc_read_nolock(NULL, spa, osi->os_rootbp,
 		    arc_getbuf_func, &osi->os_phys_buf,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
 		if (err) {
 			kmem_free(osi, sizeof (objset_impl_t));
+			/* convert checksum errors into IO errors */
+			if (err == ECKSUM)
+				err = EIO;
 			return (err);
 		}
 		osi->os_phys = osi->os_phys_buf->b_data;
-		arc_release(osi->os_phys_buf, &osi->os_phys_buf);
 	} else {
 		osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t),
 		    &osi->os_phys_buf, ARC_BUFC_METADATA);
@@ -183,18 +221,26 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 	/*
 	 * Note: the changed_cb will be called once before the register
 	 * func returns, thus changing the checksum/compression from the
-	 * default (fletcher2/off).  Snapshots don't need to know, and
-	 * registering would complicate clone promotion.
+	 * default (fletcher2/off).  Snapshots don't need to know about
+	 * checksum/compression/copies.
 	 */
-	if (ds && ds->ds_phys->ds_num_children == 0) {
-		err = dsl_prop_register(ds, "checksum",
-		    checksum_changed_cb, osi);
-		if (err == 0)
-			err = dsl_prop_register(ds, "compression",
-			    compression_changed_cb, osi);
+	if (ds) {
+		err = dsl_prop_register(ds, "primarycache",
+		    primary_cache_changed_cb, osi);
 		if (err == 0)
-			err = dsl_prop_register(ds, "copies",
-			    copies_changed_cb, osi);
+			err = dsl_prop_register(ds, "secondarycache",
+			    secondary_cache_changed_cb, osi);
+		if (!dsl_dataset_is_snapshot(ds)) {
+			if (err == 0)
+				err = dsl_prop_register(ds, "checksum",
+				    checksum_changed_cb, osi);
+			if (err == 0)
+				err = dsl_prop_register(ds, "compression",
+				    compression_changed_cb, osi);
+			if (err == 0)
+				err = dsl_prop_register(ds, "copies",
+				    copies_changed_cb, osi);
+		}
 		if (err) {
 			VERIFY(arc_buf_remove_ref(osi->os_phys_buf,
 			    &osi->os_phys_buf) == 1);
@@ -206,24 +252,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 		osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
 		osi->os_compress = ZIO_COMPRESS_LZJB;
 		osi->os_copies = spa_max_replication(spa);
+		osi->os_primary_cache = ZFS_CACHE_ALL;
+		osi->os_secondary_cache = ZFS_CACHE_ALL;
 	}
 
-	osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
-
-	/*
-	 * Metadata always gets compressed and checksummed.
-	 * If the data checksum is multi-bit correctable, and it's not
-	 * a ZBT-style checksum, then it's suitable for metadata as well.
-	 * Otherwise, the metadata checksum defaults to fletcher4.
-	 */
-	checksum = osi->os_checksum;
-
-	if (zio_checksum_table[checksum].ci_correctable &&
-	    !zio_checksum_table[checksum].ci_zbt)
-		osi->os_md_checksum = checksum;
-	else
-		osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
-	osi->os_md_compress = ZIO_COMPRESS_LZJB;
+	osi->os_zil_header = osi->os_phys->os_zil_header;
+	osi->os_zil = zil_alloc(&osi->os, &osi->os_zil_header);
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
@@ -238,70 +272,118 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 
 	mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	osi->os_meta_dnode = dnode_special_open(osi,
 	    &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
 
-	if (ds != NULL) {
-		winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict);
-		if (winner) {
-			dmu_objset_evict(ds, osi);
-			osi = winner;
-		}
+	/*
+	 * We should be the only thread trying to do this because we
+	 * have ds_opening_lock
+	 */
+	if (ds) {
+		VERIFY(NULL == dsl_dataset_set_user_ptr(ds, osi,
+		    dmu_objset_evict));
 	}
 
 	*osip = osi;
 	return (0);
 }
 
-/* called from zpl */
-int
-dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
-    objset_t **osp)
+static int
+dmu_objset_open_ds_os(dsl_dataset_t *ds, objset_t *os, dmu_objset_type_t type)
 {
-	dsl_dataset_t *ds;
-	int err;
-	objset_t *os;
 	objset_impl_t *osi;
 
-	os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
-	err = dsl_dataset_open(name, mode, os, &ds);
-	if (err) {
-		kmem_free(os, sizeof (objset_t));
-		return (err);
-	}
-
+	mutex_enter(&ds->ds_opening_lock);
 	osi = dsl_dataset_get_user_ptr(ds);
 	if (osi == NULL) {
+		int err;
+
 		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
 		    ds, &ds->ds_phys->ds_bp, &osi);
 		if (err) {
-			dsl_dataset_close(ds, mode, os);
-			kmem_free(os, sizeof (objset_t));
+			mutex_exit(&ds->ds_opening_lock);
 			return (err);
 		}
 	}
+	mutex_exit(&ds->ds_opening_lock);
 
 	os->os = osi;
-	os->os_mode = mode;
+	os->os_mode = DS_MODE_NOHOLD;
 
-	if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) {
-		dmu_objset_close(os);
+	if (type != DMU_OST_ANY && type != os->os->os_phys->os_type)
 		return (EINVAL);
-	}
-	*osp = os;
 	return (0);
 }
 
+int
+dmu_objset_open_ds(dsl_dataset_t *ds, dmu_objset_type_t type, objset_t **osp)
+{
+	objset_t *os;
+	int err;
+
+	os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
+	err = dmu_objset_open_ds_os(ds, os, type);
+	if (err)
+		kmem_free(os, sizeof (objset_t));
+	else
+		*osp = os;
+	return (err);
+}
+
+/* called from zpl */
+int
+dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+    objset_t **osp)
+{
+	objset_t *os;
+	dsl_dataset_t *ds;
+	int err;
+
+	ASSERT(DS_MODE_TYPE(mode) == DS_MODE_USER ||
+	    DS_MODE_TYPE(mode) == DS_MODE_OWNER);
+
+	os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
+	if (DS_MODE_TYPE(mode) == DS_MODE_USER)
+		err = dsl_dataset_hold(name, os, &ds);
+	else
+		err = dsl_dataset_own(name, mode, os, &ds);
+	if (err) {
+		kmem_free(os, sizeof (objset_t));
+		return (err);
+	}
+
+	err = dmu_objset_open_ds_os(ds, os, type);
+	if (err) {
+		if (DS_MODE_TYPE(mode) == DS_MODE_USER)
+			dsl_dataset_rele(ds, os);
+		else
+			dsl_dataset_disown(ds, os);
+		kmem_free(os, sizeof (objset_t));
+	} else {
+		os->os_mode = mode;
+		*osp = os;
+	}
+	return (err);
+}
+
 void
 dmu_objset_close(objset_t *os)
 {
-	dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os);
+	ASSERT(DS_MODE_TYPE(os->os_mode) == DS_MODE_USER ||
+	    DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER ||
+	    DS_MODE_TYPE(os->os_mode) == DS_MODE_NOHOLD);
+
+	if (DS_MODE_TYPE(os->os_mode) == DS_MODE_USER)
+		dsl_dataset_rele(os->os->os_dsl_dataset, os);
+	else if (DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER)
+		dsl_dataset_disown(os->os->os_dsl_dataset, os);
 	kmem_free(os, sizeof (objset_t));
 }
 
 int
-dmu_objset_evict_dbufs(objset_t *os, int try)
+dmu_objset_evict_dbufs(objset_t *os)
 {
 	objset_impl_t *osi = os->os;
 	dnode_t *dn;
@@ -319,34 +401,25 @@ dmu_objset_evict_dbufs(objset_t *os, int try)
 	 * skip.
 	 */
 	for (dn = list_head(&osi->os_dnodes);
-	    dn && refcount_is_zero(&dn->dn_holds);
+	    dn && !dnode_add_ref(dn, FTAG);
 	    dn = list_next(&osi->os_dnodes, dn))
 		continue;
-	if (dn)
-		dnode_add_ref(dn, FTAG);
 
 	while (dn) {
 		dnode_t *next_dn = dn;
 
 		do {
 			next_dn = list_next(&osi->os_dnodes, next_dn);
-		} while (next_dn && refcount_is_zero(&next_dn->dn_holds));
-		if (next_dn)
-			dnode_add_ref(next_dn, FTAG);
+		} while (next_dn && !dnode_add_ref(next_dn, FTAG));
 
 		mutex_exit(&osi->os_lock);
-		if (dnode_evict_dbufs(dn, try)) {
-			dnode_rele(dn, FTAG);
-			if (next_dn)
-				dnode_rele(next_dn, FTAG);
-			return (1);
-		}
+		dnode_evict_dbufs(dn);
 		dnode_rele(dn, FTAG);
 		mutex_enter(&osi->os_lock);
 		dn = next_dn;
 	}
 	mutex_exit(&osi->os_lock);
-	return (0);
+	return (list_head(&osi->os_dnodes) != osi->os_meta_dnode);
 }
 
 void
@@ -361,13 +434,19 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
 		ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
 	}
 
-	if (ds && ds->ds_phys->ds_num_children == 0) {
-		VERIFY(0 == dsl_prop_unregister(ds, "checksum",
-		    checksum_changed_cb, osi));
-		VERIFY(0 == dsl_prop_unregister(ds, "compression",
-		    compression_changed_cb, osi));
-		VERIFY(0 == dsl_prop_unregister(ds, "copies",
-		    copies_changed_cb, osi));
+	if (ds) {
+		if (!dsl_dataset_is_snapshot(ds)) {
+			VERIFY(0 == dsl_prop_unregister(ds, "checksum",
+			    checksum_changed_cb, osi));
+			VERIFY(0 == dsl_prop_unregister(ds, "compression",
+			    compression_changed_cb, osi));
+			VERIFY(0 == dsl_prop_unregister(ds, "copies",
+			    copies_changed_cb, osi));
+		}
+		VERIFY(0 == dsl_prop_unregister(ds, "primarycache",
+		    primary_cache_changed_cb, osi));
+		VERIFY(0 == dsl_prop_unregister(ds, "secondarycache",
+		    secondary_cache_changed_cb, osi));
 	}
 
 	/*
@@ -375,7 +454,7 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
 	 * nothing can be added to the list at this point.
 	 */
 	os.os = osi;
-	(void) dmu_objset_evict_dbufs(&os, 0);
+	(void) dmu_objset_evict_dbufs(&os);
 
 	ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
 	ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
@@ -387,6 +466,7 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
 	VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
 	mutex_destroy(&osi->os_lock);
 	mutex_destroy(&osi->os_obj_lock);
+	mutex_destroy(&osi->os_user_ptr_lock);
 	kmem_free(osi, sizeof (objset_impl_t));
 }
 
@@ -399,7 +479,11 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 	dnode_t *mdn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
+	if (ds)
+		mutex_enter(&ds->ds_opening_lock);
 	VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi));
+	if (ds)
+		mutex_exit(&ds->ds_opening_lock);
 	mdn = osi->os_meta_dnode;
 
 	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
@@ -443,14 +527,15 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 }
 
 struct oscarg {
-	void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
+	void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
 	void *userarg;
 	dsl_dataset_t *clone_parent;
 	const char *lastname;
 	dmu_objset_type_t type;
+	uint64_t flags;
 };
 
-/* ARGSUSED */
+/*ARGSUSED*/
 static int
 dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
@@ -478,11 +563,12 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
 		if (oa->clone_parent->ds_phys->ds_num_children == 0)
 			return (EINVAL);
 	}
+
 	return (0);
 }
 
 static void
-dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	struct oscarg *oa = arg2;
@@ -493,10 +579,9 @@ dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dsobj = dsl_dataset_create_sync(dd, oa->lastname,
-	    oa->clone_parent, tx);
+	    oa->clone_parent, oa->flags, cr, tx);
 
-	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds));
+	VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, FTAG, &ds));
 	bp = dsl_dataset_get_blkptr(ds);
 	if (BP_IS_HOLE(bp)) {
 		objset_impl_t *osi;
@@ -506,15 +591,19 @@ dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 		    ds, bp, oa->type, tx);
 
 		if (oa->userfunc)
-			oa->userfunc(&osi->os, oa->userarg, tx);
+			oa->userfunc(&osi->os, oa->userarg, cr, tx);
 	}
-	dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+
+	spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa,
+	    tx, cr, "dataset = %llu", dsobj);
+
+	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent,
-    void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg)
+    objset_t *clone_parent, uint64_t flags,
+    void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
 {
 	dsl_dir_t *pdd;
 	const char *tail;
@@ -536,6 +625,8 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
 	oa.userarg = arg;
 	oa.lastname = tail;
 	oa.type = type;
+	oa.flags = flags;
+
 	if (clone_parent != NULL) {
 		/*
 		 * You can't clone to a different type.
@@ -564,33 +655,47 @@ dmu_objset_destroy(const char *name)
 	 * It would be nicer to do this in dsl_dataset_destroy_sync(),
 	 * but the replay log objset is modified in open context.
 	 */
-	error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
+	error = dmu_objset_open(name, DMU_OST_ANY,
+	    DS_MODE_OWNER|DS_MODE_READONLY|DS_MODE_INCONSISTENT, &os);
 	if (error == 0) {
+		dsl_dataset_t *ds = os->os->os_dsl_dataset;
 		zil_destroy(dmu_objset_zil(os), B_FALSE);
-		dmu_objset_close(os);
+
+		error = dsl_dataset_destroy(ds, os);
+		/*
+		 * dsl_dataset_destroy() closes the ds.
+		 */
+		kmem_free(os, sizeof (objset_t));
 	}
 
-	return (dsl_dataset_destroy(name));
+	return (error);
 }
 
+/*
+ * This will close the objset.
+ */
 int
-dmu_objset_rollback(const char *name)
+dmu_objset_rollback(objset_t *os)
 {
 	int err;
-	objset_t *os;
+	dsl_dataset_t *ds;
 
-	err = dmu_objset_open(name, DMU_OST_ANY,
-	    DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
-	if (err == 0) {
-		err = zil_suspend(dmu_objset_zil(os));
-		if (err == 0)
-			zil_resume(dmu_objset_zil(os));
-		if (err == 0) {
-			/* XXX uncache everything? */
-			err = dsl_dataset_rollback(os->os->os_dsl_dataset);
-		}
+	ds = os->os->os_dsl_dataset;
+
+	if (!dsl_dataset_tryown(ds, TRUE, os)) {
 		dmu_objset_close(os);
+		return (EBUSY);
 	}
+
+	err = dsl_dataset_rollback(ds, os->os->os_phys->os_type);
+
+	/*
+	 * NB: we close the objset manually because the rollback
+	 * actually implicitly called dmu_objset_evict(), thus freeing
+	 * the objset_impl_t.
+	 */
+	dsl_dataset_disown(ds, os);
+	kmem_free(os, sizeof (objset_t));
 	return (err);
 }
 
@@ -598,6 +703,13 @@ struct snaparg {
 	dsl_sync_task_group_t *dstg;
 	char *snapname;
 	char failed[MAXPATHLEN];
+	boolean_t checkperms;
+	list_t objsets;
+};
+
+struct osnode {
+	list_node_t node;
+	objset_t *os;
 };
 
 static int
@@ -605,20 +717,25 @@ dmu_objset_snapshot_one(char *name, void *arg)
 {
 	struct snaparg *sn = arg;
 	objset_t *os;
-	dmu_objset_stats_t stat;
 	int err;
 
 	(void) strcpy(sn->failed, name);
 
-	err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+	/*
+	 * Check permissions only when requested.  This only applies when
+	 * doing a recursive snapshot.  The permission checks for the starting
+	 * dataset have already been performed in zfs_secpolicy_snapshot()
+	 */
+	if (sn->checkperms == B_TRUE &&
+	    (err = zfs_secpolicy_snapshot_perms(name, CRED())))
+		return (err);
+
+	err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_USER, &os);
 	if (err != 0)
 		return (err);
 
-	/*
-	 * If the objset is in an inconsistent state, return busy.
-	 */
-	dmu_objset_fast_stat(os, &stat);
-	if (stat.dds_inconsistent) {
+	/* If the objset is in an inconsistent state, return busy */
+	if (os->os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
 		dmu_objset_close(os);
 		return (EBUSY);
 	}
@@ -630,8 +747,13 @@ dmu_objset_snapshot_one(char *name, void *arg)
 	 */
 	err = zil_suspend(dmu_objset_zil(os));
 	if (err == 0) {
+		struct osnode *osn;
 		dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check,
-		    dsl_dataset_snapshot_sync, os, sn->snapname, 3);
+		    dsl_dataset_snapshot_sync, os->os->os_dsl_dataset,
+		    sn->snapname, 3);
+		osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP);
+		osn->os = os;
+		list_insert_tail(&sn->objsets, osn);
 	} else {
 		dmu_objset_close(os);
 	}
@@ -643,31 +765,28 @@ int
 dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
 {
 	dsl_sync_task_t *dst;
+	struct osnode *osn;
 	struct snaparg sn = { 0 };
-	char *cp;
 	spa_t *spa;
 	int err;
 
 	(void) strcpy(sn.failed, fsname);
 
-	cp = strchr(fsname, '/');
-	if (cp) {
-		*cp = '\0';
-		err = spa_open(fsname, &spa, FTAG);
-		*cp = '/';
-	} else {
-		err = spa_open(fsname, &spa, FTAG);
-	}
+	err = spa_open(fsname, &spa, FTAG);
 	if (err)
 		return (err);
 
 	sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 	sn.snapname = snapname;
+	list_create(&sn.objsets, sizeof (struct osnode),
+	    offsetof(struct osnode, node));
 
 	if (recursive) {
+		sn.checkperms = B_TRUE;
 		err = dmu_objset_find(fsname,
 		    dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN);
 	} else {
+		sn.checkperms = B_FALSE;
 		err = dmu_objset_snapshot_one(fsname, &sn);
 	}
 
@@ -678,13 +797,20 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
 
 	for (dst = list_head(&sn.dstg->dstg_tasks); dst;
 	    dst = list_next(&sn.dstg->dstg_tasks, dst)) {
-		objset_t *os = dst->dst_arg1;
+		dsl_dataset_t *ds = dst->dst_arg1;
 		if (dst->dst_err)
-			dmu_objset_name(os, sn.failed);
-		zil_resume(dmu_objset_zil(os));
-		dmu_objset_close(os);
+			dsl_dataset_name(ds, sn.failed);
 	}
+
 out:
+	while (osn = list_head(&sn.objsets)) {
+		list_remove(&sn.objsets, osn);
+		zil_resume(dmu_objset_zil(osn->os));
+		dmu_objset_close(osn->os);
+		kmem_free(osn, sizeof (struct osnode));
+	}
+	list_destroy(&sn.objsets);
+
 	if (err)
 		(void) strcpy(fsname, sn.failed);
 	dsl_sync_task_group_destroy(sn.dstg);
@@ -717,39 +843,30 @@ dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
 static void
 ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
+	blkptr_t *bp = zio->io_bp;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
 	objset_impl_t *os = arg;
-	blkptr_t *bp = os->os_rootbp;
 	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
-	int i;
+
+	ASSERT(bp == os->os_rootbp);
+	ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET);
+	ASSERT(BP_GET_LEVEL(bp) == 0);
 
 	/*
 	 * Update rootbp fill count.
 	 */
 	bp->blk_fill = 1;	/* count the meta-dnode */
-	for (i = 0; i < dnp->dn_nblkptr; i++)
+	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
-}
 
-/* ARGSUSED */
-static void
-killer(zio_t *zio, arc_buf_t *abuf, void *arg)
-{
-	objset_impl_t *os = arg;
-
-	ASSERT3U(zio->io_error, ==, 0);
-
-	BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
-	BP_SET_LEVEL(zio->io_bp, 0);
-
-	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
-	    BP_IDENTITY(&zio->io_bp_orig))) {
+	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+		ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+	} else {
 		if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
-			dsl_dataset_block_kill(os->os_dsl_dataset,
-			    &zio->io_bp_orig, NULL, os->os_synctx);
-		dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
-		    os->os_synctx);
+			(void) dsl_dataset_block_kill(os->os_dsl_dataset,
+			    &zio->io_bp_orig, zio, os->os_synctx);
+		dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
 	}
-	arc_release(os->os_phys_buf, &os->os_phys_buf);
 }
 
 /* called from dsl */
@@ -758,10 +875,10 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
 {
 	int txgoff;
 	zbookmark_t zb;
+	writeprops_t wp = { 0 };
 	zio_t *zio;
 	list_t *list;
 	dbuf_dirty_record_t *dr;
-	int zio_flags;
 
 	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
 
@@ -783,19 +900,24 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
 	 */
 	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
 	zb.zb_object = 0;
-	zb.zb_level = -1;
+	zb.zb_level = -1;	/* for block ordering; it's level 0 on disk */
 	zb.zb_blkid = 0;
-	zio_flags = ZIO_FLAG_MUSTSUCCEED;
-	if (dmu_ot[DMU_OT_OBJSET].ot_metadata || zb.zb_level != 0)
-		zio_flags |= ZIO_FLAG_METADATA;
-	if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg))
-		dsl_dataset_block_kill(os->os_dsl_dataset,
+
+	wp.wp_type = DMU_OT_OBJSET;
+	wp.wp_level = 0;	/* on-disk BP level; see above */
+	wp.wp_copies = os->os_copies;
+	wp.wp_oschecksum = os->os_checksum;
+	wp.wp_oscompress = os->os_compress;
+
+	if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) {
+		(void) dsl_dataset_block_kill(os->os_dsl_dataset,
 		    os->os_rootbp, pio, tx);
-	zio = arc_write(pio, os->os_spa, os->os_md_checksum,
-	    os->os_md_compress,
-	    dmu_get_replication_level(os, &zb, DMU_OT_OBJSET),
-	    tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os,
-	    ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
+	}
+
+	arc_release(os->os_phys_buf, &os->os_phys_buf);
+	zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os),
+	    tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
+	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*
 	 * Sync meta-dnode - the parent IO for the sync is the root block
@@ -819,6 +941,7 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
 	 * Free intent log blocks up to this tx.
 	 */
 	zil_sync(os->os_zil, tx);
+	os->os_phys->os_zil_header = os->os_zil_header;
 	zio_nowait(zio);
 }
 
@@ -867,8 +990,23 @@ dmu_objset_is_snapshot(objset_t *os)
 }
 
 int
+dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
+    boolean_t *conflict)
+{
+	dsl_dataset_t *ds = os->os->os_dsl_dataset;
+	uint64_t ignored;
+
+	if (ds->ds_phys->ds_snapnames_zapobj == 0)
+		return (ENOENT);
+
+	return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
+	    ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST,
+	    real, maxlen, conflict));
+}
+
+int
 dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
-    uint64_t *idp, uint64_t *offp)
+    uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
 {
 	dsl_dataset_t *ds = os->os->os_dsl_dataset;
 	zap_cursor_t cursor;
@@ -894,6 +1032,8 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
 	(void) strcpy(name, attr.za_name);
 	if (idp)
 		*idp = attr.za_first_integer;
+	if (case_conflict)
+		*case_conflict = attr.za_normalization_conflict;
 	zap_cursor_advance(&cursor);
 	*offp = zap_cursor_serialize(&cursor);
 	zap_cursor_fini(&cursor);
@@ -938,48 +1078,80 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
 	return (0);
 }
 
+struct findarg {
+	int (*func)(char *, void *);
+	void *arg;
+};
+
+/* ARGSUSED */
+static int
+findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+	struct findarg *fa = arg;
+	return (fa->func((char *)dsname, fa->arg));
+}
+
 /*
  * Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ * Perhaps change all callers to use dmu_objset_find_spa()?
  */
 int
 dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
 {
+	struct findarg fa;
+	fa.func = func;
+	fa.arg = arg;
+	return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags));
+}
+
+/*
+ * Find all objsets under name, call func on each
+ */
+int
+dmu_objset_find_spa(spa_t *spa, const char *name,
+    int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags)
+{
 	dsl_dir_t *dd;
-	objset_t *os;
-	uint64_t snapobj;
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
 	char *child;
-	int do_self, err;
+	uint64_t thisobj;
+	int err;
 
-	err = dsl_dir_open(name, FTAG, &dd, NULL);
+	if (name == NULL)
+		name = spa_name(spa);
+	err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL);
 	if (err)
 		return (err);
 
-	/* NB: the $MOS dir doesn't have a head dataset */
-	do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
+	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
+	if (dd->dd_myname[0] == '$') {
+		dsl_dir_close(dd, FTAG);
+		return (0);
+	}
+
+	thisobj = dd->dd_phys->dd_head_dataset_obj;
 	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+	dp = dd->dd_pool;
 
 	/*
 	 * Iterate over all children.
 	 */
 	if (flags & DS_FIND_CHILDREN) {
-		for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset,
+		for (zap_cursor_init(&zc, dp->dp_meta_objset,
 		    dd->dd_phys->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			ASSERT(attr->za_integer_length == sizeof (uint64_t));
 			ASSERT(attr->za_num_integers == 1);
 
-			/*
-			 * No separating '/' because parent's name ends in /.
-			 */
 			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-			/* XXX could probably just use name here */
-			dsl_dir_name(dd, child);
+			(void) strcpy(child, name);
 			(void) strcat(child, "/");
 			(void) strcat(child, attr->za_name);
-			err = dmu_objset_find(child, func, arg, flags);
+			err = dmu_objset_find_spa(spa, child, func, arg, flags);
 			kmem_free(child, MAXPATHLEN);
 			if (err)
 				break;
@@ -996,30 +1168,36 @@ dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
 	/*
 	 * Iterate over all snapshots.
 	 */
-	if ((flags & DS_FIND_SNAPSHOTS) &&
-	    dmu_objset_open(name, DMU_OST_ANY,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) {
-
-		snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj;
-		dmu_objset_close(os);
-
-		for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj);
-		    zap_cursor_retrieve(&zc, attr) == 0;
-		    (void) zap_cursor_advance(&zc)) {
-			ASSERT(attr->za_integer_length == sizeof (uint64_t));
-			ASSERT(attr->za_num_integers == 1);
+	if (flags & DS_FIND_SNAPSHOTS) {
+		if (!dsl_pool_sync_context(dp))
+			rw_enter(&dp->dp_config_rwlock, RW_READER);
+		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+		if (!dsl_pool_sync_context(dp))
+			rw_exit(&dp->dp_config_rwlock);
 
-			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-			/* XXX could probably just use name here */
-			dsl_dir_name(dd, child);
-			(void) strcat(child, "@");
-			(void) strcat(child, attr->za_name);
-			err = func(child, arg);
-			kmem_free(child, MAXPATHLEN);
-			if (err)
-				break;
+		if (err == 0) {
+			uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+			dsl_dataset_rele(ds, FTAG);
+
+			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
+			    zap_cursor_retrieve(&zc, attr) == 0;
+			    (void) zap_cursor_advance(&zc)) {
+				ASSERT(attr->za_integer_length ==
+				    sizeof (uint64_t));
+				ASSERT(attr->za_num_integers == 1);
+
+				child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+				(void) strcpy(child, name);
+				(void) strcat(child, "@");
+				(void) strcat(child, attr->za_name);
+				err = func(spa, attr->za_first_integer,
+				    child, arg);
+				kmem_free(child, MAXPATHLEN);
+				if (err)
+					break;
+			}
+			zap_cursor_fini(&zc);
 		}
-		zap_cursor_fini(&zc);
 	}
 
 	dsl_dir_close(dd, FTAG);
@@ -1031,7 +1209,20 @@ dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
 	/*
 	 * Apply to self if appropriate.
 	 */
-	if (do_self)
-		err = func(name, arg);
+	err = func(spa, thisobj, name, arg);
 	return (err);
 }
+
+void
+dmu_objset_set_user(objset_t *os, void *user_ptr)
+{
+	ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
+	os->os->os_user_ptr = user_ptr;
+}
+
+void *
+dmu_objset_get_user(objset_t *os)
+{
+	ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
+	return (os->os->os_user_ptr);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index 3e55dc301620..1294581a7133 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -41,10 +41,13 @@
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 
+static char *dmu_recv_tag = "dmu_recv_tag";
+
 struct backuparg {
 	dmu_replay_record_t *drr;
 	kthread_t *td;
 	struct file *fp;
+	offset_t *off;
 	objset_t *os;
 	zio_cksum_t zc;
 	int err;
@@ -77,6 +80,7 @@ dump_bytes(struct backuparg *ba, void *buf, int len)
 	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
 	ba->err = EOPNOTSUPP;
 #endif
+	*ba->off += len;
 
 	return (ba->err);
 }
@@ -179,7 +183,7 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 	void *data = bc->bc_data;
 	int err = 0;
 
-	if (SIGPENDING(curthread))
+	if (issig(JUSTLOOKING) && issig(FORREAL))
 		return (EINTR);
 
 	ASSERT(data || bp == NULL);
@@ -215,10 +219,9 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 			zb.zb_object = object;
 			zb.zb_level = level;
 			zb.zb_blkid = blkid;
-			(void) arc_read(NULL, spa, bp,
-			    dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
-			    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
-			    &aflags, &zb);
+			(void) arc_read_nolock(NULL, spa, bp,
+			    arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+			    ZIO_FLAG_MUSTSUCCEED, &aflags, &zb);
 
 			if (abuf) {
 				err = dump_data(ba, type, object, blkid * blksz,
@@ -236,13 +239,15 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 }
 
 int
-dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
+dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
+    struct file *fp, offset_t *off)
 {
 	dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
 	dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
 	dmu_replay_record_t *drr;
 	struct backuparg ba;
 	int err;
+	uint64_t fromtxg = 0;
 
 	/* tosnap must be a snapshot */
 	if (ds->ds_phys->ds_next_snap_obj == 0)
@@ -250,26 +255,55 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
 
 	/* fromsnap must be an earlier snapshot from the same fs as tosnap */
 	if (fromds && (ds->ds_dir != fromds->ds_dir ||
-	    fromds->ds_phys->ds_creation_txg >=
-	    ds->ds_phys->ds_creation_txg))
+	    fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg))
 		return (EXDEV);
 
+	if (fromorigin) {
+		dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+		if (fromsnap)
+			return (EINVAL);
+
+		if (dsl_dir_is_clone(ds->ds_dir)) {
+			rw_enter(&dp->dp_config_rwlock, RW_READER);
+			err = dsl_dataset_hold_obj(dp,
+			    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds);
+			rw_exit(&dp->dp_config_rwlock);
+			if (err)
+				return (err);
+		} else {
+			fromorigin = B_FALSE;
+		}
+	}
+
+
 	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
 	drr->drr_type = DRR_BEGIN;
 	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
-	drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION;
+	drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION;
 	drr->drr_u.drr_begin.drr_creation_time =
 	    ds->ds_phys->ds_creation_time;
 	drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
+	if (fromorigin)
+		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
 	drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
+	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
+
 	if (fromds)
 		drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
 	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
 
+	if (fromds)
+		fromtxg = fromds->ds_phys->ds_creation_txg;
+	if (fromorigin)
+		dsl_dataset_rele(fromds, FTAG);
+
 	ba.drr = drr;
 	ba.td = curthread;
 	ba.fp = fp;
 	ba.os = tosnap;
+	ba.off = off;
 	ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
 
 	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
@@ -277,8 +311,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
 		return (ba.err);
 	}
 
-	err = traverse_dsl_dataset(ds,
-	    fromds ? fromds->ds_phys->ds_creation_txg : 0,
+	err = traverse_dsl_dataset(ds, fromtxg,
 	    ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
 	    backup_cb, &ba);
 
@@ -303,164 +336,384 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
 	return (0);
 }
 
-struct restorearg {
-	int err;
-	int byteswap;
-	kthread_t *td;
-	struct file *fp;
-	char *buf;
-	uint64_t voff;
-	int buflen; /* number of valid bytes in buf */
-	int bufoff; /* next offset to read */
-	int bufsize; /* amount of memory allocated for buf */
-	zio_cksum_t zc;
+struct recvbeginsyncarg {
+	const char *tofs;
+	const char *tosnap;
+	dsl_dataset_t *origin;
+	uint64_t fromguid;
+	dmu_objset_type_t type;
+	void *tag;
+	boolean_t force;
+	uint64_t dsflags;
+	char clonelastname[MAXNAMELEN];
+	dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
 };
 
+static dsl_dataset_t *
+recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type,
+    cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds;
+
+	/* This should always work, since we just created it */
+	/* XXX - create should return an owned ds */
+	VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
+	    DS_MODE_INCONSISTENT, dmu_recv_tag, &ds));
+
+	if (type != DMU_OST_NONE) {
+		(void) dmu_objset_create_impl(dp->dp_spa,
+		    ds, &ds->ds_phys->ds_bp, type, tx);
+	}
+
+	spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
+	    dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
+
+	return (ds);
+}
+
 /* ARGSUSED */
 static int
-replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	struct drr_begin *drrb = arg2;
-	const char *snapname;
-	int err;
+	dsl_dir_t *dd = arg1;
+	struct recvbeginsyncarg *rbsa = arg2;
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	uint64_t val;
+	int err;
 
-	/* must already be a snapshot of this fs */
-	if (ds->ds_phys->ds_prev_snap_obj == 0)
-		return (ENODEV);
+	err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
+	    strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
 
-	/* most recent snapshot must match fromguid */
-	if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid)
-		return (ENODEV);
-	/* must not have any changes since most recent snapshot */
-	if (ds->ds_phys->ds_bp.blk_birth >
-	    ds->ds_prev->ds_phys->ds_creation_txg)
-		return (ETXTBSY);
+	if (err != ENOENT)
+		return (err ? err : EEXIST);
 
-	/* new snapshot name must not exist */
-	snapname = strrchr(drrb->drr_toname, '@');
-	if (snapname == NULL)
-		return (EEXIST);
+	if (rbsa->origin) {
+		/* make sure it's a snap in the same pool */
+		if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
+			return (EXDEV);
+		if (rbsa->origin->ds_phys->ds_num_children == 0)
+			return (EINVAL);
+		if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
+			return (ENODEV);
+	}
 
-	snapname++;
-	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val);
-	if (err == 0)
-		return (EEXIST);
-	if (err != ENOENT)
+	return (0);
+}
+
+static void
+recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_dir_t *dd = arg1;
+	struct recvbeginsyncarg *rbsa = arg2;
+	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
+	uint64_t dsobj;
+
+	dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
+	    rbsa->origin, flags, cr, tx);
+
+	rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
+	    rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
+}
+
+static int
+recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	struct recvbeginsyncarg *rbsa = arg2;
+	int err;
+
+	/* must be a head ds */
+	if (ds->ds_phys->ds_next_snap_obj != 0)
+		return (EINVAL);
+
+	/* must not be a clone ds */
+	if (dsl_dir_is_clone(ds->ds_dir))
+		return (EINVAL);
+
+	err = dsl_dataset_destroy_check(ds, rbsa->tag, tx);
+	if (err)
 		return (err);
 
+	if (rbsa->origin) {
+		/* make sure it's a snap in the same pool */
+		if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool)
+			return (EXDEV);
+		if (rbsa->origin->ds_phys->ds_num_children == 0)
+			return (EINVAL);
+		if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
+			return (ENODEV);
+	}
+
 	return (0);
 }
 
-/* ARGSUSED */
 static void
-replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+	struct recvbeginsyncarg *rbsa = arg2;
+	dsl_dir_t *dd = ds->ds_dir;
+	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
+	uint64_t dsobj;
+
+	/*
+	 * NB: caller must provide an extra hold on the dsl_dir_t, so it
+	 * won't go away when dsl_dataset_destroy_sync() closes the
+	 * dataset.
+	 */
+	dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx);
+
+	dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx);
+
+	rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
+	    rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
 }
 
 /* ARGSUSED */
 static int
-replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	struct drr_begin *drrb = arg2;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	char *cp;
-	uint64_t val;
+	dsl_dataset_t *ds = arg1;
+	struct recvbeginsyncarg *rbsa = arg2;
 	int err;
+	uint64_t val;
 
-	cp = strchr(drrb->drr_toname, '@');
-	*cp = '\0';
-	err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
-	    strrchr(drrb->drr_toname, '/') + 1,
-	    sizeof (uint64_t), 1, &val);
-	*cp = '@';
+	/* must not have any changes since most recent snapshot */
+	if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
+		return (ETXTBSY);
+
+	/* must already be a snapshot of this fs */
+	if (ds->ds_phys->ds_prev_snap_obj == 0)
+		return (ENODEV);
+
+	/* most recent snapshot must match fromguid */
+	if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid)
+		return (ENODEV);
 
+	/* temporary clone name must not exist */
+	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+	    ds->ds_dir->dd_phys->dd_child_dir_zapobj,
+	    rbsa->clonelastname, 8, 1, &val);
+	if (err == 0)
+		return (EEXIST);
 	if (err != ENOENT)
-		return (err ? err : EEXIST);
+		return (err);
 
+	/* new snapshot name must not exist */
+	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+	    ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
+	if (err == 0)
+		return (EEXIST);
+	if (err != ENOENT)
+		return (err);
 	return (0);
 }
 
+/* ARGSUSED */
 static void
-replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	struct drr_begin *drrb = arg2;
-	char *cp;
-	dsl_dataset_t *ds;
+	dsl_dataset_t *ohds = arg1;
+	struct recvbeginsyncarg *rbsa = arg2;
+	dsl_pool_t *dp = ohds->ds_dir->dd_pool;
+	dsl_dataset_t *ods, *cds;
+	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
 	uint64_t dsobj;
 
-	cp = strchr(drrb->drr_toname, '@');
-	*cp = '\0';
-	dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1,
-	    NULL, tx);
-	*cp = '@';
+	/* create the temporary clone */
+	VERIFY(0 == dsl_dataset_hold_obj(dp, ohds->ds_phys->ds_prev_snap_obj,
+	    FTAG, &ods));
+	dsobj = dsl_dataset_create_sync(ohds->ds_dir,
+	    rbsa->clonelastname, ods, flags, cr, tx);
+	dsl_dataset_rele(ods, FTAG);
+
+	/* open the temporary clone */
+	VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
+	    DS_MODE_INCONSISTENT, dmu_recv_tag, &cds));
 
-	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
-	    DS_MODE_EXCLUSIVE, FTAG, &ds));
+	/* copy the refquota from the target fs to the clone */
+	if (ohds->ds_quota > 0)
+		dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx);
 
-	(void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
-	    ds, &ds->ds_phys->ds_bp, drrb->drr_type, tx);
+	rbsa->ds = cds;
+
+	spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
+	    dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
+}
+
+/* ARGSUSED */
+static void
+recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
 
-	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+	spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
+	    ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
+	    ds->ds_object);
 }
 
-static int
-replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
+/*
+ * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
+ * succeeds; otherwise we will leak the holds on the datasets.
+ */
+int
+dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
+    boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc)
 {
-	objset_t *os = arg1;
-	struct drr_begin *drrb = arg2;
-	char *snapname;
+	int err = 0;
+	boolean_t byteswap;
+	struct recvbeginsyncarg rbsa;
+	uint64_t version;
+	int flags;
+	dsl_dataset_t *ds;
 
-	/* XXX verify that drr_toname is in dd */
+	if (drrb->drr_magic == DMU_BACKUP_MAGIC)
+		byteswap = FALSE;
+	else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
+		byteswap = TRUE;
+	else
+		return (EINVAL);
 
-	snapname = strchr(drrb->drr_toname, '@');
-	if (snapname == NULL)
+	rbsa.tofs = tofs;
+	rbsa.tosnap = tosnap;
+	rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL;
+	rbsa.fromguid = drrb->drr_fromguid;
+	rbsa.type = drrb->drr_type;
+	rbsa.tag = FTAG;
+	rbsa.dsflags = 0;
+	version = drrb->drr_version;
+	flags = drrb->drr_flags;
+
+	if (byteswap) {
+		rbsa.type = BSWAP_32(rbsa.type);
+		rbsa.fromguid = BSWAP_64(rbsa.fromguid);
+		version = BSWAP_64(version);
+		flags = BSWAP_32(flags);
+	}
+
+	if (version != DMU_BACKUP_STREAM_VERSION ||
+	    rbsa.type >= DMU_OST_NUMTYPES ||
+	    ((flags & DRR_FLAG_CLONE) && origin == NULL))
 		return (EINVAL);
-	snapname++;
 
-	return (dsl_dataset_snapshot_check(os, snapname, tx));
-}
+	if (flags & DRR_FLAG_CI_DATA)
+		rbsa.dsflags = DS_FLAG_CI_DATASET;
 
-static void
-replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	objset_t *os = arg1;
-	struct drr_begin *drrb = arg2;
-	char *snapname;
-	dsl_dataset_t *ds, *hds;
+	bzero(drc, sizeof (dmu_recv_cookie_t));
+	drc->drc_drrb = drrb;
+	drc->drc_tosnap = tosnap;
+	drc->drc_force = force;
 
-	snapname = strchr(drrb->drr_toname, '@') + 1;
+	/*
+	 * Process the begin in syncing context.
+	 */
+	if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) {
+		/* offline incremental receive */
+		err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds);
+		if (err)
+			return (err);
 
-	dsl_dataset_snapshot_sync(os, snapname, tx);
+		/*
+		 * Only do the rollback if the most recent snapshot
+		 * matches the incremental source
+		 */
+		if (force) {
+			if (ds->ds_prev == NULL ||
+			    ds->ds_prev->ds_phys->ds_guid !=
+			    rbsa.fromguid) {
+				dsl_dataset_disown(ds, dmu_recv_tag);
+				return (ENODEV);
+			}
+			(void) dsl_dataset_rollback(ds, DMU_OST_NONE);
+		}
+		rbsa.force = B_FALSE;
+		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+		    recv_incremental_check,
+		    recv_offline_incremental_sync, ds, &rbsa, 1);
+		if (err) {
+			dsl_dataset_disown(ds, dmu_recv_tag);
+			return (err);
+		}
+		drc->drc_logical_ds = drc->drc_real_ds = ds;
+	} else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
+		/* online incremental receive */
 
-	/* set snapshot's creation time and guid */
-	hds = os->os->os_dsl_dataset;
-	VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool,
-	    hds->ds_phys->ds_prev_snap_obj, NULL,
-	    DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
-	    FTAG, &ds));
+		/* tmp clone name is: tofs/%tosnap" */
+		(void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
+		    "%%%s", tosnap);
 
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_creation_time = drrb->drr_creation_time;
-	ds->ds_phys->ds_guid = drrb->drr_toguid;
-	ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+		/* open the dataset we are logically receiving into */
+		err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
+		if (err)
+			return (err);
 
-	dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG);
+		rbsa.force = force;
+		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+		    recv_incremental_check,
+		    recv_online_incremental_sync, ds, &rbsa, 5);
+		if (err) {
+			dsl_dataset_rele(ds, dmu_recv_tag);
+			return (err);
+		}
+		drc->drc_logical_ds = ds;
+		drc->drc_real_ds = rbsa.ds;
+	} else {
+		/* create new fs -- full backup or clone */
+		dsl_dir_t *dd = NULL;
+		const char *tail;
+
+		err = dsl_dir_open(tofs, FTAG, &dd, &tail);
+		if (err)
+			return (err);
+		if (tail == NULL) {
+			if (!force) {
+				dsl_dir_close(dd, FTAG);
+				return (EEXIST);
+			}
+
+			rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+			err = dsl_dataset_own_obj(dd->dd_pool,
+			    dd->dd_phys->dd_head_dataset_obj,
+			    DS_MODE_INCONSISTENT, FTAG, &ds);
+			rw_exit(&dd->dd_pool->dp_config_rwlock);
+			if (err) {
+				dsl_dir_close(dd, FTAG);
+				return (err);
+			}
+
+			dsl_dataset_make_exclusive(ds, FTAG);
+			err = dsl_sync_task_do(dd->dd_pool,
+			    recv_full_existing_check,
+			    recv_full_existing_sync, ds, &rbsa, 5);
+			dsl_dataset_disown(ds, FTAG);
+		} else {
+			err = dsl_sync_task_do(dd->dd_pool, recv_full_check,
+			    recv_full_sync, dd, &rbsa, 5);
+		}
+		dsl_dir_close(dd, FTAG);
+		if (err)
+			return (err);
+		drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
+		drc->drc_newfs = B_TRUE;
+	}
 
-	dmu_buf_will_dirty(hds->ds_dbuf, tx);
-	hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+	return (0);
 }
 
+struct restorearg {
+	int err;
+	int byteswap;
+	kthread_t *td;
+	struct file *fp;
+	char *buf;
+	uint64_t voff;
+	int bufsize; /* amount of memory allocated for buf */
+	zio_cksum_t cksum;
+};
+
 static int
 restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, int *resid)
 {
@@ -491,37 +744,31 @@ static void *
 restore_read(struct restorearg *ra, int len)
 {
 	void *rv;
+	int done = 0;
 
 	/* some things will require 8-byte alignment, so everything must */
 	ASSERT3U(len % 8, ==, 0);
 
-	while (ra->buflen - ra->bufoff < len) {
+	while (done < len) {
 		int resid;
-		int leftover = ra->buflen - ra->bufoff;
 
-		(void) memmove(ra->buf, ra->buf + ra->bufoff, leftover);
+		ra->err = restore_bytes(ra, (caddr_t)ra->buf + done,
+		    len - done, ra->voff, &resid);
 
-		ra->err = restore_bytes(ra, (caddr_t)ra->buf + leftover,
-		    ra->bufsize - leftover, ra->voff, &resid);
-
-		ra->voff += ra->bufsize - leftover - resid;
-		ra->buflen = ra->bufsize - resid;
-		ra->bufoff = 0;
-		if (resid == ra->bufsize - leftover)
+		if (resid == len - done)
 			ra->err = EINVAL;
+		ra->voff += len - done - resid;
+		done = len - resid;
 		if (ra->err)
 			return (NULL);
-		/* Could compute checksum here? */
 	}
 
-	ASSERT3U(ra->bufoff % 8, ==, 0);
-	ASSERT3U(ra->buflen - ra->bufoff, >=, len);
-	rv = ra->buf + ra->bufoff;
-	ra->bufoff += len;
+	ASSERT3U(done, ==, len);
+	rv = ra->buf;
 	if (ra->byteswap)
-		fletcher_4_incremental_byteswap(rv, len, &ra->zc);
+		fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
 	else
-		fletcher_4_incremental_native(rv, len, &ra->zc);
+		fletcher_4_incremental_native(rv, len, &ra->cksum);
 	return (rv);
 }
 
@@ -531,12 +778,14 @@ backup_byteswap(dmu_replay_record_t *drr)
 #define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
 #define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
 	drr->drr_type = BSWAP_32(drr->drr_type);
+	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
 	switch (drr->drr_type) {
 	case DRR_BEGIN:
 		DO64(drr_begin.drr_magic);
 		DO64(drr_begin.drr_version);
 		DO64(drr_begin.drr_creation_time);
 		DO32(drr_begin.drr_type);
+		DO32(drr_begin.drr_flags);
 		DO64(drr_begin.drr_toguid);
 		DO64(drr_begin.drr_fromguid);
 		break;
@@ -643,13 +892,13 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 		VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
 		dmu_buf_will_dirty(db, tx);
 
-		ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
-		data = restore_read(ra, P2ROUNDUP(db->db_size, 8));
+		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
+		data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
 		if (data == NULL) {
 			dmu_tx_commit(tx);
 			return (ra->err);
 		}
-		bcopy(data, db->db_data, db->db_size);
+		bcopy(data, db->db_data, drro->drr_bonuslen);
 		if (ra->byteswap) {
 			dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
 			    drro->drr_bonuslen);
@@ -673,23 +922,14 @@ restore_freeobjects(struct restorearg *ra, objset_t *os,
 	for (obj = drrfo->drr_firstobj;
 	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
 	    (void) dmu_object_next(os, &obj, FALSE, 0)) {
-		dmu_tx_t *tx;
 		int err;
 
 		if (dmu_object_info(os, obj, NULL) != 0)
 			continue;
 
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_bonus(tx, obj);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err) {
-			dmu_tx_abort(tx);
+		err = dmu_free_object(os, obj);
+		if (err)
 			return (err);
-		}
-		err = dmu_object_free(os, obj, tx);
-		dmu_tx_commit(tx);
-		if (err && err != ENOENT)
-			return (EINVAL);
 	}
 	return (0);
 }
@@ -735,7 +975,6 @@ static int
 restore_free(struct restorearg *ra, objset_t *os,
     struct drr_free *drrf)
 {
-	dmu_tx_t *tx;
 	int err;
 
 	if (drrf->drr_length != -1ULL &&
@@ -745,66 +984,65 @@ restore_free(struct restorearg *ra, objset_t *os,
 	if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
 		return (EINVAL);
 
-	tx = dmu_tx_create(os);
-
-	dmu_tx_hold_free(tx, drrf->drr_object,
+	err = dmu_free_long_range(os, drrf->drr_object,
 	    drrf->drr_offset, drrf->drr_length);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
-		dmu_tx_abort(tx);
-		return (err);
-	}
-	err = dmu_free_range(os, drrf->drr_object,
-	    drrf->drr_offset, drrf->drr_length, tx);
-	dmu_tx_commit(tx);
 	return (err);
 }
 
+void
+dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc)
+{
+	if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) {
+		/*
+		 * online incremental or new fs: destroy the fs (which
+		 * may be a clone) that we created
+		 */
+		(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
+		if (drc->drc_real_ds != drc->drc_logical_ds)
+			dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
+	} else {
+		/*
+		 * offline incremental: rollback to most recent snapshot.
+		 */
+		(void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE);
+		dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag);
+	}
+}
+
+/*
+ * NB: callers *must* call dmu_recv_end() if this succeeds.
+ */
 int
-dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
-    boolean_t force, struct file *fp, uint64_t voffset)
+dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp)
 {
 	kthread_t *td = curthread;
-	struct restorearg ra;
+	struct restorearg ra = { 0 };
 	dmu_replay_record_t *drr;
-	char *cp;
-	objset_t *os = NULL;
-	zio_cksum_t pzc;
-
-	bzero(&ra, sizeof (ra));
-	ra.td = td;
-	ra.fp = fp;
-	ra.voff = voffset;
-	ra.bufsize = 1<<20;
-	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
+	objset_t *os;
+	zio_cksum_t pcksum;
 
-	if (drrb->drr_magic == DMU_BACKUP_MAGIC) {
-		ra.byteswap = FALSE;
-	} else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
 		ra.byteswap = TRUE;
-	} else {
-		ra.err = EINVAL;
-		goto out;
-	}
 
-	/*
-	 * NB: this assumes that struct drr_begin will be the largest in
-	 * dmu_replay_record_t's drr_u, and thus we don't need to pad it
-	 * with zeros to make it the same length as we wrote out.
-	 */
-	((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN;
-	((dmu_replay_record_t *)ra.buf)->drr_pad = 0;
-	((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb;
-	if (ra.byteswap) {
-		fletcher_4_incremental_byteswap(ra.buf,
-		    sizeof (dmu_replay_record_t), &ra.zc);
-	} else {
-		fletcher_4_incremental_native(ra.buf,
-		    sizeof (dmu_replay_record_t), &ra.zc);
+	{
+		/* compute checksum of drr_begin record */
+		dmu_replay_record_t *drr;
+		drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
+
+		drr->drr_type = DRR_BEGIN;
+		drr->drr_u.drr_begin = *drc->drc_drrb;
+		if (ra.byteswap) {
+			fletcher_4_incremental_byteswap(drr,
+			    sizeof (dmu_replay_record_t), &ra.cksum);
+		} else {
+			fletcher_4_incremental_native(drr,
+			    sizeof (dmu_replay_record_t), &ra.cksum);
+		}
+		kmem_free(drr, sizeof (dmu_replay_record_t));
 	}
-	(void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */
 
 	if (ra.byteswap) {
+		struct drr_begin *drrb = drc->drc_drrb;
 		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
 		drrb->drr_version = BSWAP_64(drrb->drr_version);
 		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
@@ -813,94 +1051,30 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
 		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
 	}
 
-	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
-
-	if (drrb->drr_version != DMU_BACKUP_VERSION ||
-	    drrb->drr_type >= DMU_OST_NUMTYPES ||
-	    strchr(drrb->drr_toname, '@') == NULL) {
-		ra.err = EINVAL;
-		goto out;
-	}
-
-	/*
-	 * Process the begin in syncing context.
-	 */
-	if (drrb->drr_fromguid) {
-		/* incremental backup */
-		dsl_dataset_t *ds = NULL;
-
-		cp = strchr(tosnap, '@');
-		*cp = '\0';
-		ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds);
-		*cp = '@';
-		if (ra.err)
-			goto out;
-
-		/*
-		 * Only do the rollback if the most recent snapshot
-		 * matches the incremental source
-		 */
-		if (force) {
-			if (ds->ds_prev == NULL ||
-			    ds->ds_prev->ds_phys->ds_guid !=
-			    drrb->drr_fromguid) {
-				dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-				kmem_free(ra.buf, ra.bufsize);
-				return (ENODEV);
-			}
-			(void) dsl_dataset_rollback(ds);
-		}
-		ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    replay_incremental_check, replay_incremental_sync,
-		    ds, drrb, 1);
-		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-	} else {
-		/* full backup */
-		dsl_dir_t *dd = NULL;
-		const char *tail;
-
-		/* can't restore full backup into topmost fs, for now */
-		if (strrchr(drrb->drr_toname, '/') == NULL) {
-			ra.err = EINVAL;
-			goto out;
-		}
-
-		cp = strchr(tosnap, '@');
-		*cp = '\0';
-		ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail);
-		*cp = '@';
-		if (ra.err)
-			goto out;
-		if (tail == NULL) {
-			ra.err = EEXIST;
-			goto out;
-		}
+	ra.td = td;
+	ra.fp = fp;
+	ra.voff = *voffp;
+	ra.bufsize = 1<<20;
+	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
 
-		ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check,
-		    replay_full_sync, dd, drrb, 5);
-		dsl_dir_close(dd, FTAG);
-	}
-	if (ra.err)
-		goto out;
+	/* these were verified in dmu_recv_begin */
+	ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION);
+	ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
 
 	/*
 	 * Open the objset we are modifying.
 	 */
+	VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0);
 
-	cp = strchr(tosnap, '@');
-	*cp = '\0';
-	ra.err = dmu_objset_open(tosnap, DMU_OST_ANY,
-	    DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os);
-	*cp = '@';
-	ASSERT3U(ra.err, ==, 0);
+	ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
 
 	/*
 	 * Read records and process them.
 	 */
-	pzc = ra.zc;
+	pcksum = ra.cksum;
 	while (ra.err == 0 &&
 	    NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
-		if (SIGPENDING(td)) {
+		if (issig(JUSTLOOKING) && issig(FORREAL)) {
 			ra.err = EINTR;
 			goto out;
 		}
@@ -947,63 +1121,116 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
 			 * value, because the stored checksum is of
 			 * everything before the DRR_END record.
 			 */
-			if (drre.drr_checksum.zc_word[0] != 0 &&
-			    !ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pzc)) {
+			if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
 				ra.err = ECKSUM;
-				goto out;
-			}
-
-			ra.err = dsl_sync_task_do(dmu_objset_ds(os)->
-			    ds_dir->dd_pool, replay_end_check, replay_end_sync,
-			    os, drrb, 3);
 			goto out;
 		}
 		default:
 			ra.err = EINVAL;
 			goto out;
 		}
-		pzc = ra.zc;
+		pcksum = ra.cksum;
 	}
+	ASSERT(ra.err != 0);
 
 out:
-	if (os)
-		dmu_objset_close(os);
+	dmu_objset_close(os);
 
-	/*
-	 * Make sure we don't rollback/destroy unless we actually
-	 * processed the begin properly.  'os' will only be set if this
-	 * is the case.
-	 */
-	if (ra.err && os && tosnap && strchr(tosnap, '@')) {
+	if (ra.err != 0) {
 		/*
 		 * rollback or destroy what we created, so we don't
 		 * leave it in the restoring state.
 		 */
-		dsl_dataset_t *ds;
-		int err;
-
-		cp = strchr(tosnap, '@');
-		*cp = '\0';
-		err = dsl_dataset_open(tosnap,
-		    DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT,
-		    FTAG, &ds);
-		if (err == 0) {
-			txg_wait_synced(ds->ds_dir->dd_pool, 0);
-			if (drrb->drr_fromguid) {
-				/* incremental: rollback to most recent snap */
-				(void) dsl_dataset_rollback(ds);
-				dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-			} else {
-				/* full: destroy whole fs */
-				dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-				(void) dsl_dataset_destroy(tosnap);
-			}
-		}
-		*cp = '@';
+		txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
+		dmu_recv_abort_cleanup(drc);
 	}
 
 	kmem_free(ra.buf, ra.bufsize);
-	if (sizep)
-		*sizep = ra.voff;
+	*voffp = ra.voff;
 	return (ra.err);
 }
+
+struct recvendsyncarg {
+	char *tosnap;
+	uint64_t creation_time;
+	uint64_t toguid;
+};
+
+static int
+recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	struct recvendsyncarg *resa = arg2;
+
+	return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx));
+}
+
+static void
+recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	struct recvendsyncarg *resa = arg2;
+
+	dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx);
+
+	/* set snapshot's creation time and guid */
+	dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+	ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time;
+	ds->ds_prev->ds_phys->ds_guid = resa->toguid;
+	ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc)
+{
+	struct recvendsyncarg resa;
+	dsl_dataset_t *ds = drc->drc_logical_ds;
+	int err;
+
+	/*
+	 * XXX hack; seems the ds is still dirty and
+	 * dsl_pool_zil_clean() expects it to have a ds_user_ptr
+	 * (and zil), but clone_swap() can close it.
+	 */
+	txg_wait_synced(ds->ds_dir->dd_pool, 0);
+
+	if (ds != drc->drc_real_ds) {
+		/* we are doing an online recv */
+		if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
+			err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
+			    drc->drc_force);
+			if (err)
+				dsl_dataset_disown(ds, dmu_recv_tag);
+		} else {
+			err = EBUSY;
+			dsl_dataset_rele(ds, dmu_recv_tag);
+		}
+		/* dsl_dataset_destroy() will disown the ds */
+		(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
+		if (err)
+			return (err);
+	}
+
+	resa.creation_time = drc->drc_drrb->drr_creation_time;
+	resa.toguid = drc->drc_drrb->drr_toguid;
+	resa.tosnap = drc->drc_tosnap;
+
+	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+	    recv_end_check, recv_end_sync, ds, &resa, 3);
+	if (err) {
+		if (drc->drc_newfs) {
+			ASSERT(ds == drc->drc_real_ds);
+			(void) dsl_dataset_destroy(ds, dmu_recv_tag);
+			return (err);
+		} else {
+			(void) dsl_dataset_rollback(ds, DMU_OST_NONE);
+		}
+	}
+
+	/* release the hold from dmu_recv_begin */
+	dsl_dataset_disown(ds, dmu_recv_tag);
+	return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
index 3d2bc3e47678..43bf82e7a682 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -35,6 +35,7 @@
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_impl.h>
+#include <sys/zvol.h>
 
 #define	BP_SPAN_SHIFT(level, width)	((level) * (width))
 
@@ -261,6 +262,16 @@ advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
 	return (EAGAIN);
 }
 
+/*
+ * The traverse_callback function will call the function specified in th_func.
+ * In the event of an error the callee, specified by th_func, must return
+ * one of the following errors:
+ *
+ *	EINTR		- Indicates that the callee wants the traversal to
+ *			  abort immediately.
+ * 	ERESTART	- The callee has acknowledged the error and would
+ *			  like to continue.
+ */
 static int
 traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
 {
@@ -603,7 +614,10 @@ traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
 			th->th_locked = 0;
 		}
 
-		rc = traverse_read(th, bc, &dsp->ds_bp, dn);
+		if (BP_IS_HOLE(&dsp->ds_bp))
+			rc = ERESTART;
+		else
+			rc = traverse_read(th, bc, &dsp->ds_bp, dn);
 
 		if (rc != 0) {
 			if (rc == ERESTART)
@@ -722,6 +736,24 @@ traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
 }
 
 int
+traverse_zvol(objset_t *os, int advance,  blkptr_cb_t func, void *arg)
+{
+	spa_t *spa = dmu_objset_spa(os);
+	traverse_handle_t *th;
+	int err;
+
+	th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL);
+
+	traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ);
+
+	while ((err = traverse_more(th)) == EAGAIN)
+		continue;
+
+	traverse_fini(th);
+	return (err);
+}
+
+int
 traverse_more(traverse_handle_t *th)
 {
 	zseg_t *zseg = list_head(&th->th_seglist);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
index 13fd8d4d9dce..000c3ce64eb5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dbuf.h>
@@ -157,7 +155,7 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 	rw_exit(&dn->dn_struct_rwlock);
 	if (db == NULL)
 		return (EIO);
-	err = dbuf_read(db, zio, DB_RF_CANFAIL);
+	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 	dbuf_rele(db, FTAG);
 	return (err);
 }
@@ -294,6 +292,8 @@ dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 		txh->txh_space_tooverwrite += space;
 	} else {
 		txh->txh_space_towrite += space;
+		if (dn && dn->dn_dbuf->db_blkptr)
+			txh->txh_space_tounref += space;
 	}
 }
 
@@ -318,39 +318,25 @@ dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 static void
 dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 {
-	uint64_t blkid, nblks;
-	uint64_t space = 0;
+	uint64_t blkid, nblks, lastblk;
+	uint64_t space = 0, unref = 0, skipped = 0;
 	dnode_t *dn = txh->txh_dnode;
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
-	int dirty;
+	int epbs;
 
-	/*
-	 * We don't need to use any locking to check for dirtyness
-	 * because it's OK if we get stale data -- the dnode may become
-	 * dirty immediately after our check anyway.  This is just a
-	 * means to avoid the expensive count when we aren't sure we
-	 * need it.  We need to be able to deal with a dirty dnode.
-	 */
-	dirty = list_link_active(&dn->dn_dirty_link[0]) |
-	    list_link_active(&dn->dn_dirty_link[1]) |
-	    list_link_active(&dn->dn_dirty_link[2]) |
-	    list_link_active(&dn->dn_dirty_link[3]);
-	if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0)
+	if (dn->dn_nlevels == 0)
 		return;
 
 	/*
-	 * the struct_rwlock protects us against dn_phys->dn_nlevels
+	 * The struct_rwlock protects us against dn_nlevels
 	 * changing, in case (against all odds) we manage to dirty &
 	 * sync out the changes after we check for being dirty.
-	 * also, dbuf_hold_impl() wants us to have the struct_rwlock.
-	 *
-	 * It's fine to use dn_datablkshift rather than the dn_phys
-	 * equivalent because if it is changing, maxblkid==0 and we will
-	 * bail.
+	 * Also, dbuf_hold_level() wants us to have the struct_rwlock.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (dn->dn_phys->dn_maxblkid == 0) {
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	if (dn->dn_maxblkid == 0) {
 		if (off == 0 && len >= dn->dn_datablksz) {
 			blkid = 0;
 			nblks = 1;
@@ -360,78 +346,120 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 		}
 	} else {
 		blkid = off >> dn->dn_datablkshift;
-		nblks = (off + len) >> dn->dn_datablkshift;
+		nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 
-		if (blkid >= dn->dn_phys->dn_maxblkid) {
+		if (blkid >= dn->dn_maxblkid) {
 			rw_exit(&dn->dn_struct_rwlock);
 			return;
 		}
-		if (blkid + nblks > dn->dn_phys->dn_maxblkid)
-			nblks = dn->dn_phys->dn_maxblkid - blkid;
+		if (blkid + nblks > dn->dn_maxblkid)
+			nblks = dn->dn_maxblkid - blkid;
 
-		/* don't bother after 128,000 blocks */
-		nblks = MIN(nblks, 128*1024);
 	}
-
-	if (dn->dn_phys->dn_nlevels == 1) {
+	if (dn->dn_nlevels == 1) {
 		int i;
 		for (i = 0; i < nblks; i++) {
 			blkptr_t *bp = dn->dn_phys->dn_blkptr;
-			ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
+			ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 			bp += blkid + i;
 			if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
 				dprintf_bp(bp, "can free old%s", "");
 				space += bp_get_dasize(spa, bp);
 			}
+			unref += BP_GET_ASIZE(bp);
 		}
 		nblks = 0;
 	}
 
+	/*
+	 * Add in memory requirements of higher-level indirects.
+	 * This assumes a worst-possible scenario for dn_nlevels.
+	 */
+	{
+		uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
+		int level = (dn->dn_nlevels > 1) ? 2 : 1;
+
+		while (level++ < DN_MAX_LEVELS) {
+			txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
+			blkcnt = 1 + (blkcnt >> epbs);
+		}
+		ASSERT(blkcnt <= dn->dn_nblkptr);
+	}
+
+	lastblk = blkid + nblks - 1;
 	while (nblks) {
 		dmu_buf_impl_t *dbuf;
-		int err, epbs, blkoff, tochk;
-
-		epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-		blkoff = P2PHASE(blkid, 1<<epbs);
-		tochk = MIN((1<<epbs) - blkoff, nblks);
-
-		err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
-		if (err == 0) {
-			int i;
-			blkptr_t *bp;
-
-			err = dbuf_read(dbuf, NULL,
-			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
-			if (err != 0) {
-				txh->txh_tx->tx_err = err;
-				dbuf_rele(dbuf, FTAG);
-				break;
-			}
+		uint64_t ibyte, new_blkid;
+		int epb = 1 << epbs;
+		int err, i, blkoff, tochk;
+		blkptr_t *bp;
+
+		ibyte = blkid << dn->dn_datablkshift;
+		err = dnode_next_offset(dn,
+		    DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
+		new_blkid = ibyte >> dn->dn_datablkshift;
+		if (err == ESRCH) {
+			skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
+			break;
+		}
+		if (err) {
+			txh->txh_tx->tx_err = err;
+			break;
+		}
+		if (new_blkid > lastblk) {
+			skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
+			break;
+		}
 
-			bp = dbuf->db.db_data;
-			bp += blkoff;
+		if (new_blkid > blkid) {
+			ASSERT((new_blkid >> epbs) > (blkid >> epbs));
+			skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
+			nblks -= new_blkid - blkid;
+			blkid = new_blkid;
+		}
+		blkoff = P2PHASE(blkid, epb);
+		tochk = MIN(epb - blkoff, nblks);
 
-			for (i = 0; i < tochk; i++) {
-				if (dsl_dataset_block_freeable(ds,
-				    bp[i].blk_birth)) {
-					dprintf_bp(&bp[i],
-					    "can free old%s", "");
-					space += bp_get_dasize(spa, &bp[i]);
-				}
-			}
+		dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG);
+
+		txh->txh_memory_tohold += dbuf->db.db_size;
+		if (txh->txh_memory_tohold > DMU_MAX_ACCESS) {
+			txh->txh_tx->tx_err = E2BIG;
 			dbuf_rele(dbuf, FTAG);
+			break;
 		}
-		if (err && err != ENOENT) {
+		err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
+		if (err != 0) {
 			txh->txh_tx->tx_err = err;
+			dbuf_rele(dbuf, FTAG);
 			break;
 		}
 
+		bp = dbuf->db.db_data;
+		bp += blkoff;
+
+		for (i = 0; i < tochk; i++) {
+			if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) {
+				dprintf_bp(&bp[i], "can free old%s", "");
+				space += bp_get_dasize(spa, &bp[i]);
+			}
+			unref += BP_GET_ASIZE(bp);
+		}
+		dbuf_rele(dbuf, FTAG);
+
 		blkid += tochk;
 		nblks -= tochk;
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
+	/* account for new level 1 indirect blocks that might show up */
+	if (skipped > 0) {
+		txh->txh_fudge += skipped << dn->dn_indblkshift;
+		skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
+		txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
+	}
 	txh->txh_space_tofree += space;
+	txh->txh_space_tounref += unref;
 }
 
 void
@@ -466,7 +494,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 	/*
 	 * For i/o error checking, read the first and last level-0
 	 * blocks, and all the level-1 blocks.  The above count_write's
-	 * will take care of the level-0 blocks.
+	 * have already taken care of the level-0 blocks.
 	 */
 	if (dn->dn_nlevels > 1) {
 		shift = dn->dn_datablkshift + dn->dn_indblkshift -
@@ -478,7 +506,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 		    NULL, NULL, ZIO_FLAG_CANFAIL);
 		for (i = start; i <= end; i++) {
 			uint64_t ibyte = i << shift;
-			err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0);
+			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 			i = ibyte >> shift;
 			if (err == ESRCH)
 				break;
@@ -550,10 +578,13 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
 		 * the size will change between now and the dbuf dirty call.
 		 */
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-		    dn->dn_phys->dn_blkptr[0].blk_birth))
+		    dn->dn_phys->dn_blkptr[0].blk_birth)) {
 			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
-		else
+		} else {
 			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+			txh->txh_space_tounref +=
+			    BP_GET_ASIZE(dn->dn_phys->dn_blkptr);
+		}
 		return;
 	}
 
@@ -575,7 +606,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
 	 * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
 	 */
 	dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz,
-	    (3 + add ? 3 : 0) << dn->dn_datablkshift);
+	    (3 + (add ? 3 : 0)) << dn->dn_datablkshift);
 
 	/*
 	 * If the modified blocks are scattered to the four winds,
@@ -698,12 +729,13 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 					match_offset = TRUE;
 				break;
 			case THT_FREE:
-				if (blkid == beginblk &&
-				    (txh->txh_arg1 != 0 ||
-				    dn->dn_maxblkid == 0))
-					match_offset = TRUE;
-				if (blkid == endblk &&
-				    txh->txh_arg2 != DMU_OBJECT_END)
+				/*
+				 * We will dirty all the level 1 blocks in
+				 * the free range and perhaps the first and
+				 * last level 0 block.
+				 */
+				if (blkid >= beginblk && (blkid <= endblk ||
+				    txh->txh_arg2 == DMU_OBJECT_END))
 					match_offset = TRUE;
 				break;
 			case THT_BONUS:
@@ -733,12 +765,32 @@ static int
 dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 {
 	dmu_tx_hold_t *txh;
-	uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite;
+	spa_t *spa = tx->tx_pool->dp_spa;
+	uint64_t memory, asize, fsize, usize;
+	uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
 
 	ASSERT3U(tx->tx_txg, ==, 0);
+
 	if (tx->tx_err)
 		return (tx->tx_err);
 
+	if (spa_suspended(spa)) {
+		/*
+		 * If the user has indicated a blocking failure mode
+		 * then return ERESTART which will block in dmu_tx_wait().
+		 * Otherwise, return EIO so that an error can get
+		 * propagated back to the VOP calls.
+		 *
+		 * Note that we always honor the txg_how flag regardless
+		 * of the failuremode setting.
+		 */
+		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
+		    txg_how != TXG_WAIT)
+			return (EIO);
+
+		return (ERESTART);
+	}
+
 	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 	tx->tx_needassign_txh = NULL;
 
@@ -748,7 +800,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 	 * dmu_tx_unassign() logic.
 	 */
 
-	towrite = tofree = tooverwrite = 0;
+	towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
 	for (txh = list_head(&tx->tx_holds); txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		dnode_t *dn = txh->txh_dnode;
@@ -768,6 +820,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 		towrite += txh->txh_space_towrite;
 		tofree += txh->txh_space_tofree;
 		tooverwrite += txh->txh_space_tooverwrite;
+		tounref += txh->txh_space_tounref;
+		tohold += txh->txh_memory_tohold;
+		fudge += txh->txh_fudge;
 	}
 
 	/*
@@ -788,22 +843,31 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 		tooverwrite = tofree = 0;
 	}
 
-	/*
-	 * Convert logical size to worst-case allocated size.
-	 */
+	/* needed allocation: worst-case estimate of write space */
+	asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
+	/* freed space estimate: worst-case overwrite + free estimate */
 	fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
-	lsize = towrite + tooverwrite;
-	asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+	/* convert unrefd space to worst-case estimate */
+	usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
+	/* calculate memory footprint estimate */
+	memory = towrite + tooverwrite + tohold;
 
 #ifdef ZFS_DEBUG
-	tx->tx_space_towrite = asize;
+	/*
+	 * Add in 'tohold' to account for our dirty holds on this memory
+	 * XXX - the "fudge" factor is to account for skipped blocks that
+	 * we missed because dnode_next_offset() misses in-core-only blocks.
+	 */
+	tx->tx_space_towrite = asize +
+	    spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
 	tx->tx_space_tofree = tofree;
 	tx->tx_space_tooverwrite = tooverwrite;
+	tx->tx_space_tounref = tounref;
 #endif
 
 	if (tx->tx_dir && asize != 0) {
-		int err = dsl_dir_tempreserve_space(tx->tx_dir,
-		    lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
+		int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
+		    asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
 		if (err)
 			return (err);
 	}
@@ -885,10 +949,18 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
 void
 dmu_tx_wait(dmu_tx_t *tx)
 {
+	spa_t *spa = tx->tx_pool->dp_spa;
+
 	ASSERT(tx->tx_txg == 0);
-	ASSERT(tx->tx_lasttried_txg != 0);
 
-	if (tx->tx_needassign_txh) {
+	/*
+	 * It's possible that the pool has become active after this thread
+	 * has tried to obtain a tx. If that's the case then his
+	 * tx_lasttried_txg would not have been assigned.
+	 */
+	if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
+		txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
+	} else if (tx->tx_needassign_txh) {
 		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
 
 		mutex_enter(&dn->dn_mtx);
@@ -948,6 +1020,7 @@ dmu_tx_commit(dmu_tx_t *tx)
 
 	if (tx->tx_anyobj == FALSE)
 		txg_rele_to_sync(&tx->tx_txgh);
+	list_destroy(&tx->tx_holds);
 #ifdef ZFS_DEBUG
 	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
 	    tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
@@ -975,6 +1048,7 @@ dmu_tx_abort(dmu_tx_t *tx)
 		if (dn != NULL)
 			dnode_rele(dn, tx);
 	}
+	list_destroy(&tx->tx_holds);
 #ifdef ZFS_DEBUG
 	refcount_destroy_many(&tx->tx_space_written,
 	    refcount_count(&tx->tx_space_written));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
index b25cc898c37d..8dba38176527 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
@@ -38,10 +38,6 @@
  */
 
 int zfs_prefetch_disable = 0;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN,
-    &zfs_prefetch_disable, 0, "Disable prefetch");
 
 /* max # of streams per zfetch */
 uint32_t	zfetch_max_streams = 8;
@@ -52,6 +48,25 @@ uint32_t	zfetch_block_cap = 256;
 /* number of bytes in a array_read at which we stop prefetching (1Mb) */
 uint64_t	zfetch_array_rd_sz = 1024 * 1024;
 
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN,
+    &zfs_prefetch_disable, 0, "Disable prefetch");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
+TUNABLE_INT("vfs.zfs.zfetch.max_streams", &zfetch_max_streams);
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RDTUN,
+    &zfetch_max_streams, 0, "Max # of streams per zfetch");
+TUNABLE_INT("vfs.zfs.zfetch.min_sec_reap", &zfetch_min_sec_reap);
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RDTUN,
+    &zfetch_min_sec_reap, 0, "Min time before stream reclaim");
+TUNABLE_INT("vfs.zfs.zfetch.block_cap", &zfetch_block_cap);
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, block_cap, CTLFLAG_RDTUN,
+    &zfetch_block_cap, 0, "Max number of blocks to fetch at a time");
+TUNABLE_QUAD("vfs.zfs.zfetch.array_rd_sz", &zfetch_array_rd_sz);
+SYSCTL_QUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RDTUN,
+    &zfetch_array_rd_sz, 0,
+    "Number of bytes in a array_read at which we stop prefetching");
+
 /* forward decls for static routines */
 static int		dmu_zfetch_colinear(zfetch_t *, zstream_t *);
 static void		dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
index ca502857b1fa..5adbc3c0ff5d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
@@ -242,6 +240,23 @@ free_range_compar(const void *node1, const void *node2)
 	else return (0);
 }
 
+void
+dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
+{
+	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+
+	dnode_setdirty(dn, tx);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+	    (dn->dn_nblkptr-1) * sizeof (blkptr_t));
+	dn->dn_bonuslen = newsize;
+	if (newsize == 0)
+		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
+	else
+		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
 static void
 dnode_setdblksz(dnode_t *dn, int size)
 {
@@ -285,6 +300,7 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
 	list_insert_head(&os->os_dnodes, dn);
 	mutex_exit(&os->os_lock);
 
+	arc_space_consume(sizeof (dnode_t));
 	return (dn);
 }
 
@@ -319,6 +335,7 @@ dnode_destroy(dnode_t *dn)
 		dn->dn_bonus = NULL;
 	}
 	kmem_cache_free(dnode_cache, dn);
+	arc_space_return(sizeof (dnode_t));
 }
 
 void
@@ -362,6 +379,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 	for (i = 0; i < TXG_SIZE; i++) {
 		ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
 		ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+		ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
 		ASSERT3U(dn->dn_next_blksz[i], ==, 0);
 		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
@@ -389,6 +407,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 
 	dnode_setdirty(dn, tx);
 	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
+	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
 }
 
@@ -396,7 +415,7 @@ void
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
-	int i;
+	int i, old_nblkptr;
 	dmu_buf_impl_t *db = NULL;
 
 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
@@ -413,7 +432,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 
 	/* clean up any unreferenced dbufs */
-	(void) dnode_evict_dbufs(dn, 0);
+	dnode_evict_dbufs(dn);
 	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
 
 	/*
@@ -436,38 +455,18 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 	}
 	dnode_setdblksz(dn, blocksize);
 	dnode_setdirty(dn, tx);
+	dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
 	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
 	rw_exit(&dn->dn_struct_rwlock);
-	if (db) {
+	if (db)
 		dbuf_rele(db, FTAG);
-		db = NULL;
-	}
 
 	/* change type */
 	dn->dn_type = ot;
 
-	if (dn->dn_bonuslen != bonuslen) {
-		/* change bonus size */
-		if (bonuslen == 0)
-			bonuslen = 1; /* XXX */
-		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-		if (dn->dn_bonus == NULL)
-			dn->dn_bonus = dbuf_create_bonus(dn);
-		db = dn->dn_bonus;
-		rw_exit(&dn->dn_struct_rwlock);
-		if (refcount_add(&db->db_holds, FTAG) == 1)
-			dnode_add_ref(dn, db);
-		VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
-		mutex_enter(&db->db_mtx);
-		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
-		ASSERT(db->db.db_data != NULL);
-		db->db.db_size = bonuslen;
-		mutex_exit(&db->db_mtx);
-		(void) dbuf_dirty(db, tx);
-	}
-
 	/* change bonus size and type */
 	mutex_enter(&dn->dn_mtx);
+	old_nblkptr = dn->dn_nblkptr;
 	dn->dn_bonustype = bonustype;
 	dn->dn_bonuslen = bonuslen;
 	dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
@@ -475,12 +474,15 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
 	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 
-	/*
-	 * NB: we have to do the dbuf_rele after we've changed the
-	 * dn_bonuslen, for the sake of dbuf_verify().
-	 */
-	if (db)
-		dbuf_rele(db, FTAG);
+	/* XXX - for now, we can't make nblkptr smaller */
+	ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr);
+
+	/* fix up the bonus db_size if dn_nblkptr has changed */
+	if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) {
+		dn->dn_bonus->db.db_size =
+		    DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
+	}
 
 	dn->dn_allocated_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
@@ -559,6 +561,12 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
 	dmu_buf_impl_t *db;
 	dnode_t **children_dnodes;
 
+	/*
+	 * If you are holding the spa config lock as writer, you shouldn't
+	 * be asking the DMU to do *anything*.
+	 */
+	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0);
+
 	if (object == 0 || object >= DN_MAX_OBJECT)
 		return (EINVAL);
 
@@ -602,9 +610,10 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
 	}
 
 	if ((dn = children_dnodes[idx]) == NULL) {
+		dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx;
 		dnode_t *winner;
-		dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx,
-			db, object);
+
+		dn = dnode_create(os, dnp, db, object);
 		winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
 		if (winner != NULL) {
 			dnode_destroy(dn);
@@ -644,11 +653,22 @@ dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
 }
 
-void
+/*
+ * Can only add a reference if there is already at least one
+ * reference on the dnode.  Returns FALSE if unable to add a
+ * new reference.
+ */
+boolean_t
 dnode_add_ref(dnode_t *dn, void *tag)
 {
-	ASSERT(refcount_count(&dn->dn_holds) > 0);
-	(void) refcount_add(&dn->dn_holds, tag);
+	mutex_enter(&dn->dn_mtx);
+	if (refcount_is_zero(&dn->dn_holds)) {
+		mutex_exit(&dn->dn_mtx);
+		return (FALSE);
+	}
+	VERIFY(1 < refcount_add(&dn->dn_holds, tag));
+	mutex_exit(&dn->dn_mtx);
+	return (TRUE);
 }
 
 void
@@ -656,7 +676,9 @@ dnode_rele(dnode_t *dn, void *tag)
 {
 	uint64_t refs;
 
+	mutex_enter(&dn->dn_mtx);
 	refs = refcount_remove(&dn->dn_holds, tag);
+	mutex_exit(&dn->dn_mtx);
 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
 	if (refs == 0 && dn->dn_dbuf)
 		dbuf_rele(dn->dn_dbuf, dn);
@@ -692,6 +714,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 
 	ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
 	ASSERT(dn->dn_datablksz != 0);
+	ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
 	ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
 
 	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
@@ -714,7 +737,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 	 * dnode will hang around after we finish processing its
 	 * children.
 	 */
-	dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
+	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
 
 	(void) dbuf_dirty(dn->dn_dbuf, tx);
 
@@ -762,7 +785,7 @@ int
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db, *db_next;
-	int have_db0 = FALSE;
+	int err;
 
 	if (size == 0)
 		size = SPA_MINBLOCKSIZE;
@@ -787,9 +810,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
 		db_next = list_next(&dn->dn_dbufs, db);
 
-		if (db->db_blkid == 0) {
-			have_db0 = TRUE;
-		} else if (db->db_blkid != DB_BONUS_BLKID) {
+		if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) {
 			mutex_exit(&dn->dn_dbufs_mtx);
 			goto fail;
 		}
@@ -799,12 +820,12 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	if (ibs && dn->dn_nlevels != 1)
 		goto fail;
 
-	db = NULL;
-	if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) {
-		/* obtain the old block */
-		db = dbuf_hold(dn, 0, FTAG);
+	/* resize the old block */
+	err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+	if (err == 0)
 		dbuf_new_size(db, size, tx);
-	}
+	else if (err != ENOENT)
+		goto fail;
 
 	dnode_setdblksz(dn, size);
 	dnode_setdirty(dn, tx);
@@ -813,7 +834,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 		dn->dn_indblkshift = ibs;
 		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
 	}
-
+	/* rele after we have fixed the blocksize in the dnode */
 	if (db)
 		dbuf_rele(db, FTAG);
 
@@ -825,19 +846,32 @@ fail:
 	return (ENOTSUP);
 }
 
+/* read-holding callers must not rely on the lock being continuously held */
 void
-dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
 {
 	uint64_t txgoff = tx->tx_txg & TXG_MASK;
-	int drop_struct_lock = FALSE;
 	int epbs, new_nlevels;
 	uint64_t sz;
 
 	ASSERT(blkid != DB_BONUS_BLKID);
 
-	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
-		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-		drop_struct_lock = TRUE;
+	ASSERT(have_read ?
+	    RW_READ_HELD(&dn->dn_struct_rwlock) :
+	    RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+	/*
+	 * if we have a read-lock, check to see if we need to do any work
+	 * before upgrading to a write-lock.
+	 */
+	if (have_read) {
+		if (blkid <= dn->dn_maxblkid)
+			return;
+
+		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
+			rw_exit(&dn->dn_struct_rwlock);
+			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		}
 	}
 
 	if (blkid <= dn->dn_maxblkid)
@@ -889,8 +923,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
 	}
 
 out:
-	if (drop_struct_lock)
-		rw_exit(&dn->dn_struct_rwlock);
+	if (have_read)
+		rw_downgrade(&dn->dn_struct_rwlock);
 }
 
 void
@@ -951,15 +985,15 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	uint64_t blkoff, blkid, nblks;
-	int blksz, head;
+	int blksz, blkshift, head, tail;
 	int trunc = FALSE;
+	int epbs;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	blksz = dn->dn_datablksz;
+	blkshift = dn->dn_datablkshift;
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
-	/* If the range is past the end of the file, this is a no-op */
-	if (off >= blksz * (dn->dn_maxblkid+1))
-		goto out;
 	if (len == -1ULL) {
 		len = UINT64_MAX - off;
 		trunc = TRUE;
@@ -971,11 +1005,18 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 	if (ISP2(blksz)) {
 		head = P2NPHASE(off, blksz);
 		blkoff = P2PHASE(off, blksz);
+		if ((off >> blkshift) > dn->dn_maxblkid)
+			goto out;
 	} else {
 		ASSERT(dn->dn_maxblkid == 0);
 		if (off == 0 && len >= blksz) {
-			/* Freeing the whole block; don't do any head. */
-			head = 0;
+			/* Freeing the whole block; fast-track this request */
+			blkid = 0;
+			nblks = 1;
+			goto done;
+		} else if (off >= blksz) {
+			/* Freeing past end-of-data */
+			goto out;
 		} else {
 			/* Freeing part of the block. */
 			head = blksz - off;
@@ -1008,88 +1049,95 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 	}
 
 	/* If the range was less than one block, we're done */
-	if (len == 0 || off >= blksz * (dn->dn_maxblkid+1))
+	if (len == 0)
 		goto out;
 
-	if (!ISP2(blksz)) {
-		/*
-		 * They are freeing the whole block of a
-		 * non-power-of-two blocksize file.  Skip all the messy
-		 * math.
-		 */
-		ASSERT3U(off, ==, 0);
-		ASSERT3U(len, >=, blksz);
-		blkid = 0;
-		nblks = 1;
-	} else {
-		int tail;
-		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-		int blkshift = dn->dn_datablkshift;
-
-		/* If the remaining range is past end of file, we're done */
-		if (off > dn->dn_maxblkid << blkshift)
-			goto out;
+	/* If the remaining range is past end of file, we're done */
+	if ((off >> blkshift) > dn->dn_maxblkid)
+		goto out;
 
-		if (off + len == UINT64_MAX)
-			tail = 0;
-		else
-			tail = P2PHASE(len, blksz);
-
-		ASSERT3U(P2PHASE(off, blksz), ==, 0);
-		/* zero out any partial block data at the end of the range */
-		if (tail) {
-			if (len < tail)
-				tail = len;
-			if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
-			    TRUE, FTAG, &db) == 0) {
-				/* don't dirty if not on disk and not dirty */
-				if (db->db_last_dirty ||
-				    (db->db_blkptr &&
-				    !BP_IS_HOLE(db->db_blkptr))) {
-					rw_exit(&dn->dn_struct_rwlock);
-					dbuf_will_dirty(db, tx);
-					rw_enter(&dn->dn_struct_rwlock,
-					    RW_WRITER);
-					bzero(db->db.db_data, tail);
-				}
-				dbuf_rele(db, FTAG);
+	ASSERT(ISP2(blksz));
+	if (trunc)
+		tail = 0;
+	else
+		tail = P2PHASE(len, blksz);
+
+	ASSERT3U(P2PHASE(off, blksz), ==, 0);
+	/* zero out any partial block data at the end of the range */
+	if (tail) {
+		if (len < tail)
+			tail = len;
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
+		    TRUE, FTAG, &db) == 0) {
+			/* don't dirty if not on disk and not dirty */
+			if (db->db_last_dirty ||
+			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+				rw_exit(&dn->dn_struct_rwlock);
+				dbuf_will_dirty(db, tx);
+				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+				bzero(db->db.db_data, tail);
 			}
-			len -= tail;
+			dbuf_rele(db, FTAG);
 		}
-		/* If the range did not include a full block, we are done */
-		if (len == 0)
-			goto out;
+		len -= tail;
+	}
 
-		/* dirty the left indirects */
-		if (dn->dn_nlevels > 1 && off != 0) {
-			db = dbuf_hold_level(dn, 1,
-			    (off - head) >> (blkshift + epbs), FTAG);
+	/* If the range did not include a full block, we are done */
+	if (len == 0)
+		goto out;
+
+	ASSERT(IS_P2ALIGNED(off, blksz));
+	ASSERT(trunc || IS_P2ALIGNED(len, blksz));
+	blkid = off >> blkshift;
+	nblks = len >> blkshift;
+	if (trunc)
+		nblks += 1;
+
+	/*
+	 * Read in and mark all the level-1 indirects dirty,
+	 * so that they will stay in memory until syncing phase.
+	 * Always dirty the first and last indirect to make sure
+	 * we dirty all the partial indirects.
+	 */
+	if (dn->dn_nlevels > 1) {
+		uint64_t i, first, last;
+		int shift = epbs + dn->dn_datablkshift;
+
+		first = blkid >> epbs;
+		if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
 			dbuf_will_dirty(db, tx);
 			dbuf_rele(db, FTAG);
 		}
-
-		/* dirty the right indirects */
-		if (dn->dn_nlevels > 1 && !trunc) {
-			db = dbuf_hold_level(dn, 1,
-			    (off + len + tail - 1) >> (blkshift + epbs), FTAG);
+		if (trunc)
+			last = dn->dn_maxblkid >> epbs;
+		else
+			last = (blkid + nblks - 1) >> epbs;
+		if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
 			dbuf_will_dirty(db, tx);
 			dbuf_rele(db, FTAG);
 		}
-
-		/*
-		 * Finally, add this range to the dnode range list, we
-		 * will finish up this free operation in the syncing phase.
-		 */
-		ASSERT(IS_P2ALIGNED(off, 1<<blkshift));
-		ASSERT(off + len == UINT64_MAX ||
-		    IS_P2ALIGNED(len, 1<<blkshift));
-		blkid = off >> blkshift;
-		nblks = len >> blkshift;
-
-		if (trunc)
-			dn->dn_maxblkid = (blkid ? blkid - 1 : 0);
+		for (i = first + 1; i < last; i++) {
+			uint64_t ibyte = i << shift;
+			int err;
+
+			err = dnode_next_offset(dn,
+			    DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0);
+			i = ibyte >> shift;
+			if (err == ESRCH || i >= last)
+				break;
+			ASSERT(err == 0);
+			db = dbuf_hold_level(dn, 1, i, FTAG);
+			if (db) {
+				dbuf_will_dirty(db, tx);
+				dbuf_rele(db, FTAG);
+			}
+		}
 	}
-
+done:
+	/*
+	 * Add this range to the dnode range list.
+	 * We will finish up this free operation in the syncing phase.
+	 */
 	mutex_enter(&dn->dn_mtx);
 	dnode_clear_range(dn, blkid, nblks, tx);
 	{
@@ -1109,9 +1157,12 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 	}
 	mutex_exit(&dn->dn_mtx);
 
-	dbuf_free_range(dn, blkid, nblks, tx);
+	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
 	dnode_setdirty(dn, tx);
 out:
+	if (trunc && dn->dn_maxblkid >= (off >> blkshift))
+		dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0);
+
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
@@ -1179,7 +1230,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
 		ASSERT3U(space, >=, -delta); /* no underflow */
 	}
 	space += delta;
-	if (spa_version(dn->dn_objset->os_spa) < ZFS_VERSION_DNODE_BYTES) {
+	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
 		ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
 		ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0);
 		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
@@ -1211,7 +1262,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
 }
 
 static int
-dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
+dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 	int lvl, uint64_t blkfill, uint64_t txg)
 {
 	dmu_buf_impl_t *db = NULL;
@@ -1219,11 +1270,16 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
 	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	uint64_t epb = 1ULL << epbs;
 	uint64_t minfill, maxfill;
-	int i, error, span;
+	boolean_t hole;
+	int i, inc, error, span;
 
 	dprintf("probing object %llu offset %llx level %d of %u\n",
 	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
 
+	hole = flags & DNODE_FIND_HOLE;
+	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
+	ASSERT(txg == 0 || !hole);
+
 	if (lvl == dn->dn_phys->dn_nlevels) {
 		error = 0;
 		epb = dn->dn_phys->dn_nblkptr;
@@ -1232,9 +1288,18 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
 		uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
 		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
 		if (error) {
-			if (error == ENOENT)
-				return (hole ? 0 : ESRCH);
-			return (error);
+			if (error != ENOENT)
+				return (error);
+			if (hole)
+				return (0);
+			/*
+			 * This can only happen when we are searching up
+			 * the block tree for data.  We don't really need to
+			 * adjust the offset, as we will just end up looking
+			 * at the pointer to this block in its parent, and its
+			 * going to be unallocated, so we will skip over it.
+			 */
+			return (ESRCH);
 		}
 		error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
 		if (error) {
@@ -1246,13 +1311,18 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
 
 	if (db && txg &&
 	    (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
+		/*
+		 * This can only happen when we are searching up the tree
+		 * and these conditions mean that we need to keep climbing.
+		 */
 		error = ESRCH;
 	} else if (lvl == 0) {
 		dnode_phys_t *dnp = data;
 		span = DNODE_SHIFT;
 		ASSERT(dn->dn_type == DMU_OT_DNODE);
 
-		for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) {
+		for (i = (*offset >> span) & (blkfill - 1);
+		    i >= 0 && i < blkfill; i += inc) {
 			boolean_t newcontents = B_TRUE;
 			if (txg) {
 				int j;
@@ -1264,9 +1334,9 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
 			}
 			if (!dnp[i].dn_type == hole && newcontents)
 				break;
-			*offset += 1ULL << span;
+			*offset += (1ULL << span) * inc;
 		}
-		if (i == blkfill)
+		if (i < 0 || i == blkfill)
 			error = ESRCH;
 	} else {
 		blkptr_t *bp = data;
@@ -1280,14 +1350,17 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
 			minfill++;
 
 		for (i = (*offset >> span) & ((1ULL << epbs) - 1);
-		    i < epb; i++) {
+		    i >= 0 && i < epb; i += inc) {
 			if (bp[i].blk_fill >= minfill &&
 			    bp[i].blk_fill <= maxfill &&
-			    bp[i].blk_birth > txg)
+			    (hole || bp[i].blk_birth > txg))
 				break;
-			*offset += 1ULL << span;
+			if (inc < 0 && *offset < (1ULL << span))
+				*offset = 0;
+			else
+				*offset += (1ULL << span) * inc;
 		}
-		if (i >= epb)
+		if (i < 0 || i == epb)
 			error = ESRCH;
 	}
 
@@ -1306,64 +1379,66 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
  *
  * Examples:
  *
- * dnode_next_offset(dn, hole, offset, 1, 1, 0);
- *	Finds the next hole/data in a file.
+ * dnode_next_offset(dn, flags, offset, 1, 1, 0);
+ *	Finds the next/previous hole/data in a file.
  *	Used in dmu_offset_next().
  *
- * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg);
+ * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
  *	Finds the next free/allocated dnode an objset's meta-dnode.
  *	Only finds objects that have new contents since txg (ie.
  *	bonus buffer changes and content removal are ignored).
  *	Used in dmu_object_next().
  *
- * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
  *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
  *	Used in dmu_object_alloc().
  */
 int
-dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset,
+dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
     int minlvl, uint64_t blkfill, uint64_t txg)
 {
+	uint64_t initial_offset = *offset;
 	int lvl, maxlvl;
 	int error = 0;
-	uint64_t initial_offset = *offset;
 
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	if (!(flags & DNODE_FIND_HAVELOCK))
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	if (dn->dn_phys->dn_nlevels == 0) {
-		rw_exit(&dn->dn_struct_rwlock);
-		return (ESRCH);
+		error = ESRCH;
+		goto out;
 	}
 
 	if (dn->dn_datablkshift == 0) {
 		if (*offset < dn->dn_datablksz) {
-			if (hole)
+			if (flags & DNODE_FIND_HOLE)
 				*offset = dn->dn_datablksz;
 		} else {
 			error = ESRCH;
 		}
-		rw_exit(&dn->dn_struct_rwlock);
-		return (error);
+		goto out;
 	}
 
 	maxlvl = dn->dn_phys->dn_nlevels;
 
 	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
 		error = dnode_next_offset_level(dn,
-		    hole, offset, lvl, blkfill, txg);
+		    flags, offset, lvl, blkfill, txg);
 		if (error != ESRCH)
 			break;
 	}
 
-	while (--lvl >= minlvl && error == 0) {
+	while (error == 0 && --lvl >= minlvl) {
 		error = dnode_next_offset_level(dn,
-		    hole, offset, lvl, blkfill, txg);
+		    flags, offset, lvl, blkfill, txg);
 	}
 
-	rw_exit(&dn->dn_struct_rwlock);
-
-	if (error == 0 && initial_offset > *offset)
+	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
+	    initial_offset < *offset : initial_offset > *offset))
 		error = ESRCH;
+out:
+	if (!(flags & DNODE_FIND_HAVELOCK))
+		rw_exit(&dn->dn_struct_rwlock);
 
 	return (error);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
index 9e8c7adbda01..a46d4e70abc8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -55,9 +55,8 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
 	ASSERT(db != NULL);
 
 	dn->dn_phys->dn_nlevels = new_level;
-	dprintf("os=%p obj=%llu, increase to %d\n",
-		dn->dn_objset, dn->dn_object,
-		dn->dn_phys->dn_nlevels);
+	dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
+	    dn->dn_object, dn->dn_phys->dn_nlevels);
 
 	/* check for existing blkptrs in the dnode */
 	for (i = 0; i < nblkptr; i++)
@@ -110,25 +109,26 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
-static void
+static int
 free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
 {
-	objset_impl_t *os = dn->dn_objset;
+	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	uint64_t bytesfreed = 0;
-	int i;
+	int i, blocks_freed = 0;
 
-	dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num);
+	dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
 
 	for (i = 0; i < num; i++, bp++) {
 		if (BP_IS_HOLE(bp))
 			continue;
 
-		bytesfreed += bp_get_dasize(os->os_spa, bp);
+		bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx);
 		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
-		dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx);
 		bzero(bp, sizeof (blkptr_t));
+		blocks_freed += 1;
 	}
 	dnode_diduse_space(dn, -bytesfreed);
+	return (blocks_freed);
 }
 
 #ifdef ZFS_DEBUG
@@ -160,7 +160,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 
 		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
 		err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
-			(db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+		    (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
 		rw_exit(&db->db_dnode->dn_struct_rwlock);
 		if (err == ENOENT)
 			continue;
@@ -178,7 +178,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 				if (buf[j] != 0) {
 					panic("freed data not zero: "
 					    "child=%p i=%d off=%d num=%d\n",
-					    child, i, off, num);
+					    (void *)child, i, off, num);
 				}
 			}
 		}
@@ -195,7 +195,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 				if (buf[j] != 0) {
 					panic("freed data not zero: "
 					    "child=%p i=%d off=%d num=%d\n",
-					    child, i, off, num);
+					    (void *)child, i, off, num);
 				}
 			}
 		}
@@ -206,6 +206,8 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 }
 #endif
 
+#define	ALL -1
+
 static int
 free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
     dmu_tx_t *tx)
@@ -216,8 +218,18 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
 	uint64_t start, end, dbstart, dbend, i;
 	int epbs, shift, err;
 	int all = TRUE;
+	int blocks_freed = 0;
+
+	/*
+	 * There is a small possibility that this block will not be cached:
+	 *   1 - if level > 1 and there are no children with level <= 1
+	 *   2 - if we didn't get a dirty hold (because this block had just
+	 *	 finished being written -- and so had no holds), and then this
+	 *	 block got evicted before we got here.
+	 */
+	if (db->db_state != DB_CACHED)
+		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 
-	(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 	arc_release(db->db_buf, db);
 	bp = (blkptr_t *)db->db.db_data;
 
@@ -241,10 +253,10 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
 
 	if (db->db_level == 1) {
 		FREE_VERIFY(db, start, end, tx);
-		free_blocks(dn, bp, end-start+1, tx);
+		blocks_freed = free_blocks(dn, bp, end-start+1, tx);
 		arc_buf_freeze(db->db_buf);
-		ASSERT(all || db->db_last_dirty);
-		return (all);
+		ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+		return (all ? ALL : blocks_freed);
 	}
 
 	for (i = start; i <= end; i++, bp++) {
@@ -255,9 +267,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
 		ASSERT3U(err, ==, 0);
 		rw_exit(&dn->dn_struct_rwlock);
 
-		if (free_children(subdb, blkid, nblks, trunc, tx)) {
+		if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) {
 			ASSERT3P(subdb->db_blkptr, ==, bp);
-			free_blocks(dn, bp, 1, tx);
+			blocks_freed += free_blocks(dn, bp, 1, tx);
 		} else {
 			all = FALSE;
 		}
@@ -274,8 +286,8 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
 		ASSERT3U(bp->blk_birth, ==, 0);
 	}
 #endif
-	ASSERT(all || db->db_last_dirty);
-	return (all);
+	ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+	return (all ? ALL : blocks_freed);
 }
 
 /*
@@ -305,15 +317,14 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 			return;
 		}
 		ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
-		free_blocks(dn, bp + blkid, nblks, tx);
+		(void) free_blocks(dn, bp + blkid, nblks, tx);
 		if (trunc) {
 			uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
 			    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 			dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
 			ASSERT(off < dn->dn_phys->dn_maxblkid ||
 			    dn->dn_phys->dn_maxblkid == 0 ||
-			    dnode_next_offset(dn, FALSE, &off,
-			    1, 1, 0) != 0);
+			    dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
 		}
 		return;
 	}
@@ -331,9 +342,9 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 		ASSERT3U(err, ==, 0);
 		rw_exit(&dn->dn_struct_rwlock);
 
-		if (free_children(db, blkid, nblks, trunc, tx)) {
+		if (free_children(db, blkid, nblks, trunc, tx) == ALL) {
 			ASSERT3P(db->db_blkptr, ==, bp);
-			free_blocks(dn, bp, 1, tx);
+			(void) free_blocks(dn, bp, 1, tx);
 		}
 		dbuf_rele(db, FTAG);
 	}
@@ -343,15 +354,15 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 		dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
 		ASSERT(off < dn->dn_phys->dn_maxblkid ||
 		    dn->dn_phys->dn_maxblkid == 0 ||
-		    dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0);
+		    dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
 	}
 }
 
 /*
  * Try to kick all the dnodes dbufs out of the cache...
  */
-int
-dnode_evict_dbufs(dnode_t *dn, int try)
+void
+dnode_evict_dbufs(dnode_t *dn)
 {
 	int progress;
 	int pass = 0;
@@ -367,6 +378,7 @@ dnode_evict_dbufs(dnode_t *dn, int try)
 		for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
 			list_remove(&dn->dn_dbufs, db);
 			list_insert_tail(&dn->dn_dbufs, db);
+			ASSERT3P(db->db_dnode, ==, dn);
 
 			mutex_enter(&db->db_mtx);
 			if (db->db_state == DB_EVICTING) {
@@ -375,7 +387,6 @@ dnode_evict_dbufs(dnode_t *dn, int try)
 				mutex_exit(&db->db_mtx);
 			} else if (refcount_is_zero(&db->db_holds)) {
 				progress = TRUE;
-				ASSERT(!arc_released(db->db_buf));
 				dbuf_clear(db); /* exits db_mtx for us */
 			} else {
 				mutex_exit(&db->db_mtx);
@@ -397,21 +408,6 @@ dnode_evict_dbufs(dnode_t *dn, int try)
 		ASSERT(pass < 100); /* sanity check */
 	} while (progress);
 
-	/*
-	 * This function works fine even if it can't evict everything.
-	 * If were only asked to try to evict everything then
-	 * return an error if we can't. Otherwise panic as the caller
-	 * expects total eviction.
-	 */
-	if (list_head(&dn->dn_dbufs) != NULL) {
-		if (try) {
-			return (1);
-		} else {
-			panic("dangling dbufs (dn=%p, dbuf=%p)\n",
-			    dn, list_head(&dn->dn_dbufs));
-		}
-	}
-
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
 		mutex_enter(&dn->dn_bonus->db_mtx);
@@ -419,7 +415,6 @@ dnode_evict_dbufs(dnode_t *dn, int try)
 		dn->dn_bonus = NULL;
 	}
 	rw_exit(&dn->dn_struct_rwlock);
-	return (0);
 }
 
 static void
@@ -460,8 +455,15 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
+	/*
+	 * Our contents should have been freed in dnode_sync() by the
+	 * free range record inserted by the caller of dnode_free().
+	 */
+	ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
+	ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
+
 	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
-	(void) dnode_evict_dbufs(dn, 0);
+	dnode_evict_dbufs(dn);
 	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
 
 	/*
@@ -479,10 +481,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 	dn->dn_next_indblkshift[txgoff] = 0;
 	dn->dn_next_blksz[txgoff] = 0;
 
-	/* free up all the blocks in the file. */
-	dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
-	ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
-
 	/* ASSERT(blkptrs are zero); */
 	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
 	ASSERT(dn->dn_type != DMU_OT_NONE);
@@ -496,6 +494,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 	dn->dn_type = DMU_OT_NONE;
 	dn->dn_maxblkid = 0;
 	dn->dn_allocated_txg = 0;
+	dn->dn_free_txg = 0;
 	mutex_exit(&dn->dn_mtx);
 
 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
@@ -558,7 +557,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
 		    SPA_MINBLOCKSIZE) == 0);
 		ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
-		    list_head(list) != NULL ||
+		    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
 		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
 		    dnp->dn_datablkszsec);
 		dnp->dn_datablkszsec =
@@ -566,6 +565,15 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		dn->dn_next_blksz[txgoff] = 0;
 	}
 
+	if (dn->dn_next_bonuslen[txgoff]) {
+		if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
+			dnp->dn_bonuslen = 0;
+		else
+			dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
+		ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
+		dn->dn_next_bonuslen[txgoff] = 0;
+	}
+
 	if (dn->dn_next_indblkshift[txgoff]) {
 		ASSERT(dnp->dn_nlevels == 1);
 		dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
@@ -583,20 +591,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 	mutex_exit(&dn->dn_mtx);
 
 	/* process all the "freed" ranges in the file */
-	if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) {
-		for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL;
-		    rp = AVL_PREV(&dn->dn_ranges[txgoff], rp))
-			dnode_sync_free_range(dn,
-			    rp->fr_blkid, rp->fr_nblks, tx);
+	while (rp = avl_last(&dn->dn_ranges[txgoff])) {
+		dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
+		/* grab the mutex so we don't race with dnode_block_freed() */
+		mutex_enter(&dn->dn_mtx);
+		avl_remove(&dn->dn_ranges[txgoff], rp);
+		mutex_exit(&dn->dn_mtx);
+		kmem_free(rp, sizeof (free_range_t));
 	}
-	mutex_enter(&dn->dn_mtx);
-	for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
-		free_range_t *last = rp;
-		rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
-		avl_remove(&dn->dn_ranges[txgoff], last);
-		kmem_free(last, sizeof (free_range_t));
-	}
-	mutex_exit(&dn->dn_mtx);
 
 	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
 		dnode_sync_free(dn, tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
index 7d4689f3352a..20d8ec85cc91 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
@@ -38,35 +36,44 @@
 #include <sys/unique.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/spa.h>
+#include <sys/zfs_znode.h>
+#include <sys/sunddi.h>
+
+static char *dsl_reaper = "the grim reaper";
 
 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
 static dsl_checkfunc_t dsl_dataset_rollback_check;
 static dsl_syncfunc_t dsl_dataset_rollback_sync;
-static dsl_checkfunc_t dsl_dataset_destroy_check;
-static dsl_syncfunc_t dsl_dataset_destroy_sync;
+static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
 
 #define	DS_REF_MAX	(1ULL << 62)
 
 #define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
 
+#define	DSL_DATASET_IS_DESTROYED(ds)	((ds)->ds_owner == dsl_reaper)
+
+
 /*
- * We use weighted reference counts to express the various forms of exclusion
- * between different open modes.  A STANDARD open is 1 point, an EXCLUSIVE open
- * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
- * This makes the exclusion logic simple: the total refcnt for all opens cannot
- * exceed DS_REF_MAX.  For example, EXCLUSIVE opens are exclusive because their
- * weight (DS_REF_MAX) consumes the entire refcnt space.  PRIMARY opens consume
- * just over half of the refcnt space, so there can't be more than one, but it
- * can peacefully coexist with any number of STANDARD opens.
+ * Figure out how much of this delta should be propogated to the dsl_dir
+ * layer.  If there's a refreservation, that space has already been
+ * partially accounted for in our ancestors.
  */
-static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
-	0,			/* DS_MODE_NONE - invalid		*/
-	1,			/* DS_MODE_STANDARD - unlimited number	*/
-	(DS_REF_MAX >> 1) + 1,	/* DS_MODE_PRIMARY - only one of these	*/
-	DS_REF_MAX		/* DS_MODE_EXCLUSIVE - no other opens	*/
-};
+static int64_t
+parent_delta(dsl_dataset_t *ds, int64_t delta)
+{
+	uint64_t old_bytes, new_bytes;
 
+	if (ds->ds_reserved == 0)
+		return (delta);
+
+	old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+	new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
+
+	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
+	return (new_bytes - old_bytes);
+}
 
 void
 dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
@@ -74,6 +81,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
+	int64_t delta;
 
 	dprintf_bp(bp, "born, ds=%p\n", ds);
 
@@ -89,23 +97,28 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 		 * dsl_dir.
 		 */
 		ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
-		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
 		    used, compressed, uncompressed, tx);
 		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 		return;
 	}
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	mutex_enter(&ds->ds_dir->dd_lock);
 	mutex_enter(&ds->ds_lock);
+	delta = parent_delta(ds, used);
 	ds->ds_phys->ds_used_bytes += used;
 	ds->ds_phys->ds_compressed_bytes += compressed;
 	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
 	ds->ds_phys->ds_unique_bytes += used;
 	mutex_exit(&ds->ds_lock);
-	dsl_dir_diduse_space(ds->ds_dir,
-	    used, compressed, uncompressed, tx);
+	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
+	    compressed, uncompressed, tx);
+	dsl_dir_transfer_space(ds->ds_dir, used - delta,
+	    DD_USED_REFRSRV, DD_USED_HEAD, tx);
+	mutex_exit(&ds->ds_dir->dd_lock);
 }
 
-void
+int
 dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
     dmu_tx_t *tx)
 {
@@ -113,10 +126,11 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 
+	ASSERT(pio != NULL);
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* No block pointer => nothing to free */
 	if (BP_IS_HOLE(bp))
-		return;
+		return (0);
 
 	ASSERT(used > 0);
 	if (ds == NULL) {
@@ -125,51 +139,59 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
 		 * Account for the meta-objset space in its placeholder
 		 * dataset.
 		 */
-		err = arc_free(pio, tx->tx_pool->dp_spa,
-		    tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+		err = dsl_free(pio, tx->tx_pool,
+		    tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
 		ASSERT(err == 0);
 
-		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
 		    -used, -compressed, -uncompressed, tx);
 		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
-		return;
+		return (used);
 	}
 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 
+	ASSERT(!dsl_dataset_is_snapshot(ds));
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
 		int err;
+		int64_t delta;
 
 		dprintf_bp(bp, "freeing: %s", "");
-		err = arc_free(pio, tx->tx_pool->dp_spa,
-		    tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+		err = dsl_free(pio, tx->tx_pool,
+		    tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
 		ASSERT(err == 0);
 
+		mutex_enter(&ds->ds_dir->dd_lock);
 		mutex_enter(&ds->ds_lock);
-		/* XXX unique_bytes is not accurate for head datasets */
-		/* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
+		ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
+		    !DS_UNIQUE_IS_ACCURATE(ds));
+		delta = parent_delta(ds, -used);
 		ds->ds_phys->ds_unique_bytes -= used;
 		mutex_exit(&ds->ds_lock);
-		dsl_dir_diduse_space(ds->ds_dir,
-		    -used, -compressed, -uncompressed, tx);
+		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+		    delta, -compressed, -uncompressed, tx);
+		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
+		    DD_USED_REFRSRV, DD_USED_HEAD, tx);
+		mutex_exit(&ds->ds_dir->dd_lock);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
 		VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+		ASSERT3U(ds->ds_prev->ds_object, ==,
+		    ds->ds_phys->ds_prev_snap_obj);
+		ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
 		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
-		if (ds->ds_phys->ds_prev_snap_obj != 0) {
-			ASSERT3U(ds->ds_prev->ds_object, ==,
-			    ds->ds_phys->ds_prev_snap_obj);
-			ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
-			if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
-			    ds->ds_object && bp->blk_birth >
-			    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
-				dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-				mutex_enter(&ds->ds_prev->ds_lock);
-				ds->ds_prev->ds_phys->ds_unique_bytes +=
-				    used;
-				mutex_exit(&ds->ds_prev->ds_lock);
-			}
+		if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
+		    ds->ds_object && bp->blk_birth >
+		    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
+			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+			mutex_enter(&ds->ds_prev->ds_lock);
+			ds->ds_prev->ds_phys->ds_unique_bytes += used;
+			mutex_exit(&ds->ds_prev->ds_lock);
+		}
+		if (bp->blk_birth > ds->ds_origin_txg) {
+			dsl_dir_transfer_space(ds->ds_dir, used,
+			    DD_USED_HEAD, DD_USED_SNAP, tx);
 		}
 	}
 	mutex_enter(&ds->ds_lock);
@@ -180,6 +202,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
 	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
 	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
 	mutex_exit(&ds->ds_lock);
+
+	return (used);
 }
 
 uint64_t
@@ -216,32 +240,38 @@ static void
 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 {
 	dsl_dataset_t *ds = dsv;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
-	/* open_refcount == DS_REF_MAX when deleting */
-	ASSERT(ds->ds_open_refcount == 0 ||
-	    ds->ds_open_refcount == DS_REF_MAX);
+	ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
 
 	dprintf_ds(ds, "evicting %s\n", "");
 
-	unique_remove(ds->ds_phys->ds_fsid_guid);
+	unique_remove(ds->ds_fsid_guid);
 
 	if (ds->ds_user_ptr != NULL)
 		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
 
 	if (ds->ds_prev) {
-		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+		dsl_dataset_drop_ref(ds->ds_prev, ds);
 		ds->ds_prev = NULL;
 	}
 
 	bplist_close(&ds->ds_deadlist);
-	dsl_dir_close(ds->ds_dir, ds);
+	if (ds->ds_dir)
+		dsl_dir_close(ds->ds_dir, ds);
 
-	if (list_link_active(&ds->ds_synced_link))
-		list_remove(&dp->dp_synced_objsets, ds);
+	ASSERT(!list_link_active(&ds->ds_synced_link));
 
+	if (mutex_owned(&ds->ds_lock))
+		mutex_exit(&ds->ds_lock);
 	mutex_destroy(&ds->ds_lock);
+	if (mutex_owned(&ds->ds_opening_lock))
+		mutex_exit(&ds->ds_opening_lock);
+	mutex_destroy(&ds->ds_opening_lock);
+	if (mutex_owned(&ds->ds_deadlist.bpl_lock))
+		mutex_exit(&ds->ds_deadlist.bpl_lock);
 	mutex_destroy(&ds->ds_deadlist.bpl_lock);
+	rw_destroy(&ds->ds_rwlock);
+	cv_destroy(&ds->ds_exclusive_cv);
 
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
@@ -266,16 +296,54 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds)
 		return (err);
 	headphys = headdbuf->db_data;
 	err = zap_value_search(dp->dp_meta_objset,
-	    headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
+	    headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
 	dmu_buf_rele(headdbuf, FTAG);
 	return (err);
 }
 
-int
-dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
-    int mode, void *tag, dsl_dataset_t **dsp)
+static int
+dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+	matchtype_t mt;
+	int err;
+
+	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+		mt = MT_FIRST;
+	else
+		mt = MT_EXACT;
+
+	err = zap_lookup_norm(mos, snapobj, name, 8, 1,
+	    value, mt, NULL, 0, NULL);
+	if (err == ENOTSUP && mt == MT_FIRST)
+		err = zap_lookup(mos, snapobj, name, 8, 1, value);
+	return (err);
+}
+
+static int
+dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+	matchtype_t mt;
+	int err;
+
+	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+		mt = MT_FIRST;
+	else
+		mt = MT_EXACT;
+
+	err = zap_remove_norm(mos, snapobj, name, mt, tx);
+	if (err == ENOTSUP && mt == MT_FIRST)
+		err = zap_remove(mos, snapobj, name, tx);
+	return (err);
+}
+
+static int
+dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+    dsl_dataset_t **dsp)
 {
-	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_buf_t *dbuf;
 	dsl_dataset_t *ds;
@@ -297,8 +365,11 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
 		ds->ds_phys = dbuf->db_data;
 
 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
 		    NULL);
+		rw_init(&ds->ds_rwlock, 0, 0, 0);
+		cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
 
 		err = bplist_open(&ds->ds_deadlist,
 		    mos, ds->ds_phys->ds_deadlist_obj);
@@ -312,42 +383,65 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
 			 * just opened it.
 			 */
 			mutex_destroy(&ds->ds_lock);
+			mutex_destroy(&ds->ds_opening_lock);
 			mutex_destroy(&ds->ds_deadlist.bpl_lock);
+			rw_destroy(&ds->ds_rwlock);
+			cv_destroy(&ds->ds_exclusive_cv);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			dmu_buf_rele(dbuf, tag);
 			return (err);
 		}
 
-		if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
+		if (!dsl_dataset_is_snapshot(ds)) {
 			ds->ds_snapname[0] = '\0';
 			if (ds->ds_phys->ds_prev_snap_obj) {
-				err = dsl_dataset_open_obj(dp,
-				    ds->ds_phys->ds_prev_snap_obj, NULL,
-				    DS_MODE_NONE, ds, &ds->ds_prev);
+				err = dsl_dataset_get_ref(dp,
+				    ds->ds_phys->ds_prev_snap_obj,
+				    ds, &ds->ds_prev);
 			}
-		} else {
-			if (snapname) {
-#ifdef ZFS_DEBUG
-				dsl_dataset_phys_t *headphys;
-				dmu_buf_t *headdbuf;
-				err = dmu_bonus_hold(mos,
-				    ds->ds_dir->dd_phys->dd_head_dataset_obj,
-				    FTAG, &headdbuf);
+
+			if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) {
+				dsl_dataset_t *origin;
+
+				err = dsl_dataset_hold_obj(dp,
+				    ds->ds_dir->dd_phys->dd_origin_obj,
+				    FTAG, &origin);
 				if (err == 0) {
-					headphys = headdbuf->db_data;
-					uint64_t foundobj;
-					err = zap_lookup(dp->dp_meta_objset,
-					    headphys->ds_snapnames_zapobj,
-					    snapname, sizeof (foundobj), 1,
-					    &foundobj);
-					ASSERT3U(foundobj, ==, dsobj);
-					dmu_buf_rele(headdbuf, FTAG);
+					ds->ds_origin_txg =
+					    origin->ds_phys->ds_creation_txg;
+					dsl_dataset_rele(origin, FTAG);
 				}
-#endif
-				(void) strcat(ds->ds_snapname, snapname);
-			} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
-				err = dsl_dataset_get_snapname(ds);
 			}
+		} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
+			err = dsl_dataset_get_snapname(ds);
+		}
+
+		if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
+			/*
+			 * In sync context, we're called with either no lock
+			 * or with the write lock.  If we're not syncing,
+			 * we're always called with the read lock held.
+			 */
+			boolean_t need_lock =
+			    !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
+			    dsl_pool_sync_context(dp);
+
+			if (need_lock)
+				rw_enter(&dp->dp_config_rwlock, RW_READER);
+
+			err = dsl_prop_get_ds(ds,
+			    "refreservation", sizeof (uint64_t), 1,
+			    &ds->ds_reserved, NULL);
+			if (err == 0) {
+				err = dsl_prop_get_ds(ds,
+				    "refquota", sizeof (uint64_t), 1,
+				    &ds->ds_quota, NULL);
+			}
+
+			if (need_lock)
+				rw_exit(&dp->dp_config_rwlock);
+		} else {
+			ds->ds_reserved = ds->ds_quota = 0;
 		}
 
 		if (err == 0) {
@@ -356,13 +450,14 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
 		}
 		if (err || winner) {
 			bplist_close(&ds->ds_deadlist);
-			if (ds->ds_prev) {
-				dsl_dataset_close(ds->ds_prev,
-				    DS_MODE_NONE, ds);
-			}
+			if (ds->ds_prev)
+				dsl_dataset_drop_ref(ds->ds_prev, ds);
 			dsl_dir_close(ds->ds_dir, ds);
 			mutex_destroy(&ds->ds_lock);
+			mutex_destroy(&ds->ds_opening_lock);
 			mutex_destroy(&ds->ds_deadlist.bpl_lock);
+			rw_destroy(&ds->ds_rwlock);
+			cv_destroy(&ds->ds_exclusive_cv);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			if (err) {
 				dmu_buf_rele(dbuf, tag);
@@ -370,101 +465,175 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
 			}
 			ds = winner;
 		} else {
-			uint64_t new =
+			ds->ds_fsid_guid =
 			    unique_insert(ds->ds_phys->ds_fsid_guid);
-			if (new != ds->ds_phys->ds_fsid_guid) {
-				/* XXX it won't necessarily be synced... */
-				ds->ds_phys->ds_fsid_guid = new;
-			}
 		}
 	}
 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
 	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
-
+	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
+	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
+	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
 	mutex_enter(&ds->ds_lock);
-	if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
-	    (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) &&
-	    !DS_MODE_IS_INCONSISTENT(mode)) ||
-	    (ds->ds_open_refcount + weight > DS_REF_MAX)) {
+	if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
 		mutex_exit(&ds->ds_lock);
-		dsl_dataset_close(ds, DS_MODE_NONE, tag);
-		return (EBUSY);
+		dmu_buf_rele(ds->ds_dbuf, tag);
+		return (ENOENT);
 	}
-	ds->ds_open_refcount += weight;
 	mutex_exit(&ds->ds_lock);
-
 	*dsp = ds;
 	return (0);
 }
 
+static int
+dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	/*
+	 * In syncing context we don't want the rwlock lock: there
+	 * may be an existing writer waiting for sync phase to
+	 * finish.  We don't need to worry about such writers, since
+	 * sync phase is single-threaded, so the writer can't be
+	 * doing anything while we are active.
+	 */
+	if (dsl_pool_sync_context(dp)) {
+		ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
+		return (0);
+	}
+
+	/*
+	 * Normal users will hold the ds_rwlock as a READER until they
+	 * are finished (i.e., call dsl_dataset_rele()).  "Owners" will
+	 * drop their READER lock after they set the ds_owner field.
+	 *
+	 * If the dataset is being destroyed, the destroy thread will
+	 * obtain a WRITER lock for exclusive access after it's done its
+	 * open-context work and then change the ds_owner to
+	 * dsl_reaper once destruction is assured.  So threads
+	 * may block here temporarily, until the "destructability" of
+	 * the dataset is determined.
+	 */
+	ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
+	mutex_enter(&ds->ds_lock);
+	while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
+		rw_exit(&dp->dp_config_rwlock);
+		cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
+		if (DSL_DATASET_IS_DESTROYED(ds)) {
+			mutex_exit(&ds->ds_lock);
+			dsl_dataset_drop_ref(ds, tag);
+			rw_enter(&dp->dp_config_rwlock, RW_READER);
+			return (ENOENT);
+		}
+		rw_enter(&dp->dp_config_rwlock, RW_READER);
+	}
+	mutex_exit(&ds->ds_lock);
+	return (0);
+}
+
+int
+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+    dsl_dataset_t **dsp)
+{
+	int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
+
+	if (err)
+		return (err);
+	return (dsl_dataset_hold_ref(*dsp, tag));
+}
+
 int
-dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
-    void *tag, dsl_dataset_t **dsp)
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner,
+    dsl_dataset_t **dsp)
+{
+	int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp);
+
+	ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER);
+
+	if (err)
+		return (err);
+	if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
+		dsl_dataset_rele(*dsp, owner);
+		return (EBUSY);
+	}
+	return (0);
+}
+
+int
+dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
 {
 	dsl_dir_t *dd;
 	dsl_pool_t *dp;
-	const char *tail;
+	const char *snapname;
 	uint64_t obj;
-	dsl_dataset_t *ds = NULL;
 	int err = 0;
 
-	err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
+	err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
 	if (err)
 		return (err);
 
 	dp = dd->dd_pool;
 	obj = dd->dd_phys->dd_head_dataset_obj;
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	if (obj == 0) {
-		/* A dataset with no associated objset */
+	if (obj)
+		err = dsl_dataset_get_ref(dp, obj, tag, dsp);
+	else
 		err = ENOENT;
+	if (err)
 		goto out;
-	}
 
-	if (tail != NULL) {
-		objset_t *mos = dp->dp_meta_objset;
+	err = dsl_dataset_hold_ref(*dsp, tag);
 
-		err = dsl_dataset_open_obj(dp, obj, NULL,
-		    DS_MODE_NONE, tag, &ds);
-		if (err)
-			goto out;
-		obj = ds->ds_phys->ds_snapnames_zapobj;
-		dsl_dataset_close(ds, DS_MODE_NONE, tag);
-		ds = NULL;
+	/* we may be looking for a snapshot */
+	if (err == 0 && snapname != NULL) {
+		dsl_dataset_t *ds = NULL;
 
-		if (tail[0] != '@') {
+		if (*snapname++ != '@') {
+			dsl_dataset_rele(*dsp, tag);
 			err = ENOENT;
 			goto out;
 		}
-		tail++;
 
-		/* Look for a snapshot */
-		if (!DS_MODE_IS_READONLY(mode)) {
-			err = EROFS;
-			goto out;
+		dprintf("looking for snapshot '%s'\n", snapname);
+		err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
+		if (err == 0)
+			err = dsl_dataset_get_ref(dp, obj, tag, &ds);
+		dsl_dataset_rele(*dsp, tag);
+
+		ASSERT3U((err == 0), ==, (ds != NULL));
+
+		if (ds) {
+			mutex_enter(&ds->ds_lock);
+			if (ds->ds_snapname[0] == 0)
+				(void) strlcpy(ds->ds_snapname, snapname,
+				    sizeof (ds->ds_snapname));
+			mutex_exit(&ds->ds_lock);
+			err = dsl_dataset_hold_ref(ds, tag);
+			*dsp = err ? NULL : ds;
 		}
-		dprintf("looking for snapshot '%s'\n", tail);
-		err = zap_lookup(mos, obj, tail, 8, 1, &obj);
-		if (err)
-			goto out;
 	}
-	err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
-
 out:
 	rw_exit(&dp->dp_config_rwlock);
 	dsl_dir_close(dd, FTAG);
-
-	ASSERT3U((err == 0), ==, (ds != NULL));
-	/* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
-
-	*dsp = ds;
 	return (err);
 }
 
 int
-dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
+dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp)
 {
-	return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
+	int err = dsl_dataset_hold(name, owner, dsp);
+	if (err)
+		return (err);
+	if ((*dsp)->ds_phys->ds_num_children > 0 &&
+	    !DS_MODE_IS_READONLY(flags)) {
+		dsl_dataset_rele(*dsp, owner);
+		return (EROFS);
+	}
+	if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
+		dsl_dataset_rele(*dsp, owner);
+		return (EBUSY);
+	}
+	return (0);
 }
 
 void
@@ -477,11 +646,11 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name)
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
 			(void) strcat(name, "@");
+			/*
+			 * We use a "recursive" mutex so that we
+			 * can call dprintf_ds() with ds_lock held.
+			 */
 			if (!MUTEX_HELD(&ds->ds_lock)) {
-				/*
-				 * We use a "recursive" mutex so that we
-				 * can call dprintf_ds() with ds_lock held.
-				 */
 				mutex_enter(&ds->ds_lock);
 				(void) strcat(name, ds->ds_snapname);
 				mutex_exit(&ds->ds_lock);
@@ -505,7 +674,6 @@ dsl_dataset_namelen(dsl_dataset_t *ds)
 		if (ds->ds_snapname[0]) {
 			++result;	/* adding one for the @-sign */
 			if (!MUTEX_HELD(&ds->ds_lock)) {
-				/* see dsl_datset_name */
 				mutex_enter(&ds->ds_lock);
 				result += strlen(ds->ds_snapname);
 				mutex_exit(&ds->ds_lock);
@@ -519,119 +687,160 @@ dsl_dataset_namelen(dsl_dataset_t *ds)
 }
 
 void
-dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
+dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
 {
-	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+	dmu_buf_rele(ds->ds_dbuf, tag);
+}
+
+void
+dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
+{
+	if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
+		rw_exit(&ds->ds_rwlock);
+	}
+	dsl_dataset_drop_ref(ds, tag);
+}
+
+void
+dsl_dataset_disown(dsl_dataset_t *ds, void *owner)
+{
+	ASSERT((ds->ds_owner == owner && ds->ds_dbuf) ||
+	    (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
+
 	mutex_enter(&ds->ds_lock);
-	ASSERT3U(ds->ds_open_refcount, >=, weight);
-	ds->ds_open_refcount -= weight;
-	dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
-	    mode, ds->ds_open_refcount);
+	ds->ds_owner = NULL;
+	if (RW_WRITE_HELD(&ds->ds_rwlock)) {
+		rw_exit(&ds->ds_rwlock);
+		cv_broadcast(&ds->ds_exclusive_cv);
+	}
 	mutex_exit(&ds->ds_lock);
+	if (ds->ds_dbuf)
+		dsl_dataset_drop_ref(ds, owner);
+	else
+		dsl_dataset_evict(ds->ds_dbuf, ds);
+}
 
-	dmu_buf_rele(ds->ds_dbuf, tag);
+boolean_t
+dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner)
+{
+	boolean_t gotit = FALSE;
+
+	mutex_enter(&ds->ds_lock);
+	if (ds->ds_owner == NULL &&
+	    (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
+		ds->ds_owner = owner;
+		if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
+			rw_exit(&ds->ds_rwlock);
+		gotit = TRUE;
+	}
+	mutex_exit(&ds->ds_lock);
+	return (gotit);
 }
 
 void
-dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
+dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
 {
-	objset_t *mos = dp->dp_meta_objset;
+	ASSERT3P(owner, ==, ds->ds_owner);
+	if (!RW_WRITE_HELD(&ds->ds_rwlock))
+		rw_enter(&ds->ds_rwlock, RW_WRITER);
+}
+
+uint64_t
+dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
+    uint64_t flags, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dd->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
-	dsl_dataset_t *ds;
 	uint64_t dsobj;
-	dsl_dir_t *dd;
+	objset_t *mos = dp->dp_meta_objset;
 
-	dsl_dir_create_root(mos, ddobjp, tx);
-	VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
+	if (origin == NULL)
+		origin = dp->dp_origin_snap;
+
+	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
+	ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
+	bzero(dsphys, sizeof (dsl_dataset_phys_t));
 	dsphys->ds_dir_obj = dd->dd_object;
+	dsphys->ds_flags = flags;
 	dsphys->ds_fsid_guid = unique_create();
-	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 	    sizeof (dsphys->ds_guid));
 	dsphys->ds_snapnames_zapobj =
-	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
+	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
+	    DMU_OT_NONE, 0, tx);
 	dsphys->ds_creation_time = gethrestime_sec();
-	dsphys->ds_creation_txg = tx->tx_txg;
+	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
 	dsphys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+
+	if (origin) {
+		dsphys->ds_prev_snap_obj = origin->ds_object;
+		dsphys->ds_prev_snap_txg =
+		    origin->ds_phys->ds_creation_txg;
+		dsphys->ds_used_bytes =
+		    origin->ds_phys->ds_used_bytes;
+		dsphys->ds_compressed_bytes =
+		    origin->ds_phys->ds_compressed_bytes;
+		dsphys->ds_uncompressed_bytes =
+		    origin->ds_phys->ds_uncompressed_bytes;
+		dsphys->ds_bp = origin->ds_phys->ds_bp;
+		dsphys->ds_flags |= origin->ds_phys->ds_flags;
+
+		dmu_buf_will_dirty(origin->ds_dbuf, tx);
+		origin->ds_phys->ds_num_children++;
+
+		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
+			if (origin->ds_phys->ds_next_clones_obj == 0) {
+				origin->ds_phys->ds_next_clones_obj =
+				    zap_create(mos,
+				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
+			}
+			VERIFY(0 == zap_add_int(mos,
+			    origin->ds_phys->ds_next_clones_obj,
+			    dsobj, tx));
+		}
+
+		dmu_buf_will_dirty(dd->dd_dbuf, tx);
+		dd->dd_phys->dd_origin_obj = origin->ds_object;
+	}
+
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
 	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dd->dd_phys->dd_head_dataset_obj = dsobj;
-	dsl_dir_close(dd, FTAG);
 
-	VERIFY(0 ==
-	    dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
-	(void) dmu_objset_create_impl(dp->dp_spa, ds,
-	    &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
-	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+	return (dsobj);
 }
 
 uint64_t
-dsl_dataset_create_sync(dsl_dir_t *pdd,
-    const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
+dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
+    dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = pdd->dd_pool;
-	dmu_buf_t *dbuf;
-	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj, ddobj;
-	objset_t *mos = dp->dp_meta_objset;
 	dsl_dir_t *dd;
 
-	ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp);
-	ASSERT(clone_parent == NULL ||
-	    clone_parent->ds_phys->ds_num_children > 0);
 	ASSERT(lastname[0] != '@');
-	ASSERT(dmu_tx_is_syncing(tx));
 
-	ddobj = dsl_dir_create_sync(pdd, lastname, tx);
+	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
 	VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
 
-	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
-	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
-	dmu_buf_will_dirty(dbuf, tx);
-	dsphys = dbuf->db_data;
-	dsphys->ds_dir_obj = dd->dd_object;
-	dsphys->ds_fsid_guid = unique_create();
-	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
-	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
-	    sizeof (dsphys->ds_guid));
-	dsphys->ds_snapnames_zapobj =
-	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
-	dsphys->ds_creation_time = gethrestime_sec();
-	dsphys->ds_creation_txg = tx->tx_txg;
-	dsphys->ds_deadlist_obj =
-	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	if (clone_parent) {
-		dsphys->ds_prev_snap_obj = clone_parent->ds_object;
-		dsphys->ds_prev_snap_txg =
-		    clone_parent->ds_phys->ds_creation_txg;
-		dsphys->ds_used_bytes =
-		    clone_parent->ds_phys->ds_used_bytes;
-		dsphys->ds_compressed_bytes =
-		    clone_parent->ds_phys->ds_compressed_bytes;
-		dsphys->ds_uncompressed_bytes =
-		    clone_parent->ds_phys->ds_uncompressed_bytes;
-		dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
+	dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
 
-		dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
-		clone_parent->ds_phys->ds_num_children++;
+	dsl_deleg_set_create_perms(dd, tx, cr);
 
-		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
-	}
-	dmu_buf_rele(dbuf, FTAG);
-
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	dd->dd_phys->dd_head_dataset_obj = dsobj;
 	dsl_dir_close(dd, FTAG);
 
 	return (dsobj);
@@ -653,21 +862,24 @@ dsl_snapshot_destroy_one(char *name, void *arg)
 
 	(void) strcat(name, "@");
 	(void) strcat(name, da->snapname);
-	err = dsl_dataset_open(name,
-	    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+	err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT,
 	    da->dstg, &ds);
 	cp = strchr(name, '@');
 	*cp = '\0';
-	if (err == ENOENT)
-		return (0);
-	if (err) {
+	if (err == 0) {
+		dsl_dataset_make_exclusive(ds, da->dstg);
+		if (ds->ds_user_ptr) {
+			ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+			ds->ds_user_ptr = NULL;
+		}
+		dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
+		    dsl_dataset_destroy_sync, ds, da->dstg, 0);
+	} else if (err == ENOENT) {
+		err = 0;
+	} else {
 		(void) strcpy(da->failed, name);
-		return (err);
 	}
-
-	dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
-	    dsl_dataset_destroy_sync, ds, da->dstg, 0);
-	return (0);
+	return (err);
 }
 
 /*
@@ -681,16 +893,8 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
 	struct destroyarg da;
 	dsl_sync_task_t *dst;
 	spa_t *spa;
-	char *cp;
 
-	cp = strchr(fsname, '/');
-	if (cp) {
-		*cp = '\0';
-		err = spa_open(fsname, &spa, FTAG);
-		*cp = '/';
-	} else {
-		err = spa_open(fsname, &spa, FTAG);
-	}
+	err = spa_open(fsname, &spa, FTAG);
 	if (err)
 		return (err);
 	da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
@@ -706,17 +910,14 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
 	for (dst = list_head(&da.dstg->dstg_tasks); dst;
 	    dst = list_next(&da.dstg->dstg_tasks, dst)) {
 		dsl_dataset_t *ds = dst->dst_arg1;
+		/*
+		 * Return the file system name that triggered the error
+		 */
 		if (dst->dst_err) {
 			dsl_dataset_name(ds, fsname);
-			cp = strchr(fsname, '@');
-			*cp = '\0';
+			*strchr(fsname, '@') = '\0';
 		}
-		/*
-		 * If it was successful, destroy_sync would have
-		 * closed the ds
-		 */
-		if (err)
-			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, da.dstg);
+		dsl_dataset_disown(ds, da.dstg);
 	}
 
 	dsl_sync_task_group_destroy(da.dstg);
@@ -724,36 +925,33 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
 	return (err);
 }
 
+/*
+ * ds must be opened as OWNER.  On return (whether successful or not),
+ * ds will be closed and caller can no longer dereference it.
+ */
 int
-dsl_dataset_destroy(const char *name)
+dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
 {
 	int err;
 	dsl_sync_task_group_t *dstg;
 	objset_t *os;
-	dsl_dataset_t *ds;
 	dsl_dir_t *dd;
 	uint64_t obj;
 
-	if (strchr(name, '@')) {
+	if (dsl_dataset_is_snapshot(ds)) {
 		/* Destroying a snapshot is simpler */
-		err = dsl_dataset_open(name,
-		    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
-		    FTAG, &ds);
-		if (err)
-			return (err);
+		dsl_dataset_make_exclusive(ds, tag);
+
+		if (ds->ds_user_ptr) {
+			ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+			ds->ds_user_ptr = NULL;
+		}
 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 		    dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
-		    ds, FTAG, 0);
-		if (err)
-			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-		return (err);
+		    ds, tag, 0);
+		goto out;
 	}
 
-	err = dmu_objset_open(name, DMU_OST_ANY,
-	    DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
-	if (err)
-		return (err);
-	ds = os->os->os_dsl_dataset;
 	dd = ds->ds_dir;
 
 	/*
@@ -762,10 +960,12 @@ dsl_dataset_destroy(const char *name)
 	 */
 	err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
 	    dsl_dataset_destroy_begin_sync, ds, NULL, 0);
-	if (err) {
-		dmu_objset_close(os);
-		return (err);
-	}
+	if (err)
+		goto out;
+
+	err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
+	if (err)
+		goto out;
 
 	/*
 	 * remove the objects in open context, so that we won't
@@ -773,66 +973,73 @@ dsl_dataset_destroy(const char *name)
 	 */
 	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
 	    ds->ds_phys->ds_prev_snap_txg)) {
-		dmu_tx_t *tx = dmu_tx_create(os);
-		dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END);
-		dmu_tx_hold_bonus(tx, obj);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err) {
-			/*
-			 * Perhaps there is not enough disk
-			 * space.  Just deal with it from
-			 * dsl_dataset_destroy_sync().
-			 */
-			dmu_tx_abort(tx);
-			continue;
-		}
-		VERIFY(0 == dmu_object_free(os, obj, tx));
-		dmu_tx_commit(tx);
+		/*
+		 * Ignore errors, if there is not enough disk space
+		 * we will deal with it in dsl_dataset_destroy_sync().
+		 */
+		(void) dmu_free_object(os, obj);
 	}
-	/* Make sure it's not dirty before we finish destroying it. */
-	txg_wait_synced(dd->dd_pool, 0);
 
 	dmu_objset_close(os);
 	if (err != ESRCH)
-		return (err);
+		goto out;
+
+	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+	err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
+	rw_exit(&dd->dd_pool->dp_config_rwlock);
 
-	err = dsl_dataset_open(name,
-	    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
-	    FTAG, &ds);
 	if (err)
-		return (err);
+		goto out;
 
-	err = dsl_dir_open(name, FTAG, &dd, NULL);
-	if (err) {
-		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-		return (err);
+	if (ds->ds_user_ptr) {
+		/*
+		 * We need to sync out all in-flight IO before we try
+		 * to evict (the dataset evict func is trying to clear
+		 * the cached entries for this dataset in the ARC).
+		 */
+		txg_wait_synced(dd->dd_pool, 0);
 	}
 
 	/*
 	 * Blow away the dsl_dir + head dataset.
 	 */
+	dsl_dataset_make_exclusive(ds, tag);
+	if (ds->ds_user_ptr) {
+		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+		ds->ds_user_ptr = NULL;
+	}
 	dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
 	dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
-	    dsl_dataset_destroy_sync, ds, FTAG, 0);
+	    dsl_dataset_destroy_sync, ds, tag, 0);
 	dsl_sync_task_create(dstg, dsl_dir_destroy_check,
 	    dsl_dir_destroy_sync, dd, FTAG, 0);
 	err = dsl_sync_task_group_wait(dstg);
 	dsl_sync_task_group_destroy(dstg);
-	/* if it is successful, *destroy_sync will close the ds+dd */
-	if (err) {
-		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+	/* if it is successful, dsl_dir_destroy_sync will close the dd */
+	if (err)
 		dsl_dir_close(dd, FTAG);
-	}
+out:
+	dsl_dataset_disown(ds, tag);
 	return (err);
 }
 
 int
-dsl_dataset_rollback(dsl_dataset_t *ds)
+dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost)
 {
-	ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
-	return (dsl_sync_task_do(ds->ds_dir->dd_pool,
+	int err;
+
+	ASSERT(ds->ds_owner);
+
+	dsl_dataset_make_exclusive(ds, ds->ds_owner);
+	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
-	    ds, NULL, 0));
+	    ds, &ost, 0);
+	/* drop exclusive access */
+	mutex_enter(&ds->ds_lock);
+	rw_exit(&ds->ds_rwlock);
+	cv_broadcast(&ds->ds_exclusive_cv);
+	mutex_exit(&ds->ds_lock);
+	return (err);
 }
 
 void *
@@ -904,14 +1111,56 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
 	}
 }
 
+/*
+ * The unique space in the head dataset can be calculated by subtracting
+ * the space used in the most recent snapshot, that is still being used
+ * in this file system, from the space currently in use.  To figure out
+ * the space in the most recent snapshot still in use, we need to take
+ * the total space used in the snapshot and subtract out the space that
+ * has been freed up since the snapshot was taken.
+ */
+static void
+dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
+{
+	uint64_t mrs_used;
+	uint64_t dlused, dlcomp, dluncomp;
+
+	ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj);
+
+	if (ds->ds_phys->ds_prev_snap_obj != 0)
+		mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
+	else
+		mrs_used = 0;
+
+	VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp,
+	    &dluncomp));
+
+	ASSERT3U(dlused, <=, mrs_used);
+	ds->ds_phys->ds_unique_bytes =
+	    ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
+
+	if (!DS_UNIQUE_IS_ACCURATE(ds) &&
+	    spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+	    SPA_VERSION_UNIQUE_ACCURATE)
+		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+}
+
+static uint64_t
+dsl_dataset_unique(dsl_dataset_t *ds)
+{
+	if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds))
+		dsl_dataset_recalc_head_uniq(ds);
+
+	return (ds->ds_phys->ds_unique_bytes);
+}
+
 struct killarg {
-	uint64_t *usedp;
-	uint64_t *compressedp;
-	uint64_t *uncompressedp;
+	dsl_dataset_t *ds;
 	zio_t *zio;
 	dmu_tx_t *tx;
 };
 
+/* ARGSUSED */
 static int
 kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 {
@@ -920,16 +1169,9 @@ kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
 
 	ASSERT3U(bc->bc_errno, ==, 0);
 
-	/*
-	 * Since this callback is not called concurrently, no lock is
-	 * needed on the accounting values.
-	 */
-	*ka->usedp += bp_get_dasize(spa, bp);
-	*ka->compressedp += BP_GET_PSIZE(bp);
-	*ka->uncompressedp += BP_GET_UCSIZE(bp);
-	/* XXX check for EIO? */
-	(void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
-	    ARC_NOWAIT);
+	ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
+	(void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
+
 	return (0);
 }
 
@@ -938,14 +1180,12 @@ static int
 dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
+	dmu_objset_type_t *ost = arg2;
 
 	/*
-	 * There must be a previous snapshot.  I suppose we could roll
-	 * it back to being empty (and re-initialize the upper (ZPL)
-	 * layer).  But for now there's no way to do this via the user
-	 * interface.
+	 * We can only roll back to emptyness if it is a ZPL objset.
 	 */
-	if (ds->ds_phys->ds_prev_snap_txg == 0)
+	if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0)
 		return (EINVAL);
 
 	/*
@@ -966,13 +1206,44 @@ dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 /* ARGSUSED */
 static void
-dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
+	dmu_objset_type_t *ost = arg2;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
+	/*
+	 * Before the roll back destroy the zil.
+	 */
+	if (ds->ds_user_ptr != NULL) {
+		zil_rollback_destroy(
+		    ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx);
+
+		/*
+		 * We need to make sure that the objset_impl_t is reopened after
+		 * we do the rollback, otherwise it will have the wrong
+		 * objset_phys_t.  Normally this would happen when this
+		 * dataset-open is closed, thus causing the
+		 * dataset to be immediately evicted.  But when doing "zfs recv
+		 * -F", we reopen the objset before that, so that there is no
+		 * window where the dataset is closed and inconsistent.
+		 */
+		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+		ds->ds_user_ptr = NULL;
+	}
+
+	/* Transfer space that was freed since last snap back to the head. */
+	{
+		uint64_t used;
+
+		VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist,
+		    ds->ds_origin_txg, UINT64_MAX, &used));
+		dsl_dir_transfer_space(ds->ds_dir, used,
+		    DD_USED_SNAP, DD_USED_HEAD, tx);
+	}
+
 	/* Zero out the deadlist. */
 	bplist_close(&ds->ds_deadlist);
 	bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
@@ -984,39 +1255,65 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	{
 		/* Free blkptrs that we gave birth to */
 		zio_t *zio;
-		uint64_t used = 0, compressed = 0, uncompressed = 0;
 		struct killarg ka;
 
 		zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
 		    ZIO_FLAG_MUSTSUCCEED);
-		ka.usedp = &used;
-		ka.compressedp = &compressed;
-		ka.uncompressedp = &uncompressed;
+		ka.ds = ds;
 		ka.zio = zio;
 		ka.tx = tx;
 		(void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
 		    ADVANCE_POST, kill_blkptr, &ka);
 		(void) zio_wait(zio);
-
-		dsl_dir_diduse_space(ds->ds_dir,
-		    -used, -compressed, -uncompressed, tx);
 	}
 
-	/* Change our contents to that of the prev snapshot */
-	ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
-	ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
-	ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
-	ds->ds_phys->ds_compressed_bytes =
-	    ds->ds_prev->ds_phys->ds_compressed_bytes;
-	ds->ds_phys->ds_uncompressed_bytes =
-	    ds->ds_prev->ds_phys->ds_uncompressed_bytes;
-	ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
-	ds->ds_phys->ds_unique_bytes = 0;
+	ASSERT(!(ds->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) ||
+	    ds->ds_phys->ds_unique_bytes == 0);
+
+	if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) {
+		/* Change our contents to that of the prev snapshot */
+
+		ASSERT3U(ds->ds_prev->ds_object, ==,
+		    ds->ds_phys->ds_prev_snap_obj);
+		ASSERT3U(ds->ds_phys->ds_used_bytes, <=,
+		    ds->ds_prev->ds_phys->ds_used_bytes);
+
+		ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
+		ds->ds_phys->ds_used_bytes =
+		    ds->ds_prev->ds_phys->ds_used_bytes;
+		ds->ds_phys->ds_compressed_bytes =
+		    ds->ds_prev->ds_phys->ds_compressed_bytes;
+		ds->ds_phys->ds_uncompressed_bytes =
+		    ds->ds_prev->ds_phys->ds_uncompressed_bytes;
+		ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
+
+		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+			ds->ds_prev->ds_phys->ds_unique_bytes = 0;
+		}
+	} else {
+		objset_impl_t *osi;
+
+		ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0);
+		ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0);
+		ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0);
 
-	if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
-		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-		ds->ds_prev->ds_phys->ds_unique_bytes = 0;
+		bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t));
+		ds->ds_phys->ds_flags = 0;
+		ds->ds_phys->ds_unique_bytes = 0;
+		if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+		    SPA_VERSION_UNIQUE_ACCURATE)
+			ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
+		osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
+		    &ds->ds_phys->ds_bp, *ost, tx);
+#ifdef _KERNEL
+		zfs_create_fs(&osi->os, kcred, NULL, tx);
+#endif
 	}
+
+	spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa,
+	    tx, cr, "dataset = %llu", ds->ds_object);
 }
 
 /* ARGSUSED */
@@ -1024,6 +1321,9 @@ static int
 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t count;
+	int err;
 
 	/*
 	 * Can't delete a head dataset if there are snapshots of it.
@@ -1034,26 +1334,44 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
 		return (EINVAL);
 
+	/*
+	 * This is really a dsl_dir thing, but check it here so that
+	 * we'll be less likely to leave this dataset inconsistent &
+	 * nearly destroyed.
+	 */
+	err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
+	if (err)
+		return (err);
+	if (count != 0)
+		return (EEXIST);
+
 	return (0);
 }
 
 /* ARGSUSED */
 static void
-dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 
 	/* Mark it as inconsistent on-disk, in case we crash */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+
+	spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
+	    cr, "dataset = %llu", ds->ds_object);
 }
 
 /* ARGSUSED */
-static int
+int
 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 
+	/* we have an owner hold, so noone else can destroy us */
+	ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
+
 	/* Can't delete a branch point. */
 	if (ds->ds_phys->ds_num_children > 1)
 		return (EEXIST);
@@ -1078,11 +1396,50 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	return (0);
 }
 
+struct refsarg {
+	kmutex_t lock;
+	boolean_t gone;
+	kcondvar_t cv;
+};
+
+/* ARGSUSED */
+static void
+dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
+{
+	struct refsarg *arg = argv;
+
+	mutex_enter(&arg->lock);
+	arg->gone = TRUE;
+	cv_signal(&arg->cv);
+	mutex_exit(&arg->lock);
+}
+
 static void
-dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
+dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
+{
+	struct refsarg arg;
+
+	mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
+	arg.gone = FALSE;
+	(void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
+	    dsl_dataset_refs_gone);
+	dmu_buf_rele(ds->ds_dbuf, tag);
+	mutex_enter(&arg.lock);
+	while (!arg.gone)
+		cv_wait(&arg.cv, &arg.lock);
+	ASSERT(arg.gone);
+	mutex_exit(&arg.lock);
+	ds->ds_dbuf = NULL;
+	ds->ds_phys = NULL;
+	mutex_destroy(&arg.lock);
+	cv_destroy(&arg.cv);
+}
+
+void
+dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
-	uint64_t used = 0, compressed = 0, uncompressed = 0;
 	zio_t *zio;
 	int err;
 	int after_branch_point = FALSE;
@@ -1091,29 +1448,53 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 	dsl_dataset_t *ds_prev = NULL;
 	uint64_t obj;
 
-	ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
+	ASSERT(ds->ds_owner);
 	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
 	ASSERT(ds->ds_prev == NULL ||
 	    ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
 	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
 
+	/* signal any waiters that this dataset is going away */
+	mutex_enter(&ds->ds_lock);
+	ds->ds_owner = dsl_reaper;
+	cv_broadcast(&ds->ds_exclusive_cv);
+	mutex_exit(&ds->ds_lock);
+
+	/* Remove our reservation */
+	if (ds->ds_reserved != 0) {
+		uint64_t val = 0;
+		dsl_dataset_set_reservation_sync(ds, &val, cr, tx);
+		ASSERT3U(ds->ds_reserved, ==, 0);
+	}
+
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
+	dsl_pool_ds_destroyed(ds, tx);
+
 	obj = ds->ds_object;
 
 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
 		if (ds->ds_prev) {
 			ds_prev = ds->ds_prev;
 		} else {
-			VERIFY(0 == dsl_dataset_open_obj(dp,
-			    ds->ds_phys->ds_prev_snap_obj, NULL,
-			    DS_MODE_NONE, FTAG, &ds_prev));
+			VERIFY(0 == dsl_dataset_hold_obj(dp,
+			    ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
 		}
 		after_branch_point =
 		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
 
 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
 		if (after_branch_point &&
+		    ds_prev->ds_phys->ds_next_clones_obj != 0) {
+			VERIFY(0 == zap_remove_int(mos,
+			    ds_prev->ds_phys->ds_next_clones_obj, obj, tx));
+			if (ds->ds_phys->ds_next_snap_obj != 0) {
+				VERIFY(0 == zap_add_int(mos,
+				    ds_prev->ds_phys->ds_next_clones_obj,
+				    ds->ds_phys->ds_next_snap_obj, tx));
+			}
+		}
+		if (after_branch_point &&
 		    ds->ds_phys->ds_next_snap_obj == 0) {
 			/* This clone is toast. */
 			ASSERT(ds_prev->ds_phys->ds_num_children > 1);
@@ -1130,14 +1511,15 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 		blkptr_t bp;
 		dsl_dataset_t *ds_next;
 		uint64_t itor = 0;
+		uint64_t old_unique;
+		int64_t used = 0, compressed = 0, uncompressed = 0;
 
-		spa_scrub_restart(dp->dp_spa, tx->tx_txg);
-
-		VERIFY(0 == dsl_dataset_open_obj(dp,
-		    ds->ds_phys->ds_next_snap_obj, NULL,
-		    DS_MODE_NONE, FTAG, &ds_next));
+		VERIFY(0 == dsl_dataset_hold_obj(dp,
+		    ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
 		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
 
+		old_unique = dsl_dataset_unique(ds_next);
+
 		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
 		ds_next->ds_phys->ds_prev_snap_obj =
 		    ds->ds_phys->ds_prev_snap_obj;
@@ -1154,8 +1536,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 		 *
 		 * XXX we're doing this long task with the config lock held
 		 */
-		while (bplist_iterate(&ds_next->ds_deadlist, &itor,
-		    &bp) == 0) {
+		while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) {
 			if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
 				VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
 				    &bp, tx));
@@ -1170,16 +1551,23 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 				compressed += BP_GET_PSIZE(&bp);
 				uncompressed += BP_GET_UCSIZE(&bp);
 				/* XXX check return value? */
-				(void) arc_free(zio, dp->dp_spa, tx->tx_txg,
+				(void) dsl_free(zio, dp, tx->tx_txg,
 				    &bp, NULL, NULL, ARC_NOWAIT);
 			}
 		}
 
+		ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
+
+		/* change snapused */
+		dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+		    -used, -compressed, -uncompressed, tx);
+
 		/* free next's deadlist */
 		bplist_close(&ds_next->ds_deadlist);
 		bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
 
 		/* set next's deadlist to our deadlist */
+		bplist_close(&ds->ds_deadlist);
 		ds_next->ds_phys->ds_deadlist_obj =
 		    ds->ds_phys->ds_deadlist_obj;
 		VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
@@ -1200,51 +1588,50 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 			 * config lock held
 			 */
 			dsl_dataset_t *ds_after_next;
+			uint64_t space;
 
-			VERIFY(0 == dsl_dataset_open_obj(dp,
-			    ds_next->ds_phys->ds_next_snap_obj, NULL,
-			    DS_MODE_NONE, FTAG, &ds_after_next));
-			itor = 0;
-			while (bplist_iterate(&ds_after_next->ds_deadlist,
-			    &itor, &bp) == 0) {
-				if (bp.blk_birth >
-				    ds->ds_phys->ds_prev_snap_txg &&
-				    bp.blk_birth <=
-				    ds->ds_phys->ds_creation_txg) {
-					ds_next->ds_phys->ds_unique_bytes +=
-					    bp_get_dasize(dp->dp_spa, &bp);
-				}
-			}
+			VERIFY(0 == dsl_dataset_hold_obj(dp,
+			    ds_next->ds_phys->ds_next_snap_obj,
+			    FTAG, &ds_after_next));
+
+			VERIFY(0 ==
+			    bplist_space_birthrange(&ds_after_next->ds_deadlist,
+			    ds->ds_phys->ds_prev_snap_txg,
+			    ds->ds_phys->ds_creation_txg, &space));
+			ds_next->ds_phys->ds_unique_bytes += space;
 
-			dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
+			dsl_dataset_rele(ds_after_next, FTAG);
 			ASSERT3P(ds_next->ds_prev, ==, NULL);
 		} else {
-			/*
-			 * It would be nice to update the head dataset's
-			 * unique.  To do so we would have to traverse
-			 * it for blocks born after ds_prev, which is
-			 * pretty expensive just to maintain something
-			 * for debugging purposes.
-			 */
 			ASSERT3P(ds_next->ds_prev, ==, ds);
-			dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
-			    ds_next);
+			dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
+			ds_next->ds_prev = NULL;
 			if (ds_prev) {
-				VERIFY(0 == dsl_dataset_open_obj(dp,
-				    ds->ds_phys->ds_prev_snap_obj, NULL,
-				    DS_MODE_NONE, ds_next, &ds_next->ds_prev));
-			} else {
-				ds_next->ds_prev = NULL;
+				VERIFY(0 == dsl_dataset_get_ref(dp,
+				    ds->ds_phys->ds_prev_snap_obj,
+				    ds_next, &ds_next->ds_prev));
 			}
-		}
-		dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
 
-		/*
-		 * NB: unique_bytes is not accurate for head objsets
-		 * because we don't update it when we delete the most
-		 * recent snapshot -- see above comment.
-		 */
-		ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
+			dsl_dataset_recalc_head_uniq(ds_next);
+
+			/*
+			 * Reduce the amount of our unconsmed refreservation
+			 * being charged to our parent by the amount of
+			 * new unique data we have gained.
+			 */
+			if (old_unique < ds_next->ds_reserved) {
+				int64_t mrsdelta;
+				uint64_t new_unique =
+				    ds_next->ds_phys->ds_unique_bytes;
+
+				ASSERT(old_unique <= new_unique);
+				mrsdelta = MIN(new_unique - old_unique,
+				    ds_next->ds_reserved - old_unique);
+				dsl_dir_diduse_space(ds->ds_dir,
+				    DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
+			}
+		}
+		dsl_dataset_rele(ds_next, FTAG);
 	} else {
 		/*
 		 * There's no next snapshot, so this is a head dataset.
@@ -1263,76 +1650,106 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 		 * Free everything that we point to (that's born after
 		 * the previous snapshot, if we are a clone)
 		 *
-		 * XXX we're doing this long task with the config lock held
+		 * NB: this should be very quick, because we already
+		 * freed all the objects in open context.
 		 */
-		ka.usedp = &used;
-		ka.compressedp = &compressed;
-		ka.uncompressedp = &uncompressed;
+		ka.ds = ds;
 		ka.zio = zio;
 		ka.tx = tx;
 		err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
 		    ADVANCE_POST, kill_blkptr, &ka);
 		ASSERT3U(err, ==, 0);
+		ASSERT(spa_version(dp->dp_spa) < SPA_VERSION_UNIQUE_ACCURATE ||
+		    ds->ds_phys->ds_unique_bytes == 0);
 	}
 
 	err = zio_wait(zio);
 	ASSERT3U(err, ==, 0);
 
-	dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx);
-
-	if (ds->ds_phys->ds_snapnames_zapobj) {
-		err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
-		ASSERT(err == 0);
-	}
-
 	if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
-		/* Erase the link in the dataset */
+		/* Erase the link in the dir */
 		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 		ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
-		/*
-		 * dsl_dir_sync_destroy() called us, they'll destroy
-		 * the dataset.
-		 */
+		ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
+		err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
+		ASSERT(err == 0);
 	} else {
 		/* remove from snapshot namespace */
 		dsl_dataset_t *ds_head;
-		VERIFY(0 == dsl_dataset_open_obj(dp,
-		    ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL,
-		    DS_MODE_NONE, FTAG, &ds_head));
+		ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
+		VERIFY(0 == dsl_dataset_hold_obj(dp,
+		    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 #ifdef ZFS_DEBUG
 		{
 			uint64_t val;
-			err = zap_lookup(mos,
-			    ds_head->ds_phys->ds_snapnames_zapobj,
-			    ds->ds_snapname, 8, 1, &val);
+
+			err = dsl_dataset_snap_lookup(ds_head,
+			    ds->ds_snapname, &val);
 			ASSERT3U(err, ==, 0);
 			ASSERT3U(val, ==, obj);
 		}
 #endif
-		err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
-		    ds->ds_snapname, tx);
+		err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
 		ASSERT(err == 0);
-		dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
+		dsl_dataset_rele(ds_head, FTAG);
 	}
 
 	if (ds_prev && ds->ds_prev != ds_prev)
-		dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
-
-	spa_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
-	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag);
+		dsl_dataset_rele(ds_prev, FTAG);
+
+	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+	spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx,
+	    cr, "dataset = %llu", ds->ds_object);
+
+	if (ds->ds_phys->ds_next_clones_obj != 0) {
+		uint64_t count;
+		ASSERT(0 == zap_count(mos,
+		    ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
+		VERIFY(0 == dmu_object_free(mos,
+		    ds->ds_phys->ds_next_clones_obj, tx));
+	}
+	if (ds->ds_phys->ds_props_obj != 0)
+		VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
+	dsl_dir_close(ds->ds_dir, ds);
+	ds->ds_dir = NULL;
+	dsl_dataset_drain_refs(ds, tag);
 	VERIFY(0 == dmu_object_free(mos, obj, tx));
+}
 
+static int
+dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	uint64_t asize;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	/*
+	 * If there's an fs-only reservation, any blocks that might become
+	 * owned by the snapshot dataset must be accommodated by space
+	 * outside of the reservation.
+	 */
+	asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
+		return (ENOSPC);
+
+	/*
+	 * Propogate any reserved space for this snapshot to other
+	 * snapshot checks in this sync group.
+	 */
+	if (asize > 0)
+		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
+
+	return (0);
 }
 
 /* ARGSUSED */
 int
 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-	objset_t *os = arg1;
-	dsl_dataset_t *ds = os->os->os_dsl_dataset;
+	dsl_dataset_t *ds = arg1;
 	const char *snapname = arg2;
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	int err;
 	uint64_t value;
 
@@ -1346,8 +1763,7 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	/*
 	 * Check for conflicting name snapshot name.
 	 */
-	err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
-	    snapname, 8, 1, &value);
+	err = dsl_dataset_snap_lookup(ds, snapname, &value);
 	if (err == 0)
 		return (EEXIST);
 	if (err != ENOENT)
@@ -1360,34 +1776,44 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
 		return (ENAMETOOLONG);
 
+	err = dsl_dataset_snapshot_reserve_space(ds, tx);
+	if (err)
+		return (err);
+
 	ds->ds_trysnap_txg = tx->tx_txg;
 	return (0);
 }
 
 void
-dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
-	objset_t *os = arg1;
-	dsl_dataset_t *ds = os->os->os_dsl_dataset;
+	dsl_dataset_t *ds = arg1;
 	const char *snapname = arg2;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
-	uint64_t dsobj;
+	uint64_t dsobj, crtxg;
 	objset_t *mos = dp->dp_meta_objset;
 	int err;
 
-	spa_scrub_restart(dp->dp_spa, tx->tx_txg);
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
+	/*
+	 * The origin's ds_creation_txg has to be < TXG_INITIAL
+	 */
+	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
+		crtxg = 1;
+	else
+		crtxg = tx->tx_txg;
+
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
+	bzero(dsphys, sizeof (dsl_dataset_phys_t));
 	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
-	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 	    sizeof (dsphys->ds_guid));
 	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
@@ -1395,7 +1821,7 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	dsphys->ds_next_snap_obj = ds->ds_object;
 	dsphys->ds_num_children = 1;
 	dsphys->ds_creation_time = gethrestime_sec();
-	dsphys->ds_creation_txg = tx->tx_txg;
+	dsphys->ds_creation_txg = crtxg;
 	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
 	dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
 	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
@@ -1406,6 +1832,8 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 
 	ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
 	if (ds->ds_prev) {
+		uint64_t next_clones_obj =
+		    ds->ds_prev->ds_phys->ds_next_clones_obj;
 		ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
 		    ds->ds_object ||
 		    ds->ds_prev->ds_phys->ds_num_children > 1);
@@ -1414,15 +1842,33 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
 			    ds->ds_prev->ds_phys->ds_creation_txg);
 			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
+		} else if (next_clones_obj != 0) {
+			VERIFY3U(0, ==, zap_remove_int(mos,
+			    next_clones_obj, dsphys->ds_next_snap_obj, tx));
+			VERIFY3U(0, ==, zap_add_int(mos,
+			    next_clones_obj, dsobj, tx));
 		}
 	}
 
+	/*
+	 * If we have a reference-reservation on this dataset, we will
+	 * need to increase the amount of refreservation being charged
+	 * since our unique space is going to zero.
+	 */
+	if (ds->ds_reserved) {
+		int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
+		    add, 0, 0, tx);
+	}
+
 	bplist_close(&ds->ds_deadlist);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
+	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
 	ds->ds_phys->ds_prev_snap_obj = dsobj;
-	ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
+	ds->ds_phys->ds_prev_snap_txg = crtxg;
 	ds->ds_phys->ds_unique_bytes = 0;
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 	ds->ds_phys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
 	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
@@ -1434,10 +1880,14 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	ASSERT(err == 0);
 
 	if (ds->ds_prev)
-		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
-	VERIFY(0 == dsl_dataset_open_obj(dp,
-	    ds->ds_phys->ds_prev_snap_obj, snapname,
-	    DS_MODE_NONE, ds, &ds->ds_prev));
+		dsl_dataset_drop_ref(ds->ds_prev, ds);
+	VERIFY(0 == dsl_dataset_get_ref(dp,
+	    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+
+	dsl_pool_ds_snapshotted(ds, tx);
+
+	spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr,
+	    "dataset = %llu", dsobj);
 }
 
 void
@@ -1447,22 +1897,38 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 	ASSERT(ds->ds_user_ptr != NULL);
 	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
 
+	/*
+	 * in case we had to change ds_fsid_guid when we opened it,
+	 * sync it out now.
+	 */
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
+
 	dsl_dir_dirty(ds->ds_dir, tx);
 	dmu_objset_sync(ds->ds_user_ptr, zio, tx);
-	/* Unneeded? bplist_close(&ds->ds_deadlist); */
 }
 
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
+	uint64_t refd, avail, uobjs, aobjs;
+
 	dsl_dir_stats(ds->ds_dir, nv);
 
+	dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
+
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
 	    ds->ds_phys->ds_creation_time);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
 	    ds->ds_phys->ds_creation_txg);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
-	    ds->ds_phys->ds_used_bytes);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
+	    ds->ds_quota);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
+	    ds->ds_reserved);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
+	    ds->ds_phys->ds_guid);
 
 	if (ds->ds_phys->ds_next_snap_obj) {
 		/*
@@ -1483,29 +1949,29 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
 {
 	stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
 	stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
+	stat->dds_guid = ds->ds_phys->ds_guid;
 	if (ds->ds_phys->ds_next_snap_obj) {
 		stat->dds_is_snapshot = B_TRUE;
 		stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
 	}
 
 	/* clone origin is really a dsl_dir thing... */
-	if (ds->ds_dir->dd_phys->dd_clone_parent_obj) {
+	rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+	if (dsl_dir_is_clone(ds->ds_dir)) {
 		dsl_dataset_t *ods;
 
-		rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
-		VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool,
-		    ds->ds_dir->dd_phys->dd_clone_parent_obj,
-		    NULL, DS_MODE_NONE, FTAG, &ods));
-		dsl_dataset_name(ods, stat->dds_clone_of);
-		dsl_dataset_close(ods, DS_MODE_NONE, FTAG);
-		rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+		VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
+		    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
+		dsl_dataset_name(ods, stat->dds_origin);
+		dsl_dataset_drop_ref(ods, FTAG);
 	}
+	rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
 }
 
 uint64_t
 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
 {
-	return (ds->ds_phys->ds_fsid_guid);
+	return (ds->ds_fsid_guid);
 }
 
 void
@@ -1515,10 +1981,37 @@ dsl_dataset_space(dsl_dataset_t *ds,
 {
 	*refdbytesp = ds->ds_phys->ds_used_bytes;
 	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
+	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
+		*availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
+	if (ds->ds_quota != 0) {
+		/*
+		 * Adjust available bytes according to refquota
+		 */
+		if (*refdbytesp < ds->ds_quota)
+			*availbytesp = MIN(*availbytesp,
+			    ds->ds_quota - *refdbytesp);
+		else
+			*availbytesp = 0;
+	}
 	*usedobjsp = ds->ds_phys->ds_bp.blk_fill;
 	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
 }
 
+boolean_t
+dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+	    dsl_pool_sync_context(dp));
+	if (ds->ds_prev == NULL)
+		return (B_FALSE);
+	if (ds->ds_phys->ds_bp.blk_birth >
+	    ds->ds_prev->ds_phys->ds_creation_txg)
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
 /* ARGSUSED */
 static int
 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
@@ -1526,20 +2019,18 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	dsl_dataset_t *ds = arg1;
 	char *newsnapname = arg2;
 	dsl_dir_t *dd = ds->ds_dir;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	dsl_dataset_t *hds;
 	uint64_t val;
 	int err;
 
-	err = dsl_dataset_open_obj(dd->dd_pool,
-	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds);
+	err = dsl_dataset_hold_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
 	if (err)
 		return (err);
 
 	/* new name better not be in use */
-	err = zap_lookup(mos, hds->ds_phys->ds_snapnames_zapobj,
-	    newsnapname, 8, 1, &val);
-	dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
+	err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
+	dsl_dataset_rele(hds, FTAG);
 
 	if (err == 0)
 		err = EEXIST;
@@ -1554,10 +2045,11 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
+    cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
-	char *newsnapname = arg2;
+	const char *newsnapname = arg2;
 	dsl_dir_t *dd = ds->ds_dir;
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	dsl_dataset_t *hds;
@@ -1565,12 +2057,11 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 
 	ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
 
-	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
-	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds));
+	VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
 
 	VERIFY(0 == dsl_dataset_get_snapname(ds));
-	err = zap_remove(mos, hds->ds_phys->ds_snapnames_zapobj,
-	    ds->ds_snapname, tx);
+	err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
 	ASSERT3U(err, ==, 0);
 	mutex_enter(&ds->ds_lock);
 	(void) strcpy(ds->ds_snapname, newsnapname);
@@ -1579,10 +2070,12 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	    ds->ds_snapname, 8, 1, &ds->ds_object, tx);
 	ASSERT3U(err, ==, 0);
 
-	dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
+	spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
+	    cr, "dataset = %llu", ds->ds_object);
+	dsl_dataset_rele(hds, FTAG);
 }
 
-struct renamearg {
+struct renamesnaparg {
 	dsl_sync_task_group_t *dstg;
 	char failed[MAXPATHLEN];
 	char *oldsnap;
@@ -1592,7 +2085,7 @@ struct renamearg {
 static int
 dsl_snapshot_rename_one(char *name, void *arg)
 {
-	struct renamearg *ra = arg;
+	struct renamesnaparg *ra = arg;
 	dsl_dataset_t *ds = NULL;
 	char *cp;
 	int err;
@@ -1600,25 +2093,33 @@ dsl_snapshot_rename_one(char *name, void *arg)
 	cp = name + strlen(name);
 	*cp = '@';
 	(void) strcpy(cp + 1, ra->oldsnap);
-	err = dsl_dataset_open(name, DS_MODE_READONLY | DS_MODE_STANDARD,
-	    ra->dstg, &ds);
+
+	/*
+	 * For recursive snapshot renames the parent won't be changing
+	 * so we just pass name for both the to/from argument.
+	 */
+	err = zfs_secpolicy_rename_perms(name, name, CRED());
 	if (err == ENOENT) {
-		*cp = '\0';
 		return (0);
-	}
-	if (err) {
+	} else if (err) {
 		(void) strcpy(ra->failed, name);
-		*cp = '\0';
-		dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
 		return (err);
 	}
 
 #ifdef _KERNEL
-	/* for all filesystems undergoing rename, we'll need to unmount it */
+	/*
+	 * For all filesystems undergoing rename, we'll need to unmount it.
+	 */
 	(void) zfs_unmount_snap(name, NULL);
 #endif
-
+	err = dsl_dataset_hold(name, ra->dstg, &ds);
 	*cp = '\0';
+	if (err == ENOENT) {
+		return (0);
+	} else if (err) {
+		(void) strcpy(ra->failed, name);
+		return (err);
+	}
 
 	dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
 	    dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
@@ -1630,7 +2131,7 @@ static int
 dsl_recursive_rename(char *oldname, const char *newname)
 {
 	int err;
-	struct renamearg *ra;
+	struct renamesnaparg *ra;
 	dsl_sync_task_t *dst;
 	spa_t *spa;
 	char *cp, *fsname = spa_strdup(oldname);
@@ -1640,19 +2141,12 @@ dsl_recursive_rename(char *oldname, const char *newname)
 	cp = strchr(fsname, '@');
 	*cp = '\0';
 
-	cp = strchr(fsname, '/');
-	if (cp) {
-		*cp = '\0';
-		err = spa_open(fsname, &spa, FTAG);
-		*cp = '/';
-	} else {
-		err = spa_open(fsname, &spa, FTAG);
-	}
+	err = spa_open(fsname, &spa, FTAG);
 	if (err) {
 		kmem_free(fsname, len + 1);
 		return (err);
 	}
-	ra = kmem_alloc(sizeof (struct renamearg), KM_SLEEP);
+	ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
 	ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 
 	ra->oldsnap = strchr(oldname, '@') + 1;
@@ -1675,21 +2169,32 @@ dsl_recursive_rename(char *oldname, const char *newname)
 			(void) strcat(ra->failed, "@");
 			(void) strcat(ra->failed, ra->newsnap);
 		}
-		dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
+		dsl_dataset_rele(ds, ra->dstg);
 	}
 
-	(void) strcpy(oldname, ra->failed);
+	if (err)
+		(void) strcpy(oldname, ra->failed);
 
 	dsl_sync_task_group_destroy(ra->dstg);
-	kmem_free(ra, sizeof (struct renamearg));
+	kmem_free(ra, sizeof (struct renamesnaparg));
 	spa_close(spa, FTAG);
 	return (err);
 }
 
+static int
+dsl_valid_rename(char *oldname, void *arg)
+{
+	int delta = *(int *)arg;
+
+	if (strlen(oldname) + delta >= MAXNAMELEN)
+		return (ENAMETOOLONG);
+
+	return (0);
+}
+
 #pragma weak dmu_objset_rename = dsl_dataset_rename
 int
-dsl_dataset_rename(char *oldname, const char *newname,
-    boolean_t recursive)
+dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
 {
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
@@ -1700,7 +2205,15 @@ dsl_dataset_rename(char *oldname, const char *newname,
 	if (err)
 		return (err);
 	if (tail == NULL) {
-		err = dsl_dir_rename(dd, newname);
+		int delta = strlen(newname) - strlen(oldname);
+
+		/* if we're growing, validate child name lengths */
+		if (delta > 0)
+			err = dmu_objset_find(oldname, dsl_valid_rename,
+			    &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+
+		if (!err)
+			err = dsl_dir_rename(dd, newname);
 		dsl_dir_close(dd, FTAG);
 		return (err);
 	}
@@ -1723,8 +2236,7 @@ dsl_dataset_rename(char *oldname, const char *newname,
 	if (recursive) {
 		err = dsl_recursive_rename(oldname, newname);
 	} else {
-		err = dsl_dataset_open(oldname,
-		    DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds);
+		err = dsl_dataset_hold(oldname, FTAG, &ds);
 		if (err)
 			return (err);
 
@@ -1732,278 +2244,640 @@ dsl_dataset_rename(char *oldname, const char *newname,
 		    dsl_dataset_snapshot_rename_check,
 		    dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
 
-		dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+		dsl_dataset_rele(ds, FTAG);
 	}
 
 	return (err);
 }
 
+struct promotenode {
+	list_node_t link;
+	dsl_dataset_t *ds;
+};
+
 struct promotearg {
-	uint64_t used, comp, uncomp, unique;
-	uint64_t newnext_obj, snapnames_obj;
+	list_t shared_snaps, origin_snaps, clone_snaps;
+	dsl_dataset_t *origin_origin, *origin_head;
+	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
 };
 
+static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
+
+/* ARGSUSED */
 static int
 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *hds = arg1;
 	struct promotearg *pa = arg2;
-	dsl_dir_t *dd = hds->ds_dir;
-	dsl_pool_t *dp = hds->ds_dir->dd_pool;
-	dsl_dir_t *pdd = NULL;
-	dsl_dataset_t *ds = NULL;
-	dsl_dataset_t *pivot_ds = NULL;
-	dsl_dataset_t *newnext_ds = NULL;
+	struct promotenode *snap = list_head(&pa->shared_snaps);
+	dsl_dataset_t *origin_ds = snap->ds;
 	int err;
-	char *name = NULL;
-	uint64_t itor = 0;
-	blkptr_t bp;
-
-	bzero(pa, sizeof (*pa));
 
-	/* Check that it is a clone */
-	if (dd->dd_phys->dd_clone_parent_obj == 0)
+	/* Check that it is a real clone */
+	if (!dsl_dir_is_clone(hds->ds_dir))
 		return (EINVAL);
 
 	/* Since this is so expensive, don't do the preliminary check */
 	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
-	if (err = dsl_dataset_open_obj(dp,
-	    dd->dd_phys->dd_clone_parent_obj,
-	    NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds))
-		goto out;
-	pdd = pivot_ds->ds_dir;
-
-	{
-		dsl_dataset_t *phds;
-		if (err = dsl_dataset_open_obj(dd->dd_pool,
-		    pdd->dd_phys->dd_head_dataset_obj,
-		    NULL, DS_MODE_NONE, FTAG, &phds))
-			goto out;
-		pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj;
-		dsl_dataset_close(phds, DS_MODE_NONE, FTAG);
-	}
-
-	if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
-		err = EXDEV;
-		goto out;
-	}
-
-	/* find pivot point's new next ds */
-	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object,
-	    NULL, DS_MODE_NONE, FTAG, &newnext_ds));
-	while (newnext_ds->ds_phys->ds_prev_snap_obj != pivot_ds->ds_object) {
-		dsl_dataset_t *prev;
-
-		if (err = dsl_dataset_open_obj(dd->dd_pool,
-		    newnext_ds->ds_phys->ds_prev_snap_obj,
-		    NULL, DS_MODE_NONE, FTAG, &prev))
-			goto out;
-		dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
-		newnext_ds = prev;
-	}
-	pa->newnext_obj = newnext_ds->ds_object;
+	if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
+		return (EXDEV);
 
-	/* compute pivot point's new unique space */
-	while ((err = bplist_iterate(&newnext_ds->ds_deadlist,
-	    &itor, &bp)) == 0) {
-		if (bp.blk_birth > pivot_ds->ds_phys->ds_prev_snap_txg)
-			pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp);
-	}
-	if (err != ENOENT)
-		goto out;
+	/* compute origin's new unique space */
+	snap = list_tail(&pa->clone_snaps);
+	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
+	err = bplist_space_birthrange(&snap->ds->ds_deadlist,
+	    origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique);
+	if (err)
+		return (err);
 
-	/* Walk the snapshots that we are moving */
-	name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-	ds = pivot_ds;
-	/* CONSTCOND */
-	while (TRUE) {
+	/*
+	 * Walk the snapshots that we are moving
+	 *
+	 * Compute space to transfer.  Consider the incremental changes
+	 * to used for each snapshot:
+	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
+	 * So each snapshot gave birth to:
+	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
+	 * So a sequence would look like:
+	 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
+	 * Which simplifies to:
+	 * uN + kN + kN-1 + ... + k1 + k0
+	 * Note however, if we stop before we reach the ORIGIN we get:
+	 * uN + kN + kN-1 + ... + kM - uM-1
+	 */
+	pa->used = origin_ds->ds_phys->ds_used_bytes;
+	pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
+	pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
+	for (snap = list_head(&pa->shared_snaps); snap;
+	    snap = list_next(&pa->shared_snaps, snap)) {
 		uint64_t val, dlused, dlcomp, dluncomp;
-		dsl_dataset_t *prev;
+		dsl_dataset_t *ds = snap->ds;
 
 		/* Check that the snapshot name does not conflict */
-		dsl_dataset_name(ds, name);
-		err = zap_lookup(dd->dd_pool->dp_meta_objset,
-		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
-		    8, 1, &val);
-		if (err != ENOENT) {
-			if (err == 0)
-				err = EEXIST;
-			goto out;
-		}
-
-		/*
-		 * compute space to transfer.  Each snapshot gave birth to:
-		 * (my used) - (prev's used) + (deadlist's used)
-		 */
-		pa->used += ds->ds_phys->ds_used_bytes;
-		pa->comp += ds->ds_phys->ds_compressed_bytes;
-		pa->uncomp += ds->ds_phys->ds_uncompressed_bytes;
+		VERIFY(0 == dsl_dataset_get_snapname(ds));
+		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
+		if (err == 0)
+			return (EEXIST);
+		if (err != ENOENT)
+			return (err);
 
-		/* If we reach the first snapshot, we're done. */
+		/* The very first snapshot does not have a deadlist */
 		if (ds->ds_phys->ds_prev_snap_obj == 0)
-			break;
+			continue;
 
 		if (err = bplist_space(&ds->ds_deadlist,
 		    &dlused, &dlcomp, &dluncomp))
-			goto out;
-		if (err = dsl_dataset_open_obj(dd->dd_pool,
-		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
-		    FTAG, &prev))
-			goto out;
-		pa->used += dlused - prev->ds_phys->ds_used_bytes;
-		pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes;
-		pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes;
+			return (err);
+		pa->used += dlused;
+		pa->comp += dlcomp;
+		pa->uncomp += dluncomp;
+	}
 
-		/*
-		 * We could be a clone of a clone.  If we reach our
-		 * parent's branch point, we're done.
-		 */
-		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
-			dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
-			break;
-		}
-		if (ds != pivot_ds)
-			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-		ds = prev;
+	/*
+	 * If we are a clone of a clone then we never reached ORIGIN,
+	 * so we need to subtract out the clone origin's used space.
+	 */
+	if (pa->origin_origin) {
+		pa->used -= pa->origin_origin->ds_phys->ds_used_bytes;
+		pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
+		pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
 	}
 
 	/* Check that there is enough space here */
-	err = dsl_dir_transfer_possible(pdd, dd, pa->used);
+	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
+	    pa->used);
+	if (err)
+		return (err);
 
-out:
-	if (ds && ds != pivot_ds)
-		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-	if (pivot_ds)
-		dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
-	if (newnext_ds)
-		dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
-	if (name)
-		kmem_free(name, MAXPATHLEN);
-	return (err);
+	/*
+	 * Compute the amounts of space that will be used by snapshots
+	 * after the promotion (for both origin and clone).  For each,
+	 * it is the amount of space that will be on all of their
+	 * deadlists (that was not born before their new origin).
+	 */
+	if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+		uint64_t space;
+
+		/*
+		 * Note, typically this will not be a clone of a clone,
+		 * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so
+		 * these snaplist_space() -> bplist_space_birthrange()
+		 * calls will be fast because they do not have to
+		 * iterate over all bps.
+		 */
+		snap = list_head(&pa->origin_snaps);
+		err = snaplist_space(&pa->shared_snaps,
+		    snap->ds->ds_origin_txg, &pa->cloneusedsnap);
+		if (err)
+			return (err);
+
+		err = snaplist_space(&pa->clone_snaps,
+		    snap->ds->ds_origin_txg, &space);
+		if (err)
+			return (err);
+		pa->cloneusedsnap += space;
+	}
+	if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+		err = snaplist_space(&pa->origin_snaps,
+		    origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
+		if (err)
+			return (err);
+	}
+
+	return (0);
 }
 
 static void
-dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dataset_t *hds = arg1;
 	struct promotearg *pa = arg2;
+	struct promotenode *snap = list_head(&pa->shared_snaps);
+	dsl_dataset_t *origin_ds = snap->ds;
+	dsl_dataset_t *origin_head;
 	dsl_dir_t *dd = hds->ds_dir;
 	dsl_pool_t *dp = hds->ds_dir->dd_pool;
-	dsl_dir_t *pdd = NULL;
-	dsl_dataset_t *ds, *pivot_ds;
-	char *name;
+	dsl_dir_t *odd = NULL;
+	uint64_t oldnext_obj;
+	int64_t delta;
 
-	ASSERT(dd->dd_phys->dd_clone_parent_obj != 0);
 	ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
 
-	VERIFY(0 == dsl_dataset_open_obj(dp,
-	    dd->dd_phys->dd_clone_parent_obj,
-	    NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds));
+	snap = list_head(&pa->origin_snaps);
+	origin_head = snap->ds;
+
 	/*
-	 * We need to explicitly open pdd, since pivot_ds's pdd will be
+	 * We need to explicitly open odd, since origin_ds's dd will be
 	 * changing.
 	 */
-	VERIFY(0 == dsl_dir_open_obj(dp, pivot_ds->ds_dir->dd_object,
-	    NULL, FTAG, &pdd));
+	VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
+	    NULL, FTAG, &odd));
+
+	/* change origin's next snap */
+	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
+	oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
+	snap = list_tail(&pa->clone_snaps);
+	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
+	origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
+
+	/* change the origin's next clone */
+	if (origin_ds->ds_phys->ds_next_clones_obj) {
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    origin_ds->ds_phys->ds_next_clones_obj,
+		    origin_ds->ds_phys->ds_next_snap_obj, tx));
+		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+		    origin_ds->ds_phys->ds_next_clones_obj,
+		    oldnext_obj, tx));
+	}
 
-	/* move snapshots to this dir */
-	name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-	ds = pivot_ds;
-	/* CONSTCOND */
-	while (TRUE) {
-		dsl_dataset_t *prev;
+	/* change origin */
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
+	dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
+	hds->ds_origin_txg = origin_head->ds_origin_txg;
+	dmu_buf_will_dirty(odd->dd_dbuf, tx);
+	odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
+	origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg;
 
+	/* move snapshots to this dir */
+	for (snap = list_head(&pa->shared_snaps); snap;
+	    snap = list_next(&pa->shared_snaps, snap)) {
+		dsl_dataset_t *ds = snap->ds;
+
+		/* unregister props as dsl_dir is changing */
+		if (ds->ds_user_ptr) {
+			ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+			ds->ds_user_ptr = NULL;
+		}
 		/* move snap name entry */
-		dsl_dataset_name(ds, name);
-		VERIFY(0 == zap_remove(dp->dp_meta_objset,
-		    pa->snapnames_obj, ds->ds_snapname, tx));
+		VERIFY(0 == dsl_dataset_get_snapname(ds));
+		VERIFY(0 == dsl_dataset_snap_remove(origin_head,
+		    ds->ds_snapname, tx));
 		VERIFY(0 == zap_add(dp->dp_meta_objset,
 		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
 		    8, 1, &ds->ds_object, tx));
-
 		/* change containing dsl_dir */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, pdd->dd_object);
+		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
 		ds->ds_phys->ds_dir_obj = dd->dd_object;
-		ASSERT3P(ds->ds_dir, ==, pdd);
+		ASSERT3P(ds->ds_dir, ==, odd);
 		dsl_dir_close(ds->ds_dir, ds);
 		VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
 		    NULL, ds, &ds->ds_dir));
 
 		ASSERT3U(dsl_prop_numcb(ds), ==, 0);
+	}
 
-		if (ds->ds_phys->ds_prev_snap_obj == 0)
-			break;
+	/*
+	 * Change space accounting.
+	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
+	 * both be valid, or both be 0 (resulting in delta == 0).  This
+	 * is true for each of {clone,origin} independently.
+	 */
+
+	delta = pa->cloneusedsnap -
+	    dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+	ASSERT3S(delta, >=, 0);
+	ASSERT3U(pa->used, >=, delta);
+	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
+	dsl_dir_diduse_space(dd, DD_USED_HEAD,
+	    pa->used - delta, pa->comp, pa->uncomp, tx);
+
+	delta = pa->originusedsnap -
+	    odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+	ASSERT3S(delta, <=, 0);
+	ASSERT3U(pa->used, >=, -delta);
+	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
+	dsl_dir_diduse_space(odd, DD_USED_HEAD,
+	    -pa->used - delta, -pa->comp, -pa->uncomp, tx);
+
+	origin_ds->ds_phys->ds_unique_bytes = pa->unique;
+
+	/* log history record */
+	spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
+	    cr, "dataset = %llu", hds->ds_object);
+
+	dsl_dir_close(odd, FTAG);
+}
+
+static char *snaplist_tag = "snaplist";
+/*
+ * Make a list of dsl_dataset_t's for the snapshots between first_obj
+ * (exclusive) and last_obj (inclusive).  The list will be in reverse
+ * order (last_obj will be the list_head()).  If first_obj == 0, do all
+ * snapshots back to this dataset's origin.
+ */
+static int
+snaplist_make(dsl_pool_t *dp, boolean_t own,
+    uint64_t first_obj, uint64_t last_obj, list_t *l)
+{
+	uint64_t obj = last_obj;
+
+	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
+
+	list_create(l, sizeof (struct promotenode),
+	    offsetof(struct promotenode, link));
 
-		VERIFY(0 == dsl_dataset_open_obj(dp,
-		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
-		    FTAG, &prev));
+	while (obj != first_obj) {
+		dsl_dataset_t *ds;
+		struct promotenode *snap;
+		int err;
 
-		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
-			dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
-			break;
+		if (own) {
+			err = dsl_dataset_own_obj(dp, obj,
+			    0, snaplist_tag, &ds);
+			if (err == 0)
+				dsl_dataset_make_exclusive(ds, snaplist_tag);
+		} else {
+			err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
+		}
+		if (err == ENOENT) {
+			/* lost race with snapshot destroy */
+			struct promotenode *last = list_tail(l);
+			ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
+			obj = last->ds->ds_phys->ds_prev_snap_obj;
+			continue;
+		} else if (err) {
+			return (err);
 		}
-		if (ds != pivot_ds)
-			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-		ds = prev;
+
+		if (first_obj == 0)
+			first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
+
+		snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
+		snap->ds = ds;
+		list_insert_tail(l, snap);
+		obj = ds->ds_phys->ds_prev_snap_obj;
 	}
-	if (ds != pivot_ds)
-		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 
-	/* change pivot point's next snap */
-	dmu_buf_will_dirty(pivot_ds->ds_dbuf, tx);
-	pivot_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj;
+	return (0);
+}
 
-	/* change clone_parent-age */
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	ASSERT3U(dd->dd_phys->dd_clone_parent_obj, ==, pivot_ds->ds_object);
-	dd->dd_phys->dd_clone_parent_obj = pdd->dd_phys->dd_clone_parent_obj;
-	dmu_buf_will_dirty(pdd->dd_dbuf, tx);
-	pdd->dd_phys->dd_clone_parent_obj = pivot_ds->ds_object;
+static int
+snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
+{
+	struct promotenode *snap;
 
-	/* change space accounting */
-	dsl_dir_diduse_space(pdd, -pa->used, -pa->comp, -pa->uncomp, tx);
-	dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx);
-	pivot_ds->ds_phys->ds_unique_bytes = pa->unique;
+	*spacep = 0;
+	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
+		uint64_t used;
+		int err = bplist_space_birthrange(&snap->ds->ds_deadlist,
+		    mintxg, UINT64_MAX, &used);
+		if (err)
+			return (err);
+		*spacep += used;
+	}
+	return (0);
+}
 
-	dsl_dir_close(pdd, FTAG);
-	dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
-	kmem_free(name, MAXPATHLEN);
+static void
+snaplist_destroy(list_t *l, boolean_t own)
+{
+	struct promotenode *snap;
+
+	if (!list_link_active(&l->list_head))
+		return;
+
+	while ((snap = list_tail(l)) != NULL) {
+		list_remove(l, snap);
+		if (own)
+			dsl_dataset_disown(snap->ds, snaplist_tag);
+		else
+			dsl_dataset_rele(snap->ds, snaplist_tag);
+		kmem_free(snap, sizeof (struct promotenode));
+	}
+	list_destroy(l);
 }
 
+/*
+ * Promote a clone.  Nomenclature note:
+ * "clone" or "cds": the original clone which is being promoted
+ * "origin" or "ods": the snapshot which is originally clone's origin
+ * "origin head" or "ohds": the dataset which is the head
+ * (filesystem/volume) for the origin
+ * "origin origin": the origin of the origin's filesystem (typically
+ * NULL, indicating that the clone is not a clone of a clone).
+ */
 int
 dsl_dataset_promote(const char *name)
 {
 	dsl_dataset_t *ds;
-	int err;
+	dsl_dir_t *dd;
+	dsl_pool_t *dp;
 	dmu_object_info_t doi;
-	struct promotearg pa;
+	struct promotearg pa = { 0 };
+	struct promotenode *snap;
+	int err;
 
-	err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds);
+	err = dsl_dataset_hold(name, FTAG, &ds);
 	if (err)
 		return (err);
+	dd = ds->ds_dir;
+	dp = dd->dd_pool;
 
-	err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset,
+	err = dmu_object_info(dp->dp_meta_objset,
 	    ds->ds_phys->ds_snapnames_zapobj, &doi);
 	if (err) {
-		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		dsl_dataset_rele(ds, FTAG);
 		return (err);
 	}
 
+	if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (EINVAL);
+	}
+
+	/*
+	 * We are going to inherit all the snapshots taken before our
+	 * origin (i.e., our new origin will be our parent's origin).
+	 * Take ownership of them so that we can rename them into our
+	 * namespace.
+	 */
+	rw_enter(&dp->dp_config_rwlock, RW_READER);
+
+	err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
+	    &pa.shared_snaps);
+	if (err != 0)
+		goto out;
+
+	err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
+	if (err != 0)
+		goto out;
+
+	snap = list_head(&pa.shared_snaps);
+	ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
+	err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
+	    snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
+	if (err != 0)
+		goto out;
+
+	if (dsl_dir_is_clone(snap->ds->ds_dir)) {
+		err = dsl_dataset_own_obj(dp,
+		    snap->ds->ds_dir->dd_phys->dd_origin_obj,
+		    0, FTAG, &pa.origin_origin);
+		if (err != 0)
+			goto out;
+	}
+
+out:
+	rw_exit(&dp->dp_config_rwlock);
+
 	/*
 	 * Add in 128x the snapnames zapobj size, since we will be moving
 	 * a bunch of snapnames to the promoted ds, and dirtying their
 	 * bonus buffers.
 	 */
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    dsl_dataset_promote_check,
-	    dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks);
-	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+	if (err == 0) {
+		err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
+		    dsl_dataset_promote_sync, ds, &pa,
+		    2 + 2 * doi.doi_physical_blks);
+	}
+
+	snaplist_destroy(&pa.shared_snaps, B_TRUE);
+	snaplist_destroy(&pa.clone_snaps, B_FALSE);
+	snaplist_destroy(&pa.origin_snaps, B_FALSE);
+	if (pa.origin_origin)
+		dsl_dataset_disown(pa.origin_origin, FTAG);
+	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
+struct cloneswaparg {
+	dsl_dataset_t *cds; /* clone dataset */
+	dsl_dataset_t *ohds; /* origin's head dataset */
+	boolean_t force;
+	int64_t unused_refres_delta; /* change in unconsumed refreservation */
+};
+
+/* ARGSUSED */
+static int
+dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	struct cloneswaparg *csa = arg1;
+
+	/* they should both be heads */
+	if (dsl_dataset_is_snapshot(csa->cds) ||
+	    dsl_dataset_is_snapshot(csa->ohds))
+		return (EINVAL);
+
+	/* the branch point should be just before them */
+	if (csa->cds->ds_prev != csa->ohds->ds_prev)
+		return (EINVAL);
+
+	/* cds should be the clone */
+	if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj !=
+	    csa->ohds->ds_object)
+		return (EINVAL);
+
+	/* the clone should be a child of the origin */
+	if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
+		return (EINVAL);
+
+	/* ohds shouldn't be modified unless 'force' */
+	if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
+		return (ETXTBSY);
+
+	/* adjust amount of any unconsumed refreservation */
+	csa->unused_refres_delta =
+	    (int64_t)MIN(csa->ohds->ds_reserved,
+	    csa->ohds->ds_phys->ds_unique_bytes) -
+	    (int64_t)MIN(csa->ohds->ds_reserved,
+	    csa->cds->ds_phys->ds_unique_bytes);
+
+	if (csa->unused_refres_delta > 0 &&
+	    csa->unused_refres_delta >
+	    dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
+		return (ENOSPC);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	struct cloneswaparg *csa = arg1;
+	dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
+
+	ASSERT(csa->cds->ds_reserved == 0);
+	ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota);
+
+	dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
+	dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
+	dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx);
+
+	if (csa->cds->ds_user_ptr != NULL) {
+		csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr);
+		csa->cds->ds_user_ptr = NULL;
+	}
+
+	if (csa->ohds->ds_user_ptr != NULL) {
+		csa->ohds->ds_user_evict_func(csa->ohds,
+		    csa->ohds->ds_user_ptr);
+		csa->ohds->ds_user_ptr = NULL;
+	}
+
+	/* reset origin's unique bytes */
+	VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
+	    csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+	    &csa->cds->ds_prev->ds_phys->ds_unique_bytes));
+
+	/* swap blkptrs */
+	{
+		blkptr_t tmp;
+		tmp = csa->ohds->ds_phys->ds_bp;
+		csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
+		csa->cds->ds_phys->ds_bp = tmp;
+	}
+
+	/* set dd_*_bytes */
+	{
+		int64_t dused, dcomp, duncomp;
+		uint64_t cdl_used, cdl_comp, cdl_uncomp;
+		uint64_t odl_used, odl_comp, odl_uncomp;
+
+		ASSERT3U(csa->cds->ds_dir->dd_phys->
+		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
+
+		VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used,
+		    &cdl_comp, &cdl_uncomp));
+		VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used,
+		    &odl_comp, &odl_uncomp));
+
+		dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
+		    (csa->ohds->ds_phys->ds_used_bytes + odl_used);
+		dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
+		    (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
+		duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
+		    cdl_uncomp -
+		    (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
+
+		dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
+		    dused, dcomp, duncomp, tx);
+		dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
+		    -dused, -dcomp, -duncomp, tx);
+
+		/*
+		 * The difference in the space used by snapshots is the
+		 * difference in snapshot space due to the head's
+		 * deadlist (since that's the only thing that's
+		 * changing that affects the snapused).
+		 */
+		VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
+		    csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used));
+		VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist,
+		    csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used));
+		dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
+		    DD_USED_HEAD, DD_USED_SNAP, tx);
+	}
+
+#define	SWITCH64(x, y) \
+	{ \
+		uint64_t __tmp = (x); \
+		(x) = (y); \
+		(y) = __tmp; \
+	}
+
+	/* swap ds_*_bytes */
+	SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
+	    csa->cds->ds_phys->ds_used_bytes);
+	SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
+	    csa->cds->ds_phys->ds_compressed_bytes);
+	SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
+	    csa->cds->ds_phys->ds_uncompressed_bytes);
+	SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
+	    csa->cds->ds_phys->ds_unique_bytes);
+
+	/* apply any parent delta for change in unconsumed refreservation */
+	dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
+	    csa->unused_refres_delta, 0, 0, tx);
+
+	/* swap deadlists */
+	bplist_close(&csa->cds->ds_deadlist);
+	bplist_close(&csa->ohds->ds_deadlist);
+	SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
+	    csa->cds->ds_phys->ds_deadlist_obj);
+	VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
+	    csa->cds->ds_phys->ds_deadlist_obj));
+	VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
+	    csa->ohds->ds_phys->ds_deadlist_obj));
+}
+
+/*
+ * Swap 'clone' with its origin head file system.  Used at the end
+ * of "online recv" to swizzle the file system to the new version.
+ */
+int
+dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
+    boolean_t force)
+{
+	struct cloneswaparg csa;
+	int error;
+
+	ASSERT(clone->ds_owner);
+	ASSERT(origin_head->ds_owner);
+retry:
+	/* Need exclusive access for the swap */
+	rw_enter(&clone->ds_rwlock, RW_WRITER);
+	if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
+		rw_exit(&clone->ds_rwlock);
+		rw_enter(&origin_head->ds_rwlock, RW_WRITER);
+		if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
+			rw_exit(&origin_head->ds_rwlock);
+			goto retry;
+		}
+	}
+	csa.cds = clone;
+	csa.ohds = origin_head;
+	csa.force = force;
+	error = dsl_sync_task_do(clone->ds_dir->dd_pool,
+	    dsl_dataset_clone_swap_check,
+	    dsl_dataset_clone_swap_sync, &csa, NULL, 9);
+	return (error);
+}
+
 /*
  * Given a pool name and a dataset object number in that pool,
  * return the name of that dataset.
@@ -2013,23 +2887,220 @@ dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
 {
 	spa_t *spa;
 	dsl_pool_t *dp;
-	dsl_dataset_t *ds = NULL;
+	dsl_dataset_t *ds;
 	int error;
 
 	if ((error = spa_open(pname, &spa, FTAG)) != 0)
 		return (error);
 	dp = spa_get_dsl(spa);
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	if ((error = dsl_dataset_open_obj(dp, obj,
-	    NULL, DS_MODE_NONE, FTAG, &ds)) != 0) {
-		rw_exit(&dp->dp_config_rwlock);
-		spa_close(spa, FTAG);
-		return (error);
+	if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
+		dsl_dataset_name(ds, buf);
+		dsl_dataset_rele(ds, FTAG);
 	}
-	dsl_dataset_name(ds, buf);
-	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 	rw_exit(&dp->dp_config_rwlock);
 	spa_close(spa, FTAG);
 
+	return (error);
+}
+
+int
+dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
+    uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
+{
+	int error = 0;
+
+	ASSERT3S(asize, >, 0);
+
+	/*
+	 * *ref_rsrv is the portion of asize that will come from any
+	 * unconsumed refreservation space.
+	 */
+	*ref_rsrv = 0;
+
+	mutex_enter(&ds->ds_lock);
+	/*
+	 * Make a space adjustment for reserved bytes.
+	 */
+	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
+		ASSERT3U(*used, >=,
+		    ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
+		*used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
+		*ref_rsrv =
+		    asize - MIN(asize, parent_delta(ds, asize + inflight));
+	}
+
+	if (!check_quota || ds->ds_quota == 0) {
+		mutex_exit(&ds->ds_lock);
+		return (0);
+	}
+	/*
+	 * If they are requesting more space, and our current estimate
+	 * is over quota, they get to try again unless the actual
+	 * on-disk is over quota and there are no pending changes (which
+	 * may free up space for us).
+	 */
+	if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
+		if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)
+			error = ERESTART;
+		else
+			error = EDQUOT;
+	}
+	mutex_exit(&ds->ds_lock);
+
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	uint64_t *quotap = arg2;
+	uint64_t new_quota = *quotap;
+
+	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
+		return (ENOTSUP);
+
+	if (new_quota == 0)
+		return (0);
+
+	if (new_quota < ds->ds_phys->ds_used_bytes ||
+	    new_quota < ds->ds_reserved)
+		return (ENOSPC);
+
 	return (0);
 }
+
+/* ARGSUSED */
+void
+dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	uint64_t *quotap = arg2;
+	uint64_t new_quota = *quotap;
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+	ds->ds_quota = new_quota;
+
+	dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
+
+	spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa,
+	    tx, cr, "%lld dataset = %llu ",
+	    (longlong_t)new_quota, ds->ds_object);
+}
+
+int
+dsl_dataset_set_quota(const char *dsname, uint64_t quota)
+{
+	dsl_dataset_t *ds;
+	int err;
+
+	err = dsl_dataset_hold(dsname, FTAG, &ds);
+	if (err)
+		return (err);
+
+	if (quota != ds->ds_quota) {
+		/*
+		 * If someone removes a file, then tries to set the quota, we
+		 * want to make sure the file freeing takes effect.
+		 */
+		txg_wait_open(ds->ds_dir->dd_pool, 0);
+
+		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+		    dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
+		    ds, &quota, 0);
+	}
+	dsl_dataset_rele(ds, FTAG);
+	return (err);
+}
+
+static int
+dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	uint64_t *reservationp = arg2;
+	uint64_t new_reservation = *reservationp;
+	int64_t delta;
+	uint64_t unique;
+
+	if (new_reservation > INT64_MAX)
+		return (EOVERFLOW);
+
+	if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+	    SPA_VERSION_REFRESERVATION)
+		return (ENOTSUP);
+
+	if (dsl_dataset_is_snapshot(ds))
+		return (EINVAL);
+
+	/*
+	 * If we are doing the preliminary check in open context, the
+	 * space estimates may be inaccurate.
+	 */
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	mutex_enter(&ds->ds_lock);
+	unique = dsl_dataset_unique(ds);
+	delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved);
+	mutex_exit(&ds->ds_lock);
+
+	if (delta > 0 &&
+	    delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
+		return (ENOSPC);
+	if (delta > 0 && ds->ds_quota > 0 &&
+	    new_reservation > ds->ds_quota)
+		return (ENOSPC);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
+    dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	uint64_t *reservationp = arg2;
+	uint64_t new_reservation = *reservationp;
+	uint64_t unique;
+	int64_t delta;
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+	mutex_enter(&ds->ds_dir->dd_lock);
+	mutex_enter(&ds->ds_lock);
+	unique = dsl_dataset_unique(ds);
+	delta = MAX(0, (int64_t)(new_reservation - unique)) -
+	    MAX(0, (int64_t)(ds->ds_reserved - unique));
+	ds->ds_reserved = new_reservation;
+	mutex_exit(&ds->ds_lock);
+
+	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
+	mutex_exit(&ds->ds_dir->dd_lock);
+	dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation",
+	    new_reservation, cr, tx);
+
+	spa_history_internal_log(LOG_DS_REFRESERV,
+	    ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu",
+	    (longlong_t)new_reservation, ds->ds_object);
+}
+
+int
+dsl_dataset_set_reservation(const char *dsname, uint64_t reservation)
+{
+	dsl_dataset_t *ds;
+	int err;
+
+	err = dsl_dataset_hold(dsname, FTAG, &ds);
+	if (err)
+		return (err);
+
+	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+	    dsl_dataset_set_reservation_check,
+	    dsl_dataset_set_reservation_sync, ds, &reservation, 0);
+	dsl_dataset_rele(ds, FTAG);
+	return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
new file mode 100644
index 000000000000..2ce16fe20e12
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
@@ -0,0 +1,735 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * DSL permissions are stored in a two level zap attribute
+ * mechanism.   The first level identifies the "class" of
+ * entry.  The class is identified by the first 2 letters of
+ * the attribute.  The second letter "l" or "d" identifies whether
+ * it is a local or descendent permission.  The first letter
+ * identifies the type of entry.
+ *
+ * ul$<id>    identifies permissions granted locally for this userid.
+ * ud$<id>    identifies permissions granted on descendent datasets for
+ *            this userid.
+ * Ul$<id>    identifies permission sets granted locally for this userid.
+ * Ud$<id>    identifies permission sets granted on descendent datasets for
+ *            this userid.
+ * gl$<id>    identifies permissions granted locally for this groupid.
+ * gd$<id>    identifies permissions granted on descendent datasets for
+ *            this groupid.
+ * Gl$<id>    identifies permission sets granted locally for this groupid.
+ * Gd$<id>    identifies permission sets granted on descendent datasets for
+ *            this groupid.
+ * el$        identifies permissions granted locally for everyone.
+ * ed$        identifies permissions granted on descendent datasets
+ *            for everyone.
+ * El$        identifies permission sets granted locally for everyone.
+ * Ed$        identifies permission sets granted to descendent datasets for
+ *            everyone.
+ * c-$        identifies permission to create at dataset creation time.
+ * C-$        identifies permission sets to grant locally at dataset creation
+ *            time.
+ * s-$@<name> permissions defined in specified set @<name>
+ * S-$@<name> Sets defined in named set @<name>
+ *
+ * Each of the above entities points to another zap attribute that contains one
+ * attribute for each allowed permission, such as create, destroy,...
+ * All of the "upper" case class types will specify permission set names
+ * rather than permissions.
+ *
+ * Basically it looks something like this:
+ * ul$12 -> ZAP OBJ -> permissions...
+ *
+ * The ZAP OBJ is referred to as the jump object.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio_checksum.h> /* for the default checksum value */
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/cred.h>
+#include <sys/sunddi.h>
+
+#include "zfs_deleg.h"
+
+/*
+ * Validate that user is allowed to delegate specified permissions.
+ *
+ * In order to delegate "create" you must have "create"
+ * and "allow".
+ */
+int
+dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr)
+{
+	nvpair_t *whopair = NULL;
+	int error;
+
+	if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
+		return (error);
+
+	while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+		nvlist_t *perms;
+		nvpair_t *permpair = NULL;
+
+		VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
+
+		while (permpair = nvlist_next_nvpair(perms, permpair)) {
+			const char *perm = nvpair_name(permpair);
+
+			if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0)
+				return (EPERM);
+
+			if ((error = dsl_deleg_access(ddname, perm, cr)) != 0)
+				return (error);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Validate that user is allowed to unallow specified permissions.  They
+ * must have the 'allow' permission, and even then can only unallow
+ * perms for their uid.
+ */
+int
+dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
+{
+	nvpair_t *whopair = NULL;
+	int error;
+	char idstr[32];
+
+	if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
+		return (error);
+
+	(void) snprintf(idstr, sizeof (idstr), "%lld",
+	    (longlong_t)crgetuid(cr));
+
+	while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+		zfs_deleg_who_type_t type = nvpair_name(whopair)[0];
+
+		if (type != ZFS_DELEG_USER &&
+		    type != ZFS_DELEG_USER_SETS)
+			return (EPERM);
+
+		if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0)
+			return (EPERM);
+	}
+	return (0);
+}
+
+static void
+dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_dir_t *dd = arg1;
+	nvlist_t *nvp = arg2;
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	nvpair_t *whopair = NULL;
+	uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+
+	if (zapobj == 0) {
+		dmu_buf_will_dirty(dd->dd_dbuf, tx);
+		zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+		    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+	}
+
+	while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+		const char *whokey = nvpair_name(whopair);
+		nvlist_t *perms;
+		nvpair_t *permpair = NULL;
+		uint64_t jumpobj;
+
+		VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
+
+		if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) {
+			jumpobj = zap_create(mos, DMU_OT_DSL_PERMS,
+			    DMU_OT_NONE, 0, tx);
+			VERIFY(zap_update(mos, zapobj,
+			    whokey, 8, 1, &jumpobj, tx) == 0);
+		}
+
+		while (permpair = nvlist_next_nvpair(perms, permpair)) {
+			const char *perm = nvpair_name(permpair);
+			uint64_t n = 0;
+
+			VERIFY(zap_update(mos, jumpobj,
+			    perm, 8, 1, &n, tx) == 0);
+			spa_history_internal_log(LOG_DS_PERM_UPDATE,
+			    dd->dd_pool->dp_spa, tx, cr,
+			    "%s %s dataset = %llu", whokey, perm,
+			    dd->dd_phys->dd_head_dataset_obj);
+		}
+	}
+}
+
+static void
+dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_dir_t *dd = arg1;
+	nvlist_t *nvp = arg2;
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	nvpair_t *whopair = NULL;
+	uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+
+	if (zapobj == 0)
+		return;
+
+	while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+		const char *whokey = nvpair_name(whopair);
+		nvlist_t *perms;
+		nvpair_t *permpair = NULL;
+		uint64_t jumpobj;
+
+		if (nvpair_value_nvlist(whopair, &perms) != 0) {
+			if (zap_lookup(mos, zapobj, whokey, 8,
+			    1, &jumpobj) == 0) {
+				(void) zap_remove(mos, zapobj, whokey, tx);
+				VERIFY(0 == zap_destroy(mos, jumpobj, tx));
+			}
+			spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE,
+			    dd->dd_pool->dp_spa, tx, cr,
+			    "%s dataset = %llu", whokey,
+			    dd->dd_phys->dd_head_dataset_obj);
+			continue;
+		}
+
+		if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0)
+			continue;
+
+		while (permpair = nvlist_next_nvpair(perms, permpair)) {
+			const char *perm = nvpair_name(permpair);
+			uint64_t n = 0;
+
+			(void) zap_remove(mos, jumpobj, perm, tx);
+			if (zap_count(mos, jumpobj, &n) == 0 && n == 0) {
+				(void) zap_remove(mos, zapobj,
+				    whokey, tx);
+				VERIFY(0 == zap_destroy(mos,
+				    jumpobj, tx));
+			}
+			spa_history_internal_log(LOG_DS_PERM_REMOVE,
+			    dd->dd_pool->dp_spa, tx, cr,
+			    "%s %s dataset = %llu", whokey, perm,
+			    dd->dd_phys->dd_head_dataset_obj);
+		}
+	}
+}
+
+int
+dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset)
+{
+	dsl_dir_t *dd;
+	int error;
+	nvpair_t *whopair = NULL;
+	int blocks_modified = 0;
+
+	error = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	if (error)
+		return (error);
+
+	if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) <
+	    SPA_VERSION_DELEGATED_PERMS) {
+		dsl_dir_close(dd, FTAG);
+		return (ENOTSUP);
+	}
+
+	while (whopair = nvlist_next_nvpair(nvp, whopair))
+		blocks_modified++;
+
+	error = dsl_sync_task_do(dd->dd_pool, NULL,
+	    unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
+	    dd, nvp, blocks_modified);
+	dsl_dir_close(dd, FTAG);
+
+	return (error);
+}
+
+/*
+ * Find all 'allow' permissions from a given point and then continue
+ * traversing up to the root.
+ *
+ * This function constructs an nvlist of nvlists.
+ * each setpoint is an nvlist composed of an nvlist of an nvlist
+ * of the individual * users/groups/everyone/create
+ * permissions.
+ *
+ * The nvlist will look like this.
+ *
+ * { source fsname -> { whokeys { permissions,...}, ...}}
+ *
+ * The fsname nvpairs will be arranged in a bottom up order.  For example,
+ * if we have the following structure a/b/c then the nvpairs for the fsnames
+ * will be ordered a/b/c, a/b, a.
+ */
+int
+dsl_deleg_get(const char *ddname, nvlist_t **nvp)
+{
+	dsl_dir_t *dd, *startdd;
+	dsl_pool_t *dp;
+	int error;
+	objset_t *mos;
+
+	error = dsl_dir_open(ddname, FTAG, &startdd, NULL);
+	if (error)
+		return (error);
+
+	dp = startdd->dd_pool;
+	mos = dp->dp_meta_objset;
+
+	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	for (dd = startdd; dd != NULL; dd = dd->dd_parent) {
+		zap_cursor_t basezc;
+		zap_attribute_t baseza;
+		nvlist_t *sp_nvp;
+		uint64_t n;
+		char source[MAXNAMELEN];
+
+		if (dd->dd_phys->dd_deleg_zapobj &&
+		    (zap_count(mos, dd->dd_phys->dd_deleg_zapobj,
+		    &n) == 0) && n) {
+			VERIFY(nvlist_alloc(&sp_nvp,
+			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		} else {
+			continue;
+		}
+
+		for (zap_cursor_init(&basezc, mos,
+		    dd->dd_phys->dd_deleg_zapobj);
+		    zap_cursor_retrieve(&basezc, &baseza) == 0;
+		    zap_cursor_advance(&basezc)) {
+			zap_cursor_t zc;
+			zap_attribute_t za;
+			nvlist_t *perms_nvp;
+
+			ASSERT(baseza.za_integer_length == 8);
+			ASSERT(baseza.za_num_integers == 1);
+
+			VERIFY(nvlist_alloc(&perms_nvp,
+			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+			for (zap_cursor_init(&zc, mos, baseza.za_first_integer);
+			    zap_cursor_retrieve(&zc, &za) == 0;
+			    zap_cursor_advance(&zc)) {
+				VERIFY(nvlist_add_boolean(perms_nvp,
+				    za.za_name) == 0);
+			}
+			zap_cursor_fini(&zc);
+			VERIFY(nvlist_add_nvlist(sp_nvp, baseza.za_name,
+			    perms_nvp) == 0);
+			nvlist_free(perms_nvp);
+		}
+
+		zap_cursor_fini(&basezc);
+
+		dsl_dir_name(dd, source);
+		VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0);
+		nvlist_free(sp_nvp);
+	}
+	rw_exit(&dp->dp_config_rwlock);
+
+	dsl_dir_close(startdd, FTAG);
+	return (0);
+}
+
+/*
+ * Routines for dsl_deleg_access() -- access checking.
+ */
+typedef struct perm_set {
+	avl_node_t	p_node;
+	boolean_t	p_matched;
+	char		p_setname[ZFS_MAX_DELEG_NAME];
+} perm_set_t;
+
+static int
+perm_set_compare(const void *arg1, const void *arg2)
+{
+	const perm_set_t *node1 = arg1;
+	const perm_set_t *node2 = arg2;
+	int val;
+
+	val = strcmp(node1->p_setname, node2->p_setname);
+	if (val == 0)
+		return (0);
+	return (val > 0 ? 1 : -1);
+}
+
+/*
+ * Determine whether a specified permission exists.
+ *
+ * First the base attribute has to be retrieved.  i.e. ul$12
+ * Once the base object has been retrieved the actual permission
+ * is lookup up in the zap object the base object points to.
+ *
+ * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if
+ * there is no perm in that jumpobj.
+ */
+static int
+dsl_check_access(objset_t *mos, uint64_t zapobj,
+    char type, char checkflag, void *valp, const char *perm)
+{
+	int error;
+	uint64_t jumpobj, zero;
+	char whokey[ZFS_MAX_DELEG_NAME];
+
+	zfs_deleg_whokey(whokey, type, checkflag, valp);
+	error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
+	if (error == 0) {
+		error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero);
+		if (error == ENOENT)
+			error = EPERM;
+	}
+	return (error);
+}
+
+/*
+ * check a specified user/group for a requested permission
+ */
+static int
+dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm,
+    int checkflag, cred_t *cr)
+{
+	const	gid_t *gids;
+	int	ngids;
+	int	i;
+	uint64_t id;
+
+	/* check for user */
+	id = crgetuid(cr);
+	if (dsl_check_access(mos, zapobj,
+	    ZFS_DELEG_USER, checkflag, &id, perm) == 0)
+		return (0);
+
+	/* check for users primary group */
+	id = crgetgid(cr);
+	if (dsl_check_access(mos, zapobj,
+	    ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
+		return (0);
+
+	/* check for everyone entry */
+	id = -1;
+	if (dsl_check_access(mos, zapobj,
+	    ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0)
+		return (0);
+
+	/* check each supplemental group user is a member of */
+	ngids = crgetngroups(cr);
+	gids = crgetgroups(cr);
+	for (i = 0; i != ngids; i++) {
+		id = gids[i];
+		if (dsl_check_access(mos, zapobj,
+		    ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
+			return (0);
+	}
+
+	return (EPERM);
+}
+
+/*
+ * Iterate over the sets specified in the specified zapobj
+ * and load them into the permsets avl tree.
+ */
+static int
+dsl_load_sets(objset_t *mos, uint64_t zapobj,
+    char type, char checkflag, void *valp, avl_tree_t *avl)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	perm_set_t *permnode;
+	avl_index_t idx;
+	uint64_t jumpobj;
+	int error;
+	char whokey[ZFS_MAX_DELEG_NAME];
+
+	zfs_deleg_whokey(whokey, type, checkflag, valp);
+
+	error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
+	if (error != 0)
+		return (error);
+
+	for (zap_cursor_init(&zc, mos, jumpobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP);
+		(void) strlcpy(permnode->p_setname, za.za_name,
+		    sizeof (permnode->p_setname));
+		permnode->p_matched = B_FALSE;
+
+		if (avl_find(avl, permnode, &idx) == NULL) {
+			avl_insert(avl, permnode, idx);
+		} else {
+			kmem_free(permnode, sizeof (perm_set_t));
+		}
+	}
+	zap_cursor_fini(&zc);
+	return (0);
+}
+
+/*
+ * Load all permissions user based on cred belongs to.
+ */
+static void
+dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl,
+    char checkflag, cred_t *cr)
+{
+	const	gid_t *gids;
+	int	ngids, i;
+	uint64_t id;
+
+	id = crgetuid(cr);
+	(void) dsl_load_sets(mos, zapobj,
+	    ZFS_DELEG_USER_SETS, checkflag, &id, avl);
+
+	id = crgetgid(cr);
+	(void) dsl_load_sets(mos, zapobj,
+	    ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
+
+	(void) dsl_load_sets(mos, zapobj,
+	    ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl);
+
+	ngids = crgetngroups(cr);
+	gids = crgetgroups(cr);
+	for (i = 0; i != ngids; i++) {
+		id = gids[i];
+		(void) dsl_load_sets(mos, zapobj,
+		    ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
+	}
+}
+
+/*
+ * Check if user has requested permission.
+ */
+int
+dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+{
+	dsl_dataset_t *ds;
+	dsl_dir_t *dd;
+	dsl_pool_t *dp;
+	void *cookie;
+	int	error;
+	char	checkflag = ZFS_DELEG_LOCAL;
+	objset_t *mos;
+	avl_tree_t permsets;
+	perm_set_t *setnode;
+
+	error = dsl_dataset_hold(dsname, FTAG, &ds);
+	if (error)
+		return (error);
+
+	dp = ds->ds_dir->dd_pool;
+	mos = dp->dp_meta_objset;
+
+	if (dsl_delegation_on(mos) == B_FALSE) {
+		dsl_dataset_rele(ds, FTAG);
+		return (ECANCELED);
+	}
+
+	if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) <
+	    SPA_VERSION_DELEGATED_PERMS) {
+		dsl_dataset_rele(ds, FTAG);
+		return (EPERM);
+	}
+
+	avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
+	    offsetof(perm_set_t, p_node));
+
+	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent,
+	    checkflag = ZFS_DELEG_DESCENDENT) {
+		uint64_t zapobj;
+		boolean_t expanded;
+
+		/*
+		 * If not in global zone then make sure
+		 * the zoned property is set
+		 */
+		if (!INGLOBALZONE(curthread)) {
+			uint64_t zoned;
+
+			if (dsl_prop_get_dd(dd,
+			    zfs_prop_to_name(ZFS_PROP_ZONED),
+			    8, 1, &zoned, NULL) != 0)
+				break;
+			if (!zoned)
+				break;
+		}
+		zapobj = dd->dd_phys->dd_deleg_zapobj;
+
+		if (zapobj == 0)
+			continue;
+
+		dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr);
+again:
+		expanded = B_FALSE;
+		for (setnode = avl_first(&permsets); setnode;
+		    setnode = AVL_NEXT(&permsets, setnode)) {
+			if (setnode->p_matched == B_TRUE)
+				continue;
+
+			/* See if this set directly grants this permission */
+			error = dsl_check_access(mos, zapobj,
+			    ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm);
+			if (error == 0)
+				goto success;
+			if (error == EPERM)
+				setnode->p_matched = B_TRUE;
+
+			/* See if this set includes other sets */
+			error = dsl_load_sets(mos, zapobj,
+			    ZFS_DELEG_NAMED_SET_SETS, 0,
+			    setnode->p_setname, &permsets);
+			if (error == 0)
+				setnode->p_matched = expanded = B_TRUE;
+		}
+		/*
+		 * If we expanded any sets, that will define more sets,
+		 * which we need to check.
+		 */
+		if (expanded)
+			goto again;
+
+		error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr);
+		if (error == 0)
+			goto success;
+	}
+	error = EPERM;
+success:
+	rw_exit(&dp->dp_config_rwlock);
+	dsl_dataset_rele(ds, FTAG);
+
+	cookie = NULL;
+	while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
+		kmem_free(setnode, sizeof (perm_set_t));
+
+	return (error);
+}
+
+/*
+ * Other routines.
+ */
+
+static void
+copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj,
+    boolean_t dosets, uint64_t uid, dmu_tx_t *tx)
+{
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	uint64_t jumpobj, pjumpobj;
+	uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	char whokey[ZFS_MAX_DELEG_NAME];
+
+	zfs_deleg_whokey(whokey,
+	    dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE,
+	    ZFS_DELEG_LOCAL, NULL);
+	if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0)
+		return;
+
+	if (zapobj == 0) {
+		dmu_buf_will_dirty(dd->dd_dbuf, tx);
+		zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+		    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+	}
+
+	zfs_deleg_whokey(whokey,
+	    dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER,
+	    ZFS_DELEG_LOCAL, &uid);
+	if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) {
+		jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+		VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0);
+	}
+
+	for (zap_cursor_init(&zc, mos, pjumpobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		uint64_t zero = 0;
+		ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
+
+		VERIFY(zap_update(mos, jumpobj, za.za_name,
+		    8, 1, &zero, tx) == 0);
+	}
+	zap_cursor_fini(&zc);
+}
+
+/*
+ * set all create time permission on new dataset.
+ */
+void
+dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr)
+{
+	dsl_dir_t *dd;
+	uint64_t uid = crgetuid(cr);
+
+	if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) <
+	    SPA_VERSION_DELEGATED_PERMS)
+		return;
+
+	for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) {
+		uint64_t pzapobj = dd->dd_phys->dd_deleg_zapobj;
+
+		if (pzapobj == 0)
+			continue;
+
+		copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx);
+		copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx);
+	}
+}
+
+int
+dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	if (zapobj == 0)
+		return (0);
+
+	for (zap_cursor_init(&zc, mos, zapobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
+		VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx));
+	}
+	zap_cursor_fini(&zc);
+	VERIFY(0 == zap_destroy(mos, zapobj, tx));
+	return (0);
+}
+
+boolean_t
+dsl_delegation_on(objset_t *os)
+{
+	return (os->os->os_spa->spa_delegation);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
index 5e563b632909..48d87f97f669 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
@@ -19,26 +19,28 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
+#include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/arc.h>
+#include <sys/sunddi.h>
 #include "zfs_namecheck.h"
 
-static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd);
-static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx);
+static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
+static void dsl_dir_set_reservation_sync(void *arg1, void *arg2,
+    cred_t *cr, dmu_tx_t *tx);
 
 
 /* ARGSUSED */
@@ -55,8 +57,6 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
 		ASSERT(dd->dd_space_towrite[t] == 0);
 	}
 
-	ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes);
-
 	if (dd->dd_parent)
 		dsl_dir_close(dd->dd_parent, dd);
 
@@ -91,9 +91,9 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(dbuf, &doi);
 		ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
+		ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
 	}
 #endif
-	/* XXX assert bonus buffer size is correct */
 	if (dd == NULL) {
 		dsl_dir_t *winner;
 		int err;
@@ -103,7 +103,6 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 		dd->dd_dbuf = dbuf;
 		dd->dd_pool = dp;
 		dd->dd_phys = dbuf->db_data;
-		dd->dd_used_bytes = dd->dd_phys->dd_used_bytes;
 		mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
 
 		list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
@@ -112,36 +111,25 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 		if (dd->dd_phys->dd_parent_obj) {
 			err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
 			    NULL, dd, &dd->dd_parent);
-			if (err) {
-				mutex_destroy(&dd->dd_lock);
-				kmem_free(dd, sizeof (dsl_dir_t));
-				dmu_buf_rele(dbuf, tag);
-				return (err);
-			}
+			if (err)
+				goto errout;
 			if (tail) {
 #ifdef ZFS_DEBUG
 				uint64_t foundobj;
 
 				err = zap_lookup(dp->dp_meta_objset,
-				    dd->dd_parent->dd_phys->
-				    dd_child_dir_zapobj,
+				    dd->dd_parent->dd_phys->dd_child_dir_zapobj,
 				    tail, sizeof (foundobj), 1, &foundobj);
 				ASSERT(err || foundobj == ddobj);
 #endif
 				(void) strcpy(dd->dd_myname, tail);
 			} else {
 				err = zap_value_search(dp->dp_meta_objset,
-				    dd->dd_parent->dd_phys->
-				    dd_child_dir_zapobj,
-				    ddobj, dd->dd_myname);
-			}
-			if (err) {
-				dsl_dir_close(dd->dd_parent, dd);
-				mutex_destroy(&dd->dd_lock);
-				kmem_free(dd, sizeof (dsl_dir_t));
-				dmu_buf_rele(dbuf, tag);
-				return (err);
+				    dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+				    ddobj, 0, dd->dd_myname);
 			}
+			if (err)
+				goto errout;
 		} else {
 			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
 		}
@@ -174,6 +162,15 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 	ASSERT3P(dd->dd_dbuf, ==, dbuf);
 	*ddp = dd;
 	return (0);
+
+errout:
+	if (dd->dd_parent)
+		dsl_dir_close(dd->dd_parent, dd);
+	mutex_destroy(&dd->dd_lock);
+	kmem_free(dd, sizeof (dsl_dir_t));
+	dmu_buf_rele(dbuf, tag);
+	return (err);
+
 }
 
 void
@@ -404,27 +401,37 @@ dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
 }
 
 uint64_t
-dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
+dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
+    dmu_tx_t *tx)
 {
-	objset_t *mos = pds->dd_pool->dp_meta_objset;
+	objset_t *mos = dp->dp_meta_objset;
 	uint64_t ddobj;
 	dsl_dir_phys_t *dsphys;
 	dmu_buf_t *dbuf;
 
 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
-	VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
-	    name, sizeof (uint64_t), 1, &ddobj, tx));
+	if (pds) {
+		VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
+		    name, sizeof (uint64_t), 1, &ddobj, tx));
+	} else {
+		/* it's the root dir */
+		VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
+	}
 	VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 
 	dsphys->dd_creation_time = gethrestime_sec();
-	dsphys->dd_parent_obj = pds->dd_object;
+	if (pds)
+		dsphys->dd_parent_obj = pds->dd_object;
 	dsphys->dd_props_zapobj = zap_create(mos,
 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 	dsphys->dd_child_dir_zapobj = zap_create(mos,
 	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
+		dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
 	dmu_buf_rele(dbuf, FTAG);
 
 	return (ddobj);
@@ -461,23 +468,27 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 void
-dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
+dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	uint64_t val, obj;
+	dd_used_t t;
 
 	ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
 	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 
 	/* Remove our reservation. */
 	val = 0;
-	dsl_dir_set_reservation_sync(dd, &val, tx);
-	ASSERT3U(dd->dd_used_bytes, ==, 0);
+	dsl_dir_set_reservation_sync(dd, &val, cr, tx);
+	ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0);
 	ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
+	for (t = 0; t < DD_USED_NUM; t++)
+		ASSERT3U(dd->dd_phys->dd_used_breakdown[t], ==, 0);
 
 	VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
 	VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
+	VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
 	VERIFY(0 == zap_remove(mos,
 	    dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
 
@@ -486,65 +497,53 @@ dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 	VERIFY(0 == dmu_object_free(mos, obj, tx));
 }
 
-void
-dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
+boolean_t
+dsl_dir_is_clone(dsl_dir_t *dd)
 {
-	dsl_dir_phys_t *dsp;
-	dmu_buf_t *dbuf;
-	int error;
-
-	*ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
-	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
-
-	error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET,
-	    sizeof (uint64_t), 1, ddobjp, tx);
-	ASSERT3U(error, ==, 0);
-
-	VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf));
-	dmu_buf_will_dirty(dbuf, tx);
-	dsp = dbuf->db_data;
-
-	dsp->dd_creation_time = gethrestime_sec();
-	dsp->dd_props_zapobj = zap_create(mos,
-	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
-	dsp->dd_child_dir_zapobj = zap_create(mos,
-	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
-
-	dmu_buf_rele(dbuf, FTAG);
+	return (dd->dd_phys->dd_origin_obj &&
+	    (dd->dd_pool->dp_origin_snap == NULL ||
+	    dd->dd_phys->dd_origin_obj !=
+	    dd->dd_pool->dp_origin_snap->ds_object));
 }
 
 void
 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
 {
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
-	    dsl_dir_space_available(dd, NULL, 0, TRUE));
-
 	mutex_enter(&dd->dd_lock);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_used_bytes);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
-	    dd->dd_phys->dd_quota);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
+	    dd->dd_phys->dd_used_bytes);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
 	    dd->dd_phys->dd_reserved);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
 	    dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
 	    (dd->dd_phys->dd_uncompressed_bytes * 100 /
 	    dd->dd_phys->dd_compressed_bytes));
+	if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
+		    dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
+		    dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
+		    dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
+		    dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
+		    dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
+	}
 	mutex_exit(&dd->dd_lock);
 
-	if (dd->dd_phys->dd_clone_parent_obj) {
+	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+	if (dsl_dir_is_clone(dd)) {
 		dsl_dataset_t *ds;
 		char buf[MAXNAMELEN];
 
-		rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-		VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
-		    dd->dd_phys->dd_clone_parent_obj,
-		    NULL, DS_MODE_NONE, FTAG, &ds));
+		VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
+		    dd->dd_phys->dd_origin_obj, FTAG, &ds));
 		dsl_dataset_name(ds, buf);
-		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
-		rw_exit(&dd->dd_pool->dp_config_rwlock);
-
+		dsl_dataset_rele(ds, FTAG);
 		dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
 	}
+	rw_exit(&dd->dd_pool->dp_config_rwlock);
 }
 
 void
@@ -580,7 +579,6 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
 	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
 	    dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
 	dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
-	dd->dd_phys->dd_used_bytes = dd->dd_used_bytes;
 	mutex_exit(&dd->dd_lock);
 
 	/* release the hold from dsl_dir_dirty */
@@ -588,15 +586,13 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
 }
 
 static uint64_t
-dsl_dir_estimated_space(dsl_dir_t *dd)
+dsl_dir_space_towrite(dsl_dir_t *dd)
 {
-	int64_t space;
+	uint64_t space = 0;
 	int i;
 
 	ASSERT(MUTEX_HELD(&dd->dd_lock));
 
-	space = dd->dd_phys->dd_used_bytes;
-	ASSERT(space >= 0);
 	for (i = 0; i < TXG_SIZE; i++) {
 		space += dd->dd_space_towrite[i&TXG_MASK];
 		ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
@@ -630,13 +626,9 @@ dsl_dir_space_available(dsl_dir_t *dd,
 	mutex_enter(&dd->dd_lock);
 	if (dd->dd_phys->dd_quota != 0)
 		quota = dd->dd_phys->dd_quota;
-	if (ondiskonly) {
-		used = dd->dd_used_bytes;
-	} else {
-		used = dsl_dir_estimated_space(dd);
-	}
-	if (dd == ancestor)
-		used += delta;
+	used = dd->dd_phys->dd_used_bytes;
+	if (!ondiskonly)
+		used += dsl_dir_space_towrite(dd);
 
 	if (dd->dd_parent == NULL) {
 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
@@ -651,6 +643,14 @@ dsl_dir_space_available(dsl_dir_t *dd,
 		parentspace += dd->dd_phys->dd_reserved - used;
 	}
 
+	if (dd == ancestor) {
+		ASSERT(delta <= 0);
+		ASSERT(used >= -delta);
+		used += delta;
+		if (parentspace != UINT64_MAX)
+			parentspace -= delta;
+	}
+
 	if (used > quota) {
 		/* over quota */
 		myspace = 0;
@@ -678,50 +678,68 @@ dsl_dir_space_available(dsl_dir_t *dd,
 
 struct tempreserve {
 	list_node_t tr_node;
+	dsl_pool_t *tr_dp;
 	dsl_dir_t *tr_ds;
 	uint64_t tr_size;
 };
 
-/*
- * Reserve space in this dsl_dir, to be used in this tx's txg.
- * After the space has been dirtied (and thus
- * dsl_dir_willuse_space() has been called), the reservation should
- * be canceled, using dsl_dir_tempreserve_clear().
- */
 static int
-dsl_dir_tempreserve_impl(dsl_dir_t *dd,
-    uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx)
+dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
+    boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
+    dmu_tx_t *tx, boolean_t first)
 {
 	uint64_t txg = tx->tx_txg;
-	uint64_t est_used, quota, parent_rsrv;
-	int edquot = EDQUOT;
+	uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
+	struct tempreserve *tr;
+	int enospc = EDQUOT;
 	int txgidx = txg & TXG_MASK;
 	int i;
-	struct tempreserve *tr;
+	uint64_t ref_rsrv = 0;
 
 	ASSERT3U(txg, !=, 0);
-	ASSERT3S(asize, >=, 0);
+	ASSERT3S(asize, >, 0);
 
 	mutex_enter(&dd->dd_lock);
+
 	/*
 	 * Check against the dsl_dir's quota.  We don't add in the delta
 	 * when checking for over-quota because they get one free hit.
 	 */
-	est_used = dsl_dir_estimated_space(dd);
+	est_inflight = dsl_dir_space_towrite(dd);
 	for (i = 0; i < TXG_SIZE; i++)
-		est_used += dd->dd_tempreserved[i];
+		est_inflight += dd->dd_tempreserved[i];
+	used_on_disk = dd->dd_phys->dd_used_bytes;
 
-	quota = UINT64_MAX;
+	/*
+	 * On the first iteration, fetch the dataset's used-on-disk and
+	 * refreservation values. Also, if checkrefquota is set, test if
+	 * allocating this space would exceed the dataset's refquota.
+	 */
+	if (first && tx->tx_objset) {
+		int error;
+		dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset;
+
+		error = dsl_dataset_check_quota(ds, checkrefquota,
+		    asize, est_inflight, &used_on_disk, &ref_rsrv);
+		if (error) {
+			mutex_exit(&dd->dd_lock);
+			return (error);
+		}
+	}
 
-	if (dd->dd_phys->dd_quota)
+	/*
+	 * If this transaction will result in a net free of space,
+	 * we want to let it through.
+	 */
+	if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
+		quota = UINT64_MAX;
+	else
 		quota = dd->dd_phys->dd_quota;
 
 	/*
-	 * If this transaction will result in a net free of space, we want
-	 * to let it through, but we have to be careful: the space that it
-	 * frees won't become available until *after* this txg syncs.
-	 * Therefore, to ensure that it's possible to remove files from
-	 * a full pool without inducing transient overcommits, we throttle
+	 * Adjust the quota against the actual pool size at the root.
+	 * To ensure that it's possible to remove files from a full
+	 * pool without inducing transient overcommits, we throttle
 	 * netfree transactions against a quota that is slightly larger,
 	 * but still within the pool's allocation slop.  In cases where
 	 * we're very close to full, this will allow a steady trickle of
@@ -731,47 +749,45 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd,
 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
 		if (poolsize < quota) {
 			quota = poolsize;
-			edquot = ENOSPC;
+			enospc = ENOSPC;
 		}
-	} else if (netfree) {
-		quota = UINT64_MAX;
 	}
 
 	/*
 	 * If they are requesting more space, and our current estimate
-	 * is over quota.  They get to try again unless the actual
+	 * is over quota, they get to try again unless the actual
 	 * on-disk is over quota and there are no pending changes (which
 	 * may free up space for us).
 	 */
-	if (asize > 0 && est_used > quota) {
-		if (dd->dd_space_towrite[txg & TXG_MASK] != 0 ||
-		    dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 ||
-		    dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 ||
-		    dd->dd_used_bytes < quota)
-			edquot = ERESTART;
-		dprintf_dd(dd, "failing: used=%lluK est_used = %lluK "
+	if (used_on_disk + est_inflight > quota) {
+		if (est_inflight > 0 || used_on_disk < quota)
+			enospc = ERESTART;
+		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
 		    "quota=%lluK tr=%lluK err=%d\n",
-		    dd->dd_used_bytes>>10, est_used>>10,
-		    quota>>10, asize>>10, edquot);
+		    used_on_disk>>10, est_inflight>>10,
+		    quota>>10, asize>>10, enospc);
 		mutex_exit(&dd->dd_lock);
-		return (edquot);
+		return (enospc);
 	}
 
 	/* We need to up our estimated delta before dropping dd_lock */
 	dd->dd_tempreserved[txgidx] += asize;
 
-	parent_rsrv = parent_delta(dd, est_used, asize);
+	parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
+	    asize - ref_rsrv);
 	mutex_exit(&dd->dd_lock);
 
-	tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
+	tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 	tr->tr_ds = dd;
 	tr->tr_size = asize;
 	list_insert_tail(tr_list, tr);
 
 	/* see if it's OK with our parent */
 	if (dd->dd_parent && parent_rsrv) {
+		boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
+
 		return (dsl_dir_tempreserve_impl(dd->dd_parent,
-		    parent_rsrv, netfree, tr_list, tx));
+		    parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
 	} else {
 		return (0);
 	}
@@ -779,42 +795,62 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd,
 
 /*
  * Reserve space in this dsl_dir, to be used in this tx's txg.
- * After the space has been dirtied (and thus
- * dsl_dir_willuse_space() has been called), the reservation should
- * be canceled, using dsl_dir_tempreserve_clear().
+ * After the space has been dirtied (and dsl_dir_willuse_space()
+ * has been called), the reservation should be canceled, using
+ * dsl_dir_tempreserve_clear().
  */
 int
-dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
-    uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx)
+dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
+    uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
 {
-	int err = 0;
+	int err;
 	list_t *tr_list;
 
+	if (asize == 0) {
+		*tr_cookiep = NULL;
+		return (0);
+	}
+
 	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 	list_create(tr_list, sizeof (struct tempreserve),
 	    offsetof(struct tempreserve, tr_node));
-	ASSERT3S(asize, >=, 0);
+	ASSERT3S(asize, >, 0);
 	ASSERT3S(fsize, >=, 0);
 
-	err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
-	    tr_list, tx);
-
+	err = arc_tempreserve_space(lsize, tx->tx_txg);
 	if (err == 0) {
 		struct tempreserve *tr;
 
-		err = arc_tempreserve_space(lsize);
-		if (err == 0) {
-			tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
-			tr->tr_ds = NULL;
-			tr->tr_size = lsize;
-			list_insert_tail(tr_list, tr);
+		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
+		tr->tr_size = lsize;
+		list_insert_tail(tr_list, tr);
+
+		err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
+	} else {
+		if (err == EAGAIN) {
+			txg_delay(dd->dd_pool, tx->tx_txg, 1);
+			err = ERESTART;
 		}
+		dsl_pool_memory_pressure(dd->dd_pool);
+	}
+
+	if (err == 0) {
+		struct tempreserve *tr;
+
+		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
+		tr->tr_dp = dd->dd_pool;
+		tr->tr_size = asize;
+		list_insert_tail(tr_list, tr);
+
+		err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
+		    FALSE, asize > usize, tr_list, tx, TRUE);
 	}
 
 	if (err)
 		dsl_dir_tempreserve_clear(tr_list, tx);
 	else
 		*tr_cookiep = tr_list;
+
 	return (err);
 }
 
@@ -831,15 +867,20 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
 
 	ASSERT3U(tx->tx_txg, !=, 0);
 
+	if (tr_cookie == NULL)
+		return;
+
 	while (tr = list_head(tr_list)) {
-		if (tr->tr_ds == NULL) {
-			arc_tempreserve_clear(tr->tr_size);
-		} else {
+		if (tr->tr_dp) {
+			dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
+		} else if (tr->tr_ds) {
 			mutex_enter(&tr->tr_ds->dd_lock);
 			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
 			    tr->tr_size);
 			tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
 			mutex_exit(&tr->tr_ds->dd_lock);
+		} else {
+			arc_tempreserve_clear(tr->tr_size);
 		}
 		list_remove(tr_list, tr);
 		kmem_free(tr, sizeof (struct tempreserve));
@@ -848,13 +889,8 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
 	kmem_free(tr_list, sizeof (list_t));
 }
 
-/*
- * Call in open context when we think we're going to write/free space,
- * eg. when dirtying data.  Be conservative (ie. OK to write less than
- * this or free more than this, but don't write more or free less).
- */
-void
-dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+static void
+dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 {
 	int64_t parent_space;
 	uint64_t est_used;
@@ -863,7 +899,7 @@ dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 	if (space > 0)
 		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
 
-	est_used = dsl_dir_estimated_space(dd);
+	est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
 	parent_space = parent_delta(dd, est_used, space);
 	mutex_exit(&dd->dd_lock);
 
@@ -872,39 +908,96 @@ dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 
 	/* XXX this is potentially expensive and unnecessary... */
 	if (parent_space && dd->dd_parent)
-		dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
+		dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
+}
+
+/*
+ * Call in open context when we think we're going to write/free space,
+ * eg. when dirtying data.  Be conservative (ie. OK to write less than
+ * this or free more than this, but don't write more or free less).
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+{
+	dsl_pool_willuse_space(dd->dd_pool, space, tx);
+	dsl_dir_willuse_space_impl(dd, space, tx);
 }
 
 /* call from syncing context when we actually write/free space for this dd */
 void
-dsl_dir_diduse_space(dsl_dir_t *dd,
+dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
 {
 	int64_t accounted_delta;
+	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
 
 	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(type < DD_USED_NUM);
 
 	dsl_dir_dirty(dd, tx);
 
-	mutex_enter(&dd->dd_lock);
-	accounted_delta = parent_delta(dd, dd->dd_used_bytes, used);
-	ASSERT(used >= 0 || dd->dd_used_bytes >= -used);
+	if (needlock)
+		mutex_enter(&dd->dd_lock);
+	accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
+	ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
 	ASSERT(compressed >= 0 ||
 	    dd->dd_phys->dd_compressed_bytes >= -compressed);
 	ASSERT(uncompressed >= 0 ||
 	    dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
-	dd->dd_used_bytes += used;
+	dd->dd_phys->dd_used_bytes += used;
 	dd->dd_phys->dd_uncompressed_bytes += uncompressed;
 	dd->dd_phys->dd_compressed_bytes += compressed;
-	mutex_exit(&dd->dd_lock);
+
+	if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+		ASSERT(used > 0 ||
+		    dd->dd_phys->dd_used_breakdown[type] >= -used);
+		dd->dd_phys->dd_used_breakdown[type] += used;
+#ifdef DEBUG
+		dd_used_t t;
+		uint64_t u = 0;
+		for (t = 0; t < DD_USED_NUM; t++)
+			u += dd->dd_phys->dd_used_breakdown[t];
+		ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
+#endif
+	}
+	if (needlock)
+		mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent != NULL) {
-		dsl_dir_diduse_space(dd->dd_parent,
+		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
 		    accounted_delta, compressed, uncompressed, tx);
+		dsl_dir_transfer_space(dd->dd_parent,
+		    used - accounted_delta,
+		    DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
 	}
 }
 
-/* ARGSUSED */
+void
+dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
+    dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
+{
+	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(oldtype < DD_USED_NUM);
+	ASSERT(newtype < DD_USED_NUM);
+
+	if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
+		return;
+
+	dsl_dir_dirty(dd, tx);
+	if (needlock)
+		mutex_enter(&dd->dd_lock);
+	ASSERT(delta > 0 ?
+	    dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
+	    dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
+	ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
+	dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
+	dd->dd_phys->dd_used_breakdown[newtype] += delta;
+	if (needlock)
+		mutex_exit(&dd->dd_lock);
+}
+
 static int
 dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
@@ -921,22 +1014,22 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	/*
 	 * If we are doing the preliminary check in open context, and
 	 * there are pending changes, then don't fail it, since the
-	 * pending changes could under-estimat the amount of space to be
+	 * pending changes could under-estimate the amount of space to be
 	 * freed up.
 	 */
-	towrite = dd->dd_space_towrite[0] + dd->dd_space_towrite[1] +
-	    dd->dd_space_towrite[2] + dd->dd_space_towrite[3];
+	towrite = dsl_dir_space_towrite(dd);
 	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
 	    (new_quota < dd->dd_phys->dd_reserved ||
-	    new_quota < dsl_dir_estimated_space(dd))) {
+	    new_quota < dd->dd_phys->dd_used_bytes + towrite)) {
 		err = ENOSPC;
 	}
 	mutex_exit(&dd->dd_lock);
 	return (err);
 }
 
+/* ARGSUSED */
 static void
-dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	uint64_t *quotap = arg2;
@@ -947,6 +1040,10 @@ dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	mutex_enter(&dd->dd_lock);
 	dd->dd_phys->dd_quota = new_quota;
 	mutex_exit(&dd->dd_lock);
+
+	spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
+	    tx, cr, "%lld dataset = %llu ",
+	    (longlong_t)new_quota, dd->dd_phys->dd_head_dataset_obj);
 }
 
 int
@@ -958,20 +1055,22 @@ dsl_dir_set_quota(const char *ddname, uint64_t quota)
 	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
 	if (err)
 		return (err);
-	/*
-	 * If someone removes a file, then tries to set the quota, we
-	 * want to make sure the file freeing takes effect.
-	 */
-	txg_wait_open(dd->dd_pool, 0);
 
-	err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
-	    dsl_dir_set_quota_sync, dd, &quota, 0);
+	if (quota != dd->dd_phys->dd_quota) {
+		/*
+		 * If someone removes a file, then tries to set the quota, we
+		 * want to make sure the file freeing takes effect.
+		 */
+		txg_wait_open(dd->dd_pool, 0);
+
+		err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
+		    dsl_dir_set_quota_sync, dd, &quota, 0);
+	}
 	dsl_dir_close(dd, FTAG);
 	return (err);
 }
 
-/* ARGSUSED */
-static int
+int
 dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
@@ -991,7 +1090,7 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 		return (0);
 
 	mutex_enter(&dd->dd_lock);
-	used = dd->dd_used_bytes;
+	used = dd->dd_phys->dd_used_bytes;
 	delta = MAX(used, new_reservation) -
 	    MAX(used, dd->dd_phys->dd_reserved);
 	mutex_exit(&dd->dd_lock);
@@ -1011,8 +1110,9 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	return (0);
 }
 
+/* ARGSUSED */
 static void
-dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	uint64_t *reservationp = arg2;
@@ -1020,19 +1120,24 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	uint64_t used;
 	int64_t delta;
 
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
 	mutex_enter(&dd->dd_lock);
-	used = dd->dd_used_bytes;
+	used = dd->dd_phys->dd_used_bytes;
 	delta = MAX(used, new_reservation) -
 	    MAX(used, dd->dd_phys->dd_reserved);
-	mutex_exit(&dd->dd_lock);
-
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	dd->dd_phys->dd_reserved = new_reservation;
 
 	if (dd->dd_parent != NULL) {
 		/* Roll up this additional usage into our ancestors */
-		dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx);
+		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
+		    delta, 0, 0, tx);
 	}
+	mutex_exit(&dd->dd_lock);
+
+	spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
+	    tx, cr, "%lld dataset = %llu",
+	    (longlong_t)new_reservation, dd->dd_phys->dd_head_dataset_obj);
 }
 
 int
@@ -1074,7 +1179,7 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
 		return (delta);
 
 	mutex_enter(&dd->dd_lock);
-	delta = parent_delta(dd, dd->dd_used_bytes, delta);
+	delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
 	mutex_exit(&dd->dd_lock);
 	return (would_change(dd->dd_parent, delta, ancestor));
 }
@@ -1084,7 +1189,7 @@ struct renamearg {
 	const char *mynewname;
 };
 
-/* ARGSUSED */
+/*ARGSUSED*/
 static int
 dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
@@ -1110,7 +1215,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	if (ra->newparent != dd->dd_parent) {
 		/* is there enough space? */
 		uint64_t myspace =
-		    MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
+		    MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
 
 		/* no rename into our descendant */
 		if (closest_common_ancestor(dd, ra->newparent) == dd)
@@ -1125,7 +1230,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	struct renamearg *ra = arg2;
@@ -1136,15 +1241,24 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
 
 	if (ra->newparent != dd->dd_parent) {
-		uint64_t myspace =
-		    MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
-
-		dsl_dir_diduse_space(dd->dd_parent, -myspace,
+		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
+		    -dd->dd_phys->dd_used_bytes,
 		    -dd->dd_phys->dd_compressed_bytes,
 		    -dd->dd_phys->dd_uncompressed_bytes, tx);
-		dsl_dir_diduse_space(ra->newparent, myspace,
+		dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD,
+		    dd->dd_phys->dd_used_bytes,
 		    dd->dd_phys->dd_compressed_bytes,
 		    dd->dd_phys->dd_uncompressed_bytes, tx);
+
+		if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
+			uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
+			    dd->dd_phys->dd_used_bytes;
+
+			dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
+			    -unused_rsrv, 0, 0, tx);
+			dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV,
+			    unused_rsrv, 0, 0, tx);
+		}
 	}
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
@@ -1164,6 +1278,9 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
 	    dd->dd_myname, 8, 1, &dd->dd_object, tx);
 	ASSERT3U(err, ==, 0);
+
+	spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa,
+	    tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
 }
 
 int
@@ -1189,7 +1306,6 @@ dsl_dir_rename(dsl_dir_t *dd, const char *newname)
 		goto out;
 	}
 
-
 	err = dsl_sync_task_do(dd->dd_pool,
 	    dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
index 00abf7ec2c6b..4585dc805fe5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
@@ -36,20 +34,36 @@
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+
+int zfs_no_write_throttle = 0;
+int zfs_write_limit_shift = 3;			/* 1/8th of physical memory */
+int zfs_txg_synctime = 5;			/* target secs to sync a txg */
+
+uint64_t zfs_write_limit_min = 32 << 20;	/* min write limit is 32MB */
+uint64_t zfs_write_limit_max = 0;		/* max data payload per txg */
+uint64_t zfs_write_limit_inflated = 0;
+uint64_t zfs_write_limit_override = 0;
+extern uint64_t zfs_write_limit_min;
+
+kmutex_t zfs_write_limit_lock;
+
+static pgcnt_t old_physmem = 0;
 
 static int
-dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp)
+dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 {
 	uint64_t obj;
 	int err;
 
 	err = zap_lookup(dp->dp_meta_objset,
 	    dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
-	    MOS_DIR_NAME, sizeof (obj), 1, &obj);
+	    name, sizeof (obj), 1, &obj);
 	if (err)
 		return (err);
 
-	return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp));
+	return (dsl_dir_open_obj(dp, obj, name, dp, ddp));
 }
 
 static dsl_pool_t *
@@ -62,6 +76,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	dp->dp_spa = spa;
 	dp->dp_meta_rootbp = *bp;
 	rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL);
+	dp->dp_write_limit = zfs_write_limit_min;
 	txg_init(dp, txg);
 
 	txg_list_create(&dp->dp_dirty_datasets,
@@ -70,9 +85,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	    offsetof(dsl_dir_t, dd_dirty_link));
 	txg_list_create(&dp->dp_sync_tasks,
 	    offsetof(dsl_sync_task_group_t, dstg_node));
-	list_create(&dp->dp_synced_objsets, sizeof (dsl_dataset_t),
+	list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t),
 	    offsetof(dsl_dataset_t, ds_synced_link));
 
+	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL);
+
 	return (dp);
 }
 
@@ -81,9 +99,11 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
 	objset_impl_t *osi;
 
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
 	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
 	if (err)
 		goto out;
@@ -100,10 +120,73 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 	if (err)
 		goto out;
 
-	err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir);
+	err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 	if (err)
 		goto out;
 
+	if (spa_version(spa) >= SPA_VERSION_ORIGIN) {
+		err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
+		if (err)
+			goto out;
+		err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
+		    FTAG, &ds);
+		if (err)
+			goto out;
+		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+		    dp, &dp->dp_origin_snap);
+		if (err)
+			goto out;
+		dsl_dataset_rele(ds, FTAG);
+		dsl_dir_close(dd, dp);
+	}
+
+	/* get scrub status */
+	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
+	    &dp->dp_scrub_func);
+	if (err == 0) {
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
+		    &dp->dp_scrub_queue_obj);
+		if (err)
+			goto out;
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
+		    &dp->dp_scrub_min_txg);
+		if (err)
+			goto out;
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
+		    &dp->dp_scrub_max_txg);
+		if (err)
+			goto out;
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+		    &dp->dp_scrub_bookmark);
+		if (err)
+			goto out;
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
+		    &spa->spa_scrub_errors);
+		if (err)
+			goto out;
+		if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
+			/*
+			 * A new-type scrub was in progress on an old
+			 * pool.  Restart from the beginning, since the
+			 * old software may have changed the pool in the
+			 * meantime.
+			 */
+			dsl_pool_scrub_restart(dp);
+		}
+	} else {
+		/*
+		 * It's OK if there is no scrub in progress (and if
+		 * there was an I/O error, ignore it).
+		 */
+		err = 0;
+	}
+
 out:
 	rw_exit(&dp->dp_config_rwlock);
 	if (err)
@@ -117,7 +200,15 @@ out:
 void
 dsl_pool_close(dsl_pool_t *dp)
 {
-	/* drop our reference from dsl_pool_open() */
+	/* drop our references from dsl_pool_open() */
+
+	/*
+	 * Since we held the origin_snap from "syncing" context (which
+	 * includes pool-opening context), it actually only got a "ref"
+	 * and not a hold, so just drop that here.
+	 */
+	if (dp->dp_origin_snap)
+		dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
 	if (dp->dp_mos_dir)
 		dsl_dir_close(dp->dp_mos_dir, dp);
 	if (dp->dp_root_dir)
@@ -130,20 +221,27 @@ dsl_pool_close(dsl_pool_t *dp)
 	txg_list_destroy(&dp->dp_dirty_datasets);
 	txg_list_destroy(&dp->dp_dirty_dirs);
 	txg_list_destroy(&dp->dp_sync_tasks);
-	list_destroy(&dp->dp_synced_objsets);
+	list_destroy(&dp->dp_synced_datasets);
 
-	arc_flush();
+	arc_flush(dp->dp_spa);
 	txg_fini(dp);
 	rw_destroy(&dp->dp_config_rwlock);
+	mutex_destroy(&dp->dp_lock);
+	mutex_destroy(&dp->dp_scrub_cancel_lock);
 	kmem_free(dp, sizeof (dsl_pool_t));
 }
 
 dsl_pool_t *
-dsl_pool_create(spa_t *spa, uint64_t txg)
+dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+	objset_impl_t *osip;
+	dsl_dataset_t *ds;
+	uint64_t dsobj;
+
+	/* create and open the MOS (meta-objset) */
 	dp->dp_meta_objset = &dmu_objset_create_impl(spa,
 	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
 
@@ -153,13 +251,29 @@ dsl_pool_create(spa_t *spa, uint64_t txg)
 	ASSERT3U(err, ==, 0);
 
 	/* create and open the root dir */
-	dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx);
+	dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 	VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
 	    NULL, dp, &dp->dp_root_dir));
 
 	/* create and open the meta-objset dir */
-	(void) dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx);
-	VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir));
+	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
+	VERIFY(0 == dsl_pool_open_special_dir(dp,
+	    MOS_DIR_NAME, &dp->dp_mos_dir));
+
+	if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
+		dsl_pool_create_origin(dp, tx);
+
+	/* create the root dataset */
+	dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
+
+	/* create the root objset */
+	VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+	osip = dmu_objset_create_impl(dp->dp_spa, ds,
+	    dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
+#ifdef _KERNEL
+	zfs_create_fs(&osip->os, kcred, zplprops, tx);
+#endif
+	dsl_dataset_rele(ds, FTAG);
 
 	dmu_tx_commit(tx);
 
@@ -175,26 +289,42 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	dsl_dataset_t *ds;
 	dsl_sync_task_group_t *dstg;
 	objset_impl_t *mosi = dp->dp_meta_objset->os;
+	hrtime_t start, write_time;
+	uint64_t data_written;
 	int err;
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
+	dp->dp_read_overhead = 0;
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
 		if (!list_link_active(&ds->ds_synced_link))
-			list_insert_tail(&dp->dp_synced_objsets, ds);
+			list_insert_tail(&dp->dp_synced_datasets, ds);
 		else
 			dmu_buf_rele(ds->ds_dbuf, ds);
 		dsl_dataset_sync(ds, zio, tx);
 	}
+	DTRACE_PROBE(pool_sync__1setup);
+
+	start = gethrtime();
 	err = zio_wait(zio);
+	write_time = gethrtime() - start;
 	ASSERT(err == 0);
+	DTRACE_PROBE(pool_sync__2rootzio);
 
 	while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
 		dsl_sync_task_group_sync(dstg, tx);
+	DTRACE_PROBE(pool_sync__3task);
+
+	start = gethrtime();
 	while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
 		dsl_dir_sync(dd, tx);
+	write_time += gethrtime() - start;
+
+	if (spa_sync_pass(dp->dp_spa) == 1)
+		dsl_pool_scrub_sync(dp, tx);
 
+	start = gethrtime();
 	if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 	    list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
@@ -204,8 +334,51 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 		dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
 		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 	}
+	write_time += gethrtime() - start;
+	DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time,
+	    hrtime_t, dp->dp_read_overhead);
+	write_time -= dp->dp_read_overhead;
 
 	dmu_tx_commit(tx);
+
+	data_written = dp->dp_space_towrite[txg & TXG_MASK];
+	dp->dp_space_towrite[txg & TXG_MASK] = 0;
+	ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
+
+	/*
+	 * If the write limit max has not been explicitly set, set it
+	 * to a fraction of available physical memory (default 1/8th).
+	 * Note that we must inflate the limit because the spa
+	 * inflates write sizes to account for data replication.
+	 * Check this each sync phase to catch changing memory size.
+	 */
+	if (physmem != old_physmem && zfs_write_limit_shift) {
+		mutex_enter(&zfs_write_limit_lock);
+		old_physmem = physmem;
+		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
+		zfs_write_limit_inflated = MAX(zfs_write_limit_min,
+		    spa_get_asize(dp->dp_spa, zfs_write_limit_max));
+		mutex_exit(&zfs_write_limit_lock);
+	}
+
+	/*
+	 * Attempt to keep the sync time consistent by adjusting the
+	 * amount of write traffic allowed into each transaction group.
+	 * Weight the throughput calculation towards the current value:
+	 * 	thru = 3/4 old_thru + 1/4 new_thru
+	 */
+	ASSERT(zfs_write_limit_min > 0);
+	if (data_written > zfs_write_limit_min / 8 && write_time > 0) {
+		uint64_t throughput = (data_written * NANOSEC) / write_time;
+		if (dp->dp_throughput)
+			dp->dp_throughput = throughput / 4 +
+			    3 * dp->dp_throughput / 4;
+		else
+			dp->dp_throughput = throughput;
+		dp->dp_write_limit = MIN(zfs_write_limit_inflated,
+		    MAX(zfs_write_limit_min,
+		    dp->dp_throughput * zfs_txg_synctime));
+	}
 }
 
 void
@@ -213,8 +386,8 @@ dsl_pool_zil_clean(dsl_pool_t *dp)
 {
 	dsl_dataset_t *ds;
 
-	while (ds = list_head(&dp->dp_synced_objsets)) {
-		list_remove(&dp->dp_synced_objsets, ds);
+	while (ds = list_head(&dp->dp_synced_datasets)) {
+		list_remove(&dp->dp_synced_datasets, ds);
 		ASSERT(ds->ds_user_ptr != NULL);
 		zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
 		dmu_buf_rele(ds->ds_dbuf, ds);
@@ -254,3 +427,187 @@ dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
 
 	return (space - resv);
 }
+
+int
+dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
+{
+	uint64_t reserved = 0;
+	uint64_t write_limit = (zfs_write_limit_override ?
+	    zfs_write_limit_override : dp->dp_write_limit);
+
+	if (zfs_no_write_throttle) {
+		atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK],
+		    space);
+		return (0);
+	}
+
+	/*
+	 * Check to see if we have exceeded the maximum allowed IO for
+	 * this transaction group.  We can do this without locks since
+	 * a little slop here is ok.  Note that we do the reserved check
+	 * with only half the requested reserve: this is because the
+	 * reserve requests are worst-case, and we really don't want to
+	 * throttle based off of worst-case estimates.
+	 */
+	if (write_limit > 0) {
+		reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK]
+		    + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2;
+
+		if (reserved && reserved > write_limit)
+			return (ERESTART);
+	}
+
+	atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space);
+
+	/*
+	 * If this transaction group is over 7/8ths capacity, delay
+	 * the caller 1 clock tick.  This will slow down the "fill"
+	 * rate until the sync process can catch up with us.
+	 */
+	if (reserved && reserved > (write_limit - (write_limit >> 3)))
+		txg_delay(dp, tx->tx_txg, 1);
+
+	return (0);
+}
+
+void
+dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
+{
+	ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space);
+	atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space);
+}
+
+void
+dsl_pool_memory_pressure(dsl_pool_t *dp)
+{
+	uint64_t space_inuse = 0;
+	int i;
+
+	if (dp->dp_write_limit == zfs_write_limit_min)
+		return;
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		space_inuse += dp->dp_space_towrite[i];
+		space_inuse += dp->dp_tempreserved[i];
+	}
+	dp->dp_write_limit = MAX(zfs_write_limit_min,
+	    MIN(dp->dp_write_limit, space_inuse / 4));
+}
+
+void
+dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
+{
+	if (space > 0) {
+		mutex_enter(&dp->dp_lock);
+		dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space;
+		mutex_exit(&dp->dp_lock);
+	}
+}
+
+/* ARGSUSED */
+static int
+upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+	dmu_tx_t *tx = arg;
+	dsl_dataset_t *ds, *prev = NULL;
+	int err;
+	dsl_pool_t *dp = spa_get_dsl(spa);
+
+	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+	if (err)
+		return (err);
+
+	while (ds->ds_phys->ds_prev_snap_obj != 0) {
+		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+		    FTAG, &prev);
+		if (err) {
+			dsl_dataset_rele(ds, FTAG);
+			return (err);
+		}
+
+		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
+			break;
+		dsl_dataset_rele(ds, FTAG);
+		ds = prev;
+		prev = NULL;
+	}
+
+	if (prev == NULL) {
+		prev = dp->dp_origin_snap;
+
+		/*
+		 * The $ORIGIN can't have any data, or the accounting
+		 * will be wrong.
+		 */
+		ASSERT(prev->ds_phys->ds_bp.blk_birth == 0);
+
+		/* The origin doesn't get attached to itself */
+		if (ds->ds_object == prev->ds_object) {
+			dsl_dataset_rele(ds, FTAG);
+			return (0);
+		}
+
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
+		ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
+
+		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+		ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
+
+		dmu_buf_will_dirty(prev->ds_dbuf, tx);
+		prev->ds_phys->ds_num_children++;
+
+		if (ds->ds_phys->ds_next_snap_obj == 0) {
+			ASSERT(ds->ds_prev == NULL);
+			VERIFY(0 == dsl_dataset_hold_obj(dp,
+			    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+		}
+	}
+
+	ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object);
+	ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object);
+
+	if (prev->ds_phys->ds_next_clones_obj == 0) {
+		prev->ds_phys->ds_next_clones_obj =
+		    zap_create(dp->dp_meta_objset,
+		    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
+	}
+	VERIFY(0 == zap_add_int(dp->dp_meta_objset,
+	    prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
+
+	dsl_dataset_rele(ds, FTAG);
+	if (prev != dp->dp_origin_snap)
+		dsl_dataset_rele(prev, FTAG);
+	return (0);
+}
+
+void
+dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(dp->dp_origin_snap != NULL);
+
+	(void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
+	    tx, DS_FIND_CHILDREN);
+}
+
+void
+dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	uint64_t dsobj;
+	dsl_dataset_t *ds;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(dp->dp_origin_snap == NULL);
+
+	/* create the origin dir, ds, & snap-ds */
+	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+	dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
+	    NULL, 0, kcred, tx);
+	VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+	dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx);
+	VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+	    dp, &dp->dp_origin_snap));
+	dsl_dataset_rele(ds, FTAG);
+	rw_exit(&dp->dp_config_rwlock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
index 2fff66d06b1e..212acbbc5968 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -44,14 +44,20 @@ dodefault(const char *propname, int intsz, int numint, void *buf)
 {
 	zfs_prop_t prop;
 
-	if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL ||
-	    zfs_prop_readonly(prop))
+	/*
+	 * The setonce properties are read-only, BUT they still
+	 * have a default value that can be used as the initial
+	 * value.
+	 */
+	if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL ||
+	    (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop)))
 		return (ENOENT);
 
-	if (zfs_prop_get_type(prop) == prop_type_string) {
+	if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
 		if (intsz != 1)
 			return (EOVERFLOW);
-		(void) strncpy(buf, zfs_prop_default_string(prop), numint);
+		(void) strncpy(buf, zfs_prop_default_string(prop),
+		    numint);
 	} else {
 		if (intsz != 8 || numint < 1)
 			return (EOVERFLOW);
@@ -62,13 +68,16 @@ dodefault(const char *propname, int intsz, int numint, void *buf)
 	return (0);
 }
 
-static int
-dsl_prop_get_impl(dsl_dir_t *dd, const char *propname,
+int
+dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
     int intsz, int numint, void *buf, char *setpoint)
 {
 	int err = ENOENT;
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	zfs_prop_t prop;
 
+	ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+
 	if (setpoint)
 		setpoint[0] = '\0';
 
@@ -79,7 +88,6 @@ dsl_prop_get_impl(dsl_dir_t *dd, const char *propname,
 	 * ouside this loop.
 	 */
 	for (; dd != NULL; dd = dd->dd_parent) {
-		objset_t *mos = dd->dd_pool->dp_meta_objset;
 		ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
 		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
 		    propname, intsz, numint, buf);
@@ -92,8 +100,7 @@ dsl_prop_get_impl(dsl_dir_t *dd, const char *propname,
 		/*
 		 * Break out of this loop for non-inheritable properties.
 		 */
-		if (prop != ZFS_PROP_INVAL &&
-		    !zfs_prop_inheritable(prop))
+		if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
 			break;
 	}
 	if (err == ENOENT)
@@ -102,6 +109,26 @@ dsl_prop_get_impl(dsl_dir_t *dd, const char *propname,
 	return (err);
 }
 
+int
+dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
+    int intsz, int numint, void *buf, char *setpoint)
+{
+	ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock));
+
+	if (ds->ds_phys->ds_props_obj) {
+		int err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+		    ds->ds_phys->ds_props_obj, propname, intsz, numint, buf);
+		if (err != ENOENT) {
+			if (setpoint)
+				dsl_dataset_name(ds, setpoint);
+			return (err);
+		}
+	}
+
+	return (dsl_prop_get_dd(ds->ds_dir, propname,
+	    intsz, numint, buf, setpoint));
+}
+
 /*
  * Register interest in the named property.  We'll call the callback
  * once to notify it of the current property value, and again each time
@@ -114,18 +141,20 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
     dsl_prop_changed_cb_t *callback, void *cbarg)
 {
 	dsl_dir_t *dd = ds->ds_dir;
+	dsl_pool_t *dp = dd->dd_pool;
 	uint64_t value;
 	dsl_prop_cb_record_t *cbr;
 	int err;
 	int need_rwlock;
 
-	need_rwlock = !RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock);
+	need_rwlock = !RW_WRITE_HELD(&dp->dp_config_rwlock);
 	if (need_rwlock)
-		rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+		rw_enter(&dp->dp_config_rwlock, RW_READER);
 
-	err = dsl_prop_get_impl(dd, propname, 8, 1, &value, NULL);
+	err = dsl_prop_get_ds(ds, propname, 8, 1, &value, NULL);
 	if (err != 0) {
-		rw_exit(&dd->dd_pool->dp_config_rwlock);
+		if (need_rwlock)
+			rw_exit(&dp->dp_config_rwlock);
 		return (err);
 	}
 
@@ -141,46 +170,30 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
 
 	cbr->cbr_func(cbr->cbr_arg, value);
 
-	VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object,
+	VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
 	    NULL, cbr, &dd));
 	if (need_rwlock)
-		rw_exit(&dd->dd_pool->dp_config_rwlock);
-	/* Leave dataset open until this callback is unregistered */
+		rw_exit(&dp->dp_config_rwlock);
+	/* Leave dir open until this callback is unregistered */
 	return (0);
 }
 
 int
-dsl_prop_get_ds(dsl_dir_t *dd, const char *propname,
-    int intsz, int numints, void *buf, char *setpoint)
-{
-	int err;
-
-	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-	err = dsl_prop_get_impl(dd, propname, intsz, numints, buf, setpoint);
-	rw_exit(&dd->dd_pool->dp_config_rwlock);
-
-	return (err);
-}
-
-int
-dsl_prop_get(const char *ddname, const char *propname,
+dsl_prop_get(const char *dsname, const char *propname,
     int intsz, int numints, void *buf, char *setpoint)
 {
-	dsl_dir_t *dd;
-	const char *tail;
+	dsl_dataset_t *ds;
 	int err;
 
-	err = dsl_dir_open(ddname, FTAG, &dd, &tail);
+	err = dsl_dataset_hold(dsname, FTAG, &ds);
 	if (err)
 		return (err);
-	if (tail && tail[0] != '@') {
-		dsl_dir_close(dd, FTAG);
-		return (ENOENT);
-	}
 
-	err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint);
+	rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+	err = dsl_prop_get_ds(ds, propname, intsz, numints, buf, setpoint);
+	rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
 
-	dsl_dir_close(dd, FTAG);
+	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
@@ -264,8 +277,9 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
 	dsl_prop_cb_record_t *cbr;
 	objset_t *mos = dp->dp_meta_objset;
 	zap_cursor_t zc;
-	zap_attribute_t za;
+	zap_attribute_t *za;
 	int err;
+	uint64_t dummyval;
 
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 	err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
@@ -278,7 +292,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
 		 * being inherited here or below; stop the recursion.
 		 */
 		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
-		    8, 1, &value);
+		    8, 1, &dummyval);
 		if (err == 0) {
 			dsl_dir_close(dd, FTAG);
 			return;
@@ -287,22 +301,34 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
 	}
 
 	mutex_enter(&dd->dd_lock);
-	for (cbr = list_head(&dd->dd_prop_cbs);
-	    cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
-		if (strcmp(cbr->cbr_propname, propname) == 0) {
-			cbr->cbr_func(cbr->cbr_arg, value);
-		}
+	for (cbr = list_head(&dd->dd_prop_cbs); cbr;
+	    cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+		uint64_t propobj = cbr->cbr_ds->ds_phys->ds_props_obj;
+
+		if (strcmp(cbr->cbr_propname, propname) != 0)
+			continue;
+
+		/*
+		 * If the property is set on this ds, then it is not
+		 * inherited here; don't call the callback.
+		 */
+		if (propobj && 0 == zap_lookup(mos, propobj, propname,
+		    8, 1, &dummyval))
+			continue;
+
+		cbr->cbr_func(cbr->cbr_arg, value);
 	}
 	mutex_exit(&dd->dd_lock);
 
+	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 	for (zap_cursor_init(&zc, mos,
 	    dd->dd_phys->dd_child_dir_zapobj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_retrieve(&zc, za) == 0;
 	    zap_cursor_advance(&zc)) {
-		/* XXX recursion could blow stack; esp. za! */
-		dsl_prop_changed_notify(dp, za.za_first_integer,
+		dsl_prop_changed_notify(dp, za->za_first_integer,
 		    propname, value, FALSE);
 	}
+	kmem_free(za, sizeof (zap_attribute_t));
 	zap_cursor_fini(&zc);
 	dsl_dir_close(dd, FTAG);
 }
@@ -316,22 +342,37 @@ struct prop_set_arg {
 
 
 static void
-dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
+	dsl_dataset_t *ds = arg1;
 	struct prop_set_arg *psa = arg2;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
-	uint64_t intval;
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t zapobj, intval;
 	int isint;
+	char valbuf[32];
+	char *valstr;
 
 	isint = (dodefault(psa->name, 8, 1, &intval) == 0);
 
+	if (dsl_dataset_is_snapshot(ds)) {
+		ASSERT(spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+		    SPA_VERSION_SNAP_PROPS);
+		if (ds->ds_phys->ds_props_obj == 0) {
+			dmu_buf_will_dirty(ds->ds_dbuf, tx);
+			ds->ds_phys->ds_props_obj =
+			    zap_create(mos,
+			    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+		}
+		zapobj = ds->ds_phys->ds_props_obj;
+	} else {
+		zapobj = ds->ds_dir->dd_phys->dd_props_zapobj;
+	}
+
 	if (psa->numints == 0) {
 		int err = zap_remove(mos, zapobj, psa->name, tx);
 		ASSERT(err == 0 || err == ENOENT);
 		if (isint) {
-			VERIFY(0 == dsl_prop_get_impl(dd->dd_parent,
+			VERIFY(0 == dsl_prop_get_ds(ds,
 			    psa->name, 8, 1, &intval, NULL));
 		}
 	} else {
@@ -342,32 +383,63 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	}
 
 	if (isint) {
-		dsl_prop_changed_notify(dd->dd_pool,
-		    dd->dd_object, psa->name, intval, TRUE);
+		if (dsl_dataset_is_snapshot(ds)) {
+			dsl_prop_cb_record_t *cbr;
+			/*
+			 * It's a snapshot; nothing can inherit this
+			 * property, so just look for callbacks on this
+			 * ds here.
+			 */
+			mutex_enter(&ds->ds_dir->dd_lock);
+			for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr;
+			    cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) {
+				if (cbr->cbr_ds == ds &&
+				    strcmp(cbr->cbr_propname, psa->name) == 0)
+					cbr->cbr_func(cbr->cbr_arg, intval);
+			}
+			mutex_exit(&ds->ds_dir->dd_lock);
+		} else {
+			dsl_prop_changed_notify(ds->ds_dir->dd_pool,
+			    ds->ds_dir->dd_object, psa->name, intval, TRUE);
+		}
+	}
+	if (isint) {
+		(void) snprintf(valbuf, sizeof (valbuf),
+		    "%lld", (longlong_t)intval);
+		valstr = valbuf;
+	} else {
+		valstr = (char *)psa->buf;
 	}
+	spa_history_internal_log((psa->numints == 0) ? LOG_DS_INHERIT :
+	    LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr,
+	    "%s=%s dataset = %llu", psa->name, valstr, ds->ds_object);
 }
 
-int
-dsl_prop_set_dd(dsl_dir_t *dd, const char *propname,
-    int intsz, int numints, const void *buf)
+void
+dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+    cred_t *cr, dmu_tx_t *tx)
 {
-	struct prop_set_arg psa;
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
 
-	psa.name = propname;
-	psa.intsz = intsz;
-	psa.numints = numints;
-	psa.buf = buf;
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY(0 == zap_update(mos, zapobj, name, sizeof (val), 1, &val, tx));
+
+	dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE);
 
-	return (dsl_sync_task_do(dd->dd_pool,
-	    NULL, dsl_prop_set_sync, dd, &psa, 2));
+	spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr,
+	    "%s=%llu dataset = %llu", name, (u_longlong_t)val,
+	    dd->dd_phys->dd_head_dataset_obj);
 }
 
 int
-dsl_prop_set(const char *ddname, const char *propname,
+dsl_prop_set(const char *dsname, const char *propname,
     int intsz, int numints, const void *buf)
 {
-	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
 	int err;
+	struct prop_set_arg psa;
 
 	/*
 	 * We must do these checks before we get to the syncfunc, since
@@ -378,11 +450,24 @@ dsl_prop_set(const char *ddname, const char *propname,
 	if (intsz * numints >= ZAP_MAXVALUELEN)
 		return (E2BIG);
 
-	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	err = dsl_dataset_hold(dsname, FTAG, &ds);
 	if (err)
 		return (err);
-	err = dsl_prop_set_dd(dd, propname, intsz, numints, buf);
-	dsl_dir_close(dd, FTAG);
+
+	if (dsl_dataset_is_snapshot(ds) &&
+	    spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) {
+		dsl_dataset_rele(ds, FTAG);
+		return (ENOTSUP);
+	}
+
+	psa.name = propname;
+	psa.intsz = intsz;
+	psa.numints = numints;
+	psa.buf = buf;
+	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+	    NULL, dsl_prop_set_sync, ds, &psa, 2);
+
+	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
@@ -390,45 +475,55 @@ dsl_prop_set(const char *ddname, const char *propname,
  * Iterate over all properties for this dataset and return them in an nvlist.
  */
 int
-dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
+dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local)
 {
 	dsl_dataset_t *ds = os->os->os_dsl_dataset;
 	dsl_dir_t *dd = ds->ds_dir;
+	boolean_t snapshot = dsl_dataset_is_snapshot(ds);
 	int err = 0;
-	dsl_pool_t *dp;
-	objset_t *mos;
-
-	if (dsl_dataset_is_snapshot(ds)) {
-		VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		return (0);
-	}
+	dsl_pool_t *dp = dd->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t propobj = ds->ds_phys->ds_props_obj;
 
 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
-	dp = dd->dd_pool;
-	mos = dp->dp_meta_objset;
+	if (local && snapshot && !propobj)
+		return (0);
 
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	for (; dd != NULL; dd = dd->dd_parent) {
+	while (dd != NULL) {
 		char setpoint[MAXNAMELEN];
 		zap_cursor_t zc;
 		zap_attribute_t za;
+		dsl_dir_t *dd_next;
+
+		if (propobj) {
+			dsl_dataset_name(ds, setpoint);
+			dd_next = dd;
+		} else {
+			dsl_dir_name(dd, setpoint);
+			propobj = dd->dd_phys->dd_props_zapobj;
+			dd_next = dd->dd_parent;
+		}
 
-		dsl_dir_name(dd, setpoint);
-
-		for (zap_cursor_init(&zc, mos, dd->dd_phys->dd_props_zapobj);
+		for (zap_cursor_init(&zc, mos, propobj);
 		    (err = zap_cursor_retrieve(&zc, &za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			nvlist_t *propval;
-			zfs_prop_t prop;
-			/*
-			 * Skip non-inheritable properties.
-			 */
-			if ((prop = zfs_name_to_prop(za.za_name)) !=
-			    ZFS_PROP_INVAL && !zfs_prop_inheritable(prop) &&
-			    dd != ds->ds_dir)
+			zfs_prop_t prop = zfs_name_to_prop(za.za_name);
+
+			/* Skip non-inheritable properties. */
+			if (prop != ZPROP_INVAL &&
+			    !zfs_prop_inheritable(prop) &&
+			    (dd != ds->ds_dir || (snapshot && dd != dd_next)))
 				continue;
 
+			/* Skip properties not valid for this type. */
+			if (snapshot && prop != ZPROP_INVAL &&
+			    !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
+				continue;
+
+			/* Skip properties already defined */
 			if (nvlist_lookup_nvlist(*nvp, za.za_name,
 			    &propval) == 0)
 				continue;
@@ -441,28 +536,26 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
 				 */
 				char *tmp = kmem_alloc(za.za_num_integers,
 				    KM_SLEEP);
-				err = zap_lookup(mos,
-				    dd->dd_phys->dd_props_zapobj,
-				    za.za_name, 1, za.za_num_integers,
-				    tmp);
+				err = zap_lookup(mos, propobj,
+				    za.za_name, 1, za.za_num_integers, tmp);
 				if (err != 0) {
 					kmem_free(tmp, za.za_num_integers);
 					break;
 				}
-				VERIFY(nvlist_add_string(propval,
-				    ZFS_PROP_VALUE, tmp) == 0);
+				VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
+				    tmp) == 0);
 				kmem_free(tmp, za.za_num_integers);
 			} else {
 				/*
 				 * Integer property
 				 */
 				ASSERT(za.za_integer_length == 8);
-				(void) nvlist_add_uint64(propval,
-				    ZFS_PROP_VALUE, za.za_first_integer);
+				(void) nvlist_add_uint64(propval, ZPROP_VALUE,
+				    za.za_first_integer);
 			}
 
-			VERIFY(nvlist_add_string(propval,
-			    ZFS_PROP_SOURCE, setpoint) == 0);
+			VERIFY(nvlist_add_string(propval, ZPROP_SOURCE,
+			    setpoint) == 0);
 			VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
 			    propval) == 0);
 			nvlist_free(propval);
@@ -472,6 +565,14 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
 		if (err != ENOENT)
 			break;
 		err = 0;
+		/*
+		 * If we are just after the props that have been set
+		 * locally, then we are done after the first iteration.
+		 */
+		if (local)
+			break;
+		dd = dd_next;
+		propobj = 0;
 	}
 	rw_exit(&dp->dp_config_rwlock);
 
@@ -484,7 +585,7 @@ dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
 	nvlist_t *propval;
 
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_add_uint64(propval, ZFS_PROP_VALUE, value) == 0);
+	VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
 	VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
 	nvlist_free(propval);
 }
@@ -495,7 +596,7 @@ dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
 	nvlist_t *propval;
 
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_add_string(propval, ZFS_PROP_VALUE, value) == 0);
+	VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
 	VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
 	nvlist_free(propval);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c
new file mode 100644
index 000000000000..5f675b787df7
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c
@@ -0,0 +1,929 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zil_impl.h>
+
+typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
+
+static scrub_cb_t dsl_pool_scrub_clean_cb;
+static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
+
+int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
+int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */
+boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+
+extern int zfs_txg_timeout;
+
+static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = {
+	NULL,
+	dsl_pool_scrub_clean_cb
+};
+
+#define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
+{                                                       \
+	(zb)->zb_objset = objset;                       \
+	(zb)->zb_object = object;                       \
+	(zb)->zb_level = level;                         \
+	(zb)->zb_blkid = blkid;                         \
+}
+
+/* ARGSUSED */
+static void
+dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = arg1;
+	enum scrub_func *funcp = arg2;
+	dmu_object_type_t ot = 0;
+	boolean_t complete = B_FALSE;
+
+	dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx);
+
+	ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE);
+	ASSERT(*funcp > SCRUB_FUNC_NONE);
+	ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS);
+
+	dp->dp_scrub_min_txg = 0;
+	dp->dp_scrub_max_txg = tx->tx_txg;
+
+	if (*funcp == SCRUB_FUNC_CLEAN) {
+		vdev_t *rvd = dp->dp_spa->spa_root_vdev;
+
+		/* rewrite all disk labels */
+		vdev_config_dirty(rvd);
+
+		if (vdev_resilver_needed(rvd,
+		    &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) {
+			spa_event_notify(dp->dp_spa, NULL,
+			    ESC_ZFS_RESILVER_START);
+			dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
+			    tx->tx_txg);
+		}
+
+		/* zero out the scrub stats in all vdev_stat_t's */
+		vdev_scrub_stat_update(rvd,
+		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
+		    POOL_SCRUB_EVERYTHING, B_FALSE);
+
+		dp->dp_spa->spa_scrub_started = B_TRUE;
+	}
+
+	/* back to the generic stuff */
+
+	if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB)
+		ot = DMU_OT_ZAP_OTHER;
+
+	dp->dp_scrub_func = *funcp;
+	dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
+	    ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
+	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+	dp->dp_scrub_restart = B_FALSE;
+	dp->dp_spa->spa_scrub_errors = 0;
+
+	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
+	    &dp->dp_scrub_func, tx));
+	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
+	    &dp->dp_scrub_queue_obj, tx));
+	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
+	    &dp->dp_scrub_min_txg, tx));
+	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
+	    &dp->dp_scrub_max_txg, tx));
+	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+	    &dp->dp_scrub_bookmark, tx));
+	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
+	    &dp->dp_spa->spa_scrub_errors, tx));
+
+	spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr,
+	    "func=%u mintxg=%llu maxtxg=%llu",
+	    *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg);
+}
+
+int
+dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func)
+{
+	return (dsl_sync_task_do(dp, NULL,
+	    dsl_pool_scrub_setup_sync, dp, &func, 0));
+}
+
+/* ARGSUSED */
+static void
+dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = arg1;
+	boolean_t *completep = arg2;
+
+	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
+		return;
+
+	mutex_enter(&dp->dp_scrub_cancel_lock);
+
+	if (dp->dp_scrub_restart) {
+		dp->dp_scrub_restart = B_FALSE;
+		*completep = B_FALSE;
+	}
+
+	/* XXX this is scrub-clean specific */
+	mutex_enter(&dp->dp_spa->spa_scrub_lock);
+	while (dp->dp_spa->spa_scrub_inflight > 0) {
+		cv_wait(&dp->dp_spa->spa_scrub_io_cv,
+		    &dp->dp_spa->spa_scrub_lock);
+	}
+	mutex_exit(&dp->dp_spa->spa_scrub_lock);
+	dp->dp_spa->spa_scrub_started = B_FALSE;
+	dp->dp_spa->spa_scrub_active = B_FALSE;
+
+	dp->dp_scrub_func = SCRUB_FUNC_NONE;
+	VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+	    dp->dp_scrub_queue_obj, tx));
+	dp->dp_scrub_queue_obj = 0;
+	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+
+	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_QUEUE, tx));
+	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_MIN_TXG, tx));
+	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_MAX_TXG, tx));
+	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_BOOKMARK, tx));
+	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_FUNC, tx));
+	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_ERRORS, tx));
+
+	spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
+	    "complete=%u", *completep);
+
+	/* below is scrub-clean specific */
+	vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE,
+	    *completep);
+	/*
+	 * If the scrub/resilver completed, update all DTLs to reflect this.
+	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
+	 */
+	vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
+	    *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
+	if (dp->dp_scrub_min_txg && *completep)
+		spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH);
+	spa_errlog_rotate(dp->dp_spa);
+
+	/*
+	 * We may have finished replacing a device.
+	 * Let the async thread assess this and handle the detach.
+	 */
+	spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE);
+
+	dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0;
+	mutex_exit(&dp->dp_scrub_cancel_lock);
+}
+
+int
+dsl_pool_scrub_cancel(dsl_pool_t *dp)
+{
+	boolean_t complete = B_FALSE;
+
+	return (dsl_sync_task_do(dp, NULL,
+	    dsl_pool_scrub_cancel_sync, dp, &complete, 3));
+}
+
+int
+dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
+    zio_done_func_t *done, void *private, uint32_t arc_flags)
+{
+	/*
+	 * This function will be used by bp-rewrite wad to intercept frees.
+	 */
+	return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp,
+	    done, private, arc_flags));
+}
+
+static boolean_t
+bookmark_is_zero(const zbookmark_t *zb)
+{
+	return (zb->zb_objset == 0 && zb->zb_object == 0 &&
+	    zb->zb_level == 0 && zb->zb_blkid == 0);
+}
+
+/* dnp is the dnode for zb1->zb_object */
+static boolean_t
+bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
+    const zbookmark_t *zb2)
+{
+	uint64_t zb1nextL0, zb2thisobj;
+
+	ASSERT(zb1->zb_objset == zb2->zb_objset);
+	ASSERT(zb1->zb_object != -1ULL);
+	ASSERT(zb2->zb_level == 0);
+
+	/*
+	 * A bookmark in the deadlist is considered to be after
+	 * everything else.
+	 */
+	if (zb2->zb_object == -1ULL)
+		return (B_TRUE);
+
+	/* The objset_phys_t isn't before anything. */
+	if (dnp == NULL)
+		return (B_FALSE);
+
+	zb1nextL0 = (zb1->zb_blkid + 1) <<
+	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+
+	zb2thisobj = zb2->zb_object ? zb2->zb_object :
+	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+
+	if (zb1->zb_object == 0) {
+		uint64_t nextobj = zb1nextL0 *
+		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
+		return (nextobj <= zb2thisobj);
+	}
+
+	if (zb1->zb_object < zb2thisobj)
+		return (B_TRUE);
+	if (zb1->zb_object > zb2thisobj)
+		return (B_FALSE);
+	if (zb2->zb_object == 0)
+		return (B_FALSE);
+	return (zb1nextL0 <= zb2->zb_blkid);
+}
+
+static boolean_t
+scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
+{
+	int elapsed_ticks;
+	int mintime;
+
+	if (dp->dp_scrub_pausing)
+		return (B_TRUE); /* we're already pausing */
+
+	if (!bookmark_is_zero(&dp->dp_scrub_bookmark))
+		return (B_FALSE); /* we're resuming */
+
+	/* We only know how to resume from level-0 blocks. */
+	if (zb->zb_level != 0)
+		return (B_FALSE);
+
+	mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time :
+	    zfs_scrub_min_time;
+	elapsed_ticks = lbolt64 - dp->dp_scrub_start_time;
+	if (elapsed_ticks > hz * zfs_txg_timeout ||
+	    (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) {
+		dprintf("pausing at %llx/%llx/%llx/%llx\n",
+		    (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object,
+		    (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid);
+		dp->dp_scrub_pausing = B_TRUE;
+		dp->dp_scrub_bookmark = *zb;
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+typedef struct zil_traverse_arg {
+	dsl_pool_t	*zta_dp;
+	zil_header_t	*zta_zh;
+} zil_traverse_arg_t;
+
+/* ARGSUSED */
+static void
+traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+	zil_traverse_arg_t *zta = arg;
+	dsl_pool_t *dp = zta->zta_dp;
+	zil_header_t *zh = zta->zta_zh;
+	zbookmark_t zb;
+
+	if (bp->blk_birth <= dp->dp_scrub_min_txg)
+		return;
+
+	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+		return;
+
+	zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
+	zb.zb_object = 0;
+	zb.zb_level = -1;
+	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+	VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
+}
+
+/* ARGSUSED */
+static void
+traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+	if (lrc->lrc_txtype == TX_WRITE) {
+		zil_traverse_arg_t *zta = arg;
+		dsl_pool_t *dp = zta->zta_dp;
+		zil_header_t *zh = zta->zta_zh;
+		lr_write_t *lr = (lr_write_t *)lrc;
+		blkptr_t *bp = &lr->lr_blkptr;
+		zbookmark_t zb;
+
+		if (bp->blk_birth <= dp->dp_scrub_min_txg)
+			return;
+
+		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+			return;
+
+		zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
+		zb.zb_object = lr->lr_foid;
+		zb.zb_level = BP_GET_LEVEL(bp);
+		zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
+		VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
+	}
+}
+
+static void
+traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
+{
+	uint64_t claim_txg = zh->zh_claim_txg;
+	zil_traverse_arg_t zta = { dp, zh };
+	zilog_t *zilog;
+
+	/*
+	 * We only want to visit blocks that have been claimed but not yet
+	 * replayed (or, in read-only mode, blocks that *would* be claimed).
+	 */
+	if (claim_txg == 0 && (spa_mode & FWRITE))
+		return;
+
+	zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta,
+	    claim_txg);
+
+	zil_free(zilog);
+}
+
+static void
+scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
+    arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
+{
+	int err;
+	arc_buf_t *buf = NULL;
+
+	if (bp->blk_birth == 0)
+		return;
+
+	if (bp->blk_birth <= dp->dp_scrub_min_txg)
+		return;
+
+	if (scrub_pause(dp, zb))
+		return;
+
+	if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
+		/*
+		 * If we already visited this bp & everything below (in
+		 * a prior txg), don't bother doing it again.
+		 */
+		if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark))
+			return;
+
+		/*
+		 * If we found the block we're trying to resume from, or
+		 * we went past it to a different object, zero it out to
+		 * indicate that it's OK to start checking for pausing
+		 * again.
+		 */
+		if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 ||
+		    zb->zb_object > dp->dp_scrub_bookmark.zb_object) {
+			dprintf("resuming at %llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+			bzero(&dp->dp_scrub_bookmark, sizeof (*zb));
+		}
+	}
+
+	if (BP_GET_LEVEL(bp) > 0) {
+		uint32_t flags = ARC_WAIT;
+		int i;
+		blkptr_t *cbp;
+		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+
+		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
+		    arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err) {
+			mutex_enter(&dp->dp_spa->spa_scrub_lock);
+			dp->dp_spa->spa_scrub_errors++;
+			mutex_exit(&dp->dp_spa->spa_scrub_lock);
+			return;
+		}
+		cbp = buf->b_data;
+
+		for (i = 0; i < epb; i++, cbp++) {
+			zbookmark_t czb;
+
+			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1,
+			    zb->zb_blkid * epb + i);
+			scrub_visitbp(dp, dnp, buf, cbp, &czb);
+		}
+	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+		uint32_t flags = ARC_WAIT;
+		dnode_phys_t *child_dnp;
+		int i, j;
+		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
+		    arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err) {
+			mutex_enter(&dp->dp_spa->spa_scrub_lock);
+			dp->dp_spa->spa_scrub_errors++;
+			mutex_exit(&dp->dp_spa->spa_scrub_lock);
+			return;
+		}
+		child_dnp = buf->b_data;
+
+		for (i = 0; i < epb; i++, child_dnp++) {
+			for (j = 0; j < child_dnp->dn_nblkptr; j++) {
+				zbookmark_t czb;
+
+				SET_BOOKMARK(&czb, zb->zb_objset,
+				    zb->zb_blkid * epb + i,
+				    child_dnp->dn_nlevels - 1, j);
+				scrub_visitbp(dp, child_dnp, buf,
+				    &child_dnp->dn_blkptr[j], &czb);
+			}
+		}
+	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+		uint32_t flags = ARC_WAIT;
+		objset_phys_t *osp;
+		int j;
+
+		err = arc_read_nolock(NULL, dp->dp_spa, bp,
+		    arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err) {
+			mutex_enter(&dp->dp_spa->spa_scrub_lock);
+			dp->dp_spa->spa_scrub_errors++;
+			mutex_exit(&dp->dp_spa->spa_scrub_lock);
+			return;
+		}
+
+		osp = buf->b_data;
+
+		traverse_zil(dp, &osp->os_zil_header);
+
+		for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
+			zbookmark_t czb;
+
+			SET_BOOKMARK(&czb, zb->zb_objset, 0,
+			    osp->os_meta_dnode.dn_nlevels - 1, j);
+			scrub_visitbp(dp, &osp->os_meta_dnode, buf,
+			    &osp->os_meta_dnode.dn_blkptr[j], &czb);
+		}
+	}
+
+	(void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
+	if (buf)
+		(void) arc_buf_remove_ref(buf, &buf);
+}
+
+static void
+scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
+{
+	zbookmark_t zb;
+
+	SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0);
+	scrub_visitbp(dp, NULL, NULL, bp, &zb);
+}
+
+void
+dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
+		return;
+
+	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
+		SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0);
+	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+	    ds->ds_object, tx) != 0) {
+		return;
+	}
+
+	if (ds->ds_phys->ds_next_snap_obj != 0) {
+		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
+	}
+	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+}
+
+void
+dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
+		return;
+
+	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+
+	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
+		dp->dp_scrub_bookmark.zb_objset =
+		    ds->ds_phys->ds_prev_snap_obj;
+	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+	    ds->ds_object, tx) == 0) {
+		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+		    ds->ds_phys->ds_prev_snap_obj, tx) == 0);
+	}
+}
+
+struct enqueue_clones_arg {
+	dmu_tx_t *tx;
+	uint64_t originobj;
+};
+
+/* ARGSUSED */
+static int
+enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+	struct enqueue_clones_arg *eca = arg;
+	dsl_dataset_t *ds;
+	int err;
+	dsl_pool_t *dp;
+
+	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
+	if (err)
+		return (err);
+	dp = ds->ds_dir->dd_pool;
+
+	if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
+		while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+			dsl_dataset_t *prev;
+			err = dsl_dataset_hold_obj(dp,
+			    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+
+			dsl_dataset_rele(ds, FTAG);
+			if (err)
+				return (err);
+			ds = prev;
+		}
+		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+		    ds->ds_object, eca->tx) == 0);
+	}
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds;
+	uint64_t min_txg_save;
+
+	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+	/*
+	 * Iterate over the bps in this ds.
+	 */
+	min_txg_save = dp->dp_scrub_min_txg;
+	dp->dp_scrub_min_txg =
+	    MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg);
+	scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp);
+	dp->dp_scrub_min_txg = min_txg_save;
+
+	if (dp->dp_scrub_pausing)
+		goto out;
+
+	/*
+	 * Add descendent datasets to work queue.
+	 */
+	if (ds->ds_phys->ds_next_snap_obj != 0) {
+		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
+	}
+	if (ds->ds_phys->ds_num_children > 1) {
+		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+			struct enqueue_clones_arg eca;
+			eca.tx = tx;
+			eca.originobj = ds->ds_object;
+
+			(void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
+			    NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
+		} else {
+			VERIFY(zap_join(dp->dp_meta_objset,
+			    ds->ds_phys->ds_next_clones_obj,
+			    dp->dp_scrub_queue_obj, tx) == 0);
+		}
+	}
+
+out:
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/* ARGSUSED */
+static int
+enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+	dmu_tx_t *tx = arg;
+	dsl_dataset_t *ds;
+	int err;
+	dsl_pool_t *dp;
+
+	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
+	if (err)
+		return (err);
+
+	dp = ds->ds_dir->dd_pool;
+
+	while (ds->ds_phys->ds_prev_snap_obj != 0) {
+		dsl_dataset_t *prev;
+		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+		    FTAG, &prev);
+		if (err) {
+			dsl_dataset_rele(ds, FTAG);
+			return (err);
+		}
+
+		/*
+		 * If this is a clone, we don't need to worry about it for now.
+		 */
+		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+			dsl_dataset_rele(ds, FTAG);
+			dsl_dataset_rele(prev, FTAG);
+			return (0);
+		}
+		dsl_dataset_rele(ds, FTAG);
+		ds = prev;
+	}
+
+	VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+	    ds->ds_object, tx) == 0);
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+void
+dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	boolean_t complete = B_TRUE;
+
+	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
+		return;
+
+	/* If the spa is not fully loaded, don't bother. */
+	if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE)
+		return;
+
+	if (dp->dp_scrub_restart) {
+		enum scrub_func func = dp->dp_scrub_func;
+		dp->dp_scrub_restart = B_FALSE;
+		dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
+	}
+
+	if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
+		/*
+		 * We must have resumed after rebooting; reset the vdev
+		 * stats to know that we're doing a scrub (although it
+		 * will think we're just starting now).
+		 */
+		vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev,
+		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
+		    POOL_SCRUB_EVERYTHING, B_FALSE);
+	}
+
+	dp->dp_scrub_pausing = B_FALSE;
+	dp->dp_scrub_start_time = lbolt64;
+	dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
+	dp->dp_spa->spa_scrub_active = B_TRUE;
+
+	if (dp->dp_scrub_bookmark.zb_objset == 0) {
+		/* First do the MOS & ORIGIN */
+		scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
+		if (dp->dp_scrub_pausing)
+			goto out;
+
+		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+			VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+			    NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
+		} else {
+			scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
+		}
+		ASSERT(!dp->dp_scrub_pausing);
+	} else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) {
+		/*
+		 * If we were paused, continue from here.  Note if the
+		 * ds we were paused on was deleted, the zb_objset will
+		 * be -1, so we will skip this and find a new objset
+		 * below.
+		 */
+		scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
+		if (dp->dp_scrub_pausing)
+			goto out;
+	}
+
+	/*
+	 * In case we were paused right at the end of the ds, zero the
+	 * bookmark so we don't think that we're still trying to resume.
+	 */
+	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+
+	/* keep pulling things out of the zap-object-as-queue */
+	while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj),
+	    zap_cursor_retrieve(&zc, &za) == 0) {
+		VERIFY(0 == zap_remove(dp->dp_meta_objset,
+		    dp->dp_scrub_queue_obj, za.za_name, tx));
+		scrub_visitds(dp, za.za_first_integer, tx);
+		if (dp->dp_scrub_pausing)
+			break;
+		zap_cursor_fini(&zc);
+	}
+	zap_cursor_fini(&zc);
+	if (dp->dp_scrub_pausing)
+		goto out;
+
+	/* done. */
+
+	dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
+	return;
+out:
+	VERIFY(0 == zap_update(dp->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+	    &dp->dp_scrub_bookmark, tx));
+	VERIFY(0 == zap_update(dp->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
+	    &dp->dp_spa->spa_scrub_errors, tx));
+
+	/* XXX this is scrub-clean specific */
+	mutex_enter(&dp->dp_spa->spa_scrub_lock);
+	while (dp->dp_spa->spa_scrub_inflight > 0) {
+		cv_wait(&dp->dp_spa->spa_scrub_io_cv,
+		    &dp->dp_spa->spa_scrub_lock);
+	}
+	mutex_exit(&dp->dp_spa->spa_scrub_lock);
+}
+
+void
+dsl_pool_scrub_restart(dsl_pool_t *dp)
+{
+	mutex_enter(&dp->dp_scrub_cancel_lock);
+	dp->dp_scrub_restart = B_TRUE;
+	mutex_exit(&dp->dp_scrub_cancel_lock);
+}
+
+/*
+ * scrub consumers
+ */
+
+static void
+dsl_pool_scrub_clean_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+
+	zio_data_buf_free(zio->io_data, zio->io_size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_inflight--;
+	cv_broadcast(&spa->spa_scrub_io_cv);
+
+	if (zio->io_error && (zio->io_error != ECKSUM ||
+	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)))
+		spa->spa_scrub_errors++;
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
+    const blkptr_t *bp, const zbookmark_t *zb)
+{
+	size_t size = BP_GET_LSIZE(bp);
+	int d;
+	spa_t *spa = dp->dp_spa;
+	boolean_t needs_io;
+	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
+	int zio_priority;
+
+	if (dp->dp_scrub_isresilver == 0) {
+		/* It's a scrub */
+		zio_flags |= ZIO_FLAG_SCRUB;
+		zio_priority = ZIO_PRIORITY_SCRUB;
+		needs_io = B_TRUE;
+	} else {
+		/* It's a resilver */
+		zio_flags |= ZIO_FLAG_RESILVER;
+		zio_priority = ZIO_PRIORITY_RESILVER;
+		needs_io = B_FALSE;
+	}
+
+	/* If it's an intent log block, failure is expected. */
+	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+		zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+		vdev_t *vd = vdev_lookup_top(spa,
+		    DVA_GET_VDEV(&bp->blk_dva[d]));
+
+		/*
+		 * Keep track of how much data we've examined so that
+		 * zpool(1M) status can make useful progress reports.
+		 */
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_scrub_examined +=
+		    DVA_GET_ASIZE(&bp->blk_dva[d]);
+		mutex_exit(&vd->vdev_stat_lock);
+
+		/* if it's a resilver, this may not be in the target range */
+		if (!needs_io) {
+			if (DVA_GET_GANG(&bp->blk_dva[d])) {
+				/*
+				 * Gang members may be spread across multiple
+				 * vdevs, so the best we can do is look at the
+				 * pool-wide DTL.
+				 * XXX -- it would be better to change our
+				 * allocation policy to ensure that this can't
+				 * happen.
+				 */
+				vd = spa->spa_root_vdev;
+			}
+			needs_io = vdev_dtl_contains(&vd->vdev_dtl_map,
+			    bp->blk_birth, 1);
+		}
+	}
+
+	if (needs_io && !zfs_no_scrub_io) {
+		void *data = zio_data_buf_alloc(size);
+
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+		spa->spa_scrub_inflight++;
+		mutex_exit(&spa->spa_scrub_lock);
+
+		zio_nowait(zio_read(NULL, spa, bp, data, size,
+		    dsl_pool_scrub_clean_done, NULL, zio_priority,
+		    zio_flags, zb));
+	}
+
+	/* do not relocate this block */
+	return (0);
+}
+
+int
+dsl_pool_scrub_clean(dsl_pool_t *dp)
+{
+	/*
+	 * Purge all vdev caches.  We do this here rather than in sync
+	 * context because this requires a writer lock on the spa_config
+	 * lock, which we can't do from sync context.  The
+	 * spa_scrub_reopen flag indicates that vdev_open() should not
+	 * attempt to start another scrub.
+	 */
+	spa_config_enter(dp->dp_spa, SCL_ALL, FTAG, RW_WRITER);
+	dp->dp_spa->spa_scrub_reopen = B_TRUE;
+	vdev_reopen(dp->dp_spa->spa_root_vdev);
+	dp->dp_spa->spa_scrub_reopen = B_FALSE;
+	spa_config_exit(dp->dp_spa, SCL_ALL, FTAG);
+
+	return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
index 17deb569c4ab..21100225abf7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -30,6 +30,7 @@
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
+#include <sys/cred.h>
 
 #define	DST_AVG_BLKSHIFT 14
 
@@ -49,6 +50,7 @@ dsl_sync_task_group_create(dsl_pool_t *dp)
 	list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
 	    offsetof(dsl_sync_task_t, dst_node));
 	dstg->dstg_pool = dp;
+	dstg->dstg_cr = CRED();
 
 	return (dstg);
 }
@@ -123,6 +125,16 @@ top:
 }
 
 void
+dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
+{
+	uint64_t txg;
+
+	dstg->dstg_nowaiter = B_TRUE;
+	txg = dmu_tx_get_txg(tx);
+	VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+}
+
+void
 dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg)
 {
 	dsl_sync_task_t *dst;
@@ -146,7 +158,7 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
 	 * Check for sufficient space.
 	 */
 	dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir,
-	    dstg->dstg_space, dstg->dstg_space * 3, 0, &tr_cookie, tx);
+	    dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx);
 	/* don't bother trying again */
 	if (dstg->dstg_err == ERESTART)
 		dstg->dstg_err = EAGAIN;
@@ -171,12 +183,16 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
 		 */
 		for (dst = list_head(&dstg->dstg_tasks); dst;
 		    dst = list_next(&dstg->dstg_tasks, dst)) {
-			dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx);
+			dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2,
+			    dstg->dstg_cr, tx);
 		}
 	}
 	rw_exit(&dstg->dstg_pool->dp_config_rwlock);
 
 	dsl_dir_tempreserve_clear(tr_cookie, tx);
+
+	if (dstg->dstg_nowaiter)
+		dsl_sync_task_group_destroy(dstg);
 }
 
 int
@@ -194,3 +210,16 @@ dsl_sync_task_do(dsl_pool_t *dp,
 	dsl_sync_task_group_destroy(dstg);
 	return (err);
 }
+
+void
+dsl_sync_task_do_nowait(dsl_pool_t *dp,
+    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
+    void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx)
+{
+	dsl_sync_task_group_t *dstg;
+
+	dstg = dsl_sync_task_group_create(dp);
+	dsl_sync_task_create(dstg, checkfunc, syncfunc,
+	    arg1, arg2, blocks_modified);
+	dsl_sync_task_group_nowait(dstg, tx);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index 0dba134cef9b..22b56d617799 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
@@ -35,6 +33,7 @@
 #include <sys/zio.h>
 
 uint64_t metaslab_aliquot = 512ULL << 10;
+uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
 /*
  * ==========================================================================
@@ -341,7 +340,7 @@ metaslab_fini(metaslab_t *msp)
 	int t;
 
 	vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
-	    -msp->ms_smo.smo_alloc);
+	    -msp->ms_smo.smo_alloc, B_TRUE);
 
 	metaslab_group_remove(mg, msp);
 
@@ -534,8 +533,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 
 	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
-	ASSERT3U(db->db_size, ==, sizeof (*smo));
-	bcopy(smo, db->db_data, db->db_size);
+	ASSERT3U(db->db_size, >=, sizeof (*smo));
+	bcopy(smo, db->db_data, sizeof (*smo));
 	dmu_buf_rele(db, FTAG);
 
 	dmu_tx_commit(tx);
@@ -569,10 +568,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 			space_map_create(&msp->ms_freemap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
 		}
-		vdev_space_update(vd, sm->sm_size, 0);
+		vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
 	}
 
-	vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc);
+	vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
 
 	ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
 	ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
@@ -714,11 +713,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
  * Allocate a block for the specified i/o.
  */
 static int
-metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
-    dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid)
+metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
+    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
 {
 	metaslab_group_t *mg, *rotor;
-	metaslab_class_t *mc;
 	vdev_t *vd;
 	int dshift = 3;
 	int all_zero;
@@ -728,7 +726,11 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
 
 	ASSERT(!DVA_IS_VALID(&dva[d]));
 
-	mc = spa_metaslab_class_select(spa);
+	/*
+	 * For testing, make some blocks above a certain size be gang blocks.
+	 */
+	if (psize >= metaslab_gang_bang && (LBOLT & 3) == 0)
+		return (ENOSPC);
 
 	/*
 	 * Start at the rotor and loop through all mgs until we find something.
@@ -754,7 +756,7 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
 	 */
 	if (hintdva) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
-		if (hintdva_avoid)
+		if (flags & METASLAB_HINTBP_AVOID)
 			mg = vd->vdev_mg->mg_next;
 		else
 			mg = vd->vdev_mg;
@@ -764,12 +766,34 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
 	} else {
 		mg = mc->mc_rotor;
 	}
-	rotor = mg;
 
+	/*
+	 * If the hint put us into the wrong class, just follow the rotor.
+	 */
+	if (mg->mg_class != mc)
+		mg = mc->mc_rotor;
+
+	rotor = mg;
 top:
 	all_zero = B_TRUE;
 	do {
 		vd = mg->mg_vd;
+		/*
+		 * Don't allocate from faulted devices.
+		 */
+		if (!vdev_writeable(vd))
+			goto next;
+		/*
+		 * Avoid writing single-copy data to a failing vdev
+		 */
+		if ((vd->vdev_stat.vs_write_errors > 0 ||
+		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
+		    d == 0 && dshift == 3) {
+			all_zero = B_FALSE;
+			goto next;
+		}
+
+		ASSERT(mg->mg_class == mc);
 
 		distance = vd->vdev_asize >> dshift;
 		if (distance <= (1ULL << vd->vdev_ms_shift))
@@ -818,11 +842,12 @@ top:
 
 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
 			DVA_SET_OFFSET(&dva[d], offset);
-			DVA_SET_GANG(&dva[d], 0);
+			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
 			DVA_SET_ASIZE(&dva[d], asize);
 
 			return (0);
 		}
+next:
 		mc->mc_rotor = mg->mg_next;
 		mc->mc_allocated = 0;
 	} while ((mg = mg->mg_next) != rotor);
@@ -879,38 +904,6 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
 		if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
 		space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
-
-		/*
-		 * verify that this region is actually allocated in
-		 * either a ms_allocmap or the ms_map
-		 */
-		if (msp->ms_map.sm_loaded) {
-			boolean_t allocd = B_FALSE;
-			int i;
-
-			if (!space_map_contains(&msp->ms_map, offset, size)) {
-				allocd = B_TRUE;
-			} else {
-				for (i = 0; i < TXG_CONCURRENT_STATES; i++) {
-					space_map_t *sm = &msp->ms_allocmap
-					    [(txg - i) & TXG_MASK];
-					if (space_map_contains(sm,
-					    offset, size)) {
-						allocd = B_TRUE;
-						break;
-					}
-				}
-			}
-
-			if (!allocd) {
-				zfs_panic_recover("freeing free segment "
-				    "(vdev=%llu offset=%llx size=%llx)",
-				    (longlong_t)vdev, (longlong_t)offset,
-				    (longlong_t)size);
-			}
-		}
-
-
 	}
 
 	mutex_exit(&msp->ms_lock);
@@ -946,16 +939,18 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 	mutex_enter(&msp->ms_lock);
 
 	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
-	if (error) {
+	if (error || txg == 0) {	/* txg == 0 indicates dry run */
 		mutex_exit(&msp->ms_lock);
 		return (error);
 	}
 
-	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
-		vdev_dirty(vd, VDD_METASLAB, msp, txg);
-
 	space_map_claim(&msp->ms_map, offset, size);
-	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+
+	if (spa_mode & FWRITE) {	/* don't dirty if we're zdb(1M) */
+		if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+			vdev_dirty(vd, VDD_METASLAB, msp, txg);
+		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+	}
 
 	mutex_exit(&msp->ms_lock);
 
@@ -963,32 +958,45 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 }
 
 int
-metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas,
-    uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid)
+metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
+    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
 {
 	dva_t *dva = bp->blk_dva;
 	dva_t *hintdva = hintbp->blk_dva;
-	int d;
 	int error = 0;
 
+	ASSERT(bp->blk_birth == 0);
+
+	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+
+	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
+		spa_config_exit(spa, SCL_ALLOC, FTAG);
+		return (ENOSPC);
+	}
+
 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
 	ASSERT(BP_GET_NDVAS(bp) == 0);
 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
 
-	for (d = 0; d < ndvas; d++) {
-		error = metaslab_alloc_dva(spa, psize, dva, d, hintdva,
-		    txg, hintbp_avoid);
+	for (int d = 0; d < ndvas; d++) {
+		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
+		    txg, flags);
 		if (error) {
 			for (d--; d >= 0; d--) {
 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
 				bzero(&dva[d], sizeof (dva_t));
 			}
+			spa_config_exit(spa, SCL_ALLOC, FTAG);
 			return (error);
 		}
 	}
 	ASSERT(error == 0);
 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
 
+	spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+	bp->blk_birth = txg;
+
 	return (0);
 }
 
@@ -997,12 +1005,16 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
-	int d;
 
 	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg);
+
+	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 
-	for (d = 0; d < ndvas; d++)
+	for (int d = 0; d < ndvas; d++)
 		metaslab_free_dva(spa, &dva[d], txg, now);
+
+	spa_config_exit(spa, SCL_FREE, FTAG);
 }
 
 int
@@ -1010,14 +1022,28 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
-	int d, error;
-	int last_error = 0;
+	int error = 0;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
-	for (d = 0; d < ndvas; d++)
+	if (txg != 0) {
+		/*
+		 * First do a dry run to make sure all DVAs are claimable,
+		 * so we don't have to unwind from partial failures below.
+		 */
+		if ((error = metaslab_claim(spa, bp, 0)) != 0)
+			return (error);
+	}
+
+	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+
+	for (int d = 0; d < ndvas; d++)
 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
-			last_error = error;
+			break;
+
+	spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+	ASSERT(error == 0 || txg == 0);
 
-	return (last_error);
+	return (error);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
index a2f4614fed87..5fe4e638055a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -61,11 +60,13 @@ refcount_fini(void)
 void
 refcount_create(refcount_t *rc)
 {
+	mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&rc->rc_list, sizeof (reference_t),
 	    offsetof(reference_t, ref_link));
 	list_create(&rc->rc_removed, sizeof (reference_t),
 	    offsetof(reference_t, ref_link));
-	mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
+	rc->rc_count = 0;
+	rc->rc_removed_count = 0;
 }
 
 void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
new file mode 100644
index 000000000000..db3b70fc68b0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
@@ -0,0 +1,249 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/refcount.h>
+#include <sys/rrwlock.h>
+
+/*
+ * This file contains the implementation of a re-entrant read
+ * reader/writer lock (aka "rrwlock").
+ *
+ * This is a normal reader/writer lock with the additional feature
+ * of allowing threads who have already obtained a read lock to
+ * re-enter another read lock (re-entrant read) - even if there are
+ * waiting writers.
+ *
+ * Callers who have not obtained a read lock give waiting writers priority.
+ *
+ * The rrwlock_t lock does not allow re-entrant writers, nor does it
+ * allow a re-entrant mix of reads and writes (that is, it does not
+ * allow a caller who has already obtained a read lock to be able to
+ * then grab a write lock without first dropping all read locks, and
+ * vice versa).
+ *
+ * The rrwlock_t uses tsd (thread specific data) to keep a list of
+ * nodes (rrw_node_t), where each node keeps track of which specific
+ * lock (rrw_node_t::rn_rrl) the thread has grabbed.  Since re-entering
+ * should be rare, a thread that grabs multiple reads on the same rrwlock_t
+ * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
+ * tsd list can represent a different rrwlock_t.  This allows a thread
+ * to enter multiple and unique rrwlock_ts for read locks at the same time.
+ *
+ * Since using tsd exposes some overhead, the rrwlock_t only needs to
+ * keep tsd data when writers are waiting.  If no writers are waiting, then
+ * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
+ * is needed.  Once a writer attempts to grab the lock, readers then
+ * keep tsd data and bump the linked readers count (rr_linked_rcount).
+ *
+ * If there are waiting writers and there are anonymous readers, then a
+ * reader doesn't know if it is a re-entrant lock. But since it may be one,
+ * we allow the read to proceed (otherwise it could deadlock).  Since once
+ * waiting writers are active, readers no longer bump the anonymous count,
+ * the anonymous readers will eventually flush themselves out.  At this point,
+ * readers will be able to tell if they are a re-entrant lock (have a
+ * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
+ * we must let the proceed.  If they are not, then the reader blocks for the
+ * waiting writers.  Hence, we do not starve writers.
+ */
+
+/* global key for TSD */
+uint_t rrw_tsd_key;
+
+typedef struct rrw_node {
+	struct rrw_node	*rn_next;
+	rrwlock_t	*rn_rrl;
+} rrw_node_t;
+
+static rrw_node_t *
+rrn_find(rrwlock_t *rrl)
+{
+	rrw_node_t *rn;
+
+	if (refcount_count(&rrl->rr_linked_rcount) == 0)
+		return (NULL);
+
+	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+		if (rn->rn_rrl == rrl)
+			return (rn);
+	}
+	return (NULL);
+}
+
+/*
+ * Add a node to the head of the singly linked list.
+ */
+static void
+rrn_add(rrwlock_t *rrl)
+{
+	rrw_node_t *rn;
+
+	rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
+	rn->rn_rrl = rrl;
+	rn->rn_next = tsd_get(rrw_tsd_key);
+	VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
+}
+
+/*
+ * If a node is found for 'rrl', then remove the node from this
+ * thread's list and return TRUE; otherwise return FALSE.
+ */
+static boolean_t
+rrn_find_and_remove(rrwlock_t *rrl)
+{
+	rrw_node_t *rn;
+	rrw_node_t *prev = NULL;
+
+	if (refcount_count(&rrl->rr_linked_rcount) == 0)
+		return (B_FALSE);
+
+	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+		if (rn->rn_rrl == rrl) {
+			if (prev)
+				prev->rn_next = rn->rn_next;
+			else
+				VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
+			kmem_free(rn, sizeof (*rn));
+			return (B_TRUE);
+		}
+		prev = rn;
+	}
+	return (B_FALSE);
+}
+
+void
+rrw_init(rrwlock_t *rrl)
+{
+	mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
+	rrl->rr_writer = NULL;
+	refcount_create(&rrl->rr_anon_rcount);
+	refcount_create(&rrl->rr_linked_rcount);
+	rrl->rr_writer_wanted = B_FALSE;
+}
+
+void
+rrw_destroy(rrwlock_t *rrl)
+{
+	mutex_destroy(&rrl->rr_lock);
+	cv_destroy(&rrl->rr_cv);
+	ASSERT(rrl->rr_writer == NULL);
+	refcount_destroy(&rrl->rr_anon_rcount);
+	refcount_destroy(&rrl->rr_linked_rcount);
+}
+
+static void
+rrw_enter_read(rrwlock_t *rrl, void *tag)
+{
+	mutex_enter(&rrl->rr_lock);
+	ASSERT(rrl->rr_writer != curthread);
+	ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
+
+	while (rrl->rr_writer || (rrl->rr_writer_wanted &&
+	    refcount_is_zero(&rrl->rr_anon_rcount) &&
+	    rrn_find(rrl) == NULL))
+		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+
+	if (rrl->rr_writer_wanted) {
+		/* may or may not be a re-entrant enter */
+		rrn_add(rrl);
+		(void) refcount_add(&rrl->rr_linked_rcount, tag);
+	} else {
+		(void) refcount_add(&rrl->rr_anon_rcount, tag);
+	}
+	ASSERT(rrl->rr_writer == NULL);
+	mutex_exit(&rrl->rr_lock);
+}
+
+static void
+rrw_enter_write(rrwlock_t *rrl)
+{
+	mutex_enter(&rrl->rr_lock);
+	ASSERT(rrl->rr_writer != curthread);
+
+	while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
+	    refcount_count(&rrl->rr_linked_rcount) > 0 ||
+	    rrl->rr_writer != NULL) {
+		rrl->rr_writer_wanted = B_TRUE;
+		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+	}
+	rrl->rr_writer_wanted = B_FALSE;
+	rrl->rr_writer = curthread;
+	mutex_exit(&rrl->rr_lock);
+}
+
+void
+rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
+{
+	if (rw == RW_READER)
+		rrw_enter_read(rrl, tag);
+	else
+		rrw_enter_write(rrl);
+}
+
+void
+rrw_exit(rrwlock_t *rrl, void *tag)
+{
+	mutex_enter(&rrl->rr_lock);
+	ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
+	    !refcount_is_zero(&rrl->rr_linked_rcount) ||
+	    rrl->rr_writer != NULL);
+
+	if (rrl->rr_writer == NULL) {
+		if (rrn_find_and_remove(rrl)) {
+			if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0)
+				cv_broadcast(&rrl->rr_cv);
+
+		} else {
+			if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0)
+				cv_broadcast(&rrl->rr_cv);
+		}
+	} else {
+		ASSERT(rrl->rr_writer == curthread);
+		ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
+		    refcount_is_zero(&rrl->rr_linked_rcount));
+		rrl->rr_writer = NULL;
+		cv_broadcast(&rrl->rr_cv);
+	}
+	mutex_exit(&rrl->rr_lock);
+}
+
+boolean_t
+rrw_held(rrwlock_t *rrl, krw_t rw)
+{
+	boolean_t held;
+
+	mutex_enter(&rrl->rr_lock);
+	if (rw == RW_WRITER) {
+		held = (rrl->rr_writer == curthread);
+	} else {
+		held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
+		    !refcount_is_zero(&rrl->rr_linked_rcount));
+	}
+	mutex_exit(&rrl->rr_lock);
+
+	return (held);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
index ce5c26131af5..ca7076cb6fd9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -31,20 +30,20 @@
 #include <sys/zio_checksum.h>
 
 /*
- * SHA-256 checksum, as specified in FIPS 180-2, available at:
- * http://csrc.nist.gov/cryptval
+ * SHA-256 checksum, as specified in FIPS 180-3, available at:
+ * http://csrc.nist.gov/publications/PubsFIPS.html
  *
  * This is a very compact implementation of SHA-256.
  * It is designed to be simple and portable, not to be fast.
  */
 
 /*
- * The literal definitions according to FIPS180-2 would be:
+ * The literal definitions of Ch() and Maj() according to FIPS 180-3 are:
  *
- * 	Ch(x, y, z)     (((x) & (y)) ^ ((~(x)) & (z)))
- * 	Maj(x, y, z)    (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+ * 	Ch(x, y, z)     (x & y) ^ (~x & z)
+ * 	Maj(x, y, z)    (x & y) ^ (x & z) ^ (y & z)
  *
- * We use logical equivalents which require one less op.
+ * We use equivalent logical reductions here that require one less op.
  */
 #define	Ch(x, y, z)	((z) ^ ((x) & ((y) ^ (z))))
 #define	Maj(x, y, z)	(((x) & (y)) ^ ((z) & ((x) ^ (y))))
@@ -105,20 +104,19 @@ zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
 	uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
 	    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
 	uint8_t pad[128];
-	int padsize = size & 63;
-	int i;
+	int i, padsize;
 
-	for (i = 0; i < size - padsize; i += 64)
+	for (i = 0; i < (size & ~63ULL); i += 64)
 		SHA256Transform(H, (uint8_t *)buf + i);
 
-	for (i = 0; i < padsize; i++)
-		pad[i] = ((uint8_t *)buf)[i];
+	for (padsize = 0; i < size; i++)
+		pad[padsize++] = *((uint8_t *)buf + i);
 
 	for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
 		pad[padsize] = 0;
 
-	for (i = 0; i < 8; i++)
-		pad[padsize++] = (size << 3) >> (56 - 8 * i);
+	for (i = 56; i >= 0; i -= 8)
+		pad[padsize++] = (size << 3) >> i;
 
 	for (i = 0; i < padsize; i += 64)
 		SHA256Transform(H, pad + i);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index 6a7c525ae991..163b21572247 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This file contains all the routines used when modifying on-disk SPA state.
  * This includes opening, importing, destroying, exporting a pool, and syncing a
@@ -56,16 +54,388 @@
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/fs/zfs.h>
+#include <sys/arc.h>
 #include <sys/callb.h>
 #include <sys/sunddi.h>
+#include <sys/spa_boot.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
 
-int zio_taskq_threads = 0;
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
-TUNABLE_INT("vfs.zfs.zio.taskq_threads", &zio_taskq_threads);
-SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_threads, CTLFLAG_RW,
-    &zio_taskq_threads, 0, "Number of ZIO threads per ZIO type");
+int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+	/*	ISSUE	INTR					*/
+	{	1,	1	},	/* ZIO_TYPE_NULL	*/
+	{	1,	8	},	/* ZIO_TYPE_READ	*/
+	{	8,	1	},	/* ZIO_TYPE_WRITE	*/
+	{	1,	1	},	/* ZIO_TYPE_FREE	*/
+	{	1,	1	},	/* ZIO_TYPE_CLAIM	*/
+	{	1,	1	},	/* ZIO_TYPE_IOCTL	*/
+};
 
+static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
+static boolean_t spa_has_active_shared_spare(spa_t *spa);
+
+/*
+ * ==========================================================================
+ * SPA properties routines
+ * ==========================================================================
+ */
+
+/*
+ * Add a (source=src, propname=propval) list to an nvlist.
+ */
+static void
+spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
+    uint64_t intval, zprop_source_t src)
+{
+	const char *propname = zpool_prop_to_name(prop);
+	nvlist_t *propval;
+
+	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
+
+	if (strval != NULL)
+		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
+	else
+		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
+
+	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
+	nvlist_free(propval);
+}
+
+/*
+ * Get property values from the spa configuration.
+ */
+static void
+spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
+{
+	uint64_t size = spa_get_space(spa);
+	uint64_t used = spa_get_alloc(spa);
+	uint64_t cap, version;
+	zprop_source_t src = ZPROP_SRC_NONE;
+	spa_config_dirent_t *dp;
+
+	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
+
+	/*
+	 * readonly properties
+	 */
+	spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
+	spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
+	spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
+	spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src);
+
+	cap = (size == 0) ? 0 : (used * 100 / size);
+	spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
+
+	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
+	spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
+	    spa->spa_root_vdev->vdev_state, src);
+
+	/*
+	 * settable properties that are not stored in the pool property object.
+	 */
+	version = spa_version(spa);
+	if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
+		src = ZPROP_SRC_DEFAULT;
+	else
+		src = ZPROP_SRC_LOCAL;
+	spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
+
+	if (spa->spa_root != NULL)
+		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
+		    0, ZPROP_SRC_LOCAL);
+
+	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
+		if (dp->scd_path == NULL) {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
+			    "none", 0, ZPROP_SRC_LOCAL);
+		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
+			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
+		}
+	}
+}
+
+/*
+ * Get zpool property values.
+ */
+int
+spa_prop_get(spa_t *spa, nvlist_t **nvp)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	objset_t *mos = spa->spa_meta_objset;
+	int err;
+
+	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	mutex_enter(&spa->spa_props_lock);
+
+	/*
+	 * Get properties from the spa config.
+	 */
+	spa_prop_get_config(spa, nvp);
+
+	/* If no pool property object, no more prop to get. */
+	if (spa->spa_pool_props_object == 0) {
+		mutex_exit(&spa->spa_props_lock);
+		return (0);
+	}
+
+	/*
+	 * Get properties from the MOS pool property object.
+	 */
+	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
+	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		uint64_t intval = 0;
+		char *strval = NULL;
+		zprop_source_t src = ZPROP_SRC_DEFAULT;
+		zpool_prop_t prop;
+
+		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
+			continue;
+
+		switch (za.za_integer_length) {
+		case 8:
+			/* integer property */
+			if (za.za_first_integer !=
+			    zpool_prop_default_numeric(prop))
+				src = ZPROP_SRC_LOCAL;
+
+			if (prop == ZPOOL_PROP_BOOTFS) {
+				dsl_pool_t *dp;
+				dsl_dataset_t *ds = NULL;
+
+				dp = spa_get_dsl(spa);
+				rw_enter(&dp->dp_config_rwlock, RW_READER);
+				if (err = dsl_dataset_hold_obj(dp,
+				    za.za_first_integer, FTAG, &ds)) {
+					rw_exit(&dp->dp_config_rwlock);
+					break;
+				}
+
+				strval = kmem_alloc(
+				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
+				    KM_SLEEP);
+				dsl_dataset_name(ds, strval);
+				dsl_dataset_rele(ds, FTAG);
+				rw_exit(&dp->dp_config_rwlock);
+			} else {
+				strval = NULL;
+				intval = za.za_first_integer;
+			}
+
+			spa_prop_add_list(*nvp, prop, strval, intval, src);
+
+			if (strval != NULL)
+				kmem_free(strval,
+				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
+
+			break;
+
+		case 1:
+			/* string property */
+			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
+			err = zap_lookup(mos, spa->spa_pool_props_object,
+			    za.za_name, 1, za.za_num_integers, strval);
+			if (err) {
+				kmem_free(strval, za.za_num_integers);
+				break;
+			}
+			spa_prop_add_list(*nvp, prop, strval, 0, src);
+			kmem_free(strval, za.za_num_integers);
+			break;
+
+		default:
+			break;
+		}
+	}
+	zap_cursor_fini(&zc);
+	mutex_exit(&spa->spa_props_lock);
+out:
+	if (err && err != ENOENT) {
+		nvlist_free(*nvp);
+		*nvp = NULL;
+		return (err);
+	}
+
+	return (0);
+}
+
+/*
+ * Validate the given pool properties nvlist and modify the list
+ * for the property values to be set.
+ */
+static int
+spa_prop_validate(spa_t *spa, nvlist_t *props)
+{
+	nvpair_t *elem;
+	int error = 0, reset_bootfs = 0;
+	uint64_t objnum;
+
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+		zpool_prop_t prop;
+		char *propname, *strval;
+		uint64_t intval;
+		objset_t *os;
+		char *slash;
+
+		propname = nvpair_name(elem);
+
+		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
+			return (EINVAL);
+
+		switch (prop) {
+		case ZPOOL_PROP_VERSION:
+			error = nvpair_value_uint64(elem, &intval);
+			if (!error &&
+			    (intval < spa_version(spa) || intval > SPA_VERSION))
+				error = EINVAL;
+			break;
+
+		case ZPOOL_PROP_DELEGATION:
+		case ZPOOL_PROP_AUTOREPLACE:
+		case ZPOOL_PROP_LISTSNAPS:
+			error = nvpair_value_uint64(elem, &intval);
+			if (!error && intval > 1)
+				error = EINVAL;
+			break;
+
+		case ZPOOL_PROP_BOOTFS:
+			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
+				error = ENOTSUP;
+				break;
+			}
+
+			/*
+			 * Make sure the vdev config is bootable
+			 */
+			if (!vdev_is_bootable(spa->spa_root_vdev)) {
+				error = ENOTSUP;
+				break;
+			}
+
+			reset_bootfs = 1;
+
+			error = nvpair_value_string(elem, &strval);
+
+			if (!error) {
+				uint64_t compress;
+
+				if (strval == NULL || strval[0] == '\0') {
+					objnum = zpool_prop_default_numeric(
+					    ZPOOL_PROP_BOOTFS);
+					break;
+				}
+
+				if (error = dmu_objset_open(strval, DMU_OST_ZFS,
+				    DS_MODE_USER | DS_MODE_READONLY, &os))
+					break;
+
+				/* We don't support gzip bootable datasets */
+				if ((error = dsl_prop_get_integer(strval,
+				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
+				    &compress, NULL)) == 0 &&
+				    !BOOTFS_COMPRESS_VALID(compress)) {
+					error = ENOTSUP;
+				} else {
+					objnum = dmu_objset_id(os);
+				}
+				dmu_objset_close(os);
+			}
+			break;
+
+		case ZPOOL_PROP_FAILUREMODE:
+			error = nvpair_value_uint64(elem, &intval);
+			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
+			    intval > ZIO_FAILURE_MODE_PANIC))
+				error = EINVAL;
+
+			/*
+			 * This is a special case which only occurs when
+			 * the pool has completely failed. This allows
+			 * the user to change the in-core failmode property
+			 * without syncing it out to disk (I/Os might
+			 * currently be blocked). We do this by returning
+			 * EIO to the caller (spa_prop_set) to trick it
+			 * into thinking we encountered a property validation
+			 * error.
+			 */
+			if (!error && spa_suspended(spa)) {
+				spa->spa_failmode = intval;
+				error = EIO;
+			}
+			break;
+
+		case ZPOOL_PROP_CACHEFILE:
+			if ((error = nvpair_value_string(elem, &strval)) != 0)
+				break;
+
+			if (strval[0] == '\0')
+				break;
+
+			if (strcmp(strval, "none") == 0)
+				break;
+
+			if (strval[0] != '/') {
+				error = EINVAL;
+				break;
+			}
+
+			slash = strrchr(strval, '/');
+			ASSERT(slash != NULL);
+
+			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
+			    strcmp(slash, "/..") == 0)
+				error = EINVAL;
+			break;
+		}
+
+		if (error)
+			break;
+	}
+
+	if (!error && reset_bootfs) {
+		error = nvlist_remove(props,
+		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
+
+		if (!error) {
+			error = nvlist_add_uint64(props,
+			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
+		}
+	}
+
+	return (error);
+}
+
+int
+spa_prop_set(spa_t *spa, nvlist_t *nvp)
+{
+	int error;
+
+	if ((error = spa_prop_validate(spa, nvp)) != 0)
+		return (error);
+
+	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
+	    spa, nvp, 3));
+}
+
+/*
+ * If the bootfs property value is dsobj, clear it.
+ */
+void
+spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
+{
+	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
+		VERIFY(zap_remove(spa->spa_meta_objset,
+		    spa->spa_pool_props_object,
+		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
+		spa->spa_bootfs = 0;
+	}
+}
 
 /*
  * ==========================================================================
@@ -117,40 +487,26 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 static void
 spa_activate(spa_t *spa)
 {
-	int t;
-	int nthreads = zio_taskq_threads;
-	char name[32];
 
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
 	spa->spa_state = POOL_STATE_ACTIVE;
 
 	spa->spa_normal_class = metaslab_class_create();
+	spa->spa_log_class = metaslab_class_create();
 
-	if (nthreads == 0)
-		nthreads = max_ncpus;
-	for (t = 0; t < ZIO_TYPES; t++) {
-		snprintf(name, sizeof(name), "spa_zio_issue %d", t);
-		spa->spa_zio_issue_taskq[t] = taskq_create(name, nthreads,
-		    maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
-		snprintf(name, sizeof(name), "spa_zio_intr %d", t);
-		spa->spa_zio_intr_taskq[t] = taskq_create(name, nthreads,
-		    maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
+	for (int t = 0; t < ZIO_TYPES; t++) {
+		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+			spa->spa_zio_taskq[t][q] = taskq_create("spa_zio",
+			    zio_taskq_threads[t][q], maxclsyspri, 50,
+			    INT_MAX, TASKQ_PREPOPULATE);
+		}
 	}
 
-	rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
-
-	mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL);
-	mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
-	    offsetof(vdev_t, vdev_dirty_node));
+	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_config_dirty_node));
+	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_state_dirty_node));
 
 	txg_list_create(&spa->spa_vdev_txg_list,
 	    offsetof(struct vdev, vdev_txg_node));
@@ -169,8 +525,6 @@ spa_activate(spa_t *spa)
 static void
 spa_deactivate(spa_t *spa)
 {
-	int t;
-
 	ASSERT(spa->spa_sync_on == B_FALSE);
 	ASSERT(spa->spa_dsl_pool == NULL);
 	ASSERT(spa->spa_root_vdev == NULL);
@@ -179,18 +533,22 @@ spa_deactivate(spa_t *spa)
 
 	txg_list_destroy(&spa->spa_vdev_txg_list);
 
-	list_destroy(&spa->spa_dirty_list);
+	list_destroy(&spa->spa_config_dirty_list);
+	list_destroy(&spa->spa_state_dirty_list);
 
-	for (t = 0; t < ZIO_TYPES; t++) {
-		taskq_destroy(spa->spa_zio_issue_taskq[t]);
-		taskq_destroy(spa->spa_zio_intr_taskq[t]);
-		spa->spa_zio_issue_taskq[t] = NULL;
-		spa->spa_zio_intr_taskq[t] = NULL;
+	for (int t = 0; t < ZIO_TYPES; t++) {
+		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+			taskq_destroy(spa->spa_zio_taskq[t][q]);
+			spa->spa_zio_taskq[t][q] = NULL;
+		}
 	}
 
 	metaslab_class_destroy(spa->spa_normal_class);
 	spa->spa_normal_class = NULL;
 
+	metaslab_class_destroy(spa->spa_log_class);
+	spa->spa_log_class = NULL;
+
 	/*
 	 * If this was part of an import or the open otherwise failed, we may
 	 * still have errors left in the queues.  Empty them just in case.
@@ -200,16 +558,6 @@ spa_deactivate(spa_t *spa)
 	avl_destroy(&spa->spa_errlist_scrub);
 	avl_destroy(&spa->spa_errlist_last);
 
-	rw_destroy(&spa->spa_traverse_lock);
-	mutex_destroy(&spa->spa_uberblock_lock);
-	mutex_destroy(&spa->spa_errlog_lock);
-	mutex_destroy(&spa->spa_errlist_lock);
-	mutex_destroy(&spa->spa_config_lock.scl_lock);
-	cv_destroy(&spa->spa_config_lock.scl_cv);
-	mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
-	mutex_destroy(&spa->spa_history_lock);
-	mutex_destroy(&spa->spa_props_lock);
-
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 }
 
@@ -233,8 +581,13 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
 	if ((*vdp)->vdev_ops->vdev_op_leaf)
 		return (0);
 
-	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) != 0) {
+	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children);
+
+	if (error == ENOENT)
+		return (0);
+
+	if (error) {
 		vdev_free(*vdp);
 		*vdp = NULL;
 		return (EINVAL);
@@ -263,6 +616,8 @@ spa_unload(spa_t *spa)
 {
 	int i;
 
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
 	/*
 	 * Stop async tasks.
 	 */
@@ -277,10 +632,17 @@ spa_unload(spa_t *spa)
 	}
 
 	/*
-	 * Wait for any outstanding prefetch I/O to complete.
+	 * Wait for any outstanding async I/O to complete.
+	 */
+	mutex_enter(&spa->spa_async_root_lock);
+	while (spa->spa_async_root_count != 0)
+		cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock);
+	mutex_exit(&spa->spa_async_root_lock);
+
+	/*
+	 * Drop and purge level 2 cache
 	 */
-	spa_config_enter(spa, RW_WRITER, FTAG);
-	spa_config_exit(spa, FTAG);
+	spa_l2cache_drop(spa);
 
 	/*
 	 * Close the dsl pool.
@@ -297,16 +659,31 @@ spa_unload(spa_t *spa)
 		vdev_free(spa->spa_root_vdev);
 	ASSERT(spa->spa_root_vdev == NULL);
 
-	for (i = 0; i < spa->spa_nspares; i++)
-		vdev_free(spa->spa_spares[i]);
-	if (spa->spa_spares) {
-		kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
-		spa->spa_spares = NULL;
+	for (i = 0; i < spa->spa_spares.sav_count; i++)
+		vdev_free(spa->spa_spares.sav_vdevs[i]);
+	if (spa->spa_spares.sav_vdevs) {
+		kmem_free(spa->spa_spares.sav_vdevs,
+		    spa->spa_spares.sav_count * sizeof (void *));
+		spa->spa_spares.sav_vdevs = NULL;
+	}
+	if (spa->spa_spares.sav_config) {
+		nvlist_free(spa->spa_spares.sav_config);
+		spa->spa_spares.sav_config = NULL;
 	}
-	if (spa->spa_sparelist) {
-		nvlist_free(spa->spa_sparelist);
-		spa->spa_sparelist = NULL;
+	spa->spa_spares.sav_count = 0;
+
+	for (i = 0; i < spa->spa_l2cache.sav_count; i++)
+		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
+	if (spa->spa_l2cache.sav_vdevs) {
+		kmem_free(spa->spa_l2cache.sav_vdevs,
+		    spa->spa_l2cache.sav_count * sizeof (void *));
+		spa->spa_l2cache.sav_vdevs = NULL;
 	}
+	if (spa->spa_l2cache.sav_config) {
+		nvlist_free(spa->spa_l2cache.sav_config);
+		spa->spa_l2cache.sav_config = NULL;
+	}
+	spa->spa_l2cache.sav_count = 0;
 
 	spa->spa_async_suspended = 0;
 }
@@ -314,8 +691,8 @@ spa_unload(spa_t *spa)
 /*
  * Load (or re-load) the current list of vdevs describing the active spares for
  * this pool.  When this is called, we have some form of basic information in
- * 'spa_sparelist'.  We parse this into vdevs, try to open them, and then
- * re-generate a more complete list including status information.
+ * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
+ * then re-generate a more complete list including status information.
  */
 static void
 spa_load_spares(spa_t *spa)
@@ -325,31 +702,34 @@ spa_load_spares(spa_t *spa)
 	int i;
 	vdev_t *vd, *tvd;
 
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
 	/*
 	 * First, close and free any existing spare vdevs.
 	 */
-	for (i = 0; i < spa->spa_nspares; i++) {
-		vd = spa->spa_spares[i];
+	for (i = 0; i < spa->spa_spares.sav_count; i++) {
+		vd = spa->spa_spares.sav_vdevs[i];
 
 		/* Undo the call to spa_activate() below */
-		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
-		    tvd->vdev_isspare)
+		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
+		    B_FALSE)) != NULL && tvd->vdev_isspare)
 			spa_spare_remove(tvd);
 		vdev_close(vd);
 		vdev_free(vd);
 	}
 
-	if (spa->spa_spares)
-		kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
+	if (spa->spa_spares.sav_vdevs)
+		kmem_free(spa->spa_spares.sav_vdevs,
+		    spa->spa_spares.sav_count * sizeof (void *));
 
-	if (spa->spa_sparelist == NULL)
+	if (spa->spa_spares.sav_config == NULL)
 		nspares = 0;
 	else
-		VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
+		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 
-	spa->spa_nspares = (int)nspares;
-	spa->spa_spares = NULL;
+	spa->spa_spares.sav_count = (int)nspares;
+	spa->spa_spares.sav_vdevs = NULL;
 
 	if (nspares == 0)
 		return;
@@ -363,15 +743,17 @@ spa_load_spares(spa_t *spa)
 	 * validate each vdev on the spare list.  If the vdev also exists in the
 	 * active configuration, then we also mark this vdev as an active spare.
 	 */
-	spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP);
-	for (i = 0; i < spa->spa_nspares; i++) {
+	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
+	    KM_SLEEP);
+	for (i = 0; i < spa->spa_spares.sav_count; i++) {
 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
 		    VDEV_ALLOC_SPARE) == 0);
 		ASSERT(vd != NULL);
 
-		spa->spa_spares[i] = vd;
+		spa->spa_spares.sav_vdevs[i] = vd;
 
-		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
+		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
+		    B_FALSE)) != NULL) {
 			if (!tvd->vdev_isspare)
 				spa_spare_add(tvd);
 
@@ -392,29 +774,167 @@ spa_load_spares(spa_t *spa)
 				spa_spare_activate(tvd);
 		}
 
+		vd->vdev_top = vd;
+
 		if (vdev_open(vd) != 0)
 			continue;
 
-		vd->vdev_top = vd;
-		(void) vdev_validate_spare(vd);
+		if (vdev_validate_aux(vd) == 0)
+			spa_spare_add(vd);
 	}
 
 	/*
 	 * Recompute the stashed list of spares, with status information
 	 * this time.
 	 */
-	VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
+	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
 	    DATA_TYPE_NVLIST_ARRAY) == 0);
 
-	spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP);
-	for (i = 0; i < spa->spa_nspares; i++)
-		spares[i] = vdev_config_generate(spa, spa->spa_spares[i],
-		    B_TRUE, B_TRUE);
-	VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
-	    spares, spa->spa_nspares) == 0);
-	for (i = 0; i < spa->spa_nspares; i++)
+	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
+	    KM_SLEEP);
+	for (i = 0; i < spa->spa_spares.sav_count; i++)
+		spares[i] = vdev_config_generate(spa,
+		    spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
+	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
+	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		nvlist_free(spares[i]);
-	kmem_free(spares, spa->spa_nspares * sizeof (void *));
+	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
+}
+
+/*
+ * Load (or re-load) the current list of vdevs describing the active l2cache for
+ * this pool.  When this is called, we have some form of basic information in
+ * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
+ * then re-generate a more complete list including status information.
+ * Devices which are already active have their details maintained, and are
+ * not re-opened.
+ */
+static void
+spa_load_l2cache(spa_t *spa)
+{
+	nvlist_t **l2cache;
+	uint_t nl2cache;
+	int i, j, oldnvdevs;
+	uint64_t guid, size;
+	vdev_t *vd, **oldvdevs, **newvdevs;
+	spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	if (sav->sav_config != NULL) {
+		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
+		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
+	} else {
+		nl2cache = 0;
+	}
+
+	oldvdevs = sav->sav_vdevs;
+	oldnvdevs = sav->sav_count;
+	sav->sav_vdevs = NULL;
+	sav->sav_count = 0;
+
+	/*
+	 * Process new nvlist of vdevs.
+	 */
+	for (i = 0; i < nl2cache; i++) {
+		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
+		    &guid) == 0);
+
+		newvdevs[i] = NULL;
+		for (j = 0; j < oldnvdevs; j++) {
+			vd = oldvdevs[j];
+			if (vd != NULL && guid == vd->vdev_guid) {
+				/*
+				 * Retain previous vdev for add/remove ops.
+				 */
+				newvdevs[i] = vd;
+				oldvdevs[j] = NULL;
+				break;
+			}
+		}
+
+		if (newvdevs[i] == NULL) {
+			/*
+			 * Create new vdev
+			 */
+			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
+			    VDEV_ALLOC_L2CACHE) == 0);
+			ASSERT(vd != NULL);
+			newvdevs[i] = vd;
+
+			/*
+			 * Commit this vdev as an l2cache device,
+			 * even if it fails to open.
+			 */
+			spa_l2cache_add(vd);
+
+			vd->vdev_top = vd;
+			vd->vdev_aux = sav;
+
+			spa_l2cache_activate(vd);
+
+			if (vdev_open(vd) != 0)
+				continue;
+
+			(void) vdev_validate_aux(vd);
+
+			if (!vdev_is_dead(vd)) {
+				size = vdev_get_rsize(vd);
+				l2arc_add_vdev(spa, vd,
+				    VDEV_LABEL_START_SIZE,
+				    size - VDEV_LABEL_START_SIZE);
+			}
+		}
+	}
+
+	/*
+	 * Purge vdevs that were dropped
+	 */
+	for (i = 0; i < oldnvdevs; i++) {
+		uint64_t pool;
+
+		vd = oldvdevs[i];
+		if (vd != NULL) {
+			if ((spa_mode & FWRITE) &&
+			    spa_l2cache_exists(vd->vdev_guid, &pool) &&
+			    pool != 0ULL &&
+			    l2arc_vdev_present(vd)) {
+				l2arc_remove_vdev(vd);
+			}
+			(void) vdev_close(vd);
+			spa_l2cache_remove(vd);
+		}
+	}
+
+	if (oldvdevs)
+		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
+
+	if (sav->sav_config == NULL)
+		goto out;
+
+	sav->sav_vdevs = newvdevs;
+	sav->sav_count = (int)nl2cache;
+
+	/*
+	 * Recompute the stashed list of l2cache devices, with status
+	 * information this time.
+	 */
+	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
+	    DATA_TYPE_NVLIST_ARRAY) == 0);
+
+	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
+	for (i = 0; i < sav->sav_count; i++)
+		l2cache[i] = vdev_config_generate(spa,
+		    sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
+	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
+out:
+	for (i = 0; i < sav->sav_count; i++)
+		nvlist_free(l2cache[i]);
+	if (sav->sav_count)
+		kmem_free(l2cache, sav->sav_count * sizeof (void *));
 }
 
 static int
@@ -440,6 +960,50 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 }
 
 /*
+ * Checks to see if the given vdev could not be opened, in which case we post a
+ * sysevent to notify the autoreplace code that the device has been removed.
+ */
+static void
+spa_check_removed(vdev_t *vd)
+{
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		spa_check_removed(vd->vdev_child[c]);
+
+	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
+		zfs_post_autoreplace(vd->vdev_spa, vd);
+		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
+	}
+}
+
+/*
+ * Check for missing log devices
+ */
+int
+spa_check_logs(spa_t *spa)
+{
+	switch (spa->spa_log_state) {
+	case SPA_LOG_MISSING:
+		/* need to recheck in case slog has been restored */
+	case SPA_LOG_UNKNOWN:
+		if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
+		    DS_FIND_CHILDREN)) {
+			spa->spa_log_state = SPA_LOG_MISSING;
+			return (1);
+		}
+		break;
+
+	case SPA_LOG_CLEAR:
+		(void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL,
+		    DS_FIND_CHILDREN);
+		break;
+	}
+	spa->spa_log_state = SPA_LOG_GOOD;
+	return (0);
+}
+
+/*
  * Load an existing storage pool, using the pool's builtin spa_config as a
  * source of configuration information.
  */
@@ -453,7 +1017,10 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 	uint64_t config_cache_txg = spa->spa_config_txg;
 	uint64_t pool_guid;
 	uint64_t version;
-	zio_t *zio;
+	uint64_t autoreplace = 0;
+	char *ereport = FM_EREPORT_ZFS_POOL;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa->spa_load_state = state;
 
@@ -468,7 +1035,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 	 * it's not present treat it as the initial version.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
-		version = ZFS_VERSION_INITIAL;
+		version = SPA_VERSION_INITIAL;
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    &spa->spa_config_txg);
@@ -486,10 +1053,10 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 	 * value that will be returned by spa_version() since parsing the
 	 * configuration requires knowing the version number.
 	 */
-	spa_config_enter(spa, RW_WRITER, FTAG);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	spa->spa_ubsync.ub_version = version;
 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
-	spa_config_exit(spa, FTAG);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0)
 		goto out;
@@ -500,18 +1067,19 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 	/*
 	 * Try to open all vdevs, loading each label in the process.
 	 */
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_open(rvd);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Validate the labels for all leaf vdevs.  We need to grab the config
-	 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
-	 * flag.
+	 * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER.
 	 */
-	spa_config_enter(spa, RW_READER, FTAG);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_validate(rvd);
-	spa_config_exit(spa, FTAG);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0)
 		goto out;
@@ -524,12 +1092,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 	/*
 	 * Find the best uberblock.
 	 */
-	bzero(ub, sizeof (uberblock_t));
-
-	zio = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
-	vdev_uberblock_load(zio, rvd, ub);
-	error = zio_wait(zio);
+	vdev_uberblock_load(NULL, rvd, ub);
 
 	/*
 	 * If we weren't able to find a single valid uberblock, return failure.
@@ -544,7 +1107,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 	/*
 	 * If the pool is newer than the code, we can't open it.
 	 */
-	if (ub->ub_version > ZFS_VERSION) {
+	if (ub->ub_version > SPA_VERSION) {
 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_VERSION_NEWER);
 		error = ENOTSUP;
@@ -596,12 +1159,8 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 			goto out;
 		}
 
-		/*
-		 * hostid is set after the root file system is mounted, so
-		 * ignore the check until it's done.
-		 */
-		if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID,
-		    &hostid) == 0 && root_mounted()) {
+		if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig,
+		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
 			char *hostname;
 			unsigned long myhostid = 0;
 
@@ -609,12 +1168,13 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
 
 			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
-			if ((unsigned long)hostid != myhostid) {
+			if (hostid != 0 && myhostid != 0 &&
+			    (unsigned long)hostid != myhostid) {
 				cmn_err(CE_WARN, "pool '%s' could not be "
 				    "loaded as it was last accessed by "
-				    "another system (host: %s hostid: 0x%lx).  "
+				    "another system (host: %s hostid: 0x%lx). "
 				    "See: http://www.sun.com/msg/ZFS-8000-EY",
-				    spa->spa_name, hostname,
+				    spa_name(spa), hostname,
 				    (unsigned long)hostid);
 				error = EBADF;
 				goto out;
@@ -695,7 +1255,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 	 * Load any hot spares for this pool.
 	 */
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object);
+	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
 	if (error != 0 && error != ENOENT) {
 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
@@ -703,20 +1263,59 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 		goto out;
 	}
 	if (error == 0) {
-		ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES);
-		if (load_nvlist(spa, spa->spa_spares_object,
-		    &spa->spa_sparelist) != 0) {
+		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
+		if (load_nvlist(spa, spa->spa_spares.sav_object,
+		    &spa->spa_spares.sav_config) != 0) {
 			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			error = EIO;
 			goto out;
 		}
 
-		spa_config_enter(spa, RW_WRITER, FTAG);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
-		spa_config_exit(spa, FTAG);
+		spa_config_exit(spa, SCL_ALL, FTAG);
 	}
 
+	/*
+	 * Load any level 2 ARC devices for this pool.
+	 */
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
+	    &spa->spa_l2cache.sav_object);
+	if (error != 0 && error != ENOENT) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
+		goto out;
+	}
+	if (error == 0) {
+		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
+		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
+		    &spa->spa_l2cache.sav_config) != 0) {
+			vdev_set_state(rvd, B_TRUE,
+			    VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			error = EIO;
+			goto out;
+		}
+
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_load_l2cache(spa);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+	}
+
+	if (spa_check_logs(spa)) {
+		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_BAD_LOG);
+		error = ENXIO;
+		ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+		goto out;
+	}
+
+
+	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
+
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
 
@@ -730,11 +1329,33 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 	if (error == 0) {
 		(void) zap_lookup(spa->spa_meta_objset,
 		    spa->spa_pool_props_object,
-		    zpool_prop_to_name(ZFS_PROP_BOOTFS),
+		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
 		    sizeof (uint64_t), 1, &spa->spa_bootfs);
+		(void) zap_lookup(spa->spa_meta_objset,
+		    spa->spa_pool_props_object,
+		    zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
+		    sizeof (uint64_t), 1, &autoreplace);
+		(void) zap_lookup(spa->spa_meta_objset,
+		    spa->spa_pool_props_object,
+		    zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
+		    sizeof (uint64_t), 1, &spa->spa_delegation);
+		(void) zap_lookup(spa->spa_meta_objset,
+		    spa->spa_pool_props_object,
+		    zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
+		    sizeof (uint64_t), 1, &spa->spa_failmode);
 	}
 
 	/*
+	 * If the 'autoreplace' property is set, then post a resource notifying
+	 * the ZFS DE that it should not issue any faults for unopenable
+	 * devices.  We also iterate over the vdevs, and post a sysevent for any
+	 * unopenable vdevs so that the normal autoreplace handler can take
+	 * over.
+	 */
+	if (autoreplace && state != SPA_LOAD_TRYIMPORT)
+		spa_check_removed(spa->spa_root_vdev);
+
+	/*
 	 * Load the vdev state for all toplevel vdevs.
 	 */
 	vdev_load(rvd);
@@ -742,9 +1363,9 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 	/*
 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
 	 */
-	spa_config_enter(spa, RW_WRITER, FTAG);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
-	spa_config_exit(spa, FTAG);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * Check the state of the root vdev.  If it can't be opened, it
@@ -766,7 +1387,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 		 */
 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
 		    spa_first_txg(spa));
-		(void) dmu_objset_find(spa->spa_name,
+		(void) dmu_objset_find(spa_name(spa),
 		    zil_claim, tx, DS_FIND_CHILDREN);
 		dmu_tx_commit(tx);
 
@@ -800,8 +1421,9 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 
 	error = 0;
 out:
+	spa->spa_minref = refcount_count(&spa->spa_refcount);
 	if (error && error != EBADF)
-		zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
+		zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
 	spa->spa_load_state = SPA_LOAD_NONE;
 	spa->spa_ena = 0;
 
@@ -814,7 +1436,7 @@ out:
  * The import case is identical to an open except that the configuration is sent
  * down from userland, instead of grabbed from the configuration cache.  For the
  * case of an open, the pool configuration will exist in the
- * POOL_STATE_UNITIALIZED state.
+ * POOL_STATE_UNINITIALIZED state.
  *
  * The stats information (gen/count/ustats) is used to gather vdev statistics at
  * the same time open the pool, without having to keep around the spa_t in some
@@ -825,7 +1447,6 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 {
 	spa_t *spa;
 	int error;
-	int loaded = B_FALSE;
 	int locked = B_FALSE;
 
 	*spapp = NULL;
@@ -860,11 +1481,10 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 			 * this is the case, the config cache is out of sync and
 			 * we should remove the pool from the namespace.
 			 */
-			zfs_post_ok(spa, NULL);
 			spa_unload(spa);
 			spa_deactivate(spa);
+			spa_config_sync(spa, B_TRUE, B_TRUE);
 			spa_remove(spa);
-			spa_config_sync();
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			return (ENOENT);
@@ -876,12 +1496,9 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 			 * information: the state of each vdev after the
 			 * attempted vdev_open().  Return this to the user.
 			 */
-			if (config != NULL && spa->spa_root_vdev != NULL) {
-				spa_config_enter(spa, RW_READER, FTAG);
+			if (config != NULL && spa->spa_root_vdev != NULL)
 				*config = spa_config_generate(spa, NULL, -1ULL,
 				    B_TRUE);
-				spa_config_exit(spa, FTAG);
-			}
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa->spa_last_open_failed = B_TRUE;
@@ -890,30 +1507,19 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 			*spapp = NULL;
 			return (error);
 		} else {
-			zfs_post_ok(spa, NULL);
 			spa->spa_last_open_failed = B_FALSE;
 		}
-
-		loaded = B_TRUE;
 	}
 
 	spa_open_ref(spa, tag);
+
 	if (locked)
 		mutex_exit(&spa_namespace_lock);
 
 	*spapp = spa;
 
-	if (config != NULL) {
-		spa_config_enter(spa, RW_READER, FTAG);
+	if (config != NULL)
 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
-		spa_config_exit(spa, FTAG);
-	}
-
-	/*
-	 * If we just loaded the pool, resilver anything that's out of date.
-	 */
-	if (loaded && (spa_mode & FWRITE))
-		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
 
 	return (0);
 }
@@ -952,6 +1558,9 @@ spa_inject_delref(spa_t *spa)
 	mutex_exit(&spa_namespace_lock);
 }
 
+/*
+ * Add spares device information to the nvlist.
+ */
 static void
 spa_add_spares(spa_t *spa, nvlist_t *config)
 {
@@ -963,12 +1572,12 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
 	uint_t vsc;
 	uint64_t pool;
 
-	if (spa->spa_nspares == 0)
+	if (spa->spa_spares.sav_count == 0)
 		return;
 
 	VERIFY(nvlist_lookup_nvlist(config,
 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
-	VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
+	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 	if (nspares != 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot,
@@ -984,7 +1593,8 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
 		for (i = 0; i < nspares; i++) {
 			VERIFY(nvlist_lookup_uint64(spares[i],
 			    ZPOOL_CONFIG_GUID, &guid) == 0);
-			if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
+			if (spa_spare_exists(guid, &pool, NULL) &&
+			    pool != 0ULL) {
 				VERIFY(nvlist_lookup_uint64_array(
 				    spares[i], ZPOOL_CONFIG_STATS,
 				    (uint64_t **)&vs, &vsc) == 0);
@@ -995,6 +1605,62 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
 	}
 }
 
+/*
+ * Add l2cache device information to the nvlist, including vdev stats.
+ */
+static void
+spa_add_l2cache(spa_t *spa, nvlist_t *config)
+{
+	nvlist_t **l2cache;
+	uint_t i, j, nl2cache;
+	nvlist_t *nvroot;
+	uint64_t guid;
+	vdev_t *vd;
+	vdev_stat_t *vs;
+	uint_t vsc;
+
+	if (spa->spa_l2cache.sav_count == 0)
+		return;
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	VERIFY(nvlist_lookup_nvlist(config,
+	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
+	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+	if (nl2cache != 0) {
+		VERIFY(nvlist_add_nvlist_array(nvroot,
+		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+		VERIFY(nvlist_lookup_nvlist_array(nvroot,
+		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+
+		/*
+		 * Update level 2 cache device stats.
+		 */
+
+		for (i = 0; i < nl2cache; i++) {
+			VERIFY(nvlist_lookup_uint64(l2cache[i],
+			    ZPOOL_CONFIG_GUID, &guid) == 0);
+
+			vd = NULL;
+			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
+				if (guid ==
+				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
+					vd = spa->spa_l2cache.sav_vdevs[j];
+					break;
+				}
+			}
+			ASSERT(vd != NULL);
+
+			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
+			    ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
+			vdev_get_stats(vd, vs);
+		}
+	}
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
 int
 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
 {
@@ -1008,7 +1674,12 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
 		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
 		    spa_get_errlog_size(spa)) == 0);
 
+		if (spa_suspended(spa))
+			VERIFY(nvlist_add_uint64(*config,
+			    ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0);
+
 		spa_add_spares(spa, *config);
+		spa_add_l2cache(spa, *config);
 	}
 
 	/*
@@ -1037,45 +1708,48 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
 }
 
 /*
- * Validate that the 'spares' array is well formed.  We must have an array of
- * nvlists, each which describes a valid leaf vdev.  If this is an import (mode
- * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long
- * as they are well-formed.
+ * Validate that the auxiliary device array is well formed.  We must have an
+ * array of nvlists, each which describes a valid leaf vdev.  If this is an
+ * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
+ * specified, as long as they are well-formed.
  */
 static int
-spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
+spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
+    spa_aux_vdev_t *sav, const char *config, uint64_t version,
+    vdev_labeltype_t label)
 {
-	nvlist_t **spares;
-	uint_t i, nspares;
+	nvlist_t **dev;
+	uint_t i, ndev;
 	vdev_t *vd;
 	int error;
 
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
 	/*
-	 * It's acceptable to have no spares specified.
+	 * It's acceptable to have no devs specified.
 	 */
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-	    &spares, &nspares) != 0)
+	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
 		return (0);
 
-	if (nspares == 0)
+	if (ndev == 0)
 		return (EINVAL);
 
 	/*
-	 * Make sure the pool is formatted with a version that supports hot
-	 * spares.
+	 * Make sure the pool is formatted with a version that supports this
+	 * device type.
 	 */
-	if (spa_version(spa) < ZFS_VERSION_SPARES)
+	if (spa_version(spa) < version)
 		return (ENOTSUP);
 
 	/*
-	 * Set the pending spare list so we correctly handle device in-use
+	 * Set the pending device list so we correctly handle device in-use
 	 * checking.
 	 */
-	spa->spa_pending_spares = spares;
-	spa->spa_pending_nspares = nspares;
+	sav->sav_pending = dev;
+	sav->sav_npending = ndev;
 
-	for (i = 0; i < nspares; i++) {
-		if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0,
+	for (i = 0; i < ndev; i++) {
+		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
 		    mode)) != 0)
 			goto out;
 
@@ -1085,43 +1759,149 @@ spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
 			goto out;
 		}
 
+		/*
+		 * The L2ARC currently only supports disk devices in
+		 * kernel context.  For user-level testing, we allow it.
+		 */
+#ifdef _KERNEL
+		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
+		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
+			error = ENOTBLK;
+			goto out;
+		}
+#endif
 		vd->vdev_top = vd;
 
 		if ((error = vdev_open(vd)) == 0 &&
-		    (error = vdev_label_init(vd, crtxg,
-		    VDEV_LABEL_SPARE)) == 0) {
-			VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID,
+		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
+			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
 			    vd->vdev_guid) == 0);
 		}
 
 		vdev_free(vd);
 
-		if (error && mode != VDEV_ALLOC_SPARE)
+		if (error &&
+		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
 			goto out;
 		else
 			error = 0;
 	}
 
 out:
-	spa->spa_pending_spares = NULL;
-	spa->spa_pending_nspares = 0;
+	sav->sav_pending = NULL;
+	sav->sav_npending = 0;
 	return (error);
 }
 
+static int
+spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
+{
+	int error;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
+	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
+	    VDEV_LABEL_SPARE)) != 0) {
+		return (error);
+	}
+
+	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
+	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
+	    VDEV_LABEL_L2CACHE));
+}
+
+static void
+spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
+    const char *config)
+{
+	int i;
+
+	if (sav->sav_config != NULL) {
+		nvlist_t **olddevs;
+		uint_t oldndevs;
+		nvlist_t **newdevs;
+
+		/*
+		 * Generate new dev list by concatentating with the
+		 * current dev list.
+		 */
+		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
+		    &olddevs, &oldndevs) == 0);
+
+		newdevs = kmem_alloc(sizeof (void *) *
+		    (ndevs + oldndevs), KM_SLEEP);
+		for (i = 0; i < oldndevs; i++)
+			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
+			    KM_SLEEP) == 0);
+		for (i = 0; i < ndevs; i++)
+			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
+			    KM_SLEEP) == 0);
+
+		VERIFY(nvlist_remove(sav->sav_config, config,
+		    DATA_TYPE_NVLIST_ARRAY) == 0);
+
+		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+		    config, newdevs, ndevs + oldndevs) == 0);
+		for (i = 0; i < oldndevs + ndevs; i++)
+			nvlist_free(newdevs[i]);
+		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
+	} else {
+		/*
+		 * Generate a new dev list.
+		 */
+		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
+		    KM_SLEEP) == 0);
+		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
+		    devs, ndevs) == 0);
+	}
+}
+
+/*
+ * Stop and drop level 2 ARC devices
+ */
+void
+spa_l2cache_drop(spa_t *spa)
+{
+	vdev_t *vd;
+	int i;
+	spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+	for (i = 0; i < sav->sav_count; i++) {
+		uint64_t pool;
+
+		vd = sav->sav_vdevs[i];
+		ASSERT(vd != NULL);
+
+		if ((spa_mode & FWRITE) &&
+		    spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL &&
+		    l2arc_vdev_present(vd)) {
+			l2arc_remove_vdev(vd);
+		}
+		if (vd->vdev_isl2cache)
+			spa_l2cache_remove(vd);
+		vdev_clear_stats(vd);
+		(void) vdev_close(vd);
+	}
+}
+
 /*
  * Pool Creation
  */
 int
-spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
+spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
+    const char *history_str, nvlist_t *zplprops)
 {
 	spa_t *spa;
+	char *altroot = NULL;
 	vdev_t *rvd;
 	dsl_pool_t *dp;
 	dmu_tx_t *tx;
 	int c, error = 0;
 	uint64_t txg = TXG_INITIAL;
-	nvlist_t **spares;
-	uint_t nspares;
+	nvlist_t **spares, **l2cache;
+	uint_t nspares, nl2cache;
+	uint64_t version;
 
 	/*
 	 * If this pool already exists, return failure.
@@ -1135,36 +1915,51 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
 	/*
 	 * Allocate a new spa_t structure.
 	 */
+	(void) nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	spa = spa_add(pool, altroot);
 	spa_activate(spa);
 
 	spa->spa_uberblock.ub_txg = txg - 1;
-	spa->spa_uberblock.ub_version = ZFS_VERSION;
+
+	if (props && (error = spa_prop_validate(spa, props))) {
+		spa_unload(spa);
+		spa_deactivate(spa);
+		spa_remove(spa);
+		mutex_exit(&spa_namespace_lock);
+		return (error);
+	}
+
+	if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
+	    &version) != 0)
+		version = SPA_VERSION;
+	ASSERT(version <= SPA_VERSION);
+	spa->spa_uberblock.ub_version = version;
 	spa->spa_ubsync = spa->spa_uberblock;
 
 	/*
 	 * Create the root vdev.
 	 */
-	spa_config_enter(spa, RW_WRITER, FTAG);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
 
 	ASSERT(error != 0 || rvd != NULL);
 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
 
-	if (error == 0 && rvd->vdev_children == 0)
+	if (error == 0 && !zfs_allocatable_devs(nvroot))
 		error = EINVAL;
 
 	if (error == 0 &&
 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
-	    (error = spa_validate_spares(spa, nvroot, txg,
+	    (error = spa_validate_aux(spa, nvroot, txg,
 	    VDEV_ALLOC_ADD)) == 0) {
 		for (c = 0; c < rvd->vdev_children; c++)
 			vdev_init(rvd->vdev_child[c], txg);
 		vdev_config_dirty(rvd);
 	}
 
-	spa_config_exit(spa, FTAG);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_unload(spa);
@@ -1179,17 +1974,32 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
-		VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME,
+		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
 		    KM_SLEEP) == 0);
-		VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
+		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
-		spa_config_enter(spa, RW_WRITER, FTAG);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
-		spa_config_exit(spa, FTAG);
-		spa->spa_sync_spares = B_TRUE;
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa->spa_spares.sav_sync = B_TRUE;
+	}
+
+	/*
+	 * Get the list of level 2 cache devices, if specified.
+	 */
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+	    &l2cache, &nl2cache) == 0) {
+		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
+		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
+		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_load_l2cache(spa);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
-	spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
+	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
 	spa->spa_meta_objset = dp->dp_meta_objset;
 
 	tx = dmu_tx_create_assigned(dp, txg);
@@ -1198,7 +2008,7 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
 	 * Create the pool config object.
 	 */
 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
-	    DMU_OT_PACKED_NVLIST, 1 << 14,
+	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
 
 	if (zap_add(spa->spa_meta_objset,
@@ -1207,12 +2017,14 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
 		cmn_err(CE_PANIC, "failed to add pool config");
 	}
 
-	/* Newly created pools are always deflated. */
-	spa->spa_deflate = TRUE;
-	if (zap_add(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
-	    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
-		cmn_err(CE_PANIC, "failed to add deflate");
+	/* Newly created pools with the right version are always deflated. */
+	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
+		spa->spa_deflate = TRUE;
+		if (zap_add(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
+			cmn_err(CE_PANIC, "failed to add deflate");
+		}
 	}
 
 	/*
@@ -1234,11 +2046,20 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
 	/*
 	 * Create the pool's history object.
 	 */
-	spa_history_create_obj(spa, tx);
+	if (version >= SPA_VERSION_ZPOOL_HISTORY)
+		spa_history_create_obj(spa, tx);
+
+	/*
+	 * Set pool properties.
+	 */
+	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
+	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
+	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
+	if (props)
+		spa_sync_props(spa, props, CRED(), tx);
 
 	dmu_tx_commit(tx);
 
-	spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
 	spa->spa_sync_on = B_TRUE;
 	txg_sync_start(spa->spa_dsl_pool);
 
@@ -1248,10 +2069,15 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, txg);
 
-	spa_config_sync();
+	spa_config_sync(spa, B_FALSE, B_TRUE);
+
+	if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
+		(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
 
 	mutex_exit(&spa_namespace_lock);
 
+	spa->spa_minref = refcount_count(&spa->spa_refcount);
+
 	return (0);
 }
 
@@ -1259,17 +2085,16 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
  * Import the given pool into the system.  We set up the necessary spa_t and
  * then call spa_load() to do the dirty work.
  */
-int
-spa_import(const char *pool, nvlist_t *config, const char *altroot)
+static int
+spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
+    boolean_t isroot, boolean_t allowfaulted)
 {
 	spa_t *spa;
-	int error;
+	char *altroot = NULL;
+	int error, loaderr;
 	nvlist_t *nvroot;
-	nvlist_t **spares;
-	uint_t nspares;
-
-	if (!(spa_mode & FWRITE))
-		return (EROFS);
+	nvlist_t **spares, **l2cache;
+	uint_t nspares, nl2cache;
 
 	/*
 	 * If a pool with this name exists, return failure.
@@ -1283,78 +2108,355 @@ spa_import(const char *pool, nvlist_t *config, const char *altroot)
 	/*
 	 * Create and initialize the spa structure.
 	 */
+	(void) nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	spa = spa_add(pool, altroot);
 	spa_activate(spa);
 
+	if (allowfaulted)
+		spa->spa_import_faulted = B_TRUE;
+	spa->spa_is_root = isroot;
+
 	/*
 	 * Pass off the heavy lifting to spa_load().
-	 * Pass TRUE for mosconfig because the user-supplied config
-	 * is actually the one to trust when doing an import.
+	 * Pass TRUE for mosconfig (unless this is a root pool) because
+	 * the user-supplied config is actually the one to trust when
+	 * doing an import.
 	 */
-	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
+	loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot);
 
-	spa_config_enter(spa, RW_WRITER, FTAG);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	/*
 	 * Toss any existing sparelist, as it doesn't have any validity anymore,
 	 * and conflicts with spa_has_spare().
 	 */
-	if (spa->spa_sparelist) {
-		nvlist_free(spa->spa_sparelist);
-		spa->spa_sparelist = NULL;
+	if (!isroot && spa->spa_spares.sav_config) {
+		nvlist_free(spa->spa_spares.sav_config);
+		spa->spa_spares.sav_config = NULL;
 		spa_load_spares(spa);
 	}
+	if (!isroot && spa->spa_l2cache.sav_config) {
+		nvlist_free(spa->spa_l2cache.sav_config);
+		spa->spa_l2cache.sav_config = NULL;
+		spa_load_l2cache(spa);
+	}
 
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
 	if (error == 0)
-		error = spa_validate_spares(spa, nvroot, -1ULL,
-		    VDEV_ALLOC_SPARE);
-	spa_config_exit(spa, FTAG);
+		error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE);
+	if (error == 0)
+		error = spa_validate_aux(spa, nvroot, -1ULL,
+		    VDEV_ALLOC_L2CACHE);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 
-	if (error != 0) {
-		spa_unload(spa);
-		spa_deactivate(spa);
-		spa_remove(spa);
+	if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
+		if (loaderr != 0 && loaderr != EINVAL && allowfaulted) {
+			/*
+			 * If we failed to load the pool, but 'allowfaulted' is
+			 * set, then manually set the config as if the config
+			 * passed in was specified in the cache file.
+			 */
+			error = 0;
+			spa->spa_import_faulted = B_FALSE;
+			if (spa->spa_config == NULL)
+				spa->spa_config = spa_config_generate(spa,
+				    NULL, -1ULL, B_TRUE);
+			spa_unload(spa);
+			spa_deactivate(spa);
+			spa_config_sync(spa, B_FALSE, B_TRUE);
+		} else {
+			spa_unload(spa);
+			spa_deactivate(spa);
+			spa_remove(spa);
+		}
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	/*
-	 * Override any spares as specified by the user, as these may have
-	 * correct device names/devids, etc.
+	 * Override any spares and level 2 cache devices as specified by
+	 * the user, as these may have correct device names/devids, etc.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
-		if (spa->spa_sparelist)
-			VERIFY(nvlist_remove(spa->spa_sparelist,
+		if (spa->spa_spares.sav_config)
+			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
 		else
-			VERIFY(nvlist_alloc(&spa->spa_sparelist,
+			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
+		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
-		spa_config_enter(spa, RW_WRITER, FTAG);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
-		spa_config_exit(spa, FTAG);
-		spa->spa_sync_spares = B_TRUE;
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa->spa_spares.sav_sync = B_TRUE;
+	}
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+	    &l2cache, &nl2cache) == 0) {
+		if (spa->spa_l2cache.sav_config)
+			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
+			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
+		else
+			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
+			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
+		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_load_l2cache(spa);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
+	if (spa_mode & FWRITE) {
+		/*
+		 * Update the config cache to include the newly-imported pool.
+		 */
+		spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot);
+	}
+
+	spa->spa_import_faulted = B_FALSE;
+	mutex_exit(&spa_namespace_lock);
+
+	return (0);
+}
+
+#if defined(sun)
+#ifdef _KERNEL
+/*
+ * Build a "root" vdev for a top level vdev read in from a rootpool
+ * device label.
+ */
+static void
+spa_build_rootpool_config(nvlist_t *config)
+{
+	nvlist_t *nvtop, *nvroot;
+	uint64_t pgid;
+
 	/*
-	 * Update the config cache to include the newly-imported pool.
+	 * Add this top-level vdev to the child array.
 	 */
-	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop)
+	    == 0);
+	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid)
+	    == 0);
 
-	mutex_exit(&spa_namespace_lock);
+	/*
+	 * Put this pool's top-level vdevs into a root vdev.
+	 */
+	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT)
+	    == 0);
+	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
+	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
+	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    &nvtop, 1) == 0);
 
 	/*
-	 * Resilver anything that's out of date.
+	 * Replace the existing vdev_tree with the new root vdev in
+	 * this pool's configuration (remove the old, add the new).
 	 */
-	if (spa_mode & FWRITE)
-		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+	nvlist_free(nvroot);
+}
+
+/*
+ * Get the root pool information from the root disk, then import the root pool
+ * during the system boot up time.
+ */
+extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
+
+int
+spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf,
+    uint64_t *besttxg)
+{
+	nvlist_t *config;
+	uint64_t txg;
+	int error;
+
+	if (error = vdev_disk_read_rootlabel(devpath, devid, &config))
+		return (error);
+
+	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
 
+	if (bestconf != NULL)
+		*bestconf = config;
+	else
+		nvlist_free(config);
+	*besttxg = txg;
 	return (0);
 }
 
+boolean_t
+spa_rootdev_validate(nvlist_t *nv)
+{
+	uint64_t ival;
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 ||
+	    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 ||
+	    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+
+/*
+ * Given the boot device's physical path or devid, check if the device
+ * is in a valid state.  If so, return the configuration from the vdev
+ * label.
+ */
+int
+spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf)
+{
+	nvlist_t *conf = NULL;
+	uint64_t txg = 0;
+	nvlist_t *nvtop, **child;
+	char *type;
+	char *bootpath = NULL;
+	uint_t children, c;
+	char *tmp;
+	int error;
+
+	if (devpath && ((tmp = strchr(devpath, ' ')) != NULL))
+		*tmp = '\0';
+	if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) {
+		cmn_err(CE_NOTE, "error reading device label");
+		return (error);
+	}
+	if (txg == 0) {
+		cmn_err(CE_NOTE, "this device is detached");
+		nvlist_free(conf);
+		return (EINVAL);
+	}
+
+	VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvtop) == 0);
+	VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+	if (strcmp(type, VDEV_TYPE_DISK) == 0) {
+		if (spa_rootdev_validate(nvtop)) {
+			goto out;
+		} else {
+			nvlist_free(conf);
+			return (EINVAL);
+		}
+	}
+
+	ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0);
+
+	VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) == 0);
+
+	/*
+	 * Go thru vdevs in the mirror to see if the given device
+	 * has the most recent txg. Only the device with the most
+	 * recent txg has valid information and should be booted.
+	 */
+	for (c = 0; c < children; c++) {
+		char *cdevid, *cpath;
+		uint64_t tmptxg;
+
+		if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH,
+		    &cpath) != 0)
+			return (EINVAL);
+		if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID,
+		    &cdevid) != 0)
+			return (EINVAL);
+		if ((spa_check_rootconf(cpath, cdevid, NULL,
+		    &tmptxg) == 0) && (tmptxg > txg)) {
+			txg = tmptxg;
+			VERIFY(nvlist_lookup_string(child[c],
+			    ZPOOL_CONFIG_PATH, &bootpath) == 0);
+		}
+	}
+
+	/* Does the best device match the one we've booted from? */
+	if (bootpath) {
+		cmn_err(CE_NOTE, "try booting from '%s'", bootpath);
+		return (EINVAL);
+	}
+out:
+	*bestconf = conf;
+	return (0);
+}
+
+/*
+ * Import a root pool.
+ *
+ * For x86. devpath_list will consist of devid and/or physpath name of
+ * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
+ * The GRUB "findroot" command will return the vdev we should boot.
+ *
+ * For Sparc, devpath_list consists the physpath name of the booting device
+ * no matter the rootpool is a single device pool or a mirrored pool.
+ * e.g.
+ *	"/pci@1f,0/ide@d/disk@0,0:a"
+ */
+int
+spa_import_rootpool(char *devpath, char *devid)
+{
+	nvlist_t *conf = NULL;
+	char *pname;
+	int error;
+
+	/*
+	 * Get the vdev pathname and configuation from the most
+	 * recently updated vdev (highest txg).
+	 */
+	if (error = spa_get_rootconf(devpath, devid, &conf))
+		goto msg_out;
+
+	/*
+	 * Add type "root" vdev to the config.
+	 */
+	spa_build_rootpool_config(conf);
+
+	VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0);
+
+	/*
+	 * We specify 'allowfaulted' for this to be treated like spa_open()
+	 * instead of spa_import().  This prevents us from marking vdevs as
+	 * persistently unavailable, and generates FMA ereports as if it were a
+	 * pool open, not import.
+	 */
+	error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE);
+	if (error == EEXIST)
+		error = 0;
+
+	nvlist_free(conf);
+	return (error);
+
+msg_out:
+	cmn_err(CE_NOTE, "\n"
+	    "  ***************************************************  \n"
+	    "  *  This device is not bootable!                   *  \n"
+	    "  *  It is either offlined or detached or faulted.  *  \n"
+	    "  *  Please try to boot from a different device.    *  \n"
+	    "  ***************************************************  ");
+
+	return (error);
+}
+#endif
+#endif
+
+/*
+ * Import a non-root pool into the system.
+ */
+int
+spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
+{
+	return (spa_import_common(pool, config, props, B_FALSE, B_FALSE));
+}
+
+int
+spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props)
+{
+	return (spa_import_common(pool, config, props, B_FALSE, B_TRUE));
+}
+
+
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
  * to get the vdev stats associated with the imported devices.
@@ -1393,9 +2495,7 @@ spa_tryimport(nvlist_t *tryconfig)
 	 * If 'tryconfig' was at least parsable, return the current config.
 	 */
 	if (spa->spa_root_vdev != NULL) {
-		spa_config_enter(spa, RW_READER, FTAG);
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
-		spa_config_exit(spa, FTAG);
 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
 		    poolname) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
@@ -1404,9 +2504,42 @@ spa_tryimport(nvlist_t *tryconfig)
 		    spa->spa_uberblock.ub_timestamp) == 0);
 
 		/*
-		 * Add the list of hot spares.
+		 * If the bootfs property exists on this pool then we
+		 * copy it out so that external consumers can tell which
+		 * pools are bootable.
+		 */
+		if (spa->spa_bootfs) {
+			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+			/*
+			 * We have to play games with the name since the
+			 * pool was opened as TRYIMPORT_NAME.
+			 */
+			if (dsl_dsobj_to_dsname(spa_name(spa),
+			    spa->spa_bootfs, tmpname) == 0) {
+				char *cp;
+				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+				cp = strchr(tmpname, '/');
+				if (cp == NULL) {
+					(void) strlcpy(dsname, tmpname,
+					    MAXPATHLEN);
+				} else {
+					(void) snprintf(dsname, MAXPATHLEN,
+					    "%s/%s", poolname, ++cp);
+				}
+				VERIFY(nvlist_add_string(config,
+				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
+				kmem_free(dsname, MAXPATHLEN);
+			}
+			kmem_free(tmpname, MAXPATHLEN);
+		}
+
+		/*
+		 * Add the list of hot spares and level 2 cache devices.
 		 */
 		spa_add_spares(spa, config);
+		spa_add_l2cache(spa, config);
 	}
 
 	spa_unload(spa);
@@ -1426,7 +2559,8 @@ spa_tryimport(nvlist_t *tryconfig)
  * configuration from the cache afterwards.
  */
 static int
-spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
+spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
+    boolean_t force)
 {
 	spa_t *spa;
 
@@ -1461,7 +2595,6 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
 		 * Objsets may be open only because they're dirty, so we
 		 * have to force it to sync before checking spa_refcnt.
 		 */
-		spa_scrub_suspend(spa);
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 
 		/*
@@ -1472,14 +2605,23 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
 		if (!spa_refcount_zero(spa) ||
 		    (spa->spa_inject_ref != 0 &&
 		    new_state != POOL_STATE_UNINITIALIZED)) {
-			spa_scrub_resume(spa);
 			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (EBUSY);
 		}
 
-		spa_scrub_resume(spa);
-		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
+		/*
+		 * A pool cannot be exported if it has an active shared spare.
+		 * This is to prevent other pools stealing the active spare
+		 * from an exported pool. At user's own will, such pool can
+		 * be forcedly exported.
+		 */
+		if (!force && new_state == POOL_STATE_EXPORTED &&
+		    spa_has_active_shared_spare(spa)) {
+			spa_async_resume(spa);
+			mutex_exit(&spa_namespace_lock);
+			return (EXDEV);
+		}
 
 		/*
 		 * We want this to be reflected on every label,
@@ -1487,14 +2629,16 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
 		 * final sync that pushes these changes out.
 		 */
 		if (new_state != POOL_STATE_UNINITIALIZED) {
-			spa_config_enter(spa, RW_WRITER, FTAG);
+			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_state = new_state;
 			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
 			vdev_config_dirty(spa->spa_root_vdev);
-			spa_config_exit(spa, FTAG);
+			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
 	}
 
+	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
+
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 		spa_unload(spa);
 		spa_deactivate(spa);
@@ -1504,8 +2648,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
 
 	if (new_state != POOL_STATE_UNINITIALIZED) {
+		spa_config_sync(spa, B_TRUE, B_TRUE);
 		spa_remove(spa);
-		spa_config_sync();
 	}
 	mutex_exit(&spa_namespace_lock);
 
@@ -1518,16 +2662,16 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
 int
 spa_destroy(char *pool)
 {
-	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
+	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE));
 }
 
 /*
  * Export a storage pool.
  */
 int
-spa_export(char *pool, nvlist_t **oldconfig)
+spa_export(char *pool, nvlist_t **oldconfig, boolean_t force)
 {
-	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
+	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force));
 }
 
 /*
@@ -1537,10 +2681,10 @@ spa_export(char *pool, nvlist_t **oldconfig)
 int
 spa_reset(char *pool)
 {
-	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
+	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
+	    B_FALSE));
 }
 
-
 /*
  * ==========================================================================
  * Device manipulation
@@ -1548,7 +2692,7 @@ spa_reset(char *pool)
  */
 
 /*
- * Add capacity to a storage pool.
+ * Add a device to a storage pool.
  */
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
@@ -1557,8 +2701,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 	int c, error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
-	nvlist_t **spares;
-	uint_t i, nspares;
+	nvlist_t **spares, **l2cache;
+	uint_t nspares, nl2cache;
 
 	txg = spa_vdev_enter(spa);
 
@@ -1566,35 +2710,29 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 	    VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
-	spa->spa_pending_vdev = vd;
+	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
 
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-	    &spares, &nspares) != 0)
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
+	    &nspares) != 0)
 		nspares = 0;
 
-	if (vd->vdev_children == 0 && nspares == 0) {
-		spa->spa_pending_vdev = NULL;
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
+	    &nl2cache) != 0)
+		nl2cache = 0;
+
+	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
-	}
 
-	if (vd->vdev_children != 0) {
-		if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
-			spa->spa_pending_vdev = NULL;
-			return (spa_vdev_exit(spa, vd, txg, error));
-		}
-	}
+	if (vd->vdev_children != 0 &&
+	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
+		return (spa_vdev_exit(spa, vd, txg, error));
 
 	/*
-	 * We must validate the spares after checking the children.  Otherwise,
-	 * vdev_inuse() will blindly overwrite the spare.
+	 * We must validate the spares and l2cache devices after checking the
+	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
 	 */
-	if ((error = spa_validate_spares(spa, nvroot, txg,
-	    VDEV_ALLOC_ADD)) != 0) {
-		spa->spa_pending_vdev = NULL;
+	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
-	}
-
-	spa->spa_pending_vdev = NULL;
 
 	/*
 	 * Transfer each new top-level vdev from vd to rvd.
@@ -1608,43 +2746,17 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 	}
 
 	if (nspares != 0) {
-		if (spa->spa_sparelist != NULL) {
-			nvlist_t **oldspares;
-			uint_t oldnspares;
-			nvlist_t **newspares;
-
-			VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
-			    ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0);
-
-			newspares = kmem_alloc(sizeof (void *) *
-			    (nspares + oldnspares), KM_SLEEP);
-			for (i = 0; i < oldnspares; i++)
-				VERIFY(nvlist_dup(oldspares[i],
-				    &newspares[i], KM_SLEEP) == 0);
-			for (i = 0; i < nspares; i++)
-				VERIFY(nvlist_dup(spares[i],
-				    &newspares[i + oldnspares],
-				    KM_SLEEP) == 0);
-
-			VERIFY(nvlist_remove(spa->spa_sparelist,
-			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
-
-			VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
-			    ZPOOL_CONFIG_SPARES, newspares,
-			    nspares + oldnspares) == 0);
-			for (i = 0; i < oldnspares + nspares; i++)
-				nvlist_free(newspares[i]);
-			kmem_free(newspares, (oldnspares + nspares) *
-			    sizeof (void *));
-		} else {
-			VERIFY(nvlist_alloc(&spa->spa_sparelist,
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-			VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
-			    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
-		}
-
+		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
+		    ZPOOL_CONFIG_SPARES);
 		spa_load_spares(spa);
-		spa->spa_sync_spares = B_TRUE;
+		spa->spa_spares.sav_sync = B_TRUE;
+	}
+
+	if (nl2cache != 0) {
+		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
+		    ZPOOL_CONFIG_L2CACHE);
+		spa_load_l2cache(spa);
+		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
@@ -1676,7 +2788,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
  *
  * If 'replacing' is specified, the new device is intended to replace the
  * existing device; in this case the two devices are made into their own
- * mirror using the 'replacing' vdev, which is functionally idendical to
+ * mirror using the 'replacing' vdev, which is functionally identical to
  * the mirror vdev (it actually reuses all the same ops) but has a few
  * extra rules: you can't attach to it after it's been created, and upon
  * completion of resilvering, the first disk (the one being replaced)
@@ -1686,14 +2798,17 @@ int
 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 {
 	uint64_t txg, open_txg;
-	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 	vdev_ops_t *pvops;
+	dmu_tx_t *tx;
+	char *oldvdpath, *newvdpath;
+	int newvd_isspare;
+	int error;
 
 	txg = spa_vdev_enter(spa);
 
-	oldvd = vdev_lookup_by_guid(rvd, guid);
+	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
@@ -1704,7 +2819,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	pvd = oldvd->vdev_parent;
 
 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
-	    VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1)
+	    VDEV_ALLOC_ADD)) != 0)
+		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+	if (newrootvd->vdev_children != 1)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	newvd = newrootvd->vdev_child[0];
@@ -1715,6 +2833,12 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
 		return (spa_vdev_exit(spa, newrootvd, txg, error));
 
+	/*
+	 * Spares can't replace logs
+	 */
+	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
+		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
 	if (!replacing) {
 		/*
 		 * For attach, the only allowable parent is a mirror or the root
@@ -1828,6 +2952,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 
 	if (newvd->vdev_isspare)
 		spa_spare_activate(newvd);
+	oldvdpath = spa_strdup(oldvd->vdev_path);
+	newvdpath = spa_strdup(newvd->vdev_path);
+	newvd_isspare = newvd->vdev_isspare;
 
 	/*
 	 * Mark newvd's DTL dirty in this txg.
@@ -1836,10 +2963,25 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 
 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
 
+	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	if (dmu_tx_assign(tx, TXG_WAIT) == 0) {
+		spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx,
+		    CRED(),  "%s vdev=%s %s vdev=%s",
+		    replacing && newvd_isspare ? "spare in" :
+		    replacing ? "replace" : "attach", newvdpath,
+		    replacing ? "for" : "to", oldvdpath);
+		dmu_tx_commit(tx);
+	} else {
+		dmu_tx_abort(tx);
+	}
+
+	spa_strfree(oldvdpath);
+	spa_strfree(newvdpath);
+
 	/*
 	 * Kick off a resilver to update newvd.
 	 */
-	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+	VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
 
 	return (0);
 }
@@ -1858,10 +3000,11 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
 	uint64_t unspare_guid;
+	size_t len;
 
 	txg = spa_vdev_enter(spa);
 
-	vd = vdev_lookup_by_guid(rvd, guid);
+	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
@@ -1886,7 +3029,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 	}
 
 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
-	    spa_version(spa) >= ZFS_VERSION_SPARES);
+	    spa_version(spa) >= SPA_VERSION_SPARES);
 
 	/*
 	 * Only mirror, replacing, and spare vdevs support detach.
@@ -1925,13 +3068,26 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 			break;
 	}
 
+	if (c == pvd->vdev_children)
+		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
 	/*
-	 * If we are a replacing or spare vdev, then we can always detach the
-	 * latter child, as that is how one cancels the operation.
+	 * If we are detaching the second disk from a replacing vdev, then
+	 * check to see if we changed the original vdev's path to have "/old"
+	 * at the end in spa_vdev_attach().  If so, undo that change now.
 	 */
-	if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
-	    c == pvd->vdev_children)
-		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 &&
+	    pvd->vdev_child[0]->vdev_path != NULL &&
+	    pvd->vdev_child[1]->vdev_path != NULL) {
+		ASSERT(pvd->vdev_child[1] == vd);
+		cvd = pvd->vdev_child[0];
+		len = strlen(vd->vdev_path);
+		if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
+		    strcmp(cvd->vdev_path + len, "/old") == 0) {
+			spa_strfree(cvd->vdev_path);
+			cvd->vdev_path = spa_strdup(vd->vdev_path);
+		}
+	}
 
 	/*
 	 * If we are detaching the original disk from a spare, then it implies
@@ -1992,7 +3148,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
-	vdev_propagate_state(cvd->vdev_parent);
+	vdev_propagate_state(cvd);
 
 	/*
 	 * If the device we just detached was smaller than the others, it may be
@@ -2015,6 +3171,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
 
+	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
 	/*
@@ -2028,8 +3186,11 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 		while ((spa = spa_next(spa)) != NULL) {
 			if (spa->spa_state != POOL_STATE_ACTIVE)
 				continue;
-
+			spa_open_ref(spa, FTAG);
+			mutex_exit(&spa_namespace_lock);
 			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+			mutex_enter(&spa_namespace_lock);
+			spa_close(spa, FTAG);
 		}
 		mutex_exit(&spa_namespace_lock);
 	}
@@ -2037,100 +3198,125 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 	return (error);
 }
 
+static nvlist_t *
+spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
+{
+	for (int i = 0; i < count; i++) {
+		uint64_t guid;
+
+		VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
+		    &guid) == 0);
+
+		if (guid == target_guid)
+			return (nvpp[i]);
+	}
+
+	return (NULL);
+}
+
+static void
+spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
+	nvlist_t *dev_to_remove)
+{
+	nvlist_t **newdev = NULL;
+
+	if (count > 1)
+		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
+
+	for (int i = 0, j = 0; i < count; i++) {
+		if (dev[i] == dev_to_remove)
+			continue;
+		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
+	}
+
+	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
+	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
+
+	for (int i = 0; i < count - 1; i++)
+		nvlist_free(newdev[i]);
+
+	if (count > 1)
+		kmem_free(newdev, (count - 1) * sizeof (void *));
+}
+
 /*
  * Remove a device from the pool.  Currently, this supports removing only hot
- * spares.
+ * spares and level 2 ARC devices.
  */
 int
 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 {
 	vdev_t *vd;
-	nvlist_t **spares, *nv, **newspares;
-	uint_t i, j, nspares;
-	int ret = 0;
-
-	spa_config_enter(spa, RW_WRITER, FTAG);
+	nvlist_t **spares, **l2cache, *nv;
+	uint_t nspares, nl2cache;
+	uint64_t txg;
+	int error = 0;
 
-	vd = spa_lookup_by_guid(spa, guid);
+	txg = spa_vdev_enter(spa);
 
-	nv = NULL;
-	if (spa->spa_spares != NULL &&
-	    nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
-	    &spares, &nspares) == 0) {
-		for (i = 0; i < nspares; i++) {
-			uint64_t theguid;
+	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
-			VERIFY(nvlist_lookup_uint64(spares[i],
-			    ZPOOL_CONFIG_GUID, &theguid) == 0);
-			if (theguid == guid) {
-				nv = spares[i];
-				break;
-			}
+	if (spa->spa_spares.sav_vdevs != NULL &&
+	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
+	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
+		/*
+		 * Only remove the hot spare if it's not currently in use
+		 * in this pool.
+		 */
+		if (vd == NULL || unspare) {
+			spa_vdev_remove_aux(spa->spa_spares.sav_config,
+			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
+			spa_load_spares(spa);
+			spa->spa_spares.sav_sync = B_TRUE;
+		} else {
+			error = EBUSY;
 		}
-	}
-
-	/*
-	 * We only support removing a hot spare, and only if it's not currently
-	 * in use in this pool.
-	 */
-	if (nv == NULL && vd == NULL) {
-		ret = ENOENT;
-		goto out;
-	}
-
-	if (nv == NULL && vd != NULL) {
-		ret = ENOTSUP;
-		goto out;
-	}
-
-	if (!unspare && nv != NULL && vd != NULL) {
-		ret = EBUSY;
-		goto out;
-	}
-
-	if (nspares == 1) {
-		newspares = NULL;
+	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
+	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
+	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
+	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
+		/*
+		 * Cache devices can always be removed.
+		 */
+		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
+		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
+		spa_load_l2cache(spa);
+		spa->spa_l2cache.sav_sync = B_TRUE;
+	} else if (vd != NULL) {
+		/*
+		 * Normal vdevs cannot be removed (yet).
+		 */
+		error = ENOTSUP;
 	} else {
-		newspares = kmem_alloc((nspares - 1) * sizeof (void *),
-		    KM_SLEEP);
-		for (i = 0, j = 0; i < nspares; i++) {
-			if (spares[i] != nv)
-				VERIFY(nvlist_dup(spares[i],
-				    &newspares[j++], KM_SLEEP) == 0);
-		}
+		/*
+		 * There is no vdev of any kind with the specified guid.
+		 */
+		error = ENOENT;
 	}
 
-	VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
-	    DATA_TYPE_NVLIST_ARRAY) == 0);
-	VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
-	    newspares, nspares - 1) == 0);
-	for (i = 0; i < nspares - 1; i++)
-		nvlist_free(newspares[i]);
-	kmem_free(newspares, (nspares - 1) * sizeof (void *));
-	spa_load_spares(spa);
-	spa->spa_sync_spares = B_TRUE;
-
-out:
-	spa_config_exit(spa, FTAG);
-
-	return (ret);
+	return (spa_vdev_exit(spa, NULL, txg, error));
 }
 
 /*
- * Find any device that's done replacing, so we can detach it.
+ * Find any device that's done replacing, or a vdev marked 'unspare' that's
+ * current spared, so we can detach it.
  */
 static vdev_t *
-spa_vdev_replace_done_hunt(vdev_t *vd)
+spa_vdev_resilver_done_hunt(vdev_t *vd)
 {
 	vdev_t *newvd, *oldvd;
 	int c;
 
 	for (c = 0; c < vd->vdev_children; c++) {
-		oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
+		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
 		if (oldvd != NULL)
 			return (oldvd);
 	}
 
+	/*
+	 * Check for a completed replacement.
+	 */
 	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
 		oldvd = vd->vdev_child[0];
 		newvd = vd->vdev_child[1];
@@ -2144,20 +3330,38 @@ spa_vdev_replace_done_hunt(vdev_t *vd)
 		mutex_exit(&newvd->vdev_dtl_lock);
 	}
 
+	/*
+	 * Check for a completed resilver with the 'unspare' flag set.
+	 */
+	if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
+		newvd = vd->vdev_child[0];
+		oldvd = vd->vdev_child[1];
+
+		mutex_enter(&newvd->vdev_dtl_lock);
+		if (newvd->vdev_unspare &&
+		    newvd->vdev_dtl_map.sm_space == 0 &&
+		    newvd->vdev_dtl_scrub.sm_space == 0) {
+			newvd->vdev_unspare = 0;
+			mutex_exit(&newvd->vdev_dtl_lock);
+			return (oldvd);
+		}
+		mutex_exit(&newvd->vdev_dtl_lock);
+	}
+
 	return (NULL);
 }
 
 static void
-spa_vdev_replace_done(spa_t *spa)
+spa_vdev_resilver_done(spa_t *spa)
 {
 	vdev_t *vd;
 	vdev_t *pvd;
 	uint64_t guid;
 	uint64_t pguid = 0;
 
-	spa_config_enter(spa, RW_READER, FTAG);
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
-	while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
+	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
 		guid = vd->vdev_guid;
 		/*
 		 * If we have just finished replacing a hot spared device, then
@@ -2171,15 +3375,15 @@ spa_vdev_replace_done(spa_t *spa)
 			ASSERT(pvd->vdev_parent->vdev_children == 2);
 			pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
 		}
-		spa_config_exit(spa, FTAG);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
 			return;
 		if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
 			return;
-		spa_config_enter(spa, RW_READER, FTAG);
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	}
 
-	spa_config_exit(spa, FTAG);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 /*
@@ -2189,42 +3393,40 @@ spa_vdev_replace_done(spa_t *spa)
 int
 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
 {
-	vdev_t *rvd, *vd;
+	vdev_t *vd;
 	uint64_t txg;
 
-	rvd = spa->spa_root_vdev;
-
 	txg = spa_vdev_enter(spa);
 
-	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
+	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) {
 		/*
-		 * Determine if this is a reference to a hot spare.  In that
-		 * case, update the path as stored in the spare list.
+		 * Determine if this is a reference to a hot spare device.  If
+		 * it is, update the path manually as there is no associated
+		 * vdev_t that can be synced to disk.
 		 */
 		nvlist_t **spares;
 		uint_t i, nspares;
-		if (spa->spa_sparelist != NULL) {
-			VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
-			    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+
+		if (spa->spa_spares.sav_config != NULL) {
+			VERIFY(nvlist_lookup_nvlist_array(
+			    spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
+			    &spares, &nspares) == 0);
 			for (i = 0; i < nspares; i++) {
 				uint64_t theguid;
 				VERIFY(nvlist_lookup_uint64(spares[i],
 				    ZPOOL_CONFIG_GUID, &theguid) == 0);
-				if (theguid == guid)
-					break;
+				if (theguid == guid) {
+					VERIFY(nvlist_add_string(spares[i],
+					    ZPOOL_CONFIG_PATH, newpath) == 0);
+					spa_load_spares(spa);
+					spa->spa_spares.sav_sync = B_TRUE;
+					return (spa_vdev_exit(spa, NULL, txg,
+					    0));
+				}
 			}
-
-			if (i == nspares)
-				return (spa_vdev_exit(spa, NULL, txg, ENOENT));
-
-			VERIFY(nvlist_add_string(spares[i],
-			    ZPOOL_CONFIG_PATH, newpath) == 0);
-			spa_load_spares(spa);
-			spa->spa_sync_spares = B_TRUE;
-			return (spa_vdev_exit(spa, NULL, txg, 0));
-		} else {
-			return (spa_vdev_exit(spa, NULL, txg, ENOENT));
 		}
+
+		return (spa_vdev_exit(spa, NULL, txg, ENOENT));
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf)
@@ -2244,397 +3446,36 @@ spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
  * ==========================================================================
  */
 
-static void
-spa_scrub_io_done(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-
-	zio_data_buf_free(zio->io_data, zio->io_size);
-
-	mutex_enter(&spa->spa_scrub_lock);
-	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-		vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
-		spa->spa_scrub_errors++;
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_scrub_errors++;
-		mutex_exit(&vd->vdev_stat_lock);
-	}
-
-	if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight)
-		cv_broadcast(&spa->spa_scrub_io_cv);
-
-	ASSERT(spa->spa_scrub_inflight >= 0);
-
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-static void
-spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
-    zbookmark_t *zb)
-{
-	size_t size = BP_GET_LSIZE(bp);
-	void *data;
-
-	mutex_enter(&spa->spa_scrub_lock);
-	/*
-	 * Do not give too much work to vdev(s).
-	 */
-	while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) {
-		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-	}
-	spa->spa_scrub_inflight++;
-	mutex_exit(&spa->spa_scrub_lock);
-
-	data = zio_data_buf_alloc(size);
-
-	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
-		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
-
-	flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
-
-	zio_nowait(zio_read(NULL, spa, bp, data, size,
-	    spa_scrub_io_done, NULL, priority, flags, zb));
-}
-
-/* ARGSUSED */
-static int
-spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
-{
-	blkptr_t *bp = &bc->bc_blkptr;
-	vdev_t *vd = spa->spa_root_vdev;
-	dva_t *dva = bp->blk_dva;
-	int needs_resilver = B_FALSE;
-	int d;
-
-	if (bc->bc_errno) {
-		/*
-		 * We can't scrub this block, but we can continue to scrub
-		 * the rest of the pool.  Note the error and move along.
-		 */
-		mutex_enter(&spa->spa_scrub_lock);
-		spa->spa_scrub_errors++;
-		mutex_exit(&spa->spa_scrub_lock);
-
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_scrub_errors++;
-		mutex_exit(&vd->vdev_stat_lock);
-
-		return (ERESTART);
-	}
-
-	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
-
-	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
-
-		ASSERT(vd != NULL);
-
-		/*
-		 * Keep track of how much data we've examined so that
-		 * zpool(1M) status can make useful progress reports.
-		 */
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
-		mutex_exit(&vd->vdev_stat_lock);
-
-		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
-			if (DVA_GET_GANG(&dva[d])) {
-				/*
-				 * Gang members may be spread across multiple
-				 * vdevs, so the best we can do is look at the
-				 * pool-wide DTL.
-				 * XXX -- it would be better to change our
-				 * allocation policy to ensure that this can't
-				 * happen.
-				 */
-				vd = spa->spa_root_vdev;
-			}
-			if (vdev_dtl_contains(&vd->vdev_dtl_map,
-			    bp->blk_birth, 1))
-				needs_resilver = B_TRUE;
-		}
-	}
-
-	if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
-		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
-		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
-	else if (needs_resilver)
-		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
-		    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
-
-	return (0);
-}
-
-static void
-spa_scrub_thread(void *arg)
-{
-	spa_t *spa = arg;
-	callb_cpr_t cprinfo;
-	traverse_handle_t *th = spa->spa_scrub_th;
-	vdev_t *rvd = spa->spa_root_vdev;
-	pool_scrub_type_t scrub_type = spa->spa_scrub_type;
-	int error = 0;
-	boolean_t complete;
-
-	CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
-
-	/*
-	 * If we're restarting due to a snapshot create/delete,
-	 * wait for that to complete.
-	 */
-	txg_wait_synced(spa_get_dsl(spa), 0);
-
-	dprintf("start %s mintxg=%llu maxtxg=%llu\n",
-	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
-	    spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
-
-	spa_config_enter(spa, RW_WRITER, FTAG);
-	vdev_reopen(rvd);		/* purge all vdev caches */
-	vdev_config_dirty(rvd);		/* rewrite all disk labels */
-	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
-	spa_config_exit(spa, FTAG);
-
-	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_errors = 0;
-	spa->spa_scrub_active = 1;
-	ASSERT(spa->spa_scrub_inflight == 0);
-
-	while (!spa->spa_scrub_stop) {
-		CALLB_CPR_SAFE_BEGIN(&cprinfo);
-		while (spa->spa_scrub_suspended) {
-			spa->spa_scrub_active = 0;
-			cv_broadcast(&spa->spa_scrub_cv);
-			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
-			spa->spa_scrub_active = 1;
-		}
-		CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
-
-		if (spa->spa_scrub_restart_txg != 0)
-			break;
-
-		mutex_exit(&spa->spa_scrub_lock);
-		error = traverse_more(th);
-		mutex_enter(&spa->spa_scrub_lock);
-		if (error != EAGAIN)
-			break;
-	}
-
-	while (spa->spa_scrub_inflight)
-		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-
-	spa->spa_scrub_active = 0;
-	cv_broadcast(&spa->spa_scrub_cv);
-
-	mutex_exit(&spa->spa_scrub_lock);
-
-	spa_config_enter(spa, RW_WRITER, FTAG);
-
-	mutex_enter(&spa->spa_scrub_lock);
-
-	/*
-	 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
-	 * AND the spa config lock to synchronize with any config changes
-	 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
-	 */
-	if (spa->spa_scrub_restart_txg != 0)
-		error = ERESTART;
-
-	if (spa->spa_scrub_stop)
-		error = EINTR;
-
-	/*
-	 * Even if there were uncorrectable errors, we consider the scrub
-	 * completed.  The downside is that if there is a transient error during
-	 * a resilver, we won't resilver the data properly to the target.  But
-	 * if the damage is permanent (more likely) we will resilver forever,
-	 * which isn't really acceptable.  Since there is enough information for
-	 * the user to know what has failed and why, this seems like a more
-	 * tractable approach.
-	 */
-	complete = (error == 0);
-
-	dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
-	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
-	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
-	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
-
-	mutex_exit(&spa->spa_scrub_lock);
-
-	/*
-	 * If the scrub/resilver completed, update all DTLs to reflect this.
-	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
-	 */
-	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
-	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
-	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
-	spa_errlog_rotate(spa);
-
-	spa_config_exit(spa, FTAG);
-
-	mutex_enter(&spa->spa_scrub_lock);
-
-	/*
-	 * We may have finished replacing a device.
-	 * Let the async thread assess this and handle the detach.
-	 */
-	spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
-
-	/*
-	 * If we were told to restart, our final act is to start a new scrub.
-	 */
-	if (error == ERESTART)
-		spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
-		    SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
-
-	spa->spa_scrub_type = POOL_SCRUB_NONE;
-	spa->spa_scrub_active = 0;
-	spa->spa_scrub_thread = NULL;
-	cv_broadcast(&spa->spa_scrub_cv);
-	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
-	thread_exit();
-}
-
-void
-spa_scrub_suspend(spa_t *spa)
-{
-	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_suspended++;
-	while (spa->spa_scrub_active) {
-		cv_broadcast(&spa->spa_scrub_cv);
-		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
-	}
-	while (spa->spa_scrub_inflight)
-		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-void
-spa_scrub_resume(spa_t *spa)
-{
-	mutex_enter(&spa->spa_scrub_lock);
-	ASSERT(spa->spa_scrub_suspended != 0);
-	if (--spa->spa_scrub_suspended == 0)
-		cv_broadcast(&spa->spa_scrub_cv);
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-void
-spa_scrub_restart(spa_t *spa, uint64_t txg)
-{
-	/*
-	 * Something happened (e.g. snapshot create/delete) that means
-	 * we must restart any in-progress scrubs.  The itinerary will
-	 * fix this properly.
-	 */
-	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_restart_txg = txg;
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
 int
-spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+spa_scrub(spa_t *spa, pool_scrub_type_t type)
 {
-	space_seg_t *ss;
-	uint64_t mintxg, maxtxg;
-	vdev_t *rvd = spa->spa_root_vdev;
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 
 	if ((uint_t)type >= POOL_SCRUB_TYPES)
 		return (ENOTSUP);
 
-	mutex_enter(&spa->spa_scrub_lock);
-
 	/*
-	 * If there's a scrub or resilver already in progress, stop it.
+	 * If a resilver was requested, but there is no DTL on a
+	 * writeable leaf device, we have nothing to do.
 	 */
-	while (spa->spa_scrub_thread != NULL) {
-		/*
-		 * Don't stop a resilver unless forced.
-		 */
-		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
-			mutex_exit(&spa->spa_scrub_lock);
-			return (EBUSY);
-		}
-		spa->spa_scrub_stop = 1;
-		cv_broadcast(&spa->spa_scrub_cv);
-		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
-	}
-
-	/*
-	 * Terminate the previous traverse.
-	 */
-	if (spa->spa_scrub_th != NULL) {
-		traverse_fini(spa->spa_scrub_th);
-		spa->spa_scrub_th = NULL;
-	}
-
-	if (rvd == NULL) {
-		ASSERT(spa->spa_scrub_stop == 0);
-		ASSERT(spa->spa_scrub_type == type);
-		ASSERT(spa->spa_scrub_restart_txg == 0);
-		mutex_exit(&spa->spa_scrub_lock);
+	if (type == POOL_SCRUB_RESILVER &&
+	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
+		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 		return (0);
 	}
 
-	mintxg = TXG_INITIAL - 1;
-	maxtxg = spa_last_synced_txg(spa) + 1;
-
-	mutex_enter(&rvd->vdev_dtl_lock);
+	if (type == POOL_SCRUB_EVERYTHING &&
+	    spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE &&
+	    spa->spa_dsl_pool->dp_scrub_isresilver)
+		return (EBUSY);
 
-	if (rvd->vdev_dtl_map.sm_space == 0) {
-		/*
-		 * The pool-wide DTL is empty.
-		 * If this is a resilver, there's nothing to do except
-		 * check whether any in-progress replacements have completed.
-		 */
-		if (type == POOL_SCRUB_RESILVER) {
-			type = POOL_SCRUB_NONE;
-			spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
-		}
+	if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) {
+		return (dsl_pool_scrub_clean(spa->spa_dsl_pool));
+	} else if (type == POOL_SCRUB_NONE) {
+		return (dsl_pool_scrub_cancel(spa->spa_dsl_pool));
 	} else {
-		/*
-		 * The pool-wide DTL is non-empty.
-		 * If this is a normal scrub, upgrade to a resilver instead.
-		 */
-		if (type == POOL_SCRUB_EVERYTHING)
-			type = POOL_SCRUB_RESILVER;
-	}
-
-	if (type == POOL_SCRUB_RESILVER) {
-		/*
-		 * Determine the resilvering boundaries.
-		 *
-		 * Note: (mintxg, maxtxg) is an open interval,
-		 * i.e. mintxg and maxtxg themselves are not included.
-		 *
-		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
-		 * so we don't claim to resilver a txg that's still changing.
-		 */
-		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
-		mintxg = ss->ss_start - 1;
-		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
-		maxtxg = MIN(ss->ss_end, maxtxg);
-	}
-
-	mutex_exit(&rvd->vdev_dtl_lock);
-
-	spa->spa_scrub_stop = 0;
-	spa->spa_scrub_type = type;
-	spa->spa_scrub_restart_txg = 0;
-
-	if (type != POOL_SCRUB_NONE) {
-		spa->spa_scrub_mintxg = mintxg;
-		spa->spa_scrub_maxtxg = maxtxg;
-		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
-		    ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
-		    ZIO_FLAG_CANFAIL);
-		traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
-		spa->spa_scrub_thread = thread_create(NULL, 0,
-		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
+		return (EINVAL);
 	}
-
-	mutex_exit(&spa->spa_scrub_lock);
-
-	return (0);
 }
 
 /*
@@ -2644,23 +3485,29 @@ spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
  */
 
 static void
-spa_async_reopen(spa_t *spa)
+spa_async_remove(spa_t *spa, vdev_t *vd)
 {
-	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *tvd;
-	int c;
+	if (vd->vdev_remove_wanted) {
+		vd->vdev_remove_wanted = 0;
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
+		vdev_clear(spa, vd);
+		vdev_state_dirty(vd->vdev_top);
+	}
 
-	spa_config_enter(spa, RW_WRITER, FTAG);
+	for (int c = 0; c < vd->vdev_children; c++)
+		spa_async_remove(spa, vd->vdev_child[c]);
+}
 
-	for (c = 0; c < rvd->vdev_children; c++) {
-		tvd = rvd->vdev_child[c];
-		if (tvd->vdev_reopen_wanted) {
-			tvd->vdev_reopen_wanted = 0;
-			vdev_reopen(tvd);
-		}
+static void
+spa_async_probe(spa_t *spa, vdev_t *vd)
+{
+	if (vd->vdev_probe_wanted) {
+		vd->vdev_probe_wanted = 0;
+		vdev_reopen(vd);	/* vdev_open() does the actual probe */
 	}
 
-	spa_config_exit(spa, FTAG);
+	for (int c = 0; c < vd->vdev_children; c++)
+		spa_async_probe(spa, vd->vdev_child[c]);
 }
 
 static void
@@ -2686,28 +3533,38 @@ spa_async_thread(void *arg)
 	}
 
 	/*
-	 * See if any devices need to be reopened.
+	 * See if any devices need to be marked REMOVED.
 	 */
-	if (tasks & SPA_ASYNC_REOPEN)
-		spa_async_reopen(spa);
+	if (tasks & SPA_ASYNC_REMOVE) {
+		spa_vdev_state_enter(spa);
+		spa_async_remove(spa, spa->spa_root_vdev);
+		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
+			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
+		for (int i = 0; i < spa->spa_spares.sav_count; i++)
+			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
+		(void) spa_vdev_state_exit(spa, NULL, 0);
+	}
 
 	/*
-	 * If any devices are done replacing, detach them.
+	 * See if any devices need to be probed.
 	 */
-	if (tasks & SPA_ASYNC_REPLACE_DONE)
-		spa_vdev_replace_done(spa);
+	if (tasks & SPA_ASYNC_PROBE) {
+		spa_vdev_state_enter(spa);
+		spa_async_probe(spa, spa->spa_root_vdev);
+		(void) spa_vdev_state_exit(spa, NULL, 0);
+	}
 
 	/*
-	 * Kick off a scrub.
+	 * If any devices are done replacing, detach them.
 	 */
-	if (tasks & SPA_ASYNC_SCRUB)
-		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
+	if (tasks & SPA_ASYNC_RESILVER_DONE)
+		spa_vdev_resilver_done(spa);
 
 	/*
 	 * Kick off a resilver.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER)
-		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0);
 
 	/*
 	 * Let the world know that we're done.
@@ -2775,10 +3632,13 @@ spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
 	int error;
 	uint8_t c = 1;
 
-	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
+	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 
-	while (bplist_iterate(bpl, &itor, &blk) == 0)
-		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
+	while (bplist_iterate(bpl, &itor, &blk) == 0) {
+		ASSERT(blk.blk_birth < txg);
+		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL,
+		    ZIO_FLAG_MUSTSUCCEED));
+	}
 
 	error = zio_wait(zio);
 	ASSERT3U(error, ==, 0);
@@ -2798,19 +3658,27 @@ static void
 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 {
 	char *packed = NULL;
+	size_t bufsize;
 	size_t nvsize = 0;
 	dmu_buf_t *db;
 
 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
 
-	packed = kmem_alloc(nvsize, KM_SLEEP);
+	/*
+	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
+	 * information.  This avoids the dbuf_will_dirty() path and
+	 * saves us a pre-read to get data we don't actually care about.
+	 */
+	bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
+	packed = kmem_alloc(bufsize, KM_SLEEP);
 
 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
 	    KM_SLEEP) == 0);
+	bzero(packed + nvsize, bufsize - nvsize);
 
-	dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
+	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
 
-	kmem_free(packed, nvsize);
+	kmem_free(packed, bufsize);
 
 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
@@ -2819,50 +3687,49 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 }
 
 static void
-spa_sync_spares(spa_t *spa, dmu_tx_t *tx)
+spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
+    const char *config, const char *entry)
 {
 	nvlist_t *nvroot;
-	nvlist_t **spares;
+	nvlist_t **list;
 	int i;
 
-	if (!spa->spa_sync_spares)
+	if (!sav->sav_sync)
 		return;
 
 	/*
-	 * Update the MOS nvlist describing the list of available spares.
-	 * spa_validate_spares() will have already made sure this nvlist is
-	 * valid and the vdevs are labelled appropriately.
+	 * Update the MOS nvlist describing the list of available devices.
+	 * spa_validate_aux() will have already made sure this nvlist is
+	 * valid and the vdevs are labeled appropriately.
 	 */
-	if (spa->spa_spares_object == 0) {
-		spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset,
-		    DMU_OT_PACKED_NVLIST, 1 << 14,
-		    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
+	if (sav->sav_object == 0) {
+		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
+		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
+		    sizeof (uint64_t), tx);
 		VERIFY(zap_update(spa->spa_meta_objset,
-		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES,
-		    sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0);
+		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
+		    &sav->sav_object, tx) == 0);
 	}
 
 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	if (spa->spa_nspares == 0) {
-		VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-		    NULL, 0) == 0);
+	if (sav->sav_count == 0) {
+		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
 	} else {
-		spares = kmem_alloc(spa->spa_nspares * sizeof (void *),
-		    KM_SLEEP);
-		for (i = 0; i < spa->spa_nspares; i++)
-			spares[i] = vdev_config_generate(spa,
-			    spa->spa_spares[i], B_FALSE, B_TRUE);
-		VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-		    spares, spa->spa_nspares) == 0);
-		for (i = 0; i < spa->spa_nspares; i++)
-			nvlist_free(spares[i]);
-		kmem_free(spares, spa->spa_nspares * sizeof (void *));
+		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
+		for (i = 0; i < sav->sav_count; i++)
+			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
+			    B_FALSE, B_FALSE, B_TRUE);
+		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
+		    sav->sav_count) == 0);
+		for (i = 0; i < sav->sav_count; i++)
+			nvlist_free(list[i]);
+		kmem_free(list, sav->sav_count * sizeof (void *));
 	}
 
-	spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx);
+	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
 	nvlist_free(nvroot);
 
-	spa->spa_sync_spares = B_FALSE;
+	sav->sav_sync = B_FALSE;
 }
 
 static void
@@ -2870,10 +3737,15 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 {
 	nvlist_t *config;
 
-	if (list_is_empty(&spa->spa_dirty_list))
+	if (list_is_empty(&spa->spa_config_dirty_list))
 		return;
 
-	config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+	config = spa_config_generate(spa, spa->spa_root_vdev,
+	    dmu_tx_get_txg(tx), B_FALSE);
+
+	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (spa->spa_config_syncing)
 		nvlist_free(spa->spa_config_syncing);
@@ -2882,41 +3754,140 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
 }
 
+/*
+ * Set zpool properties.
+ */
 static void
-spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
+spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	spa_t *spa = arg1;
-	nvlist_t *nvp = arg2;
-	nvpair_t *nvpair;
 	objset_t *mos = spa->spa_meta_objset;
-	uint64_t zapobj;
+	nvlist_t *nvp = arg2;
+	nvpair_t *elem;
+	uint64_t intval;
+	char *strval;
+	zpool_prop_t prop;
+	const char *propname;
+	zprop_type_t proptype;
+	spa_config_dirent_t *dp;
 
 	mutex_enter(&spa->spa_props_lock);
-	if (spa->spa_pool_props_object == 0) {
-		zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx);
-		VERIFY(zapobj > 0);
 
-		spa->spa_pool_props_object = zapobj;
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(nvp, elem))) {
+		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
+		case ZPOOL_PROP_VERSION:
+			/*
+			 * Only set version for non-zpool-creation cases
+			 * (set/import). spa_create() needs special care
+			 * for version setting.
+			 */
+			if (tx->tx_txg != TXG_INITIAL) {
+				VERIFY(nvpair_value_uint64(elem,
+				    &intval) == 0);
+				ASSERT(intval <= SPA_VERSION);
+				ASSERT(intval >= spa_version(spa));
+				spa->spa_uberblock.ub_version = intval;
+				vdev_config_dirty(spa->spa_root_vdev);
+			}
+			break;
 
-		VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_PROPS, 8, 1,
-		    &spa->spa_pool_props_object, tx) == 0);
-	}
-	mutex_exit(&spa->spa_props_lock);
+		case ZPOOL_PROP_ALTROOT:
+			/*
+			 * 'altroot' is a non-persistent property. It should
+			 * have been set temporarily at creation or import time.
+			 */
+			ASSERT(spa->spa_root != NULL);
+			break;
 
-	nvpair = NULL;
-	while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) {
-		switch (zpool_name_to_prop(nvpair_name(nvpair))) {
-		case ZFS_PROP_BOOTFS:
-			VERIFY(nvlist_lookup_uint64(nvp,
-			    nvpair_name(nvpair), &spa->spa_bootfs) == 0);
-			VERIFY(zap_update(mos,
-			    spa->spa_pool_props_object,
-			    zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1,
-			    &spa->spa_bootfs, tx) == 0);
+		case ZPOOL_PROP_CACHEFILE:
+			/*
+			 * 'cachefile' is a non-persistent property, but note
+			 * an async request that the config cache needs to be
+			 * udpated.
+			 */
+			VERIFY(nvpair_value_string(elem, &strval) == 0);
+
+			dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP);
+
+			if (strval[0] == '\0')
+				dp->scd_path = spa_strdup(spa_config_path);
+			else if (strcmp(strval, "none") == 0)
+				dp->scd_path = NULL;
+			else
+				dp->scd_path = spa_strdup(strval);
+
+			list_insert_head(&spa->spa_config_list, dp);
+			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 			break;
+		default:
+			/*
+			 * Set pool property values in the poolprops mos object.
+			 */
+			if (spa->spa_pool_props_object == 0) {
+				objset_t *mos = spa->spa_meta_objset;
+
+				VERIFY((spa->spa_pool_props_object =
+				    zap_create(mos, DMU_OT_POOL_PROPS,
+				    DMU_OT_NONE, 0, tx)) > 0);
+
+				VERIFY(zap_update(mos,
+				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
+				    8, 1, &spa->spa_pool_props_object, tx)
+				    == 0);
+			}
+
+			/* normalize the property name */
+			propname = zpool_prop_to_name(prop);
+			proptype = zpool_prop_get_type(prop);
+
+			if (nvpair_type(elem) == DATA_TYPE_STRING) {
+				ASSERT(proptype == PROP_TYPE_STRING);
+				VERIFY(nvpair_value_string(elem, &strval) == 0);
+				VERIFY(zap_update(mos,
+				    spa->spa_pool_props_object, propname,
+				    1, strlen(strval) + 1, strval, tx) == 0);
+
+			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
+				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+
+				if (proptype == PROP_TYPE_INDEX) {
+					const char *unused;
+					VERIFY(zpool_prop_index_to_string(
+					    prop, intval, &unused) == 0);
+				}
+				VERIFY(zap_update(mos,
+				    spa->spa_pool_props_object, propname,
+				    8, 1, &intval, tx) == 0);
+			} else {
+				ASSERT(0); /* not allowed */
+			}
+
+			switch (prop) {
+			case ZPOOL_PROP_DELEGATION:
+				spa->spa_delegation = intval;
+				break;
+			case ZPOOL_PROP_BOOTFS:
+				spa->spa_bootfs = intval;
+				break;
+			case ZPOOL_PROP_FAILUREMODE:
+				spa->spa_failmode = intval;
+				break;
+			default:
+				break;
+			}
+		}
+
+		/* log internal history if this is not a zpool create */
+		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
+		    tx->tx_txg != TXG_INITIAL) {
+			spa_history_internal_log(LOG_POOL_PROPSET,
+			    spa, tx, cr, "%s %lld %s",
+			    nvpair_name(elem), intval, spa_name(spa));
 		}
 	}
+
+	mutex_exit(&spa->spa_props_lock);
 }
 
 /*
@@ -2933,25 +3904,37 @@ spa_sync(spa_t *spa, uint64_t txg)
 	vdev_t *vd;
 	dmu_tx_t *tx;
 	int dirty_vdevs;
+	int error;
 
 	/*
 	 * Lock out configuration changes.
 	 */
-	spa_config_enter(spa, RW_READER, FTAG);
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
+	/*
+	 * If there are any pending vdev state changes, convert them
+	 * into config changes that go out with this transaction group.
+	 */
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+		vdev_state_clean(vd);
+		vdev_config_dirty(vd);
+	}
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
 	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
-	 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg,
+	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
 	 * set spa_deflate if we have no raid-z vdevs.
 	 */
-	if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE &&
-	    spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) {
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		int i;
 
 		for (i = 0; i < rvd->vdev_children; i++) {
@@ -2967,6 +3950,19 @@ spa_sync(spa_t *spa, uint64_t txg)
 		}
 	}
 
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
+		dsl_pool_create_origin(dp, tx);
+
+		/* Keeping the origin open increases spa_minref */
+		spa->spa_minref += 3;
+	}
+
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
+		dsl_pool_upgrade_clones(dp, tx);
+	}
+
 	/*
 	 * If anything has changed in this txg, push the deferred frees
 	 * from the previous txg.  If not, leave them alone so that we
@@ -2984,7 +3980,10 @@ spa_sync(spa_t *spa, uint64_t txg)
 		spa->spa_sync_pass++;
 
 		spa_sync_config_object(spa, tx);
-		spa_sync_spares(spa, tx);
+		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
+		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
+		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
+		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
 		spa_errlog_sync(spa, txg);
 		dsl_pool_sync(dp, txg);
 
@@ -3005,35 +4004,52 @@ spa_sync(spa_t *spa, uint64_t txg)
 	 * Rewrite the vdev configuration (which includes the uberblock)
 	 * to commit the transaction group.
 	 *
-	 * If there are any dirty vdevs, sync the uberblock to all vdevs.
-	 * Otherwise, pick a random top-level vdev that's known to be
-	 * visible in the config cache (see spa_vdev_add() for details).
-	 * If the write fails, try the next vdev until we're tried them all.
+	 * If there are no dirty vdevs, we sync the uberblock to a few
+	 * random top-level vdevs that are known to be visible in the
+	 * config cache (see spa_vdev_add() for a complete description).
+	 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
 	 */
-	if (!list_is_empty(&spa->spa_dirty_list)) {
-		VERIFY(vdev_config_sync(rvd, txg) == 0);
-	} else {
-		int children = rvd->vdev_children;
-		int c0 = spa_get_random(children);
-		int c;
-
-		for (c = 0; c < children; c++) {
-			vd = rvd->vdev_child[(c0 + c) % children];
-			if (vd->vdev_ms_array == 0)
-				continue;
-			if (vdev_config_sync(vd, txg) == 0)
-				break;
+	for (;;) {
+		/*
+		 * We hold SCL_STATE to prevent vdev open/close/etc.
+		 * while we're attempting to write the vdev labels.
+		 */
+		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+		if (list_is_empty(&spa->spa_config_dirty_list)) {
+			vdev_t *svd[SPA_DVAS_PER_BP];
+			int svdcount = 0;
+			int children = rvd->vdev_children;
+			int c0 = spa_get_random(children);
+			int c;
+
+			for (c = 0; c < children; c++) {
+				vd = rvd->vdev_child[(c0 + c) % children];
+				if (vd->vdev_ms_array == 0 || vd->vdev_islog)
+					continue;
+				svd[svdcount++] = vd;
+				if (svdcount == SPA_DVAS_PER_BP)
+					break;
+			}
+			error = vdev_config_sync(svd, svdcount, txg);
+		} else {
+			error = vdev_config_sync(rvd->vdev_child,
+			    rvd->vdev_children, txg);
 		}
-		if (c == children)
-			VERIFY(vdev_config_sync(rvd, txg) == 0);
-	}
 
+		spa_config_exit(spa, SCL_STATE, FTAG);
+
+		if (error == 0)
+			break;
+		zio_suspend(spa, NULL);
+		zio_resume_wait(spa);
+	}
 	dmu_tx_commit(tx);
 
 	/*
 	 * Clear the dirty config list.
 	 */
-	while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
+	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
 		vdev_config_clean(vd);
 
 	/*
@@ -3046,21 +4062,12 @@ spa_sync(spa_t *spa, uint64_t txg)
 		spa->spa_config_syncing = NULL;
 	}
 
-	/*
-	 * Make a stable copy of the fully synced uberblock.
-	 * We use this as the root for pool traversals.
-	 */
-	spa->spa_traverse_wanted = 1;	/* tells traverse_more() to stop */
-
-	spa_scrub_suspend(spa);		/* stop scrubbing and finish I/Os */
-
+	spa->spa_traverse_wanted = B_TRUE;
 	rw_enter(&spa->spa_traverse_lock, RW_WRITER);
-	spa->spa_traverse_wanted = 0;
+	spa->spa_traverse_wanted = B_FALSE;
 	spa->spa_ubsync = spa->spa_uberblock;
 	rw_exit(&spa->spa_traverse_lock);
 
-	spa_scrub_resume(spa);		/* resume scrub with new ubsync */
-
 	/*
 	 * Clean up the ZIL records for the synced txg.
 	 */
@@ -3081,7 +4088,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 	ASSERT(bpl->bpl_queue == NULL);
 
-	spa_config_exit(spa, FTAG);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	/*
 	 * If any async tasks have been requested, kick them off.
@@ -3100,7 +4107,7 @@ spa_sync_allpools(void)
 	spa_t *spa = NULL;
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL) {
-		if (spa_state(spa) != POOL_STATE_ACTIVE)
+		if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa))
 			continue;
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
@@ -3139,7 +4146,6 @@ spa_evict_all(void)
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		spa_async_suspend(spa);
-		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 
@@ -3153,27 +4159,42 @@ spa_evict_all(void)
 }
 
 vdev_t *
-spa_lookup_by_guid(spa_t *spa, uint64_t guid)
+spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache)
 {
-	return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
+	vdev_t *vd;
+	int i;
+
+	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
+		return (vd);
+
+	if (l2cache) {
+		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
+			vd = spa->spa_l2cache.sav_vdevs[i];
+			if (vd->vdev_guid == guid)
+				return (vd);
+		}
+	}
+
+	return (NULL);
 }
 
 void
-spa_upgrade(spa_t *spa)
+spa_upgrade(spa_t *spa, uint64_t version)
 {
-	spa_config_enter(spa, RW_WRITER, FTAG);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * This should only be called for a non-faulted pool, and since a
 	 * future version would result in an unopenable pool, this shouldn't be
 	 * possible.
 	 */
-	ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION);
+	ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
+	ASSERT(version >= spa->spa_uberblock.ub_version);
 
-	spa->spa_uberblock.ub_version = ZFS_VERSION;
+	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 
-	spa_config_exit(spa, FTAG);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 }
@@ -3183,119 +4204,98 @@ spa_has_spare(spa_t *spa, uint64_t guid)
 {
 	int i;
 	uint64_t spareguid;
+	spa_aux_vdev_t *sav = &spa->spa_spares;
 
-	for (i = 0; i < spa->spa_nspares; i++)
-		if (spa->spa_spares[i]->vdev_guid == guid)
+	for (i = 0; i < sav->sav_count; i++)
+		if (sav->sav_vdevs[i]->vdev_guid == guid)
 			return (B_TRUE);
 
-	for (i = 0; i < spa->spa_pending_nspares; i++) {
-		if (nvlist_lookup_uint64(spa->spa_pending_spares[i],
-		    ZPOOL_CONFIG_GUID, &spareguid) == 0 &&
-		    spareguid == guid)
+	for (i = 0; i < sav->sav_npending; i++) {
+		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
+		    &spareguid) == 0 && spareguid == guid)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
-int
-spa_set_props(spa_t *spa, nvlist_t *nvp)
-{
-	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
-	    spa, nvp, 3));
-}
-
-int
-spa_get_props(spa_t *spa, nvlist_t **nvp)
+/*
+ * Check if a pool has an active shared spare device.
+ * Note: reference count of an active spare is 2, as a spare and as a replace
+ */
+static boolean_t
+spa_has_active_shared_spare(spa_t *spa)
 {
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	objset_t *mos = spa->spa_meta_objset;
-	zfs_source_t src;
-	zfs_prop_t prop;
-	nvlist_t *propval;
-	uint64_t value;
-	int err;
-
-	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-	mutex_enter(&spa->spa_props_lock);
-	/* If no props object, then just return empty nvlist */
-	if (spa->spa_pool_props_object == 0) {
-		mutex_exit(&spa->spa_props_lock);
-		return (0);
-	}
-
-	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
-	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
-	    zap_cursor_advance(&zc)) {
-
-		if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL)
-			continue;
-
-		VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		switch (za.za_integer_length) {
-		case 8:
-			if (zfs_prop_default_numeric(prop) ==
-			    za.za_first_integer)
-				src = ZFS_SRC_DEFAULT;
-			else
-				src = ZFS_SRC_LOCAL;
-			value = za.za_first_integer;
-
-			if (prop == ZFS_PROP_BOOTFS) {
-				dsl_pool_t *dp;
-				dsl_dataset_t *ds = NULL;
-				char strval[MAXPATHLEN];
-
-				dp = spa_get_dsl(spa);
-				rw_enter(&dp->dp_config_rwlock, RW_READER);
-				if ((err = dsl_dataset_open_obj(dp,
-				    za.za_first_integer, NULL, DS_MODE_NONE,
-				    FTAG, &ds)) != 0) {
-					rw_exit(&dp->dp_config_rwlock);
-					break;
-				}
-				dsl_dataset_name(ds, strval);
-				dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
-				rw_exit(&dp->dp_config_rwlock);
+	int i, refcnt;
+	uint64_t pool;
+	spa_aux_vdev_t *sav = &spa->spa_spares;
 
-				VERIFY(nvlist_add_uint64(propval,
-				    ZFS_PROP_SOURCE, src) == 0);
-				VERIFY(nvlist_add_string(propval,
-				    ZFS_PROP_VALUE, strval) == 0);
-			} else {
-				VERIFY(nvlist_add_uint64(propval,
-				    ZFS_PROP_SOURCE, src) == 0);
-				VERIFY(nvlist_add_uint64(propval,
-				    ZFS_PROP_VALUE, value) == 0);
-			}
-			VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
-			    propval) == 0);
-			break;
-		}
-		nvlist_free(propval);
-	}
-	zap_cursor_fini(&zc);
-	mutex_exit(&spa->spa_props_lock);
-	if (err && err != ENOENT) {
-		nvlist_free(*nvp);
-		return (err);
+	for (i = 0; i < sav->sav_count; i++) {
+		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
+		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
+		    refcnt > 2)
+			return (B_TRUE);
 	}
 
-	return (0);
+	return (B_FALSE);
 }
 
 /*
- * If the bootfs property value is dsobj, clear it.
+ * Post a sysevent corresponding to the given event.  The 'name' must be one of
+ * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
+ * filled in from the spa and (optionally) the vdev.  This doesn't do anything
+ * in the userland libzpool, as we don't want consumers to misinterpret ztest
+ * or zdb as real changes.
  */
 void
-spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
+spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
 {
-	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
-		VERIFY(zap_remove(spa->spa_meta_objset,
-		    spa->spa_pool_props_object,
-		    zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0);
-		spa->spa_bootfs = 0;
+#if 0
+#ifdef _KERNEL
+	sysevent_t		*ev;
+	sysevent_attr_list_t	*attr = NULL;
+	sysevent_value_t	value;
+	sysevent_id_t		eid;
+
+	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
+	    SE_SLEEP);
+
+	value.value_type = SE_DATA_TYPE_STRING;
+	value.value.sv_string = spa_name(spa);
+	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
+		goto done;
+
+	value.value_type = SE_DATA_TYPE_UINT64;
+	value.value.sv_uint64 = spa_guid(spa);
+	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
+		goto done;
+
+	if (vd) {
+		value.value_type = SE_DATA_TYPE_UINT64;
+		value.value.sv_uint64 = vd->vdev_guid;
+		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
+		    SE_SLEEP) != 0)
+			goto done;
+
+		if (vd->vdev_path) {
+			value.value_type = SE_DATA_TYPE_STRING;
+			value.value.sv_string = vd->vdev_path;
+			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
+			    &value, SE_SLEEP) != 0)
+				goto done;
+		}
 	}
+
+	if (sysevent_attach_attributes(ev, attr) != 0)
+		goto done;
+	attr = NULL;
+
+	(void) log_sysevent(ev, SE_SLEEP, &eid);
+
+done:
+	if (attr)
+		sysevent_free_attr(attr);
+	sysevent_free(ev);
+#endif
+#endif
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
index 9e8bcf391158..1ffdb10dbfa5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
@@ -43,16 +41,18 @@
 /*
  * Pool configuration repository.
  *
- * The configuration for all pools, in addition to being stored on disk, is
- * stored in /etc/zfs/zpool.cache as a packed nvlist.  The kernel maintains
- * this list as pools are created, destroyed, or modified.
+ * Pool configuration is stored as a packed nvlist on the filesystem.  By
+ * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot
+ * (when the ZFS module is loaded).  Pools can also have the 'cachefile'
+ * property set that allows them to be stored in an alternate location until
+ * the control of external software.
  *
- * We have a single nvlist which holds all the configuration information.  When
- * the module loads, we read this information from the cache and populate the
- * SPA namespace.  This namespace is maintained independently in spa.c.
- * Whenever the namespace is modified, or the configuration of a pool is
- * changed, we call spa_config_sync(), which walks through all the active pools
- * and writes the configuration to disk.
+ * For each cache file, we have a single nvlist which holds all the
+ * configuration information.  When the module loads, we read this information
+ * from /etc/zfs/zpool.cache and populate the SPA namespace.  This namespace is
+ * maintained independently in spa.c.  Whenever the namespace is modified, or
+ * the configuration of a pool is changed, we call spa_config_sync(), which
+ * walks through all the active pools and writes the configuration to disk.
  */
 
 static uint64_t spa_config_generation = 1;
@@ -61,7 +61,7 @@ static uint64_t spa_config_generation = 1;
  * This can be overridden in userland to preserve an alternate namespace for
  * userland pools when doing testing.
  */
-const char *spa_config_dir = ZPOOL_CACHE_DIR;
+const char *spa_config_path = ZPOOL_CACHE;
 
 /*
  * Called when the module is first loaded, this routine loads the configuration
@@ -75,17 +75,21 @@ spa_config_load(void)
 	nvlist_t *nvlist, *child;
 	nvpair_t *nvpair;
 	spa_t *spa;
-	char pathname[128];
+	char *pathname;
 	struct _buf *file;
 	uint64_t fsize;
 
 	/*
 	 * Open the configuration file.
 	 */
-	(void) snprintf(pathname, sizeof (pathname), "%s/%s",
-	    spa_config_dir, ZPOOL_CACHE_FILE);
+	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+	(void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
 
 	file = kobj_open_file(pathname);
+
+	kmem_free(pathname, MAXPATHLEN);
+
 	if (file == (struct _buf *)-1) {
 		ZFS_LOG(1, "Cannot open %s.", pathname);
 		return;
@@ -148,47 +152,32 @@ out:
 	kobj_close_file(file);
 }
 
-/*
- * Synchronize all pools to disk.  This must be called with the namespace lock
- * held.
- */
-void
-spa_config_sync(void)
+static void
+spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
 {
-	spa_t *spa = NULL;
-	nvlist_t *config;
 	size_t buflen;
 	char *buf;
 	vnode_t *vp;
 	int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
-	char pathname[128];
-	char pathname2[128];
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	char *temp;
 
 	/*
-	 * Add all known pools to the configuration list, ignoring those with
-	 * alternate root paths.
+	 * If the nvlist is empty (NULL), then remove the old cachefile.
 	 */
-	spa = NULL;
-	while ((spa = spa_next(spa)) != NULL) {
-		mutex_enter(&spa->spa_config_cache_lock);
-		if (spa->spa_config && spa->spa_name && spa->spa_root == NULL)
-			VERIFY(nvlist_add_nvlist(config, spa->spa_name,
-			    spa->spa_config) == 0);
-		mutex_exit(&spa->spa_config_cache_lock);
+	if (nvl == NULL) {
+		(void) vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE);
+		return;
 	}
 
 	/*
 	 * Pack the configuration into a buffer.
 	 */
-	VERIFY(nvlist_size(config, &buflen, NV_ENCODE_XDR) == 0);
+	VERIFY(nvlist_size(nvl, &buflen, NV_ENCODE_XDR) == 0);
 
 	buf = kmem_alloc(buflen, KM_SLEEP);
+	temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 
-	VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR,
+	VERIFY(nvlist_pack(nvl, &buf, &buflen, NV_ENCODE_XDR,
 	    KM_SLEEP) == 0);
 
 	/*
@@ -196,29 +185,92 @@ spa_config_sync(void)
 	 * 'write to temporary file, sync, move over original' to make sure we
 	 * always have a consistent view of the data.
 	 */
-	(void) snprintf(pathname, sizeof (pathname), "%s/%s", spa_config_dir,
-	    ZPOOL_CACHE_TMP);
+	(void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path);
 
-	if (vn_open(pathname, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) != 0)
-		goto out;
+	if (vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) == 0) {
+		if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
+		    0, RLIM64_INFINITY, kcred, NULL) == 0 &&
+		    VOP_FSYNC(vp, FSYNC, kcred, NULL) == 0) {
+			(void) vn_rename(temp, dp->scd_path, UIO_SYSSPACE);
+		}
+		(void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL);
+		VN_RELE(vp);
+	}
+
+	(void) vn_remove(temp, UIO_SYSSPACE, RMFILE);
+
+	kmem_free(buf, buflen);
+	kmem_free(temp, MAXPATHLEN);
+}
+
+/*
+ * Synchronize pool configuration to disk.  This must be called with the
+ * namespace lock held.
+ */
+void
+spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
+{
+	spa_config_dirent_t *dp, *tdp;
+	nvlist_t *nvl;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	/*
+	 * Iterate over all cachefiles for the pool, past or present.  When the
+	 * cachefile is changed, the new one is pushed onto this list, allowing
+	 * us to update previous cachefiles that no longer contain this pool.
+	 */
+	for (dp = list_head(&target->spa_config_list); dp != NULL;
+	    dp = list_next(&target->spa_config_list, dp)) {
+		spa_t *spa = NULL;
+		if (dp->scd_path == NULL)
+			continue;
 
-	if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
-	    0, RLIM64_INFINITY, kcred, NULL) == 0 &&
-	    VOP_FSYNC(vp, FSYNC, kcred) == 0) {
-		(void) snprintf(pathname2, sizeof (pathname2), "%s/%s",
-		    spa_config_dir, ZPOOL_CACHE_FILE);
-		(void) vn_rename(pathname, pathname2, UIO_SYSSPACE);
+		/*
+		 * Iterate over all pools, adding any matching pools to 'nvl'.
+		 */
+		nvl = NULL;
+		while ((spa = spa_next(spa)) != NULL) {
+			if (spa == target && removing)
+				continue;
+
+			mutex_enter(&spa->spa_props_lock);
+			tdp = list_head(&spa->spa_config_list);
+			if (spa->spa_config == NULL ||
+			    tdp->scd_path == NULL ||
+			    strcmp(tdp->scd_path, dp->scd_path) != 0) {
+				mutex_exit(&spa->spa_props_lock);
+				continue;
+			}
+
+			if (nvl == NULL)
+				VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME,
+				    KM_SLEEP) == 0);
+
+			VERIFY(nvlist_add_nvlist(nvl, spa->spa_name,
+			    spa->spa_config) == 0);
+			mutex_exit(&spa->spa_props_lock);
+		}
+
+		spa_config_write(dp, nvl);
+		nvlist_free(nvl);
 	}
 
-	(void) VOP_CLOSE(vp, oflags, 1, 0, kcred);
-	VN_RELE(vp);
+	/*
+	 * Remove any config entries older than the current one.
+	 */
+	dp = list_head(&target->spa_config_list);
+	while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) {
+		list_remove(&target->spa_config_list, tdp);
+		if (tdp->scd_path != NULL)
+			spa_strfree(tdp->scd_path);
+		kmem_free(tdp, sizeof (spa_config_dirent_t));
+	}
 
-out:
-	(void) vn_remove(pathname, UIO_SYSSPACE, RMFILE);
 	spa_config_generation++;
 
-	kmem_free(buf, buflen);
-	nvlist_free(config);
+	if (postsysevent)
+		spa_event_notify(target, NULL, ESC_ZFS_CONFIG_SYNC);
 }
 
 /*
@@ -231,27 +283,25 @@ nvlist_t *
 spa_all_configs(uint64_t *generation)
 {
 	nvlist_t *pools;
-	spa_t *spa;
+	spa_t *spa = NULL;
 
 	if (*generation == spa_config_generation)
 		return (NULL);
 
 	VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
-	spa = NULL;
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL) {
-		if (INGLOBALZONE(curproc) ||
+		if (INGLOBALZONE(curthread) ||
 		    zone_dataset_visible(spa_name(spa), NULL)) {
-			mutex_enter(&spa->spa_config_cache_lock);
+			mutex_enter(&spa->spa_props_lock);
 			VERIFY(nvlist_add_nvlist(pools, spa_name(spa),
 			    spa->spa_config) == 0);
-			mutex_exit(&spa->spa_config_cache_lock);
+			mutex_exit(&spa->spa_props_lock);
 		}
 	}
-	mutex_exit(&spa_namespace_lock);
-
 	*generation = spa_config_generation;
+	mutex_exit(&spa_namespace_lock);
 
 	return (pools);
 }
@@ -259,11 +309,11 @@ spa_all_configs(uint64_t *generation)
 void
 spa_config_set(spa_t *spa, nvlist_t *config)
 {
-	mutex_enter(&spa->spa_config_cache_lock);
+	mutex_enter(&spa->spa_props_lock);
 	if (spa->spa_config != NULL)
 		nvlist_free(spa->spa_config);
 	spa->spa_config = config;
-	mutex_exit(&spa->spa_config_cache_lock);
+	mutex_exit(&spa->spa_props_lock);
 }
 
 /*
@@ -277,11 +327,16 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 	nvlist_t *config, *nvroot;
 	vdev_t *rvd = spa->spa_root_vdev;
 	unsigned long hostid = 0;
+	boolean_t locked = B_FALSE;
 
-	ASSERT(spa_config_held(spa, RW_READER));
-
-	if (vd == NULL)
+	if (vd == NULL) {
 		vd = rvd;
+		locked = B_TRUE;
+		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+	}
+
+	ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
+	    (SCL_CONFIG | SCL_STATE));
 
 	/*
 	 * If txg is -1, report the current value of spa->spa_config_txg.
@@ -302,8 +357,10 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    spa_guid(spa)) == 0);
 	(void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
-	    hostid) == 0);
+	if (hostid != 0) {
+		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
+		    hostid) == 0);
+	}
 	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
 	    utsname.nodename) == 0);
 
@@ -315,30 +372,48 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 		if (vd->vdev_isspare)
 			VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE,
 			    1ULL) == 0);
+		if (vd->vdev_islog)
+			VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG,
+			    1ULL) == 0);
 		vd = vd->vdev_top;		/* label contains top config */
 	}
 
-	nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE);
+	nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE);
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
 	nvlist_free(nvroot);
 
+	if (locked)
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
 	return (config);
 }
 
 /*
- * Update all disk labels, generate a fresh config based on the current
- * in-core state, and sync the global config cache.
+ * For a pool that's not currently a booting rootpool, update all disk labels,
+ * generate a fresh config based on the current in-core state, and sync the
+ * global config cache.
  */
 void
 spa_config_update(spa_t *spa, int what)
 {
+	spa_config_update_common(spa, what, FALSE);
+}
+
+/*
+ * Update all disk labels, generate a fresh config based on the current
+ * in-core state, and sync the global config cache (do not sync the config
+ * cache if this is a booting rootpool).
+ */
+void
+spa_config_update_common(spa_t *spa, int what, boolean_t isroot)
+{
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t txg;
 	int c;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
-	spa_config_enter(spa, RW_WRITER, FTAG);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	txg = spa_last_synced_txg(spa) + 1;
 	if (what == SPA_CONFIG_UPDATE_POOL) {
 		vdev_config_dirty(rvd);
@@ -358,7 +433,7 @@ spa_config_update(spa_t *spa, int what)
 			}
 		}
 	}
-	spa_config_exit(spa, FTAG);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * Wait for the mosconfig to be regenerated and synced.
@@ -368,8 +443,9 @@ spa_config_update(spa_t *spa, int what)
 	/*
 	 * Update the global config cache to reflect the new mosconfig.
 	 */
-	spa_config_sync();
+	if (!isroot)
+		spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
 
 	if (what == SPA_CONFIG_UPDATE_POOL)
-		spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
+		spa_config_update_common(spa, SPA_CONFIG_UPDATE_VDEVS, isroot);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
index c52acaf30801..e5c395f63d2b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -298,10 +298,7 @@ void
 spa_errlog_rotate(spa_t *spa)
 {
 	mutex_enter(&spa->spa_errlist_lock);
-
-	ASSERT(!spa->spa_scrub_finished);
 	spa->spa_scrub_finished = B_TRUE;
-
 	mutex_exit(&spa->spa_errlist_lock);
 }
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
index 66428013a784..8e20c4d32cd7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
@@ -20,15 +20,24 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
+#include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zap.h>
 #include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/utsname.h>
+#include <sys/sunddi.h>
+#ifdef _KERNEL
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#endif
 
 /*
  * Routines to manage the on-disk history log.
@@ -59,16 +68,6 @@
  * and permanently lost.
  */
 
-typedef enum history_log_type {
-	LOG_CMD_CREATE,
-	LOG_CMD_NO_CREATE
-} history_log_type_t;
-
-typedef struct history_arg {
-	const char *ha_history_str;
-	history_log_type_t ha_log_type;
-} history_arg_t;
-
 /* convert a logical offset to physical */
 static uint64_t
 spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
@@ -156,8 +155,9 @@ spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
 	/* see if we need to reset logical BOF */
 	while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
 	    (shpp->sh_eof - shpp->sh_bof) <= len) {
-		if ((err = spa_history_advance_bof(spa, shpp)) != 0)
+		if ((err = spa_history_advance_bof(spa, shpp)) != 0) {
 			return (err);
+		}
 	}
 
 	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
@@ -175,11 +175,22 @@ spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
 	return (0);
 }
 
+static char *
+spa_history_zone()
+{
+#ifdef _KERNEL
+	/* XXX: pr_host can be changed by default from within a jail! */
+	if (jailed(curthread->td_ucred))
+		return (curthread->td_ucred->cr_prison->pr_host);
+#endif
+	return ("global");
+}
+
 /*
  * Write out a history event.
  */
-void
-spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+static void
+spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	spa_t		*spa = arg1;
 	history_arg_t	*hap = arg2;
@@ -193,9 +204,6 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	char		*record_packed = NULL;
 	int		ret;
 
-	if (history_str == NULL)
-		return;
-
 	/*
 	 * If we have an older pool that doesn't have a command
 	 * history object, create it now.
@@ -222,16 +230,39 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	}
 #endif
 
-	/* construct a nvlist of the current time and cmd string */
 	VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME,
 	    gethrestime_sec()) == 0);
-	VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, history_str) == 0);
+	VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO,
+	    (uint64_t)crgetuid(cr)) == 0);
+	if (hap->ha_zone[0] != '\0')
+		VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE,
+		    hap->ha_zone) == 0);
+#ifdef _KERNEL
+	VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_HOST,
+	    utsname.nodename) == 0);
+#endif
+	if (hap->ha_log_type == LOG_CMD_POOL_CREATE ||
+	    hap->ha_log_type == LOG_CMD_NORMAL) {
+		VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD,
+		    history_str) == 0);
+	} else {
+		VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT,
+		    hap->ha_event) == 0);
+		VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TXG,
+		    tx->tx_txg) == 0);
+		VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR,
+		    history_str) == 0);
+	}
+
+	VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0);
+	record_packed = kmem_alloc(reclen, KM_SLEEP);
+
 	VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen,
 	    NV_ENCODE_XDR, KM_SLEEP) == 0);
 
 	mutex_enter(&spa->spa_history_lock);
-	if (hap->ha_log_type == LOG_CMD_CREATE)
+	if (hap->ha_log_type == LOG_CMD_POOL_CREATE)
 		VERIFY(shpp->sh_eof == shpp->sh_pool_create_len);
 
 	/* write out the packed length as little endian */
@@ -240,7 +271,7 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	if (!ret)
 		ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
 
-	if (!ret && hap->ha_log_type == LOG_CMD_CREATE) {
+	if (!ret && hap->ha_log_type == LOG_CMD_POOL_CREATE) {
 		shpp->sh_pool_create_len += sizeof (le_len) + reclen;
 		shpp->sh_bof = shpp->sh_pool_create_len;
 	}
@@ -249,18 +280,26 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	nvlist_free(nvrecord);
 	kmem_free(record_packed, reclen);
 	dmu_buf_rele(dbp, FTAG);
+
+	if (hap->ha_log_type == LOG_INTERNAL) {
+		kmem_free((void*)hap->ha_history_str, HIS_MAX_RECORD_LEN);
+		kmem_free(hap, sizeof (history_arg_t));
+	}
 }
 
 /*
  * Write out a history event.
  */
 int
-spa_history_log(spa_t *spa, const char *history_str, uint64_t pool_create)
+spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what)
 {
 	history_arg_t ha;
 
+	ASSERT(what != LOG_INTERNAL);
+
 	ha.ha_history_str = history_str;
-	ha.ha_log_type = pool_create ? LOG_CMD_CREATE : LOG_CMD_NO_CREATE;
+	ha.ha_log_type = what;
+	(void) strlcpy(ha.ha_zone, spa_history_zone(), sizeof (ha.ha_zone));
 	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync,
 	    spa, &ha, 0));
 }
@@ -352,3 +391,39 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
 	dmu_buf_rele(dbp, FTAG);
 	return (err);
 }
+
+void
+spa_history_internal_log(history_internal_events_t event, spa_t *spa,
+    dmu_tx_t *tx, cred_t *cr, const char *fmt, ...)
+{
+	history_arg_t *hap;
+	char *str;
+	va_list adx;
+
+	/*
+	 * If this is part of creating a pool, not everything is
+	 * initialized yet, so don't bother logging the internal events.
+	 */
+	if (tx->tx_txg == TXG_INITIAL)
+		return;
+
+	hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
+	str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
+
+	va_start(adx, fmt);
+	(void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx);
+	va_end(adx);
+
+	hap->ha_log_type = LOG_INTERNAL;
+	hap->ha_history_str = str;
+	hap->ha_event = event;
+	hap->ha_zone[0] = '\0';
+
+	if (dmu_tx_is_syncing(tx)) {
+		spa_history_log_sync(spa, hap, cr, tx);
+	} else {
+		dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
+		    spa_history_log_sync, spa, hap, 0, tx);
+	}
+	/* spa_history_log_sync() will free hap and str */
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
index 1e1f0ee93068..7a41d4ff5396 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
@@ -44,6 +42,10 @@
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/fs/zfs.h>
+#include <sys/metaslab_impl.h>
+#include <sys/sunddi.h>
+#include <sys/arc.h>
+#include "zfs_prop.h"
 
 /*
  * SPA locking
@@ -72,25 +74,17 @@
  *	This reference count keep track of any active users of the spa_t.  The
  *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
  *	the refcount is never really 'zero' - opening a pool implicitly keeps
- *	some references in the DMU.  Internally we check against SPA_MINREF, but
+ *	some references in the DMU.  Internally we check against spa_minref, but
  *	present the image of a zero/non-zero value to consumers.
  *
- * spa_config_lock (per-spa crazy rwlock)
+ * spa_config_lock[] (per-spa array of rwlocks)
  *
- *	This SPA special is a recursive rwlock, capable of being acquired from
- *	asynchronous threads.  It has protects the spa_t from config changes,
- *	and must be held in the following circumstances:
+ *	This protects the spa_t from config changes, and must be held in
+ *	the following circumstances:
  *
  *		- RW_READER to perform I/O to the spa
  *		- RW_WRITER to change the vdev config
  *
- * spa_config_cache_lock (per-spa mutex)
- *
- *	This mutex prevents the spa_config nvlist from being updated.  No
- *      other locks are required to obtain this lock, although implicitly you
- *      must have the namespace lock or non-zero refcount to have any kind
- *      of spa_t pointer at all.
- *
  * The locking order is fairly straightforward:
  *
  *		spa_namespace_lock	->	spa_refcount
@@ -98,21 +92,20 @@
  *	The namespace lock must be acquired to increase the refcount from 0
  *	or to check if it is zero.
  *
- *		spa_refcount		->	spa_config_lock
+ *		spa_refcount		->	spa_config_lock[]
  *
  *	There must be at least one valid reference on the spa_t to acquire
  *	the config lock.
  *
- *		spa_namespace_lock	->	spa_config_lock
+ *		spa_namespace_lock	->	spa_config_lock[]
  *
  *	The namespace lock must always be taken before the config lock.
  *
  *
- * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
- * are globally visible.
+ * The spa_namespace_lock can be acquired directly and is globally visible.
  *
- * The namespace is manipulated using the following functions, all which require
- * the spa_namespace_lock to be held.
+ * The namespace is manipulated using the following functions, all of which
+ * require the spa_namespace_lock to be held.
  *
  *	spa_lookup()		Lookup a spa_t by name.
  *
@@ -143,16 +136,70 @@
  *				zero.  Must be called with spa_namespace_lock
  *				held.
  *
- * The spa_config_lock is manipulated using the following functions:
+ * The spa_config_lock[] is an array of rwlocks, ordered as follows:
+ * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
+ * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
+ *
+ * To read the configuration, it suffices to hold one of these locks as reader.
+ * To modify the configuration, you must hold all locks as writer.  To modify
+ * vdev state without altering the vdev tree's topology (e.g. online/offline),
+ * you must hold SCL_STATE and SCL_ZIO as writer.
+ *
+ * We use these distinct config locks to avoid recursive lock entry.
+ * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
+ * block allocations (SCL_ALLOC), which may require reading space maps
+ * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
+ *
+ * The spa config locks cannot be normal rwlocks because we need the
+ * ability to hand off ownership.  For example, SCL_ZIO is acquired
+ * by the issuing thread and later released by an interrupt thread.
+ * They do, however, obey the usual write-wanted semantics to prevent
+ * writer (i.e. system administrator) starvation.
+ *
+ * The lock acquisition rules are as follows:
+ *
+ * SCL_CONFIG
+ *	Protects changes to the vdev tree topology, such as vdev
+ *	add/remove/attach/detach.  Protects the dirty config list
+ *	(spa_config_dirty_list) and the set of spares and l2arc devices.
+ *
+ * SCL_STATE
+ *	Protects changes to pool state and vdev state, such as vdev
+ *	online/offline/fault/degrade/clear.  Protects the dirty state list
+ *	(spa_state_dirty_list) and global pool state (spa_state).
  *
- *	spa_config_enter()	Acquire the config lock as RW_READER or
- *				RW_WRITER.  At least one reference on the spa_t
- *				must exist.
+ * SCL_ALLOC
+ *	Protects changes to metaslab groups and classes.
+ *	Held as reader by metaslab_alloc() and metaslab_claim().
  *
- *	spa_config_exit()	Release the config lock.
+ * SCL_ZIO
+ *	Held by bp-level zios (those which have no io_vd upon entry)
+ *	to prevent changes to the vdev tree.  The bp-level zio implicitly
+ *	protects all of its vdev child zios, which do not hold SCL_ZIO.
  *
- *	spa_config_held()	Returns true if the config lock is currently
- *				held in the given state.
+ * SCL_FREE
+ *	Protects changes to metaslab groups and classes.
+ *	Held as reader by metaslab_free().  SCL_FREE is distinct from
+ *	SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
+ *	blocks in zio_done() while another i/o that holds either
+ *	SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
+ *
+ * SCL_VDEV
+ *	Held as reader to prevent changes to the vdev tree during trivial
+ *	inquiries such as bp_get_dasize().  SCL_VDEV is distinct from the
+ *	other locks, and lower than all of them, to ensure that it's safe
+ *	to acquire regardless of caller context.
+ *
+ * In addition, the following rules apply:
+ *
+ * (a)	spa_props_lock protects pool properties, spa_config and spa_config_list.
+ *	The lock ordering is SCL_CONFIG > spa_props_lock.
+ *
+ * (b)	I/O operations on leaf vdevs.  For any zio operation that takes
+ *	an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
+ *	or zio_write_phys() -- the caller must ensure that the config cannot
+ *	cannot change in the interim, and that the vdev cannot be reopened.
+ *	SCL_STATE as reader suffices for both.
  *
  * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
  *
@@ -163,10 +210,12 @@
  *				to complete, sync the updated configs to the
  *				cache, and release the namespace lock.
  *
- * The spa_name() function also requires either the spa_namespace_lock
- * or the spa_config_lock, as both are needed to do a rename.  spa_rename() is
- * also implemented within this file since is requires manipulation of the
- * namespace.
+ * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
+ * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
+ * locking is, always, based on spa_namespace_lock and spa_config_lock[].
+ *
+ * spa_rename() is also implemented within this file since is requires
+ * manipulation of the namespace.
  */
 
 static avl_tree_t spa_namespace_avl;
@@ -177,12 +226,15 @@ int spa_max_replication_override = SPA_DVAS_PER_BP;
 
 static kmutex_t spa_spare_lock;
 static avl_tree_t spa_spare_avl;
+static kmutex_t spa_l2cache_lock;
+static avl_tree_t spa_l2cache_avl;
 
 kmem_cache_t *spa_buffer_pool;
 int spa_mode;
 
 #ifdef ZFS_DEBUG
-int zfs_flags = ~0;
+/* Everything except dprintf is on by default in debug builds */
+int zfs_flags = ~ZFS_DEBUG_DPRINTF;
 #else
 int zfs_flags = 0;
 #endif
@@ -198,7 +250,128 @@ TUNABLE_INT("vfs.zfs.recover", &zfs_recover);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0,
     "Try to recover from otherwise-fatal errors.");
 
-#define	SPA_MINREF	5	/* spa_refcnt for an open-but-idle pool */
+
+/*
+ * ==========================================================================
+ * SPA config locking
+ * ==========================================================================
+ */
+static void
+spa_config_lock_init(spa_t *spa)
+{
+	for (int i = 0; i < SCL_LOCKS; i++) {
+		spa_config_lock_t *scl = &spa->spa_config_lock[i];
+		mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
+		cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
+		refcount_create(&scl->scl_count);
+		scl->scl_writer = NULL;
+		scl->scl_write_wanted = 0;
+	}
+}
+
+static void
+spa_config_lock_destroy(spa_t *spa)
+{
+	for (int i = 0; i < SCL_LOCKS; i++) {
+		spa_config_lock_t *scl = &spa->spa_config_lock[i];
+		mutex_destroy(&scl->scl_lock);
+		cv_destroy(&scl->scl_cv);
+		refcount_destroy(&scl->scl_count);
+		ASSERT(scl->scl_writer == NULL);
+		ASSERT(scl->scl_write_wanted == 0);
+	}
+}
+
+int
+spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
+{
+	for (int i = 0; i < SCL_LOCKS; i++) {
+		spa_config_lock_t *scl = &spa->spa_config_lock[i];
+		if (!(locks & (1 << i)))
+			continue;
+		mutex_enter(&scl->scl_lock);
+		if (rw == RW_READER) {
+			if (scl->scl_writer || scl->scl_write_wanted) {
+				mutex_exit(&scl->scl_lock);
+				spa_config_exit(spa, locks ^ (1 << i), tag);
+				return (0);
+			}
+		} else {
+			ASSERT(scl->scl_writer != curthread);
+			if (!refcount_is_zero(&scl->scl_count)) {
+				mutex_exit(&scl->scl_lock);
+				spa_config_exit(spa, locks ^ (1 << i), tag);
+				return (0);
+			}
+			scl->scl_writer = curthread;
+		}
+		(void) refcount_add(&scl->scl_count, tag);
+		mutex_exit(&scl->scl_lock);
+	}
+	return (1);
+}
+
+void
+spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
+{
+	for (int i = 0; i < SCL_LOCKS; i++) {
+		spa_config_lock_t *scl = &spa->spa_config_lock[i];
+		if (!(locks & (1 << i)))
+			continue;
+		mutex_enter(&scl->scl_lock);
+		if (rw == RW_READER) {
+			while (scl->scl_writer || scl->scl_write_wanted) {
+				cv_wait(&scl->scl_cv, &scl->scl_lock);
+			}
+		} else {
+			ASSERT(scl->scl_writer != curthread);
+			while (!refcount_is_zero(&scl->scl_count)) {
+				scl->scl_write_wanted++;
+				cv_wait(&scl->scl_cv, &scl->scl_lock);
+				scl->scl_write_wanted--;
+			}
+			scl->scl_writer = curthread;
+		}
+		(void) refcount_add(&scl->scl_count, tag);
+		mutex_exit(&scl->scl_lock);
+	}
+}
+
+void
+spa_config_exit(spa_t *spa, int locks, void *tag)
+{
+	for (int i = SCL_LOCKS - 1; i >= 0; i--) {
+		spa_config_lock_t *scl = &spa->spa_config_lock[i];
+		if (!(locks & (1 << i)))
+			continue;
+		mutex_enter(&scl->scl_lock);
+		ASSERT(!refcount_is_zero(&scl->scl_count));
+		if (refcount_remove(&scl->scl_count, tag) == 0) {
+			ASSERT(scl->scl_writer == NULL ||
+			    scl->scl_writer == curthread);
+			scl->scl_writer = NULL;	/* OK in either case */
+			cv_broadcast(&scl->scl_cv);
+		}
+		mutex_exit(&scl->scl_lock);
+	}
+}
+
+int
+spa_config_held(spa_t *spa, int locks, krw_t rw)
+{
+	int locks_held = 0;
+
+	for (int i = 0; i < SCL_LOCKS; i++) {
+		spa_config_lock_t *scl = &spa->spa_config_lock[i];
+		if (!(locks & (1 << i)))
+			continue;
+		if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) ||
+		    (rw == RW_WRITER && scl->scl_writer == curthread))
+			locks_held |= 1 << i;
+	}
+
+	return (locks_held);
+}
 
 /*
  * ==========================================================================
@@ -213,14 +386,30 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0,
 spa_t *
 spa_lookup(const char *name)
 {
-	spa_t search, *spa;
+	static spa_t search;	/* spa_t is large; don't allocate on stack */
+	spa_t *spa;
 	avl_index_t where;
+	char c;
+	char *cp;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
-	search.spa_name = (char *)name;
+	/*
+	 * If it's a full dataset name, figure out the pool name and
+	 * just use that.
+	 */
+	cp = strpbrk(name, "/@");
+	if (cp) {
+		c = *cp;
+		*cp = '\0';
+	}
+
+	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
 	spa = avl_find(&spa_namespace_avl, &search, &where);
 
+	if (cp)
+		*cp = c;
+
 	return (spa);
 }
 
@@ -233,29 +422,40 @@ spa_t *
 spa_add(const char *name, const char *altroot)
 {
 	spa_t *spa;
+	spa_config_dirent_t *dp;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
 
-	spa->spa_name = spa_strdup(name);
-	spa->spa_state = POOL_STATE_UNINITIALIZED;
-	spa->spa_freeze_txg = UINT64_MAX;
-	spa->spa_final_txg = UINT64_MAX;
+	rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
 
-	mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_async_root_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
 
-	cv_init(&spa->spa_scrub_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&spa->spa_async_root_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
+
+	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
+	spa->spa_state = POOL_STATE_UNINITIALIZED;
+	spa->spa_freeze_txg = UINT64_MAX;
+	spa->spa_final_txg = UINT64_MAX;
 
 	refcount_create(&spa->spa_refcount);
-	refcount_create(&spa->spa_config_lock.scl_count);
+	spa_config_lock_init(spa);
 
 	avl_add(&spa_namespace_avl, spa);
 
+	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
+
 	/*
 	 * Set the alternate root, if there is one.
 	 */
@@ -264,6 +464,16 @@ spa_add(const char *name, const char *altroot)
 		spa_active_count++;
 	}
 
+	/*
+	 * Every pool starts with the default cachefile
+	 */
+	list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
+	    offsetof(spa_config_dirent_t, scd_link));
+
+	dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
+	dp->scd_path = spa_strdup(spa_config_path);
+	list_insert_head(&spa->spa_config_list, dp);
+
 	return (spa);
 }
 
@@ -275,9 +485,10 @@ spa_add(const char *name, const char *altroot)
 void
 spa_remove(spa_t *spa)
 {
+	spa_config_dirent_t *dp;
+
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
-	ASSERT(spa->spa_scrub_thread == NULL);
 
 	avl_remove(&spa_namespace_avl, spa);
 	cv_broadcast(&spa_namespace_cv);
@@ -287,21 +498,37 @@ spa_remove(spa_t *spa)
 		spa_active_count--;
 	}
 
-	if (spa->spa_name)
-		spa_strfree(spa->spa_name);
+	while ((dp = list_head(&spa->spa_config_list)) != NULL) {
+		list_remove(&spa->spa_config_list, dp);
+		if (dp->scd_path != NULL)
+			spa_strfree(dp->scd_path);
+		kmem_free(dp, sizeof (spa_config_dirent_t));
+	}
+
+	list_destroy(&spa->spa_config_list);
 
 	spa_config_set(spa, NULL);
 
 	refcount_destroy(&spa->spa_refcount);
-	refcount_destroy(&spa->spa_config_lock.scl_count);
+
+	spa_config_lock_destroy(spa);
+
+	rw_destroy(&spa->spa_traverse_lock);
 
 	cv_destroy(&spa->spa_async_cv);
+	cv_destroy(&spa->spa_async_root_cv);
 	cv_destroy(&spa->spa_scrub_io_cv);
-	cv_destroy(&spa->spa_scrub_cv);
+	cv_destroy(&spa->spa_suspend_cv);
 
-	mutex_destroy(&spa->spa_scrub_lock);
 	mutex_destroy(&spa->spa_async_lock);
-	mutex_destroy(&spa->spa_config_cache_lock);
+	mutex_destroy(&spa->spa_async_root_lock);
+	mutex_destroy(&spa->spa_scrub_lock);
+	mutex_destroy(&spa->spa_errlog_lock);
+	mutex_destroy(&spa->spa_errlist_lock);
+	mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
+	mutex_destroy(&spa->spa_history_lock);
+	mutex_destroy(&spa->spa_props_lock);
+	mutex_destroy(&spa->spa_suspend_lock);
 
 	kmem_free(spa, sizeof (spa_t));
 }
@@ -334,9 +561,8 @@ spa_next(spa_t *prev)
 void
 spa_open_ref(spa_t *spa, void *tag)
 {
-	ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+	ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
 	    MUTEX_HELD(&spa_namespace_lock));
-
 	(void) refcount_add(&spa->spa_refcount, tag);
 }
 
@@ -347,15 +573,14 @@ spa_open_ref(spa_t *spa, void *tag)
 void
 spa_close(spa_t *spa, void *tag)
 {
-	ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+	ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref ||
 	    MUTEX_HELD(&spa_namespace_lock));
-
 	(void) refcount_remove(&spa->spa_refcount, tag);
 }
 
 /*
  * Check to see if the spa refcount is zero.  Must be called with
- * spa_namespace_lock held.  We really compare against SPA_MINREF, which is the
+ * spa_namespace_lock held.  We really compare against spa_minref, which is the
  * number of references acquired when opening a pool
  */
 boolean_t
@@ -363,16 +588,119 @@ spa_refcount_zero(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
-	return (refcount_count(&spa->spa_refcount) == SPA_MINREF);
+	return (refcount_count(&spa->spa_refcount) == spa->spa_minref);
 }
 
 /*
  * ==========================================================================
- * SPA spare tracking
+ * SPA spare and l2cache tracking
  * ==========================================================================
  */
 
 /*
+ * Hot spares and cache devices are tracked using the same code below,
+ * for 'auxiliary' devices.
+ */
+
+typedef struct spa_aux {
+	uint64_t	aux_guid;
+	uint64_t	aux_pool;
+	avl_node_t	aux_avl;
+	int		aux_count;
+} spa_aux_t;
+
+static int
+spa_aux_compare(const void *a, const void *b)
+{
+	const spa_aux_t *sa = a;
+	const spa_aux_t *sb = b;
+
+	if (sa->aux_guid < sb->aux_guid)
+		return (-1);
+	else if (sa->aux_guid > sb->aux_guid)
+		return (1);
+	else
+		return (0);
+}
+
+void
+spa_aux_add(vdev_t *vd, avl_tree_t *avl)
+{
+	avl_index_t where;
+	spa_aux_t search;
+	spa_aux_t *aux;
+
+	search.aux_guid = vd->vdev_guid;
+	if ((aux = avl_find(avl, &search, &where)) != NULL) {
+		aux->aux_count++;
+	} else {
+		aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
+		aux->aux_guid = vd->vdev_guid;
+		aux->aux_count = 1;
+		avl_insert(avl, aux, where);
+	}
+}
+
+void
+spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
+{
+	spa_aux_t search;
+	spa_aux_t *aux;
+	avl_index_t where;
+
+	search.aux_guid = vd->vdev_guid;
+	aux = avl_find(avl, &search, &where);
+
+	ASSERT(aux != NULL);
+
+	if (--aux->aux_count == 0) {
+		avl_remove(avl, aux);
+		kmem_free(aux, sizeof (spa_aux_t));
+	} else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
+		aux->aux_pool = 0ULL;
+	}
+}
+
+boolean_t
+spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
+{
+	spa_aux_t search, *found;
+
+	search.aux_guid = guid;
+	found = avl_find(avl, &search, NULL);
+
+	if (pool) {
+		if (found)
+			*pool = found->aux_pool;
+		else
+			*pool = 0ULL;
+	}
+
+	if (refcnt) {
+		if (found)
+			*refcnt = found->aux_count;
+		else
+			*refcnt = 0;
+	}
+
+	return (found != NULL);
+}
+
+void
+spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
+{
+	spa_aux_t search, *found;
+	avl_index_t where;
+
+	search.aux_guid = vd->vdev_guid;
+	found = avl_find(avl, &search, &where);
+	ASSERT(found != NULL);
+	ASSERT(found->aux_pool == 0ULL);
+
+	found->aux_pool = spa_guid(vd->vdev_spa);
+}
+
+/*
  * Spares are tracked globally due to the following constraints:
  *
  * 	- A spare may be part of multiple pools.
@@ -394,196 +722,110 @@ spa_refcount_zero(spa_t *spa)
  * be completely consistent with respect to other vdev configuration changes.
  */
 
-typedef struct spa_spare {
-	uint64_t	spare_guid;
-	uint64_t	spare_pool;
-	avl_node_t	spare_avl;
-	int		spare_count;
-} spa_spare_t;
-
 static int
 spa_spare_compare(const void *a, const void *b)
 {
-	const spa_spare_t *sa = a;
-	const spa_spare_t *sb = b;
-
-	if (sa->spare_guid < sb->spare_guid)
-		return (-1);
-	else if (sa->spare_guid > sb->spare_guid)
-		return (1);
-	else
-		return (0);
+	return (spa_aux_compare(a, b));
 }
 
 void
 spa_spare_add(vdev_t *vd)
 {
-	avl_index_t where;
-	spa_spare_t search;
-	spa_spare_t *spare;
-
 	mutex_enter(&spa_spare_lock);
 	ASSERT(!vd->vdev_isspare);
-
-	search.spare_guid = vd->vdev_guid;
-	if ((spare = avl_find(&spa_spare_avl, &search, &where)) != NULL) {
-		spare->spare_count++;
-	} else {
-		spare = kmem_zalloc(sizeof (spa_spare_t), KM_SLEEP);
-		spare->spare_guid = vd->vdev_guid;
-		spare->spare_count = 1;
-		avl_insert(&spa_spare_avl, spare, where);
-	}
+	spa_aux_add(vd, &spa_spare_avl);
 	vd->vdev_isspare = B_TRUE;
-
 	mutex_exit(&spa_spare_lock);
 }
 
 void
 spa_spare_remove(vdev_t *vd)
 {
-	spa_spare_t search;
-	spa_spare_t *spare;
-	avl_index_t where;
-
 	mutex_enter(&spa_spare_lock);
-
-	search.spare_guid = vd->vdev_guid;
-	spare = avl_find(&spa_spare_avl, &search, &where);
-
 	ASSERT(vd->vdev_isspare);
-	ASSERT(spare != NULL);
-
-	if (--spare->spare_count == 0) {
-		avl_remove(&spa_spare_avl, spare);
-		kmem_free(spare, sizeof (spa_spare_t));
-	} else if (spare->spare_pool == spa_guid(vd->vdev_spa)) {
-		spare->spare_pool = 0ULL;
-	}
-
+	spa_aux_remove(vd, &spa_spare_avl);
 	vd->vdev_isspare = B_FALSE;
 	mutex_exit(&spa_spare_lock);
 }
 
 boolean_t
-spa_spare_exists(uint64_t guid, uint64_t *pool)
+spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
 {
-	spa_spare_t search, *found;
-	avl_index_t where;
+	boolean_t found;
 
 	mutex_enter(&spa_spare_lock);
-
-	search.spare_guid = guid;
-	found = avl_find(&spa_spare_avl, &search, &where);
-
-	if (pool) {
-		if (found)
-			*pool = found->spare_pool;
-		else
-			*pool = 0ULL;
-	}
-
+	found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
 	mutex_exit(&spa_spare_lock);
 
-	return (found != NULL);
+	return (found);
 }
 
 void
 spa_spare_activate(vdev_t *vd)
 {
-	spa_spare_t search, *found;
-	avl_index_t where;
-
 	mutex_enter(&spa_spare_lock);
 	ASSERT(vd->vdev_isspare);
-
-	search.spare_guid = vd->vdev_guid;
-	found = avl_find(&spa_spare_avl, &search, &where);
-	ASSERT(found != NULL);
-	ASSERT(found->spare_pool == 0ULL);
-
-	found->spare_pool = spa_guid(vd->vdev_spa);
+	spa_aux_activate(vd, &spa_spare_avl);
 	mutex_exit(&spa_spare_lock);
 }
 
 /*
- * ==========================================================================
- * SPA config locking
- * ==========================================================================
+ * Level 2 ARC devices are tracked globally for the same reasons as spares.
+ * Cache devices currently only support one pool per cache device, and so
+ * for these devices the aux reference count is currently unused beyond 1.
  */
 
-/*
- * Acquire the config lock.  The config lock is a special rwlock that allows for
- * recursive enters.  Because these enters come from the same thread as well as
- * asynchronous threads working on behalf of the owner, we must unilaterally
- * allow all reads access as long at least one reader is held (even if a write
- * is requested).  This has the side effect of write starvation, but write locks
- * are extremely rare, and a solution to this problem would be significantly
- * more complex (if even possible).
- *
- * We would like to assert that the namespace lock isn't held, but this is a
- * valid use during create.
- */
-void
-spa_config_enter(spa_t *spa, krw_t rw, void *tag)
+static int
+spa_l2cache_compare(const void *a, const void *b)
 {
-	spa_config_lock_t *scl = &spa->spa_config_lock;
-
-	mutex_enter(&scl->scl_lock);
-
-	if (scl->scl_writer != curthread) {
-		if (rw == RW_READER) {
-			while (scl->scl_writer != NULL)
-				cv_wait(&scl->scl_cv, &scl->scl_lock);
-		} else {
-			while (scl->scl_writer != NULL ||
-			    !refcount_is_zero(&scl->scl_count))
-				cv_wait(&scl->scl_cv, &scl->scl_lock);
-			scl->scl_writer = curthread;
-		}
-	}
-
-	(void) refcount_add(&scl->scl_count, tag);
+	return (spa_aux_compare(a, b));
+}
 
-	mutex_exit(&scl->scl_lock);
+void
+spa_l2cache_add(vdev_t *vd)
+{
+	mutex_enter(&spa_l2cache_lock);
+	ASSERT(!vd->vdev_isl2cache);
+	spa_aux_add(vd, &spa_l2cache_avl);
+	vd->vdev_isl2cache = B_TRUE;
+	mutex_exit(&spa_l2cache_lock);
 }
 
-/*
- * Release the spa config lock, notifying any waiters in the process.
- */
 void
-spa_config_exit(spa_t *spa, void *tag)
+spa_l2cache_remove(vdev_t *vd)
 {
-	spa_config_lock_t *scl = &spa->spa_config_lock;
+	mutex_enter(&spa_l2cache_lock);
+	ASSERT(vd->vdev_isl2cache);
+	spa_aux_remove(vd, &spa_l2cache_avl);
+	vd->vdev_isl2cache = B_FALSE;
+	mutex_exit(&spa_l2cache_lock);
+}
 
-	mutex_enter(&scl->scl_lock);
+boolean_t
+spa_l2cache_exists(uint64_t guid, uint64_t *pool)
+{
+	boolean_t found;
 
-	ASSERT(!refcount_is_zero(&scl->scl_count));
-	if (refcount_remove(&scl->scl_count, tag) == 0) {
-		cv_broadcast(&scl->scl_cv);
-		scl->scl_writer = NULL;  /* OK in either case */
-	}
+	mutex_enter(&spa_l2cache_lock);
+	found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
+	mutex_exit(&spa_l2cache_lock);
 
-	mutex_exit(&scl->scl_lock);
+	return (found);
 }
 
-/*
- * Returns true if the config lock is held in the given manner.
- */
-boolean_t
-spa_config_held(spa_t *spa, krw_t rw)
+void
+spa_l2cache_activate(vdev_t *vd)
 {
-	spa_config_lock_t *scl = &spa->spa_config_lock;
-	boolean_t held;
-
-	mutex_enter(&scl->scl_lock);
-	if (rw == RW_WRITER)
-		held = (scl->scl_writer == curthread);
-	else
-		held = !refcount_is_zero(&scl->scl_count);
-	mutex_exit(&scl->scl_lock);
+	mutex_enter(&spa_l2cache_lock);
+	ASSERT(vd->vdev_isl2cache);
+	spa_aux_activate(vd, &spa_l2cache_avl);
+	mutex_exit(&spa_l2cache_lock);
+}
 
-	return (held);
+void
+spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
+{
+	vdev_space_update(vd, space, alloc, B_FALSE);
 }
 
 /*
@@ -600,14 +842,9 @@ spa_config_held(spa_t *spa, krw_t rw)
 uint64_t
 spa_vdev_enter(spa_t *spa)
 {
-	/*
-	 * Suspend scrub activity while we mess with the config.
-	 */
-	spa_scrub_suspend(spa);
-
 	mutex_enter(&spa_namespace_lock);
 
-	spa_config_enter(spa, RW_WRITER, spa);
+	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
 	return (spa_last_synced_txg(spa) + 1);
 }
@@ -625,6 +862,8 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 
 	ASSERT(txg > spa_last_synced_txg(spa));
 
+	spa->spa_pending_vdev = NULL;
+
 	/*
 	 * Reassess the DTLs.
 	 */
@@ -633,17 +872,12 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 	/*
 	 * If the config changed, notify the scrub thread that it must restart.
 	 */
-	if (error == 0 && !list_is_empty(&spa->spa_dirty_list)) {
+	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
+		dsl_pool_scrub_restart(spa->spa_dsl_pool);
 		config_changed = B_TRUE;
-		spa_scrub_restart(spa, txg);
 	}
 
-	spa_config_exit(spa, spa);
-
-	/*
-	 * Allow scrubbing to resume.
-	 */
-	spa_scrub_resume(spa);
+	spa_config_exit(spa, SCL_ALL, spa);
 
 	/*
 	 * Note: this txg_wait_synced() is important because it ensures
@@ -662,7 +896,7 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 	 * If the config changed, update the config cache.
 	 */
 	if (config_changed)
-		spa_config_sync();
+		spa_config_sync(spa, B_FALSE, B_TRUE);
 
 	mutex_exit(&spa_namespace_lock);
 
@@ -670,6 +904,26 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 }
 
 /*
+ * Lock the given spa_t for the purpose of changing vdev state.
+ */
+void
+spa_vdev_state_enter(spa_t *spa)
+{
+	spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER);
+}
+
+int
+spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
+{
+	if (vd != NULL)
+		vdev_state_dirty(vd->vdev_top);
+
+	spa_config_exit(spa, SCL_STATE_ALL, spa);
+
+	return (error);
+}
+
+/*
  * ==========================================================================
  * Miscellaneous functions
  * ==========================================================================
@@ -696,11 +950,10 @@ spa_rename(const char *name, const char *newname)
 		return (err);
 	}
 
-	spa_config_enter(spa, RW_WRITER, FTAG);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	avl_remove(&spa_namespace_avl, spa);
-	spa_strfree(spa->spa_name);
-	spa->spa_name = spa_strdup(newname);
+	(void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
 	avl_add(&spa_namespace_avl, spa);
 
 	/*
@@ -710,14 +963,14 @@ spa_rename(const char *name, const char *newname)
 	 */
 	vdev_config_dirty(spa->spa_root_vdev);
 
-	spa_config_exit(spa, FTAG);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 
 	/*
 	 * Sync the updated config cache.
 	 */
-	spa_config_sync();
+	spa_config_sync(spa, B_FALSE, B_TRUE);
 
 	spa_close(spa, FTAG);
 
@@ -754,7 +1007,7 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
 				break;
 
 			/*
-			 * Check any devices we may in the process of adding.
+			 * Check any devices we may be in the process of adding.
 			 */
 			if (spa->spa_pending_vdev) {
 				if (vdev_lookup_by_guid(spa->spa_pending_vdev,
@@ -848,12 +1101,12 @@ spa_freeze(spa_t *spa)
 {
 	uint64_t freeze_txg = 0;
 
-	spa_config_enter(spa, RW_WRITER, FTAG);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	if (spa->spa_freeze_txg == UINT64_MAX) {
 		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
 		spa->spa_freeze_txg = freeze_txg;
 	}
-	spa_config_exit(spa, FTAG);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (freeze_txg != 0)
 		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
 }
@@ -880,7 +1133,7 @@ spa_traverse_rwlock(spa_t *spa)
 	return (&spa->spa_traverse_lock);
 }
 
-int
+boolean_t
 spa_traverse_wanted(spa_t *spa)
 {
 	return (spa->spa_traverse_wanted);
@@ -922,13 +1175,6 @@ spa_sync_pass(spa_t *spa)
 char *
 spa_name(spa_t *spa)
 {
-	/*
-	 * Accessing the name requires holding either the namespace lock or the
-	 * config lock, both of which are required to do a rename.
-	 */
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
-	    spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER));
-
 	return (spa->spa_name);
 }
 
@@ -972,16 +1218,6 @@ spa_freeze_txg(spa_t *spa)
 }
 
 /*
- * In the future, this may select among different metaslab classes
- * depending on the zdp.  For now, there's no such distinction.
- */
-metaslab_class_t *
-spa_metaslab_class_select(spa_t *spa)
-{
-	return (spa->spa_normal_class);
-}
-
-/*
  * Return how much space is allocated in the pool (ie. sum of all asize)
  */
 uint64_t
@@ -1024,6 +1260,22 @@ spa_get_asize(spa_t *spa, uint64_t lsize)
 	return (lsize * 6);
 }
 
+/*
+ * Return the failure mode that has been set to this pool. The default
+ * behavior will be to block all I/Os when a complete failure occurs.
+ */
+uint8_t
+spa_get_failmode(spa_t *spa)
+{
+	return (spa->spa_failmode);
+}
+
+boolean_t
+spa_suspended(spa_t *spa)
+{
+	return (spa->spa_suspended);
+}
+
 uint64_t
 spa_version(spa_t *spa)
 {
@@ -1034,11 +1286,11 @@ int
 spa_max_replication(spa_t *spa)
 {
 	/*
-	 * As of ZFS_VERSION == ZFS_VERSION_DITTO_BLOCKS, we are able to
+	 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
 	 * handle BPs with more than one DVA allocated.  Set our max
 	 * replication level accordingly.
 	 */
-	if (spa_version(spa) < ZFS_VERSION_DITTO_BLOCKS)
+	if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
 		return (1);
 	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
 }
@@ -1051,12 +1303,15 @@ bp_get_dasize(spa_t *spa, const blkptr_t *bp)
 	if (!spa->spa_deflate)
 		return (BP_GET_ASIZE(bp));
 
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
 		vdev_t *vd =
 		    vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
-		sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> SPA_MINBLOCKSHIFT) *
-		    vd->vdev_deflate_ratio;
+		if (vd)
+			sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
+			    SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
 	}
+	spa_config_exit(spa, SCL_VDEV, FTAG);
 	return (sz);
 }
 
@@ -1088,18 +1343,27 @@ spa_busy(void)
 }
 
 void
+spa_boot_init()
+{
+	spa_config_load();
+}
+
+void
 spa_init(int mode)
 {
 	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
 
 	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
 	    offsetof(spa_t, spa_avl));
 
-	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
+	    offsetof(spa_aux_t, aux_avl));
 
-	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_spare_t),
-	    offsetof(spa_spare_t, spare_avl));
+	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
+	    offsetof(spa_aux_t, aux_avl));
 
 	spa_mode = mode;
 
@@ -1108,23 +1372,53 @@ spa_init(int mode)
 	zio_init();
 	dmu_init();
 	zil_init();
+	vdev_cache_stat_init();
+	zfs_prop_init();
+	zpool_prop_init();
 	spa_config_load();
+	l2arc_start();
 }
 
 void
 spa_fini(void)
 {
+	l2arc_stop();
+
 	spa_evict_all();
 
+	vdev_cache_stat_fini();
 	zil_fini();
 	dmu_fini();
 	zio_fini();
+	unique_fini();
 	refcount_fini();
 
 	avl_destroy(&spa_namespace_avl);
 	avl_destroy(&spa_spare_avl);
+	avl_destroy(&spa_l2cache_avl);
 
 	cv_destroy(&spa_namespace_cv);
 	mutex_destroy(&spa_namespace_lock);
 	mutex_destroy(&spa_spare_lock);
+	mutex_destroy(&spa_l2cache_lock);
+}
+
+/*
+ * Return whether this pool has slogs. No locking needed.
+ * It's not a problem if the wrong answer is returned as it's only for
+ * performance and not correctness
+ */
+boolean_t
+spa_has_slogs(spa_t *spa)
+{
+	return (spa->spa_log_class->mc_rotor != NULL);
+}
+
+/*
+ * Return whether this pool is the root pool.
+ */
+boolean_t
+spa_is_root(spa_t *spa)
+{
+	return (spa->spa_is_root);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
index 23313a908ab4..8fdfa6200ea9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -300,6 +300,7 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
 	uint64_t *entry, *entry_map, *entry_map_end;
 	uint64_t bufsize, size, offset, end, space;
 	uint64_t mapstart = sm->sm_start;
+	int error = 0;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
 
@@ -337,9 +338,10 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
 		    smo->smo_object, offset, size);
 
 		mutex_exit(sm->sm_lock);
-		VERIFY3U(dmu_read(os, smo->smo_object, offset, size,
-		    entry_map), ==, 0);
+		error = dmu_read(os, smo->smo_object, offset, size, entry_map);
 		mutex_enter(sm->sm_lock);
+		if (error != 0)
+			break;
 
 		entry_map_end = entry_map + (size / sizeof (uint64_t));
 		for (entry = entry_map; entry < entry_map_end; entry++) {
@@ -354,20 +356,25 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
 			    SM_RUN_DECODE(e) << sm->sm_shift);
 		}
 	}
-	VERIFY3U(sm->sm_space, ==, space);
+
+	if (error == 0) {
+		VERIFY3U(sm->sm_space, ==, space);
+
+		sm->sm_loaded = B_TRUE;
+		sm->sm_ops = ops;
+		if (ops != NULL)
+			ops->smop_load(sm);
+	} else {
+		space_map_vacate(sm, NULL, NULL);
+	}
 
 	zio_buf_free(entry_map, bufsize);
 
 	sm->sm_loading = B_FALSE;
-	sm->sm_loaded = B_TRUE;
-	sm->sm_ops = ops;
 
 	cv_broadcast(&sm->sm_load_cv);
 
-	if (ops != NULL)
-		ops->smop_load(sm);
-
-	return (0);
+	return (error);
 }
 
 void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
index f58ffc059f91..f3e00877a8e2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ARC_H
 #define	_SYS_ARC_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
@@ -35,11 +33,12 @@ extern "C" {
 #endif
 
 #include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
 
 typedef struct arc_buf_hdr arc_buf_hdr_t;
 typedef struct arc_buf arc_buf_t;
 typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
-typedef void arc_byteswap_func_t(void *buf, size_t size);
 typedef int arc_evict_func_t(void *private);
 
 /* generic arc_done_func_t's which you can use */
@@ -49,15 +48,16 @@ arc_done_func_t arc_getbuf_func;
 struct arc_buf {
 	arc_buf_hdr_t		*b_hdr;
 	arc_buf_t		*b_next;
+	krwlock_t		b_lock;
 	void			*b_data;
 	arc_evict_func_t	*b_efunc;
 	void			*b_private;
 };
 
 typedef enum arc_buf_contents {
-	ARC_BUFC_UNDEF,				/* buffer contents undefined */
 	ARC_BUFC_DATA,				/* buffer contains data */
-	ARC_BUFC_METADATA			/* buffer contains metadata */
+	ARC_BUFC_METADATA,			/* buffer contains metadata */
+	ARC_BUFC_NUMTYPES
 } arc_buf_contents_t;
 /*
  * These are the flags we pass into calls to the arc
@@ -66,7 +66,12 @@ typedef enum arc_buf_contents {
 #define	ARC_NOWAIT	(1 << 2)	/* perform I/O asynchronously */
 #define	ARC_PREFETCH	(1 << 3)	/* I/O is a prefetch */
 #define	ARC_CACHED	(1 << 4)	/* I/O was already in cache */
+#define	ARC_L2CACHE	(1 << 5)	/* cache in L2ARC */
 
+void arc_space_consume(uint64_t space);
+void arc_space_return(uint64_t space);
+void *arc_data_buf_alloc(uint64_t space);
+void arc_data_buf_free(void *buf, uint64_t space);
 arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
     arc_buf_contents_t type);
 void arc_buf_add_ref(arc_buf_t *buf, void *tag);
@@ -81,13 +86,24 @@ void arc_buf_thaw(arc_buf_t *buf);
 int arc_referenced(arc_buf_t *buf);
 #endif
 
-int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
+typedef struct writeprops {
+	dmu_object_type_t wp_type;
+	uint8_t wp_level;
+	uint8_t wp_copies;
+	uint8_t wp_dncompress, wp_oscompress;
+	uint8_t wp_dnchecksum, wp_oschecksum;
+} writeprops_t;
+
+int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+    arc_done_func_t *done, void *private, int priority, int zio_flags,
+    uint32_t *arc_flags, const zbookmark_t *zb);
+int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
     arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t *arc_flags, zbookmark_t *zb);
-zio_t *arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
-    int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+    uint32_t *arc_flags, const zbookmark_t *zb);
+zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
+    boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
     arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb);
+    int zio_flags, const zbookmark_t *zb);
 int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     zio_done_func_t *done, void *private, uint32_t arc_flags);
 int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
@@ -95,13 +111,25 @@ int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
 void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
 int arc_buf_evict(arc_buf_t *buf);
 
-void arc_flush(void);
-void arc_tempreserve_clear(uint64_t tempreserve);
-int arc_tempreserve_space(uint64_t tempreserve);
+void arc_flush(spa_t *spa);
+void arc_tempreserve_clear(uint64_t reserve);
+int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
 
 void arc_init(void);
 void arc_fini(void);
 
+/*
+ * Level 2 ARC
+ */
+
+void l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end);
+void l2arc_remove_vdev(vdev_t *vd);
+boolean_t l2arc_vdev_present(vdev_t *vd);
+void l2arc_init(void);
+void l2arc_fini(void);
+void l2arc_start(void);
+void l2arc_stop(void);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
index b4c83765c873..cdb93a6c35a3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_BPLIST_H
 #define	_SYS_BPLIST_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
@@ -75,12 +73,14 @@ extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
 extern void bplist_close(bplist_t *bpl);
 extern boolean_t bplist_empty(bplist_t *bpl);
 extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
-extern int bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp);
+extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx);
+extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp);
 extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
 extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
 extern int bplist_space(bplist_t *bpl,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+extern int bplist_space_birthrange(bplist_t *bpl,
+    uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep);
 
 #ifdef	__cplusplus
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
index d33657b9e67c..b27d89fe2162 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -239,7 +239,7 @@ typedef struct dbuf_hash_table {
 uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
 
 dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
-dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn);
+void dbuf_create_bonus(struct dnode *dn);
 
 dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
@@ -271,7 +271,7 @@ void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
 void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
 
-void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
+void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
     struct dmu_tx *);
 
 void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
@@ -279,10 +279,21 @@ void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
 void dbuf_init(void);
 void dbuf_fini(void);
 
-#define	DBUF_GET_BUFC_TYPE(db)					\
-	((((db)->db_level > 0) ||				\
-	    (dmu_ot[(db)->db_dnode->dn_type].ot_metadata)) ?	\
-	    ARC_BUFC_METADATA : ARC_BUFC_DATA);
+#define	DBUF_IS_METADATA(db)	\
+	((db)->db_level > 0 || dmu_ot[(db)->db_dnode->dn_type].ot_metadata)
+
+#define	DBUF_GET_BUFC_TYPE(db)	\
+	(DBUF_IS_METADATA(db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+
+#define	DBUF_IS_CACHEABLE(db)						\
+	((db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
+	(DBUF_IS_METADATA(db) &&					\
+	((db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
+
+#define	DBUF_IS_L2CACHEABLE(db)						\
+	((db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||	\
+	(DBUF_IS_METADATA(db) &&					\
+	((db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
 
 #ifdef ZFS_DEBUG
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
index 8c2a1fdaa823..4535c6864074 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -38,6 +38,7 @@
 
 #include <sys/types.h>
 #include <sys/param.h>
+#include <sys/cred.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -91,7 +92,7 @@ typedef enum dmu_object_type {
 	DMU_OT_DSL_DATASET,		/* UINT64 */
 	/* zpl: */
 	DMU_OT_ZNODE,			/* ZNODE */
-	DMU_OT_ACL,			/* ACL */
+	DMU_OT_OLDACL,			/* Old ACL */
 	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
 	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
 	DMU_OT_MASTER_NODE,		/* ZAP */
@@ -108,7 +109,13 @@ typedef enum dmu_object_type {
 	DMU_OT_SPA_HISTORY,		/* UINT8 */
 	DMU_OT_SPA_HISTORY_OFFSETS,	/* spa_his_phys_t */
 	DMU_OT_POOL_PROPS,		/* ZAP */
-
+	DMU_OT_DSL_PERMS,		/* ZAP */
+	DMU_OT_ACL,			/* ACL */
+	DMU_OT_SYSACL,			/* SYSACL */
+	DMU_OT_FUID,			/* FUID table (Packed NVLIST UINT8) */
+	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
+	DMU_OT_NEXT_CLONES,		/* ZAP */
+	DMU_OT_SCRUB_QUEUE,		/* ZAP */
 	DMU_OT_NUMTYPES
 } dmu_object_type_t;
 
@@ -127,15 +134,15 @@ void byteswap_uint32_array(void *buf, size_t size);
 void byteswap_uint16_array(void *buf, size_t size);
 void byteswap_uint8_array(void *buf, size_t size);
 void zap_byteswap(void *buf, size_t size);
+void zfs_oldacl_byteswap(void *buf, size_t size);
 void zfs_acl_byteswap(void *buf, size_t size);
 void zfs_znode_byteswap(void *buf, size_t size);
 
-#define	DS_MODE_NONE		0	/* invalid, to aid debugging */
-#define	DS_MODE_STANDARD	1	/* normal access, no special needs */
-#define	DS_MODE_PRIMARY		2	/* the "main" access, e.g. a mount */
-#define	DS_MODE_EXCLUSIVE	3	/* exclusive access, e.g. to destroy */
-#define	DS_MODE_LEVELS		4
-#define	DS_MODE_LEVEL(x)	((x) & (DS_MODE_LEVELS - 1))
+#define	DS_MODE_NOHOLD		0	/* internal use only */
+#define	DS_MODE_USER		1	/* simple access, no special needs */
+#define	DS_MODE_OWNER		2	/* the "main" access, e.g. a mount */
+#define	DS_MODE_TYPE_MASK	0x3
+#define	DS_MODE_TYPE(x)		((x) & DS_MODE_TYPE_MASK)
 #define	DS_MODE_READONLY	0x8
 #define	DS_MODE_IS_READONLY(x)	((x) & DS_MODE_READONLY)
 #define	DS_MODE_INCONSISTENT	0x10
@@ -149,20 +156,23 @@ void zfs_znode_byteswap(void *buf, size_t size);
  * operation, including metadata.
  */
 #define	DMU_MAX_ACCESS (10<<20) /* 10MB */
+#define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
 /*
  * Public routines to create, destroy, open, and close objsets.
  */
 int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
     objset_t **osp);
+int dmu_objset_open_ds(struct dsl_dataset *ds, dmu_objset_type_t type,
+    objset_t **osp);
 void dmu_objset_close(objset_t *os);
-int dmu_objset_evict_dbufs(objset_t *os, int try);
+int dmu_objset_evict_dbufs(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent,
-    void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
+    objset_t *clone_parent, uint64_t flags,
+    void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
 int dmu_objset_destroy(const char *name);
 int dmu_snapshots_destroy(char *fsname, char *snapname);
-int dmu_objset_rollback(const char *name);
+int dmu_objset_rollback(objset_t *os);
 int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
 int dmu_objset_rename(const char *name, const char *newname,
     boolean_t recursive);
@@ -180,11 +190,6 @@ typedef struct dmu_buf {
 typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
 
 /*
- * Callback function to perform byte swapping on a block.
- */
-typedef void dmu_byteswap_func_t(void *buf, size_t size);
-
-/*
  * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
  */
 #define	DMU_POOL_DIRECTORY_OBJECT	1
@@ -197,6 +202,20 @@ typedef void dmu_byteswap_func_t(void *buf, size_t size);
 #define	DMU_POOL_DEFLATE		"deflate"
 #define	DMU_POOL_HISTORY		"history"
 #define	DMU_POOL_PROPS			"pool_props"
+#define	DMU_POOL_L2CACHE		"l2cache"
+
+/* 4x8 zbookmark_t */
+#define	DMU_POOL_SCRUB_BOOKMARK		"scrub_bookmark"
+/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */
+#define	DMU_POOL_SCRUB_QUEUE		"scrub_queue"
+/* 1x8 txg */
+#define	DMU_POOL_SCRUB_MIN_TXG		"scrub_min_txg"
+/* 1x8 txg */
+#define	DMU_POOL_SCRUB_MAX_TXG		"scrub_max_txg"
+/* 1x4 enum scrub_func */
+#define	DMU_POOL_SCRUB_FUNC		"scrub_func"
+/* 1x8 count */
+#define	DMU_POOL_SCRUB_ERRORS		"scrub_errors"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
@@ -298,6 +317,7 @@ int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
  */
 int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
 int dmu_bonus_max(void);
+int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
 
 /*
  * Obtain the DMU buffer from the specified object which contains the
@@ -417,6 +437,9 @@ void dmu_tx_commit(dmu_tx_t *tx);
  */
 int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size, dmu_tx_t *tx);
+int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
+	uint64_t size);
+int dmu_free_object(objset_t *os, uint64_t object);
 
 /*
  * Convenience functions.
@@ -458,8 +481,10 @@ typedef struct dmu_object_info {
 	uint64_t doi_max_block_offset;
 } dmu_object_info_t;
 
+typedef void arc_byteswap_func_t(void *buf, size_t size);
+
 typedef struct dmu_object_type_info {
-	dmu_byteswap_func_t	*ot_byteswap;
+	arc_byteswap_func_t	*ot_byteswap;
 	boolean_t		ot_metadata;
 	char			*ot_name;
 } dmu_object_type_info_t;
@@ -482,10 +507,11 @@ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
 typedef struct dmu_objset_stats {
 	uint64_t dds_num_clones; /* number of clones of this */
 	uint64_t dds_creation_txg;
+	uint64_t dds_guid;
 	dmu_objset_type_t dds_type;
 	uint8_t dds_is_snapshot;
 	uint8_t dds_inconsistent;
-	char dds_clone_of[MAXNAMELEN];
+	char dds_origin[MAXNAMELEN];
 } dmu_objset_stats_t;
 
 /*
@@ -531,9 +557,13 @@ extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
-    uint64_t *id, uint64_t *offp);
+    uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
+extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
+    int maxlen, boolean_t *conflict);
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
+extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
+extern void *dmu_objset_get_user(objset_t *os);
 
 /*
  * Return the txg number for the given assigned transaction.
@@ -544,7 +574,7 @@ uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
  * Synchronous write.
  * If a parent zio is provided this function initiates a write on the
  * provided buffer as a child of the parent zio.
- * In the absense of a parent zio, the write is completed synchronously.
+ * In the absence of a parent zio, the write is completed synchronously.
  * At write completion, blk is filled with the bp of the written block.
  * Note that while the data covered by this function will be on stable
  * storage when the write completes this new data does not become a
@@ -572,9 +602,30 @@ typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
 void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
     dmu_traverse_cb_t cb, void *arg);
 
-int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp);
-int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
-    boolean_t force, struct file *fp, uint64_t voffset);
+int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
+    struct file *fp, offset_t *off);
+
+typedef struct dmu_recv_cookie {
+	/*
+	 * This structure is opaque!
+	 *
+	 * If logical and real are different, we are recving the stream
+	 * into the "real" temporary clone, and then switching it with
+	 * the "logical" target.
+	 */
+	struct dsl_dataset *drc_logical_ds;
+	struct dsl_dataset *drc_real_ds;
+	struct drr_begin *drc_drrb;
+	char *drc_tosnap;
+	boolean_t drc_newfs;
+	boolean_t drc_force;
+} dmu_recv_cookie_t;
+
+int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *,
+    boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *);
+int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp);
+int dmu_recv_end(dmu_recv_cookie_t *drc);
+void dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc);
 
 /* CRC64 table */
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
index 807011e94ffc..96ce688e1551 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_DMU_IMPL_H
 #define	_SYS_DMU_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/txg_impl.h>
 #include <sys/zio.h>
 #include <sys/dnode.h>
@@ -51,7 +49,7 @@ extern "C" {
  * XXX try to improve evicting path?
  *
  * dp_config_rwlock > os_obj_lock > dn_struct_rwlock >
- * 	dn_dbufs_mtx > hash_mutexes > db_mtx > leafs
+ * 	dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs
  *
  * dp_config_rwlock
  *    must be held before: everything
@@ -177,7 +175,10 @@ extern "C" {
  *   	dmu_tx_try_assign: dn_notxholds(cv)
  *   	dmu_tx_unassign: none
  *
- * dd_lock (leaf)
+ * dd_lock
+ *    must be held before:
+ *      ds_lock
+ *      ancestors' dd_lock
  *    protects:
  *    	dd_prop_cbs
  *    	dd_sync_*
@@ -207,13 +208,14 @@ extern "C" {
  *   	dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes)
  *   	dnode_free: none (dn_dirtyblksz, os_*_dnodes)
  *
- * ds_lock (leaf)
+ * ds_lock
  *    protects:
  *    	ds_user_ptr
  *    	ds_user_evice_func
  *    	ds_open_refcount
  *    	ds_snapname
  *    	ds_phys accounting
+ *	ds_reserved
  *    held from:
  *    	dsl_dataset_*
  *
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
index 8293a3b4076a..15df29a17799 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -69,12 +69,13 @@ typedef struct objset_impl {
 	uint8_t os_checksum;	/* can change, under dsl_dir's locks */
 	uint8_t os_compress;	/* can change, under dsl_dir's locks */
 	uint8_t os_copies;	/* can change, under dsl_dir's locks */
-	uint8_t os_md_checksum;
-	uint8_t os_md_compress;
+	uint8_t os_primary_cache;	/* can change, under dsl_dir's locks */
+	uint8_t os_secondary_cache;	/* can change, under dsl_dir's locks */
 
 	/* no lock needed: */
 	struct dmu_tx *os_synctx; /* XXX sketchy */
 	blkptr_t *os_rootbp;
+	zil_header_t os_zil_header;
 
 	/* Protected by os_obj_lock */
 	kmutex_t os_obj_lock;
@@ -86,19 +87,27 @@ typedef struct objset_impl {
 	list_t os_free_dnodes[TXG_SIZE];
 	list_t os_dnodes;
 	list_t os_downgraded_dbufs;
+
+	/* stuff we store for the user */
+	kmutex_t os_user_ptr_lock;
+	void *os_user_ptr;
 } objset_impl_t;
 
 #define	DMU_META_DNODE_OBJECT	0
 
+#define	DMU_OS_IS_L2CACHEABLE(os)				\
+	((os)->os_secondary_cache == ZFS_CACHE_ALL ||		\
+	(os)->os_secondary_cache == ZFS_CACHE_METADATA)
+
 /* called from zpl */
 int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
     objset_t **osp);
 void dmu_objset_close(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent,
-    void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
+    objset_t *clone_parent, uint64_t flags,
+    void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
 int dmu_objset_destroy(const char *name);
-int dmu_objset_rollback(const char *name);
+int dmu_objset_rollback(objset_t *os);
 int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
 void dmu_objset_stats(objset_t *os, nvlist_t *nv);
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
@@ -107,8 +116,10 @@ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 int dmu_objset_find(char *name, int func(char *, void *), void *arg,
     int flags);
+int dmu_objset_find_spa(spa_t *spa, const char *name,
+    int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
-int dmu_objset_evict_dbufs(objset_t *os, int try);
+int dmu_objset_evict_dbufs(objset_t *os);
 
 /* called from dsl */
 void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
index ea9fa6c1e36c..05e5ffdbff5d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -100,6 +100,7 @@ struct traverse_handle {
 
 int traverse_dsl_dataset(struct dsl_dataset *ds, uint64_t txg_start,
     int advance, blkptr_cb_t func, void *arg);
+int traverse_zvol(objset_t *os, int advance, blkptr_cb_t func, void *arg);
 
 traverse_handle_t *traverse_init(spa_t *spa, blkptr_cb_t *func, void *arg,
     int advance, int zio_flags);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
index 89f4799b57fe..6aaf35dc038f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -64,6 +64,7 @@ struct dmu_tx {
 	uint64_t tx_space_towrite;
 	uint64_t tx_space_tofree;
 	uint64_t tx_space_tooverwrite;
+	uint64_t tx_space_tounref;
 	refcount_t tx_space_written;
 	refcount_t tx_space_freed;
 #endif
@@ -86,6 +87,9 @@ typedef struct dmu_tx_hold {
 	uint64_t txh_space_towrite;
 	uint64_t txh_space_tofree;
 	uint64_t txh_space_tooverwrite;
+	uint64_t txh_space_tounref;
+	uint64_t txh_memory_tohold;
+	uint64_t txh_fudge;
 #ifdef ZFS_DEBUG
 	enum dmu_tx_hold_type txh_type;
 	uint64_t txh_arg1;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
index 327e538cf809..c79ff48a60c5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_DNODE_H
 #define	_SYS_DNODE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/avl.h>
 #include <sys/spa.h>
@@ -41,12 +39,19 @@ extern "C" {
 #endif
 
 /*
- * Flags.
+ * dnode_hold() flags.
  */
 #define	DNODE_MUST_BE_ALLOCATED	1
 #define	DNODE_MUST_BE_FREE	2
 
 /*
+ * dnode_next_offset() flags.
+ */
+#define	DNODE_FIND_HOLE		1
+#define	DNODE_FIND_BACKWARDS	2
+#define	DNODE_FIND_HAVELOCK	4
+
+/*
  * Fixed constants.
  */
 #define	DNODE_SHIFT		9	/* 512 bytes */
@@ -64,6 +69,7 @@ extern "C" {
 #define	DN_MAX_NBLKPTR	((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
 #define	DN_MAX_BONUSLEN	(DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
 #define	DN_MAX_OBJECT	(1ULL << DN_MAX_OBJECT_SHIFT)
+#define	DN_ZERO_BONUSLEN	(DN_MAX_BONUSLEN + 1)
 
 #define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
 #define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
@@ -156,6 +162,7 @@ typedef struct dnode {
 	uint64_t dn_maxblkid;
 	uint8_t dn_next_nlevels[TXG_SIZE];
 	uint8_t dn_next_indblkshift[TXG_SIZE];
+	uint16_t dn_next_bonuslen[TXG_SIZE];
 	uint32_t dn_next_blksz[TXG_SIZE];	/* next block size in bytes */
 
 	/* protected by os_lock: */
@@ -197,11 +204,12 @@ dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
     uint64_t object);
 void dnode_special_close(dnode_t *dn);
 
+void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
 int dnode_hold(struct objset_impl *dd, uint64_t object,
     void *ref, dnode_t **dnp);
 int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
     void *ref, dnode_t **dnp);
-void dnode_add_ref(dnode_t *dn, void *ref);
+boolean_t dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
 void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
 void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
@@ -220,13 +228,13 @@ void dnode_clear_range(dnode_t *dn, uint64_t blkid,
     uint64_t nblks, dmu_tx_t *tx);
 void dnode_diduse_space(dnode_t *dn, int64_t space);
 void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
-void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx);
+void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
 uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
 void dnode_init(void);
 void dnode_fini(void);
-int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
-    uint64_t blkfill, uint64_t txg);
-int dnode_evict_dbufs(dnode_t *dn, int try);
+int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
+    int minlvl, uint64_t blkfill, uint64_t txg);
+void dnode_evict_dbufs(dnode_t *dn);
 
 #ifdef ZFS_DEBUG
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
index 8cfc1dcc9840..8665aec2dda8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_DSL_DATASET_H
 #define	_SYS_DSL_DATASET_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
@@ -47,6 +45,8 @@ struct dsl_pool;
 typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
 
 #define	DS_FLAG_INCONSISTENT	(1ULL<<0)
+#define	DS_IS_INCONSISTENT(ds)	\
+	((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT)
 /*
  * NB: nopromote can not yet be set, but we want support for it in this
  * on-disk version, so that we don't need to upgrade for it later.  It
@@ -55,16 +55,29 @@ typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
  */
 #define	DS_FLAG_NOPROMOTE	(1ULL<<1)
 
+/*
+ * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly
+ * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE,
+ * refquota/refreservations).
+ */
+#define	DS_FLAG_UNIQUE_ACCURATE	(1ULL<<2)
+
+/*
+ * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
+ * name lookups should be performed case-insensitively.
+ */
+#define	DS_FLAG_CI_DATASET	(1ULL<<16)
+
 typedef struct dsl_dataset_phys {
-	uint64_t ds_dir_obj;
-	uint64_t ds_prev_snap_obj;
+	uint64_t ds_dir_obj;		/* DMU_OT_DSL_DIR */
+	uint64_t ds_prev_snap_obj;	/* DMU_OT_DSL_DATASET */
 	uint64_t ds_prev_snap_txg;
-	uint64_t ds_next_snap_obj;
-	uint64_t ds_snapnames_zapobj;	/* zap obj of snaps; ==0 for snaps */
+	uint64_t ds_next_snap_obj;	/* DMU_OT_DSL_DATASET */
+	uint64_t ds_snapnames_zapobj;	/* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */
 	uint64_t ds_num_children;	/* clone/snap children; ==0 for head */
 	uint64_t ds_creation_time;	/* seconds since 1970 */
 	uint64_t ds_creation_txg;
-	uint64_t ds_deadlist_obj;
+	uint64_t ds_deadlist_obj;	/* DMU_OT_BPLIST */
 	uint64_t ds_used_bytes;
 	uint64_t ds_compressed_bytes;
 	uint64_t ds_uncompressed_bytes;
@@ -76,9 +89,11 @@ typedef struct dsl_dataset_phys {
 	 */
 	uint64_t ds_fsid_guid;
 	uint64_t ds_guid;
-	uint64_t ds_flags;
+	uint64_t ds_flags;		/* DS_FLAG_* */
 	blkptr_t ds_bp;
-	uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */
+	uint64_t ds_next_clones_obj;	/* DMU_OT_DSL_CLONES */
+	uint64_t ds_props_obj;		/* DMU_OT_DSL_PROPS for snaps */
+	uint64_t ds_pad[6]; /* pad out to 320 bytes for good measure */
 } dsl_dataset_phys_t;
 
 typedef struct dsl_dataset {
@@ -87,9 +102,11 @@ typedef struct dsl_dataset {
 	dsl_dataset_phys_t *ds_phys;
 	dmu_buf_t *ds_dbuf;
 	uint64_t ds_object;
+	uint64_t ds_fsid_guid;
 
-	/* only used in syncing context: */
-	struct dsl_dataset *ds_prev; /* only valid for non-snapshots */
+	/* only used in syncing context, only valid for non-snapshots: */
+	struct dsl_dataset *ds_prev;
+	uint64_t ds_origin_txg;
 
 	/* has internal locking: */
 	bplist_t ds_deadlist;
@@ -105,11 +122,23 @@ typedef struct dsl_dataset {
 	kmutex_t ds_lock;
 	void *ds_user_ptr;
 	dsl_dataset_evict_func_t *ds_user_evict_func;
-	uint64_t ds_open_refcount;
+
+	/*
+	 * ds_owner is protected by the ds_rwlock and the ds_lock
+	 */
+	krwlock_t ds_rwlock;
+	kcondvar_t ds_exclusive_cv;
+	void *ds_owner;
 
 	/* no locking; only for making guesses */
 	uint64_t ds_trysnap_txg;
 
+	/* for objset_open() */
+	kmutex_t ds_opening_lock;
+
+	uint64_t ds_reserved;	/* cached refreservation */
+	uint64_t ds_quota;	/* cached refquota */
+
 	/* Protected by ds_lock; keep at end of struct for better locality */
 	char ds_snapname[MAXNAMELEN];
 } dsl_dataset_t;
@@ -117,23 +146,38 @@ typedef struct dsl_dataset {
 #define	dsl_dataset_is_snapshot(ds)	\
 	((ds)->ds_phys->ds_num_children != 0)
 
-int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
-    void *tag, dsl_dataset_t **dsp);
-int dsl_dataset_open(const char *name, int mode, void *tag,
+#define	DS_UNIQUE_IS_ACCURATE(ds)	\
+	(((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
+
+int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp);
+int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj,
+    void *tag, dsl_dataset_t **);
+int dsl_dataset_own(const char *name, int flags, void *owner,
     dsl_dataset_t **dsp);
-int dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
-    const char *tail, int mode, void *tag, dsl_dataset_t **);
+int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
+    int flags, void *owner, dsl_dataset_t **);
 void dsl_dataset_name(dsl_dataset_t *ds, char *name);
-void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag);
-uint64_t dsl_dataset_create_sync(dsl_dir_t *pds,
-    const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx);
-int dsl_dataset_destroy(const char *name);
+void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
+void dsl_dataset_disown(dsl_dataset_t *ds, void *owner);
+void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag);
+boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok,
+    void *owner);
+void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner);
+uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
+    dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
+uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
+    uint64_t flags, dmu_tx_t *tx);
+int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag);
 int dsl_snapshots_destroy(char *fsname, char *snapname);
+dsl_checkfunc_t dsl_dataset_destroy_check;
+dsl_syncfunc_t dsl_dataset_destroy_sync;
 dsl_checkfunc_t dsl_dataset_snapshot_check;
 dsl_syncfunc_t dsl_dataset_snapshot_sync;
-int dsl_dataset_rollback(dsl_dataset_t *ds);
+int dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost);
 int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
 int dsl_dataset_promote(const char *name);
+int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
+    boolean_t force);
 
 void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
     void *p, dsl_dataset_evict_func_t func);
@@ -144,10 +188,12 @@ void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
 
 spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
 
+boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds);
+
 void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
 
 void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
     dmu_tx_t *tx);
 int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
 uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
@@ -160,11 +206,19 @@ void dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
 
-void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp,
-    dmu_tx_t *tx);
-
 int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 
+int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
+    uint64_t asize, uint64_t inflight, uint64_t *used,
+    uint64_t *ref_rsrv);
+int dsl_dataset_set_quota(const char *dsname, uint64_t quota);
+void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr,
+    dmu_tx_t *tx);
+int dsl_dataset_set_reservation(const char *dsname, uint64_t reservation);
+void dsl_dataset_set_flags(dsl_dataset_t *ds, uint64_t flags);
+int64_t dsl_dataset_new_refreservation(dsl_dataset_t *ds, uint64_t reservation,
+    dmu_tx_t *tx);
+
 #ifdef ZFS_DEBUG
 #define	dprintf_ds(ds, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
new file mode 100644
index 000000000000..a29e44e67d0c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DSL_DELEG_H
+#define	_SYS_DSL_DELEG_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ZFS_DELEG_PERM_NONE		""
+#define	ZFS_DELEG_PERM_CREATE		"create"
+#define	ZFS_DELEG_PERM_DESTROY		"destroy"
+#define	ZFS_DELEG_PERM_SNAPSHOT		"snapshot"
+#define	ZFS_DELEG_PERM_ROLLBACK		"rollback"
+#define	ZFS_DELEG_PERM_CLONE		"clone"
+#define	ZFS_DELEG_PERM_PROMOTE		"promote"
+#define	ZFS_DELEG_PERM_RENAME		"rename"
+#define	ZFS_DELEG_PERM_MOUNT		"mount"
+#define	ZFS_DELEG_PERM_SHARE		"share"
+#define	ZFS_DELEG_PERM_SEND		"send"
+#define	ZFS_DELEG_PERM_RECEIVE		"receive"
+#define	ZFS_DELEG_PERM_ALLOW		"allow"
+#define	ZFS_DELEG_PERM_USERPROP		"userprop"
+#define	ZFS_DELEG_PERM_VSCAN		"vscan"
+
+/*
+ * Note: the names of properties that are marked delegatable are also
+ * valid delegated permissions
+ */
+
+int dsl_deleg_get(const char *ddname, nvlist_t **nvp);
+int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset);
+int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr);
+void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr);
+int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr);
+int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr);
+int dsl_deleg_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx);
+boolean_t dsl_delegation_on(objset_t *os);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DSL_DELEG_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
index e0595d3c368b..86b9636ceaab 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_DSL_DIR_H
 #define	_SYS_DSL_DIR_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
@@ -40,11 +38,22 @@ extern "C" {
 
 struct dsl_dataset;
 
+typedef enum dd_used {
+	DD_USED_HEAD,
+	DD_USED_SNAP,
+	DD_USED_CHILD,
+	DD_USED_CHILD_RSRV,
+	DD_USED_REFRSRV,
+	DD_USED_NUM
+} dd_used_t;
+
+#define	DD_FLAG_USED_BREAKDOWN (1<<0)
+
 typedef struct dsl_dir_phys {
 	uint64_t dd_creation_time; /* not actually used */
 	uint64_t dd_head_dataset_obj;
 	uint64_t dd_parent_obj;
-	uint64_t dd_clone_parent_obj;
+	uint64_t dd_origin_obj;
 	uint64_t dd_child_dir_zapobj;
 	/*
 	 * how much space our children are accounting for; for leaf
@@ -58,7 +67,10 @@ typedef struct dsl_dir_phys {
 	/* Administrative reservation setting */
 	uint64_t dd_reserved;
 	uint64_t dd_props_zapobj;
-	uint64_t dd_pad[21]; /* pad out to 256 bytes for good measure */
+	uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
+	uint64_t dd_flags;
+	uint64_t dd_used_breakdown[DD_USED_NUM];
+	uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */
 } dsl_dir_phys_t;
 
 struct dsl_dir {
@@ -78,9 +90,6 @@ struct dsl_dir {
 	kmutex_t dd_lock;
 	list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
 
-	/* Accounting */
-	/* reflects any changes to dd_phys->dd_used_bytes made this syncing */
-	int64_t dd_used_bytes;
 	/* gross estimate of space used by in-flight tx's */
 	uint64_t dd_tempreserved[TXG_SIZE];
 	/* amount of space we expect to write; == amount of dirty data */
@@ -99,8 +108,8 @@ int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 void dsl_dir_name(dsl_dir_t *dd, char *buf);
 int dsl_dir_namelen(dsl_dir_t *dd);
 int dsl_dir_is_private(dsl_dir_t *dd);
-uint64_t dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx);
-void dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx);
+uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
+    const char *name, dmu_tx_t *tx);
 dsl_checkfunc_t dsl_dir_destroy_check;
 dsl_syncfunc_t dsl_dir_destroy_sync;
 void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv);
@@ -109,18 +118,26 @@ uint64_t dsl_dir_space_available(dsl_dir_t *dd,
 void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
 void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
 int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
-    uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx);
+    uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep,
+    dmu_tx_t *tx);
 void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
 void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
-void dsl_dir_diduse_space(dsl_dir_t *dd,
+void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
+void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
+    dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
 int dsl_dir_set_quota(const char *ddname, uint64_t quota);
 int dsl_dir_set_reservation(const char *ddname, uint64_t reservation);
 int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
 int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
+int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx);
+boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
+void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
+    uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
 
 /* internal reserved dir name */
 #define	MOS_DIR_NAME "$MOS"
+#define	ORIGIN_DIR_NAME "$ORIGIN"
 
 #ifdef ZFS_DEBUG
 #define	dprintf_dd(dd, fmt, ...) do { \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
index f7ec67a0e062..4dd88fe6fa55 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
@@ -19,19 +19,18 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_DSL_POOL_H
 #define	_SYS_DSL_POOL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/txg_impl.h>
 #include <sys/zfs_context.h>
+#include <sys/zio.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -39,6 +38,16 @@ extern "C" {
 
 struct objset;
 struct dsl_dir;
+struct dsl_dataset;
+struct dsl_pool;
+struct dmu_tx;
+
+enum scrub_func {
+	SCRUB_FUNC_NONE,
+	SCRUB_FUNC_CLEAN,
+	SCRUB_FUNC_NUMFUNCS
+};
+
 
 typedef struct dsl_pool {
 	/* Immutable */
@@ -46,11 +55,31 @@ typedef struct dsl_pool {
 	struct objset *dp_meta_objset;
 	struct dsl_dir *dp_root_dir;
 	struct dsl_dir *dp_mos_dir;
+	struct dsl_dataset *dp_origin_snap;
 	uint64_t dp_root_dir_obj;
 
 	/* No lock needed - sync context only */
 	blkptr_t dp_meta_rootbp;
-	list_t dp_synced_objsets;
+	list_t dp_synced_datasets;
+	hrtime_t dp_read_overhead;
+	uint64_t dp_throughput;
+	uint64_t dp_write_limit;
+
+	/* Uses dp_lock */
+	kmutex_t dp_lock;
+	uint64_t dp_space_towrite[TXG_SIZE];
+	uint64_t dp_tempreserved[TXG_SIZE];
+
+	enum scrub_func dp_scrub_func;
+	uint64_t dp_scrub_queue_obj;
+	uint64_t dp_scrub_min_txg;
+	uint64_t dp_scrub_max_txg;
+	zbookmark_t dp_scrub_bookmark;
+	boolean_t dp_scrub_pausing;
+	boolean_t dp_scrub_isresilver;
+	uint64_t dp_scrub_start_time;
+	kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
+	boolean_t dp_scrub_restart;
 
 	/* Has its own locking */
 	tx_state_t dp_tx;
@@ -69,11 +98,26 @@ typedef struct dsl_pool {
 
 int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
 void dsl_pool_close(dsl_pool_t *dp);
-dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg);
+dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
 void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
 void dsl_pool_zil_clean(dsl_pool_t *dp);
 int dsl_pool_sync_context(dsl_pool_t *dp);
 uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx);
+void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
+void dsl_pool_memory_pressure(dsl_pool_t *dp);
+void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
+int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
+    zio_done_func_t *done, void *private, uint32_t arc_flags);
+void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
+
+int dsl_pool_scrub_cancel(dsl_pool_t *dp);
+int dsl_pool_scrub_clean(dsl_pool_t *dp);
+void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_scrub_restart(dsl_pool_t *dp);
 
 #ifdef	__cplusplus
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
index d2debff8b8c0..d66caa86cff6 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -37,6 +37,7 @@ extern "C" {
 #endif
 
 struct dsl_dataset;
+struct dsl_dir;
 
 /* The callback func may not call into the DMU or DSL! */
 typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
@@ -59,12 +60,16 @@ int dsl_prop_get(const char *ddname, const char *propname,
     int intsz, int numints, void *buf, char *setpoint);
 int dsl_prop_get_integer(const char *ddname, const char *propname,
     uint64_t *valuep, char *setpoint);
-int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
+int dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local);
+int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname,
+    int intsz, int numints, void *buf, char *setpoint);
+int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname,
+    int intsz, int numints, void *buf, char *setpoint);
 
 int dsl_prop_set(const char *ddname, const char *propname,
     int intsz, int numints, const void *buf);
-int dsl_prop_set_dd(dsl_dir_t *dd, const char *propname,
-    int intsz, int numints, const void *buf);
+void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+    cred_t *cr, dmu_tx_t *tx);
 
 void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
 void dsl_prop_nvlist_add_string(nvlist_t *nv,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
index e695b182f74b..4995bfe5acca 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -38,7 +38,7 @@ extern "C" {
 struct dsl_pool;
 
 typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *);
-typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *);
+typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *);
 
 typedef struct dsl_sync_task {
 	list_node_t dst_node;
@@ -53,9 +53,11 @@ typedef struct dsl_sync_task_group {
 	txg_node_t dstg_node;
 	list_t dstg_tasks;
 	struct dsl_pool *dstg_pool;
+	cred_t *dstg_cr;
 	uint64_t dstg_txg;
 	int dstg_err;
 	int dstg_space;
+	boolean_t dstg_nowaiter;
 } dsl_sync_task_group_t;
 
 dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp);
@@ -63,12 +65,16 @@ void dsl_sync_task_create(dsl_sync_task_group_t *dstg,
     dsl_checkfunc_t *, dsl_syncfunc_t *,
     void *arg1, void *arg2, int blocks_modified);
 int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg);
+void dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx);
 void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg);
 void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx);
 
 int dsl_sync_task_do(struct dsl_pool *dp,
     dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
     void *arg1, void *arg2, int blocks_modified);
+void dsl_sync_task_do_nowait(struct dsl_pool *dp,
+    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
+    void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx);
 
 #ifdef	__cplusplus
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
index 095dd3ce2464..1c9d89e8fd69 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_METASLAB_H
 #define	_SYS_METASLAB_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/spa.h>
 #include <sys/space_map.h>
 #include <sys/txg.h>
@@ -47,8 +45,12 @@ extern void metaslab_fini(metaslab_t *msp);
 extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
 extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
 
-extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp,
-    int ncopies, uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid);
+#define	METASLAB_HINTBP_FAVOR	0x0
+#define	METASLAB_HINTBP_AVOID	0x1
+#define	METASLAB_GANG_HEADER	0x2
+
+extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
+    blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
 extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
     boolean_t now);
 extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
index c64c6627f783..e84b1bf65f99 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -28,6 +28,8 @@
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
+#include <sys/cdefs.h>
+#include <sys/types.h>
 #include_next <sys/refcount.h>
 #include <sys/list.h>
 #include <sys/zfs_context.h>
@@ -59,7 +61,7 @@ typedef struct refcount {
 	int64_t rc_removed_count;
 } refcount_t;
 
-/* Note: refcount_t should be initialized to zero before use. */
+/* Note: refcount_t must be initialized with refcount_create() */
 
 void refcount_create(refcount_t *rc);
 void refcount_destroy(refcount_t *rc);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h
new file mode 100644
index 000000000000..760fc822db56
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h
@@ -0,0 +1,79 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_RR_RW_LOCK_H
+#define	_SYS_RR_RW_LOCK_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+/*
+ * A reader-writer lock implementation that allows re-entrant reads, but
+ * still gives writers priority on "new" reads.
+ *
+ * See rrwlock.c for more details about the implementation.
+ *
+ * Fields of the rrwlock_t structure:
+ * - rr_lock: protects modification and reading of rrwlock_t fields
+ * - rr_cv: cv for waking up readers or waiting writers
+ * - rr_writer: thread id of the current writer
+ * - rr_anon_rount: number of active anonymous readers
+ * - rr_linked_rcount: total number of non-anonymous active readers
+ * - rr_writer_wanted: a writer wants the lock
+ */
+typedef struct rrwlock {
+	kmutex_t	rr_lock;
+	kcondvar_t	rr_cv;
+	kthread_t	*rr_writer;
+	refcount_t	rr_anon_rcount;
+	refcount_t	rr_linked_rcount;
+	boolean_t	rr_writer_wanted;
+} rrwlock_t;
+
+/*
+ * 'tag' is used in reference counting tracking.  The
+ * 'tag' must be the same in a rrw_enter() as in its
+ * corresponding rrw_exit().
+ */
+void rrw_init(rrwlock_t *rrl);
+void rrw_destroy(rrwlock_t *rrl);
+void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag);
+void rrw_exit(rrwlock_t *rrl, void *tag);
+boolean_t rrw_held(rrwlock_t *rrl, krw_t rw);
+
+#define	RRW_READ_HELD(x)	rrw_held(x, RW_READER)
+#define	RRW_WRITE_HELD(x)	rrw_held(x, RW_WRITER)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_RR_RW_LOCK_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
index f0eb2e171aad..99bcb915911e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_SPA_H
 #define	_SYS_SPA_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/avl.h>
 #include <sys/zfs_context.h>
 #include <sys/nvpair.h>
@@ -47,6 +45,7 @@ typedef struct vdev vdev_t;
 typedef struct metaslab metaslab_t;
 typedef struct zilog zilog_t;
 typedef struct traverse_handle traverse_handle_t;
+typedef struct spa_aux_vdev spa_aux_vdev_t;
 struct dsl_pool;
 
 /*
@@ -88,6 +87,11 @@ struct dsl_pool;
 #define	SPA_BLOCKSIZES		(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
 
 /*
+ * Size of block to hold the configuration data (a packed nvlist)
+ */
+#define	SPA_CONFIG_BLOCKSIZE	(1 << 14)
+
+/*
  * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
  * The ASIZE encoding should be at least 64 times larger (6 more bits)
  * to support up to 4-way RAID-Z mirror mode with worst-case gang block
@@ -258,7 +262,6 @@ typedef struct blkptr {
 	((zc1).zc_word[2] - (zc2).zc_word[2]) | \
 	((zc1).zc_word[3] - (zc2).zc_word[3])))
 
-
 #define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
 
 #define	ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)	\
@@ -291,6 +294,8 @@ typedef struct blkptr {
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
 }
 
+#define	BLK_FILL_ALREADY_FREED	(-1ULL)
+
 /*
  * Note: the byteorder is either 0 or -1, both of which are palindromes.
  * This simplifies the endianness handling a bit.
@@ -318,23 +323,30 @@ typedef struct blkptr {
 extern int spa_open(const char *pool, spa_t **, void *tag);
 extern int spa_get_stats(const char *pool, nvlist_t **config,
     char *altroot, size_t buflen);
-extern int spa_create(const char *pool, nvlist_t *config, const char *altroot);
-extern int spa_import(const char *pool, nvlist_t *config, const char *altroot);
+extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
+    const char *history_str, nvlist_t *zplprops);
+extern int spa_check_rootconf(char *devpath, char *devid,
+    nvlist_t **bestconf, uint64_t *besttxg);
+extern boolean_t spa_rootdev_validate(nvlist_t *nv);
+extern int spa_import_rootpool(char *devpath, char *devid);
+extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props);
+extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(char *pool);
-extern int spa_export(char *pool, nvlist_t **oldconfig);
+extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force);
 extern int spa_reset(char *pool);
 extern void spa_async_request(spa_t *spa, int flag);
+extern void spa_async_unrequest(spa_t *spa, int flag);
 extern void spa_async_suspend(spa_t *spa);
 extern void spa_async_resume(spa_t *spa);
 extern spa_t *spa_inject_addref(char *pool);
 extern void spa_inject_delref(spa_t *spa);
 
-#define	SPA_ASYNC_REOPEN	0x01
-#define	SPA_ASYNC_REPLACE_DONE	0x02
-#define	SPA_ASYNC_SCRUB		0x04
-#define	SPA_ASYNC_RESILVER	0x08
-#define	SPA_ASYNC_CONFIG_UPDATE	0x10
+#define	SPA_ASYNC_CONFIG_UPDATE	0x01
+#define	SPA_ASYNC_REMOVE	0x02
+#define	SPA_ASYNC_PROBE		0x04
+#define	SPA_ASYNC_RESILVER_DONE	0x08
+#define	SPA_ASYNC_RESILVER	0x10
 
 /* device manipulation */
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
@@ -347,19 +359,27 @@ extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 /* spare state (which is global across all pools) */
 extern void spa_spare_add(vdev_t *vd);
 extern void spa_spare_remove(vdev_t *vd);
-extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool);
+extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt);
 extern void spa_spare_activate(vdev_t *vd);
 
+/* L2ARC state (which is global across all pools) */
+extern void spa_l2cache_add(vdev_t *vd);
+extern void spa_l2cache_remove(vdev_t *vd);
+extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
+extern void spa_l2cache_activate(vdev_t *vd);
+extern void spa_l2cache_drop(spa_t *spa);
+extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc);
+
 /* scrubbing */
-extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force);
-extern void spa_scrub_suspend(spa_t *spa);
-extern void spa_scrub_resume(spa_t *spa);
-extern void spa_scrub_restart(spa_t *spa, uint64_t txg);
+extern int spa_scrub(spa_t *spa, pool_scrub_type_t type);
 
 /* spa syncing */
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);
 
+/* spa namespace global mutex */
+extern kmutex_t spa_namespace_lock;
+
 /*
  * SPA configuration functions in spa_config.c
  */
@@ -367,13 +387,14 @@ extern void spa_sync_allpools(void);
 #define	SPA_CONFIG_UPDATE_POOL	0
 #define	SPA_CONFIG_UPDATE_VDEVS	1
 
-extern void spa_config_sync(void);
+extern void spa_config_sync(spa_t *, boolean_t, boolean_t);
 extern void spa_config_load(void);
 extern nvlist_t *spa_all_configs(uint64_t *);
 extern void spa_config_set(spa_t *spa, nvlist_t *config);
 extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
     int getstats);
 extern void spa_config_update(spa_t *spa, int what);
+extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot);
 
 /*
  * Miscellaneous SPA routines in spa_misc.c
@@ -390,18 +411,34 @@ extern void spa_open_ref(spa_t *spa, void *tag);
 extern void spa_close(spa_t *spa, void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);
 
-/* Pool configuration lock */
-extern void spa_config_enter(spa_t *spa, krw_t rw, void *tag);
-extern void spa_config_exit(spa_t *spa, void *tag);
-extern boolean_t spa_config_held(spa_t *spa, krw_t rw);
+#define	SCL_CONFIG	0x01
+#define	SCL_STATE	0x02
+#define	SCL_L2ARC	0x04		/* hack until L2ARC 2.0 */
+#define	SCL_ALLOC	0x08
+#define	SCL_ZIO		0x10
+#define	SCL_FREE	0x20
+#define	SCL_VDEV	0x40
+#define	SCL_LOCKS	7
+#define	SCL_ALL		((1 << SCL_LOCKS) - 1)
+#define	SCL_STATE_ALL	(SCL_STATE | SCL_L2ARC | SCL_ZIO)
+
+/* Pool configuration locks */
+extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
+extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw);
+extern void spa_config_exit(spa_t *spa, int locks, void *tag);
+extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
 
 /* Pool vdev add/remove lock */
 extern uint64_t spa_vdev_enter(spa_t *spa);
 extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
 
+/* Pool vdev state change lock */
+extern void spa_vdev_state_enter(spa_t *spa);
+extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
+
 /* Accessor functions */
 extern krwlock_t *spa_traverse_rwlock(spa_t *spa);
-extern int spa_traverse_wanted(spa_t *spa);
+extern boolean_t spa_traverse_wanted(spa_t *spa);
 extern struct dsl_pool *spa_get_dsl(spa_t *spa);
 extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
 extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
@@ -414,8 +451,6 @@ extern uint64_t spa_first_txg(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern int spa_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
-struct metaslab_class;
-extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa);
 extern uint64_t spa_get_alloc(spa_t *spa);
 extern uint64_t spa_get_space(spa_t *spa);
 extern uint64_t spa_get_dspace(spa_t *spa);
@@ -423,6 +458,8 @@ extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_version(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
 extern int spa_busy(void);
+extern uint8_t spa_get_failmode(spa_t *spa);
+extern boolean_t spa_suspended(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern int spa_rename(const char *oldname, const char *newname);
@@ -432,18 +469,38 @@ extern void spa_strfree(char *);
 extern uint64_t spa_get_random(uint64_t range);
 extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
-extern void spa_upgrade(spa_t *spa);
+extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
-extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid);
+extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
+    boolean_t l2cache);
 extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
 extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
+extern boolean_t spa_has_slogs(spa_t *spa);
+extern boolean_t spa_is_root(spa_t *spa);
 
 /* history logging */
+typedef enum history_log_type {
+	LOG_CMD_POOL_CREATE,
+	LOG_CMD_NORMAL,
+	LOG_INTERNAL
+} history_log_type_t;
+
+typedef struct history_arg {
+	const char *ha_history_str;
+	history_log_type_t ha_log_type;
+	history_internal_events_t ha_event;
+	char ha_zone[MAXPATHLEN];
+} history_arg_t;
+
+extern char *spa_his_ievent_table[];
+
 extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
 extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
     char *his_buf);
 extern int spa_history_log(spa_t *spa, const char *his_buf,
-    uint64_t pool_create);
+    history_log_type_t what);
+void spa_history_internal_log(history_internal_events_t event, spa_t *spa,
+    dmu_tx_t *tx, cred_t *cr, const char *fmt, ...);
 
 /* error handling */
 struct zbookmark;
@@ -451,7 +508,8 @@ struct zio;
 extern void spa_log_error(spa_t *spa, struct zio *zio);
 extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
     struct zio *zio, uint64_t stateoroffset, uint64_t length);
-extern void zfs_post_ok(spa_t *spa, vdev_t *vd);
+extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
+extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
 extern uint64_t spa_get_errlog_size(spa_t *spa);
 extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
 extern void spa_errlog_rotate(spa_t *spa);
@@ -459,15 +517,22 @@ extern void spa_errlog_drain(spa_t *spa);
 extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
 extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
 
+/* vdev cache */
+extern void vdev_cache_stat_init(void);
+extern void vdev_cache_stat_fini(void);
+
 /* Initialization and termination */
 extern void spa_init(int flags);
 extern void spa_fini(void);
+extern void spa_boot_init();
 
 /* properties */
-extern int spa_set_props(spa_t *spa, nvlist_t *nvp);
-extern int spa_get_props(spa_t *spa, nvlist_t **nvp);
-extern void spa_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
-extern boolean_t spa_has_bootfs(spa_t *spa);
+extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
+extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
+extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
+
+/* asynchronous event notification */
+extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_bp(bp, fmt, ...) do {				\
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
new file mode 100644
index 000000000000..b56073b97516
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
@@ -0,0 +1,45 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPA_BOOT_H
+#define	_SYS_SPA_BOOT_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/nvpair.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+extern char *spa_get_bootprop(char *prop);
+extern void spa_free_bootprop(char *prop);
+extern int spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf_p);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SPA_BOOT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
index 8c57123ad4b8..ab41ba605c6a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_SPA_IMPL_H
 #define	_SYS_SPA_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/spa.h>
 #include <sys/vdev.h>
 #include <sys/metaslab.h>
@@ -43,13 +41,6 @@
 extern "C" {
 #endif
 
-typedef struct spa_config_lock {
-	kmutex_t	scl_lock;
-	refcount_t	scl_count;
-	kthread_t	*scl_writer;
-	kcondvar_t	scl_cv;
-} spa_config_lock_t;
-
 typedef struct spa_error_entry {
 	zbookmark_t	se_bookmark;
 	char		*se_name;
@@ -64,31 +55,61 @@ typedef struct spa_history_phys {
 	uint64_t sh_records_lost;	/* num of records overwritten */
 } spa_history_phys_t;
 
-typedef struct spa_props {
-	nvlist_t	*spa_props_nvp;
-	list_node_t	spa_list_node;
-} spa_props_t;
+struct spa_aux_vdev {
+	uint64_t	sav_object;		/* MOS object for device list */
+	nvlist_t	*sav_config;		/* cached device config */
+	vdev_t		**sav_vdevs;		/* devices */
+	int		sav_count;		/* number devices */
+	boolean_t	sav_sync;		/* sync the device list */
+	nvlist_t	**sav_pending;		/* pending device additions */
+	uint_t		sav_npending;		/* # pending devices */
+};
+
+typedef struct spa_config_lock {
+	kmutex_t	scl_lock;
+	kthread_t	*scl_writer;
+	int		scl_write_wanted;
+	kcondvar_t	scl_cv;
+	refcount_t	scl_count;
+} spa_config_lock_t;
+
+typedef struct spa_config_dirent {
+	list_node_t	scd_link;
+	char		*scd_path;
+} spa_config_dirent_t;
+
+typedef enum spa_log_state {
+	SPA_LOG_UNKNOWN = 0,	/* unknown log state */
+	SPA_LOG_MISSING,	/* missing log(s) */
+	SPA_LOG_CLEAR,		/* clear the log(s) */
+	SPA_LOG_GOOD,		/* log(s) are good */
+} spa_log_state_t;
+
+enum zio_taskq_type {
+	ZIO_TASKQ_ISSUE = 0,
+	ZIO_TASKQ_INTERRUPT,
+	ZIO_TASKQ_TYPES
+};
 
 struct spa {
 	/*
 	 * Fields protected by spa_namespace_lock.
 	 */
-	char		*spa_name;		/* pool name */
+	char		spa_name[MAXNAMELEN];	/* pool name */
 	avl_node_t	spa_avl;		/* node in spa_namespace_avl */
 	nvlist_t	*spa_config;		/* last synced config */
 	nvlist_t	*spa_config_syncing;	/* currently syncing config */
 	uint64_t	spa_config_txg;		/* txg of last config change */
-	kmutex_t	spa_config_cache_lock;	/* for spa_config RW_READER */
 	int		spa_sync_pass;		/* iterate-to-convergence */
 	int		spa_state;		/* pool state */
 	int		spa_inject_ref;		/* injection references */
 	uint8_t		spa_traverse_wanted;	/* traverse lock wanted */
 	uint8_t		spa_sync_on;		/* sync threads are running */
 	spa_load_state_t spa_load_state;	/* current load operation */
-	taskq_t		*spa_zio_issue_taskq[ZIO_TYPES];
-	taskq_t		*spa_zio_intr_taskq[ZIO_TYPES];
+	taskq_t		*spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
 	dsl_pool_t	*spa_dsl_pool;
 	metaslab_class_t *spa_normal_class;	/* normal data class */
+	metaslab_class_t *spa_log_class;	/* intent log data class */
 	uint64_t	spa_first_txg;		/* first txg after spa_open() */
 	uint64_t	spa_final_txg;		/* txg of export/destroy */
 	uint64_t	spa_freeze_txg;		/* freeze pool at this txg */
@@ -96,12 +117,10 @@ struct spa {
 	txg_list_t	spa_vdev_txg_list;	/* per-txg dirty vdev list */
 	vdev_t		*spa_root_vdev;		/* top-level vdev container */
 	uint64_t	spa_load_guid;		/* initial guid for spa_load */
-	list_t		spa_dirty_list;		/* vdevs with dirty labels */
-	uint64_t	spa_spares_object;	/* MOS object for spare list */
-	nvlist_t	*spa_sparelist;		/* cached spare config */
-	vdev_t		**spa_spares;		/* available hot spares */
-	int		spa_nspares;		/* number of hot spares */
-	boolean_t	spa_sync_spares;	/* sync the spares list */
+	list_t		spa_config_dirty_list;	/* vdevs with dirty config */
+	list_t		spa_state_dirty_list;	/* vdevs with dirty state */
+	spa_aux_vdev_t	spa_spares;		/* hot spares */
+	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
 	uint64_t	spa_config_object;	/* MOS object for pool config */
 	uint64_t	spa_syncing_txg;	/* txg currently syncing */
 	uint64_t	spa_sync_bplist_obj;	/* object for deferred frees */
@@ -110,28 +129,24 @@ struct spa {
 	uberblock_t	spa_ubsync;		/* last synced uberblock */
 	uberblock_t	spa_uberblock;		/* current uberblock */
 	kmutex_t	spa_scrub_lock;		/* resilver/scrub lock */
-	kthread_t	*spa_scrub_thread;	/* scrub/resilver thread */
-	traverse_handle_t *spa_scrub_th;	/* scrub traverse handle */
-	uint64_t	spa_scrub_restart_txg;	/* need to restart */
-	uint64_t	spa_scrub_mintxg;	/* min txg we'll scrub */
-	uint64_t	spa_scrub_maxtxg;	/* max txg we'll scrub */
 	uint64_t	spa_scrub_inflight;	/* in-flight scrub I/Os */
 	uint64_t	spa_scrub_maxinflight;	/* max in-flight scrub I/Os */
 	uint64_t	spa_scrub_errors;	/* scrub I/O error count */
-	int		spa_scrub_suspended;	/* tell scrubber to suspend */
-	kcondvar_t	spa_scrub_cv;		/* scrub thread state change */
 	kcondvar_t	spa_scrub_io_cv;	/* scrub I/O completion */
-	uint8_t		spa_scrub_stop;		/* tell scrubber to stop */
 	uint8_t		spa_scrub_active;	/* active or suspended? */
 	uint8_t		spa_scrub_type;		/* type of scrub we're doing */
 	uint8_t		spa_scrub_finished;	/* indicator to rotate logs */
+	uint8_t		spa_scrub_started;	/* started since last boot */
+	uint8_t		spa_scrub_reopen;	/* scrub doing vdev_reopen */
 	kmutex_t	spa_async_lock;		/* protect async state */
 	kthread_t	*spa_async_thread;	/* thread doing async task */
 	int		spa_async_suspended;	/* async tasks suspended */
 	kcondvar_t	spa_async_cv;		/* wait for thread_exit() */
 	uint16_t	spa_async_tasks;	/* async task mask */
+	kmutex_t	spa_async_root_lock;	/* protects async root count */
+	uint64_t	spa_async_root_count;	/* number of async root zios */
+	kcondvar_t	spa_async_root_cv;	/* notify when count == 0 */
 	char		*spa_root;		/* alternate root directory */
-	kmutex_t	spa_uberblock_lock;	/* vdev_uberblock_load_done() */
 	uint64_t	spa_ena;		/* spa-wide ereport ENA */
 	boolean_t	spa_last_open_failed;	/* true if last open faled */
 	kmutex_t	spa_errlog_lock;	/* error log lock */
@@ -144,22 +159,37 @@ struct spa {
 	uint64_t	spa_history;		/* history object */
 	kmutex_t	spa_history_lock;	/* history lock */
 	vdev_t		*spa_pending_vdev;	/* pending vdev additions */
-	nvlist_t	**spa_pending_spares;	/* pending spare additions */
-	uint_t		spa_pending_nspares;	/* # pending spares */
 	kmutex_t	spa_props_lock;		/* property lock */
 	uint64_t	spa_pool_props_object;	/* object for properties */
 	uint64_t	spa_bootfs;		/* default boot filesystem */
+	uint64_t	spa_failmode;		/* failure mode for the pool */
+	uint64_t	spa_delegation;		/* delegation on/off */
+	list_t		spa_config_list;	/* previous cache file(s) */
+	zio_t		*spa_suspend_zio_root;	/* root of all suspended I/O */
+	kmutex_t	spa_suspend_lock;	/* protects suspend_zio_root */
+	kcondvar_t	spa_suspend_cv;		/* notification of resume */
+	uint8_t		spa_suspended;		/* pool is suspended */
+	boolean_t	spa_import_faulted;	/* allow faulted vdevs */
+	boolean_t	spa_is_root;		/* pool is root */
+	int		spa_minref;		/* num refs when first opened */
+	spa_log_state_t spa_log_state;		/* log state */
 	/*
-	 * spa_refcnt must be the last element because it changes size based on
-	 * compilation options.  In order for the MDB module to function
-	 * correctly, the other fields must remain in the same location.
+	 * spa_refcnt & spa_config_lock must be the last elements
+	 * because refcount_t changes size based on compilation options.
+	 * In order for the MDB module to function correctly, the other
+	 * fields must remain in the same location.
 	 */
-	spa_config_lock_t spa_config_lock;	/* configuration changes */
+	spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
 	refcount_t	spa_refcount;		/* number of opens */
 };
 
-extern const char *spa_config_dir;
-extern kmutex_t spa_namespace_lock;
+extern const char *spa_config_path;
+
+#define	BOOTFS_COMPRESS_VALID(compress) \
+	((compress) == ZIO_COMPRESS_LZJB || \
+	((compress) == ZIO_COMPRESS_ON && \
+	ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \
+	(compress) == ZIO_COMPRESS_OFF)
 
 #ifdef	__cplusplus
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
index dae129c2e5a4..23bdff211b4a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -76,6 +75,14 @@ extern void txg_suspend(struct dsl_pool *dp);
 extern void txg_resume(struct dsl_pool *dp);
 
 /*
+ * Delay the caller by the specified number of ticks or until
+ * the txg closes (whichever comes first).  This is intended
+ * to be used to throttle writers when the system nears its
+ * capacity.
+ */
+extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks);
+
+/*
  * Wait until the given transaction group has finished syncing.
  * Try to make this happen as soon as possible (eg. kick off any
  * necessary syncs immediately).  If txg==0, wait for the currently open
@@ -95,7 +102,10 @@ extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
  * Returns TRUE if we are "backed up" waiting for the syncing
  * transaction to complete; otherwise returns FALSE.
  */
-extern int txg_stalled(struct dsl_pool *dp);
+extern boolean_t txg_stalled(struct dsl_pool *dp);
+
+/* returns TRUE if someone is waiting for the next txg to sync */
+extern boolean_t txg_sync_waiting(struct dsl_pool *dp);
 
 /*
  * Per-txg object lists.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
index 45a138afaac3..a58be84be5af 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -59,7 +58,7 @@ typedef struct tx_state {
 	kcondvar_t	tx_sync_done_cv;
 	kcondvar_t	tx_quiesce_more_cv;
 	kcondvar_t	tx_quiesce_done_cv;
-	kcondvar_t	tx_timeout_exit_cv;
+	kcondvar_t	tx_timeout_cv;
 	kcondvar_t	tx_exit_cv;	/* wait for all threads to exit */
 
 	uint8_t		tx_threads;	/* number of threads */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
index ab0f2dcf8c1b..55a0dd5aec0d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -49,7 +49,7 @@ extern "C" {
 
 struct uberblock {
 	uint64_t	ub_magic;	/* UBERBLOCK_MAGIC		*/
-	uint64_t	ub_version;	/* ZFS_VERSION			*/
+	uint64_t	ub_version;	/* SPA_VERSION			*/
 	uint64_t	ub_txg;		/* txg of last sync		*/
 	uint64_t	ub_guid_sum;	/* sum of all vdev guids	*/
 	uint64_t	ub_timestamp;	/* UTC time of last sync	*/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
index c8c177e3ca6c..2ef3093edf1c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,8 +38,12 @@ extern "C" {
 #define	UNIQUE_BITS	56
 
 void unique_init(void);
+void unique_fini(void);
 
-/* Return a new unique value. */
+/*
+ * Return a new unique value (which will not be uniquified against until
+ * it is unique_insert()-ed.
+ */
 uint64_t unique_create(void);
 
 /* Return a unique value, which equals the one passed in if possible. */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
index 31208116256d..013389501e51 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_VDEV_H
 #define	_SYS_VDEV_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu.h>
@@ -40,35 +38,31 @@ extern "C" {
 
 extern boolean_t zfs_nocacheflush;
 
-/*
- * Fault injection modes.
- */
-#define	VDEV_FAULT_NONE		0
-#define	VDEV_FAULT_RANDOM	1
-#define	VDEV_FAULT_COUNT	2
-
 extern int vdev_open(vdev_t *);
 extern int vdev_validate(vdev_t *);
 extern void vdev_close(vdev_t *);
 extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
 extern void vdev_init(vdev_t *, uint64_t txg);
 extern void vdev_reopen(vdev_t *);
-extern int vdev_validate_spare(vdev_t *);
+extern int vdev_validate_aux(vdev_t *vd);
+extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
 
+extern boolean_t vdev_is_bootable(vdev_t *vd);
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
 extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
 extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
 extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
 extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     int scrub_done);
-
-extern const char *vdev_description(vdev_t *vd);
+extern boolean_t vdev_resilver_needed(vdev_t *vd,
+    uint64_t *minp, uint64_t *maxp);
 
 extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
 extern void vdev_metaslab_fini(vdev_t *vd);
 
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
-extern void vdev_stat_update(zio_t *zio);
+extern void vdev_clear_stats(vdev_t *vd);
+extern void vdev_stat_update(zio_t *zio, uint64_t psize);
 extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
     boolean_t complete);
 extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
@@ -77,24 +71,27 @@ extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
     vdev_aux_t aux);
 
 extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
-    int64_t alloc_delta);
+    int64_t alloc_delta, boolean_t update_root);
 
 extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
 
-extern void vdev_io_start(zio_t *zio);
-extern void vdev_io_done(zio_t *zio);
-
-extern int vdev_online(spa_t *spa, uint64_t guid);
-extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp);
+extern int vdev_fault(spa_t *spa, uint64_t guid);
+extern int vdev_degrade(spa_t *spa, uint64_t guid);
+extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
+    vdev_state_t *);
+extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
 extern void vdev_clear(spa_t *spa, vdev_t *vd);
 
-extern int vdev_error_inject(vdev_t *vd, zio_t *zio);
-extern int vdev_is_dead(vdev_t *vd);
+extern boolean_t vdev_is_dead(vdev_t *vd);
+extern boolean_t vdev_readable(vdev_t *vd);
+extern boolean_t vdev_writeable(vdev_t *vd);
+extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
 
 extern void vdev_cache_init(vdev_t *vd);
 extern void vdev_cache_fini(vdev_t *vd);
 extern int vdev_cache_read(zio_t *zio);
 extern void vdev_cache_write(zio_t *zio);
+extern void vdev_cache_purge(vdev_t *vd);
 
 extern void vdev_queue_init(vdev_t *vd);
 extern void vdev_queue_fini(vdev_t *vd);
@@ -103,16 +100,20 @@ extern void vdev_queue_io_done(zio_t *zio);
 
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);
-extern int vdev_config_sync(vdev_t *vd, uint64_t txg);
+extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
+
+extern void vdev_state_dirty(vdev_t *vd);
+extern void vdev_state_clean(vdev_t *vd);
 
 extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
-    boolean_t getstats, boolean_t isspare);
+    boolean_t getstats, boolean_t isspare, boolean_t isl2cache);
 
 /*
  * Label routines
  */
 struct uberblock;
 extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
+extern int vdev_label_number(uint64_t psise, uint64_t offset);
 extern nvlist_t *vdev_label_read_config(vdev_t *vd);
 extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub);
 
@@ -120,7 +121,8 @@ typedef enum {
 	VDEV_LABEL_CREATE,	/* create/add a new device */
 	VDEV_LABEL_REPLACE,	/* replace an existing device */
 	VDEV_LABEL_SPARE,	/* add a new hot spare */
-	VDEV_LABEL_REMOVE	/* remove an existing device */
+	VDEV_LABEL_REMOVE,	/* remove an existing device */
+	VDEV_LABEL_L2CACHE	/* add an L2ARC cache device */
 } vdev_labeltype_t;
 
 extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
index 95536a77db9a..b748571ea0c3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -31,6 +30,8 @@
 
 #include <sys/vdev.h>
 #ifdef _KERNEL
+#include <sys/buf.h>
+#include <sys/ddi.h>
 #include <sys/sunldi.h>
 #include <sys/sunddi.h>
 #endif
@@ -45,6 +46,9 @@ typedef struct vdev_disk {
 	ldi_handle_t	vd_lh;
 } vdev_disk_t;
 
+#ifdef _KERNEL
+extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
+#endif
 #ifdef	__cplusplus
 }
 #endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
index aba756713f9c..7e24edea7f38 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_VDEV_IMPL_H
 #define	_SYS_VDEV_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/avl.h>
 #include <sys/dmu.h>
 #include <sys/metaslab.h>
@@ -61,7 +59,7 @@ typedef struct vdev_cache_entry vdev_cache_entry_t;
 typedef int	vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift);
 typedef void	vdev_close_func_t(vdev_t *vd);
 typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
-typedef void	vdev_io_start_func_t(zio_t *zio);
+typedef int	vdev_io_start_func_t(zio_t *zio);
 typedef void	vdev_io_done_func_t(zio_t *zio);
 typedef void	vdev_state_change_func_t(vdev_t *vd, int, int);
 
@@ -140,9 +138,12 @@ struct vdev {
 	txg_list_t	vdev_ms_list;	/* per-txg dirty metaslab lists	*/
 	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
-	uint8_t		vdev_reopen_wanted; /* async reopen wanted?	*/
-	list_node_t	vdev_dirty_node; /* config dirty list		*/
+	boolean_t	vdev_remove_wanted; /* async remove wanted?	*/
+	boolean_t	vdev_probe_wanted; /* async probe wanted?	*/
+	list_node_t	vdev_config_dirty_node; /* config dirty list	*/
+	list_node_t	vdev_state_dirty_node; /* state dirty list	*/
 	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/
+	uint64_t	vdev_islog;	/* is an intent log device	*/
 
 	/*
 	 * Leaf vdev state.
@@ -151,22 +152,30 @@ struct vdev {
 	space_map_obj_t	vdev_dtl;	/* dirty time log on-disk state	*/
 	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
 	uint64_t	vdev_wholedisk;	/* true if this is a whole disk */
-	uint64_t	vdev_offline;	/* device taken offline?	*/
+	uint64_t	vdev_offline;	/* persistent offline state	*/
+	uint64_t	vdev_faulted;	/* persistent faulted state	*/
+	uint64_t	vdev_degraded;	/* persistent degraded state	*/
+	uint64_t	vdev_removed;	/* persistent removed state	*/
 	uint64_t	vdev_nparity;	/* number of parity devices for raidz */
 	char		*vdev_path;	/* vdev path (if any)		*/
 	char		*vdev_devid;	/* vdev devid (if any)		*/
-	uint64_t	vdev_fault_arg; /* fault injection paramater	*/
-	int		vdev_fault_mask; /* zio types to fault		*/
-	uint8_t		vdev_fault_mode; /* fault injection mode	*/
-	uint8_t		vdev_cache_active; /* vdev_cache and vdev_queue	*/
+	char		*vdev_physpath;	/* vdev device path (if any)	*/
+	uint64_t	vdev_not_present; /* not present during import	*/
+	uint64_t	vdev_unspare;	/* unspare when resilvering done */
+	hrtime_t	vdev_last_try;	/* last reopen time		*/
+	boolean_t	vdev_nowritecache; /* true if flushwritecache failed */
+	boolean_t	vdev_checkremove; /* temporary online test	*/
+	boolean_t	vdev_forcefault; /* force online fault		*/
 	uint8_t		vdev_tmpoffline; /* device taken offline temporarily? */
 	uint8_t		vdev_detached;	/* device detached?		*/
-	uint64_t	vdev_isspare;	/* was a hot spare */
+	uint8_t		vdev_cant_read;	/* vdev is failing all reads	*/
+	uint8_t		vdev_cant_write; /* vdev is failing all writes	*/
+	uint64_t	vdev_isspare;	/* was a hot spare		*/
+	uint64_t	vdev_isl2cache;	/* was a l2cache device		*/
 	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
 	vdev_cache_t	vdev_cache;	/* physical block cache		*/
-	uint64_t	vdev_not_present; /* not present during import	*/
-	hrtime_t	vdev_last_try;	/* last reopen time		*/
-	boolean_t	vdev_nowritecache; /* true if flushwritecache failed */
+	spa_aux_vdev_t	*vdev_aux;	/* for l2cache vdevs		*/
+	zio_t		*vdev_probe_zio; /* root of current probe	*/
 
 	/*
 	 * For DTrace to work in userland (libzpool) context, these fields must
@@ -177,6 +186,7 @@ struct vdev {
 	 */
 	kmutex_t	vdev_dtl_lock;	/* vdev_dtl_{map,resilver}	*/
 	kmutex_t	vdev_stat_lock;	/* vdev_stat			*/
+	kmutex_t	vdev_probe_lock; /* protects vdev_probe_zio	*/
 };
 
 #define	VDEV_SKIP_SIZE		(8 << 10)
@@ -239,6 +249,7 @@ typedef struct vdev_label {
 #define	VDEV_ALLOC_LOAD		0
 #define	VDEV_ALLOC_ADD		1
 #define	VDEV_ALLOC_SPARE	2
+#define	VDEV_ALLOC_L2CACHE	3
 
 /*
  * Allocate or free a vdev
@@ -275,8 +286,8 @@ extern vdev_ops_t vdev_raidz_ops;
 extern vdev_ops_t vdev_geom_ops;
 #else
 extern vdev_ops_t vdev_disk_ops;
-extern vdev_ops_t vdev_file_ops;
 #endif
+extern vdev_ops_t vdev_file_ops;
 extern vdev_ops_t vdev_missing_ops;
 extern vdev_ops_t vdev_spare_ops;
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
index f89d9385ea38..f88cc068bd57 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -31,7 +31,7 @@
 /*
  * ZAP - ZFS Attribute Processor
  *
- * The ZAP is a module which sits on top of the DMU (Data Managemnt
+ * The ZAP is a module which sits on top of the DMU (Data Management
  * Unit) and implements a higher-level storage primitive using DMU
  * objects.  Its primary consumer is the ZPL (ZFS Posix Layer).
  *
@@ -91,10 +91,38 @@ extern "C" {
 #define	ZAP_MAXVALUELEN 1024
 
 /*
+ * The matchtype specifies which entry will be accessed.
+ * MT_EXACT: only find an exact match (non-normalized)
+ * MT_FIRST: find the "first" normalized (case and Unicode
+ *     form) match; the designated "first" match will not change as long
+ *     as the set of entries with this normalization doesn't change
+ * MT_BEST: if there is an exact match, find that, otherwise find the
+ *     first normalized match
+ */
+typedef enum matchtype
+{
+	MT_EXACT,
+	MT_BEST,
+	MT_FIRST
+} matchtype_t;
+
+/*
  * Create a new zapobj with no attributes and return its object number.
+ * MT_EXACT will cause the zap object to only support MT_EXACT lookups,
+ * otherwise any matchtype can be used for lookups.
+ *
+ * normflags specifies what normalization will be done.  values are:
+ * 0: no normalization (legacy on-disk format, supports MT_EXACT matching
+ *     only)
+ * U8_TEXTPREP_TOLOWER: case normalization will be performed.
+ *     MT_FIRST/MT_BEST matching will find entries that match without
+ *     regard to case (eg. looking for "foo" can find an entry "Foo").
+ * Eventually, other flags will permit unicode normalization as well.
  */
 uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 
 /*
  * Create a new zapobj with no attributes from the given (unallocated)
@@ -102,6 +130,9 @@ uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
  */
 int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+int zap_create_claim_norm(objset_t *ds, uint64_t obj,
+    int normflags, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 
 /*
  * The zapobj passed in must be a valid ZAP object for all of the
@@ -140,9 +171,20 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
  * If the attribute is longer than the buffer, as many integers as will
  * fit will be transferred to 'buf'.  If the entire attribute was not
  * transferred, the call will return EOVERFLOW.
+ *
+ * If rn_len is nonzero, realname will be set to the name of the found
+ * entry (which may be different from the requested name if matchtype is
+ * not MT_EXACT).
+ *
+ * If normalization_conflictp is not NULL, it will be set if there is
+ * another name with the same case/unicode normalized form.
  */
 int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *normalization_conflictp);
 
 /*
  * Create an attribute with the given name and value.
@@ -182,6 +224,8 @@ int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
  * return ENOENT.
  */
 int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
+int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
+    matchtype_t mt, dmu_tx_t *tx);
 
 /*
  * Returns (in *count) the number of attributes in the specified zap
@@ -191,11 +235,28 @@ int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
 
 
 /*
- * Returns (in name) the name of the entry whose value
+ * Returns (in name) the name of the entry whose (value & mask)
  * (za_first_integer) is value, or ENOENT if not found.  The string
- * pointed to by name must be at least 256 bytes long.
+ * pointed to by name must be at least 256 bytes long.  If mask==0, the
+ * match must be exact (ie, same as mask=-1ULL).
  */
-int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name);
+int zap_value_search(objset_t *os, uint64_t zapobj,
+    uint64_t value, uint64_t mask, char *name);
+
+/*
+ * Transfer all the entries from fromobj into intoobj.  Only works on
+ * int_size=8 num_integers=1 values.  Fails if there are any duplicated
+ * entries.
+ */
+int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
+
+/*
+ * Manipulate entries where the name + value are the "same" (the name is
+ * a stringified version of the value).
+ */
+int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
+int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
+int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
 
 struct zap;
 struct zap_leaf;
@@ -211,6 +272,11 @@ typedef struct zap_cursor {
 
 typedef struct {
 	int za_integer_length;
+	/*
+	 * za_normalization_conflict will be set if there are additional
+	 * entries with this normalized form (eg, "foo" and "Foo").
+	 */
+	boolean_t za_normalization_conflict;
 	uint64_t za_num_integers;
 	uint64_t za_first_integer;	/* no sign extension for <8byte ints */
 	char za_name[MAXNAMELEN];
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
index 4e43f4ae49a1..0dc02ab6b0ac 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -59,7 +59,8 @@ typedef struct mzap_ent_phys {
 typedef struct mzap_phys {
 	uint64_t mz_block_type;	/* ZBT_MICRO */
 	uint64_t mz_salt;
-	uint64_t mz_pad[6];
+	uint64_t mz_normflags;
+	uint64_t mz_pad[5];
 	mzap_ent_phys_t mz_chunk[1];
 	/* actually variable size depending on block size */
 } mzap_phys_t;
@@ -127,6 +128,7 @@ typedef struct zap_phys {
 	uint64_t zap_num_leafs;		/* number of leafs */
 	uint64_t zap_num_entries;	/* number of entries */
 	uint64_t zap_salt;		/* salt to stir into hash function */
+	uint64_t zap_normflags;		/* flags for u8_textprep_str() */
 	/*
 	 * This structure is followed by padding, and then the embedded
 	 * pointer table.  The embedded pointer table takes up second
@@ -142,7 +144,8 @@ typedef struct zap {
 	uint64_t zap_object;
 	struct dmu_buf *zap_dbuf;
 	krwlock_t zap_rwlock;
-	int zap_ismicro;
+	boolean_t zap_ismicro;
+	int zap_normflags;
 	uint64_t zap_salt;
 	union {
 		struct {
@@ -165,34 +168,45 @@ typedef struct zap {
 	} zap_u;
 } zap_t;
 
+typedef struct zap_name {
+	zap_t *zn_zap;
+	const char *zn_name_orij;
+	uint64_t zn_hash;
+	matchtype_t zn_matchtype;
+	const char *zn_name_norm;
+	char zn_normbuf[ZAP_MAXNAMELEN];
+} zap_name_t;
+
 #define	zap_f	zap_u.zap_fat
 #define	zap_m	zap_u.zap_micro
 
-uint64_t zap_hash(zap_t *zap, const char *name);
+boolean_t zap_match(zap_name_t *zn, const char *matchname);
 int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
-    krw_t lti, int fatreader, zap_t **zapp);
+    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
 void zap_unlockdir(zap_t *zap);
 void zap_evict(dmu_buf_t *db, void *vmzap);
+zap_name_t *zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt);
+void zap_name_free(zap_name_t *zn);
 
 #define	ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
 
 void fzap_byteswap(void *buf, size_t size);
 int fzap_count(zap_t *zap, uint64_t *count);
-int fzap_lookup(zap_t *zap, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf);
-int fzap_add(zap_t *zap, const char *name,
-    uint64_t integer_size, uint64_t num_integers,
+int fzap_lookup(zap_name_t *zn,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    char *realname, int rn_len, boolean_t *normalization_conflictp);
+int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
-int fzap_update(zap_t *zap, const char *name,
+int fzap_update(zap_name_t *zn,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
-int fzap_length(zap_t *zap, const char *name,
+int fzap_length(zap_name_t *zn,
     uint64_t *integer_size, uint64_t *num_integers);
-int fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx);
+int fzap_remove(zap_name_t *zn, dmu_tx_t *tx);
 int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
 void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
 void zap_put_leaf(struct zap_leaf *l);
 
-int fzap_add_cd(zap_t *zap, const char *name,
+int fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, uint32_t cd, dmu_tx_t *tx);
 void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
index 147fb7212454..14144e059e54 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -92,6 +92,8 @@ typedef enum zap_chunk_type {
 	ZAP_CHUNK_TYPE_MAX = 250
 } zap_chunk_type_t;
 
+#define	ZLF_ENTRIES_CDSORTED (1<<0)
+
 /*
  * TAKE NOTE:
  * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
@@ -109,7 +111,8 @@ typedef struct zap_leaf_phys {
 /* above is accessable to zap, below is zap_leaf private */
 
 		uint16_t lh_freelist;		/* chunk head of free list */
-		uint8_t lh_pad2[12];
+		uint8_t lh_flags;		/* ZLF_* flags */
+		uint8_t lh_pad2[11];
 	} l_hdr; /* 2 24-byte chunks */
 
 	/*
@@ -148,7 +151,7 @@ typedef union zap_leaf_chunk {
 } zap_leaf_chunk_t;
 
 typedef struct zap_leaf {
-	krwlock_t l_rwlock; 		/* only used on head of chain */
+	krwlock_t l_rwlock;
 	uint64_t l_blkid;		/* 1<<ZAP_BLOCK_SHIFT byte block off */
 	int l_bs;			/* block size shift */
 	dmu_buf_t *l_dbuf;
@@ -174,7 +177,7 @@ typedef struct zap_entry_handle {
  * value must equal zap_hash(name).
  */
 extern int zap_leaf_lookup(zap_leaf_t *l,
-	const char *name, uint64_t h, zap_entry_handle_t *zeh);
+    zap_name_t *zn, zap_entry_handle_t *zeh);
 
 /*
  * Return a handle to the entry with this hash+cd, or the entry with the
@@ -219,12 +222,19 @@ extern int zap_entry_create(zap_leaf_t *l,
 	zap_entry_handle_t *zeh);
 
 /*
+ * Return true if there are additional entries with the same normalized
+ * form.
+ */
+extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
+    zap_name_t *zn, const char *name, zap_t *zap);
+
+/*
  * Other stuff.
  */
 
-extern void zap_leaf_init(zap_leaf_t *l);
+extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
 extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
-extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl);
+extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
 extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs);
 
 #ifdef	__cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
index 3250b760fb07..fe953184db44 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -33,6 +33,7 @@
 #endif
 #include <sys/acl.h>
 #include <sys/dmu.h>
+#include <sys/zfs_fuid.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -40,33 +41,131 @@ extern "C" {
 
 struct znode_phys;
 
-#define	ACCESS_UNDETERMINED	-1
-
 #define	ACE_SLOT_CNT	6
+#define	ZFS_ACL_VERSION_INITIAL 0ULL
+#define	ZFS_ACL_VERSION_FUID	1ULL
+#define	ZFS_ACL_VERSION		ZFS_ACL_VERSION_FUID
+
+/*
+ * ZFS ACLs are store in various forms.
+ * Files created with ACL version ZFS_ACL_VERSION_INITIAL
+ * will all be created with fixed length ACEs of type
+ * zfs_oldace_t.
+ *
+ * Files with ACL version ZFS_ACL_VERSION_FUID will be created
+ * with various sized ACEs.  The abstraction entries will utilize
+ * zfs_ace_hdr_t, normal user/group entries will use zfs_ace_t
+ * and some specialized CIFS ACEs will use zfs_object_ace_t.
+ */
+
+/*
+ * All ACEs have a common hdr.  For
+ * owner@, group@, and everyone@ this is all
+ * thats needed.
+ */
+typedef struct zfs_ace_hdr {
+	uint16_t z_type;
+	uint16_t z_flags;
+	uint32_t z_access_mask;
+} zfs_ace_hdr_t;
+
+typedef zfs_ace_hdr_t zfs_ace_abstract_t;
+
+/*
+ * Standard ACE
+ */
+typedef struct zfs_ace {
+	zfs_ace_hdr_t	z_hdr;
+	uint64_t	z_fuid;
+} zfs_ace_t;
+
+/*
+ * The following type only applies to ACE_ACCESS_ALLOWED|DENIED_OBJECT_ACE_TYPE
+ * and will only be set/retrieved in a CIFS context.
+ */
 
-typedef struct zfs_znode_acl {
+typedef struct zfs_object_ace {
+	zfs_ace_t	z_ace;
+	uint8_t		z_object_type[16]; /* object type */
+	uint8_t		z_inherit_type[16]; /* inherited object type */
+} zfs_object_ace_t;
+
+typedef struct zfs_oldace {
+	uint32_t	z_fuid;		/* "who" */
+	uint32_t	z_access_mask;  /* access mask */
+	uint16_t	z_flags;	/* flags, i.e inheritance */
+	uint16_t	z_type;		/* type of entry allow/deny */
+} zfs_oldace_t;
+
+typedef struct zfs_acl_phys_v0 {
+	uint64_t	z_acl_extern_obj;	/* ext acl pieces */
+	uint32_t	z_acl_count;		/* Number of ACEs */
+	uint16_t	z_acl_version;		/* acl version */
+	uint16_t	z_acl_pad;		/* pad */
+	zfs_oldace_t	z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
+} zfs_acl_phys_v0_t;
+
+#define	ZFS_ACE_SPACE	(sizeof (zfs_oldace_t) * ACE_SLOT_CNT)
+
+typedef struct zfs_acl_phys {
 	uint64_t	z_acl_extern_obj;	  /* ext acl pieces */
-	uint32_t	z_acl_count;		  /* Number of ACEs */
+	uint32_t	z_acl_size;		  /* Number of bytes in ACL */
 	uint16_t	z_acl_version;		  /* acl version */
-	uint16_t	z_acl_pad;		  /* pad */
-	ace_t		z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
-} zfs_znode_acl_t;
-
-#define	ACL_DATA_ALLOCED	0x1
+	uint16_t	z_acl_count;		  /* ace count */
+	uint8_t		z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
+} zfs_acl_phys_t;
+
+
+
+typedef struct acl_ops {
+	uint32_t	(*ace_mask_get) (void *acep); /* get  access mask */
+	void 		(*ace_mask_set) (void *acep,
+			    uint32_t mask); /* set access mask */
+	uint16_t	(*ace_flags_get) (void *acep);	/* get flags */
+	void		(*ace_flags_set) (void *acep,
+			    uint16_t flags); /* set flags */
+	uint16_t	(*ace_type_get)(void *acep); /* get type */
+	void		(*ace_type_set)(void *acep,
+			    uint16_t type); /* set type */
+	uint64_t	(*ace_who_get)(void *acep); /* get who/fuid */
+	void		(*ace_who_set)(void *acep,
+			    uint64_t who); /* set who/fuid */
+	size_t		(*ace_size)(void *acep); /* how big is this ace */
+	size_t		(*ace_abstract_size)(void); /* sizeof abstract entry */
+	int		(*ace_mask_off)(void); /* off of access mask in ace */
+	int		(*ace_data)(void *acep, void **datap);
+			    /* ptr to data if any */
+} acl_ops_t;
 
 /*
- * Max ACL size is prepended deny for all entries + the
- * canonical six tacked on * the end.
+ * A zfs_acl_t structure is composed of a list of zfs_acl_node_t's.
+ * Each node will have one or more ACEs associated with it.  You will
+ * only have multiple nodes during a chmod operation.   Normally only
+ * one node is required.
  */
-#define	MAX_ACL_SIZE	(MAX_ACL_ENTRIES * 2 + 6)
+typedef struct zfs_acl_node {
+	list_node_t	z_next;		/* Next chunk of ACEs */
+	void		*z_acldata;	/* pointer into actual ACE(s) */
+	void		*z_allocdata;	/* pointer to kmem allocated memory */
+	size_t		z_allocsize;	/* Size of blob in bytes */
+	size_t		z_size;		/* length of ACL data */
+	int		z_ace_count;	/* number of ACEs in this acl node */
+	int		z_ace_idx;	/* ace iterator positioned on */
+} zfs_acl_node_t;
 
 typedef struct zfs_acl {
-	int		z_slots;	/* number of allocated slots for ACEs */
-	int		z_acl_count;
-	uint_t		z_state;
-	ace_t		*z_acl;
+	int		z_acl_count;	/* Number of ACEs */
+	size_t		z_acl_bytes;	/* Number of bytes in ACL */
+	uint_t		z_version;	/* version of ACL */
+	void		*z_next_ace;	/* pointer to next ACE */
+	int		z_hints;	/* ACL hints (ZFS_INHERIT_ACE ...) */
+	zfs_acl_node_t	*z_curr_node;	/* current node iterator is handling */
+	list_t		z_acl;		/* chunks of ACE data */
+	acl_ops_t	z_ops;		/* ACL operations */
+	boolean_t	z_has_fuids;	/* FUIDs present in ACL? */
 } zfs_acl_t;
 
+#define	ACL_DATA_ALLOCED	0x1
 #define	ZFS_ACL_SIZE(aclcnt)	(sizeof (ace_t) * (aclcnt))
 
 /*
@@ -80,31 +179,34 @@ typedef struct zfs_acl {
 #define	ZFS_ACL_NOALLOW		1
 #define	ZFS_ACL_GROUPMASK	2
 #define	ZFS_ACL_PASSTHROUGH	3
-#define	ZFS_ACL_SECURE		4
+#define	ZFS_ACL_RESTRICTED	4
 
 struct znode;
+struct zfsvfs;
+struct zfs_fuid_info;
 
 #ifdef _KERNEL
 void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *,
-    dmu_tx_t *, cred_t *);
-#ifdef TODO
-int zfs_getacl(struct znode *, vsecattr_t *, cred_t *);
-#endif
-int zfs_mode_update(struct znode *, uint64_t, dmu_tx_t  *);
+    dmu_tx_t *, cred_t *, zfs_acl_t *, zfs_fuid_info_t **);
 #ifdef TODO
-int zfs_setacl(struct znode *, vsecattr_t *, cred_t *);
+int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
+int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
 #endif
 void zfs_acl_rele(void *);
-void zfs_ace_byteswap(ace_t *, int);
-extern int zfs_zaccess(struct znode *, int, cred_t *);
-extern int zfs_zaccess_rwx(struct znode *, mode_t, cred_t *);
+void zfs_oldace_byteswap(ace_t *, int);
+void zfs_ace_byteswap(void *, size_t, boolean_t);
+extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *);
+extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
+extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
 extern int zfs_acl_access(struct znode *, int, cred_t *);
-int zfs_acl_chmod_setattr(struct znode *, uint64_t, dmu_tx_t *);
+int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t);
 int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
 int zfs_zaccess_rename(struct znode *, struct znode *,
     struct znode *, struct znode *, cred_t *cr);
-int zfs_zaccess_v4_perm(struct znode *, int, cred_t *);
 void zfs_acl_free(zfs_acl_t *);
+int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, zfs_acl_t **);
+int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *,
+    struct zfs_fuid_info **, dmu_tx_t *);
 
 #endif
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
index 4deeb3c9bf75..76fdc0dce7a5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -45,6 +44,7 @@ extern "C" {
 #include <sys/cmn_err.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
+#include <sys/taskqueue.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/mutex.h>
@@ -73,11 +73,19 @@ extern "C" {
 #include <sys/ktr.h>
 #include <sys/stack.h>
 #include <sys/lockf.h>
+#include <sys/pathname.h>
 #include <sys/policy.h>
+#include <sys/refstr.h>
 #include <sys/zone.h>
 #include <sys/eventhandler.h>
+#include <sys/extattr.h>
 #include <sys/misc.h>
+#include <sys/sig.h>
+#include <sys/osd.h>
 #include <sys/zfs_debug.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/u8_textprep.h>
+#include <sys/fm/util.h>
 
 #include <machine/stdarg.h>
 
@@ -99,6 +107,14 @@ extern "C" {
 
 #define	CPU_SEQID	(curcpu)
 
+#define	tsd_create(keyp, destructor)	do {				\
+	*(keyp) = osd_thread_register((destructor));			\
+	KASSERT(*(keyp) > 0, ("cannot register OSD"));			\
+} while (0)
+#define	tsd_destroy(keyp)		osd_thread_deregister(*(keyp))
+#define	tsd_get(key)			osd_thread_get(curthread, (key))
+#define	tsd_set(key, value)		osd_thread_set(curthread, (key), (value))
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
index a676533ac4a2..905e8dd2c0e3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -57,7 +56,8 @@ int zfsctl_destroy_snapshot(const char *snapname, int force);
 int zfsctl_umount_snapshots(vfs_t *, int, cred_t *);
 
 int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr);
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp);
 
 int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
index f60d614953f3..ebb66e8ae4e9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
@@ -28,6 +28,7 @@
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
+#include <sys/pathname.h>
 #include <sys/dmu.h>
 #include <sys/zfs_znode.h>
 
@@ -41,6 +42,8 @@ extern "C" {
 #define	ZSHARED		0x0004		/* shared access (zfs_dirlook()) */
 #define	ZXATTR		0x0008		/* we want the xattr dir */
 #define	ZRENAMING	0x0010		/* znode is being renamed */
+#define	ZCILOOK		0x0020		/* case-insensitive lookup requested */
+#define	ZCIEXACT	0x0040		/* c-i requires c-s match (rename) */
 
 /* mknode flags */
 #define	IS_ROOT_NODE	0x01		/* create a root node */
@@ -48,15 +51,17 @@ extern "C" {
 #define	IS_REPLAY	0x04		/* we are replaying intent log */
 
 extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
-    int);
+    int, int *, pathname_t *);
 extern void zfs_dirent_unlock(zfs_dirlock_t *);
 extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
 extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
     boolean_t *);
-extern int zfs_dirlook(znode_t *, char *, vnode_t **);
-extern void zfs_mknode(znode_t *, vattr_t *, uint64_t *,
-    dmu_tx_t *, cred_t *, uint_t, znode_t **, int);
+extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
+    pathname_t *);
+extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
+    uint_t, znode_t **, int, zfs_acl_t *, zfs_fuid_info_t **);
 extern void zfs_rmnode(znode_t *);
+extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
 extern boolean_t zfs_dirempty(znode_t *);
 extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
 extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
new file mode 100644
index 000000000000..8d73b41938df
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_FS_ZFS_FUID_H
+#define	_SYS_FS_ZFS_FUID_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#ifdef _KERNEL
+#include <sys/kidmap.h>
+#include <sys/dmu.h>
+#include <sys/zfs_vfsops.h>
+#endif
+#include <sys/avl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+	ZFS_OWNER,
+	ZFS_GROUP,
+	ZFS_ACE_USER,
+	ZFS_ACE_GROUP
+} zfs_fuid_type_t;
+
+/*
+ * Estimate space needed for one more fuid table entry.
+ * for now assume its current size + 1K
+ */
+#define	FUID_SIZE_ESTIMATE(z) (z->z_fuid_size + (SPA_MINBLOCKSIZE << 1))
+
+#define	FUID_INDEX(x)	(x >> 32)
+#define	FUID_RID(x)	(x & 0xffffffff)
+#define	FUID_ENCODE(idx, rid) ((idx << 32) | rid)
+/*
+ * FUIDs cause problems for the intent log
+ * we need to replay the creation of the FUID,
+ * but we can't count on the idmapper to be around
+ * and during replay the FUID index may be different than
+ * before.  Also, if an ACL has 100 ACEs and 12 different
+ * domains we don't want to log 100 domain strings, but rather
+ * just the unique 12.
+ */
+
+/*
+ * The FUIDs in the log will index into
+ * domain string table and the bottom half will be the rid.
+ * Used for mapping ephemeral uid/gid during ACL setting to FUIDs
+ */
+typedef struct zfs_fuid {
+	list_node_t 	z_next;
+	uint64_t 	z_id;		/* uid/gid being converted to fuid */
+	uint64_t	z_domidx;	/* index in AVL domain table */
+	uint64_t	z_logfuid;	/* index for domain in log */
+} zfs_fuid_t;
+
+/* list of unique domains */
+typedef struct zfs_fuid_domain {
+	list_node_t	z_next;
+	uint64_t	z_domidx;	/* AVL tree idx */
+	const char	*z_domain;	/* domain string */
+} zfs_fuid_domain_t;
+
+/*
+ * FUID information necessary for logging create, setattr, and setacl.
+ */
+typedef struct zfs_fuid_info {
+	list_t	z_fuids;
+	list_t	z_domains;
+	uint64_t z_fuid_owner;
+	uint64_t z_fuid_group;
+	char **z_domain_table;  /* Used during replay */
+	uint32_t z_fuid_cnt;	/* How many fuids in z_fuids */
+	uint32_t z_domain_cnt;	/* How many domains */
+	size_t	z_domain_str_sz; /* len of domain strings z_domain list */
+} zfs_fuid_info_t;
+
+#ifdef _KERNEL
+struct znode;
+extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t);
+extern void zfs_fuid_destroy(zfsvfs_t *);
+extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t,
+    dmu_tx_t *, cred_t *, zfs_fuid_info_t **);
+extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t,
+    dmu_tx_t *, zfs_fuid_info_t **);
+extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, uid_t *uid,
+    uid_t *gid);
+extern zfs_fuid_info_t *zfs_fuid_info_alloc(void);
+extern void zfs_fuid_info_free();
+extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *);
+#endif
+
+char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t);
+uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *);
+void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_FUID_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
index 61a0a9ebdc2e..05a21c846ee8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -31,6 +31,11 @@
 #include <sys/cred.h>
 #include <sys/dmu.h>
 #include <sys/zio.h>
+#include <sys/dsl_deleg.h>
+
+#ifdef _KERNEL
+#include <sys/nvpair.h>
+#endif	/* _KERNEL */
 
 #ifdef	__cplusplus
 extern "C" {
@@ -42,9 +47,13 @@ extern "C" {
 #define	ZFS_SNAPDIR_HIDDEN		0
 #define	ZFS_SNAPDIR_VISIBLE		1
 
-#define	DMU_BACKUP_VERSION (1ULL)
+#define	DMU_BACKUP_STREAM_VERSION (1ULL)
+#define	DMU_BACKUP_HEADER_VERSION (2ULL)
 #define	DMU_BACKUP_MAGIC 0x2F5bacbacULL
 
+#define	DRR_FLAG_CLONE		(1<<0)
+#define	DRR_FLAG_CI_DATA	(1<<1)
+
 /*
  * zfs ioctl command structure
  */
@@ -53,14 +62,14 @@ typedef struct dmu_replay_record {
 		DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
 		DRR_WRITE, DRR_FREE, DRR_END,
 	} drr_type;
-	uint32_t drr_pad;
+	uint32_t drr_payloadlen;
 	union {
 		struct drr_begin {
 			uint64_t drr_magic;
 			uint64_t drr_version;
 			uint64_t drr_creation_time;
 			dmu_objset_type_t drr_type;
-			uint32_t drr_pad;
+			uint32_t drr_flags;
 			uint64_t drr_toguid;
 			uint64_t drr_fromguid;
 			char drr_toname[MAXNAMELEN];
@@ -109,48 +118,71 @@ typedef struct zinject_record {
 	uint32_t	zi_error;
 	uint64_t	zi_type;
 	uint32_t	zi_freq;
+	uint32_t	zi_pad;	/* pad out to 64 bit alignment */
 } zinject_record_t;
 
 #define	ZINJECT_NULL		0x1
 #define	ZINJECT_FLUSH_ARC	0x2
 #define	ZINJECT_UNLOAD_SPA	0x4
 
+typedef struct zfs_share {
+	uint64_t	z_exportdata;
+	uint64_t	z_sharedata;
+	uint64_t	z_sharetype;	/* 0 = share, 1 = unshare */
+	uint64_t	z_sharemax;  /* max length of share string */
+} zfs_share_t;
+
+/*
+ * ZFS file systems may behave the usual, POSIX-compliant way, where
+ * name lookups are case-sensitive.  They may also be set up so that
+ * all the name lookups are case-insensitive, or so that only some
+ * lookups, the ones that set an FIGNORECASE flag, are case-insensitive.
+ */
+typedef enum zfs_case {
+	ZFS_CASE_SENSITIVE,
+	ZFS_CASE_INSENSITIVE,
+	ZFS_CASE_MIXED
+} zfs_case_t;
+
 typedef struct zfs_cmd {
 	char		zc_name[MAXPATHLEN];
-	char		zc_value[MAXPATHLEN * 2];
+	char		zc_value[MAXPATHLEN];
+	char		zc_string[MAXNAMELEN];
 	uint64_t	zc_guid;
-	uint64_t	zc_nvlist_src;	/* really (char *) */
+	uint64_t	zc_nvlist_conf;		/* really (char *) */
+	uint64_t	zc_nvlist_conf_size;
+	uint64_t	zc_nvlist_src;		/* really (char *) */
 	uint64_t	zc_nvlist_src_size;
-	uint64_t	zc_nvlist_dst;	/* really (char *) */
+	uint64_t	zc_nvlist_dst;		/* really (char *) */
 	uint64_t	zc_nvlist_dst_size;
 	uint64_t	zc_cookie;
-	uint64_t	zc_cred;
-	uint64_t	zc_dev;
 	uint64_t	zc_objset_type;
-	uint64_t	zc_history;	/* really (char *) */
-	uint64_t	zc_history_len;
+	uint64_t	zc_perm_action;
+	uint64_t 	zc_history;		/* really (char *) */
+	uint64_t 	zc_history_len;
 	uint64_t	zc_history_offset;
 	uint64_t	zc_obj;
+	zfs_share_t	zc_share;
 	uint64_t	zc_jailid;
 	dmu_objset_stats_t zc_objset_stats;
 	struct drr_begin zc_begin_record;
 	zinject_record_t zc_inject_record;
 } zfs_cmd_t;
 
-#ifdef _KERNEL
-typedef struct zfs_create_data {
-	cred_t		*zc_cred;
-	dev_t		zc_dev;
-	nvlist_t	*zc_props;
-} zfs_create_data_t;
-#endif
-
 #define	ZVOL_MAX_MINOR	(1 << 16)
 #define	ZFS_MIN_MINOR	(ZVOL_MAX_MINOR + 1)
 
 #ifdef _KERNEL
 
-extern int zfs_secpolicy_write(const char *dataset, cred_t *cr);
+typedef struct zfs_creat {
+	nvlist_t	*zct_zplprops;
+	nvlist_t	*zct_props;
+} zfs_creat_t;
+
+extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr);
+extern int zfs_secpolicy_rename_perms(const char *from,
+    const char *to, cred_t *cr);
+extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
 extern int zfs_busy(void);
 extern int zfs_unmount_snap(char *, void *);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
index aa82cc178091..8d53c02b77aa 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -31,6 +31,8 @@
 #include <sys/list.h>
 #include <sys/vfs.h>
 #include <sys/zil.h>
+#include <sys/rrwlock.h>
+#include <sys/zfs_ioctl.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -46,35 +48,50 @@ struct zfsvfs {
 	uint64_t	z_unlinkedobj;	/* id of unlinked zapobj */
 	uint64_t	z_max_blksz;	/* maximum block size for files */
 	uint64_t	z_assign;	/* TXG_NOWAIT or set by zil_replay() */
+	uint64_t	z_fuid_obj;	/* fuid table object number */
+	uint64_t	z_fuid_size;	/* fuid table size */
+	avl_tree_t	z_fuid_idx;	/* fuid tree keyed by index */
+	avl_tree_t	z_fuid_domain;	/* fuid tree keyed by domain */
+	krwlock_t	z_fuid_lock;	/* fuid lock */
+	boolean_t	z_fuid_loaded;	/* fuid tables are loaded */
+	struct zfs_fuid_info	*z_fuid_replay; /* fuid info for replay */
 	zilog_t		*z_log;		/* intent log pointer */
 	uint_t		z_acl_mode;	/* acl chmod/mode behavior */
 	uint_t		z_acl_inherit;	/* acl inheritance behavior */
+	zfs_case_t	z_case;		/* case-sense */
+	boolean_t	z_utf8;		/* utf8-only */
+	int		z_norm;		/* normalization flags */
 	boolean_t	z_atime;	/* enable atimes mount option */
-	boolean_t	z_unmounted1;	/* unmounted phase 1 */
-	boolean_t	z_unmounted2;	/* unmounted phase 2 */
-	uint32_t	z_op_cnt;	/* vnode/vfs operations ref count */
-	krwlock_t	z_um_lock;	/* rw lock for umount phase 2 */
+	boolean_t	z_unmounted;	/* unmounted */
+	rrwlock_t	z_teardown_lock;
+	krwlock_t	z_teardown_inactive_lock;
 	list_t		z_all_znodes;	/* all vnodes in the fs */
 	kmutex_t	z_znodes_lock;	/* lock for z_all_znodes */
 	vnode_t		*z_ctldir;	/* .zfs directory pointer */
 	boolean_t	z_show_ctldir;	/* expose .zfs in the root dir */
 	boolean_t	z_issnap;	/* true if this is a snapshot */
+	boolean_t	z_vscan;	/* virus scan on/off */
+	boolean_t	z_use_fuids;	/* version allows fuids */
+	kmutex_t	z_online_recv_lock; /* recv in prog grabs as WRITER */
+	uint64_t	z_version;	/* ZPL version */
 #define	ZFS_OBJ_MTX_SZ	64
 	kmutex_t	z_hold_mtx[ZFS_OBJ_MTX_SZ];	/* znode hold locks */
 };
 
 /*
- * The total file ID size is limited to 12 bytes (including the length
- * field) in the NFSv2 protocol.  For historical reasons, this same limit
- * is currently being imposed by the Solaris NFSv3 implementation...
- * although the protocol actually permits a maximum of 64 bytes.  It will
- * not be possible to expand beyond 12 bytes without abandoning support
- * of NFSv2 and making some changes to the Solaris NFSv3 implementation.
+ * Normal filesystems (those not under .zfs/snapshot) have a total
+ * file ID size limited to 12 bytes (including the length field) due to
+ * NFSv2 protocol's limitation of 32 bytes for a filehandle.  For historical
+ * reasons, this same limit is being imposed by the Solaris NFSv3 implementation
+ * (although the NFSv3 protocol actually permits a maximum of 64 bytes).  It
+ * is not possible to expand beyond 12 bytes without abandoning support
+ * of NFSv2.
  *
- * For the time being, we will partition up the available space as follows:
+ * For normal filesystems, we partition up the available space as follows:
  *	2 bytes		fid length (required)
  *	6 bytes		object number (48 bits)
  *	4 bytes		generation number (32 bits)
+ *
  * We reserve only 48 bits for the object number, as this is the limit
  * currently defined and imposed by the DMU.
  */
@@ -84,6 +101,22 @@ typedef struct zfid_short {
 	uint8_t		zf_gen[4];		/* gen[i] = gen >> (8 * i) */
 } zfid_short_t;
 
+/*
+ * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes
+ * (including the length field).  This makes files under .zfs/snapshot
+ * accessible by NFSv3 and NFSv4, but not NFSv2.
+ *
+ * For files under .zfs/snapshot, we partition up the available space
+ * as follows:
+ *	2 bytes		fid length (required)
+ *	6 bytes		object number (48 bits)
+ *	4 bytes		generation number (32 bits)
+ *	6 bytes		objset id (48 bits)
+ *	4 bytes		currently just zero (32 bits)
+ *
+ * We reserve only 48 bits for the object number and objset id, as these are
+ * the limits currently defined and imposed by the DMU.
+ */
 typedef struct zfid_long {
 	zfid_short_t	z_fid;
 	uint8_t		zf_setid[6];		/* obj[i] = obj >> (8 * i) */
@@ -93,6 +126,12 @@ typedef struct zfid_long {
 #define	SHORT_FID_LEN	(sizeof (zfid_short_t) - sizeof (uint16_t))
 #define	LONG_FID_LEN	(sizeof (zfid_long_t) - sizeof (uint16_t))
 
+extern uint_t zfs_fsyncer_key;
+extern int zfs_super_owner;
+
+extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode);
+extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
index 6b2923298df2..a0cf44064970 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
@@ -19,19 +19,18 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_FS_ZFS_ZNODE_H
 #define	_SYS_FS_ZFS_ZNODE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef _KERNEL
 #include <sys/list.h>
 #include <sys/dmu.h>
 #include <sys/zfs_vfsops.h>
+#include <sys/rrwlock.h>
 #endif
 #include <sys/zfs_acl.h>
 #include <sys/zil.h>
@@ -41,34 +40,62 @@ extern "C" {
 #endif
 
 /*
- * Define special zfs pflags
+ * Additional file level attributes, that are stored
+ * in the upper half of zp_flags
  */
-#define	ZFS_XATTR	0x1		/* is an extended attribute */
-#define	ZFS_INHERIT_ACE	0x2		/* ace has inheritable ACEs */
-#define	ZFS_ACL_TRIVIAL 0x4		/* files ACL is trivial */
+#define	ZFS_READONLY		0x0000000100000000
+#define	ZFS_HIDDEN		0x0000000200000000
+#define	ZFS_SYSTEM		0x0000000400000000
+#define	ZFS_ARCHIVE		0x0000000800000000
+#define	ZFS_IMMUTABLE		0x0000001000000000
+#define	ZFS_NOUNLINK		0x0000002000000000
+#define	ZFS_APPENDONLY		0x0000004000000000
+#define	ZFS_NODUMP		0x0000008000000000
+#define	ZFS_OPAQUE		0x0000010000000000
+#define	ZFS_AV_QUARANTINED 	0x0000020000000000
+#define	ZFS_AV_MODIFIED 	0x0000040000000000
+
+#define	ZFS_ATTR_SET(zp, attr, value)	\
+{ \
+	if (value) \
+		zp->z_phys->zp_flags |= attr; \
+	else \
+		zp->z_phys->zp_flags &= ~attr; \
+}
 
-#define	MASTER_NODE_OBJ	1
+/*
+ * Define special zfs pflags
+ */
+#define	ZFS_XATTR		0x1		/* is an extended attribute */
+#define	ZFS_INHERIT_ACE		0x2		/* ace has inheritable ACEs */
+#define	ZFS_ACL_TRIVIAL 	0x4		/* files ACL is trivial */
+#define	ZFS_ACL_OBJ_ACE 	0x8		/* ACL has CMPLX Object ACE */
+#define	ZFS_ACL_PROTECTED	0x10		/* ACL protected */
+#define	ZFS_ACL_DEFAULTED	0x20		/* ACL should be defaulted */
+#define	ZFS_ACL_AUTO_INHERIT	0x40		/* ACL should be inherited */
+#define	ZFS_BONUS_SCANSTAMP	0x80		/* Scanstamp in bonus area */
 
 /*
- * special attributes for master node.
+ * Is ID ephemeral?
  */
+#define	IS_EPHEMERAL(x)		(x > MAXUID)
 
-#define	ZFS_FSID		"FSID"
-#define	ZFS_UNLINKED_SET	"DELETE_QUEUE"
-#define	ZFS_ROOT_OBJ		"ROOT"
-#define	ZPL_VERSION_OBJ		"VERSION"
-#define	ZFS_PROP_BLOCKPERPAGE	"BLOCKPERPAGE"
-#define	ZFS_PROP_NOGROWBLOCKS	"NOGROWBLOCKS"
+/*
+ * Should we use FUIDs?
+ */
+#define	USE_FUIDS(version, os)	(version >= ZPL_VERSION_FUID &&\
+    spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
 
-#define	ZFS_FLAG_BLOCKPERPAGE	0x1
-#define	ZFS_FLAG_NOGROWBLOCKS	0x2
+#define	MASTER_NODE_OBJ	1
 
 /*
- * ZPL version - rev'd whenever an incompatible on-disk format change
- * occurs.  Independent of SPA/DMU/ZAP versioning.
+ * Special attributes for master node.
  */
-
-#define	ZPL_VERSION		1ULL
+#define	ZFS_FSID		"FSID"
+#define	ZFS_UNLINKED_SET	"DELETE_QUEUE"
+#define	ZFS_ROOT_OBJ		"ROOT"
+#define	ZPL_VERSION_STR		"VERSION"
+#define	ZFS_FUID_TABLES		"FUID"
 
 #define	ZFS_MAX_BLOCKSIZE	(SPA_MAXBLOCKSIZE)
 
@@ -83,14 +110,20 @@ extern "C" {
 #define	ZFS_MAXNAMELEN	(MAXNAMELEN - 1)
 
 /*
+ * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in
+ * the directory entries.
+ */
+#ifndef IFTODT
+#define	IFTODT(mode) (((mode) & S_IFMT) >> 12)
+#endif
+
+/*
  * The directory entry has the type (currently unused on Solaris) in the
  * top 4 bits, and the object number in the low 48 bits.  The "middle"
  * 12 bits are unused.
  */
 #define	ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4)
 #define	ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
-#define	ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj)
-
 
 /*
  * This is the persistent portion of the znode.  It is stored
@@ -112,8 +145,9 @@ typedef struct znode_phys {
 	uint64_t zp_flags;		/* 120 - persistent flags */
 	uint64_t zp_uid;		/* 128 - file owner */
 	uint64_t zp_gid;		/* 136 - owning group */
-	uint64_t zp_pad[4];		/* 144 - future */
-	zfs_znode_acl_t zp_acl;		/* 176 - 263 ACL */
+	uint64_t zp_zap;		/* 144 - extra attributes */
+	uint64_t zp_pad[3];		/* 152 - future */
+	zfs_acl_phys_t zp_acl;		/* 176 - 263 ACL */
 	/*
 	 * Data may pad out any remaining bytes in the znode buffer, eg:
 	 *
@@ -121,7 +155,9 @@ typedef struct znode_phys {
 	 * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
 	 *			|<---- znode (264) ---->|<---- data (56) ---->|
 	 *
-	 * At present, we only use this space to store symbolic links.
+	 * At present, we use this space for the following:
+	 *  - symbolic links
+	 *  - 32-byte anti-virus scanstamp (regular files only)
 	 */
 } znode_phys_t;
 
@@ -153,12 +189,12 @@ typedef struct znode {
 	avl_tree_t	z_range_avl;	/* avl tree of file range locks */
 	uint8_t		z_unlinked;	/* file has been unlinked */
 	uint8_t		z_atime_dirty;	/* atime needs to be synced */
-	uint8_t		z_dbuf_held;	/* Is z_dbuf already held? */
 	uint8_t		z_zn_prefetch;	/* Prefetch znodes? */
 	uint_t		z_blksz;	/* block size in bytes */
 	uint_t		z_seq;		/* modification sequence number */
 	uint64_t	z_mapcnt;	/* number of pages mapped to file */
 	uint64_t	z_last_itx;	/* last ZIL itx on this znode */
+	uint64_t	z_gen;		/* generation (same as zp_gen) */
 	uint32_t	z_sync_cnt;	/* synchronous open count */
 	kmutex_t	z_acl_lock;	/* acl data lock */
 	list_node_t	z_link_node;	/* all znodes in fs link */
@@ -167,6 +203,8 @@ typedef struct znode {
 	 */
 	znode_phys_t	*z_phys;	/* pointer to persistent znode */
 	dmu_buf_t	*z_dbuf;	/* buffer containing the z_phys */
+	/* FreeBSD-specific field. */
+	struct task	z_task;
 } znode_t;
 
 
@@ -195,42 +233,51 @@ typedef struct znode {
 /*
  * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation.
  * ZFS_EXIT() must be called before exitting the vop.
+ * ZFS_VERIFY_ZP() verifies the znode is valid.
  */
 #define	ZFS_ENTER(zfsvfs) \
 	{ \
-		atomic_add_32(&(zfsvfs)->z_op_cnt, 1); \
-		if ((zfsvfs)->z_unmounted1) { \
+		rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG); \
+		if ((zfsvfs)->z_unmounted) { \
 			ZFS_EXIT(zfsvfs); \
 			return (EIO); \
 		} \
 	}
-#define	ZFS_EXIT(zfsvfs) atomic_add_32(&(zfsvfs)->z_op_cnt, -1)
+
+#define	ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
+
+#define	ZFS_VERIFY_ZP(zp) \
+	if ((zp)->z_dbuf == NULL) { \
+		ZFS_EXIT((zp)->z_zfsvfs); \
+		return (EIO); \
+	} \
 
 /*
  * Macros for dealing with dmu_buf_hold
  */
-#define	ZFS_OBJ_HASH(obj_num)	(obj_num & (ZFS_OBJ_MTX_SZ - 1))
-#define	ZFS_OBJ_MUTEX(zp)	\
-	(&zp->z_zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(zp->z_id)])
+#define	ZFS_OBJ_HASH(obj_num)	((obj_num) & (ZFS_OBJ_MTX_SZ - 1))
+#define	ZFS_OBJ_MUTEX(zfsvfs, obj_num)	\
+	(&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)])
 #define	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \
-	mutex_enter(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]);
-
+	mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
+#define	ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \
+	mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
 #define	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
-	mutex_exit(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)])
+	mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
 
 /*
  * Macros to encode/decode ZFS stored time values from/to struct timespec
  */
 #define	ZFS_TIME_ENCODE(tp, stmp)		\
 {						\
-	stmp[0] = (uint64_t)(tp)->tv_sec; 	\
-	stmp[1] = (uint64_t)(tp)->tv_nsec;	\
+	(stmp)[0] = (uint64_t)(tp)->tv_sec;	\
+	(stmp)[1] = (uint64_t)(tp)->tv_nsec;	\
 }
 
 #define	ZFS_TIME_DECODE(tp, stmp)		\
 {						\
-	(tp)->tv_sec = (time_t)stmp[0];		\
-	(tp)->tv_nsec = (long)stmp[1];		\
+	(tp)->tv_sec = (time_t)(stmp)[0];		\
+	(tp)->tv_nsec = (long)(stmp)[1];		\
 }
 
 /*
@@ -244,9 +291,10 @@ typedef struct znode {
 	if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
 		zfs_time_stamper(zp, ACCESSED, NULL)
 
-extern int	zfs_init_fs(zfsvfs_t *, znode_t **, cred_t *);
+extern int	zfs_init_fs(zfsvfs_t *, znode_t **);
 extern void	zfs_set_dataprop(objset_t *);
-extern void	zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx);
+extern void	zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *,
+    dmu_tx_t *tx);
 extern void	zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *);
 extern void	zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *);
 extern void	zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
@@ -254,33 +302,43 @@ extern int	zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
 extern void	zfs_znode_init(void);
 extern void	zfs_znode_fini(void);
 extern int	zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
+extern int	zfs_rezget(znode_t *);
 extern void	zfs_zinactive(znode_t *);
 extern void	zfs_znode_delete(znode_t *, dmu_tx_t *);
 extern void	zfs_znode_free(znode_t *);
 extern void	zfs_remove_op_tables();
 extern int	zfs_create_op_tables();
 extern dev_t	zfs_cmpldev(uint64_t);
-
-extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *dzp, znode_t *zp, char *name);
-extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+extern int	zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value);
+extern int	zfs_set_version(const char *name, uint64_t newvers);
+extern int	zfs_get_stats(objset_t *os, nvlist_t *nv);
+extern void	zfs_znode_dmu_fini(znode_t *);
+
+extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+    znode_t *dzp, znode_t *zp, char *name, vsecattr_t *, zfs_fuid_info_t *,
+    vattr_t *vap);
+extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp,
+    vattr_t *vap);
+extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, char *name);
-extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name);
-extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name, char *link);
-extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
 extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, offset_t off, ssize_t len, int ioflag);
 extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, uint64_t off, uint64_t len);
 extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, vattr_t *vap, uint_t mask_applied);
+    znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp);
 #ifndef ZFS_NO_ACL
-extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, int aclcnt, ace_t *z_ace);
+extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
+    vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
 #endif
+extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap);
+extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
 
 extern zil_get_data_t zfs_get_data;
 extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
index 947ba9fa6076..4d02d14f7075 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ZIL_H
 #define	_SYS_ZIL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
@@ -88,22 +86,61 @@ typedef struct zil_trailer {
 #define	ZIL_ZC_OBJSET	2
 #define	ZIL_ZC_SEQ	3
 
+typedef enum zil_create {
+	Z_FILE,
+	Z_DIR,
+	Z_XATTRDIR,
+} zil_create_t;
+
+/*
+ * size of xvattr log section.
+ * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps
+ * for create time and a single 64 bit integer for all of the attributes,
+ * and 4 64 bit integers (32 bytes) for the scanstamp.
+ *
+ */
+
+#define	ZIL_XVAT_SIZE(mapsize) \
+	sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \
+	(sizeof (uint64_t) * 7)
+
+/*
+ * Size of ACL in log.  The ACE data is padded out to properly align
+ * on 8 byte boundary.
+ */
+
+#define	ZIL_ACE_LENGTH(x)	(roundup(x, sizeof (uint64_t)))
+
 /*
  * Intent log transaction types and record structures
  */
-#define	TX_CREATE	1		/* Create file */
-#define	TX_MKDIR	2		/* Make directory */
-#define	TX_MKXATTR	3		/* Make XATTR directory */
-#define	TX_SYMLINK	4		/* Create symbolic link to a file */
-#define	TX_REMOVE	5		/* Remove file */
-#define	TX_RMDIR	6		/* Remove directory */
-#define	TX_LINK		7		/* Create hard link to a file */
-#define	TX_RENAME	8		/* Rename a file */
-#define	TX_WRITE	9		/* File write */
-#define	TX_TRUNCATE	10		/* Truncate a file */
-#define	TX_SETATTR	11		/* Set file attributes */
-#define	TX_ACL		12		/* Set acl */
-#define	TX_MAX_TYPE	13		/* Max transaction type */
+#define	TX_CREATE		1	/* Create file */
+#define	TX_MKDIR		2	/* Make directory */
+#define	TX_MKXATTR		3	/* Make XATTR directory */
+#define	TX_SYMLINK		4	/* Create symbolic link to a file */
+#define	TX_REMOVE		5	/* Remove file */
+#define	TX_RMDIR		6	/* Remove directory */
+#define	TX_LINK			7	/* Create hard link to a file */
+#define	TX_RENAME		8	/* Rename a file */
+#define	TX_WRITE		9	/* File write */
+#define	TX_TRUNCATE		10	/* Truncate a file */
+#define	TX_SETATTR		11	/* Set file attributes */
+#define	TX_ACL_V0		12	/* Set old formatted ACL */
+#define	TX_ACL			13	/* Set ACL */
+#define	TX_CREATE_ACL		14	/* create with ACL */
+#define	TX_CREATE_ATTR		15	/* create + attrs */
+#define	TX_CREATE_ACL_ATTR 	16	/* create with ACL + attrs */
+#define	TX_MKDIR_ACL		17	/* mkdir with ACL */
+#define	TX_MKDIR_ATTR		18	/* mkdir with attr */
+#define	TX_MKDIR_ACL_ATTR	19	/* mkdir with ACL + attrs */
+#define	TX_MAX_TYPE		20	/* Max transaction type */
+
+/*
+ * The transactions for mkdir, symlink, remove, rmdir, link, and rename
+ * may have the following bit set, indicating the original request
+ * specified case-insensitive handling of names.
+ */
+#define	TX_CI	((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
 
 /*
  * Format of log records.
@@ -124,6 +161,23 @@ typedef struct {			/* common log record header */
 	uint64_t	lrc_seq;	/* see comment above */
 } lr_t;
 
+/*
+ * Handle option extended vattr attributes.
+ *
+ * Whenever new attributes are added the version number
+ * will need to be updated as will code in
+ * zfs_log.c and zfs_replay.c
+ */
+typedef struct {
+	uint32_t	lr_attr_masksize; /* number of elements in array */
+	uint32_t	lr_attr_bitmap; /* First entry of array */
+	/* remainder of array and any additional fields */
+} lr_attr_t;
+
+/*
+ * log record for creates without optional ACL.
+ * This log record does support optional xvattr_t attributes.
+ */
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_doid;	/* object id of directory */
@@ -136,8 +190,42 @@ typedef struct {
 	uint64_t	lr_rdev;	/* rdev of object to create */
 	/* name of object to create follows this */
 	/* for symlinks, link content follows name */
+	/* for creates with xvattr data, the name follows the xvattr info */
 } lr_create_t;
 
+/*
+ * FUID ACL record will be an array of ACEs from the original ACL.
+ * If this array includes ephemeral IDs, the record will also include
+ * an array of log-specific FUIDs to replace the ephemeral IDs.
+ * Only one copy of each unique domain will be present, so the log-specific
+ * FUIDs will use an index into a compressed domain table.  On replay this
+ * information will be used to construct real FUIDs (and bypass idmap,
+ * since it may not be available).
+ */
+
+/*
+ * Log record for creates with optional ACL
+ * This log record is also used for recording any FUID
+ * information needed for replaying the create.  If the
+ * file doesn't have any actual ACEs then the lr_aclcnt
+ * would be zero.
+ */
+typedef struct {
+	lr_create_t	lr_create;	/* common create portion */
+	uint64_t	lr_aclcnt;	/* number of ACEs in ACL */
+	uint64_t	lr_domcnt;	/* number of unique domains */
+	uint64_t	lr_fuidcnt;	/* number of real fuids */
+	uint64_t	lr_acl_bytes;	/* number of bytes in ACL */
+	uint64_t	lr_acl_flags;	/* ACL flags */
+	/* lr_acl_bytes number of variable sized ace's follows */
+	/* if create is also setting xvattr's, then acl data follows xvattr */
+	/* if ACE FUIDs are needed then they will follow the xvattr_t */
+	/* Following the FUIDs will be the domain table information. */
+	/* The FUIDs for the owner and group will be in the lr_create */
+	/* portion of the record. */
+	/* name follows ACL data */
+} lr_acl_create_t;
+
 typedef struct {
 	lr_t		lr_common;	/* common portion of log record */
 	uint64_t	lr_doid;	/* obj id of directory */
@@ -185,6 +273,7 @@ typedef struct {
 	uint64_t	lr_size;	/* size to set */
 	uint64_t	lr_atime[2];	/* access time */
 	uint64_t	lr_mtime[2];	/* modification time */
+	/* optional attribute lr_attr_t may be here */
 } lr_setattr_t;
 
 typedef struct {
@@ -192,6 +281,17 @@ typedef struct {
 	uint64_t	lr_foid;	/* obj id of file */
 	uint64_t	lr_aclcnt;	/* number of acl entries */
 	/* lr_aclcnt number of ace_t entries follow this */
+} lr_acl_v0_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_foid;	/* obj id of file */
+	uint64_t	lr_aclcnt;	/* number of ACEs in ACL */
+	uint64_t	lr_domcnt;	/* number of unique domains */
+	uint64_t	lr_fuidcnt;	/* number of real fuids */
+	uint64_t	lr_acl_bytes;	/* number of bytes in ACL */
+	uint64_t	lr_acl_flags;	/* ACL flags */
+	/* lr_acl_bytes number of variable sized ace's follows */
 } lr_acl_t;
 
 /*
@@ -213,6 +313,7 @@ typedef struct itx {
 	void		*itx_private;	/* type-specific opaque data */
 	itx_wr_state_t	itx_wr_state;	/* write state */
 	uint8_t		itx_sync;	/* synchronous transaction */
+	uint64_t	itx_sod;	/* record size on disk */
 	lr_t		itx_lr;		/* common part of log record */
 	/* followed by type-specific part of lr_xx_t and its immediate data */
 } itx_t;
@@ -234,6 +335,7 @@ typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
 typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
     uint64_t txg);
 typedef int zil_replay_func_t();
+typedef void zil_replay_cleaner_t();
 typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
 
 extern uint64_t	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
@@ -249,15 +351,19 @@ extern zilog_t	*zil_open(objset_t *os, zil_get_data_t *get_data);
 extern void	zil_close(zilog_t *zilog);
 
 extern void	zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-    zil_replay_func_t *replay_func[TX_MAX_TYPE]);
+    zil_replay_func_t *replay_func[TX_MAX_TYPE],
+    zil_replay_cleaner_t *replay_cleaner);
 extern void	zil_destroy(zilog_t *zilog, boolean_t keep_first);
+extern void	zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
 
-extern itx_t	*zil_itx_create(int txtype, size_t lrsize);
+extern itx_t	*zil_itx_create(uint64_t txtype, size_t lrsize);
 extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
 
 extern void	zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
 
 extern int	zil_claim(char *osname, void *txarg);
+extern int	zil_check_log_chain(char *osname, void *txarg);
+extern int	zil_clear_log_chain(char *osname, void *txarg);
 extern void	zil_sync(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_clean(zilog_t *zilog);
 extern int	zil_is_committed(zilog_t *zilog);
@@ -265,7 +371,7 @@ extern int	zil_is_committed(zilog_t *zilog);
 extern int	zil_suspend(zilog_t *zilog);
 extern void	zil_resume(zilog_t *zilog);
 
-extern void	zil_add_vdev(zilog_t *zilog, uint64_t vdev);
+extern void	zil_add_block(zilog_t *zilog, blkptr_t *bp);
 
 extern int zil_disable;
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
index 3ecf4e4debf5..0fc800b96dea 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -51,15 +51,13 @@ typedef struct lwb {
 } lwb_t;
 
 /*
- * Vdev flushing: We use a bit map of size ZIL_VDEV_BMAP bytes.
- * Any vdev numbers beyond that use a linked list of zil_vdev_t structures.
+ * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
+ * we've touched so we know which ones need a write cache flush at the end.
  */
-
-#define	ZIL_VDEV_BMSZ 16 /* 16 * 8 = 128 vdevs */
-typedef struct zil_vdev {
-	uint64_t	vdev;		/* device written */
-	list_node_t	vdev_seq_node;	/* zilog->zl_vdev_list linkage */
-} zil_vdev_t;
+typedef struct zil_vdev_node {
+	uint64_t	zv_vdev;	/* vdev to be flushed */
+	avl_node_t	zv_node;	/* AVL tree linkage */
+} zil_vdev_node_t;
 
 /*
  * Stable storage intent log management structure.  One per dataset.
@@ -91,8 +89,8 @@ struct zilog {
 	uint64_t	zl_cur_used;	/* current commit log size used */
 	uint64_t	zl_prev_used;	/* previous commit log size used */
 	list_t		zl_lwb_list;	/* in-flight log write list */
-	list_t		zl_vdev_list;	/* list of [vdev, seq] pairs */
-	uint8_t		zl_vdev_bmap[ZIL_VDEV_BMSZ]; /* bitmap of vdevs */
+	kmutex_t	zl_vdev_lock;	/* protects zl_vdev_tree */
+	avl_tree_t	zl_vdev_tree;	/* vdevs to flush in zil_commit() */
 	taskq_t		*zl_clean_taskq; /* runs lwb and itx clean tasks */
 	avl_tree_t	zl_dva_tree;	/* track DVAs during log parse */
 	clock_t		zl_replay_time;	/* lbolt of when replay started */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
index b026ae6450c6..6331567498b5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -20,20 +20,17 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _ZIO_H
 #define	_ZIO_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
-#include <sys/dkio.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio_impl.h>
 
@@ -60,10 +57,6 @@ typedef struct zio_block_tail {
 	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
 	sizeof (uint64_t))
 
-#define	ZIO_GET_IOSIZE(zio)	\
-	(BP_IS_GANG((zio)->io_bp) ? \
-	SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp))
-
 typedef struct zio_gbh {
 	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
 	uint64_t		zg_filler[SPA_GBH_FILLER];
@@ -107,6 +100,10 @@ enum zio_compress {
 #define	ZIO_COMPRESS_ON_VALUE	ZIO_COMPRESS_LZJB
 #define	ZIO_COMPRESS_DEFAULT	ZIO_COMPRESS_OFF
 
+#define	ZIO_FAILURE_MODE_WAIT		0
+#define	ZIO_FAILURE_MODE_CONTINUE	1
+#define	ZIO_FAILURE_MODE_PANIC		2
+
 #define	ZIO_PRIORITY_NOW		(zio_priority_table[0])
 #define	ZIO_PRIORITY_SYNC_READ		(zio_priority_table[1])
 #define	ZIO_PRIORITY_SYNC_WRITE		(zio_priority_table[2])
@@ -121,51 +118,70 @@ enum zio_compress {
 
 #define	ZIO_FLAG_MUSTSUCCEED		0x00000
 #define	ZIO_FLAG_CANFAIL		0x00001
-#define	ZIO_FLAG_FAILFAST		0x00002
-#define	ZIO_FLAG_CONFIG_HELD		0x00004
-#define	ZIO_FLAG_CONFIG_GRABBED		0x00008
+#define	ZIO_FLAG_SPECULATIVE		0x00002
+#define	ZIO_FLAG_CONFIG_WRITER		0x00004
+#define	ZIO_FLAG_DONT_RETRY		0x00008
 
 #define	ZIO_FLAG_DONT_CACHE		0x00010
 #define	ZIO_FLAG_DONT_QUEUE		0x00020
-#define	ZIO_FLAG_DONT_PROPAGATE		0x00040
-#define	ZIO_FLAG_DONT_RETRY		0x00080
-
-#define	ZIO_FLAG_PHYSICAL		0x00100
-#define	ZIO_FLAG_IO_BYPASS		0x00200
-#define	ZIO_FLAG_IO_REPAIR		0x00400
-#define	ZIO_FLAG_SPECULATIVE		0x00800
+#define	ZIO_FLAG_DONT_AGGREGATE		0x00040
+#define	ZIO_FLAG_DONT_PROPAGATE		0x00080
 
-#define	ZIO_FLAG_RESILVER		0x01000
-#define	ZIO_FLAG_SCRUB			0x02000
-#define	ZIO_FLAG_SCRUB_THREAD		0x04000
-#define	ZIO_FLAG_SUBBLOCK		0x08000
+#define	ZIO_FLAG_IO_BYPASS		0x00100
+#define	ZIO_FLAG_IO_REPAIR		0x00200
+#define	ZIO_FLAG_IO_RETRY		0x00400
+#define	ZIO_FLAG_IO_REWRITE		0x00800
 
-#define	ZIO_FLAG_NOBOOKMARK		0x10000
-#define	ZIO_FLAG_USER			0x20000
+#define	ZIO_FLAG_PROBE			0x01000
+#define	ZIO_FLAG_RESILVER		0x02000
+#define	ZIO_FLAG_SCRUB			0x04000
+#define	ZIO_FLAG_SCRUB_THREAD		0x08000
 
-#define	ZIO_FLAG_METADATA		0x40000
+#define	ZIO_FLAG_GANG_CHILD		0x10000
 
 #define	ZIO_FLAG_GANG_INHERIT		\
 	(ZIO_FLAG_CANFAIL |		\
-	ZIO_FLAG_FAILFAST |		\
-	ZIO_FLAG_CONFIG_HELD |		\
-	ZIO_FLAG_DONT_RETRY |		\
-	ZIO_FLAG_IO_REPAIR |		\
 	ZIO_FLAG_SPECULATIVE |		\
+	ZIO_FLAG_CONFIG_WRITER |	\
+	ZIO_FLAG_DONT_RETRY |		\
+	ZIO_FLAG_DONT_CACHE |		\
+	ZIO_FLAG_DONT_AGGREGATE |	\
 	ZIO_FLAG_RESILVER |		\
 	ZIO_FLAG_SCRUB |		\
 	ZIO_FLAG_SCRUB_THREAD)
 
 #define	ZIO_FLAG_VDEV_INHERIT		\
 	(ZIO_FLAG_GANG_INHERIT |	\
-	ZIO_FLAG_DONT_CACHE |		\
-	ZIO_FLAG_PHYSICAL)
+	ZIO_FLAG_IO_REPAIR |		\
+	ZIO_FLAG_IO_RETRY |		\
+	ZIO_FLAG_PROBE)
+
+#define	ZIO_PIPELINE_CONTINUE		0x100
+#define	ZIO_PIPELINE_STOP		0x101
+
+#define	ZIO_GANG_CHILD_FLAGS(zio)				\
+	(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) |		\
+	ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
+
+enum zio_child {
+	ZIO_CHILD_VDEV = 0,
+	ZIO_CHILD_GANG,
+	ZIO_CHILD_LOGICAL,
+	ZIO_CHILD_TYPES
+};
+
+enum zio_wait_type {
+	ZIO_WAIT_READY = 0,
+	ZIO_WAIT_DONE,
+	ZIO_WAIT_TYPES
+};
 
 /*
- * We'll take the EILSEQ (Illegal byte sequence) errno
- * to indicate checksum errors.
+ * We'll take the EILSEQ and ENOMSG to indicate checksum errors and
+ * fragmentation.
  */
 #define	ECKSUM	EILSEQ
+#define	EFRAGS	ENOMSG
 
 typedef struct zio zio_t;
 typedef void zio_done_func_t(zio_t *zio);
@@ -200,23 +216,64 @@ typedef struct zbookmark {
 	uint64_t	zb_blkid;
 } zbookmark_t;
 
+typedef struct zio_prop {
+	enum zio_checksum	zp_checksum;
+	enum zio_compress	zp_compress;
+	dmu_object_type_t	zp_type;
+	uint8_t			zp_level;
+	uint8_t			zp_ndvas;
+} zio_prop_t;
+
+typedef struct zio_gang_node {
+	zio_gbh_phys_t		*gn_gbh;
+	struct zio_gang_node	*gn_child[SPA_GBH_NBLKPTRS];
+} zio_gang_node_t;
+
+typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
+    zio_gang_node_t *gn, void *data);
+
+typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size);
+
+typedef struct zio_transform {
+	void			*zt_orig_data;
+	uint64_t		zt_orig_size;
+	uint64_t		zt_bufsize;
+	zio_transform_func_t	*zt_transform;
+	struct zio_transform	*zt_next;
+} zio_transform_t;
+
+typedef int zio_pipe_stage_t(zio_t *zio);
+
+/*
+ * The io_reexecute flags are distinct from io_flags because the child must
+ * be able to propagate them to the parent.  The normal io_flags are local
+ * to the zio, not protected by any lock, and not modifiable by children;
+ * the reexecute flags are protected by io_lock, modifiable by children,
+ * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
+ */
+#define	ZIO_REEXECUTE_NOW	0x01
+#define	ZIO_REEXECUTE_SUSPEND	0x02
+
 struct zio {
 	/* Core information about this I/O */
-	zio_t		*io_parent;
-	zio_t		*io_root;
-	spa_t		*io_spa;
 	zbookmark_t	io_bookmark;
-	enum zio_checksum io_checksum;
-	enum zio_compress io_compress;
-	int		io_ndvas;
+	zio_prop_t	io_prop;
+	zio_type_t	io_type;
+	enum zio_child	io_child_type;
+	int		io_cmd;
+	uint8_t		io_priority;
+	uint8_t		io_reexecute;
+	uint8_t		io_async_root;
 	uint64_t	io_txg;
+	spa_t		*io_spa;
 	blkptr_t	*io_bp;
 	blkptr_t	io_bp_copy;
+	zio_t		*io_parent;
 	zio_t		*io_child;
 	zio_t		*io_sibling_prev;
 	zio_t		*io_sibling_next;
-	zio_transform_t *io_transform_stack;
 	zio_t		*io_logical;
+	zio_transform_t *io_transform_stack;
 
 	/* Callback info */
 	zio_done_func_t	*io_ready;
@@ -231,9 +288,9 @@ struct zio {
 	/* Stuff for the vdev stack */
 	vdev_t		*io_vd;
 	void		*io_vsd;
+	zio_done_func_t	*io_vsd_free;
 	uint64_t	io_offset;
 	uint64_t	io_deadline;
-	uint64_t	io_timestamp;
 	avl_node_t	io_offset_node;
 	avl_node_t	io_deadline_node;
 	avl_tree_t	*io_vdev_tree;
@@ -242,19 +299,17 @@ struct zio {
 
 	/* Internal pipeline state */
 	int		io_flags;
-	enum zio_type	io_type;
-	enum zio_stage	io_stage;
-	uint8_t		io_stalled;
-	uint8_t		io_priority;
-	struct dk_callback io_dk_callback;
-	int		io_cmd;
-	int		io_retries;
-	int		io_error;
-	uint32_t	io_numerrors;
+	zio_stage_t	io_stage;
 	uint32_t	io_pipeline;
-	uint32_t	io_async_stages;
-	uint64_t	io_children_notready;
-	uint64_t	io_children_notdone;
+	int		io_orig_flags;
+	zio_stage_t	io_orig_stage;
+	uint32_t	io_orig_pipeline;
+	int		io_error;
+	int		io_child_error[ZIO_CHILD_TYPES];
+	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
+	uint64_t	*io_stall;
+	zio_gang_node_t	*io_gang_tree;
+	void		*io_executor;
 	void		*io_waiter;
 	kmutex_t	io_lock;
 	kcondvar_t	io_cv;
@@ -269,76 +324,76 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa,
 extern zio_t *zio_root(spa_t *spa,
     zio_done_func_t *done, void *private, int flags);
 
-extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
+extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, zbookmark_t *zb);
+    int priority, int flags, const zbookmark_t *zb);
 
-extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
-    int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb);
+extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    void *data, uint64_t size, zio_prop_t *zp,
+    zio_done_func_t *ready, zio_done_func_t *done, void *private,
+    int priority, int flags, const zbookmark_t *zb);
 
-extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
-    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags,
-    zbookmark_t *zb);
+extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    void *data, uint64_t size, zio_done_func_t *done, void *private,
+    int priority, int flags, zbookmark_t *zb);
 
 extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private);
+    zio_done_func_t *done, void *private, int flags);
 
 extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private);
+    zio_done_func_t *done, void *private, int flags);
 
 extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
     zio_done_func_t *done, void *private, int priority, int flags);
 
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *private, int priority, int flags);
+    zio_done_func_t *done, void *private, int priority, int flags,
+    boolean_t labels);
 
 extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *private, int priority, int flags);
+    zio_done_func_t *done, void *private, int priority, int flags,
+    boolean_t labels);
 
 extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp,
     blkptr_t *old_bp, uint64_t txg);
 extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
+extern void zio_flush(zio_t *zio, vdev_t *vd);
 
 extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
+extern void zio_execute(zio_t *zio);
+extern void zio_interrupt(zio_t *zio);
 
 extern void *zio_buf_alloc(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
 extern void zio_data_buf_free(void *buf, size_t size);
 
-/*
- * Move an I/O to the next stage of the pipeline and execute that stage.
- * There's no locking on io_stage because there's no legitimate way for
- * multiple threads to be attempting to process the same I/O.
- */
-extern void zio_next_stage(zio_t *zio);
-extern void zio_next_stage_async(zio_t *zio);
-extern void zio_wait_children_done(zio_t *zio);
+extern void zio_resubmit_stage_async(void *);
 
-/*
- * Delegate I/O to a child vdev.
- */
 extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
     uint64_t offset, void *data, uint64_t size, int type, int priority,
     int flags, zio_done_func_t *done, void *private);
 
+extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
+    void *data, uint64_t size, int type, int priority,
+    int flags, zio_done_func_t *done, void *private);
+
 extern void zio_vdev_io_bypass(zio_t *zio);
 extern void zio_vdev_io_reissue(zio_t *zio);
 extern void zio_vdev_io_redone(zio_t *zio);
 
 extern void zio_checksum_verified(zio_t *zio);
-extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp);
+extern int zio_worst_error(int e1, int e2);
 
 extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
 extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
 
-boolean_t zio_should_retry(zio_t *zio);
+extern void zio_suspend(spa_t *spa, zio_t *zio);
+extern void zio_resume(spa_t *spa);
+extern void zio_resume_wait(spa_t *spa);
 
 /*
  * Initial setup and teardown.
@@ -358,6 +413,7 @@ extern int zio_inject_list_next(int *id, char *name, size_t buflen,
 extern int zio_clear_fault(int id);
 extern int zio_handle_fault_injection(zio_t *zio, int error);
 extern int zio_handle_device_injection(vdev_t *vd, int error);
+extern int zio_handle_label_injection(zio_t *zio, int error);
 
 #ifdef	__cplusplus
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
index bb7bd41e0bb3..da407399da06 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_ZIO_CHECKSUM_H
 #define	_SYS_ZIO_CHECKSUM_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zio.h>
 
 #ifdef	__cplusplus
@@ -64,7 +62,7 @@ extern zio_checksum_t fletcher_4_incremental_byteswap;
 
 extern zio_checksum_t zio_checksum_SHA256;
 
-extern void zio_checksum(uint_t checksum, zio_cksum_t *zcp,
+extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
     void *data, uint64_t size);
 extern int zio_checksum_error(zio_t *zio);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
index d2ddbc34e922..e7503b733cc0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _ZIO_IMPL_H
 #define	_ZIO_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
 
@@ -38,162 +36,102 @@ extern "C" {
 /*
  * I/O Groups: pipeline stage definitions.
  */
-
 typedef enum zio_stage {
 	ZIO_STAGE_OPEN = 0,			/* RWFCI */
-	ZIO_STAGE_WAIT_CHILDREN_READY,		/* RWFCI */
 
-	ZIO_STAGE_WRITE_COMPRESS,		/* -W--- */
-	ZIO_STAGE_CHECKSUM_GENERATE,		/* -W--- */
+	ZIO_STAGE_ISSUE_ASYNC,			/* -W--- */
 
-	ZIO_STAGE_GANG_PIPELINE,		/* -WFC- */
+	ZIO_STAGE_READ_BP_INIT,			/* R---- */
+	ZIO_STAGE_WRITE_BP_INIT,		/* -W--- */
+
+	ZIO_STAGE_CHECKSUM_GENERATE,		/* -W--- */
 
-	ZIO_STAGE_GET_GANG_HEADER,		/* -WFC- */
-	ZIO_STAGE_REWRITE_GANG_MEMBERS,		/* -W--- */
-	ZIO_STAGE_FREE_GANG_MEMBERS,		/* --F-- */
-	ZIO_STAGE_CLAIM_GANG_MEMBERS,		/* ---C- */
+	ZIO_STAGE_GANG_ASSEMBLE,		/* RWFC- */
+	ZIO_STAGE_GANG_ISSUE,			/* RWFC- */
 
 	ZIO_STAGE_DVA_ALLOCATE,			/* -W--- */
 	ZIO_STAGE_DVA_FREE,			/* --F-- */
 	ZIO_STAGE_DVA_CLAIM,			/* ---C- */
 
-	ZIO_STAGE_GANG_CHECKSUM_GENERATE,	/* -W--- */
-
 	ZIO_STAGE_READY,			/* RWFCI */
 
 	ZIO_STAGE_VDEV_IO_START,		/* RW--I */
 	ZIO_STAGE_VDEV_IO_DONE,			/* RW--I */
 	ZIO_STAGE_VDEV_IO_ASSESS,		/* RW--I */
 
-	ZIO_STAGE_WAIT_CHILDREN_DONE,		/* RWFCI */
-
 	ZIO_STAGE_CHECKSUM_VERIFY,		/* R---- */
-	ZIO_STAGE_READ_GANG_MEMBERS,		/* R---- */
-	ZIO_STAGE_READ_DECOMPRESS,		/* R---- */
 
-	ZIO_STAGE_DONE				/* RWFCI */
+	ZIO_STAGE_DONE,				/* RWFCI */
+	ZIO_STAGES
 } zio_stage_t;
 
-/*
- * The stages for which there's some performance value in going async.
- * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well.
- */
-#define	ZIO_ASYNC_PIPELINE_STAGES				\
-	((1U << ZIO_STAGE_CHECKSUM_GENERATE) |			\
-	(1U << ZIO_STAGE_VDEV_IO_DONE) |			\
-	(1U << ZIO_STAGE_CHECKSUM_VERIFY) |			\
-	(1U << ZIO_STAGE_READ_DECOMPRESS))
+#define	ZIO_INTERLOCK_STAGES					\
+	((1U << ZIO_STAGE_READY) |				\
+	(1U << ZIO_STAGE_DONE))
 
-#define	ZIO_VDEV_IO_PIPELINE					\
+#define	ZIO_INTERLOCK_PIPELINE					\
+	ZIO_INTERLOCK_STAGES
+
+#define	ZIO_VDEV_IO_STAGES					\
 	((1U << ZIO_STAGE_VDEV_IO_START) |			\
 	(1U << ZIO_STAGE_VDEV_IO_DONE) |			\
 	(1U << ZIO_STAGE_VDEV_IO_ASSESS))
 
-#define	ZIO_READ_PHYS_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_READY) |				\
-	ZIO_VDEV_IO_PIPELINE |					\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_CHECKSUM_VERIFY) |			\
+#define	ZIO_VDEV_CHILD_PIPELINE					\
+	(ZIO_VDEV_IO_STAGES |					\
 	(1U << ZIO_STAGE_DONE))
 
-#define	ZIO_READ_PIPELINE					\
-	ZIO_READ_PHYS_PIPELINE
+#define	ZIO_READ_COMMON_STAGES					\
+	(ZIO_INTERLOCK_STAGES |					\
+	ZIO_VDEV_IO_STAGES |					\
+	(1U << ZIO_STAGE_CHECKSUM_VERIFY))
 
-#define	ZIO_WRITE_PHYS_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_CHECKSUM_GENERATE) |			\
-	(1U << ZIO_STAGE_READY) |				\
-	ZIO_VDEV_IO_PIPELINE |					\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_DONE))
+#define	ZIO_READ_PHYS_PIPELINE					\
+	ZIO_READ_COMMON_STAGES
 
-#define	ZIO_WRITE_COMMON_PIPELINE				\
-	ZIO_WRITE_PHYS_PIPELINE
+#define	ZIO_READ_PIPELINE					\
+	(ZIO_READ_COMMON_STAGES |				\
+	(1U << ZIO_STAGE_READ_BP_INIT))
 
-#define	ZIO_WRITE_PIPELINE					\
-	((1U << ZIO_STAGE_WRITE_COMPRESS) |			\
-	ZIO_WRITE_COMMON_PIPELINE)
+#define	ZIO_WRITE_COMMON_STAGES					\
+	(ZIO_INTERLOCK_STAGES |					\
+	ZIO_VDEV_IO_STAGES |					\
+	(1U << ZIO_STAGE_ISSUE_ASYNC) |				\
+	(1U << ZIO_STAGE_CHECKSUM_GENERATE))
 
-#define	ZIO_GANG_STAGES						\
-	((1U << ZIO_STAGE_GET_GANG_HEADER) |			\
-	(1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) |		\
-	(1U << ZIO_STAGE_FREE_GANG_MEMBERS) |			\
-	(1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) |			\
-	(1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) |		\
-	(1U << ZIO_STAGE_READ_GANG_MEMBERS))
+#define	ZIO_WRITE_PHYS_PIPELINE					\
+	ZIO_WRITE_COMMON_STAGES
 
 #define	ZIO_REWRITE_PIPELINE					\
-	((1U << ZIO_STAGE_GANG_PIPELINE) |			\
-	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
-	(1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) |		\
-	(1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) |		\
-	ZIO_WRITE_COMMON_PIPELINE)
+	(ZIO_WRITE_COMMON_STAGES |				\
+	(1U << ZIO_STAGE_WRITE_BP_INIT))
 
-#define	ZIO_WRITE_ALLOCATE_PIPELINE				\
-	((1U << ZIO_STAGE_DVA_ALLOCATE) |			\
-	ZIO_WRITE_COMMON_PIPELINE)
+#define	ZIO_WRITE_PIPELINE					\
+	(ZIO_WRITE_COMMON_STAGES |				\
+	(1U << ZIO_STAGE_WRITE_BP_INIT) |			\
+	(1U << ZIO_STAGE_DVA_ALLOCATE))
 
-#define	ZIO_GANG_FREE_STAGES					\
-	((1U << ZIO_STAGE_GET_GANG_HEADER) |			\
-	(1U << ZIO_STAGE_FREE_GANG_MEMBERS))
+#define	ZIO_GANG_STAGES						\
+	((1U << ZIO_STAGE_GANG_ASSEMBLE) |			\
+	(1U << ZIO_STAGE_GANG_ISSUE))
 
 #define	ZIO_FREE_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_GANG_PIPELINE) |			\
-	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
-	(1U << ZIO_STAGE_FREE_GANG_MEMBERS) |			\
-	(1U << ZIO_STAGE_DVA_FREE) |				\
-	(1U << ZIO_STAGE_READY) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_DONE))
+	(ZIO_INTERLOCK_STAGES |					\
+	(1U << ZIO_STAGE_DVA_FREE))
 
 #define	ZIO_CLAIM_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_GANG_PIPELINE) |			\
-	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
-	(1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) |			\
-	(1U << ZIO_STAGE_DVA_CLAIM) |				\
-	(1U << ZIO_STAGE_READY) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_DONE))
+	(ZIO_INTERLOCK_STAGES |					\
+	(1U << ZIO_STAGE_DVA_CLAIM))
 
 #define	ZIO_IOCTL_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_READY) |				\
-	ZIO_VDEV_IO_PIPELINE |					\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_DONE))
-
-#define	ZIO_WAIT_FOR_CHILDREN_PIPELINE				\
-	((1U << ZIO_STAGE_WAIT_CHILDREN_READY) |		\
-	(1U << ZIO_STAGE_READY) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_DONE))
-
-#define	ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE			\
-	((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_DONE))
+	(ZIO_INTERLOCK_STAGES |					\
+	(1U << ZIO_STAGE_VDEV_IO_START) |			\
+	(1U << ZIO_STAGE_VDEV_IO_ASSESS))
 
-#define	ZIO_VDEV_CHILD_PIPELINE					\
-	(ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE |			\
-	ZIO_VDEV_IO_PIPELINE)
-
-#define	ZIO_ERROR_PIPELINE_MASK					\
-	ZIO_WAIT_FOR_CHILDREN_PIPELINE
-
-typedef struct zio_transform zio_transform_t;
-struct zio_transform {
-	void		*zt_data;
-	uint64_t	zt_size;
-	uint64_t	zt_bufsize;
-	zio_transform_t	*zt_next;
-};
+#define	ZIO_CONFIG_LOCK_BLOCKING_STAGES				\
+	((1U << ZIO_STAGE_VDEV_IO_START) |			\
+	(1U << ZIO_STAGE_DVA_ALLOCATE) |			\
+	(1U << ZIO_STAGE_DVA_CLAIM))
 
 extern void zio_inject_init(void);
 extern void zio_inject_fini(void);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
index df85824d59bd..2a6452aa433c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -35,17 +35,21 @@
 extern "C" {
 #endif
 
+#define	ZVOL_OBJ		1ULL
+#define	ZVOL_ZAP_OBJ		2ULL
+
 #ifdef _KERNEL
 extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
 extern int zvol_check_volblocksize(uint64_t volblocksize);
 extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
-extern void zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx);
-extern int zvol_create_minor(const char *, dev_t);
+extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
+extern int zvol_create_minor(const char *, major_t);
 extern int zvol_remove_minor(const char *);
-extern int zvol_set_volsize(const char *, dev_t, uint64_t);
+extern int zvol_set_volsize(const char *, major_t, uint64_t);
 extern int zvol_set_volblocksize(const char *, uint64_t);
 
 extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
+extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks);
 extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
 #ifndef __FreeBSD__
 extern int zvol_strategy(buf_t *bp);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
index 844beb6864a5..040e4d70fc04 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/txg_impl.h>
 #include <sys/dmu_impl.h>
@@ -37,9 +35,18 @@
 
 static void txg_sync_thread(void *arg);
 static void txg_quiesce_thread(void *arg);
-static void txg_timelimit_thread(void *arg);
 
-int txg_time = 5;	/* max 5 seconds worth of delta per txg */
+int zfs_txg_timeout = 30;	/* max seconds worth of delta per txg */
+extern int zfs_txg_synctime;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS TXG");
+TUNABLE_INT("vfs.zfs.txg.timeout", &zfs_txg_timeout);
+SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RDTUN, &zfs_txg_timeout, 0,
+    "Maximum seconds worth of delta per txg");
+TUNABLE_INT("vfs.zfs.txg.synctime", &zfs_txg_synctime);
+SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, synctime, CTLFLAG_RDTUN, &zfs_txg_synctime,
+    0, "Target seconds to sync a txg");
 
 /*
  * Prepare the txg subsystem.
@@ -48,14 +55,19 @@ void
 txg_init(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
-	int c, i;
+	int c;
 	bzero(tx, sizeof (tx_state_t));
 
 	tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
+
 	for (c = 0; c < max_ncpus; c++) {
+		int i;
+
 		mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
-		for (i = 0; i < TXG_SIZE; i++)
-			cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL);
+		for (i = 0; i < TXG_SIZE; i++) {
+			cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
+			    NULL);
+		}
 	}
 
 	rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
@@ -64,7 +76,6 @@ txg_init(dsl_pool_t *dp, uint64_t txg)
 	cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&tx->tx_timeout_exit_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
 
 	tx->tx_open_txg = txg;
@@ -77,12 +88,11 @@ void
 txg_fini(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
-	int c, i;
+	int c;
 
 	ASSERT(tx->tx_threads == 0);
 
 	cv_destroy(&tx->tx_exit_cv);
-	cv_destroy(&tx->tx_timeout_exit_cv);
 	cv_destroy(&tx->tx_quiesce_done_cv);
 	cv_destroy(&tx->tx_quiesce_more_cv);
 	cv_destroy(&tx->tx_sync_done_cv);
@@ -91,9 +101,11 @@ txg_fini(dsl_pool_t *dp)
 	mutex_destroy(&tx->tx_sync_lock);
 
 	for (c = 0; c < max_ncpus; c++) {
+		int i;
+
+		mutex_destroy(&tx->tx_cpu[c].tc_lock);
 		for (i = 0; i < TXG_SIZE; i++)
 			cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
-		mutex_destroy(&tx->tx_cpu[c].tc_lock);
 	}
 
 	kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
@@ -115,15 +127,17 @@ txg_sync_start(dsl_pool_t *dp)
 
 	ASSERT(tx->tx_threads == 0);
 
-	tx->tx_threads = 3;
+	tx->tx_threads = 2;
 
 	tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
 	    dp, 0, &p0, TS_RUN, minclsyspri);
 
-	tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
-	    dp, 0, &p0, TS_RUN, minclsyspri);
-
-	tx->tx_timelimit_thread = thread_create(NULL, 0, txg_timelimit_thread,
+	/*
+	 * The sync thread can need a larger-than-default stack size on
+	 * 32-bit x86.  This is due in part to nested pools and
+	 * scrub_visitbp() recursion.
+	 */
+	tx->tx_sync_thread = thread_create(NULL, 12<<10, txg_sync_thread,
 	    dp, 0, &p0, TS_RUN, minclsyspri);
 
 	mutex_exit(&tx->tx_sync_lock);
@@ -148,12 +162,12 @@ txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
 }
 
 static void
-txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, int secmax)
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time)
 {
 	CALLB_CPR_SAFE_BEGIN(cpr);
 
-	if (secmax)
-		(void) cv_timedwait(cv, &tx->tx_sync_lock, secmax * hz);
+	if (time)
+		(void) cv_timedwait(cv, &tx->tx_sync_lock, time);
 	else
 		cv_wait(cv, &tx->tx_sync_lock);
 
@@ -172,22 +186,21 @@ txg_sync_stop(dsl_pool_t *dp)
 	/*
 	 * Finish off any work in progress.
 	 */
-	ASSERT(tx->tx_threads == 3);
+	ASSERT(tx->tx_threads == 2);
 	txg_wait_synced(dp, 0);
 
 	/*
-	 * Wake all 3 sync threads (one per state) and wait for them to die.
+	 * Wake all sync threads and wait for them to die.
 	 */
 	mutex_enter(&tx->tx_sync_lock);
 
-	ASSERT(tx->tx_threads == 3);
+	ASSERT(tx->tx_threads == 2);
 
 	tx->tx_exiting = 1;
 
 	cv_broadcast(&tx->tx_quiesce_more_cv);
 	cv_broadcast(&tx->tx_quiesce_done_cv);
 	cv_broadcast(&tx->tx_sync_more_cv);
-	cv_broadcast(&tx->tx_timeout_exit_cv);
 
 	while (tx->tx_threads != 0)
 		cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
@@ -279,22 +292,29 @@ txg_sync_thread(void *arg)
 	dsl_pool_t *dp = arg;
 	tx_state_t *tx = &dp->dp_tx;
 	callb_cpr_t cpr;
+	uint64_t start, delta;
 
 	txg_thread_enter(tx, &cpr);
 
+	start = delta = 0;
 	for (;;) {
+		uint64_t timer, timeout = zfs_txg_timeout * hz;
 		uint64_t txg;
 
 		/*
 		 * We sync when there's someone waiting on us, or the
-		 * quiesce thread has handed off a txg to us.
+		 * quiesce thread has handed off a txg to us, or we have
+		 * reached our timeout.
 		 */
-		while (!tx->tx_exiting &&
+		timer = (delta >= timeout ? 0 : timeout - delta);
+		while (!tx->tx_exiting && timer > 0 &&
 		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
 		    tx->tx_quiesced_txg == 0) {
 			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
-			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, 0);
+			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
+			delta = LBOLT - start;
+			timer = (delta > timeout ? 0 : timeout - delta);
 		}
 
 		/*
@@ -325,10 +345,13 @@ txg_sync_thread(void *arg)
 		rw_exit(&tx->tx_suspend);
 
 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
-			txg, tx->tx_quiesce_txg_waiting,
-			tx->tx_sync_txg_waiting);
+		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 		mutex_exit(&tx->tx_sync_lock);
+
+		start = LBOLT;
 		spa_sync(dp->dp_spa, txg);
+		delta = LBOLT - start;
+
 		mutex_enter(&tx->tx_sync_lock);
 		rw_enter(&tx->tx_suspend, RW_WRITER);
 		tx->tx_synced_txg = txg;
@@ -383,13 +406,43 @@ txg_quiesce_thread(void *arg)
 	}
 }
 
+/*
+ * Delay this thread by 'ticks' if we are still in the open transaction
+ * group and there is already a waiting txg quiesing or quiesced.  Abort
+ * the delay if this txg stalls or enters the quiesing state.
+ */
+void
+txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	int timeout = LBOLT + ticks;
+
+	/* don't delay if this txg could transition to quiesing immediately */
+	if (tx->tx_open_txg > txg ||
+	    tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
+		return;
+
+	mutex_enter(&tx->tx_sync_lock);
+	if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
+		mutex_exit(&tx->tx_sync_lock);
+		return;
+	}
+
+	while (LBOLT < timeout &&
+	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
+		(void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
+		    timeout - LBOLT);
+
+	mutex_exit(&tx->tx_sync_lock);
+}
+
 void
 txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	mutex_enter(&tx->tx_sync_lock);
-	ASSERT(tx->tx_threads == 3);
+	ASSERT(tx->tx_threads == 2);
 	if (txg == 0)
 		txg = tx->tx_open_txg;
 	if (tx->tx_sync_txg_waiting < txg)
@@ -412,7 +465,7 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg)
 	tx_state_t *tx = &dp->dp_tx;
 
 	mutex_enter(&tx->tx_sync_lock);
-	ASSERT(tx->tx_threads == 3);
+	ASSERT(tx->tx_threads == 2);
 	if (txg == 0)
 		txg = tx->tx_open_txg + 1;
 	if (tx->tx_quiesce_txg_waiting < txg)
@@ -426,37 +479,20 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg)
 	mutex_exit(&tx->tx_sync_lock);
 }
 
-static void
-txg_timelimit_thread(void *arg)
+boolean_t
+txg_stalled(dsl_pool_t *dp)
 {
-	dsl_pool_t *dp = arg;
 	tx_state_t *tx = &dp->dp_tx;
-	callb_cpr_t cpr;
-
-	txg_thread_enter(tx, &cpr);
-
-	while (!tx->tx_exiting) {
-		uint64_t txg = tx->tx_open_txg + 1;
-
-		txg_thread_wait(tx, &cpr, &tx->tx_timeout_exit_cv, txg_time);
-
-		if (tx->tx_quiesce_txg_waiting < txg)
-			tx->tx_quiesce_txg_waiting = txg;
-
-		while (!tx->tx_exiting && tx->tx_open_txg < txg) {
-			dprintf("pushing out %llu\n", txg);
-			cv_broadcast(&tx->tx_quiesce_more_cv);
-			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
-		}
-	}
-	txg_thread_exit(tx, &cpr, &tx->tx_timelimit_thread);
+	return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
 }
 
-int
-txg_stalled(dsl_pool_t *dp)
+boolean_t
+txg_sync_waiting(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
-	return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
+
+	return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
+	    tx->tx_quiesced_txg != 0);
 }
 
 void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
index b52e729d6294..fbe7b619a29a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -30,8 +30,7 @@
 #include <sys/unique.h>
 
 static avl_tree_t unique_avl;
-static kmutex_t unique_mtx;	/* Lock never initialized. */
-SX_SYSINIT(unique, &unique_mtx, "unique lock");
+static kmutex_t unique_mtx;
 
 typedef struct unique {
 	avl_node_t un_link;
@@ -58,12 +57,22 @@ unique_init(void)
 {
 	avl_create(&unique_avl, unique_compare,
 	    sizeof (unique_t), offsetof(unique_t, un_link));
+	mutex_init(&unique_mtx, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+unique_fini(void)
+{
+	avl_destroy(&unique_avl);
+	mutex_destroy(&unique_mtx);
 }
 
 uint64_t
 unique_create(void)
 {
-	return (unique_insert(0));
+	uint64_t value = unique_insert(0);
+	unique_remove(value);
+	return (value);
 }
 
 uint64_t
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
index b966099f4640..7d0602c8ee36 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
@@ -40,6 +38,7 @@
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
+#include <sys/arc.h>
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
@@ -58,14 +57,18 @@ static vdev_ops_t *vdev_ops_table[] = {
 	&vdev_geom_ops,
 #else
 	&vdev_disk_ops,
-	&vdev_file_ops,
 #endif
+	&vdev_file_ops,
 	&vdev_missing_ops,
 	NULL
 };
 
-/* maximum scrub/resilver I/O queue */
-int zfs_scrub_limit = 70;
+/* maximum scrub/resilver I/O queue per leaf vdev */
+int zfs_scrub_limit = 10;
+
+TUNABLE_INT("vfs.zfs.scrub_limit", &zfs_scrub_limit);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, scrub_limit, CTLFLAG_RDTUN, &zfs_scrub_limit, 0,
+    "Maximum scrub/resilver I/O queue");
 
 /*
  * Given a vdev type, return the appropriate ops vector.
@@ -143,8 +146,12 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
-	if (vdev < rvd->vdev_children)
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+	if (vdev < rvd->vdev_children) {
+		ASSERT(rvd->vdev_child[vdev] != NULL);
 		return (rvd->vdev_child[vdev]);
+	}
 
 	return (NULL);
 }
@@ -173,7 +180,7 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 	uint64_t id = cvd->vdev_id;
 	vdev_t **newchild;
 
-	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
+	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(cvd->vdev_parent == NULL);
 
 	cvd->vdev_parent = pvd;
@@ -256,7 +263,7 @@ vdev_compact_children(vdev_t *pvd)
 	int oldc = pvd->vdev_children;
 	int newc, c;
 
-	ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
+	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	for (c = newc = 0; c < oldc; c++)
 		if (pvd->vdev_child[c])
@@ -319,6 +326,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
 	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
 	txg_list_create(&vd->vdev_ms_list,
@@ -326,44 +334,13 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	txg_list_create(&vd->vdev_dtl_list,
 	    offsetof(struct vdev, vdev_dtl_node));
 	vd->vdev_stat.vs_timestamp = gethrtime();
+	vdev_queue_init(vd);
+	vdev_cache_init(vd);
 
 	return (vd);
 }
 
 /*
- * Free a vdev_t that has been removed from service.
- */
-static void
-vdev_free_common(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	if (vd->vdev_path)
-		spa_strfree(vd->vdev_path);
-	if (vd->vdev_devid)
-		spa_strfree(vd->vdev_devid);
-
-	if (vd->vdev_isspare)
-		spa_spare_remove(vd);
-
-	txg_list_destroy(&vd->vdev_ms_list);
-	txg_list_destroy(&vd->vdev_dtl_list);
-	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_unload(&vd->vdev_dtl_map);
-	space_map_destroy(&vd->vdev_dtl_map);
-	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
-	space_map_destroy(&vd->vdev_dtl_scrub);
-	mutex_exit(&vd->vdev_dtl_lock);
-	mutex_destroy(&vd->vdev_dtl_lock);
-	mutex_destroy(&vd->vdev_stat_lock);
-
-	if (vd == spa->spa_root_vdev)
-		spa->spa_root_vdev = NULL;
-
-	kmem_free(vd, sizeof (vdev_t));
-}
-
-/*
  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
  * creating a new vdev or loading an existing one - the behavior is slightly
  * different for each case.
@@ -374,10 +351,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 {
 	vdev_ops_t *ops;
 	char *type;
-	uint64_t guid = 0;
+	uint64_t guid = 0, islog, nparity;
 	vdev_t *vd;
 
-	ASSERT(spa_config_held(spa, RW_WRITER));
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
 		return (EINVAL);
@@ -401,6 +378,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	} else if (alloctype == VDEV_ALLOC_SPARE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (EINVAL);
+	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+			return (EINVAL);
 	}
 
 	/*
@@ -409,47 +389,61 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
 		return (EINVAL);
 
-	vd = vdev_alloc_common(spa, id, guid, ops);
-
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
-		vd->vdev_path = spa_strdup(vd->vdev_path);
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
-		vd->vdev_devid = spa_strdup(vd->vdev_devid);
+	/*
+	 * Determine whether we're a log vdev.
+	 */
+	islog = 0;
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
+	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
+		return (ENOTSUP);
 
 	/*
-	 * Set the nparity propery for RAID-Z vdevs.
+	 * Set the nparity property for RAID-Z vdevs.
 	 */
+	nparity = -1ULL;
 	if (ops == &vdev_raidz_ops) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
-		    &vd->vdev_nparity) == 0) {
+		    &nparity) == 0) {
 			/*
 			 * Currently, we can only support 2 parity devices.
 			 */
-			if (vd->vdev_nparity > 2)
+			if (nparity == 0 || nparity > 2)
 				return (EINVAL);
 			/*
 			 * Older versions can only support 1 parity device.
 			 */
-			if (vd->vdev_nparity == 2 &&
-			    spa_version(spa) < ZFS_VERSION_RAID6)
+			if (nparity == 2 &&
+			    spa_version(spa) < SPA_VERSION_RAID6)
 				return (ENOTSUP);
-
 		} else {
 			/*
 			 * We require the parity to be specified for SPAs that
 			 * support multiple parity levels.
 			 */
-			if (spa_version(spa) >= ZFS_VERSION_RAID6)
+			if (spa_version(spa) >= SPA_VERSION_RAID6)
 				return (EINVAL);
-
 			/*
 			 * Otherwise, we default to 1 parity device for RAID-Z.
 			 */
-			vd->vdev_nparity = 1;
+			nparity = 1;
 		}
 	} else {
-		vd->vdev_nparity = 0;
+		nparity = 0;
 	}
+	ASSERT(nparity != -1ULL);
+
+	vd = vdev_alloc_common(spa, id, guid, ops);
+
+	vd->vdev_islog = islog;
+	vd->vdev_nparity = nparity;
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
+		vd->vdev_path = spa_strdup(vd->vdev_path);
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
+		vd->vdev_devid = spa_strdup(vd->vdev_devid);
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+	    &vd->vdev_physpath) == 0)
+		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
 
 	/*
 	 * Set the whole_disk property.  If it's not specified, leave the value
@@ -463,8 +457,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	 * Look for the 'not present' flag.  This will only be set if the device
 	 * was not present at the time of import.
 	 */
-	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
-	    &vd->vdev_not_present);
+	if (!spa->spa_import_faulted)
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+		    &vd->vdev_not_present);
 
 	/*
 	 * Get the alignment requirement.
@@ -484,13 +479,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	}
 
 	/*
-	 * If we're a leaf vdev, try to load the DTL object and offline state.
+	 * If we're a leaf vdev, try to load the DTL object and other state.
 	 */
-	if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
-		    &vd->vdev_dtl.smo_object);
+	if (vd->vdev_ops->vdev_op_leaf &&
+	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) {
+		if (alloctype == VDEV_ALLOC_LOAD) {
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
+			    &vd->vdev_dtl.smo_object);
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
+			    &vd->vdev_unspare);
+		}
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
+
+		/*
+		 * When importing a pool, we want to ignore the persistent fault
+		 * state, as the diagnosis made on another system may not be
+		 * valid in the current context.
+		 */
+		if (spa->spa_load_state == SPA_LOAD_OPEN) {
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
+			    &vd->vdev_faulted);
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
+			    &vd->vdev_degraded);
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
+			    &vd->vdev_removed);
+		}
 	}
 
 	/*
@@ -507,6 +521,7 @@ void
 vdev_free(vdev_t *vd)
 {
 	int c;
+	spa_t *spa = vd->vdev_spa;
 
 	/*
 	 * vdev_free() implies closing the vdev first.  This is simpler than
@@ -514,7 +529,7 @@ vdev_free(vdev_t *vd)
 	 */
 	vdev_close(vd);
 
-	ASSERT(!list_link_active(&vd->vdev_dirty_node));
+	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
 
 	/*
 	 * Free all children.
@@ -542,7 +557,40 @@ vdev_free(vdev_t *vd)
 
 	ASSERT(vd->vdev_parent == NULL);
 
-	vdev_free_common(vd);
+	/*
+	 * Clean up vdev structure.
+	 */
+	vdev_queue_fini(vd);
+	vdev_cache_fini(vd);
+
+	if (vd->vdev_path)
+		spa_strfree(vd->vdev_path);
+	if (vd->vdev_devid)
+		spa_strfree(vd->vdev_devid);
+	if (vd->vdev_physpath)
+		spa_strfree(vd->vdev_physpath);
+
+	if (vd->vdev_isspare)
+		spa_spare_remove(vd);
+	if (vd->vdev_isl2cache)
+		spa_l2cache_remove(vd);
+
+	txg_list_destroy(&vd->vdev_ms_list);
+	txg_list_destroy(&vd->vdev_dtl_list);
+	mutex_enter(&vd->vdev_dtl_lock);
+	space_map_unload(&vd->vdev_dtl_map);
+	space_map_destroy(&vd->vdev_dtl_map);
+	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+	space_map_destroy(&vd->vdev_dtl_scrub);
+	mutex_exit(&vd->vdev_dtl_lock);
+	mutex_destroy(&vd->vdev_dtl_lock);
+	mutex_destroy(&vd->vdev_stat_lock);
+	mutex_destroy(&vd->vdev_probe_lock);
+
+	if (vd == spa->spa_root_vdev)
+		spa->spa_root_vdev = NULL;
+
+	kmem_free(vd, sizeof (vdev_t));
 }
 
 /*
@@ -592,16 +640,21 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
 	}
 
-	if (list_link_active(&svd->vdev_dirty_node)) {
+	if (list_link_active(&svd->vdev_config_dirty_node)) {
 		vdev_config_clean(svd);
 		vdev_config_dirty(tvd);
 	}
 
-	tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted;
-	svd->vdev_reopen_wanted = 0;
+	if (list_link_active(&svd->vdev_state_dirty_node)) {
+		vdev_state_clean(svd);
+		vdev_state_dirty(tvd);
+	}
 
 	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
 	svd->vdev_deflate_ratio = 0;
+
+	tvd->vdev_islog = svd->vdev_islog;
+	svd->vdev_islog = 0;
 }
 
 static void
@@ -628,7 +681,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 	vdev_t *pvd = cvd->vdev_parent;
 	vdev_t *mvd;
 
-	ASSERT(spa_config_held(spa, RW_WRITER));
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
 
@@ -657,7 +710,7 @@ vdev_remove_parent(vdev_t *cvd)
 	vdev_t *mvd = cvd->vdev_parent;
 	vdev_t *pvd = mvd->vdev_parent;
 
-	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
+	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	ASSERT(mvd->vdev_children == 1);
 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
@@ -667,22 +720,16 @@ vdev_remove_parent(vdev_t *cvd)
 
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
-	cvd->vdev_id = mvd->vdev_id;
-	vdev_add_child(pvd, cvd);
 	/*
-	 * If we created a new toplevel vdev, then we need to change the child's
-	 * vdev GUID to match the old toplevel vdev.  Otherwise, we could have
-	 * detached an offline device, and when we go to import the pool we'll
-	 * think we have two toplevel vdevs, instead of a different version of
-	 * the same toplevel vdev.
+	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
+	 * Otherwise, we could have detached an offline device, and when we
+	 * go to import the pool we'll think we have two top-level vdevs,
+	 * instead of a different version of the same top-level vdev.
 	 */
-	if (cvd->vdev_top == cvd) {
-		pvd->vdev_guid_sum -= cvd->vdev_guid;
-		cvd->vdev_guid_sum -= cvd->vdev_guid;
-		cvd->vdev_guid = mvd->vdev_guid;
-		cvd->vdev_guid_sum += mvd->vdev_guid;
-		pvd->vdev_guid_sum += cvd->vdev_guid;
-	}
+	if (mvd->vdev_top == mvd)
+		cvd->vdev_guid = cvd->vdev_guid_sum = mvd->vdev_guid;
+	cvd->vdev_id = mvd->vdev_id;
+	vdev_add_child(pvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (cvd == cvd->vdev_top)
@@ -697,7 +744,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
-	metaslab_class_t *mc = spa_metaslab_class_select(spa);
+	metaslab_class_t *mc;
 	uint64_t m;
 	uint64_t oldc = vd->vdev_ms_count;
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
@@ -707,10 +754,13 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 	if (vd->vdev_ms_shift == 0)	/* not being allocated from yet */
 		return (0);
 
-	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
-
 	ASSERT(oldc <= newc);
 
+	if (vd->vdev_islog)
+		mc = spa->spa_log_class;
+	else
+		mc = spa->spa_normal_class;
+
 	if (vd->vdev_mg == NULL)
 		vd->vdev_mg = metaslab_group_create(mc, vd);
 
@@ -737,8 +787,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 				error = dmu_bonus_hold(mos, object, FTAG, &db);
 				if (error)
 					return (error);
-				ASSERT3U(db->db_size, ==, sizeof (smo));
-				bcopy(db->db_data, &smo, db->db_size);
+				ASSERT3U(db->db_size, >=, sizeof (smo));
+				bcopy(db->db_data, &smo, sizeof (smo));
 				ASSERT3U(smo.smo_object, ==, object);
 				dmu_buf_rele(db, FTAG);
 			}
@@ -765,6 +815,112 @@ vdev_metaslab_fini(vdev_t *vd)
 	}
 }
 
+typedef struct vdev_probe_stats {
+	boolean_t	vps_readable;
+	boolean_t	vps_writeable;
+	int		vps_flags;
+	zio_t		*vps_root;
+	vdev_t		*vps_vd;
+} vdev_probe_stats_t;
+
+static void
+vdev_probe_done(zio_t *zio)
+{
+	vdev_probe_stats_t *vps = zio->io_private;
+	vdev_t *vd = vps->vps_vd;
+
+	if (zio->io_type == ZIO_TYPE_READ) {
+		ASSERT(zio->io_vd == vd);
+		if (zio->io_error == 0)
+			vps->vps_readable = 1;
+		if (zio->io_error == 0 && (spa_mode & FWRITE)) {
+			zio_nowait(zio_write_phys(vps->vps_root, vd,
+			    zio->io_offset, zio->io_size, zio->io_data,
+			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
+			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
+		} else {
+			zio_buf_free(zio->io_data, zio->io_size);
+		}
+	} else if (zio->io_type == ZIO_TYPE_WRITE) {
+		ASSERT(zio->io_vd == vd);
+		if (zio->io_error == 0)
+			vps->vps_writeable = 1;
+		zio_buf_free(zio->io_data, zio->io_size);
+	} else if (zio->io_type == ZIO_TYPE_NULL) {
+		ASSERT(zio->io_vd == NULL);
+		ASSERT(zio == vps->vps_root);
+
+		vd->vdev_cant_read |= !vps->vps_readable;
+		vd->vdev_cant_write |= !vps->vps_writeable;
+
+		if (vdev_readable(vd) &&
+		    (vdev_writeable(vd) || !(spa_mode & FWRITE))) {
+			zio->io_error = 0;
+		} else {
+			ASSERT(zio->io_error != 0);
+			zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
+			    zio->io_spa, vd, NULL, 0, 0);
+			zio->io_error = ENXIO;
+		}
+		kmem_free(vps, sizeof (*vps));
+	}
+}
+
+/*
+ * Determine whether this device is accessible by reading and writing
+ * to several known locations: the pad regions of each vdev label
+ * but the first (which we leave alone in case it contains a VTOC).
+ */
+zio_t *
+vdev_probe(vdev_t *vd, zio_t *pio)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_probe_stats_t *vps;
+	zio_t *zio;
+
+	vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
+
+	vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
+	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_DONT_RETRY;
+
+	if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
+		/*
+		 * vdev_cant_read and vdev_cant_write can only transition
+		 * from TRUE to FALSE when we have the SCL_ZIO lock as writer;
+		 * otherwise they can only transition from FALSE to TRUE.
+		 * This ensures that any zio looking at these values can
+		 * assume that failures persist for the life of the I/O.
+		 * That's important because when a device has intermittent
+		 * connectivity problems, we want to ensure that they're
+		 * ascribed to the device (ENXIO) and not the zio (EIO).
+		 *
+		 * Since we hold SCL_ZIO as writer here, clear both values
+		 * so the probe can reevaluate from first principles.
+		 */
+		vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
+		vd->vdev_cant_read = B_FALSE;
+		vd->vdev_cant_write = B_FALSE;
+	}
+
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	zio = zio_null(pio, spa, vdev_probe_done, vps, vps->vps_flags);
+
+	vps->vps_root = zio;
+	vps->vps_vd = vd;
+
+	for (int l = 1; l < VDEV_LABELS; l++) {
+		zio_nowait(zio_read_phys(zio, vd,
+		    vdev_label_offset(vd->vdev_psize, l,
+		    offsetof(vdev_label_t, vl_pad)),
+		    VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE),
+		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
+		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
+	}
+
+	return (zio);
+}
+
 /*
  * Prepare a virtual device for access.
  */
@@ -781,20 +937,14 @@ vdev_open(vdev_t *vd)
 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
 	    vd->vdev_state == VDEV_STATE_OFFLINE);
 
-	if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
-		vd->vdev_fault_arg >>= 1;
-	else
-		vd->vdev_fault_mode = VDEV_FAULT_NONE;
-
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 
-	if (vd->vdev_ops->vdev_op_leaf) {
-		vdev_cache_init(vd);
-		vdev_queue_init(vd);
-		vd->vdev_cache_active = B_TRUE;
-	}
-
-	if (vd->vdev_offline) {
+	if (!vd->vdev_removed && vd->vdev_faulted) {
+		ASSERT(vd->vdev_children == 0);
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+		    VDEV_AUX_ERR_EXCEEDED);
+		return (ENXIO);
+	} else if (vd->vdev_offline) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
 		return (ENXIO);
@@ -805,16 +955,25 @@ vdev_open(vdev_t *vd)
 	if (zio_injection_enabled && error == 0)
 		error = zio_handle_device_injection(vd, ENXIO);
 
-	dprintf("%s = %d, osize %llu, state = %d\n",
-	    vdev_description(vd), error, osize, vd->vdev_state);
-
 	if (error) {
+		if (vd->vdev_removed &&
+		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
+			vd->vdev_removed = B_FALSE;
+
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    vd->vdev_stat.vs_aux);
 		return (error);
 	}
 
-	vd->vdev_state = VDEV_STATE_HEALTHY;
+	vd->vdev_removed = B_FALSE;
+
+	if (vd->vdev_degraded) {
+		ASSERT(vd->vdev_children == 0);
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+		    VDEV_AUX_ERR_EXCEEDED);
+	} else {
+		vd->vdev_state = VDEV_STATE_HEALTHY;
+	}
 
 	for (c = 0; c < vd->vdev_children; c++)
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
@@ -883,6 +1042,17 @@ vdev_open(vdev_t *vd)
 	}
 
 	/*
+	 * Ensure we can issue some IO before declaring the
+	 * vdev open for business.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf &&
+	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_IO_FAILURE);
+		return (error);
+	}
+
+	/*
 	 * If this is a top-level vdev, compute the raidz-deflation
 	 * ratio.  Note, we hard-code in 128k (1<<17) because it is the
 	 * current "typical" blocksize.  Even if SPA_MAXBLOCKSIZE
@@ -895,16 +1065,17 @@ vdev_open(vdev_t *vd)
 	}
 
 	/*
-	 * This allows the ZFS DE to close cases appropriately.  If a device
-	 * goes away and later returns, we want to close the associated case.
-	 * But it's not enough to simply post this only when a device goes from
-	 * CANT_OPEN -> HEALTHY.  If we reboot the system and the device is
-	 * back, we also need to close the case (otherwise we will try to replay
-	 * it).  So we have to post this notifier every time.  Since this only
-	 * occurs during pool open or error recovery, this should not be an
-	 * issue.
+	 * If a leaf vdev has a DTL, and seems healthy, then kick off a
+	 * resilver.  But don't do this if we are doing a reopen for a
+	 * scrub, since this would just restart the scrub we are already
+	 * doing.
 	 */
-	zfs_post_ok(vd->vdev_spa, vd);
+	if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) {
+		mutex_enter(&vd->vdev_dtl_lock);
+		if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd))
+			spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER);
+		mutex_exit(&vd->vdev_dtl_lock);
+	}
 
 	return (0);
 }
@@ -912,8 +1083,7 @@ vdev_open(vdev_t *vd)
 /*
  * Called once the vdevs are all opened, this routine validates the label
  * contents.  This needs to be done before vdev_load() so that we don't
- * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen()
- * won't succeed if the device has been changed underneath.
+ * inadvertently do repair I/Os to the wrong device.
  *
  * This function will only return failure if one of the vdevs indicates that it
  * has since been destroyed or exported.  This is only possible if
@@ -926,7 +1096,7 @@ vdev_validate(vdev_t *vd)
 	spa_t *spa = vd->vdev_spa;
 	int c;
 	nvlist_t *label;
-	uint64_t guid;
+	uint64_t guid, top_guid;
 	uint64_t state;
 
 	for (c = 0; c < vd->vdev_children; c++)
@@ -938,7 +1108,7 @@ vdev_validate(vdev_t *vd)
 	 * any further validation.  Otherwise, label I/O will fail and we will
 	 * overwrite the previous state.
 	 */
-	if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) {
+	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
 
 		if ((label = vdev_label_read_config(vd)) == NULL) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -954,8 +1124,20 @@ vdev_validate(vdev_t *vd)
 			return (0);
 		}
 
+		/*
+		 * If this vdev just became a top-level vdev because its
+		 * sibling was detached, it will have adopted the parent's
+		 * vdev guid -- but the label may or may not be on disk yet.
+		 * Fortunately, either version of the label will have the
+		 * same top guid, so if we're a top-level vdev, we can
+		 * safely compare to that instead.
+		 */
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
-		    &guid) != 0 || guid != vd->vdev_guid) {
+		    &guid) != 0 ||
+		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
+		    &top_guid) != 0 ||
+		    (vd->vdev_guid != guid &&
+		    (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
@@ -975,14 +1157,15 @@ vdev_validate(vdev_t *vd)
 		if (spa->spa_load_state == SPA_LOAD_OPEN &&
 		    state != POOL_STATE_ACTIVE)
 			return (EBADF);
-	}
 
-	/*
-	 * If we were able to open and validate a vdev that was previously
-	 * marked permanently unavailable, clear that state now.
-	 */
-	if (vd->vdev_not_present)
-		vd->vdev_not_present = 0;
+		/*
+		 * If we were able to open and validate a vdev that was
+		 * previously marked permanently unavailable, clear that state
+		 * now.
+		 */
+		if (vd->vdev_not_present)
+			vd->vdev_not_present = 0;
+	}
 
 	return (0);
 }
@@ -995,11 +1178,7 @@ vdev_close(vdev_t *vd)
 {
 	vd->vdev_ops->vdev_op_close(vd);
 
-	if (vd->vdev_cache_active) {
-		vdev_cache_fini(vd);
-		vdev_queue_fini(vd);
-		vd->vdev_cache_active = B_FALSE;
-	}
+	vdev_cache_purge(vd);
 
 	/*
 	 * We record the previous state before we close it, so  that if we are
@@ -1020,7 +1199,7 @@ vdev_reopen(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
-	ASSERT(spa_config_held(spa, RW_WRITER));
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	vdev_close(vd);
 	(void) vdev_open(vd);
@@ -1029,22 +1208,24 @@ vdev_reopen(vdev_t *vd)
 	 * Call vdev_validate() here to make sure we have the same device.
 	 * Otherwise, a device with an invalid label could be successfully
 	 * opened in response to vdev_reopen().
-	 *
-	 * The downside to this is that if the user is simply experimenting by
-	 * overwriting an entire disk, we'll fault the device rather than
-	 * demonstrate self-healing capabilities.  On the other hand, with
-	 * proper FMA integration, the series of errors we'd see from the device
-	 * would result in a faulted device anyway.  Given that this doesn't
-	 * model any real-world corruption, it's better to catch this here and
-	 * correctly identify that the device has either changed beneath us, or
-	 * is corrupted beyond recognition.
 	 */
-	(void) vdev_validate(vd);
+	if (vd->vdev_aux) {
+		(void) vdev_validate_aux(vd);
+		if (vdev_readable(vd) && vdev_writeable(vd) &&
+		    !l2arc_vdev_present(vd)) {
+			uint64_t size = vdev_get_rsize(vd);
+			l2arc_add_vdev(spa, vd,
+			    VDEV_LABEL_START_SIZE,
+			    size - VDEV_LABEL_START_SIZE);
+		}
+	} else {
+		(void) vdev_validate(vd);
+	}
 
 	/*
-	 * Reassess root vdev's health.
+	 * Reassess parent vdev's health.
 	 */
-	vdev_propagate_state(spa->spa_root_vdev);
+	vdev_propagate_state(vd);
 }
 
 int
@@ -1150,22 +1331,27 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 	spa_t *spa = vd->vdev_spa;
 	int c;
 
-	ASSERT(spa_config_held(spa, RW_WRITER));
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
-		/*
-		 * We're successfully scrubbed everything up to scrub_txg.
-		 * Therefore, excise all old DTLs up to that point, then
-		 * fold in the DTLs for everything we couldn't scrub.
-		 */
-		if (scrub_txg != 0) {
+		if (scrub_txg != 0 &&
+		    (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
+			/* XXX should check scrub_done? */
+			/*
+			 * We completed a scrub up to scrub_txg.  If we
+			 * did it without rebooting, then the scrub dtl
+			 * will be valid, so excise the old region and
+			 * fold in the scrub dtl.  Otherwise, leave the
+			 * dtl as-is if there was an error.
+			 */
 			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
 			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
 		}
 		if (scrub_done)
 			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
 		mutex_exit(&vd->vdev_dtl_lock);
+
 		if (txg != 0)
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
 		return;
@@ -1212,8 +1398,8 @@ vdev_dtl_load(vdev_t *vd)
 	if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
 		return (error);
 
-	ASSERT3U(db->db_size, ==, sizeof (*smo));
-	bcopy(db->db_data, smo, db->db_size);
+	ASSERT3U(db->db_size, >=, sizeof (*smo));
+	bcopy(db->db_data, smo, sizeof (*smo));
 	dmu_buf_rele(db, FTAG);
 
 	mutex_enter(&vd->vdev_dtl_lock);
@@ -1235,9 +1421,6 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
 
-	dprintf("%s in txg %llu pass %d\n",
-	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
-
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	if (vd->vdev_detached) {
@@ -1247,8 +1430,6 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 			smo->smo_object = 0;
 		}
 		dmu_tx_commit(tx);
-		dprintf("detach %s committed in txg %llu\n",
-		    vdev_description(vd), txg);
 		return;
 	}
 
@@ -1283,13 +1464,56 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 
 	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
-	ASSERT3U(db->db_size, ==, sizeof (*smo));
-	bcopy(smo, db->db_data, db->db_size);
+	ASSERT3U(db->db_size, >=, sizeof (*smo));
+	bcopy(smo, db->db_data, sizeof (*smo));
 	dmu_buf_rele(db, FTAG);
 
 	dmu_tx_commit(tx);
 }
 
+/*
+ * Determine if resilver is needed, and if so the txg range.
+ */
+boolean_t
+vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
+{
+	boolean_t needed = B_FALSE;
+	uint64_t thismin = UINT64_MAX;
+	uint64_t thismax = 0;
+
+	if (vd->vdev_children == 0) {
+		mutex_enter(&vd->vdev_dtl_lock);
+		if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) {
+			space_seg_t *ss;
+
+			ss = avl_first(&vd->vdev_dtl_map.sm_root);
+			thismin = ss->ss_start - 1;
+			ss = avl_last(&vd->vdev_dtl_map.sm_root);
+			thismax = ss->ss_end;
+			needed = B_TRUE;
+		}
+		mutex_exit(&vd->vdev_dtl_lock);
+	} else {
+		int c;
+		for (c = 0; c < vd->vdev_children; c++) {
+			vdev_t *cvd = vd->vdev_child[c];
+			uint64_t cmin, cmax;
+
+			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
+				thismin = MIN(thismin, cmin);
+				thismax = MAX(thismax, cmax);
+				needed = B_TRUE;
+			}
+		}
+	}
+
+	if (needed && minp) {
+		*minp = thismin;
+		*maxp = thismax;
+	}
+	return (needed);
+}
+
 void
 vdev_load(vdev_t *vd)
 {
@@ -1319,19 +1543,22 @@ vdev_load(vdev_t *vd)
 }
 
 /*
- * This special case of vdev_spare() is used for hot spares.  It's sole purpose
- * it to set the vdev state for the associated vdev.  To do this, we make sure
- * that we can open the underlying device, then try to read the label, and make
- * sure that the label is sane and that it hasn't been repurposed to another
- * pool.
+ * The special vdev case is used for hot spares and l2cache devices.  Its
+ * sole purpose it to set the vdev state for the associated vdev.  To do this,
+ * we make sure that we can open the underlying device, then try to read the
+ * label, and make sure that the label is sane and that it hasn't been
+ * repurposed to another pool.
  */
 int
-vdev_validate_spare(vdev_t *vd)
+vdev_validate_aux(vdev_t *vd)
 {
 	nvlist_t *label;
 	uint64_t guid, version;
 	uint64_t state;
 
+	if (!vdev_readable(vd))
+		return (0);
+
 	if ((label = vdev_label_read_config(vd)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
@@ -1339,7 +1566,7 @@ vdev_validate_spare(vdev_t *vd)
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
-	    version > ZFS_VERSION ||
+	    version > SPA_VERSION ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
 	    guid != vd->vdev_guid ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
@@ -1349,8 +1576,6 @@ vdev_validate_spare(vdev_t *vd)
 		return (-1);
 	}
 
-	spa_spare_add(vd);
-
 	/*
 	 * We don't actually check the pool state here.  If it's in fact in
 	 * use by another pool, we update this fact on the fly when requested.
@@ -1364,8 +1589,6 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
 	metaslab_t *msp;
 
-	dprintf("%s txg %llu\n", vdev_description(vd), txg);
-
 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
 		metaslab_sync_done(msp, txg);
 }
@@ -1378,9 +1601,6 @@ vdev_sync(vdev_t *vd, uint64_t txg)
 	metaslab_t *msp;
 	dmu_tx_t *tx;
 
-	dprintf("%s txg %llu pass %d\n",
-	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
-
 	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
 		ASSERT(vd == vd->vdev_top);
 		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1408,81 +1628,139 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
 }
 
-void
-vdev_io_start(zio_t *zio)
+/*
+ * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
+ * not be opened, and no I/O is attempted.
+ */
+int
+vdev_fault(spa_t *spa, uint64_t guid)
 {
-	zio->io_vd->vdev_ops->vdev_op_io_start(zio);
-}
+	vdev_t *vd;
 
-void
-vdev_io_done(zio_t *zio)
-{
-	zio->io_vd->vdev_ops->vdev_op_io_done(zio);
+	spa_vdev_state_enter(spa);
+
+	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+		return (spa_vdev_state_exit(spa, NULL, ENODEV));
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
+	/*
+	 * Faulted state takes precedence over degraded.
+	 */
+	vd->vdev_faulted = 1ULL;
+	vd->vdev_degraded = 0ULL;
+	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED);
+
+	/*
+	 * If marking the vdev as faulted cause the top-level vdev to become
+	 * unavailable, then back off and simply mark the vdev as degraded
+	 * instead.
+	 */
+	if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
+		vd->vdev_degraded = 1ULL;
+		vd->vdev_faulted = 0ULL;
+
+		/*
+		 * If we reopen the device and it's not dead, only then do we
+		 * mark it degraded.
+		 */
+		vdev_reopen(vd);
+
+		if (vdev_readable(vd)) {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
+			    VDEV_AUX_ERR_EXCEEDED);
+		}
+	}
+
+	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
-const char *
-vdev_description(vdev_t *vd)
+/*
+ * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
+ * user that something is wrong.  The vdev continues to operate as normal as far
+ * as I/O is concerned.
+ */
+int
+vdev_degrade(spa_t *spa, uint64_t guid)
 {
-	if (vd == NULL || vd->vdev_ops == NULL)
-		return ("<unknown>");
+	vdev_t *vd;
+
+	spa_vdev_state_enter(spa);
+
+	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
-	if (vd->vdev_path != NULL)
-		return (vd->vdev_path);
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
+	/*
+	 * If the vdev is already faulted, then don't do anything.
+	 */
+	if (vd->vdev_faulted || vd->vdev_degraded)
+		return (spa_vdev_state_exit(spa, NULL, 0));
 
-	if (vd->vdev_parent == NULL)
-		return (spa_name(vd->vdev_spa));
+	vd->vdev_degraded = 1ULL;
+	if (!vdev_is_dead(vd))
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
+		    VDEV_AUX_ERR_EXCEEDED);
 
-	return (vd->vdev_ops->vdev_op_type);
+	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
+/*
+ * Online the given vdev.  If 'unspare' is set, it implies two things.  First,
+ * any attached spare device should be detached when the device finishes
+ * resilvering.  Second, the online should be treated like a 'test' online case,
+ * so no FMA events are generated if the device fails to open.
+ */
 int
-vdev_online(spa_t *spa, uint64_t guid)
+vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 {
-	vdev_t *rvd, *vd;
-	uint64_t txg;
-
-	txg = spa_vdev_enter(spa);
+	vdev_t *vd;
 
-	rvd = spa->spa_root_vdev;
+	spa_vdev_state_enter(spa);
 
-	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
-		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
-	dprintf("ONLINE: %s\n", vdev_description(vd));
+		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
+	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
+	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
 	vdev_reopen(vd->vdev_top);
+	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
 
-	vdev_config_dirty(vd->vdev_top);
+	if (newstate)
+		*newstate = vd->vdev_state;
+	if ((flags & ZFS_ONLINE_UNSPARE) &&
+	    !vdev_is_dead(vd) && vd->vdev_parent &&
+	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+	    vd->vdev_parent->vdev_child[0] == vd)
+		vd->vdev_unspare = B_TRUE;
 
-	(void) spa_vdev_exit(spa, NULL, txg, 0);
+	(void) spa_vdev_state_exit(spa, vd, 0);
 
-	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+	VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
 
 	return (0);
 }
 
 int
-vdev_offline(spa_t *spa, uint64_t guid, int istmp)
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 {
-	vdev_t *rvd, *vd;
-	uint64_t txg;
-
-	txg = spa_vdev_enter(spa);
+	vdev_t *vd;
 
-	rvd = spa->spa_root_vdev;
+	spa_vdev_state_enter(spa);
 
-	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
-		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
-	dprintf("OFFLINE: %s\n", vdev_description(vd));
+		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	/*
 	 * If the device isn't already offline, try to offline it.
@@ -1496,7 +1774,7 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp)
 		 * as long as the remaining devices don't have any DTL holes.
 		 */
 		if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
-			return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 
 		/*
 		 * Offline this device and reopen its top-level vdev.
@@ -1505,18 +1783,16 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp)
 		 */
 		vd->vdev_offline = B_TRUE;
 		vdev_reopen(vd->vdev_top);
-		if (vdev_is_dead(vd->vdev_top)) {
+		if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
 			vd->vdev_offline = B_FALSE;
 			vdev_reopen(vd->vdev_top);
-			return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 		}
 	}
 
-	vd->vdev_tmpoffline = istmp;
+	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
 
-	vdev_config_dirty(vd->vdev_top);
-
-	return (spa_vdev_exit(spa, NULL, txg, 0));
+	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 /*
@@ -1527,56 +1803,78 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp)
 void
 vdev_clear(spa_t *spa, vdev_t *vd)
 {
-	int c;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == NULL)
-		vd = spa->spa_root_vdev;
+		vd = rvd;
 
 	vd->vdev_stat.vs_read_errors = 0;
 	vd->vdev_stat.vs_write_errors = 0;
 	vd->vdev_stat.vs_checksum_errors = 0;
 
-	for (c = 0; c < vd->vdev_children; c++)
+	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
+
+	/*
+	 * If we're in the FAULTED state or have experienced failed I/O, then
+	 * clear the persistent state and attempt to reopen the device.  We
+	 * also mark the vdev config dirty, so that the new faulted state is
+	 * written out to disk.
+	 */
+	if (vd->vdev_faulted || vd->vdev_degraded ||
+	    !vdev_readable(vd) || !vdev_writeable(vd)) {
+
+		vd->vdev_faulted = vd->vdev_degraded = 0;
+		vd->vdev_cant_read = B_FALSE;
+		vd->vdev_cant_write = B_FALSE;
+
+		vdev_reopen(vd);
+
+		if (vd != rvd)
+			vdev_state_dirty(vd->vdev_top);
+
+		if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
+			spa_async_request(spa, SPA_ASYNC_RESILVER);
+
+		spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
+	}
 }
 
-int
+boolean_t
 vdev_is_dead(vdev_t *vd)
 {
-	return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
+	return (vd->vdev_state < VDEV_STATE_DEGRADED);
 }
 
-int
-vdev_error_inject(vdev_t *vd, zio_t *zio)
+boolean_t
+vdev_readable(vdev_t *vd)
+{
+	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
+}
+
+boolean_t
+vdev_writeable(vdev_t *vd)
 {
-	int error = 0;
+	return (!vdev_is_dead(vd) && !vd->vdev_cant_write);
+}
 
-	if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
-		return (0);
+boolean_t
+vdev_accessible(vdev_t *vd, zio_t *zio)
+{
+	ASSERT(zio->io_vd == vd);
 
-	if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
-		return (0);
+	if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
+		return (B_FALSE);
 
-	switch (vd->vdev_fault_mode) {
-	case VDEV_FAULT_RANDOM:
-		if (spa_get_random(vd->vdev_fault_arg) == 0)
-			error = EIO;
-		break;
-
-	case VDEV_FAULT_COUNT:
-		if ((int64_t)--vd->vdev_fault_arg <= 0)
-			vd->vdev_fault_mode = VDEV_FAULT_NONE;
-		error = EIO;
-		break;
-	}
+	if (zio->io_type == ZIO_TYPE_READ)
+		return (!vd->vdev_cant_read);
 
-	if (error != 0) {
-		dprintf("returning %d for type %d on %s state %d offset %llx\n",
-		    error, zio->io_type, vdev_description(vd),
-		    vd->vdev_state, zio->io_offset);
-	}
+	if (zio->io_type == ZIO_TYPE_WRITE)
+		return (!vd->vdev_cant_write);
 
-	return (error);
+	return (B_TRUE);
 }
 
 /*
@@ -1586,10 +1884,10 @@ void
 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 {
 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
-	int c, t;
 
 	mutex_enter(&vd->vdev_stat_lock);
 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+	vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 	vs->vs_state = vd->vdev_state;
 	vs->vs_rsize = vdev_get_rsize(vd);
@@ -1600,49 +1898,80 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 	 * over all top-level vdevs (i.e. the direct children of the root).
 	 */
 	if (vd == rvd) {
-		for (c = 0; c < rvd->vdev_children; c++) {
+		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *cvd = rvd->vdev_child[c];
 			vdev_stat_t *cvs = &cvd->vdev_stat;
 
 			mutex_enter(&vd->vdev_stat_lock);
-			for (t = 0; t < ZIO_TYPES; t++) {
+			for (int t = 0; t < ZIO_TYPES; t++) {
 				vs->vs_ops[t] += cvs->vs_ops[t];
 				vs->vs_bytes[t] += cvs->vs_bytes[t];
 			}
-			vs->vs_read_errors += cvs->vs_read_errors;
-			vs->vs_write_errors += cvs->vs_write_errors;
-			vs->vs_checksum_errors += cvs->vs_checksum_errors;
 			vs->vs_scrub_examined += cvs->vs_scrub_examined;
-			vs->vs_scrub_errors += cvs->vs_scrub_errors;
 			mutex_exit(&vd->vdev_stat_lock);
 		}
 	}
 }
 
 void
-vdev_stat_update(zio_t *zio)
+vdev_clear_stats(vdev_t *vd)
 {
-	vdev_t *vd = zio->io_vd;
+	mutex_enter(&vd->vdev_stat_lock);
+	vd->vdev_stat.vs_space = 0;
+	vd->vdev_stat.vs_dspace = 0;
+	vd->vdev_stat.vs_alloc = 0;
+	mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
+vdev_stat_update(zio_t *zio, uint64_t psize)
+{
+	vdev_t *rvd = zio->io_spa->spa_root_vdev;
+	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
 	vdev_t *pvd;
 	uint64_t txg = zio->io_txg;
 	vdev_stat_t *vs = &vd->vdev_stat;
 	zio_type_t type = zio->io_type;
 	int flags = zio->io_flags;
 
+	/*
+	 * If this i/o is a gang leader, it didn't do any actual work.
+	 */
+	if (zio->io_gang_tree)
+		return;
+
 	if (zio->io_error == 0) {
+		/*
+		 * If this is a root i/o, don't count it -- we've already
+		 * counted the top-level vdevs, and vdev_get_stats() will
+		 * aggregate them when asked.  This reduces contention on
+		 * the root vdev_stat_lock and implicitly handles blocks
+		 * that compress away to holes, for which there is no i/o.
+		 * (Holes never create vdev children, so all the counters
+		 * remain zero, which is what we want.)
+		 *
+		 * Note: this only applies to successful i/o (io_error == 0)
+		 * because unlike i/o counts, errors are not additive.
+		 * When reading a ditto block, for example, failure of
+		 * one top-level vdev does not imply a root-level error.
+		 */
+		if (vd == rvd)
+			return;
+
+		ASSERT(vd == zio->io_vd);
 		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
 			mutex_enter(&vd->vdev_stat_lock);
 			vs->vs_ops[type]++;
-			vs->vs_bytes[type] += zio->io_size;
+			vs->vs_bytes[type] += psize;
 			mutex_exit(&vd->vdev_stat_lock);
 		}
-		if ((flags & ZIO_FLAG_IO_REPAIR) &&
-		    zio->io_delegate_list == NULL) {
+		if (flags & ZIO_FLAG_IO_REPAIR) {
+			ASSERT(zio->io_delegate_list == NULL);
 			mutex_enter(&vd->vdev_stat_lock);
 			if (flags & ZIO_FLAG_SCRUB_THREAD)
-				vs->vs_scrub_repaired += zio->io_size;
+				vs->vs_scrub_repaired += psize;
 			else
-				vs->vs_self_healed += zio->io_size;
+				vs->vs_self_healed += psize;
 			mutex_exit(&vd->vdev_stat_lock);
 		}
 		return;
@@ -1651,22 +1980,18 @@ vdev_stat_update(zio_t *zio)
 	if (flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
-	if (!vdev_is_dead(vd)) {
-		mutex_enter(&vd->vdev_stat_lock);
-		if (type == ZIO_TYPE_READ) {
-			if (zio->io_error == ECKSUM)
-				vs->vs_checksum_errors++;
-			else
-				vs->vs_read_errors++;
-		}
-		if (type == ZIO_TYPE_WRITE)
-			vs->vs_write_errors++;
-		mutex_exit(&vd->vdev_stat_lock);
+	mutex_enter(&vd->vdev_stat_lock);
+	if (type == ZIO_TYPE_READ) {
+		if (zio->io_error == ECKSUM)
+			vs->vs_checksum_errors++;
+		else
+			vs->vs_read_errors++;
 	}
+	if (type == ZIO_TYPE_WRITE)
+		vs->vs_write_errors++;
+	mutex_exit(&vd->vdev_stat_lock);
 
-	if (type == ZIO_TYPE_WRITE) {
-		if (txg == 0 || vd->vdev_children != 0)
-			return;
+	if (type == ZIO_TYPE_WRITE && txg != 0 && vd->vdev_children == 0) {
 		if (flags & ZIO_FLAG_SCRUB_THREAD) {
 			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
@@ -1705,7 +2030,6 @@ vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
 		vs->vs_scrub_complete = 0;
 		vs->vs_scrub_examined = 0;
 		vs->vs_scrub_repaired = 0;
-		vs->vs_scrub_errors = 0;
 		vs->vs_scrub_start = gethrestime_sec();
 		vs->vs_scrub_end = 0;
 	}
@@ -1717,33 +2041,48 @@ vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
  * Update the in-core space usage stats for this vdev and the root vdev.
  */
 void
-vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta)
+vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
+    boolean_t update_root)
 {
-	ASSERT(vd == vd->vdev_top);
 	int64_t dspace_delta = space_delta;
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
 
-	do {
-		if (vd->vdev_ms_count) {
-			/*
-			 * If this is a top-level vdev, apply the
-			 * inverse of its psize-to-asize (ie. RAID-Z)
-			 * space-expansion factor.  We must calculate
-			 * this here and not at the root vdev because
-			 * the root vdev's psize-to-asize is simply the
-			 * max of its childrens', thus not accurate
-			 * enough for us.
-			 */
-			ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
-			dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
-			    vd->vdev_deflate_ratio;
-		}
+	ASSERT(vd == vd->vdev_top);
+
+	/*
+	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
+	 * factor.  We must calculate this here and not at the root vdev
+	 * because the root vdev's psize-to-asize is simply the max of its
+	 * childrens', thus not accurate enough for us.
+	 */
+	ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
+	dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
+	    vd->vdev_deflate_ratio;
+
+	mutex_enter(&vd->vdev_stat_lock);
+	vd->vdev_stat.vs_space += space_delta;
+	vd->vdev_stat.vs_alloc += alloc_delta;
+	vd->vdev_stat.vs_dspace += dspace_delta;
+	mutex_exit(&vd->vdev_stat_lock);
+
+	if (update_root) {
+		ASSERT(rvd == vd->vdev_parent);
+		ASSERT(vd->vdev_ms_count != 0);
+
+		/*
+		 * Don't count non-normal (e.g. intent log) space as part of
+		 * the pool's capacity.
+		 */
+		if (vd->vdev_mg->mg_class != spa->spa_normal_class)
+			return;
 
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_space += space_delta;
-		vd->vdev_stat.vs_alloc += alloc_delta;
-		vd->vdev_stat.vs_dspace += dspace_delta;
-		mutex_exit(&vd->vdev_stat_lock);
-	} while ((vd = vd->vdev_parent) != NULL);
+		mutex_enter(&rvd->vdev_stat_lock);
+		rvd->vdev_stat.vs_space += space_delta;
+		rvd->vdev_stat.vs_alloc += alloc_delta;
+		rvd->vdev_stat.vs_dspace += dspace_delta;
+		mutex_exit(&rvd->vdev_stat_lock);
+	}
 }
 
 /*
@@ -1759,13 +2098,53 @@ vdev_config_dirty(vdev_t *vd)
 	int c;
 
 	/*
-	 * The dirty list is protected by the config lock.  The caller must
-	 * either hold the config lock as writer, or must be the sync thread
-	 * (which holds the lock as reader).  There's only one sync thread,
+	 * If this is an aux vdev (as with l2cache devices), then we update the
+	 * vdev config manually and set the sync flag.
+	 */
+	if (vd->vdev_aux != NULL) {
+		spa_aux_vdev_t *sav = vd->vdev_aux;
+		nvlist_t **aux;
+		uint_t naux;
+
+		for (c = 0; c < sav->sav_count; c++) {
+			if (sav->sav_vdevs[c] == vd)
+				break;
+		}
+
+		if (c == sav->sav_count) {
+			/*
+			 * We're being removed.  There's nothing more to do.
+			 */
+			ASSERT(sav->sav_sync == B_TRUE);
+			return;
+		}
+
+		sav->sav_sync = B_TRUE;
+
+		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
+		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0);
+
+		ASSERT(c < naux);
+
+		/*
+		 * Setting the nvlist in the middle if the array is a little
+		 * sketchy, but it will work.
+		 */
+		nvlist_free(aux[c]);
+		aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE);
+
+		return;
+	}
+
+	/*
+	 * The dirty list is protected by the SCL_CONFIG lock.  The caller
+	 * must either hold SCL_CONFIG as writer, or must be the sync thread
+	 * (which holds SCL_CONFIG as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
-	ASSERT(spa_config_held(spa, RW_WRITER) ||
-	    dsl_pool_sync_context(spa_get_dsl(spa)));
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
+	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	if (vd == rvd) {
 		for (c = 0; c < rvd->vdev_children; c++)
@@ -1773,8 +2152,8 @@ vdev_config_dirty(vdev_t *vd)
 	} else {
 		ASSERT(vd == vd->vdev_top);
 
-		if (!list_link_active(&vd->vdev_dirty_node))
-			list_insert_head(&spa->spa_dirty_list, vd);
+		if (!list_link_active(&vd->vdev_config_dirty_node))
+			list_insert_head(&spa->spa_config_dirty_list, vd);
 	}
 }
 
@@ -1783,14 +2162,58 @@ vdev_config_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
-	ASSERT(spa_config_held(spa, RW_WRITER) ||
-	    dsl_pool_sync_context(spa_get_dsl(spa)));
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
+	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
+
+	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
+	list_remove(&spa->spa_config_dirty_list, vd);
+}
+
+/*
+ * Mark a top-level vdev's state as dirty, so that the next pass of
+ * spa_sync() can convert this into vdev_config_dirty().  We distinguish
+ * the state changes from larger config changes because they require
+ * much less locking, and are often needed for administrative actions.
+ */
+void
+vdev_state_dirty(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(vd == vd->vdev_top);
+
+	/*
+	 * The state list is protected by the SCL_STATE lock.  The caller
+	 * must either hold SCL_STATE as writer, or must be the sync thread
+	 * (which holds SCL_STATE as reader).  There's only one sync thread,
+	 * so this is sufficient to ensure mutual exclusion.
+	 */
+	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
+	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
-	ASSERT(list_link_active(&vd->vdev_dirty_node));
-	list_remove(&spa->spa_dirty_list, vd);
+	if (!list_link_active(&vd->vdev_state_dirty_node))
+		list_insert_head(&spa->spa_state_dirty_list, vd);
 }
 
 void
+vdev_state_clean(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
+	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+	    spa_config_held(spa, SCL_STATE, RW_READER)));
+
+	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
+	list_remove(&spa->spa_state_dirty_list, vd);
+}
+
+/*
+ * Propagate vdev state up from children to parent.
+ */
+void
 vdev_propagate_state(vdev_t *vd)
 {
 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
@@ -1799,28 +2222,45 @@ vdev_propagate_state(vdev_t *vd)
 	int c;
 	vdev_t *child;
 
-	for (c = 0; c < vd->vdev_children; c++) {
-		child = vd->vdev_child[c];
-		if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
-			faulted++;
-		else if (child->vdev_state == VDEV_STATE_DEGRADED)
-			degraded++;
-
-		if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
-			corrupted++;
-	}
+	if (vd->vdev_children > 0) {
+		for (c = 0; c < vd->vdev_children; c++) {
+			child = vd->vdev_child[c];
+
+			if (!vdev_readable(child) ||
+			    (!vdev_writeable(child) && (spa_mode & FWRITE))) {
+				/*
+				 * Root special: if there is a top-level log
+				 * device, treat the root vdev as if it were
+				 * degraded.
+				 */
+				if (child->vdev_islog && vd == rvd)
+					degraded++;
+				else
+					faulted++;
+			} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
+				degraded++;
+			}
 
-	vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+				corrupted++;
+		}
 
-	/*
-	 * Root special: if there is a toplevel vdev that cannot be
-	 * opened due to corrupted metadata, then propagate the root
-	 * vdev's aux state as 'corrupt' rather than 'insufficient
-	 * replicas'.
-	 */
-	if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN)
-		vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
+		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+
+		/*
+		 * Root special: if there is a top-level vdev that cannot be
+		 * opened due to corrupted metadata, then propagate the root
+		 * vdev's aux state as 'corrupt' rather than 'insufficient
+		 * replicas'.
+		 */
+		if (corrupted && vd == rvd &&
+		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
+			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+	}
+
+	if (vd->vdev_parent)
+		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
@@ -1835,6 +2275,7 @@ void
 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 {
 	uint64_t save_state;
+	spa_t *spa = vd->vdev_spa;
 
 	if (state == vd->vdev_state) {
 		vd->vdev_stat.vs_aux = aux;
@@ -1857,14 +2298,36 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 	if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_close(vd);
 
-	if (state == VDEV_STATE_CANT_OPEN) {
+	if (vd->vdev_removed &&
+	    state == VDEV_STATE_CANT_OPEN &&
+	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
+		/*
+		 * If the previous state is set to VDEV_STATE_REMOVED, then this
+		 * device was previously marked removed and someone attempted to
+		 * reopen it.  If this failed due to a nonexistent device, then
+		 * keep the device in the REMOVED state.  We also let this be if
+		 * it is one of our special test online cases, which is only
+		 * attempting to online the device and shouldn't generate an FMA
+		 * fault.
+		 */
+		vd->vdev_state = VDEV_STATE_REMOVED;
+		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+	} else if (state == VDEV_STATE_REMOVED) {
+		/*
+		 * Indicate to the ZFS DE that this device has been removed, and
+		 * any recent errors should be ignored.
+		 */
+		zfs_post_remove(spa, vd);
+		vd->vdev_removed = B_TRUE;
+	} else if (state == VDEV_STATE_CANT_OPEN) {
 		/*
 		 * If we fail to open a vdev during an import, we mark it as
 		 * "not available", which signifies that it was never there to
 		 * begin with.  Failure to open such a device is not considered
 		 * an error.
 		 */
-		if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT &&
+		if (spa->spa_load_state == SPA_LOAD_IMPORT &&
+		    !spa->spa_import_faulted &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
@@ -1874,9 +2337,18 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 		 * that this is part of a vdev_reopen().  In this case, we don't
 		 * want to post the ereport if the device was already in the
 		 * CANT_OPEN state beforehand.
+		 *
+		 * If the 'checkremove' flag is set, then this is an attempt to
+		 * online the device in response to an insertion event.  If we
+		 * hit this case, then we have detected an insertion event for a
+		 * faulted or offline device that wasn't in the removed state.
+		 * In this scenario, we don't post an ereport because we are
+		 * about to replace the device, or attempt an online with
+		 * vdev_forcefault, which will generate the fault for us.
 		 */
-		if (vd->vdev_prevstate != state && !vd->vdev_not_present &&
-		    vd != vd->vdev_spa->spa_root_vdev) {
+		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
+		    !vd->vdev_not_present && !vd->vdev_checkremove &&
+		    vd != spa->spa_root_vdev) {
 			const char *class;
 
 			switch (aux) {
@@ -1898,18 +2370,54 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 			case VDEV_AUX_BAD_LABEL:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
 				break;
+			case VDEV_AUX_IO_FAILURE:
+				class = FM_EREPORT_ZFS_IO_FAILURE;
+				break;
 			default:
 				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
 			}
 
-			zfs_ereport_post(class, vd->vdev_spa,
-			    vd, NULL, save_state, 0);
+			zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
 		}
+
+		/* Erase any notion of persistent removed state */
+		vd->vdev_removed = B_FALSE;
+	} else {
+		vd->vdev_removed = B_FALSE;
 	}
 
-	if (isopen)
-		return;
+	if (!isopen)
+		vdev_propagate_state(vd);
+}
 
-	if (vd->vdev_parent != NULL)
-		vdev_propagate_state(vd->vdev_parent);
+/*
+ * Check the vdev configuration to ensure that it's capable of supporting
+ * a root pool. Currently, we do not support RAID-Z or partial configuration.
+ * In addition, only a single top-level vdev is allowed and none of the leaves
+ * can be wholedisks.
+ */
+boolean_t
+vdev_is_bootable(vdev_t *vd)
+{
+	int c;
+
+	if (!vd->vdev_ops->vdev_op_leaf) {
+		char *vdev_type = vd->vdev_ops->vdev_op_type;
+
+		if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
+		    vd->vdev_children > 1) {
+			return (B_FALSE);
+		} else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
+		    strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
+			return (B_FALSE);
+		}
+	} else if (vd->vdev_wholedisk == 1) {
+		return (B_FALSE);
+	}
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		if (!vdev_is_bootable(vd->vdev_child[c]))
+			return (B_FALSE);
+	}
+	return (B_TRUE);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
index 4e419b678eb4..aa8f6f0e5a0f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
@@ -19,16 +19,15 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
+#include <sys/kstat.h>
 
 /*
  * Virtual device read-ahead caching.
@@ -36,15 +35,16 @@
  * This file implements a simple LRU read-ahead cache.  When the DMU reads
  * a given block, it will often want other, nearby blocks soon thereafter.
  * We take advantage of this by reading a larger disk region and caching
- * the result.  In the best case, this can turn 256 back-to-back 512-byte
- * reads into a single 128k read followed by 255 cache hits; this reduces
+ * the result.  In the best case, this can turn 128 back-to-back 512-byte
+ * reads into a single 64k read followed by 127 cache hits; this reduces
  * latency dramatically.  In the worst case, it can turn an isolated 512-byte
- * read into a 128k read, which doesn't affect latency all that much but is
+ * read into a 64k read, which doesn't affect latency all that much but is
  * terribly wasteful of bandwidth.  A more intelligent version of the cache
  * could keep track of access patterns and not do read-ahead unless it sees
- * at least two temporally close I/Os to the same region.  It could also
- * take advantage of semantic information about the I/O.  And it could use
- * something faster than an AVL tree; that was chosen solely for convenience.
+ * at least two temporally close I/Os to the same region.  Currently, only
+ * metadata I/O is inflated.  A futher enhancement could take advantage of
+ * more semantic information about the I/O.  And it could use something
+ * faster than an AVL tree; that was chosen solely for convenience.
  *
  * There are five cache operations: allocate, fill, read, write, evict.
  *
@@ -69,13 +69,15 @@
 /*
  * All i/os smaller than zfs_vdev_cache_max will be turned into
  * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
- * track buffer.  At most zfs_vdev_cache_size bytes will be kept in each
+ * track buffer).  At most zfs_vdev_cache_size bytes will be kept in each
  * vdev's vdev_cache.
  */
-int zfs_vdev_cache_max = 1<<14;
-int zfs_vdev_cache_size = 10ULL << 20;
+int zfs_vdev_cache_max = 1<<14;			/* 16KB */
+int zfs_vdev_cache_size = 10ULL << 20;		/* 10MB */
 int zfs_vdev_cache_bshift = 16;
 
+#define	VCBS (1 << zfs_vdev_cache_bshift)	/* 64KB */
+
 SYSCTL_DECL(_vfs_zfs_vdev);
 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
 TUNABLE_INT("vfs.zfs.vdev.cache.max", &zfs_vdev_cache_max);
@@ -84,8 +86,25 @@ SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN,
 TUNABLE_INT("vfs.zfs.vdev.cache.size", &zfs_vdev_cache_size);
 SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN,
     &zfs_vdev_cache_size, 0, "Size of VDEV cache");
+TUNABLE_INT("vfs.zfs.vdev.cache.bshift", &zfs_vdev_cache_bshift);
+SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, bshift, CTLFLAG_RDTUN,
+    &zfs_vdev_cache_bshift, 0, "Turn too small requests into 1 << this value");
+
+kstat_t	*vdc_ksp = NULL;
+
+typedef struct vdc_stats {
+	kstat_named_t vdc_stat_delegations;
+	kstat_named_t vdc_stat_hits;
+	kstat_named_t vdc_stat_misses;
+} vdc_stats_t;
+
+static vdc_stats_t vdc_stats = {
+	{ "delegations",	KSTAT_DATA_UINT64 },
+	{ "hits",		KSTAT_DATA_UINT64 },
+	{ "misses",		KSTAT_DATA_UINT64 }
+};
 
-#define	VCBS (1 << zfs_vdev_cache_bshift)
+#define	VDCSTAT_BUMP(stat)	atomic_add_64(&vdc_stats.stat.value.ui64, 1);
 
 static int
 vdev_cache_offset_compare(const void *a1, const void *a2)
@@ -127,10 +146,6 @@ vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
 	ASSERT(ve->ve_fill_io == NULL);
 	ASSERT(ve->ve_data != NULL);
 
-	dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n",
-	    vc, ve->ve_offset, ve->ve_lastused, LBOLT - ve->ve_lastused,
-	    ve->ve_hits, ve->ve_missed_update);
-
 	avl_remove(&vc->vc_lastused_tree, ve);
 	avl_remove(&vc->vc_offset_tree, ve);
 	zio_buf_free(ve->ve_data, VCBS);
@@ -161,10 +176,8 @@ vdev_cache_allocate(zio_t *zio)
 	if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
 	    zfs_vdev_cache_size) {
 		ve = avl_first(&vc->vc_lastused_tree);
-		if (ve->ve_fill_io != NULL) {
-			dprintf("can't evict in %p, still filling\n", vc);
+		if (ve->ve_fill_io != NULL)
 			return (NULL);
-		}
 		ASSERT(ve->ve_hits != 0);
 		vdev_cache_evict(vc, ve);
 	}
@@ -239,7 +252,7 @@ vdev_cache_fill(zio_t *zio)
 		zio->io_delegate_list = dio->io_delegate_next;
 		dio->io_delegate_next = NULL;
 		dio->io_error = zio->io_error;
-		zio_next_stage(dio);
+		zio_execute(dio);
 	}
 }
 
@@ -287,6 +300,7 @@ vdev_cache_read(zio_t *zio)
 			fio->io_delegate_list = zio;
 			zio_vdev_io_bypass(zio);
 			mutex_exit(&vc->vc_lock);
+			VDCSTAT_BUMP(vdc_stat_delegations);
 			return (0);
 		}
 
@@ -294,7 +308,8 @@ vdev_cache_read(zio_t *zio)
 		zio_vdev_io_bypass(zio);
 
 		mutex_exit(&vc->vc_lock);
-		zio_next_stage(zio);
+		zio_execute(zio);
+		VDCSTAT_BUMP(vdc_stat_hits);
 		return (0);
 	}
 
@@ -305,11 +320,9 @@ vdev_cache_read(zio_t *zio)
 		return (ENOMEM);
 	}
 
-	fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
+	fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
 	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
-	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
-	    ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
-	    vdev_cache_fill, ve);
+	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
 
 	ve->ve_fill_io = fio;
 	fio->io_delegate_list = zio;
@@ -317,6 +330,7 @@ vdev_cache_read(zio_t *zio)
 
 	mutex_exit(&vc->vc_lock);
 	zio_nowait(fio);
+	VDCSTAT_BUMP(vdc_stat_misses);
 
 	return (0);
 }
@@ -361,6 +375,18 @@ vdev_cache_write(zio_t *zio)
 }
 
 void
+vdev_cache_purge(vdev_t *vd)
+{
+	vdev_cache_t *vc = &vd->vdev_cache;
+	vdev_cache_entry_t *ve;
+
+	mutex_enter(&vc->vc_lock);
+	while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
+		vdev_cache_evict(vc, ve);
+	mutex_exit(&vc->vc_lock);
+}
+
+void
 vdev_cache_init(vdev_t *vd)
 {
 	vdev_cache_t *vc = &vd->vdev_cache;
@@ -380,15 +406,32 @@ void
 vdev_cache_fini(vdev_t *vd)
 {
 	vdev_cache_t *vc = &vd->vdev_cache;
-	vdev_cache_entry_t *ve;
 
-	mutex_enter(&vc->vc_lock);
-	while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
-		vdev_cache_evict(vc, ve);
-	mutex_exit(&vc->vc_lock);
+	vdev_cache_purge(vd);
 
 	avl_destroy(&vc->vc_offset_tree);
 	avl_destroy(&vc->vc_lastused_tree);
 
 	mutex_destroy(&vc->vc_lock);
 }
+
+void
+vdev_cache_stat_init(void)
+{
+	vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (vdc_ksp != NULL) {
+		vdc_ksp->ks_data = &vdc_stats;
+		kstat_install(vdc_ksp);
+	}
+}
+
+void
+vdev_cache_stat_fini(void)
+{
+	if (vdc_ksp != NULL) {
+		kstat_delete(vdc_ksp);
+		vdc_ksp = NULL;
+	}
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
index b965b1c5f09f..35d4e2a9200d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
@@ -19,19 +19,19 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
+#include <sys/refcount.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <sys/sunldi.h>
+#include <sys/fm/fs/zfs.h>
 
 /*
  * Virtual device vector for disks.
@@ -50,6 +50,8 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	vdev_disk_t *dvd;
 	struct dk_minfo dkm;
 	int error;
+	dev_t dev;
+	int otyp;
 
 	/*
 	 * We must have a pathname, and it must be absolute.
@@ -77,6 +79,11 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	 * 3. Otherwise, the device may have moved.  Try opening the device
 	 *    by the devid instead.
 	 *
+	 * If the vdev is part of the root pool, we avoid opening it by path.
+	 * We do this because there is no /dev path available early in boot,
+	 * and if we try to open the device by path at a later point, we can
+	 * deadlock when devfsadm attempts to open the underlying backing store
+	 * file.
 	 */
 	if (vd->vdev_devid != NULL) {
 		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
@@ -88,7 +95,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 
 	error = EINVAL;		/* presume failure */
 
-	if (vd->vdev_path != NULL) {
+	if (vd->vdev_path != NULL && !spa_is_root(vd->vdev_spa)) {
 		ddi_devid_t devid;
 
 		if (vd->vdev_wholedisk == -1ULL) {
@@ -141,12 +148,60 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
 		    spa_mode, kcred, &dvd->vd_lh, zfs_li);
 
+	/*
+	 * If all else fails, then try opening by physical path (if available)
+	 * or the logical path (if we failed due to the devid check).  While not
+	 * as reliable as the devid, this will give us something, and the higher
+	 * level vdev validation will prevent us from opening the wrong device.
+	 */
+	if (error) {
+		if (vd->vdev_physpath != NULL &&
+		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != ENODEV)
+			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode,
+			    kcred, &dvd->vd_lh, zfs_li);
+
+		/*
+		 * Note that we don't support the legacy auto-wholedisk support
+		 * as above.  This hasn't been used in a very long time and we
+		 * don't need to propagate its oddities to this edge condition.
+		 */
+		if (error && vd->vdev_path != NULL &&
+		    !spa_is_root(vd->vdev_spa))
+			error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
+			    &dvd->vd_lh, zfs_li);
+	}
+
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (error);
 	}
 
 	/*
+	 * Once a device is opened, verify that the physical device path (if
+	 * available) is up to date.
+	 */
+	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
+	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
+		char *physpath, *minorname;
+
+		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+		minorname = NULL;
+		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
+		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
+		    (vd->vdev_physpath == NULL ||
+		    strcmp(vd->vdev_physpath, physpath) != 0)) {
+			if (vd->vdev_physpath)
+				spa_strfree(vd->vdev_physpath);
+			(void) strlcat(physpath, ":", MAXPATHLEN);
+			(void) strlcat(physpath, minorname, MAXPATHLEN);
+			vd->vdev_physpath = spa_strdup(physpath);
+		}
+		if (minorname)
+			kmem_free(minorname, strlen(minorname) + 1);
+		kmem_free(physpath, MAXPATHLEN);
+	}
+
+	/*
 	 * Determine the actual size of the device.
 	 */
 	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
@@ -191,10 +246,6 @@ vdev_disk_close(vdev_t *vd)
 	if (dvd == NULL)
 		return;
 
-	dprintf("removing disk %s, devid %s\n",
-	    vd->vdev_path ? vd->vdev_path : "<none>",
-	    vd->vdev_devid ? vd->vdev_devid : "<none>");
-
 	if (dvd->vd_minor != NULL)
 		ddi_devid_str_free(dvd->vd_minor);
 
@@ -208,18 +259,59 @@ vdev_disk_close(vdev_t *vd)
 	vd->vdev_tsd = NULL;
 }
 
+int
+vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
+    uint64_t offset, int flags)
+{
+	buf_t *bp;
+	int error = 0;
+
+	if (vd_lh == NULL)
+		return (EINVAL);
+
+	ASSERT(flags & B_READ || flags & B_WRITE);
+
+	bp = getrbuf(KM_SLEEP);
+	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
+	bp->b_bcount = size;
+	bp->b_un.b_addr = (void *)data;
+	bp->b_lblkno = lbtodb(offset);
+	bp->b_bufsize = size;
+
+	error = ldi_strategy(vd_lh, bp);
+	ASSERT(error == 0);
+	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
+		error = EIO;
+	freerbuf(bp);
+
+	return (error);
+}
+
 static void
 vdev_disk_io_intr(buf_t *bp)
 {
 	vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
 	zio_t *zio = vdb->vdb_io;
 
-	if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0)
+	/*
+	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
+	 * Rather than teach the rest of the stack about other error
+	 * possibilities (EFAULT, etc), we normalize the error value here.
+	 */
+	zio->io_error = (geterror(bp) != 0 ? EIO : 0);
+
+	if (zio->io_error == 0 && bp->b_resid != 0)
 		zio->io_error = EIO;
 
 	kmem_free(vdb, sizeof (vdev_disk_buf_t));
 
-	zio_next_stage_async(zio);
+	zio_interrupt(zio);
+}
+
+static void
+vdev_disk_ioctl_free(zio_t *zio)
+{
+	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
 }
 
 static void
@@ -229,26 +321,24 @@ vdev_disk_ioctl_done(void *zio_arg, int error)
 
 	zio->io_error = error;
 
-	zio_next_stage_async(zio);
+	zio_interrupt(zio);
 }
 
-static void
+static int
 vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_disk_t *dvd = vd->vdev_tsd;
 	vdev_disk_buf_t *vdb;
+	struct dk_callback *dkc;
 	buf_t *bp;
-	int flags, error;
+	int error;
 
 	if (zio->io_type == ZIO_TYPE_IOCTL) {
-		zio_vdev_io_bypass(zio);
-
 		/* XXPOLICY */
-		if (vdev_is_dead(vd)) {
+		if (!vdev_readable(vd)) {
 			zio->io_error = ENXIO;
-			zio_next_stage_async(zio);
-			return;
+			return (ZIO_PIPELINE_CONTINUE);
 		}
 
 		switch (zio->io_cmd) {
@@ -263,12 +353,15 @@ vdev_disk_io_start(zio_t *zio)
 				break;
 			}
 
-			zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done;
-			zio->io_dk_callback.dkc_cookie = zio;
+			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
+			zio->io_vsd_free = vdev_disk_ioctl_free;
+
+			dkc->dkc_callback = vdev_disk_ioctl_done;
+			dkc->dkc_flag = FLUSH_VOLATILE;
+			dkc->dkc_cookie = zio;
 
 			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
-			    (uintptr_t)&zio->io_dk_callback,
-			    FKIOCTL, kcred, NULL);
+			    (uintptr_t)dkc, FKIOCTL, kcred, NULL);
 
 			if (error == 0) {
 				/*
@@ -276,13 +369,16 @@ vdev_disk_io_start(zio_t *zio)
 				 * and will call vdev_disk_ioctl_done()
 				 * upon completion.
 				 */
-				return;
-			} else if (error == ENOTSUP) {
+				return (ZIO_PIPELINE_STOP);
+			}
+
+			if (error == ENOTSUP || error == ENOTTY) {
 				/*
-				 * If we get ENOTSUP, we know that no future
-				 * attempts will ever succeed.  In this case we
-				 * set a persistent bit so that we don't bother
-				 * with the ioctl in the future.
+				 * If we get ENOTSUP or ENOTTY, we know that
+				 * no future attempts will ever succeed.
+				 * In this case we set a persistent bit so
+				 * that we don't bother with the ioctl in the
+				 * future.
 				 */
 				vd->vdev_nowritecache = B_TRUE;
 			}
@@ -294,61 +390,51 @@ vdev_disk_io_start(zio_t *zio)
 			zio->io_error = ENOTSUP;
 		}
 
-		zio_next_stage_async(zio);
-		return;
+		return (ZIO_PIPELINE_CONTINUE);
 	}
 
-	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
-		return;
-
-	if ((zio = vdev_queue_io(zio)) == NULL)
-		return;
-
-	flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
-	flags |= B_BUSY | B_NOCACHE;
-	if (zio->io_flags & ZIO_FLAG_FAILFAST)
-		flags |= B_FAILFAST;
-
 	vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
 
 	vdb->vdb_io = zio;
 	bp = &vdb->vdb_buf;
 
 	bioinit(bp);
-	bp->b_flags = flags;
+	bp->b_flags = B_BUSY | B_NOCACHE |
+	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE) |
+	    ((zio->io_flags & ZIO_FLAG_IO_RETRY) ? 0 : B_FAILFAST);
 	bp->b_bcount = zio->io_size;
 	bp->b_un.b_addr = zio->io_data;
 	bp->b_lblkno = lbtodb(zio->io_offset);
 	bp->b_bufsize = zio->io_size;
 	bp->b_iodone = (int (*)())vdev_disk_io_intr;
 
-	/* XXPOLICY */
-	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
-	if (error) {
-		zio->io_error = error;
-		bioerror(bp, error);
-		bp->b_resid = bp->b_bcount;
-		bp->b_iodone(bp);
-		return;
-	}
-
-	error = ldi_strategy(dvd->vd_lh, bp);
 	/* ldi_strategy() will return non-zero only on programming errors */
-	ASSERT(error == 0);
+	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
+
+	return (ZIO_PIPELINE_STOP);
 }
 
 static void
 vdev_disk_io_done(zio_t *zio)
 {
-	vdev_queue_io_done(zio);
-
-	if (zio->io_type == ZIO_TYPE_WRITE)
-		vdev_cache_write(zio);
-
-	if (zio_injection_enabled && zio->io_error == 0)
-		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+	vdev_t *vd = zio->io_vd;
 
-	zio_next_stage(zio);
+	/*
+	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
+	 * the device has been removed.  If this is the case, then we trigger an
+	 * asynchronous removal of the device. Otherwise, probe the device and
+	 * make sure it's still accessible.
+	 */
+	if (zio->io_error == EIO) {
+		vdev_disk_t *dvd = vd->vdev_tsd;
+		int state = DKIO_NONE;
+
+		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
+		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
+			vd->vdev_remove_wanted = B_TRUE;
+			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+		}
+	}
 }
 
 vdev_ops_t vdev_disk_ops = {
@@ -361,3 +447,80 @@ vdev_ops_t vdev_disk_ops = {
 	VDEV_TYPE_DISK,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
+
+/*
+ * Given the root disk device devid or pathname, read the label from
+ * the device, and construct a configuration nvlist.
+ */
+int
+vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
+{
+	ldi_handle_t vd_lh;
+	vdev_label_t *label;
+	uint64_t s, size;
+	int l;
+	ddi_devid_t tmpdevid;
+	int error = -1;
+	char *minor_name;
+
+	/*
+	 * Read the device label and build the nvlist.
+	 */
+	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
+	    &minor_name) == 0) {
+		error = ldi_open_by_devid(tmpdevid, minor_name,
+		    spa_mode, kcred, &vd_lh, zfs_li);
+		ddi_devid_free(tmpdevid);
+		ddi_devid_str_free(minor_name);
+	}
+
+	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
+	    zfs_li)))
+		return (error);
+
+	if (ldi_get_size(vd_lh, &s)) {
+		(void) ldi_close(vd_lh, FREAD, kcred);
+		return (EIO);
+	}
+
+	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
+	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
+
+	for (l = 0; l < VDEV_LABELS; l++) {
+		uint64_t offset, state, txg = 0;
+
+		/* read vdev label */
+		offset = vdev_label_offset(size, l, 0);
+		if (vdev_disk_physio(vd_lh, (caddr_t)label,
+		    VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE +
+		    VDEV_PHYS_SIZE, offset, B_READ) != 0)
+			continue;
+
+		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
+		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
+			*config = NULL;
+			continue;
+		}
+
+		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
+		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
+			nvlist_free(*config);
+			*config = NULL;
+			continue;
+		}
+
+		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
+		    &txg) != 0 || txg == 0) {
+			nvlist_free(*config);
+			*config = NULL;
+			continue;
+		}
+
+		break;
+	}
+
+	kmem_free(label, sizeof (vdev_label_t));
+	(void) ldi_close(vd_lh, FREAD, kcred);
+
+	return (error);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
index ab2d34c08256..673b633f595b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
@@ -19,18 +19,17 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
 
 /*
  * Virtual device vector for files.
@@ -61,8 +60,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	 * to local zone users, so the underlying devices should be as well.
 	 */
 	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
-	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX,
-	    0, &vp, 0, 0, rootdir);
+	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
+	    spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
 
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
@@ -80,12 +79,13 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 		return (ENODEV);
 	}
 #endif
-
 	/*
 	 * Determine the physical size of the file.
 	 */
 	vattr.va_mask = AT_SIZE;
-	error = VOP_GETATTR(vp, &vattr, 0);
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	error = VOP_GETATTR(vp, &vattr, kcred);
+	VOP_UNLOCK(vp, 0);
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (error);
@@ -101,71 +101,46 @@ static void
 vdev_file_close(vdev_t *vd)
 {
 	vdev_file_t *vf = vd->vdev_tsd;
+	int vfslocked;
 
 	if (vf == NULL)
 		return;
 
 	if (vf->vf_vnode != NULL) {
-		(void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred);
-		(void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred);
+		vfslocked = VFS_LOCK_GIANT(vf->vf_vnode->v_mount);
+		(void)vn_close(vf->vf_vnode, spa_mode, kcred, curthread);
 		VN_RELE(vf->vf_vnode);
+		VFS_UNLOCK_GIANT(vfslocked);
 	}
 
 	kmem_free(vf, sizeof (vdev_file_t));
 	vd->vdev_tsd = NULL;
 }
 
-static void
+static int
 vdev_file_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_file_t *vf = vd->vdev_tsd;
 	ssize_t resid;
-	int error;
 
 	if (zio->io_type == ZIO_TYPE_IOCTL) {
-		zio_vdev_io_bypass(zio);
-
 		/* XXPOLICY */
-		if (vdev_is_dead(vd)) {
+		if (!vdev_readable(vd)) {
 			zio->io_error = ENXIO;
-			zio_next_stage_async(zio);
-			return;
+			return (ZIO_PIPELINE_CONTINUE);
 		}
 
 		switch (zio->io_cmd) {
 		case DKIOCFLUSHWRITECACHE:
 			zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
-			    kcred);
-			dprintf("fsync(%s) = %d\n", vdev_description(vd),
-			    zio->io_error);
+			    kcred, NULL);
 			break;
 		default:
 			zio->io_error = ENOTSUP;
 		}
 
-		zio_next_stage_async(zio);
-		return;
-	}
-
-	/*
-	 * In the kernel, don't bother double-caching, but in userland,
-	 * we want to test the vdev_cache code.
-	 */
-#ifndef _KERNEL
-	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
-		return;
-#endif
-
-	if ((zio = vdev_queue_io(zio)) == NULL)
-		return;
-
-	/* XXPOLICY */
-	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
-	if (error) {
-		zio->io_error = error;
-		zio_next_stage_async(zio);
-		return;
+		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
@@ -176,23 +151,15 @@ vdev_file_io_start(zio_t *zio)
 	if (resid != 0 && zio->io_error == 0)
 		zio->io_error = ENOSPC;
 
-	zio_next_stage_async(zio);
+	zio_interrupt(zio);
+
+	return (ZIO_PIPELINE_STOP);
 }
 
+/* ARGSUSED */
 static void
 vdev_file_io_done(zio_t *zio)
 {
-	vdev_queue_io_done(zio);
-
-#ifndef _KERNEL
-	if (zio->io_type == ZIO_TYPE_WRITE)
-		vdev_cache_write(zio);
-#endif
-
-	if (zio_injection_enabled && zio->io_error == 0)
-		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
-
-	zio_next_stage(zio);
 }
 
 vdev_ops_t vdev_file_ops = {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
index eebc911edc4b..f151f83ff82d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
@@ -96,13 +96,9 @@ vdev_geom_orphan(struct g_consumer *cp)
 		g_wither_geom(gp, error);
 	}
 	vdev_geom_release(vd);
-	/* Both methods below work, but in a bit different way. */
-#if 0
-	vd->vdev_reopen_wanted = 1;
-#else
-	vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux);
-#endif
+
+	vd->vdev_remove_wanted = B_TRUE;
+	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
 }
 
 static struct g_consumer *
@@ -229,7 +225,7 @@ vdev_geom_worker(void *arg)
 			vd->vdev_nowritecache = B_TRUE;
 		}
 		g_destroy_bio(bp);
-		zio_next_stage_async(zio);
+		zio_interrupt(zio);
 	}
 }
 
@@ -249,6 +245,194 @@ vdev_geom_get_id(struct g_consumer *cp)
 	return (id);
 }
 
+static uint64_t
+nvlist_get_guid(nvlist_t *list)
+{
+	nvpair_t *elem = NULL;
+	uint64_t value;
+
+	while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
+		if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
+		    strcmp(nvpair_name(elem), "guid") == 0) {
+			VERIFY(nvpair_value_uint64(elem, &value) == 0);
+			return (value);
+		}
+	}
+	return (0);
+}
+
+static char *
+nvlist_get_devid(nvlist_t *list, uint64_t guid)
+{
+	nvpair_t *elem = NULL;
+	int progress;
+	char *id;
+
+	progress = 0;
+	id = NULL;
+
+	while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
+		switch (nvpair_type(elem)) {
+		case DATA_TYPE_STRING:
+		    {
+			char *value;
+
+			VERIFY(nvpair_value_string(elem, &value) == 0);
+			if (strcmp(nvpair_name(elem), "type") == 0 &&
+			    strcmp(value, "disk") == 0) {
+				progress |= 0x01;
+			} else if (strcmp(nvpair_name(elem), "devid") == 0) {
+				progress |= 0x02;
+				id = value;
+			}
+			break;
+		    }
+		case DATA_TYPE_UINT64:
+		    {
+			uint64_t value;
+
+			VERIFY(nvpair_value_uint64(elem, &value) == 0);
+			if (strcmp(nvpair_name(elem), "guid") == 0 &&
+			    value == guid) {
+				progress |= 0x04;
+			}
+			break;
+		    }
+		case DATA_TYPE_NVLIST:
+		    {
+			nvlist_t *value;
+			char *lid;
+
+			VERIFY(nvpair_value_nvlist(elem, &value) == 0);
+			lid = nvlist_get_devid(value, guid);
+			if (lid != NULL)
+				return (lid);
+			break;
+		    }
+		case DATA_TYPE_NVLIST_ARRAY:
+		    {
+			nvlist_t **value;
+			u_int c, count;
+			char *lid;
+
+			VERIFY(nvpair_value_nvlist_array(elem, &value,
+			    &count) == 0);
+
+			for (c = 0; c < count; c++) {
+				lid = nvlist_get_devid(value[c], guid);
+				if (lid != NULL)
+					return (lid);
+			}
+			break;
+		    }
+		}
+		if (progress == 0x07)
+			break;
+	}
+	if (progress != 0x07)
+		id = NULL;
+	return (id);
+}
+
+static int
+vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size)
+{
+	struct bio *bp;
+	u_char *p;
+	off_t off;
+	int error;
+
+	ASSERT((offset % cp->provider->sectorsize) == 0);
+	ASSERT((size % cp->provider->sectorsize) == 0);
+
+	bp = g_alloc_bio();
+	off = offset;
+	offset += size;
+	p = data;
+	error = 0;
+
+	for (; off < offset; off += MAXPHYS, p += MAXPHYS, size -= MAXPHYS) {
+		bzero(bp, sizeof(*bp));
+		bp->bio_cmd = cmd;
+		bp->bio_done = NULL;
+		bp->bio_offset = off;
+		bp->bio_length = MIN(size, MAXPHYS);
+		bp->bio_data = p;
+		g_io_request(bp, cp);
+		error = biowait(bp, "vdev_geom_io");
+		if (error != 0)
+			break;
+	}
+
+	g_destroy_bio(bp);
+	return (error);
+}
+
+static char *
+vdev_geom_read_id(struct g_consumer *cp)
+{
+	struct g_provider *pp;
+	vdev_label_t *label;
+	char *p, *buf;
+	size_t buflen;
+	uint64_t psize;
+	off_t offset, size;
+	char *id;
+	int error, l, len;
+
+	g_topology_assert_not();
+
+	pp = cp->provider;
+
+	psize = pp->mediasize;
+	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
+
+	size = sizeof(*label) + pp->sectorsize -
+	    ((sizeof(*label) - 1) % pp->sectorsize) - 1;
+
+	id = NULL;
+	label = kmem_alloc(size, KM_SLEEP);
+	buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
+
+	for (l = 0; l < VDEV_LABELS && id == NULL; l++) {
+		nvlist_t *config = NULL;
+		uint64_t guid;
+
+		offset = vdev_label_offset(psize, l, 0);
+		if ((offset % pp->sectorsize) != 0)
+			continue;
+
+		error = vdev_geom_io(cp, BIO_READ, label, offset, size);
+		if (error != 0)
+			continue;
+		buf = label->vl_vdev_phys.vp_nvlist;
+
+		if (nvlist_unpack(buf, buflen, &config, 0) != 0)
+			continue;
+
+		guid = nvlist_get_guid(config);
+		if (guid == 0) {
+			nvlist_free(config);
+			continue;
+		}
+		id = nvlist_get_devid(config, guid);
+		if (id != NULL) {
+			char *tmp;
+
+			tmp = kmem_zalloc(DISK_IDENT_SIZE, KM_SLEEP);
+			strlcpy(tmp, id, DISK_IDENT_SIZE);
+			id = tmp;
+		}
+
+		nvlist_free(config);
+	}
+
+	kmem_free(label, size);
+	if (id != NULL)
+		ZFS_LOG(1, "ID of %s: %s", pp->name, id);
+	return (id);
+}
+
 static void
 vdev_geom_free_id(char *id)
 {
@@ -290,6 +474,7 @@ vdev_geom_attach_by_id_event(void *arg, int flags __unused)
 	zgp->orphan = vdev_geom_taste_orphan;
 	zcp = g_new_consumer(zgp);
 
+	/* First round tries to get provider's ID without reading metadata. */
 	LIST_FOREACH(mp, &g_classes, class) {
 		if (mp == &zfs_vdev_class)
 			continue;
@@ -324,6 +509,41 @@ vdev_geom_attach_by_id_event(void *arg, int flags __unused)
 			}
 		}
 	}
+	/* Second round looks for ID by reading ZFS metadata. */
+	LIST_FOREACH(mp, &g_classes, class) {
+		if (mp == &zfs_vdev_class)
+			continue;
+		LIST_FOREACH(gp, &mp->geom, geom) {
+			if (gp->flags & G_GEOM_WITHER)
+				continue;
+			LIST_FOREACH(pp, &gp->provider, provider) {
+				if (pp->flags & G_PF_WITHER)
+					continue;
+				g_attach(zcp, pp);
+				if (g_access(zcp, 1, 0, 0) != 0) {
+					g_detach(zcp);
+					continue;
+				}
+				g_topology_unlock();
+				id = vdev_geom_read_id(zcp);
+				g_topology_lock();
+				g_access(zcp, -1, 0, 0);
+				g_detach(zcp);
+				if (id == NULL || strcmp(id, ap->id) != 0) {
+					vdev_geom_free_id(id);
+					continue;
+				}
+				vdev_geom_free_id(id);
+				ap->cp = vdev_geom_attach(pp, ap->write);
+				if (ap->cp == NULL) {
+					printf("ZFS WARNING: Cannot open %s "
+					    "for writting.\n", pp->name);
+					continue;
+				}
+				goto end;
+			}
+		}
+	}
 	ap->cp = NULL;
 end:
 	g_destroy_consumer(zcp);
@@ -345,25 +565,13 @@ vdev_geom_attach_by_id(const char *id, int write)
 	return (cp);
 }
 
-static int
-vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+static struct g_consumer *
+vdev_geom_open_by_path_and_devid(vdev_t *vd)
 {
-	vdev_geom_ctx_t *ctx;
 	struct g_provider *pp;
 	struct g_consumer *cp;
-	char *id = NULL;
-	int owned;
-
-	/*
-	 * We must have a pathname, and it must be absolute.
-	 */
-	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
-		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (EINVAL);
-	}
+	char *id;
 
-	if ((owned = mtx_owned(&Giant)))
-		mtx_unlock(&Giant);
 	cp = NULL;
 	g_topology_lock();
 	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
@@ -380,40 +588,101 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 				ZFS_LOG(1, "ID mismatch for provider %s: "
 				    "[%s]!=[%s].", vd->vdev_path,
 				    vd->vdev_devid, id);
-				goto next;
-			}
-			ZFS_LOG(1, "ID match for provider %s.", vd->vdev_path);
+			} else
+				ZFS_LOG(1, "ID match for provider %s.",
+				    vd->vdev_path);
+			vdev_geom_free_id(id);
 		}
 	}
-next:
 	g_topology_unlock();
-	vdev_geom_free_id(id);
-	if (cp == NULL && vd->vdev_devid != NULL) {
-		ZFS_LOG(1, "Searching by ID [%s].", vd->vdev_devid);
-		cp = vdev_geom_attach_by_id(vd->vdev_devid,
-		    !!(spa_mode & FWRITE));
-		if (cp != NULL) {
-			size_t len = strlen(cp->provider->name) + 6; /* 6 == strlen("/dev/") + 1 */
-			char *buf = kmem_alloc(len, KM_SLEEP);
-
-			snprintf(buf, len, "/dev/%s", cp->provider->name);
-			spa_strfree(vd->vdev_path);
-			vd->vdev_path = buf;
-
-			ZFS_LOG(1, "Attach by ID [%s] succeeded, provider %s.",
-			    vd->vdev_devid, vd->vdev_path);
+
+	return (cp);
+}
+
+static struct g_consumer *
+vdev_geom_open_by_devid(vdev_t *vd)
+{
+	struct g_consumer *cp;
+	char *buf;
+	size_t len;
+
+	/*
+	 * We can't search by devid if it's missing.
+	 */
+	if (vd->vdev_devid == NULL)
+		return (NULL);
+
+	ZFS_LOG(1, "Searching by ID [%s].", vd->vdev_devid);
+	cp = vdev_geom_attach_by_id(vd->vdev_devid, !!(spa_mode & FWRITE));
+	if (cp != NULL) {
+		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
+		buf = kmem_alloc(len, KM_SLEEP);
+	
+		snprintf(buf, len, "/dev/%s", cp->provider->name);
+		spa_strfree(vd->vdev_path);
+		vd->vdev_path = buf;
+
+		ZFS_LOG(1, "Attach by ID [%s] succeeded, provider %s.",
+		    vd->vdev_devid, vd->vdev_path);
+	} else
+		ZFS_LOG(1, "Search by ID [%s] failed.", vd->vdev_devid);
+
+	return (cp);
+}
+
+static int
+vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+	vdev_geom_ctx_t *ctx;
+	struct g_provider *pp;
+	struct g_consumer *cp;
+	int owned;
+
+	/*
+	 * We must have a pathname, and it must be absolute.
+	 */
+	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (EINVAL);
+	}
+
+	vd->vdev_tsd = NULL;
+
+	if ((owned = mtx_owned(&Giant)))
+		mtx_unlock(&Giant);
+	cp = vdev_geom_open_by_path_and_devid(vd);
+	if (cp == NULL) {
+		/*
+		 * The device at vd->vdev_path doesn't have the right devid.
+		 * The disks might have merely moved around so try all other
+		 * geom providers to find one with the right devid.
+		 */
+		cp = vdev_geom_open_by_devid(vd);
+		if (cp == NULL) {
+			ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
+			vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+			if (owned)
+				mtx_lock(&Giant);
+			return (EACCES);
 		}
 	}
 	if (owned)
 		mtx_lock(&Giant);
-	if (cp == NULL) {
-		ZFS_LOG(1, "Provider %s (id=[%s]) not found.", vd->vdev_path,
-		    vd->vdev_devid);
-		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		return (EACCES);
-	}
+
+	cp->private = vd;
+
+	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP);
+	bioq_init(&ctx->gc_queue);
+	mtx_init(&ctx->gc_queue_mtx, "zfs:vdev:geom:queue", NULL, MTX_DEF);
+	ctx->gc_consumer = cp;
+	ctx->gc_state = 0;
+
+	vd->vdev_tsd = ctx;
 	pp = cp->provider;
 
+	kproc_create(vdev_geom_worker, ctx, NULL, 0, 0, "vdev:worker %s",
+	    pp->name);
+
 	/*
 	 * Determine the actual size of the device.
 	 */
@@ -430,19 +699,6 @@ next:
 	 */
 	vd->vdev_nowritecache = B_FALSE;
 
-	cp->private = vd;
-
-	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP);
-	bioq_init(&ctx->gc_queue);
-	mtx_init(&ctx->gc_queue_mtx, "zfs:vdev:geom:queue", NULL, MTX_DEF);
-	ctx->gc_consumer = cp;
-	ctx->gc_state = 0;
-
-	vd->vdev_tsd = ctx;
-
-	kproc_create(vdev_geom_worker, ctx, NULL, 0, 0, "vdev:worker %s",
-	    pp->name);
-
 	return (0);
 }
 
@@ -469,13 +725,16 @@ vdev_geom_io_intr(struct bio *bp)
 	zio = bp->bio_caller1;
 	ctx = zio->io_vd->vdev_tsd;
 
+	if ((zio->io_error = bp->bio_error) == 0 && bp->bio_resid != 0)
+		zio->io_error = EIO;
+
 	mtx_lock(&ctx->gc_queue_mtx);
 	bioq_insert_tail(&ctx->gc_queue, bp);
 	wakeup_one(&ctx->gc_queue);
 	mtx_unlock(&ctx->gc_queue_mtx);
 }
 
-static void
+static int
 vdev_geom_io_start(zio_t *zio)
 {
 	vdev_t *vd;
@@ -492,18 +751,19 @@ vdev_geom_io_start(zio_t *zio)
 		cp = ctx->gc_consumer;
 
 	if (zio->io_type == ZIO_TYPE_IOCTL) {
-		zio_vdev_io_bypass(zio);
-
 		/* XXPOLICY */
-		if (vdev_is_dead(vd)) {
+		if (!vdev_readable(vd)) {
 			zio->io_error = ENXIO;
-			zio_next_stage_async(zio);
-			return;
+			return (ZIO_PIPELINE_CONTINUE);
 		}
 
 		switch (zio->io_cmd) {
 
 		case DKIOCFLUSHWRITECACHE:
+
+			if (zfs_nocacheflush)
+				break;
+
 			if (vd->vdev_nowritecache) {
 				zio->io_error = ENOTSUP;
 				break;
@@ -514,27 +774,13 @@ vdev_geom_io_start(zio_t *zio)
 			zio->io_error = ENOTSUP;
 		}
 
-		zio_next_stage_async(zio);
-		return;
+		return (ZIO_PIPELINE_CONTINUE);
 	}
-
-	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
-		return;
-
-	if ((zio = vdev_queue_io(zio)) == NULL)
-		return;
-
 sendreq:
-
-	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
-	if (error == 0 && cp == NULL)
-		error = ENXIO;
-	if (error) {
-		zio->io_error = error;
-		zio_next_stage_async(zio);
-		return;
+	if (cp == NULL) {
+		zio->io_error = ENXIO;
+		return (ZIO_PIPELINE_CONTINUE);
 	}
-
 	bp = g_alloc_bio();
 	bp->bio_caller1 = zio;
 	switch (zio->io_type) {
@@ -555,20 +801,33 @@ sendreq:
 	bp->bio_done = vdev_geom_io_intr;
 
 	g_io_request(bp, cp);
+
+	return (ZIO_PIPELINE_STOP);
 }
 
 static void
 vdev_geom_io_done(zio_t *zio)
 {
-	vdev_queue_io_done(zio);
-
-	if (zio->io_type == ZIO_TYPE_WRITE)
-		vdev_cache_write(zio);
 
-	if (zio_injection_enabled && zio->io_error == 0)
-		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
-
-	zio_next_stage(zio);
+	/*																						    
+	 * If the device returned ENXIO, then attempt we should verify if GEOM														
+	 * provider has been removed. If this is the case, then we trigger an														 
+	 * asynchronous removal of the device.																		
+	 */																						   
+	if (zio->io_error == ENXIO) {
+		vdev_t *vd = zio->io_vd;
+		vdev_geom_ctx_t *ctx;
+		struct g_provider *pp = NULL;
+
+		ctx = vd->vdev_tsd;
+		if (ctx != NULL && ctx->gc_consumer != NULL)
+			pp = ctx->gc_consumer->provider;
+
+		if (pp == NULL || (pp->flags & G_PF_ORPHAN)) {
+			vd->vdev_remove_wanted = B_TRUE;
+			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+		}
+	}
 }
 
 vdev_ops_t vdev_geom_ops = {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
index 9d9f5556fa08..bf930466fbd6 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Virtual Device Labels
  * ---------------------
@@ -62,7 +60,7 @@
  * or a device was added, we want to update all the labels such that we can deal
  * with fatal failure at any point.  To this end, each disk has two labels which
  * are updated before and after the uberblock is synced.  Assuming we have
- * labels and an uberblock with the following transacation groups:
+ * labels and an uberblock with the following transaction groups:
  *
  *              L1          UB          L2
  *           +------+    +------+    +------+
@@ -153,34 +151,56 @@ uint64_t
 vdev_label_offset(uint64_t psize, int l, uint64_t offset)
 {
 	ASSERT(offset < sizeof (vdev_label_t));
+	ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);
 
 	return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
 	    0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
 }
 
+/*
+ * Returns back the vdev label associated with the passed in offset.
+ */
+int
+vdev_label_number(uint64_t psize, uint64_t offset)
+{
+	int l;
+
+	if (offset >= psize - VDEV_LABEL_END_SIZE) {
+		offset -= psize - VDEV_LABEL_END_SIZE;
+		offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t);
+	}
+	l = offset / sizeof (vdev_label_t);
+	return (l < VDEV_LABELS ? l : -1);
+}
+
 static void
 vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
-	uint64_t size, zio_done_func_t *done, void *private)
+	uint64_t size, zio_done_func_t *done, void *private, int flags)
 {
-	ASSERT(vd->vdev_children == 0);
+	ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) ==
+	    SCL_STATE_ALL);
+	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
 
 	zio_nowait(zio_read_phys(zio, vd,
 	    vdev_label_offset(vd->vdev_psize, l, offset),
 	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
-	    ZIO_PRIORITY_SYNC_READ,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+	    ZIO_PRIORITY_SYNC_READ, flags, B_TRUE));
 }
 
 static void
 vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
-	uint64_t size, zio_done_func_t *done, void *private)
+	uint64_t size, zio_done_func_t *done, void *private, int flags)
 {
-	ASSERT(vd->vdev_children == 0);
+	ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL ||
+	    (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
+	    (SCL_CONFIG | SCL_STATE) &&
+	    dsl_pool_sync_context(spa_get_dsl(zio->io_spa))));
+	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
 
 	zio_nowait(zio_write_phys(zio, vd,
 	    vdev_label_offset(vd->vdev_psize, l, offset),
 	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL));
+	    ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
 }
 
 /*
@@ -188,7 +208,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
  */
 nvlist_t *
 vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
-    boolean_t isspare)
+    boolean_t isspare, boolean_t isl2cache)
 {
 	nvlist_t *nv = NULL;
 
@@ -196,7 +216,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 
 	VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
 	    vd->vdev_ops->vdev_op_type) == 0);
-	if (!isspare)
+	if (!isspare && !isl2cache)
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
 		    == 0);
 	VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
@@ -209,6 +229,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
 		    vd->vdev_devid) == 0);
 
+	if (vd->vdev_physpath != NULL)
+		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+		    vd->vdev_physpath) == 0);
+
 	if (vd->vdev_nparity != 0) {
 		ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
 		    VDEV_TYPE_RAIDZ) == 0);
@@ -219,7 +243,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		 */
 		ASSERT(vd->vdev_nparity == 1 ||
 		    (vd->vdev_nparity == 2 &&
-		    spa_version(spa) >= ZFS_VERSION_RAID6));
+		    spa_version(spa) >= SPA_VERSION_RAID6));
 
 		/*
 		 * Note that we'll add the nparity tag even on storage pools
@@ -240,7 +264,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 	if (vd->vdev_isspare)
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
 
-	if (!isspare && vd == vd->vdev_top) {
+	if (!isspare && !isl2cache && vd == vd->vdev_top) {
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vd->vdev_ms_array) == 0);
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
@@ -249,6 +273,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		    vd->vdev_ashift) == 0);
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    vd->vdev_asize) == 0);
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG,
+		    vd->vdev_islog) == 0);
 	}
 
 	if (vd->vdev_dtl.smo_object != 0)
@@ -271,7 +297,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 
 		for (c = 0; c < vd->vdev_children; c++)
 			child[c] = vdev_config_generate(spa, vd->vdev_child[c],
-			    getstats, isspare);
+			    getstats, isspare, isl2cache);
 
 		VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 		    child, vd->vdev_children) == 0);
@@ -285,9 +311,18 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		if (vd->vdev_offline && !vd->vdev_tmpoffline)
 			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 			    B_TRUE) == 0);
-		else
-			(void) nvlist_remove(nv, ZPOOL_CONFIG_OFFLINE,
-			    DATA_TYPE_UINT64);
+		if (vd->vdev_faulted)
+			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED,
+			    B_TRUE) == 0);
+		if (vd->vdev_degraded)
+			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED,
+			    B_TRUE) == 0);
+		if (vd->vdev_removed)
+			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED,
+			    B_TRUE) == 0);
+		if (vd->vdev_unspare)
+			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
+			    B_TRUE) == 0);
 	}
 
 	return (nv);
@@ -300,23 +335,23 @@ vdev_label_read_config(vdev_t *vd)
 	nvlist_t *config = NULL;
 	vdev_phys_t *vp;
 	zio_t *zio;
-	int l;
+	int flags =
+	    ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
 
-	ASSERT(spa_config_held(spa, RW_READER));
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
-	if (vdev_is_dead(vd))
+	if (!vdev_readable(vd))
 		return (NULL);
 
 	vp = zio_buf_alloc(sizeof (vdev_phys_t));
 
-	for (l = 0; l < VDEV_LABELS; l++) {
+	for (int l = 0; l < VDEV_LABELS; l++) {
 
-		zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL |
-		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CONFIG_HELD);
+		zio = zio_root(spa, NULL, NULL, flags);
 
 		vdev_label_read(zio, vd, l, vp,
 		    offsetof(vdev_label_t, vl_vdev_phys),
-		    sizeof (vdev_phys_t), NULL, NULL);
+		    sizeof (vdev_phys_t), NULL, NULL, flags);
 
 		if (zio_wait(zio) == 0 &&
 		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
@@ -340,7 +375,7 @@ vdev_label_read_config(vdev_t *vd)
  */
 static boolean_t
 vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
-    uint64_t *spare_guid)
+    uint64_t *spare_guid, uint64_t *l2cache_guid)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t state, pool_guid, device_guid, txg, spare_pool;
@@ -349,6 +384,8 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
 
 	if (spare_guid)
 		*spare_guid = 0ULL;
+	if (l2cache_guid)
+		*l2cache_guid = 0ULL;
 
 	/*
 	 * Read the label, if any, and perform some basic sanity checks.
@@ -367,7 +404,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
 		return (B_FALSE);
 	}
 
-	if (state != POOL_STATE_SPARE &&
+	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 	    (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
 	    &pool_guid) != 0 ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
@@ -383,9 +420,10 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
 	 * be a part of.  The only way this is allowed is if the device is a hot
 	 * spare (which we check for later on).
 	 */
-	if (state != POOL_STATE_SPARE &&
+	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 	    !spa_guid_exists(pool_guid, device_guid) &&
-	    !spa_spare_exists(device_guid, NULL))
+	    !spa_spare_exists(device_guid, NULL, NULL) &&
+	    !spa_l2cache_exists(device_guid, NULL))
 		return (B_FALSE);
 
 	/*
@@ -395,21 +433,23 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
 	 * user has attempted to add the same vdev multiple times in the same
 	 * transaction.
 	 */
-	if (state != POOL_STATE_SPARE && txg == 0 && vdtxg == crtxg)
+	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+	    txg == 0 && vdtxg == crtxg)
 		return (B_TRUE);
 
 	/*
 	 * Check to see if this is a spare device.  We do an explicit check for
 	 * spa_has_spare() here because it may be on our pending list of spares
-	 * to add.
+	 * to add.  We also check if it is an l2cache device.
 	 */
-	if (spa_spare_exists(device_guid, &spare_pool) ||
+	if (spa_spare_exists(device_guid, &spare_pool, NULL) ||
 	    spa_has_spare(spa, device_guid)) {
 		if (spare_guid)
 			*spare_guid = device_guid;
 
 		switch (reason) {
 		case VDEV_LABEL_CREATE:
+		case VDEV_LABEL_L2CACHE:
 			return (B_TRUE);
 
 		case VDEV_LABEL_REPLACE:
@@ -422,6 +462,12 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
 	}
 
 	/*
+	 * Check to see if this is an l2cache device.
+	 */
+	if (spa_l2cache_exists(device_guid, NULL))
+		return (B_TRUE);
+
+	/*
 	 * If the device is marked ACTIVE, then this device is in use by another
 	 * pool on the system.
 	 */
@@ -445,15 +491,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	vdev_boot_header_t *vb;
 	uberblock_t *ub;
 	zio_t *zio;
-	int l, c, n;
 	char *buf;
 	size_t buflen;
 	int error;
-	uint64_t spare_guid;
+	uint64_t spare_guid, l2cache_guid;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 
-	ASSERT(spa_config_held(spa, RW_WRITER));
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
-	for (c = 0; c < vd->vdev_children; c++)
+	for (int c = 0; c < vd->vdev_children; c++)
 		if ((error = vdev_label_init(vd->vdev_child[c],
 		    crtxg, reason)) != 0)
 			return (error);
@@ -471,38 +517,56 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	 * Determine if the vdev is in use.
 	 */
 	if (reason != VDEV_LABEL_REMOVE &&
-	    vdev_inuse(vd, crtxg, reason, &spare_guid))
+	    vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
 		return (EBUSY);
 
 	ASSERT(reason != VDEV_LABEL_REMOVE ||
-	    vdev_inuse(vd, crtxg, reason, NULL));
+	    vdev_inuse(vd, crtxg, reason, NULL, NULL));
 
 	/*
-	 * If this is a request to add or replace a spare that is in use
-	 * elsewhere on the system, then we must update the guid (which was
-	 * initialized to a random value) to reflect the actual GUID (which is
-	 * shared between multiple pools).
+	 * If this is a request to add or replace a spare or l2cache device
+	 * that is in use elsewhere on the system, then we must update the
+	 * guid (which was initialized to a random value) to reflect the
+	 * actual GUID (which is shared between multiple pools).
 	 */
-	if (reason != VDEV_LABEL_REMOVE && spare_guid != 0ULL) {
-		vdev_t *pvd = vd->vdev_parent;
+	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE &&
+	    spare_guid != 0ULL) {
+		uint64_t guid_delta = spare_guid - vd->vdev_guid;
 
-		for (; pvd != NULL; pvd = pvd->vdev_parent) {
-			pvd->vdev_guid_sum -= vd->vdev_guid;
-			pvd->vdev_guid_sum += spare_guid;
-		}
+		vd->vdev_guid += guid_delta;
 
-		vd->vdev_guid = vd->vdev_guid_sum = spare_guid;
+		for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+			pvd->vdev_guid_sum += guid_delta;
 
 		/*
 		 * If this is a replacement, then we want to fallthrough to the
 		 * rest of the code.  If we're adding a spare, then it's already
-		 * labelled appropriately and we can just return.
+		 * labeled appropriately and we can just return.
 		 */
 		if (reason == VDEV_LABEL_SPARE)
 			return (0);
 		ASSERT(reason == VDEV_LABEL_REPLACE);
 	}
 
+	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
+	    l2cache_guid != 0ULL) {
+		uint64_t guid_delta = l2cache_guid - vd->vdev_guid;
+
+		vd->vdev_guid += guid_delta;
+
+		for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+			pvd->vdev_guid_sum += guid_delta;
+
+		/*
+		 * If this is a replacement, then we want to fallthrough to the
+		 * rest of the code.  If we're adding an l2cache, then it's
+		 * already labeled appropriately and we can just return.
+		 */
+		if (reason == VDEV_LABEL_L2CACHE)
+			return (0);
+		ASSERT(reason == VDEV_LABEL_REPLACE);
+	}
+
 	/*
 	 * Initialize its label.
 	 */
@@ -532,6 +596,19 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 		    POOL_STATE_SPARE) == 0);
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
 		    vd->vdev_guid) == 0);
+	} else if (reason == VDEV_LABEL_L2CACHE ||
+	    (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) {
+		/*
+		 * For level 2 ARC devices, add a special label.
+		 */
+		VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
+		    spa_version(spa)) == 0);
+		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+		    POOL_STATE_L2CACHE) == 0);
+		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
+		    vd->vdev_guid) == 0);
 	} else {
 		label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
 
@@ -576,23 +653,22 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	/*
 	 * Write everything in parallel.
 	 */
-	zio = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+	zio = zio_root(spa, NULL, NULL, flags);
 
-	for (l = 0; l < VDEV_LABELS; l++) {
+	for (int l = 0; l < VDEV_LABELS; l++) {
 
 		vdev_label_write(zio, vd, l, vp,
 		    offsetof(vdev_label_t, vl_vdev_phys),
-		    sizeof (vdev_phys_t), NULL, NULL);
+		    sizeof (vdev_phys_t), NULL, NULL, flags);
 
 		vdev_label_write(zio, vd, l, vb,
 		    offsetof(vdev_label_t, vl_boot_header),
-		    sizeof (vdev_boot_header_t), NULL, NULL);
+		    sizeof (vdev_boot_header_t), NULL, NULL, flags);
 
-		for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+		for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
 			vdev_label_write(zio, vd, l, ub,
 			    VDEV_UBERBLOCK_OFFSET(vd, n),
-			    VDEV_UBERBLOCK_SIZE(vd), NULL, NULL);
+			    VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags);
 		}
 	}
 
@@ -605,14 +681,20 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 
 	/*
 	 * If this vdev hasn't been previously identified as a spare, then we
-	 * mark it as such only if a) we are labelling it as a spare, or b) it
-	 * exists as a spare elsewhere in the system.
+	 * mark it as such only if a) we are labeling it as a spare, or b) it
+	 * exists as a spare elsewhere in the system.  Do the same for
+	 * level 2 ARC devices.
 	 */
 	if (error == 0 && !vd->vdev_isspare &&
 	    (reason == VDEV_LABEL_SPARE ||
-	    spa_spare_exists(vd->vdev_guid, NULL)))
+	    spa_spare_exists(vd->vdev_guid, NULL, NULL)))
 		spa_spare_add(vd);
 
+	if (error == 0 && !vd->vdev_isl2cache &&
+	    (reason == VDEV_LABEL_L2CACHE ||
+	    spa_l2cache_exists(vd->vdev_guid, NULL)))
+		spa_l2cache_add(vd);
+
 	return (error);
 }
 
@@ -651,17 +733,17 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
 static void
 vdev_uberblock_load_done(zio_t *zio)
 {
+	zio_t *rio = zio->io_private;
 	uberblock_t *ub = zio->io_data;
-	uberblock_t *ubbest = zio->io_private;
-	spa_t *spa = zio->io_spa;
+	uberblock_t *ubbest = rio->io_private;
 
 	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd));
 
 	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
-		mutex_enter(&spa->spa_uberblock_lock);
+		mutex_enter(&rio->io_lock);
 		if (vdev_uberblock_compare(ub, ubbest) > 0)
 			*ubbest = *ub;
-		mutex_exit(&spa->spa_uberblock_lock);
+		mutex_exit(&rio->io_lock);
 	}
 
 	zio_buf_free(zio->io_data, zio->io_size);
@@ -670,136 +752,169 @@ vdev_uberblock_load_done(zio_t *zio)
 void
 vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
 {
-	int l, c, n;
-
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	int flags =
+	    ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+
+	if (vd == rvd) {
+		ASSERT(zio == NULL);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		zio = zio_root(spa, NULL, ubbest, flags);
+		bzero(ubbest, sizeof (uberblock_t));
+	}
 
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return;
+	ASSERT(zio != NULL);
 
-	if (vdev_is_dead(vd))
-		return;
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
 
-	for (l = 0; l < VDEV_LABELS; l++) {
-		for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
-			vdev_label_read(zio, vd, l,
-			    zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
-			    VDEV_UBERBLOCK_OFFSET(vd, n),
-			    VDEV_UBERBLOCK_SIZE(vd),
-			    vdev_uberblock_load_done, ubbest);
+	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+		for (int l = 0; l < VDEV_LABELS; l++) {
+			for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+				vdev_label_read(zio, vd, l,
+				    zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
+				    VDEV_UBERBLOCK_OFFSET(vd, n),
+				    VDEV_UBERBLOCK_SIZE(vd),
+				    vdev_uberblock_load_done, zio, flags);
+			}
 		}
 	}
+
+	if (vd == rvd) {
+		(void) zio_wait(zio);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+	}
 }
 
 /*
- * Write the uberblock to both labels of all leaves of the specified vdev.
+ * On success, increment root zio's count of good writes.
  * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
  */
 static void
 vdev_uberblock_sync_done(zio_t *zio)
 {
-	uint64_t *good_writes = zio->io_root->io_private;
+	uint64_t *good_writes = zio->io_private;
 
 	if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
 		atomic_add_64(good_writes, 1);
 }
 
+/*
+ * Write the uberblock to all labels of all leaves of the specified vdev.
+ */
 static void
-vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, uint64_t txg)
+vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
 {
-	int l, c, n;
+	uberblock_t *ubbuf;
+	int n;
 
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_uberblock_sync(zio, ub, vd->vdev_child[c], txg);
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
 
-	if (vdev_is_dead(vd))
+	if (!vdev_writeable(vd))
 		return;
 
-	n = txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
+	n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
 
-	ASSERT(ub->ub_txg == txg);
+	ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
+	bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
+	*ubbuf = *ub;
 
-	for (l = 0; l < VDEV_LABELS; l++)
-		vdev_label_write(zio, vd, l, ub,
-		    VDEV_UBERBLOCK_OFFSET(vd, n),
-		    VDEV_UBERBLOCK_SIZE(vd),
-		    vdev_uberblock_sync_done, NULL);
+	for (int l = 0; l < VDEV_LABELS; l++)
+		vdev_label_write(zio, vd, l, ubbuf,
+		    VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
+		    vdev_uberblock_sync_done, zio->io_private,
+		    flags | ZIO_FLAG_DONT_PROPAGATE);
 
-	dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg);
+	zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
 }
 
-static int
-vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *vd, uint64_t txg)
+int
+vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
 {
-	uberblock_t *ubbuf;
-	size_t size = vd->vdev_top ? VDEV_UBERBLOCK_SIZE(vd) : SPA_MAXBLOCKSIZE;
-	uint64_t *good_writes;
+	spa_t *spa = svd[0]->vdev_spa;
 	zio_t *zio;
-	int error;
-
-	ubbuf = zio_buf_alloc(size);
-	bzero(ubbuf, size);
-	*ubbuf = *ub;
+	uint64_t good_writes = 0;
 
-	good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+	zio = zio_root(spa, NULL, &good_writes, flags);
 
-	zio = zio_root(spa, NULL, good_writes,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+	for (int v = 0; v < svdcount; v++)
+		vdev_uberblock_sync(zio, ub, svd[v], flags);
 
-	vdev_uberblock_sync(zio, ubbuf, vd, txg);
-
-	error = zio_wait(zio);
-
-	if (error && *good_writes != 0) {
-		dprintf("partial success: good_writes = %llu\n", *good_writes);
-		error = 0;
-	}
+	(void) zio_wait(zio);
 
 	/*
-	 * It's possible to have no good writes and no error if every vdev is in
-	 * the CANT_OPEN state.
+	 * Flush the uberblocks to disk.  This ensures that the odd labels
+	 * are no longer needed (because the new uberblocks and the even
+	 * labels are safely on disk), so it is safe to overwrite them.
 	 */
-	if (*good_writes == 0 && error == 0)
-		error = EIO;
+	zio = zio_root(spa, NULL, NULL, flags);
 
-	kmem_free(good_writes, sizeof (uint64_t));
-	zio_buf_free(ubbuf, size);
+	for (int v = 0; v < svdcount; v++)
+		zio_flush(zio, svd[v]);
 
-	return (error);
+	(void) zio_wait(zio);
+
+	return (good_writes >= 1 ? 0 : EIO);
 }
 
 /*
- * Sync out an individual vdev.
+ * On success, increment the count of good writes for our top-level vdev.
  */
 static void
-vdev_sync_label_done(zio_t *zio)
+vdev_label_sync_done(zio_t *zio)
 {
-	uint64_t *good_writes = zio->io_root->io_private;
+	uint64_t *good_writes = zio->io_private;
 
 	if (zio->io_error == 0)
 		atomic_add_64(good_writes, 1);
 }
 
+/*
+ * If there weren't enough good writes, indicate failure to the parent.
+ */
+static void
+vdev_label_sync_top_done(zio_t *zio)
+{
+	uint64_t *good_writes = zio->io_private;
+
+	if (*good_writes == 0)
+		zio->io_error = EIO;
+
+	kmem_free(good_writes, sizeof (uint64_t));
+}
+
+/*
+ * We ignore errors for log and cache devices, simply free the private data.
+ */
+static void
+vdev_label_sync_ignore_done(zio_t *zio)
+{
+	kmem_free(zio->io_private, sizeof (uint64_t));
+}
+
+/*
+ * Write all even or odd labels to all leaves of the specified vdev.
+ */
 static void
-vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg)
+vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
 {
 	nvlist_t *label;
 	vdev_phys_t *vp;
 	char *buf;
 	size_t buflen;
-	int c;
 
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_sync_label(zio, vd->vdev_child[c], l, txg);
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags);
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
 
-	if (vdev_is_dead(vd))
+	if (!vdev_writeable(vd))
 		return;
 
 	/*
@@ -813,107 +928,110 @@ vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg)
 	buf = vp->vp_nvlist;
 	buflen = sizeof (vp->vp_nvlist);
 
-	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0)
-		vdev_label_write(zio, vd, l, vp,
-		    offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
-		    vdev_sync_label_done, NULL);
+	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) {
+		for (; l < VDEV_LABELS; l += 2) {
+			vdev_label_write(zio, vd, l, vp,
+			    offsetof(vdev_label_t, vl_vdev_phys),
+			    sizeof (vdev_phys_t),
+			    vdev_label_sync_done, zio->io_private,
+			    flags | ZIO_FLAG_DONT_PROPAGATE);
+		}
+	}
 
 	zio_buf_free(vp, sizeof (vdev_phys_t));
 	nvlist_free(label);
-
-	dprintf("%s label %d txg %llu\n", vdev_description(vd), l, txg);
 }
 
-static int
-vdev_sync_labels(vdev_t *vd, int l, uint64_t txg)
+int
+vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
 {
-	uint64_t *good_writes;
+	list_t *dl = &spa->spa_config_dirty_list;
+	vdev_t *vd;
 	zio_t *zio;
 	int error;
 
-	ASSERT(vd == vd->vdev_top);
-
-	good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
-
-	zio = zio_root(vd->vdev_spa, NULL, good_writes,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
-
 	/*
-	 * Recursively kick off writes to all labels.
+	 * Write the new labels to disk.
 	 */
-	vdev_sync_label(zio, vd, l, txg);
+	zio = zio_root(spa, NULL, NULL, flags);
+
+	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
+		uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
+		    KM_SLEEP);
+		zio_t *vio = zio_null(zio, spa,
+		    (vd->vdev_islog || vd->vdev_aux != NULL) ?
+		    vdev_label_sync_ignore_done : vdev_label_sync_top_done,
+		    good_writes, flags);
+		vdev_label_sync(vio, vd, l, txg, flags);
+		zio_nowait(vio);
+	}
 
 	error = zio_wait(zio);
 
-	if (error && *good_writes != 0) {
-		dprintf("partial success: good_writes = %llu\n", *good_writes);
-		error = 0;
-	}
+	/*
+	 * Flush the new labels to disk.
+	 */
+	zio = zio_root(spa, NULL, NULL, flags);
 
-	if (*good_writes == 0 && error == 0)
-		error = ENODEV;
+	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
+		zio_flush(zio, vd);
 
-	kmem_free(good_writes, sizeof (uint64_t));
+	(void) zio_wait(zio);
 
 	return (error);
 }
 
 /*
- * Sync the entire vdev configuration.
+ * Sync the uberblock and any changes to the vdev configuration.
  *
  * The order of operations is carefully crafted to ensure that
  * if the system panics or loses power at any time, the state on disk
  * is still transactionally consistent.  The in-line comments below
  * describe the failure semantics at each stage.
  *
- * Moreover, it is designed to be idempotent: if spa_sync_labels() fails
+ * Moreover, vdev_config_sync() is designed to be idempotent: if it fails
  * at any time, you can just call it again, and it will resume its work.
  */
 int
-vdev_config_sync(vdev_t *uvd, uint64_t txg)
+vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 {
-	spa_t *spa = uvd->vdev_spa;
+	spa_t *spa = svd[0]->vdev_spa;
 	uberblock_t *ub = &spa->spa_uberblock;
-	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
 	zio_t *zio;
-	int l, error;
+	int error;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 
 	ASSERT(ub->ub_txg <= txg);
 
 	/*
-	 * If this isn't a resync due to I/O errors, and nothing changed
-	 * in this transaction group, and the vdev configuration hasn't changed,
+	 * If this isn't a resync due to I/O errors,
+	 * and nothing changed in this transaction group,
+	 * and the vdev configuration hasn't changed,
 	 * then there's nothing to do.
 	 */
-	if (ub->ub_txg < txg && uberblock_update(ub, rvd, txg) == B_FALSE &&
-	    list_is_empty(&spa->spa_dirty_list)) {
-		dprintf("nothing to sync in %s in txg %llu\n",
-		    spa_name(spa), txg);
+	if (ub->ub_txg < txg &&
+	    uberblock_update(ub, spa->spa_root_vdev, txg) == B_FALSE &&
+	    list_is_empty(&spa->spa_config_dirty_list))
 		return (0);
-	}
 
 	if (txg > spa_freeze_txg(spa))
 		return (0);
 
 	ASSERT(txg <= spa->spa_final_txg);
 
-	dprintf("syncing %s txg %llu\n", spa_name(spa), txg);
-
 	/*
 	 * Flush the write cache of every disk that's been written to
 	 * in this transaction group.  This ensures that all blocks
 	 * written in this txg will be committed to stable storage
 	 * before any uberblock that references them.
 	 */
-	zio = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+	zio = zio_root(spa, NULL, NULL, flags);
+
 	for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
-	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) {
-		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
-		    NULL, NULL, ZIO_PRIORITY_NOW,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
-	}
+	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
+		zio_flush(zio, vd);
+
 	(void) zio_wait(zio);
 
 	/*
@@ -921,34 +1039,15 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg)
 	 * system dies in the middle of this process, that's OK: all of the
 	 * even labels that made it to disk will be newer than any uberblock,
 	 * and will therefore be considered invalid.  The odd labels (L1, L3),
-	 * which have not yet been touched, will still be valid.
-	 */
-	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
-	    vd = list_next(&spa->spa_dirty_list, vd)) {
-		for (l = 0; l < VDEV_LABELS; l++) {
-			if (l & 1)
-				continue;
-			if ((error = vdev_sync_labels(vd, l, txg)) != 0)
-				return (error);
-		}
-	}
-
-	/*
-	 * Flush the new labels to disk.  This ensures that all even-label
-	 * updates are committed to stable storage before the uberblock update.
+	 * which have not yet been touched, will still be valid.  We flush
+	 * the new labels to disk to ensure that all even-label updates
+	 * are committed to stable storage before the uberblock update.
 	 */
-	zio = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
-	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
-	    vd = list_next(&spa->spa_dirty_list, vd)) {
-		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
-		    NULL, NULL, ZIO_PRIORITY_NOW,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
-	}
-	(void) zio_wait(zio);
+	if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0)
+		return (error);
 
 	/*
-	 * Sync the uberblocks to all vdevs in the tree specified by uvd.
+	 * Sync the uberblocks to all vdevs in svd[].
 	 * If the system dies in the middle of this step, there are two cases
 	 * to consider, and the on-disk state is consistent either way:
 	 *
@@ -962,50 +1061,18 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg)
 	 *	been successfully committed) will be valid with respect
 	 *	to the new uberblocks.
 	 */
-	if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0)
+	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
 		return (error);
 
 	/*
-	 * Flush the uberblocks to disk.  This ensures that the odd labels
-	 * are no longer needed (because the new uberblocks and the even
-	 * labels are safely on disk), so it is safe to overwrite them.
-	 */
-	(void) zio_wait(zio_ioctl(NULL, spa, uvd, DKIOCFLUSHWRITECACHE,
-	    NULL, NULL, ZIO_PRIORITY_NOW,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
-
-	/*
 	 * Sync out odd labels for every dirty vdev.  If the system dies
 	 * in the middle of this process, the even labels and the new
 	 * uberblocks will suffice to open the pool.  The next time
 	 * the pool is opened, the first thing we'll do -- before any
 	 * user data is modified -- is mark every vdev dirty so that
-	 * all labels will be brought up to date.
+	 * all labels will be brought up to date.  We flush the new labels
+	 * to disk to ensure that all odd-label updates are committed to
+	 * stable storage before the next transaction group begins.
 	 */
-	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
-	    vd = list_next(&spa->spa_dirty_list, vd)) {
-		for (l = 0; l < VDEV_LABELS; l++) {
-			if ((l & 1) == 0)
-				continue;
-			if ((error = vdev_sync_labels(vd, l, txg)) != 0)
-				return (error);
-		}
-	}
-
-	/*
-	 * Flush the new labels to disk.  This ensures that all odd-label
-	 * updates are committed to stable storage before the next
-	 * transaction group begins.
-	 */
-	zio = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
-	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
-	    vd = list_next(&spa->spa_dirty_list, vd)) {
-		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
-		    NULL, NULL, ZIO_PRIORITY_NOW,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
-	}
-	(void) zio_wait(zio);
-
-	return (0);
+	return (vdev_label_sync_list(spa, 1, txg, flags));
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
index 73d1a83d9436..c4629ff45087 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
@@ -39,8 +37,9 @@ typedef struct mirror_child {
 	vdev_t		*mc_vd;
 	uint64_t	mc_offset;
 	int		mc_error;
-	short		mc_tried;
-	short		mc_skipped;
+	uint8_t		mc_tried;
+	uint8_t		mc_skipped;
+	uint8_t		mc_speculative;
 } mirror_child_t;
 
 typedef struct mirror_map {
@@ -53,6 +52,14 @@ typedef struct mirror_map {
 
 int vdev_mirror_shift = 21;
 
+static void
+vdev_mirror_map_free(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_vsd;
+
+	kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+}
+
 static mirror_map_t *
 vdev_mirror_map_alloc(zio_t *zio)
 {
@@ -110,18 +117,10 @@ vdev_mirror_map_alloc(zio_t *zio)
 	}
 
 	zio->io_vsd = mm;
+	zio->io_vsd_free = vdev_mirror_map_free;
 	return (mm);
 }
 
-static void
-vdev_mirror_map_free(zio_t *zio)
-{
-	mirror_map_t *mm = zio->io_vsd;
-
-	kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
-	zio->io_vsd = NULL;
-}
-
 static int
 vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 {
@@ -195,13 +194,6 @@ vdev_mirror_scrub_done(zio_t *zio)
 	mc->mc_skipped = 0;
 }
 
-static void
-vdev_mirror_repair_done(zio_t *zio)
-{
-	ASSERT(zio->io_private == zio->io_parent);
-	vdev_mirror_map_free(zio->io_private);
-}
-
 /*
  * Try to find a child whose DTL doesn't contain the block we want to read.
  * If we can't, try the read on any vdev we haven't already tried.
@@ -219,7 +211,7 @@ vdev_mirror_child_select(zio_t *zio)
 	/*
 	 * Try to find a child whose DTL doesn't contain the block to read.
 	 * If a child is known to be completely inaccessible (indicated by
-	 * vdev_is_dead() returning B_TRUE), don't even try.
+	 * vdev_readable() returning B_FALSE), don't even try.
 	 */
 	for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
 		if (c >= mm->mm_children)
@@ -227,7 +219,7 @@ vdev_mirror_child_select(zio_t *zio)
 		mc = &mm->mm_child[c];
 		if (mc->mc_tried || mc->mc_skipped)
 			continue;
-		if (vdev_is_dead(mc->mc_vd)) {
+		if (!vdev_readable(mc->mc_vd)) {
 			mc->mc_error = ENXIO;
 			mc->mc_tried = 1;	/* don't even try */
 			mc->mc_skipped = 1;
@@ -237,6 +229,7 @@ vdev_mirror_child_select(zio_t *zio)
 			return (c);
 		mc->mc_error = ESTALE;
 		mc->mc_skipped = 1;
+		mc->mc_speculative = 1;
 	}
 
 	/*
@@ -253,7 +246,7 @@ vdev_mirror_child_select(zio_t *zio)
 	return (-1);
 }
 
-static void
+static int
 vdev_mirror_io_start(zio_t *zio)
 {
 	mirror_map_t *mm;
@@ -275,12 +268,10 @@ vdev_mirror_io_start(zio_t *zio)
 				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 				    mc->mc_vd, mc->mc_offset,
 				    zio_buf_alloc(zio->io_size), zio->io_size,
-				    zio->io_type, zio->io_priority,
-				    ZIO_FLAG_CANFAIL,
+				    zio->io_type, zio->io_priority, 0,
 				    vdev_mirror_scrub_done, mc));
 			}
-			zio_wait_children_done(zio);
-			return;
+			return (ZIO_PIPELINE_CONTINUE);
 		}
 		/*
 		 * For normal reads just pick one child.
@@ -310,13 +301,27 @@ vdev_mirror_io_start(zio_t *zio)
 	while (children--) {
 		mc = &mm->mm_child[c];
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-		    mc->mc_vd, mc->mc_offset,
-		    zio->io_data, zio->io_size, zio->io_type, zio->io_priority,
-		    ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc));
+		    mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
+		    zio->io_type, zio->io_priority, 0,
+		    vdev_mirror_child_done, mc));
 		c++;
 	}
 
-	zio_wait_children_done(zio);
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+vdev_mirror_worst_error(mirror_map_t *mm)
+{
+	int error[2] = { 0, 0 };
+
+	for (int c = 0; c < mm->mm_children; c++) {
+		mirror_child_t *mc = &mm->mm_child[c];
+		int s = mc->mc_speculative;
+		error[s] = zio_worst_error(error[s], mc->mc_error);
+	}
+
+	return (error[0] ? error[0] : error[1]);
 }
 
 static void
@@ -328,41 +333,45 @@ vdev_mirror_io_done(zio_t *zio)
 	int good_copies = 0;
 	int unexpected_errors = 0;
 
-	zio->io_error = 0;
-	zio->io_numerrors = 0;
-
 	for (c = 0; c < mm->mm_children; c++) {
 		mc = &mm->mm_child[c];
 
-		if (mc->mc_tried && mc->mc_error == 0) {
-			good_copies++;
-			continue;
-		}
-
-		/*
-		 * We preserve any EIOs because those may be worth retrying;
-		 * whereas ECKSUM and ENXIO are more likely to be persistent.
-		 */
 		if (mc->mc_error) {
-			if (zio->io_error != EIO)
-				zio->io_error = mc->mc_error;
 			if (!mc->mc_skipped)
 				unexpected_errors++;
-			zio->io_numerrors++;
+		} else if (mc->mc_tried) {
+			good_copies++;
 		}
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		/*
 		 * XXX -- for now, treat partial writes as success.
-		 * XXX -- For a replacing vdev, we need to make sure the
-		 *	  new child succeeds.
+		 *
+		 * Now that we support write reallocation, it would be better
+		 * to treat partial failure as real failure unless there are
+		 * no non-degraded top-level vdevs left, and not update DTLs
+		 * if we intend to reallocate.
 		 */
 		/* XXPOLICY */
-		if (good_copies != 0)
-			zio->io_error = 0;
-		vdev_mirror_map_free(zio);
-		zio_next_stage(zio);
+		if (good_copies != mm->mm_children) {
+			/*
+			 * Always require at least one good copy.
+			 *
+			 * For ditto blocks (io_vd == NULL), require
+			 * all copies to be good.
+			 *
+			 * XXX -- for replacing vdevs, there's no great answer.
+			 * If the old device is really dead, we may not even
+			 * be able to access it -- so we only want to
+			 * require good writes to the new device.  But if
+			 * the new device turns out to be flaky, we want
+			 * to be able to detach it -- which requires all
+			 * writes to the old device to have succeeded.
+			 */
+			if (good_copies == 0 || zio->io_vd == NULL)
+				zio->io_error = vdev_mirror_worst_error(mm);
+		}
 		return;
 	}
 
@@ -375,40 +384,27 @@ vdev_mirror_io_done(zio_t *zio)
 	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
 		ASSERT(c >= 0 && c < mm->mm_children);
 		mc = &mm->mm_child[c];
-		dprintf("retrying i/o (err=%d) on child %s\n",
-		    zio->io_error, vdev_description(mc->mc_vd));
-		zio->io_error = 0;
 		zio_vdev_io_redone(zio);
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 		    mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
-		    ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
+		    ZIO_TYPE_READ, zio->io_priority, 0,
 		    vdev_mirror_child_done, mc));
-		zio_wait_children_done(zio);
 		return;
 	}
 
 	/* XXPOLICY */
-	if (good_copies)
-		zio->io_error = 0;
-	else
+	if (good_copies == 0) {
+		zio->io_error = vdev_mirror_worst_error(mm);
 		ASSERT(zio->io_error != 0);
+	}
 
 	if (good_copies && (spa_mode & FWRITE) &&
 	    (unexpected_errors ||
 	    (zio->io_flags & ZIO_FLAG_RESILVER) ||
 	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
-		zio_t *rio;
-
 		/*
 		 * Use the good data we have in hand to repair damaged children.
-		 *
-		 * We issue all repair I/Os as children of 'rio' to arrange
-		 * that vdev_mirror_map_free(zio) will be invoked after all
-		 * repairs complete, but before we advance to the next stage.
 		 */
-		rio = zio_null(zio, zio->io_spa,
-		    vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL);
-
 		for (c = 0; c < mm->mm_children; c++) {
 			/*
 			 * Don't rewrite known good children.
@@ -429,24 +425,13 @@ vdev_mirror_io_done(zio_t *zio)
 				mc->mc_error = ESTALE;
 			}
 
-			dprintf("resilvered %s @ 0x%llx error %d\n",
-			    vdev_description(mc->mc_vd), mc->mc_offset,
-			    mc->mc_error);
-
-			zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd,
-			    mc->mc_offset, zio->io_data, zio->io_size,
+			zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+			    mc->mc_vd, mc->mc_offset,
+			    zio->io_data, zio->io_size,
 			    ZIO_TYPE_WRITE, zio->io_priority,
-			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
-			    ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
+			    ZIO_FLAG_IO_REPAIR, NULL, NULL));
 		}
-
-		zio_nowait(rio);
-		zio_wait_children_done(zio);
-		return;
 	}
-
-	vdev_mirror_map_free(zio);
-	zio_next_stage(zio);
 }
 
 static void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
index b35f4a5bcd03..731f7d3dcec9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * The 'missing' vdev is a special vdev type used only during import.  It
  * signifies a placeholder in the root vdev for some vdev that we know is
@@ -63,18 +60,17 @@ vdev_missing_close(vdev_t *vd)
 }
 
 /* ARGSUSED */
-static void
+static int
 vdev_missing_io_start(zio_t *zio)
 {
 	zio->io_error = ENOTSUP;
-	zio_next_stage_async(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /* ARGSUSED */
 static void
 vdev_missing_io_done(zio_t *zio)
 {
-	zio_next_stage(zio);
 }
 
 vdev_ops_t vdev_missing_ops = {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
index 8ef524f71931..cd4d5aef241f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
@@ -55,6 +53,25 @@ int zfs_vdev_ramp_rate = 2;
  */
 int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
 
+SYSCTL_DECL(_vfs_zfs_vdev);
+TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending);
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RDTUN,
+    &zfs_vdev_max_pending, 0, "Maximum I/O requests pending on each device");
+TUNABLE_INT("vfs.zfs.vdev.min_pending", &zfs_vdev_min_pending);
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RDTUN,
+    &zfs_vdev_min_pending, 0,
+    "Initial number of I/O requests pending to each device");
+TUNABLE_INT("vfs.zfs.vdev.time_shift", &zfs_vdev_time_shift);
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, time_shift, CTLFLAG_RDTUN,
+    &zfs_vdev_time_shift, 0, "Used for calculating I/O request deadline");
+TUNABLE_INT("vfs.zfs.vdev.ramp_rate", &zfs_vdev_ramp_rate);
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, ramp_rate, CTLFLAG_RDTUN,
+    &zfs_vdev_ramp_rate, 0, "Exponential I/O issue ramp-up rate");
+TUNABLE_INT("vfs.zfs.vdev.aggregation_limit", &zfs_vdev_aggregation_limit);
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RDTUN,
+    &zfs_vdev_aggregation_limit, 0,
+    "I/O requests are aggregated up to this size");
+
 /*
  * Virtual device vector for disk I/O scheduling.
  */
@@ -162,7 +179,7 @@ vdev_queue_agg_io_done(zio_t *aio)
 		aio->io_delegate_list = dio->io_delegate_next;
 		dio->io_delegate_next = NULL;
 		dio->io_error = aio->io_error;
-		zio_next_stage(dio);
+		zio_execute(dio);
 	}
 	ASSERT3U(offset, ==, aio->io_size);
 
@@ -172,11 +189,8 @@ vdev_queue_agg_io_done(zio_t *aio)
 #define	IS_ADJACENT(io, nio) \
 	((io)->io_offset + (io)->io_size == (nio)->io_offset)
 
-typedef void zio_issue_func_t(zio_t *);
-
 static zio_t *
-vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
-	zio_issue_func_t **funcp)
+vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
 {
 	zio_t *fio, *lio, *aio, *dio;
 	avl_tree_t *tree;
@@ -184,8 +198,6 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
 
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
-	*funcp = NULL;
-
 	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
 	    avl_numnodes(&vq->vq_deadline_tree) == 0)
 		return (NULL);
@@ -196,6 +208,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
 	size = fio->io_size;
 
 	while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
+	    !((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) &&
 	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
 		dio->io_delegate_next = fio;
 		fio = dio;
@@ -203,6 +216,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
 	}
 
 	while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
+	    !((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) &&
 	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
 		lio->io_delegate_next = dio;
 		lio = dio;
@@ -212,15 +226,12 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
 	if (fio != lio) {
 		char *buf = zio_buf_alloc(size);
 		uint64_t offset = 0;
-		int nagg = 0;
 
 		ASSERT(size <= zfs_vdev_aggregation_limit);
 
-		aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
-		    fio->io_offset, buf, size, fio->io_type,
-		    ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
-		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
-		    ZIO_FLAG_NOBOOKMARK,
+		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
+		    buf, size, fio->io_type, ZIO_PRIORITY_NOW,
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
 		    vdev_queue_agg_io_done, NULL);
 
 		aio->io_delegate_list = fio;
@@ -233,19 +244,12 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
 			offset += dio->io_size;
 			vdev_queue_io_remove(vq, dio);
 			zio_vdev_io_bypass(dio);
-			nagg++;
 		}
 
 		ASSERT(offset == size);
 
-		dprintf("%5s  T=%llu  off=%8llx  agg=%3d  "
-		    "old=%5llx  new=%5llx\n",
-		    zio_type_name[fio->io_type],
-		    fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);
-
 		avl_add(&vq->vq_pending_tree, aio);
 
-		*funcp = zio_nowait;
 		return (aio);
 	}
 
@@ -254,8 +258,6 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
 
 	avl_add(&vq->vq_pending_tree, fio);
 
-	*funcp = zio_next_stage;
-
 	return (fio);
 }
 
@@ -264,7 +266,6 @@ vdev_queue_io(zio_t *zio)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 	zio_t *nio;
-	zio_issue_func_t *func;
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
 
@@ -280,42 +281,45 @@ vdev_queue_io(zio_t *zio)
 
 	mutex_enter(&vq->vq_lock);
 
-	zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
-	    zio->io_priority;
+	zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority;
 
 	vdev_queue_io_add(vq, zio);
 
-	nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func);
+	nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
 
 	mutex_exit(&vq->vq_lock);
 
-	if (nio == NULL || func != zio_nowait)
-		return (nio);
+	if (nio == NULL)
+		return (NULL);
+
+	if (nio->io_done == vdev_queue_agg_io_done) {
+		zio_nowait(nio);
+		return (NULL);
+	}
 
-	func(nio);
-	return (NULL);
+	return (nio);
 }
 
 void
 vdev_queue_io_done(zio_t *zio)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
-	zio_t *nio;
-	zio_issue_func_t *func;
-	int i;
 
 	mutex_enter(&vq->vq_lock);
 
 	avl_remove(&vq->vq_pending_tree, zio);
 
-	for (i = 0; i < zfs_vdev_ramp_rate; i++) {
-		nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func);
+	for (int i = 0; i < zfs_vdev_ramp_rate; i++) {
+		zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
 		if (nio == NULL)
 			break;
 		mutex_exit(&vq->vq_lock);
-		if (func == zio_next_stage)
+		if (nio->io_done == vdev_queue_agg_io_done) {
+			zio_nowait(nio);
+		} else {
 			zio_vdev_io_reissue(nio);
-		func(nio);
+			zio_execute(nio);
+		}
 		mutex_enter(&vq->vq_lock);
 	}
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
index 0c866307653b..69e314468ee4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
@@ -194,6 +192,18 @@ vdev_raidz_exp2(uint_t a, int exp)
 	return (vdev_raidz_pow2[exp]);
 }
 
+static void
+vdev_raidz_map_free(zio_t *zio)
+{
+	raidz_map_t *rm = zio->io_vsd;
+	int c;
+
+	for (c = 0; c < rm->rm_firstdatacol; c++)
+		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
+
+	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+}
+
 static raidz_map_t *
 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
     uint64_t nparity)
@@ -276,23 +286,11 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 	}
 
 	zio->io_vsd = rm;
+	zio->io_vsd_free = vdev_raidz_map_free;
 	return (rm);
 }
 
 static void
-vdev_raidz_map_free(zio_t *zio)
-{
-	raidz_map_t *rm = zio->io_vsd;
-	int c;
-
-	for (c = 0; c < rm->rm_firstdatacol; c++)
-		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
-
-	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
-	zio->io_vsd = NULL;
-}
-
-static void
 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 {
 	uint64_t *p, *src, pcount, ccount, i;
@@ -632,14 +630,7 @@ vdev_raidz_child_done(zio_t *zio)
 	rc->rc_skipped = 0;
 }
 
-static void
-vdev_raidz_repair_done(zio_t *zio)
-{
-	ASSERT(zio->io_private == zio->io_parent);
-	vdev_raidz_map_free(zio->io_private);
-}
-
-static void
+static int
 vdev_raidz_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -669,11 +660,11 @@ vdev_raidz_io_start(zio_t *zio)
 			cvd = vd->vdev_child[rc->rc_devidx];
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
-			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
-		zio_wait_children_done(zio);
-		return;
+
+		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -686,7 +677,7 @@ vdev_raidz_io_start(zio_t *zio)
 	for (c = rm->rm_cols - 1; c >= 0; c--) {
 		rc = &rm->rm_col[c];
 		cvd = vd->vdev_child[rc->rc_devidx];
-		if (vdev_is_dead(cvd)) {
+		if (!vdev_readable(cvd)) {
 			if (c >= rm->rm_firstdatacol)
 				rm->rm_missingdata++;
 			else
@@ -709,12 +700,12 @@ vdev_raidz_io_start(zio_t *zio)
 		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
-			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
 	}
 
-	zio_wait_children_done(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
@@ -724,8 +715,6 @@ static void
 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
 {
 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
-	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
-	    vdev_description(vd));
 
 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 		mutex_enter(&vd->vdev_stat_lock);
@@ -783,6 +772,17 @@ static uint64_t raidz_corrected_p;
 static uint64_t raidz_corrected_q;
 static uint64_t raidz_corrected_pq;
 
+static int
+vdev_raidz_worst_error(raidz_map_t *rm)
+{
+	int error = 0;
+
+	for (int c = 0; c < rm->rm_cols; c++)
+		error = zio_worst_error(error, rm->rm_col[c].rc_error);
+
+	return (error);
+}
+
 static void
 vdev_raidz_io_done(zio_t *zio)
 {
@@ -794,26 +794,19 @@ vdev_raidz_io_done(zio_t *zio)
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
+	int total_errors = 0;
 	int n, c, c1;
 
 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
 
-	zio->io_error = 0;
-	zio->io_numerrors = 0;
-
 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
 
 	for (c = 0; c < rm->rm_cols; c++) {
 		rc = &rm->rm_col[c];
 
-		/*
-		 * We preserve any EIOs because those may be worth retrying;
-		 * whereas ECKSUM and ENXIO are more likely to be persistent.
-		 */
 		if (rc->rc_error) {
-			if (zio->io_error != EIO)
-				zio->io_error = rc->rc_error;
+			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
 
 			if (c < rm->rm_firstdatacol)
 				parity_errors++;
@@ -823,7 +816,7 @@ vdev_raidz_io_done(zio_t *zio)
 			if (!rc->rc_skipped)
 				unexpected_errors++;
 
-			zio->io_numerrors++;
+			total_errors++;
 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
 			parity_untried++;
 		}
@@ -831,16 +824,19 @@ vdev_raidz_io_done(zio_t *zio)
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		/*
-		 * If this is not a failfast write, and we were able to
-		 * write enough columns to reconstruct the data, good enough.
+		 * XXX -- for now, treat partial writes as a success.
+		 * (If we couldn't write enough columns to reconstruct
+		 * the data, the I/O failed.  Otherwise, good enough.)
+		 *
+		 * Now that we support write reallocation, it would be better
+		 * to treat partial failure as real failure unless there are
+		 * no non-degraded top-level vdevs left, and not update DTLs
+		 * if we intend to reallocate.
 		 */
 		/* XXPOLICY */
-		if (zio->io_numerrors <= rm->rm_firstdatacol &&
-		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
-			zio->io_error = 0;
+		if (total_errors > rm->rm_firstdatacol)
+			zio->io_error = vdev_raidz_worst_error(rm);
 
-		vdev_raidz_map_free(zio);
-		zio_next_stage(zio);
 		return;
 	}
 
@@ -862,12 +858,10 @@ vdev_raidz_io_done(zio_t *zio)
 	 * has a valid checksum. Naturally, this case applies in the absence of
 	 * any errors.
 	 */
-	if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
+	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
 		switch (data_errors) {
 		case 0:
 			if (zio_checksum_error(zio) == 0) {
-				zio->io_error = 0;
-
 				/*
 				 * If we read parity information (unnecessarily
 				 * as it happens since no reconstruction was
@@ -919,7 +913,6 @@ vdev_raidz_io_done(zio_t *zio)
 			}
 
 			if (zio_checksum_error(zio) == 0) {
-				zio->io_error = 0;
 				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
 					atomic_inc_64(&raidz_corrected_p);
 				else
@@ -981,9 +974,7 @@ vdev_raidz_io_done(zio_t *zio)
 			vdev_raidz_reconstruct_pq(rm, c1, c);
 
 			if (zio_checksum_error(zio) == 0) {
-				zio->io_error = 0;
 				atomic_inc_64(&raidz_corrected_pq);
-
 				goto done;
 			}
 			break;
@@ -1009,7 +1000,6 @@ vdev_raidz_io_done(zio_t *zio)
 		if (rm->rm_col[c].rc_tried)
 			continue;
 
-		zio->io_error = 0;
 		zio_vdev_io_redone(zio);
 		do {
 			rc = &rm->rm_col[c];
@@ -1018,11 +1008,10 @@ vdev_raidz_io_done(zio_t *zio)
 			zio_nowait(zio_vdev_child_io(zio, NULL,
 			    vd->vdev_child[rc->rc_devidx],
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
-			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		} while (++c < rm->rm_cols);
-		dprintf("rereading\n");
-		zio_wait_children_done(zio);
+
 		return;
 	}
 
@@ -1034,8 +1023,15 @@ vdev_raidz_io_done(zio_t *zio)
 	 * in absent data. Before we attempt combinatorial reconstruction make
 	 * sure we have a chance of coming up with the right answer.
 	 */
-	if (zio->io_numerrors >= rm->rm_firstdatacol) {
-		ASSERT(zio->io_error != 0);
+	if (total_errors >= rm->rm_firstdatacol) {
+		zio->io_error = vdev_raidz_worst_error(rm);
+		/*
+		 * If there were exactly as many device errors as parity
+		 * columns, yet we couldn't reconstruct the data, then at
+		 * least one device must have returned bad data silently.
+		 */
+		if (total_errors == rm->rm_firstdatacol)
+			zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
 		goto done;
 	}
 
@@ -1053,7 +1049,6 @@ vdev_raidz_io_done(zio_t *zio)
 
 			if (zio_checksum_error(zio) == 0) {
 				zio_buf_free(orig, rc->rc_size);
-				zio->io_error = 0;
 				atomic_inc_64(&raidz_corrected_p);
 
 				/*
@@ -1085,7 +1080,6 @@ vdev_raidz_io_done(zio_t *zio)
 
 			if (zio_checksum_error(zio) == 0) {
 				zio_buf_free(orig, rc->rc_size);
-				zio->io_error = 0;
 				atomic_inc_64(&raidz_corrected_q);
 
 				/*
@@ -1127,7 +1121,6 @@ vdev_raidz_io_done(zio_t *zio)
 				if (zio_checksum_error(zio) == 0) {
 					zio_buf_free(orig, rc->rc_size);
 					zio_buf_free(orig1, rc1->rc_size);
-					zio->io_error = 0;
 					atomic_inc_64(&raidz_corrected_pq);
 
 					/*
@@ -1159,6 +1152,7 @@ vdev_raidz_io_done(zio_t *zio)
 	 * all children.
 	 */
 	zio->io_error = ECKSUM;
+
 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
@@ -1173,18 +1167,9 @@ done:
 
 	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
-		zio_t *rio;
-
 		/*
 		 * Use the good data we have in hand to repair damaged children.
-		 *
-		 * We issue all repair I/Os as children of 'rio' to arrange
-		 * that vdev_raidz_map_free(zio) will be invoked after all
-		 * repairs complete, but before we advance to the next stage.
 		 */
-		rio = zio_null(zio, zio->io_spa,
-		    vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
-
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
 			cvd = vd->vdev_child[rc->rc_devidx];
@@ -1192,25 +1177,12 @@ done:
 			if (rc->rc_error == 0)
 				continue;
 
-			dprintf("%s resilvered %s @ 0x%llx error %d\n",
-			    vdev_description(vd),
-			    vdev_description(cvd),
-			    zio->io_offset, rc->rc_error);
-
-			zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    ZIO_TYPE_WRITE, zio->io_priority,
-			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
-			    ZIO_FLAG_CANFAIL, NULL, NULL));
+			    ZIO_FLAG_IO_REPAIR, NULL, NULL));
 		}
-
-		zio_nowait(rio);
-		zio_wait_children_done(zio);
-		return;
 	}
-
-	vdev_raidz_map_free(zio);
-	zio_next_stage(zio);
 }
 
 static void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
index 0e8752c6ce83..88383f002b80 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
@@ -44,18 +42,17 @@
  * probably fine.  Adding bean counters during alloc/free can make this
  * future guesswork more accurate.
  */
-/*ARGSUSED*/
 static int
 too_many_errors(vdev_t *vd, int numerrors)
 {
+	ASSERT3U(numerrors, <=, vd->vdev_children);
 	return (numerrors > 0);
 }
 
 static int
 vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 {
-	vdev_t *cvd;
-	int c, error;
+	int c;
 	int lasterror = 0;
 	int numerrors = 0;
 
@@ -65,9 +62,11 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 	}
 
 	for (c = 0; c < vd->vdev_children; c++) {
-		cvd = vd->vdev_child[c];
+		vdev_t *cvd = vd->vdev_child[c];
+		int error;
 
-		if ((error = vdev_open(cvd)) != 0) {
+		if ((error = vdev_open(cvd)) != 0 &&
+		    !cvd->vdev_islog) {
 			lasterror = error;
 			numerrors++;
 			continue;
@@ -97,13 +96,14 @@ vdev_root_close(vdev_t *vd)
 static void
 vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
 {
-	if (too_many_errors(vd, faulted))
+	if (too_many_errors(vd, faulted)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_NO_REPLICAS);
-	else if (degraded != 0)
+	} else if (degraded) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
-	else
+	} else {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+	}
 }
 
 vdev_ops_t vdev_root_ops = {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
index 4246ec0b0e6c..90fe3d094318 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -44,6 +44,7 @@
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
 #include <sys/zap.h>
 #include <sys/refcount.h>
 #include <sys/zap_impl.h>
@@ -103,6 +104,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
 	zp->zap_num_leafs = 1;
 	zp->zap_num_entries = 0;
 	zp->zap_salt = zap->zap_salt;
+	zp->zap_normflags = zap->zap_normflags;
 
 	/* block 1 will be the first leaf */
 	for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
@@ -119,7 +121,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
 	l->l_dbuf = db;
 	l->l_phys = db->db_data;
 
-	zap_leaf_init(l);
+	zap_leaf_init(l, zp->zap_normflags != 0);
 
 	kmem_free(l, sizeof (zap_leaf_t));
 	dmu_buf_rele(db, FTAG);
@@ -399,7 +401,7 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 	ASSERT(winner == NULL);
 	dmu_buf_will_dirty(l->l_dbuf, tx);
 
-	zap_leaf_init(l);
+	zap_leaf_init(l, zap->zap_normflags != 0);
 
 	zap->zap_f.zap_phys->zap_num_leafs++;
 
@@ -580,9 +582,10 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 }
 
 static int
-zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx,
-    zap_leaf_t **lp)
+zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
 {
+	zap_t *zap = zn->zn_zap;
+	uint64_t hash = zn->zn_hash;
 	zap_leaf_t *nl;
 	int prefix_diff, i, err;
 	uint64_t sibling;
@@ -602,7 +605,9 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx,
 
 		zap_put_leaf(l);
 		zap_unlockdir(zap);
-		err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
+		err = zap_lockdir(os, object, tx, RW_WRITER,
+		    FALSE, FALSE, &zn->zn_zap);
+		zap = zn->zn_zap;
 		if (err)
 			return (err);
 		ASSERT(!zap->zap_ismicro);
@@ -643,7 +648,7 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx,
 	}
 
 	nl = zap_create_leaf(zap, tx);
-	zap_leaf_split(l, nl);
+	zap_leaf_split(l, nl, zap->zap_normflags != 0);
 
 	/* set sibling pointers */
 	for (i = 0; i < (1ULL<<prefix_diff); i++) {
@@ -664,8 +669,9 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx,
 }
 
 static void
-zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
+zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
 {
+	zap_t *zap = zn->zn_zap;
 	int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
 	int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift &&
 	    l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
@@ -685,7 +691,8 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
 
 			zap_unlockdir(zap);
 			err = zap_lockdir(os, zapobj, tx,
-			    RW_WRITER, FALSE, &zap);
+			    RW_WRITER, FALSE, FALSE, &zn->zn_zap);
+			zap = zn->zn_zap;
 			if (err)
 				return;
 		}
@@ -721,53 +728,58 @@ fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
 }
 
 /*
- * Routines for maniplulating attributes.
+ * Routines for manipulating attributes.
  */
 int
-fzap_lookup(zap_t *zap, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf)
+fzap_lookup(zap_name_t *zn,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    char *realname, int rn_len, boolean_t *ncp)
 {
 	zap_leaf_t *l;
 	int err;
-	uint64_t hash;
 	zap_entry_handle_t zeh;
 
-	err = fzap_checksize(name, integer_size, num_integers);
+	err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
-	hash = zap_hash(zap, name);
-	err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
+	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
 	if (err != 0)
 		return (err);
-	err = zap_leaf_lookup(l, name, hash, &zeh);
-	if (err == 0)
+	err = zap_leaf_lookup(l, zn, &zeh);
+	if (err == 0) {
 		err = zap_entry_read(&zeh, integer_size, num_integers, buf);
+		(void) zap_entry_read_name(&zeh, rn_len, realname);
+		if (ncp) {
+			*ncp = zap_entry_normalization_conflict(&zeh,
+			    zn, NULL, zn->zn_zap);
+		}
+	}
 
 	zap_put_leaf(l);
 	return (err);
 }
 
 int
-fzap_add_cd(zap_t *zap, const char *name,
+fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, uint32_t cd, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
-	uint64_t hash;
 	int err;
 	zap_entry_handle_t zeh;
+	zap_t *zap = zn->zn_zap;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 	ASSERT(!zap->zap_ismicro);
-	ASSERT(fzap_checksize(name, integer_size, num_integers) == 0);
+	ASSERT(fzap_checksize(zn->zn_name_orij,
+	    integer_size, num_integers) == 0);
 
-	hash = zap_hash(zap, name);
-	err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
 	if (err != 0)
 		return (err);
 retry:
-	err = zap_leaf_lookup(l, name, hash, &zeh);
+	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err == 0) {
 		err = EEXIST;
 		goto out;
@@ -775,63 +787,62 @@ retry:
 	if (err != ENOENT)
 		goto out;
 
-	err = zap_entry_create(l, name, hash, cd,
+	err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, cd,
 	    integer_size, num_integers, val, &zeh);
 
 	if (err == 0) {
 		zap_increment_num_entries(zap, 1, tx);
 	} else if (err == EAGAIN) {
-		err = zap_expand_leaf(zap, l, hash, tx, &l);
+		err = zap_expand_leaf(zn, l, tx, &l);
+		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
 	}
 
 out:
-	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
+	if (zap != NULL)
+		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
 	return (err);
 }
 
 int
-fzap_add(zap_t *zap, const char *name,
+fzap_add(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
-	int err = fzap_checksize(name, integer_size, num_integers);
+	int err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
-	return (fzap_add_cd(zap, name, integer_size, num_integers,
+	return (fzap_add_cd(zn, integer_size, num_integers,
 	    val, ZAP_MAXCD, tx));
 }
 
 int
-fzap_update(zap_t *zap, const char *name,
+fzap_update(zap_name_t *zn,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
-	uint64_t hash;
 	int err, create;
 	zap_entry_handle_t zeh;
+	zap_t *zap = zn->zn_zap;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	err = fzap_checksize(name, integer_size, num_integers);
+	err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
-	hash = zap_hash(zap, name);
-	err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
 	if (err != 0)
 		return (err);
 retry:
-	err = zap_leaf_lookup(l, name, hash, &zeh);
+	err = zap_leaf_lookup(l, zn, &zeh);
 	create = (err == ENOENT);
 	ASSERT(err == 0 || err == ENOENT);
 
-	/* XXX If this leaf is chained, split it if we can. */
-
 	if (create) {
-		err = zap_entry_create(l, name, hash, ZAP_MAXCD,
-		    integer_size, num_integers, val, &zeh);
+		err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash,
+		    ZAP_MAXCD, integer_size, num_integers, val, &zeh);
 		if (err == 0)
 			zap_increment_num_entries(zap, 1, tx);
 	} else {
@@ -839,29 +850,29 @@ retry:
 	}
 
 	if (err == EAGAIN) {
-		err = zap_expand_leaf(zap, l, hash, tx, &l);
+		err = zap_expand_leaf(zn, l, tx, &l);
+		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
 	}
 
-	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
+	if (zap != NULL)
+		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
 	return (err);
 }
 
 int
-fzap_length(zap_t *zap, const char *name,
+fzap_length(zap_name_t *zn,
     uint64_t *integer_size, uint64_t *num_integers)
 {
 	zap_leaf_t *l;
 	int err;
-	uint64_t hash;
 	zap_entry_handle_t zeh;
 
-	hash = zap_hash(zap, name);
-	err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
+	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
 	if (err != 0)
 		return (err);
-	err = zap_leaf_lookup(l, name, hash, &zeh);
+	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err != 0)
 		goto out;
 
@@ -875,40 +886,44 @@ out:
 }
 
 int
-fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
+fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
-	uint64_t hash;
 	int err;
 	zap_entry_handle_t zeh;
 
-	hash = zap_hash(zap, name);
-	err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
 	if (err != 0)
 		return (err);
-	err = zap_leaf_lookup(l, name, hash, &zeh);
+	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err == 0) {
 		zap_entry_remove(&zeh);
-		zap_increment_num_entries(zap, -1, tx);
+		zap_increment_num_entries(zn->zn_zap, -1, tx);
 	}
 	zap_put_leaf(l);
-	dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n",
-	    zap->zap_objset, zap->zap_object, name, err);
 	return (err);
 }
 
+/*
+ * Helper functions for consumers.
+ */
+
 int
-zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name)
+zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
+    char *name)
 {
 	zap_cursor_t zc;
 	zap_attribute_t *za;
 	int err;
 
+	if (mask == 0)
+		mask = -1ULL;
+
 	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 	for (zap_cursor_init(&zc, os, zapobj);
 	    (err = zap_cursor_retrieve(&zc, za)) == 0;
 	    zap_cursor_advance(&zc)) {
-		if (ZFS_DIRENT_OBJ(za->za_first_integer) == value) {
+		if ((za->za_first_integer & mask) == (value & mask)) {
 			(void) strcpy(name, za->za_name);
 			break;
 		}
@@ -918,6 +933,53 @@ zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name)
 	return (err);
 }
 
+int
+zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int err;
+
+	for (zap_cursor_init(&zc, os, fromobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    (void) zap_cursor_advance(&zc)) {
+		if (za.za_integer_length != 8 || za.za_num_integers != 1)
+			return (EINVAL);
+		err = zap_add(os, intoobj, za.za_name,
+		    8, 1, &za.za_first_integer, tx);
+		if (err)
+			return (err);
+	}
+	zap_cursor_fini(&zc);
+	return (0);
+}
+
+int
+zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+	return (zap_add(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+	return (zap_remove(os, obj, name, tx));
+}
+
+int
+zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+	return (zap_lookup(os, obj, name, 8, 1, &value));
+}
 
 /*
  * Routines for iterating over the attributes.
@@ -983,6 +1045,10 @@ again:
 		err = zap_entry_read_name(&zeh,
 		    sizeof (za->za_name), za->za_name);
 		ASSERT(err == 0);
+
+		za->za_normalization_conflict =
+		    zap_entry_normalization_conflict(&zeh,
+		    NULL, za->za_name, zap);
 	}
 	rw_exit(&zc->zc_leaf->l_rwlock);
 	return (err);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
index 5dff5145308a..da498b6bc9e3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -38,6 +38,8 @@
 #include <sys/spa.h>
 #include <sys/dmu.h>
 
+static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
+
 #define	CHAIN_END 0xffff /* end of the chunk chain */
 
 /* half the (current) minimum block size */
@@ -150,7 +152,7 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
 }
 
 void
-zap_leaf_init(zap_leaf_t *l)
+zap_leaf_init(zap_leaf_t *l, boolean_t sort)
 {
 	int i;
 
@@ -165,6 +167,8 @@ zap_leaf_init(zap_leaf_t *l)
 	l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
 	l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
 	l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
+	if (sort)
+		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
 }
 
 /*
@@ -327,19 +331,30 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
 /*
  * Only to be used on 8-bit arrays.
  * array_len is actual len in bytes (not encoded le_value_length).
- * buf is null-terminated.
+ * namenorm is null-terminated.
  */
-static int
-zap_leaf_array_equal(zap_leaf_t *l, int chunk,
-    int array_len, const char *buf)
+static boolean_t
+zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int chunk, int array_len)
 {
 	int bseen = 0;
 
+	if (zn->zn_matchtype == MT_FIRST) {
+		char *thisname = kmem_alloc(array_len, KM_SLEEP);
+		boolean_t match;
+
+		zap_leaf_array_read(l, chunk, 1, array_len, 1,
+		    array_len, thisname);
+		match = zap_match(zn, thisname);
+		kmem_free(thisname, array_len);
+		return (match);
+	}
+
+	/* Fast path for exact matching */
 	while (bseen < array_len) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		if (bcmp(la->la_array, buf + bseen, toread))
+		if (bcmp(la->la_array, zn->zn_name_orij + bseen, toread))
 			break;
 		chunk = la->la_next;
 		bseen += toread;
@@ -352,15 +367,15 @@ zap_leaf_array_equal(zap_leaf_t *l, int chunk,
  */
 
 int
-zap_leaf_lookup(zap_leaf_t *l,
-    const char *name, uint64_t h, zap_entry_handle_t *zeh)
+zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
 {
 	uint16_t *chunkp;
 	struct zap_leaf_entry *le;
 
 	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
-	for (chunkp = LEAF_HASH_ENTPTR(l, h);
+again:
+	for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
 	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
 		uint16_t chunk = *chunkp;
 		le = ZAP_LEAF_ENTRY(l, chunk);
@@ -368,11 +383,18 @@ zap_leaf_lookup(zap_leaf_t *l,
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
-		if (le->le_hash != h)
+		if (le->le_hash != zn->zn_hash)
 			continue;
 
-		if (zap_leaf_array_equal(l, le->le_name_chunk,
-		    le->le_name_length, name)) {
+		/*
+		 * NB: the entry chain is always sorted by cd on
+		 * normalized zap objects, so this will find the
+		 * lowest-cd match for MT_FIRST.
+		 */
+		ASSERT(zn->zn_matchtype == MT_EXACT ||
+		    (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
+		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
+		    le->le_name_length)) {
 			zeh->zeh_num_integers = le->le_value_length;
 			zeh->zeh_integer_size = le->le_int_size;
 			zeh->zeh_cd = le->le_cd;
@@ -383,6 +405,15 @@ zap_leaf_lookup(zap_leaf_t *l,
 		}
 	}
 
+	/*
+	 * NB: we could of course do this in one pass, but that would be
+	 * a pain.  We'll see if MT_BEST is even used much.
+	 */
+	if (zn->zn_matchtype == MT_BEST) {
+		zn->zn_matchtype = MT_FIRST;
+		goto again;
+	}
+
 	return (ENOENT);
 }
 
@@ -539,22 +570,41 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
 		return (E2BIG);
 
 	if (cd == ZAP_MAXCD) {
-		for (cd = 0; cd < ZAP_MAXCD; cd++) {
+		/* find the lowest unused cd */
+		if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
+			cd = 0;
+
 			for (chunk = *LEAF_HASH_ENTPTR(l, h);
 			    chunk != CHAIN_END; chunk = le->le_next) {
 				le = ZAP_LEAF_ENTRY(l, chunk);
-				if (le->le_hash == h &&
-				    le->le_cd == cd) {
+				if (le->le_cd > cd)
 					break;
+				if (le->le_hash == h) {
+					ASSERT3U(cd, ==, le->le_cd);
+					cd++;
 				}
 			}
-			/* If this cd is not in use, we are good. */
-			if (chunk == CHAIN_END)
-				break;
+		} else {
+			/* old unsorted format; do it the O(n^2) way */
+			for (cd = 0; cd < ZAP_MAXCD; cd++) {
+				for (chunk = *LEAF_HASH_ENTPTR(l, h);
+				    chunk != CHAIN_END; chunk = le->le_next) {
+					le = ZAP_LEAF_ENTRY(l, chunk);
+					if (le->le_hash == h &&
+					    le->le_cd == cd) {
+						break;
+					}
+				}
+				/* If this cd is not in use, we are good. */
+				if (chunk == CHAIN_END)
+					break;
+			}
 		}
-		/* If we tried all the cd's, we lose. */
-		if (cd == ZAP_MAXCD)
-			return (ENOSPC);
+		/*
+		 * we would run out of space in a block before we could
+		 * have ZAP_MAXCD entries
+		 */
+		ASSERT3U(cd, <, ZAP_MAXCD);
 	}
 
 	if (l->l_phys->l_hdr.lh_nfree < numchunks)
@@ -574,9 +624,8 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
 	le->le_cd = cd;
 
 	/* link it into the hash chain */
-	chunkp = LEAF_HASH_ENTPTR(l, h);
-	le->le_next = *chunkp;
-	*chunkp = chunk;
+	/* XXX if we did the search above, we could just use that */
+	chunkp = zap_leaf_rehash_entry(l, chunk);
 
 	l->l_phys->l_hdr.lh_nentries++;
 
@@ -591,16 +640,76 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
 }
 
 /*
+ * Determine if there is another entry with the same normalized form.
+ * For performance purposes, either zn or name must be provided (the
+ * other can be NULL).  Note, there usually won't be any hash
+ * conflicts, in which case we don't need the concatenated/normalized
+ * form of the name.  But all callers have one of these on hand anyway,
+ * so might as well take advantage.  A cleaner but slower interface
+ * would accept neither argument, and compute the normalized name as
+ * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
+ */
+boolean_t
+zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
+    const char *name, zap_t *zap)
+{
+	uint64_t chunk;
+	struct zap_leaf_entry *le;
+	boolean_t allocdzn = B_FALSE;
+
+	if (zap->zap_normflags == 0)
+		return (B_FALSE);
+
+	for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
+	    chunk != CHAIN_END; chunk = le->le_next) {
+		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
+		if (le->le_hash != zeh->zeh_hash)
+			continue;
+		if (le->le_cd == zeh->zeh_cd)
+			continue;
+
+		if (zn == NULL) {
+			zn = zap_name_alloc(zap, name, MT_FIRST);
+			allocdzn = B_TRUE;
+		}
+		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
+		    le->le_name_chunk, le->le_name_length)) {
+			if (allocdzn)
+				zap_name_free(zn);
+			return (B_TRUE);
+		}
+	}
+	if (allocdzn)
+		zap_name_free(zn);
+	return (B_FALSE);
+}
+
+/*
  * Routines for transferring entries between leafs.
  */
 
-static void
+static uint16_t *
 zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
 {
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
-	uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash);
-	le->le_next = *ptr;
-	*ptr = entry;
+	struct zap_leaf_entry *le2;
+	uint16_t *chunkp;
+
+	/*
+	 * keep the entry chain sorted by cd
+	 * NB: this will not cause problems for unsorted leafs, though
+	 * it is unnecessary there.
+	 */
+	for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
+	    *chunkp != CHAIN_END; chunkp = &le2->le_next) {
+		le2 = ZAP_LEAF_ENTRY(l, *chunkp);
+		if (le2->le_cd > le->le_cd)
+			break;
+	}
+
+	le->le_next = *chunkp;
+	*chunkp = entry;
+	return (chunkp);
 }
 
 static uint16_t
@@ -644,7 +753,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 	nle = ZAP_LEAF_ENTRY(nl, chunk);
 	*nle = *le; /* structure assignment */
 
-	zap_leaf_rehash_entry(nl, chunk);
+	(void) zap_leaf_rehash_entry(nl, chunk);
 
 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
 	nle->le_value_chunk =
@@ -660,7 +769,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
  * Transfer the entries whose hash prefix ends in 1 to the new leaf.
  */
 void
-zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl)
+zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 {
 	int i;
 	int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
@@ -674,6 +783,9 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl)
 	/* break existing hash chains */
 	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
 
+	if (sort)
+		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+
 	/*
 	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
 	 * the remaining entries
@@ -691,7 +803,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl)
 		if (le->le_hash & (1ULL << bit))
 			zap_leaf_transfer_entry(l, i, nl);
 		else
-			zap_leaf_rehash_entry(l, i);
+			(void) zap_leaf_rehash_entry(l, i);
 	}
 }
 
@@ -726,7 +838,7 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 
 			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) +
 			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length *
-				le->le_int_size);
+			    le->le_int_size);
 			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 			zs->zs_entries_using_n_chunks[n]++;
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
index 9a882a5491e7..75b43a6f88da 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -34,9 +34,104 @@
 #include <sys/zap_leaf.h>
 #include <sys/avl.h>
 
+#ifdef _KERNEL
+#include <sys/sunddi.h>
+#endif
+
+static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx);
+
+
+static uint64_t
+zap_hash(zap_t *zap, const char *normname)
+{
+	const uint8_t *cp;
+	uint8_t c;
+	uint64_t crc = zap->zap_salt;
+
+	/* NB: name must already be normalized, if necessary */
+
+	ASSERT(crc != 0);
+	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+	for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) {
+		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
+	}
+
+	/*
+	 * Only use 28 bits, since we need 4 bits in the cookie for the
+	 * collision differentiator.  We MUST use the high bits, since
+	 * those are the ones that we first pay attention to when
+	 * chosing the bucket.
+	 */
+	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
 
-static void mzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+	return (crc);
+}
+
+static int
+zap_normalize(zap_t *zap, const char *name, char *namenorm)
+{
+	size_t inlen, outlen;
+	int err;
+
+	inlen = strlen(name) + 1;
+	outlen = ZAP_MAXNAMELEN;
+
+	err = 0;
+	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
+	    zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST,
+	    &err);
+
+	return (err);
+}
+
+boolean_t
+zap_match(zap_name_t *zn, const char *matchname)
+{
+	if (zn->zn_matchtype == MT_FIRST) {
+		char norm[ZAP_MAXNAMELEN];
 
+		if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
+			return (B_FALSE);
+
+		return (strcmp(zn->zn_name_norm, norm) == 0);
+	} else {
+		/* MT_BEST or MT_EXACT */
+		return (strcmp(zn->zn_name_orij, matchname) == 0);
+	}
+}
+
+void
+zap_name_free(zap_name_t *zn)
+{
+	kmem_free(zn, sizeof (zap_name_t));
+}
+
+/* XXX combine this with zap_lockdir()? */
+zap_name_t *
+zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt)
+{
+	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+
+	zn->zn_zap = zap;
+	zn->zn_name_orij = name;
+	zn->zn_matchtype = mt;
+	if (zap->zap_normflags) {
+		if (zap_normalize(zap, name, zn->zn_normbuf) != 0) {
+			zap_name_free(zn);
+			return (NULL);
+		}
+		zn->zn_name_norm = zn->zn_normbuf;
+	} else {
+		if (mt != MT_EXACT) {
+			zap_name_free(zn);
+			return (NULL);
+		}
+		zn->zn_name_norm = zn->zn_name_orij;
+	}
+
+	zn->zn_hash = zap_hash(zap, zn->zn_name_norm);
+	return (zn);
+}
 
 static void
 mzap_byteswap(mzap_phys_t *buf, size_t size)
@@ -44,6 +139,7 @@ mzap_byteswap(mzap_phys_t *buf, size_t size)
 	int i, max;
 	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
 	buf->mz_salt = BSWAP_64(buf->mz_salt);
+	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
 	max = (size / MZAP_ENT_LEN) - 1;
 	for (i = 0; i < max; i++) {
 		buf->mz_chunk[i].mze_value =
@@ -93,7 +189,6 @@ mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 	ASSERT(mzep->mze_cd < ZAP_MAXCD);
-	ASSERT3U(zap_hash(zap, mzep->mze_name), ==, hash);
 
 	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
 	mze->mze_chunkid = chunkid;
@@ -103,30 +198,34 @@ mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
 }
 
 static mzap_ent_t *
-mze_find(zap_t *zap, const char *name, uint64_t hash)
+mze_find(zap_name_t *zn)
 {
 	mzap_ent_t mze_tofind;
 	mzap_ent_t *mze;
 	avl_index_t idx;
-	avl_tree_t *avl = &zap->zap_m.zap_avl;
+	avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
 
-	ASSERT(zap->zap_ismicro);
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	ASSERT3U(zap_hash(zap, name), ==, hash);
+	ASSERT(zn->zn_zap->zap_ismicro);
+	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
 
-	if (strlen(name) >= sizeof (mze_tofind.mze_phys.mze_name))
+	if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name))
 		return (NULL);
 
-	mze_tofind.mze_hash = hash;
+	mze_tofind.mze_hash = zn->zn_hash;
 	mze_tofind.mze_phys.mze_cd = 0;
 
+again:
 	mze = avl_find(avl, &mze_tofind, &idx);
 	if (mze == NULL)
 		mze = avl_nearest(avl, idx, AVL_AFTER);
-	for (; mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
-		if (strcmp(name, mze->mze_phys.mze_name) == 0)
+	for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
+		if (zap_match(zn, mze->mze_phys.mze_name))
 			return (mze);
 	}
+	if (zn->zn_matchtype == MT_BEST) {
+		zn->zn_matchtype = MT_FIRST;
+		goto again;
+	}
 	return (NULL);
 }
 
@@ -193,7 +292,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 	zap->zap_object = obj;
 	zap->zap_dbuf = db;
 
-	if (((uint64_t *)db->db_data)[0] != ZBT_MICRO) {
+	if (*(uint64_t *)db->db_data != ZBT_MICRO) {
 		mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL,
 		    MUTEX_DEFAULT, 0);
 		zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
@@ -219,6 +318,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 
 	if (zap->zap_ismicro) {
 		zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
+		zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags;
 		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
 		avl_create(&zap->zap_m.zap_avl, mze_compare,
 		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
@@ -227,13 +327,18 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 			mzap_ent_phys_t *mze =
 			    &zap->zap_m.zap_phys->mz_chunk[i];
 			if (mze->mze_name[0]) {
+				zap_name_t *zn;
+
 				zap->zap_m.zap_num_entries++;
-				mze_insert(zap, i,
-				    zap_hash(zap, mze->mze_name), mze);
+				zn = zap_name_alloc(zap, mze->mze_name,
+				    MT_EXACT);
+				mze_insert(zap, i, zn->zn_hash, mze);
+				zap_name_free(zn);
 			}
 		}
 	} else {
 		zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
+		zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags;
 
 		ASSERT3U(sizeof (struct zap_leaf_header), ==,
 		    2*ZAP_LEAF_CHUNKSIZE);
@@ -260,7 +365,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 
 int
 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
-    krw_t lti, int fatreader, zap_t **zapp)
+    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
 	zap_t *zap;
 	dmu_buf_t *db;
@@ -312,15 +417,14 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 
 	ASSERT(!zap->zap_ismicro ||
 	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
-	if (zap->zap_ismicro && tx &&
+	if (zap->zap_ismicro && tx && adding &&
 	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
 		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
 		if (newsz > MZAP_MAX_BLKSZ) {
 			dprintf("upgrading obj %llu: num_entries=%u\n",
 			    obj, zap->zap_m.zap_num_entries);
-			mzap_upgrade(zap, tx);
 			*zapp = zap;
-			return (0);
+			return (mzap_upgrade(zapp, tx));
 		}
 		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
 		ASSERT3U(err, ==, 0);
@@ -339,11 +443,12 @@ zap_unlockdir(zap_t *zap)
 	dmu_buf_rele(zap->zap_dbuf, NULL);
 }
 
-static void
-mzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+static int
+mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
 {
 	mzap_phys_t *mzp;
 	int i, sz, nchunks, err;
+	zap_t *zap = *zapp;
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
@@ -354,10 +459,14 @@ mzap_upgrade(zap_t *zap, dmu_tx_t *tx)
 
 	err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
 	    1ULL << fzap_default_block_shift, 0, tx);
-	ASSERT(err == 0);
+	if (err) {
+		kmem_free(mzp, sz);
+		return (err);
+	}
 
 	dprintf("upgrading obj=%llu with %u chunks\n",
 	    zap->zap_object, nchunks);
+	/* XXX destroy the avl later, so we can use the stored hash value */
 	mze_destroy(zap);
 
 	fzap_upgrade(zap, tx);
@@ -365,44 +474,25 @@ mzap_upgrade(zap_t *zap, dmu_tx_t *tx)
 	for (i = 0; i < nchunks; i++) {
 		int err;
 		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
+		zap_name_t *zn;
 		if (mze->mze_name[0] == 0)
 			continue;
 		dprintf("adding %s=%llu\n",
 		    mze->mze_name, mze->mze_value);
-		err = fzap_add_cd(zap,
-		    mze->mze_name, 8, 1, &mze->mze_value,
-		    mze->mze_cd, tx);
-		ASSERT3U(err, ==, 0);
+		zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT);
+		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx);
+		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
+		zap_name_free(zn);
+		if (err)
+			break;
 	}
 	kmem_free(mzp, sz);
+	*zapp = zap;
+	return (err);
 }
 
-uint64_t
-zap_hash(zap_t *zap, const char *name)
-{
-	const uint8_t *cp;
-	uint8_t c;
-	uint64_t crc = zap->zap_salt;
-
-	ASSERT(crc != 0);
-	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
-		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
-
-	/*
-	 * Only use 28 bits, since we need 4 bits in the cookie for the
-	 * collision differentiator.  We MUST use the high bits, since
-	 * those are the onces that we first pay attention to when
-	 * chosing the bucket.
-	 */
-	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
-
-	return (crc);
-}
-
-
 static void
-mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 	mzap_phys_t *zp;
@@ -421,7 +511,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
 	zp = db->db_data;
 	zp->mz_block_type = ZBT_MICRO;
 	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
-	ASSERT(zp->mz_salt != 0);
+	zp->mz_normflags = normflags;
 	dmu_buf_rele(db, FTAG);
 }
 
@@ -429,12 +519,21 @@ int
 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
+	return (zap_create_claim_norm(os, obj,
+	    0, ot, bonustype, bonuslen, tx));
+}
+
+int
+zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
+    dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
 	int err;
 
 	err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
 	if (err != 0)
 		return (err);
-	mzap_create_impl(os, obj, tx);
+	mzap_create_impl(os, obj, normflags, tx);
 	return (0);
 }
 
@@ -442,9 +541,16 @@ uint64_t
 zap_create(objset_t *os, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
+	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
+}
+
+uint64_t
+zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
 	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
-	mzap_create_impl(os, obj, tx);
+	mzap_create_impl(os, obj, normflags, tx);
 	return (obj);
 }
 
@@ -482,7 +588,7 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 	zap_t *zap;
 	int err;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 	if (!zap->zap_ismicro) {
@@ -495,36 +601,102 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 }
 
 /*
- * Routines for maniplulating attributes.
+ * zn may be NULL; if not specified, it will be computed if needed.
+ * See also the comment above zap_entry_normalization_conflict().
+ */
+static boolean_t
+mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
+{
+	mzap_ent_t *other;
+	int direction = AVL_BEFORE;
+	boolean_t allocdzn = B_FALSE;
+
+	if (zap->zap_normflags == 0)
+		return (B_FALSE);
+
+again:
+	for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
+	    other && other->mze_hash == mze->mze_hash;
+	    other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
+
+		if (zn == NULL) {
+			zn = zap_name_alloc(zap, mze->mze_phys.mze_name,
+			    MT_FIRST);
+			allocdzn = B_TRUE;
+		}
+		if (zap_match(zn, other->mze_phys.mze_name)) {
+			if (allocdzn)
+				zap_name_free(zn);
+			return (B_TRUE);
+		}
+	}
+
+	if (direction == AVL_BEFORE) {
+		direction = AVL_AFTER;
+		goto again;
+	}
+
+	if (allocdzn)
+		zap_name_free(zn);
+	return (B_FALSE);
+}
+
+/*
+ * Routines for manipulating attributes.
  */
 
 int
 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf)
 {
+	return (zap_lookup_norm(os, zapobj, name, integer_size,
+	    num_integers, buf, MT_EXACT, NULL, 0, NULL));
+}
+
+int
+zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
 	zap_t *zap;
 	int err;
 	mzap_ent_t *mze;
+	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
+	zn = zap_name_alloc(zap, name, mt);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
+
 	if (!zap->zap_ismicro) {
-		err = fzap_lookup(zap, name,
-		    integer_size, num_integers, buf);
+		err = fzap_lookup(zn, integer_size, num_integers, buf,
+		    realname, rn_len, ncp);
 	} else {
-		mze = mze_find(zap, name, zap_hash(zap, name));
+		mze = mze_find(zn);
 		if (mze == NULL) {
 			err = ENOENT;
 		} else {
-			if (num_integers < 1)
+			if (num_integers < 1) {
 				err = EOVERFLOW;
-			else if (integer_size != 8)
+			} else if (integer_size != 8) {
 				err = EINVAL;
-			else
+			} else {
 				*(uint64_t *)buf = mze->mze_phys.mze_value;
+				(void) strlcpy(realname,
+				    mze->mze_phys.mze_name, rn_len);
+				if (ncp) {
+					*ncp = mzap_normalization_conflict(zap,
+					    zn, mze);
+				}
+			}
 		}
 	}
+	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
@@ -536,14 +708,20 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
 	zap_t *zap;
 	int err;
 	mzap_ent_t *mze;
+	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
+	zn = zap_name_alloc(zap, name, MT_EXACT);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
 	if (!zap->zap_ismicro) {
-		err = fzap_length(zap, name, integer_size, num_integers);
+		err = fzap_length(zn, integer_size, num_integers);
 	} else {
-		mze = mze_find(zap, name, zap_hash(zap, name));
+		mze = mze_find(zn);
 		if (mze == NULL) {
 			err = ENOENT;
 		} else {
@@ -553,28 +731,31 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
 				*num_integers = 1;
 		}
 	}
+	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
 static void
-mzap_addent(zap_t *zap, const char *name, uint64_t hash, uint64_t value)
+mzap_addent(zap_name_t *zn, uint64_t value)
 {
 	int i;
+	zap_t *zap = zn->zn_zap;
 	int start = zap->zap_m.zap_alloc_next;
 	uint32_t cd;
 
-	dprintf("obj=%llu %s=%llu\n", zap->zap_object, name, value);
+	dprintf("obj=%llu %s=%llu\n", zap->zap_object,
+	    zn->zn_name_orij, value);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 #ifdef ZFS_DEBUG
 	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
-		ASSERT(strcmp(name, mze->mze_name) != 0);
+		ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0);
 	}
 #endif
 
-	cd = mze_find_unused_cd(zap, hash);
+	cd = mze_find_unused_cd(zap, zn->zn_hash);
 	/* given the limited size of the microzap, this can't happen */
 	ASSERT(cd != ZAP_MAXCD);
 
@@ -584,13 +765,13 @@ again:
 		if (mze->mze_name[0] == 0) {
 			mze->mze_value = value;
 			mze->mze_cd = cd;
-			(void) strcpy(mze->mze_name, name);
+			(void) strcpy(mze->mze_name, zn->zn_name_orij);
 			zap->zap_m.zap_num_entries++;
 			zap->zap_m.zap_alloc_next = i+1;
 			if (zap->zap_m.zap_alloc_next ==
 			    zap->zap_m.zap_num_chunks)
 				zap->zap_m.zap_alloc_next = 0;
-			mze_insert(zap, i, hash, mze);
+			mze_insert(zap, i, zn->zn_hash, mze);
 			return;
 		}
 	}
@@ -610,29 +791,39 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name,
 	int err;
 	mzap_ent_t *mze;
 	const uint64_t *intval = val;
-	uint64_t hash;
+	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
 	if (err)
 		return (err);
+	zn = zap_name_alloc(zap, name, MT_EXACT);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
 	if (!zap->zap_ismicro) {
-		err = fzap_add(zap, name, integer_size, num_integers, val, tx);
+		err = fzap_add(zn, integer_size, num_integers, val, tx);
+		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(name) >= MZAP_NAME_LEN) {
 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
 		    zapobj, integer_size, num_integers, name);
-		mzap_upgrade(zap, tx);
-		err = fzap_add(zap, name, integer_size, num_integers, val, tx);
+		err = mzap_upgrade(&zn->zn_zap, tx);
+		if (err == 0)
+			err = fzap_add(zn, integer_size, num_integers, val, tx);
+		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else {
-		hash = zap_hash(zap, name);
-		mze = mze_find(zap, name, hash);
+		mze = mze_find(zn);
 		if (mze != NULL) {
 			err = EEXIST;
 		} else {
-			mzap_addent(zap, name, hash, *intval);
+			mzap_addent(zn, *intval);
 		}
 	}
-	zap_unlockdir(zap);
+	ASSERT(zap == zn->zn_zap);
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_add() failed */
+		zap_unlockdir(zap);
 	return (err);
 }
 
@@ -643,68 +834,87 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	zap_t *zap;
 	mzap_ent_t *mze;
 	const uint64_t *intval = val;
-	uint64_t hash;
+	zap_name_t *zn;
 	int err;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
 	if (err)
 		return (err);
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	zn = zap_name_alloc(zap, name, MT_EXACT);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
 	if (!zap->zap_ismicro) {
-		err = fzap_update(zap, name,
-		    integer_size, num_integers, val, tx);
+		err = fzap_update(zn, integer_size, num_integers, val, tx);
+		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(name) >= MZAP_NAME_LEN) {
 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
 		    zapobj, integer_size, num_integers, name);
-		mzap_upgrade(zap, tx);
-		err = fzap_update(zap, name,
-		    integer_size, num_integers, val, tx);
+		err = mzap_upgrade(&zn->zn_zap, tx);
+		if (err == 0)
+			err = fzap_update(zn, integer_size, num_integers,
+			    val, tx);
+		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else {
-		hash = zap_hash(zap, name);
-		mze = mze_find(zap, name, hash);
+		mze = mze_find(zn);
 		if (mze != NULL) {
 			mze->mze_phys.mze_value = *intval;
 			zap->zap_m.zap_phys->mz_chunk
 			    [mze->mze_chunkid].mze_value = *intval;
 		} else {
-			mzap_addent(zap, name, hash, *intval);
+			mzap_addent(zn, *intval);
 		}
 	}
-	zap_unlockdir(zap);
+	ASSERT(zap == zn->zn_zap);
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
+		zap_unlockdir(zap);
 	return (err);
 }
 
 int
 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
 {
+	return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx));
+}
+
+int
+zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
+    matchtype_t mt, dmu_tx_t *tx)
+{
 	zap_t *zap;
 	int err;
 	mzap_ent_t *mze;
+	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
+	zn = zap_name_alloc(zap, name, mt);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
 	if (!zap->zap_ismicro) {
-		err = fzap_remove(zap, name, tx);
+		err = fzap_remove(zn, tx);
 	} else {
-		mze = mze_find(zap, name, zap_hash(zap, name));
+		mze = mze_find(zn);
 		if (mze == NULL) {
-			dprintf("fail: %s\n", name);
 			err = ENOENT;
 		} else {
-			dprintf("success: %s\n", name);
 			zap->zap_m.zap_num_entries--;
 			bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
 			    sizeof (mzap_ent_phys_t));
 			mze_remove(zap, mze);
 		}
 	}
+	zap_name_free(zn);
 	zap_unlockdir(zap);
 	return (err);
 }
 
-
 /*
  * Routines for iterating over the attributes.
  */
@@ -781,7 +991,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 
 	if (zc->zc_zap == NULL) {
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
-		    RW_READER, TRUE, &zc->zc_zap);
+		    RW_READER, TRUE, FALSE, &zc->zc_zap);
 		if (err)
 			return (err);
 	} else {
@@ -796,14 +1006,17 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 		mze_tofind.mze_phys.mze_cd = zc->zc_cd;
 
 		mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
-		ASSERT(mze == NULL || 0 == bcmp(&mze->mze_phys,
-		    &zc->zc_zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
-		    sizeof (mze->mze_phys)));
 		if (mze == NULL) {
 			mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
 			    idx, AVL_AFTER);
 		}
 		if (mze) {
+			ASSERT(0 == bcmp(&mze->mze_phys,
+			    &zc->zc_zap->zap_m.zap_phys->mz_chunk
+			    [mze->mze_chunkid], sizeof (mze->mze_phys)));
+
+			za->za_normalization_conflict =
+			    mzap_normalization_conflict(zc->zc_zap, NULL, mze);
 			za->za_integer_length = 8;
 			za->za_num_integers = 1;
 			za->za_first_integer = mze->mze_phys.mze_value;
@@ -839,7 +1052,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 	int err;
 	zap_t *zap;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
 	if (err)
 		return (err);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
index 33c2909316e3..ec7d29e64f70 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
@@ -43,15 +41,19 @@
 #include <sys/fs/zfs.h>
 #include <sys/policy.h>
 #include <sys/zfs_znode.h>
+#include <sys/zfs_fuid.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/dmu.h>
+#include <sys/dnode.h>
 #include <sys/zap.h>
 #include <acl/acl_common.h>
 
 #define	ALLOW	ACE_ACCESS_ALLOWED_ACE_TYPE
 #define	DENY	ACE_ACCESS_DENIED_ACE_TYPE
+#define	MAX_ACE_TYPE	ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
+#define	MIN_ACE_TYPE	ALLOW
 
 #define	OWNING_GROUP		(ACE_GROUP|ACE_IDENTIFIER_GROUP)
 #define	EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
@@ -60,8 +62,15 @@
     ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
 #define	OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
     ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
-#define	WRITE_MASK (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS| \
-    ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|ACE_WRITE_OWNER)
+#define	WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
+
+#define	ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
+    ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
+    ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
+    ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
+
+#define	WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\
+    ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD)
 
 #define	OGE_CLEAR	(ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
     ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
@@ -70,59 +79,656 @@
     ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
 
 #define	ALL_INHERIT	(ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
-    ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE)
+    ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE)
+
+#define	RESTRICTED_CLEAR	(ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define	V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\
+    ZFS_ACL_PROTECTED)
+
+#define	ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
+    ZFS_ACL_OBJ_ACE)
+
+static uint16_t
+zfs_ace_v0_get_type(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_v0_get_flags(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_v0_get_mask(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_v0_get_who(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_v0_set_type(void *acep, uint16_t type)
+{
+	((zfs_oldace_t *)acep)->z_type = type;
+}
 
-#define	SECURE_CLEAR	(ACE_WRITE_ACL|ACE_WRITE_OWNER)
+static void
+zfs_ace_v0_set_flags(void *acep, uint16_t flags)
+{
+	((zfs_oldace_t *)acep)->z_flags = flags;
+}
 
-#define	OGE_PAD	6		/* traditional owner/group/everyone ACES */
+static void
+zfs_ace_v0_set_mask(void *acep, uint32_t mask)
+{
+	((zfs_oldace_t *)acep)->z_access_mask = mask;
+}
 
-static int zfs_ace_can_use(znode_t *zp, ace_t *);
+static void
+zfs_ace_v0_set_who(void *acep, uint64_t who)
+{
+	((zfs_oldace_t *)acep)->z_fuid = who;
+}
+
+/*ARGSUSED*/
+static size_t
+zfs_ace_v0_size(void *acep)
+{
+	return (sizeof (zfs_oldace_t));
+}
+
+static size_t
+zfs_ace_v0_abstract_size(void)
+{
+	return (sizeof (zfs_oldace_t));
+}
+
+static int
+zfs_ace_v0_mask_off(void)
+{
+	return (offsetof(zfs_oldace_t, z_access_mask));
+}
+
+/*ARGSUSED*/
+static int
+zfs_ace_v0_data(void *acep, void **datap)
+{
+	*datap = NULL;
+	return (0);
+}
+
+static acl_ops_t zfs_acl_v0_ops = {
+	zfs_ace_v0_get_mask,
+	zfs_ace_v0_set_mask,
+	zfs_ace_v0_get_flags,
+	zfs_ace_v0_set_flags,
+	zfs_ace_v0_get_type,
+	zfs_ace_v0_set_type,
+	zfs_ace_v0_get_who,
+	zfs_ace_v0_set_who,
+	zfs_ace_v0_size,
+	zfs_ace_v0_abstract_size,
+	zfs_ace_v0_mask_off,
+	zfs_ace_v0_data
+};
+
+static uint16_t
+zfs_ace_fuid_get_type(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_fuid_get_flags(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_fuid_get_mask(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_fuid_get_who(void *args)
+{
+	uint16_t entry_type;
+	zfs_ace_t *acep = args;
+
+	entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+	if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+	    entry_type == ACE_EVERYONE)
+		return (-1);
+	return (((zfs_ace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_fuid_set_type(void *acep, uint16_t type)
+{
+	((zfs_ace_hdr_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
+{
+	((zfs_ace_hdr_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
+{
+	((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_fuid_set_who(void *arg, uint64_t who)
+{
+	zfs_ace_t *acep = arg;
+
+	uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+	if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+	    entry_type == ACE_EVERYONE)
+		return;
+	acep->z_fuid = who;
+}
+
+static size_t
+zfs_ace_fuid_size(void *acep)
+{
+	zfs_ace_hdr_t *zacep = acep;
+	uint16_t entry_type;
+
+	switch (zacep->z_type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		return (sizeof (zfs_object_ace_t));
+	case ALLOW:
+	case DENY:
+		entry_type =
+		    (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
+		if (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE)
+			return (sizeof (zfs_ace_hdr_t));
+		/*FALLTHROUGH*/
+	default:
+		return (sizeof (zfs_ace_t));
+	}
+}
+
+static size_t
+zfs_ace_fuid_abstract_size(void)
+{
+	return (sizeof (zfs_ace_hdr_t));
+}
+
+static int
+zfs_ace_fuid_mask_off(void)
+{
+	return (offsetof(zfs_ace_hdr_t, z_access_mask));
+}
+
+static int
+zfs_ace_fuid_data(void *acep, void **datap)
+{
+	zfs_ace_t *zacep = acep;
+	zfs_object_ace_t *zobjp;
+
+	switch (zacep->z_hdr.z_type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		zobjp = acep;
+		*datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
+		return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
+	default:
+		*datap = NULL;
+		return (0);
+	}
+}
+
+static acl_ops_t zfs_acl_fuid_ops = {
+	zfs_ace_fuid_get_mask,
+	zfs_ace_fuid_set_mask,
+	zfs_ace_fuid_get_flags,
+	zfs_ace_fuid_set_flags,
+	zfs_ace_fuid_get_type,
+	zfs_ace_fuid_set_type,
+	zfs_ace_fuid_get_who,
+	zfs_ace_fuid_set_who,
+	zfs_ace_fuid_size,
+	zfs_ace_fuid_abstract_size,
+	zfs_ace_fuid_mask_off,
+	zfs_ace_fuid_data
+};
+
+static int
+zfs_acl_version(int version)
+{
+	if (version < ZPL_VERSION_FUID)
+		return (ZFS_ACL_VERSION_INITIAL);
+	else
+		return (ZFS_ACL_VERSION_FUID);
+}
+
+static int
+zfs_acl_version_zp(znode_t *zp)
+{
+	return (zfs_acl_version(zp->z_zfsvfs->z_version));
+}
 
 static zfs_acl_t *
-zfs_acl_alloc(int slots)
+zfs_acl_alloc(int vers)
 {
 	zfs_acl_t *aclp;
 
 	aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
-	if (slots != 0) {
-		aclp->z_acl = kmem_alloc(ZFS_ACL_SIZE(slots), KM_SLEEP);
-		aclp->z_acl_count = 0;
-		aclp->z_state = ACL_DATA_ALLOCED;
-	} else {
-		aclp->z_state = 0;
-	}
-	aclp->z_slots = slots;
+	list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
+	    offsetof(zfs_acl_node_t, z_next));
+	aclp->z_version = vers;
+	if (vers == ZFS_ACL_VERSION_FUID)
+		aclp->z_ops = zfs_acl_fuid_ops;
+	else
+		aclp->z_ops = zfs_acl_v0_ops;
 	return (aclp);
 }
 
+static zfs_acl_node_t *
+zfs_acl_node_alloc(size_t bytes)
+{
+	zfs_acl_node_t *aclnode;
+
+	aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
+	if (bytes) {
+		aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
+		aclnode->z_allocdata = aclnode->z_acldata;
+		aclnode->z_allocsize = bytes;
+		aclnode->z_size = bytes;
+	}
+
+	return (aclnode);
+}
+
+static void
+zfs_acl_node_free(zfs_acl_node_t *aclnode)
+{
+	if (aclnode->z_allocsize)
+		kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
+	kmem_free(aclnode, sizeof (zfs_acl_node_t));
+}
+
+static void
+zfs_acl_release_nodes(zfs_acl_t *aclp)
+{
+	zfs_acl_node_t *aclnode;
+
+	while (aclnode = list_head(&aclp->z_acl)) {
+		list_remove(&aclp->z_acl, aclnode);
+		zfs_acl_node_free(aclnode);
+	}
+	aclp->z_acl_count = 0;
+	aclp->z_acl_bytes = 0;
+}
+
 void
 zfs_acl_free(zfs_acl_t *aclp)
 {
-	if (aclp->z_state == ACL_DATA_ALLOCED) {
-		kmem_free(aclp->z_acl, ZFS_ACL_SIZE(aclp->z_slots));
-	}
+	zfs_acl_release_nodes(aclp);
+	list_destroy(&aclp->z_acl);
 	kmem_free(aclp, sizeof (zfs_acl_t));
 }
 
-static uint32_t
-zfs_v4_to_unix(uint32_t access_mask)
+static boolean_t
+zfs_acl_valid_ace_type(uint_t type, uint_t flags)
 {
-	uint32_t new_mask = 0;
+	uint16_t entry_type;
+
+	switch (type) {
+	case ALLOW:
+	case DENY:
+	case ACE_SYSTEM_AUDIT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_ACE_TYPE:
+		entry_type = flags & ACE_TYPE_FLAGS;
+		return (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE || entry_type == 0 ||
+		    entry_type == ACE_IDENTIFIER_GROUP);
+	default:
+		if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
 
+static boolean_t
+zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
+{
 	/*
-	 * This is used for mapping v4 permissions into permissions
-	 * that can be passed to secpolicy_vnode_access()
+	 * first check type of entry
 	 */
-	if (access_mask & (ACE_READ_DATA | ACE_LIST_DIRECTORY |
-	    ACE_READ_ATTRIBUTES | ACE_READ_ACL))
-		new_mask |= S_IROTH;
-	if (access_mask & (ACE_WRITE_DATA | ACE_APPEND_DATA |
-	    ACE_WRITE_ATTRIBUTES | ACE_ADD_FILE | ACE_WRITE_NAMED_ATTRS))
-		new_mask |= S_IWOTH;
-	if (access_mask & (ACE_EXECUTE | ACE_READ_NAMED_ATTRS))
-		new_mask |= S_IXOTH;
 
-	return (new_mask);
+	if (!zfs_acl_valid_ace_type(type, iflags))
+		return (B_FALSE);
+
+	switch (type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		if (aclp->z_version < ZFS_ACL_VERSION_FUID)
+			return (B_FALSE);
+		aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+	}
+
+	/*
+	 * next check inheritance level flags
+	 */
+
+	if (obj_type == VDIR &&
+	    (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+		aclp->z_hints |= ZFS_INHERIT_ACE;
+
+	if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+		if ((iflags & (ACE_FILE_INHERIT_ACE|
+		    ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+			return (B_FALSE);
+		}
+	}
+
+	return (B_TRUE);
+}
+
+static void *
+zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
+    uint32_t *access_mask, uint16_t *iflags, uint16_t *type)
+{
+	zfs_acl_node_t *aclnode;
+
+	if (start == NULL) {
+		aclnode = list_head(&aclp->z_acl);
+		if (aclnode == NULL)
+			return (NULL);
+
+		aclp->z_next_ace = aclnode->z_acldata;
+		aclp->z_curr_node = aclnode;
+		aclnode->z_ace_idx = 0;
+	}
+
+	aclnode = aclp->z_curr_node;
+
+	if (aclnode == NULL)
+		return (NULL);
+
+	if (aclnode->z_ace_idx >= aclnode->z_ace_count) {
+		aclnode = list_next(&aclp->z_acl, aclnode);
+		if (aclnode == NULL)
+			return (NULL);
+		else {
+			aclp->z_curr_node = aclnode;
+			aclnode->z_ace_idx = 0;
+			aclp->z_next_ace = aclnode->z_acldata;
+		}
+	}
+
+	if (aclnode->z_ace_idx < aclnode->z_ace_count) {
+		void *acep = aclp->z_next_ace;
+		size_t ace_size;
+
+		/*
+		 * Make sure we don't overstep our bounds
+		 */
+		ace_size = aclp->z_ops.ace_size(acep);
+
+		if (((caddr_t)acep + ace_size) >
+		    ((caddr_t)aclnode->z_acldata + aclnode->z_size)) {
+			return (NULL);
+		}
+
+		*iflags = aclp->z_ops.ace_flags_get(acep);
+		*type = aclp->z_ops.ace_type_get(acep);
+		*access_mask = aclp->z_ops.ace_mask_get(acep);
+		*who = aclp->z_ops.ace_who_get(acep);
+		aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
+		aclnode->z_ace_idx++;
+		return ((void *)acep);
+	}
+	return (NULL);
+}
+
+/*ARGSUSED*/
+static uint64_t
+zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
+    uint16_t *flags, uint16_t *type, uint32_t *mask)
+{
+	zfs_acl_t *aclp = datap;
+	zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
+	uint64_t who;
+
+	acep = zfs_acl_next_ace(aclp, acep, &who, mask,
+	    flags, type);
+	return ((uint64_t)(uintptr_t)acep);
+}
+
+static zfs_acl_node_t *
+zfs_acl_curr_node(zfs_acl_t *aclp)
+{
+	ASSERT(aclp->z_curr_node);
+	return (aclp->z_curr_node);
+}
+
+/*
+ * Copy ACE to internal ZFS format.
+ * While processing the ACL each ACE will be validated for correctness.
+ * ACE FUIDs will be created later.
+ */
+int
+zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap,
+    zfs_ace_t *z_acl, int aclcnt, size_t *size)
+{
+	int i;
+	uint16_t entry_type;
+	zfs_ace_t *aceptr = z_acl;
+	ace_t *acep = datap;
+	zfs_object_ace_t *zobjacep;
+	ace_object_t *aceobjp;
+
+	for (i = 0; i != aclcnt; i++) {
+		aceptr->z_hdr.z_access_mask = acep->a_access_mask;
+		aceptr->z_hdr.z_flags = acep->a_flags;
+		aceptr->z_hdr.z_type = acep->a_type;
+		entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
+		if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
+		    entry_type != ACE_EVERYONE) {
+			if (!aclp->z_has_fuids)
+				aclp->z_has_fuids = IS_EPHEMERAL(acep->a_who);
+			aceptr->z_fuid = (uint64_t)acep->a_who;
+		}
+
+		/*
+		 * Make sure ACE is valid
+		 */
+		if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type,
+		    aceptr->z_hdr.z_flags) != B_TRUE)
+			return (EINVAL);
+
+		switch (acep->a_type) {
+		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+		case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+			zobjacep = (zfs_object_ace_t *)aceptr;
+			aceobjp = (ace_object_t *)acep;
+
+			bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
+			    sizeof (aceobjp->a_obj_type));
+			bcopy(aceobjp->a_inherit_obj_type,
+			    zobjacep->z_inherit_type,
+			    sizeof (aceobjp->a_inherit_obj_type));
+			acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
+			break;
+		default:
+			acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
+		}
+
+		aceptr = (zfs_ace_t *)((caddr_t)aceptr +
+		    aclp->z_ops.ace_size(aceptr));
+	}
+
+	*size = (caddr_t)aceptr - (caddr_t)z_acl;
+
+	return (0);
+}
+
+/*
+ * Copy ZFS ACEs to fixed size ace_t layout
+ */
+static void
+zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr,
+    void *datap, int filter)
+{
+	uint64_t who;
+	uint32_t access_mask;
+	uint16_t iflags, type;
+	zfs_ace_hdr_t *zacep = NULL;
+	ace_t *acep = datap;
+	ace_object_t *objacep;
+	zfs_object_ace_t *zobjacep;
+	size_t ace_size;
+	uint16_t entry_type;
+
+	while (zacep = zfs_acl_next_ace(aclp, zacep,
+	    &who, &access_mask, &iflags, &type)) {
+
+		switch (type) {
+		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+		case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+			if (filter) {
+				continue;
+			}
+			zobjacep = (zfs_object_ace_t *)zacep;
+			objacep = (ace_object_t *)acep;
+			bcopy(zobjacep->z_object_type,
+			    objacep->a_obj_type,
+			    sizeof (zobjacep->z_object_type));
+			bcopy(zobjacep->z_inherit_type,
+			    objacep->a_inherit_obj_type,
+			    sizeof (zobjacep->z_inherit_type));
+			ace_size = sizeof (ace_object_t);
+			break;
+		default:
+			ace_size = sizeof (ace_t);
+			break;
+		}
+
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+		if ((entry_type != ACE_OWNER &&
+		    entry_type != OWNING_GROUP &&
+		    entry_type != ACE_EVERYONE)) {
+			acep->a_who = zfs_fuid_map_id(zfsvfs, who,
+			    cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
+			    ZFS_ACE_GROUP : ZFS_ACE_USER);
+		} else {
+			acep->a_who = (uid_t)(int64_t)who;
+		}
+		acep->a_access_mask = access_mask;
+		acep->a_flags = iflags;
+		acep->a_type = type;
+		acep = (ace_t *)((caddr_t)acep + ace_size);
+	}
+}
+
+static int
+zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep,
+    zfs_oldace_t *z_acl, int aclcnt, size_t *size)
+{
+	int i;
+	zfs_oldace_t *aceptr = z_acl;
+
+	for (i = 0; i != aclcnt; i++, aceptr++) {
+		aceptr->z_access_mask = acep[i].a_access_mask;
+		aceptr->z_type = acep[i].a_type;
+		aceptr->z_flags = acep[i].a_flags;
+		aceptr->z_fuid = acep[i].a_who;
+		/*
+		 * Make sure ACE is valid
+		 */
+		if (zfs_ace_valid(obj_type, aclp, aceptr->z_type,
+		    aceptr->z_flags) != B_TRUE)
+			return (EINVAL);
+	}
+	*size = (caddr_t)aceptr - (caddr_t)z_acl;
+	return (0);
+}
+
+/*
+ * convert old ACL format to new
+ */
+void
+zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp)
+{
+	zfs_oldace_t *oldaclp;
+	int i;
+	uint16_t type, iflags;
+	uint32_t access_mask;
+	uint64_t who;
+	void *cookie = NULL;
+	zfs_acl_node_t *newaclnode;
+
+	ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
+	/*
+	 * First create the ACE in a contiguous piece of memory
+	 * for zfs_copy_ace_2_fuid().
+	 *
+	 * We only convert an ACL once, so this won't happen
+	 * everytime.
+	 */
+	oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
+	    KM_SLEEP);
+	i = 0;
+	while (cookie = zfs_acl_next_ace(aclp, cookie, &who,
+	    &access_mask, &iflags, &type)) {
+		oldaclp[i].z_flags = iflags;
+		oldaclp[i].z_type = type;
+		oldaclp[i].z_fuid = who;
+		oldaclp[i++].z_access_mask = access_mask;
+	}
+
+	newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
+	    sizeof (zfs_object_ace_t));
+	aclp->z_ops = zfs_acl_fuid_ops;
+	VERIFY(zfs_copy_ace_2_fuid(ZTOV(zp)->v_type, aclp, oldaclp,
+	    newaclnode->z_acldata, aclp->z_acl_count,
+	    &newaclnode->z_size) == 0);
+	newaclnode->z_ace_count = aclp->z_acl_count;
+	aclp->z_version = ZFS_ACL_VERSION;
+	kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
+
+	/*
+	 * Release all previous ACL nodes
+	 */
+
+	zfs_acl_release_nodes(aclp);
+
+	list_insert_head(&aclp->z_acl, newaclnode);
+
+	aclp->z_acl_bytes = newaclnode->z_size;
+	aclp->z_acl_count = newaclnode->z_ace_count;
+
 }
 
 /*
@@ -133,157 +739,213 @@ zfs_unix_to_v4(uint32_t access_mask)
 {
 	uint32_t new_mask = 0;
 
-	if (access_mask & 01)
-		new_mask |= (ACE_EXECUTE);
-	if (access_mask & 02) {
-		new_mask |= (ACE_WRITE_DATA);
-	} if (access_mask & 04) {
+	if (access_mask & S_IXOTH)
+		new_mask |= ACE_EXECUTE;
+	if (access_mask & S_IWOTH)
+		new_mask |= ACE_WRITE_DATA;
+	if (access_mask & S_IROTH)
 		new_mask |= ACE_READ_DATA;
-	}
 	return (new_mask);
 }
 
 static void
-zfs_set_ace(ace_t *zacep, uint32_t access_mask, int access_type,
-    uid_t uid, int entry_type)
+zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
+    uint16_t access_type, uint64_t fuid, uint16_t entry_type)
 {
-	zacep->a_access_mask = access_mask;
-	zacep->a_type = access_type;
-	zacep->a_who = uid;
-	zacep->a_flags = entry_type;
+	uint16_t type = entry_type & ACE_TYPE_FLAGS;
+
+	aclp->z_ops.ace_mask_set(acep, access_mask);
+	aclp->z_ops.ace_type_set(acep, access_type);
+	aclp->z_ops.ace_flags_set(acep, entry_type);
+	if ((type != ACE_OWNER && type != OWNING_GROUP &&
+	    type != ACE_EVERYONE))
+		aclp->z_ops.ace_who_set(acep, fuid);
 }
 
+/*
+ * Determine mode of file based on ACL.
+ * Also, create FUIDs for any User/Group ACEs
+ */
 static uint64_t
-zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
-{
-	int 	i;
-	int	entry_type;
-	mode_t	mode = (zp->z_phys->zp_mode &
-	    (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
-	mode_t	 seen = 0;
-	ace_t 	*acep;
-
-	for (i = 0, acep = aclp->z_acl;
-	    i != aclp->z_acl_count; i++, acep++) {
-		entry_type = (acep->a_flags & ACE_TYPE_FLAGS);
+zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
+    zfs_fuid_info_t **fuidp, dmu_tx_t *tx)
+{
+	int		entry_type;
+	mode_t		mode;
+	mode_t		seen = 0;
+	zfs_ace_hdr_t 	*acep = NULL;
+	uint64_t	who;
+	uint16_t	iflags, type;
+	uint32_t	access_mask;
+
+	mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+
+	while (acep = zfs_acl_next_ace(aclp, acep, &who,
+	    &access_mask, &iflags, &type)) {
+
+		if (!zfs_acl_valid_ace_type(type, iflags))
+			continue;
+
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+
+		/*
+		 * Skip over owner@, group@ or everyone@ inherit only ACEs
+		 */
+		if ((iflags & ACE_INHERIT_ONLY_ACE) &&
+		    (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
+		    entry_type == OWNING_GROUP))
+			continue;
+
 		if (entry_type == ACE_OWNER) {
-			if ((acep->a_access_mask & ACE_READ_DATA) &&
+			if ((access_mask & ACE_READ_DATA) &&
 			    (!(seen & S_IRUSR))) {
 				seen |= S_IRUSR;
-				if (acep->a_type == ALLOW) {
+				if (type == ALLOW) {
 					mode |= S_IRUSR;
 				}
 			}
-			if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+			if ((access_mask & ACE_WRITE_DATA) &&
 			    (!(seen & S_IWUSR))) {
 				seen |= S_IWUSR;
-				if (acep->a_type == ALLOW) {
+				if (type == ALLOW) {
 					mode |= S_IWUSR;
 				}
 			}
-			if ((acep->a_access_mask & ACE_EXECUTE) &&
+			if ((access_mask & ACE_EXECUTE) &&
 			    (!(seen & S_IXUSR))) {
 				seen |= S_IXUSR;
-				if (acep->a_type == ALLOW) {
+				if (type == ALLOW) {
 					mode |= S_IXUSR;
 				}
 			}
 		} else if (entry_type == OWNING_GROUP) {
-			if ((acep->a_access_mask & ACE_READ_DATA) &&
+			if ((access_mask & ACE_READ_DATA) &&
 			    (!(seen & S_IRGRP))) {
 				seen |= S_IRGRP;
-				if (acep->a_type == ALLOW) {
+				if (type == ALLOW) {
 					mode |= S_IRGRP;
 				}
 			}
-			if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+			if ((access_mask & ACE_WRITE_DATA) &&
 			    (!(seen & S_IWGRP))) {
 				seen |= S_IWGRP;
-				if (acep->a_type == ALLOW) {
+				if (type == ALLOW) {
 					mode |= S_IWGRP;
 				}
 			}
-			if ((acep->a_access_mask & ACE_EXECUTE) &&
+			if ((access_mask & ACE_EXECUTE) &&
 			    (!(seen & S_IXGRP))) {
 				seen |= S_IXGRP;
-				if (acep->a_type == ALLOW) {
+				if (type == ALLOW) {
 					mode |= S_IXGRP;
 				}
 			}
 		} else if (entry_type == ACE_EVERYONE) {
-			if ((acep->a_access_mask & ACE_READ_DATA)) {
+			if ((access_mask & ACE_READ_DATA)) {
 				if (!(seen & S_IRUSR)) {
 					seen |= S_IRUSR;
-					if (acep->a_type == ALLOW) {
+					if (type == ALLOW) {
 						mode |= S_IRUSR;
 					}
 				}
 				if (!(seen & S_IRGRP)) {
 					seen |= S_IRGRP;
-					if (acep->a_type == ALLOW) {
+					if (type == ALLOW) {
 						mode |= S_IRGRP;
 					}
 				}
 				if (!(seen & S_IROTH)) {
 					seen |= S_IROTH;
-					if (acep->a_type == ALLOW) {
+					if (type == ALLOW) {
 						mode |= S_IROTH;
 					}
 				}
 			}
-			if ((acep->a_access_mask & ACE_WRITE_DATA)) {
+			if ((access_mask & ACE_WRITE_DATA)) {
 				if (!(seen & S_IWUSR)) {
 					seen |= S_IWUSR;
-					if (acep->a_type == ALLOW) {
+					if (type == ALLOW) {
 						mode |= S_IWUSR;
 					}
 				}
 				if (!(seen & S_IWGRP)) {
 					seen |= S_IWGRP;
-					if (acep->a_type == ALLOW) {
+					if (type == ALLOW) {
 						mode |= S_IWGRP;
 					}
 				}
 				if (!(seen & S_IWOTH)) {
 					seen |= S_IWOTH;
-					if (acep->a_type == ALLOW) {
+					if (type == ALLOW) {
 						mode |= S_IWOTH;
 					}
 				}
 			}
-			if ((acep->a_access_mask & ACE_EXECUTE)) {
+			if ((access_mask & ACE_EXECUTE)) {
 				if (!(seen & S_IXUSR)) {
 					seen |= S_IXUSR;
-					if (acep->a_type == ALLOW) {
+					if (type == ALLOW) {
 						mode |= S_IXUSR;
 					}
 				}
 				if (!(seen & S_IXGRP)) {
 					seen |= S_IXGRP;
-					if (acep->a_type == ALLOW) {
+					if (type == ALLOW) {
 						mode |= S_IXGRP;
 					}
 				}
 				if (!(seen & S_IXOTH)) {
 					seen |= S_IXOTH;
-					if (acep->a_type == ALLOW) {
+					if (type == ALLOW) {
 						mode |= S_IXOTH;
 					}
 				}
 			}
 		}
+		/*
+		 * Now handle FUID create for user/group ACEs
+		 */
+		if (entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP) {
+			aclp->z_ops.ace_who_set(acep,
+			    zfs_fuid_create(zp->z_zfsvfs, who, cr,
+			    (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP,
+			    tx, fuidp));
+		}
 	}
 	return (mode);
 }
 
 static zfs_acl_t *
-zfs_acl_node_read_internal(znode_t *zp)
+zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify)
 {
 	zfs_acl_t	*aclp;
+	zfs_acl_node_t	*aclnode;
 
-	aclp = zfs_acl_alloc(0);
-	aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
-	aclp->z_acl = &zp->z_phys->zp_acl.z_ace_data[0];
+	aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version);
+
+	/*
+	 * Version 0 to 1 znode_acl_phys has the size/count fields swapped.
+	 * Version 0 didn't have a size field, only a count.
+	 */
+	if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+		aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_size;
+		aclp->z_acl_bytes = ZFS_ACL_SIZE(aclp->z_acl_count);
+	} else {
+		aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
+		aclp->z_acl_bytes = zp->z_phys->zp_acl.z_acl_size;
+	}
+
+	aclnode = zfs_acl_node_alloc(will_modify ? aclp->z_acl_bytes : 0);
+	aclnode->z_ace_count = aclp->z_acl_count;
+	if (will_modify) {
+		bcopy(zp->z_phys->zp_acl.z_ace_data, aclnode->z_acldata,
+		    aclp->z_acl_bytes);
+	} else {
+		aclnode->z_size = aclp->z_acl_bytes;
+		aclnode->z_acldata = &zp->z_phys->zp_acl.z_ace_data[0];
+	}
+
+	list_insert_head(&aclp->z_acl, aclnode);
 
 	return (aclp);
 }
@@ -292,212 +954,176 @@ zfs_acl_node_read_internal(znode_t *zp)
  * Read an external acl object.
  */
 static int
-zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp)
+zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
 {
 	uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
 	zfs_acl_t	*aclp;
+	size_t		aclsize;
+	size_t		acl_count;
+	zfs_acl_node_t	*aclnode;
 	int error;
 
 	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
 
 	if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
-		*aclpp = zfs_acl_node_read_internal(zp);
+		*aclpp = zfs_acl_node_read_internal(zp, will_modify);
 		return (0);
 	}
 
-	aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count);
+	aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version);
+	if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+		zfs_acl_phys_v0_t *zacl0 =
+		    (zfs_acl_phys_v0_t *)&zp->z_phys->zp_acl;
 
+		aclsize = ZFS_ACL_SIZE(zacl0->z_acl_count);
+		acl_count = zacl0->z_acl_count;
+	} else {
+		aclsize = zp->z_phys->zp_acl.z_acl_size;
+		acl_count = zp->z_phys->zp_acl.z_acl_count;
+		if (aclsize == 0)
+			aclsize = acl_count * sizeof (zfs_ace_t);
+	}
+	aclnode = zfs_acl_node_alloc(aclsize);
+	list_insert_head(&aclp->z_acl, aclnode);
 	error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
-	    ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl);
+	    aclsize, aclnode->z_acldata);
+	aclnode->z_ace_count = acl_count;
+	aclp->z_acl_count = acl_count;
+	aclp->z_acl_bytes = aclsize;
+
 	if (error != 0) {
 		zfs_acl_free(aclp);
+		/* convert checksum errors into IO errors */
+		if (error == ECKSUM)
+			error = EIO;
 		return (error);
 	}
 
-	aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
-
 	*aclpp = aclp;
 	return (0);
 }
 
-static boolean_t
-zfs_acl_valid(znode_t *zp, ace_t *uace, int aclcnt, int *inherit)
-{
-	ace_t 	*acep;
-	int i;
-
-	*inherit = 0;
-
-	if (aclcnt > MAX_ACL_ENTRIES || aclcnt <= 0) {
-		return (B_FALSE);
-	}
-
-	for (i = 0, acep = uace; i != aclcnt; i++, acep++) {
-
-		/*
-		 * first check type of entry
-		 */
-
-		switch (acep->a_flags & ACE_TYPE_FLAGS) {
-		case ACE_OWNER:
-			acep->a_who = -1;
-			break;
-		case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
-		case ACE_IDENTIFIER_GROUP:
-			if (acep->a_flags & ACE_GROUP) {
-				acep->a_who = -1;
-			}
-			break;
-		case ACE_EVERYONE:
-			acep->a_who = -1;
-			break;
-		}
-
-		/*
-		 * next check inheritance level flags
-		 */
-
-		if (acep->a_type != ALLOW && acep->a_type != DENY)
-			return (B_FALSE);
-
-		/*
-		 * Only directories should have inheritance flags.
-		 */
-		if (ZTOV(zp)->v_type != VDIR && (acep->a_flags &
-		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE|
-		    ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) {
-			return (B_FALSE);
-		}
-
-		if (acep->a_flags &
-		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))
-			*inherit = 1;
-
-		if (acep->a_flags &
-		    (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
-			if ((acep->a_flags & (ACE_FILE_INHERIT_ACE|
-			    ACE_DIRECTORY_INHERIT_ACE)) == 0) {
-				return (B_FALSE);
-			}
-		}
-	}
-
-	return (B_TRUE);
-}
 /*
- * common code for setting acl's.
+ * common code for setting ACLs.
  *
  * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
  * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
  * already checked the acl and knows whether to inherit.
  */
 int
-zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, dmu_tx_t *tx, int *ihp)
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
+    zfs_fuid_info_t **fuidp, dmu_tx_t *tx)
 {
-	int 		inherit = 0;
 	int		error;
 	znode_phys_t	*zphys = zp->z_phys;
-	zfs_znode_acl_t	*zacl = &zphys->zp_acl;
-	uint32_t	acl_phys_size = ZFS_ACL_SIZE(aclp->z_acl_count);
+	zfs_acl_phys_t	*zacl = &zphys->zp_acl;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint64_t	aoid = zphys->zp_acl.z_acl_extern_obj;
+	uint64_t	off = 0;
+	dmu_object_type_t otype;
+	zfs_acl_node_t	*aclnode;
 
 	ASSERT(MUTEX_HELD(&zp->z_lock));
 	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
 
-	if (ihp)
-		inherit = *ihp;		/* already determined by caller */
-	else if (!zfs_acl_valid(zp, aclp->z_acl,
-	    aclp->z_acl_count, &inherit)) {
-		return (EINVAL);
-	}
-
 	dmu_buf_will_dirty(zp->z_dbuf, tx);
 
+	zphys->zp_mode = zfs_mode_fuid_compute(zp, aclp, cr, fuidp, tx);
+
 	/*
-	 * Will ACL fit internally?
+	 * Decide which opbject type to use.  If we are forced to
+	 * use old ACL format than transform ACL into zfs_oldace_t
+	 * layout.
 	 */
-	if (aclp->z_acl_count > ACE_SLOT_CNT) {
+	if (!zfsvfs->z_use_fuids) {
+		otype = DMU_OT_OLDACL;
+	} else {
+		if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
+		    (zfsvfs->z_version >= ZPL_VERSION_FUID))
+			zfs_acl_xform(zp, aclp);
+		ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
+		otype = DMU_OT_ACL;
+	}
+
+	if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		/*
+		 * If ACL was previously external and we are now
+		 * converting to new ACL format then release old
+		 * ACL object and create a new one.
+		 */
+		if (aoid && aclp->z_version != zacl->z_acl_version) {
+			error = dmu_object_free(zfsvfs->z_os,
+			    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
+			if (error)
+				return (error);
+			aoid = 0;
+		}
 		if (aoid == 0) {
 			aoid = dmu_object_alloc(zfsvfs->z_os,
-			    DMU_OT_ACL, acl_phys_size, DMU_OT_NONE, 0, tx);
+			    otype, aclp->z_acl_bytes,
+			    otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE,
+			    otype == DMU_OT_ACL ? DN_MAX_BONUSLEN : 0, tx);
 		} else {
 			(void) dmu_object_set_blocksize(zfsvfs->z_os, aoid,
-			    acl_phys_size, 0, tx);
+			    aclp->z_acl_bytes, 0, tx);
 		}
 		zphys->zp_acl.z_acl_extern_obj = aoid;
-		zphys->zp_acl.z_acl_count = aclp->z_acl_count;
-		dmu_write(zfsvfs->z_os, aoid, 0,
-		    acl_phys_size, aclp->z_acl, tx);
+		for (aclnode = list_head(&aclp->z_acl); aclnode;
+		    aclnode = list_next(&aclp->z_acl, aclnode)) {
+			if (aclnode->z_ace_count == 0)
+				continue;
+			dmu_write(zfsvfs->z_os, aoid, off,
+			    aclnode->z_size, aclnode->z_acldata, tx);
+			off += aclnode->z_size;
+		}
 	} else {
+		void *start = zacl->z_ace_data;
 		/*
 		 * Migrating back embedded?
 		 */
 		if (zphys->zp_acl.z_acl_extern_obj) {
 			error = dmu_object_free(zfsvfs->z_os,
-				zp->z_phys->zp_acl.z_acl_extern_obj, tx);
+			    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
 			if (error)
 				return (error);
 			zphys->zp_acl.z_acl_extern_obj = 0;
 		}
-		bcopy(aclp->z_acl, zacl->z_ace_data,
-		    aclp->z_acl_count * sizeof (ace_t));
-		zacl->z_acl_count = aclp->z_acl_count;
+
+		for (aclnode = list_head(&aclp->z_acl); aclnode;
+		    aclnode = list_next(&aclp->z_acl, aclnode)) {
+			if (aclnode->z_ace_count == 0)
+				continue;
+			bcopy(aclnode->z_acldata, start, aclnode->z_size);
+			start = (caddr_t)start + aclnode->z_size;
+		}
 	}
 
-	zp->z_phys->zp_flags &= ~(ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE);
-	if (inherit) {
-		zp->z_phys->zp_flags |= ZFS_INHERIT_ACE;
-	} else if (ace_trivial(zacl->z_ace_data, zacl->z_acl_count) == 0) {
-		zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
+	/*
+	 * If Old version then swap count/bytes to match old
+	 * layout of znode_acl_phys_t.
+	 */
+	if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+		zphys->zp_acl.z_acl_size = aclp->z_acl_count;
+		zphys->zp_acl.z_acl_count = aclp->z_acl_bytes;
+	} else {
+		zphys->zp_acl.z_acl_size = aclp->z_acl_bytes;
+		zphys->zp_acl.z_acl_count = aclp->z_acl_count;
 	}
 
-	zphys->zp_mode = zfs_mode_compute(zp, aclp);
-	zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+	zphys->zp_acl.z_acl_version = aclp->z_version;
 
-	return (0);
-}
+	/*
+	 * Replace ACL wide bits, but first clear them.
+	 */
+	zp->z_phys->zp_flags &= ~ZFS_ACL_WIDE_FLAGS;
 
-/*
- * Create space for slots_needed ACEs to be append
- * to aclp.
- */
-static void
-zfs_acl_append(zfs_acl_t *aclp, int slots_needed)
-{
-	ace_t	*newacep;
-	ace_t	*oldaclp;
-	int	slot_cnt;
-	int 	slots_left = aclp->z_slots - aclp->z_acl_count;
+	zp->z_phys->zp_flags |= aclp->z_hints;
 
-	if (aclp->z_state == ACL_DATA_ALLOCED)
-		ASSERT(aclp->z_slots >= aclp->z_acl_count);
-	if (slots_left < slots_needed || aclp->z_state != ACL_DATA_ALLOCED) {
-		slot_cnt = aclp->z_slots +  1 + (slots_needed - slots_left);
-		newacep = kmem_alloc(ZFS_ACL_SIZE(slot_cnt), KM_SLEEP);
-		bcopy(aclp->z_acl, newacep,
-		    ZFS_ACL_SIZE(aclp->z_acl_count));
-		oldaclp = aclp->z_acl;
-		if (aclp->z_state == ACL_DATA_ALLOCED)
-			kmem_free(oldaclp, ZFS_ACL_SIZE(aclp->z_slots));
-		aclp->z_acl = newacep;
-		aclp->z_slots = slot_cnt;
-		aclp->z_state = ACL_DATA_ALLOCED;
-	}
-}
+	if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
+		zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
 
-/*
- * Remove "slot" ACE from aclp
- */
-static void
-zfs_ace_remove(zfs_acl_t *aclp, int slot)
-{
-	if (aclp->z_acl_count > 1) {
-		(void) memmove(&aclp->z_acl[slot],
-		    &aclp->z_acl[slot +1], sizeof (ace_t) *
-		    (--aclp->z_acl_count - slot));
-	} else
-		aclp->z_acl_count--;
+	zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+	return (0);
 }
 
 /*
@@ -506,16 +1132,24 @@ zfs_ace_remove(zfs_acl_t *aclp, int slot)
  * This applies the "groupmask" value for aclmode property.
  */
 static void
-zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner)
+zfs_acl_prepend_fixup(zfs_acl_t *aclp, void  *acep, void  *origacep,
+    mode_t mode, uint64_t owner)
 {
-
 	int	rmask, wmask, xmask;
 	int	user_ace;
+	uint16_t aceflags;
+	uint32_t origmask, acepmask;
+	uint64_t fuid;
 
-	user_ace = (!(acep->a_flags &
+	aceflags = aclp->z_ops.ace_flags_get(acep);
+	fuid = aclp->z_ops.ace_who_get(acep);
+	origmask = aclp->z_ops.ace_mask_get(origacep);
+	acepmask = aclp->z_ops.ace_mask_get(acep);
+
+	user_ace = (!(aceflags &
 	    (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP)));
 
-	if (user_ace && (acep->a_who == owner)) {
+	if (user_ace && (fuid == owner)) {
 		rmask = S_IRUSR;
 		wmask = S_IWUSR;
 		xmask = S_IXUSR;
@@ -525,33 +1159,38 @@ zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner)
 		xmask = S_IXGRP;
 	}
 
-	if (origacep->a_access_mask & ACE_READ_DATA) {
-		if (mode & rmask)
-			acep->a_access_mask &= ~ACE_READ_DATA;
-		else
-			acep->a_access_mask |= ACE_READ_DATA;
+	if (origmask & ACE_READ_DATA) {
+		if (mode & rmask) {
+			acepmask &= ~ACE_READ_DATA;
+		} else {
+			acepmask |= ACE_READ_DATA;
+		}
 	}
 
-	if (origacep->a_access_mask & ACE_WRITE_DATA) {
-		if (mode & wmask)
-			acep->a_access_mask &= ~ACE_WRITE_DATA;
-		else
-			acep->a_access_mask |= ACE_WRITE_DATA;
+	if (origmask & ACE_WRITE_DATA) {
+		if (mode & wmask) {
+			acepmask &= ~ACE_WRITE_DATA;
+		} else {
+			acepmask |= ACE_WRITE_DATA;
+		}
 	}
 
-	if (origacep->a_access_mask & ACE_APPEND_DATA) {
-		if (mode & wmask)
-			acep->a_access_mask &= ~ACE_APPEND_DATA;
-		else
-			acep->a_access_mask |= ACE_APPEND_DATA;
+	if (origmask & ACE_APPEND_DATA) {
+		if (mode & wmask) {
+			acepmask &= ~ACE_APPEND_DATA;
+		} else {
+			acepmask |= ACE_APPEND_DATA;
+		}
 	}
 
-	if (origacep->a_access_mask & ACE_EXECUTE) {
-		if (mode & xmask)
-			acep->a_access_mask &= ~ACE_EXECUTE;
-		else
-			acep->a_access_mask |= ACE_EXECUTE;
+	if (origmask & ACE_EXECUTE) {
+		if (mode & xmask) {
+			acepmask &= ~ACE_EXECUTE;
+		} else {
+			acepmask |= ACE_EXECUTE;
+		}
 	}
+	aclp->z_ops.ace_mask_set(acep, acepmask);
 }
 
 /*
@@ -560,116 +1199,156 @@ zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner)
 static void
 zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode)
 {
-	int	cnt;
-	ace_t	*acep;
+	zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl);
+	void	*acep;
+	int	maskoff = aclp->z_ops.ace_mask_off();
+	size_t abstract_size = aclp->z_ops.ace_abstract_size();
+
+	ASSERT(aclnode != NULL);
 
-	cnt = aclp->z_acl_count -1;
-	acep = aclp->z_acl;
+	acep = (void *)((caddr_t)aclnode->z_acldata +
+	    aclnode->z_size - (abstract_size * 6));
 
 	/*
 	 * Fixup final ACEs to match the mode
 	 */
 
-	ASSERT(cnt >= 5);
-	adjust_ace_pair(&acep[cnt - 1], mode);	/* everyone@ */
-	adjust_ace_pair(&acep[cnt - 3], (mode & 0070) >> 3);	/* group@ */
-	adjust_ace_pair(&acep[cnt - 5], (mode & 0700) >> 6);	/* owner@ */
+	adjust_ace_pair_common(acep, maskoff, abstract_size,
+	    (mode & 0700) >> 6);	/* owner@ */
+
+	acep = (caddr_t)acep + (abstract_size * 2);
+
+	adjust_ace_pair_common(acep, maskoff, abstract_size,
+	    (mode & 0070) >> 3);	/* group@ */
+
+	acep = (caddr_t)acep + (abstract_size * 2);
+	adjust_ace_pair_common(acep, maskoff,
+	    abstract_size, mode);	/* everyone@ */
 }
 
 
 static int
-zfs_acl_ace_match(ace_t *acep, int allow_deny, int type, int mask)
+zfs_acl_ace_match(zfs_acl_t *aclp, void *acep, int allow_deny,
+    int entry_type, int accessmask)
 {
-	return (acep->a_access_mask == mask && acep->a_type == allow_deny &&
-	    ((acep->a_flags & ACE_TYPE_FLAGS) == type));
+	uint32_t mask = aclp->z_ops.ace_mask_get(acep);
+	uint16_t type = aclp->z_ops.ace_type_get(acep);
+	uint16_t flags = aclp->z_ops.ace_flags_get(acep);
+
+	return (mask == accessmask && type == allow_deny &&
+	    ((flags & ACE_TYPE_FLAGS) == entry_type));
 }
 
 /*
  * Can prepended ACE be reused?
  */
 static int
-zfs_reuse_deny(ace_t *acep, int i)
+zfs_reuse_deny(zfs_acl_t *aclp, void *acep, void *prevacep)
 {
 	int okay_masks;
+	uint16_t prevtype;
+	uint16_t prevflags;
+	uint16_t flags;
+	uint32_t mask, prevmask;
 
-	if (i < 1)
+	if (prevacep == NULL)
 		return (B_FALSE);
 
-	if (acep[i-1].a_type != DENY)
+	prevtype = aclp->z_ops.ace_type_get(prevacep);
+	prevflags = aclp->z_ops.ace_flags_get(prevacep);
+	flags = aclp->z_ops.ace_flags_get(acep);
+	mask = aclp->z_ops.ace_mask_get(acep);
+	prevmask = aclp->z_ops.ace_mask_get(prevacep);
+
+	if (prevtype != DENY)
 		return (B_FALSE);
 
-	if (acep[i-1].a_flags != (acep[i].a_flags & ACE_IDENTIFIER_GROUP))
+	if (prevflags != (flags & ACE_IDENTIFIER_GROUP))
 		return (B_FALSE);
 
-	okay_masks = (acep[i].a_access_mask & OKAY_MASK_BITS);
+	okay_masks = (mask & OKAY_MASK_BITS);
 
-	if (acep[i-1].a_access_mask & ~okay_masks)
+	if (prevmask & ~okay_masks)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
+
 /*
- * Create space to prepend an ACE
+ * Insert new ACL node into chain of zfs_acl_node_t's
+ *
+ * This will result in two possible results.
+ * 1. If the ACL is currently just a single zfs_acl_node and
+ *    we are prepending the entry then current acl node will have
+ *    a new node inserted above it.
+ *
+ * 2. If we are inserting in the middle of current acl node then
+ *    the current node will be split in two and new node will be inserted
+ *    in between the two split nodes.
  */
-static void
-zfs_acl_prepend(zfs_acl_t *aclp, int i)
-{
-	ace_t	*oldaclp = NULL;
-	ace_t	*to, *from;
-	int	slots_left = aclp->z_slots - aclp->z_acl_count;
-	int	oldslots;
-	int	need_free = 0;
-
-	if (aclp->z_state == ACL_DATA_ALLOCED)
-		ASSERT(aclp->z_slots >= aclp->z_acl_count);
-
-	if (slots_left == 0 || aclp->z_state != ACL_DATA_ALLOCED) {
-
-		to = kmem_alloc(ZFS_ACL_SIZE(aclp->z_acl_count +
-		    OGE_PAD), KM_SLEEP);
-		if (aclp->z_state == ACL_DATA_ALLOCED)
-			need_free++;
-		from = aclp->z_acl;
-		oldaclp = aclp->z_acl;
-		(void) memmove(to, from,
-		    sizeof (ace_t) * aclp->z_acl_count);
-		aclp->z_state = ACL_DATA_ALLOCED;
-	} else {
-		from = aclp->z_acl;
-		to = aclp->z_acl;
+static zfs_acl_node_t *
+zfs_acl_ace_insert(zfs_acl_t *aclp, void  *acep)
+{
+	zfs_acl_node_t 	*newnode;
+	zfs_acl_node_t 	*trailernode = NULL;
+	zfs_acl_node_t 	*currnode = zfs_acl_curr_node(aclp);
+	int		curr_idx = aclp->z_curr_node->z_ace_idx;
+	int		trailer_count;
+	size_t		oldsize;
+
+	newnode = zfs_acl_node_alloc(aclp->z_ops.ace_size(acep));
+	newnode->z_ace_count = 1;
+
+	oldsize = currnode->z_size;
+
+	if (curr_idx != 1) {
+		trailernode = zfs_acl_node_alloc(0);
+		trailernode->z_acldata = acep;
+
+		trailer_count = currnode->z_ace_count - curr_idx + 1;
+		currnode->z_ace_count = curr_idx - 1;
+		currnode->z_size = (caddr_t)acep - (caddr_t)currnode->z_acldata;
+		trailernode->z_size = oldsize - currnode->z_size;
+		trailernode->z_ace_count = trailer_count;
 	}
 
-
-	(void) memmove(&to[i + 1], &from[i],
-	    sizeof (ace_t) * (aclp->z_acl_count - i));
-
-	if (oldaclp) {
-		aclp->z_acl = to;
-		oldslots = aclp->z_slots;
-		aclp->z_slots = aclp->z_acl_count + OGE_PAD;
-		if (need_free)
-			kmem_free(oldaclp, ZFS_ACL_SIZE(oldslots));
+	aclp->z_acl_count += 1;
+	aclp->z_acl_bytes += aclp->z_ops.ace_size(acep);
+
+	if (curr_idx == 1)
+		list_insert_before(&aclp->z_acl, currnode, newnode);
+	else
+		list_insert_after(&aclp->z_acl, currnode, newnode);
+	if (trailernode) {
+		list_insert_after(&aclp->z_acl, newnode, trailernode);
+		aclp->z_curr_node = trailernode;
+		trailernode->z_ace_idx = 1;
 	}
 
+	return (newnode);
 }
 
 /*
  * Prepend deny ACE
  */
-static void
-zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, int i,
+static void *
+zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep,
     mode_t mode)
 {
-	ace_t	*acep;
-
-	zfs_acl_prepend(aclp, i);
-
-	acep = aclp->z_acl;
-	zfs_set_ace(&acep[i], 0, DENY, acep[i + 1].a_who,
-	    (acep[i + 1].a_flags & ACE_TYPE_FLAGS));
-	zfs_acl_prepend_fixup(&acep[i], &acep[i+1], mode, zp->z_phys->zp_uid);
-	aclp->z_acl_count++;
+	zfs_acl_node_t *aclnode;
+	void  *newacep;
+	uint64_t fuid;
+	uint16_t flags;
+
+	aclnode = zfs_acl_ace_insert(aclp, acep);
+	newacep = aclnode->z_acldata;
+	fuid = aclp->z_ops.ace_who_get(acep);
+	flags = aclp->z_ops.ace_flags_get(acep);
+	zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS));
+	zfs_acl_prepend_fixup(aclp, newacep, acep, mode, zp->z_phys->zp_uid);
+
+	return (newacep);
 }
 
 /*
@@ -677,41 +1356,74 @@ zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, int i,
  * and original ACE with inheritance flags stripped off.
  */
 static void
-zfs_acl_split_ace(zfs_acl_t *aclp, int i)
+zfs_acl_split_ace(zfs_acl_t *aclp, zfs_ace_hdr_t *acep)
 {
-	ace_t *acep = aclp->z_acl;
-
-	zfs_acl_prepend(aclp, i);
-	acep = aclp->z_acl;
-	acep[i] = acep[i + 1];
-	acep[i].a_flags |= ACE_INHERIT_ONLY_ACE;
-	acep[i + 1].a_flags &= ~ALL_INHERIT;
-	aclp->z_acl_count++;
+	zfs_acl_node_t *aclnode;
+	zfs_acl_node_t *currnode;
+	void  *newacep;
+	uint16_t type, flags;
+	uint32_t mask;
+	uint64_t fuid;
+
+	type = aclp->z_ops.ace_type_get(acep);
+	flags = aclp->z_ops.ace_flags_get(acep);
+	mask = aclp->z_ops.ace_mask_get(acep);
+	fuid = aclp->z_ops.ace_who_get(acep);
+
+	aclnode = zfs_acl_ace_insert(aclp, acep);
+	newacep = aclnode->z_acldata;
+
+	aclp->z_ops.ace_type_set(newacep, type);
+	aclp->z_ops.ace_flags_set(newacep, flags | ACE_INHERIT_ONLY_ACE);
+	aclp->z_ops.ace_mask_set(newacep, mask);
+	aclp->z_ops.ace_type_set(newacep, type);
+	aclp->z_ops.ace_who_set(newacep, fuid);
+	aclp->z_next_ace = acep;
+	flags &= ~ALL_INHERIT;
+	aclp->z_ops.ace_flags_set(acep, flags);
+	currnode = zfs_acl_curr_node(aclp);
+	ASSERT(currnode->z_ace_idx >= 1);
+	currnode->z_ace_idx -= 1;
 }
 
 /*
  * Are ACES started at index i, the canonical six ACES?
  */
 static int
-zfs_have_canonical_six(zfs_acl_t *aclp, int i)
+zfs_have_canonical_six(zfs_acl_t *aclp)
 {
-	ace_t *acep = aclp->z_acl;
+	void *acep;
+	zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl);
+	int		i = 0;
+	size_t abstract_size = aclp->z_ops.ace_abstract_size();
 
-	if ((zfs_acl_ace_match(&acep[i],
+	ASSERT(aclnode != NULL);
+
+	if (aclnode->z_ace_count < 6)
+		return (0);
+
+	acep = (void *)((caddr_t)aclnode->z_acldata +
+	    aclnode->z_size - (aclp->z_ops.ace_abstract_size() * 6));
+
+	if ((zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
 	    DENY, ACE_OWNER, 0) &&
-	    zfs_acl_ace_match(&acep[i + 1], ALLOW, ACE_OWNER,
-	    OWNER_ALLOW_MASK) && zfs_acl_ace_match(&acep[i + 2],
-	    DENY, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 3],
-	    ALLOW, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 4],
+	    zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
+	    ALLOW, ACE_OWNER, OWNER_ALLOW_MASK) &&
+	    zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), DENY,
+	    OWNING_GROUP, 0) && zfs_acl_ace_match(aclp, (caddr_t)acep +
+	    (abstract_size * i++),
+	    ALLOW, OWNING_GROUP, 0) &&
+	    zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
 	    DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) &&
-	    zfs_acl_ace_match(&acep[i + 5], ALLOW, ACE_EVERYONE,
-	    EVERYONE_ALLOW_MASK))) {
+	    zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
+	    ALLOW, ACE_EVERYONE, EVERYONE_ALLOW_MASK))) {
 		return (1);
 	} else {
 		return (0);
 	}
 }
 
+
 /*
  * Apply step 1g, to group entries
  *
@@ -721,73 +1433,89 @@ zfs_have_canonical_six(zfs_acl_t *aclp, int i)
  * group has.
  */
 static void
-zfs_fixup_group_entries(ace_t *acep, mode_t mode)
+zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep,
+    mode_t mode)
 {
+	uint32_t prevmask = aclp->z_ops.ace_mask_get(prevacep);
+	uint32_t mask = aclp->z_ops.ace_mask_get(acep);
+	uint16_t prevflags = aclp->z_ops.ace_flags_get(prevacep);
 	mode_t extramode = (mode >> 3) & 07;
 	mode_t ownermode = (mode >> 6);
 
-	if (acep[0].a_flags & ACE_IDENTIFIER_GROUP) {
+	if (prevflags & ACE_IDENTIFIER_GROUP) {
 
 		extramode &= ~ownermode;
 
 		if (extramode) {
-			if (extramode & 04) {
-				acep[0].a_access_mask &= ~ACE_READ_DATA;
-				acep[1].a_access_mask &= ~ACE_READ_DATA;
+			if (extramode & S_IROTH) {
+				prevmask &= ~ACE_READ_DATA;
+				mask &= ~ACE_READ_DATA;
 			}
-			if (extramode & 02) {
-				acep[0].a_access_mask &=
-				    ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
-				acep[1].a_access_mask &=
-				    ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+			if (extramode & S_IWOTH) {
+				prevmask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+				mask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
 			}
-			if (extramode & 01) {
-				acep[0].a_access_mask &= ~ACE_EXECUTE;
-				acep[1].a_access_mask &= ~ACE_EXECUTE;
+			if (extramode & S_IXOTH) {
+				prevmask  &= ~ACE_EXECUTE;
+				mask &= ~ACE_EXECUTE;
 			}
 		}
 	}
+	aclp->z_ops.ace_mask_set(acep, mask);
+	aclp->z_ops.ace_mask_set(prevacep, prevmask);
 }
 
 /*
  * Apply the chmod algorithm as described
  * in PSARC/2002/240
  */
-static int
-zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp,
-    dmu_tx_t *tx)
+static void
+zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp)
 {
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	ace_t 		*acep;
+	void		*acep = NULL, *prevacep = NULL;
+	uint64_t	who;
 	int 		i;
-	int		error;
 	int 		entry_type;
 	int 		reuse_deny;
 	int 		need_canonical_six = 1;
-	int		inherit = 0;
-	int		iflags;
+	uint16_t	iflags, type;
+	uint32_t	access_mask;
 
 	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
 	ASSERT(MUTEX_HELD(&zp->z_lock));
 
-	i = 0;
-	while (i < aclp->z_acl_count) {
-		acep = aclp->z_acl;
-		entry_type = (acep[i].a_flags & ACE_TYPE_FLAGS);
-		iflags = (acep[i].a_flags & ALL_INHERIT);
+	aclp->z_hints = (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
 
-		if ((acep[i].a_type != ALLOW && acep[i].a_type != DENY) ||
-		    (iflags & ACE_INHERIT_ONLY_ACE)) {
-			i++;
-			if (iflags)
-				inherit = 1;
-			continue;
-		}
+	/*
+	 * If discard then just discard all ACL nodes which
+	 * represent the ACEs.
+	 *
+	 * New owner@/group@/everone@ ACEs will be added
+	 * later.
+	 */
+	if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
+		zfs_acl_release_nodes(aclp);
 
+	while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+	    &iflags, &type)) {
 
-		if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) {
-			zfs_ace_remove(aclp, i);
-			continue;
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+		iflags = (iflags & ALL_INHERIT);
+
+		if ((type != ALLOW && type != DENY) ||
+		    (iflags & ACE_INHERIT_ONLY_ACE)) {
+			if (iflags)
+				aclp->z_hints |= ZFS_INHERIT_ACE;
+			switch (type) {
+			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+				aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+				break;
+			}
+			goto nextace;
 		}
 
 		/*
@@ -796,20 +1524,19 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp,
 		if ((iflags & (ACE_FILE_INHERIT_ACE|
 		    ACE_DIRECTORY_INHERIT_ACE)) &&
 		    (!(iflags & ACE_INHERIT_ONLY_ACE))) {
-			zfs_acl_split_ace(aclp, i);
-			i++;
-			inherit = 1;
-			continue;
+			zfs_acl_split_ace(aclp, acep);
+			aclp->z_hints |= ZFS_INHERIT_ACE;
+			goto nextace;
 		}
 
 		if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
 		    (entry_type == OWNING_GROUP)) {
-			acep[i].a_access_mask &= ~OGE_CLEAR;
-			i++;
-			continue;
-
+			access_mask &= ~OGE_CLEAR;
+			aclp->z_ops.ace_mask_set(acep, access_mask);
+			goto nextace;
 		} else {
-			if (acep[i].a_type == ALLOW) {
+			reuse_deny = B_TRUE;
+			if (type == ALLOW) {
 
 				/*
 				 * Check preceding ACE if any, to see
@@ -819,25 +1546,27 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp,
 				 */
 				if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) {
 
-					reuse_deny = zfs_reuse_deny(acep, i);
+					reuse_deny = zfs_reuse_deny(aclp, acep,
+					    prevacep);
 
-					if (reuse_deny == B_FALSE) {
-						zfs_acl_prepend_deny(zp, aclp,
-						    i, mode);
-						i++;
-						acep = aclp->z_acl;
+					if (!reuse_deny) {
+						prevacep =
+						    zfs_acl_prepend_deny(zp,
+						    aclp, acep, mode);
 					} else {
 						zfs_acl_prepend_fixup(
-						    &acep[i - 1],
-						    &acep[i], mode,
+						    aclp, prevacep,
+						    acep, mode,
 						    zp->z_phys->zp_uid);
 					}
-					zfs_fixup_group_entries(&acep[i - 1],
-					    mode);
+					zfs_fixup_group_entries(aclp, acep,
+					    prevacep, mode);
+
 				}
 			}
-			i++;
 		}
+nextace:
+		prevacep = acep;
 	}
 
 	/*
@@ -845,51 +1574,56 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp,
 	 */
 
 	if (aclp->z_acl_count >= 6) {
-		i = aclp->z_acl_count - 6;
-
-		if (zfs_have_canonical_six(aclp, i)) {
+		if (zfs_have_canonical_six(aclp)) {
 			need_canonical_six = 0;
 		}
 	}
 
 	if (need_canonical_six) {
-
-		zfs_acl_append(aclp, 6);
-		i = aclp->z_acl_count;
-		acep = aclp->z_acl;
-		zfs_set_ace(&acep[i++], 0, DENY, -1, ACE_OWNER);
-		zfs_set_ace(&acep[i++], OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER);
-		zfs_set_ace(&acep[i++], 0, DENY, -1, OWNING_GROUP);
-		zfs_set_ace(&acep[i++], 0, ALLOW, -1, OWNING_GROUP);
-		zfs_set_ace(&acep[i++], EVERYONE_DENY_MASK,
-		    DENY, -1, ACE_EVERYONE);
-		zfs_set_ace(&acep[i++], EVERYONE_ALLOW_MASK,
-		    ALLOW, -1, ACE_EVERYONE);
+		size_t abstract_size = aclp->z_ops.ace_abstract_size();
+		void *zacep;
+		zfs_acl_node_t *aclnode =
+		    zfs_acl_node_alloc(abstract_size * 6);
+
+		aclnode->z_size = abstract_size * 6;
+		aclnode->z_ace_count = 6;
+		aclp->z_acl_bytes += aclnode->z_size;
+		list_insert_tail(&aclp->z_acl, aclnode);
+
+		zacep = aclnode->z_acldata;
+
+		i = 0;
+		zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
+		    0, DENY, -1, ACE_OWNER);
+		zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
+		    OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER);
+		zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0,
+		    DENY, -1, OWNING_GROUP);
+		zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0,
+		    ALLOW, -1, OWNING_GROUP);
+		zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
+		    EVERYONE_DENY_MASK, DENY, -1, ACE_EVERYONE);
+		zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
+		    EVERYONE_ALLOW_MASK, ALLOW, -1, ACE_EVERYONE);
 		aclp->z_acl_count += 6;
 	}
 
 	zfs_acl_fixup_canonical_six(aclp, mode);
-
-	zp->z_phys->zp_mode = mode;
-	error = zfs_aclset_common(zp, aclp, tx, &inherit);
-	return (error);
 }
 
-
 int
-zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx)
+zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
 {
-	zfs_acl_t *aclp = NULL;
 	int error;
 
-	ASSERT(MUTEX_HELD(&zp->z_lock));
+	mutex_enter(&zp->z_lock);
 	mutex_enter(&zp->z_acl_lock);
-	error = zfs_acl_node_read(zp, &aclp);
+	*aclp = NULL;
+	error = zfs_acl_node_read(zp, aclp, B_TRUE);
 	if (error == 0)
-		error = zfs_acl_chmod(zp, mode, aclp, tx);
+		zfs_acl_chmod(zp, mode, *aclp);
 	mutex_exit(&zp->z_acl_lock);
-	if (aclp)
-		zfs_acl_free(aclp);
+	mutex_exit(&zp->z_lock);
 	return (error);
 }
 
@@ -897,104 +1631,159 @@ zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx)
  * strip off write_owner and write_acl
  */
 static void
-zfs_securemode_update(zfsvfs_t *zfsvfs, ace_t *acep)
+zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep)
 {
-	if ((zfsvfs->z_acl_inherit == ZFS_ACL_SECURE) &&
-	    (acep->a_type == ALLOW))
-		acep->a_access_mask &= ~SECURE_CLEAR;
+	uint32_t mask = aclp->z_ops.ace_mask_get(acep);
+
+	if ((zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) &&
+	    (aclp->z_ops.ace_type_get(acep) == ALLOW)) {
+		mask &= ~RESTRICTED_CLEAR;
+		aclp->z_ops.ace_mask_set(acep, mask);
+	}
+}
+
+/*
+ * Should ACE be inherited?
+ */
+static int
+zfs_ace_can_use(znode_t *zp, uint16_t acep_flags)
+{
+	int vtype = ZTOV(zp)->v_type;
+	int	iflags = (acep_flags & 0xf);
+
+	if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+		return (1);
+	else if (iflags & ACE_FILE_INHERIT_ACE)
+		return (!((vtype == VDIR) &&
+		    (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
+	return (0);
 }
 
 /*
  * inherit inheritable ACEs from parent
  */
 static zfs_acl_t *
-zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp)
+zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, boolean_t *need_chmod)
 {
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	ace_t 		*pacep;
-	ace_t		*acep;
-	int 		ace_cnt = 0;
-	int		pace_cnt;
-	int 		i, j;
+	void		*pacep;
+	void		*acep, *acep2;
+	zfs_acl_node_t  *aclnode, *aclnode2;
 	zfs_acl_t	*aclp = NULL;
-
-	i = j = 0;
-	pace_cnt = paclp->z_acl_count;
-	pacep = paclp->z_acl;
+	uint64_t	who;
+	uint32_t	access_mask;
+	uint16_t	iflags, newflags, type;
+	size_t		ace_size;
+	void		*data1, *data2;
+	size_t		data1sz, data2sz;
+	enum vtype	vntype = ZTOV(zp)->v_type;
+
+	*need_chmod = B_TRUE;
+	pacep = NULL;
+	aclp = zfs_acl_alloc(paclp->z_version);
 	if (zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) {
-		for (i = 0; i != pace_cnt; i++) {
+		while (pacep = zfs_acl_next_ace(paclp, pacep, &who,
+		    &access_mask, &iflags, &type)) {
 
-			if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW &&
-			    pacep[i].a_type == ALLOW)
+			/*
+			 * don't inherit bogus ACEs
+			 */
+			if (!zfs_acl_valid_ace_type(type, iflags))
 				continue;
 
-			if (zfs_ace_can_use(zp, &pacep[i])) {
-				ace_cnt++;
-				if (!(pacep[i].a_flags &
-				    ACE_NO_PROPAGATE_INHERIT_ACE))
-					ace_cnt++;
-			}
-		}
-	}
-
-	aclp = zfs_acl_alloc(ace_cnt + OGE_PAD);
-	if (ace_cnt && zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) {
-		acep = aclp->z_acl;
-		pacep = paclp->z_acl;
-		for (i = 0; i != pace_cnt; i++) {
-
 			if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW &&
-			    pacep[i].a_type == ALLOW)
+			    type == ALLOW)
 				continue;
 
-			if (zfs_ace_can_use(zp, &pacep[i])) {
+			ace_size = aclp->z_ops.ace_size(pacep);
 
-				/*
-				 * Now create entry for inherited ace
-				 */
-
-				acep[j] = pacep[i];
+			if (!zfs_ace_can_use(zp, iflags))
+				continue;
 
-				/*
-				 * When AUDIT/ALARM a_types are supported
-				 * they should be inherited here.
-				 */
+			/*
+			 * If owner@, group@, or everyone@ inheritable
+			 * then zfs_acl_chmod() isn't needed.
+			 */
+			if (zfsvfs->z_acl_inherit ==
+			    ZFS_ACL_PASSTHROUGH &&
+			    ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
+			    ((iflags & OWNING_GROUP) ==
+			    OWNING_GROUP)) && (vntype == VREG ||
+			    (vntype == VDIR &&
+			    (iflags & ACE_DIRECTORY_INHERIT_ACE))))
+				*need_chmod = B_FALSE;
+
+			aclnode = zfs_acl_node_alloc(ace_size);
+			list_insert_tail(&aclp->z_acl, aclnode);
+			acep = aclnode->z_acldata;
+			zfs_set_ace(aclp, acep, access_mask, type,
+			    who, iflags|ACE_INHERITED_ACE);
 
-				if ((pacep[i].a_flags &
-				    ACE_NO_PROPAGATE_INHERIT_ACE) ||
-				    (ZTOV(zp)->v_type != VDIR)) {
-					acep[j].a_flags &= ~ALL_INHERIT;
-					zfs_securemode_update(zfsvfs, &acep[j]);
-					j++;
-					continue;
-				}
+			/*
+			 * Copy special opaque data if any
+			 */
+			if ((data1sz = paclp->z_ops.ace_data(pacep,
+			    &data1)) != 0) {
+				VERIFY((data2sz = aclp->z_ops.ace_data(acep,
+				    &data2)) == data1sz);
+				bcopy(data1, data2, data2sz);
+			}
+			aclp->z_acl_count++;
+			aclnode->z_ace_count++;
+			aclp->z_acl_bytes += aclnode->z_size;
+			newflags = aclp->z_ops.ace_flags_get(acep);
+
+			if (vntype == VDIR)
+				aclp->z_hints |= ZFS_INHERIT_ACE;
+
+			if ((iflags & ACE_NO_PROPAGATE_INHERIT_ACE) ||
+			    (vntype != VDIR)) {
+				newflags &= ~ALL_INHERIT;
+				aclp->z_ops.ace_flags_set(acep,
+				    newflags|ACE_INHERITED_ACE);
+				zfs_restricted_update(zfsvfs, aclp, acep);
+				continue;
+			}
 
-				ASSERT(ZTOV(zp)->v_type == VDIR);
+			ASSERT(vntype == VDIR);
+
+			newflags = aclp->z_ops.ace_flags_get(acep);
+			if ((iflags & (ACE_FILE_INHERIT_ACE |
+			    ACE_DIRECTORY_INHERIT_ACE)) !=
+			    ACE_FILE_INHERIT_ACE) {
+				aclnode2 = zfs_acl_node_alloc(ace_size);
+				list_insert_tail(&aclp->z_acl, aclnode2);
+				acep2 = aclnode2->z_acldata;
+				zfs_set_ace(aclp, acep2,
+				    access_mask, type, who,
+				    iflags|ACE_INHERITED_ACE);
+				newflags |= ACE_INHERIT_ONLY_ACE;
+				aclp->z_ops.ace_flags_set(acep, newflags);
+				newflags &= ~ALL_INHERIT;
+				aclp->z_ops.ace_flags_set(acep2,
+				    newflags|ACE_INHERITED_ACE);
 
 				/*
-				 * If we are inheriting an ACE targeted for
-				 * only files, then make sure inherit_only
-				 * is on for future propagation.
+				 * Copy special opaque data if any
 				 */
-				if ((pacep[i].a_flags & (ACE_FILE_INHERIT_ACE |
-				    ACE_DIRECTORY_INHERIT_ACE)) !=
-				    ACE_FILE_INHERIT_ACE) {
-					j++;
-					acep[j] = acep[j-1];
-					acep[j-1].a_flags |=
-					    ACE_INHERIT_ONLY_ACE;
-					acep[j].a_flags &= ~ALL_INHERIT;
-				} else {
-					acep[j].a_flags |= ACE_INHERIT_ONLY_ACE;
+				if ((data1sz = aclp->z_ops.ace_data(acep,
+				    &data1)) != 0) {
+					VERIFY((data2sz =
+					    aclp->z_ops.ace_data(acep2,
+					    &data2)) == data1sz);
+					bcopy(data1, data2, data1sz);
 				}
-				zfs_securemode_update(zfsvfs, &acep[j]);
-				j++;
+				aclp->z_acl_count++;
+				aclnode2->z_ace_count++;
+				aclp->z_acl_bytes += aclnode->z_size;
+				zfs_restricted_update(zfsvfs, aclp, acep2);
+			} else {
+				newflags |= ACE_INHERIT_ONLY_ACE;
+				aclp->z_ops.ace_flags_set(acep,
+				    newflags|ACE_INHERITED_ACE);
 			}
 		}
 	}
-	aclp->z_acl_count = j;
-	ASSERT(aclp->z_slots >= aclp->z_acl_count);
-
 	return (aclp);
 }
 
@@ -1004,14 +1793,20 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp)
  */
 void
 zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
-    vattr_t *vap, dmu_tx_t *tx, cred_t *cr)
+    vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
+    zfs_acl_t *setaclp, zfs_fuid_info_t **fuidp)
 {
-	uint64_t	mode;
-	uid_t		uid;
-	gid_t		gid;
+	uint64_t	mode, fuid, fgid;
 	int		error;
-	int		pull_down;
-	zfs_acl_t	*aclp, *paclp;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zfs_acl_t	*aclp = NULL;
+	zfs_acl_t	*paclp;
+	xvattr_t	*xvap = (xvattr_t *)vap;
+	gid_t		gid;
+	boolean_t	need_chmod = B_TRUE;
+
+	if (setaclp)
+		aclp = setaclp;
 
 	mode = MAKEIMODE(vap->va_type, vap->va_mode);
 
@@ -1020,22 +1815,38 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
 	 */
 	if ((flag & (IS_ROOT_NODE | IS_REPLAY)) ||
 	    ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
-		uid = vap->va_uid;
+		fuid = zfs_fuid_create(zfsvfs, vap->va_uid, cr,
+		    ZFS_OWNER, tx, fuidp);
+		fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr,
+		    ZFS_GROUP, tx, fuidp);
 		gid = vap->va_gid;
 	} else {
-		uid = crgetuid(cr);
-		if ((vap->va_mask & AT_GID) &&
-		    ((vap->va_gid == parent->z_phys->zp_gid) ||
-		    groupmember(vap->va_gid, cr) ||
-		    secpolicy_vnode_create_gid(cr) == 0))
+		fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, tx, cr, fuidp);
+		fgid = 0;
+		if (vap->va_mask & AT_GID)  {
+			fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr,
+			    ZFS_GROUP, tx, fuidp);
 			gid = vap->va_gid;
-		else
+			if (fgid != parent->z_phys->zp_gid &&
+			    !groupmember(vap->va_gid, cr) &&
+			    secpolicy_vnode_create_gid(cr) != 0)
+				fgid = 0;
+		}
+		if (fgid == 0) {
+			if (parent->z_phys->zp_mode & S_ISGID) {
+				fgid = parent->z_phys->zp_gid;
+				gid = zfs_fuid_map_id(zfsvfs, fgid,
+				    cr, ZFS_GROUP);
+			} else {
+				fgid = zfs_fuid_create_cred(zfsvfs,
+				    ZFS_GROUP, tx, cr, fuidp);
 #ifdef __FreeBSD__
-			gid = parent->z_phys->zp_gid;
+				gid = parent->z_phys->zp_gid;
 #else
-			gid = (parent->z_phys->zp_mode & S_ISGID) ?
-			    parent->z_phys->zp_gid : crgetgid(cr);
+				gid = crgetgid(cr);
 #endif
+			}
+		}
 	}
 
 	/*
@@ -1045,55 +1856,57 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
 	 * file's new group, clear the file's set-GID bit.
 	 */
 
-	if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR))
+	if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) {
 		mode |= S_ISGID;
-	else {
+	} else {
 		if ((mode & S_ISGID) &&
-		    secpolicy_vnode_setids_setgids(cr, gid) != 0)
+		    secpolicy_vnode_setids_setgids(ZTOV(zp), cr, gid) != 0)
 			mode &= ~S_ISGID;
 	}
 
-	zp->z_phys->zp_uid = uid;
-	zp->z_phys->zp_gid = gid;
+	zp->z_phys->zp_uid = fuid;
+	zp->z_phys->zp_gid = fgid;
 	zp->z_phys->zp_mode = mode;
 
-	mutex_enter(&parent->z_lock);
-	pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE);
-	if (pull_down) {
-		mutex_enter(&parent->z_acl_lock);
-		VERIFY(0 == zfs_acl_node_read(parent, &paclp));
-		mutex_exit(&parent->z_acl_lock);
-		aclp = zfs_acl_inherit(zp, paclp);
-		zfs_acl_free(paclp);
+	if (aclp == NULL) {
+		mutex_enter(&parent->z_lock);
+		if ((ZTOV(parent)->v_type == VDIR &&
+		    (parent->z_phys->zp_flags & ZFS_INHERIT_ACE)) &&
+		    !(zp->z_phys->zp_flags & ZFS_XATTR)) {
+			mutex_enter(&parent->z_acl_lock);
+			VERIFY(0 == zfs_acl_node_read(parent, &paclp, B_FALSE));
+			mutex_exit(&parent->z_acl_lock);
+			aclp = zfs_acl_inherit(zp, paclp, &need_chmod);
+			zfs_acl_free(paclp);
+		} else {
+			aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+		}
+		mutex_exit(&parent->z_lock);
+		mutex_enter(&zp->z_lock);
+		mutex_enter(&zp->z_acl_lock);
+		if (need_chmod)
+			zfs_acl_chmod(zp, mode, aclp);
 	} else {
-		aclp = zfs_acl_alloc(6);
+		mutex_enter(&zp->z_lock);
+		mutex_enter(&zp->z_acl_lock);
 	}
-	mutex_exit(&parent->z_lock);
-	mutex_enter(&zp->z_lock);
-	mutex_enter(&zp->z_acl_lock);
-	error = zfs_acl_chmod(zp, mode, aclp, tx);
+
+	/* Force auto_inherit on all new directory objects */
+	if (vap->va_type == VDIR)
+		aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
+
+	error = zfs_aclset_common(zp, aclp, cr, fuidp, tx);
+
+	/* Set optional attributes if any */
+	if (vap->va_mask & AT_XVATTR)
+		zfs_xvattr_set(zp, xvap);
+
 	mutex_exit(&zp->z_lock);
 	mutex_exit(&zp->z_acl_lock);
 	ASSERT3U(error, ==, 0);
-	zfs_acl_free(aclp);
-}
-
-/*
- * Should ACE be inherited?
- */
-static int
-zfs_ace_can_use(znode_t *zp, ace_t *acep)
-{
-	int vtype = ZTOV(zp)->v_type;
 
-	int	iflags = (acep->a_flags & 0xf);
-
-	if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
-		return (1);
-	else if (iflags & ACE_FILE_INHERIT_ACE)
-		return (!((vtype == VDIR) &&
-		    (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
-	return (0);
+	if (aclp != setaclp)
+		zfs_acl_free(aclp);
 }
 
 #ifdef TODO
@@ -1101,42 +1914,89 @@ zfs_ace_can_use(znode_t *zp, ace_t *acep)
  * Retrieve a files ACL
  */
 int
-zfs_getacl(znode_t *zp, vsecattr_t  *vsecp, cred_t *cr)
+zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 {
 	zfs_acl_t	*aclp;
-	ulong_t		mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+	ulong_t		mask;
 	int		error;
+	int 		count = 0;
+	int		largeace = 0;
 
-	if (error = zfs_zaccess(zp, ACE_READ_ACL, cr)) {
-		/*
-		 * If owner of file then allow reading of the
-		 * ACL.
-		 */
-		if (crgetuid(cr) != zp->z_phys->zp_uid)
-			return (error);
-	}
+	mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
+	    VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
+
+	if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))
+		return (error);
 
 	if (mask == 0)
 		return (ENOSYS);
 
 	mutex_enter(&zp->z_acl_lock);
 
-	error = zfs_acl_node_read(zp, &aclp);
+	error = zfs_acl_node_read(zp, &aclp, B_FALSE);
 	if (error != 0) {
 		mutex_exit(&zp->z_acl_lock);
 		return (error);
 	}
 
+	/*
+	 * Scan ACL to determine number of ACEs
+	 */
+	if ((zp->z_phys->zp_flags & ZFS_ACL_OBJ_ACE) &&
+	    !(mask & VSA_ACE_ALLTYPES)) {
+		void *zacep = NULL;
+		uint64_t who;
+		uint32_t access_mask;
+		uint16_t type, iflags;
+
+		while (zacep = zfs_acl_next_ace(aclp, zacep,
+		    &who, &access_mask, &iflags, &type)) {
+			switch (type) {
+			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+				largeace++;
+				continue;
+			default:
+				count++;
+			}
+		}
+		vsecp->vsa_aclcnt = count;
+	} else
+		count = aclp->z_acl_count;
 
 	if (mask & VSA_ACECNT) {
-		vsecp->vsa_aclcnt = aclp->z_acl_count;
+		vsecp->vsa_aclcnt = count;
 	}
 
 	if (mask & VSA_ACE) {
-		vsecp->vsa_aclentp = kmem_alloc(aclp->z_acl_count *
-		    sizeof (ace_t), KM_SLEEP);
-		bcopy(aclp->z_acl, vsecp->vsa_aclentp,
-		    aclp->z_acl_count * sizeof (ace_t));
+		size_t aclsz;
+
+		zfs_acl_node_t *aclnode = list_head(&aclp->z_acl);
+
+		aclsz = count * sizeof (ace_t) +
+		    sizeof (ace_object_t) * largeace;
+
+		vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
+		vsecp->vsa_aclentsz = aclsz;
+
+		if (aclp->z_version == ZFS_ACL_VERSION_FUID)
+			zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr,
+			    vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
+		else {
+			bcopy(aclnode->z_acldata, vsecp->vsa_aclentp,
+			    count * sizeof (ace_t));
+		}
+	}
+	if (mask & VSA_ACE_ACLFLAGS) {
+		vsecp->vsa_aclflags = 0;
+		if (zp->z_phys->zp_flags & ZFS_ACL_DEFAULTED)
+			vsecp->vsa_aclflags |= ACL_DEFAULTED;
+		if (zp->z_phys->zp_flags & ZFS_ACL_PROTECTED)
+			vsecp->vsa_aclflags |= ACL_PROTECTED;
+		if (zp->z_phys->zp_flags & ZFS_ACL_AUTO_INHERIT)
+			vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
 	}
 
 	mutex_exit(&zp->z_acl_lock);
@@ -1147,37 +2007,100 @@ zfs_getacl(znode_t *zp, vsecattr_t  *vsecp, cred_t *cr)
 }
 #endif	/* TODO */
 
+int
+zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
+    vsecattr_t *vsecp, zfs_acl_t **zaclp)
+{
+	zfs_acl_t *aclp;
+	zfs_acl_node_t *aclnode;
+	int aclcnt = vsecp->vsa_aclcnt;
+	int error;
+
+	if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
+		return (EINVAL);
+
+	aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
+
+	aclp->z_hints = 0;
+	aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
+	if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+		if ((error = zfs_copy_ace_2_oldace(obj_type, aclp,
+		    (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
+		    aclcnt, &aclnode->z_size)) != 0) {
+			zfs_acl_free(aclp);
+			zfs_acl_node_free(aclnode);
+			return (error);
+		}
+	} else {
+		if ((error = zfs_copy_ace_2_fuid(obj_type, aclp,
+		    vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
+		    &aclnode->z_size)) != 0) {
+			zfs_acl_free(aclp);
+			zfs_acl_node_free(aclnode);
+			return (error);
+		}
+	}
+	aclp->z_acl_bytes = aclnode->z_size;
+	aclnode->z_ace_count = aclcnt;
+	aclp->z_acl_count = aclcnt;
+	list_insert_head(&aclp->z_acl, aclnode);
+
+	/*
+	 * If flags are being set then add them to z_hints
+	 */
+	if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
+		if (vsecp->vsa_aclflags & ACL_PROTECTED)
+			aclp->z_hints |= ZFS_ACL_PROTECTED;
+		if (vsecp->vsa_aclflags & ACL_DEFAULTED)
+			aclp->z_hints |= ZFS_ACL_DEFAULTED;
+		if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
+			aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
+	}
+
+	*zaclp = aclp;
+
+	return (0);
+}
+
 #ifdef TODO
 /*
  * Set a files ACL
  */
 int
-zfs_setacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr)
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 {
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog = zfsvfs->z_log;
-	ace_t		*acep = vsecp->vsa_aclentp;
-	int		aclcnt = vsecp->vsa_aclcnt;
 	ulong_t		mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
 	dmu_tx_t	*tx;
 	int		error;
-	int		inherit;
 	zfs_acl_t	*aclp;
+	zfs_fuid_info_t	*fuidp = NULL;
 
 	if (mask == 0)
-		return (EINVAL);
+		return (ENOSYS);
 
-	if (!zfs_acl_valid(zp, acep, aclcnt, &inherit))
-		return (EINVAL);
+	if (zp->z_phys->zp_flags & ZFS_IMMUTABLE)
+		return (EPERM);
+
+	if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
+		return (error);
+
+	error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, &aclp);
+	if (error)
+		return (error);
+
+	/*
+	 * If ACL wide flags aren't being set then preserve any
+	 * existing flags.
+	 */
+	if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
+		aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
+	}
 top:
-	error = zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr);
-	if (error == EACCES || error == ACCESS_UNDETERMINED) {
-		if ((error = secpolicy_vnode_setdac(cr,
-		    zp->z_phys->zp_uid)) != 0) {
-			return (error);
-		}
-	} else if (error) {
-		return (error == EROFS ? error : EPERM);
+	if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) {
+		zfs_acl_free(aclp);
+		return (error);
 	}
 
 	mutex_enter(&zp->z_lock);
@@ -1187,10 +2110,34 @@ top:
 	dmu_tx_hold_bonus(tx, zp->z_id);
 
 	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
-		dmu_tx_hold_write(tx, zp->z_phys->zp_acl.z_acl_extern_obj,
-		    0, ZFS_ACL_SIZE(aclcnt));
-	} else if (aclcnt > ACE_SLOT_CNT) {
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ZFS_ACL_SIZE(aclcnt));
+		/* Are we upgrading ACL? */
+		if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
+		    zp->z_phys->zp_acl.z_acl_version ==
+		    ZFS_ACL_VERSION_INITIAL) {
+			dmu_tx_hold_free(tx,
+			    zp->z_phys->zp_acl.z_acl_extern_obj,
+			    0, DMU_OBJECT_END);
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+			    0, aclp->z_acl_bytes);
+		} else {
+			dmu_tx_hold_write(tx,
+			    zp->z_phys->zp_acl.z_acl_extern_obj,
+			    0, aclp->z_acl_bytes);
+		}
+	} else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
+	}
+	if (aclp->z_has_fuids) {
+		if (zfsvfs->z_fuid_obj == 0) {
+			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+			    FUID_SIZE_ESTIMATE(zfsvfs));
+			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
+		} else {
+			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+			    FUID_SIZE_ESTIMATE(zfsvfs));
+		}
 	}
 
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
@@ -1204,17 +2151,18 @@ top:
 			goto top;
 		}
 		dmu_tx_abort(tx);
+		zfs_acl_free(aclp);
 		return (error);
 	}
 
-	aclp = zfs_acl_alloc(aclcnt);
-	bcopy(acep, aclp->z_acl, sizeof (ace_t) * aclcnt);
-	aclp->z_acl_count = aclcnt;
-	error = zfs_aclset_common(zp, aclp, tx, &inherit);
+	error = zfs_aclset_common(zp, aclp, cr, &fuidp, tx);
 	ASSERT(error == 0);
 
+	zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
+
+	if (fuidp)
+		zfs_fuid_info_free(fuidp);
 	zfs_acl_free(aclp);
-	zfs_log_acl(zilog, tx, TX_ACL, zp, aclcnt, acep);
 	dmu_tx_commit(tx);
 done:
 	mutex_exit(&zp->z_acl_lock);
@@ -1224,46 +2172,34 @@ done:
 }
 #endif	/* TODO */
 
+/*
+ * working_mode returns the permissions that were not granted
+ */
 static int
-zfs_ace_access(ace_t *zacep, int *working_mode)
-{
-	if (*working_mode == 0) {
-		return (0);
-	}
-
-	if (zacep->a_access_mask & *working_mode) {
-		if (zacep->a_type == ALLOW) {
-			*working_mode &=
-			    ~(*working_mode & zacep->a_access_mask);
-			if (*working_mode == 0)
-				return (0);
-		} else if (zacep->a_type == DENY) {
-			return (EACCES);
-		}
-	}
-
-	/*
-	 * haven't been specifcally denied at this point
-	 * so return UNDETERMINED.
-	 */
-
-	return (ACCESS_UNDETERMINED);
-}
-
-
-static int
-zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
+zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
+    boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
 {
 	zfs_acl_t	*aclp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	ace_t		*zacep;
-	gid_t		gid;
-	int		cnt;
-	int		i;
 	int		error;
-	int		access_deny = ACCESS_UNDETERMINED;
-	uint_t		entry_type;
 	uid_t		uid = crgetuid(cr);
+	uint64_t 	who;
+	uint16_t	type, iflags;
+	uint16_t	entry_type;
+	uint32_t	access_mask;
+	uint32_t	deny_mask = 0;
+	zfs_ace_hdr_t	*acep = NULL;
+	boolean_t	checkit;
+	uid_t		fowner;
+	uid_t		gowner;
+
+	/*
+	 * Short circuit empty requests
+	 */
+	if (v4_mode == 0)
+		return (0);
+
+	*check_privs = B_TRUE;
 
 	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
 		*working_mode = 0;
@@ -1275,93 +2211,155 @@ zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
 	if ((v4_mode & WRITE_MASK) &&
 	    (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
 	    (!IS_DEVVP(ZTOV(zp)))) {
+		*check_privs = B_FALSE;
 		return (EROFS);
 	}
 
+	/*
+	 * Only check for READONLY on non-directories.
+	 */
+	if ((v4_mode & WRITE_MASK_DATA) &&
+	    (((ZTOV(zp)->v_type != VDIR) &&
+	    (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
+	    (ZTOV(zp)->v_type == VDIR &&
+	    (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) {
+		*check_privs = B_FALSE;
+		return (EPERM);
+	}
+
+	if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
+	    (zp->z_phys->zp_flags & ZFS_NOUNLINK)) {
+		*check_privs = B_FALSE;
+		return (EPERM);
+	}
+
+	if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
+	    (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) {
+		*check_privs = B_FALSE;
+		return (EACCES);
+	}
+
+	/*
+	 * The caller requested that the ACL check be skipped.  This
+	 * would only happen if the caller checked VOP_ACCESS() with a
+	 * 32 bit ACE mask and already had the appropriate permissions.
+	 */
+	if (skipaclchk) {
+		*working_mode = 0;
+		return (0);
+	}
+
+	zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
+
 	mutex_enter(&zp->z_acl_lock);
 
-	error = zfs_acl_node_read(zp, &aclp);
+	error = zfs_acl_node_read(zp, &aclp, B_FALSE);
 	if (error != 0) {
 		mutex_exit(&zp->z_acl_lock);
 		return (error);
 	}
 
+	while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+	    &iflags, &type)) {
 
-	zacep = aclp->z_acl;
-	cnt = aclp->z_acl_count;
+		if (!zfs_acl_valid_ace_type(type, iflags))
+			continue;
 
-	for (i = 0; i != cnt; i++) {
+		if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE))
+			continue;
 
-		DTRACE_PROBE2(zfs__access__common,
-		    ace_t *, &zacep[i], int, *working_mode);
+		entry_type = (iflags & ACE_TYPE_FLAGS);
 
-		if (zacep[i].a_flags & ACE_INHERIT_ONLY_ACE)
-			continue;
+		checkit = B_FALSE;
 
-		entry_type = (zacep[i].a_flags & ACE_TYPE_FLAGS);
 		switch (entry_type) {
 		case ACE_OWNER:
-			if (uid == zp->z_phys->zp_uid) {
-				access_deny = zfs_ace_access(&zacep[i],
-				    working_mode);
-			}
+			if (uid == fowner)
+				checkit = B_TRUE;
 			break;
-		case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+		case OWNING_GROUP:
+			who = gowner;
+			/*FALLTHROUGH*/
 		case ACE_IDENTIFIER_GROUP:
-			/*
-			 * Owning group gid is in znode not ACL
-			 */
-			if (entry_type == (ACE_IDENTIFIER_GROUP | ACE_GROUP))
-				gid = zp->z_phys->zp_gid;
-			else
-				gid = zacep[i].a_who;
-
-			if (groupmember(gid, cr)) {
-				access_deny = zfs_ace_access(&zacep[i],
-				    working_mode);
-			}
+			checkit = zfs_groupmember(zfsvfs, who, cr);
 			break;
 		case ACE_EVERYONE:
-			access_deny = zfs_ace_access(&zacep[i], working_mode);
+			checkit = B_TRUE;
 			break;
 
 		/* USER Entry */
 		default:
 			if (entry_type == 0) {
-				if (uid == zacep[i].a_who) {
-					access_deny = zfs_ace_access(&zacep[i],
-					    working_mode);
-				}
+				uid_t newid;
+
+				newid = zfs_fuid_map_id(zfsvfs, who, cr,
+				    ZFS_ACE_USER);
+				if (newid != IDMAP_WK_CREATOR_OWNER_UID &&
+				    uid == newid)
+					checkit = B_TRUE;
 				break;
+			} else {
+				zfs_acl_free(aclp);
+				mutex_exit(&zp->z_acl_lock);
+				return (EIO);
+			}
+		}
+
+		if (checkit) {
+			uint32_t mask_matched = (access_mask & *working_mode);
+
+			if (mask_matched) {
+				if (type == DENY)
+					deny_mask |= mask_matched;
+
+				*working_mode &= ~mask_matched;
 			}
-			zfs_acl_free(aclp);
-			mutex_exit(&zp->z_acl_lock);
-			return (EIO);
 		}
 
-		if (access_deny != ACCESS_UNDETERMINED)
+		/* Are we done? */
+		if (*working_mode == 0)
 			break;
 	}
 
 	mutex_exit(&zp->z_acl_lock);
 	zfs_acl_free(aclp);
 
-	return (access_deny);
+	/* Put the found 'denies' back on the working mode */
+	if (deny_mask) {
+		*working_mode |= deny_mask;
+		return (EACCES);
+	} else if (*working_mode) {
+		return (-1);
+	}
+
+	return (0);
 }
 
+static int
+zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
+    cred_t *cr)
+{
+	if (*working_mode != ACE_WRITE_DATA)
+		return (EACCES);
+
+	return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
+	    check_privs, B_FALSE, cr));
+}
 
 /*
  * Determine whether Access should be granted/denied, invoking least
  * priv subsytem when a deny is determined.
  */
 int
-zfs_zaccess(znode_t *zp, int mode, cred_t *cr)
+zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
 {
-	int	working_mode;
-	int	error;
-	int	is_attr;
-	znode_t	*xzp;
-	znode_t *check_zp = zp;
+	uint32_t	working_mode;
+	int		error;
+	int		is_attr;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	boolean_t 	check_privs;
+	znode_t		*xzp;
+	znode_t 	*check_zp = zp;
 
 	is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) &&
 	    (ZTOV(zp)->v_type == VDIR));
@@ -1374,7 +2372,9 @@ zfs_zaccess(znode_t *zp, int mode, cred_t *cr)
 		    zp->z_phys->zp_parent, &xzp)) != 0)	{
 			return (error);
 		}
+
 		check_zp = xzp;
+
 		/*
 		 * fixup mode to map to xattr perms
 		 */
@@ -1390,18 +2390,76 @@ zfs_zaccess(znode_t *zp, int mode, cred_t *cr)
 		}
 	}
 
-	error = zfs_zaccess_common(check_zp, mode, &working_mode, cr);
+	if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
+	    &check_privs, skipaclchk, cr)) == 0) {
+		if (is_attr)
+			VN_RELE(ZTOV(xzp));
+		return (0);
+	}
 
-	if (error == EROFS) {
+	if (error && !check_privs) {
 		if (is_attr)
 			VN_RELE(ZTOV(xzp));
 		return (error);
 	}
 
-	if (error || working_mode) {
-		working_mode = (zfs_v4_to_unix(working_mode) << 6);
-		error = secpolicy_vnode_access(cr, ZTOV(check_zp),
-		    check_zp->z_phys->zp_uid, working_mode);
+	if (error && (flags & V_APPEND)) {
+		error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
+	}
+
+	if (error && check_privs) {
+		uid_t		owner;
+		mode_t		checkmode = 0;
+
+		owner = zfs_fuid_map_id(zfsvfs, check_zp->z_phys->zp_uid, cr,
+		    ZFS_OWNER);
+
+		/*
+		 * First check for implicit owner permission on
+		 * read_acl/read_attributes
+		 */
+
+		error = 0;
+		ASSERT(working_mode != 0);
+
+		if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
+		    owner == crgetuid(cr)))
+			working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+		if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+		    ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+			checkmode |= VREAD;
+		if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+		    ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+			checkmode |= VWRITE;
+		if (working_mode & ACE_EXECUTE)
+			checkmode |= VEXEC;
+
+		if (checkmode)
+			error = secpolicy_vnode_access(cr, ZTOV(check_zp),
+			    owner, checkmode);
+
+		if (error == 0 && (working_mode & ACE_WRITE_OWNER))
+			error = secpolicy_vnode_chown(ZTOV(check_zp), cr, B_TRUE);
+		if (error == 0 && (working_mode & ACE_WRITE_ACL))
+			error = secpolicy_vnode_setdac(ZTOV(check_zp), cr, owner);
+
+		if (error == 0 && (working_mode &
+		    (ACE_DELETE|ACE_DELETE_CHILD)))
+			error = secpolicy_vnode_remove(ZTOV(check_zp), cr);
+
+		if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
+			error = secpolicy_vnode_chown(ZTOV(check_zp), cr, B_FALSE);
+		}
+		if (error == 0) {
+			/*
+			 * See if any bits other than those already checked
+			 * for are still present.  If so then return EACCES
+			 */
+			if (working_mode & ~(ZFS_CHECKED_MASKS)) {
+				error = EACCES;
+			}
+		}
 	}
 
 	if (is_attr)
@@ -1411,38 +2469,37 @@ zfs_zaccess(znode_t *zp, int mode, cred_t *cr)
 }
 
 /*
- * Special zaccess function to check for special nfsv4 perm.
- * doesn't call secpolicy_vnode_access() for failure, since that
- * would probably be the wrong policy function to call.
- * instead its up to the caller to handle that situation.
+ * Translate traditional unix VREAD/VWRITE/VEXEC mode into
+ * native ACL format and call zfs_zaccess()
  */
-
 int
-zfs_zaccess_v4_perm(znode_t *zp, int mode, cred_t *cr)
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
 {
-	int working_mode = 0;
-	return (zfs_zaccess_common(zp, mode, &working_mode, cr));
+	return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
 }
 
 /*
- * Translate tradition unix VREAD/VWRITE/VEXEC mode into
- * native ACL format and call zfs_zaccess()
+ * Access function for secpolicy_vnode_setattr
  */
 int
-zfs_zaccess_rwx(znode_t *zp, mode_t mode, cred_t *cr)
+zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
 {
 	int v4_mode = zfs_unix_to_v4(mode >> 6);
 
-	return (zfs_zaccess(zp, v4_mode, cr));
+	return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
 }
 
 static int
-zfs_delete_final_check(znode_t *zp, znode_t *dzp, cred_t *cr)
+zfs_delete_final_check(znode_t *zp, znode_t *dzp,
+    mode_t missing_perms, cred_t *cr)
 {
 	int error;
+	uid_t downer;
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
-	error = secpolicy_vnode_access(cr, ZTOV(zp),
-	    dzp->z_phys->zp_uid, S_IWRITE|S_IEXEC);
+	downer = zfs_fuid_map_id(zfsvfs, dzp->z_phys->zp_uid, cr, ZFS_OWNER);
+
+	error = secpolicy_vnode_access(cr, ZTOV(dzp), downer, missing_perms);
 
 	if (error == 0)
 		error = zfs_sticky_remove_access(dzp, zp, cr);
@@ -1488,83 +2545,88 @@ zfs_delete_final_check(znode_t *zp, znode_t *dzp, cred_t *cr)
 int
 zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
 {
-	int dzp_working_mode = 0;
-	int zp_working_mode = 0;
+	uint32_t dzp_working_mode = 0;
+	uint32_t zp_working_mode = 0;
 	int dzp_error, zp_error;
+	mode_t missing_perms;
+	boolean_t dzpcheck_privs = B_TRUE;
+	boolean_t zpcheck_privs = B_TRUE;
 
 	/*
-	 * Arghh, this check is going to require a couple of questions
-	 * to be asked.  We want specific DELETE permissions to
+	 * We want specific DELETE permissions to
 	 * take precedence over WRITE/EXECUTE.  We don't
 	 * want an ACL such as this to mess us up.
 	 * user:joe:write_data:deny,user:joe:delete:allow
 	 *
 	 * However, deny permissions may ultimately be overridden
 	 * by secpolicy_vnode_access().
+	 *
+	 * We will ask for all of the necessary permissions and then
+	 * look at the working modes from the directory and target object
+	 * to determine what was found.
 	 */
 
-	dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
-	    &dzp_working_mode, cr);
-	zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, cr);
-
-	if (dzp_error == EROFS || zp_error == EROFS)
-		return (dzp_error);
+	if (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+		return (EPERM);
 
 	/*
-	 * First check the first row.
-	 * We only need to see if parent Allows delete_child
+	 * First row
+	 * If the directory permissions allow the delete, we are done.
 	 */
-	if ((dzp_working_mode & ACE_DELETE_CHILD) == 0)
+	if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
+	    &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
 		return (0);
 
 	/*
-	 * Second row
-	 * we already have the necessary information in
-	 * zp_working_mode, zp_error and dzp_error.
+	 * If target object has delete permission then we are done
 	 */
-
-	if ((zp_working_mode & ACE_DELETE) == 0)
+	if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
+	    &zpcheck_privs, B_FALSE, cr)) == 0)
 		return (0);
 
+	ASSERT(dzp_error && zp_error);
+
+	if (!dzpcheck_privs)
+		return (dzp_error);
+	if (!zpcheck_privs)
+		return (zp_error);
+
 	/*
-	 * Now zp_error should either be EACCES which indicates
-	 * a "deny" delete entry or ACCESS_UNDETERMINED if the "delete"
-	 * entry exists on the target.
+	 * Second row
 	 *
-	 * dzp_error should be either EACCES which indicates a "deny"
-	 * entry for delete_child or ACCESS_UNDETERMINED if no delete_child
-	 * entry exists.  If value is EACCES then we are done
-	 * and zfs_delete_final_check() will make the final decision
-	 * regarding to allow the delete.
+	 * If directory returns EACCES then delete_child was denied
+	 * due to deny delete_child.  In this case send the request through
+	 * secpolicy_vnode_remove().  We don't use zfs_delete_final_check()
+	 * since that *could* allow the delete based on write/execute permission
+	 * and we want delete permissions to override write/execute.
 	 */
 
-	ASSERT(zp_error != 0 && dzp_error != 0);
 	if (dzp_error == EACCES)
-		return (zfs_delete_final_check(zp, dzp, cr));
+		return (secpolicy_vnode_remove(ZTOV(dzp), cr));	/* XXXPJD: s/dzp/zp/ ? */
 
 	/*
 	 * Third Row
-	 * Only need to check for write/execute on parent
+	 * only need to see if we have write/execute on directory.
 	 */
 
-	dzp_error = zfs_zaccess_common(dzp, ACE_WRITE_DATA|ACE_EXECUTE,
-	    &dzp_working_mode, cr);
+	if ((dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
+	    &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
+		return (zfs_sticky_remove_access(dzp, zp, cr));
 
-	if (dzp_error == EROFS)
+	if (!dzpcheck_privs)
 		return (dzp_error);
 
-	if ((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) == 0)
-		return (zfs_sticky_remove_access(dzp, zp, cr));
-
 	/*
-	 * Fourth Row
+	 * Fourth row
 	 */
 
-	if (((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) != 0) &&
-	    ((zp_working_mode & ACE_DELETE) == 0))
-		return (zfs_sticky_remove_access(dzp, zp, cr));
+	missing_perms = (dzp_working_mode & ACE_WRITE_DATA) ? VWRITE : 0;
+	missing_perms |= (dzp_working_mode & ACE_EXECUTE) ? VEXEC : 0;
+
+	ASSERT(missing_perms);
+
+	return (zfs_delete_final_check(zp, dzp, missing_perms, cr));
 
-	return (zfs_delete_final_check(zp, dzp, cr));
 }
 
 int
@@ -1574,6 +2636,9 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
 	int add_perm;
 	int error;
 
+	if (szp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
+		return (EACCES);
+
 	add_perm = (ZTOV(szp)->v_type == VDIR) ?
 	    ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
 
@@ -1586,7 +2651,7 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
 	 * to another.
 	 */
 	if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) {
-		if (error = zfs_zaccess(szp, ACE_WRITE_DATA, cr))
+		if (error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr))
 			return (error);
 	}
 
@@ -1610,7 +2675,7 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
 	/*
 	 * Now check for add permissions
 	 */
-	error = zfs_zaccess(tdzp, add_perm, cr);
+	error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
 
 	return (error);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
index c8450d488bdb..b6c43f4245f2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -33,7 +32,7 @@
 #include <sys/zfs_acl.h>
 
 void
-zfs_ace_byteswap(ace_t *ace, int ace_cnt)
+zfs_oldace_byteswap(ace_t *ace, int ace_cnt)
 {
 	int i;
 
@@ -45,9 +44,78 @@ zfs_ace_byteswap(ace_t *ace, int ace_cnt)
 	}
 }
 
+/*
+ * swap ace_t and ace_oject_t
+ */
+void
+zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
+{
+#ifdef TODO
+	caddr_t end;
+	caddr_t ptr;
+	zfs_ace_t *zacep;
+	ace_t *acep;
+	uint16_t entry_type;
+	size_t entry_size;
+	int ace_type;
+
+	end = (caddr_t)buf + size;
+	ptr = buf;
+
+	while (ptr < end) {
+		if (zfs_layout) {
+			zacep = (zfs_ace_t *)ptr;
+			zacep->z_hdr.z_access_mask =
+			    BSWAP_32(zacep->z_hdr.z_access_mask);
+			zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags);
+			ace_type = zacep->z_hdr.z_type =
+			    BSWAP_16(zacep->z_hdr.z_type);
+			entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+		} else {
+			acep = (ace_t *)ptr;
+			acep->a_access_mask = BSWAP_32(acep->a_access_mask);
+			acep->a_flags = BSWAP_16(acep->a_flags);
+			ace_type = acep->a_type = BSWAP_16(acep->a_type);
+			acep->a_who = BSWAP_32(acep->a_who);
+			entry_type = acep->a_flags & ACE_TYPE_FLAGS;
+		}
+		switch (entry_type) {
+		case ACE_OWNER:
+		case ACE_EVERYONE:
+		case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+			entry_size = zfs_layout ?
+			    sizeof (zfs_ace_hdr_t) : sizeof (ace_t);
+			break;
+		case ACE_IDENTIFIER_GROUP:
+		default:
+			if (zfs_layout) {
+				zacep->z_fuid = BSWAP_64(zacep->z_fuid);
+			}
+			switch (ace_type) {
+			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+				entry_size = zfs_layout ?
+				    sizeof (zfs_object_ace_t) :
+				    sizeof (ace_object_t);
+				break;
+			default:
+				entry_size = zfs_layout ? sizeof (zfs_ace_t) :
+				    sizeof (ace_t);
+				break;
+			}
+		}
+		ptr = ptr + entry_size;
+	}
+#else	/* TODO */
+	panic("%s:%u: TODO", __func__, __LINE__);
+#endif	/* TODO */
+}
+
 /* ARGSUSED */
 void
-zfs_acl_byteswap(void *buf, size_t size)
+zfs_oldacl_byteswap(void *buf, size_t size)
 {
 	int cnt;
 
@@ -58,7 +126,14 @@ zfs_acl_byteswap(void *buf, size_t size)
 
 	cnt = size / sizeof (ace_t);
 
-	zfs_ace_byteswap((ace_t *)buf, cnt);
+	zfs_oldace_byteswap((ace_t *)buf, cnt);
+}
+
+/* ARGSUSED */
+void
+zfs_acl_byteswap(void *buf, size_t size)
+{
+	zfs_ace_byteswap(buf, size, B_TRUE);
 }
 
 void
@@ -86,14 +161,19 @@ zfs_znode_byteswap(void *buf, size_t size)
 	zp->zp_flags = BSWAP_64(zp->zp_flags);
 	zp->zp_uid = BSWAP_64(zp->zp_uid);
 	zp->zp_gid = BSWAP_64(zp->zp_gid);
+	zp->zp_zap = BSWAP_64(zp->zp_zap);
 	zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
 	zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
 	zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
-	zp->zp_pad[3] = BSWAP_64(zp->zp_pad[3]);
 
 	zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
-	zp->zp_acl.z_acl_count = BSWAP_32(zp->zp_acl.z_acl_count);
+	zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size);
 	zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
-	zp->zp_acl.z_acl_pad = BSWAP_16(zp->zp_acl.z_acl_pad);
-	zfs_ace_byteswap(&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT);
+	zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count);
+	if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) {
+		zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0],
+		    ZFS_ACE_SPACE);
+	} else
+		zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0],
+		    ACE_SLOT_CNT);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
index 286fe97e1142..654d2f949b3f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -53,6 +53,17 @@
  * reliable way to auto-unmount the filesystem when it's "no longer in use".
  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
  * unmounts any snapshots within the snapshot directory.
+ *
+ * The '.zfs', '.zfs/snapshot', and all directories created under
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
+ * share the same vfs_t as the head filesystem (what '.zfs' lives under).
+ *
+ * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
+ * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
+ * However, vnodes within these mounted on file systems have their v_vfsp
+ * fields set to the head filesystem to make NFS happy (see
+ * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
+ * so that it cannot be freed until all snapshots have been unmounted.
  */
 
 #include <sys/zfs_context.h>
@@ -63,7 +74,23 @@
 #include <sys/gfs.h>
 #include <sys/stat.h>
 #include <sys/dmu.h>
+#include <sys/dsl_deleg.h>
 #include <sys/mount.h>
+#include <sys/sunddi.h>
+
+#include "zfs_namecheck.h"
+
+typedef struct zfsctl_node {
+	gfs_dir_t	zc_gfs_private;
+	uint64_t	zc_id;
+	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
+} zfsctl_node_t;
+
+typedef struct zfsctl_snapdir {
+	zfsctl_node_t	sd_node;
+	kmutex_t	sd_lock;
+	avl_tree_t	sd_snaps;
+} zfsctl_snapdir_t;
 
 typedef struct {
 	char		*se_name;
@@ -92,18 +119,7 @@ static struct vop_vector zfsctl_ops_snapshot;
 
 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
-
-typedef struct zfsctl_node {
-	gfs_dir_t	zc_gfs_private;
-	uint64_t	zc_id;
-	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
-} zfsctl_node_t;
-
-typedef struct zfsctl_snapdir {
-	zfsctl_node_t	sd_node;
-	kmutex_t	sd_lock;
-	avl_tree_t	sd_snaps;
-} zfsctl_snapdir_t;
+static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
 
 /*
  * Root directory elements.  We have only a single static entry, 'snapshot'.
@@ -237,14 +253,14 @@ static int
 zfsctl_common_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
-		accmode_t a_accmode;
+		int  a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
-	accmode_t accmode = ap->a_accmode;
+	int mode = ap->a_accmode;
 
-	if (accmode & VWRITE)
+	if (mode & VWRITE)
 		return (EACCES);
 
 	return (0);
@@ -283,6 +299,7 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 	vap->va_flags = 0;
 }
 
+/*ARGSUSED*/
 static int
 zfsctl_common_fid(ap)
 	struct vop_fid_args /* {
@@ -360,6 +377,7 @@ zfsctl_root_getattr(ap)
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
+		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
@@ -382,11 +400,18 @@ zfsctl_root_getattr(ap)
 /* ARGSUSED */
 int
 zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr)
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp)
 {
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	int err;
 
+	/*
+	 * No extended attributes allowed under .zfs
+	 */
+	if (flags & LOOKUP_XATTR)
+		return (EINVAL);
+
 	ZFS_ENTER(zfsvfs);
 
 	if (strcmp(nm, "..") == 0) {
@@ -394,7 +419,8 @@ zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
 		if (err == 0)
 			VOP_UNLOCK(*vpp, 0);
 	} else {
-		err = gfs_dir_lookup(dvp, nm, vpp);
+		err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
+		    cr, ct, direntflags, realpnp);
 	}
 
 	ZFS_EXIT(zfsvfs);
@@ -407,7 +433,7 @@ zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
  */
 /* ARGSUSED */
 int
-zfsctl_root_lookup_vop(ap)
+zfsctl_freebsd_root_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
@@ -428,7 +454,7 @@ zfsctl_root_lookup_vop(ap)
 	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
 
-	err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr);
+	err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr, NULL, NULL, NULL);
 	if (err == 0 && (nm[0] != '.' || nm[1] != '\0'))
 		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 
@@ -443,7 +469,7 @@ static struct vop_vector zfsctl_ops_root = {
 	.vop_getattr =	zfsctl_root_getattr,
 	.vop_access =	zfsctl_common_access,
 	.vop_readdir =	gfs_vop_readdir,
-	.vop_lookup =	zfsctl_root_lookup_vop,
+	.vop_lookup =	zfsctl_freebsd_root_lookup,
 	.vop_inactive =	gfs_vop_inactive,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_fid =	zfsctl_common_fid,
@@ -454,6 +480,8 @@ zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
 {
 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 
+	if (snapshot_namecheck(name, NULL, NULL) != 0)
+		return (EILSEQ);
 	dmu_objset_name(os, zname);
 	if (strlen(zname) + 1 + strlen(name) >= len)
 		return (ENAMETOOLONG);
@@ -463,38 +491,18 @@ zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
 }
 
 static int
-zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
+zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
 {
-	zfsctl_snapdir_t *sdp = dvp->v_data;
-	zfs_snapentry_t search, *sep;
-	struct vop_inactive_args ap;
-	avl_index_t where;
-	int err;
-
-	ASSERT(MUTEX_HELD(&sdp->sd_lock));
-
-	search.se_name = (char *)name;
-	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
-		return (ENOENT);
+	vnode_t *svp = sep->se_root;
+	int error;
 
-	ASSERT(vn_ismntpt(sep->se_root));
+	ASSERT(vn_ismntpt(svp));
 
 	/* this will be dropped by dounmount() */
-	if ((err = vn_vfswlock(sep->se_root)) != 0)
-		return (err);
-
-	err = dounmount(vn_mountedvfs(sep->se_root), force, curthread);
-	if (err)
-		return (err);
-	ASSERT(sep->se_root->v_count == 1);
-	ap.a_vp = sep->se_root;
-	gfs_vop_inactive(&ap);
-
-	avl_remove(&sdp->sd_snaps, sep);
-	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-	kmem_free(sep, sizeof (zfs_snapentry_t));
+	if ((error = vn_vfswlock(svp)) != 0)
+		return (error);
 
-	return (0);
+	return (dounmount(vn_mountedvfs(svp), fflags, curthread));
 }
 
 #if 0
@@ -553,20 +561,40 @@ zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
 #endif
 
 #if 0
+/*ARGSUSED*/
 static int
 zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
-    cred_t *cr)
+    cred_t *cr, caller_context_t *ct, int flags)
 {
 	zfsctl_snapdir_t *sdp = sdvp->v_data;
 	zfs_snapentry_t search, *sep;
+	zfsvfs_t *zfsvfs;
 	avl_index_t where;
 	char from[MAXNAMELEN], to[MAXNAMELEN];
+	char real[MAXNAMELEN];
 	int err;
 
+	zfsvfs = sdvp->v_vfsp->vfs_data;
+	ZFS_ENTER(zfsvfs);
+
+	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+		err = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
+		    MAXNAMELEN, NULL);
+		if (err == 0) {
+			snm = real;
+		} else if (err != ENOTSUP) {
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+	}
+
+	ZFS_EXIT(zfsvfs);
+
 	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
-	if (err)
-		return (err);
-	err = zfs_secpolicy_write(from, cr);
+	if (!err)
+		err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
+	if (!err)
+		err = zfs_secpolicy_rename_perms(from, to, cr);
 	if (err)
 		return (err);
 
@@ -579,10 +607,6 @@ zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
 	if (strcmp(snm, tnm) == 0)
 		return (0);
 
-	err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
-	if (err)
-		return (err);
-
 	mutex_enter(&sdp->sd_lock);
 
 	search.se_name = (char *)snm;
@@ -604,29 +628,55 @@ zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
 #if 0
 /* ARGSUSED */
 static int
-zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
+zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
+    caller_context_t *ct, int flags)
 {
 	zfsctl_snapdir_t *sdp = dvp->v_data;
+	zfs_snapentry_t *sep;
+	zfs_snapentry_t search;
+	zfsvfs_t *zfsvfs;
 	char snapname[MAXNAMELEN];
+	char real[MAXNAMELEN];
 	int err;
 
+	zfsvfs = dvp->v_vfsp->vfs_data;
+	ZFS_ENTER(zfsvfs);
+
+	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+
+		err = dmu_snapshot_realname(zfsvfs->z_os, name, real,
+		    MAXNAMELEN, NULL);
+		if (err == 0) {
+			name = real;
+		} else if (err != ENOTSUP) {
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+	}
+
+	ZFS_EXIT(zfsvfs);
+
 	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
-	if (err)
-		return (err);
-	err = zfs_secpolicy_write(snapname, cr);
+	if (!err)
+		err = zfs_secpolicy_destroy_perms(snapname, cr);
 	if (err)
 		return (err);
 
 	mutex_enter(&sdp->sd_lock);
 
-	err = zfsctl_unmount_snap(dvp, name, 0, cr);
-	if (err) {
-		mutex_exit(&sdp->sd_lock);
-		return (err);
+	search.se_name = name;
+	sep = avl_find(&sdp->sd_snaps, &search, NULL);
+	if (sep) {
+		avl_remove(&sdp->sd_snaps, sep);
+		err = zfsctl_unmount_snap(sep, MS_FORCE, cr);
+		if (err)
+			avl_add(&sdp->sd_snaps, sep);
+		else
+			err = dmu_objset_destroy(snapname);
+	} else {
+		err = ENOENT;
 	}
 
-	err = dmu_objset_destroy(snapname);
-
 	mutex_exit(&sdp->sd_lock);
 
 	return (err);
@@ -634,6 +684,57 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
 #endif
 
 /*
+ * This creates a snapshot under '.zfs/snapshot'.
+ */
+/* ARGSUSED */
+static int
+zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
+    cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp)
+{
+	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+	char name[MAXNAMELEN];
+	int err;
+	static enum symfollow follow = NO_FOLLOW;
+	static enum uio_seg seg = UIO_SYSSPACE;
+
+	if (snapshot_namecheck(dirname, NULL, NULL) != 0)
+		return (EILSEQ);
+
+	dmu_objset_name(zfsvfs->z_os, name);
+
+	*vpp = NULL;
+
+	err = zfs_secpolicy_snapshot_perms(name, cr);
+	if (err)
+		return (err);
+
+	if (err == 0) {
+		err = dmu_objset_snapshot(name, dirname, B_FALSE);
+		if (err)
+			return (err);
+		err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
+	}
+
+	return (err);
+}
+
+static int
+zfsctl_freebsd_snapdir_mkdir(ap)
+        struct vop_mkdir_args /* {
+                struct vnode *a_dvp;
+                struct vnode **a_vpp;
+                struct componentname *a_cnp;
+                struct vattr *a_vap;
+        } */ *ap;
+{
+
+	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+	return (zfsctl_snapdir_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, NULL,
+	    ap->a_vpp, ap->a_cnp->cn_cred, NULL, 0, NULL));
+}
+
+/*
  * Lookup entry point for the 'snapshot' directory.  Try to open the
  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
  * Perform a mount of the associated dataset on top of the vnode.
@@ -649,17 +750,25 @@ zfsctl_snapdir_lookup(ap)
 {
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
 	char nm[NAME_MAX + 1];
 	zfsctl_snapdir_t *sdp = dvp->v_data;
 	objset_t *snap;
 	char snapname[MAXNAMELEN];
+	char real[MAXNAMELEN];
 	char *mountpoint;
 	zfs_snapentry_t *sep, search;
 	size_t mountpoint_len;
 	avl_index_t where;
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	int err;
+	int flags = 0;
 
+	/*
+	 * No extended attributes allowed under .zfs
+	 */
+	if (flags & LOOKUP_XATTR)
+		return (EINVAL);
 	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
 
@@ -681,6 +790,26 @@ zfsctl_snapdir_lookup(ap)
 
 	ZFS_ENTER(zfsvfs);
 
+	if (flags & FIGNORECASE) {
+		boolean_t conflict = B_FALSE;
+
+		err = dmu_snapshot_realname(zfsvfs->z_os, nm, real,
+		    MAXNAMELEN, &conflict);
+		if (err == 0) {
+			strlcpy(nm, real, sizeof(nm));
+		} else if (err != ENOTSUP) {
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+#if 0
+		if (realpnp)
+			(void) strlcpy(realpnp->pn_buf, nm,
+			    realpnp->pn_bufsize);
+		if (conflict && direntflags)
+			*direntflags = ED_CASE_CONFLICT;
+#endif
+	}
+
 	mutex_enter(&sdp->sd_lock);
 	search.se_name = (char *)nm;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
@@ -692,6 +821,13 @@ zfsctl_snapdir_lookup(ap)
 			 * try to remount it.
 			 */
 			goto domount;
+		} else {
+			/*
+			 * VROOT was set during the traverse call.  We need
+			 * to clear it since we're pretending to be part
+			 * of our parent's vfs.
+			 */
+			(*vpp)->v_flag &= ~VROOT;
 		}
 		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 		mutex_exit(&sdp->sd_lock);
@@ -706,13 +842,25 @@ zfsctl_snapdir_lookup(ap)
 	if (err) {
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
-		return (err);
+		/*
+		 * handle "ls *" or "?" in a graceful manner,
+		 * forcing EILSEQ to ENOENT.
+		 * Since shell ultimately passes "*" or "?" as name to lookup
+		 */
+		return (err == EILSEQ ? ENOENT : err);
 	}
 	if (dmu_objset_open(snapname, DMU_OST_ZFS,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
+	    DS_MODE_USER | DS_MODE_READONLY, &snap) != 0) {
 		mutex_exit(&sdp->sd_lock);
+		/* Translate errors and add SAVENAME when needed. */
+		if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE) {
+			err = EJUSTRETURN;
+			cnp->cn_flags |= SAVENAME;
+		} else {
+			err = ENOENT;
+		}
 		ZFS_EXIT(zfsvfs);
-		return (ENOENT);
+		return (err);
 	}
 
 	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
@@ -735,7 +883,6 @@ domount:
 	if (err == 0)
 		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 	mutex_exit(&sdp->sd_lock);
-
 	/*
 	 * If we had an error, drop our hold on the vnode and
 	 * zfsctl_snapshot_inactive() will clean up.
@@ -750,25 +897,41 @@ domount:
 
 /* ARGSUSED */
 static int
-zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
-    offset_t *offp, offset_t *nextp, void *data)
+zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
+    offset_t *offp, offset_t *nextp, void *data, int flags)
 {
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	char snapname[MAXNAMELEN];
 	uint64_t id, cookie;
+	boolean_t case_conflict;
+	int error;
 
 	ZFS_ENTER(zfsvfs);
 
 	cookie = *offp;
-	if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
-	    &cookie) == ENOENT) {
-		*eofp = 1;
+	error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
+	    &cookie, &case_conflict);
+	if (error) {
 		ZFS_EXIT(zfsvfs);
-		return (0);
+		if (error == ENOENT) {
+			*eofp = 1;
+			return (0);
+		}
+		return (error);
 	}
 
-	(void) strcpy(dp->d_name, snapname);
-	dp->d_ino = ZFSCTL_INO_SNAP(id);
+	if (flags & V_RDDIR_ENTFLAGS) {
+		edirent_t *eodp = dp;
+
+		(void) strcpy(eodp->ed_name, snapname);
+		eodp->ed_ino = ZFSCTL_INO_SNAP(id);
+		eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0;
+	} else {
+		struct dirent64 *odp = dp;
+
+		(void) strcpy(odp->d_name, snapname);
+		odp->d_ino = ZFSCTL_INO_SNAP(id);
+	}
 	*nextp = cookie;
 
 	ZFS_EXIT(zfsvfs);
@@ -776,6 +939,13 @@ zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
 	return (0);
 }
 
+/*
+ * pvp is the '.zfs' directory (zfsctl_node_t).
+ * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
+ *
+ * This function is the callback to create a GFS vnode for '.zfs/snapshot'
+ * when a lookup is performed on .zfs for "snapshot".
+ */
 vnode_t *
 zfsctl_mknode_snapdir(vnode_t *pvp)
 {
@@ -802,6 +972,7 @@ zfsctl_snapdir_getattr(ap)
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
+		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
@@ -847,6 +1018,7 @@ static struct vop_vector zfsctl_ops_snapdir = {
 	.vop_ioctl =	VOP_EINVAL,
 	.vop_getattr =	zfsctl_snapdir_getattr,
 	.vop_access =	zfsctl_common_access,
+	.vop_mkdir =	zfsctl_freebsd_snapdir_mkdir,
 	.vop_readdir =	gfs_vop_readdir,
 	.vop_lookup =	zfsctl_snapdir_lookup,
 	.vop_inactive =	zfsctl_snapdir_inactive,
@@ -854,6 +1026,13 @@ static struct vop_vector zfsctl_ops_snapdir = {
 	.vop_fid =	zfsctl_common_fid,
 };
 
+/*
+ * pvp is the GFS vnode '.zfs/snapshot'.
+ *
+ * This creates a GFS node under '.zfs/snapshot' representing each
+ * snapshot.  This newly created GFS node is what we mount snapshot
+ * vfs_t's ontop of.
+ */
 static vnode_t *
 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
 {
@@ -862,8 +1041,10 @@ zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
 
 	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
 	    &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
+	VN_HOLD(vp);
 	zcp = vp->v_data;
 	zcp->zc_id = objset;
+	VFS_HOLD(vp->v_vfsp);
 	VOP_UNLOCK(vp, 0);
 
 	return (vp);
@@ -877,13 +1058,14 @@ zfsctl_snapshot_inactive(ap)
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
+	cred_t *cr = ap->a_td->td_ucred;
 	struct vop_inactive_args iap;
 	zfsctl_snapdir_t *sdp;
 	zfs_snapentry_t *sep, *next;
 	int locked;
 	vnode_t *dvp;
 
-	VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
+	VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0);
 	sdp = dvp->v_data;
 	VOP_UNLOCK(dvp, 0);
 
@@ -914,6 +1096,7 @@ zfsctl_snapshot_inactive(ap)
 	if (!locked)
 		mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
+	VFS_RELE(vp->v_vfsp);
 
 	/*
 	 * Dispose of the vnode for the snapshot mount point.
@@ -931,7 +1114,6 @@ zfsctl_traverse_begin(vnode_t **vpp, int lktype)
 {
 
 	VN_HOLD(*vpp);
-
 	/* Snapshot should be already mounted, but just in case. */
 	if (vn_mountedvfs(*vpp) == NULL)
 		return (ENOENT);
@@ -983,6 +1165,36 @@ zfsctl_snapshot_fid(ap)
 	return (err);
 }
 
+static int
+zfsctl_snapshot_lookup(ap)
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap;
+{
+	vnode_t *dvp = ap->a_dvp;
+	vnode_t **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	cred_t *cr = ap->a_cnp->cn_cred;
+	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+	int error;
+
+	if (cnp->cn_namelen != 2 || cnp->cn_nameptr[0] != '.' ||
+	    cnp->cn_nameptr[1] != '.') {
+		return (ENOENT);
+	}
+
+	ASSERT(dvp->v_type == VDIR);
+	ASSERT(zfsvfs->z_ctldir != NULL);
+
+	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", vpp,
+	    NULL, 0, NULL, cr, NULL, NULL, NULL);
+	if (error == 0)
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+	return (error);
+}
+
 /*
  * These VP's should never see the light of day.  They should always
  * be covered.
@@ -990,6 +1202,7 @@ zfsctl_snapshot_fid(ap)
 static struct vop_vector zfsctl_ops_snapshot = {
 	.vop_default =	&default_vnodeops,
 	.vop_inactive =	zfsctl_snapshot_inactive,
+	.vop_lookup =	zfsctl_snapshot_lookup,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_getattr =	zfsctl_snapshot_getattr,
 	.vop_fid =	zfsctl_snapshot_fid,
@@ -1007,7 +1220,7 @@ zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-	    NULL, 0, NULL, kcred);
+	    NULL, 0, NULL, kcred, NULL, NULL, NULL);
 	if (error != 0)
 		return (error);
 	sdp = dvp->v_data;
@@ -1025,6 +1238,12 @@ zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
 
 	if (sep != NULL) {
 		VN_HOLD(vp);
+		/*
+		 * Return the mounted root rather than the covered mount point.
+		 * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
+		 * and returns the ZFS vnode mounted on top of the GFS node.
+		 * This ZFS vnode is the root of the vfs for objset 'objsetid'.
+		 */
 		error = traverse(&vp, LK_SHARED | LK_RETRY);
 		if (error == 0) {
 			if (vp == sep->se_root)
@@ -1055,16 +1274,15 @@ zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
 int
 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
 {
-	struct vop_inactive_args ap;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	vnode_t *dvp, *svp;
+	vnode_t *dvp;
 	zfsctl_snapdir_t *sdp;
 	zfs_snapentry_t *sep, *next;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-	    NULL, 0, NULL, cr);
+	    NULL, 0, NULL, cr, NULL, NULL, NULL);
 	if (error != 0)
 		return (error);
 	sdp = dvp->v_data;
@@ -1073,7 +1291,6 @@ zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
 
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
-		svp = sep->se_root;
 		next = AVL_NEXT(&sdp->sd_snaps, sep);
 
 		/*
@@ -1081,40 +1298,16 @@ zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
 		 * have just been unmounted by somebody else, and
 		 * will be cleaned up by zfsctl_snapdir_inactive().
 		 */
-		if (vn_ismntpt(svp)) {
-			if ((error = vn_vfswlock(svp)) != 0)
-				goto out;
-
-			/*
-			 * Increase usecount, so dounmount() won't vrele() it
-			 * to 0 and call zfsctl_snapdir_inactive().
-			 */
-			VN_HOLD(svp);
-			vfsp = vn_mountedvfs(svp);
-			mtx_lock(&Giant);
-			error = dounmount(vfsp, fflags, curthread);
-			mtx_unlock(&Giant);
-			if (error != 0) {
-				VN_RELE(svp);
-				goto out;
+		if (vn_ismntpt(sep->se_root)) {
+			error = zfsctl_unmount_snap(sep, fflags, cr);
+			if (error) {
+				avl_add(&sdp->sd_snaps, sep);
+				break;
 			}
-
-			avl_remove(&sdp->sd_snaps, sep);
-			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-			kmem_free(sep, sizeof (zfs_snapentry_t));
-
-			/*
-			 * We can't use VN_RELE(), as that will try to
-			 * invoke zfsctl_snapdir_inactive(), and that
-			 * would lead to an attempt to re-grab the sd_lock.
-			 */
-			ASSERT3U(svp->v_count, ==, 1);
-			ap.a_vp = svp;
-			gfs_vop_inactive(&ap);
 		}
 		sep = next;
 	}
-out:
+
 	mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
index f233b8f61e8e..45ec88b7b735 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
@@ -40,6 +38,7 @@
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/unistd.h>
+#include <sys/sunddi.h>
 #include <sys/random.h>
 #include <sys/policy.h>
 #include <sys/kcondvar.h>
@@ -52,7 +51,50 @@
 #include <sys/dmu.h>
 #include <sys/atomic.h>
 #include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
 #include <sys/dnlc.h>
+#include <sys/extdirent.h>
+
+/*
+ * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups
+ * of names after deciding which is the appropriate lookup interface.
+ */
+static int
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact,
+    boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
+{
+	int error;
+
+	if (zfsvfs->z_norm) {
+		matchtype_t mt = MT_FIRST;
+		boolean_t conflict = B_FALSE;
+		size_t bufsz = 0;
+		char *buf = NULL;
+
+		if (rpnp) {
+			buf = rpnp->pn_buf;
+			bufsz = rpnp->pn_bufsize;
+		}
+		if (exact)
+			mt = MT_EXACT;
+		/*
+		 * In the non-mixed case we only expect there would ever
+		 * be one match, but we need to use the normalizing lookup.
+		 */
+		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
+		    zoid, mt, buf, bufsz, &conflict);
+		if (!error && deflags)
+			*deflags = conflict ? ED_CASE_CONFLICT : 0;
+	} else {
+		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
+	}
+	*zoid = ZFS_DIRENT_OBJ(*zoid);
+
+	if (error == ENOENT && update)
+		dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
+
+	return (error);
+}
 
 /*
  * Lock a directory entry.  A dirlock on <dzp, name> protects that name
@@ -67,24 +109,38 @@
  *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
  *		  ZSHARED: allow concurrent access with other ZSHARED callers.
  *		  ZXATTR: we want dzp's xattr directory
+ *		  ZCILOOK: On a mixed sensitivity file system,
+ *			   this lookup should be case-insensitive.
+ *		  ZCIEXACT: On a purely case-insensitive file system,
+ *			    this lookup should be case-sensitive.
+ *		  ZRENAMING: we are locking for renaming, force narrow locks
  *
  * Output arguments:
  *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
  *	dlpp	- pointer to the dirlock for this entry (NULL on error)
+ *      direntflags - (case-insensitive lookup only)
+ *		flags if multiple case-sensitive matches exist in directory
+ *      realpnp     - (case-insensitive lookup only)
+ *		actual name matched within the directory
  *
  * Return value: 0 on success or errno on failure.
  *
  * NOTE: Always checks for, and rejects, '.' and '..'.
+ * NOTE: For case-insensitive file systems we take wide locks (see below),
+ *	 but return znode pointers to a single match.
  */
 int
 zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
-	int flag)
+    int flag, int *direntflags, pathname_t *realpnp)
 {
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zfs_dirlock_t	*dl;
+	boolean_t	update;
+	boolean_t	exact;
 	uint64_t	zoid;
-	int		error;
-	vnode_t		*vp;
+	vnode_t		*vp = NULL;
+	int		error = 0;
+	int		cmpflags;
 
 	*zpp = NULL;
 	*dlpp = NULL;
@@ -98,6 +154,59 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
 		return (EEXIST);
 
 	/*
+	 * Case sensitivity and normalization preferences are set when
+	 * the file system is created.  These are stored in the
+	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
+	 * affect what vnodes can be cached in the DNLC, how we
+	 * perform zap lookups, and the "width" of our dirlocks.
+	 *
+	 * A normal dirlock locks a single name.  Note that with
+	 * normalization a name can be composed multiple ways, but
+	 * when normalized, these names all compare equal.  A wide
+	 * dirlock locks multiple names.  We need these when the file
+	 * system is supporting mixed-mode access.  It is sometimes
+	 * necessary to lock all case permutations of file name at
+	 * once so that simultaneous case-insensitive/case-sensitive
+	 * behaves as rationally as possible.
+	 */
+
+	/*
+	 * Decide if exact matches should be requested when performing
+	 * a zap lookup on file systems supporting case-insensitive
+	 * access.
+	 */
+	exact =
+	    ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) ||
+	    ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK));
+
+	/*
+	 * Only look in or update the DNLC if we are looking for the
+	 * name on a file system that does not require normalization
+	 * or case folding.  We can also look there if we happen to be
+	 * on a non-normalizing, mixed sensitivity file system IF we
+	 * are looking for the exact name.
+	 *
+	 * Maybe can add TO-UPPERed version of name to dnlc in ci-only
+	 * case for performance improvement?
+	 */
+	update = !zfsvfs->z_norm ||
+	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
+	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
+
+	/*
+	 * ZRENAMING indicates we are in a situation where we should
+	 * take narrow locks regardless of the file system's
+	 * preferences for normalizing and case folding.  This will
+	 * prevent us deadlocking trying to grab the same wide lock
+	 * twice if the two names happen to be case-insensitive
+	 * matches.
+	 */
+	if (flag & ZRENAMING)
+		cmpflags = 0;
+	else
+		cmpflags = zfsvfs->z_norm;
+
+	/*
 	 * Wait until there are no locks on this name.
 	 */
 	rw_enter(&dzp->z_name_lock, RW_READER);
@@ -108,9 +217,16 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
 			rw_exit(&dzp->z_name_lock);
 			return (ENOENT);
 		}
-		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next)
-			if (strcmp(name, dl->dl_name) == 0)
+		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
+			if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
+			    U8_UNICODE_LATEST, &error) == 0) || error != 0)
 				break;
+		}
+		if (error != 0) {
+			mutex_exit(&dzp->z_lock);
+			rw_exit(&dzp->z_name_lock);
+			return (ENOENT);
+		}
 		if (dl == NULL)	{
 			/*
 			 * Allocate a new dirlock and add it to the list.
@@ -156,7 +272,8 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
 		zoid = dzp->z_phys->zp_xattr;
 		error = (zoid == 0 ? ENOENT : 0);
 	} else {
-		vp = dnlc_lookup(ZTOV(dzp), name);
+		if (update)
+			vp = dnlc_lookup(ZTOV(dzp), name);
 		if (vp == DNLC_NO_VNODE) {
 			VN_RELE(vp);
 			error = ENOENT;
@@ -170,11 +287,8 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
 			*zpp = VTOZ(vp);
 			return (0);
 		} else {
-			error = zap_lookup(zfsvfs->z_os, dzp->z_id, name,
-			    8, 1, &zoid);
-			zoid = ZFS_DIRENT_OBJ(zoid);
-			if (error == ENOENT)
-				dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
+			error = zfs_match_find(zfsvfs, dzp, name, exact,
+			    update, direntflags, realpnp, &zoid);
 		}
 	}
 	if (error) {
@@ -192,7 +306,7 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
 			zfs_dirent_unlock(dl);
 			return (error);
 		}
-		if (!(flag & ZXATTR))
+		if (!(flag & ZXATTR) && update)
 			dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
 	}
 
@@ -239,7 +353,8 @@ zfs_dirent_unlock(zfs_dirlock_t *dl)
  *	special pseudo-directory.
  */
 int
-zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp)
+zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags,
+    int *deflg, pathname_t *rpnp)
 {
 	zfs_dirlock_t *dl;
 	znode_t *zp;
@@ -257,7 +372,8 @@ zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp)
 		if (dzp->z_phys->zp_parent == dzp->z_id &&
 		    zfsvfs->z_parent != zfsvfs) {
 			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
-			    "snapshot", vpp, NULL, 0, NULL, kcred);
+			    "snapshot", vpp, NULL, 0, NULL, kcred,
+			    NULL, NULL, NULL);
 			return (error);
 		}
 		rw_enter(&dzp->z_parent_lock, RW_READER);
@@ -268,30 +384,25 @@ zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp)
 	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
 		*vpp = zfsctl_root(dzp);
 	} else {
-		error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS | ZSHARED);
+		int zf;
+
+		zf = ZEXISTS | ZSHARED;
+		if (flags & FIGNORECASE)
+			zf |= ZCILOOK;
+
+		error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
 		if (error == 0) {
 			*vpp = ZTOV(zp);
 			zfs_dirent_unlock(dl);
 			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
 		}
+		rpnp = NULL;
 	}
 
-	return (error);
-}
-
-static char *
-zfs_unlinked_hexname(char namebuf[17], uint64_t x)
-{
-	char *name = &namebuf[16];
-	const char digits[16] = "0123456789abcdef";
-
-	*name = '\0';
-	do {
-		*--name = digits[x & 0xf];
-		x >>= 4;
-	} while (x != 0);
+	if ((flags & FIGNORECASE) && rpnp && !error)
+		(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
 
-	return (name);
+	return (error);
 }
 
 /*
@@ -312,15 +423,12 @@ void
 zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	char obj_name[17];
-	int error;
 
 	ASSERT(zp->z_unlinked);
 	ASSERT3U(zp->z_phys->zp_links, ==, 0);
 
-	error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
-	    zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
-	ASSERT3U(error, ==, 0);
+	VERIFY3U(0, ==,
+	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
 }
 
 /*
@@ -377,7 +485,9 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs)
 
 /*
  * Delete the entire contents of a directory.  Return a count
- * of the number of entries that could not be deleted.
+ * of the number of entries that could not be deleted. If we encounter
+ * an error, return a count of at least one so that the directory stays
+ * in the unlinked set.
  *
  * NOTE: this function assumes that the directory is inactive,
  *	so there is no need to lock its entries before deletion.
@@ -401,7 +511,10 @@ zfs_purgedir(znode_t *dzp)
 	    zap_cursor_advance(&zc)) {
 		error = zfs_zget(zfsvfs,
 		    ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
-		ASSERT3U(error, ==, 0);
+		if (error) {
+			skipped += 1;
+			continue;
+		}
 
 		ASSERT((ZTOV(xzp)->v_type == VREG) ||
 		    (ZTOV(xzp)->v_type == VLNK));
@@ -423,13 +536,15 @@ zfs_purgedir(znode_t *dzp)
 		dl.dl_name = zap.za_name;
 
 		error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
-		ASSERT3U(error, ==, 0);
+		if (error)
+			skipped += 1;
 		dmu_tx_commit(tx);
 
 		VN_RELE(ZTOV(xzp));
 	}
 	zap_cursor_fini(&zc);
-	ASSERT(error == ENOENT);
+	if (error != ENOENT)
+		skipped += 1;
 	return (skipped);
 }
 
@@ -439,7 +554,6 @@ zfs_rmnode(znode_t *zp)
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os = zfsvfs->z_os;
 	znode_t		*xzp = NULL;
-	char		obj_name[17];
 	dmu_tx_t	*tx;
 	uint64_t	acl_obj;
 	int		error;
@@ -450,6 +564,24 @@ zfs_rmnode(znode_t *zp)
 	ASSERT(zp->z_phys->zp_links == 0);
 
 	/*
+	 * If this is a ZIL replay then leave the object in the unlinked set.
+	 * Otherwise we can get a deadlock, because the delete can be
+	 * quite large and span multiple tx's and txgs, but each replay
+	 * creates a tx to atomically run the replay function and mark the
+	 * replay record as complete. We deadlock trying to start a tx in
+	 * a new txg to further the deletion but can't because the replay
+	 * tx hasn't finished.
+	 *
+	 * We actually delete the object if we get a failure to create an
+	 * object in zil_replay_log_record(), or after calling zil_replay().
+	 */
+	if (zfsvfs->z_assign >= TXG_INITIAL) {
+		zfs_znode_dmu_fini(zp);
+		zfs_znode_free(zp);
+		return;
+	}
+
+	/*
 	 * If this is an attribute directory, purge its contents.
 	 */
 	if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
@@ -457,14 +589,29 @@ zfs_rmnode(znode_t *zp)
 		if (zfs_purgedir(zp) != 0) {
 			/*
 			 * Not enough space to delete some xattrs.
-			 * Leave it on the unlinked set.
+			 * Leave it in the unlinked set.
 			 */
+			zfs_znode_dmu_fini(zp);
+			zfs_znode_free(zp);
 			VFS_UNLOCK_GIANT(vfslocked);
 			return;
 		}
 	}
 
 	/*
+	 * Free up all the data in the file.
+	 */
+	error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
+	if (error) {
+		/*
+		 * Not enough space.  Leave the file in the unlinked set.
+		 */
+		zfs_znode_dmu_fini(zp);
+		zfs_znode_free(zp);
+		return;
+	}
+
+	/*
 	 * If the file has extended attributes, we're going to unlink
 	 * the xattr dir.
 	 */
@@ -476,7 +623,7 @@ zfs_rmnode(znode_t *zp)
 	acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
 
 	/*
-	 * Set up the transaction.
+	 * Set up the final transaction.
 	 */
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
@@ -495,8 +642,9 @@ zfs_rmnode(znode_t *zp)
 		 * which point we'll call zfs_unlinked_drain() to process it).
 		 */
 		dmu_tx_abort(tx);
-		VFS_UNLOCK_GIANT(vfslocked);
-		return;
+		zfs_znode_dmu_fini(zp);
+		zfs_znode_free(zp);
+		goto out;
 	}
 
 	if (xzp) {
@@ -509,19 +657,27 @@ zfs_rmnode(znode_t *zp)
 	}
 
 	/* Remove this znode from the unlinked set */
-	error = zap_remove(os, zfsvfs->z_unlinkedobj,
-	    zfs_unlinked_hexname(obj_name, zp->z_id), tx);
-	ASSERT3U(error, ==, 0);
+	VERIFY3U(0, ==,
+	    zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
 
 	zfs_znode_delete(zp, tx);
 
 	dmu_tx_commit(tx);
-
+out:
 	if (xzp)
 		VN_RELE(ZTOV(xzp));
 	VFS_UNLOCK_GIANT(vfslocked);
 }
 
+static uint64_t
+zfs_dirent(znode_t *zp)
+{
+	uint64_t de = zp->z_id;
+	if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
+		de |= IFTODT((zp)->z_phys->zp_mode) << 60;
+	return (de);
+}
+
 /*
  * Link zp into dl.  Can only fail if zp has been unlinked.
  */
@@ -558,10 +714,7 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
 	zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
 	mutex_exit(&dzp->z_lock);
 
-	/*
-	 * MacOS X will fill in the 4-bit object type here.
-	 */
-	value = ZFS_DIRENT_MAKE(IFTODT(zp->z_phys->zp_mode), zp->z_id);
+	value = zfs_dirent(zp);
 	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
 	    8, 1, &value, tx);
 	ASSERT(error == 0);
@@ -632,7 +785,20 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 	zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
 	mutex_exit(&dzp->z_lock);
 
-	error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, tx);
+	if (zp->z_zfsvfs->z_norm) {
+		if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
+		    (flag & ZCIEXACT)) ||
+		    ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
+		    !(flag & ZCILOOK)))
+			error = zap_remove_norm(zp->z_zfsvfs->z_os,
+			    dzp->z_id, dl->dl_name, MT_EXACT, tx);
+		else
+			error = zap_remove_norm(zp->z_zfsvfs->z_os,
+			    dzp->z_id, dl->dl_name, MT_FIRST, tx);
+	} else {
+		error = zap_remove(zp->z_zfsvfs->z_os,
+		    dzp->z_id, dl->dl_name, tx);
+	}
 	ASSERT(error == 0);
 
 	if (unlinkedp != NULL)
@@ -660,17 +826,29 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	znode_t *xzp;
 	dmu_tx_t *tx;
-	uint64_t xoid;
 	int error;
+	zfs_fuid_info_t *fuidp = NULL;
 
 	*xvpp = NULL;
 
-	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, cr))
+	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
 		return (error);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
+		if (zfsvfs->z_fuid_obj == 0) {
+			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+			    FUID_SIZE_ESTIMATE(zfsvfs));
+			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
+		} else {
+			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+			    FUID_SIZE_ESTIMATE(zfsvfs));
+		}
+	}
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
@@ -678,13 +856,15 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
 		dmu_tx_abort(tx);
 		return (error);
 	}
-	zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0);
-	ASSERT(xzp->z_id == xoid);
+	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp);
 	ASSERT(xzp->z_phys->zp_parent == zp->z_id);
 	dmu_buf_will_dirty(zp->z_dbuf, tx);
-	zp->z_phys->zp_xattr = xoid;
+	zp->z_phys->zp_xattr = xzp->z_id;
 
-	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "");
+	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
+	    xzp, "", NULL, fuidp, vap);
+	if (fuidp)
+		zfs_fuid_info_free(fuidp);
 	dmu_tx_commit(tx);
 
 	*xvpp = ZTOV(xzp);
@@ -714,7 +894,7 @@ zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
 	vattr_t		va;
 	int		error;
 top:
-	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR);
+	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
 	if (error)
 		return (error);
 
@@ -751,8 +931,7 @@ top:
 	va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
 	va.va_type = VDIR;
 	va.va_mode = S_IFDIR | S_ISVTX | 0777;
-	va.va_uid = (uid_t)zp->z_phys->zp_uid;
-	va.va_gid = (gid_t)zp->z_phys->zp_gid;
+	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
 
 	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
 	zfs_dirent_unlock(dl);
@@ -782,16 +961,23 @@ int
 zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
 {
 	uid_t  		uid;
+	uid_t		downer;
+	uid_t		fowner;
+	zfsvfs_t	*zfsvfs = zdp->z_zfsvfs;
 
 	if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL)	/* ZIL replay */
 		return (0);
 
-	if ((zdp->z_phys->zp_mode & S_ISVTX) == 0 ||
-	    (uid = crgetuid(cr)) == zdp->z_phys->zp_uid ||
-	    uid == zp->z_phys->zp_uid ||
+	if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)
+		return (0);
+
+	downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER);
+	fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER);
+
+	if ((uid = crgetuid(cr)) == downer || uid == fowner ||
 	    (ZTOV(zp)->v_type == VREG &&
-	    zfs_zaccess(zp, ACE_WRITE_DATA, cr) == 0))
+	    zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
 		return (0);
 	else
-		return (secpolicy_vnode_remove(cr));
+		return (secpolicy_vnode_remove(ZTOV(zp), cr));
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
index e2385a0ba2c4..17e4b0a09c9b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
@@ -53,7 +51,7 @@ extern void devctl_notify(const char *__system, const char *__subsystem,
  * 	pool			X
  *
  * If we are in a loading state, all errors are chained together by the same
- * SPA-wide ENA.
+ * SPA-wide ENA (Error Numeric Association).
  *
  * For isolated I/O requests, we get the ENA from the zio_t. The propagation
  * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
@@ -90,11 +88,10 @@ extern void devctl_notify(const char *__system, const char *__subsystem,
  * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
  * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
  * then inherit this pointer, so that when it is first set subsequent failures
- * will use the same ENA.  If a physical I/O is issued (by passing the
- * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a
- * unique ENA will be generated.  For an aggregate I/O, this pointer is set to
- * NULL, and no ereport will be generated (since it doesn't actually correspond
- * to any particular device or piece of data).
+ * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
+ * this pointer is set to NULL, and no ereport will be generated (since it
+ * doesn't actually correspond to any particular device or piece of data,
+ * and the caller will always retry without caching or queueing anyway).
  */
 void
 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
@@ -104,6 +101,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	char buf[1024];
 	struct sbuf sb;
 	struct timespec ts;
+	int state;
 
 	/*
 	 * If we are doing a spa_tryimport(), ignore errors.
@@ -120,21 +118,33 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	    spa->spa_last_open_failed)
 		return;
 
-	/*
-	 * Ignore any errors from I/Os that we are going to retry anyway - we
-	 * only generate errors from the final failure.
-	 */
-	if (zio && zio_should_retry(zio))
-		return;
+	if (zio != NULL) {
+		/*
+		 * If this is not a read or write zio, ignore the error.  This
+		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
+		 */
+		if (zio->io_type != ZIO_TYPE_READ &&
+		    zio->io_type != ZIO_TYPE_WRITE)
+			return;
 
-	/*
-	 * If this is not a read or write zio, ignore the error.  This can occur
-	 * if the DKIOCFLUSHWRITECACHE ioctl fails.
-	 */
-	if (zio && zio->io_type != ZIO_TYPE_READ &&
-	    zio->io_type != ZIO_TYPE_WRITE)
-		return;
+		/*
+		 * Ignore any errors from speculative I/Os, as failure is an
+		 * expected result.
+		 */
+		if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+			return;
 
+		/*
+		 * If the vdev has already been marked as failing due to a
+		 * failed probe, then ignore any subsequent I/O errors, as the
+		 * DE will automatically fault the vdev on the first such
+		 * failure.
+		 */
+		if (vd != NULL &&
+		    (!vdev_readable(vd) || !vdev_writeable(vd)) &&
+		    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0)
+			return;
+	}
 	nanotime(&ts);
 
 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
@@ -187,22 +197,28 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	 */
 
 	/*
+	 * If we are importing a faulted pool, then we treat it like an open,
+	 * not an import.  Otherwise, the DE will ignore all faults during
+	 * import, since the default behavior is to mark the devices as
+	 * persistently unavailable, not leave them in the faulted state.
+	 */
+	state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state;
+
+	/*
 	 * Generic payload members common to all ereports.
-	 *
-	 * The direct reference to spa_name is used rather than spa_name()
-	 * because of the asynchronous nature of the zio pipeline.  spa_name()
-	 * asserts that the config lock is held in some form.  This is always
-	 * the case in I/O context, but because the check for RW_WRITER compares
-	 * against 'curthread', we may be in an asynchronous context and blow
-	 * this assert.  Rather than loosen this assert, we acknowledge that all
-	 * contexts in which this function is called (pool open, I/O) are safe,
-	 * and dereference the name directly.
 	 */
-	sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa->spa_name);
+	sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa));
 	sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
 	    spa_guid(spa));
-	sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT,
-	    spa->spa_load_state);
+	sbuf_printf(&sb, " %s=%d", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, state);
+
+	if (spa != NULL) {
+		sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
+		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
+		    FM_EREPORT_FAILMODE_WAIT :
+		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
+		    FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC);
+	}
 
 	if (vd != NULL) {
 		vdev_t *pvd = vd->vdev_parent;
@@ -290,7 +306,6 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	mutex_exit(&spa->spa_errlist_lock);
 
 	sbuf_finish(&sb);
-	ZFS_LOG(1, "%s", sbuf_data(&sb));
 	devctl_notify("ZFS", spa->spa_name, subclass, sbuf_data(&sb));
 	if (sbuf_overflowed(&sb))
 		printf("ZFS WARNING: sbuf overflowed\n");
@@ -298,13 +313,8 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 #endif
 }
 
-/*
- * The 'resource.fs.zfs.ok' event is an internal signal that the associated
- * resource (pool or disk) has been identified by ZFS as healthy.  This will
- * then trigger the DE to close the associated case, if any.
- */
-void
-zfs_post_ok(spa_t *spa, vdev_t *vd)
+static void
+zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
 {
 #ifdef _KERNEL
 	char buf[1024];
@@ -318,7 +328,7 @@ zfs_post_ok(spa_t *spa, vdev_t *vd)
 	sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
 
 	snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE,
-	    ZFS_ERROR_CLASS, FM_RESOURCE_OK);
+	    ZFS_ERROR_CLASS, name);
 	sbuf_printf(&sb, " %s=%hhu", FM_VERSION, FM_RSRC_VERSION);
 	sbuf_printf(&sb, " %s=%s", FM_CLASS, class);
 	sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
@@ -327,9 +337,33 @@ zfs_post_ok(spa_t *spa, vdev_t *vd)
 		sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
 		    vd->vdev_guid);
 	sbuf_finish(&sb);
+	ZFS_LOG(1, "%s", sbuf_data(&sb));
 	devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb));
 	if (sbuf_overflowed(&sb))
 		printf("ZFS WARNING: sbuf overflowed\n");
 	sbuf_delete(&sb);
 #endif
 }
+
+/*
+ * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
+ * has been removed from the system.  This will cause the DE to ignore any
+ * recent I/O errors, inferring that they are due to the asynchronous device
+ * removal.
+ */
+void
+zfs_post_remove(spa_t *spa, vdev_t *vd)
+{
+	zfs_post_common(spa, vd, FM_RESOURCE_REMOVED);
+}
+
+/*
+ * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
+ * has the 'autoreplace' property set, and therefore any broken vdevs will be
+ * handled by higher level logic, and no vdev fault should be generated.
+ */
+void
+zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
+{
+	zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
new file mode 100644
index 000000000000..dfec3ed903bc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
@@ -0,0 +1,716 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/sunddi.h>
+#include <sys/dmu.h>
+#include <sys/avl.h>
+#include <sys/zap.h>
+#include <sys/refcount.h>
+#include <sys/nvpair.h>
+#ifdef _KERNEL
+#include <sys/kidmap.h>
+#include <sys/sid.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#endif
+#include <sys/zfs_fuid.h>
+
+/*
+ * FUID Domain table(s).
+ *
+ * The FUID table is stored as a packed nvlist of an array
+ * of nvlists which contain an index, domain string and offset
+ *
+ * During file system initialization the nvlist(s) are read and
+ * two AVL trees are created.  One tree is keyed by the index number
+ * and the other by the domain string.  Nodes are never removed from
+ * trees, but new entries may be added.  If a new entry is added then the
+ * on-disk packed nvlist will also be updated.
+ */
+
+#define	FUID_IDX	"fuid_idx"
+#define	FUID_DOMAIN	"fuid_domain"
+#define	FUID_OFFSET	"fuid_offset"
+#define	FUID_NVP_ARRAY	"fuid_nvlist"
+
+typedef struct fuid_domain {
+	avl_node_t	f_domnode;
+	avl_node_t	f_idxnode;
+	ksiddomain_t	*f_ksid;
+	uint64_t	f_idx;
+} fuid_domain_t;
+
+static char *nulldomain = "";
+
+/*
+ * Compare two indexes.
+ */
+static int
+idx_compare(const void *arg1, const void *arg2)
+{
+	const fuid_domain_t *node1 = arg1;
+	const fuid_domain_t *node2 = arg2;
+
+	if (node1->f_idx < node2->f_idx)
+		return (-1);
+	else if (node1->f_idx > node2->f_idx)
+		return (1);
+	return (0);
+}
+
+/*
+ * Compare two domain strings.
+ */
+static int
+domain_compare(const void *arg1, const void *arg2)
+{
+	const fuid_domain_t *node1 = arg1;
+	const fuid_domain_t *node2 = arg2;
+	int val;
+
+	val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
+	if (val == 0)
+		return (0);
+	return (val > 0 ? 1 : -1);
+}
+
+/*
+ * load initial fuid domain and idx trees.  This function is used by
+ * both the kernel and zdb.
+ */
+uint64_t
+zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
+    avl_tree_t *domain_tree)
+{
+	dmu_buf_t *db;
+	uint64_t fuid_size;
+
+	avl_create(idx_tree, idx_compare,
+	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
+	avl_create(domain_tree, domain_compare,
+	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
+
+	VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db));
+	fuid_size = *(uint64_t *)db->db_data;
+	dmu_buf_rele(db, FTAG);
+
+	if (fuid_size)  {
+		nvlist_t **fuidnvp;
+		nvlist_t *nvp = NULL;
+		uint_t count;
+		char *packed;
+		int i;
+
+		packed = kmem_alloc(fuid_size, KM_SLEEP);
+		VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0);
+		VERIFY(nvlist_unpack(packed, fuid_size,
+		    &nvp, 0) == 0);
+		VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
+		    &fuidnvp, &count) == 0);
+
+		for (i = 0; i != count; i++) {
+			fuid_domain_t *domnode;
+			char *domain;
+			uint64_t idx;
+
+			VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN,
+			    &domain) == 0);
+			VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX,
+			    &idx) == 0);
+
+			domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
+
+			domnode->f_idx = idx;
+			domnode->f_ksid = ksid_lookupdomain(domain);
+			avl_add(idx_tree, domnode);
+			avl_add(domain_tree, domnode);
+		}
+		nvlist_free(nvp);
+		kmem_free(packed, fuid_size);
+	}
+	return (fuid_size);
+}
+
+void
+zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
+{
+	fuid_domain_t *domnode;
+	void *cookie;
+
+	cookie = NULL;
+	while (domnode = avl_destroy_nodes(domain_tree, &cookie))
+		ksiddomain_rele(domnode->f_ksid);
+
+	avl_destroy(domain_tree);
+	cookie = NULL;
+	while (domnode = avl_destroy_nodes(idx_tree, &cookie))
+		kmem_free(domnode, sizeof (fuid_domain_t));
+	avl_destroy(idx_tree);
+}
+
+char *
+zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
+{
+	fuid_domain_t searchnode, *findnode;
+	avl_index_t loc;
+
+	searchnode.f_idx = idx;
+
+	findnode = avl_find(idx_tree, &searchnode, &loc);
+
+	return (findnode ? findnode->f_ksid->kd_name : nulldomain);
+}
+
+#ifdef _KERNEL
+/*
+ * Load the fuid table(s) into memory.
+ */
+static void
+zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+	int error = 0;
+
+	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+
+	if (zfsvfs->z_fuid_loaded) {
+		rw_exit(&zfsvfs->z_fuid_lock);
+		return;
+	}
+
+	if (zfsvfs->z_fuid_obj == 0) {
+
+		/* first make sure we need to allocate object */
+
+		error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
+		    ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
+		if (error == ENOENT && tx != NULL) {
+			zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
+			    DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
+			    sizeof (uint64_t), tx);
+			VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+			    ZFS_FUID_TABLES, sizeof (uint64_t), 1,
+			    &zfsvfs->z_fuid_obj, tx) == 0);
+		}
+	}
+
+	if (zfsvfs->z_fuid_obj != 0) {
+		zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
+		    zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx,
+		    &zfsvfs->z_fuid_domain);
+		zfsvfs->z_fuid_loaded = B_TRUE;
+	}
+
+	rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * Query domain table for a given domain.
+ *
+ * If domain isn't found it is added to AVL trees and
+ * the results are pushed out to disk.
+ */
+int
+zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
+    dmu_tx_t *tx)
+{
+	fuid_domain_t searchnode, *findnode;
+	avl_index_t loc;
+	krw_t rw = RW_READER;
+
+	/*
+	 * If the dummy "nobody" domain then return an index of 0
+	 * to cause the created FUID to be a standard POSIX id
+	 * for the user nobody.
+	 */
+	if (domain[0] == '\0') {
+		*retdomain = nulldomain;
+		return (0);
+	}
+
+	searchnode.f_ksid = ksid_lookupdomain(domain);
+	if (retdomain) {
+		*retdomain = searchnode.f_ksid->kd_name;
+	}
+	if (!zfsvfs->z_fuid_loaded)
+		zfs_fuid_init(zfsvfs, tx);
+
+retry:
+	rw_enter(&zfsvfs->z_fuid_lock, rw);
+	findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc);
+
+	if (findnode) {
+		rw_exit(&zfsvfs->z_fuid_lock);
+		ksiddomain_rele(searchnode.f_ksid);
+		return (findnode->f_idx);
+	} else {
+		fuid_domain_t *domnode;
+		nvlist_t *nvp;
+		nvlist_t **fuids;
+		uint64_t retidx;
+		size_t nvsize = 0;
+		char *packed;
+		dmu_buf_t *db;
+		int i = 0;
+
+		if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) {
+			rw_exit(&zfsvfs->z_fuid_lock);
+			rw = RW_WRITER;
+			goto retry;
+		}
+
+		domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
+		domnode->f_ksid = searchnode.f_ksid;
+
+		retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1;
+
+		avl_add(&zfsvfs->z_fuid_domain, domnode);
+		avl_add(&zfsvfs->z_fuid_idx, domnode);
+		/*
+		 * Now resync the on-disk nvlist.
+		 */
+		VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+		domnode = avl_first(&zfsvfs->z_fuid_domain);
+		fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP);
+		while (domnode) {
+			VERIFY(nvlist_alloc(&fuids[i],
+			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+			VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
+			    domnode->f_idx) == 0);
+			VERIFY(nvlist_add_uint64(fuids[i],
+			    FUID_OFFSET, 0) == 0);
+			VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN,
+			    domnode->f_ksid->kd_name) == 0);
+			domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode);
+		}
+		VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
+		    fuids, retidx) == 0);
+		for (i = 0; i != retidx; i++)
+			nvlist_free(fuids[i]);
+		kmem_free(fuids, retidx * sizeof (void *));
+		VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
+		packed = kmem_alloc(nvsize, KM_SLEEP);
+		VERIFY(nvlist_pack(nvp, &packed, &nvsize,
+		    NV_ENCODE_XDR, KM_SLEEP) == 0);
+		nvlist_free(nvp);
+		zfsvfs->z_fuid_size = nvsize;
+		dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
+		    zfsvfs->z_fuid_size, packed, tx);
+		kmem_free(packed, zfsvfs->z_fuid_size);
+		VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
+		    FTAG, &db));
+		dmu_buf_will_dirty(db, tx);
+		*(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
+		dmu_buf_rele(db, FTAG);
+
+		rw_exit(&zfsvfs->z_fuid_lock);
+		return (retidx);
+	}
+}
+
+/*
+ * Query domain table by index, returning domain string
+ *
+ * Returns a pointer from an avl node of the domain string.
+ *
+ */
+static char *
+zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
+{
+	char *domain;
+
+	if (idx == 0 || !zfsvfs->z_use_fuids)
+		return (NULL);
+
+	if (!zfsvfs->z_fuid_loaded)
+		zfs_fuid_init(zfsvfs, NULL);
+
+	rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
+
+	if (zfsvfs->z_fuid_obj)
+		domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
+	else
+		domain = nulldomain;
+	rw_exit(&zfsvfs->z_fuid_lock);
+
+	ASSERT(domain);
+	return (domain);
+}
+
+void
+zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
+{
+	*uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid,
+	    cr, ZFS_OWNER);
+	*gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid,
+	    cr, ZFS_GROUP);
+}
+
+uid_t
+zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
+    cred_t *cr, zfs_fuid_type_t type)
+{
+	uint32_t index = FUID_INDEX(fuid);
+	char *domain;
+	uid_t id;
+
+	if (index == 0)
+		return (fuid);
+
+	domain = zfs_fuid_find_by_idx(zfsvfs, index);
+	ASSERT(domain != NULL);
+
+#ifdef TODO
+	if (type == ZFS_OWNER || type == ZFS_ACE_USER) {
+		(void) kidmap_getuidbysid(crgetzone(cr), domain,
+		    FUID_RID(fuid), &id);
+	} else {
+		(void) kidmap_getgidbysid(crgetzone(cr), domain,
+		    FUID_RID(fuid), &id);
+	}
+#else
+	panic(__func__);
+#endif
+	return (id);
+}
+
+/*
+ * Add a FUID node to the list of fuid's being created for this
+ * ACL
+ *
+ * If ACL has multiple domains, then keep only one copy of each unique
+ * domain.
+ */
+static void
+zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
+    uint64_t idx, uint64_t id, zfs_fuid_type_t type)
+{
+	zfs_fuid_t *fuid;
+	zfs_fuid_domain_t *fuid_domain;
+	zfs_fuid_info_t *fuidp;
+	uint64_t fuididx;
+	boolean_t found = B_FALSE;
+
+	if (*fuidpp == NULL)
+		*fuidpp = zfs_fuid_info_alloc();
+
+	fuidp = *fuidpp;
+	/*
+	 * First find fuid domain index in linked list
+	 *
+	 * If one isn't found then create an entry.
+	 */
+
+	for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains);
+	    fuid_domain; fuid_domain = list_next(&fuidp->z_domains,
+	    fuid_domain), fuididx++) {
+		if (idx == fuid_domain->z_domidx) {
+			found = B_TRUE;
+			break;
+		}
+	}
+
+	if (!found) {
+		fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP);
+		fuid_domain->z_domain = domain;
+		fuid_domain->z_domidx = idx;
+		list_insert_tail(&fuidp->z_domains, fuid_domain);
+		fuidp->z_domain_str_sz += strlen(domain) + 1;
+		fuidp->z_domain_cnt++;
+	}
+
+	if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) {
+		/*
+		 * Now allocate fuid entry and add it on the end of the list
+		 */
+
+		fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
+		fuid->z_id = id;
+		fuid->z_domidx = idx;
+		fuid->z_logfuid = FUID_ENCODE(fuididx, rid);
+
+		list_insert_tail(&fuidp->z_fuids, fuid);
+		fuidp->z_fuid_cnt++;
+	} else {
+		if (type == ZFS_OWNER)
+			fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid);
+		else
+			fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid);
+	}
+}
+
+/*
+ * Create a file system FUID, based on information in the users cred
+ */
+uint64_t
+zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
+    dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp)
+{
+	uint64_t	idx;
+	ksid_t		*ksid;
+	uint32_t	rid;
+	char 		*kdomain;
+	const char	*domain;
+	uid_t		id;
+
+	VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
+
+	if (type == ZFS_OWNER)
+		id = crgetuid(cr);
+	else
+		id = crgetgid(cr);
+
+	if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id))
+		return ((uint64_t)id);
+
+#ifdef TODO
+	ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
+
+	VERIFY(ksid != NULL);
+	rid = ksid_getrid(ksid);
+	domain = ksid_getdomain(ksid);
+
+	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
+
+	zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
+
+	return (FUID_ENCODE(idx, rid));
+#else
+	panic(__func__);
+#endif
+}
+
+/*
+ * Create a file system FUID for an ACL ace
+ * or a chown/chgrp of the file.
+ * This is similar to zfs_fuid_create_cred, except that
+ * we can't find the domain + rid information in the
+ * cred.  Instead we have to query Winchester for the
+ * domain and rid.
+ *
+ * During replay operations the domain+rid information is
+ * found in the zfs_fuid_info_t that the replay code has
+ * attached to the zfsvfs of the file system.
+ */
+uint64_t
+zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
+    zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp)
+{
+	const char *domain;
+	char *kdomain;
+	uint32_t fuid_idx = FUID_INDEX(id);
+	uint32_t rid;
+	idmap_stat status;
+	uint64_t idx;
+	boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL);
+	zfs_fuid_t *zfuid = NULL;
+	zfs_fuid_info_t *fuidp;
+
+	/*
+	 * If POSIX ID, or entry is already a FUID then
+	 * just return the id
+	 *
+	 * We may also be handed an already FUID'ized id via
+	 * chmod.
+	 */
+
+	if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
+		return (id);
+
+	if (is_replay) {
+		fuidp = zfsvfs->z_fuid_replay;
+
+		/*
+		 * If we are passed an ephemeral id, but no
+		 * fuid_info was logged then return NOBODY.
+		 * This is most likely a result of idmap service
+		 * not being available.
+		 */
+		if (fuidp == NULL)
+			return (UID_NOBODY);
+
+		switch (type) {
+		case ZFS_ACE_USER:
+		case ZFS_ACE_GROUP:
+			zfuid = list_head(&fuidp->z_fuids);
+			rid = FUID_RID(zfuid->z_logfuid);
+			idx = FUID_INDEX(zfuid->z_logfuid);
+			break;
+		case ZFS_OWNER:
+			rid = FUID_RID(fuidp->z_fuid_owner);
+			idx = FUID_INDEX(fuidp->z_fuid_owner);
+			break;
+		case ZFS_GROUP:
+			rid = FUID_RID(fuidp->z_fuid_group);
+			idx = FUID_INDEX(fuidp->z_fuid_group);
+			break;
+		};
+		domain = fuidp->z_domain_table[idx -1];
+	} else {
+#ifdef TODO
+		if (type == ZFS_OWNER || type == ZFS_ACE_USER)
+			status = kidmap_getsidbyuid(crgetzone(cr), id,
+			    &domain, &rid);
+		else
+			status = kidmap_getsidbygid(crgetzone(cr), id,
+			    &domain, &rid);
+
+		if (status != 0) {
+			/*
+			 * When returning nobody we will need to
+			 * make a dummy fuid table entry for logging
+			 * purposes.
+			 */
+			rid = UID_NOBODY;
+			domain = nulldomain;
+		}
+#else
+		panic(__func__);
+#endif
+	}
+
+	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
+
+	if (!is_replay)
+		zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type);
+	else if (zfuid != NULL) {
+		list_remove(&fuidp->z_fuids, zfuid);
+		kmem_free(zfuid, sizeof (zfs_fuid_t));
+	}
+	return (FUID_ENCODE(idx, rid));
+}
+
+void
+zfs_fuid_destroy(zfsvfs_t *zfsvfs)
+{
+	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+	if (!zfsvfs->z_fuid_loaded) {
+		rw_exit(&zfsvfs->z_fuid_lock);
+		return;
+	}
+	zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
+	rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * Allocate zfs_fuid_info for tracking FUIDs created during
+ * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR()
+ */
+zfs_fuid_info_t *
+zfs_fuid_info_alloc(void)
+{
+	zfs_fuid_info_t *fuidp;
+
+	fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP);
+	list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t),
+	    offsetof(zfs_fuid_domain_t, z_next));
+	list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t),
+	    offsetof(zfs_fuid_t, z_next));
+	return (fuidp);
+}
+
+/*
+ * Release all memory associated with zfs_fuid_info_t
+ */
+void
+zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
+{
+	zfs_fuid_t *zfuid;
+	zfs_fuid_domain_t *zdomain;
+
+	while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
+		list_remove(&fuidp->z_fuids, zfuid);
+		kmem_free(zfuid, sizeof (zfs_fuid_t));
+	}
+
+	if (fuidp->z_domain_table != NULL)
+		kmem_free(fuidp->z_domain_table,
+		    (sizeof (char **)) * fuidp->z_domain_cnt);
+
+	while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
+		list_remove(&fuidp->z_domains, zdomain);
+		kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
+	}
+
+	kmem_free(fuidp, sizeof (zfs_fuid_info_t));
+}
+
+/*
+ * Check to see if id is a groupmember.  If cred
+ * has ksid info then sidlist is checked first
+ * and if still not found then POSIX groups are checked
+ *
+ * Will use a straight FUID compare when possible.
+ */
+boolean_t
+zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
+{
+	ksid_t		*ksid = crgetsid(cr, KSID_GROUP);
+	uid_t		gid;
+
+#ifdef TODO
+	if (ksid) {
+		int 		i;
+		ksid_t		*ksid_groups;
+		ksidlist_t	*ksidlist = crgetsidlist(cr);
+		uint32_t	idx = FUID_INDEX(id);
+		uint32_t	rid = FUID_RID(id);
+
+		ASSERT(ksidlist);
+		ksid_groups = ksidlist->ksl_sids;
+
+		for (i = 0; i != ksidlist->ksl_nsid; i++) {
+			if (idx == 0) {
+				if (id != IDMAP_WK_CREATOR_GROUP_GID &&
+				    id == ksid_groups[i].ks_id) {
+					return (B_TRUE);
+				}
+			} else {
+				char *domain;
+
+				domain = zfs_fuid_find_by_idx(zfsvfs, idx);
+				ASSERT(domain != NULL);
+
+				if (strcmp(domain,
+				    IDMAP_WK_CREATOR_SID_AUTHORITY) == 0)
+					return (B_FALSE);
+
+				if ((strcmp(domain,
+				    ksid_groups[i].ks_domain->kd_name) == 0) &&
+				    rid == ksid_groups[i].ks_rid)
+					return (B_TRUE);
+			}
+		}
+	}
+#endif
+
+	/*
+	 * Not found in ksidlist, check posix groups
+	 */
+	gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
+	return (groupmember(gid, cr));
+}
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
index 8699922ccf09..a6829eb1f122 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -43,6 +41,7 @@
 #include <sys/cmn_err.h>
 #include <sys/stat.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
@@ -52,6 +51,8 @@
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_objset.h>
 #include <sys/sunddi.h>
 #include <sys/policy.h>
 #include <sys/zone.h>
@@ -62,10 +63,13 @@
 #include <sys/varargs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
 #include <sys/zvol.h>
+#include <sys/dmu_objset.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
+#include "zfs_deleg.h"
 
 CTASSERT(sizeof(zfs_cmd_t) <= PAGE_SIZE);
 
@@ -75,18 +79,24 @@ extern void zfs_init(void);
 extern void zfs_fini(void);
 
 typedef int zfs_ioc_func_t(zfs_cmd_t *);
-typedef int zfs_secpolicy_func_t(const char *, cred_t *);
+typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *);
 
 typedef struct zfs_ioc_vec {
 	zfs_ioc_func_t		*zvec_func;
 	zfs_secpolicy_func_t	*zvec_secpolicy;
 	enum {
-		no_name,
-		pool_name,
-		dataset_name
-	}			zvec_namecheck;
+		NO_NAME,
+		POOL_NAME,
+		DATASET_NAME
+	} zvec_namecheck;
+	boolean_t		zvec_his_log;
 } zfs_ioc_vec_t;
 
+static void clear_props(char *dataset, nvlist_t *props);
+static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
+    boolean_t *);
+int zfs_set_prop_nvlist(const char *, nvlist_t *);
+
 /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
 void
 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
@@ -123,13 +133,122 @@ __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
 	    char *, newfile, char *, func, int, line, char *, buf);
 }
 
+static void
+history_str_free(char *buf)
+{
+	kmem_free(buf, HIS_MAX_RECORD_LEN);
+}
+
+static char *
+history_str_get(zfs_cmd_t *zc)
+{
+	char *buf;
+
+	if (zc->zc_history == 0)
+		return (NULL);
+
+	buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
+	if (copyinstr((void *)(uintptr_t)zc->zc_history,
+	    buf, HIS_MAX_RECORD_LEN, NULL) != 0) {
+		history_str_free(buf);
+		return (NULL);
+	}
+
+	buf[HIS_MAX_RECORD_LEN -1] = '\0';
+
+	return (buf);
+}
+
+/*
+ * Check to see if the named dataset is currently defined as bootable
+ */
+static boolean_t
+zfs_is_bootfs(const char *name)
+{
+	spa_t *spa;
+	boolean_t ret = B_FALSE;
+
+	if (spa_open(name, &spa, FTAG) == 0) {
+		if (spa->spa_bootfs) {
+			objset_t *os;
+
+			if (dmu_objset_open(name, DMU_OST_ZFS,
+			    DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
+				ret = (dmu_objset_id(os) == spa->spa_bootfs);
+				dmu_objset_close(os);
+			}
+		}
+		spa_close(spa, FTAG);
+	}
+	return (ret);
+}
+
+/*
+ * zfs_earlier_version
+ *
+ *	Return non-zero if the spa version is less than requested version.
+ */
+static int
+zfs_earlier_version(const char *name, int version)
+{
+	spa_t *spa;
+
+	if (spa_open(name, &spa, FTAG) == 0) {
+		if (spa_version(spa) < version) {
+			spa_close(spa, FTAG);
+			return (1);
+		}
+		spa_close(spa, FTAG);
+	}
+	return (0);
+}
+
+/*
+ * zpl_earlier_version
+ *
+ * Return TRUE if the ZPL version is less than requested version.
+ */
+static boolean_t
+zpl_earlier_version(const char *name, int version)
+{
+	objset_t *os;
+	boolean_t rc = B_TRUE;
+
+	if (dmu_objset_open(name, DMU_OST_ANY,
+	    DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
+		uint64_t zplversion;
+
+		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
+			rc = zplversion < version;
+		dmu_objset_close(os);
+	}
+	return (rc);
+}
+
+static void
+zfs_log_history(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	char *buf;
+
+	if ((buf = history_str_get(zc)) == NULL)
+		return;
+
+	if (spa_open(zc->zc_name, &spa, FTAG) == 0) {
+		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY)
+			(void) spa_history_log(spa, buf, LOG_CMD_NORMAL);
+		spa_close(spa, FTAG);
+	}
+	history_str_free(buf);
+}
+
 /*
  * Policy for top-level read operations (list pools).  Requires no privileges,
  * and can be used in the local zone, as there is no associated dataset.
  */
 /* ARGSUSED */
 static int
-zfs_secpolicy_none(const char *unused1, cred_t *cr)
+zfs_secpolicy_none(zfs_cmd_t *zc, cred_t *cr)
 {
 	return (0);
 }
@@ -140,10 +259,10 @@ zfs_secpolicy_none(const char *unused1, cred_t *cr)
  */
 /* ARGSUSED */
 static int
-zfs_secpolicy_read(const char *dataset, cred_t *cr)
+zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr)
 {
-	if (INGLOBALZONE(curproc) ||
-	    zone_dataset_visible(dataset, NULL))
+	if (INGLOBALZONE(curthread) ||
+	    zone_dataset_visible(zc->zc_name, NULL))
 		return (0);
 
 	return (ENOENT);
@@ -159,14 +278,14 @@ zfs_dozonecheck(const char *dataset, cred_t *cr)
 	 * The dataset must be visible by this zone -- check this first
 	 * so they don't see EPERM on something they shouldn't know about.
 	 */
-	if (!INGLOBALZONE(curproc) &&
+	if (!INGLOBALZONE(curthread) &&
 	    !zone_dataset_visible(dataset, &writable))
 		return (ENOENT);
 
 	if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
 		return (ENOENT);
 
-	if (INGLOBALZONE(curproc)) {
+	if (INGLOBALZONE(curthread)) {
 		/*
 		 * If the fs is zoned, only root can access it from the
 		 * global zone.
@@ -187,47 +306,324 @@ zfs_dozonecheck(const char *dataset, cred_t *cr)
 	return (0);
 }
 
-/*
- * Policy for dataset write operations (create children, set properties, etc).
- * Requires SYS_MOUNT privilege, and must be writable in the local zone.
- */
 int
-zfs_secpolicy_write(const char *dataset, cred_t *cr)
+zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
+{
+	int error;
+
+	error = zfs_dozonecheck(name, cr);
+	if (error == 0) {
+		error = secpolicy_zfs(cr);
+		if (error)
+			error = dsl_deleg_access(name, perm, cr);
+	}
+	return (error);
+}
+
+static int
+zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr)
+{
+	/*
+	 * Check permissions for special properties.
+	 */
+	switch (prop) {
+	case ZFS_PROP_ZONED:
+		/*
+		 * Disallow setting of 'zoned' from within a local zone.
+		 */
+		if (!INGLOBALZONE(curthread))
+			return (EPERM);
+		break;
+
+	case ZFS_PROP_QUOTA:
+		if (!INGLOBALZONE(curthread)) {
+			uint64_t zoned;
+			char setpoint[MAXNAMELEN];
+			/*
+			 * Unprivileged users are allowed to modify the
+			 * quota on things *under* (ie. contained by)
+			 * the thing they own.
+			 */
+			if (dsl_prop_get_integer(name, "zoned", &zoned,
+			    setpoint))
+				return (EPERM);
+			if (!zoned || strlen(name) <= strlen(setpoint))
+				return (EPERM);
+		}
+		break;
+	}
+
+	return (zfs_secpolicy_write_perms(name, zfs_prop_to_name(prop), cr));
+}
+
+int
+zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr)
 {
 	int error;
 
-	if (error = zfs_dozonecheck(dataset, cr))
+	error = zfs_dozonecheck(zc->zc_name, cr);
+	if (error)
 		return (error);
 
-	return (secpolicy_zfs(cr));
+	/*
+	 * permission to set permissions will be evaluated later in
+	 * dsl_deleg_can_allow()
+	 */
+	return (0);
+}
+
+int
+zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr)
+{
+	int error;
+	error = zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_ROLLBACK, cr);
+	if (error == 0)
+		error = zfs_secpolicy_write_perms(zc->zc_name,
+		    ZFS_DELEG_PERM_MOUNT, cr);
+	return (error);
+}
+
+int
+zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr)
+{
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_SEND, cr));
+}
+
+int
+zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr)
+{
+	if (!INGLOBALZONE(curthread))
+		return (EPERM);
+
+	if (secpolicy_nfs(cr) == 0) {
+		return (0);
+	} else {
+		vnode_t *vp;
+		int error;
+
+		if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
+		    NO_FOLLOW, NULL, &vp)) != 0)
+			return (error);
+
+		/* Now make sure mntpnt and dataset are ZFS */
+
+		if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
+		    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
+		    zc->zc_name) != 0)) {
+			VN_RELE(vp);
+			return (EPERM);
+		}
+
+		VN_RELE(vp);
+		return (dsl_deleg_access(zc->zc_name,
+		    ZFS_DELEG_PERM_SHARE, cr));
+	}
 }
 
-/*
- * Policy for operations that want to write a dataset's parent:
- * create, destroy, snapshot, clone, restore.
- */
 static int
-zfs_secpolicy_parent(const char *dataset, cred_t *cr)
+zfs_get_parent(const char *datasetname, char *parent, int parentsize)
 {
-	char parentname[MAXNAMELEN];
 	char *cp;
 
 	/*
 	 * Remove the @bla or /bla from the end of the name to get the parent.
 	 */
-	(void) strncpy(parentname, dataset, sizeof (parentname));
-	cp = strrchr(parentname, '@');
+	(void) strncpy(parent, datasetname, parentsize);
+	cp = strrchr(parent, '@');
 	if (cp != NULL) {
 		cp[0] = '\0';
 	} else {
-		cp = strrchr(parentname, '/');
+		cp = strrchr(parent, '/');
 		if (cp == NULL)
 			return (ENOENT);
 		cp[0] = '\0';
+	}
+
+	return (0);
+}
+
+int
+zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
+{
+	int error;
+
+	if ((error = zfs_secpolicy_write_perms(name,
+	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+		return (error);
+
+	return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
+}
+
+static int
+zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr)
+{
+	return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
+}
+
+/*
+ * Must have sys_config privilege to check the iscsi permission
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_iscsi(zfs_cmd_t *zc, cred_t *cr)
+{
+	return (secpolicy_zfs(cr));
+}
+
+int
+zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
+{
+	char 	parentname[MAXNAMELEN];
+	int	error;
+
+	if ((error = zfs_secpolicy_write_perms(from,
+	    ZFS_DELEG_PERM_RENAME, cr)) != 0)
+		return (error);
+
+	if ((error = zfs_secpolicy_write_perms(from,
+	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+		return (error);
+
+	if ((error = zfs_get_parent(to, parentname,
+	    sizeof (parentname))) != 0)
+		return (error);
+
+	if ((error = zfs_secpolicy_write_perms(parentname,
+	    ZFS_DELEG_PERM_CREATE, cr)) != 0)
+		return (error);
+
+	if ((error = zfs_secpolicy_write_perms(parentname,
+	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+		return (error);
+
+	return (error);
+}
+
+static int
+zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr)
+{
+	return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr));
+}
 
+static int
+zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
+{
+	char 	parentname[MAXNAMELEN];
+	objset_t *clone;
+	int error;
+
+	error = zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_PROMOTE, cr);
+	if (error)
+		return (error);
+
+	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+	    DS_MODE_USER | DS_MODE_READONLY, &clone);
+
+	if (error == 0) {
+		dsl_dataset_t *pclone = NULL;
+		dsl_dir_t *dd;
+		dd = clone->os->os_dsl_dataset->ds_dir;
+
+		rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+		error = dsl_dataset_hold_obj(dd->dd_pool,
+		    dd->dd_phys->dd_origin_obj, FTAG, &pclone);
+		rw_exit(&dd->dd_pool->dp_config_rwlock);
+		if (error) {
+			dmu_objset_close(clone);
+			return (error);
+		}
+
+		error = zfs_secpolicy_write_perms(zc->zc_name,
+		    ZFS_DELEG_PERM_MOUNT, cr);
+
+		dsl_dataset_name(pclone, parentname);
+		dmu_objset_close(clone);
+		dsl_dataset_rele(pclone, FTAG);
+		if (error == 0)
+			error = zfs_secpolicy_write_perms(parentname,
+			    ZFS_DELEG_PERM_PROMOTE, cr);
 	}
+	return (error);
+}
+
+static int
+zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr)
+{
+	int error;
+
+	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
+		return (error);
+
+	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+		return (error);
 
-	return (zfs_secpolicy_write(parentname, cr));
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_CREATE, cr));
+}
+
+int
+zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
+{
+	int error;
+
+	if ((error = zfs_secpolicy_write_perms(name,
+	    ZFS_DELEG_PERM_SNAPSHOT, cr)) != 0)
+		return (error);
+
+	error = zfs_secpolicy_write_perms(name,
+	    ZFS_DELEG_PERM_MOUNT, cr);
+
+	return (error);
+}
+
+static int
+zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr)
+{
+
+	return (zfs_secpolicy_snapshot_perms(zc->zc_name, cr));
+}
+
+static int
+zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr)
+{
+	char 	parentname[MAXNAMELEN];
+	int 	error;
+
+	if ((error = zfs_get_parent(zc->zc_name, parentname,
+	    sizeof (parentname))) != 0)
+		return (error);
+
+	if (zc->zc_value[0] != '\0') {
+		if ((error = zfs_secpolicy_write_perms(zc->zc_value,
+		    ZFS_DELEG_PERM_CLONE, cr)) != 0)
+			return (error);
+	}
+
+	if ((error = zfs_secpolicy_write_perms(parentname,
+	    ZFS_DELEG_PERM_CREATE, cr)) != 0)
+		return (error);
+
+	error = zfs_secpolicy_write_perms(parentname,
+	    ZFS_DELEG_PERM_MOUNT, cr);
+
+	return (error);
+}
+
+static int
+zfs_secpolicy_umount(zfs_cmd_t *zc, cred_t *cr)
+{
+	int error;
+
+	error = secpolicy_fs_unmount(cr, NULL);
+	if (error) {
+		error = dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr);
+	}
+	return (error);
 }
 
 /*
@@ -236,7 +632,7 @@ zfs_secpolicy_parent(const char *dataset, cred_t *cr)
  */
 /* ARGSUSED */
 static int
-zfs_secpolicy_config(const char *unused, cred_t *cr)
+zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr)
 {
 	if (secpolicy_sys_config(cr, B_FALSE) != 0)
 		return (EPERM);
@@ -245,15 +641,48 @@ zfs_secpolicy_config(const char *unused, cred_t *cr)
 }
 
 /*
+ * Just like zfs_secpolicy_config, except that we will check for
+ * mount permission on the dataset for permission to create/remove
+ * the minor nodes.
+ */
+static int
+zfs_secpolicy_minor(zfs_cmd_t *zc, cred_t *cr)
+{
+	if (secpolicy_sys_config(cr, B_FALSE) != 0) {
+		return (dsl_deleg_access(zc->zc_name,
+		    ZFS_DELEG_PERM_MOUNT, cr));
+	}
+
+	return (0);
+}
+
+/*
  * Policy for fault injection.  Requires all privileges.
  */
 /* ARGSUSED */
 static int
-zfs_secpolicy_inject(const char *unused, cred_t *cr)
+zfs_secpolicy_inject(zfs_cmd_t *zc, cred_t *cr)
 {
 	return (secpolicy_zinject(cr));
 }
 
+static int
+zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr)
+{
+	zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
+
+	if (prop == ZPROP_INVAL) {
+		if (!zfs_prop_user(zc->zc_value))
+			return (EINVAL);
+		return (zfs_secpolicy_write_perms(zc->zc_name,
+		    ZFS_DELEG_PERM_USERPROP, cr));
+	} else {
+		if (!zfs_prop_inheritable(prop))
+			return (EINVAL);
+		return (zfs_secpolicy_setprop(zc->zc_name, prop, cr));
+	}
+}
+
 /*
  * Policy for dataset backup operations (sendbackup).
  * Requires SYS_MOUNT privilege, and must be writable in the local zone.
@@ -263,7 +692,7 @@ zfs_secpolicy_operator(const char *dataset, cred_t *cr)
 {
 	int writable = 1;
 
-	if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable))
+	if (!INGLOBALZONE(curthread) && !zone_dataset_visible(dataset, &writable))
 		return (ENOENT);
 	if (secpolicy_zfs(cr) != 0 && !groupmember(GID_OPERATOR, cr))
 		return (EPERM);
@@ -274,35 +703,33 @@ zfs_secpolicy_operator(const char *dataset, cred_t *cr)
  * Returns the nvlist as specified by the user in the zfs_cmd_t.
  */
 static int
-get_nvlist(zfs_cmd_t *zc, nvlist_t **nvp)
+get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp)
 {
 	char *packed;
-	size_t size;
 	int error;
-	nvlist_t *config = NULL;
+	nvlist_t *list = NULL;
 
 	/*
 	 * Read in and unpack the user-supplied nvlist.
 	 */
-	if ((size = zc->zc_nvlist_src_size) == 0)
+	if (size == 0)
 		return (EINVAL);
 
 	packed = kmem_alloc(size, KM_SLEEP);
 
-	if ((error = xcopyin((void *)(uintptr_t)zc->zc_nvlist_src, packed,
-	    size)) != 0) {
+	if ((error = xcopyin((void *)(uintptr_t)nvl, packed, size)) != 0) {
 		kmem_free(packed, size);
 		return (error);
 	}
 
-	if ((error = nvlist_unpack(packed, size, &config, 0)) != 0) {
+	if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
 		kmem_free(packed, size);
 		return (error);
 	}
 
 	kmem_free(packed, size);
 
-	*nvp = config;
+	*nvp = list;
 	return (0);
 }
 
@@ -326,6 +753,7 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
 		 */
 		error = 0;
 	} else {
+		packed = kmem_alloc(size, KM_SLEEP);
 		VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
 		    KM_SLEEP) == 0);
 		error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
@@ -341,15 +769,67 @@ static int
 zfs_ioc_pool_create(zfs_cmd_t *zc)
 {
 	int error;
-	nvlist_t *config;
+	nvlist_t *config, *props = NULL;
+	nvlist_t *rootprops = NULL;
+	nvlist_t *zplprops = NULL;
+	char *buf;
+
+	if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+	    &config))
+		return (error);
 
-	if ((error = get_nvlist(zc, &config)) != 0)
+	if (zc->zc_nvlist_src_size != 0 && (error =
+	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) {
+		nvlist_free(config);
 		return (error);
+	}
+
+	if (props) {
+		nvlist_t *nvl = NULL;
+		uint64_t version = SPA_VERSION;
+
+		(void) nvlist_lookup_uint64(props,
+		    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
+		if (version < SPA_VERSION_INITIAL || version > SPA_VERSION) {
+			error = EINVAL;
+			goto pool_props_bad;
+		}
+		(void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl);
+		if (nvl) {
+			error = nvlist_dup(nvl, &rootprops, KM_SLEEP);
+			if (error != 0) {
+				nvlist_free(config);
+				nvlist_free(props);
+				return (error);
+			}
+			(void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS);
+		}
+		VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		error = zfs_fill_zplprops_root(version, rootprops,
+		    zplprops, NULL);
+		if (error)
+			goto pool_props_bad;
+	}
+
+	buf = history_str_get(zc);
 
-	error = spa_create(zc->zc_name, config, zc->zc_value[0] == '\0' ?
-	    NULL : zc->zc_value);
+	error = spa_create(zc->zc_name, config, props, buf, zplprops);
 
+	/*
+	 * Set the remaining root properties
+	 */
+	if (!error &&
+	    (error = zfs_set_prop_nvlist(zc->zc_name, rootprops)) != 0)
+		(void) spa_destroy(zc->zc_name);
+
+	if (buf != NULL)
+		history_str_free(buf);
+
+pool_props_bad:
+	nvlist_free(rootprops);
+	nvlist_free(zplprops);
 	nvlist_free(config);
+	nvlist_free(props);
 
 	return (error);
 }
@@ -357,35 +837,55 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
 static int
 zfs_ioc_pool_destroy(zfs_cmd_t *zc)
 {
-	return (spa_destroy(zc->zc_name));
+	int error;
+	zfs_log_history(zc);
+	error = spa_destroy(zc->zc_name);
+	return (error);
 }
 
 static int
 zfs_ioc_pool_import(zfs_cmd_t *zc)
 {
 	int error;
-	nvlist_t *config;
+	nvlist_t *config, *props = NULL;
 	uint64_t guid;
 
-	if ((error = get_nvlist(zc, &config)) != 0)
+	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+	    &config)) != 0)
 		return (error);
 
+	if (zc->zc_nvlist_src_size != 0 && (error =
+	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) {
+		nvlist_free(config);
+		return (error);
+	}
+
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
 	    guid != zc->zc_guid)
 		error = EINVAL;
+	else if (zc->zc_cookie)
+		error = spa_import_faulted(zc->zc_name, config,
+		    props);
 	else
-		error = spa_import(zc->zc_name, config,
-		    zc->zc_value[0] == '\0' ? NULL : zc->zc_value);
+		error = spa_import(zc->zc_name, config, props);
 
 	nvlist_free(config);
 
+	if (props)
+		nvlist_free(props);
+
 	return (error);
 }
 
 static int
 zfs_ioc_pool_export(zfs_cmd_t *zc)
 {
-	return (spa_export(zc->zc_name, NULL));
+	int error;
+	boolean_t force = (boolean_t)zc->zc_cookie;
+
+	zfs_log_history(zc);
+	error = spa_export(zc->zc_name, NULL, force);
+	return (error);
 }
 
 static int
@@ -441,7 +941,8 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
 	nvlist_t *tryconfig, *config;
 	int error;
 
-	if ((error = get_nvlist(zc, &tryconfig)) != 0)
+	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+	    &tryconfig)) != 0)
 		return (error);
 
 	config = spa_tryimport(tryconfig);
@@ -466,7 +967,7 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc)
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	error = spa_scrub(spa, zc->zc_cookie, B_FALSE);
+	error = spa_scrub(spa, zc->zc_cookie);
 
 	spa_close(spa, FTAG);
 
@@ -496,8 +997,12 @@ zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	spa_upgrade(spa);
+	if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) {
+		spa_close(spa, FTAG);
+		return (EINVAL);
+	}
 
+	spa_upgrade(spa, zc->zc_cookie);
 	spa_close(spa, FTAG);
 
 	return (error);
@@ -517,7 +1022,7 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc)
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) {
+	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
 		spa_close(spa, FTAG);
 		return (ENOTSUP);
 	}
@@ -525,7 +1030,8 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc)
 	hist_buf = kmem_alloc(size, KM_SLEEP);
 	if ((error = spa_history_get(spa, &zc->zc_history_offset,
 	    &zc->zc_history_len, hist_buf)) == 0) {
-		error = xcopyout(hist_buf, (char *)(uintptr_t)zc->zc_history,
+		error = xcopyout(hist_buf,
+		    (char *)(uintptr_t)zc->zc_history,
 		    zc->zc_history_len);
 	}
 
@@ -535,45 +1041,6 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc)
 }
 
 static int
-zfs_ioc_pool_log_history(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	char *history_str = NULL;
-	size_t size;
-	int error;
-
-	size = zc->zc_history_len;
-	if (size == 0 || size > HIS_MAX_RECORD_LEN)
-		return (EINVAL);
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) {
-		spa_close(spa, FTAG);
-		return (ENOTSUP);
-	}
-
-	/* add one for the NULL delimiter */
-	size++;
-	history_str = kmem_alloc(size, KM_SLEEP);
-	if ((error = xcopyin((void *)(uintptr_t)zc->zc_history, history_str,
-	    size)) != 0) {
-		spa_close(spa, FTAG);
-		kmem_free(history_str, size);
-		return (error);
-	}
-	history_str[size - 1] = '\0';
-
-	error = spa_history_log(spa, history_str, zc->zc_history_offset);
-
-	spa_close(spa, FTAG);
-	kmem_free(history_str, size);
-
-	return (error);
-}
-
-static int
 zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
 {
 	int error;
@@ -591,9 +1058,8 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc)
 	int error;
 
 	if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS,
-	    DS_MODE_NONE | DS_MODE_READONLY, &osp)) != 0)
+	    DS_MODE_USER | DS_MODE_READONLY, &osp)) != 0)
 		return (error);
-
 	error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value,
 	    sizeof (zc->zc_value));
 	dmu_objset_close(osp);
@@ -606,26 +1072,40 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
-	nvlist_t *config;
+	nvlist_t *config, **l2cache, **spares;
+	uint_t nl2cache = 0, nspares = 0;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
+	error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+	    &config);
+	(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
+	    &l2cache, &nl2cache);
+
+	(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES,
+	    &spares, &nspares);
+
 	/*
 	 * A root pool with concatenated devices is not supported.
-	 * Thus, can not add a device to a root pool with one device.
+	 * Thus, can not add a device to a root pool.
+	 *
+	 * Intent log device can not be added to a rootpool because
+	 * during mountroot, zil is replayed, a seperated log device
+	 * can not be accessed during the mountroot time.
+	 *
+	 * l2cache and spare devices are ok to be added to a rootpool.
 	 */
-	if (spa->spa_root_vdev->vdev_children == 1 && spa->spa_bootfs != 0) {
+	if (spa->spa_bootfs != 0 && nl2cache == 0 && nspares == 0) {
 		spa_close(spa, FTAG);
 		return (EDOM);
 	}
 
-	if ((error = get_nvlist(zc, &config)) == 0) {
+	if (error == 0) {
 		error = spa_vdev_add(spa, config);
 		nvlist_free(config);
 	}
-
 	spa_close(spa, FTAG);
 	return (error);
 }
@@ -645,28 +1125,35 @@ zfs_ioc_vdev_remove(zfs_cmd_t *zc)
 }
 
 static int
-zfs_ioc_vdev_online(zfs_cmd_t *zc)
+zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
+	vdev_state_t newstate = VDEV_STATE_UNKNOWN;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
-	error = vdev_online(spa, zc->zc_guid);
-	spa_close(spa, FTAG);
-	return (error);
-}
+	switch (zc->zc_cookie) {
+	case VDEV_STATE_ONLINE:
+		error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate);
+		break;
 
-static int
-zfs_ioc_vdev_offline(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int istmp = zc->zc_cookie;
-	int error;
+	case VDEV_STATE_OFFLINE:
+		error = vdev_offline(spa, zc->zc_guid, zc->zc_obj);
+		break;
 
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-	error = vdev_offline(spa, zc->zc_guid, istmp);
+	case VDEV_STATE_FAULTED:
+		error = vdev_fault(spa, zc->zc_guid);
+		break;
+
+	case VDEV_STATE_DEGRADED:
+		error = vdev_degrade(spa, zc->zc_guid);
+		break;
+
+	default:
+		error = EINVAL;
+	}
+	zc->zc_cookie = newstate;
 	spa_close(spa, FTAG);
 	return (error);
 }
@@ -682,7 +1169,8 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc)
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	if ((error = get_nvlist(zc, &config)) == 0) {
+	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+	    &config)) == 0) {
 		error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
 		nvlist_free(config);
 	}
@@ -723,6 +1211,16 @@ zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
 	return (error);
 }
 
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_nvlist_dst_size	size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_objset_stats	stats
+ * zc_nvlist_dst	property nvlist
+ * zc_nvlist_dst_size	size of property nvlist
+ */
 static int
 zfs_ioc_objset_stats(zfs_cmd_t *zc)
 {
@@ -730,44 +1228,29 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc)
 	int error;
 	nvlist_t *nv;
 
-retry:
-	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
-	if (error != 0) {
-		/*
-		 * This is ugly: dmu_objset_open() can return EBUSY if
-		 * the objset is held exclusively. Fortunately this hold is
-		 * only for a short while, so we retry here.
-		 * This avoids user code having to handle EBUSY,
-		 * for example for a "zfs list".
-		 */
-		if (error == EBUSY) {
-			delay(1);
-			goto retry;
-		}
+	if (error = dmu_objset_open(zc->zc_name,
+	    DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os))
 		return (error);
-	}
 
 	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
 
 	if (zc->zc_nvlist_dst != 0 &&
-	    (error = dsl_prop_get_all(os, &nv)) == 0) {
+	    (error = dsl_prop_get_all(os, &nv, FALSE)) == 0) {
 		dmu_objset_stats(os, nv);
 		/*
 		 * NB: zvol_get_stats() will read the objset contents,
 		 * which we aren't supposed to do with a
-		 * DS_MODE_STANDARD open, because it could be
+		 * DS_MODE_USER hold, because it could be
 		 * inconsistent.  So this is a bit of a workaround...
 		 */
-		if (!zc->zc_objset_stats.dds_inconsistent &&
-		    dmu_objset_type(os) == DMU_OST_ZVOL)
-			VERIFY(zvol_get_stats(os, nv) == 0);
+		if (!zc->zc_objset_stats.dds_inconsistent) {
+			if (dmu_objset_type(os) == DMU_OST_ZVOL)
+				VERIFY(zvol_get_stats(os, nv) == 0);
+		}
 		error = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	}
 
-	spa_altroot(dmu_objset_spa(os), zc->zc_value, sizeof (zc->zc_value));
-
 	dmu_objset_close(os);
 	if (error == ENOMEM)
 		error = 0;
@@ -775,27 +1258,87 @@ retry:
 }
 
 static int
+nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop)
+{
+	uint64_t value;
+	int error;
+
+	/*
+	 * zfs_get_zplprop() will either find a value or give us
+	 * the default value (if there is one).
+	 */
+	if ((error = zfs_get_zplprop(os, prop, &value)) != 0)
+		return (error);
+	VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0);
+	return (0);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_nvlist_dst_size	size of buffer for zpl property nvlist
+ *
+ * outputs:
+ * zc_nvlist_dst	zpl property nvlist
+ * zc_nvlist_dst_size	size of zpl property nvlist
+ */
+static int
+zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
+{
+	objset_t *os;
+	int err;
+
+	if (err = dmu_objset_open(zc->zc_name,
+	    DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os))
+		return (err);
+
+	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
+
+	/*
+	 * NB: nvl_add_zplprop() will read the objset contents,
+	 * which we aren't supposed to do with a DS_MODE_USER
+	 * hold, because it could be inconsistent.
+	 */
+	if (zc->zc_nvlist_dst != 0 &&
+	    !zc->zc_objset_stats.dds_inconsistent &&
+	    dmu_objset_type(os) == DMU_OST_ZFS) {
+		nvlist_t *nv;
+
+		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 &&
+		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 &&
+		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 &&
+		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0)
+			err = put_nvlist(zc, nv);
+		nvlist_free(nv);
+	} else {
+		err = ENOENT;
+	}
+	dmu_objset_close(os);
+	return (err);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_cookie		zap cursor
+ * zc_nvlist_dst_size	size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_name		name of next filesystem
+ * zc_objset_stats	stats
+ * zc_nvlist_dst	property nvlist
+ * zc_nvlist_dst_size	size of property nvlist
+ */
+static int
 zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 	char *p;
 
-retry:
-	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
-	if (error != 0) {
-		/*
-		 * This is ugly: dmu_objset_open() can return EBUSY if
-		 * the objset is held exclusively. Fortunately this hold is
-		 * only for a short while, so we retry here.
-		 * This avoids user code having to handle EBUSY,
-		 * for example for a "zfs list".
-		 */
-		if (error == EBUSY) {
-			delay(1);
-			goto retry;
-		}
+	if (error = dmu_objset_open(zc->zc_name,
+	    DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) {
 		if (error == ENOENT)
 			error = ESRCH;
 		return (error);
@@ -812,8 +1355,9 @@ retry:
 		    NULL, &zc->zc_cookie);
 		if (error == ENOENT)
 			error = ESRCH;
-	} while (error == 0 && !INGLOBALZONE(curproc) &&
+	} while (error == 0 && !INGLOBALZONE(curthread) &&
 	    !zone_dataset_visible(zc->zc_name, NULL));
+	dmu_objset_close(os);
 
 	/*
 	 * If it's a hidden dataset (ie. with a '$' in its name), don't
@@ -822,35 +1366,31 @@ retry:
 	if (error == 0 && strchr(zc->zc_name, '$') == NULL)
 		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
 
-	dmu_objset_close(os);
 	return (error);
 }
 
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_cookie		zap cursor
+ * zc_nvlist_dst_size	size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_name		name of next snapshot
+ * zc_objset_stats	stats
+ * zc_nvlist_dst	property nvlist
+ * zc_nvlist_dst_size	size of property nvlist
+ */
 static int
 zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
-retry:
-	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
-	if (error != 0) {
-		/*
-		 * This is ugly: dmu_objset_open() can return EBUSY if
-		 * the objset is held exclusively. Fortunately this hold is
-		 * only for a short while, so we retry here.
-		 * This avoids user code having to handle EBUSY,
-		 * for example for a "zfs list".
-		 */
-		if (error == EBUSY) {
-			delay(1);
-			goto retry;
-		}
-		if (error == ENOENT)
-			error = ESRCH;
-		return (error);
-	}
+	error = dmu_objset_open(zc->zc_name,
+	    DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os);
+	if (error)
+		return (error == ENOENT ? ESRCH : error);
 
 	/*
 	 * A dataset name of maximum length cannot have any snapshots,
@@ -863,36 +1403,36 @@ retry:
 
 	error = dmu_snapshot_list_next(os,
 	    sizeof (zc->zc_name) - strlen(zc->zc_name),
-	    zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie);
-	if (error == ENOENT)
-		error = ESRCH;
-
+	    zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL);
+	dmu_objset_close(os);
 	if (error == 0)
 		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
+	else if (error == ENOENT)
+		error = ESRCH;
 
-	dmu_objset_close(os);
+	/* if we failed, undo the @ that we tacked on to zc_name */
+	if (error)
+		*strchr(zc->zc_name, '@') = '\0';
 	return (error);
 }
 
-static int
-zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl)
+int
+zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
 {
 	nvpair_t *elem;
 	int error;
-	const char *propname;
-	zfs_prop_t prop;
 	uint64_t intval;
 	char *strval;
-	char buf[MAXNAMELEN];
-	const char *p;
-	spa_t *spa;
 
+	/*
+	 * First validate permission to set all of the properties
+	 */
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
-		propname = nvpair_name(elem);
+		const char *propname = nvpair_name(elem);
+		zfs_prop_t prop = zfs_name_to_prop(propname);
 
-		if ((prop = zfs_name_to_prop(propname)) ==
-		    ZFS_PROP_INVAL) {
+		if (prop == ZPROP_INVAL) {
 			/*
 			 * If this is a user-defined property, it must be a
 			 * string, and there is no further validation to do.
@@ -901,51 +1441,19 @@ zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl)
 			    nvpair_type(elem) != DATA_TYPE_STRING)
 				return (EINVAL);
 
-			VERIFY(nvpair_value_string(elem, &strval) == 0);
-			error = dsl_prop_set(name, propname, 1,
-			    strlen(strval) + 1, strval);
-			if (error == 0)
-				continue;
-			else
+			if (error = zfs_secpolicy_write_perms(name,
+			    ZFS_DELEG_PERM_USERPROP, CRED()))
 				return (error);
+			continue;
 		}
 
+		if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0)
+			return (error);
+
 		/*
-		 * Check permissions for special properties.
+		 * Check that this value is valid for this pool version
 		 */
 		switch (prop) {
-		case ZFS_PROP_ZONED:
-			/*
-			 * Disallow setting of 'zoned' from within a local zone.
-			 */
-			if (!INGLOBALZONE(curproc))
-				return (EPERM);
-			break;
-
-		case ZFS_PROP_QUOTA:
-			if (error = zfs_dozonecheck(name, cr))
-				return (error);
-
-			if (!INGLOBALZONE(curproc)) {
-				uint64_t zoned;
-				char setpoint[MAXNAMELEN];
-				int dslen;
-				/*
-				 * Unprivileged users are allowed to modify the
-				 * quota on things *under* (ie. contained by)
-				 * the thing they own.
-				 */
-				if (dsl_prop_get_integer(name, "jailed", &zoned,
-				    setpoint))
-					return (EPERM);
-				if (!zoned) /* this shouldn't happen */
-					return (EPERM);
-				dslen = strlen(name);
-				if (dslen <= strlen(setpoint))
-					return (EPERM);
-			}
-			break;
-
 		case ZFS_PROP_COMPRESSION:
 			/*
 			 * If the user specified gzip compression, make sure
@@ -953,35 +1461,64 @@ zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl)
 			 * we'll catch them later.
 			 */
 			if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
-			    nvpair_value_uint64(elem, &intval) == 0 &&
-			    intval >= ZIO_COMPRESS_GZIP_1 &&
-			    intval <= ZIO_COMPRESS_GZIP_9) {
-				if ((p = strchr(name, '/')) == NULL) {
-					p = name;
-				} else {
-					bcopy(name, buf, p - name);
-					buf[p - name] = '\0';
-					p = buf;
-				}
-
-				if (spa_open(p, &spa, FTAG) == 0) {
-					if (spa_version(spa) <
-					    ZFS_VERSION_GZIP_COMPRESSION) {
-						spa_close(spa, FTAG);
-						return (ENOTSUP);
-					}
+			    nvpair_value_uint64(elem, &intval) == 0) {
+				if (intval >= ZIO_COMPRESS_GZIP_1 &&
+				    intval <= ZIO_COMPRESS_GZIP_9 &&
+				    zfs_earlier_version(name,
+				    SPA_VERSION_GZIP_COMPRESSION))
+					return (ENOTSUP);
 
-					spa_close(spa, FTAG);
-				}
+				/*
+				 * If this is a bootable dataset then
+				 * verify that the compression algorithm
+				 * is supported for booting. We must return
+				 * something other than ENOTSUP since it
+				 * implies a downrev pool version.
+				 */
+				if (zfs_is_bootfs(name) &&
+				    !BOOTFS_COMPRESS_VALID(intval))
+					return (ERANGE);
 			}
 			break;
+
+		case ZFS_PROP_COPIES:
+			if (zfs_earlier_version(name,
+			    SPA_VERSION_DITTO_BLOCKS))
+				return (ENOTSUP);
+			break;
+
+		case ZFS_PROP_SHARESMB:
+			if (zpl_earlier_version(name, ZPL_VERSION_FUID))
+				return (ENOTSUP);
+			break;
+		}
+	}
+
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+		const char *propname = nvpair_name(elem);
+		zfs_prop_t prop = zfs_name_to_prop(propname);
+
+		if (prop == ZPROP_INVAL) {
+			VERIFY(nvpair_value_string(elem, &strval) == 0);
+			error = dsl_prop_set(name, propname, 1,
+			    strlen(strval) + 1, strval);
+			if (error == 0)
+				continue;
+			else
+				return (error);
 		}
 
 		switch (prop) {
 		case ZFS_PROP_QUOTA:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = dsl_dir_set_quota(name,
-			    intval)) != 0)
+			    (error = dsl_dir_set_quota(name, intval)) != 0)
+				return (error);
+			break;
+
+		case ZFS_PROP_REFQUOTA:
+			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
+			    (error = dsl_dataset_set_quota(name, intval)) != 0)
 				return (error);
 			break;
 
@@ -992,24 +1529,36 @@ zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl)
 				return (error);
 			break;
 
-		case ZFS_PROP_VOLSIZE:
+		case ZFS_PROP_REFRESERVATION:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = zvol_set_volsize(name, dev,
+			    (error = dsl_dataset_set_reservation(name,
 			    intval)) != 0)
 				return (error);
 			break;
 
+		case ZFS_PROP_VOLSIZE:
+			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
+			    (error = zvol_set_volsize(name,
+			    ddi_driver_major(zfs_dip), intval)) != 0)
+				return (error);
+			break;
+
 		case ZFS_PROP_VOLBLOCKSIZE:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = zvol_set_volblocksize(name,
-			    intval)) != 0)
+			    (error = zvol_set_volblocksize(name, intval)) != 0)
+				return (error);
+			break;
+
+		case ZFS_PROP_VERSION:
+			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
+			    (error = zfs_set_version(name, intval)) != 0)
 				return (error);
 			break;
 
 		default:
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				if (zfs_prop_get_type(prop) !=
-				    prop_type_string)
+				    PROP_TYPE_STRING)
 					return (EINVAL);
 				VERIFY(nvpair_value_string(elem, &strval) == 0);
 				if ((error = dsl_prop_set(name,
@@ -1022,22 +1571,18 @@ zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl)
 				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
 
 				switch (zfs_prop_get_type(prop)) {
-				case prop_type_number:
+				case PROP_TYPE_NUMBER:
 					break;
-				case prop_type_boolean:
-					if (intval > 1)
-						return (EINVAL);
-					break;
-				case prop_type_string:
+				case PROP_TYPE_STRING:
 					return (EINVAL);
-				case prop_type_index:
+				case PROP_TYPE_INDEX:
 					if (zfs_prop_index_to_string(prop,
 					    intval, &unused) != 0)
 						return (EINVAL);
 					break;
 				default:
-					cmn_err(CE_PANIC, "unknown property "
-					    "type");
+					cmn_err(CE_PANIC,
+					    "unknown property type");
 					break;
 				}
 
@@ -1054,127 +1599,79 @@ zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl)
 	return (0);
 }
 
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_value		name of property to inherit
+ * zc_nvlist_src{_size}	nvlist of properties to apply
+ * zc_cookie		clear existing local props?
+ *
+ * outputs:		none
+ */
 static int
 zfs_ioc_set_prop(zfs_cmd_t *zc)
 {
 	nvlist_t *nvl;
 	int error;
-	zfs_prop_t prop;
 
-	/*
-	 * If zc_value is set, then this is an attempt to inherit a value.
-	 * Otherwise, zc_nvlist refers to a list of properties to set.
-	 */
-	if (zc->zc_value[0] != '\0') {
-		if (!zfs_prop_user(zc->zc_value) &&
-		    ((prop = zfs_name_to_prop(zc->zc_value)) ==
-		    ZFS_PROP_INVAL ||
-		    !zfs_prop_inheritable(prop)))
-			return (EINVAL);
+	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    &nvl)) != 0)
+		return (error);
+
+	if (zc->zc_cookie) {
+		nvlist_t *origprops;
+		objset_t *os;
+
+		if (dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+		    DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
+			if (dsl_prop_get_all(os, &origprops, TRUE) == 0) {
+				clear_props(zc->zc_name, origprops);
+				nvlist_free(origprops);
+			}
+			dmu_objset_close(os);
+		}
 
-		return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL));
 	}
 
-	if ((error = get_nvlist(zc, &nvl)) != 0)
-		return (error);
+	error = zfs_set_prop_nvlist(zc->zc_name, nvl);
 
-	error = zfs_set_prop_nvlist(zc->zc_name, zc->zc_dev,
-	    (cred_t *)(uintptr_t)zc->zc_cred, nvl);
 	nvlist_free(nvl);
 	return (error);
 }
 
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_value		name of property to inherit
+ *
+ * outputs:		none
+ */
+static int
+zfs_ioc_inherit_prop(zfs_cmd_t *zc)
+{
+	/* the property name has been validated by zfs_secpolicy_inherit() */
+	return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL));
+}
+
 static int
 zfs_ioc_pool_set_props(zfs_cmd_t *zc)
 {
-	nvlist_t *nvl;
-	int error, reset_bootfs = 0;
-	uint64_t objnum;
-	zpool_prop_t prop;
-	nvpair_t *elem;
-	char *propname, *strval;
+	nvlist_t *props;
 	spa_t *spa;
-	vdev_t *rvdev;
-	char *vdev_type;
-	objset_t *os;
+	int error;
 
-	if ((error = get_nvlist(zc, &nvl)) != 0)
+	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    &props)))
 		return (error);
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
-		nvlist_free(nvl);
+		nvlist_free(props);
 		return (error);
 	}
 
-	if (spa_version(spa) < ZFS_VERSION_BOOTFS) {
-		nvlist_free(nvl);
-		spa_close(spa, FTAG);
-		return (ENOTSUP);
-	}
-
-	elem = NULL;
-	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
-
-		propname = nvpair_name(elem);
-
-		if ((prop = zpool_name_to_prop(propname)) ==
-		    ZFS_PROP_INVAL) {
-			nvlist_free(nvl);
-			spa_close(spa, FTAG);
-			return (EINVAL);
-		}
-
-		switch (prop) {
-		case ZFS_PROP_BOOTFS:
-			/*
-			 * A bootable filesystem can not be on a RAIDZ pool
-			 * nor a striped pool with more than 1 device.
-			 */
-			rvdev = spa->spa_root_vdev;
-			vdev_type =
-			    rvdev->vdev_child[0]->vdev_ops->vdev_op_type;
-			if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
-			    (strcmp(vdev_type, VDEV_TYPE_MIRROR) != 0 &&
-			    rvdev->vdev_children > 1)) {
-				error = ENOTSUP;
-				break;
-			}
-
-			reset_bootfs = 1;
-
-			VERIFY(nvpair_value_string(elem, &strval) == 0);
-			if (strval == NULL || strval[0] == '\0') {
-				objnum =
-				    zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
-				break;
-			}
-
-			if (error = dmu_objset_open(strval, DMU_OST_ZFS,
-			    DS_MODE_STANDARD | DS_MODE_READONLY, &os))
-				break;
-			objnum = dmu_objset_id(os);
-			dmu_objset_close(os);
-			break;
+	error = spa_prop_set(spa, props);
 
-		default:
-			error = EINVAL;
-		}
-
-		if (error)
-			break;
-	}
-	if (error == 0) {
-		if (reset_bootfs) {
-			VERIFY(nvlist_remove(nvl,
-			    zpool_prop_to_name(ZFS_PROP_BOOTFS),
-			    DATA_TYPE_STRING) == 0);
-			VERIFY(nvlist_add_uint64(nvl,
-			    zpool_prop_to_name(ZFS_PROP_BOOTFS), objnum) == 0);
-		}
-		error = spa_set_props(spa, nvl);
-	}
-
-	nvlist_free(nvl);
+	nvlist_free(props);
 	spa_close(spa, FTAG);
 
 	return (error);
@@ -1190,7 +1687,7 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc)
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	error = spa_get_props(spa, &nvp);
+	error = spa_prop_get(spa, &nvp);
 
 	if (error == 0 && zc->zc_nvlist_dst != 0)
 		error = put_nvlist(zc, nvp);
@@ -1205,11 +1702,145 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc)
 }
 
 static int
+zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc)
+{
+#ifdef TODO
+	nvlist_t *nvp;
+	int error;
+	uint32_t uid;
+	uint32_t gid;
+	uint32_t *groups;
+	uint_t group_cnt;
+	cred_t	*usercred;
+
+	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    &nvp)) != 0) {
+		return (error);
+	}
+
+	if ((error = nvlist_lookup_uint32(nvp,
+	    ZFS_DELEG_PERM_UID, &uid)) != 0) {
+		nvlist_free(nvp);
+		return (EPERM);
+	}
+
+	if ((error = nvlist_lookup_uint32(nvp,
+	    ZFS_DELEG_PERM_GID, &gid)) != 0) {
+		nvlist_free(nvp);
+		return (EPERM);
+	}
+
+	if ((error = nvlist_lookup_uint32_array(nvp, ZFS_DELEG_PERM_GROUPS,
+	    &groups, &group_cnt)) != 0) {
+		nvlist_free(nvp);
+		return (EPERM);
+	}
+	usercred = cralloc();
+	if ((crsetugid(usercred, uid, gid) != 0) ||
+	    (crsetgroups(usercred, group_cnt, (gid_t *)groups) != 0)) {
+		nvlist_free(nvp);
+		crfree(usercred);
+		return (EPERM);
+	}
+	nvlist_free(nvp);
+	error = dsl_deleg_access(zc->zc_name,
+	    zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred);
+	crfree(usercred);
+	return (error);
+#else
+	return (EPERM);
+#endif
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_nvlist_src{_size}	nvlist of delegated permissions
+ * zc_perm_action	allow/unallow flag
+ *
+ * outputs:		none
+ */
+static int
+zfs_ioc_set_fsacl(zfs_cmd_t *zc)
+{
+	int error;
+	nvlist_t *fsaclnv = NULL;
+
+	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    &fsaclnv)) != 0)
+		return (error);
+
+	/*
+	 * Verify nvlist is constructed correctly
+	 */
+	if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
+		nvlist_free(fsaclnv);
+		return (EINVAL);
+	}
+
+	/*
+	 * If we don't have PRIV_SYS_MOUNT, then validate
+	 * that user is allowed to hand out each permission in
+	 * the nvlist(s)
+	 */
+
+	error = secpolicy_zfs(CRED());
+	if (error) {
+		if (zc->zc_perm_action == B_FALSE) {
+			error = dsl_deleg_can_allow(zc->zc_name,
+			    fsaclnv, CRED());
+		} else {
+			error = dsl_deleg_can_unallow(zc->zc_name,
+			    fsaclnv, CRED());
+		}
+	}
+
+	if (error == 0)
+		error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action);
+
+	nvlist_free(fsaclnv);
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ *
+ * outputs:
+ * zc_nvlist_src{_size}	nvlist of delegated permissions
+ */
+static int
+zfs_ioc_get_fsacl(zfs_cmd_t *zc)
+{
+	nvlist_t *nvp;
+	int error;
+
+	if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) {
+		error = put_nvlist(zc, nvp);
+		nvlist_free(nvp);
+	}
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of volume
+ *
+ * outputs:		none
+ */
+static int
 zfs_ioc_create_minor(zfs_cmd_t *zc)
 {
-	return (zvol_create_minor(zc->zc_name, zc->zc_dev));
+	return (zvol_create_minor(zc->zc_name, ddi_driver_major(zfs_dip)));
 }
 
+/*
+ * inputs:
+ * zc_name		name of volume
+ *
+ * outputs:		none
+ */
 static int
 zfs_ioc_remove_minor(zfs_cmd_t *zc)
 {
@@ -1228,7 +1859,7 @@ zfs_get_vfs(const char *resource)
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(vfsp, &mountlist, mnt_list) {
-		if (strcmp(vfsp->mnt_stat.f_mntfromname, resource) == 0) {
+		if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) {
 			VFS_HOLD(vfsp);
 			break;
 		}
@@ -1237,21 +1868,183 @@ zfs_get_vfs(const char *resource)
 	return (vfsp);
 }
 
+/* ARGSUSED */
 static void
-zfs_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
+zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+{
+	zfs_creat_t *zct = arg;
+
+	zfs_create_fs(os, cr, zct->zct_zplprops, tx);
+}
+
+#define	ZFS_PROP_UNDEFINED	((uint64_t)-1)
+
+/*
+ * inputs:
+ * createprops		list of properties requested by creator
+ * default_zplver	zpl version to use if unspecified in createprops
+ * fuids_ok		fuids allowed in this version of the spa?
+ * os			parent objset pointer (NULL if root fs)
+ *
+ * outputs:
+ * zplprops	values for the zplprops we attach to the master node object
+ * is_ci	true if requested file system will be purely case-insensitive
+ *
+ * Determine the settings for utf8only, normalization and
+ * casesensitivity.  Specific values may have been requested by the
+ * creator and/or we can inherit values from the parent dataset.  If
+ * the file system is of too early a vintage, a creator can not
+ * request settings for these properties, even if the requested
+ * setting is the default value.  We don't actually want to create dsl
+ * properties for these, so remove them from the source nvlist after
+ * processing.
+ */
+static int
+zfs_fill_zplprops_impl(objset_t *os, uint64_t default_zplver,
+    boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops,
+    boolean_t *is_ci)
+{
+	uint64_t zplver = default_zplver;
+	uint64_t sense = ZFS_PROP_UNDEFINED;
+	uint64_t norm = ZFS_PROP_UNDEFINED;
+	uint64_t u8 = ZFS_PROP_UNDEFINED;
+
+	ASSERT(zplprops != NULL);
+
+	/*
+	 * Pull out creator prop choices, if any.
+	 */
+	if (createprops) {
+		(void) nvlist_lookup_uint64(createprops,
+		    zfs_prop_to_name(ZFS_PROP_VERSION), &zplver);
+		(void) nvlist_lookup_uint64(createprops,
+		    zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm);
+		(void) nvlist_remove_all(createprops,
+		    zfs_prop_to_name(ZFS_PROP_NORMALIZE));
+		(void) nvlist_lookup_uint64(createprops,
+		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8);
+		(void) nvlist_remove_all(createprops,
+		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
+		(void) nvlist_lookup_uint64(createprops,
+		    zfs_prop_to_name(ZFS_PROP_CASE), &sense);
+		(void) nvlist_remove_all(createprops,
+		    zfs_prop_to_name(ZFS_PROP_CASE));
+	}
+
+	/*
+	 * If the zpl version requested is whacky or the file system
+	 * or pool is version is too "young" to support normalization
+	 * and the creator tried to set a value for one of the props,
+	 * error out.
+	 */
+	if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
+	    (zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
+	    (zplver < ZPL_VERSION_NORMALIZATION &&
+	    (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
+	    sense != ZFS_PROP_UNDEFINED)))
+		return (ENOTSUP);
+
+	/*
+	 * Put the version in the zplprops
+	 */
+	VERIFY(nvlist_add_uint64(zplprops,
+	    zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
+
+	if (norm == ZFS_PROP_UNDEFINED)
+		VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
+	VERIFY(nvlist_add_uint64(zplprops,
+	    zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
+
+	/*
+	 * If we're normalizing, names must always be valid UTF-8 strings.
+	 */
+	if (norm)
+		u8 = 1;
+	if (u8 == ZFS_PROP_UNDEFINED)
+		VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
+	VERIFY(nvlist_add_uint64(zplprops,
+	    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
+
+	if (sense == ZFS_PROP_UNDEFINED)
+		VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
+	VERIFY(nvlist_add_uint64(zplprops,
+	    zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
+
+	if (is_ci)
+		*is_ci = (sense == ZFS_CASE_INSENSITIVE);
+
+	return (0);
+}
+
+static int
+zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
+    nvlist_t *zplprops, boolean_t *is_ci)
+{
+	boolean_t fuids_ok = B_TRUE;
+	uint64_t zplver = ZPL_VERSION;
+	objset_t *os = NULL;
+	char parentname[MAXNAMELEN];
+	char *cp;
+	int error;
+
+	(void) strlcpy(parentname, dataset, sizeof (parentname));
+	cp = strrchr(parentname, '/');
+	ASSERT(cp != NULL);
+	cp[0] = '\0';
+
+	if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) {
+		zplver = ZPL_VERSION_FUID - 1;
+		fuids_ok = B_FALSE;
+	}
+
+	/*
+	 * Open parent object set so we can inherit zplprop values.
+	 */
+	if ((error = dmu_objset_open(parentname, DMU_OST_ANY,
+	    DS_MODE_USER | DS_MODE_READONLY, &os)) != 0)
+		return (error);
+
+	error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, createprops,
+	    zplprops, is_ci);
+	dmu_objset_close(os);
+	return (error);
+}
+
+static int
+zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
+    nvlist_t *zplprops, boolean_t *is_ci)
 {
-	zfs_create_data_t *zc = arg;
+	boolean_t fuids_ok = B_TRUE;
+	uint64_t zplver = ZPL_VERSION;
+	int error;
 
-	zfs_create_fs(os, (cred_t *)(uintptr_t)zc->zc_cred, tx);
+	if (spa_vers < SPA_VERSION_FUID) {
+		zplver = ZPL_VERSION_FUID - 1;
+		fuids_ok = B_FALSE;
+	}
+
+	error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, createprops,
+	    zplprops, is_ci);
+	return (error);
 }
 
+/*
+ * inputs:
+ * zc_objset_type	type of objset to create (fs vs zvol)
+ * zc_name		name of new objset
+ * zc_value		name of snapshot to clone from (may be empty)
+ * zc_nvlist_src{_size}	nvlist of properties to apply
+ *
+ * outputs: none
+ */
 static int
 zfs_ioc_create(zfs_cmd_t *zc)
 {
 	objset_t *clone;
 	int error = 0;
-	zfs_create_data_t cbdata = { 0 };
-	void (*cbfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
+	zfs_creat_t zct;
+	nvlist_t *nvprops = NULL;
+	void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
 	dmu_objset_type_t type = zc->zc_objset_type;
 
 	switch (type) {
@@ -1266,16 +2059,19 @@ zfs_ioc_create(zfs_cmd_t *zc)
 
 	default:
 		cbfunc = NULL;
+		break;
 	}
-	if (strchr(zc->zc_name, '@'))
+	if (strchr(zc->zc_name, '@') ||
+	    strchr(zc->zc_name, '%'))
 		return (EINVAL);
 
 	if (zc->zc_nvlist_src != 0 &&
-	    (error = get_nvlist(zc, &cbdata.zc_props)) != 0)
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    &nvprops)) != 0)
 		return (error);
 
-	cbdata.zc_cred = (cred_t *)(uintptr_t)zc->zc_cred;
-	cbdata.zc_dev = (dev_t)zc->zc_dev;
+	zct.zct_zplprops = NULL;
+	zct.zct_props = nvprops;
 
 	if (zc->zc_value[0] != '\0') {
 		/*
@@ -1283,39 +2079,48 @@ zfs_ioc_create(zfs_cmd_t *zc)
 		 */
 		zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
 		if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) {
-			nvlist_free(cbdata.zc_props);
+			nvlist_free(nvprops);
 			return (EINVAL);
 		}
 
 		error = dmu_objset_open(zc->zc_value, type,
-		    DS_MODE_STANDARD | DS_MODE_READONLY, &clone);
+		    DS_MODE_USER | DS_MODE_READONLY, &clone);
+		if (error) {
+			nvlist_free(nvprops);
+			return (error);
+		}
+
+		error = dmu_objset_create(zc->zc_name, type, clone, 0,
+		    NULL, NULL);
 		if (error) {
-			nvlist_free(cbdata.zc_props);
+			dmu_objset_close(clone);
+			nvlist_free(nvprops);
 			return (error);
 		}
-		error = dmu_objset_create(zc->zc_name, type, clone, NULL, NULL);
 		dmu_objset_close(clone);
 	} else {
+		boolean_t is_insensitive = B_FALSE;
+
 		if (cbfunc == NULL) {
-			nvlist_free(cbdata.zc_props);
+			nvlist_free(nvprops);
 			return (EINVAL);
 		}
 
 		if (type == DMU_OST_ZVOL) {
 			uint64_t volsize, volblocksize;
 
-			if (cbdata.zc_props == NULL ||
-			    nvlist_lookup_uint64(cbdata.zc_props,
+			if (nvprops == NULL ||
+			    nvlist_lookup_uint64(nvprops,
 			    zfs_prop_to_name(ZFS_PROP_VOLSIZE),
 			    &volsize) != 0) {
-				nvlist_free(cbdata.zc_props);
+				nvlist_free(nvprops);
 				return (EINVAL);
 			}
 
-			if ((error = nvlist_lookup_uint64(cbdata.zc_props,
+			if ((error = nvlist_lookup_uint64(nvprops,
 			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 			    &volblocksize)) != 0 && error != ENOENT) {
-				nvlist_free(cbdata.zc_props);
+				nvlist_free(nvprops);
 				return (EINVAL);
 			}
 
@@ -1327,56 +2132,127 @@ zfs_ioc_create(zfs_cmd_t *zc)
 			    volblocksize)) != 0 ||
 			    (error = zvol_check_volsize(volsize,
 			    volblocksize)) != 0) {
-				nvlist_free(cbdata.zc_props);
+				nvlist_free(nvprops);
 				return (error);
 			}
-		}
+		} else if (type == DMU_OST_ZFS) {
+			int error;
 
-		error = dmu_objset_create(zc->zc_name, type, NULL, cbfunc,
-		    &cbdata);
+			/*
+			 * We have to have normalization and
+			 * case-folding flags correct when we do the
+			 * file system creation, so go figure them out
+			 * now.
+			 */
+			VERIFY(nvlist_alloc(&zct.zct_zplprops,
+			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+			error = zfs_fill_zplprops(zc->zc_name, nvprops,
+			    zct.zct_zplprops, &is_insensitive);
+			if (error != 0) {
+				nvlist_free(nvprops);
+				nvlist_free(zct.zct_zplprops);
+				return (error);
+			}
+		}
+		error = dmu_objset_create(zc->zc_name, type, NULL,
+		    is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
+		nvlist_free(zct.zct_zplprops);
 	}
 
 	/*
 	 * It would be nice to do this atomically.
 	 */
 	if (error == 0) {
-		if ((error = zfs_set_prop_nvlist(zc->zc_name,
-		    zc->zc_dev, (cred_t *)(uintptr_t)zc->zc_cred,
-		    cbdata.zc_props)) != 0)
+		if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0)
 			(void) dmu_objset_destroy(zc->zc_name);
 	}
-
-	nvlist_free(cbdata.zc_props);
+	nvlist_free(nvprops);
 	return (error);
 }
 
+struct snap_prop_arg {
+	nvlist_t *nvprops;
+	const char *snapname;
+};
+
+static int
+set_snap_props(char *name, void *arg)
+{
+	struct snap_prop_arg *snpa = arg;
+	int len = strlen(name) + strlen(snpa->snapname) + 2;
+	char *buf = kmem_alloc(len, KM_SLEEP);
+	int err;
+
+	(void) snprintf(buf, len, "%s@%s", name, snpa->snapname);
+	err = zfs_set_prop_nvlist(buf, snpa->nvprops);
+	if (err)
+		(void) dmu_objset_destroy(buf);
+	kmem_free(buf, len);
+	return (err);
+}
+
+/*
+ * inputs:
+ * zc_name	name of filesystem
+ * zc_value	short name of snapshot
+ * zc_cookie	recursive flag
+ *
+ * outputs:	none
+ */
 static int
 zfs_ioc_snapshot(zfs_cmd_t *zc)
 {
+	nvlist_t *nvprops = NULL;
+	int error;
+	boolean_t recursive = zc->zc_cookie;
+
 	if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
 		return (EINVAL);
-	return (dmu_objset_snapshot(zc->zc_name,
-	    zc->zc_value, zc->zc_cookie));
+
+	if (zc->zc_nvlist_src != 0 &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    &nvprops)) != 0)
+		return (error);
+
+	error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, recursive);
+
+	/*
+	 * It would be nice to do this atomically.
+	 */
+	if (error == 0) {
+		struct snap_prop_arg snpa;
+		snpa.nvprops = nvprops;
+		snpa.snapname = zc->zc_value;
+		if (recursive) {
+			error = dmu_objset_find(zc->zc_name,
+			    set_snap_props, &snpa, DS_FIND_CHILDREN);
+			if (error) {
+				(void) dmu_snapshots_destroy(zc->zc_name,
+				    zc->zc_value);
+			}
+		} else {
+			error = set_snap_props(zc->zc_name, &snpa);
+		}
+	}
+	nvlist_free(nvprops);
+	return (error);
 }
 
 int
 zfs_unmount_snap(char *name, void *arg)
 {
-	char *snapname = arg;
-	char *cp;
 	vfs_t *vfsp = NULL;
 
-	/*
-	 * Snapshots (which are under .zfs control) must be unmounted
-	 * before they can be destroyed.
-	 */
+	if (arg) {
+		char *snapname = arg;
+		int len = strlen(name) + strlen(snapname) + 2;
+		char *buf = kmem_alloc(len, KM_SLEEP);
 
-	if (snapname) {
-		(void) strcat(name, "@");
-		(void) strcat(name, snapname);
-		vfsp = zfs_get_vfs(name);
-		cp = strchr(name, '@');
-		*cp = '\0';
+		(void) strcpy(buf, name);
+		(void) strcat(buf, "@");
+		(void) strcat(buf, snapname);
+		vfsp = zfs_get_vfs(buf);
+		kmem_free(buf, len);
 	} else if (strchr(name, '@')) {
 		vfsp = zfs_get_vfs(name);
 	}
@@ -1400,6 +2276,13 @@ zfs_unmount_snap(char *name, void *arg)
 	return (0);
 }
 
+/*
+ * inputs:
+ * zc_name	name of filesystem
+ * zc_value	short name of snapshot
+ *
+ * outputs:	none
+ */
 static int
 zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
 {
@@ -1414,6 +2297,13 @@ zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
 	return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value));
 }
 
+/*
+ * inputs:
+ * zc_name		name of dataset to destroy
+ * zc_objset_type	type of objset
+ *
+ * outputs:		none
+ */
 static int
 zfs_ioc_destroy(zfs_cmd_t *zc)
 {
@@ -1426,19 +2316,76 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
 	return (dmu_objset_destroy(zc->zc_name));
 }
 
+/*
+ * inputs:
+ * zc_name	name of dataset to rollback (to most recent snapshot)
+ *
+ * outputs:	none
+ */
 static int
 zfs_ioc_rollback(zfs_cmd_t *zc)
 {
-	return (dmu_objset_rollback(zc->zc_name));
+	objset_t *os;
+	int error;
+	zfsvfs_t *zfsvfs = NULL;
+
+	/*
+	 * Get the zfsvfs for the receiving objset. There
+	 * won't be one if we're operating on a zvol, if the
+	 * objset doesn't exist yet, or is not mounted.
+	 */
+	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, DS_MODE_USER, &os);
+	if (error)
+		return (error);
+
+	if (dmu_objset_type(os) == DMU_OST_ZFS) {
+		mutex_enter(&os->os->os_user_ptr_lock);
+		zfsvfs = dmu_objset_get_user(os);
+		if (zfsvfs != NULL)
+			VFS_HOLD(zfsvfs->z_vfs);
+		mutex_exit(&os->os->os_user_ptr_lock);
+	}
+
+	if (zfsvfs != NULL) {
+		char osname[MAXNAMELEN];
+		int mode;
+
+		error = zfs_suspend_fs(zfsvfs, osname, &mode);
+		if (error == 0) {
+			int resume_err;
+
+			ASSERT(strcmp(osname, zc->zc_name) == 0);
+			error = dmu_objset_rollback(os);
+			resume_err = zfs_resume_fs(zfsvfs, osname, mode);
+			error = error ? error : resume_err;
+		} else {
+			dmu_objset_close(os);
+		}
+		VFS_RELE(zfsvfs->z_vfs);
+	} else {
+		error = dmu_objset_rollback(os);
+	}
+	/* Note, the dmu_objset_rollback() releases the objset for us. */
+
+	return (error);
 }
 
+/*
+ * inputs:
+ * zc_name	old name of dataset
+ * zc_value	new name of dataset
+ * zc_cookie	recursive flag (only valid for snapshots)
+ *
+ * outputs:	none
+ */
 static int
 zfs_ioc_rename(zfs_cmd_t *zc)
 {
-	int recursive = zc->zc_cookie & 1;
+	boolean_t recursive = zc->zc_cookie & 1;
 
 	zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
-	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0)
+	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
+	    strchr(zc->zc_value, '%'))
 		return (EINVAL);
 
 	/*
@@ -1452,48 +2399,199 @@ zfs_ioc_rename(zfs_cmd_t *zc)
 		if (err)
 			return (err);
 	}
-
 	return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive));
 }
 
+static void
+clear_props(char *dataset, nvlist_t *props)
+{
+	zfs_cmd_t *zc;
+	nvpair_t *prop;
+
+	if (props == NULL)
+		return;
+	zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
+	(void) strcpy(zc->zc_name, dataset);
+	for (prop = nvlist_next_nvpair(props, NULL); prop;
+	    prop = nvlist_next_nvpair(props, prop)) {
+		(void) strcpy(zc->zc_value, nvpair_name(prop));
+		if (zfs_secpolicy_inherit(zc, CRED()) == 0)
+			(void) zfs_ioc_inherit_prop(zc);
+	}
+	kmem_free(zc, sizeof (zfs_cmd_t));
+}
+
+/*
+ * inputs:
+ * zc_name		name of containing filesystem
+ * zc_nvlist_src{_size}	nvlist of properties to apply
+ * zc_value		name of snapshot to create
+ * zc_string		name of clone origin (if DRR_FLAG_CLONE)
+ * zc_cookie		file descriptor to recv from
+ * zc_begin_record	the BEGIN record of the stream (not byteswapped)
+ * zc_guid		force flag
+ *
+ * outputs:
+ * zc_cookie		number of bytes read
+ */
 static int
-zfs_ioc_recvbackup(zfs_cmd_t *zc)
+zfs_ioc_recv(zfs_cmd_t *zc)
 {
-	kthread_t *td = curthread;
-	struct file *fp;
-	int error;
-	offset_t new_off;
+	file_t *fp;
+	objset_t *os;
+	dmu_recv_cookie_t drc;
+	zfsvfs_t *zfsvfs = NULL;
+	boolean_t force = (boolean_t)zc->zc_guid;
+	int error, fd;
+	offset_t off;
+	nvlist_t *props = NULL;
+	nvlist_t *origprops = NULL;
+	objset_t *origin = NULL;
+	char *tosnap;
+	char tofs[ZFS_MAXNAMELEN];
 
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
-	    strchr(zc->zc_value, '@') == NULL)
+	    strchr(zc->zc_value, '@') == NULL ||
+	    strchr(zc->zc_value, '%'))
 		return (EINVAL);
 
-	error = fget_read(td, zc->zc_cookie, &fp);
-	if (error)
+	(void) strcpy(tofs, zc->zc_value);
+	tosnap = strchr(tofs, '@');
+	*tosnap = '\0';
+	tosnap++;
+
+	if (zc->zc_nvlist_src != 0 &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    &props)) != 0)
 		return (error);
 
-	error = dmu_recvbackup(zc->zc_value, &zc->zc_begin_record,
-	    &zc->zc_cookie, (boolean_t)zc->zc_guid, fp,
-	    fp->f_offset);
+	fd = zc->zc_cookie;
+	fp = getf(fd, 0);
+	if (fp == NULL) {
+		nvlist_free(props);
+		return (EBADF);
+	}
 
-	new_off = fp->f_offset + zc->zc_cookie;
-	fp->f_offset = new_off;
+	if (dmu_objset_open(tofs, DMU_OST_ANY,
+	    DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
+		/*
+		 * Try to get the zfsvfs for the receiving objset.
+		 * There won't be one if we're operating on a zvol,
+		 * if the objset doesn't exist yet, or is not mounted.
+		 */
+		mutex_enter(&os->os->os_user_ptr_lock);
+		if (zfsvfs = dmu_objset_get_user(os)) {
+			if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) {
+				mutex_exit(&os->os->os_user_ptr_lock);
+				dmu_objset_close(os);
+				zfsvfs = NULL;
+				error = EBUSY;
+				goto out;
+			}
+			VFS_HOLD(zfsvfs->z_vfs);
+		}
+		mutex_exit(&os->os->os_user_ptr_lock);
+
+		/*
+		 * If new properties are supplied, they are to completely
+		 * replace the existing ones, so stash away the existing ones.
+		 */
+		if (props)
+			(void) dsl_prop_get_all(os, &origprops, TRUE);
+
+		dmu_objset_close(os);
+	}
+
+	if (zc->zc_string[0]) {
+		error = dmu_objset_open(zc->zc_string, DMU_OST_ANY,
+		    DS_MODE_USER | DS_MODE_READONLY, &origin);
+		if (error)
+			goto out;
+	}
+
+	error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record,
+	    force, origin, zfsvfs != NULL, &drc);
+	if (origin)
+		dmu_objset_close(origin);
+	if (error)
+		goto out;
+
+	/*
+	 * Reset properties.  We do this before we receive the stream
+	 * so that the properties are applied to the new data.
+	 */
+	if (props) {
+		clear_props(tofs, origprops);
+		/*
+		 * XXX - Note, this is all-or-nothing; should be best-effort.
+		 */
+		(void) zfs_set_prop_nvlist(tofs, props);
+	}
+
+	off = fp->f_offset;
+	error = dmu_recv_stream(&drc, fp, &off);
+
+	if (error == 0 && zfsvfs) {
+		char osname[MAXNAMELEN];
+		int mode;
+
+		/* online recv */
+		error = zfs_suspend_fs(zfsvfs, osname, &mode);
+		if (error == 0) {
+			int resume_err;
+
+			error = dmu_recv_end(&drc);
+			resume_err = zfs_resume_fs(zfsvfs, osname, mode);
+			error = error ? error : resume_err;
+		} else {
+			dmu_recv_abort_cleanup(&drc);
+		}
+	} else if (error == 0) {
+		error = dmu_recv_end(&drc);
+	}
 
-	fdrop(fp, td);
+	zc->zc_cookie = off - fp->f_offset;
+	if (off >= 0 && off <= MAXOFFSET_T)
+		fp->f_offset = off;
+
+	/*
+	 * On error, restore the original props.
+	 */
+	if (error && props) {
+		clear_props(tofs, props);
+		(void) zfs_set_prop_nvlist(tofs, origprops);
+	}
+out:
+	if (zfsvfs) {
+		mutex_exit(&zfsvfs->z_online_recv_lock);
+		VFS_RELE(zfsvfs->z_vfs);
+	}
+	nvlist_free(props);
+	nvlist_free(origprops);
+	releasef(fp);
 	return (error);
 }
 
+/*
+ * inputs:
+ * zc_name	name of snapshot to send
+ * zc_value	short name of incremental fromsnap (may be empty)
+ * zc_cookie	file descriptor to send stream to
+ * zc_obj	fromorigin flag (mutually exclusive with zc_value)
+ *
+ * outputs: none
+ */
 static int
-zfs_ioc_sendbackup(zfs_cmd_t *zc)
+zfs_ioc_send(zfs_cmd_t *zc)
 {
-	kthread_t *td = curthread;
-	struct file *fp;
 	objset_t *fromsnap = NULL;
 	objset_t *tosnap;
-	int error, fd;
+	file_t *fp;
+	int error;
+	offset_t off;
 
 	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap);
+	    DS_MODE_USER | DS_MODE_READONLY, &tosnap);
 	if (error)
 		return (error);
 
@@ -1507,25 +2605,27 @@ zfs_ioc_sendbackup(zfs_cmd_t *zc)
 			*(cp+1) = 0;
 		(void) strlcat(buf, zc->zc_value, sizeof (buf));
 		error = dmu_objset_open(buf, DMU_OST_ANY,
-		    DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap);
+		    DS_MODE_USER | DS_MODE_READONLY, &fromsnap);
 		if (error) {
 			dmu_objset_close(tosnap);
 			return (error);
 		}
 	}
 
-	fd = zc->zc_cookie;
-	error = fget_write(td, fd, &fp);
-	if (error) {
+	fp = getf(zc->zc_cookie, 1);
+	if (fp == NULL) {
 		dmu_objset_close(tosnap);
 		if (fromsnap)
 			dmu_objset_close(fromsnap);
-		return (error);
+		return (EBADF);
 	}
 
-	error = dmu_sendbackup(tosnap, fromsnap, fp);
+	off = fp->f_offset;
+	error = dmu_sendbackup(tosnap, fromsnap, zc->zc_obj, fp, &off);
 
-	fdrop(fp, td);
+	if (off >= 0 && off <= MAXOFFSET_T)
+		fp->f_offset = off;
+	releasef(fp);
 	if (fromsnap)
 		dmu_objset_close(fromsnap);
 	dmu_objset_close(tosnap);
@@ -1595,28 +2695,58 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 	vdev_t *vd;
 	int error;
 
+	/*
+	 * On zpool clear we also fix up missing slogs
+	 */
+	mutex_enter(&spa_namespace_lock);
+	spa = spa_lookup(zc->zc_name);
+	if (spa == NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (EIO);
+	}
+	if (spa->spa_log_state == SPA_LOG_MISSING) {
+		/* we need to let spa_open/spa_load clear the chains */
+		spa->spa_log_state = SPA_LOG_CLEAR;
+	}
+	mutex_exit(&spa_namespace_lock);
+
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	spa_config_enter(spa, RW_WRITER, FTAG);
+	spa_vdev_state_enter(spa);
 
 	if (zc->zc_guid == 0) {
 		vd = NULL;
-	} else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) {
-		spa_config_exit(spa, FTAG);
-		spa_close(spa, FTAG);
-		return (ENODEV);
+	} else {
+		vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE);
+		if (vd == NULL) {
+			(void) spa_vdev_state_exit(spa, NULL, ENODEV);
+			spa_close(spa, FTAG);
+			return (ENODEV);
+		}
 	}
 
 	vdev_clear(spa, vd);
 
-	spa_config_exit(spa, FTAG);
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+
+	/*
+	 * Resume any suspended I/Os.
+	 */
+	zio_resume(spa);
 
 	spa_close(spa, FTAG);
 
 	return (0);
 }
 
+/*
+ * inputs:
+ * zc_name	name of filesystem
+ * zc_value	name of origin snapshot
+ *
+ * outputs:	none
+ */
 static int
 zfs_ioc_promote(zfs_cmd_t *zc)
 {
@@ -1634,68 +2764,221 @@ zfs_ioc_promote(zfs_cmd_t *zc)
 	return (dsl_dataset_promote(zc->zc_name));
 }
 
+#ifdef TODO
+/*
+ * We don't want to have a hard dependency
+ * against some special symbols in sharefs
+ * nfs, and smbsrv.  Determine them if needed when
+ * the first file system is shared.
+ * Neither sharefs, nfs or smbsrv are unloadable modules.
+ */
+int (*znfsexport_fs)(void *arg);
+int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t);
+int (*zsmbexport_fs)(void *arg, boolean_t add_share);
+
+int zfs_nfsshare_inited;
+int zfs_smbshare_inited;
+
+ddi_modhandle_t nfs_mod;
+ddi_modhandle_t sharefs_mod;
+ddi_modhandle_t smbsrv_mod;
+#endif
+kmutex_t zfs_share_lock;
+
+#ifdef TODO
+static int
+zfs_init_sharefs()
+{
+	int error;
+
+	ASSERT(MUTEX_HELD(&zfs_share_lock));
+	/* Both NFS and SMB shares also require sharetab support. */
+	if (sharefs_mod == NULL && ((sharefs_mod =
+	    ddi_modopen("fs/sharefs",
+	    KRTLD_MODE_FIRST, &error)) == NULL)) {
+		return (ENOSYS);
+	}
+	if (zshare_fs == NULL && ((zshare_fs =
+	    (int (*)(enum sharefs_sys_op, share_t *, uint32_t))
+	    ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) {
+		return (ENOSYS);
+	}
+	return (0);
+}
+#endif
+
+static int
+zfs_ioc_share(zfs_cmd_t *zc)
+{
+#ifdef TODO
+	int error;
+	int opcode;
+
+	switch (zc->zc_share.z_sharetype) {
+	case ZFS_SHARE_NFS:
+	case ZFS_UNSHARE_NFS:
+		if (zfs_nfsshare_inited == 0) {
+			mutex_enter(&zfs_share_lock);
+			if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs",
+			    KRTLD_MODE_FIRST, &error)) == NULL)) {
+				mutex_exit(&zfs_share_lock);
+				return (ENOSYS);
+			}
+			if (znfsexport_fs == NULL &&
+			    ((znfsexport_fs = (int (*)(void *))
+			    ddi_modsym(nfs_mod,
+			    "nfs_export", &error)) == NULL)) {
+				mutex_exit(&zfs_share_lock);
+				return (ENOSYS);
+			}
+			error = zfs_init_sharefs();
+			if (error) {
+				mutex_exit(&zfs_share_lock);
+				return (ENOSYS);
+			}
+			zfs_nfsshare_inited = 1;
+			mutex_exit(&zfs_share_lock);
+		}
+		break;
+	case ZFS_SHARE_SMB:
+	case ZFS_UNSHARE_SMB:
+		if (zfs_smbshare_inited == 0) {
+			mutex_enter(&zfs_share_lock);
+			if (smbsrv_mod == NULL && ((smbsrv_mod =
+			    ddi_modopen("drv/smbsrv",
+			    KRTLD_MODE_FIRST, &error)) == NULL)) {
+				mutex_exit(&zfs_share_lock);
+				return (ENOSYS);
+			}
+			if (zsmbexport_fs == NULL && ((zsmbexport_fs =
+			    (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod,
+			    "smb_server_share", &error)) == NULL)) {
+				mutex_exit(&zfs_share_lock);
+				return (ENOSYS);
+			}
+			error = zfs_init_sharefs();
+			if (error) {
+				mutex_exit(&zfs_share_lock);
+				return (ENOSYS);
+			}
+			zfs_smbshare_inited = 1;
+			mutex_exit(&zfs_share_lock);
+		}
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	switch (zc->zc_share.z_sharetype) {
+	case ZFS_SHARE_NFS:
+	case ZFS_UNSHARE_NFS:
+		if (error =
+		    znfsexport_fs((void *)
+		    (uintptr_t)zc->zc_share.z_exportdata))
+			return (error);
+		break;
+	case ZFS_SHARE_SMB:
+	case ZFS_UNSHARE_SMB:
+		if (error = zsmbexport_fs((void *)
+		    (uintptr_t)zc->zc_share.z_exportdata,
+		    zc->zc_share.z_sharetype == ZFS_SHARE_SMB ?
+		    B_TRUE : B_FALSE)) {
+			return (error);
+		}
+		break;
+	}
+
+	opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS ||
+	    zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ?
+	    SHAREFS_ADD : SHAREFS_REMOVE;
+
+	/*
+	 * Add or remove share from sharetab
+	 */
+	error = zshare_fs(opcode,
+	    (void *)(uintptr_t)zc->zc_share.z_sharedata,
+	    zc->zc_share.z_sharemax);
+
+	return (error);
+#else
+	return (ENOSYS);
+#endif
+}
+
+/*
+ * pool create, destroy, and export don't log the history as part of
+ * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export
+ * do the logging of those commands.
+ */
 static int
 zfs_ioc_jail(zfs_cmd_t *zc)
 {
 
-	return (zone_dataset_attach((cred_t *)(uintptr_t)zc->zc_cred,
-	    zc->zc_name, (int)zc->zc_jailid));
+	return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
+	    (int)zc->zc_jailid));
 }
 
 static int
 zfs_ioc_unjail(zfs_cmd_t *zc)
 {
 
-	return (zone_dataset_detach((cred_t *)(uintptr_t)zc->zc_cred,
-	    zc->zc_name, (int)zc->zc_jailid));
+	return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
+	    (int)zc->zc_jailid));
 }
 
 static zfs_ioc_vec_t zfs_ioc_vec[] = {
-	{ zfs_ioc_pool_create,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_destroy,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_import,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_export,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_configs,		zfs_secpolicy_none,	no_name },
-	{ zfs_ioc_pool_stats,		zfs_secpolicy_read,	pool_name },
-	{ zfs_ioc_pool_tryimport,	zfs_secpolicy_config,	no_name },
-	{ zfs_ioc_pool_scrub,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_freeze,		zfs_secpolicy_config,	no_name },
-	{ zfs_ioc_pool_upgrade,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_get_history,	zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_log_history,	zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_vdev_add,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_vdev_remove,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_vdev_online,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_vdev_offline,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_vdev_attach,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_vdev_detach,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_vdev_setpath,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_objset_stats,		zfs_secpolicy_read,	dataset_name },
-	{ zfs_ioc_dataset_list_next,	zfs_secpolicy_read,	dataset_name },
-	{ zfs_ioc_snapshot_list_next,	zfs_secpolicy_read,	dataset_name },
-	{ zfs_ioc_set_prop,		zfs_secpolicy_write,	dataset_name },
-	{ zfs_ioc_create_minor,		zfs_secpolicy_config,	dataset_name },
-	{ zfs_ioc_remove_minor,		zfs_secpolicy_config,	dataset_name },
-	{ zfs_ioc_create,		zfs_secpolicy_parent,	dataset_name },
-	{ zfs_ioc_destroy,		zfs_secpolicy_parent,	dataset_name },
-	{ zfs_ioc_rollback,		zfs_secpolicy_write,	dataset_name },
-	{ zfs_ioc_rename,		zfs_secpolicy_write,	dataset_name },
-	{ zfs_ioc_recvbackup,		zfs_secpolicy_write,	dataset_name },
-	{ zfs_ioc_sendbackup,		zfs_secpolicy_operator,	dataset_name },
-	{ zfs_ioc_inject_fault,		zfs_secpolicy_inject,	no_name },
-	{ zfs_ioc_clear_fault,		zfs_secpolicy_inject,	no_name },
-	{ zfs_ioc_inject_list_next,	zfs_secpolicy_inject,	no_name },
-	{ zfs_ioc_error_log,		zfs_secpolicy_inject,	pool_name },
-	{ zfs_ioc_clear,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_promote,		zfs_secpolicy_write,	dataset_name },
-	{ zfs_ioc_destroy_snaps,	zfs_secpolicy_write,	dataset_name },
-	{ zfs_ioc_snapshot,		zfs_secpolicy_operator,	dataset_name },
-	{ zfs_ioc_dsobj_to_dsname,	zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_obj_to_path,		zfs_secpolicy_config,	no_name },
-	{ zfs_ioc_pool_set_props,	zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_get_props,	zfs_secpolicy_read,	pool_name },
-	{ zfs_ioc_jail,			zfs_secpolicy_config,	dataset_name },
-	{ zfs_ioc_unjail,		zfs_secpolicy_config,	dataset_name }
+	{ zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE },
+	{ zfs_ioc_pool_destroy,	zfs_secpolicy_config, POOL_NAME, B_FALSE },
+	{ zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE },
+	{ zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE },
+	{ zfs_ioc_pool_configs,	zfs_secpolicy_none, NO_NAME, B_FALSE },
+	{ zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE },
+	{ zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE },
+	{ zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE },
+	{ zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE },
+	{ zfs_ioc_pool_upgrade,	zfs_secpolicy_config, POOL_NAME, B_TRUE },
+	{ zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE },
+	{ zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE },
+	{ zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE },
+	{ zfs_ioc_vdev_set_state, zfs_secpolicy_config,	POOL_NAME, B_TRUE },
+	{ zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE },
+	{ zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE },
+	{ zfs_ioc_vdev_setpath,	zfs_secpolicy_config, POOL_NAME, B_FALSE },
+	{ zfs_ioc_objset_stats,	zfs_secpolicy_read, DATASET_NAME, B_FALSE },
+	{ zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE },
+	{ zfs_ioc_dataset_list_next, zfs_secpolicy_read,
+	    DATASET_NAME, B_FALSE },
+	{ zfs_ioc_snapshot_list_next, zfs_secpolicy_read,
+	    DATASET_NAME, B_FALSE },
+	{ zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE },
+	{ zfs_ioc_create_minor,	zfs_secpolicy_minor, DATASET_NAME, B_FALSE },
+	{ zfs_ioc_remove_minor,	zfs_secpolicy_minor, DATASET_NAME, B_FALSE },
+	{ zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE },
+	{ zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE },
+	{ zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE },
+	{ zfs_ioc_rename, zfs_secpolicy_rename,	DATASET_NAME, B_TRUE },
+	{ zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE },
+	{ zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE },
+	{ zfs_ioc_inject_fault,	zfs_secpolicy_inject, NO_NAME, B_FALSE },
+	{ zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE },
+	{ zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE },
+	{ zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE },
+	{ zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE },
+	{ zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE },
+	{ zfs_ioc_destroy_snaps, zfs_secpolicy_destroy,	DATASET_NAME, B_TRUE },
+	{ zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE },
+	{ zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE },
+	{ zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE },
+	{ zfs_ioc_pool_set_props, zfs_secpolicy_config,	POOL_NAME, B_TRUE },
+	{ zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE },
+	{ zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE },
+	{ zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE },
+	{ zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi,
+	    DATASET_NAME, B_FALSE },
+	{ zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE },
+	{ zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE },
+	{ zfs_ioc_jail, zfs_secpolicy_config, DATASET_NAME, B_TRUE },
+	{ zfs_ioc_unjail, zfs_secpolicy_config, DATASET_NAME, B_TRUE }
 };
 
 static int
@@ -1711,9 +2994,7 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
 	if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
 		return (EINVAL);
 
-	zc->zc_cred = (uintptr_t)td->td_ucred;
-	zc->zc_dev = (uintptr_t)dev;
-	error = zfs_ioc_vec[vec].zvec_secpolicy(zc->zc_name, td->td_ucred);
+	error = zfs_ioc_vec[vec].zvec_secpolicy(zc, td->td_ucred);
 
 	/*
 	 * Ensure that all pool/dataset names are valid before we pass down to
@@ -1722,17 +3003,17 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
 	if (error == 0) {
 		zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
 		switch (zfs_ioc_vec[vec].zvec_namecheck) {
-		case pool_name:
+		case POOL_NAME:
 			if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
 				error = EINVAL;
 			break;
 
-		case dataset_name:
+		case DATASET_NAME:
 			if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
 				error = EINVAL;
 			break;
 
-		case no_name:
+		case NO_NAME:
 			break;
 		}
 	}
@@ -1740,6 +3021,9 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
 	if (error == 0)
 		error = zfs_ioc_vec[vec].zvec_func(zc);
 
+	if (zfs_ioc_vec[vec].zvec_his_log == B_TRUE)
+		zfs_log_history(zc);
+
 	return (error);
 }
 
@@ -1761,7 +3045,7 @@ static struct cdevsw zfs_cdevsw = {
 static void
 zfsdev_init(void)
 {
-	zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0660,
+	zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666,
 	    ZFS_DEV_NAME);
 }
 
@@ -1775,6 +3059,10 @@ zfsdev_fini(void)
 static struct task zfs_start_task;
 static struct root_hold_token *zfs_root_token;
 
+
+uint_t zfs_fsyncer_key;
+extern uint_t rrw_tsd_key;
+
 static void
 zfs_start(void *context __unused, int pending __unused)
 {
@@ -1783,7 +3071,11 @@ zfs_start(void *context __unused, int pending __unused)
 	spa_init(FREAD | FWRITE);
 	zfs_init();
 	zvol_init();
-	printf("ZFS storage pool version " ZFS_VERSION_STRING "\n");
+
+	tsd_create(&zfs_fsyncer_key, NULL);
+	tsd_create(&rrw_tsd_key, NULL);
+
+	printf("ZFS storage pool version " SPA_VERSION_STRING "\n");
 	root_mount_rel(zfs_root_token);
 }
 
@@ -1800,6 +3092,7 @@ zfs_modevent(module_t mod, int type, void *unused __unused)
 		    "feature in FreeBSD.\n");
 		TASK_INIT(&zfs_start_task, 0, zfs_start, NULL);
 		taskqueue_enqueue(taskqueue_thread, &zfs_start_task);
+		mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
 		error = 0;
 		break;
 	case MOD_UNLOAD:
@@ -1812,6 +3105,9 @@ zfs_modevent(module_t mod, int type, void *unused __unused)
 		zfs_fini();
 		spa_fini();
 		zfsdev_fini();
+		tsd_destroy(&zfs_fsyncer_key);
+		tsd_destroy(&rrw_tsd_key);
+		mutex_destroy(&zfs_share_lock);
 		error = 0;
 		break;
 	}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
index dde9ec1a335c..5f99780d7544 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -36,49 +34,282 @@
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zil.h>
+#include <sys/zil_impl.h>
 #include <sys/byteorder.h>
 #include <sys/policy.h>
 #include <sys/stat.h>
 #include <sys/acl.h>
 #include <sys/dmu.h>
 #include <sys/spa.h>
+#include <sys/zfs_fuid.h>
 
 /*
  * All the functions in this file are used to construct the log entries
- * to record transactions. They allocate * a intent log transaction
+ * to record transactions. They allocate * an intent log transaction
  * structure (itx_t) and save within it all the information necessary to
  * possibly replay the transaction. The itx is then assigned a sequence
  * number and inserted in the in-memory list anchored in the zilog.
  */
 
+int
+zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
+{
+	int isxvattr = (vap->va_mask & AT_XVATTR);
+	switch (type) {
+	case Z_FILE:
+		if (vsecp == NULL && !isxvattr)
+			return (TX_CREATE);
+		if (vsecp && isxvattr)
+			return (TX_CREATE_ACL_ATTR);
+		if (vsecp)
+			return (TX_CREATE_ACL);
+		else
+			return (TX_CREATE_ATTR);
+		/*NOTREACHED*/
+	case Z_DIR:
+		if (vsecp == NULL && !isxvattr)
+			return (TX_MKDIR);
+		if (vsecp && isxvattr)
+			return (TX_MKDIR_ACL_ATTR);
+		if (vsecp)
+			return (TX_MKDIR_ACL);
+		else
+			return (TX_MKDIR_ATTR);
+	case Z_XATTRDIR:
+		return (TX_MKXATTR);
+	}
+	ASSERT(0);
+	return (TX_MAX_TYPE);
+}
+
+/*
+ * build up the log data necessary for logging xvattr_t
+ * First lr_attr_t is initialized.  following the lr_attr_t
+ * is the mapsize and attribute bitmap copied from the xvattr_t.
+ * Following the bitmap and bitmapsize two 64 bit words are reserved
+ * for the create time which may be set.  Following the create time
+ * records a single 64 bit integer which has the bits to set on
+ * replay for the xvattr.
+ */
+static void
+zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
+{
+	uint32_t	*bitmap;
+	uint64_t	*attrs;
+	uint64_t	*crtime;
+	xoptattr_t	*xoap;
+	void		*scanstamp;
+	int		i;
+
+	xoap = xva_getxoptattr(xvap);
+	ASSERT(xoap);
+
+	lrattr->lr_attr_masksize = xvap->xva_mapsize;
+	bitmap = &lrattr->lr_attr_bitmap;
+	for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
+		*bitmap = xvap->xva_reqattrmap[i];
+	}
+
+	/* Now pack the attributes up in a single uint64_t */
+	attrs = (uint64_t *)bitmap;
+	crtime = attrs + 1;
+	scanstamp = (caddr_t)(crtime + 2);
+	*attrs = 0;
+	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
+		*attrs |= (xoap->xoa_readonly == 0) ? 0 :
+		    XAT0_READONLY;
+	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
+		*attrs |= (xoap->xoa_hidden == 0) ? 0 :
+		    XAT0_HIDDEN;
+	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
+		*attrs |= (xoap->xoa_system == 0) ? 0 :
+		    XAT0_SYSTEM;
+	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
+		*attrs |= (xoap->xoa_archive == 0) ? 0 :
+		    XAT0_ARCHIVE;
+	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
+		*attrs |= (xoap->xoa_immutable == 0) ? 0 :
+		    XAT0_IMMUTABLE;
+	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
+		*attrs |= (xoap->xoa_nounlink == 0) ? 0 :
+		    XAT0_NOUNLINK;
+	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
+		*attrs |= (xoap->xoa_appendonly == 0) ? 0 :
+		    XAT0_APPENDONLY;
+	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
+		*attrs |= (xoap->xoa_opaque == 0) ? 0 :
+		    XAT0_APPENDONLY;
+	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
+		*attrs |= (xoap->xoa_nodump == 0) ? 0 :
+		    XAT0_NODUMP;
+	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
+		*attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
+		    XAT0_AV_QUARANTINED;
+	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
+		*attrs |= (xoap->xoa_av_modified == 0) ? 0 :
+		    XAT0_AV_MODIFIED;
+	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+		ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
+	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+		bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
+}
+
+static void *
+zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start)
+{
+	zfs_fuid_t *zfuid;
+	uint64_t *fuidloc = start;
+
+	/* First copy in the ACE FUIDs */
+	for (zfuid = list_head(&fuidp->z_fuids); zfuid;
+	    zfuid = list_next(&fuidp->z_fuids, zfuid)) {
+		*fuidloc++ = zfuid->z_logfuid;
+	}
+	return (fuidloc);
+}
+
+
+static void *
+zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
+{
+	zfs_fuid_domain_t *zdomain;
+
+	/* now copy in the domain info, if any */
+	if (fuidp->z_domain_str_sz != 0) {
+		for (zdomain = list_head(&fuidp->z_domains); zdomain;
+		    zdomain = list_next(&fuidp->z_domains, zdomain)) {
+			bcopy((void *)zdomain->z_domain, start,
+			    strlen(zdomain->z_domain) + 1);
+			start = (caddr_t)start +
+			    strlen(zdomain->z_domain) + 1;
+		}
+	}
+	return (start);
+}
+
 /*
- * zfs_log_create() is used to handle TX_CREATE, TX_MKDIR and TX_MKXATTR
+ * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR,
+ * TX_MKDIR_ATTR and TX_MKXATTR
  * transactions.
+ *
+ * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
+ * domain information appended prior to the name.  In this case the
+ * uid/gid in the log record will be a log centric FUID.
+ *
+ * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
+ * may contain attributes, ACL and optional fuid information.
+ *
+ * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
+ * and ACL and normal users/groups in the ACEs.
+ *
+ * There may be an optional xvattr attribute information similar
+ * to zfs_log_setattr.
+ *
+ * Also, after the file name "domain" strings may be appended.
  */
 void
-zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *dzp, znode_t *zp, char *name)
+zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+    znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp,
+    zfs_fuid_info_t *fuidp, vattr_t *vap)
 {
 	itx_t *itx;
 	uint64_t seq;
 	lr_create_t *lr;
+	lr_acl_create_t *lracl;
+	size_t aclsize;
+	size_t xvatsize = 0;
+	size_t txsize;
+	xvattr_t *xvap = (xvattr_t *)vap;
+	void *end;
+	size_t lrsize;
 	size_t namesize = strlen(name) + 1;
+	size_t fuidsz = 0;
 
 	if (zilog == NULL)
 		return;
 
-	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+	/*
+	 * If we have FUIDs present then add in space for
+	 * domains and ACE fuid's if any.
+	 */
+	if (fuidp) {
+		fuidsz += fuidp->z_domain_str_sz;
+		fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t);
+	}
+
+	if (vap->va_mask & AT_XVATTR)
+		xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize);
+
+	if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR ||
+	    (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR ||
+	    (int)txtype == TX_MKXATTR) {
+		txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
+		lrsize = sizeof (*lr);
+	} else {
+		aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0;
+		txsize =
+		    sizeof (lr_acl_create_t) + namesize + fuidsz +
+		    ZIL_ACE_LENGTH(aclsize) + xvatsize;
+		lrsize = sizeof (lr_acl_create_t);
+	}
+
+	itx = zil_itx_create(txtype, txsize);
+
 	lr = (lr_create_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
 	lr->lr_foid = zp->z_id;
 	lr->lr_mode = zp->z_phys->zp_mode;
-	lr->lr_uid = zp->z_phys->zp_uid;
-	lr->lr_gid = zp->z_phys->zp_gid;
+	if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) {
+		lr->lr_uid = (uint64_t)zp->z_phys->zp_uid;
+	} else {
+		lr->lr_uid = fuidp->z_fuid_owner;
+	}
+	if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) {
+		lr->lr_gid = (uint64_t)zp->z_phys->zp_gid;
+	} else {
+		lr->lr_gid = fuidp->z_fuid_group;
+	}
 	lr->lr_gen = zp->z_phys->zp_gen;
 	lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
 	lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
 	lr->lr_rdev = zp->z_phys->zp_rdev;
-	bcopy(name, (char *)(lr + 1), namesize);
+
+	/*
+	 * Fill in xvattr info if any
+	 */
+	if (vap->va_mask & AT_XVATTR) {
+		zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap);
+		end = (caddr_t)lr + lrsize + xvatsize;
+	} else {
+		end = (caddr_t)lr + lrsize;
+	}
+
+	/* Now fill in any ACL info */
+
+	if (vsecp) {
+		lracl = (lr_acl_create_t *)&itx->itx_lr;
+		lracl->lr_aclcnt = vsecp->vsa_aclcnt;
+		lracl->lr_acl_bytes = aclsize;
+		lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
+		lracl->lr_fuidcnt  = fuidp ? fuidp->z_fuid_cnt : 0;
+		if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS)
+			lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
+		else
+			lracl->lr_acl_flags = 0;
+
+		bcopy(vsecp->vsa_aclentp, end, aclsize);
+		end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
+	}
+
+	/* drop in FUID info */
+	if (fuidp) {
+		end = zfs_log_fuid_ids(fuidp, end);
+		end = zfs_log_fuid_domains(fuidp, end);
+	}
+	/*
+	 * Now place file name in log record
+	 */
+	bcopy(name, end, namesize);
 
 	seq = zil_itx_assign(zilog, itx, tx);
 	dzp->z_last_itx = seq;
@@ -89,7 +320,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
  * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
  */
 void
-zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	znode_t *dzp, char *name)
 {
 	itx_t *itx;
@@ -113,7 +344,7 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
  * zfs_log_link() handles TX_LINK transactions.
  */
 void
-zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	znode_t *dzp, znode_t *zp, char *name)
 {
 	itx_t *itx;
@@ -139,8 +370,8 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
  * zfs_log_symlink() handles TX_SYMLINK transactions.
  */
 void
-zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *dzp, znode_t *zp, char *name, char *link)
+zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+    znode_t *dzp, znode_t *zp, char *name, char *link)
 {
 	itx_t *itx;
 	uint64_t seq;
@@ -173,7 +404,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
  * zfs_log_rename() handles TX_RENAME transactions.
  */
 void
-zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
 {
 	itx_t *itx;
@@ -203,15 +434,16 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
  */
 ssize_t zfs_immediate_write_sz = 32768;
 
+#define	ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
+    sizeof (lr_write_t))
+
 void
 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *zp, offset_t off, ssize_t len, int ioflag)
+	znode_t *zp, offset_t off, ssize_t resid, int ioflag)
 {
-	itx_t *itx;
-	uint64_t seq;
-	lr_write_t *lr;
 	itx_wr_state_t write_state;
-	int err;
+	boolean_t slogging;
+	uintptr_t fsync_cnt;
 
 	if (zilog == NULL || zp->z_unlinked)
 		return;
@@ -220,52 +452,84 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	 * Writes are handled in three different ways:
 	 *
 	 * WR_INDIRECT:
-	 *    If the write is greater than zfs_immediate_write_sz then
-	 *    later *if* we need to log the write then dmu_sync() is used
-	 *    to immediately write the block and it's block pointer is put
-	 *    in the log record.
+	 *    In this mode, if we need to commit the write later, then the block
+	 *    is immediately written into the file system (using dmu_sync),
+	 *    and a pointer to the block is put into the log record.
+	 *    When the txg commits the block is linked in.
+	 *    This saves additionally writing the data into the log record.
+	 *    There are a few requirements for this to occur:
+	 *	- write is greater than zfs_immediate_write_sz
+	 *	- not using slogs (as slogs are assumed to always be faster
+	 *	  than writing into the main pool)
+	 *	- the write occupies only one block
 	 * WR_COPIED:
 	 *    If we know we'll immediately be committing the
-	 *    transaction (FDSYNC (O_DSYNC)), the we allocate a larger
+	 *    transaction (FSYNC or FDSYNC), the we allocate a larger
 	 *    log record here for the data and copy the data in.
 	 * WR_NEED_COPY:
 	 *    Otherwise we don't allocate a buffer, and *if* we need to
 	 *    flush the write later then a buffer is allocated and
 	 *    we retrieve the data using the dmu.
 	 */
-	if (len > zfs_immediate_write_sz)
+	slogging = spa_has_slogs(zilog->zl_spa);
+	if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz)
 		write_state = WR_INDIRECT;
-	else if (ioflag & FDSYNC)
+	else if (ioflag & (FSYNC | FDSYNC))
 		write_state = WR_COPIED;
 	else
 		write_state = WR_NEED_COPY;
 
-	itx = zil_itx_create(txtype, sizeof (*lr) +
-	    (write_state == WR_COPIED ? len : 0));
-	lr = (lr_write_t *)&itx->itx_lr;
-	if (write_state == WR_COPIED) {
-		err = dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1);
-		if (err) {
+	if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
+		(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
+	}
+
+	while (resid) {
+		itx_t *itx;
+		lr_write_t *lr;
+		ssize_t len;
+
+		/*
+		 * If the write would overflow the largest block then split it.
+		 */
+		if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
+			len = SPA_MAXBLOCKSIZE >> 1;
+		else
+			len = resid;
+
+		itx = zil_itx_create(txtype, sizeof (*lr) +
+		    (write_state == WR_COPIED ? len : 0));
+		lr = (lr_write_t *)&itx->itx_lr;
+		if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
+		    zp->z_id, off, len, lr + 1) != 0) {
 			kmem_free(itx, offsetof(itx_t, itx_lr) +
 			    itx->itx_lr.lrc_reclen);
 			itx = zil_itx_create(txtype, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
 			write_state = WR_NEED_COPY;
 		}
-	}
 
-	itx->itx_wr_state = write_state;
-	lr->lr_foid = zp->z_id;
-	lr->lr_offset = off;
-	lr->lr_length = len;
-	lr->lr_blkoff = 0;
-	BP_ZERO(&lr->lr_blkptr);
+		itx->itx_wr_state = write_state;
+		if (write_state == WR_NEED_COPY)
+			itx->itx_sod += len;
+		lr->lr_foid = zp->z_id;
+		lr->lr_offset = off;
+		lr->lr_length = len;
+		lr->lr_blkoff = 0;
+		BP_ZERO(&lr->lr_blkptr);
 
-	itx->itx_private = zp->z_zfsvfs;
+		itx->itx_private = zp->z_zfsvfs;
 
-	itx->itx_sync = (zp->z_sync_cnt != 0);
-	seq = zil_itx_assign(zilog, itx, tx);
-	zp->z_last_itx = seq;
+		if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) ||
+		    (ioflag & (FSYNC | FDSYNC)))
+			itx->itx_sync = B_TRUE;
+		else
+			itx->itx_sync = B_FALSE;
+
+		zp->z_last_itx = zil_itx_assign(zilog, itx, tx);
+
+		off += len;
+		resid -= len;
+	}
 }
 
 /*
@@ -298,25 +562,60 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
  */
 void
 zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *zp, vattr_t *vap, uint_t mask_applied)
+	znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
 {
-	itx_t *itx;
-	uint64_t seq;
-	lr_setattr_t *lr;
+	itx_t		*itx;
+	uint64_t	seq;
+	lr_setattr_t	*lr;
+	xvattr_t	*xvap = (xvattr_t *)vap;
+	size_t		recsize = sizeof (lr_setattr_t);
+	void		*start;
+
 
 	if (zilog == NULL || zp->z_unlinked)
 		return;
 
-	itx = zil_itx_create(txtype, sizeof (*lr));
+	/*
+	 * If XVATTR set, then log record size needs to allow
+	 * for lr_attr_t + xvattr mask, mapsize and create time
+	 * plus actual attribute values
+	 */
+	if (vap->va_mask & AT_XVATTR)
+		recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize);
+
+	if (fuidp)
+		recsize += fuidp->z_domain_str_sz;
+
+	itx = zil_itx_create(txtype, recsize);
 	lr = (lr_setattr_t *)&itx->itx_lr;
 	lr->lr_foid = zp->z_id;
 	lr->lr_mask = (uint64_t)mask_applied;
 	lr->lr_mode = (uint64_t)vap->va_mode;
-	lr->lr_uid = (uint64_t)vap->va_uid;
-	lr->lr_gid = (uint64_t)vap->va_gid;
+	if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid))
+		lr->lr_uid = fuidp->z_fuid_owner;
+	else
+		lr->lr_uid = (uint64_t)vap->va_uid;
+
+	if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid))
+		lr->lr_gid = fuidp->z_fuid_group;
+	else
+		lr->lr_gid = (uint64_t)vap->va_gid;
+
 	lr->lr_size = (uint64_t)vap->va_size;
 	ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
 	ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
+	start = (lr_setattr_t *)(lr + 1);
+	if (vap->va_mask & AT_XVATTR) {
+		zfs_log_xvattr((lr_attr_t *)start, xvap);
+		start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize);
+	}
+
+	/*
+	 * Now stick on domain information if any on end
+	 */
+
+	if (fuidp)
+		(void) zfs_log_fuid_domains(fuidp, start);
 
 	itx->itx_sync = (zp->z_sync_cnt != 0);
 	seq = zil_itx_assign(zilog, itx, tx);
@@ -327,21 +626,64 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
  * zfs_log_acl() handles TX_ACL transactions.
  */
 void
-zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *zp, int aclcnt, ace_t *z_ace)
+zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
+    vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
 {
 	itx_t *itx;
 	uint64_t seq;
+	lr_acl_v0_t *lrv0;
 	lr_acl_t *lr;
+	int txtype;
+	int lrsize;
+	size_t txsize;
+	size_t aclbytes = vsecp->vsa_aclentsz;
 
 	if (zilog == NULL || zp->z_unlinked)
 		return;
 
-	itx = zil_itx_create(txtype, sizeof (*lr) + aclcnt * sizeof (ace_t));
+	txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
+	    TX_ACL_V0 : TX_ACL;
+
+	if (txtype == TX_ACL)
+		lrsize = sizeof (*lr);
+	else
+		lrsize = sizeof (*lrv0);
+
+	txsize = lrsize +
+	    ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) +
+	    (fuidp ? fuidp->z_domain_str_sz : 0) +
+	    sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0);
+
+	itx = zil_itx_create(txtype, txsize);
+
 	lr = (lr_acl_t *)&itx->itx_lr;
 	lr->lr_foid = zp->z_id;
-	lr->lr_aclcnt = (uint64_t)aclcnt;
-	bcopy(z_ace, (ace_t *)(lr + 1), aclcnt * sizeof (ace_t));
+	if (txtype == TX_ACL) {
+		lr->lr_acl_bytes = aclbytes;
+		lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
+		lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
+		if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS)
+			lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
+		else
+			lr->lr_acl_flags = 0;
+	}
+	lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt;
+
+	if (txtype == TX_ACL_V0) {
+		lrv0 = (lr_acl_v0_t *)lr;
+		bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
+	} else {
+		void *start = (ace_t *)(lr + 1);
+
+		bcopy(vsecp->vsa_aclentp, start, aclbytes);
+
+		start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
+
+		if (fuidp) {
+			start = zfs_log_fuid_ids(fuidp, start);
+			(void) zfs_log_fuid_domains(fuidp, start);
+		}
+	}
 
 	itx->itx_sync = (zp->z_sync_cnt != 0);
 	seq = zil_itx_assign(zilog, itx, tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
index eb3215d79e62..573a82c98e19 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -38,6 +38,7 @@
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
+#include <sys/zfs_fuid.h>
 #include <sys/spa.h>
 #include <sys/zil.h>
 #include <sys/byteorder.h>
@@ -61,8 +62,8 @@ zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
 	vap->va_mask = (uint_t)mask;
 	vap->va_type = IFTOVT(mode);
 	vap->va_mode = mode & MODEMASK;
-	vap->va_uid = (uid_t)uid;
-	vap->va_gid = (gid_t)gid;
+	vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid;
+	vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid;
 	vap->va_rdev = zfs_cmpldev(rdev);
 	vap->va_nodeid = nodeid;
 }
@@ -74,24 +75,365 @@ zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
 	return (ENOTSUP);
 }
 
+static void
+zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
+{
+	xoptattr_t *xoap = NULL;
+	uint64_t *attrs;
+	uint64_t *crtime;
+	uint32_t *bitmap;
+	void *scanstamp;
+	int i;
+
+	xvap->xva_vattr.va_mask |= AT_XVATTR;
+	if ((xoap = xva_getxoptattr(xvap)) == NULL) {
+		xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */
+		return;
+	}
+
+	ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize);
+
+	bitmap = &lrattr->lr_attr_bitmap;
+	for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++)
+		xvap->xva_reqattrmap[i] = *bitmap;
+
+	attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1);
+	crtime = attrs + 1;
+	scanstamp = (caddr_t)(crtime + 2);
+
+	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
+		xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
+		xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
+		xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
+		xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
+		xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
+		xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
+		xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
+		xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
+		xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
+		xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
+		xoap->xoa_av_quarantined =
+		    ((*attrs & XAT0_AV_QUARANTINED) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+		ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
+	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+		bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
+}
+
+static int
+zfs_replay_domain_cnt(uint64_t uid, uint64_t gid)
+{
+	uint64_t uid_idx;
+	uint64_t gid_idx;
+	int domcnt = 0;
+
+	uid_idx = FUID_INDEX(uid);
+	gid_idx = FUID_INDEX(gid);
+	if (uid_idx)
+		domcnt++;
+	if (gid_idx > 0 && gid_idx != uid_idx)
+		domcnt++;
+
+	return (domcnt);
+}
+
+static void *
+zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start,
+    int domcnt)
+{
+	int i;
+
+	for (i = 0; i != domcnt; i++) {
+		fuid_infop->z_domain_table[i] = start;
+		start = (caddr_t)start + strlen(start) + 1;
+	}
+
+	return (start);
+}
+
+/*
+ * Set the uid/gid in the fuid_info structure.
+ */
+static void
+zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid)
+{
+	/*
+	 * If owner or group are log specific FUIDs then slurp up
+	 * domain information and build zfs_fuid_info_t
+	 */
+	if (IS_EPHEMERAL(uid))
+		fuid_infop->z_fuid_owner = uid;
+
+	if (IS_EPHEMERAL(gid))
+		fuid_infop->z_fuid_group = gid;
+}
+
+/*
+ * Load fuid domains into fuid_info_t
+ */
+static zfs_fuid_info_t *
+zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid)
+{
+	int domcnt;
+
+	zfs_fuid_info_t *fuid_infop;
+
+	fuid_infop = zfs_fuid_info_alloc();
+
+	domcnt = zfs_replay_domain_cnt(uid, gid);
+
+	if (domcnt == 0)
+		return (fuid_infop);
+
+	fuid_infop->z_domain_table =
+	    kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
+
+	zfs_replay_fuid_ugid(fuid_infop, uid, gid);
+
+	fuid_infop->z_domain_cnt = domcnt;
+	*end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt);
+	return (fuid_infop);
+}
+
+/*
+ * load zfs_fuid_t's and fuid_domains into fuid_info_t
+ */
+static zfs_fuid_info_t *
+zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid,
+    uint64_t gid)
+{
+	uint64_t *log_fuid = (uint64_t *)start;
+	zfs_fuid_info_t *fuid_infop;
+	int i;
+
+	fuid_infop = zfs_fuid_info_alloc();
+	fuid_infop->z_domain_cnt = domcnt;
+
+	fuid_infop->z_domain_table =
+	    kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
+
+	for (i = 0; i != idcnt; i++) {
+		zfs_fuid_t *zfuid;
+
+		zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
+		zfuid->z_logfuid = *log_fuid;
+		zfuid->z_id = -1;
+		zfuid->z_domidx = 0;
+		list_insert_tail(&fuid_infop->z_fuids, zfuid);
+		log_fuid++;
+	}
+
+	zfs_replay_fuid_ugid(fuid_infop, uid, gid);
+
+	*end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt);
+	return (fuid_infop);
+}
+
+static void
+zfs_replay_swap_attrs(lr_attr_t *lrattr)
+{
+	/* swap the lr_attr structure */
+	byteswap_uint32_array(lrattr, sizeof (*lrattr));
+	/* swap the bitmap */
+	byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) *
+	    sizeof (uint32_t));
+	/* swap the attributes, create time + 64 bit word for attributes */
+	byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) *
+	    (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t));
+}
+
+/*
+ * Replay file create with optional ACL, xvattr information as well
+ * as option FUID information.
+ */
+static int
+zfs_replay_create_acl(zfsvfs_t *zfsvfs,
+    lr_acl_create_t *lracl, boolean_t byteswap)
+{
+	char *name = NULL;		/* location determined later */
+	lr_create_t *lr = (lr_create_t *)lracl;
+	znode_t *dzp;
+	vnode_t *vp = NULL;
+	xvattr_t xva;
+	int vflg = 0;
+	vsecattr_t vsec = { 0 };
+	lr_attr_t *lrattr;
+	void *aclstart;
+	void *fuidstart;
+	size_t xvatlen = 0;
+	uint64_t txtype;
+	int error;
+
+	if (byteswap) {
+		byteswap_uint64_array(lracl, sizeof (*lracl));
+		txtype = (int)lr->lr_common.lrc_txtype;
+		if (txtype == TX_CREATE_ACL_ATTR ||
+		    txtype == TX_MKDIR_ACL_ATTR) {
+			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+			zfs_replay_swap_attrs(lrattr);
+			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+		}
+
+		aclstart = (caddr_t)(lracl + 1) + xvatlen;
+		zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE);
+		/* swap fuids */
+		if (lracl->lr_fuidcnt) {
+			byteswap_uint64_array((caddr_t)aclstart +
+			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes),
+			    lracl->lr_fuidcnt * sizeof (uint64_t));
+		}
+	}
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	xva_init(&xva);
+	zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
+	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
+
+	/*
+	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
+	 * eventually end up in zfs_mknode(), which assigns the object's
+	 * creation time and generation number.  The generic VOP_CREATE()
+	 * doesn't have either concept, so we smuggle the values inside
+	 * the vattr's otherwise unused va_ctime and va_nblocks fields.
+	 */
+	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
+	xva.xva_vattr.va_nblocks = lr->lr_gen;
+
+	error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
+	if (error != ENOENT)
+		goto bail;
+
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
+	switch ((int)lr->lr_common.lrc_txtype) {
+	case TX_CREATE_ACL:
+		aclstart = (caddr_t)(lracl + 1);
+		fuidstart = (caddr_t)aclstart +
+		    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
+		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+		    lr->lr_uid, lr->lr_gid);
+		/*FALLTHROUGH*/
+	case TX_CREATE_ACL_ATTR:
+		if (name == NULL) {
+			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+			xva.xva_vattr.va_mask |= AT_XVATTR;
+			zfs_replay_xvattr(lrattr, &xva);
+		}
+		vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
+		vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
+		vsec.vsa_aclcnt = lracl->lr_aclcnt;
+		vsec.vsa_aclentsz = lracl->lr_acl_bytes;
+		vsec.vsa_aclflags = lracl->lr_acl_flags;
+		if (zfsvfs->z_fuid_replay == NULL) {
+			fuidstart = (caddr_t)(lracl + 1) + xvatlen +
+			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+			zfsvfs->z_fuid_replay =
+			    zfs_replay_fuids(fuidstart,
+			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+			    lr->lr_uid, lr->lr_gid);
+		}
+
+#ifdef TODO
+		error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
+		    0, 0, &vp, kcred, vflg, NULL, &vsec);
+#else
+		panic("%s:%u: unsupported condition", __func__, __LINE__);
+#endif
+		break;
+	case TX_MKDIR_ACL:
+		aclstart = (caddr_t)(lracl + 1);
+		fuidstart = (caddr_t)aclstart +
+		    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
+		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+		    lr->lr_uid, lr->lr_gid);
+		/*FALLTHROUGH*/
+	case TX_MKDIR_ACL_ATTR:
+		if (name == NULL) {
+			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+			zfs_replay_xvattr(lrattr, &xva);
+		}
+		vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
+		vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
+		vsec.vsa_aclcnt = lracl->lr_aclcnt;
+		vsec.vsa_aclentsz = lracl->lr_acl_bytes;
+		vsec.vsa_aclflags = lracl->lr_acl_flags;
+		if (zfsvfs->z_fuid_replay == NULL) {
+			fuidstart = (caddr_t)(lracl + 1) + xvatlen +
+			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+			zfsvfs->z_fuid_replay =
+			    zfs_replay_fuids(fuidstart,
+			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+			    lr->lr_uid, lr->lr_gid);
+		}
+#ifdef TODO
+		error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
+		    &vp, kcred, NULL, vflg, &vsec);
+#else
+		panic("%s:%u: unsupported condition", __func__, __LINE__);
+#endif
+		break;
+	default:
+		error = ENOTSUP;
+	}
+
+bail:
+	if (error == 0 && vp != NULL)
+		VN_RELE(vp);
+
+	VN_RELE(ZTOV(dzp));
+
+	zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+	zfsvfs->z_fuid_replay = NULL;
+
+	return (error);
+}
+
 static int
 zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
 {
-	char *name = (char *)(lr + 1);	/* name follows lr_create_t */
+	char *name = NULL;		/* location determined later */
 	char *link;			/* symlink content follows name */
 	znode_t *dzp;
 	vnode_t *vp = NULL;
-	vattr_t va;
+	xvattr_t xva;
+	int vflg = 0;
+	size_t lrsize = sizeof (lr_create_t);
+	lr_attr_t *lrattr;
+	void *start;
+	size_t xvatlen;
+	uint64_t txtype;
 	struct componentname cn;
 	int error;
 
-	if (byteswap)
+	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
+		txtype = (int)lr->lr_common.lrc_txtype;
+		if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
+			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
+	}
+
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
-	zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID,
+	xva_init(&xva);
+	zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
 	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
 
 	/*
@@ -101,34 +443,89 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
 	 * doesn't have either concept, so we smuggle the values inside
 	 * the vattr's otherwise unused va_ctime and va_nblocks fields.
 	 */
-	ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime);
-	va.va_nblocks = lr->lr_gen;
+	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
+	xva.xva_vattr.va_nblocks = lr->lr_gen;
+
+	error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
+	if (error != ENOENT)
+		goto out;
+
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
+
+	/*
+	 * Symlinks don't have fuid info, and CIFS never creates
+	 * symlinks.
+	 *
+	 * The _ATTR versions will grab the fuid info in their subcases.
+	 */
+	if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
+	    (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
+	    (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
+		start = (lr + 1);
+		zfsvfs->z_fuid_replay =
+		    zfs_replay_fuid_domain(start, &start,
+		    lr->lr_uid, lr->lr_gid);
+	}
 
-	cn.cn_nameptr = name;
 	cn.cn_cred = kcred;
 	cn.cn_thread = curthread;
 	cn.cn_flags = SAVENAME;
 
 	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
 	switch ((int)lr->lr_common.lrc_txtype) {
+	case TX_CREATE_ATTR:
+		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
+		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+		zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
+		start = (caddr_t)(lr + 1) + xvatlen;
+		zfsvfs->z_fuid_replay =
+		    zfs_replay_fuid_domain(start, &start,
+		    lr->lr_uid, lr->lr_gid);
+		name = (char *)start;
+
+		/*FALLTHROUGH*/
 	case TX_CREATE:
-		error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &va);
+		if (name == NULL)
+			name = (char *)start;
+
+		cn.cn_nameptr = name;
+		error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/);
 		break;
+	case TX_MKDIR_ATTR:
+		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
+		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+		zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
+		start = (caddr_t)(lr + 1) + xvatlen;
+		zfsvfs->z_fuid_replay =
+		    zfs_replay_fuid_domain(start, &start,
+		    lr->lr_uid, lr->lr_gid);
+		name = (char *)start;
+
+		/*FALLTHROUGH*/
 	case TX_MKDIR:
-		error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &va);
+		if (name == NULL)
+			name = (char *)(lr + 1);
+
+		cn.cn_nameptr = name;
+		error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/);
 		break;
 	case TX_MKXATTR:
-		error = zfs_make_xattrdir(dzp, &va, &vp, kcred);
+		name = (char *)(lr + 1);
+		error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred);
 		break;
 	case TX_SYMLINK:
+		name = (char *)(lr + 1);
 		link = name + strlen(name) + 1;
-		error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &va, link);
+		cn.cn_nameptr = name;
+		error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &xva.xva_vattr, link /*,vflg*/);
 		break;
 	default:
 		error = ENOTSUP;
 	}
 	VOP_UNLOCK(ZTOV(dzp), 0);
 
+out:
 	if (error == 0 && vp != NULL) {
 		VOP_UNLOCK(vp, 0);
 		VN_RELE(vp);
@@ -136,6 +533,9 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
 
 	VN_RELE(ZTOV(dzp));
 
+	if (zfsvfs->z_fuid_replay)
+		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+	zfsvfs->z_fuid_replay = NULL;
 	return (error);
 }
 
@@ -147,6 +547,7 @@ zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
 	struct componentname cn;
 	vnode_t *vp;
 	int error;
+	int vflg = 0;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
@@ -154,7 +555,8 @@ zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
-	bzero(&cn, sizeof(cn));
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
 	cn.cn_nameptr = name;
 	cn.cn_namelen = strlen(name);
 	cn.cn_nameiop = DELETE;
@@ -171,10 +573,10 @@ zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
 
 	switch ((int)lr->lr_common.lrc_txtype) {
 	case TX_REMOVE:
-		error = VOP_REMOVE(ZTOV(dzp), vp, &cn);
+		error = VOP_REMOVE(ZTOV(dzp), vp, &cn /*,vflg*/);
 		break;
 	case TX_RMDIR:
-		error = VOP_RMDIR(ZTOV(dzp), vp, &cn);
+		error = VOP_RMDIR(ZTOV(dzp), vp, &cn /*,vflg*/);
 		break;
 	default:
 		error = ENOTSUP;
@@ -194,6 +596,7 @@ zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
 	znode_t *dzp, *zp;
 	struct componentname cn;
 	int error;
+	int vflg = 0;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
@@ -206,6 +609,8 @@ zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
 		return (error);
 	}
 
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
 	cn.cn_nameptr = name;
 	cn.cn_cred = kcred;
 	cn.cn_thread = curthread;
@@ -213,7 +618,7 @@ zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
 
 	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
 	vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn);
+	error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn /*,vflg*/);
 	VOP_UNLOCK(ZTOV(zp), 0);
 	VOP_UNLOCK(ZTOV(dzp), 0);
 
@@ -233,6 +638,7 @@ zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
 	vnode_t *svp, *tvp;
 	kthread_t *td = curthread;
 	int error;
+	int vflg = 0;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
@@ -245,9 +651,10 @@ zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
 		return (error);
 	}
 
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
 	svp = tvp = NULL;
 
-	bzero(&scn, sizeof(scn));
 	scn.cn_nameptr = sname;
 	scn.cn_namelen = strlen(sname);
 	scn.cn_nameiop = DELETE;
@@ -262,7 +669,6 @@ zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
 		goto fail;
 	VOP_UNLOCK(svp, 0);
 
-	bzero(&tcn, sizeof(tcn));
 	tcn.cn_nameptr = tname;
 	tcn.cn_namelen = strlen(tname);
 	tcn.cn_nameiop = RENAME;
@@ -279,7 +685,7 @@ zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
 		goto fail;
 	}
 
-	error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn);
+	error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn /*,vflg*/);
 	return (error);
 fail:
 	if (svp != NULL)
@@ -334,13 +740,21 @@ static int
 zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
 {
 	znode_t *zp;
-	vattr_t va;
+	xvattr_t xva;
+	vattr_t *vap = &xva.xva_vattr;
 	vnode_t *vp;
 	int error;
+	void *start;
 
-	if (byteswap)
+	xva_init(&xva);
+	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 
+		if ((lr->lr_mask & AT_XVATTR) &&
+		    zfsvfs->z_version >= ZPL_VERSION_INITIAL)
+			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
+	}
+
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
 		/*
 		 * As we can log setattrs out of order, it's possible the
@@ -352,35 +766,112 @@ zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
 		return (error);
 	}
 
-	zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode,
+	zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
 	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
 
-	va.va_size = lr->lr_size;
-	ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime);
-	ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime);
+	vap->va_size = lr->lr_size;
+	ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime);
+	ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime);
+
+	/*
+	 * Fill in xvattr_t portions if necessary.
+	 */
+
+	start = (lr_setattr_t *)(lr + 1);
+	if (vap->va_mask & AT_XVATTR) {
+		zfs_replay_xvattr((lr_attr_t *)start, &xva);
+		start = (caddr_t)start +
+		    ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize);
+	} else
+		xva.xva_vattr.va_mask &= ~AT_XVATTR;
+
+	zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
+	    lr->lr_uid, lr->lr_gid);
 
 	vp = ZTOV(zp);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_SETATTR(vp, &va, kcred);
+	error = VOP_SETATTR(vp, vap, kcred);
 	VOP_UNLOCK(vp, 0);
+
+	zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+	zfsvfs->z_fuid_replay = NULL;
 	VN_RELE(vp);
 
 	return (error);
 }
 
 static int
-zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
+zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap)
 {
 	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
-#ifdef TODO
 	vsecattr_t vsa;
+	znode_t *zp;
+	int error;
+
+	if (byteswap) {
+		byteswap_uint64_array(lr, sizeof (*lr));
+		zfs_oldace_byteswap(ace, lr->lr_aclcnt);
+	}
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+		/*
+		 * As we can log acls out of order, it's possible the
+		 * file has been removed. In this case just drop the acl
+		 * and return success.
+		 */
+		if (error == ENOENT)
+			error = 0;
+		return (error);
+	}
+
+	bzero(&vsa, sizeof (vsa));
+	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
+	vsa.vsa_aclcnt = lr->lr_aclcnt;
+	vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt;
+	vsa.vsa_aclflags = 0;
+	vsa.vsa_aclentp = ace;
+
+#ifdef TODO
+	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
+#else
+	panic("%s:%u: unsupported condition", __func__, __LINE__);
 #endif
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+/*
+ * Replaying ACLs is complicated by FUID support.
+ * The log record may contain some optional data
+ * to be used for replaying FUID's.  These pieces
+ * are the actual FUIDs that were created initially.
+ * The FUID table index may no longer be valid and
+ * during zfs_create() a new index may be assigned.
+ * Because of this the log will contain the original
+ * doman+rid in order to create a new FUID.
+ *
+ * The individual ACEs may contain an ephemeral uid/gid which is no
+ * longer valid and will need to be replaced with an actual FUID.
+ *
+ */
+static int
+zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
+{
+	ace_t *ace = (ace_t *)(lr + 1);
+	vsecattr_t vsa;
 	znode_t *zp;
 	int error;
 
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
-		zfs_ace_byteswap(ace, lr->lr_aclcnt);
+		zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
+		if (lr->lr_fuidcnt) {
+			byteswap_uint64_array((caddr_t)ace +
+			    ZIL_ACE_LENGTH(lr->lr_acl_bytes),
+			    lr->lr_fuidcnt * sizeof (uint64_t));
+		}
 	}
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
@@ -396,15 +887,30 @@ zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
 
 #ifdef TODO
 	bzero(&vsa, sizeof (vsa));
-	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
+	vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
 	vsa.vsa_aclcnt = lr->lr_aclcnt;
 	vsa.vsa_aclentp = ace;
+	vsa.vsa_aclentsz = lr->lr_acl_bytes;
+	vsa.vsa_aclflags = lr->lr_acl_flags;
+
+	if (lr->lr_fuidcnt) {
+		void *fuidstart = (caddr_t)ace +
+		    ZIL_ACE_LENGTH(lr->lr_acl_bytes);
+
+		zfsvfs->z_fuid_replay =
+		    zfs_replay_fuids(fuidstart, &fuidstart,
+		    lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
+	}
+
+	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
 
-	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred);
+	if (zfsvfs->z_fuid_replay)
+		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 #else
 	error = EOPNOTSUPP;
 #endif
 
+	zfsvfs->z_fuid_replay = NULL;
 	VN_RELE(ZTOV(zp));
 
 	return (error);
@@ -426,5 +932,12 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
 	zfs_replay_write,	/* TX_WRITE */
 	zfs_replay_truncate,	/* TX_TRUNCATE */
 	zfs_replay_setattr,	/* TX_SETATTR */
+	zfs_replay_acl_v0,	/* TX_ACL_V0 */
 	zfs_replay_acl,		/* TX_ACL */
+	zfs_replay_create_acl,	/* TX_CREATE_ACL */
+	zfs_replay_create,	/* TX_CREATE_ATTR */
+	zfs_replay_create_acl,	/* TX_CREATE_ACL_ATTR */
+	zfs_replay_create_acl,	/* TX_MKDIR_ACL */
+	zfs_replay_create,	/* TX_MKDIR_ATTR */
+	zfs_replay_create_acl,	/* TX_MKDIR_ACL_ATTR */
 };
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
index 07ec0f6b6e90..f0a75b5fa0d7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
@@ -472,10 +472,14 @@ zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
 	 */
 	if (remove->r_cnt == 1) {
 		avl_remove(tree, remove);
-		if (remove->r_write_wanted)
+		if (remove->r_write_wanted) {
 			cv_broadcast(&remove->r_wr_cv);
-		if (remove->r_read_wanted)
+			cv_destroy(&remove->r_wr_cv);
+		}
+		if (remove->r_read_wanted) {
 			cv_broadcast(&remove->r_rd_cv);
+			cv_destroy(&remove->r_rd_cv);
+		}
 	} else {
 		ASSERT3U(remove->r_cnt, ==, 0);
 		ASSERT3U(remove->r_write_wanted, ==, 0);
@@ -501,10 +505,14 @@ zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
 			rl->r_cnt--;
 			if (rl->r_cnt == 0) {
 				avl_remove(tree, rl);
-				if (rl->r_write_wanted)
+				if (rl->r_write_wanted) {
 					cv_broadcast(&rl->r_wr_cv);
-				if (rl->r_read_wanted)
+					cv_destroy(&rl->r_wr_cv);
+				}
+				if (rl->r_read_wanted) {
 					cv_broadcast(&rl->r_rd_cv);
+					cv_destroy(&rl->r_rd_cv);
+				}
 				kmem_free(rl, sizeof (rl_t));
 			}
 		}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
index 28f3293ec435..5becdb46a9f1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -44,6 +42,7 @@
 #include <sys/dmu.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
+#include <sys/dsl_deleg.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <sys/varargs.h>
@@ -51,17 +50,47 @@
 #include <sys/atomic.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
 #include <sys/sunddi.h>
 #include <sys/dnlc.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa_boot.h>
+#include <sys/vdev_impl.h>	/* VDEV_BOOT_VERSION */
 
 struct mtx zfs_debug_mtx;
 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
+
 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
+
+int zfs_super_owner = 0;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
+    "File system owner can perform privileged operation on his file systems");
+
 int zfs_debug_level = 0;
 TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
     "Debug level");
 
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
+static int zfs_version_acl = ZFS_ACL_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
+    "ZFS_ACL_VERSION");
+static int zfs_version_dmu_backup_header = DMU_BACKUP_HEADER_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_header, CTLFLAG_RD,
+    &zfs_version_dmu_backup_header, 0, "DMU_BACKUP_HEADER_VERSION");
+static int zfs_version_dmu_backup_stream = DMU_BACKUP_STREAM_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_stream, CTLFLAG_RD,
+    &zfs_version_dmu_backup_stream, 0, "DMU_BACKUP_STREAM_VERSION");
+static int zfs_version_spa = SPA_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
+    "SPA_VERSION");
+static int zfs_version_vdev_boot = VDEV_BOOT_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, vdev_boot, CTLFLAG_RD,
+    &zfs_version_vdev_boot, 0, "VDEV_BOOT_VERSION");
+static int zfs_version_zpl = ZPL_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
+    "ZPL_VERSION");
+
 static int zfs_mount(vfs_t *vfsp, kthread_t *td);
 static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td);
 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td);
@@ -82,7 +111,7 @@ static struct vfsops zfs_vfsops = {
 	.vfs_fhtovp =		zfs_fhtovp,
 };
 
-VFS_SET(zfs_vfsops, zfs, VFCF_JAIL);
+VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
 
 /*
  * We need to keep a count of active fs's.
@@ -235,6 +264,27 @@ exec_changed_cb(void *arg, uint64_t newval)
 	}
 }
 
+/*
+ * The nbmand mount option can be changed at mount time.
+ * We can't allow it to be toggled on live file systems or incorrect
+ * behavior may be seen from cifs clients
+ *
+ * This property isn't registered via dsl_prop_register(), but this callback
+ * will be called when a file system is first mounted
+ */
+static void
+nbmand_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+	if (newval == FALSE) {
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
+	} else {
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
+	}
+}
+
 static void
 snapdir_changed_cb(void *arg, uint64_t newval)
 {
@@ -244,64 +294,27 @@ snapdir_changed_cb(void *arg, uint64_t newval)
 }
 
 static void
-acl_mode_changed_cb(void *arg, uint64_t newval)
+vscan_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
-	zfsvfs->z_acl_mode = newval;
+	zfsvfs->z_vscan = newval;
 }
 
 static void
-acl_inherit_changed_cb(void *arg, uint64_t newval)
+acl_mode_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
-	zfsvfs->z_acl_inherit = newval;
+	zfsvfs->z_acl_mode = newval;
 }
 
-static int
-zfs_refresh_properties(vfs_t *vfsp)
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
 {
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-
-	/*
-	 * Remount operations default to "rw" unless "ro" is explicitly
-	 * specified.
-	 */
-	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
-		readonly_changed_cb(zfsvfs, B_TRUE);
-	} else {
-		if (!dmu_objset_is_snapshot(zfsvfs->z_os))
-			readonly_changed_cb(zfsvfs, B_FALSE);
-		else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
-			return (EROFS);
-	}
-
-	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
-		setuid_changed_cb(zfsvfs, B_FALSE);
-	} else {
-		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
-			setuid_changed_cb(zfsvfs, B_FALSE);
-		else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
-			setuid_changed_cb(zfsvfs, B_TRUE);
-	}
-
-	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
-		exec_changed_cb(zfsvfs, B_FALSE);
-	else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
-		exec_changed_cb(zfsvfs, B_TRUE);
-
-	if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
-		atime_changed_cb(zfsvfs, B_TRUE);
-	else if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
-		atime_changed_cb(zfsvfs, B_FALSE);
-
-	if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
-		xattr_changed_cb(zfsvfs, B_TRUE);
-	else if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
-		xattr_changed_cb(zfsvfs, B_FALSE);
+	zfsvfs_t *zfsvfs = arg;
 
-	return (0);
+	zfsvfs->z_acl_inherit = newval;
 }
 
 static int
@@ -310,10 +323,12 @@ zfs_register_callbacks(vfs_t *vfsp)
 	struct dsl_dataset *ds = NULL;
 	objset_t *os = NULL;
 	zfsvfs_t *zfsvfs = NULL;
+	uint64_t nbmand;
 	int readonly, do_readonly = FALSE;
 	int setuid, do_setuid = FALSE;
 	int exec, do_exec = FALSE;
 	int xattr, do_xattr = FALSE;
+	int atime, do_atime = FALSE;
 	int error = 0;
 
 	ASSERT(vfsp);
@@ -360,6 +375,34 @@ zfs_register_callbacks(vfs_t *vfsp)
 		xattr = B_TRUE;
 		do_xattr = B_TRUE;
 	}
+	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
+		atime = B_FALSE;
+		do_atime = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
+		atime = B_TRUE;
+		do_atime = B_TRUE;
+	}
+
+	/*
+	 * nbmand is a special property.  It can only be changed at
+	 * mount time.
+	 *
+	 * This is weird, but it is documented to only be changeable
+	 * at mount time.
+	 */
+	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
+		nbmand = B_FALSE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
+		nbmand = B_TRUE;
+	} else {
+		char osname[MAXNAMELEN];
+
+		dmu_objset_name(os, osname);
+		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
+		    NULL)) {
+			return (error);
+		}
+	}
 
 	/*
 	 * Register property callbacks.
@@ -386,6 +429,8 @@ zfs_register_callbacks(vfs_t *vfsp)
 	    "aclmode", acl_mode_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    "vscan", vscan_changed_cb, zfsvfs);
 	if (error)
 		goto unregister;
 
@@ -400,6 +445,10 @@ zfs_register_callbacks(vfs_t *vfsp)
 		exec_changed_cb(zfsvfs, exec);
 	if (do_xattr)
 		xattr_changed_cb(zfsvfs, xattr);
+	if (do_atime)
+		atime_changed_cb(zfsvfs, atime);
+
+	nbmand_changed_cb(zfsvfs, nbmand);
 
 	return (0);
 
@@ -419,14 +468,73 @@ unregister:
 	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
 	    zfsvfs);
+	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
 	return (error);
 
 }
 
 static int
-zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td)
+zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
+{
+	int error;
+
+	error = zfs_register_callbacks(zfsvfs->z_vfs);
+	if (error)
+		return (error);
+
+	/*
+	 * Set the objset user_ptr to track its zfsvfs.
+	 */
+	mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
+	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+	mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
+
+	/*
+	 * If we are not mounting (ie: online recv), then we don't
+	 * have to worry about replaying the log as we blocked all
+	 * operations out since we closed the ZIL.
+	 */
+	if (mounting) {
+		boolean_t readonly;
+
+		/*
+		 * During replay we remove the read only flag to
+		 * allow replays to succeed.
+		 */
+		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+
+		/*
+		 * Parse and replay the intent log.
+		 */
+		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
+		    zfs_replay_vector, zfs_unlinked_drain);
+
+		zfs_unlinked_drain(zfsvfs);
+		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
+	}
+
+	if (!zil_disable)
+		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+
+	return (0);
+}
+
+static void
+zfs_freezfsvfs(zfsvfs_t *zfsvfs)
+{
+	mutex_destroy(&zfsvfs->z_znodes_lock);
+	mutex_destroy(&zfsvfs->z_online_recv_lock);
+	list_destroy(&zfsvfs->z_all_znodes);
+	rrw_destroy(&zfsvfs->z_teardown_lock);
+	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
+	rw_destroy(&zfsvfs->z_fuid_lock);
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+
+static int
+zfs_domount(vfs_t *vfsp, char *osname)
 {
-	cred_t *cr = td->td_ucred;
 	uint64_t recordsize, readonly;
 	int error = 0;
 	int mode;
@@ -449,9 +557,12 @@ zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td)
 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
-	rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
+	rrw_init(&zfsvfs->z_teardown_lock);
+	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 
 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
 	    NULL))
@@ -466,14 +577,13 @@ zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td)
 	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
 		goto out;
 
+	mode = DS_MODE_OWNER;
 	if (readonly)
-		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
-	else
-		mode = DS_MODE_PRIMARY;
+		mode |= DS_MODE_READONLY;
 
 	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
 	if (error == EROFS) {
-		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
+		mode = DS_MODE_OWNER | DS_MODE_READONLY;
 		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
 		    &zfsvfs->z_os);
 	}
@@ -481,34 +591,40 @@ zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td)
 	if (error)
 		goto out;
 
-	if (error = zfs_init_fs(zfsvfs, &zp, cr))
+	if (error = zfs_init_fs(zfsvfs, &zp))
 		goto out;
 
+	/*
+	 * Set features for file system.
+	 */
+	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+	if (zfsvfs->z_use_fuids) {
+		vfs_set_feature(vfsp, VFSFT_XVATTR);
+		vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
+		vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
+		vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
+	}
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
+		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
+		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
+	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
+		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
+		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
+	}
+
 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
-		uint64_t xattr;
+		uint64_t pval;
 
 		ASSERT(mode & DS_MODE_READONLY);
 		atime_changed_cb(zfsvfs, B_FALSE);
 		readonly_changed_cb(zfsvfs, B_TRUE);
-		if (error = dsl_prop_get_integer(osname, "xattr", &xattr, NULL))
+		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
 			goto out;
-		xattr_changed_cb(zfsvfs, xattr);
+		xattr_changed_cb(zfsvfs, pval);
 		zfsvfs->z_issnap = B_TRUE;
 	} else {
-		error = zfs_register_callbacks(vfsp);
-		if (error)
-			goto out;
-
-		zfs_unlinked_drain(zfsvfs);
-
-		/*
-		 * Parse and replay the intent log.
-		 */
-		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
-		    zfs_replay_vector);
-
-		if (!zil_disable)
-			zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+		error = zfsvfs_setup(zfsvfs, B_TRUE);
 	}
 
 	vfs_mountedfrom(vfsp, osname);
@@ -519,15 +635,12 @@ out:
 	if (error) {
 		if (zfsvfs->z_os)
 			dmu_objset_close(zfsvfs->z_os);
-		rw_destroy(&zfsvfs->z_um_lock);
-		mutex_destroy(&zfsvfs->z_znodes_lock);
-		kmem_free(zfsvfs, sizeof (zfsvfs_t));
+		zfs_freezfsvfs(zfsvfs);
 	} else {
 		atomic_add_32(&zfs_active_fs_count, 1);
 	}
 
 	return (error);
-
 }
 
 void
@@ -567,6 +680,9 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
 
 		VERIFY(dsl_prop_unregister(ds, "aclinherit",
 		    acl_inherit_changed_cb, zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "vscan",
+		    vscan_changed_cb, zfsvfs) == 0);
 	}
 }
 
@@ -574,22 +690,94 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
 static int
 zfs_mount(vfs_t *vfsp, kthread_t *td)
 {
-	char *from;
-	int error;
+	vnode_t		*mvp = vfsp->mnt_vnodecovered;
+	cred_t		*cr = td->td_ucred;
+	char		*osname;
+	int		error = 0;
+	int		canwrite;
+
+	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
+		return (EINVAL);
+
+	/*
+	 * If full-owner-access is enabled and delegated administration is
+	 * turned on, we must set nosuid.
+	 */
+	if (zfs_super_owner &&
+	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
+		secpolicy_fs_mount_clearopts(cr, vfsp);
+	}
+
+	/*
+	 * Check for mount privilege?
+	 *
+	 * If we don't have privilege then see if
+	 * we have local permission to allow it
+	 */
+	error = secpolicy_fs_mount(cr, mvp, vfsp);
+	if (error) {
+		error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
+		if (error == 0) {
+			vattr_t		vattr;
+
+			/*
+			 * Make sure user is the owner of the mount point
+			 * or has sufficient privileges.
+			 */
+
+			vattr.va_mask = AT_UID;
+
+			if (error = VOP_GETATTR(mvp, &vattr, cr)) {
+				goto out;
+			}
+
+#if 0 /* CHECK THIS! Is probably needed for zfs_suser. */
+			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
+			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
+				error = EPERM;
+				goto out;
+			}
+#else
+			if (error = secpolicy_vnode_owner(mvp, cr, vattr.va_uid)) {
+				goto out;
+			}
+
+			if (error = VOP_ACCESS(mvp, VWRITE, cr, td)) {
+				goto out;
+			}
+#endif
+
+			secpolicy_fs_mount_clearopts(cr, vfsp);
+		} else {
+			goto out;
+		}
+	}
+
+	/*
+	 * Refuse to mount a filesystem if we are in a local zone and the
+	 * dataset is not visible.
+	 */
+	if (!INGLOBALZONE(curthread) &&
+	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
+		error = EPERM;
+		goto out;
+	}
 
 	/*
 	 * When doing a remount, we simply refresh our temporary properties
 	 * according to those options set in the current VFS options.
 	 */
-	if (vfsp->vfs_flag & MS_REMOUNT)
-		return (zfs_refresh_properties(vfsp));
-
-	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&from, NULL))
-		return (EINVAL);
+	if (vfsp->vfs_flag & MS_REMOUNT) {
+		/* refresh mount options */
+		zfs_unregister_callbacks(vfsp->vfs_data);
+		error = zfs_register_callbacks(vfsp);
+		goto out;
+	}
 
 	DROP_GIANT();
-	error = zfs_domount(vfsp, from, td);
+	error = zfs_domount(vfsp, osname);
 	PICKUP_GIANT();
+out:
 	return (error);
 }
 
@@ -671,18 +859,131 @@ zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td)
 	return (error);
 }
 
+/*
+ * Teardown the zfsvfs::z_os.
+ *
+ * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
+ * and 'z_teardown_inactive_lock' held.
+ */
+static int
+zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
+{
+	znode_t	*zp;
+
+	rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+
+	if (!unmounting) {
+		/*
+		 * We purge the parent filesystem's vfsp as the parent
+		 * filesystem and all of its snapshots have their vnode's
+		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
+		 * 'z_parent' is self referential for non-snapshots.
+		 */
+		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
+	}
+
+	/*
+	 * Close the zil. NB: Can't close the zil while zfs_inactive
+	 * threads are blocked as zil_close can call zfs_inactive.
+	 */
+	if (zfsvfs->z_log) {
+		zil_close(zfsvfs->z_log);
+		zfsvfs->z_log = NULL;
+	}
+
+	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
+
+	/*
+	 * If we are not unmounting (ie: online recv) and someone already
+	 * unmounted this file system while we were doing the switcheroo,
+	 * or a reopen of z_os failed then just bail out now.
+	 */
+	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+		return (EIO);
+	}
+
+	/*
+	 * At this point there are no vops active, and any new vops will
+	 * fail with EIO since we have z_teardown_lock for writer (only
+	 * relavent for forced unmount).
+	 *
+	 * Release all holds on dbufs.
+	 */
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
+	    zp = list_next(&zfsvfs->z_all_znodes, zp))
+		if (zp->z_dbuf) {
+			ASSERT(ZTOV(zp)->v_count > 0);
+			zfs_znode_dmu_fini(zp);
+		}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	/*
+	 * If we are unmounting, set the unmounted flag and let new vops
+	 * unblock.  zfs_inactive will have the unmounted behavior, and all
+	 * other vops will fail with EIO.
+	 */
+	if (unmounting) {
+		zfsvfs->z_unmounted = B_TRUE;
+		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+	}
+
+	/*
+	 * z_os will be NULL if there was an error in attempting to reopen
+	 * zfsvfs, so just return as the properties had already been
+	 * unregistered and cached data had been evicted before.
+	 */
+	if (zfsvfs->z_os == NULL)
+		return (0);
+
+	/*
+	 * Unregister properties.
+	 */
+	zfs_unregister_callbacks(zfsvfs);
+
+	/*
+	 * Evict cached data
+	 */
+	if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
+		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+		(void) dmu_objset_evict_dbufs(zfsvfs->z_os);
+	}
+
+	return (0);
+}
+
 /*ARGSUSED*/
 static int
 zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	objset_t *os;
 	cred_t *cr = td->td_ucred;
 	int ret;
 
-	if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
-		return (ret);
+	if (fflag & MS_FORCE) {
+		/* TODO: Force unmount is not well implemented yet, so deny it. */
+		ZFS_LOG(0, "Force unmount is not supported, removing FORCE flag.");
+		fflag &= ~MS_FORCE;
+	}
 
-	(void) dnlc_purge_vfsp(vfsp, 0);
+	ret = secpolicy_fs_unmount(cr, vfsp);
+	if (ret) {
+		ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
+		    ZFS_DELEG_PERM_MOUNT, cr);
+		if (ret)
+			return (ret);
+	}
+	/*
+	 * We purge the parent filesystem's vfsp as the parent filesystem
+	 * and all of its snapshots have their vnode's v_vfsp set to the
+	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
+	 * referential for non-snapshots.
+	 */
+	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
 
 	/*
 	 * Unmount any snapshots mounted under .zfs before unmounting the
@@ -714,33 +1015,63 @@ zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td)
 		return (ret);
 	}
 
-	if (fflag & MS_FORCE) {
+	if (!(fflag & MS_FORCE)) {
+		/*
+		 * Check the number of active vnodes in the file system.
+		 * Our count is maintained in the vfs structure, but the
+		 * number is off by 1 to indicate a hold on the vfs
+		 * structure itself.
+		 *
+		 * The '.zfs' directory maintains a reference of its
+		 * own, and any active references underneath are
+		 * reflected in the vnode count.
+		 */
+		if (zfsvfs->z_ctldir == NULL) {
+			if (vfsp->vfs_count > 1)
+				return (EBUSY);
+		} else {
+			if (vfsp->vfs_count > 2 ||
+			    zfsvfs->z_ctldir->v_count > 1)
+				return (EBUSY);
+		}
+	} else {
 		MNT_ILOCK(vfsp);
 		vfsp->mnt_kern_flag |= MNTK_UNMOUNTF;
 		MNT_IUNLOCK(vfsp);
-		zfsvfs->z_unmounted1 = B_TRUE;
+	}
+
+	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+	os = zfsvfs->z_os;
+
+	/*
+	 * z_os will be NULL if there was an error in
+	 * attempting to reopen zfsvfs.
+	 */
+	if (os != NULL) {
+		/*
+		 * Unset the objset user_ptr.
+		 */
+		mutex_enter(&os->os->os_user_ptr_lock);
+		dmu_objset_set_user(os, NULL);
+		mutex_exit(&os->os->os_user_ptr_lock);
 
 		/*
-		 * Wait for all zfs threads to leave zfs.
-		 * Grabbing a rwlock as reader in all vops and
-		 * as writer here doesn't work because it too easy to get
-		 * multiple reader enters as zfs can re-enter itself.
-		 * This can lead to deadlock if there is an intervening
-		 * rw_enter as writer.
-		 * So a file system threads ref count (z_op_cnt) is used.
-		 * A polling loop on z_op_cnt may seem inefficient, but
-		 * - this saves all threads on exit from having to grab a
-		 *   mutex in order to cv_signal
-		 * - only occurs on forced unmount in the rare case when
-		 *   there are outstanding threads within the file system.
+		 * Finally release the objset
 		 */
-		while (zfsvfs->z_op_cnt) {
-			delay(1);
-		}
+		dmu_objset_close(os);
 	}
 
-	zfs_objset_close(zfsvfs);
-	VFS_RELE(vfsp);
+	/*
+	 * We can now safely destroy the '.zfs' directory node.
+	 */
+	if (zfsvfs->z_ctldir != NULL)
+		zfsctl_destroy(zfsvfs);
+	if (zfsvfs->z_issnap) {
+		vnode_t *svp = vfsp->mnt_vnodecovered;
+
+		ASSERT(svp->v_count == 2);
+		VN_RELE(svp);
+	}
 	zfs_freevfs(vfsp);
 
 	return (0);
@@ -772,7 +1103,6 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
 static int
 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
 {
-	kthread_t	*td = curthread;
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
 	uint64_t	object = 0;
@@ -824,7 +1154,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
 		ASSERT(*vpp != NULL);
 		if (object == ZFSCTL_INO_SNAPDIR) {
 			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
-			    0, NULL, NULL) == 0);
+			    0, NULL, NULL, NULL, NULL, NULL) == 0);
 		} else {
 			VN_HOLD(*vpp);
 		}
@@ -854,84 +1184,79 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
 	*vpp = ZTOV(zp);
 	/* XXX: LK_RETRY? */
 	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-	vnode_create_vobject(*vpp, zp->z_phys->zp_size, td);
+	vnode_create_vobject(*vpp, zp->z_phys->zp_size, curthread);
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
-static void
-zfs_objset_close(zfsvfs_t *zfsvfs)
+/*
+ * Block out VOPs and close zfsvfs_t::z_os
+ *
+ * Note, if successful, then we return with the 'z_teardown_lock' and
+ * 'z_teardown_inactive_lock' write held.
+ */
+int
+zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
 {
-	znode_t		*zp, *nextzp;
-	objset_t	*os = zfsvfs->z_os;
+	int error;
 
-	/*
-	 * For forced unmount, at this point all vops except zfs_inactive
-	 * are erroring EIO. We need to now suspend zfs_inactive threads
-	 * while we are freeing dbufs before switching zfs_inactive
-	 * to use behaviour without a objset.
-	 */
-	rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
+	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
+		return (error);
 
-	/*
-	 * Release all holds on dbufs
-	 * Note, although we have stopped all other vop threads and
-	 * zfs_inactive(), the dmu can callback via znode_pageout_func()
-	 * which can zfs_znode_free() the znode.
-	 * So we lock z_all_znodes; search the list for a held
-	 * dbuf; drop the lock (we know zp can't disappear if we hold
-	 * a dbuf lock; then regrab the lock and restart.
-	 */
-	mutex_enter(&zfsvfs->z_znodes_lock);
-	for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
-		nextzp = list_next(&zfsvfs->z_all_znodes, zp);
-		if (zp->z_dbuf_held) {
-			/* dbufs should only be held when force unmounting */
-			zp->z_dbuf_held = 0;
-			mutex_exit(&zfsvfs->z_znodes_lock);
-			dmu_buf_rele(zp->z_dbuf, NULL);
-			/* Start again */
-			mutex_enter(&zfsvfs->z_znodes_lock);
-			nextzp = list_head(&zfsvfs->z_all_znodes);
-		}
-	}
-	mutex_exit(&zfsvfs->z_znodes_lock);
+	*mode = zfsvfs->z_os->os_mode;
+	dmu_objset_name(zfsvfs->z_os, name);
+	dmu_objset_close(zfsvfs->z_os);
 
-	/*
-	 * Unregister properties.
-	 */
-	if (!dmu_objset_is_snapshot(os))
-		zfs_unregister_callbacks(zfsvfs);
+	return (0);
+}
 
-	/*
-	 * Switch zfs_inactive to behaviour without an objset.
-	 * It just tosses cached pages and frees the znode & vnode.
-	 * Then re-enable zfs_inactive threads in that new behaviour.
-	 */
-	zfsvfs->z_unmounted2 = B_TRUE;
-	rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */
+/*
+ * Reopen zfsvfs_t::z_os and release VOPs.
+ */
+int
+zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
+{
+	int err;
 
-	/*
-	 * Close the zil. Can't close the zil while zfs_inactive
-	 * threads are blocked as zil_close can call zfs_inactive.
-	 */
-	if (zfsvfs->z_log) {
-		zil_close(zfsvfs->z_log);
-		zfsvfs->z_log = NULL;
-	}
+	ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
+	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+	err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
+	if (err) {
+		zfsvfs->z_os = NULL;
+	} else {
+		znode_t *zp;
+
+		VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+
+		/*
+		 * Attempt to re-establish all the active znodes with
+		 * their dbufs.  If a zfs_rezget() fails, then we'll let
+		 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
+		 * when they try to use their znode.
+		 */
+		mutex_enter(&zfsvfs->z_znodes_lock);
+		for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+			(void) zfs_rezget(zp);
+		}
+		mutex_exit(&zfsvfs->z_znodes_lock);
 
-	/*
-	 * Evict all dbufs so that cached znodes will be freed
-	 */
-	if (dmu_objset_evict_dbufs(os, 1)) {
-		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
-		(void) dmu_objset_evict_dbufs(os, 0);
 	}
 
-	/*
-	 * Finally close the objset
-	 */
-	dmu_objset_close(os);
+	/* release the VOPs */
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
+	rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+
+	if (err) {
+		/*
+		 * Since we couldn't reopen zfsvfs::z_os, force
+		 * unmount this file system.
+		 */
+		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
+			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
+	}
+	return (err);
 }
 
 static void
@@ -942,9 +1267,9 @@ zfs_freevfs(vfs_t *vfsp)
 
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
-	rw_destroy(&zfsvfs->z_um_lock);
-	mutex_destroy(&zfsvfs->z_znodes_lock);
-	kmem_free(zfsvfs, sizeof (zfsvfs_t));
+
+	zfs_fuid_destroy(zfsvfs);
+	zfs_freezfsvfs(zfsvfs);
 
 	atomic_add_32(&zfs_active_fs_count, -1);
 }
@@ -957,7 +1282,7 @@ static void
 zfs_vnodes_adjust(void)
 {
 #ifdef __i386__
-	int val;
+	int newdesiredvnodes;
 
 	desiredvnodes_backup = desiredvnodes;
 
@@ -966,10 +1291,11 @@ zfs_vnodes_adjust(void)
 	 * vntblinit(). If it is equal to desiredvnodes, it means that
 	 * it wasn't tuned by the administrator and we can tune it down.
 	 */
-	val = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
-	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
-	if (desiredvnodes == val)
-		desiredvnodes = (3 * desiredvnodes) / 4;
+	newdesiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 *
+	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
+	    sizeof(struct vnode))));
+	if (newdesiredvnodes == desiredvnodes)
+		desiredvnodes = (3 * newdesiredvnodes) / 4;
 #endif
 }
 
@@ -986,20 +1312,20 @@ void
 zfs_init(void)
 {
 
-	printf("ZFS filesystem version " ZFS_VERSION_STRING "\n");
+	printf("ZFS filesystem version " SPA_VERSION_STRING "\n");
 
 	/*
-	 * Initialize .zfs directory structures
+	 * Initialize znode cache, vnode ops, etc...
 	 */
-	zfsctl_init();
+	zfs_znode_init();
 
 	/*
-	 * Initialize znode cache, vnode ops, etc...
+	 * Initialize .zfs directory structures
 	 */
-	zfs_znode_init();
+	zfsctl_init();
 
 	/*
-	 * Reduce number of vnodes. Originally number of vnodes is calculated
+	 * Reduce number of vnode. Originally number of vnodes is calculated
 	 * with UFS inode in mind. We reduce it here, because it's too big for
 	 * ZFS/i386.
 	 */
@@ -1019,3 +1345,95 @@ zfs_busy(void)
 {
 	return (zfs_active_fs_count != 0);
 }
+
+int
+zfs_set_version(const char *name, uint64_t newvers)
+{
+	int error;
+	objset_t *os;
+	dmu_tx_t *tx;
+	uint64_t curvers;
+
+	/*
+	 * XXX for now, require that the filesystem be unmounted.  Would
+	 * be nice to find the zfsvfs_t and just update that if
+	 * possible.
+	 */
+
+	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
+		return (EINVAL);
+
+	error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os);
+	if (error)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+	    8, 1, &curvers);
+	if (error)
+		goto out;
+	if (newvers < curvers) {
+		error = EINVAL;
+		goto out;
+	}
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		goto out;
+	}
+	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
+	    &newvers, tx);
+
+	spa_history_internal_log(LOG_DS_UPGRADE,
+	    dmu_objset_spa(os), tx, CRED(),
+	    "oldver=%llu newver=%llu dataset = %llu", curvers, newvers,
+	    dmu_objset_id(os));
+	dmu_tx_commit(tx);
+
+out:
+	dmu_objset_close(os);
+	return (error);
+}
+/*
+ * Read a property stored within the master node.
+ */
+int
+zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
+{
+	const char *pname;
+	int error = ENOENT;
+
+	/*
+	 * Look up the file system's value for the property.  For the
+	 * version property, we look up a slightly different string.
+	 */
+	if (prop == ZFS_PROP_VERSION)
+		pname = ZPL_VERSION_STR;
+	else
+		pname = zfs_prop_to_name(prop);
+
+	if (os != NULL)
+		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
+
+	if (error == ENOENT) {
+		/* No value set, use the default value */
+		switch (prop) {
+		case ZFS_PROP_VERSION:
+			*value = ZPL_VERSION;
+			break;
+		case ZFS_PROP_NORMALIZE:
+		case ZFS_PROP_UTF8ONLY:
+			*value = 0;
+			break;
+		case ZFS_PROP_CASE:
+			*value = ZFS_CASE_SENSITIVE;
+			break;
+		default:
+			return (error);
+		}
+		error = 0;
+	}
+	return (error);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index 49ea690a977a..d37c90e981c3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -19,14 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
@@ -46,7 +44,6 @@
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
-#include <sys/zfs_vfsops.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
@@ -61,8 +58,11 @@
 #include <sys/sunddi.h>
 #include <sys/filio.h>
 #include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
 #include <sys/dnlc.h>
 #include <sys/zfs_rlock.h>
+#include <sys/extdirent.h>
+#include <sys/kidmap.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sf_buf.h>
@@ -74,14 +74,16 @@
  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  * properly lock its in-core state, create a DMU transaction, do the work,
  * record this work in the intent log (ZIL), commit the DMU transaction,
- * and wait the the intent log to commit if it's is a synchronous operation.
- * Morover, the vnode ops must work in both normal and log replay context.
+ * and wait for the intent log to commit if it is a synchronous operation.
+ * Moreover, the vnode ops must work in both normal and log replay context.
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1) A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
- *	A ZFS_EXIT(zfsvfs) is needed before all returns.
+ *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
+ *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
+ *      can return EIO from the calling function.
  *
  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
@@ -154,26 +156,41 @@
  *	ZFS_EXIT(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
+
 /* ARGSUSED */
 static int
-zfs_open(vnode_t **vpp, int flag, cred_t *cr)
+zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(*vpp);
 
+	if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
+	    ((flag & FAPPEND) == 0)) {
+		return (EPERM);
+	}
+
+	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
+	    ZTOV(zp)->v_type == VREG &&
+	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
+	    zp->z_phys->zp_size > 0)
+		if (fs_vscan(*vpp, cr, 0) != 0)
+			return (EACCES);
+
 	/* Keep a count of the synchronous opens in the znode */
 	if (flag & (FSYNC | FDSYNC))
 		atomic_inc_32(&zp->z_sync_cnt);
+
 	return (0);
 }
 
 /* ARGSUSED */
 static int
-zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
+zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 
 	/* Decrement the synchronous opens in the znode */
-	if (flag & (FSYNC | FDSYNC))
+	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	/*
@@ -182,6 +199,12 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
 	cleanlocks(vp, ddi_get_pid(), 0);
 	cleanshares(vp, ddi_get_pid());
 
+	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
+	    ZTOV(zp)->v_type == VREG &&
+	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
+	    zp->z_phys->zp_size > 0)
+		VERIFY(fs_vscan(vp, cr, 1) == 0);
+
 	return (0);
 }
 
@@ -231,31 +254,34 @@ zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
 /* ARGSUSED */
 static int
 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
-    int *rvalp)
+    int *rvalp, caller_context_t *ct)
 {
 	offset_t off;
 	int error;
 	zfsvfs_t *zfsvfs;
+	znode_t *zp;
 
 	switch (com) {
-	    case _FIOFFS:
+	case _FIOFFS:
 		return (0);
 
 		/*
 		 * The following two ioctls are used by bfu.  Faking out,
 		 * necessary to avoid bfu errors.
 		 */
-	    case _FIOGDIO:
-	    case _FIOSDIO:
+	case _FIOGDIO:
+	case _FIOSDIO:
 		return (0);
 
-	    case _FIO_SEEK_DATA:
-	    case _FIO_SEEK_HOLE:
+	case _FIO_SEEK_DATA:
+	case _FIO_SEEK_HOLE:
 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 			return (EFAULT);
 
-		zfsvfs = VTOZ(vp)->z_zfsvfs;
+		zp = VTOZ(vp);
+		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
+		ZFS_VERIFY_ZP(zp);
 
 		/* offset parameter is in/out */
 		error = zfs_holey(vp, com, &off);
@@ -474,6 +500,7 @@ offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
  *			  and return buffer.
  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
  *		cr	- credentials of caller.
+ *		ct	- caller context
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *
@@ -489,12 +516,19 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	objset_t	*os = zfsvfs->z_os;
+	objset_t	*os;
 	ssize_t		n, nbytes;
 	int		error;
 	rl_t		*rl;
 
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+	os = zfsvfs->z_os;
+
+	if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
+		ZFS_EXIT(zfsvfs);
+		return (EACCES);
+	}
 
 	/*
 	 * Validate file offset
@@ -554,8 +588,12 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 			error = mappedread(vp, nbytes, uio);
 		else
 			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
-		if (error)
+		if (error) {
+			/* convert checksum errors into IO errors */
+			if (error == ECKSUM)
+				error = EIO;
 			break;
+		}
 
 		n -= nbytes;
 	}
@@ -623,6 +661,7 @@ zfs_prefault_write(ssize_t n, struct uio *uio)
  *			  and data buffer.
  *		ioflag	- IO_APPEND flag set if in append mode.
  *		cr	- credentials of caller.
+ *		ct	- caller context (NFS/CIFS fem monitor only)
  *
  *	OUT:	uio	- updated offset and range.
  *
@@ -643,11 +682,12 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	uint64_t	end_size;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	offset_t	woff;
 	ssize_t		n, nbytes;
 	rl_t		*rl;
 	int		max_blksz = zfsvfs->z_max_blksz;
+	uint64_t	pflags;
 	int		error;
 
 	/*
@@ -661,6 +701,20 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 		limit = MAXOFFSET_T;
 
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	/*
+	 * If immutable or not appending then return EPERM
+	 */
+	pflags = zp->z_phys->zp_flags;
+	if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
+	    ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
+	    (uio->uio_loffset < zp->z_phys->zp_size))) {
+		ZFS_EXIT(zfsvfs);
+		return (EPERM);
+	}
+
+	zilog = zfsvfs->z_log;
 
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
@@ -808,15 +862,18 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 		 * It would be nice to to this after all writes have
 		 * been done, but that would still expose the ISUID/ISGID
 		 * to another app after the partial write is committed.
+		 *
+		 * Note: we don't call zfs_fuid_map_id() here because
+		 * user 0 is not an ephemeral uid.
 		 */
 		mutex_enter(&zp->z_acl_lock);
 		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
 		    (S_IXUSR >> 6))) != 0 &&
 		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
-		    secpolicy_vnode_setid_retain(cr,
+		    secpolicy_vnode_setid_retain(vp, cr,
 		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
 		    zp->z_phys->zp_uid == 0) != 0) {
-			    zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
+			zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
 		}
 		mutex_exit(&zp->z_acl_lock);
 
@@ -872,7 +929,7 @@ zfs_get_done(dmu_buf_t *db, void *vzgd)
 	dmu_buf_rele(db, vzgd);
 	zfs_range_unlock(rl);
 	VN_RELE(vp);
-	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
+	zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 	kmem_free(zgd, sizeof (zgd_t));
 	VFS_UNLOCK_GIANT(vfslocked);
 }
@@ -957,11 +1014,10 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 		lr->lr_blkoff = off - boff;
 		error = dmu_sync(zio, db, &lr->lr_blkptr,
 		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
-		ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz);
-		if (error == 0) {
-			zil_add_vdev(zfsvfs->z_log,
-			    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
-		}
+		ASSERT((error && error != EINPROGRESS) ||
+		    lr->lr_length <= zp->z_blksz);
+		if (error == 0)
+			zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
 		/*
 		 * If we get EINPROGRESS, then we need to wait for a
 		 * write IO initiated by dmu_sync() to complete before
@@ -981,14 +1037,21 @@ out:
 
 /*ARGSUSED*/
 static int
-zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
+zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
+    caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
-	error = zfs_zaccess_rwx(zp, mode, cr);
+	ZFS_VERIFY_ZP(zp);
+
+	if (flag & V_ACE_MASK)
+		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
+	else
+		error = zfs_zaccess_rwx(zp, mode, flag, cr);
+
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
@@ -1003,6 +1066,9 @@ zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
  *		flags	- LOOKUP_XATTR set if looking for an attribute.
  *		rdir	- root directory vnode [UNUSED].
  *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		direntflags - directory lookup flags
+ *		realpnp - returned pathname.
  *
  *	OUT:	vpp	- vnode of located entry, NULL if not found.
  *
@@ -1015,19 +1081,21 @@ zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
 /* ARGSUSED */
 static int
 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
-    int nameiop, cred_t *cr, kthread_t *td)
+    int nameiop, cred_t *cr, kthread_t *td, int flags)
 {
-
 	znode_t *zdp = VTOZ(dvp);
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int	error;
+	int *direntflags = NULL;
+	void *realpnp = NULL;
 
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zdp);
 
 	*vpp = NULL;
 
-#ifdef TODO
 	if (flags & LOOKUP_XATTR) {
+#ifdef TODO
 		/*
 		 * If the xattr property is off, refuse the lookup request.
 		 */
@@ -1035,6 +1103,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
 			ZFS_EXIT(zfsvfs);
 			return (EINVAL);
 		}
+#endif
 
 		/*
 		 * We don't allow recursive attributes..
@@ -1054,14 +1123,15 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
 		 * Do we have permission to get into attribute directory?
 		 */
 
-		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
+		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
+		    B_FALSE, cr)) {
 			VN_RELE(*vpp);
+			*vpp = NULL;
 		}
 
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
-#endif	/* TODO */
 
 	if (dvp->v_type != VDIR) {
 		ZFS_EXIT(zfsvfs);
@@ -1072,13 +1142,19 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
 	 * Check accessibility of directory.
 	 */
 
-	if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
+	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-	if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) {
+	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EILSEQ);
+	}
 
+	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
+	if (error == 0) {
 		/*
 		 * Convert device special files
 		 */
@@ -1162,6 +1238,8 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
  *		mode	- mode to open file with.
  *		cr	- credentials of caller.
  *		flag	- large file flag [UNUSED].
+ *		ct	- caller context
+ *		vsecp 	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created or trunc'd entry.
  *
@@ -1172,22 +1250,52 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
  *	dvp - ctime|mtime updated if new entry created
  *	 vp - ctime|mtime always, atime if new
  */
+
 /* ARGSUSED */
 static int
 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
-    vnode_t **vpp, cred_t *cr)
+    vnode_t **vpp, cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
-	objset_t	*os = zfsvfs->z_os;
+	zilog_t		*zilog;
+	objset_t	*os;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
-	uint64_t	zoid;
+	zfs_acl_t	*aclp = NULL;
+	zfs_fuid_info_t *fuidp = NULL;
+	void		*vsecp = NULL;
+	int		flag = 0;
+
+	/*
+	 * If we have an ephemeral id, ACL, or XVATTR then
+	 * make sure file system is at proper version
+	 */
+
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (vsecp || (vap->va_mask & AT_XVATTR) ||
+	    IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))))
+		return (EINVAL);
 
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	os = zfsvfs->z_os;
+	zilog = zfsvfs->z_log;
 
+	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EILSEQ);
+	}
+
+	if (vap->va_mask & AT_XVATTR) {
+		if ((error = secpolicy_xvattr((xvattr_t *)vap,
+		    crgetuid(cr), cr, vap->va_type)) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
 top:
 	*vpp = NULL;
 
@@ -1204,22 +1312,40 @@ top:
 		error = 0;
 	} else {
 		/* possible VN_HOLD(zp) */
-		if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
+		int zflg = 0;
+
+		if (flag & FIGNORECASE)
+			zflg |= ZCILOOK;
+
+		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+		    NULL, NULL);
+		if (error) {
 			if (strcmp(name, "..") == 0)
 				error = EISDIR;
 			ZFS_EXIT(zfsvfs);
+			if (aclp)
+				zfs_acl_free(aclp);
+			return (error);
+		}
+	}
+	if (vsecp && aclp == NULL) {
+		error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
+		if (error) {
+			ZFS_EXIT(zfsvfs);
+			if (dl)
+				zfs_dirent_unlock(dl);
 			return (error);
 		}
 	}
-
-	zoid = zp ? zp->z_id : -1ULL;
 
 	if (zp == NULL) {
+		uint64_t txtype;
+
 		/*
 		 * Create a new file object and update the directory
 		 * to reference it.
 		 */
-		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 			goto out;
 		}
 
@@ -1235,11 +1361,26 @@ top:
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+		if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) ||
+		    IS_EPHEMERAL(crgetgid(cr))) {
+			if (zfsvfs->z_fuid_obj == 0) {
+				dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+				dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+				    FUID_SIZE_ESTIMATE(zfsvfs));
+				dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
+				    FALSE, NULL);
+			} else {
+				dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+				dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+				    FUID_SIZE_ESTIMATE(zfsvfs));
+			}
+		}
 		dmu_tx_hold_bonus(tx, dzp->z_id);
 		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-		if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+		if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, SPA_MAXBLOCKSIZE);
+		}
 		error = dmu_tx_assign(tx, zfsvfs->z_assign);
 		if (error) {
 			zfs_dirent_unlock(dl);
@@ -1251,14 +1392,23 @@ top:
 			}
 			dmu_tx_abort(tx);
 			ZFS_EXIT(zfsvfs);
+			if (aclp)
+				zfs_acl_free(aclp);
 			return (error);
 		}
-		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
-		ASSERT(zp->z_id == zoid);
+		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
 		(void) zfs_link_create(dl, zp, tx, ZNEW);
-		zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
+		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+		if (flag & FIGNORECASE)
+			txtype |= TX_CI;
+		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+		    vsecp, fuidp, vap);
+		if (fuidp)
+			zfs_fuid_info_free(fuidp);
 		dmu_tx_commit(tx);
 	} else {
+		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
+
 		/*
 		 * A directory entry already exists for this name.
 		 */
@@ -1279,7 +1429,7 @@ top:
 		/*
 		 * Verify requested access to file.
 		 */
-		if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
+		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
 			goto out;
 		}
 
@@ -1292,13 +1442,12 @@ top:
 		 */
 		if ((ZTOV(zp)->v_type == VREG) &&
 		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
+			/* we can't hold any locks when calling zfs_freesp() */
+			zfs_dirent_unlock(dl);
+			dl = NULL;
 			error = zfs_freesp(zp, 0, 0, mode, TRUE);
-			if (error == ERESTART &&
-			    zfsvfs->z_assign == TXG_NOWAIT) {
-				/* NB: we already did dmu_tx_wait() */
-				zfs_dirent_unlock(dl);
-				VN_RELE(ZTOV(zp));
-				goto top;
+			if (error == 0) {
+				vnevent_create(ZTOV(zp), ct);
 			}
 		}
 	}
@@ -1325,6 +1474,8 @@ out:
 			*vpp = svp;
 		}
 	}
+	if (aclp)
+		zfs_acl_free(aclp);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
@@ -1336,6 +1487,8 @@ out:
  *	IN:	dvp	- vnode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
  *
  *	RETURN:	0 if success
  *		error code if failure
@@ -1344,28 +1497,45 @@ out:
  *	dvp - ctime|mtime
  *	 vp - ctime (if nlink > 0)
  */
+/*ARGSUSED*/
 static int
-zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
+zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
+    int flags)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	znode_t		*xzp = NULL;
 	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	boolean_t	may_delete_now, delete_now = FALSE;
-	boolean_t	unlinked;
+	boolean_t	unlinked, toobig = FALSE;
+	uint64_t	txtype;
+	pathname_t	*realnmp = NULL;
+	pathname_t	realnm;
 	int		error;
+	int		zflg = ZEXISTS;
 
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
+
+	if (flags & FIGNORECASE) {
+		zflg |= ZCILOOK;
+		pn_alloc(&realnm);
+		realnmp = &realnm;
+	}
 
 top:
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
+	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+	    NULL, realnmp)) {
+		if (realnmp)
+			pn_free(realnmp);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
@@ -1384,9 +1554,12 @@ top:
 		goto out;
 	}
 
-	vnevent_remove(vp);
+	vnevent_remove(vp, dvp, name, ct);
 
-	dnlc_remove(dvp, name);
+	if (realnmp)
+		dnlc_remove(dvp, realnmp->pn_buf);
+	else
+		dnlc_remove(dvp, name);
 
 	may_delete_now = FALSE;
 
@@ -1399,8 +1572,13 @@ top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	if (may_delete_now)
-		dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+	if (may_delete_now) {
+		toobig =
+		    zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
+		/* if the file is too big, only hold_free a token amount */
+		dmu_tx_hold_free(tx, zp->z_id, 0,
+		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
+	}
 
 	/* are there any extended attributes? */
 	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
@@ -1425,6 +1603,8 @@ top:
 			dmu_tx_abort(tx);
 			goto top;
 		}
+		if (realnmp)
+			pn_free(realnmp);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
@@ -1433,7 +1613,7 @@ top:
 	/*
 	 * Remove the directory entry.
 	 */
-	error = zfs_link_destroy(dl, zp, tx, 0, &unlinked);
+	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
@@ -1442,7 +1622,7 @@ top:
 
 	if (0 && unlinked) {
 		VI_LOCK(vp);
-		delete_now = may_delete_now &&
+		delete_now = may_delete_now && !toobig &&
 		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
 		    zp->z_phys->zp_xattr == xattr_obj &&
 		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
@@ -1469,21 +1649,26 @@ top:
 		VI_UNLOCK(vp);
 		mutex_exit(&zp->z_lock);
 		zfs_znode_delete(zp, tx);
-		VFS_RELE(zfsvfs->z_vfs);
 	} else if (unlinked) {
 		zfs_unlinked_add(zp, tx);
 	}
 
-	zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
+	txtype = TX_REMOVE;
+	if (flags & FIGNORECASE)
+		txtype |= TX_CI;
+	zfs_log_remove(zilog, tx, txtype, dzp, name);
 
 	dmu_tx_commit(tx);
 out:
+	if (realnmp)
+		pn_free(realnmp);
+
 	zfs_dirent_unlock(dl);
 
 	if (!delete_now) {
 		VN_RELE(vp);
 	} else if (xzp) {
-		/* this rele delayed to prevent nesting transactions */
+		/* this rele is delayed to prevent nesting transactions */
 		VN_RELE(ZTOV(xzp));
 	}
 
@@ -1499,6 +1684,8 @@ out:
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created directory.
  *
@@ -1509,49 +1696,104 @@ out:
  *	dvp - ctime|mtime updated
  *	 vp - ctime|mtime|atime updated
  */
+/*ARGSUSED*/
 static int
-zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
+zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
+    caller_context_t *ct, int flags, vsecattr_t *vsecp)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
-	uint64_t	zoid = 0;
+	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
+	zfs_acl_t	*aclp = NULL;
+	zfs_fuid_info_t	*fuidp = NULL;
+	int		zf = ZNEW;
 
 	ASSERT(vap->va_type == VDIR);
 
+	/*
+	 * If we have an ephemeral id, ACL, or XVATTR then
+	 * make sure file system is at proper version
+	 */
+
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))||
+	    IS_EPHEMERAL(crgetgid(cr))))
+		return (EINVAL);
+
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
 
 	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
 		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
 	}
-top:
-	*vpp = NULL;
+
+	if (zfsvfs->z_utf8 && u8_validate(dirname,
+	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EILSEQ);
+	}
+	if (flags & FIGNORECASE)
+		zf |= ZCILOOK;
+
+	if (vap->va_mask & AT_XVATTR)
+		if ((error = secpolicy_xvattr((xvattr_t *)vap,
+		    crgetuid(cr), cr, vap->va_type)) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
 
 	/*
 	 * First make sure the new directory doesn't exist.
 	 */
-	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
+top:
+	*vpp = NULL;
+
+	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
+	    NULL, NULL)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
+	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
+	if (vsecp && aclp == NULL) {
+		error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
+		if (error) {
+			zfs_dirent_unlock(dl);
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
 	/*
 	 * Add a new entry to the directory.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
-	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+	if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) ||
+	    IS_EPHEMERAL(crgetgid(cr))) {
+		if (zfsvfs->z_fuid_obj == 0) {
+			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+			    FUID_SIZE_ESTIMATE(zfsvfs));
+			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
+		} else {
+			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+			    FUID_SIZE_ESTIMATE(zfsvfs));
+		}
+	}
+	if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, SPA_MAXBLOCKSIZE);
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
@@ -1564,13 +1806,18 @@ top:
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
+		if (aclp)
+			zfs_acl_free(aclp);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
-	zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
+
+	if (aclp)
+		zfs_acl_free(aclp);
 
 	/*
 	 * Now put new name in parent dir.
@@ -1579,7 +1826,13 @@ top:
 
 	*vpp = ZTOV(zp);
 
-	zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
+	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
+	if (flags & FIGNORECASE)
+		txtype |= TX_CI;
+	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap);
+
+	if (fuidp)
+		zfs_fuid_info_free(fuidp);
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
@@ -1597,6 +1850,8 @@ top:
  *		name	- name of directory to be removed.
  *		cwd	- vnode of current working directory.
  *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
  *
  *	RETURN:	0 if success
  *		error code if failure
@@ -1604,27 +1859,35 @@ top:
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
+/*ARGSUSED*/
 static int
-zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
+zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
+    caller_context_t *ct, int flags)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp;
 	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
+	int		zflg = ZEXISTS;
 
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
 
+	if (flags & FIGNORECASE)
+		zflg |= ZCILOOK;
 top:
 	zp = NULL;
 
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
+	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+	    NULL, NULL)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
@@ -1645,7 +1908,7 @@ top:
 		goto out;
 	}
 
-	vnevent_rmdir(vp);
+	vnevent_rmdir(vp, dvp, name, ct);
 
 	/*
 	 * Grab a lock on the directory to make sure that noone is
@@ -1683,10 +1946,14 @@ top:
 	cache_purge(dvp);
 #endif
 
-	error = zfs_link_destroy(dl, zp, tx, 0, NULL);
+	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
 
-	if (error == 0)
-		zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
+	if (error == 0) {
+		uint64_t txtype = TX_RMDIR;
+		if (flags & FIGNORECASE)
+			txtype |= TX_CI;
+		zfs_log_remove(zilog, tx, txtype, dzp, name);
+	}
 
 	dmu_tx_commit(tx);
 
@@ -1713,6 +1980,8 @@ out:
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *		eofp	- set to true if end-of-file detected.
@@ -1734,6 +2003,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
 {
 	znode_t		*zp = VTOZ(vp);
 	iovec_t		*iovp;
+	edirent_t	*eodp;
 	dirent64_t	*odp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
@@ -1747,11 +2017,14 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
 	int		outcount;
 	int		error;
 	uint8_t		prefetch;
+	boolean_t	check_sysattrs;
 	uint8_t		type;
 	int		ncooks;
 	u_long		*cooks = NULL;
+	int		flags = 0;
 
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
 
 	/*
 	 * If we are not given an eof variable,
@@ -1809,6 +2082,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
 		bufsize = bytes_wanted;
 		odp = (struct dirent64 *)iovp->iov_base;
 	}
+	eodp = (struct edirent *)odp;
 
 	if (ncookies != NULL) {
 		/*
@@ -1819,6 +2093,19 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
 		*cookies = cooks;
 		*ncookies = ncooks;
 	}
+	/*
+	 * If this VFS supports the system attribute view interface; and
+	 * we're looking at an extended attribute directory; and we care
+	 * about normalization conflicts on this vfs; then we must check
+	 * for normalization conflicts with the sysattr name space.
+	 */
+#ifdef TODO
+	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
+	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
+	    (flags & V_RDDIR_ENTFLAGS);
+#else
+	check_sysattrs = 0;
+#endif
 
 	/*
 	 * Transform to file-system independent format
@@ -1827,20 +2114,24 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
 	while (outcount < bytes_wanted) {
 		ino64_t objnum;
 		ushort_t reclen;
+		off64_t *next;
 
 		/*
 		 * Special case `.', `..', and `.zfs'.
 		 */
 		if (offset == 0) {
 			(void) strcpy(zap.za_name, ".");
+			zap.za_normalization_conflict = 0;
 			objnum = zp->z_id;
 			type = DT_DIR;
 		} else if (offset == 1) {
 			(void) strcpy(zap.za_name, "..");
+			zap.za_normalization_conflict = 0;
 			objnum = zp->z_phys->zp_parent;
 			type = DT_DIR;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+			zap.za_normalization_conflict = 0;
 			objnum = ZFSCTL_INO_ROOT;
 			type = DT_DIR;
 		} else {
@@ -1870,8 +2161,21 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
 			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 			 */
 			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+
+			if (check_sysattrs && !zap.za_normalization_conflict) {
+#ifdef TODO
+				zap.za_normalization_conflict =
+				    xattr_sysattr_casechk(zap.za_name);
+#else
+				panic("%s:%u: TODO", __func__, __LINE__);
+#endif
+			}
 		}
-		reclen = DIRENT64_RECLEN(strlen(zap.za_name));
+
+		if (flags & V_RDDIR_ENTFLAGS)
+			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
+		else
+			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
 
 		/*
 		 * Will this entry fit in the buffer?
@@ -1886,16 +2190,31 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
 			}
 			break;
 		}
-		/*
-		 * Add this entry:
-		 */
-		odp->d_ino = objnum;
-		odp->d_reclen = reclen;
-		odp->d_namlen = strlen(zap.za_name);
-		(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
-		odp->d_type = type;
+		if (flags & V_RDDIR_ENTFLAGS) {
+			/*
+			 * Add extended flag entry:
+			 */
+			eodp->ed_ino = objnum;
+			eodp->ed_reclen = reclen;
+			/* NOTE: ed_off is the offset for the *next* entry */
+			next = &(eodp->ed_off);
+			eodp->ed_eflags = zap.za_normalization_conflict ?
+			    ED_CASE_CONFLICT : 0;
+			(void) strncpy(eodp->ed_name, zap.za_name,
+			    EDIRENT_NAMELEN(reclen));
+			eodp = (edirent_t *)((intptr_t)eodp + reclen);
+		} else {
+			/*
+			 * Add normal entry:
+			 */
+			odp->d_ino = objnum;
+			odp->d_reclen = reclen;
+			odp->d_namlen = strlen(zap.za_name);
+			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
+			odp->d_type = type;
+			odp = (dirent64_t *)((intptr_t)odp + reclen);
+		}
 		outcount += reclen;
-		odp = (dirent64_t *)((intptr_t)odp + reclen);
 
 		ASSERT(outcount <= bufsize);
 
@@ -1956,26 +2275,34 @@ update:
 	return (error);
 }
 
+ulong_t zfs_fsync_sync_cnt = 4;
+
 static int
-zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
+zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
+	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
+
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
 	zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
+
 /*
  * Get the requested file attributes and place them in the provided
  * vattr structure.
  *
  *	IN:	vp	- vnode of file.
  *		vap	- va_mask identifies requested attributes.
- *		flags	- [UNUSED]
+ *			  If AT_XVATTR set, then optional attrs are requested
+ *		flags	- ATTR_NOACLCHECK (CIFS server context)
  *		cr	- credentials of caller.
+ *		ct	- caller context
  *
  *	OUT:	vap	- attribute values.
  *
@@ -1983,54 +2310,170 @@ zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
  */
 /* ARGSUSED */
 static int
-zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
+zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	znode_phys_t *pzp = zp->z_phys;
+	znode_phys_t *pzp;
+	int	error = 0;
 	uint32_t blksize;
 	u_longlong_t nblocks;
-	int	error;
+	uint64_t links;
+	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
+	xoptattr_t *xoap = NULL;
+	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+	pzp = zp->z_phys;
+
+	mutex_enter(&zp->z_lock);
+
+	/*
+	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
+	 * Also, if we are the owner don't bother, since owner should
+	 * always be allowed to read basic attributes of file.
+	 */
+	if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
+	    (pzp->zp_uid != crgetuid(cr))) {
+		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
+		    skipaclchk, cr)) {
+			mutex_exit(&zp->z_lock);
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
 
 	/*
 	 * Return all attributes.  It's cheaper to provide the answer
 	 * than to determine whether we were asked the question.
 	 */
-	mutex_enter(&zp->z_lock);
 
 	vap->va_type = IFTOVT(pzp->zp_mode);
 	vap->va_mode = pzp->zp_mode & ~S_IFMT;
-	vap->va_uid = zp->z_phys->zp_uid;
-	vap->va_gid = zp->z_phys->zp_gid;
+	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+//	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
 	vap->va_nodeid = zp->z_id;
-	vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX);	/* nlink_t limit! */
+	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
+		links = pzp->zp_links + 1;
+	else
+		links = pzp->zp_links;
+	vap->va_nlink = MIN(links, UINT32_MAX);	/* nlink_t limit! */
 	vap->va_size = pzp->zp_size;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
 	vap->va_seq = zp->z_seq;
 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
 
-	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
-	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
-	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
-	ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
-
 	/*
-	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
-	 * Also, if we are the owner don't bother, since owner should
-	 * always be allowed to read basic attributes of file.
+	 * Add in any requested optional attributes and the create time.
+	 * Also set the corresponding bits in the returned attribute bitmap.
 	 */
-	if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) &&
-	    (zp->z_phys->zp_uid != crgetuid(cr))) {
-		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
-			mutex_exit(&zp->z_lock);
-			ZFS_EXIT(zfsvfs);
-			return (error);
+	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
+		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+			xoap->xoa_archive =
+			    ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
+			XVA_SET_RTN(xvap, XAT_ARCHIVE);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+			xoap->xoa_readonly =
+			    ((pzp->zp_flags & ZFS_READONLY) != 0);
+			XVA_SET_RTN(xvap, XAT_READONLY);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+			xoap->xoa_system =
+			    ((pzp->zp_flags & ZFS_SYSTEM) != 0);
+			XVA_SET_RTN(xvap, XAT_SYSTEM);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+			xoap->xoa_hidden =
+			    ((pzp->zp_flags & ZFS_HIDDEN) != 0);
+			XVA_SET_RTN(xvap, XAT_HIDDEN);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+			xoap->xoa_nounlink =
+			    ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
+			XVA_SET_RTN(xvap, XAT_NOUNLINK);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+			xoap->xoa_immutable =
+			    ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
+			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+			xoap->xoa_appendonly =
+			    ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
+			XVA_SET_RTN(xvap, XAT_APPENDONLY);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+			xoap->xoa_nodump =
+			    ((pzp->zp_flags & ZFS_NODUMP) != 0);
+			XVA_SET_RTN(xvap, XAT_NODUMP);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+			xoap->xoa_opaque =
+			    ((pzp->zp_flags & ZFS_OPAQUE) != 0);
+			XVA_SET_RTN(xvap, XAT_OPAQUE);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+			xoap->xoa_av_quarantined =
+			    ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
+			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+			xoap->xoa_av_modified =
+			    ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
+			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
+		    vp->v_type == VREG &&
+		    (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
+			size_t len;
+			dmu_object_info_t doi;
+
+			/*
+			 * Only VREG files have anti-virus scanstamps, so we
+			 * won't conflict with symlinks in the bonus buffer.
+			 */
+			dmu_object_info_from_db(zp->z_dbuf, &doi);
+			len = sizeof (xoap->xoa_av_scanstamp) +
+			    sizeof (znode_phys_t);
+			if (len <= doi.doi_bonus_size) {
+				/*
+				 * pzp points to the start of the
+				 * znode_phys_t. pzp + 1 points to the
+				 * first byte after the znode_phys_t.
+				 */
+				(void) memcpy(xoap->xoa_av_scanstamp,
+				    pzp + 1,
+				    sizeof (xoap->xoa_av_scanstamp));
+				XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
+			ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
+			XVA_SET_RTN(xvap, XAT_CREATETIME);
 		}
 	}
 
+	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
+	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
+	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
+	ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
+
 	mutex_exit(&zp->z_lock);
 
 	dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
@@ -2054,8 +2497,11 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
  *
  *	IN:	vp	- vnode of file to be modified.
  *		vap	- new attribute values.
+ *			  If AT_XVATTR set, then optional attrs are being set
  *		flags	- ATTR_UTIME set if non-default time values provided.
+ *			- ATTR_NOACLCHECK (CIFS context only).
  *		cr	- credentials of caller.
+ *		ct	- caller context
  *
  *	RETURN:	0 if success
  *		error code if failure
@@ -2068,10 +2514,10 @@ static int
 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	caller_context_t *ct)
 {
-	struct znode	*zp = VTOZ(vp);
-	znode_phys_t	*pzp = zp->z_phys;
+	znode_t		*zp = VTOZ(vp);
+	znode_phys_t	*pzp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	uint_t		mask = vap->va_mask;
@@ -2081,6 +2527,11 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err;
+	zfs_fuid_info_t *fuidp = NULL;
+	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
+	xoptattr_t	*xoap;
+	zfs_acl_t	*aclp = NULL;
+	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
 	if (mask == 0)
 		return (0);
@@ -2088,13 +2539,69 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	if (mask & AT_NOSET)
 		return (EINVAL);
 
-	if (mask & AT_SIZE && vp->v_type == VDIR)
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	pzp = zp->z_phys;
+	zilog = zfsvfs->z_log;
+
+	/*
+	 * Make sure that if we have ephemeral uid/gid or xvattr specified
+	 * that file system is at proper version level
+	 */
+
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
+	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
+	    (mask & AT_XVATTR))) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	if (mask & AT_SIZE && vp->v_type == VDIR) {
+		ZFS_EXIT(zfsvfs);
 		return (EISDIR);
+	}
 
-	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO)
+	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
+		ZFS_EXIT(zfsvfs);
 		return (EINVAL);
+	}
 
-	ZFS_ENTER(zfsvfs);
+	/*
+	 * If this is an xvattr_t, then get a pointer to the structure of
+	 * optional attributes.  If this is NULL, then we have a vattr_t.
+	 */
+	xoap = xva_getxoptattr(xvap);
+
+	/*
+	 * Immutable files can only alter immutable bit and atime
+	 */
+	if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
+	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
+	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
+		ZFS_EXIT(zfsvfs);
+		return (EPERM);
+	}
+
+	if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
+		ZFS_EXIT(zfsvfs);
+		return (EPERM);
+	}
+
+	/*
+	 * Verify timestamps doesn't overflow 32 bits.
+	 * ZFS can handle large timestamps, but 32bit syscalls can't
+	 * handle times greater than 2039.  This check should be removed
+	 * once large timestamps are fully supported.
+	 */
+	if (mask & (AT_ATIME | AT_MTIME)) {
+		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
+		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
+			ZFS_EXIT(zfsvfs);
+			return (EOVERFLOW);
+		}
+	}
 
 top:
 	attrzp = NULL;
@@ -2109,7 +2616,7 @@ top:
 	 */
 
 	if (mask & AT_SIZE) {
-		err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
+		err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
@@ -2120,18 +2627,22 @@ top:
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
-		do {
-			err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
-			/* NB: we already did dmu_tx_wait() if necessary */
-		} while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
+		/* XXX - would it be OK to generate a log record here? */
+		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 	}
 
-	if (mask & (AT_ATIME|AT_MTIME))
-		need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
+	if (mask & (AT_ATIME|AT_MTIME) ||
+	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
+	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
+	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
+	    XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
+		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
+		    skipaclchk, cr);
 
 	if (mask & (AT_UID|AT_GID)) {
 		int	idmask = (mask & (AT_UID|AT_GID));
@@ -2151,7 +2662,8 @@ top:
 		 */
 
 		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
-		take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
+		take_group = (mask & AT_GID) &&
+		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
 
 		/*
 		 * If both AT_UID and AT_GID are set then take_owner and
@@ -2165,11 +2677,12 @@ top:
 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
 		    ((idmask == AT_UID) && take_owner) ||
 		    ((idmask == AT_GID) && take_group)) {
-			if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
+			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
+			    skipaclchk, cr) == 0) {
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
-				secpolicy_setid_clear(vap, cr);
+				secpolicy_setid_clear(vap, vp, cr);
 				trim_mask = (mask & (AT_UID|AT_GID));
 			} else {
 				need_policy =  TRUE;
@@ -2181,12 +2694,38 @@ top:
 
 	mutex_enter(&zp->z_lock);
 	oldva.va_mode = pzp->zp_mode;
-	oldva.va_uid = zp->z_phys->zp_uid;
-	oldva.va_gid = zp->z_phys->zp_gid;
+	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
+	if (mask & AT_XVATTR) {
+		if ((need_policy == FALSE) &&
+		    (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) &&
+		    xoap->xoa_appendonly !=
+		    ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) ||
+		    (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) &&
+		    xoap->xoa_nounlink !=
+		    ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) ||
+		    (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) &&
+		    xoap->xoa_immutable !=
+		    ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) ||
+		    (XVA_ISSET_REQ(xvap, XAT_NODUMP) &&
+		    xoap->xoa_nodump !=
+		    ((pzp->zp_flags & ZFS_NODUMP) != 0)) ||
+		    (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) &&
+		    xoap->xoa_av_modified !=
+		    ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) ||
+		    ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) &&
+		    ((vp->v_type != VREG && xoap->xoa_av_quarantined) ||
+		    xoap->xoa_av_quarantined !=
+		    ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) ||
+		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
+		    (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
+			need_policy = TRUE;
+		}
+	}
+
 	mutex_exit(&zp->z_lock);
 
 	if (mask & AT_MODE) {
-		if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) {
+		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
 			    &oldva, cr);
 			if (err) {
@@ -2211,10 +2750,9 @@ top:
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
-
 		}
 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
-		    (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
+		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
@@ -2232,25 +2770,58 @@ top:
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
+	if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
+	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) {
+		if (zfsvfs->z_fuid_obj == 0) {
+			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+			    FUID_SIZE_ESTIMATE(zfsvfs));
+			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
+		} else {
+			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+			    FUID_SIZE_ESTIMATE(zfsvfs));
+		}
+	}
 
 	if (mask & AT_MODE) {
 		uint64_t pmode = pzp->zp_mode;
 
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
-		if (zp->z_phys->zp_acl.z_acl_extern_obj)
-			dmu_tx_hold_write(tx,
-			    pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
-		else
+		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) {
+			dmu_tx_abort(tx);
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+		if (pzp->zp_acl.z_acl_extern_obj) {
+			/* Are we upgrading ACL from old V0 format to new V1 */
+			if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
+			    pzp->zp_acl.z_acl_version ==
+			    ZFS_ACL_VERSION_INITIAL) {
+				dmu_tx_hold_free(tx,
+				    pzp->zp_acl.z_acl_extern_obj, 0,
+				    DMU_OBJECT_END);
+				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+				    0, aclp->z_acl_bytes);
+			} else {
+				dmu_tx_hold_write(tx,
+				    pzp->zp_acl.z_acl_extern_obj, 0,
+				    aclp->z_acl_bytes);
+			}
+		} else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-			    0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
+			    0, aclp->z_acl_bytes);
+		}
 	}
 
-	if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) {
-		err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp);
+	if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) {
+		err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
 		if (err) {
 			dmu_tx_abort(tx);
 			ZFS_EXIT(zfsvfs);
+			if (aclp)
+				zfs_acl_free(aclp);
 			return (err);
 		}
 		dmu_tx_hold_bonus(tx, attrzp->z_id);
@@ -2260,6 +2831,12 @@ top:
 	if (err) {
 		if (attrzp)
 			VN_RELE(ZTOV(attrzp));
+
+		if (aclp) {
+			zfs_acl_free(aclp);
+			aclp = NULL;
+		}
+
 		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
@@ -2283,26 +2860,36 @@ top:
 	mutex_enter(&zp->z_lock);
 
 	if (mask & AT_MODE) {
-		err = zfs_acl_chmod_setattr(zp, new_mode, tx);
+		mutex_enter(&zp->z_acl_lock);
+		zp->z_phys->zp_mode = new_mode;
+		err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx);
 		ASSERT3U(err, ==, 0);
+		mutex_exit(&zp->z_acl_lock);
 	}
 
 	if (attrzp)
 		mutex_enter(&attrzp->z_lock);
 
 	if (mask & AT_UID) {
-		zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
+		pzp->zp_uid = zfs_fuid_create(zfsvfs,
+		    vap->va_uid, cr, ZFS_OWNER, tx, &fuidp);
 		if (attrzp) {
-			attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid;
+			attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs,
+			    vap->va_uid,  cr, ZFS_OWNER, tx, &fuidp);
 		}
 	}
 
 	if (mask & AT_GID) {
-		zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
+		pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid,
+		    cr, ZFS_GROUP, tx, &fuidp);
 		if (attrzp)
-			attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid;
+			attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs,
+			    vap->va_gid, cr, ZFS_GROUP, tx, &fuidp);
 	}
 
+	if (aclp)
+		zfs_acl_free(aclp);
+
 	if (attrzp)
 		mutex_exit(&attrzp->z_lock);
 
@@ -2312,14 +2899,38 @@ top:
 	if (mask & AT_MTIME)
 		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
 
+	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
 	if (mask & AT_SIZE)
 		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
 	else if (mask != 0)
 		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+	/*
+	 * Do this after setting timestamps to prevent timestamp
+	 * update from toggling bit
+	 */
+
+	if (xoap && (mask & AT_XVATTR)) {
+		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+			size_t len;
+			dmu_object_info_t doi;
+
+			ASSERT(vp->v_type == VREG);
+
+			/* Grow the bonus buffer if necessary. */
+			dmu_object_info_from_db(zp->z_dbuf, &doi);
+			len = sizeof (xoap->xoa_av_scanstamp) +
+			    sizeof (znode_phys_t);
+			if (len > doi.doi_bonus_size)
+				VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
+		}
+		zfs_xvattr_set(zp, xvap);
+	}
 
 	if (mask != 0)
-		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask);
+		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
+	if (fuidp)
+		zfs_fuid_info_free(fuidp);
 	mutex_exit(&zp->z_lock);
 
 	if (attrzp)
@@ -2436,6 +3047,8 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
  *		tdvp	- Target directory to contain the "new entry".
  *		tnm	- New entry name.
  *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
  *
  *	RETURN:	0 if success
  *		error code if failure
@@ -2443,25 +3056,31 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
  * Timestamps:
  *	sdvp,tdvp - ctime|mtime updated
  */
+/*ARGSUSED*/
 static int
-zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
+zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
+    caller_context_t *ct, int flags)
 {
 	znode_t		*tdzp, *szp, *tzp;
 	znode_t		*sdzp = VTOZ(sdvp);
 	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	vnode_t		*realvp;
 	zfs_dirlock_t	*sdl, *tdl;
 	dmu_tx_t	*tx;
 	zfs_zlock_t	*zl;
-	int		cmp, serr, terr, error;
+	int		cmp, serr, terr;
+	int		error = 0;
+	int		zflg = 0;
 
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(sdzp);
+	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure we have the real vp for the target directory.
 	 */
-	if (VOP_REALVP(tdvp, &realvp) == 0)
+	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
 		tdvp = realvp;
 
 	if (tdvp->v_vfsp != sdvp->v_vfsp) {
@@ -2470,6 +3089,16 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
 	}
 
 	tdzp = VTOZ(tdvp);
+	ZFS_VERIFY_ZP(tdzp);
+	if (zfsvfs->z_utf8 && u8_validate(tnm,
+	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EILSEQ);
+	}
+
+	if (flags & FIGNORECASE)
+		zflg |= ZCILOOK;
+
 top:
 	szp = NULL;
 	tzp = NULL;
@@ -2497,7 +3126,14 @@ top:
 	} else if (sdzp->z_id > tdzp->z_id) {
 		cmp = 1;
 	} else {
-		cmp = strcmp(snm, tnm);
+		/*
+		 * First compare the two name arguments without
+		 * considering any case folding.
+		 */
+		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
+
+		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
+		ASSERT(error == 0 || !zfsvfs->z_utf8);
 		if (cmp == 0) {
 			/*
 			 * POSIX: "If the old argument and the new argument
@@ -2508,13 +3144,49 @@ top:
 			ZFS_EXIT(zfsvfs);
 			return (0);
 		}
+		/*
+		 * If the file system is case-folding, then we may
+		 * have some more checking to do.  A case-folding file
+		 * system is either supporting mixed case sensitivity
+		 * access or is completely case-insensitive.  Note
+		 * that the file system is always case preserving.
+		 *
+		 * In mixed sensitivity mode case sensitive behavior
+		 * is the default.  FIGNORECASE must be used to
+		 * explicitly request case insensitive behavior.
+		 *
+		 * If the source and target names provided differ only
+		 * by case (e.g., a request to rename 'tim' to 'Tim'),
+		 * we will treat this as a special case in the
+		 * case-insensitive mode: as long as the source name
+		 * is an exact match, we will allow this to proceed as
+		 * a name-change request.
+		 */
+		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
+		    flags & FIGNORECASE)) &&
+		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
+		    &error) == 0) {
+			/*
+			 * case preserving rename request, require exact
+			 * name matches
+			 */
+			zflg |= ZCIEXACT;
+			zflg &= ~ZCILOOK;
+		}
 	}
+
 	if (cmp < 0) {
-		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
-		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
+		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
+		    ZEXISTS | zflg, NULL, NULL);
+		terr = zfs_dirent_lock(&tdl,
+		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
 	} else {
-		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
-		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
+		terr = zfs_dirent_lock(&tdl,
+		    tdzp, tnm, &tzp, zflg, NULL, NULL);
+		serr = zfs_dirent_lock(&sdl,
+		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
+		    NULL, NULL);
 	}
 
 	if (serr) {
@@ -2588,9 +3260,17 @@ top:
 		}
 	}
 
-	vnevent_rename_src(ZTOV(szp));
+	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
 	if (tzp)
-		vnevent_rename_dest(ZTOV(tzp));
+		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
+
+	/*
+	 * notify the target directory if it is not the same
+	 * as source directory.
+	 */
+	if (tdvp != sdvp) {
+		vnevent_rename_dest_dir(tdvp, ct);
+	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
@@ -2622,15 +3302,22 @@ top:
 	}
 
 	if (tzp)	/* Attempt to remove the existing target */
-		error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
+		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
 
 	if (error == 0) {
 		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
 		if (error == 0) {
+			szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
+
 			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
 			ASSERT(error == 0);
-			zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
-			    sdl->dl_name, tdzp, tdl->dl_name, szp);
+
+			zfs_log_rename(zilog, tx,
+			    TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0),
+			    sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
+
+			/* Update path information for the target vnode */
+			vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm));
 		}
 #ifdef FREEBSD_NAMECACHE
 		if (error == 0) {
@@ -2665,6 +3352,8 @@ out:
  *		vap	- Attributes of new entry.
  *		target	- Target path of new symlink.
  *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
  *
  *	RETURN:	0 if success
  *		error code if failure
@@ -2672,23 +3361,37 @@ out:
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
+/*ARGSUSED*/
 static int
-zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td)
+zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
+    cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
-	uint64_t	zoid;
+	zilog_t		*zilog;
 	int		len = strlen(link);
 	int		error;
+	int		zflg = ZNEW;
+	zfs_fuid_info_t *fuidp = NULL;
+	int		flags = 0;
 
 	ASSERT(vap->va_type == VLNK);
 
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
+
+	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EILSEQ);
+	}
+	if (flags & FIGNORECASE)
+		zflg |= ZCILOOK;
 top:
-	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
@@ -2701,7 +3404,8 @@ top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
+	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
+	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
@@ -2712,6 +3416,18 @@ top:
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
+	if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
+		if (zfsvfs->z_fuid_obj == 0) {
+			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+			    FUID_SIZE_ESTIMATE(zfsvfs));
+			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
+		} else {
+			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+			    FUID_SIZE_ESTIMATE(zfsvfs));
+		}
+	}
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
 		zfs_dirent_unlock(dl);
@@ -2732,23 +3448,22 @@ top:
 	 * Put the link content into bonus buffer if it will fit;
 	 * otherwise, store it just like any other file data.
 	 */
-	zoid = 0;
 	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
-		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
+		zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp);
 		if (len != 0)
 			bcopy(link, zp->z_phys + 1, len);
 	} else {
 		dmu_buf_t *dbp;
 
-		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
-
+		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp);
 		/*
 		 * Nothing can access the znode yet so no locking needed
 		 * for growing the znode's blocksize.
 		 */
 		zfs_grow_blocksize(zp, len, tx);
 
-		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
+		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os,
+		    zp->z_id, 0, FTAG, &dbp));
 		dmu_buf_will_dirty(dbp, tx);
 
 		ASSERT3U(len, <=, dbp->db_size);
@@ -2763,9 +3478,14 @@ top:
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
 out:
 	if (error == 0) {
-		zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link);
+		uint64_t txtype = TX_SYMLINK;
+		if (flags & FIGNORECASE)
+			txtype |= TX_CI;
+		zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 		*vpp = ZTOV(zp);
 	}
+	if (fuidp)
+		zfs_fuid_info_free(fuidp);
 
 	dmu_tx_commit(tx);
 
@@ -2782,6 +3502,7 @@ out:
  *	IN:	vp	- vnode of symbolic link.
  *		uoip	- structure to contain the link path.
  *		cr	- credentials of caller.
+ *		ct	- caller context
  *
  *	OUT:	uio	- structure to contain the link path.
  *
@@ -2793,7 +3514,7 @@ out:
  */
 /* ARGSUSED */
 static int
-zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
+zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
@@ -2801,6 +3522,7 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
 
 	bufsz = (size_t)zp->z_phys->zp_size;
 	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
@@ -2830,6 +3552,7 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
  *		svp	- vnode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
+ *		ct	- caller context
  *
  *	RETURN:	0 if success
  *		error code if failure
@@ -2840,30 +3563,44 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
  */
 /* ARGSUSED */
 static int
-zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
+zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
+    caller_context_t *ct, int flags)
 {
 	znode_t		*dzp = VTOZ(tdvp);
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	vnode_t		*realvp;
 	int		error;
+	int		zf = ZNEW;
+	uid_t		owner;
 
 	ASSERT(tdvp->v_type == VDIR);
 
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
 
-	if (VOP_REALVP(svp, &realvp) == 0)
+	if (VOP_REALVP(svp, &realvp, ct) == 0)
 		svp = realvp;
 
 	if (svp->v_vfsp != tdvp->v_vfsp) {
 		ZFS_EXIT(zfsvfs);
 		return (EXDEV);
 	}
-
 	szp = VTOZ(svp);
+	ZFS_VERIFY_ZP(szp);
+
+	if (zfsvfs->z_utf8 && u8_validate(name,
+	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EILSEQ);
+	}
+	if (flags & FIGNORECASE)
+		zf |= ZCILOOK;
+
 top:
 	/*
 	 * We do not support links between attributes and non-attributes
@@ -2886,13 +3623,14 @@ top:
 		return (EPERM);
 	}
 
-	if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
-	    secpolicy_basic_link(cr) != 0) {
+	owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER);
+	if (owner != crgetuid(cr) &&
+	    secpolicy_basic_link(svp, cr) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (EPERM);
 	}
 
-	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
@@ -2900,7 +3638,8 @@ top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
+	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
+	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
@@ -2923,40 +3662,45 @@ top:
 
 	error = zfs_link_create(dl, szp, tx, 0);
 
-	if (error == 0)
-		zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
+	if (error == 0) {
+		uint64_t txtype = TX_LINK;
+		if (flags & FIGNORECASE)
+			txtype |= TX_CI;
+		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
+	}
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
+	if (error == 0) {
+		vnevent_link(svp, ct);
+	}
+
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
+/*ARGSUSED*/
 void
-zfs_inactive(vnode_t *vp, cred_t *cr)
+zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
-	rw_enter(&zfsvfs->z_um_lock, RW_READER);
-	if (zfsvfs->z_unmounted2) {
-		ASSERT(zp->z_dbuf_held == 0);
-
+	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+	if (zp->z_dbuf == NULL) {
+		/*
+		 * The fs has been unmounted, or we did a
+		 * suspend/resume and this file no longer exists.
+		 */
 		mutex_enter(&zp->z_lock);
 		VI_LOCK(vp);
 		vp->v_count = 0; /* count arrives as 1 */
-		VI_UNLOCK(vp);
-		if (zp->z_dbuf == NULL) {
-			mutex_exit(&zp->z_lock);
-			zfs_znode_free(zp);
-		} else {
-			mutex_exit(&zp->z_lock);
-		}
-		rw_exit(&zfsvfs->z_um_lock);
-		VFS_RELE(zfsvfs->z_vfs);
+		mutex_exit(&zp->z_lock);
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		zfs_znode_free(zp);
 		return;
 	}
 
@@ -2977,23 +3721,26 @@ zfs_inactive(vnode_t *vp, cred_t *cr)
 	}
 
 	zfs_zinactive(zp);
-	rw_exit(&zfsvfs->z_um_lock);
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }
 
 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
 
+/*ARGSUSED*/
 static int
-zfs_fid(vnode_t *vp, fid_t *fidp)
+zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	uint32_t	gen = (uint32_t)zp->z_phys->zp_gen;
+	uint32_t	gen;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i;
 
 	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+	gen = (uint32_t)zp->z_gen;
 
 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
 	fidp->fid_len = size;
@@ -3030,7 +3777,8 @@ zfs_fid(vnode_t *vp, fid_t *fidp)
 }
 
 static int
-zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
+zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+    caller_context_t *ct)
 {
 	znode_t		*zp, *xzp;
 	zfsvfs_t	*zfsvfs;
@@ -3051,9 +3799,10 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
+		ZFS_VERIFY_ZP(zp);
 		*valp = 0;
 		error = zfs_dirent_lock(&dl, zp, "", &xzp,
-		    ZXATTR | ZEXISTS | ZSHARED);
+		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
 		if (error == 0) {
 			zfs_dirent_unlock(dl);
 			if (!zfs_dirempty(xzp))
@@ -3086,14 +3835,17 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
 #ifdef TODO
 /*ARGSUSED*/
 static int
-zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
+zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
+    caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
+	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
-	error = zfs_getacl(zp, vsecp, cr);
+	ZFS_VERIFY_ZP(zp);
+	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
@@ -3103,14 +3855,17 @@ zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
 #ifdef TODO
 /*ARGSUSED*/
 static int
-zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
+zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
+    caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
+	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
-	error = zfs_setacl(zp, vsecp, cr);
+	ZFS_VERIFY_ZP(zp);
+	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
@@ -3129,7 +3884,7 @@ zfs_freebsd_open(ap)
 	znode_t *zp = VTOZ(vp);
 	int error;
 
-	error = zfs_open(&vp, ap->a_mode, ap->a_cred);
+	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
 	if (error == 0)
 		vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
 	return (error);
@@ -3145,7 +3900,7 @@ zfs_freebsd_close(ap)
 	} */ *ap;
 {
 
-	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred));
+	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL));
 }
 
 static int
@@ -3161,7 +3916,7 @@ zfs_freebsd_ioctl(ap)
 {
 
 	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
-	    ap->a_fflag, ap->a_cred, NULL));
+	    ap->a_fflag, ap->a_cred, NULL, NULL));
 }
 
 static int
@@ -3194,13 +3949,13 @@ static int
 zfs_freebsd_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
-		accmode_t a_accmode;
+		int  a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
-	return (zfs_access(ap->a_vp, ap->a_accmode, 0, ap->a_cred));
+	return (zfs_access(ap->a_vp, ap->a_accmode, 0, ap->a_cred, NULL));
 }
 
 static int
@@ -3218,7 +3973,7 @@ zfs_freebsd_lookup(ap)
 	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
 
 	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
-	    cnp->cn_cred, cnp->cn_thread));
+	    cnp->cn_cred, cnp->cn_thread, 0));
 }
 
 static int
@@ -3240,7 +3995,7 @@ zfs_freebsd_create(ap)
 	mode = vap->va_mode & ALLPERMS;
 
 	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
-	    ap->a_vpp, cnp->cn_cred));
+	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
 }
 
 static int
@@ -3255,7 +4010,7 @@ zfs_freebsd_remove(ap)
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
-	    ap->a_cnp->cn_cred));
+	    ap->a_cnp->cn_cred, NULL, 0));
 }
 
 static int
@@ -3274,7 +4029,7 @@ zfs_freebsd_mkdir(ap)
 	vattr_init_mask(vap);
 
 	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
-	    ap->a_cnp->cn_cred));
+	    ap->a_cnp->cn_cred, NULL, 0, NULL));
 }
 
 static int
@@ -3289,7 +4044,7 @@ zfs_freebsd_rmdir(ap)
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
-	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred));
+	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
 }
 
 static int
@@ -3318,7 +4073,7 @@ zfs_freebsd_fsync(ap)
 {
 
 	vop_stdfsync(ap);
-	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred));
+	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
 }
 
 static int
@@ -3327,10 +4082,45 @@ zfs_freebsd_getattr(ap)
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
+		struct thread *a_td;
 	} */ *ap;
 {
+	vattr_t *vap = ap->a_vap;
+	xvattr_t xvap;
+	u_long fflags = 0;
+	int error;
+
+	xva_init(&xvap);
+	xvap.xva_vattr = *vap;
+	xvap.xva_vattr.va_mask |= AT_XVATTR;
+
+	/* Convert chflags into ZFS-type flags. */
+	/* XXX: what about SF_SETTABLE?. */
+	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
+	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
+	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
+	XVA_SET_REQ(&xvap, XAT_NODUMP);
+	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
+	if (error != 0)
+		return (error);
 
-	return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred));
+	/* Convert ZFS xattr into chflags. */
+#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
+	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
+		fflags |= (fflag);					\
+} while (0)
+	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
+	    xvap.xva_xoptattrs.xoa_immutable);
+	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
+	    xvap.xva_xoptattrs.xoa_appendonly);
+	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
+	    xvap.xva_xoptattrs.xoa_nounlink);
+	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
+	    xvap.xva_xoptattrs.xoa_nodump);
+#undef	FLAG_CHECK
+	*vap = xvap.xva_vattr;
+	vap->va_flags = fflags;
+	return (0);
 }
 
 static int
@@ -3339,18 +4129,46 @@ zfs_freebsd_setattr(ap)
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
+		struct thread *a_td;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
-
-	/* No support for FreeBSD's chflags(2). */
-	if (vap->va_flags != VNOVAL)
-		return (EOPNOTSUPP);
+	xvattr_t xvap;
+	u_long fflags;
+	uint64_t zflags;
 
 	vattr_init_mask(vap);
 	vap->va_mask &= ~AT_NOSET;
 
-	return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL));
+	xva_init(&xvap);
+	xvap.xva_vattr = *vap;
+
+	if (vap->va_flags != VNOVAL) {
+		fflags = vap->va_flags;
+		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
+			return (EOPNOTSUPP);
+		zflags = VTOZ(ap->a_vp)->z_phys->zp_flags;
+
+#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
+	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
+	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
+		XVA_SET_REQ(&xvap, (xflag));				\
+		(xfield) = ((fflags & (fflag)) != 0);			\
+	}								\
+} while (0)
+		/* Convert chflags into ZFS-type flags. */
+		/* XXX: what about SF_SETTABLE?. */
+		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
+		    xvap.xva_xoptattrs.xoa_immutable);
+		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
+		    xvap.xva_xoptattrs.xoa_appendonly);
+		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
+		    xvap.xva_xoptattrs.xoa_nounlink);
+		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
+		    xvap.xva_xoptattrs.xoa_nounlink);
+#undef	FLAG_CHANGE
+	}
+	return (zfs_setattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL));
 }
 
 static int
@@ -3374,7 +4192,7 @@ zfs_freebsd_rename(ap)
 	ASSERT(ap->a_tcnp->cn_flags & SAVENAME);
 
 	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
-	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred);
+	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
 
 	if (tdvp == tvp)
 		VN_RELE(tdvp);
@@ -3419,7 +4237,7 @@ zfs_freebsd_readlink(ap)
 	} */ *ap;
 {
 
-	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred));
+	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
 }
 
 static int
@@ -3434,7 +4252,7 @@ zfs_freebsd_link(ap)
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
-	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
+	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
 }
 
 static int
@@ -3446,10 +4264,23 @@ zfs_freebsd_inactive(ap)
 {
 	vnode_t *vp = ap->a_vp;
 
-	zfs_inactive(vp, ap->a_td->td_ucred);
+	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
 	return (0);
 }
 
+static void
+zfs_reclaim_complete(void *arg, int pending)
+{
+	znode_t	*zp = arg;
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ZFS_LOG(1, "zp=%p", zp);
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
+	zfs_znode_dmu_fini(zp);
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
+	zfs_znode_free(zp);
+}
+
 static int
 zfs_freebsd_reclaim(ap)
 	struct vop_reclaim_args /* {
@@ -3460,7 +4291,6 @@ zfs_freebsd_reclaim(ap)
 	vnode_t	*vp = ap->a_vp;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs;
-	int rele = 1;
 
 	ASSERT(zp != NULL);
 
@@ -3471,24 +4301,34 @@ zfs_freebsd_reclaim(ap)
 
 	mutex_enter(&zp->z_lock);
 	ASSERT(zp->z_phys);
-	ASSERT(zp->z_dbuf_held);
-	zfsvfs = zp->z_zfsvfs;
+	ZTOV(zp) = NULL;
 	if (!zp->z_unlinked) {
-		zp->z_dbuf_held = 0;
-		ZTOV(zp) = NULL;
+		int locked;
+
+		zfsvfs = zp->z_zfsvfs;
 		mutex_exit(&zp->z_lock);
-		dmu_buf_rele(zp->z_dbuf, NULL);
+		locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
+		    ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id);
+		if (locked == 0) {
+			/*
+			 * Lock can't be obtained due to deadlock possibility,
+			 * so defer znode destruction.
+			 */
+			TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
+			taskqueue_enqueue(taskqueue_thread, &zp->z_task);
+		} else {
+			zfs_znode_dmu_fini(zp);
+			if (locked == 1)
+				ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
+			zfs_znode_free(zp);
+		}
 	} else {
 		mutex_exit(&zp->z_lock);
 	}
 	VI_LOCK(vp);
-	if (vp->v_count > 0)
-		rele = 0;
 	vp->v_data = NULL;
 	ASSERT(vp->v_holdcnt >= 1);
 	VI_UNLOCK(vp);
-	if (!zp->z_unlinked && rele)
-		VFS_RELE(zfsvfs->z_vfs);
 	return (0);
 }
 
@@ -3500,7 +4340,7 @@ zfs_freebsd_fid(ap)
 	} */ *ap;
 {
 
-	return (zfs_fid(ap->a_vp, (void *)ap->a_fid));
+	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
 }
 
 static int
@@ -3514,7 +4354,7 @@ zfs_freebsd_pathconf(ap)
 	ulong_t val;
 	int error;
 
-	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred);
+	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
 	if (error == 0)
 		*ap->a_retval = val;
 	else if (error == EOPNOTSUPP)
@@ -3522,52 +4362,408 @@ zfs_freebsd_pathconf(ap)
 	return (error);
 }
 
+/*
+ * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
+ * extended attribute name:
+ *
+ *	NAMESPACE	PREFIX	
+ *	system		freebsd:system:
+ *	user		(none, can be used to access ZFS fsattr(5) attributes
+ *			created on Solaris)
+ */
+static int
+zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
+    size_t size)
+{
+	const char *namespace, *prefix, *suffix;
+
+	/* We don't allow '/' character in attribute name. */
+	if (strchr(name, '/') != NULL)
+		return (EINVAL);
+	/* We don't allow attribute names that start with "freebsd:" string. */
+	if (strncmp(name, "freebsd:", 8) == 0)
+		return (EINVAL);
+
+	bzero(attrname, size);
+
+	switch (attrnamespace) {
+	case EXTATTR_NAMESPACE_USER:
+#if 0
+		prefix = "freebsd:";
+		namespace = EXTATTR_NAMESPACE_USER_STRING;
+		suffix = ":";
+#else
+		/*
+		 * This is the default namespace by which we can access all
+		 * attributes created on Solaris.
+		 */
+		prefix = namespace = suffix = "";
+#endif
+		break;
+	case EXTATTR_NAMESPACE_SYSTEM:
+		prefix = "freebsd:";
+		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
+		suffix = ":";
+		break;
+	case EXTATTR_NAMESPACE_EMPTY:
+	default:
+		return (EINVAL);
+	}
+	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
+	    name) >= size) {
+		return (ENAMETOOLONG);
+	}
+	return (0);
+}
+
+/*
+ * Vnode operating to retrieve a named extended attribute.
+ */
+static int
+zfs_getextattr(struct vop_getextattr_args *ap)
+/*
+vop_getextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	IN const char *a_name;
+	INOUT struct uio *a_uio;
+	OUT size_t *a_size;
+	IN struct ucred *a_cred;
+	IN struct thread *a_td;
+};
+*/
+{
+	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+	struct thread *td = ap->a_td;
+	struct nameidata nd;
+	char attrname[255];
+	struct vattr va;
+	vnode_t *xvp = NULL, *vp;
+	int error, flags;
+
+	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+	    sizeof(attrname));
+	if (error != 0)
+		return (error);
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+	    LOOKUP_XATTR);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	flags = FREAD;
+	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
+	    xvp, td);
+	error = vn_open_cred(&nd, &flags, 0, ap->a_cred, NULL);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (ap->a_size != NULL) {
+		error = VOP_GETATTR(vp, &va, ap->a_cred);
+		if (error == 0)
+			*ap->a_size = (size_t)va.va_size;
+	} else if (ap->a_uio != NULL)
+		error = VOP_READ(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
+
+	VOP_UNLOCK(vp, 0);
+	vn_close(vp, flags, ap->a_cred, td);
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Vnode operation to remove a named attribute.
+ */
+int
+zfs_deleteextattr(struct vop_deleteextattr_args *ap)
+/*
+vop_deleteextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	IN const char *a_name;
+	IN struct ucred *a_cred;
+	IN struct thread *a_td;
+};
+*/
+{
+	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+	struct thread *td = ap->a_td;
+	struct nameidata nd;
+	char attrname[255];
+	struct vattr va;
+	vnode_t *xvp = NULL, *vp;
+	int error, flags;
+
+	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+	    sizeof(attrname));
+	if (error != 0)
+		return (error);
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+	    LOOKUP_XATTR);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE,
+	    UIO_SYSSPACE, attrname, xvp, td);
+	error = namei(&nd);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	VOP_LEASE(nd.ni_dvp, td, ap->a_cred, LEASE_WRITE);
+	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+
+	vput(nd.ni_dvp);
+	if (vp == nd.ni_dvp)
+		vrele(vp);
+	else
+		vput(vp);
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Vnode operation to set a named attribute.
+ */
+static int
+zfs_setextattr(struct vop_setextattr_args *ap)
+/*
+vop_setextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	IN const char *a_name;
+	INOUT struct uio *a_uio;
+	IN struct ucred *a_cred;
+	IN struct thread *a_td;
+};
+*/
+{
+	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+	struct thread *td = ap->a_td;
+	struct nameidata nd;
+	char attrname[255];
+	struct vattr va;
+	vnode_t *xvp = NULL, *vp;
+	int error, flags;
+
+	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+	    sizeof(attrname));
+	if (error != 0)
+		return (error);
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+	    LOOKUP_XATTR);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	flags = FFLAGS(O_WRONLY | O_CREAT);
+	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
+	    xvp, td);
+	error = vn_open_cred(&nd, &flags, 0600, ap->a_cred, NULL);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	VOP_LEASE(vp, td, ap->a_cred, LEASE_WRITE);
+	VATTR_NULL(&va);
+	va.va_size = 0;
+	error = VOP_SETATTR(vp, &va, ap->a_cred);
+	if (error == 0)
+		VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
+
+	VOP_UNLOCK(vp, 0);
+	vn_close(vp, flags, ap->a_cred, td);
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Vnode operation to retrieve extended attributes on a vnode.
+ */
+static int
+zfs_listextattr(struct vop_listextattr_args *ap)
+/*
+vop_listextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	INOUT struct uio *a_uio;
+	OUT size_t *a_size;
+	IN struct ucred *a_cred;
+	IN struct thread *a_td;
+};
+*/
+{
+	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+	struct thread *td = ap->a_td;
+	struct nameidata nd;
+	char attrprefix[16];
+	u_char dirbuf[sizeof(struct dirent)];
+	struct dirent *dp;
+	struct iovec aiov;
+	struct uio auio, *uio = ap->a_uio;
+	size_t *sizep = ap->a_size;
+	size_t plen;
+	vnode_t *xvp = NULL, *vp;
+	int done, error, eof, pos;
+
+	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
+	    sizeof(attrprefix));
+	if (error != 0)
+		return (error);
+	plen = strlen(attrprefix);
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+	    LOOKUP_XATTR);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE, UIO_SYSSPACE,
+	    ".", xvp, td);
+	error = namei(&nd);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_td = td;
+	auio.uio_rw = UIO_READ;
+	auio.uio_offset = 0;
+
+	if (sizep != NULL)
+		*sizep = 0;
+
+	do {
+		u_char nlen;
+
+		aiov.iov_base = (void *)dirbuf;
+		aiov.iov_len = sizeof(dirbuf);
+		auio.uio_resid = sizeof(dirbuf);
+		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
+		done = sizeof(dirbuf) - auio.uio_resid;
+		if (error != 0)
+			break;
+		for (pos = 0; pos < done;) {
+			dp = (struct dirent *)(dirbuf + pos);
+			pos += dp->d_reclen;
+			/*
+			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
+			 * is what we get when attribute was created on Solaris.
+			 */
+			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
+				continue;
+			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
+				continue;
+			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
+				continue;
+			nlen = dp->d_namlen - plen;
+			if (sizep != NULL)
+				*sizep += 1 + nlen;
+			else if (uio != NULL) {
+				/*
+				 * Format of extattr name entry is one byte for
+				 * length and the rest for name.
+				 */
+				error = uiomove(&nlen, 1, uio->uio_rw, uio);
+				if (error == 0) {
+					error = uiomove(dp->d_name + plen, nlen,
+					    uio->uio_rw, uio);
+				}
+				if (error != 0)
+					break;
+			}
+		}
+	} while (!eof && error == 0);
+
+	vput(vp);
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
 
 struct vop_vector zfs_vnodeops = {
-	.vop_default =	&default_vnodeops,
-	.vop_inactive =	zfs_freebsd_inactive,
-	.vop_reclaim =	zfs_freebsd_reclaim,
-	.vop_access =	zfs_freebsd_access,
+	.vop_default =		&default_vnodeops,
+	.vop_inactive =		zfs_freebsd_inactive,
+	.vop_reclaim =		zfs_freebsd_reclaim,
+	.vop_access =		zfs_freebsd_access,
 #ifdef FREEBSD_NAMECACHE
-	.vop_lookup =	vfs_cache_lookup,
-	.vop_cachedlookup = zfs_freebsd_lookup,
+	.vop_lookup =		vfs_cache_lookup,
+	.vop_cachedlookup =	zfs_freebsd_lookup,
 #else
-	.vop_lookup =	zfs_freebsd_lookup,
+	.vop_lookup =		zfs_freebsd_lookup,
 #endif
-	.vop_getattr =	zfs_freebsd_getattr,
-	.vop_setattr =	zfs_freebsd_setattr,
-	.vop_create =	zfs_freebsd_create,
-	.vop_mknod =	zfs_freebsd_create,
-	.vop_mkdir =	zfs_freebsd_mkdir,
-	.vop_readdir =	zfs_freebsd_readdir,
-	.vop_fsync =	zfs_freebsd_fsync,
-	.vop_open =	zfs_freebsd_open,
-	.vop_close =	zfs_freebsd_close,
-	.vop_rmdir =	zfs_freebsd_rmdir,
-	.vop_ioctl =	zfs_freebsd_ioctl,
-	.vop_link =	zfs_freebsd_link,
-	.vop_symlink =	zfs_freebsd_symlink,
-	.vop_readlink =	zfs_freebsd_readlink,
-	.vop_read =	zfs_freebsd_read,
-	.vop_write =	zfs_freebsd_write,
-	.vop_remove =	zfs_freebsd_remove,
-	.vop_rename =	zfs_freebsd_rename,
-	.vop_pathconf =	zfs_freebsd_pathconf,
-	.vop_bmap =	VOP_EOPNOTSUPP,
-	.vop_fid =	zfs_freebsd_fid,
+	.vop_getattr =		zfs_freebsd_getattr,
+	.vop_setattr =		zfs_freebsd_setattr,
+	.vop_create =		zfs_freebsd_create,
+	.vop_mknod =		zfs_freebsd_create,
+	.vop_mkdir =		zfs_freebsd_mkdir,
+	.vop_readdir =		zfs_freebsd_readdir,
+	.vop_fsync =		zfs_freebsd_fsync,
+	.vop_open =		zfs_freebsd_open,
+	.vop_close =		zfs_freebsd_close,
+	.vop_rmdir =		zfs_freebsd_rmdir,
+	.vop_ioctl =		zfs_freebsd_ioctl,
+	.vop_link =		zfs_freebsd_link,
+	.vop_symlink =		zfs_freebsd_symlink,
+	.vop_readlink =		zfs_freebsd_readlink,
+	.vop_read =		zfs_freebsd_read,
+	.vop_write =		zfs_freebsd_write,
+	.vop_remove =		zfs_freebsd_remove,
+	.vop_rename =		zfs_freebsd_rename,
+	.vop_pathconf =		zfs_freebsd_pathconf,
+	.vop_bmap =		VOP_EOPNOTSUPP,
+	.vop_fid =		zfs_freebsd_fid,
+	.vop_getextattr =	zfs_getextattr,
+	.vop_deleteextattr =	zfs_deleteextattr,
+	.vop_setextattr =	zfs_setextattr,
+	.vop_listextattr =	zfs_listextattr,
 };
 
 struct vop_vector zfs_fifoops = {
-	.vop_default =	&fifo_specops,
-	.vop_fsync =	VOP_PANIC,
-	.vop_access =	zfs_freebsd_access,
-	.vop_getattr =	zfs_freebsd_getattr,
-	.vop_inactive =	zfs_freebsd_inactive,
-	.vop_read =	VOP_PANIC,
-	.vop_reclaim =	zfs_freebsd_reclaim,
-	.vop_setattr =	zfs_freebsd_setattr,
-	.vop_write =	VOP_PANIC,
-	.vop_fid =	zfs_freebsd_fid,
+	.vop_default =		&fifo_specops,
+	.vop_fsync =		VOP_PANIC,
+	.vop_access =		zfs_freebsd_access,
+	.vop_getattr =		zfs_freebsd_getattr,
+	.vop_inactive =		zfs_freebsd_inactive,
+	.vop_read =		VOP_PANIC,
+	.vop_reclaim =		zfs_freebsd_reclaim,
+	.vop_setattr =		zfs_freebsd_setattr,
+	.vop_write =		VOP_PANIC,
+	.vop_fid =		zfs_freebsd_fid,
 };
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
index a964ec257f30..86838df837f2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@@ -19,14 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef _KERNEL
 #include <sys/types.h>
 #include <sys/param.h>
@@ -35,11 +33,12 @@
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/mntent.h>
+#include <sys/u8_textprep.h>
+#include <sys/dsl_dataset.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
-#include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/atomic.h>
@@ -47,7 +46,9 @@
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_rlock.h>
+#include <sys/zfs_fuid.h>
 #include <sys/fs/zfs.h>
+#include <sys/kidmap.h>
 #endif /* _KERNEL */
 
 #include <sys/dmu.h>
@@ -57,26 +58,53 @@
 #include <sys/zfs_znode.h>
 #include <sys/refcount.h>
 
+#include "zfs_prop.h"
+
 /* Used by fstat(1). */
 SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
     "sizeof(znode_t)");
 
 /*
+ * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef	DEBUG
+#define	ZNODE_STATS
+#endif	/* DEBUG */
+
+#ifdef	ZNODE_STATS
+#define	ZNODE_STAT_ADD(stat)			((stat)++)
+#else
+#define	ZNODE_STAT_ADD(stat)			/* nothing */
+#endif	/* ZNODE_STATS */
+
+#define	POINTER_IS_VALID(p)	(!((uintptr_t)(p) & 0x3))
+#define	POINTER_INVALIDATE(pp)	(*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
+
+/*
  * Functions needed for userland (ie: libzpool) are not put under
  * #ifdef_KERNEL; the rest of the functions have dependencies
  * (such as VFS logic) that will not compile easily in userland.
  */
 #ifdef _KERNEL
-struct kmem_cache *znode_cache = NULL;
+static kmem_cache_t *znode_cache = NULL;
 
 /*ARGSUSED*/
 static void
-znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
+znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
 {
+#if 1	/* XXXPJD: From OpenSolaris. */
+	/*
+	 * We should never drop all dbuf refs without first clearing
+	 * the eviction callback.
+	 */
+	panic("evicting znode %p\n", user_ptr);
+#else	/* XXXPJD */
 	znode_t *zp = user_ptr;
 	vnode_t *vp;
 
 	mutex_enter(&zp->z_lock);
+	zp->z_dbuf = NULL;
 	vp = ZTOV(zp);
 	if (vp == NULL) {
 		mutex_exit(&zp->z_lock);
@@ -85,16 +113,15 @@ znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
 		ZTOV(zp) = NULL;
 		vhold(vp);
 		mutex_exit(&zp->z_lock);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
 		vrecycle(vp, curthread);
 		VOP_UNLOCK(vp, 0);
 		vdrop(vp);
 		zfs_znode_free(zp);
 	} else {
-		/* signal force unmount that this znode can be freed */
-		zp->z_dbuf = NULL;
 		mutex_exit(&zp->z_lock);
 	}
+#endif
 }
 
 extern struct vop_vector zfs_vnodeops;
@@ -107,24 +134,29 @@ extern struct vop_vector zfs_fifoops;
  *      'cdrarg' is defined at kmem_cache_create() time.
  */
 static int
-zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
+zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 {
 	znode_t *zp = buf;
 	vnode_t *vp;
-	vfs_t *vfsp = cdrarg;
+	vfs_t *vfsp = arg;
 	int error;
 
-	if (cdrarg != NULL) {
-		error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp);
-		ASSERT(error == 0);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-		zp->z_vnode = vp;
-		vp->v_data = (caddr_t)zp;
-		VN_LOCK_AREC(vp);
-		VN_LOCK_ASHARE(vp);
-	} else {
-		zp->z_vnode = NULL;
-	}
+	POINTER_INVALIDATE(&zp->z_zfsvfs);
+	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+	ASSERT(vfsp != NULL);
+
+	error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp);
+	if (error != 0 && (kmflags & KM_NOSLEEP))
+		return (-1);
+	ASSERT(error == 0);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	zp->z_vnode = vp;
+	vp->v_data = (caddr_t)zp;
+	VN_LOCK_AREC(vp);
+	VN_LOCK_ASHARE(vp);
+
+	list_link_init(&zp->z_link_node);
+
 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
@@ -135,29 +167,189 @@ zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
 	avl_create(&zp->z_range_avl, zfs_range_compare,
 	    sizeof (rl_t), offsetof(rl_t, r_node));
 
-	zp->z_dbuf_held = 0;
-	zp->z_dirlocks = 0;
+	zp->z_dbuf = NULL;
+	zp->z_dirlocks = NULL;
 	return (0);
 }
 
 /*ARGSUSED*/
 static void
-zfs_znode_cache_destructor(void *buf, void *cdarg)
+zfs_znode_cache_destructor(void *buf, void *arg)
 {
 	znode_t *zp = buf;
 
-	ASSERT(zp->z_dirlocks == 0);
+	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+	ASSERT(ZTOV(zp) == NULL);
+	vn_free(ZTOV(zp));
+	ASSERT(!list_link_active(&zp->z_link_node));
 	mutex_destroy(&zp->z_lock);
 	rw_destroy(&zp->z_map_lock);
 	rw_destroy(&zp->z_parent_lock);
 	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
-	mutex_destroy(&zp->z_range_lock);
 	avl_destroy(&zp->z_range_avl);
+	mutex_destroy(&zp->z_range_lock);
+
+	ASSERT(zp->z_dbuf == NULL);
+	ASSERT(zp->z_dirlocks == NULL);
+}
+
+#ifdef	ZNODE_STATS
+static struct {
+	uint64_t zms_zfsvfs_invalid;
+	uint64_t zms_zfsvfs_unmounted;
+	uint64_t zms_zfsvfs_recheck_invalid;
+	uint64_t zms_obj_held;
+	uint64_t zms_vnode_locked;
+	uint64_t zms_not_only_dnlc;
+} znode_move_stats;
+#endif	/* ZNODE_STATS */
+
+#if defined(sun)
+static void
+zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
+{
+	vnode_t *vp;
+
+	/* Copy fields. */
+	nzp->z_zfsvfs = ozp->z_zfsvfs;
+
+	/* Swap vnodes. */
+	vp = nzp->z_vnode;
+	nzp->z_vnode = ozp->z_vnode;
+	ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
+	ZTOV(ozp)->v_data = ozp;
+	ZTOV(nzp)->v_data = nzp;
+
+	nzp->z_id = ozp->z_id;
+	ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
+	ASSERT(avl_numnodes(&ozp->z_range_avl) == 0);
+	nzp->z_unlinked = ozp->z_unlinked;
+	nzp->z_atime_dirty = ozp->z_atime_dirty;
+	nzp->z_zn_prefetch = ozp->z_zn_prefetch;
+	nzp->z_blksz = ozp->z_blksz;
+	nzp->z_seq = ozp->z_seq;
+	nzp->z_mapcnt = ozp->z_mapcnt;
+	nzp->z_last_itx = ozp->z_last_itx;
+	nzp->z_gen = ozp->z_gen;
+	nzp->z_sync_cnt = ozp->z_sync_cnt;
+	nzp->z_phys = ozp->z_phys;
+	nzp->z_dbuf = ozp->z_dbuf;
+
+	/* Update back pointers. */
+	(void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
+	    znode_evict_error);
 
-	ASSERT(zp->z_dbuf_held == 0);
+	/*
+	 * Invalidate the original znode by clearing fields that provide a
+	 * pointer back to the znode. Set the low bit of the vfs pointer to
+	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
+	 * subsequent callback.
+	 */
+	ozp->z_dbuf = NULL;
+	POINTER_INVALIDATE(&ozp->z_zfsvfs);
 }
 
+/*
+ * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise
+ * returns a non-zero error code.
+ */
+static int
+zfs_enter(zfsvfs_t *zfsvfs)
+{
+	ZFS_ENTER(zfsvfs);
+	return (0);
+}
+
+/*ARGSUSED*/
+static kmem_cbrc_t
+zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
+{
+	znode_t *ozp = buf, *nzp = newbuf;
+	zfsvfs_t *zfsvfs;
+	vnode_t *vp;
+
+	/*
+	 * The znode is on the file system's list of known znodes if the vfs
+	 * pointer is valid. We set the low bit of the vfs pointer when freeing
+	 * the znode to invalidate it, and the memory patterns written by kmem
+	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
+	 * created znode sets the vfs pointer last of all to indicate that the
+	 * znode is known and in a valid state to be moved by this function.
+	 */
+	zfsvfs = ozp->z_zfsvfs;
+	if (!POINTER_IS_VALID(zfsvfs)) {
+		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * Ensure that the filesystem is not unmounted during the move.
+	 */
+	if (zfs_enter(zfsvfs) != 0) {		/* ZFS_ENTER */
+		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	/*
+	 * Recheck the vfs pointer in case the znode was removed just before
+	 * acquiring the lock.
+	 */
+	if (zfsvfs != ozp->z_zfsvfs) {
+		mutex_exit(&zfsvfs->z_znodes_lock);
+		ZFS_EXIT(zfsvfs);
+		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * At this point we know that as long as we hold z_znodes_lock, the
+	 * znode cannot be freed and fields within the znode can be safely
+	 * accessed. Now, prevent a race with zfs_zget().
+	 */
+	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
+		mutex_exit(&zfsvfs->z_znodes_lock);
+		ZFS_EXIT(zfsvfs);
+		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
+		return (KMEM_CBRC_LATER);
+	}
+
+	vp = ZTOV(ozp);
+	if (mutex_tryenter(&vp->v_lock) == 0) {
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
+		mutex_exit(&zfsvfs->z_znodes_lock);
+		ZFS_EXIT(zfsvfs);
+		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
+		return (KMEM_CBRC_LATER);
+	}
+
+	/* Only move znodes that are referenced _only_ by the DNLC. */
+	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
+		mutex_exit(&vp->v_lock);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
+		mutex_exit(&zfsvfs->z_znodes_lock);
+		ZFS_EXIT(zfsvfs);
+		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
+		return (KMEM_CBRC_LATER);
+	}
+
+	/*
+	 * The znode is known and in a valid state to move. We're holding the
+	 * locks needed to execute the critical section.
+	 */
+	zfs_znode_move_impl(ozp, nzp);
+	mutex_exit(&vp->v_lock);
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
+
+	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
+	mutex_exit(&zfsvfs->z_znodes_lock);
+	ZFS_EXIT(zfsvfs);
+
+	return (KMEM_CBRC_YES);
+}
+#endif /* sun */
+
 void
 zfs_znode_init(void)
 {
@@ -168,6 +360,9 @@ zfs_znode_init(void)
 	znode_cache = kmem_cache_create("zfs_znode_cache",
 	    sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
+#if defined(sun)
+	kmem_cache_set_move(znode_cache, zfs_znode_move);
+#endif
 }
 
 void
@@ -186,44 +381,43 @@ zfs_znode_fini(void)
  *	incore "master" object.  Verify version compatibility.
  */
 int
-zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
+zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp)
 {
 	objset_t	*os = zfsvfs->z_os;
-	uint64_t	version = ZPL_VERSION;
 	int		i, error;
-	dmu_object_info_t doi;
 	uint64_t fsid_guid;
+	uint64_t zval;
 
 	*zpp = NULL;
 
-	/*
-	 * XXX - hack to auto-create the pool root filesystem at
-	 * the first attempted mount.
-	 */
-	if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
-		dmu_tx_t *tx = dmu_tx_create(os);
-
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		ASSERT3U(error, ==, 0);
-		zfs_create_fs(os, cr, tx);
-		dmu_tx_commit(tx);
-	}
-
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1,
-	    &version);
+	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 	if (error) {
 		return (error);
-	} else if (version != ZPL_VERSION) {
+	} else if (zfsvfs->z_version > ZPL_VERSION) {
 		(void) printf("Mismatched versions:  File system "
-		    "is version %lld on-disk format, which is "
+		    "is version %llu on-disk format, which is "
 		    "incompatible with this software version %lld!",
-		    (u_longlong_t)version, ZPL_VERSION);
+		    (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
 		return (ENOTSUP);
 	}
 
+	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
+		return (error);
+	zfsvfs->z_norm = (int)zval;
+	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
+		return (error);
+	zfsvfs->z_utf8 = (zval != 0);
+	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
+		return (error);
+	zfsvfs->z_case = (uint_t)zval;
+	/*
+	 * Fold case on file systems that are always or sometimes case
+	 * insensitive.
+	 */
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+	    zfsvfs->z_case == ZFS_CASE_MIXED)
+		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
 	/*
 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
 	 * separates our fsid from any other filesystem types, and a
@@ -244,9 +438,10 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
 		return (error);
 	ASSERT(zfsvfs->z_root != 0);
 
-	/*
-	 * Create the per mount vop tables.
-	 */
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+	    &zfsvfs->z_unlinkedobj);
+	if (error)
+		return (error);
 
 	/*
 	 * Initialize zget mutex's
@@ -255,14 +450,21 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
 	error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
-	if (error)
+	if (error) {
+		/*
+		 * On error, we destroy the mutexes here since it's not
+		 * possible for the caller to determine if the mutexes were
+		 * initialized properly.
+		 */
+		for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+			mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 		return (error);
+	}
 	ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
-
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
-	    &zfsvfs->z_unlinkedobj);
-	if (error)
-		return (error);
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
+	    &zfsvfs->z_fuid_obj);
+	if (error == ENOENT)
+		error = 0;
 
 	return (0);
 }
@@ -307,6 +509,50 @@ zfs_cmpldev(uint64_t dev)
 	return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
 }
 
+static void
+zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
+{
+	znode_t		*nzp;
+
+	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
+	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
+
+	mutex_enter(&zp->z_lock);
+
+	ASSERT(zp->z_dbuf == NULL);
+	zp->z_dbuf = db;
+	nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
+
+	/*
+	 * there should be no
+	 * concurrent zgets on this object.
+	 */
+	if (nzp != NULL)
+		panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db);
+
+	/*
+	 * Slap on VROOT if we are the root znode
+	 */
+	if (zp->z_id == zfsvfs->z_root)
+		ZTOV(zp)->v_flag |= VROOT;
+
+	mutex_exit(&zp->z_lock);
+	vn_exists(ZTOV(zp));
+}
+
+void
+zfs_znode_dmu_fini(znode_t *zp)
+{
+	dmu_buf_t *db = zp->z_dbuf;
+	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
+	    zp->z_unlinked ||
+	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
+	ASSERT(zp->z_dbuf != NULL);
+	zp->z_dbuf = NULL;
+	VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
+	dmu_buf_rele(db, NULL);
+}
+
 /*
  * Construct a new znode/vnode and intialize.
  *
@@ -315,42 +561,45 @@ zfs_cmpldev(uint64_t dev)
  * return the znode
  */
 static znode_t *
-zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
 {
 	znode_t	*zp;
 	vnode_t *vp;
-	int error;
 
 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
-	zfs_znode_cache_constructor(zp, zfsvfs->z_vfs, 0);
+	zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0);
 
 	ASSERT(zp->z_dirlocks == NULL);
+	ASSERT(zp->z_dbuf == NULL);
+	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
 
-	zp->z_phys = db->db_data;
-	zp->z_zfsvfs = zfsvfs;
+	/*
+	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
+	 * the zfs_znode_move() callback.
+	 */
+	zp->z_phys = NULL;
 	zp->z_unlinked = 0;
 	zp->z_atime_dirty = 0;
-	zp->z_dbuf_held = 0;
 	zp->z_mapcnt = 0;
 	zp->z_last_itx = 0;
-	zp->z_dbuf = db;
-	zp->z_id = obj_num;
+	zp->z_id = db->db_object;
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
 
-	mutex_enter(&zfsvfs->z_znodes_lock);
-	list_insert_tail(&zfsvfs->z_all_znodes, zp);
-	mutex_exit(&zfsvfs->z_znodes_lock);
-
 	vp = ZTOV(zp);
+#ifdef TODO
+	vn_reinit(vp);
+#endif
+
+	zfs_znode_dmu_init(zfsvfs, zp, db);
+
+	zp->z_gen = zp->z_phys->zp_gen;
+
+#if 0
 	if (vp == NULL)
 		return (zp);
-
-	vp->v_vflag |= VV_FORCEINSMQ;
-	error = insmntque(vp, zfsvfs->z_vfs);
-	vp->v_vflag &= ~VV_FORCEINSMQ;
-	KASSERT(error == 0, ("insmntque() failed: error %d", error));
+#endif
 
 	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
 	switch (vp->v_type) {
@@ -362,37 +611,18 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
 		break;
 	}
 
-	return (zp);
-}
-
-static void
-zfs_znode_dmu_init(znode_t *zp)
-{
-	znode_t		*nzp;
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	dmu_buf_t	*db = zp->z_dbuf;
-
-	mutex_enter(&zp->z_lock);
-
-	nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_pageout_func);
-
-	/*
-	 * there should be no
-	 * concurrent zgets on this object.
-	 */
-	ASSERT3P(nzp, ==, NULL);
-
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	list_insert_tail(&zfsvfs->z_all_znodes, zp);
+	membar_producer();
 	/*
-	 * Slap on VROOT if we are the root znode
+	 * Everything else must be valid before assigning z_zfsvfs makes the
+	 * znode eligible for zfs_znode_move().
 	 */
-	if (zp->z_id == zfsvfs->z_root) {
-		ZTOV(zp)->v_flag |= VROOT;
-	}
+	zp->z_zfsvfs = zfsvfs;
+	mutex_exit(&zfsvfs->z_znodes_lock);
 
-	ASSERT(zp->z_dbuf_held == 0);
-	zp->z_dbuf_held = 1;
 	VFS_HOLD(zfsvfs->z_vfs);
-	mutex_exit(&zp->z_lock);
+	return (zp);
 }
 
 /*
@@ -406,31 +636,34 @@ zfs_znode_dmu_init(znode_t *zp)
  *			  IS_ROOT_NODE	- new object will be root
  *			  IS_XATTR	- new object is an attribute
  *			  IS_REPLAY	- intent log replay
+ *		bonuslen - length of bonus buffer
+ *		setaclp  - File/Dir initial ACL
+ *		fuidp	 - Tracks fuid allocation.
  *
- *	OUT:	oid	- ID of created object
+ *	OUT:	zpp	- allocated znode
  *
  */
 void
-zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
-	uint_t flag, znode_t **zpp, int bonuslen)
+zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
+    uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp,
+    zfs_fuid_info_t **fuidp)
 {
-	dmu_buf_t	*dbp;
+	dmu_buf_t	*db;
 	znode_phys_t	*pzp;
-	znode_t		*zp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	timestruc_t	now;
-	uint64_t	gen;
+	uint64_t	gen, obj;
 	int		err;
 
 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
 
 	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
-		*oid = vap->va_nodeid;
+		obj = vap->va_nodeid;
 		flag |= IS_REPLAY;
 		now = vap->va_ctime;		/* see zfs_replay_create() */
 		gen = vap->va_nblocks;		/* ditto */
 	} else {
-		*oid = 0;
+		obj = 0;
 		gethrestime(&now);
 		gen = dmu_tx_get_txg(tx);
 	}
@@ -446,44 +679,45 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
 	 */
 	if (vap->va_type == VDIR) {
 		if (flag & IS_REPLAY) {
-			err = zap_create_claim(zfsvfs->z_os, *oid,
-			    DMU_OT_DIRECTORY_CONTENTS,
+			err = zap_create_claim_norm(zfsvfs->z_os, obj,
+			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 			ASSERT3U(err, ==, 0);
 		} else {
-			*oid = zap_create(zfsvfs->z_os,
-			    DMU_OT_DIRECTORY_CONTENTS,
+			obj = zap_create_norm(zfsvfs->z_os,
+			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 		}
 	} else {
 		if (flag & IS_REPLAY) {
-			err = dmu_object_claim(zfsvfs->z_os, *oid,
+			err = dmu_object_claim(zfsvfs->z_os, obj,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 			ASSERT3U(err, ==, 0);
 		} else {
-			*oid = dmu_object_alloc(zfsvfs->z_os,
+			obj = dmu_object_alloc(zfsvfs->z_os,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 		}
 	}
-	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp));
-	dmu_buf_will_dirty(dbp, tx);
+	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
+	dmu_buf_will_dirty(db, tx);
 
 	/*
 	 * Initialize the znode physical data to zero.
 	 */
-	ASSERT(dbp->db_size >= sizeof (znode_phys_t));
-	bzero(dbp->db_data, dbp->db_size);
-	pzp = dbp->db_data;
+	ASSERT(db->db_size >= sizeof (znode_phys_t));
+	bzero(db->db_data, db->db_size);
+	pzp = db->db_data;
 
 	/*
 	 * If this is the root, fix up the half-initialized parent pointer
 	 * to reference the just-allocated physical data area.
 	 */
 	if (flag & IS_ROOT_NODE) {
+		dzp->z_dbuf = db;
 		dzp->z_phys = pzp;
-		dzp->z_id = *oid;
+		dzp->z_id = obj;
 	}
 
 	/*
@@ -496,6 +730,9 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
 		pzp->zp_rdev = zfs_expldev(vap->va_rdev);
 	}
 
+	if (zfsvfs->z_use_fuids)
+		pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+
 	if (vap->va_type == VDIR) {
 		pzp->zp_size = 2;		/* contents ("." and "..") */
 		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
@@ -523,25 +760,91 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
 	}
 
 	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
-	zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0);
-
-	zfs_perm_init(zp, dzp, flag, vap, tx, cr);
+	if (!(flag & IS_ROOT_NODE)) {
+		ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
+		*zpp = zfs_znode_alloc(zfsvfs, db, 0);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+	} else {
+		/*
+		 * If we are creating the root node, the "parent" we
+		 * passed in is the znode for the root.
+		 */
+		*zpp = dzp;
+	}
+	zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp);
+	if (!(flag & IS_ROOT_NODE)) {
+		vnode_t *vp;
+
+		vp = ZTOV(*zpp);
+		vp->v_vflag |= VV_FORCEINSMQ;
+		err = insmntque(vp, zfsvfs->z_vfs);
+		vp->v_vflag &= ~VV_FORCEINSMQ;
+		KASSERT(err == 0, ("insmntque() failed: error %d", err));
+	}
+}
 
-	if (zpp) {
-		kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp);
+void
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
+{
+	xoptattr_t *xoap;
 
-		mutex_enter(hash_mtx);
-		zfs_znode_dmu_init(zp);
-		mutex_exit(hash_mtx);
+	xoap = xva_getxoptattr(xvap);
+	ASSERT(xoap);
 
-		*zpp = zp;
-	} else {
-		if (ZTOV(zp) != NULL) {
-			ZTOV(zp)->v_count = 0;
-			VOP_UNLOCK(ZTOV(zp), 0);
-		}
-		dmu_buf_rele(dbp, NULL);
-		zfs_znode_free(zp);
+	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
+		ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
+		XVA_SET_RTN(xvap, XAT_CREATETIME);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
+		XVA_SET_RTN(xvap, XAT_READONLY);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
+		XVA_SET_RTN(xvap, XAT_HIDDEN);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
+		XVA_SET_RTN(xvap, XAT_SYSTEM);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
+		XVA_SET_RTN(xvap, XAT_ARCHIVE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
+		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
+		XVA_SET_RTN(xvap, XAT_NOUNLINK);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
+		XVA_SET_RTN(xvap, XAT_APPENDONLY);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
+		XVA_SET_RTN(xvap, XAT_NODUMP);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
+		XVA_SET_RTN(xvap, XAT_OPAQUE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
+		    xoap->xoa_av_quarantined);
+		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
+		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+		(void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
+		    sizeof (xoap->xoa_av_scanstamp));
+		zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
+		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
 	}
 }
 
@@ -552,10 +855,10 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 	dmu_buf_t	*db;
 	znode_t		*zp;
 	vnode_t		*vp;
-	int err;
+	int err, first = 1;
 
 	*zpp = NULL;
-
+again:
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
 	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
@@ -572,84 +875,118 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 		return (EINVAL);
 	}
 
-	ASSERT(db->db_object == obj_num);
-	ASSERT(db->db_offset == -1);
-	ASSERT(db->db_data != NULL);
-
 	zp = dmu_buf_get_user(db);
-
 	if (zp != NULL) {
 		mutex_enter(&zp->z_lock);
 
+		/*
+		 * Since we do immediate eviction of the z_dbuf, we
+		 * should never find a dbuf with a znode that doesn't
+		 * know about the dbuf.
+		 */
+		ASSERT3P(zp->z_dbuf, ==, db);
 		ASSERT3U(zp->z_id, ==, obj_num);
 		if (zp->z_unlinked) {
-			dmu_buf_rele(db, NULL);
-			mutex_exit(&zp->z_lock);
-			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-			return (ENOENT);
-		} else if (zp->z_dbuf_held) {
-			dmu_buf_rele(db, NULL);
+			err = ENOENT;
 		} else {
-			zp->z_dbuf_held = 1;
-			VFS_HOLD(zfsvfs->z_vfs);
-		}
-
-		if (ZTOV(zp) != NULL)
-			VN_HOLD(ZTOV(zp));
-		else {
-			err = getnewvnode("zfs", zfsvfs->z_vfs, &zfs_vnodeops,
-			    &zp->z_vnode);
-			ASSERT(err == 0);
-			vp = ZTOV(zp);
-			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-			vp->v_data = (caddr_t)zp;
-			VN_LOCK_AREC(vp);
-			VN_LOCK_ASHARE(vp);
-			vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
-			if (vp->v_type == VDIR)
-				zp->z_zn_prefetch = B_TRUE;	/* z_prefetch default is enabled */
-			vp->v_vflag |= VV_FORCEINSMQ;
-			err = insmntque(vp, zfsvfs->z_vfs);
-			vp->v_vflag &= ~VV_FORCEINSMQ;
-			KASSERT(err == 0, ("insmntque() failed: error %d", err));
-			VOP_UNLOCK(vp, 0);
+			if (ZTOV(zp) != NULL)
+				VN_HOLD(ZTOV(zp));
+			else {
+				if (first) {
+					ZFS_LOG(1, "dying znode detected (zp=%p)", zp);
+					first = 0;
+				}
+				/*
+				 * znode is dying so we can't reuse it, we must
+				 * wait until destruction is completed.
+				 */
+				dmu_buf_rele(db, NULL);
+				mutex_exit(&zp->z_lock);
+				ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+				tsleep(zp, 0, "zcollide", 1);
+				goto again;
+			}
+			*zpp = zp;
+			err = 0;
 		}
+		dmu_buf_rele(db, NULL);
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		*zpp = zp;
-		return (0);
+		return (err);
 	}
 
 	/*
 	 * Not found create new znode/vnode
 	 */
-	zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
-	ASSERT3U(zp->z_id, ==, obj_num);
-	zfs_znode_dmu_init(zp);
+	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
+
+	vp = ZTOV(zp);
+	vp->v_vflag |= VV_FORCEINSMQ;
+	err = insmntque(vp, zfsvfs->z_vfs);
+	vp->v_vflag &= ~VV_FORCEINSMQ;
+	KASSERT(err == 0, ("insmntque() failed: error %d", err));
+	VOP_UNLOCK(vp, 0);
+
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 	*zpp = zp;
-	if ((vp = ZTOV(zp)) != NULL)
-		VOP_UNLOCK(vp, 0);
 	return (0);
 }
 
-void
-zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+int
+zfs_rezget(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int error;
+	dmu_object_info_t doi;
+	dmu_buf_t *db;
+	uint64_t obj_num = zp->z_id;
+	int err;
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
-	ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
-	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
-		error = dmu_object_free(zfsvfs->z_os,
-		    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
-		ASSERT3U(error, ==, 0);
+	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	if (err) {
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (err);
 	}
-	error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
-	ASSERT3U(error, ==, 0);
-	zp->z_dbuf_held = 0;
-	ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
-	dmu_buf_rele(zp->z_dbuf, NULL);
+
+	dmu_object_info_from_db(db, &doi);
+	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
+		dmu_buf_rele(db, NULL);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (EINVAL);
+	}
+
+	if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
+		dmu_buf_rele(db, NULL);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (EIO);
+	}
+
+	zfs_znode_dmu_init(zfsvfs, zp, db);
+	zp->z_unlinked = (zp->z_phys->zp_links == 0);
+	zp->z_blksz = doi.doi_data_block_size;
+
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+	return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	objset_t *os = zfsvfs->z_os;
+	uint64_t obj = zp->z_id;
+	uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
+	if (acl_obj)
+		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+	VERIFY(0 == dmu_object_free(os, obj, tx));
+	zfs_znode_dmu_fini(zp);
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+	zfs_znode_free(zp);
 }
 
 void
@@ -659,7 +996,7 @@ zfs_zinactive(znode_t *zp)
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	uint64_t z_id = zp->z_id;
 
-	ASSERT(zp->z_dbuf_held && zp->z_phys);
+	ASSERT(zp->z_dbuf && zp->z_phys);
 
 	/*
 	 * Don't allow a zfs_zget() while were trying to release this znode
@@ -686,17 +1023,13 @@ zfs_zinactive(znode_t *zp)
 	 * remove the file from the file system.
 	 */
 	if (zp->z_unlinked) {
-		ZTOV(zp) = NULL;
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 		ASSERT(vp->v_count == 0);
 		vrecycle(vp, curthread);
 		zfs_rmnode(zp);
-		VFS_RELE(zfsvfs->z_vfs);
 		return;
 	}
-	ASSERT(zp->z_phys);
-	ASSERT(zp->z_dbuf_held);
 	mutex_exit(&zp->z_lock);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 }
@@ -706,11 +1039,15 @@ zfs_znode_free(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
+	ASSERT(ZTOV(zp) == NULL);
 	mutex_enter(&zfsvfs->z_znodes_lock);
+	POINTER_INVALIDATE(&zp->z_zfsvfs);
 	list_remove(&zfsvfs->z_all_znodes, zp);
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	kmem_cache_free(znode_cache, zp);
+
+	VFS_RELE(zfsvfs->z_vfs);
 }
 
 void
@@ -733,11 +1070,17 @@ zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
 	if (flag & AT_ATIME)
 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
 
-	if (flag & AT_MTIME)
+	if (flag & AT_MTIME) {
 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
+		if (zp->z_zfsvfs->z_use_fuids)
+			zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
+	}
 
-	if (flag & AT_CTIME)
+	if (flag & AT_CTIME) {
 		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
+		if (zp->z_zfsvfs->z_use_fuids)
+			zp->z_phys->zp_flags |= ZFS_ARCHIVE;
+	}
 }
 
 /*
@@ -796,113 +1139,195 @@ zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
 }
 
 /*
- * Free space in a file.
+ * Increase the file length
  *
  *	IN:	zp	- znode of file to free data in.
- *		off	- start of section to free.
- *		len	- length of section to free (0 => to EOF).
- *		flag	- current file open mode flags.
+ *		end	- new end-of-file
  *
  * 	RETURN:	0 if success
  *		error code if failure
  */
-int
-zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+static int
+zfs_extend(znode_t *zp, uint64_t end)
 {
-	vnode_t *vp = ZTOV(zp);
-	dmu_tx_t *tx;
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	zilog_t *zilog = zfsvfs->z_log;
+	dmu_tx_t *tx;
 	rl_t *rl;
-	uint64_t end = off + len;
-	uint64_t size, new_blksz;
+	uint64_t newblksz;
 	int error;
 
-	if (ZTOV(zp)->v_type == VFIFO)
-		return (0);
-
 	/*
-	 * If we will change zp_size then lock the whole file,
-	 * otherwise just lock the range being freed.
+	 * We will change zp_size, lock the whole file.
 	 */
-	if (len == 0 || off + len > zp->z_phys->zp_size) {
-		rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
-	} else {
-		rl = zfs_range_lock(zp, off, len, RL_WRITER);
-		/* recheck, in case zp_size changed */
-		if (off + len > zp->z_phys->zp_size) {
-			/* lost race: file size changed, lock whole file */
-			zfs_range_unlock(rl);
-			rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
-		}
-	}
+	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
-	size = zp->z_phys->zp_size;
-	if (len == 0 && size == off && off != 0) {
+	if (end <= zp->z_phys->zp_size) {
 		zfs_range_unlock(rl);
 		return (0);
 	}
-
+top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	new_blksz = 0;
-	if (end > size &&
+	if (end > zp->z_blksz &&
 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
 		/*
 		 * We are growing the file past the current block size.
 		 */
 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
 			ASSERT(!ISP2(zp->z_blksz));
-			new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
+			newblksz = MIN(end, SPA_MAXBLOCKSIZE);
 		} else {
-			new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
+			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
 		}
-		dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz));
-	} else if (off < size) {
-		/*
-		 * If len == 0, we are truncating the file.
-		 */
-		dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
+		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
+	} else {
+		newblksz = 0;
 	}
 
 	error = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
 		dmu_tx_abort(tx);
 		zfs_range_unlock(rl);
 		return (error);
 	}
+	dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+	if (newblksz)
+		zfs_grow_blocksize(zp, newblksz, tx);
 
-	if (new_blksz)
-		zfs_grow_blocksize(zp, new_blksz, tx);
+	zp->z_phys->zp_size = end;
 
-	if (end > size || len == 0)
-		zp->z_phys->zp_size = end;
+	zfs_range_unlock(rl);
 
-	if (off < size) {
-		objset_t *os = zfsvfs->z_os;
-		uint64_t rlen = len;
+	dmu_tx_commit(tx);
 
-		if (len == 0)
-			rlen = -1;
-		else if (end > size)
-			rlen = size - off;
-		VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx));
+	rw_enter(&zp->z_map_lock, RW_WRITER);
+	error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0);
+	ASSERT(error == 0);
+	vnode_pager_setsize(ZTOV(zp), end);
+	rw_exit(&zp->z_map_lock);
+
+	return (0);
+}
+
+/*
+ * Free space in a file.
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		off	- start of section to free.
+ *		len	- length of section to free.
+ *
+ * 	RETURN:	0 if success
+ *		error code if failure
+ */
+static int
+zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	rl_t *rl;
+	int error;
+
+	/*
+	 * Lock the range being freed.
+	 */
+	rl = zfs_range_lock(zp, off, len, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (off >= zp->z_phys->zp_size) {
+		zfs_range_unlock(rl);
+		return (0);
 	}
 
-	if (log) {
-		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
-		zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+	if (off + len > zp->z_phys->zp_size)
+		len = zp->z_phys->zp_size - off;
+
+	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
+
+	if (error == 0) {
+		/*
+		 * In FreeBSD we cannot free block in the middle of a file,
+		 * but only at the end of a file.
+		 */
+		rw_enter(&zp->z_map_lock, RW_WRITER);
+		error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0);
+		ASSERT(error == 0);
+		vnode_pager_setsize(ZTOV(zp), off);
+		rw_exit(&zp->z_map_lock);
 	}
 
 	zfs_range_unlock(rl);
 
+	return (error);
+}
+
+/*
+ * Truncate a file
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		end	- new end-of-file.
+ *
+ * 	RETURN:	0 if success
+ *		error code if failure
+ */
+static int
+zfs_trunc(znode_t *zp, uint64_t end)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	vnode_t *vp = ZTOV(zp);
+	dmu_tx_t *tx;
+	rl_t *rl;
+	int error;
+
+	/*
+	 * We will change zp_size, lock the whole file.
+	 */
+	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (end >= zp->z_phys->zp_size) {
+		zfs_range_unlock(rl);
+		return (0);
+	}
+
+	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,  -1);
+	if (error) {
+		zfs_range_unlock(rl);
+		return (error);
+	}
+top:
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
+		dmu_tx_abort(tx);
+		zfs_range_unlock(rl);
+		return (error);
+	}
+	dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+	zp->z_phys->zp_size = end;
+
 	dmu_tx_commit(tx);
 
+	zfs_range_unlock(rl);
+
 	/*
 	 * Clear any mapped pages in the truncated region.  This has to
 	 * happen outside of the transaction to avoid the possibility of
@@ -910,30 +1335,90 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
 	 * about to invalidate.
 	 */
 	rw_enter(&zp->z_map_lock, RW_WRITER);
-	if (end > size)
-		vnode_pager_setsize(vp, end);
-	else if (len == 0) {
 #if 0
-		error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE);
+	error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE);
 #else
-		error = vinvalbuf(vp, V_SAVE, 0, 0);
-		vnode_pager_setsize(vp, end);
+	error = vinvalbuf(vp, V_SAVE, 0, 0);
+	ASSERT(error == 0);
+	vnode_pager_setsize(vp, end);
 #endif
-	}
 	rw_exit(&zp->z_map_lock);
 
 	return (0);
 }
 
+/*
+ * Free space in a file
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		off	- start of range
+ *		len	- end of range (0 => EOF)
+ *		flag	- current file open mode flags.
+ *		log	- TRUE if this action should be logged
+ *
+ * 	RETURN:	0 if success
+ *		error code if failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+	vnode_t *vp = ZTOV(zp);
+	dmu_tx_t *tx;
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	zilog_t *zilog = zfsvfs->z_log;
+	int error;
+
+	if (off > zp->z_phys->zp_size) {
+		error =  zfs_extend(zp, off+len);
+		if (error == 0 && log)
+			goto log;
+		else
+			return (error);
+	}
+
+	if (len == 0) {
+		error = zfs_trunc(zp, off);
+	} else {
+		if ((error = zfs_free_range(zp, off, len)) == 0 &&
+		    off + len > zp->z_phys->zp_size)
+			error = zfs_extend(zp, off+len);
+	}
+	if (error || !log)
+		return (error);
+log:
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto log;
+		}
+		dmu_tx_abort(tx);
+		return (error);
+	}
+
+	zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+
+	dmu_tx_commit(tx);
+	return (0);
+}
+
 void
-zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
+zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 {
 	zfsvfs_t	zfsvfs;
-	uint64_t	moid, doid, roid = 0;
-	uint64_t	version = ZPL_VERSION;
+	uint64_t	moid, doid, version;
+	uint64_t	sense = ZFS_CASE_SENSITIVE;
+	uint64_t	norm = 0;
+	nvpair_t	*elem;
 	int		error;
 	znode_t		*rootzp = NULL;
+	vnode_t		*vp;
 	vattr_t		vattr;
+	znode_t		*zp;
 
 	/*
 	 * First attempt to create master node.
@@ -950,9 +1435,35 @@ zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
 	/*
 	 * Set starting attributes.
 	 */
-
-	error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx);
-	ASSERT(error == 0);
+	if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+		version = ZPL_VERSION;
+	else
+		version = ZPL_VERSION_FUID - 1;
+	error = zap_update(os, moid, ZPL_VERSION_STR,
+	    8, 1, &version, tx);
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
+		/* For the moment we expect all zpl props to be uint64_ts */
+		uint64_t val;
+		char *name;
+
+		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
+		VERIFY(nvpair_value_uint64(elem, &val) == 0);
+		name = nvpair_name(elem);
+		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
+			version = val;
+			error = zap_update(os, moid, ZPL_VERSION_STR,
+			    8, 1, &version, tx);
+		} else {
+			error = zap_update(os, moid, name, 8, 1, &val, tx);
+		}
+		ASSERT(error == 0);
+		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
+			norm = val;
+		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
+			sense = val;
+	}
+	ASSERT(version != 0);
 
 	/*
 	 * Create a delete queue.
@@ -966,39 +1477,62 @@ zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
 	 * to allow zfs_mknode to work.
 	 */
+	VATTR_NULL(&vattr);
 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
 	vattr.va_type = VDIR;
 	vattr.va_mode = S_IFDIR|0755;
-	vattr.va_uid = UID_ROOT;
-	vattr.va_gid = GID_WHEEL;
+	vattr.va_uid = crgetuid(cr);
+	vattr.va_gid = crgetgid(cr);
 
 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
-	zfs_znode_cache_constructor(rootzp, NULL, 0);
-	rootzp->z_zfsvfs = &zfsvfs;
+	zfs_znode_cache_constructor(rootzp, &zfsvfs, 0);
 	rootzp->z_unlinked = 0;
 	rootzp->z_atime_dirty = 0;
-	rootzp->z_dbuf_held = 0;
+
+	vp = ZTOV(rootzp);
+	vp->v_type = VDIR;
 
 	bzero(&zfsvfs, sizeof (zfsvfs_t));
 
 	zfsvfs.z_os = os;
 	zfsvfs.z_assign = TXG_NOWAIT;
 	zfsvfs.z_parent = &zfsvfs;
+	zfsvfs.z_version = version;
+	zfsvfs.z_use_fuids = USE_FUIDS(version, os);
+	zfsvfs.z_norm = norm;
+	/*
+	 * Fold case on file systems that are always or sometimes case
+	 * insensitive.
+	 */
+	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
+		zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER;
 
 	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 
-	zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0);
-	ASSERT3U(rootzp->z_id, ==, roid);
-	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx);
+	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
+	rootzp->z_zfsvfs = &zfsvfs;
+	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL);
+	ASSERT3P(zp, ==, rootzp);
+	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
 	ASSERT(error == 0);
+	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
 
+	VI_LOCK(vp);
+	ZTOV(rootzp)->v_data = NULL;
+	ZTOV(rootzp)->v_count = 0;
+	ZTOV(rootzp)->v_holdcnt = 0;
+	ZTOV(rootzp) = NULL;
+	VOP_UNLOCK(vp, 0);
+	vdestroy(vp);
+	dmu_buf_rele(rootzp->z_dbuf, NULL);
+	rootzp->z_dbuf = NULL;
 	mutex_destroy(&zfsvfs.z_znodes_lock);
 	kmem_cache_free(znode_cache, rootzp);
 }
-#endif /* _KERNEL */
 
+#endif /* _KERNEL */
 /*
  * Given an object number, return its parent object number and whether
  * or not the object is an extended attribute directory.
@@ -1058,7 +1592,8 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
 		if (is_xattrdir) {
 			(void) sprintf(component + 1, "<xattrdir>");
 		} else {
-			error = zap_value_search(osp, pobj, obj, component + 1);
+			error = zap_value_search(osp, pobj, obj,
+			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
 			if (error != 0)
 				break;
 		}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
index 69ee509d50ed..1f6fa0db9460 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
@@ -174,7 +172,11 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
 
 	*abufpp = NULL;
 
-	error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array,
+	/*
+	 * We shouldn't be doing any scrubbing while we're doing log
+	 * replay, it's OK to not lock.
+	 */
+	error = arc_read_nolock(NULL, zilog->zl_spa, &blk,
 	    arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
 
@@ -185,17 +187,20 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
 		zio_cksum_t cksum = bp->blk_cksum;
 
 		/*
+		 * Validate the checksummed log block.
+		 *
 		 * Sequence numbers should be... sequential.  The checksum
 		 * verifier for the next block should be bp's checksum plus 1.
+		 *
+		 * Also check the log chain linkage and size used.
 		 */
 		cksum.zc_word[ZIL_ZC_SEQ]++;
 
-		if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)))
-			error = ESTALE;
-		else if (BP_IS_HOLE(&ztp->zit_next_blk))
-			error = ENOENT;
-		else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))
-			error = EOVERFLOW;
+		if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum,
+		    sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) ||
+		    (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))) {
+			error = ECKSUM;
+		}
 
 		if (error) {
 			VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
@@ -290,7 +295,8 @@ zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
 	 */
 	if (bp->blk_birth >= first_txg &&
 	    zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
-		err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL));
+		err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL,
+		    ZIO_FLAG_MUSTSUCCEED));
 		ASSERT(err == 0);
 	}
 }
@@ -430,6 +436,16 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 
 	mutex_enter(&zilog->zl_lock);
 
+	/*
+	 * It is possible for the ZIL to get the previously mounted zilog
+	 * structure of the same dataset if quickly remounted and the dbuf
+	 * eviction has not completed. In this case we can see a non
+	 * empty lwb list and keep_first will be set. We fix this by
+	 * clearing the keep_first. This will be slower but it's very rare.
+	 */
+	if (!list_is_empty(&zilog->zl_lwb_list) && keep_first)
+		keep_first = B_FALSE;
+
 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
 	zilog->zl_destroy_txg = txg;
 	zilog->zl_keep_first = keep_first;
@@ -453,12 +469,37 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
+}
+
+/*
+ * zil_rollback_destroy() is only called by the rollback code.
+ * We already have a syncing tx. Rollback has exclusive access to the
+ * dataset, so we don't have to worry about concurrent zil access.
+ * The actual freeing of any log blocks occurs in zil_sync() later in
+ * this txg syncing phase.
+ */
+void
+zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx)
+{
+	const zil_header_t *zh = zilog->zl_header;
+	uint64_t txg;
 
-	if (keep_first)			/* no need to wait in this case */
+	if (BP_IS_HOLE(&zh->zh_log))
 		return;
 
-	txg_wait_synced(zilog->zl_dmu_pool, txg);
-	ASSERT(BP_IS_HOLE(&zh->zh_log));
+	txg = dmu_tx_get_txg(tx);
+	ASSERT3U(zilog->zl_destroy_txg, <, txg);
+	zilog->zl_destroy_txg = txg;
+	zilog->zl_keep_first = B_FALSE;
+
+	/*
+	 * Ensure there's no outstanding ZIL IO.  No lwbs or just the
+	 * unused one that allocated in advance is ok.
+	 */
+	ASSERT(zilog->zl_lwb_list.list_head.list_next ==
+	    zilog->zl_lwb_list.list_head.list_prev);
+	(void) zil_parse(zilog, zil_free_log_block, zil_free_log_record,
+	    tx, zh->zh_claim_txg);
 }
 
 int
@@ -471,9 +512,9 @@ zil_claim(char *osname, void *txarg)
 	objset_t *os;
 	int error;
 
-	error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+	error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
 	if (error) {
-		cmn_err(CE_WARN, "can't process intent log for %s", osname);
+		cmn_err(CE_WARN, "can't open objset for %s", osname);
 		return (0);
 	}
 
@@ -500,104 +541,164 @@ zil_claim(char *osname, void *txarg)
 	return (0);
 }
 
-void
-zil_add_vdev(zilog_t *zilog, uint64_t vdev)
+/*
+ * Check the log by walking the log chain.
+ * Checksum errors are ok as they indicate the end of the chain.
+ * Any other error (no device or read failure) returns an error.
+ */
+/* ARGSUSED */
+int
+zil_check_log_chain(char *osname, void *txarg)
 {
-	zil_vdev_t *zv, *new;
-	uint64_t bmap_sz = sizeof (zilog->zl_vdev_bmap) << 3;
-	uchar_t *cp;
+	zilog_t *zilog;
+	zil_header_t *zh;
+	blkptr_t blk;
+	arc_buf_t *abuf;
+	objset_t *os;
+	char *lrbuf;
+	zil_trailer_t *ztp;
+	int error;
 
-	if (zfs_nocacheflush)
-		return;
+	error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+	if (error) {
+		cmn_err(CE_WARN, "can't open objset for %s", osname);
+		return (0);
+	}
 
-	if (vdev < bmap_sz) {
-		cp = zilog->zl_vdev_bmap + (vdev / 8);
-		atomic_or_8(cp, 1 << (vdev % 8));
-	} else  {
-		/*
-		 * insert into ordered list
-		 */
-		mutex_enter(&zilog->zl_lock);
-		for (zv = list_head(&zilog->zl_vdev_list); zv != NULL;
-		    zv = list_next(&zilog->zl_vdev_list, zv)) {
-			if (zv->vdev == vdev) {
-				/* duplicate found - just return */
-				mutex_exit(&zilog->zl_lock);
-				return;
-			}
-			if (zv->vdev > vdev) {
-				/* insert before this entry */
-				new = kmem_alloc(sizeof (zil_vdev_t),
-				    KM_SLEEP);
-				new->vdev = vdev;
-				list_insert_before(&zilog->zl_vdev_list,
-				    zv, new);
-				mutex_exit(&zilog->zl_lock);
-				return;
-			}
-		}
-		/* ran off end of list, insert at the end */
-		ASSERT(zv == NULL);
-		new = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP);
-		new->vdev = vdev;
-		list_insert_tail(&zilog->zl_vdev_list, new);
-		mutex_exit(&zilog->zl_lock);
+	zilog = dmu_objset_zil(os);
+	zh = zil_header_in_syncing_context(zilog);
+	blk = zh->zh_log;
+	if (BP_IS_HOLE(&blk)) {
+		dmu_objset_close(os);
+		return (0); /* no chain */
+	}
+
+	for (;;) {
+		error = zil_read_log_block(zilog, &blk, &abuf);
+		if (error)
+			break;
+		lrbuf = abuf->b_data;
+		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
+		blk = ztp->zit_next_blk;
+		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+	}
+	dmu_objset_close(os);
+	if (error == ECKSUM)
+		return (0); /* normal end of chain */
+	return (error);
+}
+
+/*
+ * Clear a log chain
+ */
+/* ARGSUSED */
+int
+zil_clear_log_chain(char *osname, void *txarg)
+{
+	zilog_t *zilog;
+	zil_header_t *zh;
+	objset_t *os;
+	dmu_tx_t *tx;
+	int error;
+
+	error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+	if (error) {
+		cmn_err(CE_WARN, "can't open objset for %s", osname);
+		return (0);
 	}
+
+	zilog = dmu_objset_zil(os);
+	tx = dmu_tx_create(zilog->zl_os);
+	(void) dmu_tx_assign(tx, TXG_WAIT);
+	zh = zil_header_in_syncing_context(zilog);
+	BP_ZERO(&zh->zh_log);
+	dsl_dataset_dirty(dmu_objset_ds(os), tx);
+	dmu_tx_commit(tx);
+	dmu_objset_close(os);
+	return (0);
+}
+
+static int
+zil_vdev_compare(const void *x1, const void *x2)
+{
+	uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
+	uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
+
+	if (v1 < v2)
+		return (-1);
+	if (v1 > v2)
+		return (1);
+
+	return (0);
 }
 
-/* start an async flush of the write cache for this vdev */
 void
-zil_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio)
+zil_add_block(zilog_t *zilog, blkptr_t *bp)
 {
-	vdev_t *vd;
+	avl_tree_t *t = &zilog->zl_vdev_tree;
+	avl_index_t where;
+	zil_vdev_node_t *zv, zvsearch;
+	int ndvas = BP_GET_NDVAS(bp);
+	int i;
 
-	if (*zio == NULL)
-		*zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+	if (zfs_nocacheflush)
+		return;
 
-	vd = vdev_lookup_top(spa, vdev);
-	ASSERT(vd);
+	ASSERT(zilog->zl_writer);
 
-	(void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE,
-	    NULL, NULL, ZIO_PRIORITY_NOW,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+	/*
+	 * Even though we're zl_writer, we still need a lock because the
+	 * zl_get_data() callbacks may have dmu_sync() done callbacks
+	 * that will run concurrently.
+	 */
+	mutex_enter(&zilog->zl_vdev_lock);
+	for (i = 0; i < ndvas; i++) {
+		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
+		if (avl_find(t, &zvsearch, &where) == NULL) {
+			zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
+			zv->zv_vdev = zvsearch.zv_vdev;
+			avl_insert(t, zv, where);
+		}
+	}
+	mutex_exit(&zilog->zl_vdev_lock);
 }
 
 void
 zil_flush_vdevs(zilog_t *zilog)
 {
-	zil_vdev_t *zv;
-	zio_t *zio = NULL;
 	spa_t *spa = zilog->zl_spa;
-	uint64_t vdev;
-	uint8_t b;
-	int i, j;
+	avl_tree_t *t = &zilog->zl_vdev_tree;
+	void *cookie = NULL;
+	zil_vdev_node_t *zv;
+	zio_t *zio;
 
 	ASSERT(zilog->zl_writer);
 
-	for (i = 0; i < sizeof (zilog->zl_vdev_bmap); i++) {
-		b = zilog->zl_vdev_bmap[i];
-		if (b == 0)
-			continue;
-		for (j = 0; j < 8; j++) {
-			if (b & (1 << j)) {
-				vdev = (i << 3) + j;
-				zil_flush_vdev(spa, vdev, &zio);
-			}
-		}
-		zilog->zl_vdev_bmap[i] = 0;
-	}
+	/*
+	 * We don't need zl_vdev_lock here because we're the zl_writer,
+	 * and all zl_get_data() callbacks are done.
+	 */
+	if (avl_numnodes(t) == 0)
+		return;
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
-	while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
-		zil_flush_vdev(spa, zv->vdev, &zio);
-		list_remove(&zilog->zl_vdev_list, zv);
-		kmem_free(zv, sizeof (zil_vdev_t));
+	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
+		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
+		if (vd != NULL)
+			zio_flush(zio, vd);
+		kmem_free(zv, sizeof (*zv));
 	}
+
 	/*
 	 * Wait for all the flushes to complete.  Not all devices actually
 	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
 	 */
-	if (zio)
-		(void) zio_wait(zio);
+	(void) zio_wait(zio);
+
+	spa_config_exit(spa, SCL_STATE, FTAG);
 }
 
 /*
@@ -609,6 +710,15 @@ zil_lwb_write_done(zio_t *zio)
 	lwb_t *lwb = zio->io_private;
 	zilog_t *zilog = lwb->lwb_zilog;
 
+	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
+	ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG);
+	ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
+	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
+	ASSERT(!BP_IS_GANG(zio->io_bp));
+	ASSERT(!BP_IS_HOLE(zio->io_bp));
+	ASSERT(zio->io_bp->blk_fill == 0);
+
 	/*
 	 * Now that we've written this log block, we have a stable pointer
 	 * to the next block in the chain, so it's OK to let the txg in
@@ -619,19 +729,13 @@ zil_lwb_write_done(zio_t *zio)
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_buf = NULL;
-	if (zio->io_error) {
+	if (zio->io_error)
 		zilog->zl_log_error = B_TRUE;
-		mutex_exit(&zilog->zl_lock);
-		return;
-	}
 	mutex_exit(&zilog->zl_lock);
 }
 
 /*
  * Initialize the io for a log block.
- *
- * Note, we should not initialize the IO until we are about
- * to use it, since zio_rewrite() does a spa_config_enter().
  */
 static void
 zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
@@ -649,9 +753,9 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 	}
 	if (lwb->lwb_zio == NULL) {
 		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
-		    ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf,
+		    0, &lwb->lwb_blk, lwb->lwb_buf,
 		    lwb->lwb_sz, zil_lwb_write_done, lwb,
-		    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+		    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_CANFAIL, &zb);
 	}
 }
 
@@ -751,8 +855,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 	list_insert_tail(&zilog->zl_lwb_list, nlwb);
 	mutex_exit(&zilog->zl_lock);
 
-	/* Record the vdev for later flushing */
-	zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk))));
+	/* Record the block for later vdev flushing */
+	zil_add_block(zilog, &lwb->lwb_blk);
 
 	/*
 	 * kick off the write for the old log block
@@ -848,7 +952,7 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 }
 
 itx_t *
-zil_itx_create(int txtype, size_t lrsize)
+zil_itx_create(uint64_t txtype, size_t lrsize)
 {
 	itx_t *itx;
 
@@ -857,6 +961,7 @@ zil_itx_create(int txtype, size_t lrsize)
 	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
+	itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
 
 	return (itx);
@@ -871,7 +976,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_itx_list, itx);
-	zilog->zl_itx_list_sz += itx->itx_lr.lrc_reclen;
+	zilog->zl_itx_list_sz += itx->itx_sod;
 	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
 	itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
 	mutex_exit(&zilog->zl_lock);
@@ -907,7 +1012,7 @@ zil_itx_clean(zilog_t *zilog)
 	while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
 	    itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
 		list_remove(&zilog->zl_itx_list, itx);
-		zilog->zl_itx_list_sz -= itx->itx_lr.lrc_reclen;
+		zilog->zl_itx_list_sz -= itx->itx_sod;
 		list_insert_tail(&clean_list, itx);
 	}
 	cv_broadcast(&zilog->zl_cv_writer);
@@ -941,18 +1046,17 @@ zil_clean(zilog_t *zilog)
 	mutex_exit(&zilog->zl_lock);
 }
 
-void
+static void
 zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
 {
 	uint64_t txg;
-	uint64_t reclen;
 	uint64_t commit_seq = 0;
 	itx_t *itx, *itx_next = (itx_t *)-1;
 	lwb_t *lwb;
 	spa_t *spa;
 
 	zilog->zl_writer = B_TRUE;
-	zilog->zl_root_zio = NULL;
+	ASSERT(zilog->zl_root_zio == NULL);
 	spa = zilog->zl_spa;
 
 	if (zilog->zl_suspend) {
@@ -1009,10 +1113,9 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
 		if (itx == NULL)
 			break;
 
-		reclen = itx->itx_lr.lrc_reclen;
 		if ((itx->itx_lr.lrc_seq > seq) &&
 		    ((lwb == NULL) || (lwb->lwb_nused == 0) ||
-		    (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)))) {
+		    (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) {
 			break;
 		}
 
@@ -1024,6 +1127,7 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
 		 */
 		itx_next = list_next(&zilog->zl_itx_list, itx);
 		list_remove(&zilog->zl_itx_list, itx);
+		zilog->zl_itx_list_sz -= itx->itx_sod;
 		mutex_exit(&zilog->zl_lock);
 		txg = itx->itx_lr.lrc_txg;
 		ASSERT(txg);
@@ -1034,7 +1138,6 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
 		kmem_free(itx, offsetof(itx_t, itx_lr)
 		    + itx->itx_lr.lrc_reclen);
 		mutex_enter(&zilog->zl_lock);
-		zilog->zl_itx_list_sz -= reclen;
 	}
 	DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
 	/* determine commit sequence number */
@@ -1058,9 +1161,9 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
 	if (zilog->zl_root_zio) {
 		DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
 		(void) zio_wait(zilog->zl_root_zio);
+		zilog->zl_root_zio = NULL;
 		DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
-		if (!zfs_nocacheflush)
-			zil_flush_vdevs(zilog);
+		zil_flush_vdevs(zilog);
 	}
 
 	if (zilog->zl_log_error || lwb == NULL) {
@@ -1195,8 +1298,6 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 	zilog->zl_destroy_txg = TXG_INITIAL - 1;
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
-	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
 
 	list_create(&zilog->zl_itx_list, sizeof (itx_t),
 	    offsetof(itx_t, itx_node));
@@ -1204,8 +1305,13 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
 	    offsetof(lwb_t, lwb_node));
 
-	list_create(&zilog->zl_vdev_list, sizeof (zil_vdev_t),
-	    offsetof(zil_vdev_t, vdev_seq_node));
+	mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	avl_create(&zilog->zl_vdev_tree, zil_vdev_compare,
+	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
+
+	cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
+	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
 
 	return (zilog);
 }
@@ -1214,7 +1320,6 @@ void
 zil_free(zilog_t *zilog)
 {
 	lwb_t *lwb;
-	zil_vdev_t *zv;
 
 	zilog->zl_stop_sync = 1;
 
@@ -1226,38 +1331,36 @@ zil_free(zilog_t *zilog)
 	}
 	list_destroy(&zilog->zl_lwb_list);
 
-	while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
-		list_remove(&zilog->zl_vdev_list, zv);
-		kmem_free(zv, sizeof (zil_vdev_t));
-	}
-	list_destroy(&zilog->zl_vdev_list);
+	avl_destroy(&zilog->zl_vdev_tree);
+	mutex_destroy(&zilog->zl_vdev_lock);
 
 	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
 	list_destroy(&zilog->zl_itx_list);
-	cv_destroy(&zilog->zl_cv_suspend);
-	cv_destroy(&zilog->zl_cv_writer);
 	mutex_destroy(&zilog->zl_lock);
 
+	cv_destroy(&zilog->zl_cv_writer);
+	cv_destroy(&zilog->zl_cv_suspend);
+
 	kmem_free(zilog, sizeof (zilog_t));
 }
 
 /*
  * return true if the initial log block is not valid
  */
-static int
+static boolean_t
 zil_empty(zilog_t *zilog)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	arc_buf_t *abuf = NULL;
 
 	if (BP_IS_HOLE(&zh->zh_log))
-		return (1);
+		return (B_TRUE);
 
 	if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
-		return (1);
+		return (B_TRUE);
 
 	VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
-	return (0);
+	return (B_FALSE);
 }
 
 /*
@@ -1326,7 +1429,6 @@ zil_suspend(zilog_t *zilog)
 		 */
 		while (zilog->zl_suspending)
 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
-		ASSERT(BP_IS_HOLE(&zh->zh_log));
 		mutex_exit(&zilog->zl_lock);
 		return (0);
 	}
@@ -1346,7 +1448,6 @@ zil_suspend(zilog_t *zilog)
 	zil_destroy(zilog, B_FALSE);
 
 	mutex_enter(&zilog->zl_lock);
-	ASSERT(BP_IS_HOLE(&zh->zh_log));
 	zilog->zl_suspending = B_FALSE;
 	cv_broadcast(&zilog->zl_cv_suspend);
 	mutex_exit(&zilog->zl_lock);
@@ -1366,6 +1467,7 @@ zil_resume(zilog_t *zilog)
 typedef struct zil_replay_arg {
 	objset_t	*zr_os;
 	zil_replay_func_t **zr_replay;
+	zil_replay_cleaner_t *zr_replay_cleaner;
 	void		*zr_arg;
 	uint64_t	*zr_txgp;
 	boolean_t	zr_byteswap;
@@ -1391,6 +1493,9 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
 		return;
 
+	/* Strip case-insensitive bit, still present in log record */
+	txtype &= ~TX_CI;
+
 	/*
 	 * Make a copy of the data so we can revise and extend it.
 	 */
@@ -1465,10 +1570,12 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 			 * On the first pass, arrange for the replay vector
 			 * to fail its dmu_tx_assign().  That's the only way
 			 * to ensure that those code paths remain well tested.
+			 *
+			 * Only byteswap (if needed) on the 1st pass.
 			 */
 			*zr->zr_txgp = replay_txg - (pass == 1);
 			error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
-			    zr->zr_byteswap);
+			    zr->zr_byteswap && pass == 1);
 			*zr->zr_txgp = TXG_NOWAIT;
 		}
 
@@ -1491,6 +1598,8 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 		 * transaction.
 		 */
 		if (error != ERESTART && !sunk) {
+			if (zr->zr_replay_cleaner)
+				zr->zr_replay_cleaner(zr->zr_arg);
 			txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
 			sunk = B_TRUE;
 			continue; /* retry */
@@ -1510,8 +1619,9 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 	dmu_objset_name(zr->zr_os, name);
 	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
-	    "dataset %s, seq 0x%llx, txtype %llu\n",
-	    error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype);
+	    "dataset %s, seq 0x%llx, txtype %llu %s\n",
+	    error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
+	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
 	zilog->zl_stop_replay = 1;
 	kmem_free(name, MAXNAMELEN);
 }
@@ -1528,7 +1638,8 @@ zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
  */
 void
 zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-	zil_replay_func_t *replay_func[TX_MAX_TYPE])
+	zil_replay_func_t *replay_func[TX_MAX_TYPE],
+	zil_replay_cleaner_t *replay_cleaner)
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
@@ -1542,6 +1653,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
 
 	zr.zr_os = os;
 	zr.zr_replay = replay_func;
+	zr.zr_replay_cleaner = replay_cleaner;
 	zr.zr_arg = arg;
 	zr.zr_txgp = txgp;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
@@ -1560,6 +1672,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
 	kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
 
 	zil_destroy(zilog, B_FALSE);
+	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 	//printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name);
 }
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index b5dd35f5599e..4650d42b7c2f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
@@ -61,23 +59,9 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
 char *zio_type_name[ZIO_TYPES] = {
 	"null", "read", "write", "free", "claim", "ioctl" };
 
-/* At or above this size, force gang blocking - for testing */
-uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
-
-/* Force an allocation failure when non-zero */
-uint16_t zio_zil_fail_shift = 0;
-
-typedef struct zio_sync_pass {
-	int	zp_defer_free;		/* defer frees after this pass */
-	int	zp_dontcompress;	/* don't compress after this pass */
-	int	zp_rewrite;		/* rewrite new bps after this pass */
-} zio_sync_pass_t;
-
-zio_sync_pass_t zio_sync_pass = {
-	1,	/* zp_defer_free */
-	4,	/* zp_dontcompress */
-	1,	/* zp_rewrite */
-};
+#define	SYNC_PASS_DEFERRED_FREE	1	/* defer frees after this pass */
+#define	SYNC_PASS_DONT_COMPRESS	4	/* don't compress after this pass */
+#define	SYNC_PASS_REWRITE	1	/* rewrite new bps after this pass */
 
 /*
  * ==========================================================================
@@ -94,6 +78,13 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 extern vmem_t *zio_alloc_arena;
 #endif
 
+/*
+ * An allocating zio is one that either currently has the DVA allocate
+ * stage set or will have it later in its lifetime.
+ */
+#define	IO_IS_ALLOCATING(zio) \
+	((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
+
 void
 zio_init(void)
 {
@@ -107,7 +98,6 @@ zio_init(void)
 	data_alloc_arena = zio_alloc_arena;
 #endif
 #endif
-
 	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
 	    NULL, NULL, NULL, NULL, NULL, 0);
 
@@ -144,9 +134,6 @@ zio_init(void)
 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
 			    KMC_NODEBUG);
-
-			dprintf("creating cache for size %5lx align %5lx\n",
-			    size, align);
 		}
 	}
 
@@ -212,7 +199,7 @@ zio_buf_alloc(size_t size)
 
 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
-	return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
+	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 #else
 	return (kmem_alloc(size, KM_SLEEP));
 #endif
@@ -232,7 +219,7 @@ zio_data_buf_alloc(size_t size)
 
 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
-	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP));
+	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 #else
 	return (kmem_alloc(size, KM_SLEEP));
 #endif
@@ -272,13 +259,15 @@ zio_data_buf_free(void *buf, size_t size)
  * ==========================================================================
  */
 static void
-zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
+zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
+	zio_transform_func_t *transform)
 {
 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 
-	zt->zt_data = data;
-	zt->zt_size = size;
+	zt->zt_orig_data = zio->io_data;
+	zt->zt_orig_size = zio->io_size;
 	zt->zt_bufsize = bufsize;
+	zt->zt_transform = transform;
 
 	zt->zt_next = zio->io_transform_stack;
 	zio->io_transform_stack = zt;
@@ -288,128 +277,233 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
 }
 
 static void
-zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
+zio_pop_transforms(zio_t *zio)
 {
-	zio_transform_t *zt = zio->io_transform_stack;
+	zio_transform_t *zt;
+
+	while ((zt = zio->io_transform_stack) != NULL) {
+		if (zt->zt_transform != NULL)
+			zt->zt_transform(zio,
+			    zt->zt_orig_data, zt->zt_orig_size);
 
-	*data = zt->zt_data;
-	*size = zt->zt_size;
-	*bufsize = zt->zt_bufsize;
+		zio_buf_free(zio->io_data, zt->zt_bufsize);
 
-	zio->io_transform_stack = zt->zt_next;
-	kmem_free(zt, sizeof (zio_transform_t));
+		zio->io_data = zt->zt_orig_data;
+		zio->io_size = zt->zt_orig_size;
+		zio->io_transform_stack = zt->zt_next;
 
-	if ((zt = zio->io_transform_stack) != NULL) {
-		zio->io_data = zt->zt_data;
-		zio->io_size = zt->zt_size;
+		kmem_free(zt, sizeof (zio_transform_t));
 	}
 }
 
+/*
+ * ==========================================================================
+ * I/O transform callbacks for subblocks and decompression
+ * ==========================================================================
+ */
+static void
+zio_subblock(zio_t *zio, void *data, uint64_t size)
+{
+	ASSERT(zio->io_size > size);
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		bcopy(zio->io_data, data, size);
+}
+
 static void
-zio_clear_transform_stack(zio_t *zio)
+zio_decompress(zio_t *zio, void *data, uint64_t size)
 {
-	void *data;
-	uint64_t size, bufsize;
+	if (zio->io_error == 0 &&
+	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
+	    zio->io_data, zio->io_size, data, size) != 0)
+		zio->io_error = EIO;
+}
 
-	ASSERT(zio->io_transform_stack != NULL);
+/*
+ * ==========================================================================
+ * I/O parent/child relationships and pipeline interlocks
+ * ==========================================================================
+ */
 
-	zio_pop_transform(zio, &data, &size, &bufsize);
-	while (zio->io_transform_stack != NULL) {
-		zio_buf_free(data, bufsize);
-		zio_pop_transform(zio, &data, &size, &bufsize);
+static void
+zio_add_child(zio_t *pio, zio_t *zio)
+{
+	mutex_enter(&pio->io_lock);
+	if (zio->io_stage < ZIO_STAGE_READY)
+		pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++;
+	if (zio->io_stage < ZIO_STAGE_DONE)
+		pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++;
+	zio->io_sibling_prev = NULL;
+	zio->io_sibling_next = pio->io_child;
+	if (pio->io_child != NULL)
+		pio->io_child->io_sibling_prev = zio;
+	pio->io_child = zio;
+	zio->io_parent = pio;
+	mutex_exit(&pio->io_lock);
+}
+
+static void
+zio_remove_child(zio_t *pio, zio_t *zio)
+{
+	zio_t *next, *prev;
+
+	ASSERT(zio->io_parent == pio);
+
+	mutex_enter(&pio->io_lock);
+	next = zio->io_sibling_next;
+	prev = zio->io_sibling_prev;
+	if (next != NULL)
+		next->io_sibling_prev = prev;
+	if (prev != NULL)
+		prev->io_sibling_next = next;
+	if (pio->io_child == zio)
+		pio->io_child = next;
+	mutex_exit(&pio->io_lock);
+}
+
+static boolean_t
+zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
+{
+	uint64_t *countp = &zio->io_children[child][wait];
+	boolean_t waiting = B_FALSE;
+
+	mutex_enter(&zio->io_lock);
+	ASSERT(zio->io_stall == NULL);
+	if (*countp != 0) {
+		zio->io_stage--;
+		zio->io_stall = countp;
+		waiting = B_TRUE;
+	}
+	mutex_exit(&zio->io_lock);
+
+	return (waiting);
+}
+
+static void
+zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
+{
+	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
+	int *errorp = &pio->io_child_error[zio->io_child_type];
+
+	mutex_enter(&pio->io_lock);
+	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+		*errorp = zio_worst_error(*errorp, zio->io_error);
+	pio->io_reexecute |= zio->io_reexecute;
+	ASSERT3U(*countp, >, 0);
+	if (--*countp == 0 && pio->io_stall == countp) {
+		pio->io_stall = NULL;
+		mutex_exit(&pio->io_lock);
+		zio_execute(pio);
+	} else {
+		mutex_exit(&pio->io_lock);
 	}
 }
 
+static void
+zio_inherit_child_errors(zio_t *zio, enum zio_child c)
+{
+	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
+		zio->io_error = zio->io_child_error[c];
+}
+
 /*
  * ==========================================================================
- * Create the various types of I/O (read, write, free)
+ * Create the various types of I/O (read, write, free, etc)
  * ==========================================================================
  */
 static zio_t *
 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
+    zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset,
+    const zbookmark_t *zb, uint8_t stage, uint32_t pipeline)
 {
 	zio_t *zio;
 
 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
+
+	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
+	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
+	ASSERT(vd || stage == ZIO_STAGE_OPEN);
 
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 	bzero(zio, sizeof (zio_t));
-	zio->io_parent = pio;
-	zio->io_spa = spa;
-	zio->io_txg = txg;
+
+	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
+
+	if (vd != NULL)
+		zio->io_child_type = ZIO_CHILD_VDEV;
+	else if (flags & ZIO_FLAG_GANG_CHILD)
+		zio->io_child_type = ZIO_CHILD_GANG;
+	else
+		zio->io_child_type = ZIO_CHILD_LOGICAL;
+
 	if (bp != NULL) {
 		zio->io_bp = bp;
 		zio->io_bp_copy = *bp;
 		zio->io_bp_orig = *bp;
+		if (type != ZIO_TYPE_WRITE)
+			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
+		if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
+			if (BP_IS_GANG(bp))
+				pipeline |= ZIO_GANG_STAGES;
+			zio->io_logical = zio;
+		}
 	}
+
+	zio->io_spa = spa;
+	zio->io_txg = txg;
+	zio->io_data = data;
+	zio->io_size = size;
 	zio->io_done = done;
 	zio->io_private = private;
 	zio->io_type = type;
 	zio->io_priority = priority;
-	zio->io_stage = stage;
-	zio->io_pipeline = pipeline;
-	zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
-	zio->io_timestamp = lbolt64;
-	zio->io_flags = flags;
-	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
-	zio_push_transform(zio, data, size, size);
+	zio->io_vd = vd;
+	zio->io_offset = offset;
+	zio->io_orig_flags = zio->io_flags = flags;
+	zio->io_orig_stage = zio->io_stage = stage;
+	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 
-	/*
-	 * Note on config lock:
-	 *
-	 * If CONFIG_HELD is set, then the caller already has the config
-	 * lock, so we don't need it for this io.
-	 *
-	 * We set CONFIG_GRABBED to indicate that we have grabbed the
-	 * config lock on behalf of this io, so it should be released
-	 * in zio_done.
-	 *
-	 * Unless CONFIG_HELD is set, we will grab the config lock for
-	 * any top-level (parent-less) io, *except* NULL top-level ios.
-	 * The NULL top-level ios rarely have any children, so we delay
-	 * grabbing the lock until the first child is added (but it is
-	 * still grabbed on behalf of the top-level i/o, so additional
-	 * children don't need to also grab it).  This greatly reduces
-	 * contention on the config lock.
-	 */
-	if (pio == NULL) {
-		if (type != ZIO_TYPE_NULL &&
-		    !(flags & ZIO_FLAG_CONFIG_HELD)) {
-			spa_config_enter(zio->io_spa, RW_READER, zio);
-			zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
-		}
-		zio->io_root = zio;
-	} else {
-		zio->io_root = pio->io_root;
-		if (!(flags & ZIO_FLAG_NOBOOKMARK))
+	if (zb != NULL)
+		zio->io_bookmark = *zb;
+
+	if (pio != NULL) {
+		/*
+		 * Logical I/Os can have logical, gang, or vdev children.
+		 * Gang I/Os can have gang or vdev children.
+		 * Vdev I/Os can only have vdev children.
+		 * The following ASSERT captures all of these constraints.
+		 */
+		ASSERT(zio->io_child_type <= pio->io_child_type);
+		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
-		mutex_enter(&pio->io_lock);
-		if (pio->io_parent == NULL &&
-		    pio->io_type == ZIO_TYPE_NULL &&
-		    !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
-		    !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
-			pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
-			spa_config_enter(zio->io_spa, RW_READER, pio);
-		}
-		if (stage < ZIO_STAGE_READY)
-			pio->io_children_notready++;
-		pio->io_children_notdone++;
-		zio->io_sibling_next = pio->io_child;
-		zio->io_sibling_prev = NULL;
-		if (pio->io_child != NULL)
-			pio->io_child->io_sibling_prev = zio;
-		pio->io_child = zio;
-		zio->io_ndvas = pio->io_ndvas;
-		mutex_exit(&pio->io_lock);
+		zio_add_child(pio, zio);
 	}
 
 	return (zio);
 }
 
+static void
+zio_destroy(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	uint8_t async_root = zio->io_async_root;
+
+	mutex_destroy(&zio->io_lock);
+	cv_destroy(&zio->io_cv);
+	kmem_cache_free(zio_cache, zio);
+
+	if (async_root) {
+		mutex_enter(&spa->spa_async_root_lock);
+		if (--spa->spa_async_root_count == 0)
+			cv_broadcast(&spa->spa_async_root_cv);
+		mutex_exit(&spa->spa_async_root_lock);
+	}
+}
+
 zio_t *
 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
 	int flags)
@@ -417,8 +511,8 @@ zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
-	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
-	    ZIO_WAIT_FOR_CHILDREN_PIPELINE);
+	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
+	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 
 	return (zio);
 }
@@ -430,160 +524,89 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
 }
 
 zio_t *
-zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
-    uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, zbookmark_t *zb)
+zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+    void *data, uint64_t size, zio_done_func_t *done, void *private,
+    int priority, int flags, const zbookmark_t *zb)
 {
 	zio_t *zio;
 
-	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
-
-	zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
-	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
+	zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp,
+	    data, size, done, private,
+	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
-	zio->io_bookmark = *zb;
-
-	zio->io_logical = zio;
-
-	/*
-	 * Work off our copy of the bp so the caller can free it.
-	 */
-	zio->io_bp = &zio->io_bp_copy;
-
-	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
-		uint64_t csize = BP_GET_PSIZE(bp);
-		void *cbuf = zio_buf_alloc(csize);
-
-		zio_push_transform(zio, cbuf, csize, csize);
-		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
-	}
-
-	if (BP_IS_GANG(bp)) {
-		uint64_t gsize = SPA_GANGBLOCKSIZE;
-		void *gbuf = zio_buf_alloc(gsize);
-
-		zio_push_transform(zio, gbuf, gsize, gsize);
-		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
-	}
 
 	return (zio);
 }
 
 zio_t *
-zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
-    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb)
+zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    void *data, uint64_t size, zio_prop_t *zp,
+    zio_done_func_t *ready, zio_done_func_t *done, void *private,
+    int priority, int flags, const zbookmark_t *zb)
 {
 	zio_t *zio;
 
-	ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
-	    checksum < ZIO_CHECKSUM_FUNCTIONS);
-
-	ASSERT(compress >= ZIO_COMPRESS_OFF &&
-	    compress < ZIO_COMPRESS_FUNCTIONS);
+	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
+	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
+	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
+	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
+	    zp->zp_type < DMU_OT_NUMTYPES &&
+	    zp->zp_level < 32 &&
+	    zp->zp_ndvas > 0 &&
+	    zp->zp_ndvas <= spa_max_replication(spa));
+	ASSERT(ready != NULL);
 
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
+	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
 
 	zio->io_ready = ready;
-
-	zio->io_bookmark = *zb;
-
-	zio->io_logical = zio;
-
-	zio->io_checksum = checksum;
-	zio->io_compress = compress;
-	zio->io_ndvas = ncopies;
-
-	if (compress != ZIO_COMPRESS_OFF)
-		zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
-
-	if (bp->blk_birth != txg) {
-		/* XXX the bp usually (always?) gets re-zeroed later */
-		BP_ZERO(bp);
-		BP_SET_LSIZE(bp, size);
-		BP_SET_PSIZE(bp, size);
-	} else {
-		/* Make sure someone doesn't change their mind on overwrites */
-		ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
-		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
-	}
+	zio->io_prop = *zp;
 
 	return (zio);
 }
 
 zio_t *
-zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
-    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags,
-    zbookmark_t *zb)
+zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
+    uint64_t size, zio_done_func_t *done, void *private, int priority,
+    int flags, zbookmark_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
+	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 
-	zio->io_bookmark = *zb;
-	zio->io_checksum = checksum;
-	zio->io_compress = ZIO_COMPRESS_OFF;
-
-	if (pio != NULL)
-		ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
-
-	return (zio);
-}
-
-static zio_t *
-zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
-    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags)
-{
-	zio_t *zio;
-
-	BP_ZERO(bp);
-	BP_SET_LSIZE(bp, size);
-	BP_SET_PSIZE(bp, size);
-	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-
-	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags,
-	    ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
-
-	zio->io_checksum = checksum;
-	zio->io_compress = ZIO_COMPRESS_OFF;
-
 	return (zio);
 }
 
 zio_t *
 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private)
+    zio_done_func_t *done, void *private, int flags)
 {
 	zio_t *zio;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
+	if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
+		return (zio_null(pio, spa, NULL, NULL, flags));
+
 	if (txg == spa->spa_syncing_txg &&
-	    spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
+	    spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
 		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
-		return (zio_null(pio, spa, NULL, NULL, 0));
+		return (zio_null(pio, spa, NULL, NULL, flags));
 	}
 
-	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
-	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
-	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
-
-	zio->io_bp = &zio->io_bp_copy;
+	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
+	    done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
+	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private)
+    zio_done_func_t *done, void *private, int flags)
 {
 	zio_t *zio;
 
@@ -601,11 +624,9 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 	ASSERT3U(spa_first_txg(spa), <=, txg);
 
-	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
-	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
-	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
-
-	zio->io_bp = &zio->io_bp_copy;
+	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
+	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
+	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 
 	return (zio);
 }
@@ -619,10 +640,9 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 
 	if (vd->vdev_children == 0) {
 		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
-		    ZIO_TYPE_IOCTL, priority, flags,
+		    ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 
-		zio->io_vd = vd;
 		zio->io_cmd = cmd;
 	} else {
 		zio = zio_null(pio, spa, NULL, NULL, flags);
@@ -635,54 +655,23 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 	return (zio);
 }
 
-static void
-zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
-    int checksum)
-{
-	ASSERT(vd->vdev_children == 0);
-
-	ASSERT(size <= SPA_MAXBLOCKSIZE);
-	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
-	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
-
-	ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
-	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
-	ASSERT3U(offset + size, <=, vd->vdev_psize);
-
-	BP_ZERO(bp);
-
-	BP_SET_LSIZE(bp, size);
-	BP_SET_PSIZE(bp, size);
-
-	BP_SET_CHECKSUM(bp, checksum);
-	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
-	if (checksum != ZIO_CHECKSUM_OFF)
-		ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
-}
-
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, int flags)
+    int priority, int flags, boolean_t labels)
 {
 	zio_t *zio;
-	blkptr_t blk;
 
-	zio_phys_bp_init(vd, &blk, offset, size, checksum);
+	ASSERT(vd->vdev_children == 0);
+	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
+	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
-	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
-	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
+	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
+	    ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
 	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 
-	zio->io_vd = vd;
-	zio->io_offset = offset;
-
-	/*
-	 * Work off our copy of the bp so the caller can free it.
-	 */
-	zio->io_bp = &zio->io_bp_copy;
+	zio->io_prop.zp_checksum = checksum;
 
 	return (zio);
 }
@@ -690,53 +679,49 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, int flags)
+    int priority, int flags, boolean_t labels)
 {
-	zio_block_tail_t *zbt;
-	void *wbuf;
 	zio_t *zio;
-	blkptr_t blk;
 
-	zio_phys_bp_init(vd, &blk, offset, size, checksum);
+	ASSERT(vd->vdev_children == 0);
+	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
+	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
-	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
+	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
+	    ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
 	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 
-	zio->io_vd = vd;
-	zio->io_offset = offset;
-
-	zio->io_bp = &zio->io_bp_copy;
-	zio->io_checksum = checksum;
+	zio->io_prop.zp_checksum = checksum;
 
 	if (zio_checksum_table[checksum].ci_zbt) {
 		/*
 		 * zbt checksums are necessarily destructive -- they modify
-		 * one word of the write buffer to hold the verifier/checksum.
+		 * the end of the write buffer to hold the verifier/checksum.
 		 * Therefore, we must make a local copy in case the data is
-		 * being written to multiple places.
+		 * being written to multiple places in parallel.
 		 */
-		wbuf = zio_buf_alloc(size);
+		void *wbuf = zio_buf_alloc(size);
 		bcopy(data, wbuf, size);
-		zio_push_transform(zio, wbuf, size, size);
-
-		zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
-		zbt->zbt_cksum = blk.blk_cksum;
+		zio_push_transform(zio, wbuf, size, size, NULL);
 	}
 
 	return (zio);
 }
 
 /*
- * Create a child I/O to do some work for us.  It has no associated bp.
+ * Create a child I/O to do some work for us.
  */
 zio_t *
-zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
+zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 	void *data, uint64_t size, int type, int priority, int flags,
 	zio_done_func_t *done, void *private)
 {
 	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
-	zio_t *cio;
+	zio_t *zio;
+
+	ASSERT(vd->vdev_parent ==
+	    (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 
 	if (type == ZIO_TYPE_READ && bp != NULL) {
 		/*
@@ -746,517 +731,754 @@ zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 		 * eliminates redundant checksums in the interior nodes.
 		 */
 		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
-		zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+		pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
 	}
 
-	cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
+	if (vd->vdev_children == 0)
+		offset += VDEV_LABEL_START_SIZE;
+
+	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 	    done, private, type, priority,
-	    (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
+	    (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) |
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags,
+	    vd, offset, &pio->io_bookmark,
 	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
 
-	cio->io_vd = vd;
-	cio->io_offset = offset;
-
-	return (cio);
+	return (zio);
 }
 
-/*
- * ==========================================================================
- * Initiate I/O, either sync or async
- * ==========================================================================
- */
-int
-zio_wait(zio_t *zio)
+zio_t *
+zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
+	int type, int priority, int flags, zio_done_func_t *done, void *private)
 {
-	int error;
-
-	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
-
-	zio->io_waiter = curthread;
+	zio_t *zio;
 
-	zio_next_stage_async(zio);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
-	mutex_enter(&zio->io_lock);
-	while (zio->io_stalled != ZIO_STAGE_DONE)
-		cv_wait(&zio->io_cv, &zio->io_lock);
-	mutex_exit(&zio->io_lock);
+	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
+	    data, size, done, private, type, priority,
+	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
+	    vd, offset, NULL,
+	    ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE);
 
-	error = zio->io_error;
-	cv_destroy(&zio->io_cv);
-	mutex_destroy(&zio->io_lock);
-	kmem_cache_free(zio_cache, zio);
-
-	return (error);
+	return (zio);
 }
 
 void
-zio_nowait(zio_t *zio)
+zio_flush(zio_t *zio, vdev_t *vd)
 {
-	zio_next_stage_async(zio);
+	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
+	    NULL, NULL, ZIO_PRIORITY_NOW,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 }
 
 /*
  * ==========================================================================
- * I/O pipeline interlocks: parent/child dependency scoreboarding
+ * Prepare to read and write logical blocks
  * ==========================================================================
  */
-static void
-zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
+
+static int
+zio_read_bp_init(zio_t *zio)
 {
-	mutex_enter(&zio->io_lock);
-	if (*countp == 0) {
-		ASSERT(zio->io_stalled == 0);
-		mutex_exit(&zio->io_lock);
-		zio_next_stage(zio);
-	} else {
-		zio->io_stalled = stage;
-		mutex_exit(&zio->io_lock);
+	blkptr_t *bp = zio->io_bp;
+
+	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) {
+		uint64_t csize = BP_GET_PSIZE(bp);
+		void *cbuf = zio_buf_alloc(csize);
+
+		zio_push_transform(zio, cbuf, csize, csize, zio_decompress);
 	}
+
+	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
+		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
-zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
+static int
+zio_write_bp_init(zio_t *zio)
 {
-	zio_t *pio = zio->io_parent;
+	zio_prop_t *zp = &zio->io_prop;
+	int compress = zp->zp_compress;
+	blkptr_t *bp = zio->io_bp;
+	void *cbuf;
+	uint64_t lsize = zio->io_size;
+	uint64_t csize = lsize;
+	uint64_t cbufsize = 0;
+	int pass = 1;
 
-	mutex_enter(&pio->io_lock);
-	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
-		pio->io_error = zio->io_error;
-	if (--*countp == 0 && pio->io_stalled == stage) {
-		pio->io_stalled = 0;
-		mutex_exit(&pio->io_lock);
-		zio_next_stage_async(pio);
+	/*
+	 * If our children haven't all reached the ready stage,
+	 * wait for them and then repeat this pipeline stage.
+	 */
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
+	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
+		return (ZIO_PIPELINE_STOP);
+
+	if (!IO_IS_ALLOCATING(zio))
+		return (ZIO_PIPELINE_CONTINUE);
+
+	ASSERT(compress != ZIO_COMPRESS_INHERIT);
+
+	if (bp->blk_birth == zio->io_txg) {
+		/*
+		 * We're rewriting an existing block, which means we're
+		 * working on behalf of spa_sync().  For spa_sync() to
+		 * converge, it must eventually be the case that we don't
+		 * have to allocate new blocks.  But compression changes
+		 * the blocksize, which forces a reallocate, and makes
+		 * convergence take longer.  Therefore, after the first
+		 * few passes, stop compressing to ensure convergence.
+		 */
+		pass = spa_sync_pass(zio->io_spa);
+		ASSERT(pass > 1);
+
+		if (pass > SYNC_PASS_DONT_COMPRESS)
+			compress = ZIO_COMPRESS_OFF;
+
+		/*
+		 * Only MOS (objset 0) data should need to be rewritten.
+		 */
+		ASSERT(zio->io_logical->io_bookmark.zb_objset == 0);
+
+		/* Make sure someone doesn't change their mind on overwrites */
+		ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp),
+		    spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp));
+	}
+
+	if (compress != ZIO_COMPRESS_OFF) {
+		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
+		    &cbuf, &csize, &cbufsize)) {
+			compress = ZIO_COMPRESS_OFF;
+		} else if (csize != 0) {
+			zio_push_transform(zio, cbuf, csize, cbufsize, NULL);
+		}
+	}
+
+	/*
+	 * The final pass of spa_sync() must be all rewrites, but the first
+	 * few passes offer a trade-off: allocating blocks defers convergence,
+	 * but newly allocated blocks are sequential, so they can be written
+	 * to disk faster.  Therefore, we allow the first few passes of
+	 * spa_sync() to allocate new blocks, but force rewrites after that.
+	 * There should only be a handful of blocks after pass 1 in any case.
+	 */
+	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+	    pass > SYNC_PASS_REWRITE) {
+		ASSERT(csize != 0);
+		uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
+		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
 	} else {
-		mutex_exit(&pio->io_lock);
+		BP_ZERO(bp);
+		zio->io_pipeline = ZIO_WRITE_PIPELINE;
+	}
+
+	if (csize == 0) {
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+	} else {
+		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
+		BP_SET_LSIZE(bp, lsize);
+		BP_SET_PSIZE(bp, csize);
+		BP_SET_COMPRESS(bp, compress);
+		BP_SET_CHECKSUM(bp, zp->zp_checksum);
+		BP_SET_TYPE(bp, zp->zp_type);
+		BP_SET_LEVEL(bp, zp->zp_level);
+		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 	}
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
+/*
+ * ==========================================================================
+ * Execute the I/O pipeline
+ * ==========================================================================
+ */
+
 static void
-zio_wait_children_ready(zio_t *zio)
+zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
 {
-	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
-	    &zio->io_children_notready);
-}
+	zio_type_t t = zio->io_type;
 
-void
-zio_wait_children_done(zio_t *zio)
-{
-	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
-	    &zio->io_children_notdone);
+	/*
+	 * If we're a config writer, the normal issue and interrupt threads
+	 * may all be blocked waiting for the config lock.  In this case,
+	 * select the otherwise-unused taskq for ZIO_TYPE_NULL.
+	 */
+	if (zio->io_flags & ZIO_FLAG_CONFIG_WRITER)
+		t = ZIO_TYPE_NULL;
+
+	/*
+	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
+	 */
+	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
+		t = ZIO_TYPE_NULL;
+
+	(void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q],
+	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
 }
 
-static void
-zio_ready(zio_t *zio)
+static boolean_t
+zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
 {
-	zio_t *pio = zio->io_parent;
+	kthread_t *executor = zio->io_executor;
+	spa_t *spa = zio->io_spa;
 
-	if (zio->io_ready)
-		zio->io_ready(zio);
+	for (zio_type_t t = 0; t < ZIO_TYPES; t++)
+		if (taskq_member(spa->spa_zio_taskq[t][q], executor))
+			return (B_TRUE);
 
-	if (pio != NULL)
-		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
-		    &pio->io_children_notready);
+	return (B_FALSE);
+}
 
-	if (zio->io_bp)
-		zio->io_bp_copy = *zio->io_bp;
+static int
+zio_issue_async(zio_t *zio)
+{
+	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_STOP);
 }
 
-static void
-zio_done(zio_t *zio)
+void
+zio_interrupt(zio_t *zio)
 {
-	zio_t *pio = zio->io_parent;
-	spa_t *spa = zio->io_spa;
-	blkptr_t *bp = zio->io_bp;
-	vdev_t *vd = zio->io_vd;
+	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT);
+}
 
-	ASSERT(zio->io_children_notready == 0);
-	ASSERT(zio->io_children_notdone == 0);
+/*
+ * Execute the I/O pipeline until one of the following occurs:
+ * (1) the I/O completes; (2) the pipeline stalls waiting for
+ * dependent child I/Os; (3) the I/O issues, so we're waiting
+ * for an I/O completion interrupt; (4) the I/O is delegated by
+ * vdev-level caching or aggregation; (5) the I/O is deferred
+ * due to vdev-level queueing; (6) the I/O is handed off to
+ * another thread.  In all cases, the pipeline stops whenever
+ * there's no CPU work; it never burns a thread in cv_wait().
+ *
+ * There's no locking on io_stage because there's no legitimate way
+ * for multiple threads to be attempting to process the same I/O.
+ */
+static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES];
 
-	if (bp != NULL) {
-		ASSERT(bp->blk_pad[0] == 0);
-		ASSERT(bp->blk_pad[1] == 0);
-		ASSERT(bp->blk_pad[2] == 0);
-		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
-		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
-		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
-			ASSERT(!BP_SHOULD_BYTESWAP(bp));
-			if (zio->io_ndvas != 0)
-				ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
-			ASSERT(BP_COUNT_GANG(bp) == 0 ||
-			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
-		}
-	}
+void
+zio_execute(zio_t *zio)
+{
+	zio->io_executor = curthread;
 
-	if (vd != NULL)
-		vdev_stat_update(zio);
+	while (zio->io_stage < ZIO_STAGE_DONE) {
+		uint32_t pipeline = zio->io_pipeline;
+		zio_stage_t stage = zio->io_stage;
+		int rv;
 
-	if (zio->io_error) {
-		/*
-		 * If this I/O is attached to a particular vdev,
-		 * generate an error message describing the I/O failure
-		 * at the block level.  We ignore these errors if the
-		 * device is currently unavailable.
-		 */
-		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
-			zfs_ereport_post(FM_EREPORT_ZFS_IO,
-			    zio->io_spa, vd, zio, 0, 0);
+		ASSERT(!MUTEX_HELD(&zio->io_lock));
 
-		if ((zio->io_error == EIO ||
-		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
-		    zio->io_logical == zio) {
-			/*
-			 * For root I/O requests, tell the SPA to log the error
-			 * appropriately.  Also, generate a logical data
-			 * ereport.
-			 */
-			spa_log_error(zio->io_spa, zio);
+		while (((1U << ++stage) & pipeline) == 0)
+			continue;
 
-			zfs_ereport_post(FM_EREPORT_ZFS_DATA,
-			    zio->io_spa, NULL, zio, 0, 0);
-		}
+		ASSERT(stage <= ZIO_STAGE_DONE);
+		ASSERT(zio->io_stall == NULL);
 
 		/*
-		 * For I/O requests that cannot fail, panic appropriately.
+		 * If we are in interrupt context and this pipeline stage
+		 * will grab a config lock that is held across I/O,
+		 * issue async to avoid deadlock.
 		 */
-		if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
-			char *blkbuf;
-
-			blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP);
-			if (blkbuf) {
-				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
-				    bp ? bp : &zio->io_bp_copy);
-			}
-			panic("ZFS: %s (%s on %s off %llx: zio %p %s): error "
-			    "%d", zio->io_error == ECKSUM ?
-			    "bad checksum" : "I/O failure",
-			    zio_type_name[zio->io_type],
-			    vdev_description(vd),
-			    (u_longlong_t)zio->io_offset,
-			    zio, blkbuf ? blkbuf : "", zio->io_error);
+		if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) &&
+		    zio->io_vd == NULL &&
+		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
+			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+			return;
 		}
+
+		zio->io_stage = stage;
+		rv = zio_pipeline[stage](zio);
+
+		if (rv == ZIO_PIPELINE_STOP)
+			return;
+
+		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
 	}
-	zio_clear_transform_stack(zio);
+}
 
-	if (zio->io_done)
-		zio->io_done(zio);
+/*
+ * ==========================================================================
+ * Initiate I/O, either sync or async
+ * ==========================================================================
+ */
+int
+zio_wait(zio_t *zio)
+{
+	int error;
 
-	ASSERT(zio->io_delegate_list == NULL);
-	ASSERT(zio->io_delegate_next == NULL);
+	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+	ASSERT(zio->io_executor == NULL);
 
-	if (pio != NULL) {
-		zio_t *next, *prev;
+	zio->io_waiter = curthread;
 
-		mutex_enter(&pio->io_lock);
-		next = zio->io_sibling_next;
-		prev = zio->io_sibling_prev;
-		if (next != NULL)
-			next->io_sibling_prev = prev;
-		if (prev != NULL)
-			prev->io_sibling_next = next;
-		if (pio->io_child == zio)
-			pio->io_child = next;
-		mutex_exit(&pio->io_lock);
+	zio_execute(zio);
 
-		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
-		    &pio->io_children_notdone);
-	}
+	mutex_enter(&zio->io_lock);
+	while (zio->io_executor != NULL)
+		cv_wait(&zio->io_cv, &zio->io_lock);
+	mutex_exit(&zio->io_lock);
 
-	/*
-	 * Note: this I/O is now done, and will shortly be freed, so there is no
-	 * need to clear this (or any other) flag.
-	 */
-	if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)
-		spa_config_exit(spa, zio);
+	error = zio->io_error;
+	zio_destroy(zio);
 
-	if (zio->io_waiter != NULL) {
-		mutex_enter(&zio->io_lock);
-		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
-		zio->io_stalled = zio->io_stage;
-		cv_broadcast(&zio->io_cv);
-		mutex_exit(&zio->io_lock);
-	} else {
-		cv_destroy(&zio->io_cv);
-		mutex_destroy(&zio->io_lock);
-		kmem_cache_free(zio_cache, zio);
+	return (error);
+}
+
+void
+zio_nowait(zio_t *zio)
+{
+	ASSERT(zio->io_executor == NULL);
+
+	if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) {
+		/*
+		 * This is a logical async I/O with no parent to wait for it.
+		 * Attach it to the pool's global async root zio so that
+		 * spa_unload() has a way of waiting for async I/O to finish.
+		 */
+		spa_t *spa = zio->io_spa;
+		zio->io_async_root = B_TRUE;
+		mutex_enter(&spa->spa_async_root_lock);
+		spa->spa_async_root_count++;
+		mutex_exit(&spa->spa_async_root_lock);
 	}
+
+	zio_execute(zio);
 }
 
 /*
  * ==========================================================================
- * Compression support
+ * Reexecute or suspend/resume failed I/O
  * ==========================================================================
  */
+
 static void
-zio_write_compress(zio_t *zio)
+zio_reexecute(zio_t *pio)
 {
-	int compress = zio->io_compress;
-	blkptr_t *bp = zio->io_bp;
-	void *cbuf;
-	uint64_t lsize = zio->io_size;
-	uint64_t csize = lsize;
-	uint64_t cbufsize = 0;
-	int pass;
+	zio_t *zio, *zio_next;
 
-	if (bp->blk_birth == zio->io_txg) {
+	pio->io_flags = pio->io_orig_flags;
+	pio->io_stage = pio->io_orig_stage;
+	pio->io_pipeline = pio->io_orig_pipeline;
+	pio->io_reexecute = 0;
+	pio->io_error = 0;
+	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
+		pio->io_child_error[c] = 0;
+
+	if (IO_IS_ALLOCATING(pio)) {
 		/*
-		 * We're rewriting an existing block, which means we're
-		 * working on behalf of spa_sync().  For spa_sync() to
-		 * converge, it must eventually be the case that we don't
-		 * have to allocate new blocks.  But compression changes
-		 * the blocksize, which forces a reallocate, and makes
-		 * convergence take longer.  Therefore, after the first
-		 * few passes, stop compressing to ensure convergence.
+		 * Remember the failed bp so that the io_ready() callback
+		 * can update its accounting upon reexecution.  The block
+		 * was already freed in zio_done(); we indicate this with
+		 * a fill count of -1 so that zio_free() knows to skip it.
 		 */
-		pass = spa_sync_pass(zio->io_spa);
-		if (pass > zio_sync_pass.zp_dontcompress)
-			compress = ZIO_COMPRESS_OFF;
-	} else {
-		ASSERT(BP_IS_HOLE(bp));
-		pass = 1;
+		blkptr_t *bp = pio->io_bp;
+		ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
+		bp->blk_fill = BLK_FILL_ALREADY_FREED;
+		pio->io_bp_orig = *bp;
+		BP_ZERO(bp);
 	}
 
-	if (compress != ZIO_COMPRESS_OFF)
-		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
-		    &cbuf, &csize, &cbufsize))
-			compress = ZIO_COMPRESS_OFF;
-
-	if (compress != ZIO_COMPRESS_OFF && csize != 0)
-		zio_push_transform(zio, cbuf, csize, cbufsize);
+	/*
+	 * As we reexecute pio's children, new children could be created.
+	 * New children go to the head of the io_child list, however,
+	 * so we will (correctly) not reexecute them.  The key is that
+	 * the remainder of the io_child list, from 'zio_next' onward,
+	 * cannot be affected by any side effects of reexecuting 'zio'.
+	 */
+	for (zio = pio->io_child; zio != NULL; zio = zio_next) {
+		zio_next = zio->io_sibling_next;
+		mutex_enter(&pio->io_lock);
+		pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++;
+		pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++;
+		mutex_exit(&pio->io_lock);
+		zio_reexecute(zio);
+	}
 
 	/*
-	 * The final pass of spa_sync() must be all rewrites, but the first
-	 * few passes offer a trade-off: allocating blocks defers convergence,
-	 * but newly allocated blocks are sequential, so they can be written
-	 * to disk faster.  Therefore, we allow the first few passes of
-	 * spa_sync() to reallocate new blocks, but force rewrites after that.
-	 * There should only be a handful of blocks after pass 1 in any case.
+	 * Now that all children have been reexecuted, execute the parent.
 	 */
-	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
-	    pass > zio_sync_pass.zp_rewrite) {
-		ASSERT(csize != 0);
-		BP_SET_LSIZE(bp, lsize);
-		BP_SET_COMPRESS(bp, compress);
-		zio->io_pipeline = ZIO_REWRITE_PIPELINE;
-	} else {
-		if (bp->blk_birth == zio->io_txg)
-			BP_ZERO(bp);
-		if (csize == 0) {
-			BP_ZERO(bp);
-			zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
-		} else {
-			ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
-			BP_SET_LSIZE(bp, lsize);
-			BP_SET_PSIZE(bp, csize);
-			BP_SET_COMPRESS(bp, compress);
-			zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE;
-		}
+	zio_execute(pio);
+}
+
+void
+zio_suspend(spa_t *spa, zio_t *zio)
+{
+	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
+		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
+		    "failure and the failure mode property for this pool "
+		    "is set to panic.", spa_name(spa));
+
+	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
+
+	mutex_enter(&spa->spa_suspend_lock);
+
+	if (spa->spa_suspend_zio_root == NULL)
+		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 0);
+
+	spa->spa_suspended = B_TRUE;
+
+	if (zio != NULL) {
+		ASSERT(zio != spa->spa_suspend_zio_root);
+		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+		ASSERT(zio->io_parent == NULL);
+		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
+		zio_add_child(spa->spa_suspend_zio_root, zio);
 	}
 
-	zio_next_stage(zio);
+	mutex_exit(&spa->spa_suspend_lock);
 }
 
-static void
-zio_read_decompress(zio_t *zio)
+void
+zio_resume(spa_t *spa)
 {
-	blkptr_t *bp = zio->io_bp;
-	void *data;
-	uint64_t size;
-	uint64_t bufsize;
-	int compress = BP_GET_COMPRESS(bp);
+	zio_t *pio, *zio;
 
-	ASSERT(compress != ZIO_COMPRESS_OFF);
+	/*
+	 * Reexecute all previously suspended i/o.
+	 */
+	mutex_enter(&spa->spa_suspend_lock);
+	spa->spa_suspended = B_FALSE;
+	cv_broadcast(&spa->spa_suspend_cv);
+	pio = spa->spa_suspend_zio_root;
+	spa->spa_suspend_zio_root = NULL;
+	mutex_exit(&spa->spa_suspend_lock);
+
+	if (pio == NULL)
+		return;
 
-	zio_pop_transform(zio, &data, &size, &bufsize);
+	while ((zio = pio->io_child) != NULL) {
+		zio_remove_child(pio, zio);
+		zio->io_parent = NULL;
+		zio_reexecute(zio);
+	}
 
-	if (zio_decompress_data(compress, data, size,
-	    zio->io_data, zio->io_size))
-		zio->io_error = EIO;
+	ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0);
 
-	zio_buf_free(data, bufsize);
+	(void) zio_wait(pio);
+}
 
-	zio_next_stage(zio);
+void
+zio_resume_wait(spa_t *spa)
+{
+	mutex_enter(&spa->spa_suspend_lock);
+	while (spa_suspended(spa))
+		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
+	mutex_exit(&spa->spa_suspend_lock);
 }
 
 /*
  * ==========================================================================
- * Gang block support
+ * Gang blocks.
+ *
+ * A gang block is a collection of small blocks that looks to the DMU
+ * like one large block.  When zio_dva_allocate() cannot find a block
+ * of the requested size, due to either severe fragmentation or the pool
+ * being nearly full, it calls zio_write_gang_block() to construct the
+ * block from smaller fragments.
+ *
+ * A gang block consists of a gang header (zio_gbh_phys_t) and up to
+ * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
+ * an indirect block: it's an array of block pointers.  It consumes
+ * only one sector and hence is allocatable regardless of fragmentation.
+ * The gang header's bps point to its gang members, which hold the data.
+ *
+ * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
+ * as the verifier to ensure uniqueness of the SHA256 checksum.
+ * Critically, the gang block bp's blk_cksum is the checksum of the data,
+ * not the gang header.  This ensures that data block signatures (needed for
+ * deduplication) are independent of how the block is physically stored.
+ *
+ * Gang blocks can be nested: a gang member may itself be a gang block.
+ * Thus every gang block is a tree in which root and all interior nodes are
+ * gang headers, and the leaves are normal blocks that contain user data.
+ * The root of the gang tree is called the gang leader.
+ *
+ * To perform any operation (read, rewrite, free, claim) on a gang block,
+ * zio_gang_assemble() first assembles the gang tree (minus data leaves)
+ * in the io_gang_tree field of the original logical i/o by recursively
+ * reading the gang leader and all gang headers below it.  This yields
+ * an in-core tree containing the contents of every gang header and the
+ * bps for every constituent of the gang block.
+ *
+ * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
+ * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
+ * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
+ * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
+ * zio_read_gang() is a wrapper around zio_read() that omits reading gang
+ * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
+ * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
+ * of the gang header plus zio_checksum_compute() of the data to update the
+ * gang header's blk_cksum as described above.
+ *
+ * The two-phase assemble/issue model solves the problem of partial failure --
+ * what if you'd freed part of a gang block but then couldn't read the
+ * gang header for another part?  Assembling the entire gang tree first
+ * ensures that all the necessary gang header I/O has succeeded before
+ * starting the actual work of free, claim, or write.  Once the gang tree
+ * is assembled, free and claim are in-memory operations that cannot fail.
+ *
+ * In the event that a gang write fails, zio_dva_unallocate() walks the
+ * gang tree to immediately free (i.e. insert back into the space map)
+ * everything we've allocated.  This ensures that we don't get ENOSPC
+ * errors during repeated suspend/resume cycles due to a flaky device.
+ *
+ * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
+ * the gang tree, we won't modify the block, so we can safely defer the free
+ * (knowing that the block is still intact).  If we *can* assemble the gang
+ * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
+ * each constituent bp and we can allocate a new block on the next sync pass.
+ *
+ * In all cases, the gang tree allows complete recovery from partial failure.
  * ==========================================================================
  */
-static void
-zio_gang_pipeline(zio_t *zio)
+
+static zio_t *
+zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 {
-	/*
-	 * By default, the pipeline assumes that we're dealing with a gang
-	 * block.  If we're not, strip out any gang-specific stages.
-	 */
-	if (!BP_IS_GANG(zio->io_bp))
-		zio->io_pipeline &= ~ZIO_GANG_STAGES;
+	if (gn != NULL)
+		return (pio);
 
-	zio_next_stage(zio);
+	return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
+	    NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+	    &pio->io_bookmark));
 }
 
-static void
-zio_gang_byteswap(zio_t *zio)
+zio_t *
+zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 {
-	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
+	zio_t *zio;
 
-	if (BP_SHOULD_BYTESWAP(zio->io_bp))
-		byteswap_uint64_array(zio->io_data, zio->io_size);
+	if (gn != NULL) {
+		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
+		    gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
+		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+		/*
+		 * As we rewrite each gang header, the pipeline will compute
+		 * a new gang block header checksum for it; but no one will
+		 * compute a new data checksum, so we do that here.  The one
+		 * exception is the gang leader: the pipeline already computed
+		 * its data checksum because that stage precedes gang assembly.
+		 * (Presently, nothing actually uses interior data checksums;
+		 * this is just good hygiene.)
+		 */
+		if (gn != pio->io_logical->io_gang_tree) {
+			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
+			    data, BP_GET_PSIZE(bp));
+		}
+	} else {
+		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
+		    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
+		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+	}
+
+	return (zio);
 }
 
-static void
-zio_get_gang_header(zio_t *zio)
+/* ARGSUSED */
+zio_t *
+zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 {
-	blkptr_t *bp = zio->io_bp;
-	uint64_t gsize = SPA_GANGBLOCKSIZE;
-	void *gbuf = zio_buf_alloc(gsize);
+	return (zio_free(pio, pio->io_spa, pio->io_txg, bp,
+	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
+}
 
-	ASSERT(BP_IS_GANG(bp));
+/* ARGSUSED */
+zio_t *
+zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+{
+	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
+	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
+}
 
-	zio_push_transform(zio, gbuf, gsize, gsize);
+static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
+	NULL,
+	zio_read_gang,
+	zio_rewrite_gang,
+	zio_free_gang,
+	zio_claim_gang,
+	NULL
+};
 
-	zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
-	    NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
-	    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
-	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE));
+static void zio_gang_tree_assemble_done(zio_t *zio);
 
-	zio_wait_children_done(zio);
+static zio_gang_node_t *
+zio_gang_node_alloc(zio_gang_node_t **gnpp)
+{
+	zio_gang_node_t *gn;
+
+	ASSERT(*gnpp == NULL);
+
+	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
+	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
+	*gnpp = gn;
+
+	return (gn);
 }
 
 static void
-zio_read_gang_members(zio_t *zio)
+zio_gang_node_free(zio_gang_node_t **gnpp)
 {
-	zio_gbh_phys_t *gbh;
-	uint64_t gsize, gbufsize, loff, lsize;
-	int i;
+	zio_gang_node_t *gn = *gnpp;
 
-	ASSERT(BP_IS_GANG(zio->io_bp));
+	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+		ASSERT(gn->gn_child[g] == NULL);
 
-	zio_gang_byteswap(zio);
-	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+	kmem_free(gn, sizeof (*gn));
+	*gnpp = NULL;
+}
 
-	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
-		blkptr_t *gbp = &gbh->zg_blkptr[i];
-		lsize = BP_GET_PSIZE(gbp);
+static void
+zio_gang_tree_free(zio_gang_node_t **gnpp)
+{
+	zio_gang_node_t *gn = *gnpp;
 
-		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
-		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
-		ASSERT3U(loff + lsize, <=, zio->io_size);
-		ASSERT(i < SPA_GBH_NBLKPTRS);
-		ASSERT(!BP_IS_HOLE(gbp));
+	if (gn == NULL)
+		return;
 
-		zio_nowait(zio_read(zio, zio->io_spa, gbp,
-		    (char *)zio->io_data + loff, lsize, NULL, NULL,
-		    zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
-		    &zio->io_bookmark));
-	}
+	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+		zio_gang_tree_free(&gn->gn_child[g]);
 
-	zio_buf_free(gbh, gbufsize);
-	zio_wait_children_done(zio);
+	zio_gang_node_free(gnpp);
 }
 
 static void
-zio_rewrite_gang_members(zio_t *zio)
+zio_gang_tree_assemble(zio_t *lio, blkptr_t *bp, zio_gang_node_t **gnpp)
 {
-	zio_gbh_phys_t *gbh;
-	uint64_t gsize, gbufsize, loff, lsize;
-	int i;
+	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
 
-	ASSERT(BP_IS_GANG(zio->io_bp));
-	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+	ASSERT(lio->io_logical == lio);
+	ASSERT(BP_IS_GANG(bp));
+
+	zio_nowait(zio_read(lio, lio->io_spa, bp, gn->gn_gbh,
+	    SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
+	    lio->io_priority, ZIO_GANG_CHILD_FLAGS(lio), &lio->io_bookmark));
+}
 
-	zio_gang_byteswap(zio);
-	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+static void
+zio_gang_tree_assemble_done(zio_t *zio)
+{
+	zio_t *lio = zio->io_logical;
+	zio_gang_node_t *gn = zio->io_private;
+	blkptr_t *bp = zio->io_bp;
 
-	ASSERT(gsize == gbufsize);
+	ASSERT(zio->io_parent == lio);
+	ASSERT(zio->io_child == NULL);
 
-	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
-		blkptr_t *gbp = &gbh->zg_blkptr[i];
-		lsize = BP_GET_PSIZE(gbp);
+	if (zio->io_error)
+		return;
 
-		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
-		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
-		ASSERT3U(loff + lsize, <=, zio->io_size);
-		ASSERT(i < SPA_GBH_NBLKPTRS);
-		ASSERT(!BP_IS_HOLE(gbp));
+	if (BP_SHOULD_BYTESWAP(bp))
+		byteswap_uint64_array(zio->io_data, zio->io_size);
 
-		zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
-		    zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
-		    NULL, NULL, zio->io_priority, zio->io_flags,
-		    &zio->io_bookmark));
-	}
+	ASSERT(zio->io_data == gn->gn_gbh);
+	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
+	ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
 
-	zio_push_transform(zio, gbh, gsize, gbufsize);
-	zio_wait_children_ready(zio);
+	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+		if (!BP_IS_GANG(gbp))
+			continue;
+		zio_gang_tree_assemble(lio, gbp, &gn->gn_child[g]);
+	}
 }
 
 static void
-zio_free_gang_members(zio_t *zio)
+zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
 {
-	zio_gbh_phys_t *gbh;
-	uint64_t gsize, gbufsize;
-	int i;
+	zio_t *lio = pio->io_logical;
+	zio_t *zio;
 
-	ASSERT(BP_IS_GANG(zio->io_bp));
+	ASSERT(BP_IS_GANG(bp) == !!gn);
+	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(lio->io_bp));
+	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == lio->io_gang_tree);
 
-	zio_gang_byteswap(zio);
-	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+	/*
+	 * If you're a gang header, your data is in gn->gn_gbh.
+	 * If you're a gang member, your data is in 'data' and gn == NULL.
+	 */
+	zio = zio_gang_issue_func[lio->io_type](pio, bp, gn, data);
 
-	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
-		blkptr_t *gbp = &gbh->zg_blkptr[i];
+	if (gn != NULL) {
+		ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
 
-		if (BP_IS_HOLE(gbp))
-			continue;
-		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
-		    gbp, NULL, NULL));
+		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+			if (BP_IS_HOLE(gbp))
+				continue;
+			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
+			data = (char *)data + BP_GET_PSIZE(gbp);
+		}
 	}
 
-	zio_buf_free(gbh, gbufsize);
-	zio_next_stage(zio);
+	if (gn == lio->io_gang_tree)
+		ASSERT3P((char *)lio->io_data + lio->io_size, ==, data);
+
+	if (zio != pio)
+		zio_nowait(zio);
 }
 
-static void
-zio_claim_gang_members(zio_t *zio)
+static int
+zio_gang_assemble(zio_t *zio)
 {
-	zio_gbh_phys_t *gbh;
-	uint64_t gsize, gbufsize;
-	int i;
+	blkptr_t *bp = zio->io_bp;
 
-	ASSERT(BP_IS_GANG(zio->io_bp));
+	ASSERT(BP_IS_GANG(bp) && zio == zio->io_logical);
 
-	zio_gang_byteswap(zio);
-	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
 
-	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
-		blkptr_t *gbp = &gbh->zg_blkptr[i];
-		if (BP_IS_HOLE(gbp))
-			continue;
-		zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
-		    gbp, NULL, NULL));
-	}
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_gang_issue(zio_t *zio)
+{
+	zio_t *lio = zio->io_logical;
+	blkptr_t *bp = zio->io_bp;
+
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
+		return (ZIO_PIPELINE_STOP);
+
+	ASSERT(BP_IS_GANG(bp) && zio == lio);
+
+	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
+		zio_gang_tree_issue(lio, lio->io_gang_tree, bp, lio->io_data);
+	else
+		zio_gang_tree_free(&lio->io_gang_tree);
+
+	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
-	zio_buf_free(gbh, gbufsize);
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static void
-zio_write_allocate_gang_member_done(zio_t *zio)
+zio_write_gang_member_ready(zio_t *zio)
 {
 	zio_t *pio = zio->io_parent;
+	zio_t *lio = zio->io_logical;
 	dva_t *cdva = zio->io_bp->blk_dva;
 	dva_t *pdva = pio->io_bp->blk_dva;
 	uint64_t asize;
-	int d;
 
-	ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
+	if (BP_IS_HOLE(zio->io_bp))
+		return;
+
+	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
+
+	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
+	ASSERT3U(zio->io_prop.zp_ndvas, ==, lio->io_prop.zp_ndvas);
+	ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
+	ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
-	ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
-	ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
 
 	mutex_enter(&pio->io_lock);
-	for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
+	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
 		ASSERT(DVA_GET_GANG(&pdva[d]));
 		asize = DVA_GET_ASIZE(&pdva[d]);
 		asize += DVA_GET_ASIZE(&cdva[d]);
@@ -1265,97 +1487,77 @@ zio_write_allocate_gang_member_done(zio_t *zio)
 	mutex_exit(&pio->io_lock);
 }
 
-static void
-zio_write_allocate_gang_members(zio_t *zio)
+static int
+zio_write_gang_block(zio_t *pio)
 {
-	blkptr_t *bp = zio->io_bp;
-	dva_t *dva = bp->blk_dva;
-	spa_t *spa = zio->io_spa;
+	spa_t *spa = pio->io_spa;
+	blkptr_t *bp = pio->io_bp;
+	zio_t *lio = pio->io_logical;
+	zio_t *zio;
+	zio_gang_node_t *gn, **gnpp;
 	zio_gbh_phys_t *gbh;
-	uint64_t txg = zio->io_txg;
-	uint64_t resid = zio->io_size;
-	uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
-	uint64_t gsize, loff, lsize;
-	uint32_t gbps_left;
-	int ndvas = zio->io_ndvas;
+	uint64_t txg = pio->io_txg;
+	uint64_t resid = pio->io_size;
+	uint64_t lsize;
+	int ndvas = lio->io_prop.zp_ndvas;
 	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
+	zio_prop_t zp;
 	int error;
-	int i, d;
-
-	gsize = SPA_GANGBLOCKSIZE;
-	gbps_left = SPA_GBH_NBLKPTRS;
-
-	error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE);
-	if (error == ENOSPC)
-		panic("can't allocate gang block header");
-	ASSERT(error == 0);
-
-	for (d = 0; d < gbh_ndvas; d++)
-		DVA_SET_GANG(&dva[d], 1);
-
-	bp->blk_birth = txg;
-
-	gbh = zio_buf_alloc(gsize);
-	bzero(gbh, gsize);
 
-	/* We need to test multi-level gang blocks */
-	if (maxalloc >= zio_gang_bang && (LBOLT & 0x1) == 0)
-		maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
+	error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
+	    bp, gbh_ndvas, txg, pio == lio ? NULL : lio->io_bp,
+	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
+	if (error) {
+		pio->io_error = error;
+		return (ZIO_PIPELINE_CONTINUE);
+	}
 
-	for (loff = 0, i = 0; loff != zio->io_size;
-	    loff += lsize, resid -= lsize, gbps_left--, i++) {
-		blkptr_t *gbp = &gbh->zg_blkptr[i];
-		dva = gbp->blk_dva;
+	if (pio == lio) {
+		gnpp = &lio->io_gang_tree;
+	} else {
+		gnpp = pio->io_private;
+		ASSERT(pio->io_ready == zio_write_gang_member_ready);
+	}
 
-		ASSERT(gbps_left != 0);
-		maxalloc = MIN(maxalloc, resid);
+	gn = zio_gang_node_alloc(gnpp);
+	gbh = gn->gn_gbh;
+	bzero(gbh, SPA_GANGBLOCKSIZE);
 
-		while (resid <= maxalloc * gbps_left) {
-			error = metaslab_alloc(spa, maxalloc, gbp, ndvas,
-			    txg, bp, B_FALSE);
-			if (error == 0)
-				break;
-			ASSERT3U(error, ==, ENOSPC);
-			if (maxalloc == SPA_MINBLOCKSIZE)
-				panic("really out of space");
-			maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
-		}
+	/*
+	 * Create the gang header.
+	 */
+	zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
+	    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
-		if (resid <= maxalloc * gbps_left) {
-			lsize = maxalloc;
-			BP_SET_LSIZE(gbp, lsize);
-			BP_SET_PSIZE(gbp, lsize);
-			BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
-			gbp->blk_birth = txg;
-			zio_nowait(zio_rewrite(zio, spa,
-			    zio->io_checksum, txg, gbp,
-			    (char *)zio->io_data + loff, lsize,
-			    zio_write_allocate_gang_member_done, NULL,
-			    zio->io_priority, zio->io_flags,
-			    &zio->io_bookmark));
-		} else {
-			lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
-			ASSERT(lsize != SPA_MINBLOCKSIZE);
-			zio_nowait(zio_write_allocate(zio, spa,
-			    zio->io_checksum, txg, gbp,
-			    (char *)zio->io_data + loff, lsize,
-			    zio_write_allocate_gang_member_done, NULL,
-			    zio->io_priority, zio->io_flags));
-		}
+	/*
+	 * Create and nowait the gang children.
+	 */
+	for (int g = 0; resid != 0; resid -= lsize, g++) {
+		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
+		    SPA_MINBLOCKSIZE);
+		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
+
+		zp.zp_checksum = lio->io_prop.zp_checksum;
+		zp.zp_compress = ZIO_COMPRESS_OFF;
+		zp.zp_type = DMU_OT_NONE;
+		zp.zp_level = 0;
+		zp.zp_ndvas = lio->io_prop.zp_ndvas;
+
+		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
+		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
+		    zio_write_gang_member_ready, NULL, &gn->gn_child[g],
+		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+		    &pio->io_bookmark));
 	}
 
-	ASSERT(resid == 0 && loff == zio->io_size);
-
-	zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
-
-	zio_push_transform(zio, gbh, gsize, gsize);
 	/*
-	 * As much as we'd like this to be zio_wait_children_ready(),
-	 * updating our ASIZE doesn't happen until the io_done callback,
-	 * so we have to wait for that to finish in order for our BP
-	 * to be stable.
+	 * Set pio's pipeline to just wait for zio to finish.
 	 */
-	zio_wait_children_done(zio);
+	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+	zio_nowait(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
@@ -1363,59 +1565,139 @@ zio_write_allocate_gang_members(zio_t *zio)
  * Allocate and free blocks
  * ==========================================================================
  */
-static void
+
+static int
 zio_dva_allocate(zio_t *zio)
 {
+	spa_t *spa = zio->io_spa;
+	metaslab_class_t *mc = spa->spa_normal_class;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
-	ASSERT3U(zio->io_ndvas, >, 0);
-	ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa));
-
-	/* For testing, make some blocks above a certain size be gang blocks */
-	if (zio->io_size >= zio_gang_bang && (LBOLT & 0x3) == 0) {
-		zio_write_allocate_gang_members(zio);
-		return;
-	}
-
+	ASSERT3U(zio->io_prop.zp_ndvas, >, 0);
+	ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
-	error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas,
-	    zio->io_txg, NULL, B_FALSE);
+	error = metaslab_alloc(spa, mc, zio->io_size, bp,
+	    zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0);
 
-	if (error == 0) {
-		bp->blk_birth = zio->io_txg;
-	} else if (error == ENOSPC) {
-		if (zio->io_size == SPA_MINBLOCKSIZE)
-			panic("really, truly out of space");
-		zio_write_allocate_gang_members(zio);
-		return;
-	} else {
+	if (error) {
+		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
+			return (zio_write_gang_block(zio));
 		zio->io_error = error;
 	}
-	zio_next_stage(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_dva_free(zio_t *zio)
 {
-	blkptr_t *bp = zio->io_bp;
+	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
 
-	metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE);
+	return (ZIO_PIPELINE_CONTINUE);
+}
 
-	BP_ZERO(bp);
+static int
+zio_dva_claim(zio_t *zio)
+{
+	int error;
 
-	zio_next_stage(zio);
+	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
+	if (error)
+		zio->io_error = error;
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
+/*
+ * Undo an allocation.  This is used by zio_done() when an I/O fails
+ * and we want to give back the block we just allocated.
+ * This handles both normal blocks and gang blocks.
+ */
 static void
-zio_dva_claim(zio_t *zio)
+zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
-	zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
+	spa_t *spa = zio->io_spa;
+	boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE);
+
+	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+
+	if (zio->io_bp == bp && !now) {
+		/*
+		 * This is a rewrite for sync-to-convergence.
+		 * We can't do a metaslab_free(NOW) because bp wasn't allocated
+		 * during this sync pass, which means that metaslab_sync()
+		 * already committed the allocation.
+		 */
+		ASSERT(DVA_EQUAL(BP_IDENTITY(bp),
+		    BP_IDENTITY(&zio->io_bp_orig)));
+		ASSERT(spa_sync_pass(spa) > 1);
 
-	zio_next_stage(zio);
+		if (BP_IS_GANG(bp) && gn == NULL) {
+			/*
+			 * This is a gang leader whose gang header(s) we
+			 * couldn't read now, so defer the free until later.
+			 * The block should still be intact because without
+			 * the headers, we'd never even start the rewrite.
+			 */
+			bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
+			return;
+		}
+	}
+
+	if (!BP_IS_HOLE(bp))
+		metaslab_free(spa, bp, bp->blk_birth, now);
+
+	if (gn != NULL) {
+		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+			zio_dva_unallocate(zio, gn->gn_child[g],
+			    &gn->gn_gbh->zg_blkptr[g]);
+		}
+	}
+}
+
+/*
+ * Try to allocate an intent log block.  Return 0 on success, errno on failure.
+ */
+int
+zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
+    uint64_t txg)
+{
+	int error;
+
+	error = metaslab_alloc(spa, spa->spa_log_class, size,
+	    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
+
+	if (error)
+		error = metaslab_alloc(spa, spa->spa_normal_class, size,
+		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
+
+	if (error == 0) {
+		BP_SET_LSIZE(new_bp, size);
+		BP_SET_PSIZE(new_bp, size);
+		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
+		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
+		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+		BP_SET_LEVEL(new_bp, 0);
+		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
+	}
+
+	return (error);
+}
+
+/*
+ * Free an intent log block.  We know it can't be a gang block, so there's
+ * nothing to do except metaslab_free() it.
+ */
+void
+zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
+{
+	ASSERT(!BP_IS_GANG(bp));
+
+	metaslab_free(spa, bp, txg, B_FALSE);
 }
 
 /*
@@ -1425,150 +1707,223 @@ zio_dva_claim(zio_t *zio)
  */
 
 static void
-zio_vdev_io_start(zio_t *zio)
+zio_vdev_io_probe_done(zio_t *zio)
+{
+	zio_t *dio;
+	vdev_t *vd = zio->io_private;
+
+	mutex_enter(&vd->vdev_probe_lock);
+	ASSERT(vd->vdev_probe_zio == zio);
+	vd->vdev_probe_zio = NULL;
+	mutex_exit(&vd->vdev_probe_lock);
+
+	while ((dio = zio->io_delegate_list) != NULL) {
+		zio->io_delegate_list = dio->io_delegate_next;
+		dio->io_delegate_next = NULL;
+		if (!vdev_accessible(vd, dio))
+			dio->io_error = ENXIO;
+		zio_execute(dio);
+	}
+}
+
+/*
+ * Probe the device to determine whether I/O failure is specific to this
+ * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged).
+ */
+static int
+zio_vdev_io_probe(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
-	vdev_t *tvd = vd ? vd->vdev_top : NULL;
-	blkptr_t *bp = zio->io_bp;
-	uint64_t align;
+	zio_t *pio = NULL;
+	boolean_t created_pio = B_FALSE;
 
-	if (vd == NULL) {
-		/* The mirror_ops handle multiple DVAs in a single BP */
-		vdev_mirror_ops.vdev_op_io_start(zio);
-		return;
+	/*
+	 * Don't probe the probe.
+	 */
+	if (zio->io_flags & ZIO_FLAG_PROBE)
+		return (ZIO_PIPELINE_CONTINUE);
+
+	/*
+	 * To prevent 'probe storms' when a device fails, we create
+	 * just one probe i/o at a time.  All zios that want to probe
+	 * this vdev will join the probe zio's io_delegate_list.
+	 */
+	mutex_enter(&vd->vdev_probe_lock);
+
+	if ((pio = vd->vdev_probe_zio) == NULL) {
+		vd->vdev_probe_zio = pio = zio_root(zio->io_spa,
+		    zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL);
+		created_pio = B_TRUE;
+		vd->vdev_probe_wanted = B_TRUE;
+		spa_async_request(zio->io_spa, SPA_ASYNC_PROBE);
 	}
 
-	align = 1ULL << tvd->vdev_ashift;
+	zio->io_delegate_next = pio->io_delegate_list;
+	pio->io_delegate_list = zio;
+
+	mutex_exit(&vd->vdev_probe_lock);
 
-	if (zio->io_retries == 0 && vd == tvd)
-		zio->io_flags |= ZIO_FLAG_FAILFAST;
+	if (created_pio) {
+		zio_nowait(vdev_probe(vd, pio));
+		zio_nowait(pio);
+	}
+
+	return (ZIO_PIPELINE_STOP);
+}
+
+static int
+zio_vdev_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	uint64_t align;
+	spa_t *spa = zio->io_spa;
+
+	ASSERT(zio->io_error == 0);
+	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
+
+	if (vd == NULL) {
+		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
+			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
 
-	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
-	    vd->vdev_children == 0) {
-		zio->io_flags |= ZIO_FLAG_PHYSICAL;
-		zio->io_offset += VDEV_LABEL_START_SIZE;
+		/*
+		 * The mirror_ops handle multiple DVAs in a single BP.
+		 */
+		return (vdev_mirror_ops.vdev_op_io_start(zio));
 	}
 
+	align = 1ULL << vd->vdev_top->vdev_ashift;
+
 	if (P2PHASE(zio->io_size, align) != 0) {
 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
 		char *abuf = zio_buf_alloc(asize);
-		ASSERT(vd == tvd);
+		ASSERT(vd == vd->vdev_top);
 		if (zio->io_type == ZIO_TYPE_WRITE) {
 			bcopy(zio->io_data, abuf, zio->io_size);
 			bzero(abuf + zio->io_size, asize - zio->io_size);
 		}
-		zio_push_transform(zio, abuf, asize, asize);
-		ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK));
-		zio->io_flags |= ZIO_FLAG_SUBBLOCK;
+		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
 	}
 
 	ASSERT(P2PHASE(zio->io_offset, align) == 0);
 	ASSERT(P2PHASE(zio->io_size, align) == 0);
-	ASSERT(bp == NULL ||
-	    P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
 	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
 
-	vdev_io_start(zio);
+	if (vd->vdev_ops->vdev_op_leaf &&
+	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
 
-	/* zio_next_stage_async() gets called from io completion interrupt */
-}
+		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+			return (ZIO_PIPELINE_STOP);
 
-static void
-zio_vdev_io_done(zio_t *zio)
-{
-	if (zio->io_vd == NULL)
-		/* The mirror_ops handle multiple DVAs in a single BP */
-		vdev_mirror_ops.vdev_op_io_done(zio);
-	else
-		vdev_io_done(zio);
+		if ((zio = vdev_queue_io(zio)) == NULL)
+			return (ZIO_PIPELINE_STOP);
+
+		if (!vdev_accessible(vd, zio)) {
+			zio->io_error = ENXIO;
+			zio_interrupt(zio);
+			return (ZIO_PIPELINE_STOP);
+		}
+
+	}
+
+	return (vd->vdev_ops->vdev_op_io_start(zio));
 }
 
-/* XXPOLICY */
-boolean_t
-zio_should_retry(zio_t *zio)
+static int
+zio_vdev_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
+	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
+	boolean_t unexpected_error = B_FALSE;
 
-	if (zio->io_error == 0)
-		return (B_FALSE);
-	if (zio->io_delegate_list != NULL)
-		return (B_FALSE);
-	if (vd && vd != vd->vdev_top)
-		return (B_FALSE);
-	if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
-		return (B_FALSE);
-	if (zio->io_retries > 0)
-		return (B_FALSE);
+	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
+		return (ZIO_PIPELINE_STOP);
 
-	return (B_TRUE);
+	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+
+	if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
+
+		vdev_queue_io_done(zio);
+
+		if (zio->io_type == ZIO_TYPE_WRITE)
+			vdev_cache_write(zio);
+
+		if (zio_injection_enabled && zio->io_error == 0)
+			zio->io_error = zio_handle_device_injection(vd, EIO);
+
+		if (zio_injection_enabled && zio->io_error == 0)
+			zio->io_error = zio_handle_label_injection(zio, EIO);
+
+		if (zio->io_error) {
+			if (!vdev_accessible(vd, zio)) {
+				zio->io_error = ENXIO;
+			} else {
+				unexpected_error = B_TRUE;
+			}
+		}
+	}
+
+	ops->vdev_op_io_done(zio);
+
+	if (unexpected_error)
+		return (zio_vdev_io_probe(zio));
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
-	vdev_t *tvd = vd ? vd->vdev_top : NULL;
-
-	ASSERT(zio->io_vsd == NULL);
-
-	if (zio->io_flags & ZIO_FLAG_SUBBLOCK) {
-		void *abuf;
-		uint64_t asize;
-		ASSERT(vd == tvd);
-		zio_pop_transform(zio, &abuf, &asize, &asize);
-		if (zio->io_type == ZIO_TYPE_READ)
-			bcopy(abuf, zio->io_data, zio->io_size);
-		zio_buf_free(abuf, asize);
-		zio->io_flags &= ~ZIO_FLAG_SUBBLOCK;
+
+	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
+		return (ZIO_PIPELINE_STOP);
+
+	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
+		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
+
+	if (zio->io_vsd != NULL) {
+		zio->io_vsd_free(zio);
+		zio->io_vsd = NULL;
 	}
 
-	if (zio_injection_enabled && !zio->io_error)
+	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_fault_injection(zio, EIO);
 
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
 	 */
-	/* XXPOLICY */
-	if (zio_should_retry(zio)) {
-		ASSERT(tvd == vd);
-
-		zio->io_retries++;
+	if (zio->io_error && vd == NULL &&
+	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
+		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
+		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
 		zio->io_error = 0;
-		zio->io_flags &= ZIO_FLAG_VDEV_INHERIT |
-		    ZIO_FLAG_CONFIG_GRABBED;
-		/* XXPOLICY */
-		zio->io_flags &= ~ZIO_FLAG_FAILFAST;
-		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+		zio->io_flags |= ZIO_FLAG_IO_RETRY |
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
 		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
+		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+		return (ZIO_PIPELINE_STOP);
+	}
 
-		dprintf("retry #%d for %s to %s offset %llx\n",
-		    zio->io_retries, zio_type_name[zio->io_type],
-		    vdev_description(vd), zio->io_offset);
+	/*
+	 * If we got an error on a leaf device, convert it to ENXIO
+	 * if the device is not accessible at all.
+	 */
+	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+	    !vdev_accessible(vd, zio))
+		zio->io_error = ENXIO;
 
-		zio_next_stage_async(zio);
-		return;
-	}
+	/*
+	 * If we can't write to an interior vdev (mirror or RAID-Z),
+	 * set vdev_cant_write so that we stop trying to allocate from it.
+	 */
+	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
+	    vd != NULL && !vd->vdev_ops->vdev_op_leaf)
+		vd->vdev_cant_write = B_TRUE;
 
-	if (zio->io_error != 0 && zio->io_error != ECKSUM &&
-	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) {
-		/*
-		 * Poor man's hotplug support.  Even if we're done retrying this
-		 * I/O, try to reopen the vdev to see if it's still attached.
-		 * To avoid excessive thrashing, we only try it once a minute.
-		 * This also has the effect of detecting when missing devices
-		 * have come back, by polling the device once a minute.
-		 *
-		 * We need to do this asynchronously because we can't grab
-		 * all the necessary locks way down here.
-		 */
-		if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) {
-			vd->vdev_last_try = gethrtime();
-			tvd->vdev_reopen_wanted = 1;
-			spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN);
-		}
-	}
+	if (zio->io_error)
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 void
@@ -1603,49 +1958,63 @@ zio_vdev_io_bypass(zio_t *zio)
  * Generate and verify checksums
  * ==========================================================================
  */
-static void
+static int
 zio_checksum_generate(zio_t *zio)
 {
-	int checksum = zio->io_checksum;
 	blkptr_t *bp = zio->io_bp;
+	enum zio_checksum checksum;
 
-	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+	if (bp == NULL) {
+		/*
+		 * This is zio_write_phys().
+		 * We're either generating a label checksum, or none at all.
+		 */
+		checksum = zio->io_prop.zp_checksum;
 
-	BP_SET_CHECKSUM(bp, checksum);
-	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+		if (checksum == ZIO_CHECKSUM_OFF)
+			return (ZIO_PIPELINE_CONTINUE);
 
-	zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
+		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
+	} else {
+		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
+			ASSERT(!IO_IS_ALLOCATING(zio));
+			checksum = ZIO_CHECKSUM_GANG_HEADER;
+		} else {
+			checksum = BP_GET_CHECKSUM(bp);
+		}
+	}
 
-	zio_next_stage(zio);
+	zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
-zio_gang_checksum_generate(zio_t *zio)
+static int
+zio_checksum_verify(zio_t *zio)
 {
-	zio_cksum_t zc;
-	zio_gbh_phys_t *gbh = zio->io_data;
-
-	ASSERT(BP_IS_GANG(zio->io_bp));
-	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
-
-	zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
+	blkptr_t *bp = zio->io_bp;
+	int error;
 
-	zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
+	if (bp == NULL) {
+		/*
+		 * This is zio_read_phys().
+		 * We're either verifying a label checksum, or nothing at all.
+		 */
+		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
+			return (ZIO_PIPELINE_CONTINUE);
 
-	zio_next_stage(zio);
-}
+		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
+	}
 
-static void
-zio_checksum_verify(zio_t *zio)
-{
-	if (zio->io_bp != NULL) {
-		zio->io_error = zio_checksum_error(zio);
-		if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
+	if ((error = zio_checksum_error(zio)) != 0) {
+		zio->io_error = error;
+		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
 			    zio->io_spa, zio->io_vd, zio, 0, 0);
+		}
 	}
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
@@ -1658,204 +2027,263 @@ zio_checksum_verified(zio_t *zio)
 }
 
 /*
- * Set the external verifier for a gang block based on stuff in the bp
+ * ==========================================================================
+ * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
+ * An error of 0 indictes success.  ENXIO indicates whole-device failure,
+ * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
+ * indicate errors that are specific to one I/O, and most likely permanent.
+ * Any other error is presumed to be worse because we weren't expecting it.
+ * ==========================================================================
  */
-void
-zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
+int
+zio_worst_error(int e1, int e2)
 {
-	blkptr_t *bp = zio->io_bp;
+	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
+	int r1, r2;
 
-	zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
-	zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
-	zcp->zc_word[2] = bp->blk_birth;
-	zcp->zc_word[3] = 0;
+	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
+		if (e1 == zio_error_rank[r1])
+			break;
+
+	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
+		if (e2 == zio_error_rank[r2])
+			break;
+
+	return (r1 > r2 ? e1 : e2);
 }
 
 /*
  * ==========================================================================
- * Define the pipeline
+ * I/O completion
  * ==========================================================================
  */
-typedef void zio_pipe_stage_t(zio_t *zio);
-
-static void
-zio_badop(zio_t *zio)
+static int
+zio_ready(zio_t *zio)
 {
-	panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
-}
-
-zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
-	zio_badop,
-	zio_wait_children_ready,
-	zio_write_compress,
-	zio_checksum_generate,
-	zio_gang_pipeline,
-	zio_get_gang_header,
-	zio_rewrite_gang_members,
-	zio_free_gang_members,
-	zio_claim_gang_members,
-	zio_dva_allocate,
-	zio_dva_free,
-	zio_dva_claim,
-	zio_gang_checksum_generate,
-	zio_ready,
-	zio_vdev_io_start,
-	zio_vdev_io_done,
-	zio_vdev_io_assess,
-	zio_wait_children_done,
-	zio_checksum_verify,
-	zio_read_gang_members,
-	zio_read_decompress,
-	zio_done,
-	zio_badop
-};
+	blkptr_t *bp = zio->io_bp;
+	zio_t *pio = zio->io_parent;
 
-/*
- * Move an I/O to the next stage of the pipeline and execute that stage.
- * There's no locking on io_stage because there's no legitimate way for
- * multiple threads to be attempting to process the same I/O.
- */
-void
-zio_next_stage(zio_t *zio)
-{
-	uint32_t pipeline = zio->io_pipeline;
+	if (zio->io_ready) {
+		if (BP_IS_GANG(bp) &&
+		    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
+			return (ZIO_PIPELINE_STOP);
 
-	ASSERT(!MUTEX_HELD(&zio->io_lock));
+		ASSERT(IO_IS_ALLOCATING(zio));
+		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
-	if (zio->io_error) {
-		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
-		    zio, vdev_description(zio->io_vd),
-		    zio->io_offset, zio->io_stage, zio->io_error);
-		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
-			pipeline &= ZIO_ERROR_PIPELINE_MASK;
+		zio->io_ready(zio);
 	}
 
-	while (((1U << ++zio->io_stage) & pipeline) == 0)
-		continue;
+	if (bp != NULL && bp != &zio->io_bp_copy)
+		zio->io_bp_copy = *bp;
 
-	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
-	ASSERT(zio->io_stalled == 0);
+	if (zio->io_error)
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
-	/*
-	 * See the comment in zio_next_stage_async() about per-CPU taskqs.
-	 */
-	if (((1U << zio->io_stage) & zio->io_async_stages) &&
-	    (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) &&
-	    !(zio->io_flags & ZIO_FLAG_METADATA)) {
-		taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
-		(void) taskq_dispatch(tq,
-		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
-	} else {
-		zio_pipeline[zio->io_stage](zio);
-	}
+	if (pio != NULL)
+		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-void
-zio_next_stage_async(zio_t *zio)
+static int
+zio_done(zio_t *zio)
 {
-	taskq_t *tq;
-	uint32_t pipeline = zio->io_pipeline;
-
-	ASSERT(!MUTEX_HELD(&zio->io_lock));
+	spa_t *spa = zio->io_spa;
+	zio_t *pio = zio->io_parent;
+	zio_t *lio = zio->io_logical;
+	blkptr_t *bp = zio->io_bp;
+	vdev_t *vd = zio->io_vd;
+	uint64_t psize = zio->io_size;
 
-	if (zio->io_error) {
-		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
-		    zio, vdev_description(zio->io_vd),
-		    zio->io_offset, zio->io_stage, zio->io_error);
-		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
-			pipeline &= ZIO_ERROR_PIPELINE_MASK;
-	}
+	/*
+	 * If our of children haven't all completed,
+	 * wait for them and then repeat this pipeline stage.
+	 */
+	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
+	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
+	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
+		return (ZIO_PIPELINE_STOP);
 
-	while (((1U << ++zio->io_stage) & pipeline) == 0)
-		continue;
+	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
+		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+			ASSERT(zio->io_children[c][w] == 0);
 
-	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
-	ASSERT(zio->io_stalled == 0);
+	if (bp != NULL) {
+		ASSERT(bp->blk_pad[0] == 0);
+		ASSERT(bp->blk_pad[1] == 0);
+		ASSERT(bp->blk_pad[2] == 0);
+		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
+		    (pio != NULL && bp == pio->io_bp));
+		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
+			ASSERT(!BP_SHOULD_BYTESWAP(bp));
+			ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp));
+			ASSERT(BP_COUNT_GANG(bp) == 0 ||
+			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
+		}
+	}
 
 	/*
-	 * For performance, we'll probably want two sets of task queues:
-	 * per-CPU issue taskqs and per-CPU completion taskqs.  The per-CPU
-	 * part is for read performance: since we have to make a pass over
-	 * the data to checksum it anyway, we want to do this on the same CPU
-	 * that issued the read, because (assuming CPU scheduling affinity)
-	 * that thread is probably still there.  Getting this optimization
-	 * right avoids performance-hostile cache-to-cache transfers.
-	 *
-	 * Note that having two sets of task queues is also necessary for
-	 * correctness: if all of the issue threads get bogged down waiting
-	 * for dependent reads (e.g. metaslab freelist) to complete, then
-	 * there won't be any threads available to service I/O completion
-	 * interrupts.
+	 * If there were child vdev or gang errors, they apply to us now.
 	 */
-	if ((1U << zio->io_stage) & zio->io_async_stages) {
-		if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
-			tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
-		else
-			tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
-		(void) taskq_dispatch(tq,
-		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
-	} else {
-		zio_pipeline[zio->io_stage](zio);
-	}
-}
+	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
+	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
 
-static boolean_t
-zio_alloc_should_fail(void)
-{
-	static uint16_t	allocs = 0;
+	zio_pop_transforms(zio);	/* note: may set zio->io_error */
 
-	return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0);
-}
+	vdev_stat_update(zio, psize);
 
-/*
- * Try to allocate an intent log block.  Return 0 on success, errno on failure.
- */
-int
-zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
-    uint64_t txg)
-{
-	int error;
+	if (zio->io_error) {
+		/*
+		 * If this I/O is attached to a particular vdev,
+		 * generate an error message describing the I/O failure
+		 * at the block level.  We ignore these errors if the
+		 * device is currently unavailable.
+		 */
+		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
+			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
 
-	spa_config_enter(spa, RW_READER, FTAG);
+		if ((zio->io_error == EIO ||
+		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) {
+			/*
+			 * For logical I/O requests, tell the SPA to log the
+			 * error and generate a logical data ereport.
+			 */
+			spa_log_error(spa, zio);
+			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
+			    0, 0);
+		}
+	}
 
-	if (zio_zil_fail_shift && zio_alloc_should_fail()) {
-		spa_config_exit(spa, FTAG);
-		return (ENOSPC);
+	if (zio->io_error && zio == lio) {
+		/*
+		 * Determine whether zio should be reexecuted.  This will
+		 * propagate all the way to the root via zio_notify_parent().
+		 */
+		ASSERT(vd == NULL && bp != NULL);
+
+		if (IO_IS_ALLOCATING(zio))
+			if (zio->io_error != ENOSPC)
+				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
+			else
+				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+
+		if ((zio->io_type == ZIO_TYPE_READ ||
+		    zio->io_type == ZIO_TYPE_FREE) &&
+		    zio->io_error == ENXIO &&
+		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
+			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+
+		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
+			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 	}
 
 	/*
-	 * We were passed the previous log blocks dva_t in bp->blk_dva[0].
+	 * If there were logical child errors, they apply to us now.
+	 * We defer this until now to avoid conflating logical child
+	 * errors with errors that happened to the zio itself when
+	 * updating vdev stats and reporting FMA events above.
 	 */
-	error = metaslab_alloc(spa, size, new_bp, 1, txg, old_bp, B_TRUE);
+	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
-	if (error == 0) {
-		BP_SET_LSIZE(new_bp, size);
-		BP_SET_PSIZE(new_bp, size);
-		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
-		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
-		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
-		BP_SET_LEVEL(new_bp, 0);
-		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
-		new_bp->blk_birth = txg;
+	if (zio->io_reexecute) {
+		/*
+		 * This is a logical I/O that wants to reexecute.
+		 *
+		 * Reexecute is top-down.  When an i/o fails, if it's not
+		 * the root, it simply notifies its parent and sticks around.
+		 * The parent, seeing that it still has children in zio_done(),
+		 * does the same.  This percolates all the way up to the root.
+		 * The root i/o will reexecute or suspend the entire tree.
+		 *
+		 * This approach ensures that zio_reexecute() honors
+		 * all the original i/o dependency relationships, e.g.
+		 * parents not executing until children are ready.
+		 */
+		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+		if (IO_IS_ALLOCATING(zio))
+			zio_dva_unallocate(zio, zio->io_gang_tree, bp);
+
+		zio_gang_tree_free(&zio->io_gang_tree);
+
+		if (pio != NULL) {
+			/*
+			 * We're not a root i/o, so there's nothing to do
+			 * but notify our parent.  Don't propagate errors
+			 * upward since we haven't permanently failed yet.
+			 */
+			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
+			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
+		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
+			/*
+			 * We'd fail again if we reexecuted now, so suspend
+			 * until conditions improve (e.g. device comes online).
+			 */
+			zio_suspend(spa, zio);
+		} else {
+			/*
+			 * Reexecution is potentially a huge amount of work.
+			 * Hand it off to the otherwise-unused claim taskq.
+			 */
+			(void) taskq_dispatch(
+			    spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
+			    (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
+		}
+		return (ZIO_PIPELINE_STOP);
 	}
 
-	spa_config_exit(spa, FTAG);
+	ASSERT(zio->io_child == NULL);
+	ASSERT(zio->io_reexecute == 0);
+	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
-	return (error);
-}
+	if (zio->io_done)
+		zio->io_done(zio);
 
-/*
- * Free an intent log block.  We know it can't be a gang block, so there's
- * nothing to do except metaslab_free() it.
- */
-void
-zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
-{
-	ASSERT(!BP_IS_GANG(bp));
+	zio_gang_tree_free(&zio->io_gang_tree);
 
-	spa_config_enter(spa, RW_READER, FTAG);
+	ASSERT(zio->io_delegate_list == NULL);
+	ASSERT(zio->io_delegate_next == NULL);
 
-	metaslab_free(spa, bp, txg, B_FALSE);
+	if (pio != NULL) {
+		zio_remove_child(pio, zio);
+		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
+	}
+
+	if (zio->io_waiter != NULL) {
+		mutex_enter(&zio->io_lock);
+		zio->io_executor = NULL;
+		cv_broadcast(&zio->io_cv);
+		mutex_exit(&zio->io_lock);
+	} else {
+		zio_destroy(zio);
+	}
 
-	spa_config_exit(spa, FTAG);
+	return (ZIO_PIPELINE_STOP);
 }
+
+/*
+ * ==========================================================================
+ * I/O pipeline definition
+ * ==========================================================================
+ */
+static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = {
+	NULL,
+	zio_issue_async,
+	zio_read_bp_init,
+	zio_write_bp_init,
+	zio_checksum_generate,
+	zio_gang_assemble,
+	zio_gang_issue,
+	zio_dva_allocate,
+	zio_dva_free,
+	zio_dva_claim,
+	zio_ready,
+	zio_vdev_io_start,
+	zio_vdev_io_done,
+	zio_vdev_io_assess,
+	zio_checksum_verify,
+	zio_done
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
index f0d9a1463580..bf7fe733fe0c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
@@ -96,25 +94,59 @@ zio_checksum_select(uint8_t child, uint8_t parent)
 }
 
 /*
+ * Set the external verifier for a gang block based on <vdev, offset, txg>,
+ * a tuple which is guaranteed to be unique for the life of the pool.
+ */
+static void
+zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
+{
+	dva_t *dva = BP_IDENTITY(bp);
+	uint64_t txg = bp->blk_birth;
+
+	ASSERT(BP_IS_GANG(bp));
+
+	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
+}
+
+/*
+ * Set the external verifier for a label block based on its offset.
+ * The vdev is implicit, and the txg is unknowable at pool open time --
+ * hence the logic in vdev_uberblock_load() to find the most recent copy.
+ */
+static void
+zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
+{
+	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
+}
+
+/*
  * Generate the checksum.
  */
 void
-zio_checksum(uint_t checksum, zio_cksum_t *zcp, void *data, uint64_t size)
+zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
+	void *data, uint64_t size)
 {
+	blkptr_t *bp = zio->io_bp;
+	uint64_t offset = zio->io_offset;
 	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t zbt_cksum;
 
-	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(ci->ci_func[0] != NULL);
 
 	if (ci->ci_zbt) {
-		*zcp = zbt->zbt_cksum;
+		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
+			zio_checksum_gang_verifier(&zbt->zbt_cksum, bp);
+		else if (checksum == ZIO_CHECKSUM_LABEL)
+			zio_checksum_label_verifier(&zbt->zbt_cksum, offset);
+		else
+			bp->blk_cksum = zbt->zbt_cksum;
 		zbt->zbt_magic = ZBT_MAGIC;
 		ci->ci_func[0](data, size, &zbt_cksum);
 		zbt->zbt_cksum = zbt_cksum;
 	} else {
-		ci->ci_func[0](data, size, zcp);
+		ci->ci_func[0](data, size, &bp->blk_cksum);
 	}
 }
 
@@ -122,47 +154,49 @@ int
 zio_checksum_error(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	zio_cksum_t zc = bp->blk_cksum;
-	uint_t checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER :
-	    BP_GET_CHECKSUM(bp);
-	int byteswap = BP_SHOULD_BYTESWAP(bp);
+	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+	int byteswap;
 	void *data = zio->io_data;
-	uint64_t size = ZIO_GET_IOSIZE(zio);
+	uint64_t size = (bp == NULL ? zio->io_size :
+	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
+	uint64_t offset = zio->io_offset;
 	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
-	zio_cksum_t actual_cksum, expected_cksum;
+	zio_cksum_t actual_cksum, expected_cksum, verifier;
 
 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
 		return (EINVAL);
 
 	if (ci->ci_zbt) {
 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
-			zio_set_gang_verifier(zio, &zc);
+			zio_checksum_gang_verifier(&verifier, bp);
+		else if (checksum == ZIO_CHECKSUM_LABEL)
+			zio_checksum_label_verifier(&verifier, offset);
+		else
+			verifier = bp->blk_cksum;
+
+		byteswap = (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC));
 
-		if (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)) {
-			expected_cksum = zbt->zbt_cksum;
+		if (byteswap)
+			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
+
+		expected_cksum = zbt->zbt_cksum;
+		zbt->zbt_cksum = verifier;
+		ci->ci_func[byteswap](data, size, &actual_cksum);
+		zbt->zbt_cksum = expected_cksum;
+
+		if (byteswap)
 			byteswap_uint64_array(&expected_cksum,
 			    sizeof (zio_cksum_t));
-			zbt->zbt_cksum = zc;
-			byteswap_uint64_array(&zbt->zbt_cksum,
-			    sizeof (zio_cksum_t));
-			ci->ci_func[1](data, size, &actual_cksum);
-			zbt->zbt_cksum = expected_cksum;
-			byteswap_uint64_array(&zbt->zbt_cksum,
-			    sizeof (zio_cksum_t));
-		} else {
-			expected_cksum = zbt->zbt_cksum;
-			zbt->zbt_cksum = zc;
-			ci->ci_func[0](data, size, &actual_cksum);
-			zbt->zbt_cksum = expected_cksum;
-		}
-		zc = expected_cksum;
 	} else {
 		ASSERT(!BP_IS_GANG(bp));
+		byteswap = BP_SHOULD_BYTESWAP(bp);
+		expected_cksum = bp->blk_cksum;
 		ci->ci_func[byteswap](data, size, &actual_cksum);
 	}
 
-	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc))
+	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 		return (ECKSUM);
 
 	if (zio_injection_enabled && !zio->io_error)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
index 4cada09d835c..b3469fdd5c24 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * ZFS fault injection
  *
@@ -47,6 +45,7 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
 
 uint32_t zio_injection_enabled;
 
@@ -145,6 +144,56 @@ zio_handle_fault_injection(zio_t *zio, int error)
 	return (ret);
 }
 
+/*
+ * Determine if the zio is part of a label update and has an injection
+ * handler associated with that portion of the label. Currently, we
+ * allow error injection in either the nvlist or the uberblock region of
+ * of the vdev label.
+ */
+int
+zio_handle_label_injection(zio_t *zio, int error)
+{
+	inject_handler_t *handler;
+	vdev_t *vd = zio->io_vd;
+	uint64_t offset = zio->io_offset;
+	int label;
+	int ret = 0;
+
+	if (offset + zio->io_size > VDEV_LABEL_START_SIZE &&
+	    offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
+		return (0);
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+		uint64_t start = handler->zi_record.zi_start;
+		uint64_t end = handler->zi_record.zi_end;
+
+		/* Ignore device only faults */
+		if (handler->zi_record.zi_start == 0)
+			continue;
+
+		/*
+		 * The injection region is the relative offsets within a
+		 * vdev label. We must determine the label which is being
+		 * updated and adjust our region accordingly.
+		 */
+		label = vdev_label_number(vd->vdev_psize, offset);
+		start = vdev_label_offset(vd->vdev_psize, label, start);
+		end = vdev_label_offset(vd->vdev_psize, label, end);
+
+		if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
+		    (offset >= start && offset <= end)) {
+			ret = error;
+			break;
+		}
+	}
+	rw_exit(&inject_lock);
+	return (ret);
+}
+
+
 int
 zio_handle_device_injection(vdev_t *vd, int error)
 {
@@ -156,6 +205,10 @@ zio_handle_device_injection(vdev_t *vd, int error)
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
+		/* Ignore label specific faults */
+		if (handler->zi_record.zi_start != 0)
+			continue;
+
 		if (vd->vdev_guid == handler->zi_record.zi_guid) {
 			if (handler->zi_record.zi_error == error) {
 				/*
@@ -230,7 +283,7 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
 	 * fault injection isn't a performance critical path.
 	 */
 	if (flags & ZINJECT_FLUSH_ARC)
-		arc_flush();
+		arc_flush(NULL);
 
 	return (0);
 }
@@ -304,6 +357,7 @@ zio_clear_fault(int id)
 void
 zio_inject_init(void)
 {
+	rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
 	list_create(&inject_handlers, sizeof (inject_handler_t),
 	    offsetof(inject_handler_t, zi_link));
 }
@@ -312,4 +366,5 @@ void
 zio_inject_fini(void)
 {
 	list_destroy(&inject_handlers);
+	rw_destroy(&inject_lock);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
index fedae03e5107..db0ebf29b7ca 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -23,12 +23,10 @@
  * All rights reserved.
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * ZFS volume emulation driver.
  *
@@ -57,6 +55,9 @@
 #include <sys/zap.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dnode.h>
+#include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dkio.h>
 #include <sys/byteorder.h>
@@ -69,10 +70,14 @@
 #include <sys/refcount.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_rlock.h>
+#include <sys/vdev_impl.h>
+#include <sys/zvol.h>
 #include <geom/geom.h>
 
 #include "zfs_namecheck.h"
 
+#define	ZVOL_DUMPSIZE	"dumpsize"
+
 struct g_class zfs_zvol_class = {
 	.name = "ZFS::ZVOL",
 	.version = G_VERSION,
@@ -80,11 +85,31 @@ struct g_class zfs_zvol_class = {
 
 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
 
-#define	ZVOL_OBJ		1ULL
-#define	ZVOL_ZAP_OBJ		2ULL
-
+/*
+ * This lock protects the zvol_state structure from being modified
+ * while it's being used, e.g. an open that comes in before a create
+ * finishes.  It also protects temporary opens of the dataset so that,
+ * e.g., an open doesn't get a spurious EBUSY.
+ */
+static kmutex_t zvol_state_lock;
 static uint32_t zvol_minors;
 
+#define	NUM_EXTENTS	((SPA_MAXBLOCKSIZE) / sizeof (zvol_extent_t))
+
+typedef struct zvol_extent {
+	dva_t		ze_dva;		/* dva associated with this extent */
+	uint64_t	ze_stride;	/* extent stride */
+	uint64_t	ze_size;	/* number of blocks in extent */
+} zvol_extent_t;
+
+/*
+ * The list of extents associated with the dump device
+ */
+typedef struct zvol_ext_list {
+	zvol_extent_t		zl_extents[NUM_EXTENTS];
+	struct zvol_ext_list	*zl_next;
+} zvol_ext_list_t;
+
 /*
  * The in-core state of each volume.
  */
@@ -94,11 +119,12 @@ typedef struct zvol_state {
 	uint64_t	zv_volblocksize; /* volume block size */
 	struct g_provider *zv_provider;	/* GEOM provider */
 	uint8_t		zv_min_bs;	/* minimum addressable block shift */
-	uint8_t		zv_readonly;	/* hard readonly; like write-protect */
+	uint8_t		zv_flags;	/* readonly; dumpified */
 	objset_t	*zv_objset;	/* objset handle */
 	uint32_t	zv_mode;	/* DS_MODE_* flags at open time */
 	uint32_t	zv_total_opens;	/* total open count */
 	zilog_t		*zv_zilog;	/* ZIL handle */
+	zvol_ext_list_t	*zv_list;	/* List of extents for dump */
 	uint64_t	zv_txg_assign;	/* txg to assign during ZIL replay */
 	znode_t		zv_znode;	/* for range locking */
 	int		zv_state;
@@ -107,11 +133,28 @@ typedef struct zvol_state {
 } zvol_state_t;
 
 /*
+ * zvol specific flags
+ */
+#define	ZVOL_RDONLY	0x1
+#define	ZVOL_DUMPIFIED	0x2
+#define	ZVOL_EXCL	0x4
+
+/*
  * zvol maximum transfer in one DMU tx.
  */
 int zvol_maxphys = DMU_MAX_ACCESS/2;
 
+extern int zfs_set_prop_nvlist(const char *, nvlist_t *);
 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
+static int zvol_dumpify(zvol_state_t *zv);
+static int zvol_dump_fini(zvol_state_t *zv);
+static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
+
+static void
+zvol_size_changed(zvol_state_t *zv, major_t maj)
+{
+
+}
 
 int
 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
@@ -145,7 +188,10 @@ zvol_readonly_changed_cb(void *arg, uint64_t newval)
 {
 	zvol_state_t *zv = arg;
 
-	zv->zv_readonly = (uint8_t)newval;
+	if (newval)
+		zv->zv_flags |= ZVOL_RDONLY;
+	else
+		zv->zv_flags &= ~ZVOL_RDONLY;
 }
 
 int
@@ -179,6 +225,7 @@ zvol_minor_lookup(const char *name)
 	struct g_geom *gp;
 
 	g_topology_assert();
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
 
 	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
 		LIST_FOREACH(pp, &gp->provider, provider) {
@@ -196,21 +243,29 @@ zvol_access(struct g_provider *pp, int acr, int acw, int ace)
 	zvol_state_t *zv;
 
 	g_topology_assert();
+	mutex_enter(&zvol_state_lock);
 
 	zv = pp->private;
 	if (zv == NULL) {
 		if (acr <= 0 && acw <= 0 && ace <= 0)
 			return (0);
+		mutex_exit(&zvol_state_lock);
 		return (pp->error);
 	}
 
 	ASSERT(zv->zv_objset != NULL);
 
-	if (acw > 0 && (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)))
+	if (acw > 0 &&
+	    ((zv->zv_flags & ZVOL_RDONLY) ||
+	     (zv->zv_mode & DS_MODE_READONLY))) {
+		mutex_exit(&zvol_state_lock);
 		return (EROFS);
+	}
 
 	zv->zv_total_opens += acr + acw + ace;
 
+	mutex_exit(&zvol_state_lock);
+
 	return (0);
 }
 
@@ -324,8 +379,12 @@ zvol_serve_one(zvol_state_t *zv, struct bio *bp)
 				dmu_tx_commit(tx);
 			}
 		}
-		if (error)
+		if (error) {
+			/* convert checksum errors into IO errors */
+			if (error == ECKSUM)
+				error = EIO;
 			break;
+		}
 		off += size;
 		addr += size;
 		resid -= size;
@@ -368,7 +427,7 @@ zvol_worker(void *arg)
 			break;
 		}
 
-		if (bp->bio_cmd != BIO_READ && !zil_disable)
+		if (bp->bio_cmd == BIO_FLUSH && !zil_disable)
 			zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
 
 		g_io_deliver(bp, bp->bio_error);
@@ -376,25 +435,152 @@ zvol_worker(void *arg)
 }
 
 void
-zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
+zvol_init_extent(zvol_extent_t *ze, blkptr_t *bp)
 {
-	zfs_create_data_t *zc = arg;
+	ze->ze_dva = bp->blk_dva[0];	/* structure assignment */
+	ze->ze_stride = 0;
+	ze->ze_size = 1;
+}
+
+/* extent mapping arg */
+struct maparg {
+	zvol_ext_list_t	*ma_list;
+	zvol_extent_t	*ma_extent;
+	int		ma_gang;
+};
+
+/*ARGSUSED*/
+static int
+zvol_map_block(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+	zbookmark_t *zb = &bc->bc_bookmark;
+	blkptr_t *bp = &bc->bc_blkptr;
+	void *data = bc->bc_data;
+	dnode_phys_t *dnp = bc->bc_dnode;
+	struct maparg *ma = (struct maparg *)arg;
+	uint64_t stride;
+
+	/* If there is an error, then keep trying to make progress */
+	if (bc->bc_errno)
+		return (ERESTART);
+
+#ifdef ZFS_DEBUG
+	if (zb->zb_level == -1) {
+		ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
+		ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
+	} else {
+		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
+		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
+	}
+
+	if (zb->zb_level > 0) {
+		uint64_t fill = 0;
+		blkptr_t *bpx, *bpend;
+
+		for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx);
+		    bpx < bpend; bpx++) {
+			if (bpx->blk_birth != 0) {
+				fill += bpx->blk_fill;
+			} else {
+				ASSERT(bpx->blk_fill == 0);
+			}
+		}
+		ASSERT3U(fill, ==, bp->blk_fill);
+	}
+
+	if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) {
+		uint64_t fill = 0;
+		dnode_phys_t *dnx, *dnend;
+
+		for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT);
+		    dnx < dnend; dnx++) {
+			if (dnx->dn_type != DMU_OT_NONE)
+				fill++;
+		}
+		ASSERT3U(fill, ==, bp->blk_fill);
+	}
+#endif
+
+	if (zb->zb_level || dnp->dn_type == DMU_OT_DNODE)
+		return (0);
+
+	/* Abort immediately if we have encountered gang blocks */
+	if (BP_IS_GANG(bp)) {
+		ma->ma_gang++;
+		return (EINTR);
+	}
+
+	/* first time? */
+	if (ma->ma_extent->ze_size == 0) {
+		zvol_init_extent(ma->ma_extent, bp);
+		return (0);
+	}
+
+	stride = (DVA_GET_OFFSET(&bp->blk_dva[0])) -
+	    ((DVA_GET_OFFSET(&ma->ma_extent->ze_dva)) +
+	    (ma->ma_extent->ze_size - 1) * (ma->ma_extent->ze_stride));
+	if (DVA_GET_VDEV(BP_IDENTITY(bp)) ==
+	    DVA_GET_VDEV(&ma->ma_extent->ze_dva)) {
+		if (ma->ma_extent->ze_stride == 0) {
+			/* second block in this extent */
+			ma->ma_extent->ze_stride = stride;
+			ma->ma_extent->ze_size++;
+			return (0);
+		} else if (ma->ma_extent->ze_stride == stride) {
+			/*
+			 * the block we allocated has the same
+			 * stride
+			 */
+			ma->ma_extent->ze_size++;
+			return (0);
+		}
+	}
+
+	/*
+	 * dtrace -n 'zfs-dprintf
+	 * /stringof(arg0) == "zvol.c"/
+	 * {
+	 *	printf("%s: %s", stringof(arg1), stringof(arg3))
+	 * } '
+	 */
+	dprintf("ma_extent 0x%lx mrstride 0x%lx stride %lx\n",
+	    ma->ma_extent->ze_size, ma->ma_extent->ze_stride, stride);
+	dprintf_bp(bp, "%s", "next blkptr:");
+	/* start a new extent */
+	if (ma->ma_extent == &ma->ma_list->zl_extents[NUM_EXTENTS - 1]) {
+		ma->ma_list->zl_next = kmem_zalloc(sizeof (zvol_ext_list_t),
+		    KM_SLEEP);
+		ma->ma_list = ma->ma_list->zl_next;
+		ma->ma_extent = &ma->ma_list->zl_extents[0];
+	} else {
+		ma->ma_extent++;
+	}
+	zvol_init_extent(ma->ma_extent, bp);
+	return (0);
+}
+
+/* ARGSUSED */
+void
+zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+{
+	zfs_creat_t *zct = arg;
+	nvlist_t *nvprops = zct->zct_props;
 	int error;
 	uint64_t volblocksize, volsize;
 
-	VERIFY(nvlist_lookup_uint64(zc->zc_props,
+	VERIFY(nvlist_lookup_uint64(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
-	if (nvlist_lookup_uint64(zc->zc_props,
+	if (nvlist_lookup_uint64(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
 		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 
 	/*
-	 * These properites must be removed from the list so the generic
+	 * These properties must be removed from the list so the generic
 	 * property setting step won't apply to them.
 	 */
-	VERIFY(nvlist_remove_all(zc->zc_props,
+	VERIFY(nvlist_remove_all(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
-	(void) nvlist_remove_all(zc->zc_props,
+	(void) nvlist_remove_all(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 
 	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
@@ -467,10 +653,110 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 };
 
 /*
- * Create a minor node for the specified volume.
+ * reconstruct dva that gets us to the desired offset (offset
+ * is in bytes)
  */
 int
-zvol_create_minor(const char *name, dev_t dev)
+zvol_get_dva(zvol_state_t *zv, uint64_t offset, dva_t *dva)
+{
+	zvol_ext_list_t	*zl;
+	zvol_extent_t	*ze;
+	int		idx;
+	uint64_t	tmp;
+
+	if ((zl = zv->zv_list) == NULL)
+		return (EIO);
+	idx = 0;
+	ze =  &zl->zl_extents[0];
+	while (offset >= ze->ze_size * zv->zv_volblocksize) {
+		offset -= ze->ze_size * zv->zv_volblocksize;
+
+		if (idx == NUM_EXTENTS - 1) {
+			/* we've reached the end of this array */
+			ASSERT(zl->zl_next != NULL);
+			if (zl->zl_next == NULL)
+				return (-1);
+			zl = zl->zl_next;
+			ze = &zl->zl_extents[0];
+			idx = 0;
+		} else {
+			ze++;
+			idx++;
+		}
+	}
+	DVA_SET_VDEV(dva, DVA_GET_VDEV(&ze->ze_dva));
+	tmp = DVA_GET_OFFSET((&ze->ze_dva));
+	tmp += (ze->ze_stride * (offset / zv->zv_volblocksize));
+	DVA_SET_OFFSET(dva, tmp);
+	return (0);
+}
+
+static void
+zvol_free_extents(zvol_state_t *zv)
+{
+	zvol_ext_list_t *zl;
+	zvol_ext_list_t *tmp;
+
+	if (zv->zv_list != NULL) {
+		zl = zv->zv_list;
+		while (zl != NULL) {
+			tmp = zl->zl_next;
+			kmem_free(zl, sizeof (zvol_ext_list_t));
+			zl = tmp;
+		}
+		zv->zv_list = NULL;
+	}
+}
+
+int
+zvol_get_lbas(zvol_state_t *zv)
+{
+	struct maparg	ma;
+	zvol_ext_list_t	*zl;
+	zvol_extent_t	*ze;
+	uint64_t	blocks = 0;
+	int		err;
+
+	ma.ma_list = zl = kmem_zalloc(sizeof (zvol_ext_list_t), KM_SLEEP);
+	ma.ma_extent = &ma.ma_list->zl_extents[0];
+	ma.ma_gang = 0;
+	zv->zv_list = ma.ma_list;
+
+	err = traverse_zvol(zv->zv_objset, ADVANCE_PRE, zvol_map_block, &ma);
+	if (err == EINTR && ma.ma_gang) {
+		/*
+		 * We currently don't support dump devices when the pool
+		 * is so fragmented that our allocation has resulted in
+		 * gang blocks.
+		 */
+		zvol_free_extents(zv);
+		return (EFRAGS);
+	}
+	ASSERT3U(err, ==, 0);
+
+	ze = &zl->zl_extents[0];
+	while (ze) {
+		blocks += ze->ze_size;
+		if (ze == &zl->zl_extents[NUM_EXTENTS - 1]) {
+			zl = zl->zl_next;
+			ze = &zl->zl_extents[0];
+		} else {
+			ze++;
+		}
+	}
+	if (blocks != (zv->zv_volsize / zv->zv_volblocksize)) {
+		zvol_free_extents(zv);
+		return (EIO);
+	}
+
+	return (0);
+}
+
+/*
+ * Create a minor node (plus a whole lot more) for the specified volume.
+ */
+int
+zvol_create_minor(const char *name, major_t maj)
 {
 	struct g_provider *pp;
 	struct g_geom *gp;
@@ -478,11 +764,12 @@ zvol_create_minor(const char *name, dev_t dev)
 	objset_t *os;
 	dmu_object_info_t doi;
 	uint64_t volsize;
-	int ds_mode = DS_MODE_PRIMARY;
+	int ds_mode = DS_MODE_OWNER;
 	int error;
 
 	DROP_GIANT();
 	g_topology_lock();
+	mutex_enter(&zvol_state_lock);
 
 	if ((zv = zvol_minor_lookup(name)) != NULL) {
 		error = EEXIST;
@@ -496,11 +783,7 @@ zvol_create_minor(const char *name, dev_t dev)
 	if (error)
 		goto end;
 
-	g_topology_unlock();
-	PICKUP_GIANT();
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
-	DROP_GIANT();
-	g_topology_lock();
 	if (error) {
 		dmu_objset_close(os);
 		goto end;
@@ -524,14 +807,12 @@ zvol_create_minor(const char *name, dev_t dev)
 	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
 	    sizeof (rl_t), offsetof(rl_t, r_node));
-
-
 	/* get and cache the blocksize */
 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
 	ASSERT(error == 0);
 	zv->zv_volblocksize = doi.doi_data_block_size;
 
-	zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector);
+	zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL);
 
 	/* XXX this should handle the possible i/o error */
 	VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
@@ -547,6 +828,7 @@ zvol_create_minor(const char *name, dev_t dev)
 
 	zvol_minors++;
 end:
+	mutex_exit(&zvol_state_lock);
 	g_topology_unlock();
 	PICKUP_GIANT();
 
@@ -565,6 +847,7 @@ zvol_remove_minor(const char *name)
 
 	DROP_GIANT();
 	g_topology_lock();
+	mutex_enter(&zvol_state_lock);
 
 	if ((zv = zvol_minor_lookup(name)) == NULL) {
 		error = ENXIO;
@@ -602,6 +885,7 @@ zvol_remove_minor(const char *name)
 
 	zvol_minors--;
 end:
+	mutex_exit(&zvol_state_lock);
 	g_topology_unlock();
 	PICKUP_GIANT();
 
@@ -609,55 +893,143 @@ end:
 }
 
 int
-zvol_set_volsize(const char *name, dev_t dev, uint64_t volsize)
+zvol_prealloc(zvol_state_t *zv)
+{
+	objset_t *os = zv->zv_objset;
+	dmu_tx_t *tx;
+	void *data;
+	uint64_t refd, avail, usedobjs, availobjs;
+	uint64_t resid = zv->zv_volsize;
+	uint64_t off = 0;
+
+	/* Check the space usage before attempting to allocate the space */
+	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
+	if (avail < zv->zv_volsize)
+		return (ENOSPC);
+
+	/* Free old extents if they exist */
+	zvol_free_extents(zv);
+
+	/* allocate the blocks by writing each one */
+	data = kmem_zalloc(SPA_MAXBLOCKSIZE, KM_SLEEP);
+
+	while (resid != 0) {
+		int error;
+		uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
+
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			kmem_free(data, SPA_MAXBLOCKSIZE);
+			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
+			return (error);
+		}
+		dmu_write(os, ZVOL_OBJ, off, bytes, data, tx);
+		dmu_tx_commit(tx);
+		off += bytes;
+		resid -= bytes;
+	}
+	kmem_free(data, SPA_MAXBLOCKSIZE);
+	txg_wait_synced(dmu_objset_pool(os), 0);
+
+	return (0);
+}
+
+int
+zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize)
 {
-	zvol_state_t *zv;
 	dmu_tx_t *tx;
 	int error;
+
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+
+	tx = dmu_tx_create(zv->zv_objset);
+	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+
+	error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
+	    &volsize, tx);
+	dmu_tx_commit(tx);
+
+	if (error == 0)
+		error = dmu_free_long_range(zv->zv_objset,
+		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
+
+	/*
+	 * If we are using a faked-up state (zv_provider == NULL) then don't
+	 * try to update the in-core zvol state.
+	 */
+	if (error == 0 && zv->zv_provider) {
+		zv->zv_volsize = volsize;
+		zvol_size_changed(zv, maj);
+	}
+	return (error);
+}
+
+int
+zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
+{
+	zvol_state_t *zv;
+	int error;
 	dmu_object_info_t doi;
+	uint64_t old_volsize = 0ULL;
+	zvol_state_t state = { 0 };
 
 	DROP_GIANT();
 	g_topology_lock();
+	mutex_enter(&zvol_state_lock);
 
 	if ((zv = zvol_minor_lookup(name)) == NULL) {
-		error = ENXIO;
-		goto end;
+		/*
+		 * If we are doing a "zfs clone -o volsize=", then the
+		 * minor node won't exist yet.
+		 */
+		error = dmu_objset_open(name, DMU_OST_ZVOL, DS_MODE_OWNER,
+		    &state.zv_objset);
+		if (error != 0)
+			goto out;
+		zv = &state;
 	}
+	old_volsize = zv->zv_volsize;
 
 	if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
 	    (error = zvol_check_volsize(volsize,
-	    doi.doi_data_block_size)) != 0) {
-		goto end;
-	}
+	    doi.doi_data_block_size)) != 0)
+		goto out;
 
-	if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
+	if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) {
 		error = EROFS;
-		goto end;
+		goto out;
 	}
 
-	tx = dmu_tx_create(zv->zv_objset);
-	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
-	dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		goto end;
-	}
+	error = zvol_update_volsize(zv, maj, volsize);
 
-	error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
-	    &volsize, tx);
-	if (error == 0) {
-		error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, volsize,
-		    DMU_OBJECT_END, tx);
+#if 0
+	/*
+	 * Reinitialize the dump area to the new size. If we
+	 * failed to resize the dump area then restore the it back to
+	 * it's original size.
+	 */
+	if (error == 0 && zv->zv_flags & ZVOL_DUMPIFIED) {
+		if ((error = zvol_dumpify(zv)) != 0 ||
+		    (error = dumpvp_resize()) != 0) {
+			(void) zvol_update_volsize(zv, maj, old_volsize);
+			error = zvol_dumpify(zv);
+		}
 	}
+#endif
 
-	dmu_tx_commit(tx);
+out:
+	if (state.zv_objset)
+		dmu_objset_close(state.zv_objset);
 
-	if (error == 0) {
-		zv->zv_volsize = volsize;
-		zv->zv_provider->mediasize = volsize;	/* XXX: Not supported. */
-	}
-end:
+	mutex_exit(&zvol_state_lock);
 	g_topology_unlock();
 	PICKUP_GIANT();
 
@@ -673,13 +1045,13 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize)
 
 	DROP_GIANT();
 	g_topology_lock();
+	mutex_enter(&zvol_state_lock);
 
 	if ((zv = zvol_minor_lookup(name)) == NULL) {
 		error = ENXIO;
 		goto end;
 	}
-
-	if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
+	if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) {
 		error = EROFS;
 		goto end;
 	}
@@ -702,6 +1074,7 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize)
 #endif
 	}
 end:
+	mutex_exit(&zvol_state_lock);
 	g_topology_unlock();
 	PICKUP_GIANT();
 
@@ -716,7 +1089,7 @@ zvol_get_done(dmu_buf_t *db, void *vzgd)
 
 	dmu_buf_rele(db, vzgd);
 	zfs_range_unlock(rl);
-	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
+	zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
@@ -754,7 +1127,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 
 	/*
 	 * Lock the range of the block to ensure that when the data is
-	 * written out and it's checksum is being calculated that no other
+	 * written out and its checksum is being calculated that no other
 	 * thread can change the block.
 	 */
 	boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
@@ -766,8 +1139,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 	error = dmu_sync(zio, db, &lr->lr_blkptr,
 	    lr->lr_common.lrc_txg, zvol_get_done, zgd);
 	if (error == 0)
-		zil_add_vdev(zv->zv_zilog,
-		    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
+		zil_add_block(zv->zv_zilog, &lr->lr_blkptr);
 	/*
 	 * If we get EINPROGRESS, then we need to wait for a
 	 * write IO initiated by dmu_sync() to complete before
@@ -791,11 +1163,230 @@ zvol_busy(void)
 void
 zvol_init(void)
 {
+	mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
 	ZFS_LOG(1, "ZVOL Initialized.");
 }
 
 void
 zvol_fini(void)
 {
+	mutex_destroy(&zvol_state_lock);
 	ZFS_LOG(1, "ZVOL Deinitialized.");
 }
+
+static boolean_t
+zvol_is_swap(zvol_state_t *zv)
+{
+	vnode_t *vp;
+	boolean_t ret = B_FALSE;
+	char *devpath;
+	size_t devpathlen;
+	int error;
+
+#if 0
+	devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(zv->zv_name) + 1;
+	devpath = kmem_alloc(devpathlen, KM_SLEEP);
+	(void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name);
+	error = lookupname(devpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+	kmem_free(devpath, devpathlen);
+
+	ret = !error && IS_SWAPVP(common_specvp(vp));
+
+	if (vp != NULL)
+		VN_RELE(vp);
+#endif
+
+	return (ret);
+}
+
+static int
+zvol_dump_init(zvol_state_t *zv, boolean_t resize)
+{
+	dmu_tx_t *tx;
+	int error = 0;
+	objset_t *os = zv->zv_objset;
+	nvlist_t *nv = NULL;
+	uint64_t checksum, compress, refresrv;
+
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+
+	/*
+	 * If we are resizing the dump device then we only need to
+	 * update the refreservation to match the newly updated
+	 * zvolsize. Otherwise, we save off the original state of the
+	 * zvol so that we can restore them if the zvol is ever undumpified.
+	 */
+	if (resize) {
+		error = zap_update(os, ZVOL_ZAP_OBJ,
+		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
+		    &zv->zv_volsize, tx);
+	} else {
+		error = dsl_prop_get_integer(zv->zv_name,
+		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
+		error = error ? error : dsl_prop_get_integer(zv->zv_name,
+		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
+		error = error ? error : dsl_prop_get_integer(zv->zv_name,
+		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
+
+		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
+		    &compress, tx);
+		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
+		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
+		    &refresrv, tx);
+	}
+	dmu_tx_commit(tx);
+
+	/* Truncate the file */
+	if (!error)
+		error = dmu_free_long_range(zv->zv_objset,
+		    ZVOL_OBJ, 0, DMU_OBJECT_END);
+
+	if (error)
+		return (error);
+
+	/*
+	 * We only need update the zvol's property if we are initializing
+	 * the dump area for the first time.
+	 */
+	if (!resize) {
+		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_uint64(nv,
+		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
+		VERIFY(nvlist_add_uint64(nv,
+		    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
+		    ZIO_COMPRESS_OFF) == 0);
+		VERIFY(nvlist_add_uint64(nv,
+		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
+		    ZIO_CHECKSUM_OFF) == 0);
+
+		error = zfs_set_prop_nvlist(zv->zv_name, nv);
+		nvlist_free(nv);
+
+		if (error)
+			return (error);
+	}
+
+	/* Allocate the space for the dump */
+	error = zvol_prealloc(zv);
+	return (error);
+}
+
+static int
+zvol_dumpify(zvol_state_t *zv)
+{
+	int error = 0;
+	uint64_t dumpsize = 0;
+	dmu_tx_t *tx;
+	objset_t *os = zv->zv_objset;
+
+	if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY))
+		return (EROFS);
+
+	/*
+	 * We do not support swap devices acting as dump devices.
+	 */
+	if (zvol_is_swap(zv))
+		return (ENOTSUP);
+
+	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
+	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
+		boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE;
+
+		if ((error = zvol_dump_init(zv, resize)) != 0) {
+			(void) zvol_dump_fini(zv);
+			return (error);
+		}
+	}
+
+	/*
+	 * Build up our lba mapping.
+	 */
+	error = zvol_get_lbas(zv);
+	if (error) {
+		(void) zvol_dump_fini(zv);
+		return (error);
+	}
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		(void) zvol_dump_fini(zv);
+		return (error);
+	}
+
+	zv->zv_flags |= ZVOL_DUMPIFIED;
+	error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
+	    &zv->zv_volsize, tx);
+	dmu_tx_commit(tx);
+
+	if (error) {
+		(void) zvol_dump_fini(zv);
+		return (error);
+	}
+
+	txg_wait_synced(dmu_objset_pool(os), 0);
+	return (0);
+}
+
+static int
+zvol_dump_fini(zvol_state_t *zv)
+{
+	dmu_tx_t *tx;
+	objset_t *os = zv->zv_objset;
+	nvlist_t *nv;
+	int error = 0;
+	uint64_t checksum, compress, refresrv;
+
+	/*
+	 * Attempt to restore the zvol back to its pre-dumpified state.
+	 * This is a best-effort attempt as it's possible that not all
+	 * of these properties were initialized during the dumpify process
+	 * (i.e. error during zvol_dump_init).
+	 */
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+	(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
+	dmu_tx_commit(tx);
+
+	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
+	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
+	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
+	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
+	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
+	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
+
+	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	(void) nvlist_add_uint64(nv,
+	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
+	(void) nvlist_add_uint64(nv,
+	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
+	(void) nvlist_add_uint64(nv,
+	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
+	(void) zfs_set_prop_nvlist(zv->zv_name, nv);
+	nvlist_free(nv);
+
+	zvol_free_extents(zv);
+	zv->zv_flags &= ~ZVOL_DUMPIFIED;
+	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
+
+	return (0);
+}