diff options
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs')
110 files changed, 27710 insertions, 9431 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c index dd2aa82304ab..d9eb88a40202 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c @@ -20,7 +20,7 @@ */ /* Portions Copyright 2007 Shivakumar GN */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,6 +35,7 @@ #include <sys/mutex.h> #include <sys/sysmacros.h> #include <sys/systm.h> +#include <sys/sunddi.h> #include <sys/uio.h> #include <sys/vfs.h> #include <sys/vnode.h> @@ -60,7 +61,7 @@ * * These routines are designed to play a support role for existing * pseudo-filesystems (such as procfs). They simplify common tasks, - * without enforcing the filesystem to hand over management to GFS. The + * without forcing the filesystem to hand over management to GFS. The * routines covered are: * * gfs_readdir_init() @@ -116,6 +117,42 @@ */ /* + * gfs_get_parent_ino: used to obtain a parent inode number and the + * inode number of the given vnode in preparation for calling gfs_readdir_init. + */ +int +gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct, + ino64_t *pino, ino64_t *ino) +{ + vnode_t *parent; + gfs_dir_t *dp = dvp->v_data; + int error; + + *ino = dp->gfsd_file.gfs_ino; + parent = dp->gfsd_file.gfs_parent; + + if (parent == NULL) { + *pino = *ino; /* root of filesystem */ + } else if (dvp->v_flag & V_XATTRDIR) { +#ifdef TODO + vattr_t va; + + va.va_mask = AT_NODEID; + error = VOP_GETATTR(parent, &va, 0, cr, ct); + if (error) + return (error); + *pino = va.va_nodeid; +#else + panic("%s:%u: not implemented", __func__, __LINE__); +#endif + } else { + *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino; + } + + return (0); +} + +/* * gfs_readdir_init: initiate a generic readdir * st - a pointer to an uninitialized gfs_readdir_state_t structure * name_max - the directory's maximum file name length @@ -123,6 +160,7 @@ * uiop - the uiop passed to readdir * parent - the parent directory's inode * self - this directory's inode + * flags - flags from VOP_READDIR * * Returns 0 or a non-zero errno. * @@ -153,8 +191,10 @@ */ int gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, - uio_t *uiop, ino64_t parent, ino64_t self) + uio_t *uiop, ino64_t parent, ino64_t self, int flags) { + size_t dirent_size; + if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 || (uiop->uio_loffset % ureclen) != 0) return (EINVAL); @@ -162,9 +202,14 @@ gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, st->grd_ureclen = ureclen; st->grd_oresid = uiop->uio_resid; st->grd_namlen = name_max; - st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP); + if (flags & V_RDDIR_ENTFLAGS) + dirent_size = EDIRENT_RECLEN(st->grd_namlen); + else + dirent_size = DIRENT64_RECLEN(st->grd_namlen); + st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP); st->grd_parent = parent; st->grd_self = self; + st->grd_flags = flags; return (0); } @@ -172,8 +217,8 @@ gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, /* * gfs_readdir_emit_int: internal routine to emit directory entry * - * st - the current readdir state, which must have d_ino and d_name - * set + * st - the current readdir state, which must have d_ino/ed_ino + * and d_name/ed_name set * uiop - caller-supplied uio pointer * next - the offset of the next entry */ @@ -182,9 +227,18 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, int *ncookies, u_long **cookies) { int reclen, namlen; + dirent64_t *dp; + edirent_t *edp; - namlen = strlen(st->grd_dirent->d_name); - reclen = DIRENT64_RECLEN(namlen); + if (st->grd_flags & V_RDDIR_ENTFLAGS) { + edp = st->grd_dirent; + namlen = strlen(edp->ed_name); + reclen = EDIRENT_RECLEN(namlen); + } else { + dp = st->grd_dirent; + namlen = strlen(dp->d_name); + reclen = DIRENT64_RECLEN(namlen); + } if (reclen > uiop->uio_resid) { /* @@ -195,10 +249,15 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, return (-1); } - /* XXX: This can change in the future. */ - st->grd_dirent->d_type = DT_DIR; - st->grd_dirent->d_reclen = (ushort_t)reclen; - st->grd_dirent->d_namlen = namlen; + if (st->grd_flags & V_RDDIR_ENTFLAGS) { + edp->ed_off = next; + edp->ed_reclen = (ushort_t)reclen; + } else { + /* XXX: This can change in the future. */ + dp->d_reclen = (ushort_t)reclen; + dp->d_type = DT_DIR; + dp->d_namlen = namlen; + } if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop)) return (EFAULT); @@ -219,6 +278,7 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, * voff - the virtual offset (obtained from gfs_readdir_pred) * ino - the entry's inode * name - the entry's name + * eflags - value for ed_eflags (if processing edirent_t) * * Returns a 0 on success, a non-zero errno on failure, or -1 if the * readdir loop should terminate. A non-zero result (either errno or @@ -227,12 +287,22 @@ gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, */ int gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, - ino64_t ino, const char *name, int *ncookies, u_long **cookies) + ino64_t ino, const char *name, int eflags, int *ncookies, u_long **cookies) { offset_t off = (voff + 2) * st->grd_ureclen; - st->grd_dirent->d_ino = ino; - (void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen); + if (st->grd_flags & V_RDDIR_ENTFLAGS) { + edirent_t *edp = st->grd_dirent; + + edp->ed_ino = ino; + (void) strncpy(edp->ed_name, name, st->grd_namlen); + edp->ed_eflags = eflags; + } else { + dirent64_t *dp = st->grd_dirent; + + dp->d_ino = ino; + (void) strncpy(dp->d_name, name, st->grd_namlen); + } /* * Inter-entry offsets are invalid, so we assume a record size of @@ -266,11 +336,11 @@ top: voff = off - 2; if (off == 0) { if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self, - ".", ncookies, cookies)) == 0) + ".", 0, ncookies, cookies)) == 0) goto top; } else if (off == 1) { if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent, - "..", ncookies, cookies)) == 0) + "..", 0, ncookies, cookies)) == 0) goto top; } else { *voffp = voff; @@ -292,7 +362,13 @@ top: int gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof) { - kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen)); + size_t dirent_size; + + if (st->grd_flags & V_RDDIR_ENTFLAGS) + dirent_size = EDIRENT_RECLEN(st->grd_namlen); + else + dirent_size = DIRENT64_RECLEN(st->grd_namlen); + kmem_free(st->grd_dirent, dirent_size); if (error > 0) return (error); if (eofp) @@ -485,7 +561,7 @@ gfs_file_inactive(vnode_t *vp) gfs_dir_t *dp = NULL; void *data; - if (fp->gfs_parent == NULL) + if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR)) goto found; dp = fp->gfs_parent->v_data; @@ -511,6 +587,8 @@ gfs_file_inactive(vnode_t *vp) ge = NULL; found: + if (vp->v_flag & V_XATTRDIR) + VI_LOCK(fp->gfs_parent); VI_LOCK(vp); ASSERT(vp->v_count < 2); /* @@ -535,7 +613,8 @@ found: * Free vnode and release parent */ if (fp->gfs_parent) { - gfs_dir_unlock(dp); + if (dp) + gfs_dir_unlock(dp); VI_LOCK(fp->gfs_parent); fp->gfs_parent->v_usecount--; VI_UNLOCK(fp->gfs_parent); @@ -543,6 +622,8 @@ found: ASSERT(vp->v_vfsp != NULL); VFS_RELE(vp->v_vfsp); } + if (vp->v_flag & V_XATTRDIR) + VI_UNLOCK(fp->gfs_parent); return (data); } @@ -570,55 +651,119 @@ gfs_dir_inactive(vnode_t *vp) } /* - * gfs_dir_lookup() + * gfs_dir_lookup_dynamic() * - * Looks up the given name in the directory and returns the corresponding vnode, - * if found. + * This routine looks up the provided name amongst the dynamic entries + * in the gfs directory and returns the corresponding vnode, if found. * - * First, we search statically defined entries, if any. If a match is found, - * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the - * existing vnode. Otherwise, we call the static entry's callback routine, - * caching the result if necessary. + * The gfs directory is expected to be locked by the caller prior to + * calling this function. The directory will be unlocked during the + * execution of this function, but will be locked upon return from the + * function. This function returns 0 on success, non-zero on error. * - * If no static entry is found, we invoke the lookup callback, if any. The - * arguments to this callback are: + * The dynamic lookups are performed by invoking the lookup + * callback, which is passed to this function as the first argument. + * The arguments to the callback are: * - * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp); + * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr, + * int flags, int *deflgs, pathname_t *rpnp); * * pvp - parent vnode * nm - name of entry * vpp - pointer to resulting vnode + * cr - pointer to cred + * flags - flags value from lookup request + * ignored here; currently only used to request + * insensitive lookups + * direntflgs - output parameter, directory entry flags + * ignored here; currently only used to indicate a lookup + * has more than one possible match when case is not considered + * realpnp - output parameter, real pathname + * ignored here; when lookup was performed case-insensitively, + * this field contains the "real" name of the file. * * Returns 0 on success, non-zero on error. */ -int -gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp) +static int +gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp, + const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags, + int *direntflags, pathname_t *realpnp) { - int i; - gfs_dirent_t *ge; - vnode_t *vp; - gfs_dir_t *dp = dvp->v_data; - int ret = 0; - - ASSERT(dvp->v_type == VDIR); + gfs_file_t *fp; + ino64_t ino; + int ret; - if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) - return (0); + ASSERT(GFS_DIR_LOCKED(dp)); + /* + * Drop the directory lock, as the lookup routine + * will need to allocate memory, or otherwise deadlock on this + * directory. + */ + gfs_dir_unlock(dp); + ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp); gfs_dir_lock(dp); /* + * The callback for extended attributes returns a vnode + * with v_data from an underlying fs. + */ + if (ret == 0 && !IS_XATTRDIR(dvp)) { + fp = (gfs_file_t *)((*vpp)->v_data); + fp->gfs_index = -1; + fp->gfs_ino = ino; + } + + return (ret); +} + +/* + * gfs_dir_lookup_static() + * + * This routine looks up the provided name amongst the static entries + * in the gfs directory and returns the corresponding vnode, if found. + * The first argument to the function is a pointer to the comparison + * function this function should use to decide if names are a match. + * + * If a match is found, and GFS_CACHE_VNODE is set and the vnode + * exists, we simply return the existing vnode. Otherwise, we call + * the static entry's callback routine, caching the result if + * necessary. If the idx pointer argument is non-NULL, we use it to + * return the index of the matching static entry. + * + * The gfs directory is expected to be locked by the caller prior to calling + * this function. The directory may be unlocked during the execution of + * this function, but will be locked upon return from the function. + * + * This function returns 0 if a match is found, ENOENT if not. + */ +static int +gfs_dir_lookup_static(int (*compare)(const char *, const char *), + gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx, + vnode_t **vpp, pathname_t *rpnp) +{ + gfs_dirent_t *ge; + vnode_t *vp = NULL; + int i; + + ASSERT(GFS_DIR_LOCKED(dp)); + + /* * Search static entries. */ for (i = 0; i < dp->gfsd_nstatic; i++) { ge = &dp->gfsd_static[i]; - if (strcmp(ge->gfse_name, nm) == 0) { + if (compare(ge->gfse_name, nm) == 0) { + if (rpnp) + (void) strlcpy(rpnp->pn_buf, ge->gfse_name, + rpnp->pn_bufsize); + if (ge->gfse_vnode) { ASSERT(ge->gfse_flags & GFS_CACHE_VNODE); vp = ge->gfse_vnode; VN_HOLD(vp); - goto out; + break; } /* @@ -626,8 +771,8 @@ gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp) * need to do KM_SLEEP allocations. If we return from * the constructor only to find that a parallel * operation has completed, and GFS_CACHE_VNODE is set - * for this entry, we discard the result in favor of the - * cached vnode. + * for this entry, we discard the result in favor of + * the cached vnode. */ gfs_dir_unlock(dp); vp = ge->gfse_ctor(dvp); @@ -660,49 +805,94 @@ gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp) gfs_dir_lock(dp); } } - - goto out; + break; } } - /* - * See if there is a dynamic constructor. - */ - if (dp->gfsd_lookup) { - ino64_t ino; - gfs_file_t *fp; + if (vp == NULL) + return (ENOENT); + else if (idx) + *idx = i; + *vpp = vp; + return (0); +} - /* - * Once again, drop the directory lock, as the lookup routine - * will need to allocate memory, or otherwise deadlock on this - * directory. - */ - gfs_dir_unlock(dp); - ret = dp->gfsd_lookup(dvp, nm, &vp, &ino); - gfs_dir_lock(dp); - if (ret != 0) - goto out; +/* + * gfs_dir_lookup() + * + * Looks up the given name in the directory and returns the corresponding + * vnode, if found. + * + * First, we search statically defined entries, if any, with a call to + * gfs_dir_lookup_static(). If no static entry is found, and we have + * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic(). + * + * This function returns 0 on success, non-zero on error. + */ +int +gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr, + int flags, int *direntflags, pathname_t *realpnp) +{ + gfs_dir_t *dp = dvp->v_data; + boolean_t casecheck; + vnode_t *dynvp = NULL; + vnode_t *vp = NULL; + int (*compare)(const char *, const char *); + int error, idx; - fp = (gfs_file_t *)vp->v_data; - fp->gfs_index = -1; - fp->gfs_ino = ino; - } else { - /* - * No static entry found, and there is no lookup callback, so - * return ENOENT. - */ - ret = ENOENT; + ASSERT(dvp->v_type == VDIR); + + if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) + return (0); + + casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL; + if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) || + (flags & FIGNORECASE)) + compare = strcasecmp; + else + compare = strcmp; + + gfs_dir_lock(dp); + + error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp); + + if (vp && casecheck) { + gfs_dirent_t *ge; + int i; + + for (i = idx + 1; i < dp->gfsd_nstatic; i++) { + ge = &dp->gfsd_static[i]; + + if (strcasecmp(ge->gfse_name, nm) == 0) { + *direntflags |= ED_CASE_CONFLICT; + goto out; + } + } + } + + if ((error || casecheck) && dp->gfsd_lookup) + error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp, + &dynvp, cr, flags, direntflags, vp ? NULL : realpnp); + + if (vp && dynvp) { + /* static and dynamic entries are case-insensitive conflict */ + ASSERT(casecheck); + *direntflags |= ED_CASE_CONFLICT; + VN_RELE(dynvp); + } else if (vp == NULL) { + vp = dynvp; + } else if (error == ENOENT) { + error = 0; + } else if (error) { + VN_RELE(vp); + vp = NULL; } out: gfs_dir_unlock(dp); - if (ret == 0) - *vpp = vp; - else - *vpp = NULL; - - return (ret); + *vpp = vp; + return (error); } /* @@ -731,13 +921,15 @@ out: * This is significantly more complex, thanks to the particulars of * VOP_READDIR(). * - * int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, - * offset_t *off, offset_t *nextoff, void *data) + * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp, + * offset_t *off, offset_t *nextoff, void *data, int flags) * * vp - directory vnode * dp - directory entry, sized according to maxlen given to * gfs_dir_create(). callback must fill in d_name and - * d_ino. + * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags + * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS + * is set in 'flags'. * eofp - callback must set to 1 when EOF has been reached * off - on entry, the last offset read from the directory. Callback * must set to the offset of the current entry, typically left @@ -745,12 +937,13 @@ out: * nextoff - callback must set to offset of next entry. Typically * (off + 1) * data - caller-supplied data + * flags - VOP_READDIR flags * * Return 0 on success, or error on failure. */ int gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, - u_long **cookies, void *data) + u_long **cookies, void *data, cred_t *cr, int flags) { gfs_readdir_state_t gstate; int error, eof = 0; @@ -758,16 +951,12 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, offset_t off, next; gfs_dir_t *dp = dvp->v_data; - ino = dp->gfsd_file.gfs_ino; - - if (dp->gfsd_file.gfs_parent == NULL) - pino = ino; /* root of filesystem */ - else - pino = ((gfs_file_t *) - (dp->gfsd_file.gfs_parent->v_data))->gfs_ino; + error = gfs_get_parent_ino(dvp, cr, NULL, &pino, &ino); + if (error) + return (error); if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop, - pino, ino)) != 0) + pino, ino, flags)) != 0) return (error); while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies, @@ -777,8 +966,8 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, ino = dp->gfsd_inode(dvp, off); if ((error = gfs_readdir_emit(&gstate, uiop, - off, ino, dp->gfsd_static[off].gfse_name, ncookies, - cookies)) != 0) + off, ino, dp->gfsd_static[off].gfse_name, 0, + ncookies, cookies)) != 0) break; } else if (dp->gfsd_readdir) { @@ -786,7 +975,7 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, if ((error = dp->gfsd_readdir(dvp, gstate.grd_dirent, &eof, &off, &next, - data)) != 0 || eof) + data, flags)) != 0 || eof) break; off += dp->gfsd_nstatic + 2; @@ -808,6 +997,21 @@ gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, } /* + * gfs_vop_lookup: VOP_LOOKUP() entry point + * + * For use directly in vnode ops table. Given a GFS directory, calls + * gfs_dir_lookup() as necessary. + */ +/* ARGSUSED */ +int +gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp)); +} + +/* * gfs_vop_readdir: VOP_READDIR() entry point * * For use directly in vnode ops table. Given a GFS directory, calls @@ -827,6 +1031,7 @@ gfs_vop_readdir(ap) { vnode_t *vp = ap->a_vp; uio_t *uiop = ap->a_uio; + cred_t *cr = ap->a_cred; int *eofp = ap->a_eofflag; int ncookies = 0; u_long *cookies = NULL; @@ -842,7 +1047,8 @@ gfs_vop_readdir(ap) *ap->a_ncookies = ncookies; } - error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL); + error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL, + cr, 0); if (error == 0) { /* Subtract unused cookies */ @@ -882,6 +1088,9 @@ gfs_vop_inactive(ap) if (data != NULL) kmem_free(data, fp->gfs_size); + + VI_LOCK(vp); vp->v_data = NULL; + VI_UNLOCK(vp); return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c new file mode 100644 index 000000000000..00a10aae8ec9 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/vnode.h> + +/* Extensible attribute (xva) routines. */ + +/* + * Zero out the structure, set the size of the requested/returned bitmaps, + * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer + * to the returned attributes array. + */ +void +xva_init(xvattr_t *xvap) +{ + bzero(xvap, sizeof (xvattr_t)); + xvap->xva_mapsize = XVA_MAPSIZE; + xvap->xva_magic = XVA_MAGIC; + xvap->xva_vattr.va_mask = AT_XVATTR; + xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0]; +} + +/* + * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t + * structure. Otherwise, returns NULL. + */ +xoptattr_t * +xva_getxoptattr(xvattr_t *xvap) +{ + xoptattr_t *xoap = NULL; + if (xvap->xva_vattr.va_mask & AT_XVATTR) + xoap = &xvap->xva_xoptattrs; + return (xoap); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 420f802f360d..7ca528033c4f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * DVA-based Adjustable Replacement Cache * @@ -47,13 +45,13 @@ * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we - * implement a "cache throttle" that slowes the flow of new data - * into the cache until we can make space avaiable. + * implement a "cache throttle" that slows the flow of new data + * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with - * high use, but also tries to react to memory preasure from the + * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * @@ -75,7 +73,7 @@ * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, - * or 2) via one of the ARC lists. The arc_read() inerface + * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal arc algorithms for * adjusting the cache use method 2. We therefor provide two * types of locks: 1) the hash table lock array, and 2) the @@ -109,6 +107,14 @@ * * Note that the majority of the performance stats are manipulated * with atomic operations. + * + * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: + * + * - L2ARC buflist creation + * - L2ARC buflist eviction + * - L2ARC write completion, which walks L2ARC buflists + * - ARC header destruction, as it removes from L2ARC buflists + * - ARC header release, as it removes from L2ARC buflists */ #include <sys/spa.h> @@ -117,6 +123,7 @@ #include <sys/zfs_context.h> #include <sys/arc.h> #include <sys/refcount.h> +#include <sys/vdev.h> #ifdef _KERNEL #include <sys/dnlc.h> #endif @@ -128,6 +135,10 @@ static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; +extern int zfs_write_limit_shift; +extern uint64_t zfs_write_limit_max; +extern kmutex_t zfs_write_limit_lock; + #define ARC_REDUCE_DNLC_PERCENT 3 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; @@ -148,28 +159,45 @@ static int arc_min_prefetch_lifespan; static int arc_dead; /* + * The arc has filled available memory and has now warmed up. + */ +static boolean_t arc_warm; + +/* * These tunables are for performance analysis. */ -u_long zfs_arc_max; -u_long zfs_arc_min; -TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max); -TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min); +uint64_t zfs_arc_max; +uint64_t zfs_arc_min; +uint64_t zfs_arc_meta_limit = 0; +int zfs_mdcomp_disable = 0; + +TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); +TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); +TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); +TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); SYSCTL_DECL(_vfs_zfs); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, "Maximum ARC size"); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, "Minimum ARC size"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, + &zfs_mdcomp_disable, 0, "Disable metadata compression"); /* - * Note that buffers can be on one of 5 states: + * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recentely used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache - * When there are no active references to the buffer, they - * are linked onto one of the lists in arc. These are the - * only buffers that can be evicted or deleted. + * ARC_l2c_only - exists in L2ARC but not other states + * When there are no active references to the buffer, they are + * are linked onto a list in one of these arc states. These are + * the only buffers that can be evicted or deleted. Within each + * state there are multiple lists, one for meta-data and one for + * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, + * etc.) is tracked separately so that it can be managed more + * explicitly: favored over data, limited explicitly. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies @@ -177,21 +205,30 @@ SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will aquire a DVA * as they are written and migrate onto the arc_mru list. + * + * The ARC_l2c_only state is for buffers that are in the second + * level ARC but no longer in any of the ARC_m* lists. The second + * level ARC itself may also contain buffers that are in any of + * the ARC_m* states - meaning that a buffer can exist in two + * places. The reason for the ARC_l2c_only state is to keep the + * buffer header in the hash table, so that reads that hit the + * second level ARC benefit from these fast lookups. */ typedef struct arc_state { - list_t arcs_list; /* linked list of evictable buffer in state */ - uint64_t arcs_lsize; /* total size of buffers in the linked list */ - uint64_t arcs_size; /* total size of all buffers in this state */ + list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ + uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ + uint64_t arcs_size; /* total amount of data in this state */ kmutex_t arcs_mtx; } arc_state_t; -/* The 5 states: */ +/* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; static arc_state_t ARC_mru_ghost; static arc_state_t ARC_mfu; static arc_state_t ARC_mfu_ghost; +static arc_state_t ARC_l2c_only; typedef struct arc_stats { kstat_named_t arcstat_hits; @@ -222,6 +259,24 @@ typedef struct arc_stats { kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; + kstat_named_t arcstat_hdr_size; + kstat_named_t arcstat_l2_hits; + kstat_named_t arcstat_l2_misses; + kstat_named_t arcstat_l2_feeds; + kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_writes_sent; + kstat_named_t arcstat_l2_writes_done; + kstat_named_t arcstat_l2_writes_error; + kstat_named_t arcstat_l2_writes_hdr_miss; + kstat_named_t arcstat_l2_evict_lock_retry; + kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_abort_lowmem; + kstat_named_t arcstat_l2_cksum_bad; + kstat_named_t arcstat_l2_io_error; + kstat_named_t arcstat_l2_size; + kstat_named_t arcstat_l2_hdr_size; + kstat_named_t arcstat_memory_throttle_count; } arc_stats_t; static arc_stats_t arc_stats = { @@ -252,7 +307,25 @@ static arc_stats_t arc_stats = { { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, - { "size", KSTAT_DATA_UINT64 } + { "size", KSTAT_DATA_UINT64 }, + { "hdr_size", KSTAT_DATA_UINT64 }, + { "l2_hits", KSTAT_DATA_UINT64 }, + { "l2_misses", KSTAT_DATA_UINT64 }, + { "l2_feeds", KSTAT_DATA_UINT64 }, + { "l2_rw_clash", KSTAT_DATA_UINT64 }, + { "l2_writes_sent", KSTAT_DATA_UINT64 }, + { "l2_writes_done", KSTAT_DATA_UINT64 }, + { "l2_writes_error", KSTAT_DATA_UINT64 }, + { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, + { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, + { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_free_on_write", KSTAT_DATA_UINT64 }, + { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, + { "l2_cksum_bad", KSTAT_DATA_UINT64 }, + { "l2_io_error", KSTAT_DATA_UINT64 }, + { "l2_size", KSTAT_DATA_UINT64 }, + { "l2_hdr_size", KSTAT_DATA_UINT64 }, + { "memory_throttle_count", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -299,6 +372,7 @@ static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu; static arc_state_t *arc_mfu_ghost; +static arc_state_t *arc_l2c_only; /* * There are several ARC variables that are critical to export as kstats -- @@ -316,13 +390,21 @@ static arc_state_t *arc_mfu_ghost; static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; +static uint64_t arc_meta_used; +static uint64_t arc_meta_limit; +static uint64_t arc_meta_max = 0; +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN, + &arc_meta_used, 0, "ARC metadata used"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN, + &arc_meta_limit, 0, "ARC metadata limit"); + +typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; arc_done_func_t *acb_done; - arc_byteswap_func_t *acb_byteswap; arc_buf_t *acb_buf; zio_t *acb_zio_dummy; arc_callback_t *acb_next; @@ -368,6 +450,9 @@ struct arc_buf_hdr { /* self protecting */ refcount_t b_refcnt; + + l2arc_buf_hdr_t *b_l2hdr; + list_node_t b_l2node; }; static arc_buf_t *arc_eviction_list; @@ -375,9 +460,12 @@ static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; static void arc_get_data_buf(arc_buf_t *buf); static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); +static int arc_evict_needed(arc_buf_contents_t type); +static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); #define GHOST_STATE(state) \ - ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) + ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ + (state) == arc_l2c_only) /* * Private ARC flags. These flags are private ARC only flags that will show up @@ -393,12 +481,31 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ +#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ +#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ +#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ +#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ +#define ARC_STORED (1 << 19) /* has been store()d to */ #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) +#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) +#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) +#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ + (hdr)->b_l2hdr != NULL) +#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) +#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) +#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) + +/* + * Other sizes + */ + +#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) /* * Hash table routines @@ -431,8 +538,90 @@ static buf_hash_table_t buf_hash_table; uint64_t zfs_crc64_table[256]; +/* + * Level 2 ARC + */ + +#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ +#define L2ARC_HEADROOM 4 /* num of writes */ +#define L2ARC_FEED_SECS 1 /* caching interval */ + +#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) +#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) + +/* + * L2ARC Performance Tunables + */ +uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ +uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ +uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ +uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ + +/* + * L2ARC Internals + */ +typedef struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_write; /* desired write size, bytes */ + uint64_t l2ad_boost; /* warmup write boost, bytes */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + uint64_t l2ad_evict; /* last addr eviction reached */ + boolean_t l2ad_first; /* first sweep through */ + list_t *l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ +} l2arc_dev_t; + +static list_t L2ARC_dev_list; /* device list */ +static list_t *l2arc_dev_list; /* device list pointer */ +static kmutex_t l2arc_dev_mtx; /* device list mutex */ +static l2arc_dev_t *l2arc_dev_last; /* last device used */ +static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ +static list_t L2ARC_free_on_write; /* free after write buf list */ +static list_t *l2arc_free_on_write; /* free after write list ptr */ +static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ +static uint64_t l2arc_ndev; /* number of devices */ + +typedef struct l2arc_read_callback { + arc_buf_t *l2rcb_buf; /* read buffer */ + spa_t *l2rcb_spa; /* spa */ + blkptr_t l2rcb_bp; /* original blkptr */ + zbookmark_t l2rcb_zb; /* original bookmark */ + int l2rcb_flags; /* original flags */ +} l2arc_read_callback_t; + +typedef struct l2arc_write_callback { + l2arc_dev_t *l2wcb_dev; /* device info */ + arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ +} l2arc_write_callback_t; + +struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + daddr_t b_daddr; /* disk address, offset byte */ +}; + +typedef struct l2arc_data_free { + /* protected by l2arc_free_on_write_mtx */ + void *l2df_data; + size_t l2df_size; + void (*l2df_func)(void *, size_t); + list_node_t l2df_list_node; +} l2arc_data_free_t; + +static kmutex_t l2arc_feed_thr_lock; +static kcondvar_t l2arc_feed_thr_cv; +static uint8_t l2arc_thread_exit; + +static void l2arc_read_done(zio_t *zio); +static void l2arc_hdr_stat_add(void); +static void l2arc_hdr_stat_remove(void); + static uint64_t -buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) +buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) { uintptr_t spav = (uintptr_t)spa; uint8_t *vdva = (uint8_t *)dva; @@ -460,7 +649,7 @@ buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) ((buf)->b_birth == birth) && ((buf)->b_spa == spa) static arc_buf_hdr_t * -buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) +buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); @@ -579,6 +768,20 @@ hdr_cons(void *vbuf, void *unused, int kmflag) bzero(buf, sizeof (arc_buf_hdr_t)); refcount_create(&buf->b_refcnt); cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + + ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); + return (0); +} + +/* ARGSUSED */ +static int +buf_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_t *buf = vbuf; + + bzero(buf, sizeof (arc_buf_t)); + rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); return (0); } @@ -594,6 +797,18 @@ hdr_dest(void *vbuf, void *unused) refcount_destroy(&buf->b_refcnt); cv_destroy(&buf->b_cv); + mutex_destroy(&buf->b_freeze_lock); + + ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); +} + +/* ARGSUSED */ +static void +buf_dest(void *vbuf, void *unused) +{ + arc_buf_t *buf = vbuf; + + rw_destroy(&buf->b_lock); } /* @@ -639,7 +854,7 @@ retry: hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); + 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) @@ -673,10 +888,24 @@ arc_cksum_verify(arc_buf_t *buf) mutex_exit(&buf->b_hdr->b_freeze_lock); } +static int +arc_cksum_equal(arc_buf_t *buf) +{ + zio_cksum_t zc; + int equal; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); + mutex_exit(&buf->b_hdr->b_freeze_lock); + + return (equal); +} + static void -arc_cksum_compute(arc_buf_t *buf) +arc_cksum_compute(arc_buf_t *buf, boolean_t force) { - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) return; mutex_enter(&buf->b_hdr->b_freeze_lock); @@ -693,14 +922,14 @@ arc_cksum_compute(arc_buf_t *buf) void arc_buf_thaw(arc_buf_t *buf) { - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - return; + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (buf->b_hdr->b_state != arc_anon) + panic("modifying non-anon buffer!"); + if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) + panic("modifying buffer while i/o in progress!"); + arc_cksum_verify(buf); + } - if (buf->b_hdr->b_state != arc_anon) - panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) - panic("modifying buffer while i/o in progress!"); - arc_cksum_verify(buf); mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); @@ -717,7 +946,7 @@ arc_buf_freeze(arc_buf_t *buf) ASSERT(buf->b_hdr->b_freeze_cksum != NULL || buf->b_hdr->b_state == arc_anon); - arc_cksum_compute(buf); + arc_cksum_compute(buf, B_FALSE); } static void @@ -728,21 +957,23 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) if ((refcount_add(&ab->b_refcnt, tag) == 1) && (ab->b_state != arc_anon)) { uint64_t delta = ab->b_size * ab->b_datacnt; + list_t *list = &ab->b_state->arcs_list[ab->b_type]; + uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); mutex_enter(&ab->b_state->arcs_mtx); ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&ab->b_state->arcs_list, ab); + list_remove(list, ab); if (GHOST_STATE(ab->b_state)) { ASSERT3U(ab->b_datacnt, ==, 0); ASSERT3P(ab->b_buf, ==, NULL); delta = ab->b_size; } ASSERT(delta > 0); - ASSERT3U(ab->b_state->arcs_lsize, >=, delta); - atomic_add_64(&ab->b_state->arcs_lsize, -delta); + ASSERT3U(*size, >=, delta); + atomic_add_64(size, -delta); mutex_exit(&ab->b_state->arcs_mtx); - /* remove the prefetch flag is we get a reference */ + /* remove the prefetch flag if we get a reference */ if (ab->b_flags & ARC_PREFETCH) ab->b_flags &= ~ARC_PREFETCH; } @@ -759,13 +990,14 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && (state != arc_anon)) { + uint64_t *size = &state->arcs_lsize[ab->b_type]; + ASSERT(!MUTEX_HELD(&state->arcs_mtx)); mutex_enter(&state->arcs_mtx); ASSERT(!list_link_active(&ab->b_arc_node)); - list_insert_head(&state->arcs_list, ab); + list_insert_head(&state->arcs_list[ab->b_type], ab); ASSERT(ab->b_datacnt > 0); - atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt); - ASSERT3U(state->arcs_size, >=, state->arcs_lsize); + atomic_add_64(size, ab->b_size * ab->b_datacnt); mutex_exit(&state->arcs_mtx); } return (cnt); @@ -796,12 +1028,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) if (refcnt == 0) { if (old_state != arc_anon) { int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); + uint64_t *size = &old_state->arcs_lsize[ab->b_type]; if (use_mutex) mutex_enter(&old_state->arcs_mtx); ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&old_state->arcs_list, ab); + list_remove(&old_state->arcs_list[ab->b_type], ab); /* * If prefetching out of the ghost cache, @@ -812,19 +1045,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ASSERT(ab->b_buf == NULL); from_delta = ab->b_size; } - ASSERT3U(old_state->arcs_lsize, >=, from_delta); - atomic_add_64(&old_state->arcs_lsize, -from_delta); + ASSERT3U(*size, >=, from_delta); + atomic_add_64(size, -from_delta); if (use_mutex) mutex_exit(&old_state->arcs_mtx); } if (new_state != arc_anon) { int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); + uint64_t *size = &new_state->arcs_lsize[ab->b_type]; if (use_mutex) mutex_enter(&new_state->arcs_mtx); - list_insert_head(&new_state->arcs_list, ab); + list_insert_head(&new_state->arcs_list[ab->b_type], ab); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { @@ -832,9 +1066,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ASSERT(ab->b_buf == NULL); to_delta = ab->b_size; } - atomic_add_64(&new_state->arcs_lsize, to_delta); - ASSERT3U(new_state->arcs_size + to_delta, >=, - new_state->arcs_lsize); + atomic_add_64(size, to_delta); if (use_mutex) mutex_exit(&new_state->arcs_mtx); @@ -842,7 +1074,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc_anon && old_state != arc_anon) { + if (new_state == arc_anon) { buf_hash_remove(ab); } @@ -854,6 +1086,47 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) atomic_add_64(&old_state->arcs_size, -from_delta); } ab->b_state = new_state; + + /* adjust l2arc hdr stats */ + if (new_state == arc_l2c_only) + l2arc_hdr_stat_add(); + else if (old_state == arc_l2c_only) + l2arc_hdr_stat_remove(); +} + +void +arc_space_consume(uint64_t space) +{ + atomic_add_64(&arc_meta_used, space); + atomic_add_64(&arc_size, space); +} + +void +arc_space_return(uint64_t space) +{ + ASSERT(arc_meta_used >= space); + if (arc_meta_max < arc_meta_used) + arc_meta_max = arc_meta_used; + atomic_add_64(&arc_meta_used, -space); + ASSERT(arc_size >= space); + atomic_add_64(&arc_size, -space); +} + +void * +arc_data_buf_alloc(uint64_t size) +{ + if (arc_evict_needed(ARC_BUFC_DATA)) + cv_signal(&arc_reclaim_thr_cv); + atomic_add_64(&arc_size, size); + return (zio_data_buf_alloc(size)); +} + +void +arc_data_buf_free(void *buf, uint64_t size) +{ + zio_data_buf_free(buf, size); + ASSERT(arc_size >= size); + atomic_add_64(&arc_size, -size); } arc_buf_t * @@ -863,15 +1136,14 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) arc_buf_t *buf; ASSERT3U(size, >, 0); - hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); + hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; hdr->b_spa = spa; hdr->b_state = arc_anon; hdr->b_arc_access = 0; - mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; @@ -894,7 +1166,7 @@ arc_buf_clone(arc_buf_t *from) arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; - buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; @@ -914,28 +1186,21 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) kmutex_t *hash_lock; /* - * Check to see if this buffer is currently being evicted via - * arc_do_user_evicts(). + * Check to see if this buffer is evicted. Callers + * must verify b_data != NULL to know if the add_ref + * was successful. */ - mutex_enter(&arc_eviction_mtx); - hdr = buf->b_hdr; - if (hdr == NULL) { - mutex_exit(&arc_eviction_mtx); + rw_enter(&buf->b_lock, RW_READER); + if (buf->b_data == NULL) { + rw_exit(&buf->b_lock); return; } + hdr = buf->b_hdr; + ASSERT(hdr != NULL); hash_lock = HDR_LOCK(hdr); - mutex_exit(&arc_eviction_mtx); - mutex_enter(hash_lock); - if (buf->b_data == NULL) { - /* - * This buffer is evicted. - */ - mutex_exit(hash_lock); - return; - } + rw_exit(&buf->b_lock); - ASSERT(buf->b_hdr == hdr); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); add_reference(hdr, hash_lock, tag); arc_access(hdr, hash_lock); @@ -946,6 +1211,29 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) data, metadata, hits); } +/* + * Free the arc data buffer. If it is an l2arc write in progress, + * the buffer is placed on l2arc_free_on_write to be freed later. + */ +static void +arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), + void *data, size_t size) +{ + if (HDR_L2_WRITING(hdr)) { + l2arc_data_free_t *df; + df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); + df->l2df_data = data; + df->l2df_size = size; + df->l2df_func = free_func; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + free_func(data, size); + } +} + static void arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) { @@ -960,18 +1248,24 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) arc_cksum_verify(buf); if (!recycle) { if (type == ARC_BUFC_METADATA) { - zio_buf_free(buf->b_data, size); + arc_buf_data_free(buf->b_hdr, zio_buf_free, + buf->b_data, size); + arc_space_return(size); } else { ASSERT(type == ARC_BUFC_DATA); - zio_data_buf_free(buf->b_data, size); + arc_buf_data_free(buf->b_hdr, + zio_data_buf_free, buf->b_data, size); + atomic_add_64(&arc_size, -size); } - atomic_add_64(&arc_size, -size); } if (list_link_active(&buf->b_hdr->b_arc_node)) { + uint64_t *cnt = &state->arcs_lsize[type]; + ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); ASSERT(state != arc_anon); - ASSERT3U(state->arcs_lsize, >=, size); - atomic_add_64(&state->arcs_lsize, -size); + + ASSERT3U(*cnt, >=, size); + atomic_add_64(cnt, -size); } ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); @@ -1002,6 +1296,35 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) ASSERT(refcount_is_zero(&hdr->b_refcnt)); ASSERT3P(hdr->b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!(hdr->b_flags & ARC_STORED)); + + if (hdr->b_l2hdr != NULL) { + if (!MUTEX_HELD(&l2arc_buflist_mtx)) { + /* + * To prevent arc_free() and l2arc_evict() from + * attempting to free the same buffer at the same time, + * a FREE_IN_PROGRESS flag is given to arc_free() to + * give it priority. l2arc_evict() can't destroy this + * header while we are waiting on l2arc_buflist_mtx. + * + * The hdr may be removed from l2ad_buflist before we + * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. + */ + mutex_enter(&l2arc_buflist_mtx); + if (hdr->b_l2hdr != NULL) { + list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, + hdr); + } + mutex_exit(&l2arc_buflist_mtx); + } else { + list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr); + } + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t)); + if (hdr->b_state == arc_l2c_only) + l2arc_hdr_stat_remove(); + hdr->b_l2hdr = NULL; + } if (!BUF_EMPTY(hdr)) { ASSERT(!HDR_IN_HASH_TABLE(hdr)); @@ -1014,12 +1337,14 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) if (buf->b_efunc) { mutex_enter(&arc_eviction_mtx); + rw_enter(&buf->b_lock, RW_WRITER); ASSERT(buf->b_hdr != NULL); arc_buf_destroy(hdr->b_buf, FALSE, FALSE); hdr->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; + rw_exit(&buf->b_lock); mutex_exit(&arc_eviction_mtx); } else { arc_buf_destroy(hdr->b_buf, FALSE, TRUE); @@ -1029,7 +1354,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } - mutex_destroy(&hdr->b_freeze_lock); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT3P(hdr->b_hash_next, ==, NULL); @@ -1124,14 +1448,19 @@ arc_buf_size(arc_buf_t *buf) * - return the data block from this buffer rather than freeing it. * This flag is used by callers that are trying to make space for a * new buffer in a full arc cache. + * + * This function makes a "best effort". It skips over any buffers + * it can't get a hash_lock on, and so may not catch all candidates. + * It may also return without evicting as much space as requested. */ static void * -arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, +arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) { arc_state_t *evicted_state; uint64_t bytes_evicted = 0, skipped = 0, missed = 0; arc_buf_hdr_t *ab, *ab_prev = NULL; + list_t *list = &state->arcs_list[type]; kmutex_t *hash_lock; boolean_t have_lock; void *stolen = NULL; @@ -1143,10 +1472,11 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); - for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { - ab_prev = list_prev(&state->arcs_list, ab); + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(ab) || + (spa && ab->b_spa != spa) || (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) { skipped++; @@ -1163,10 +1493,15 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, ASSERT(ab->b_datacnt > 0); while (ab->b_buf) { arc_buf_t *buf = ab->b_buf; + if (!rw_tryenter(&buf->b_lock, RW_WRITER)) { + missed += 1; + break; + } if (buf->b_data) { bytes_evicted += ab->b_size; if (recycle && ab->b_type == type && - ab->b_size == bytes) { + ab->b_size == bytes && + !HDR_L2_WRITING(ab)) { stolen = buf->b_data; recycle = FALSE; } @@ -1180,16 +1515,20 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&arc_eviction_mtx); + rw_exit(&buf->b_lock); } else { + rw_exit(&buf->b_lock); arc_buf_destroy(buf, buf->b_data == stolen, TRUE); } } - ASSERT(ab->b_datacnt == 0); - arc_change_state(evicted_state, ab, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(ab)); - ab->b_flags = ARC_IN_HASH_TABLE; - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + if (ab->b_datacnt == 0) { + arc_change_state(evicted_state, ab, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(ab)); + ab->b_flags |= ARC_IN_HASH_TABLE; + ab->b_flags &= ~ARC_BUF_AVAILABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + } if (!have_lock) mutex_exit(hash_lock); if (bytes >= 0 && bytes_evicted >= bytes) @@ -1212,6 +1551,27 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, if (missed) ARCSTAT_INCR(arcstat_mutex_miss, missed); + /* + * We have just evicted some date into the ghost state, make + * sure we also adjust the ghost state size if necessary. + */ + if (arc_no_grow && + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { + int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + + arc_mru_ghost->arcs_size - arc_c; + + if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { + int64_t todelete = + MIN(arc_mru_ghost->arcs_lsize[type], mru_over); + arc_evict_ghost(arc_mru_ghost, NULL, todelete); + } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { + int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], + arc_mru_ghost->arcs_size + + arc_mfu_ghost->arcs_size - arc_c); + arc_evict_ghost(arc_mfu_ghost, NULL, todelete); + } + } + return (stolen); } @@ -1220,9 +1580,10 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, * bytes. Destroy the buffers that are removed. */ static void -arc_evict_ghost(arc_state_t *state, int64_t bytes) +arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; + list_t *list = &state->arcs_list[ARC_BUFC_DATA]; kmutex_t *hash_lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; @@ -1230,17 +1591,30 @@ arc_evict_ghost(arc_state_t *state, int64_t bytes) ASSERT(GHOST_STATE(state)); top: mutex_enter(&state->arcs_mtx); - for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { - ab_prev = list_prev(&state->arcs_list, ab); + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); + if (spa && ab->b_spa != spa) + continue; hash_lock = HDR_LOCK(ab); if (mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(ab)); ASSERT(ab->b_buf == NULL); - arc_change_state(arc_anon, ab, hash_lock); - mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_deleted); bytes_deleted += ab->b_size; - arc_hdr_destroy(ab); + + if (ab->b_l2hdr != NULL) { + /* + * This buffer is cached on the 2nd Level ARC; + * don't destroy the header. + */ + arc_change_state(arc_l2c_only, ab, hash_lock); + mutex_exit(hash_lock); + } else { + arc_change_state(arc_anon, ab, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(ab); + } + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); if (bytes >= 0 && bytes_deleted >= bytes) break; @@ -1256,6 +1630,12 @@ top: } mutex_exit(&state->arcs_mtx); + if (list == &state->arcs_list[ARC_BUFC_DATA] && + (bytes < 0 || bytes_deleted < bytes)) { + list = &state->arcs_list[ARC_BUFC_METADATA]; + goto top; + } + if (bufs_skipped) { ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); ASSERT(bytes >= 0); @@ -1271,38 +1651,58 @@ arc_adjust(void) { int64_t top_sz, mru_over, arc_over, todelete; - top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used; + + if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { + int64_t toevict = + MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); + (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA); + top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + } - if (top_sz > arc_p && arc_mru->arcs_lsize > 0) { - int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p); - (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF); + if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t toevict = + MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); + (void) arc_evict(arc_mru, NULL, toevict, FALSE, + ARC_BUFC_METADATA); top_sz = arc_anon->arcs_size + arc_mru->arcs_size; } mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; if (mru_over > 0) { - if (arc_mru_ghost->arcs_lsize > 0) { - todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over); - arc_evict_ghost(arc_mru_ghost, todelete); + if (arc_mru_ghost->arcs_size > 0) { + todelete = MIN(arc_mru_ghost->arcs_size, mru_over); + arc_evict_ghost(arc_mru_ghost, NULL, todelete); } } if ((arc_over = arc_size - arc_c) > 0) { int64_t tbl_over; - if (arc_mfu->arcs_lsize > 0) { - int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over); - (void) arc_evict(arc_mfu, toevict, FALSE, - ARC_BUFC_UNDEF); + if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { + int64_t toevict = + MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); + (void) arc_evict(arc_mfu, NULL, toevict, FALSE, + ARC_BUFC_DATA); + arc_over = arc_size - arc_c; + } + + if (arc_over > 0 && + arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t toevict = + MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], + arc_over); + (void) arc_evict(arc_mfu, NULL, toevict, FALSE, + ARC_BUFC_METADATA); } - tbl_over = arc_size + arc_mru_ghost->arcs_lsize + - arc_mfu_ghost->arcs_lsize - arc_c*2; + tbl_over = arc_size + arc_mru_ghost->arcs_size + + arc_mfu_ghost->arcs_size - arc_c * 2; - if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) { - todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over); - arc_evict_ghost(arc_mfu_ghost, todelete); + if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { + todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); + arc_evict_ghost(arc_mfu_ghost, NULL, todelete); } } } @@ -1314,7 +1714,9 @@ arc_do_user_evicts(void) while (arc_eviction_list != NULL) { arc_buf_t *buf = arc_eviction_list; arc_eviction_list = buf->b_next; + rw_enter(&buf->b_lock, RW_WRITER); buf->b_hdr = NULL; + rw_exit(&buf->b_lock); mutex_exit(&arc_eviction_mtx); if (buf->b_efunc != NULL) @@ -1329,24 +1731,40 @@ arc_do_user_evicts(void) } /* - * Flush all *evictable* data from the cache. + * Flush all *evictable* data from the cache for the given spa. * NOTE: this will not touch "active" (i.e. referenced) data. */ void -arc_flush(void) +arc_flush(spa_t *spa) { - while (list_head(&arc_mru->arcs_list)) - (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF); - while (list_head(&arc_mfu->arcs_list)) - (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF); + while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { + (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); + if (spa) + break; + } + while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { + (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); + if (spa) + break; + } + while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { + (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); + if (spa) + break; + } + while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { + (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); + if (spa) + break; + } - arc_evict_ghost(arc_mru_ghost, -1); - arc_evict_ghost(arc_mfu_ghost, -1); + arc_evict_ghost(arc_mru_ghost, spa, -1); + arc_evict_ghost(arc_mfu_ghost, spa, -1); mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); mutex_exit(&arc_reclaim_thr_lock); - ASSERT(arc_eviction_list == NULL); + ASSERT(spa || arc_eviction_list == NULL); } int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ @@ -1380,7 +1798,7 @@ arc_shrink(void) arc_adjust(); } -static int zfs_needfree = 0; +static int needfree = 0; static int arc_reclaim_needed(void) @@ -1391,13 +1809,28 @@ arc_reclaim_needed(void) #ifdef _KERNEL - if (zfs_needfree) + if (needfree) return (1); #if 0 /* + * take 'desfree' extra pages, so we reclaim sooner, rather than later + */ + extra = desfree; + + /* + * check that we're out of range of the pageout scanner. It starts to + * schedule paging if freemem is less than lotsfree and needfree. + * lotsfree is the high-water mark for pageout, and needfree is the + * number of needed free pages. We add extra pages here to make sure + * the scanner doesn't start up while we're freeing memory. + */ + if (freemem < lotsfree + needfree + extra) + return (1); + + /* * check to make sure that swapfs has enough space so that anon - * reservations can still succeeed. anon_resvmem() checks that the + * reservations can still succeed. anon_resvmem() checks that the * availrmem is greater than swapfs_minfree, and the number of reserved * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. @@ -1405,23 +1838,6 @@ arc_reclaim_needed(void) if (availrmem < swapfs_minfree + swapfs_reserve + extra) return (1); - /* - * If zio data pages are being allocated out of a separate heap segment, - * then check that the size of available vmem for this area remains - * above 1/4th free. This needs to be done when the size of the - * non-default segment is smaller than physical memory, so we could - * conceivably run out of VA in that segment before running out of - * physical memory. - */ - if (zio_arena != NULL) { - size_t arc_ziosize = - btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC)); - - if ((physmem > arc_ziosize) && - (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2)) - return (1); - } - #if defined(__i386) /* * If we're on an i386 platform, it's possible that we'll exhaust the @@ -1431,7 +1847,7 @@ arc_reclaim_needed(void) * can have in the system. However, this is generally fixed at 25 pages * which is so low that it's useless. In this comparison, we seek to * calculate the total heap-size, and reclaim if more than 3/4ths of the - * heap is allocated. (Or, in the caclulation, if less than 1/4th is + * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ if (btop(vmem_size(heap_arena, VMEM_FREE)) < @@ -1462,12 +1878,13 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) #endif #ifdef _KERNEL - /* - * First purge some DNLC entries, in case the DNLC is using - * up too much memory. - */ - dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); - + if (arc_meta_used >= arc_meta_limit) { + /* + * We are exceeding our meta-data cache limit. + * Purge some DNLC entries to release holds on meta-data. + */ + dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); + } #if defined(__i386) /* * Reclaim unused memory from all kmem caches. @@ -1477,7 +1894,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) #endif /* - * An agressive reclamation will shrink the cache size as well as + * An aggressive reclamation will shrink the cache size as well as * reap free buffers from the arc kmem caches. */ if (strat == ARC_RECLAIM_AGGR) @@ -1526,11 +1943,10 @@ arc_reclaim_thread(void *dummy __unused) /* reset the growth delay for every reclaim */ growtime = LBOLT + (arc_grow_retry * hz); - ASSERT(growtime > 0); - if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) { + if (needfree && last_reclaim == ARC_RECLAIM_CONS) { /* - * If zfs_needfree is TRUE our vm_lowmem hook + * If needfree is TRUE our vm_lowmem hook * was called and in that case we must free some * memory, so switch to aggressive mode. */ @@ -1538,11 +1954,13 @@ arc_reclaim_thread(void *dummy __unused) last_reclaim = ARC_RECLAIM_AGGR; } arc_kmem_reap_now(last_reclaim); - } else if ((growtime > 0) && ((growtime - LBOLT) <= 0)) { + arc_warm = B_TRUE; + + } else if (arc_no_grow && LBOLT >= growtime) { arc_no_grow = FALSE; } - if (zfs_needfree || + if (needfree || (2 * arc_c < arc_size + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)) arc_adjust(); @@ -1551,9 +1969,9 @@ arc_reclaim_thread(void *dummy __unused) arc_do_user_evicts(); if (arc_reclaim_needed()) { - zfs_needfree = 0; + needfree = 0; #ifdef _KERNEL - wakeup(&zfs_needfree); + wakeup(&needfree); #endif } @@ -1580,6 +1998,9 @@ arc_adapt(int bytes, arc_state_t *state) { int mult; + if (state == arc_l2c_only) + return; + ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: @@ -1634,8 +2055,25 @@ arc_adapt(int bytes, arc_state_t *state) * prior to insert. */ static int -arc_evict_needed() +arc_evict_needed(arc_buf_contents_t type) { + if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) + return (1); + +#if 0 +#ifdef _KERNEL + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this area remains + * above about 1/32nd free. + */ + if (type == ARC_BUFC_DATA && zio_arena != NULL && + vmem_size(zio_arena, VMEM_FREE) < + (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) + return (1); +#endif +#endif + if (arc_reclaim_needed()) return (1); @@ -1678,14 +2116,15 @@ arc_get_data_buf(arc_buf_t *buf) * We have not yet reached cache maximum size, * just allocate a new buffer. */ - if (!arc_evict_needed()) { + if (!arc_evict_needed(type)) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); + arc_space_consume(size); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + atomic_add_64(&arc_size, size); } - atomic_add_64(&arc_size, size); goto out; } @@ -1700,20 +2139,23 @@ arc_get_data_buf(arc_buf_t *buf) if (state == arc_mru || state == arc_anon) { uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; - state = (arc_p > mru_used) ? arc_mfu : arc_mru; + state = (arc_mfu->arcs_lsize[type] > 0 && + arc_p > mru_used) ? arc_mfu : arc_mru; } else { /* MFU cases */ uint64_t mfu_space = arc_c - arc_p; - state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; + state = (arc_mru->arcs_lsize[type] > 0 && + mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } - if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { + if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); + arc_space_consume(size); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + atomic_add_64(&arc_size, size); } - atomic_add_64(&arc_size, size); ARCSTAT_BUMP(arcstat_recycle_miss); } ASSERT(buf->b_data != NULL); @@ -1728,7 +2170,7 @@ out: atomic_add_64(&hdr->b_state->arcs_size, size); if (list_link_active(&hdr->b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_refcnt)); - atomic_add_64(&hdr->b_state->arcs_lsize, size); + atomic_add_64(&hdr->b_state->arcs_lsize[type], size); } /* * If we are growing the cache, and we are adding anonymous @@ -1773,10 +2215,6 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) if ((buf->b_flags & ARC_PREFETCH) != 0) { if (refcount_count(&buf->b_refcnt) == 0) { ASSERT(list_link_active(&buf->b_arc_node)); - mutex_enter(&arc_mru->arcs_mtx); - list_remove(&arc_mru->arcs_list, buf); - list_insert_head(&arc_mru->arcs_list, buf); - mutex_exit(&arc_mru->arcs_mtx); } else { buf->b_flags &= ~ARC_PREFETCH; ARCSTAT_BUMP(arcstat_mru_hits); @@ -1836,10 +2274,6 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) if ((buf->b_flags & ARC_PREFETCH) != 0) { ASSERT(refcount_count(&buf->b_refcnt) == 0); ASSERT(list_link_active(&buf->b_arc_node)); - mutex_enter(&arc_mfu->arcs_mtx); - list_remove(&arc_mfu->arcs_list, buf); - list_insert_head(&arc_mfu->arcs_list, buf); - mutex_exit(&arc_mfu->arcs_mtx); } ARCSTAT_BUMP(arcstat_mfu_hits); buf->b_arc_access = LBOLT; @@ -1865,6 +2299,14 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) arc_change_state(new_state, buf, hash_lock); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + } else if (buf->b_state == arc_l2c_only) { + /* + * This buffer is on the 2nd Level ARC. + */ + + buf->b_arc_access = LBOLT; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(arc_mfu, buf, hash_lock); } else { ASSERT(!"invalid arc state"); } @@ -1879,7 +2321,7 @@ arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) VERIFY(arc_buf_remove_ref(buf, arg) == 1); } -/* a generic arc_done_func_t which you can use */ +/* a generic arc_done_func_t */ void arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { @@ -1917,15 +2359,24 @@ arc_read_done(zio_t *zio) &hash_lock); ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || - (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); + (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || + (found == hdr && HDR_L2_READING(hdr))); + + hdr->b_flags &= ~ARC_L2_EVICTED; + if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) + hdr->b_flags &= ~ARC_L2CACHE; /* byteswap if necessary */ callback_list = hdr->b_acb; ASSERT(callback_list != NULL); - if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) - callback_list->acb_byteswap(buf->b_data, hdr->b_size); + if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? + byteswap_uint64_array : + dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; + func(buf->b_data, hdr->b_size); + } - arc_cksum_compute(buf); + arc_cksum_compute(buf, B_FALSE); /* create copies of the data buffer for the callers */ abuf = buf; @@ -1952,9 +2403,6 @@ arc_read_done(zio_t *zio) if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = refcount_is_zero(&hdr->b_refcnt); - /* convert checksum errors into IO errors */ - if (zio->io_error == ECKSUM) - zio->io_error = EIO; } /* @@ -2020,16 +2468,40 @@ arc_read_done(zio_t *zio) * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. + * + * Normal callers should use arc_read and pass the arc buffer and offset + * for the bp. But if you know you don't need locking, you can use + * arc_read_bp. */ int -arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, - arc_done_func_t *done, void *private, int priority, int flags, - uint32_t *arc_flags, zbookmark_t *zb) +arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) +{ + int err; + arc_buf_hdr_t *hdr = pbuf->b_hdr; + + ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); + ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); + rw_enter(&pbuf->b_lock, RW_READER); + + err = arc_read_nolock(pio, spa, bp, done, private, priority, + zio_flags, arc_flags, zb); + + ASSERT3P(hdr, ==, pbuf->b_hdr); + rw_exit(&pbuf->b_lock); + return (err); +} + +int +arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr; arc_buf_t *buf; kmutex_t *hash_lock; - zio_t *rzio; + zio_t *rzio; top: hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); @@ -2053,10 +2525,9 @@ top: KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - acb->acb_byteswap = swap; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, - spa, NULL, NULL, flags); + spa, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); acb->acb_next = hdr->b_acb; @@ -2093,6 +2564,8 @@ top: } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), @@ -2104,6 +2577,8 @@ top: } else { uint64_t size = BP_GET_LSIZE(bp); arc_callback_t *acb; + vdev_t *vd = NULL; + daddr_t addr; if (hdr == NULL) { /* this block is not in the cache */ @@ -2130,6 +2605,8 @@ top: private); hdr->b_flags |= ARC_PREFETCH; } + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; if (BP_GET_LEVEL(bp) > 0) hdr->b_flags |= ARC_INDIRECT; } else { @@ -2144,7 +2621,9 @@ top: hdr->b_flags |= ARC_PREFETCH; else add_reference(hdr, hash_lock, private); - buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + if (*arc_flags & ARC_L2CACHE) + hdr->b_flags |= ARC_L2CACHE; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; @@ -2160,7 +2639,6 @@ top: acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - acb->acb_byteswap = swap; ASSERT(hdr->b_acb == NULL); hdr->b_acb = acb; @@ -2176,6 +2654,18 @@ top: if (GHOST_STATE(hdr->b_state)) arc_access(hdr, hash_lock); + + if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && + (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { + addr = hdr->b_l2hdr->b_daddr; + /* + * Lock out device removal. + */ + if (vdev_is_dead(vd) || + !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) + vd = NULL; + } + mutex_exit(hash_lock); ASSERT3U(hdr->b_size, ==, size); @@ -2186,8 +2676,65 @@ top: demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, misses); + if (vd != NULL) { + /* + * Read from the L2ARC if the following are true: + * 1. The L2ARC vdev was previously cached. + * 2. This buffer still has L2ARC metadata. + * 3. This buffer isn't currently writing to the L2ARC. + * 4. The L2ARC entry wasn't evicted, which may + * also have invalidated the vdev. + */ + if (hdr->b_l2hdr != NULL && + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { + l2arc_read_callback_t *cb; + + DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_hits); + + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), + KM_SLEEP); + cb->l2rcb_buf = buf; + cb->l2rcb_spa = spa; + cb->l2rcb_bp = *bp; + cb->l2rcb_zb = *zb; + cb->l2rcb_flags = zio_flags; + + /* + * l2arc read. The SCL_L2ARC lock will be + * released by l2arc_read_done(). + */ + rzio = zio_read_phys(pio, vd, addr, size, + buf->b_data, ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, zio_flags | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); + DTRACE_PROBE2(l2arc__read, vdev_t *, vd, + zio_t *, rzio); + + if (*arc_flags & ARC_NOWAIT) { + zio_nowait(rzio); + return (0); + } + + ASSERT(*arc_flags & ARC_WAIT); + if (zio_wait(rzio) == 0) + return (0); + + /* l2arc read error; goto zio_read() */ + } else { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + if (HDR_L2_WRITING(hdr)) + ARCSTAT_BUMP(arcstat_l2_rw_clash); + spa_config_exit(spa, SCL_L2ARC, vd); + } + } + rzio = zio_read(pio, spa, bp, buf->b_data, size, - arc_read_done, buf, priority, flags, zb); + arc_read_done, buf, priority, zio_flags, zb); if (*arc_flags & ARC_WAIT) return (zio_wait(rzio)); @@ -2254,45 +2801,28 @@ arc_buf_evict(arc_buf_t *buf) kmutex_t *hash_lock; arc_buf_t **bufp; - mutex_enter(&arc_eviction_mtx); + rw_enter(&buf->b_lock, RW_WRITER); hdr = buf->b_hdr; if (hdr == NULL) { /* * We are in arc_do_user_evicts(). */ ASSERT(buf->b_data == NULL); - mutex_exit(&arc_eviction_mtx); + rw_exit(&buf->b_lock); return (0); - } - hash_lock = HDR_LOCK(hdr); - mutex_exit(&arc_eviction_mtx); - - mutex_enter(hash_lock); - - if (buf->b_data == NULL) { + } else if (buf->b_data == NULL) { + arc_buf_t copy = *buf; /* structure assignment */ /* - * We are on the eviction list. + * We are on the eviction list; process this buffer now + * but let arc_do_user_evicts() do the reaping. */ - mutex_exit(hash_lock); - mutex_enter(&arc_eviction_mtx); - if (buf->b_hdr == NULL) { - /* - * We are already in arc_do_user_evicts(). - */ - mutex_exit(&arc_eviction_mtx); - return (0); - } else { - arc_buf_t copy = *buf; /* structure assignment */ - /* - * Process this buffer now - * but let arc_do_user_evicts() do the reaping. - */ - buf->b_efunc = NULL; - mutex_exit(&arc_eviction_mtx); - VERIFY(copy.b_efunc(©) == 0); - return (1); - } + buf->b_efunc = NULL; + rw_exit(&buf->b_lock); + VERIFY(copy.b_efunc(©) == 0); + return (1); } + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); ASSERT(buf->b_hdr == hdr); ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); @@ -2323,12 +2853,14 @@ arc_buf_evict(arc_buf_t *buf) arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags = ARC_IN_HASH_TABLE; + hdr->b_flags |= ARC_IN_HASH_TABLE; + hdr->b_flags &= ~ARC_BUF_AVAILABLE; mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&old_state->arcs_mtx); } mutex_exit(hash_lock); + rw_exit(&buf->b_lock); VERIFY(buf->b_efunc(buf) == 0); buf->b_efunc = NULL; @@ -2342,16 +2874,22 @@ arc_buf_evict(arc_buf_t *buf) * Release this buffer from the cache. This must be done * after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make - * make a new hdr for the buffer. + * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock = HDR_LOCK(hdr); + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + l2arc_buf_hdr_t *l2hdr; + uint64_t buf_size; + + rw_enter(&buf->b_lock, RW_WRITER); + hdr = buf->b_hdr; /* this buffer is not on any list */ ASSERT(refcount_count(&hdr->b_refcnt) > 0); + ASSERT(!(hdr->b_flags & ARC_STORED)); if (hdr->b_state == arc_anon) { /* this buffer is already released */ @@ -2359,22 +2897,32 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(BUF_EMPTY(hdr)); ASSERT(buf->b_efunc == NULL); arc_buf_thaw(buf); + rw_exit(&buf->b_lock); return; } + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + l2hdr = hdr->b_l2hdr; + if (l2hdr) { + mutex_enter(&l2arc_buflist_mtx); + hdr->b_l2hdr = NULL; + buf_size = hdr->b_size; + } + /* * Do we have more than one buf? */ - if (hdr->b_buf != buf || buf->b_next != NULL) { + if (hdr->b_datacnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; spa_t *spa = hdr->b_spa; arc_buf_contents_t type = hdr->b_type; + uint32_t flags = hdr->b_flags; - ASSERT(hdr->b_datacnt > 1); + ASSERT(hdr->b_buf != buf || buf->b_next != NULL); /* * Pull the data off of this buf and attach it to * a new anonymous buf. @@ -2389,37 +2937,39 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); if (refcount_is_zero(&hdr->b_refcnt)) { - ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size); - atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size); + uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; + ASSERT3U(*size, >=, hdr->b_size); + atomic_add_64(size, -hdr->b_size); } hdr->b_datacnt -= 1; arc_cksum_verify(buf); mutex_exit(hash_lock); - nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); + nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); nhdr->b_size = blksz; nhdr->b_spa = spa; nhdr->b_type = type; nhdr->b_buf = buf; nhdr->b_state = arc_anon; nhdr->b_arc_access = 0; - nhdr->b_flags = 0; + nhdr->b_flags = flags & ARC_L2_WRITING; + nhdr->b_l2hdr = NULL; nhdr->b_datacnt = 1; nhdr->b_freeze_cksum = NULL; - mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); (void) refcount_add(&nhdr->b_refcnt, tag); buf->b_hdr = nhdr; + rw_exit(&buf->b_lock); atomic_add_64(&arc_anon->arcs_size, blksz); - - hdr = nhdr; } else { + rw_exit(&buf->b_lock); ASSERT(refcount_count(&hdr->b_refcnt) == 1); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; mutex_exit(hash_lock); + bzero(&hdr->b_dva, sizeof (dva_t)); hdr->b_birth = 0; hdr->b_cksum0 = 0; @@ -2427,25 +2977,47 @@ arc_release(arc_buf_t *buf, void *tag) } buf->b_efunc = NULL; buf->b_private = NULL; + + if (l2hdr) { + list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -buf_size); + mutex_exit(&l2arc_buflist_mtx); + } } int arc_released(arc_buf_t *buf) { - return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); + int released; + + rw_enter(&buf->b_lock, RW_READER); + released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); + rw_exit(&buf->b_lock); + return (released); } int arc_has_callback(arc_buf_t *buf) { - return (buf->b_efunc != NULL); + int callback; + + rw_enter(&buf->b_lock, RW_READER); + callback = (buf->b_efunc != NULL); + rw_exit(&buf->b_lock); + return (callback); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { - return (refcount_count(&buf->b_hdr->b_refcnt)); + int referenced; + + rw_enter(&buf->b_lock, RW_READER); + referenced = (refcount_count(&buf->b_hdr->b_refcnt)); + rw_exit(&buf->b_lock); + return (referenced); } #endif @@ -2454,12 +3026,27 @@ arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; - if (callback->awcb_ready) { - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); - callback->awcb_ready(zio, buf, callback->awcb_private); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); + callback->awcb_ready(zio, buf, callback->awcb_private); + + /* + * If the IO is already in progress, then this is a re-write + * attempt, so we need to thaw and re-compute the cksum. + * It is the responsibility of the callback to handle the + * accounting for any re-write attempt. + */ + if (HDR_IO_IN_PROGRESS(hdr)) { + mutex_enter(&hdr->b_freeze_lock); + if (hdr->b_freeze_cksum != NULL) { + kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_freeze_cksum = NULL; + } + mutex_exit(&hdr->b_freeze_lock); } - arc_cksum_compute(buf); + arc_cksum_compute(buf, B_FALSE); + hdr->b_flags |= ARC_IO_IN_PROGRESS; } static void @@ -2471,9 +3058,6 @@ arc_write_done(zio_t *zio) hdr->b_acb = NULL; - /* this buffer is on no lists and is not in the hash table */ - ASSERT3P(hdr->b_state, ==, arc_anon); - hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = zio->io_bp->blk_birth; hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; @@ -2496,6 +3080,7 @@ arc_write_done(zio_t *zio) * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ + ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE); ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), BP_IDENTITY(zio->io_bp))); ASSERT3U(zio->io_bp_orig.blk_birth, ==, @@ -2509,7 +3094,9 @@ arc_write_done(zio_t *zio) ASSERT3P(exists, ==, NULL); } hdr->b_flags &= ~ARC_IO_IN_PROGRESS; - arc_access(hdr, hash_lock); + /* if it's not anon, we are doing a scrub */ + if (hdr->b_state == arc_anon) + arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else if (callback->awcb_done == NULL) { int destroy_hdr; @@ -2526,6 +3113,7 @@ arc_write_done(zio_t *zio) } else { hdr->b_flags &= ~ARC_IO_IN_PROGRESS; } + hdr->b_flags &= ~ARC_STORED; if (callback->awcb_done) { ASSERT(!refcount_is_zero(&hdr->b_refcnt)); @@ -2535,31 +3123,74 @@ arc_write_done(zio_t *zio) kmem_free(callback, sizeof (arc_write_callback_t)); } +static void +write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp) +{ + boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata); + + /* Determine checksum setting */ + if (ismd) { + /* + * Metadata always gets checksummed. If the data + * checksum is multi-bit correctable, and it's not a + * ZBT-style checksum, then it's suitable for metadata + * as well. Otherwise, the metadata checksum defaults + * to fletcher4. + */ + if (zio_checksum_table[wp->wp_oschecksum].ci_correctable && + !zio_checksum_table[wp->wp_oschecksum].ci_zbt) + zp->zp_checksum = wp->wp_oschecksum; + else + zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4; + } else { + zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum, + wp->wp_oschecksum); + } + + /* Determine compression setting */ + if (ismd) { + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : + ZIO_COMPRESS_LZJB; + } else { + zp->zp_compress = zio_compress_select(wp->wp_dncompress, + wp->wp_oscompress); + } + + zp->zp_type = wp->wp_type; + zp->zp_level = wp->wp_level; + zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa)); +} + zio_t * -arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, - uint64_t txg, blkptr_t *bp, arc_buf_t *buf, +arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, + boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, - int flags, zbookmark_t *zb) + int zio_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; - zio_t *zio; + zio_t *zio; + zio_prop_t zp; - /* this is a private buffer - no locking required */ - ASSERT3P(hdr->b_state, ==, arc_anon); - ASSERT(BUF_EMPTY(hdr)); + ASSERT(ready != NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); ASSERT(hdr->b_acb == 0); + if (l2arc) + hdr->b_flags |= ARC_L2CACHE; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; - hdr->b_flags |= ARC_IO_IN_PROGRESS; - zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, - buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, - priority, flags, zb); + + write_policy(spa, wp, &zp); + zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp, + arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); return (zio); } @@ -2584,7 +3215,9 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, * nonzero, it should match what we have in the cache. */ ASSERT(bp->blk_cksum.zc_word[0] == 0 || - ab->b_cksum0 == bp->blk_cksum.zc_word[0]); + bp->blk_cksum.zc_word[0] == ab->b_cksum0 || + bp->blk_fill == BLK_FILL_ALREADY_FREED); + if (ab->b_state != arc_anon) arc_change_state(arc_anon, ab, hash_lock); if (HDR_IO_IN_PROGRESS(ab)) { @@ -2604,6 +3237,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, ab->b_buf->b_private = NULL; mutex_exit(hash_lock); } else if (refcount_is_zero(&ab->b_refcnt)) { + ab->b_flags |= ARC_FREE_IN_PROGRESS; mutex_exit(hash_lock); arc_hdr_destroy(ab); ARCSTAT_BUMP(arcstat_deleted); @@ -2624,7 +3258,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, } } - zio = zio_free(pio, spa, txg, bp, done, private); + zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED); if (arc_flags & ARC_WAIT) return (zio_wait(zio)); @@ -2635,16 +3269,75 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, return (0); } +static int +arc_memory_throttle(uint64_t reserve, uint64_t txg) +{ +#ifdef _KERNEL + uint64_t inflight_data = arc_anon->arcs_size; + uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count); + static uint64_t page_load = 0; + static uint64_t last_txg = 0; + +#if 0 +#if defined(__i386) + available_memory = + MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); +#endif +#endif + if (available_memory >= zfs_write_limit_max) + return (0); + + if (txg > last_txg) { + last_txg = txg; + page_load = 0; + } + /* + * If we are in pageout, we know that memory is already tight, + * the arc is already going to be evicting, so we just want to + * continue to let page writes occur as quickly as possible. + */ + if (curproc == pageproc) { + if (page_load > available_memory / 4) + return (ERESTART); + /* Note: reserve is inflated, so we deflate */ + page_load += reserve / 8; + return (0); + } else if (page_load > 0 && arc_reclaim_needed()) { + /* memory is low, delay before restarting */ + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + return (EAGAIN); + } + page_load = 0; + + if (arc_size > arc_c_min) { + uint64_t evictable_memory = + arc_mru->arcs_lsize[ARC_BUFC_DATA] + + arc_mru->arcs_lsize[ARC_BUFC_METADATA] + + arc_mfu->arcs_lsize[ARC_BUFC_DATA] + + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; + available_memory += MIN(evictable_memory, arc_size - arc_c_min); + } + + if (inflight_data > available_memory / 4) { + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + return (ERESTART); + } +#endif + return (0); +} + void -arc_tempreserve_clear(uint64_t tempreserve) +arc_tempreserve_clear(uint64_t reserve) { - atomic_add_64(&arc_tempreserve, -tempreserve); + atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int -arc_tempreserve_space(uint64_t tempreserve) +arc_tempreserve_space(uint64_t reserve, uint64_t txg) { + int error; + #ifdef ZFS_DEBUG /* * Once in a while, fail for no reason. Everything should cope. @@ -2654,31 +3347,37 @@ arc_tempreserve_space(uint64_t tempreserve) return (ERESTART); } #endif - if (tempreserve > arc_c/4 && !arc_no_grow) - arc_c = MIN(arc_c_max, tempreserve * 4); - if (tempreserve > arc_c) + if (reserve > arc_c/4 && !arc_no_grow) + arc_c = MIN(arc_c_max, reserve * 4); + if (reserve > arc_c) return (ENOMEM); /* + * Writes will, almost always, require additional memory allocations + * in order to compress/encrypt/etc the data. We therefor need to + * make sure that there is sufficient available memory for this. + */ + if (error = arc_memory_throttle(reserve, txg)) + return (error); + + /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. - * - * XXX The limit should be adjusted dynamically to keep the time - * to sync a dataset fixed (around 1-5 seconds?). */ - - if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && - arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { - dprintf("failing, arc_tempreserve=%lluK anon=%lluK " - "tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve>>10, arc_anon->arcs_lsize>>10, - tempreserve>>10, arc_c>>10); + if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && + arc_anon->arcs_size > arc_c / 4) { + dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " + "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", + arc_tempreserve>>10, + arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, + arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, + reserve>>10, arc_c>>10); return (ERESTART); } - atomic_add_64(&arc_tempreserve, tempreserve); + atomic_add_64(&arc_tempreserve, reserve); return (0); } @@ -2692,10 +3391,10 @@ arc_lowmem(void *arg __unused, int howto __unused) /* Serialize access via arc_lowmem_lock. */ mutex_enter(&arc_lowmem_lock); - zfs_needfree = 1; + needfree = 1; cv_signal(&arc_reclaim_thr_cv); - while (zfs_needfree) - tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5); + while (needfree) + tsleep(&needfree, 0, "zfs:lowmem", hz / 5); mutex_exit(&arc_lowmem_lock); } #endif @@ -2743,6 +3442,16 @@ arc_init(void) arc_c = arc_c_max; arc_p = (arc_c >> 1); + /* limit meta-data to 1/4 of the arc capacity */ + arc_meta_limit = arc_c_max / 4; + + /* Allow the tunable to override if it is reasonable */ + if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) + arc_meta_limit = zfs_arc_meta_limit; + + if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) + arc_c_min = arc_meta_limit / 2; + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -2757,6 +3466,7 @@ arc_init(void) arc_mru_ghost = &ARC_mru_ghost; arc_mfu = &ARC_mfu; arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; arc_size = 0; mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); @@ -2764,15 +3474,28 @@ arc_init(void) mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - - list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); + mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + + list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); buf_init(); @@ -2798,6 +3521,13 @@ arc_init(void) #endif arc_dead = FALSE; + arc_warm = B_FALSE; + + if (zfs_write_limit_max == 0) + zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; + else + zfs_write_limit_shift = 0; + mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); #ifdef _KERNEL /* Warn about ZFS memory and address space requirements. */ @@ -2808,9 +3538,9 @@ arc_init(void) if (kmem_size() < 512 * (1 << 20)) { printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " "expect unstable behavior.\n"); - printf(" Consider tuning vm.kmem_size and " + printf(" Consider tuning vm.kmem_size and " "vm.kmem_size_max\n"); - printf(" in /boot/loader.conf.\n"); + printf(" in /boot/loader.conf.\n"); } #endif } @@ -2818,6 +3548,7 @@ arc_init(void) void arc_fini(void) { + mutex_enter(&arc_reclaim_thr_lock); arc_thread_exit = 1; cv_signal(&arc_reclaim_thr_cv); @@ -2825,7 +3556,7 @@ arc_fini(void) cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); mutex_exit(&arc_reclaim_thr_lock); - arc_flush(); + arc_flush(NULL); arc_dead = TRUE; @@ -2838,10 +3569,14 @@ arc_fini(void) mutex_destroy(&arc_reclaim_thr_lock); cv_destroy(&arc_reclaim_thr_cv); - list_destroy(&arc_mru->arcs_list); - list_destroy(&arc_mru_ghost->arcs_list); - list_destroy(&arc_mfu->arcs_list); - list_destroy(&arc_mfu_ghost->arcs_list); + list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); mutex_destroy(&arc_anon->arcs_mtx); mutex_destroy(&arc_mru->arcs_mtx); @@ -2849,6 +3584,8 @@ arc_fini(void) mutex_destroy(&arc_mfu->arcs_mtx); mutex_destroy(&arc_mfu_ghost->arcs_mtx); + mutex_destroy(&zfs_write_limit_lock); + buf_fini(); mutex_destroy(&arc_lowmem_lock); @@ -2857,3 +3594,985 @@ arc_fini(void) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); #endif } + +/* + * Level 2 ARC + * + * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. + * It uses dedicated storage devices to hold cached data, which are populated + * using large infrequent writes. The main role of this cache is to boost + * the performance of random read workloads. The intended L2ARC devices + * include short-stroked disks, solid state disks, and other media with + * substantially faster read latency than disk. + * + * +-----------------------+ + * | ARC | + * +-----------------------+ + * | ^ ^ + * | | | + * l2arc_feed_thread() arc_read() + * | | | + * | l2arc read | + * V | | + * +---------------+ | + * | L2ARC | | + * +---------------+ | + * | ^ | + * l2arc_write() | | + * | | | + * V | | + * +-------+ +-------+ + * | vdev | | vdev | + * | cache | | cache | + * +-------+ +-------+ + * +=========+ .-----. + * : L2ARC : |-_____-| + * : devices : | Disks | + * +=========+ `-_____-' + * + * Read requests are satisfied from the following sources, in order: + * + * 1) ARC + * 2) vdev cache of L2ARC devices + * 3) L2ARC devices + * 4) vdev cache of disks + * 5) disks + * + * Some L2ARC device types exhibit extremely slow write performance. + * To accommodate for this there are some significant differences between + * the L2ARC and traditional cache design: + * + * 1. There is no eviction path from the ARC to the L2ARC. Evictions from + * the ARC behave as usual, freeing buffers and placing headers on ghost + * lists. The ARC does not send buffers to the L2ARC during eviction as + * this would add inflated write latencies for all ARC memory pressure. + * + * 2. The L2ARC attempts to cache data from the ARC before it is evicted. + * It does this by periodically scanning buffers from the eviction-end of + * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are + * not already there. It scans until a headroom of buffers is satisfied, + * which itself is a buffer for ARC eviction. The thread that does this is + * l2arc_feed_thread(), illustrated below; example sizes are included to + * provide a better sense of ratio than this diagram: + * + * head --> tail + * +---------------------+----------+ + * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC + * +---------------------+----------+ | o L2ARC eligible + * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer + * +---------------------+----------+ | + * 15.9 Gbytes ^ 32 Mbytes | + * headroom | + * l2arc_feed_thread() + * | + * l2arc write hand <--[oooo]--' + * | 8 Mbyte + * | write max + * V + * +==============================+ + * L2ARC dev |####|#|###|###| |####| ... | + * +==============================+ + * 32 Gbytes + * + * 3. If an ARC buffer is copied to the L2ARC but then hit instead of + * evicted, then the L2ARC has cached a buffer much sooner than it probably + * needed to, potentially wasting L2ARC device bandwidth and storage. It is + * safe to say that this is an uncommon case, since buffers at the end of + * the ARC lists have moved there due to inactivity. + * + * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, + * then the L2ARC simply misses copying some buffers. This serves as a + * pressure valve to prevent heavy read workloads from both stalling the ARC + * with waits and clogging the L2ARC with writes. This also helps prevent + * the potential for the L2ARC to churn if it attempts to cache content too + * quickly, such as during backups of the entire pool. + * + * 5. After system boot and before the ARC has filled main memory, there are + * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru + * lists can remain mostly static. Instead of searching from tail of these + * lists as pictured, the l2arc_feed_thread() will search from the list heads + * for eligible buffers, greatly increasing its chance of finding them. + * + * The L2ARC device write speed is also boosted during this time so that + * the L2ARC warms up faster. Since there have been no ARC evictions yet, + * there are no L2ARC reads, and no fear of degrading read performance + * through increased writes. + * + * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that + * the vdev queue can aggregate them into larger and fewer writes. Each + * device is written to in a rotor fashion, sweeping writes through + * available space then repeating. + * + * 7. The L2ARC does not store dirty content. It never needs to flush + * write buffers back to disk based storage. + * + * 8. If an ARC buffer is written (and dirtied) which also exists in the + * L2ARC, the now stale L2ARC buffer is immediately dropped. + * + * The performance of the L2ARC can be tweaked by a number of tunables, which + * may be necessary for different workloads: + * + * l2arc_write_max max write bytes per interval + * l2arc_write_boost extra write bytes during device warmup + * l2arc_noprefetch skip caching prefetched buffers + * l2arc_headroom number of max device writes to precache + * l2arc_feed_secs seconds between L2ARC writing + * + * Tunables may be removed or added as future performance improvements are + * integrated, and also may become zpool properties. + */ + +static void +l2arc_hdr_stat_add(void) +{ + ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); + ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); +} + +static void +l2arc_hdr_stat_remove(void) +{ + ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); + ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); +} + +/* + * Cycle through L2ARC devices. This is how L2ARC load balances. + * If a device is returned, this also returns holding the spa config lock. + */ +static l2arc_dev_t * +l2arc_dev_get_next(void) +{ + l2arc_dev_t *first, *next = NULL; + + /* + * Lock out the removal of spas (spa_namespace_lock), then removal + * of cache devices (l2arc_dev_mtx). Once a device has been selected, + * both locks will be dropped and a spa config lock held instead. + */ + mutex_enter(&spa_namespace_lock); + mutex_enter(&l2arc_dev_mtx); + + /* if there are no vdevs, there is nothing to do */ + if (l2arc_ndev == 0) + goto out; + + first = NULL; + next = l2arc_dev_last; + do { + /* loop around the list looking for a non-faulted vdev */ + if (next == NULL) { + next = list_head(l2arc_dev_list); + } else { + next = list_next(l2arc_dev_list, next); + if (next == NULL) + next = list_head(l2arc_dev_list); + } + + /* if we have come back to the start, bail out */ + if (first == NULL) + first = next; + else if (next == first) + break; + + } while (vdev_is_dead(next->l2ad_vdev)); + + /* if we were unable to find any usable vdevs, return NULL */ + if (vdev_is_dead(next->l2ad_vdev)) + next = NULL; + + l2arc_dev_last = next; + +out: + mutex_exit(&l2arc_dev_mtx); + + /* + * Grab the config lock to prevent the 'next' device from being + * removed while we are writing to it. + */ + if (next != NULL) + spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); + mutex_exit(&spa_namespace_lock); + + return (next); +} + +/* + * Free buffers that were tagged for destruction. + */ +static void +l2arc_do_free_on_write() +{ + list_t *buflist; + l2arc_data_free_t *df, *df_prev; + + mutex_enter(&l2arc_free_on_write_mtx); + buflist = l2arc_free_on_write; + + for (df = list_tail(buflist); df; df = df_prev) { + df_prev = list_prev(buflist, df); + ASSERT(df->l2df_data != NULL); + ASSERT(df->l2df_func != NULL); + df->l2df_func(df->l2df_data, df->l2df_size); + list_remove(buflist, df); + kmem_free(df, sizeof (l2arc_data_free_t)); + } + + mutex_exit(&l2arc_free_on_write_mtx); +} + +/* + * A write to a cache device has completed. Update all headers to allow + * reads from these buffers to begin. + */ +static void +l2arc_write_done(zio_t *zio) +{ + l2arc_write_callback_t *cb; + l2arc_dev_t *dev; + list_t *buflist; + arc_buf_hdr_t *head, *ab, *ab_prev; + l2arc_buf_hdr_t *abl2; + kmutex_t *hash_lock; + + cb = zio->io_private; + ASSERT(cb != NULL); + dev = cb->l2wcb_dev; + ASSERT(dev != NULL); + head = cb->l2wcb_head; + ASSERT(head != NULL); + buflist = dev->l2ad_buflist; + ASSERT(buflist != NULL); + DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, + l2arc_write_callback_t *, cb); + + if (zio->io_error != 0) + ARCSTAT_BUMP(arcstat_l2_writes_error); + + mutex_enter(&l2arc_buflist_mtx); + + /* + * All writes completed, or an error was hit. + */ + for (ab = list_prev(buflist, head); ab; ab = ab_prev) { + ab_prev = list_prev(buflist, ab); + + hash_lock = HDR_LOCK(ab); + if (!mutex_tryenter(hash_lock)) { + /* + * This buffer misses out. It may be in a stage + * of eviction. Its ARC_L2_WRITING flag will be + * left set, denying reads to this buffer. + */ + ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); + continue; + } + + if (zio->io_error != 0) { + /* + * Error - drop L2ARC entry. + */ + list_remove(buflist, ab); + abl2 = ab->b_l2hdr; + ab->b_l2hdr = NULL; + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + } + + /* + * Allow ARC to begin reads to this L2ARC entry. + */ + ab->b_flags &= ~ARC_L2_WRITING; + + mutex_exit(hash_lock); + } + + atomic_inc_64(&l2arc_writes_done); + list_remove(buflist, head); + kmem_cache_free(hdr_cache, head); + mutex_exit(&l2arc_buflist_mtx); + + l2arc_do_free_on_write(); + + kmem_free(cb, sizeof (l2arc_write_callback_t)); +} + +/* + * A read to a cache device completed. Validate buffer contents before + * handing over to the regular ARC routines. + */ +static void +l2arc_read_done(zio_t *zio) +{ + l2arc_read_callback_t *cb; + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + kmutex_t *hash_lock; + int equal; + + ASSERT(zio->io_vd != NULL); + ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); + + spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); + + cb = zio->io_private; + ASSERT(cb != NULL); + buf = cb->l2rcb_buf; + ASSERT(buf != NULL); + hdr = buf->b_hdr; + ASSERT(hdr != NULL); + + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + /* + * Check this survived the L2ARC journey. + */ + equal = arc_cksum_equal(buf); + if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + mutex_exit(hash_lock); + zio->io_private = buf; + zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ + zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + arc_read_done(zio); + } else { + mutex_exit(hash_lock); + /* + * Buffer didn't survive caching. Increment stats and + * reissue to the original storage device. + */ + if (zio->io_error != 0) { + ARCSTAT_BUMP(arcstat_l2_io_error); + } else { + zio->io_error = EIO; + } + if (!equal) + ARCSTAT_BUMP(arcstat_l2_cksum_bad); + + /* + * If there's no waiter, issue an async i/o to the primary + * storage now. If there *is* a waiter, the caller must + * issue the i/o in a context where it's OK to block. + */ + if (zio->io_waiter == NULL) + zio_nowait(zio_read(zio->io_parent, + cb->l2rcb_spa, &cb->l2rcb_bp, + buf->b_data, zio->io_size, arc_read_done, buf, + zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + } + + kmem_free(cb, sizeof (l2arc_read_callback_t)); +} + +/* + * This is the list priority from which the L2ARC will search for pages to + * cache. This is used within loops (0..3) to cycle through lists in the + * desired order. This order can have a significant effect on cache + * performance. + * + * Currently the metadata lists are hit first, MFU then MRU, followed by + * the data lists. This function returns a locked list, and also returns + * the lock pointer. + */ +static list_t * +l2arc_list_locked(int list_num, kmutex_t **lock) +{ + list_t *list; + + ASSERT(list_num >= 0 && list_num <= 3); + + switch (list_num) { + case 0: + list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; + *lock = &arc_mfu->arcs_mtx; + break; + case 1: + list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; + *lock = &arc_mru->arcs_mtx; + break; + case 2: + list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; + *lock = &arc_mfu->arcs_mtx; + break; + case 3: + list = &arc_mru->arcs_list[ARC_BUFC_DATA]; + *lock = &arc_mru->arcs_mtx; + break; + } + + ASSERT(!(MUTEX_HELD(*lock))); + mutex_enter(*lock); + return (list); +} + +/* + * Evict buffers from the device write hand to the distance specified in + * bytes. This distance may span populated buffers, it may span nothing. + * This is clearing a region on the L2ARC device ready for writing. + * If the 'all' boolean is set, every buffer is evicted. + */ +static void +l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) +{ + list_t *buflist; + l2arc_buf_hdr_t *abl2; + arc_buf_hdr_t *ab, *ab_prev; + kmutex_t *hash_lock; + uint64_t taddr; + + buflist = dev->l2ad_buflist; + + if (buflist == NULL) + return; + + if (!all && dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. + */ + return; + } + + if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { + /* + * When nearing the end of the device, evict to the end + * before the device write hand jumps to the start. + */ + taddr = dev->l2ad_end; + } else { + taddr = dev->l2ad_hand + distance; + } + DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, + uint64_t, taddr, boolean_t, all); + +top: + mutex_enter(&l2arc_buflist_mtx); + for (ab = list_tail(buflist); ab; ab = ab_prev) { + ab_prev = list_prev(buflist, ab); + + hash_lock = HDR_LOCK(ab); + if (!mutex_tryenter(hash_lock)) { + /* + * Missed the hash lock. Retry. + */ + ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); + mutex_exit(&l2arc_buflist_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; + } + + if (HDR_L2_WRITE_HEAD(ab)) { + /* + * We hit a write head node. Leave it for + * l2arc_write_done(). + */ + list_remove(buflist, ab); + mutex_exit(hash_lock); + continue; + } + + if (!all && ab->b_l2hdr != NULL && + (ab->b_l2hdr->b_daddr > taddr || + ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { + /* + * We've evicted to the target address, + * or the end of the device. + */ + mutex_exit(hash_lock); + break; + } + + if (HDR_FREE_IN_PROGRESS(ab)) { + /* + * Already on the path to destruction. + */ + mutex_exit(hash_lock); + continue; + } + + if (ab->b_state == arc_l2c_only) { + ASSERT(!HDR_L2_READING(ab)); + /* + * This doesn't exist in the ARC. Destroy. + * arc_hdr_destroy() will call list_remove() + * and decrement arcstat_l2_size. + */ + arc_change_state(arc_anon, ab, hash_lock); + arc_hdr_destroy(ab); + } else { + /* + * Invalidate issued or about to be issued + * reads, since we may be about to write + * over this location. + */ + if (HDR_L2_READING(ab)) { + ARCSTAT_BUMP(arcstat_l2_evict_reading); + ab->b_flags |= ARC_L2_EVICTED; + } + + /* + * Tell ARC this no longer exists in L2ARC. + */ + if (ab->b_l2hdr != NULL) { + abl2 = ab->b_l2hdr; + ab->b_l2hdr = NULL; + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + } + list_remove(buflist, ab); + + /* + * This may have been leftover after a + * failed write. + */ + ab->b_flags &= ~ARC_L2_WRITING; + } + mutex_exit(hash_lock); + } + mutex_exit(&l2arc_buflist_mtx); + + spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict)); + dev->l2ad_evict = taddr; +} + +/* + * Find and write ARC buffers to the L2ARC device. + * + * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid + * for reading until they have completed writing. + */ +static void +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) +{ + arc_buf_hdr_t *ab, *ab_prev, *head; + l2arc_buf_hdr_t *hdrl2; + list_t *list; + uint64_t passed_sz, write_sz, buf_sz, headroom; + void *buf_data; + kmutex_t *hash_lock, *list_lock; + boolean_t have_lock, full; + l2arc_write_callback_t *cb; + zio_t *pio, *wzio; + int try; + + ASSERT(dev->l2ad_vdev != NULL); + + pio = NULL; + write_sz = 0; + full = B_FALSE; + head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + head->b_flags |= ARC_L2_WRITE_HEAD; + + /* + * Copy buffers for L2ARC writing. + */ + mutex_enter(&l2arc_buflist_mtx); + for (try = 0; try <= 3; try++) { + list = l2arc_list_locked(try, &list_lock); + passed_sz = 0; + + /* + * L2ARC fast warmup. + * + * Until the ARC is warm and starts to evict, read from the + * head of the ARC lists rather than the tail. + */ + headroom = target_sz * l2arc_headroom; + if (arc_warm == B_FALSE) + ab = list_head(list); + else + ab = list_tail(list); + + for (; ab; ab = ab_prev) { + if (arc_warm == B_FALSE) + ab_prev = list_next(list, ab); + else + ab_prev = list_prev(list, ab); + + hash_lock = HDR_LOCK(ab); + have_lock = MUTEX_HELD(hash_lock); + if (!have_lock && !mutex_tryenter(hash_lock)) { + /* + * Skip this buffer rather than waiting. + */ + continue; + } + + passed_sz += ab->b_size; + if (passed_sz > headroom) { + /* + * Searched too far. + */ + mutex_exit(hash_lock); + break; + } + + if (ab->b_spa != spa) { + mutex_exit(hash_lock); + continue; + } + + if (ab->b_l2hdr != NULL) { + /* + * Already in L2ARC. + */ + mutex_exit(hash_lock); + continue; + } + + if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) { + mutex_exit(hash_lock); + continue; + } + + if ((write_sz + ab->b_size) > target_sz) { + full = B_TRUE; + mutex_exit(hash_lock); + break; + } + + if (ab->b_buf == NULL) { + DTRACE_PROBE1(l2arc__buf__null, void *, ab); + mutex_exit(hash_lock); + continue; + } + + if (pio == NULL) { + /* + * Insert a dummy header on the buflist so + * l2arc_write_done() can find where the + * write buffers begin without searching. + */ + list_insert_head(dev->l2ad_buflist, head); + + cb = kmem_alloc( + sizeof (l2arc_write_callback_t), KM_SLEEP); + cb->l2wcb_dev = dev; + cb->l2wcb_head = head; + pio = zio_root(spa, l2arc_write_done, cb, + ZIO_FLAG_CANFAIL); + } + + /* + * Create and add a new L2ARC header. + */ + hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); + hdrl2->b_dev = dev; + hdrl2->b_daddr = dev->l2ad_hand; + + ab->b_flags |= ARC_L2_WRITING; + ab->b_l2hdr = hdrl2; + list_insert_head(dev->l2ad_buflist, ab); + buf_data = ab->b_buf->b_data; + buf_sz = ab->b_size; + + /* + * Compute and store the buffer cksum before + * writing. On debug the cksum is verified first. + */ + arc_cksum_verify(ab->b_buf); + arc_cksum_compute(ab->b_buf, B_TRUE); + + mutex_exit(hash_lock); + + wzio = zio_write_phys(pio, dev->l2ad_vdev, + dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, + NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, B_FALSE); + + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + (void) zio_nowait(wzio); + + /* + * Keep the clock hand suitably device-aligned. + */ + buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); + + write_sz += buf_sz; + dev->l2ad_hand += buf_sz; + } + + mutex_exit(list_lock); + + if (full == B_TRUE) + break; + } + mutex_exit(&l2arc_buflist_mtx); + + if (pio == NULL) { + ASSERT3U(write_sz, ==, 0); + kmem_cache_free(hdr_cache, head); + return; + } + + ASSERT3U(write_sz, <=, target_sz); + ARCSTAT_BUMP(arcstat_l2_writes_sent); + ARCSTAT_INCR(arcstat_l2_size, write_sz); + spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz); + + /* + * Bump device hand to the device start if it is approaching the end. + * l2arc_evict() will already have evicted ahead for this case. + */ + if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { + spa_l2cache_space_update(dev->l2ad_vdev, 0, + dev->l2ad_end - dev->l2ad_hand); + dev->l2ad_hand = dev->l2ad_start; + dev->l2ad_evict = dev->l2ad_start; + dev->l2ad_first = B_FALSE; + } + + (void) zio_wait(pio); +} + +/* + * This thread feeds the L2ARC at regular intervals. This is the beating + * heart of the L2ARC. + */ +static void +l2arc_feed_thread(void *dummy __unused) +{ + callb_cpr_t cpr; + l2arc_dev_t *dev; + spa_t *spa; + uint64_t size; + + CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&l2arc_feed_thr_lock); + + while (l2arc_thread_exit == 0) { + /* + * Pause for l2arc_feed_secs seconds between writes. + */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, + hz * l2arc_feed_secs); + CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); + + /* + * Quick check for L2ARC devices. + */ + mutex_enter(&l2arc_dev_mtx); + if (l2arc_ndev == 0) { + mutex_exit(&l2arc_dev_mtx); + continue; + } + mutex_exit(&l2arc_dev_mtx); + + /* + * This selects the next l2arc device to write to, and in + * doing so the next spa to feed from: dev->l2ad_spa. This + * will return NULL if there are now no l2arc devices or if + * they are all faulted. + * + * If a device is returned, its spa's config lock is also + * held to prevent device removal. l2arc_dev_get_next() + * will grab and release l2arc_dev_mtx. + */ + if ((dev = l2arc_dev_get_next()) == NULL) + continue; + + spa = dev->l2ad_spa; + ASSERT(spa != NULL); + + /* + * Avoid contributing to memory pressure. + */ + if (arc_reclaim_needed()) { + ARCSTAT_BUMP(arcstat_l2_abort_lowmem); + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + ARCSTAT_BUMP(arcstat_l2_feeds); + + size = dev->l2ad_write; + if (arc_warm == B_FALSE) + size += dev->l2ad_boost; + + /* + * Evict L2ARC buffers that will be overwritten. + */ + l2arc_evict(dev, size, B_FALSE); + + /* + * Write ARC buffers. + */ + l2arc_write_buffers(spa, dev, size); + spa_config_exit(spa, SCL_L2ARC, dev); + } + + l2arc_thread_exit = 0; + cv_broadcast(&l2arc_feed_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ + thread_exit(); +} + +boolean_t +l2arc_vdev_present(vdev_t *vd) +{ + l2arc_dev_t *dev; + + mutex_enter(&l2arc_dev_mtx); + for (dev = list_head(l2arc_dev_list); dev != NULL; + dev = list_next(l2arc_dev_list, dev)) { + if (dev->l2ad_vdev == vd) + break; + } + mutex_exit(&l2arc_dev_mtx); + + return (dev != NULL); +} + +/* + * Add a vdev for use by the L2ARC. By this point the spa has already + * validated the vdev and opened it. + */ +void +l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) +{ + l2arc_dev_t *adddev; + + ASSERT(!l2arc_vdev_present(vd)); + + /* + * Create a new l2arc device entry. + */ + adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); + adddev->l2ad_spa = spa; + adddev->l2ad_vdev = vd; + adddev->l2ad_write = l2arc_write_max; + adddev->l2ad_boost = l2arc_write_boost; + adddev->l2ad_start = start; + adddev->l2ad_end = end; + adddev->l2ad_hand = adddev->l2ad_start; + adddev->l2ad_evict = adddev->l2ad_start; + adddev->l2ad_first = B_TRUE; + ASSERT3U(adddev->l2ad_write, >, 0); + + /* + * This is a list of all ARC buffers that are still valid on the + * device. + */ + adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); + list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l2node)); + + spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0); + + /* + * Add device to global list + */ + mutex_enter(&l2arc_dev_mtx); + list_insert_head(l2arc_dev_list, adddev); + atomic_inc_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); +} + +/* + * Remove a vdev from the L2ARC. + */ +void +l2arc_remove_vdev(vdev_t *vd) +{ + l2arc_dev_t *dev, *nextdev, *remdev = NULL; + + /* + * Find the device by vdev + */ + mutex_enter(&l2arc_dev_mtx); + for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { + nextdev = list_next(l2arc_dev_list, dev); + if (vd == dev->l2ad_vdev) { + remdev = dev; + break; + } + } + ASSERT(remdev != NULL); + + /* + * Remove device from global list + */ + list_remove(l2arc_dev_list, remdev); + l2arc_dev_last = NULL; /* may have been invalidated */ + atomic_dec_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); + + /* + * Clear all buflists and ARC references. L2ARC device flush. + */ + l2arc_evict(remdev, 0, B_TRUE); + list_destroy(remdev->l2ad_buflist); + kmem_free(remdev->l2ad_buflist, sizeof (list_t)); + kmem_free(remdev, sizeof (l2arc_dev_t)); +} + +void +l2arc_init(void) +{ + l2arc_thread_exit = 0; + l2arc_ndev = 0; + l2arc_writes_sent = 0; + l2arc_writes_done = 0; + + mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); + + l2arc_dev_list = &L2ARC_dev_list; + l2arc_free_on_write = &L2ARC_free_on_write; + list_create(l2arc_dev_list, sizeof (l2arc_dev_t), + offsetof(l2arc_dev_t, l2ad_node)); + list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), + offsetof(l2arc_data_free_t, l2df_list_node)); +} + +void +l2arc_fini(void) +{ + /* + * This is called from dmu_fini(), which is called from spa_fini(); + * Because of this, we can assume that all l2arc devices have + * already been removed when the pools themselves were removed. + */ + + l2arc_do_free_on_write(); + + mutex_destroy(&l2arc_feed_thr_lock); + cv_destroy(&l2arc_feed_thr_cv); + mutex_destroy(&l2arc_dev_mtx); + mutex_destroy(&l2arc_buflist_mtx); + mutex_destroy(&l2arc_free_on_write_mtx); + + list_destroy(l2arc_dev_list); + list_destroy(l2arc_free_on_write); +} + +void +l2arc_start(void) +{ + if (!(spa_mode & FWRITE)) + return; + + (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); +} + +void +l2arc_stop(void) +{ + if (!(spa_mode & FWRITE)) + return; + + mutex_enter(&l2arc_feed_thr_lock); + cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ + l2arc_thread_exit = 1; + while (l2arc_thread_exit != 0) + cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); + mutex_exit(&l2arc_feed_thr_lock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c index 4442b1f28ac8..93b7741d77be 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/bplist.h> #include <sys/zfs_context.h> @@ -47,7 +45,7 @@ bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx) { int size; - size = spa_version(dmu_objset_spa(mos)) < ZFS_VERSION_BPLIST_ACCOUNT ? + size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ? BPLIST_SIZE_V0 : sizeof (bplist_phys_t); return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize, @@ -181,7 +179,7 @@ bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp) } int -bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx) +bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx) { uint64_t blk, off; blkptr_t *bparray; @@ -229,7 +227,7 @@ bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx) * Deferred entry; will be written later by bplist_sync(). */ void -bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp) +bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp) { bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP); @@ -278,9 +276,7 @@ bplist_vacate(bplist_t *bpl, dmu_tx_t *tx) int bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { - uint64_t itor = 0, comp = 0, uncomp = 0; int err; - blkptr_t bp; mutex_enter(&bpl->bpl_lock); @@ -298,6 +294,9 @@ bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) mutex_exit(&bpl->bpl_lock); if (!bpl->bpl_havecomp) { + uint64_t itor = 0, comp = 0, uncomp = 0; + blkptr_t bp; + while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { comp += BP_GET_PSIZE(&bp); uncomp += BP_GET_UCSIZE(&bp); @@ -310,3 +309,41 @@ bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) return (err); } + +/* + * Return (in *dasizep) the amount of space on the deadlist which is: + * mintxg < blk_birth <= maxtxg + */ +int +bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg, + uint64_t *dasizep) +{ + uint64_t size = 0; + uint64_t itor = 0; + blkptr_t bp; + int err; + + /* + * As an optimization, if they want the whole txg range, just + * get bpl_bytes rather than iterating over the bps. + */ + if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) { + mutex_enter(&bpl->bpl_lock); + err = bplist_hold(bpl); + if (err == 0) + *dasizep = bpl->bpl_phys->bpl_bytes; + mutex_exit(&bpl->bpl_lock); + return (err); + } + + while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { + if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) { + size += + bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp); + } + } + if (err == ENOENT) + err = 0; + *dasizep = size; + return (err); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 94c63081478a..2494c1e7f9d1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/dmu.h> #include <sys/dmu_impl.h> @@ -39,17 +37,10 @@ static void dbuf_destroy(dmu_buf_impl_t *db); static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, - int compress, dmu_tx_t *tx); +static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); static arc_done_func_t dbuf_write_ready; static arc_done_func_t dbuf_write_done; -int zfs_mdcomp_disable = 0; -SYSCTL_DECL(_vfs_zfs); -TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); -SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, - &zfs_mdcomp_disable, 0, "Disable metadata compression"); - /* * Global data structures and functions for the dbuf cache. */ @@ -311,7 +302,7 @@ dbuf_verify(dmu_buf_impl_t *db) } if (db->db_blkid == DB_BONUS_BLKID) { ASSERT(dn != NULL); - ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); @@ -460,45 +451,45 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { - blkptr_t *bp; + dnode_t *dn = db->db_dnode; zbookmark_t zb; uint32_t aflags = ARC_NOWAIT; + arc_buf_t *pbuf; ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ - ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); if (db->db_blkid == DB_BONUS_BLKID) { - ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); + int bonuslen = dn->dn_bonuslen; + + ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); - if (db->db.db_size < DN_MAX_BONUSLEN) + arc_space_consume(DN_MAX_BONUSLEN); + if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); - bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data, - db->db.db_size); + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, + bonuslen); dbuf_update_data(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return; } - if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid)) - bp = NULL; - else - bp = db->db_blkptr; - - if (bp == NULL) - dprintf_dbuf(db, "blkptr: %s\n", "NULL"); - else - dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); - - if (bp == NULL || BP_IS_HOLE(bp)) { + /* + * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() + * processes the delete record and clears the bp while we are waiting + * for the dn_mtx (resulting in a "no" from block_freed). + */ + if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || + (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || + BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - ASSERT(bp == NULL || BP_IS_HOLE(bp)); - dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; @@ -510,6 +501,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) db->db_state = DB_READ; mutex_exit(&db->db_mtx); + if (DBUF_IS_L2CACHEABLE(db)) + aflags |= ARC_L2CACHE; + zb.zb_objset = db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : 0; zb.zb_object = db->db.db_object; @@ -518,10 +512,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) dbuf_add_ref(db, NULL); /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ - ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES); - (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, - db->db_level > 0 ? byteswap_uint64_array : - dmu_ot[db->db_dnode->dn_type].ot_byteswap, + + if (db->db_parent) + pbuf = db->db_parent->db_buf; + else + pbuf = db->db_objset->os_phys_buf; + + (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); @@ -546,7 +543,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL; + (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && + DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { @@ -661,6 +659,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) if (db->db_blkid == DB_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN); bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; @@ -690,7 +689,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) /* free this block */ if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) { /* XXX can get silent EIO here */ - (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, + (void) dsl_free(NULL, + spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT); } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; @@ -705,22 +705,50 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) arc_release(dr->dt.dl.dr_data, db); } +/* + * Evict (if its unreferenced) or clear (if its referenced) any level-0 + * data blocks in the free range, so that any future readers will find + * empty blocks. Also, if we happen accross any level-1 dbufs in the + * range that have not already been marked dirty, mark them dirty so + * they stay in memory. + */ void -dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) { dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t first_l1 = start >> epbs; + uint64_t last_l1 = end >> epbs; - dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); + if (end > dn->dn_maxblkid) { + end = dn->dn_maxblkid; + last_l1 = end >> epbs; + } + dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); mutex_enter(&dn->dn_dbufs_mtx); for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DB_BONUS_BLKID); + + if (db->db_level == 1 && + db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { + mutex_enter(&db->db_mtx); + if (db->db_last_dirty && + db->db_last_dirty->dr_txg < txg) { + dbuf_add_ref(db, FTAG); + mutex_exit(&db->db_mtx); + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } else { + mutex_exit(&db->db_mtx); + } + } + if (db->db_level != 0) continue; dprintf_dbuf(db, "found buf %s\n", ""); - if (db->db_blkid < blkid || - db->db_blkid >= blkid+nblks) + if (db->db_blkid < start || db->db_blkid > end) continue; /* found a level 0 buffer in the range */ @@ -783,31 +811,28 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) } static int -dbuf_new_block(dmu_buf_impl_t *db) +dbuf_block_freeable(dmu_buf_impl_t *db) { dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; uint64_t birth_txg = 0; - /* Don't count meta-objects */ - if (ds == NULL) - return (FALSE); - /* * We don't need any locking to protect db_blkptr: * If it's syncing, then db_last_dirty will be set * so we'll ignore db_blkptr. */ ASSERT(MUTEX_HELD(&db->db_mtx)); - /* If we have been dirtied since the last snapshot, its not new */ if (db->db_last_dirty) birth_txg = db->db_last_dirty->dr_txg; else if (db->db_blkptr) birth_txg = db->db_blkptr->blk_birth; + /* If we don't exist or are in a snapshot, we can't be freed */ if (birth_txg) - return (!dsl_dataset_block_freeable(ds, birth_txg)); + return (ds == NULL || + dsl_dataset_block_freeable(ds, birth_txg)); else - return (TRUE); + return (FALSE); } void @@ -865,6 +890,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) objset_impl_t *os = dn->dn_objset; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; + boolean_t do_free_accounting = B_FALSE; int txgoff = tx->tx_txg & TXG_MASK; ASSERT(tx->tx_txg != 0); @@ -922,20 +948,20 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) drp = &db->db_last_dirty; ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); - while (*drp && (*drp)->dr_txg > tx->tx_txg) - drp = &(*drp)->dr_next; - if (*drp && (*drp)->dr_txg == tx->tx_txg) { + while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) + drp = &dr->dr_next; + if (dr && dr->dr_txg == tx->tx_txg) { if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. */ - dbuf_unoverride(*drp); + dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT) arc_buf_thaw(db->db_buf); } mutex_exit(&db->db_mtx); - return (*drp); + return (dr); } /* @@ -966,6 +992,18 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + if (db->db_blkid != DB_BONUS_BLKID) { + /* + * Update the accounting. + * Note: we delay "free accounting" until after we drop + * the db_mtx. This keeps us from grabbing other locks + * (and possibly deadlocking) in bp_get_dasize() while + * also holding the db_mtx. + */ + dnode_willuse_space(dn, db->db.db_size, tx); + do_free_accounting = dbuf_block_freeable(db); + } + /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this @@ -1015,25 +1053,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_freed_in_flight = FALSE; } - if (db->db_blkid != DB_BONUS_BLKID) { - /* - * Update the accounting. - */ - if (!dbuf_new_block(db) && db->db_blkptr) { - /* - * This is only a guess -- if the dbuf is dirty - * in a previous txg, we don't know how much - * space it will use on disk yet. We should - * really have the struct_rwlock to access - * db_blkptr, but since this is just a guess, - * it's OK if we get an odd answer. - */ - dnode_willuse_space(dn, - -bp_get_dasize(os->os_spa, db->db_blkptr), tx); - } - dnode_willuse_space(dn, db->db.db_size, tx); - } - /* * This buffer is now part of this txg */ @@ -1050,11 +1069,19 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); return (dr); - } - - if (db->db_level == 0) { - dnode_new_blkid(dn, db->db_blkid, tx); - ASSERT(dn->dn_maxblkid >= db->db_blkid); + } else if (do_free_accounting) { + blkptr_t *bp = db->db_blkptr; + int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? + bp_get_dasize(os->os_spa, bp) : db->db.db_size; + /* + * This is only a guess -- if the dbuf is dirty + * in a previous txg, we don't know how much + * space it will use on disk yet. We should + * really have the struct_rwlock to access + * db_blkptr, but since this is just a guess, + * it's OK if we get an odd answer. + */ + dnode_willuse_space(dn, -willfree, tx); } if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { @@ -1062,6 +1089,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) drop_struct_lock = TRUE; } + if (db->db_level == 0) { + dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); + ASSERT(dn->dn_maxblkid >= db->db_blkid); + } + if (db->db_level+1 < dn->dn_nlevels) { dmu_buf_impl_t *parent = db->db_parent; dbuf_dirty_record_t *di; @@ -1115,7 +1147,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn = db->db_dnode; uint64_t txg = tx->tx_txg; - dbuf_dirty_record_t *dr; + dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); ASSERT(db->db_blkid != DB_BONUS_BLKID); @@ -1125,7 +1157,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * If this buffer is not dirty, we're done. */ - for (dr = db->db_last_dirty; dr; dr = dr->dr_next) + for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg <= txg) break; if (dr == NULL || dr->dr_txg < txg) { @@ -1155,14 +1187,14 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* XXX would be nice to fix up dn_towrite_space[] */ - db->db_last_dirty = dr->dr_next; + *drp = dr->dr_next; if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_level+1 == dn->dn_nlevels) { - ASSERT3P(db->db_parent, ==, dn->dn_dbuf); + ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); @@ -1178,8 +1210,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } else { ASSERT(db->db_buf != NULL); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -1204,7 +1236,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - int rf = DB_RF_MUST_SUCCEED; + int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); @@ -1282,8 +1314,10 @@ dbuf_clear(dmu_buf_impl_t *db) if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); - if (db->db_blkid == DB_BONUS_BLKID) + if (db->db_blkid == DB_BONUS_BLKID) { zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN); + } db->db.db_data = NULL; db->db_state = DB_UNCACHED; } @@ -1297,6 +1331,7 @@ dbuf_clear(dmu_buf_impl_t *db) if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { list_remove(&dn->dn_dbufs, db); dnode_rele(dn, db); + db->db_dnode = NULL; } if (db->db_buf) @@ -1397,10 +1432,13 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, if (blkid == DB_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); - db->db.db_size = dn->dn_bonuslen; + db->db.db_size = DN_MAX_BONUSLEN - + (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DB_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ + arc_space_consume(sizeof (dmu_buf_impl_t)); return (db); } else { int blocksize = @@ -1427,6 +1465,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, list_insert_head(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); + arc_space_consume(sizeof (dmu_buf_impl_t)); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); @@ -1469,31 +1508,33 @@ dbuf_destroy(dmu_buf_impl_t *db) ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_blkid != DB_BONUS_BLKID) { - dnode_t *dn = db->db_dnode; - /* * If this dbuf is still on the dn_dbufs list, * remove it from that list. */ - if (list_link_active(&db->db_link)) { + if (db->db_dnode) { + dnode_t *dn = db->db_dnode; + mutex_enter(&dn->dn_dbufs_mtx); list_remove(&dn->dn_dbufs, db); mutex_exit(&dn->dn_dbufs_mtx); dnode_rele(dn, db); + db->db_dnode = NULL; } dbuf_hash_remove(db); } db->db_parent = NULL; - db->db_dnode = NULL; db->db_buf = NULL; + ASSERT(!list_link_active(&db->db_link)); ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); kmem_cache_free(dbuf_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t)); } void @@ -1525,6 +1566,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { if (bp && !BP_IS_HOLE(bp)) { + arc_buf_t *pbuf; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; zb.zb_objset = dn->dn_objset->os_dsl_dataset ? @@ -1533,9 +1575,13 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) zb.zb_level = 0; zb.zb_blkid = blkid; - (void) arc_read(NULL, dn->dn_objset->os_spa, bp, - dmu_ot[dn->dn_type].ot_byteswap, - NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + if (db) + pbuf = db->db_buf; + else + pbuf = dn->dn_objset->os_phys_buf; + + (void) arc_read(NULL, dn->dn_objset->os_spa, + bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); } @@ -1652,16 +1698,13 @@ dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) return (err ? NULL : db); } -dmu_buf_impl_t * +void dbuf_create_bonus(dnode_t *dn) { - dmu_buf_impl_t *db = dn->dn_bonus; - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); - db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); - return (db); + dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); } #pragma weak dmu_buf_add_ref = dbuf_add_ref @@ -1716,7 +1759,10 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag) dbuf_evict(db); } else { VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); - mutex_exit(&db->db_mtx); + if (!DBUF_IS_CACHEABLE(db)) + dbuf_clear(db); + else + mutex_exit(&db->db_mtx); } } else { mutex_exit(&db->db_mtx); @@ -1852,15 +1898,8 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) db->db_data_pending = dr; - arc_release(db->db_buf, db); mutex_exit(&db->db_mtx); - - /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. - */ - dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4, - zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx); + dbuf_write(dr, db->db_buf, tx); zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); @@ -1878,7 +1917,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dnode_t *dn = db->db_dnode; objset_impl_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; - int checksum, compress; int blksz; ASSERT(dmu_tx_is_syncing(tx)); @@ -1909,23 +1947,21 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) */ if (db->db_blkid == DB_BONUS_BLKID) { dbuf_dirty_record_t **drp; - /* - * Use dn_phys->dn_bonuslen since db.db_size is the length - * of the bonus buffer in the open transaction rather than - * the syncing transaction. - */ + ASSERT(*datap != NULL); ASSERT3U(db->db_level, ==, 0); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); - if (*datap != db->db.db_data) + if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN); + } db->db_data_pending = NULL; drp = &db->db_last_dirty; while (*drp != dr) drp = &(*drp)->dr_next; - ASSERT((*drp)->dr_next == NULL); - *drp = NULL; + ASSERT(dr->dr_next == NULL); + *drp = dr->dr_next; if (dr->dr_dbuf->db_level != 0) { list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); @@ -1939,6 +1975,14 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } /* + * This function may have dropped the db_mtx lock allowing a dmu_sync + * operation to sneak in. As a result, we need to ensure that we + * don't check the dr_override_state until we have returned from + * dbuf_check_blkptr. + */ + dbuf_check_blkptr(dn, db); + + /* * If this buffer is in the middle of an immdiate write, * wait for the synchronous IO to complete. */ @@ -1948,8 +1992,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } - dbuf_check_blkptr(dn, db); - /* * If this dbuf has already been written out via an immediate write, * just complete the write by copying over the new block pointer and @@ -1963,6 +2005,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) zio_fake.io_bp = db->db_blkptr; zio_fake.io_bp_orig = *db->db_blkptr; zio_fake.io_txg = txg; + zio_fake.io_flags = 0; *db->db_blkptr = dr->dt.dl.dr_overridden_by; dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; @@ -1970,8 +2013,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dr->dr_zio = &zio_fake; mutex_exit(&db->db_mtx); + ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp), + BP_IDENTITY(&zio_fake.io_bp_orig)) || + BP_IS_HOLE(zio_fake.io_bp)); + if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg)) - dsl_dataset_block_kill(os->os_dsl_dataset, + (void) dsl_dataset_block_kill(os->os_dsl_dataset, &zio_fake.io_bp_orig, dn->dn_zio, tx); dbuf_write_ready(&zio_fake, db->db_buf, db); @@ -1997,14 +2044,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) *datap = arc_buf_alloc(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } - } else { - /* - * Private object buffers are released here rather - * than in dbuf_dirty() since they are only modified - * in the syncing context and we don't want the - * overhead of making multiple copies of the data. - */ - arc_release(db->db_buf, db); } ASSERT(*datap != NULL); @@ -2012,22 +2051,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) mutex_exit(&db->db_mtx); - /* - * Allow dnode settings to override objset settings, - * except for metadata checksums. - */ - if (dmu_ot[dn->dn_type].ot_metadata) { - checksum = os->os_md_checksum; - compress = zio_compress_select(dn->dn_compress, - os->os_md_compress); - } else { - checksum = zio_checksum_select(dn->dn_checksum, - os->os_checksum); - compress = zio_compress_select(dn->dn_compress, - os->os_compress); - } - - dbuf_write(dr, *datap, checksum, compress, tx); + dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) @@ -2063,8 +2087,7 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx) } static void -dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, - int compress, dmu_tx_t *tx) +dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn = db->db_dnode; @@ -2072,8 +2095,23 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_t zb; + writeprops_t wp = { 0 }; zio_t *zio; - int zio_flags; + + if (!BP_IS_HOLE(db->db_blkptr) && + (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) { + /* + * Private object buffers are released here rather + * than in dbuf_dirty() since they are only modified + * in the syncing context and we don't want the + * overhead of making multiple copies of the data. + */ + arc_release(data, db); + } else { + ASSERT(arc_released(data)); + /* XXX why do we need to thaw here? */ + arc_buf_thaw(data); + } if (parent != dn->dn_dbuf) { ASSERT(parent && parent->db_data_pending); @@ -2096,17 +2134,22 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, zb.zb_level = db->db_level; zb.zb_blkid = db->db_blkid; - zio_flags = ZIO_FLAG_MUSTSUCCEED; - if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0) - zio_flags |= ZIO_FLAG_METADATA; + wp.wp_type = dn->dn_type; + wp.wp_level = db->db_level; + wp.wp_copies = os->os_copies; + wp.wp_dncompress = dn->dn_compress; + wp.wp_oscompress = os->os_compress; + wp.wp_dnchecksum = dn->dn_checksum; + wp.wp_oschecksum = os->os_checksum; + if (BP_IS_OLDER(db->db_blkptr, txg)) - dsl_dataset_block_kill( + (void) dsl_dataset_block_kill( os->os_dsl_dataset, db->db_blkptr, zio, tx); - dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress, - dmu_get_replication_level(os, &zb, dn->dn_type), txg, - db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db, - ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb); + dr->dr_zio = arc_write(zio, os->os_spa, &wp, + DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr, + data, dbuf_write_ready, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } /* ARGSUSED */ @@ -2116,27 +2159,33 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) dmu_buf_impl_t *db = vdb; dnode_t *dn = db->db_dnode; objset_impl_t *os = dn->dn_objset; + blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint64_t fill = 0; int old_size, new_size, i; + ASSERT(db->db_blkptr == bp); + dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", ""); old_size = bp_get_dasize(os->os_spa, bp_orig); - new_size = bp_get_dasize(os->os_spa, zio->io_bp); + new_size = bp_get_dasize(os->os_spa, bp); - dnode_diduse_space(dn, new_size-old_size); + dnode_diduse_space(dn, new_size - old_size); - if (BP_IS_HOLE(zio->io_bp)) { + if (BP_IS_HOLE(bp)) { dsl_dataset_t *ds = os->os_dsl_dataset; dmu_tx_t *tx = os->os_synctx; if (bp_orig->blk_birth == tx->tx_txg) - dsl_dataset_block_kill(ds, bp_orig, NULL, tx); - ASSERT3U(db->db_blkptr->blk_fill, ==, 0); + (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx); + ASSERT3U(bp->blk_fill, ==, 0); return; } + ASSERT(BP_GET_TYPE(bp) == dn->dn_type); + ASSERT(BP_GET_LEVEL(bp) == db->db_level); + mutex_enter(&db->db_mtx); if (db->db_level == 0) { @@ -2156,32 +2205,31 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) fill = 1; } } else { - blkptr_t *bp = db->db.db_data; + blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); - for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { - if (BP_IS_HOLE(bp)) + for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { + if (BP_IS_HOLE(ibp)) continue; - ASSERT3U(BP_GET_LSIZE(bp), ==, + ASSERT3U(BP_GET_LSIZE(ibp), ==, db->db_level == 1 ? dn->dn_datablksz : (1<<dn->dn_phys->dn_indblkshift)); - fill += bp->blk_fill; + fill += ibp->blk_fill; } } - db->db_blkptr->blk_fill = fill; - BP_SET_TYPE(db->db_blkptr, dn->dn_type); - BP_SET_LEVEL(db->db_blkptr, db->db_level); + bp->blk_fill = fill; mutex_exit(&db->db_mtx); - /* We must do this after we've set the bp's type and level */ - if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) { + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); + } else { dsl_dataset_t *ds = os->os_dsl_dataset; dmu_tx_t *tx = os->os_synctx; if (bp_orig->blk_birth == tx->tx_txg) - dsl_dataset_block_kill(ds, bp_orig, NULL, tx); - dsl_dataset_block_born(ds, zio->io_bp, tx); + (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx); + dsl_dataset_block_born(ds, bp, tx); } } @@ -2198,13 +2246,12 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) mutex_enter(&db->db_mtx); drp = &db->db_last_dirty; - while (*drp != db->db_data_pending) - drp = &(*drp)->dr_next; - ASSERT(!list_link_active(&(*drp)->dr_dirty_node)); - ASSERT((*drp)->dr_txg == txg); - ASSERT((*drp)->dr_next == NULL); - dr = *drp; - *drp = NULL; + while ((dr = *drp) != db->db_data_pending) + drp = &dr->dr_next; + ASSERT(!list_link_active(&dr->dr_dirty_node)); + ASSERT(dr->dr_txg == txg); + ASSERT(dr->dr_next == NULL); + *drp = dr->dr_next; if (db->db_level == 0) { ASSERT(db->db_blkid != DB_BONUS_BLKID); @@ -2230,8 +2277,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) >> (db->db_level * epbs), >=, db->db_blkid); arc_set_callback(db->db_buf, dbuf_do_evict, db); } - list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index d3be6b4ff22e..377efb9d105e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dmu_impl.h> #include <sys/dmu_tx.h> @@ -42,6 +40,7 @@ #include <sys/zfs_ioctl.h> #include <sys/zap.h> #include <sys/zio_checksum.h> +#include <sys/zfs_znode.h> const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint8_array, TRUE, "unallocated" }, @@ -62,7 +61,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { zap_byteswap, TRUE, "DSL props" }, { byteswap_uint64_array, TRUE, "DSL dataset" }, { zfs_znode_byteswap, TRUE, "ZFS znode" }, - { zfs_acl_byteswap, TRUE, "ZFS ACL" }, + { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, { byteswap_uint8_array, FALSE, "ZFS plain file" }, { zap_byteswap, TRUE, "ZFS directory" }, { zap_byteswap, TRUE, "ZFS master node" }, @@ -75,7 +74,14 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { zap_byteswap, TRUE, "persistent error log" }, { byteswap_uint8_array, TRUE, "SPA history" }, { byteswap_uint64_array, TRUE, "SPA history offsets" }, - { zap_byteswap, TRUE, "Pool properties" }, + { zap_byteswap, TRUE, "Pool properties" }, + { zap_byteswap, TRUE, "DSL permissions" }, + { zfs_acl_byteswap, TRUE, "ZFS ACL" }, + { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, + { byteswap_uint8_array, TRUE, "FUID table" }, + { byteswap_uint64_array, TRUE, "FUID table size" }, + { zap_byteswap, TRUE, "DSL dataset next clones"}, + { zap_byteswap, TRUE, "scrub work queue" }, }; int @@ -115,6 +121,19 @@ dmu_bonus_max(void) return (DN_MAX_BONUSLEN); } +int +dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx) +{ + dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + + if (dn->dn_bonus != (dmu_buf_impl_t *)db) + return (EINVAL); + if (newsize < 0 || newsize > db->db_size) + return (EINVAL); + dnode_setbonuslen(dn, newsize, tx); + return (0); +} + /* * returns ENOENT, EIO, or 0. */ @@ -122,27 +141,27 @@ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) { dnode_t *dn; - int err, count; dmu_buf_impl_t *db; + int error; - err = dnode_hold(os->os, object, FTAG, &dn); - if (err) - return (err); + error = dnode_hold(os->os, object, FTAG, &dn); + if (error) + return (error); rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus == NULL) - dn->dn_bonus = dbuf_create_bonus(dn); + dbuf_create_bonus(dn); } db = dn->dn_bonus; rw_exit(&dn->dn_struct_rwlock); - mutex_enter(&db->db_mtx); - count = refcount_add(&db->db_holds, tag); - mutex_exit(&db->db_mtx); - if (count == 1) - dnode_add_ref(dn, db); + + /* as long as the bonus buf is held, the dnode will be held */ + if (refcount_add(&db->db_holds, tag) == 1) + VERIFY(dnode_add_ref(dn, db)); + dnode_rele(dn, FTAG); VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); @@ -161,11 +180,13 @@ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { + dsl_pool_t *dp = NULL; dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t flags; int err; zio_t *zio; + hrtime_t start; ASSERT(length <= DMU_MAX_ACCESS); @@ -192,7 +213,11 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE); + if (dn->dn_objset->os_dsl_dataset) + dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; + if (dp && dsl_pool_sync_context(dp)) + start = gethrtime(); + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); @@ -214,6 +239,9 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, /* wait for async i/o */ err = zio_wait(zio); + /* track read overhead when we are in sync context */ + if (dp && dsl_pool_sync_context(dp)) + dp->dp_read_overhead += gethrtime() - start; if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); @@ -343,6 +371,155 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) dnode_rele(dn, FTAG); } +static int +get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit) +{ + uint64_t len = *offset - limit; + uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT; + uint64_t subchunk = + dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); + + ASSERT(limit <= *offset); + + if (len <= chunk_len) { + *offset = limit; + return (0); + } + + ASSERT(ISP2(subchunk)); + + while (*offset > limit) { + uint64_t initial_offset = P2ROUNDUP(*offset, subchunk); + uint64_t delta; + int err; + + /* skip over allocated data */ + err = dnode_next_offset(dn, + DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0); + if (err == ESRCH) + *offset = limit; + else if (err) + return (err); + + ASSERT3U(*offset, <=, initial_offset); + *offset = P2ALIGN(*offset, subchunk); + delta = initial_offset - *offset; + if (delta >= chunk_len) { + *offset += delta - chunk_len; + return (0); + } + chunk_len -= delta; + + /* skip over unallocated data */ + err = dnode_next_offset(dn, + DNODE_FIND_BACKWARDS, offset, 1, 1, 0); + if (err == ESRCH) + *offset = limit; + else if (err) + return (err); + + if (*offset < limit) + *offset = limit; + ASSERT3U(*offset, <, initial_offset); + } + return (0); +} + +static int +dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, + uint64_t length, boolean_t free_dnode) +{ + dmu_tx_t *tx; + uint64_t object_size, start, end, len; + boolean_t trunc = (length == DMU_OBJECT_END); + int align, err; + + align = 1 << dn->dn_datablkshift; + ASSERT(align > 0); + object_size = align == 1 ? dn->dn_datablksz : + (dn->dn_maxblkid + 1) << dn->dn_datablkshift; + + if (trunc || (end = offset + length) > object_size) + end = object_size; + if (end <= offset) + return (0); + length = end - offset; + + while (length) { + start = end; + err = get_next_chunk(dn, &start, offset); + if (err) + return (err); + len = trunc ? DMU_OBJECT_END : end - start; + + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, dn->dn_object, start, len); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + dnode_free_range(dn, start, trunc ? -1 : len, tx); + + if (start == 0 && free_dnode) { + ASSERT(trunc); + dnode_free(dn, tx); + } + + length -= end - start; + + dmu_tx_commit(tx); + end = start; + } + return (0); +} + +int +dmu_free_long_range(objset_t *os, uint64_t object, + uint64_t offset, uint64_t length) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err != 0) + return (err); + err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_free_object(objset_t *os, uint64_t object) +{ + dnode_t *dn; + dmu_tx_t *tx; + int err; + + err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + FTAG, &dn); + if (err != 0) + return (err); + if (dn->dn_nlevels == 1) { + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, object); + dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err == 0) { + dnode_free_range(dn, 0, DMU_OBJECT_END, tx); + dnode_free(dn, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + } else { + err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); + } + dnode_rele(dn, FTAG); + return (err); +} + int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) @@ -384,7 +561,6 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); - int err; /* * NB: we could do this block-at-a-time, but it's nice @@ -393,7 +569,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp); if (err) - return (err); + break; for (i = 0; i < numbufs; i++) { int tocpy; @@ -414,7 +590,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } dnode_rele(dn, FTAG); - return (0); + return (err); } void @@ -590,9 +766,9 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); thiscpy = MIN(PAGESIZE, tocpy - copied); - va = ppmapin(pp, PROT_READ, (caddr_t)-1); + va = zfs_map_page(pp, S_READ); bcopy(va, (char *)db->db_data + bufoff, thiscpy); - ppmapout(va); + zfs_unmap_page(pp, va); pp = pp->p_next; bufoff += PAGESIZE; } @@ -620,6 +796,22 @@ typedef struct { /* ARGSUSED */ static void +dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) +{ + blkptr_t *bp = zio->io_bp; + + if (!BP_IS_HOLE(bp)) { + dmu_sync_arg_t *in = varg; + dbuf_dirty_record_t *dr = in->dr; + dmu_buf_impl_t *db = dr->dr_dbuf; + ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type); + ASSERT(BP_GET_LEVEL(bp) == 0); + bp->blk_fill = 1; + } +} + +/* ARGSUSED */ +static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *in = varg; @@ -627,12 +819,6 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) dmu_buf_impl_t *db = dr->dr_dbuf; dmu_sync_cb_t *done = in->done; - if (!BP_IS_HOLE(zio->io_bp)) { - zio->io_bp->blk_fill = 1; - BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type); - BP_SET_LEVEL(zio->io_bp, 0); - } - mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ @@ -679,14 +865,13 @@ dmu_sync(zio_t *pio, dmu_buf_t *db_fake, dbuf_dirty_record_t *dr; dmu_sync_arg_t *in; zbookmark_t zb; + writeprops_t wp = { 0 }; zio_t *zio; - int zio_flags; int err; ASSERT(BP_IS_HOLE(bp)); ASSERT(txg != 0); - dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); @@ -791,15 +976,20 @@ dmu_sync(zio_t *pio, dmu_buf_t *db_fake, zb.zb_object = db->db.db_object; zb.zb_level = db->db_level; zb.zb_blkid = db->db_blkid; - zio_flags = ZIO_FLAG_MUSTSUCCEED; - if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0) - zio_flags |= ZIO_FLAG_METADATA; - zio = arc_write(pio, os->os_spa, - zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum), - zio_compress_select(db->db_dnode->dn_compress, os->os_compress), - dmu_get_replication_level(os, &zb, db->db_dnode->dn_type), - txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in, - ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb); + + wp.wp_type = db->db_dnode->dn_type; + wp.wp_level = db->db_level; + wp.wp_copies = os->os_copies; + wp.wp_dnchecksum = db->db_dnode->dn_checksum; + wp.wp_oschecksum = os->os_checksum; + wp.wp_dncompress = db->db_dnode->dn_compress; + wp.wp_oscompress = os->os_compress; + + ASSERT(BP_IS_HOLE(bp)); + + zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db), + txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); if (pio) { zio_nowait(zio); @@ -855,21 +1045,6 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, } int -dmu_get_replication_level(objset_impl_t *os, - zbookmark_t *zb, dmu_object_type_t ot) -{ - int ncopies = os->os_copies; - - /* If it's the mos, it should have max copies set. */ - ASSERT(zb->zb_objset != 0 || - ncopies == spa_max_replication(os->os_spa)); - - if (dmu_ot[ot].ot_metadata || zb->zb_level != 0) - ncopies++; - return (MIN(ncopies, spa_max_replication(os->os_spa))); -} - -int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; @@ -894,7 +1069,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) return (err); } - err = dnode_next_offset(dn, hole, off, 1, 1, 0); + err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); dnode_rele(dn, FTAG); return (err); @@ -1018,6 +1193,7 @@ dmu_init(void) dbuf_init(); dnode_init(); arc_init(); + l2arc_init(); } void @@ -1026,4 +1202,5 @@ dmu_fini(void) arc_fini(); dnode_fini(); dbuf_fini(); + l2arc_fini(); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c index 93168cc8901f..1b9247d66e65 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -54,7 +54,8 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, if (P2PHASE(object, L2_dnode_count) == 0) { uint64_t offset = restarted ? object << DNODE_SHIFT : 0; int error = dnode_next_offset(osi->os_meta_dnode, - B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0); + DNODE_FIND_HOLE, + &offset, 2, DNODES_PER_BLOCK >> 2, 0); restarted = B_TRUE; if (error == 0) object = offset >> DNODE_SHIFT; @@ -139,6 +140,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) return (err); ASSERT(dn->dn_type != DMU_OT_NONE); + dnode_free_range(dn, 0, DMU_OBJECT_END, tx); dnode_free(dn, tx); dnode_rele(dn, FTAG); @@ -152,7 +154,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) int error; error = dnode_next_offset(os->os->os_meta_dnode, - hole, &offset, 0, DNODES_PER_BLOCK, txg); + (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); *objectp = offset >> DNODE_SHIFT; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index 378fe8c15bc0..7981e06825c4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -19,12 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - +#include <sys/cred.h> #include <sys/zfs_context.h> #include <sys/dmu_objset.h> #include <sys/dsl_dir.h> @@ -32,6 +31,7 @@ #include <sys/dsl_prop.h> #include <sys/dsl_pool.h> #include <sys/dsl_synctask.h> +#include <sys/dsl_deleg.h> #include <sys/dnode.h> #include <sys/dbuf.h> #include <sys/zvol.h> @@ -40,7 +40,7 @@ #include <sys/zap.h> #include <sys/zil.h> #include <sys/dmu_impl.h> - +#include <sys/zfs_ioctl.h> spa_t * dmu_objset_spa(objset_t *os) @@ -131,6 +131,34 @@ copies_changed_cb(void *arg, uint64_t newval) osi->os_copies = newval; } +static void +primary_cache_changed_cb(void *arg, uint64_t newval) +{ + objset_impl_t *osi = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || + newval == ZFS_CACHE_METADATA); + + osi->os_primary_cache = newval; +} + +static void +secondary_cache_changed_cb(void *arg, uint64_t newval) +{ + objset_impl_t *osi = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || + newval == ZFS_CACHE_METADATA); + + osi->os_secondary_cache = newval; +} + void dmu_objset_byteswap(void *buf, size_t size) { @@ -146,8 +174,10 @@ int dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, objset_impl_t **osip) { - objset_impl_t *winner, *osi; - int i, err, checksum; + objset_impl_t *osi; + int i, err; + + ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP); osi->os.os = osi; @@ -161,18 +191,26 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, zb.zb_object = 0; zb.zb_level = -1; zb.zb_blkid = 0; + if (DMU_OS_IS_L2CACHEABLE(osi)) + aflags |= ARC_L2CACHE; dprintf_bp(osi->os_rootbp, "reading %s", ""); - err = arc_read(NULL, spa, osi->os_rootbp, - dmu_ot[DMU_OT_OBJSET].ot_byteswap, + /* + * NB: when bprewrite scrub can change the bp, + * and this is called from dmu_objset_open_ds_os, the bp + * could change, and we'll need a lock. + */ + err = arc_read_nolock(NULL, spa, osi->os_rootbp, arc_getbuf_func, &osi->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err) { kmem_free(osi, sizeof (objset_impl_t)); + /* convert checksum errors into IO errors */ + if (err == ECKSUM) + err = EIO; return (err); } osi->os_phys = osi->os_phys_buf->b_data; - arc_release(osi->os_phys_buf, &osi->os_phys_buf); } else { osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t), &osi->os_phys_buf, ARC_BUFC_METADATA); @@ -183,18 +221,26 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* * Note: the changed_cb will be called once before the register * func returns, thus changing the checksum/compression from the - * default (fletcher2/off). Snapshots don't need to know, and - * registering would complicate clone promotion. + * default (fletcher2/off). Snapshots don't need to know about + * checksum/compression/copies. */ - if (ds && ds->ds_phys->ds_num_children == 0) { - err = dsl_prop_register(ds, "checksum", - checksum_changed_cb, osi); - if (err == 0) - err = dsl_prop_register(ds, "compression", - compression_changed_cb, osi); + if (ds) { + err = dsl_prop_register(ds, "primarycache", + primary_cache_changed_cb, osi); if (err == 0) - err = dsl_prop_register(ds, "copies", - copies_changed_cb, osi); + err = dsl_prop_register(ds, "secondarycache", + secondary_cache_changed_cb, osi); + if (!dsl_dataset_is_snapshot(ds)) { + if (err == 0) + err = dsl_prop_register(ds, "checksum", + checksum_changed_cb, osi); + if (err == 0) + err = dsl_prop_register(ds, "compression", + compression_changed_cb, osi); + if (err == 0) + err = dsl_prop_register(ds, "copies", + copies_changed_cb, osi); + } if (err) { VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); @@ -206,24 +252,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4; osi->os_compress = ZIO_COMPRESS_LZJB; osi->os_copies = spa_max_replication(spa); + osi->os_primary_cache = ZFS_CACHE_ALL; + osi->os_secondary_cache = ZFS_CACHE_ALL; } - osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header); - - /* - * Metadata always gets compressed and checksummed. - * If the data checksum is multi-bit correctable, and it's not - * a ZBT-style checksum, then it's suitable for metadata as well. - * Otherwise, the metadata checksum defaults to fletcher4. - */ - checksum = osi->os_checksum; - - if (zio_checksum_table[checksum].ci_correctable && - !zio_checksum_table[checksum].ci_zbt) - osi->os_md_checksum = checksum; - else - osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4; - osi->os_md_compress = ZIO_COMPRESS_LZJB; + osi->os_zil_header = osi->os_phys->os_zil_header; + osi->os_zil = zil_alloc(&osi->os, &osi->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t), @@ -238,70 +272,118 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); osi->os_meta_dnode = dnode_special_open(osi, &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); - if (ds != NULL) { - winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict); - if (winner) { - dmu_objset_evict(ds, osi); - osi = winner; - } + /* + * We should be the only thread trying to do this because we + * have ds_opening_lock + */ + if (ds) { + VERIFY(NULL == dsl_dataset_set_user_ptr(ds, osi, + dmu_objset_evict)); } *osip = osi; return (0); } -/* called from zpl */ -int -dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, - objset_t **osp) +static int +dmu_objset_open_ds_os(dsl_dataset_t *ds, objset_t *os, dmu_objset_type_t type) { - dsl_dataset_t *ds; - int err; - objset_t *os; objset_impl_t *osi; - os = kmem_alloc(sizeof (objset_t), KM_SLEEP); - err = dsl_dataset_open(name, mode, os, &ds); - if (err) { - kmem_free(os, sizeof (objset_t)); - return (err); - } - + mutex_enter(&ds->ds_opening_lock); osi = dsl_dataset_get_user_ptr(ds); if (osi == NULL) { + int err; + err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, &ds->ds_phys->ds_bp, &osi); if (err) { - dsl_dataset_close(ds, mode, os); - kmem_free(os, sizeof (objset_t)); + mutex_exit(&ds->ds_opening_lock); return (err); } } + mutex_exit(&ds->ds_opening_lock); os->os = osi; - os->os_mode = mode; + os->os_mode = DS_MODE_NOHOLD; - if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) { - dmu_objset_close(os); + if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) return (EINVAL); - } - *osp = os; return (0); } +int +dmu_objset_open_ds(dsl_dataset_t *ds, dmu_objset_type_t type, objset_t **osp) +{ + objset_t *os; + int err; + + os = kmem_alloc(sizeof (objset_t), KM_SLEEP); + err = dmu_objset_open_ds_os(ds, os, type); + if (err) + kmem_free(os, sizeof (objset_t)); + else + *osp = os; + return (err); +} + +/* called from zpl */ +int +dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, + objset_t **osp) +{ + objset_t *os; + dsl_dataset_t *ds; + int err; + + ASSERT(DS_MODE_TYPE(mode) == DS_MODE_USER || + DS_MODE_TYPE(mode) == DS_MODE_OWNER); + + os = kmem_alloc(sizeof (objset_t), KM_SLEEP); + if (DS_MODE_TYPE(mode) == DS_MODE_USER) + err = dsl_dataset_hold(name, os, &ds); + else + err = dsl_dataset_own(name, mode, os, &ds); + if (err) { + kmem_free(os, sizeof (objset_t)); + return (err); + } + + err = dmu_objset_open_ds_os(ds, os, type); + if (err) { + if (DS_MODE_TYPE(mode) == DS_MODE_USER) + dsl_dataset_rele(ds, os); + else + dsl_dataset_disown(ds, os); + kmem_free(os, sizeof (objset_t)); + } else { + os->os_mode = mode; + *osp = os; + } + return (err); +} + void dmu_objset_close(objset_t *os) { - dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os); + ASSERT(DS_MODE_TYPE(os->os_mode) == DS_MODE_USER || + DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER || + DS_MODE_TYPE(os->os_mode) == DS_MODE_NOHOLD); + + if (DS_MODE_TYPE(os->os_mode) == DS_MODE_USER) + dsl_dataset_rele(os->os->os_dsl_dataset, os); + else if (DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER) + dsl_dataset_disown(os->os->os_dsl_dataset, os); kmem_free(os, sizeof (objset_t)); } int -dmu_objset_evict_dbufs(objset_t *os, int try) +dmu_objset_evict_dbufs(objset_t *os) { objset_impl_t *osi = os->os; dnode_t *dn; @@ -319,34 +401,25 @@ dmu_objset_evict_dbufs(objset_t *os, int try) * skip. */ for (dn = list_head(&osi->os_dnodes); - dn && refcount_is_zero(&dn->dn_holds); + dn && !dnode_add_ref(dn, FTAG); dn = list_next(&osi->os_dnodes, dn)) continue; - if (dn) - dnode_add_ref(dn, FTAG); while (dn) { dnode_t *next_dn = dn; do { next_dn = list_next(&osi->os_dnodes, next_dn); - } while (next_dn && refcount_is_zero(&next_dn->dn_holds)); - if (next_dn) - dnode_add_ref(next_dn, FTAG); + } while (next_dn && !dnode_add_ref(next_dn, FTAG)); mutex_exit(&osi->os_lock); - if (dnode_evict_dbufs(dn, try)) { - dnode_rele(dn, FTAG); - if (next_dn) - dnode_rele(next_dn, FTAG); - return (1); - } + dnode_evict_dbufs(dn); dnode_rele(dn, FTAG); mutex_enter(&osi->os_lock); dn = next_dn; } mutex_exit(&osi->os_lock); - return (0); + return (list_head(&osi->os_dnodes) != osi->os_meta_dnode); } void @@ -361,13 +434,19 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg) ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL); } - if (ds && ds->ds_phys->ds_num_children == 0) { - VERIFY(0 == dsl_prop_unregister(ds, "checksum", - checksum_changed_cb, osi)); - VERIFY(0 == dsl_prop_unregister(ds, "compression", - compression_changed_cb, osi)); - VERIFY(0 == dsl_prop_unregister(ds, "copies", - copies_changed_cb, osi)); + if (ds) { + if (!dsl_dataset_is_snapshot(ds)) { + VERIFY(0 == dsl_prop_unregister(ds, "checksum", + checksum_changed_cb, osi)); + VERIFY(0 == dsl_prop_unregister(ds, "compression", + compression_changed_cb, osi)); + VERIFY(0 == dsl_prop_unregister(ds, "copies", + copies_changed_cb, osi)); + } + VERIFY(0 == dsl_prop_unregister(ds, "primarycache", + primary_cache_changed_cb, osi)); + VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", + secondary_cache_changed_cb, osi)); } /* @@ -375,7 +454,7 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg) * nothing can be added to the list at this point. */ os.os = osi; - (void) dmu_objset_evict_dbufs(&os, 0); + (void) dmu_objset_evict_dbufs(&os); ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode); ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode); @@ -387,6 +466,7 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg) VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); mutex_destroy(&osi->os_lock); mutex_destroy(&osi->os_obj_lock); + mutex_destroy(&osi->os_user_ptr_lock); kmem_free(osi, sizeof (objset_impl_t)); } @@ -399,7 +479,11 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); + if (ds) + mutex_enter(&ds->ds_opening_lock); VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi)); + if (ds) + mutex_exit(&ds->ds_opening_lock); mdn = osi->os_meta_dnode; dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, @@ -443,14 +527,15 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, } struct oscarg { - void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx); + void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); void *userarg; dsl_dataset_t *clone_parent; const char *lastname; dmu_objset_type_t type; + uint64_t flags; }; -/* ARGSUSED */ +/*ARGSUSED*/ static int dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -478,11 +563,12 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) if (oa->clone_parent->ds_phys->ds_num_children == 0) return (EINVAL); } + return (0); } static void -dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct oscarg *oa = arg2; @@ -493,10 +579,9 @@ dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); dsobj = dsl_dataset_create_sync(dd, oa->lastname, - oa->clone_parent, tx); + oa->clone_parent, oa->flags, cr, tx); - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL, - DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds)); + VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, FTAG, &ds)); bp = dsl_dataset_get_blkptr(ds); if (BP_IS_HOLE(bp)) { objset_impl_t *osi; @@ -506,15 +591,19 @@ dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) ds, bp, oa->type, tx); if (oa->userfunc) - oa->userfunc(&osi->os, oa->userarg, tx); + oa->userfunc(&osi->os, oa->userarg, cr, tx); } - dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG); + + spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa, + tx, cr, "dataset = %llu", dsobj); + + dsl_dataset_rele(ds, FTAG); } int dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, - void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg) + objset_t *clone_parent, uint64_t flags, + void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) { dsl_dir_t *pdd; const char *tail; @@ -536,6 +625,8 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, oa.userarg = arg; oa.lastname = tail; oa.type = type; + oa.flags = flags; + if (clone_parent != NULL) { /* * You can't clone to a different type. @@ -564,33 +655,47 @@ dmu_objset_destroy(const char *name) * It would be nicer to do this in dsl_dataset_destroy_sync(), * but the replay log objset is modified in open context. */ - error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os); + error = dmu_objset_open(name, DMU_OST_ANY, + DS_MODE_OWNER|DS_MODE_READONLY|DS_MODE_INCONSISTENT, &os); if (error == 0) { + dsl_dataset_t *ds = os->os->os_dsl_dataset; zil_destroy(dmu_objset_zil(os), B_FALSE); - dmu_objset_close(os); + + error = dsl_dataset_destroy(ds, os); + /* + * dsl_dataset_destroy() closes the ds. + */ + kmem_free(os, sizeof (objset_t)); } - return (dsl_dataset_destroy(name)); + return (error); } +/* + * This will close the objset. + */ int -dmu_objset_rollback(const char *name) +dmu_objset_rollback(objset_t *os) { int err; - objset_t *os; + dsl_dataset_t *ds; - err = dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os); - if (err == 0) { - err = zil_suspend(dmu_objset_zil(os)); - if (err == 0) - zil_resume(dmu_objset_zil(os)); - if (err == 0) { - /* XXX uncache everything? */ - err = dsl_dataset_rollback(os->os->os_dsl_dataset); - } + ds = os->os->os_dsl_dataset; + + if (!dsl_dataset_tryown(ds, TRUE, os)) { dmu_objset_close(os); + return (EBUSY); } + + err = dsl_dataset_rollback(ds, os->os->os_phys->os_type); + + /* + * NB: we close the objset manually because the rollback + * actually implicitly called dmu_objset_evict(), thus freeing + * the objset_impl_t. + */ + dsl_dataset_disown(ds, os); + kmem_free(os, sizeof (objset_t)); return (err); } @@ -598,6 +703,13 @@ struct snaparg { dsl_sync_task_group_t *dstg; char *snapname; char failed[MAXPATHLEN]; + boolean_t checkperms; + list_t objsets; +}; + +struct osnode { + list_node_t node; + objset_t *os; }; static int @@ -605,20 +717,25 @@ dmu_objset_snapshot_one(char *name, void *arg) { struct snaparg *sn = arg; objset_t *os; - dmu_objset_stats_t stat; int err; (void) strcpy(sn->failed, name); - err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_STANDARD, &os); + /* + * Check permissions only when requested. This only applies when + * doing a recursive snapshot. The permission checks for the starting + * dataset have already been performed in zfs_secpolicy_snapshot() + */ + if (sn->checkperms == B_TRUE && + (err = zfs_secpolicy_snapshot_perms(name, CRED()))) + return (err); + + err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_USER, &os); if (err != 0) return (err); - /* - * If the objset is in an inconsistent state, return busy. - */ - dmu_objset_fast_stat(os, &stat); - if (stat.dds_inconsistent) { + /* If the objset is in an inconsistent state, return busy */ + if (os->os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { dmu_objset_close(os); return (EBUSY); } @@ -630,8 +747,13 @@ dmu_objset_snapshot_one(char *name, void *arg) */ err = zil_suspend(dmu_objset_zil(os)); if (err == 0) { + struct osnode *osn; dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check, - dsl_dataset_snapshot_sync, os, sn->snapname, 3); + dsl_dataset_snapshot_sync, os->os->os_dsl_dataset, + sn->snapname, 3); + osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP); + osn->os = os; + list_insert_tail(&sn->objsets, osn); } else { dmu_objset_close(os); } @@ -643,31 +765,28 @@ int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) { dsl_sync_task_t *dst; + struct osnode *osn; struct snaparg sn = { 0 }; - char *cp; spa_t *spa; int err; (void) strcpy(sn.failed, fsname); - cp = strchr(fsname, '/'); - if (cp) { - *cp = '\0'; - err = spa_open(fsname, &spa, FTAG); - *cp = '/'; - } else { - err = spa_open(fsname, &spa, FTAG); - } + err = spa_open(fsname, &spa, FTAG); if (err) return (err); sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); sn.snapname = snapname; + list_create(&sn.objsets, sizeof (struct osnode), + offsetof(struct osnode, node)); if (recursive) { + sn.checkperms = B_TRUE; err = dmu_objset_find(fsname, dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN); } else { + sn.checkperms = B_FALSE; err = dmu_objset_snapshot_one(fsname, &sn); } @@ -678,13 +797,20 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) for (dst = list_head(&sn.dstg->dstg_tasks); dst; dst = list_next(&sn.dstg->dstg_tasks, dst)) { - objset_t *os = dst->dst_arg1; + dsl_dataset_t *ds = dst->dst_arg1; if (dst->dst_err) - dmu_objset_name(os, sn.failed); - zil_resume(dmu_objset_zil(os)); - dmu_objset_close(os); + dsl_dataset_name(ds, sn.failed); } + out: + while (osn = list_head(&sn.objsets)) { + list_remove(&sn.objsets, osn); + zil_resume(dmu_objset_zil(osn->os)); + dmu_objset_close(osn->os); + kmem_free(osn, sizeof (struct osnode)); + } + list_destroy(&sn.objsets); + if (err) (void) strcpy(fsname, sn.failed); dsl_sync_task_group_destroy(sn.dstg); @@ -717,39 +843,30 @@ dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx) static void ready(zio_t *zio, arc_buf_t *abuf, void *arg) { + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; objset_impl_t *os = arg; - blkptr_t *bp = os->os_rootbp; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; - int i; + + ASSERT(bp == os->os_rootbp); + ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); + ASSERT(BP_GET_LEVEL(bp) == 0); /* * Update rootbp fill count. */ bp->blk_fill = 1; /* count the meta-dnode */ - for (i = 0; i < dnp->dn_nblkptr; i++) + for (int i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += dnp->dn_blkptr[i].blk_fill; -} -/* ARGSUSED */ -static void -killer(zio_t *zio, arc_buf_t *abuf, void *arg) -{ - objset_impl_t *os = arg; - - ASSERT3U(zio->io_error, ==, 0); - - BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET); - BP_SET_LEVEL(zio->io_bp, 0); - - if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), - BP_IDENTITY(&zio->io_bp_orig))) { + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); + } else { if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) - dsl_dataset_block_kill(os->os_dsl_dataset, - &zio->io_bp_orig, NULL, os->os_synctx); - dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp, - os->os_synctx); + (void) dsl_dataset_block_kill(os->os_dsl_dataset, + &zio->io_bp_orig, zio, os->os_synctx); + dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx); } - arc_release(os->os_phys_buf, &os->os_phys_buf); } /* called from dsl */ @@ -758,10 +875,10 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) { int txgoff; zbookmark_t zb; + writeprops_t wp = { 0 }; zio_t *zio; list_t *list; dbuf_dirty_record_t *dr; - int zio_flags; dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); @@ -783,19 +900,24 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) */ zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; zb.zb_object = 0; - zb.zb_level = -1; + zb.zb_level = -1; /* for block ordering; it's level 0 on disk */ zb.zb_blkid = 0; - zio_flags = ZIO_FLAG_MUSTSUCCEED; - if (dmu_ot[DMU_OT_OBJSET].ot_metadata || zb.zb_level != 0) - zio_flags |= ZIO_FLAG_METADATA; - if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) - dsl_dataset_block_kill(os->os_dsl_dataset, + + wp.wp_type = DMU_OT_OBJSET; + wp.wp_level = 0; /* on-disk BP level; see above */ + wp.wp_copies = os->os_copies; + wp.wp_oschecksum = os->os_checksum; + wp.wp_oscompress = os->os_compress; + + if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) { + (void) dsl_dataset_block_kill(os->os_dsl_dataset, os->os_rootbp, pio, tx); - zio = arc_write(pio, os->os_spa, os->os_md_checksum, - os->os_md_compress, - dmu_get_replication_level(os, &zb, DMU_OT_OBJSET), - tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os, - ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb); + } + + arc_release(os->os_phys_buf, &os->os_phys_buf); + zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os), + tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* * Sync meta-dnode - the parent IO for the sync is the root block @@ -819,6 +941,7 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) * Free intent log blocks up to this tx. */ zil_sync(os->os_zil, tx); + os->os_phys->os_zil_header = os->os_zil_header; zio_nowait(zio); } @@ -867,8 +990,23 @@ dmu_objset_is_snapshot(objset_t *os) } int +dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, + boolean_t *conflict) +{ + dsl_dataset_t *ds = os->os->os_dsl_dataset; + uint64_t ignored; + + if (ds->ds_phys->ds_snapnames_zapobj == 0) + return (ENOENT); + + return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, + real, maxlen, conflict)); +} + +int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, - uint64_t *idp, uint64_t *offp) + uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) { dsl_dataset_t *ds = os->os->os_dsl_dataset; zap_cursor_t cursor; @@ -894,6 +1032,8 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name, (void) strcpy(name, attr.za_name); if (idp) *idp = attr.za_first_integer; + if (case_conflict) + *case_conflict = attr.za_normalization_conflict; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); @@ -938,48 +1078,80 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, return (0); } +struct findarg { + int (*func)(char *, void *); + void *arg; +}; + +/* ARGSUSED */ +static int +findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + struct findarg *fa = arg; + return (fa->func((char *)dsname, fa->arg)); +} + /* * Find all objsets under name, and for each, call 'func(child_name, arg)'. + * Perhaps change all callers to use dmu_objset_find_spa()? */ int dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags) { + struct findarg fa; + fa.func = func; + fa.arg = arg; + return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); +} + +/* + * Find all objsets under name, call func on each + */ +int +dmu_objset_find_spa(spa_t *spa, const char *name, + int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) +{ dsl_dir_t *dd; - objset_t *os; - uint64_t snapobj; + dsl_pool_t *dp; + dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; char *child; - int do_self, err; + uint64_t thisobj; + int err; - err = dsl_dir_open(name, FTAG, &dd, NULL); + if (name == NULL) + name = spa_name(spa); + err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); if (err) return (err); - /* NB: the $MOS dir doesn't have a head dataset */ - do_self = (dd->dd_phys->dd_head_dataset_obj != 0); + /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ + if (dd->dd_myname[0] == '$') { + dsl_dir_close(dd, FTAG); + return (0); + } + + thisobj = dd->dd_phys->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + dp = dd->dd_pool; /* * Iterate over all children. */ if (flags & DS_FIND_CHILDREN) { - for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, + for (zap_cursor_init(&zc, dp->dp_meta_objset, dd->dd_phys->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT(attr->za_integer_length == sizeof (uint64_t)); ASSERT(attr->za_num_integers == 1); - /* - * No separating '/' because parent's name ends in /. - */ child = kmem_alloc(MAXPATHLEN, KM_SLEEP); - /* XXX could probably just use name here */ - dsl_dir_name(dd, child); + (void) strcpy(child, name); (void) strcat(child, "/"); (void) strcat(child, attr->za_name); - err = dmu_objset_find(child, func, arg, flags); + err = dmu_objset_find_spa(spa, child, func, arg, flags); kmem_free(child, MAXPATHLEN); if (err) break; @@ -996,30 +1168,36 @@ dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags) /* * Iterate over all snapshots. */ - if ((flags & DS_FIND_SNAPSHOTS) && - dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) { - - snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj; - dmu_objset_close(os); - - for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj); - zap_cursor_retrieve(&zc, attr) == 0; - (void) zap_cursor_advance(&zc)) { - ASSERT(attr->za_integer_length == sizeof (uint64_t)); - ASSERT(attr->za_num_integers == 1); + if (flags & DS_FIND_SNAPSHOTS) { + if (!dsl_pool_sync_context(dp)) + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + if (!dsl_pool_sync_context(dp)) + rw_exit(&dp->dp_config_rwlock); - child = kmem_alloc(MAXPATHLEN, KM_SLEEP); - /* XXX could probably just use name here */ - dsl_dir_name(dd, child); - (void) strcat(child, "@"); - (void) strcat(child, attr->za_name); - err = func(child, arg); - kmem_free(child, MAXPATHLEN); - if (err) - break; + if (err == 0) { + uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + dsl_dataset_rele(ds, FTAG); + + for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT(attr->za_integer_length == + sizeof (uint64_t)); + ASSERT(attr->za_num_integers == 1); + + child = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) strcpy(child, name); + (void) strcat(child, "@"); + (void) strcat(child, attr->za_name); + err = func(spa, attr->za_first_integer, + child, arg); + kmem_free(child, MAXPATHLEN); + if (err) + break; + } + zap_cursor_fini(&zc); } - zap_cursor_fini(&zc); } dsl_dir_close(dd, FTAG); @@ -1031,7 +1209,20 @@ dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags) /* * Apply to self if appropriate. */ - if (do_self) - err = func(name, arg); + err = func(spa, thisobj, name, arg); return (err); } + +void +dmu_objset_set_user(objset_t *os, void *user_ptr) +{ + ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock)); + os->os->os_user_ptr = user_ptr; +} + +void * +dmu_objset_get_user(objset_t *os) +{ + ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock)); + return (os->os->os_user_ptr); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index 3e55dc301620..1294581a7133 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,10 +41,13 @@ #include <sys/zap.h> #include <sys/zio_checksum.h> +static char *dmu_recv_tag = "dmu_recv_tag"; + struct backuparg { dmu_replay_record_t *drr; kthread_t *td; struct file *fp; + offset_t *off; objset_t *os; zio_cksum_t zc; int err; @@ -77,6 +80,7 @@ dump_bytes(struct backuparg *ba, void *buf, int len) fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); ba->err = EOPNOTSUPP; #endif + *ba->off += len; return (ba->err); } @@ -179,7 +183,7 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) void *data = bc->bc_data; int err = 0; - if (SIGPENDING(curthread)) + if (issig(JUSTLOOKING) && issig(FORREAL)) return (EINTR); ASSERT(data || bp == NULL); @@ -215,10 +219,9 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) zb.zb_object = object; zb.zb_level = level; zb.zb_blkid = blkid; - (void) arc_read(NULL, spa, bp, - dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED, - &aflags, &zb); + (void) arc_read_nolock(NULL, spa, bp, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); if (abuf) { err = dump_data(ba, type, object, blkid * blksz, @@ -236,13 +239,15 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) } int -dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp) +dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, + struct file *fp, offset_t *off) { dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; dmu_replay_record_t *drr; struct backuparg ba; int err; + uint64_t fromtxg = 0; /* tosnap must be a snapshot */ if (ds->ds_phys->ds_next_snap_obj == 0) @@ -250,26 +255,55 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp) /* fromsnap must be an earlier snapshot from the same fs as tosnap */ if (fromds && (ds->ds_dir != fromds->ds_dir || - fromds->ds_phys->ds_creation_txg >= - ds->ds_phys->ds_creation_txg)) + fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) return (EXDEV); + if (fromorigin) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (fromsnap) + return (EINVAL); + + if (dsl_dir_is_clone(ds->ds_dir)) { + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); + rw_exit(&dp->dp_config_rwlock); + if (err) + return (err); + } else { + fromorigin = B_FALSE; + } + } + + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION; + drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION; drr->drr_u.drr_begin.drr_creation_time = ds->ds_phys->ds_creation_time; drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; + if (fromorigin) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; + if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; + if (fromds) drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); + if (fromds) + fromtxg = fromds->ds_phys->ds_creation_txg; + if (fromorigin) + dsl_dataset_rele(fromds, FTAG); + ba.drr = drr; ba.td = curthread; ba.fp = fp; ba.os = tosnap; + ba.off = off; ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { @@ -277,8 +311,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp) return (ba.err); } - err = traverse_dsl_dataset(ds, - fromds ? fromds->ds_phys->ds_creation_txg : 0, + err = traverse_dsl_dataset(ds, fromtxg, ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK, backup_cb, &ba); @@ -303,164 +336,384 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp) return (0); } -struct restorearg { - int err; - int byteswap; - kthread_t *td; - struct file *fp; - char *buf; - uint64_t voff; - int buflen; /* number of valid bytes in buf */ - int bufoff; /* next offset to read */ - int bufsize; /* amount of memory allocated for buf */ - zio_cksum_t zc; +struct recvbeginsyncarg { + const char *tofs; + const char *tosnap; + dsl_dataset_t *origin; + uint64_t fromguid; + dmu_objset_type_t type; + void *tag; + boolean_t force; + uint64_t dsflags; + char clonelastname[MAXNAMELEN]; + dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ }; +static dsl_dataset_t * +recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type, + cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds; + + /* This should always work, since we just created it */ + /* XXX - create should return an owned ds */ + VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, + DS_MODE_INCONSISTENT, dmu_recv_tag, &ds)); + + if (type != DMU_OST_NONE) { + (void) dmu_objset_create_impl(dp->dp_spa, + ds, &ds->ds_phys->ds_bp, type, tx); + } + + spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC, + dp->dp_spa, tx, cr, "dataset = %lld", dsobj); + + return (ds); +} + /* ARGSUSED */ static int -replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) +recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - struct drr_begin *drrb = arg2; - const char *snapname; - int err; + dsl_dir_t *dd = arg1; + struct recvbeginsyncarg *rbsa = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t val; + int err; - /* must already be a snapshot of this fs */ - if (ds->ds_phys->ds_prev_snap_obj == 0) - return (ENODEV); + err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, + strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); - /* most recent snapshot must match fromguid */ - if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) - return (ENODEV); - /* must not have any changes since most recent snapshot */ - if (ds->ds_phys->ds_bp.blk_birth > - ds->ds_prev->ds_phys->ds_creation_txg) - return (ETXTBSY); + if (err != ENOENT) + return (err ? err : EEXIST); - /* new snapshot name must not exist */ - snapname = strrchr(drrb->drr_toname, '@'); - if (snapname == NULL) - return (EEXIST); + if (rbsa->origin) { + /* make sure it's a snap in the same pool */ + if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) + return (EXDEV); + if (rbsa->origin->ds_phys->ds_num_children == 0) + return (EINVAL); + if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) + return (ENODEV); + } - snapname++; - err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val); - if (err == 0) - return (EEXIST); - if (err != ENOENT) + return (0); +} + +static void +recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct recvbeginsyncarg *rbsa = arg2; + uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; + uint64_t dsobj; + + dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, + rbsa->origin, flags, cr, tx); + + rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj, + rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx); +} + +static int +recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvbeginsyncarg *rbsa = arg2; + int err; + + /* must be a head ds */ + if (ds->ds_phys->ds_next_snap_obj != 0) + return (EINVAL); + + /* must not be a clone ds */ + if (dsl_dir_is_clone(ds->ds_dir)) + return (EINVAL); + + err = dsl_dataset_destroy_check(ds, rbsa->tag, tx); + if (err) return (err); + if (rbsa->origin) { + /* make sure it's a snap in the same pool */ + if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool) + return (EXDEV); + if (rbsa->origin->ds_phys->ds_num_children == 0) + return (EINVAL); + if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) + return (ENODEV); + } + return (0); } -/* ARGSUSED */ static void -replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx) +recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; + struct recvbeginsyncarg *rbsa = arg2; + dsl_dir_t *dd = ds->ds_dir; + uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; + uint64_t dsobj; + + /* + * NB: caller must provide an extra hold on the dsl_dir_t, so it + * won't go away when dsl_dataset_destroy_sync() closes the + * dataset. + */ + dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx); + + dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx); + + rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj, + rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx); } /* ARGSUSED */ static int -replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx) +recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - struct drr_begin *drrb = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; - char *cp; - uint64_t val; + dsl_dataset_t *ds = arg1; + struct recvbeginsyncarg *rbsa = arg2; int err; + uint64_t val; - cp = strchr(drrb->drr_toname, '@'); - *cp = '\0'; - err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, - strrchr(drrb->drr_toname, '/') + 1, - sizeof (uint64_t), 1, &val); - *cp = '@'; + /* must not have any changes since most recent snapshot */ + if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) + return (ETXTBSY); + + /* must already be a snapshot of this fs */ + if (ds->ds_phys->ds_prev_snap_obj == 0) + return (ENODEV); + + /* most recent snapshot must match fromguid */ + if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) + return (ENODEV); + /* temporary clone name must not exist */ + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_dir->dd_phys->dd_child_dir_zapobj, + rbsa->clonelastname, 8, 1, &val); + if (err == 0) + return (EEXIST); if (err != ENOENT) - return (err ? err : EEXIST); + return (err); + /* new snapshot name must not exist */ + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); return (0); } +/* ARGSUSED */ static void -replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx) +recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - struct drr_begin *drrb = arg2; - char *cp; - dsl_dataset_t *ds; + dsl_dataset_t *ohds = arg1; + struct recvbeginsyncarg *rbsa = arg2; + dsl_pool_t *dp = ohds->ds_dir->dd_pool; + dsl_dataset_t *ods, *cds; + uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; uint64_t dsobj; - cp = strchr(drrb->drr_toname, '@'); - *cp = '\0'; - dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1, - NULL, tx); - *cp = '@'; + /* create the temporary clone */ + VERIFY(0 == dsl_dataset_hold_obj(dp, ohds->ds_phys->ds_prev_snap_obj, + FTAG, &ods)); + dsobj = dsl_dataset_create_sync(ohds->ds_dir, + rbsa->clonelastname, ods, flags, cr, tx); + dsl_dataset_rele(ods, FTAG); + + /* open the temporary clone */ + VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, + DS_MODE_INCONSISTENT, dmu_recv_tag, &cds)); - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL, - DS_MODE_EXCLUSIVE, FTAG, &ds)); + /* copy the refquota from the target fs to the clone */ + if (ohds->ds_quota > 0) + dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx); - (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds), - ds, &ds->ds_phys->ds_bp, drrb->drr_type, tx); + rbsa->ds = cds; + + spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, + dp->dp_spa, tx, cr, "dataset = %lld", dsobj); +} + +/* ARGSUSED */ +static void +recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, + ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld", + ds->ds_object); } -static int -replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx) +/* + * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() + * succeeds; otherwise we will leak the holds on the datasets. + */ +int +dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, + boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc) { - objset_t *os = arg1; - struct drr_begin *drrb = arg2; - char *snapname; + int err = 0; + boolean_t byteswap; + struct recvbeginsyncarg rbsa; + uint64_t version; + int flags; + dsl_dataset_t *ds; - /* XXX verify that drr_toname is in dd */ + if (drrb->drr_magic == DMU_BACKUP_MAGIC) + byteswap = FALSE; + else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) + byteswap = TRUE; + else + return (EINVAL); - snapname = strchr(drrb->drr_toname, '@'); - if (snapname == NULL) + rbsa.tofs = tofs; + rbsa.tosnap = tosnap; + rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL; + rbsa.fromguid = drrb->drr_fromguid; + rbsa.type = drrb->drr_type; + rbsa.tag = FTAG; + rbsa.dsflags = 0; + version = drrb->drr_version; + flags = drrb->drr_flags; + + if (byteswap) { + rbsa.type = BSWAP_32(rbsa.type); + rbsa.fromguid = BSWAP_64(rbsa.fromguid); + version = BSWAP_64(version); + flags = BSWAP_32(flags); + } + + if (version != DMU_BACKUP_STREAM_VERSION || + rbsa.type >= DMU_OST_NUMTYPES || + ((flags & DRR_FLAG_CLONE) && origin == NULL)) return (EINVAL); - snapname++; - return (dsl_dataset_snapshot_check(os, snapname, tx)); -} + if (flags & DRR_FLAG_CI_DATA) + rbsa.dsflags = DS_FLAG_CI_DATASET; -static void -replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - objset_t *os = arg1; - struct drr_begin *drrb = arg2; - char *snapname; - dsl_dataset_t *ds, *hds; + bzero(drc, sizeof (dmu_recv_cookie_t)); + drc->drc_drrb = drrb; + drc->drc_tosnap = tosnap; + drc->drc_force = force; - snapname = strchr(drrb->drr_toname, '@') + 1; + /* + * Process the begin in syncing context. + */ + if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) { + /* offline incremental receive */ + err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds); + if (err) + return (err); - dsl_dataset_snapshot_sync(os, snapname, tx); + /* + * Only do the rollback if the most recent snapshot + * matches the incremental source + */ + if (force) { + if (ds->ds_prev == NULL || + ds->ds_prev->ds_phys->ds_guid != + rbsa.fromguid) { + dsl_dataset_disown(ds, dmu_recv_tag); + return (ENODEV); + } + (void) dsl_dataset_rollback(ds, DMU_OST_NONE); + } + rbsa.force = B_FALSE; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_incremental_check, + recv_offline_incremental_sync, ds, &rbsa, 1); + if (err) { + dsl_dataset_disown(ds, dmu_recv_tag); + return (err); + } + drc->drc_logical_ds = drc->drc_real_ds = ds; + } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) { + /* online incremental receive */ - /* set snapshot's creation time and guid */ - hds = os->os->os_dsl_dataset; - VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool, - hds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT, - FTAG, &ds)); + /* tmp clone name is: tofs/%tosnap" */ + (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), + "%%%s", tosnap); - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_creation_time = drrb->drr_creation_time; - ds->ds_phys->ds_guid = drrb->drr_toguid; - ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + /* open the dataset we are logically receiving into */ + err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); + if (err) + return (err); - dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG); + rbsa.force = force; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_incremental_check, + recv_online_incremental_sync, ds, &rbsa, 5); + if (err) { + dsl_dataset_rele(ds, dmu_recv_tag); + return (err); + } + drc->drc_logical_ds = ds; + drc->drc_real_ds = rbsa.ds; + } else { + /* create new fs -- full backup or clone */ + dsl_dir_t *dd = NULL; + const char *tail; + + err = dsl_dir_open(tofs, FTAG, &dd, &tail); + if (err) + return (err); + if (tail == NULL) { + if (!force) { + dsl_dir_close(dd, FTAG); + return (EEXIST); + } + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + err = dsl_dataset_own_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, + DS_MODE_INCONSISTENT, FTAG, &ds); + rw_exit(&dd->dd_pool->dp_config_rwlock); + if (err) { + dsl_dir_close(dd, FTAG); + return (err); + } + + dsl_dataset_make_exclusive(ds, FTAG); + err = dsl_sync_task_do(dd->dd_pool, + recv_full_existing_check, + recv_full_existing_sync, ds, &rbsa, 5); + dsl_dataset_disown(ds, FTAG); + } else { + err = dsl_sync_task_do(dd->dd_pool, recv_full_check, + recv_full_sync, dd, &rbsa, 5); + } + dsl_dir_close(dd, FTAG); + if (err) + return (err); + drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; + drc->drc_newfs = B_TRUE; + } - dmu_buf_will_dirty(hds->ds_dbuf, tx); - hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + return (0); } +struct restorearg { + int err; + int byteswap; + kthread_t *td; + struct file *fp; + char *buf; + uint64_t voff; + int bufsize; /* amount of memory allocated for buf */ + zio_cksum_t cksum; +}; + static int restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, int *resid) { @@ -491,37 +744,31 @@ static void * restore_read(struct restorearg *ra, int len) { void *rv; + int done = 0; /* some things will require 8-byte alignment, so everything must */ ASSERT3U(len % 8, ==, 0); - while (ra->buflen - ra->bufoff < len) { + while (done < len) { int resid; - int leftover = ra->buflen - ra->bufoff; - (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover); + ra->err = restore_bytes(ra, (caddr_t)ra->buf + done, + len - done, ra->voff, &resid); - ra->err = restore_bytes(ra, (caddr_t)ra->buf + leftover, - ra->bufsize - leftover, ra->voff, &resid); - - ra->voff += ra->bufsize - leftover - resid; - ra->buflen = ra->bufsize - resid; - ra->bufoff = 0; - if (resid == ra->bufsize - leftover) + if (resid == len - done) ra->err = EINVAL; + ra->voff += len - done - resid; + done = len - resid; if (ra->err) return (NULL); - /* Could compute checksum here? */ } - ASSERT3U(ra->bufoff % 8, ==, 0); - ASSERT3U(ra->buflen - ra->bufoff, >=, len); - rv = ra->buf + ra->bufoff; - ra->bufoff += len; + ASSERT3U(done, ==, len); + rv = ra->buf; if (ra->byteswap) - fletcher_4_incremental_byteswap(rv, len, &ra->zc); + fletcher_4_incremental_byteswap(rv, len, &ra->cksum); else - fletcher_4_incremental_native(rv, len, &ra->zc); + fletcher_4_incremental_native(rv, len, &ra->cksum); return (rv); } @@ -531,12 +778,14 @@ backup_byteswap(dmu_replay_record_t *drr) #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) drr->drr_type = BSWAP_32(drr->drr_type); + drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); switch (drr->drr_type) { case DRR_BEGIN: DO64(drr_begin.drr_magic); DO64(drr_begin.drr_version); DO64(drr_begin.drr_creation_time); DO32(drr_begin.drr_type); + DO32(drr_begin.drr_flags); DO64(drr_begin.drr_toguid); DO64(drr_begin.drr_fromguid); break; @@ -643,13 +892,13 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); - ASSERT3U(db->db_size, ==, drro->drr_bonuslen); - data = restore_read(ra, P2ROUNDUP(db->db_size, 8)); + ASSERT3U(db->db_size, >=, drro->drr_bonuslen); + data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); if (data == NULL) { dmu_tx_commit(tx); return (ra->err); } - bcopy(data, db->db_data, db->db_size); + bcopy(data, db->db_data, drro->drr_bonuslen); if (ra->byteswap) { dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, drro->drr_bonuslen); @@ -673,23 +922,14 @@ restore_freeobjects(struct restorearg *ra, objset_t *os, for (obj = drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs; (void) dmu_object_next(os, &obj, FALSE, 0)) { - dmu_tx_t *tx; int err; if (dmu_object_info(os, obj, NULL) != 0) continue; - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, obj); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); + err = dmu_free_object(os, obj); + if (err) return (err); - } - err = dmu_object_free(os, obj, tx); - dmu_tx_commit(tx); - if (err && err != ENOENT) - return (EINVAL); } return (0); } @@ -735,7 +975,6 @@ static int restore_free(struct restorearg *ra, objset_t *os, struct drr_free *drrf) { - dmu_tx_t *tx; int err; if (drrf->drr_length != -1ULL && @@ -745,66 +984,65 @@ restore_free(struct restorearg *ra, objset_t *os, if (dmu_object_info(os, drrf->drr_object, NULL) != 0) return (EINVAL); - tx = dmu_tx_create(os); - - dmu_tx_hold_free(tx, drrf->drr_object, + err = dmu_free_long_range(os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - err = dmu_free_range(os, drrf->drr_object, - drrf->drr_offset, drrf->drr_length, tx); - dmu_tx_commit(tx); return (err); } +void +dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc) +{ + if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) { + /* + * online incremental or new fs: destroy the fs (which + * may be a clone) that we created + */ + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); + if (drc->drc_real_ds != drc->drc_logical_ds) + dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); + } else { + /* + * offline incremental: rollback to most recent snapshot. + */ + (void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE); + dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag); + } +} + +/* + * NB: callers *must* call dmu_recv_end() if this succeeds. + */ int -dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, - boolean_t force, struct file *fp, uint64_t voffset) +dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp) { kthread_t *td = curthread; - struct restorearg ra; + struct restorearg ra = { 0 }; dmu_replay_record_t *drr; - char *cp; - objset_t *os = NULL; - zio_cksum_t pzc; - - bzero(&ra, sizeof (ra)); - ra.td = td; - ra.fp = fp; - ra.voff = voffset; - ra.bufsize = 1<<20; - ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); + objset_t *os; + zio_cksum_t pcksum; - if (drrb->drr_magic == DMU_BACKUP_MAGIC) { - ra.byteswap = FALSE; - } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { + if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) ra.byteswap = TRUE; - } else { - ra.err = EINVAL; - goto out; - } - /* - * NB: this assumes that struct drr_begin will be the largest in - * dmu_replay_record_t's drr_u, and thus we don't need to pad it - * with zeros to make it the same length as we wrote out. - */ - ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN; - ((dmu_replay_record_t *)ra.buf)->drr_pad = 0; - ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb; - if (ra.byteswap) { - fletcher_4_incremental_byteswap(ra.buf, - sizeof (dmu_replay_record_t), &ra.zc); - } else { - fletcher_4_incremental_native(ra.buf, - sizeof (dmu_replay_record_t), &ra.zc); + { + /* compute checksum of drr_begin record */ + dmu_replay_record_t *drr; + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); + + drr->drr_type = DRR_BEGIN; + drr->drr_u.drr_begin = *drc->drc_drrb; + if (ra.byteswap) { + fletcher_4_incremental_byteswap(drr, + sizeof (dmu_replay_record_t), &ra.cksum); + } else { + fletcher_4_incremental_native(drr, + sizeof (dmu_replay_record_t), &ra.cksum); + } + kmem_free(drr, sizeof (dmu_replay_record_t)); } - (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */ if (ra.byteswap) { + struct drr_begin *drrb = drc->drc_drrb; drrb->drr_magic = BSWAP_64(drrb->drr_magic); drrb->drr_version = BSWAP_64(drrb->drr_version); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); @@ -813,94 +1051,30 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } - ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); - - if (drrb->drr_version != DMU_BACKUP_VERSION || - drrb->drr_type >= DMU_OST_NUMTYPES || - strchr(drrb->drr_toname, '@') == NULL) { - ra.err = EINVAL; - goto out; - } - - /* - * Process the begin in syncing context. - */ - if (drrb->drr_fromguid) { - /* incremental backup */ - dsl_dataset_t *ds = NULL; - - cp = strchr(tosnap, '@'); - *cp = '\0'; - ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds); - *cp = '@'; - if (ra.err) - goto out; - - /* - * Only do the rollback if the most recent snapshot - * matches the incremental source - */ - if (force) { - if (ds->ds_prev == NULL || - ds->ds_prev->ds_phys->ds_guid != - drrb->drr_fromguid) { - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - kmem_free(ra.buf, ra.bufsize); - return (ENODEV); - } - (void) dsl_dataset_rollback(ds); - } - ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool, - replay_incremental_check, replay_incremental_sync, - ds, drrb, 1); - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - } else { - /* full backup */ - dsl_dir_t *dd = NULL; - const char *tail; - - /* can't restore full backup into topmost fs, for now */ - if (strrchr(drrb->drr_toname, '/') == NULL) { - ra.err = EINVAL; - goto out; - } - - cp = strchr(tosnap, '@'); - *cp = '\0'; - ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail); - *cp = '@'; - if (ra.err) - goto out; - if (tail == NULL) { - ra.err = EEXIST; - goto out; - } + ra.td = td; + ra.fp = fp; + ra.voff = *voffp; + ra.bufsize = 1<<20; + ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); - ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check, - replay_full_sync, dd, drrb, 5); - dsl_dir_close(dd, FTAG); - } - if (ra.err) - goto out; + /* these were verified in dmu_recv_begin */ + ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION); + ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); /* * Open the objset we are modifying. */ + VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0); - cp = strchr(tosnap, '@'); - *cp = '\0'; - ra.err = dmu_objset_open(tosnap, DMU_OST_ANY, - DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os); - *cp = '@'; - ASSERT3U(ra.err, ==, 0); + ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); /* * Read records and process them. */ - pzc = ra.zc; + pcksum = ra.cksum; while (ra.err == 0 && NULL != (drr = restore_read(&ra, sizeof (*drr)))) { - if (SIGPENDING(td)) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { ra.err = EINTR; goto out; } @@ -947,63 +1121,116 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, * value, because the stored checksum is of * everything before the DRR_END record. */ - if (drre.drr_checksum.zc_word[0] != 0 && - !ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pzc)) { + if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) ra.err = ECKSUM; - goto out; - } - - ra.err = dsl_sync_task_do(dmu_objset_ds(os)-> - ds_dir->dd_pool, replay_end_check, replay_end_sync, - os, drrb, 3); goto out; } default: ra.err = EINVAL; goto out; } - pzc = ra.zc; + pcksum = ra.cksum; } + ASSERT(ra.err != 0); out: - if (os) - dmu_objset_close(os); + dmu_objset_close(os); - /* - * Make sure we don't rollback/destroy unless we actually - * processed the begin properly. 'os' will only be set if this - * is the case. - */ - if (ra.err && os && tosnap && strchr(tosnap, '@')) { + if (ra.err != 0) { /* * rollback or destroy what we created, so we don't * leave it in the restoring state. */ - dsl_dataset_t *ds; - int err; - - cp = strchr(tosnap, '@'); - *cp = '\0'; - err = dsl_dataset_open(tosnap, - DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, - FTAG, &ds); - if (err == 0) { - txg_wait_synced(ds->ds_dir->dd_pool, 0); - if (drrb->drr_fromguid) { - /* incremental: rollback to most recent snap */ - (void) dsl_dataset_rollback(ds); - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - } else { - /* full: destroy whole fs */ - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - (void) dsl_dataset_destroy(tosnap); - } - } - *cp = '@'; + txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); + dmu_recv_abort_cleanup(drc); } kmem_free(ra.buf, ra.bufsize); - if (sizep) - *sizep = ra.voff; + *voffp = ra.voff; return (ra.err); } + +struct recvendsyncarg { + char *tosnap; + uint64_t creation_time; + uint64_t toguid; +}; + +static int +recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvendsyncarg *resa = arg2; + + return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); +} + +static void +recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvendsyncarg *resa = arg2; + + dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; + ds->ds_prev->ds_phys->ds_guid = resa->toguid; + ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; +} + +int +dmu_recv_end(dmu_recv_cookie_t *drc) +{ + struct recvendsyncarg resa; + dsl_dataset_t *ds = drc->drc_logical_ds; + int err; + + /* + * XXX hack; seems the ds is still dirty and + * dsl_pool_zil_clean() expects it to have a ds_user_ptr + * (and zil), but clone_swap() can close it. + */ + txg_wait_synced(ds->ds_dir->dd_pool, 0); + + if (ds != drc->drc_real_ds) { + /* we are doing an online recv */ + if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { + err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, + drc->drc_force); + if (err) + dsl_dataset_disown(ds, dmu_recv_tag); + } else { + err = EBUSY; + dsl_dataset_rele(ds, dmu_recv_tag); + } + /* dsl_dataset_destroy() will disown the ds */ + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); + if (err) + return (err); + } + + resa.creation_time = drc->drc_drrb->drr_creation_time; + resa.toguid = drc->drc_drrb->drr_toguid; + resa.tosnap = drc->drc_tosnap; + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_end_check, recv_end_sync, ds, &resa, 3); + if (err) { + if (drc->drc_newfs) { + ASSERT(ds == drc->drc_real_ds); + (void) dsl_dataset_destroy(ds, dmu_recv_tag); + return (err); + } else { + (void) dsl_dataset_rollback(ds, DMU_OST_NONE); + } + } + + /* release the hold from dmu_recv_begin */ + dsl_dataset_disown(ds, dmu_recv_tag); + return (err); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c index 3d2bc3e47678..43bf82e7a682 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,6 +35,7 @@ #include <sys/spa.h> #include <sys/zio.h> #include <sys/dmu_impl.h> +#include <sys/zvol.h> #define BP_SPAN_SHIFT(level, width) ((level) * (width)) @@ -261,6 +262,16 @@ advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance) return (EAGAIN); } +/* + * The traverse_callback function will call the function specified in th_func. + * In the event of an error the callee, specified by th_func, must return + * one of the following errors: + * + * EINTR - Indicates that the callee wants the traversal to + * abort immediately. + * ERESTART - The callee has acknowledged the error and would + * like to continue. + */ static int traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc) { @@ -603,7 +614,10 @@ traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) th->th_locked = 0; } - rc = traverse_read(th, bc, &dsp->ds_bp, dn); + if (BP_IS_HOLE(&dsp->ds_bp)) + rc = ERESTART; + else + rc = traverse_read(th, bc, &dsp->ds_bp, dn); if (rc != 0) { if (rc == ERESTART) @@ -722,6 +736,24 @@ traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance, } int +traverse_zvol(objset_t *os, int advance, blkptr_cb_t func, void *arg) +{ + spa_t *spa = dmu_objset_spa(os); + traverse_handle_t *th; + int err; + + th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL); + + traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ); + + while ((err = traverse_more(th)) == EAGAIN) + continue; + + traverse_fini(th); + return (err); +} + +int traverse_more(traverse_handle_t *th) { zseg_t *zseg = list_head(&th->th_seglist); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index 13fd8d4d9dce..000c3ce64eb5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dmu_impl.h> #include <sys/dbuf.h> @@ -157,7 +155,7 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) rw_exit(&dn->dn_struct_rwlock); if (db == NULL) return (EIO); - err = dbuf_read(db, zio, DB_RF_CANFAIL); + err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); dbuf_rele(db, FTAG); return (err); } @@ -294,6 +292,8 @@ dmu_tx_count_dnode(dmu_tx_hold_t *txh) txh->txh_space_tooverwrite += space; } else { txh->txh_space_towrite += space; + if (dn && dn->dn_dbuf->db_blkptr) + txh->txh_space_tounref += space; } } @@ -318,39 +318,25 @@ dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) static void dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { - uint64_t blkid, nblks; - uint64_t space = 0; + uint64_t blkid, nblks, lastblk; + uint64_t space = 0, unref = 0, skipped = 0; dnode_t *dn = txh->txh_dnode; dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; spa_t *spa = txh->txh_tx->tx_pool->dp_spa; - int dirty; + int epbs; - /* - * We don't need to use any locking to check for dirtyness - * because it's OK if we get stale data -- the dnode may become - * dirty immediately after our check anyway. This is just a - * means to avoid the expensive count when we aren't sure we - * need it. We need to be able to deal with a dirty dnode. - */ - dirty = list_link_active(&dn->dn_dirty_link[0]) | - list_link_active(&dn->dn_dirty_link[1]) | - list_link_active(&dn->dn_dirty_link[2]) | - list_link_active(&dn->dn_dirty_link[3]); - if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0) + if (dn->dn_nlevels == 0) return; /* - * the struct_rwlock protects us against dn_phys->dn_nlevels + * The struct_rwlock protects us against dn_nlevels * changing, in case (against all odds) we manage to dirty & * sync out the changes after we check for being dirty. - * also, dbuf_hold_impl() wants us to have the struct_rwlock. - * - * It's fine to use dn_datablkshift rather than the dn_phys - * equivalent because if it is changing, maxblkid==0 and we will - * bail. + * Also, dbuf_hold_level() wants us to have the struct_rwlock. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_phys->dn_maxblkid == 0) { + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + if (dn->dn_maxblkid == 0) { if (off == 0 && len >= dn->dn_datablksz) { blkid = 0; nblks = 1; @@ -360,78 +346,120 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) } } else { blkid = off >> dn->dn_datablkshift; - nblks = (off + len) >> dn->dn_datablkshift; + nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; - if (blkid >= dn->dn_phys->dn_maxblkid) { + if (blkid >= dn->dn_maxblkid) { rw_exit(&dn->dn_struct_rwlock); return; } - if (blkid + nblks > dn->dn_phys->dn_maxblkid) - nblks = dn->dn_phys->dn_maxblkid - blkid; + if (blkid + nblks > dn->dn_maxblkid) + nblks = dn->dn_maxblkid - blkid; - /* don't bother after 128,000 blocks */ - nblks = MIN(nblks, 128*1024); } - - if (dn->dn_phys->dn_nlevels == 1) { + if (dn->dn_nlevels == 1) { int i; for (i = 0; i < nblks; i++) { blkptr_t *bp = dn->dn_phys->dn_blkptr; - ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr); + ASSERT3U(blkid + i, <, dn->dn_nblkptr); bp += blkid + i; if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { dprintf_bp(bp, "can free old%s", ""); space += bp_get_dasize(spa, bp); } + unref += BP_GET_ASIZE(bp); } nblks = 0; } + /* + * Add in memory requirements of higher-level indirects. + * This assumes a worst-possible scenario for dn_nlevels. + */ + { + uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs); + int level = (dn->dn_nlevels > 1) ? 2 : 1; + + while (level++ < DN_MAX_LEVELS) { + txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift; + blkcnt = 1 + (blkcnt >> epbs); + } + ASSERT(blkcnt <= dn->dn_nblkptr); + } + + lastblk = blkid + nblks - 1; while (nblks) { dmu_buf_impl_t *dbuf; - int err, epbs, blkoff, tochk; - - epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - blkoff = P2PHASE(blkid, 1<<epbs); - tochk = MIN((1<<epbs) - blkoff, nblks); - - err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf); - if (err == 0) { - int i; - blkptr_t *bp; - - err = dbuf_read(dbuf, NULL, - DB_RF_HAVESTRUCT | DB_RF_CANFAIL); - if (err != 0) { - txh->txh_tx->tx_err = err; - dbuf_rele(dbuf, FTAG); - break; - } + uint64_t ibyte, new_blkid; + int epb = 1 << epbs; + int err, i, blkoff, tochk; + blkptr_t *bp; + + ibyte = blkid << dn->dn_datablkshift; + err = dnode_next_offset(dn, + DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); + new_blkid = ibyte >> dn->dn_datablkshift; + if (err == ESRCH) { + skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; + break; + } + if (err) { + txh->txh_tx->tx_err = err; + break; + } + if (new_blkid > lastblk) { + skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; + break; + } - bp = dbuf->db.db_data; - bp += blkoff; + if (new_blkid > blkid) { + ASSERT((new_blkid >> epbs) > (blkid >> epbs)); + skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1; + nblks -= new_blkid - blkid; + blkid = new_blkid; + } + blkoff = P2PHASE(blkid, epb); + tochk = MIN(epb - blkoff, nblks); - for (i = 0; i < tochk; i++) { - if (dsl_dataset_block_freeable(ds, - bp[i].blk_birth)) { - dprintf_bp(&bp[i], - "can free old%s", ""); - space += bp_get_dasize(spa, &bp[i]); - } - } + dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG); + + txh->txh_memory_tohold += dbuf->db.db_size; + if (txh->txh_memory_tohold > DMU_MAX_ACCESS) { + txh->txh_tx->tx_err = E2BIG; dbuf_rele(dbuf, FTAG); + break; } - if (err && err != ENOENT) { + err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); + if (err != 0) { txh->txh_tx->tx_err = err; + dbuf_rele(dbuf, FTAG); break; } + bp = dbuf->db.db_data; + bp += blkoff; + + for (i = 0; i < tochk; i++) { + if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) { + dprintf_bp(&bp[i], "can free old%s", ""); + space += bp_get_dasize(spa, &bp[i]); + } + unref += BP_GET_ASIZE(bp); + } + dbuf_rele(dbuf, FTAG); + blkid += tochk; nblks -= tochk; } rw_exit(&dn->dn_struct_rwlock); + /* account for new level 1 indirect blocks that might show up */ + if (skipped > 0) { + txh->txh_fudge += skipped << dn->dn_indblkshift; + skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); + txh->txh_memory_tohold += skipped << dn->dn_indblkshift; + } txh->txh_space_tofree += space; + txh->txh_space_tounref += unref; } void @@ -466,7 +494,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) /* * For i/o error checking, read the first and last level-0 * blocks, and all the level-1 blocks. The above count_write's - * will take care of the level-0 blocks. + * have already taken care of the level-0 blocks. */ if (dn->dn_nlevels > 1) { shift = dn->dn_datablkshift + dn->dn_indblkshift - @@ -478,7 +506,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) NULL, NULL, ZIO_FLAG_CANFAIL); for (i = start; i <= end; i++) { uint64_t ibyte = i << shift; - err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0); + err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); i = ibyte >> shift; if (err == ESRCH) break; @@ -550,10 +578,13 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) * the size will change between now and the dbuf dirty call. */ if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_phys->dn_blkptr[0].blk_birth)) + dn->dn_phys->dn_blkptr[0].blk_birth)) { txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; - else + } else { txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + txh->txh_space_tounref += + BP_GET_ASIZE(dn->dn_phys->dn_blkptr); + } return; } @@ -575,7 +606,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks */ dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz, - (3 + add ? 3 : 0) << dn->dn_datablkshift); + (3 + (add ? 3 : 0)) << dn->dn_datablkshift); /* * If the modified blocks are scattered to the four winds, @@ -698,12 +729,13 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) match_offset = TRUE; break; case THT_FREE: - if (blkid == beginblk && - (txh->txh_arg1 != 0 || - dn->dn_maxblkid == 0)) - match_offset = TRUE; - if (blkid == endblk && - txh->txh_arg2 != DMU_OBJECT_END) + /* + * We will dirty all the level 1 blocks in + * the free range and perhaps the first and + * last level 0 block. + */ + if (blkid >= beginblk && (blkid <= endblk || + txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; break; case THT_BONUS: @@ -733,12 +765,32 @@ static int dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) { dmu_tx_hold_t *txh; - uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite; + spa_t *spa = tx->tx_pool->dp_spa; + uint64_t memory, asize, fsize, usize; + uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; ASSERT3U(tx->tx_txg, ==, 0); + if (tx->tx_err) return (tx->tx_err); + if (spa_suspended(spa)) { + /* + * If the user has indicated a blocking failure mode + * then return ERESTART which will block in dmu_tx_wait(). + * Otherwise, return EIO so that an error can get + * propagated back to the VOP calls. + * + * Note that we always honor the txg_how flag regardless + * of the failuremode setting. + */ + if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && + txg_how != TXG_WAIT) + return (EIO); + + return (ERESTART); + } + tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); tx->tx_needassign_txh = NULL; @@ -748,7 +800,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) * dmu_tx_unassign() logic. */ - towrite = tofree = tooverwrite = 0; + towrite = tofree = tooverwrite = tounref = tohold = fudge = 0; for (txh = list_head(&tx->tx_holds); txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; @@ -768,6 +820,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) towrite += txh->txh_space_towrite; tofree += txh->txh_space_tofree; tooverwrite += txh->txh_space_tooverwrite; + tounref += txh->txh_space_tounref; + tohold += txh->txh_memory_tohold; + fudge += txh->txh_fudge; } /* @@ -788,22 +843,31 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) tooverwrite = tofree = 0; } - /* - * Convert logical size to worst-case allocated size. - */ + /* needed allocation: worst-case estimate of write space */ + asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); + /* freed space estimate: worst-case overwrite + free estimate */ fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; - lsize = towrite + tooverwrite; - asize = spa_get_asize(tx->tx_pool->dp_spa, lsize); + /* convert unrefd space to worst-case estimate */ + usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); + /* calculate memory footprint estimate */ + memory = towrite + tooverwrite + tohold; #ifdef ZFS_DEBUG - tx->tx_space_towrite = asize; + /* + * Add in 'tohold' to account for our dirty holds on this memory + * XXX - the "fudge" factor is to account for skipped blocks that + * we missed because dnode_next_offset() misses in-core-only blocks. + */ + tx->tx_space_towrite = asize + + spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge); tx->tx_space_tofree = tofree; tx->tx_space_tooverwrite = tooverwrite; + tx->tx_space_tounref = tounref; #endif if (tx->tx_dir && asize != 0) { - int err = dsl_dir_tempreserve_space(tx->tx_dir, - lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx); + int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, + asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); if (err) return (err); } @@ -885,10 +949,18 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) void dmu_tx_wait(dmu_tx_t *tx) { + spa_t *spa = tx->tx_pool->dp_spa; + ASSERT(tx->tx_txg == 0); - ASSERT(tx->tx_lasttried_txg != 0); - if (tx->tx_needassign_txh) { + /* + * It's possible that the pool has become active after this thread + * has tried to obtain a tx. If that's the case then his + * tx_lasttried_txg would not have been assigned. + */ + if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { + txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); + } else if (tx->tx_needassign_txh) { dnode_t *dn = tx->tx_needassign_txh->txh_dnode; mutex_enter(&dn->dn_mtx); @@ -948,6 +1020,7 @@ dmu_tx_commit(dmu_tx_t *tx) if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); + list_destroy(&tx->tx_holds); #ifdef ZFS_DEBUG dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", tx->tx_space_towrite, refcount_count(&tx->tx_space_written), @@ -975,6 +1048,7 @@ dmu_tx_abort(dmu_tx_t *tx) if (dn != NULL) dnode_rele(dn, tx); } + list_destroy(&tx->tx_holds); #ifdef ZFS_DEBUG refcount_destroy_many(&tx->tx_space_written, refcount_count(&tx->tx_space_written)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c index b25cc898c37d..8dba38176527 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c @@ -38,10 +38,6 @@ */ int zfs_prefetch_disable = 0; -SYSCTL_DECL(_vfs_zfs); -TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable); -SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN, - &zfs_prefetch_disable, 0, "Disable prefetch"); /* max # of streams per zfetch */ uint32_t zfetch_max_streams = 8; @@ -52,6 +48,25 @@ uint32_t zfetch_block_cap = 256; /* number of bytes in a array_read at which we stop prefetching (1Mb) */ uint64_t zfetch_array_rd_sz = 1024 * 1024; +SYSCTL_DECL(_vfs_zfs); +TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable); +SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN, + &zfs_prefetch_disable, 0, "Disable prefetch"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH"); +TUNABLE_INT("vfs.zfs.zfetch.max_streams", &zfetch_max_streams); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RDTUN, + &zfetch_max_streams, 0, "Max # of streams per zfetch"); +TUNABLE_INT("vfs.zfs.zfetch.min_sec_reap", &zfetch_min_sec_reap); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RDTUN, + &zfetch_min_sec_reap, 0, "Min time before stream reclaim"); +TUNABLE_INT("vfs.zfs.zfetch.block_cap", &zfetch_block_cap); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, block_cap, CTLFLAG_RDTUN, + &zfetch_block_cap, 0, "Max number of blocks to fetch at a time"); +TUNABLE_QUAD("vfs.zfs.zfetch.array_rd_sz", &zfetch_array_rd_sz); +SYSCTL_QUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RDTUN, + &zfetch_array_rd_sz, 0, + "Number of bytes in a array_read at which we stop prefetching"); + /* forward decls for static routines */ static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index ca502857b1fa..5adbc3c0ff5d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/dbuf.h> #include <sys/dnode.h> @@ -242,6 +240,23 @@ free_range_compar(const void *node1, const void *node2) else return (0); } +void +dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + + dnode_setdirty(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + ASSERT3U(newsize, <=, DN_MAX_BONUSLEN - + (dn->dn_nblkptr-1) * sizeof (blkptr_t)); + dn->dn_bonuslen = newsize; + if (newsize == 0) + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; + else + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; + rw_exit(&dn->dn_struct_rwlock); +} + static void dnode_setdblksz(dnode_t *dn, int size) { @@ -285,6 +300,7 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, list_insert_head(&os->os_dnodes, dn); mutex_exit(&os->os_lock); + arc_space_consume(sizeof (dnode_t)); return (dn); } @@ -319,6 +335,7 @@ dnode_destroy(dnode_t *dn) dn->dn_bonus = NULL; } kmem_cache_free(dnode_cache, dn); + arc_space_return(sizeof (dnode_t)); } void @@ -362,6 +379,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, for (i = 0; i < TXG_SIZE; i++) { ASSERT3U(dn->dn_next_nlevels[i], ==, 0); ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); + ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); ASSERT3U(dn->dn_next_blksz[i], ==, 0); ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); @@ -389,6 +407,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dnode_setdirty(dn, tx); dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; } @@ -396,7 +415,7 @@ void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - int i; + int i, old_nblkptr; dmu_buf_impl_t *db = NULL; ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); @@ -413,7 +432,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, ASSERT(!list_link_active(&dn->dn_dirty_link[i])); /* clean up any unreferenced dbufs */ - (void) dnode_evict_dbufs(dn, 0); + dnode_evict_dbufs(dn); ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); /* @@ -436,38 +455,18 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, } dnode_setdblksz(dn, blocksize); dnode_setdirty(dn, tx); + dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; rw_exit(&dn->dn_struct_rwlock); - if (db) { + if (db) dbuf_rele(db, FTAG); - db = NULL; - } /* change type */ dn->dn_type = ot; - if (dn->dn_bonuslen != bonuslen) { - /* change bonus size */ - if (bonuslen == 0) - bonuslen = 1; /* XXX */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - if (dn->dn_bonus == NULL) - dn->dn_bonus = dbuf_create_bonus(dn); - db = dn->dn_bonus; - rw_exit(&dn->dn_struct_rwlock); - if (refcount_add(&db->db_holds, FTAG) == 1) - dnode_add_ref(dn, db); - VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); - mutex_enter(&db->db_mtx); - ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); - ASSERT(db->db.db_data != NULL); - db->db.db_size = bonuslen; - mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx); - } - /* change bonus size and type */ mutex_enter(&dn->dn_mtx); + old_nblkptr = dn->dn_nblkptr; dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); @@ -475,12 +474,15 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dn->dn_compress = ZIO_COMPRESS_INHERIT; ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); - /* - * NB: we have to do the dbuf_rele after we've changed the - * dn_bonuslen, for the sake of dbuf_verify(). - */ - if (db) - dbuf_rele(db, FTAG); + /* XXX - for now, we can't make nblkptr smaller */ + ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr); + + /* fix up the bonus db_size if dn_nblkptr has changed */ + if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) { + dn->dn_bonus->db.db_size = + DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); + } dn->dn_allocated_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); @@ -559,6 +561,12 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, dmu_buf_impl_t *db; dnode_t **children_dnodes; + /* + * If you are holding the spa config lock as writer, you shouldn't + * be asking the DMU to do *anything*. + */ + ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0); + if (object == 0 || object >= DN_MAX_OBJECT) return (EINVAL); @@ -602,9 +610,10 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, } if ((dn = children_dnodes[idx]) == NULL) { + dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx; dnode_t *winner; - dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx, - db, object); + + dn = dnode_create(os, dnp, db, object); winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn); if (winner != NULL) { dnode_destroy(dn); @@ -644,11 +653,22 @@ dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp) return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); } -void +/* + * Can only add a reference if there is already at least one + * reference on the dnode. Returns FALSE if unable to add a + * new reference. + */ +boolean_t dnode_add_ref(dnode_t *dn, void *tag) { - ASSERT(refcount_count(&dn->dn_holds) > 0); - (void) refcount_add(&dn->dn_holds, tag); + mutex_enter(&dn->dn_mtx); + if (refcount_is_zero(&dn->dn_holds)) { + mutex_exit(&dn->dn_mtx); + return (FALSE); + } + VERIFY(1 < refcount_add(&dn->dn_holds, tag)); + mutex_exit(&dn->dn_mtx); + return (TRUE); } void @@ -656,7 +676,9 @@ dnode_rele(dnode_t *dn, void *tag) { uint64_t refs; + mutex_enter(&dn->dn_mtx); refs = refcount_remove(&dn->dn_holds, tag); + mutex_exit(&dn->dn_mtx); /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ if (refs == 0 && dn->dn_dbuf) dbuf_rele(dn->dn_dbuf, dn); @@ -692,6 +714,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs)); ASSERT(dn->dn_datablksz != 0); + ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0); ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0); dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", @@ -714,7 +737,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) * dnode will hang around after we finish processing its * children. */ - dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg); + VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); (void) dbuf_dirty(dn->dn_dbuf, tx); @@ -762,7 +785,7 @@ int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) { dmu_buf_impl_t *db, *db_next; - int have_db0 = FALSE; + int err; if (size == 0) size = SPA_MINBLOCKSIZE; @@ -787,9 +810,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); - if (db->db_blkid == 0) { - have_db0 = TRUE; - } else if (db->db_blkid != DB_BONUS_BLKID) { + if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); goto fail; } @@ -799,12 +820,12 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) if (ibs && dn->dn_nlevels != 1) goto fail; - db = NULL; - if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) { - /* obtain the old block */ - db = dbuf_hold(dn, 0, FTAG); + /* resize the old block */ + err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); + if (err == 0) dbuf_new_size(db, size, tx); - } + else if (err != ENOENT) + goto fail; dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); @@ -813,7 +834,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) dn->dn_indblkshift = ibs; dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; } - + /* rele after we have fixed the blocksize in the dnode */ if (db) dbuf_rele(db, FTAG); @@ -825,19 +846,32 @@ fail: return (ENOTSUP); } +/* read-holding callers must not rely on the lock being continuously held */ void -dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) +dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) { uint64_t txgoff = tx->tx_txg & TXG_MASK; - int drop_struct_lock = FALSE; int epbs, new_nlevels; uint64_t sz; ASSERT(blkid != DB_BONUS_BLKID); - if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - drop_struct_lock = TRUE; + ASSERT(have_read ? + RW_READ_HELD(&dn->dn_struct_rwlock) : + RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + /* + * if we have a read-lock, check to see if we need to do any work + * before upgrading to a write-lock. + */ + if (have_read) { + if (blkid <= dn->dn_maxblkid) + return; + + if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { + rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + } } if (blkid <= dn->dn_maxblkid) @@ -889,8 +923,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) } out: - if (drop_struct_lock) - rw_exit(&dn->dn_struct_rwlock); + if (have_read) + rw_downgrade(&dn->dn_struct_rwlock); } void @@ -951,15 +985,15 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { dmu_buf_impl_t *db; uint64_t blkoff, blkid, nblks; - int blksz, head; + int blksz, blkshift, head, tail; int trunc = FALSE; + int epbs; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); blksz = dn->dn_datablksz; + blkshift = dn->dn_datablkshift; + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - /* If the range is past the end of the file, this is a no-op */ - if (off >= blksz * (dn->dn_maxblkid+1)) - goto out; if (len == -1ULL) { len = UINT64_MAX - off; trunc = TRUE; @@ -971,11 +1005,18 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (ISP2(blksz)) { head = P2NPHASE(off, blksz); blkoff = P2PHASE(off, blksz); + if ((off >> blkshift) > dn->dn_maxblkid) + goto out; } else { ASSERT(dn->dn_maxblkid == 0); if (off == 0 && len >= blksz) { - /* Freeing the whole block; don't do any head. */ - head = 0; + /* Freeing the whole block; fast-track this request */ + blkid = 0; + nblks = 1; + goto done; + } else if (off >= blksz) { + /* Freeing past end-of-data */ + goto out; } else { /* Freeing part of the block. */ head = blksz - off; @@ -1008,88 +1049,95 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) } /* If the range was less than one block, we're done */ - if (len == 0 || off >= blksz * (dn->dn_maxblkid+1)) + if (len == 0) goto out; - if (!ISP2(blksz)) { - /* - * They are freeing the whole block of a - * non-power-of-two blocksize file. Skip all the messy - * math. - */ - ASSERT3U(off, ==, 0); - ASSERT3U(len, >=, blksz); - blkid = 0; - nblks = 1; - } else { - int tail; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - int blkshift = dn->dn_datablkshift; - - /* If the remaining range is past end of file, we're done */ - if (off > dn->dn_maxblkid << blkshift) - goto out; + /* If the remaining range is past end of file, we're done */ + if ((off >> blkshift) > dn->dn_maxblkid) + goto out; - if (off + len == UINT64_MAX) - tail = 0; - else - tail = P2PHASE(len, blksz); - - ASSERT3U(P2PHASE(off, blksz), ==, 0); - /* zero out any partial block data at the end of the range */ - if (tail) { - if (len < tail) - tail = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), - TRUE, FTAG, &db) == 0) { - /* don't dirty if not on disk and not dirty */ - if (db->db_last_dirty || - (db->db_blkptr && - !BP_IS_HOLE(db->db_blkptr))) { - rw_exit(&dn->dn_struct_rwlock); - dbuf_will_dirty(db, tx); - rw_enter(&dn->dn_struct_rwlock, - RW_WRITER); - bzero(db->db.db_data, tail); - } - dbuf_rele(db, FTAG); + ASSERT(ISP2(blksz)); + if (trunc) + tail = 0; + else + tail = P2PHASE(len, blksz); + + ASSERT3U(P2PHASE(off, blksz), ==, 0); + /* zero out any partial block data at the end of the range */ + if (tail) { + if (len < tail) + tail = len; + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), + TRUE, FTAG, &db) == 0) { + /* don't dirty if not on disk and not dirty */ + if (db->db_last_dirty || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { + rw_exit(&dn->dn_struct_rwlock); + dbuf_will_dirty(db, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + bzero(db->db.db_data, tail); } - len -= tail; + dbuf_rele(db, FTAG); } - /* If the range did not include a full block, we are done */ - if (len == 0) - goto out; + len -= tail; + } - /* dirty the left indirects */ - if (dn->dn_nlevels > 1 && off != 0) { - db = dbuf_hold_level(dn, 1, - (off - head) >> (blkshift + epbs), FTAG); + /* If the range did not include a full block, we are done */ + if (len == 0) + goto out; + + ASSERT(IS_P2ALIGNED(off, blksz)); + ASSERT(trunc || IS_P2ALIGNED(len, blksz)); + blkid = off >> blkshift; + nblks = len >> blkshift; + if (trunc) + nblks += 1; + + /* + * Read in and mark all the level-1 indirects dirty, + * so that they will stay in memory until syncing phase. + * Always dirty the first and last indirect to make sure + * we dirty all the partial indirects. + */ + if (dn->dn_nlevels > 1) { + uint64_t i, first, last; + int shift = epbs + dn->dn_datablkshift; + + first = blkid >> epbs; + if (db = dbuf_hold_level(dn, 1, first, FTAG)) { dbuf_will_dirty(db, tx); dbuf_rele(db, FTAG); } - - /* dirty the right indirects */ - if (dn->dn_nlevels > 1 && !trunc) { - db = dbuf_hold_level(dn, 1, - (off + len + tail - 1) >> (blkshift + epbs), FTAG); + if (trunc) + last = dn->dn_maxblkid >> epbs; + else + last = (blkid + nblks - 1) >> epbs; + if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) { dbuf_will_dirty(db, tx); dbuf_rele(db, FTAG); } - - /* - * Finally, add this range to the dnode range list, we - * will finish up this free operation in the syncing phase. - */ - ASSERT(IS_P2ALIGNED(off, 1<<blkshift)); - ASSERT(off + len == UINT64_MAX || - IS_P2ALIGNED(len, 1<<blkshift)); - blkid = off >> blkshift; - nblks = len >> blkshift; - - if (trunc) - dn->dn_maxblkid = (blkid ? blkid - 1 : 0); + for (i = first + 1; i < last; i++) { + uint64_t ibyte = i << shift; + int err; + + err = dnode_next_offset(dn, + DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0); + i = ibyte >> shift; + if (err == ESRCH || i >= last) + break; + ASSERT(err == 0); + db = dbuf_hold_level(dn, 1, i, FTAG); + if (db) { + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } + } } - +done: + /* + * Add this range to the dnode range list. + * We will finish up this free operation in the syncing phase. + */ mutex_enter(&dn->dn_mtx); dnode_clear_range(dn, blkid, nblks, tx); { @@ -1109,9 +1157,12 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) } mutex_exit(&dn->dn_mtx); - dbuf_free_range(dn, blkid, nblks, tx); + dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dnode_setdirty(dn, tx); out: + if (trunc && dn->dn_maxblkid >= (off >> blkshift)) + dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0); + rw_exit(&dn->dn_struct_rwlock); } @@ -1179,7 +1230,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta) ASSERT3U(space, >=, -delta); /* no underflow */ } space += delta; - if (spa_version(dn->dn_objset->os_spa) < ZFS_VERSION_DNODE_BYTES) { + if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0); dn->dn_phys->dn_used = space >> DEV_BSHIFT; @@ -1211,7 +1262,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) } static int -dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, +dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, int lvl, uint64_t blkfill, uint64_t txg) { dmu_buf_impl_t *db = NULL; @@ -1219,11 +1270,16 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t epb = 1ULL << epbs; uint64_t minfill, maxfill; - int i, error, span; + boolean_t hole; + int i, inc, error, span; dprintf("probing object %llu offset %llx level %d of %u\n", dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); + hole = flags & DNODE_FIND_HOLE; + inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; + ASSERT(txg == 0 || !hole); + if (lvl == dn->dn_phys->dn_nlevels) { error = 0; epb = dn->dn_phys->dn_nblkptr; @@ -1232,9 +1288,18 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); if (error) { - if (error == ENOENT) - return (hole ? 0 : ESRCH); - return (error); + if (error != ENOENT) + return (error); + if (hole) + return (0); + /* + * This can only happen when we are searching up + * the block tree for data. We don't really need to + * adjust the offset, as we will just end up looking + * at the pointer to this block in its parent, and its + * going to be unallocated, so we will skip over it. + */ + return (ESRCH); } error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); if (error) { @@ -1246,13 +1311,18 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, if (db && txg && (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) { + /* + * This can only happen when we are searching up the tree + * and these conditions mean that we need to keep climbing. + */ error = ESRCH; } else if (lvl == 0) { dnode_phys_t *dnp = data; span = DNODE_SHIFT; ASSERT(dn->dn_type == DMU_OT_DNODE); - for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) { + for (i = (*offset >> span) & (blkfill - 1); + i >= 0 && i < blkfill; i += inc) { boolean_t newcontents = B_TRUE; if (txg) { int j; @@ -1264,9 +1334,9 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, } if (!dnp[i].dn_type == hole && newcontents) break; - *offset += 1ULL << span; + *offset += (1ULL << span) * inc; } - if (i == blkfill) + if (i < 0 || i == blkfill) error = ESRCH; } else { blkptr_t *bp = data; @@ -1280,14 +1350,17 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, minfill++; for (i = (*offset >> span) & ((1ULL << epbs) - 1); - i < epb; i++) { + i >= 0 && i < epb; i += inc) { if (bp[i].blk_fill >= minfill && bp[i].blk_fill <= maxfill && - bp[i].blk_birth > txg) + (hole || bp[i].blk_birth > txg)) break; - *offset += 1ULL << span; + if (inc < 0 && *offset < (1ULL << span)) + *offset = 0; + else + *offset += (1ULL << span) * inc; } - if (i >= epb) + if (i < 0 || i == epb) error = ESRCH; } @@ -1306,64 +1379,66 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, * * Examples: * - * dnode_next_offset(dn, hole, offset, 1, 1, 0); - * Finds the next hole/data in a file. + * dnode_next_offset(dn, flags, offset, 1, 1, 0); + * Finds the next/previous hole/data in a file. * Used in dmu_offset_next(). * - * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg); + * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg); * Finds the next free/allocated dnode an objset's meta-dnode. * Only finds objects that have new contents since txg (ie. * bonus buffer changes and content removal are ignored). * Used in dmu_object_next(). * - * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0); + * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0); * Finds the next L2 meta-dnode bp that's at most 1/4 full. * Used in dmu_object_alloc(). */ int -dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset, +dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { + uint64_t initial_offset = *offset; int lvl, maxlvl; int error = 0; - uint64_t initial_offset = *offset; - rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (!(flags & DNODE_FIND_HAVELOCK)) + rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_phys->dn_nlevels == 0) { - rw_exit(&dn->dn_struct_rwlock); - return (ESRCH); + error = ESRCH; + goto out; } if (dn->dn_datablkshift == 0) { if (*offset < dn->dn_datablksz) { - if (hole) + if (flags & DNODE_FIND_HOLE) *offset = dn->dn_datablksz; } else { error = ESRCH; } - rw_exit(&dn->dn_struct_rwlock); - return (error); + goto out; } maxlvl = dn->dn_phys->dn_nlevels; for (lvl = minlvl; lvl <= maxlvl; lvl++) { error = dnode_next_offset_level(dn, - hole, offset, lvl, blkfill, txg); + flags, offset, lvl, blkfill, txg); if (error != ESRCH) break; } - while (--lvl >= minlvl && error == 0) { + while (error == 0 && --lvl >= minlvl) { error = dnode_next_offset_level(dn, - hole, offset, lvl, blkfill, txg); + flags, offset, lvl, blkfill, txg); } - rw_exit(&dn->dn_struct_rwlock); - - if (error == 0 && initial_offset > *offset) + if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? + initial_offset < *offset : initial_offset > *offset)) error = ESRCH; +out: + if (!(flags & DNODE_FIND_HAVELOCK)) + rw_exit(&dn->dn_struct_rwlock); return (error); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c index 9e8c7adbda01..a46d4e70abc8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -55,9 +55,8 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) ASSERT(db != NULL); dn->dn_phys->dn_nlevels = new_level; - dprintf("os=%p obj=%llu, increase to %d\n", - dn->dn_objset, dn->dn_object, - dn->dn_phys->dn_nlevels); + dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, + dn->dn_object, dn->dn_phys->dn_nlevels); /* check for existing blkptrs in the dnode */ for (i = 0; i < nblkptr; i++) @@ -110,25 +109,26 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) rw_exit(&dn->dn_struct_rwlock); } -static void +static int free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) { - objset_impl_t *os = dn->dn_objset; + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint64_t bytesfreed = 0; - int i; + int i, blocks_freed = 0; - dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num); + dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); for (i = 0; i < num; i++, bp++) { if (BP_IS_HOLE(bp)) continue; - bytesfreed += bp_get_dasize(os->os_spa, bp); + bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx); ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); - dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx); bzero(bp, sizeof (blkptr_t)); + blocks_freed += 1; } dnode_diduse_space(dn, -bytesfreed); + return (blocks_freed); } #ifdef ZFS_DEBUG @@ -160,7 +160,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(db->db_dnode, db->db_level-1, - (db->db_blkid << epbs) + i, TRUE, FTAG, &child); + (db->db_blkid << epbs) + i, TRUE, FTAG, &child); rw_exit(&db->db_dnode->dn_struct_rwlock); if (err == ENOENT) continue; @@ -178,7 +178,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) if (buf[j] != 0) { panic("freed data not zero: " "child=%p i=%d off=%d num=%d\n", - child, i, off, num); + (void *)child, i, off, num); } } } @@ -195,7 +195,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) if (buf[j] != 0) { panic("freed data not zero: " "child=%p i=%d off=%d num=%d\n", - child, i, off, num); + (void *)child, i, off, num); } } } @@ -206,6 +206,8 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) } #endif +#define ALL -1 + static int free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, dmu_tx_t *tx) @@ -216,8 +218,18 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, uint64_t start, end, dbstart, dbend, i; int epbs, shift, err; int all = TRUE; + int blocks_freed = 0; + + /* + * There is a small possibility that this block will not be cached: + * 1 - if level > 1 and there are no children with level <= 1 + * 2 - if we didn't get a dirty hold (because this block had just + * finished being written -- and so had no holds), and then this + * block got evicted before we got here. + */ + if (db->db_state != DB_CACHED) + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); arc_release(db->db_buf, db); bp = (blkptr_t *)db->db.db_data; @@ -241,10 +253,10 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, if (db->db_level == 1) { FREE_VERIFY(db, start, end, tx); - free_blocks(dn, bp, end-start+1, tx); + blocks_freed = free_blocks(dn, bp, end-start+1, tx); arc_buf_freeze(db->db_buf); - ASSERT(all || db->db_last_dirty); - return (all); + ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + return (all ? ALL : blocks_freed); } for (i = start; i <= end; i++, bp++) { @@ -255,9 +267,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, ASSERT3U(err, ==, 0); rw_exit(&dn->dn_struct_rwlock); - if (free_children(subdb, blkid, nblks, trunc, tx)) { + if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) { ASSERT3P(subdb->db_blkptr, ==, bp); - free_blocks(dn, bp, 1, tx); + blocks_freed += free_blocks(dn, bp, 1, tx); } else { all = FALSE; } @@ -274,8 +286,8 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, ASSERT3U(bp->blk_birth, ==, 0); } #endif - ASSERT(all || db->db_last_dirty); - return (all); + ASSERT(all || blocks_freed == 0 || db->db_last_dirty); + return (all ? ALL : blocks_freed); } /* @@ -305,15 +317,14 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) return; } ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); - free_blocks(dn, bp + blkid, nblks, tx); + (void) free_blocks(dn, bp + blkid, nblks, tx); if (trunc) { uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); ASSERT(off < dn->dn_phys->dn_maxblkid || dn->dn_phys->dn_maxblkid == 0 || - dnode_next_offset(dn, FALSE, &off, - 1, 1, 0) != 0); + dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); } return; } @@ -331,9 +342,9 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) ASSERT3U(err, ==, 0); rw_exit(&dn->dn_struct_rwlock); - if (free_children(db, blkid, nblks, trunc, tx)) { + if (free_children(db, blkid, nblks, trunc, tx) == ALL) { ASSERT3P(db->db_blkptr, ==, bp); - free_blocks(dn, bp, 1, tx); + (void) free_blocks(dn, bp, 1, tx); } dbuf_rele(db, FTAG); } @@ -343,15 +354,15 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); ASSERT(off < dn->dn_phys->dn_maxblkid || dn->dn_phys->dn_maxblkid == 0 || - dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0); + dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); } } /* * Try to kick all the dnodes dbufs out of the cache... */ -int -dnode_evict_dbufs(dnode_t *dn, int try) +void +dnode_evict_dbufs(dnode_t *dn) { int progress; int pass = 0; @@ -367,6 +378,7 @@ dnode_evict_dbufs(dnode_t *dn, int try) for (; db != ▮ db = list_head(&dn->dn_dbufs)) { list_remove(&dn->dn_dbufs, db); list_insert_tail(&dn->dn_dbufs, db); + ASSERT3P(db->db_dnode, ==, dn); mutex_enter(&db->db_mtx); if (db->db_state == DB_EVICTING) { @@ -375,7 +387,6 @@ dnode_evict_dbufs(dnode_t *dn, int try) mutex_exit(&db->db_mtx); } else if (refcount_is_zero(&db->db_holds)) { progress = TRUE; - ASSERT(!arc_released(db->db_buf)); dbuf_clear(db); /* exits db_mtx for us */ } else { mutex_exit(&db->db_mtx); @@ -397,21 +408,6 @@ dnode_evict_dbufs(dnode_t *dn, int try) ASSERT(pass < 100); /* sanity check */ } while (progress); - /* - * This function works fine even if it can't evict everything. - * If were only asked to try to evict everything then - * return an error if we can't. Otherwise panic as the caller - * expects total eviction. - */ - if (list_head(&dn->dn_dbufs) != NULL) { - if (try) { - return (1); - } else { - panic("dangling dbufs (dn=%p, dbuf=%p)\n", - dn, list_head(&dn->dn_dbufs)); - } - } - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { mutex_enter(&dn->dn_bonus->db_mtx); @@ -419,7 +415,6 @@ dnode_evict_dbufs(dnode_t *dn, int try) dn->dn_bonus = NULL; } rw_exit(&dn->dn_struct_rwlock); - return (0); } static void @@ -460,8 +455,15 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); + /* + * Our contents should have been freed in dnode_sync() by the + * free range record inserted by the caller of dnode_free(). + */ + ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0); + ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr)); + dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); - (void) dnode_evict_dbufs(dn, 0); + dnode_evict_dbufs(dn); ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); /* @@ -479,10 +481,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dn->dn_next_indblkshift[txgoff] = 0; dn->dn_next_blksz[txgoff] = 0; - /* free up all the blocks in the file. */ - dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx); - ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0); - /* ASSERT(blkptrs are zero); */ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); ASSERT(dn->dn_type != DMU_OT_NONE); @@ -496,6 +494,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dn->dn_type = DMU_OT_NONE; dn->dn_maxblkid = 0; dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; mutex_exit(&dn->dn_mtx); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); @@ -558,7 +557,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], SPA_MINBLOCKSIZE) == 0); ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || - list_head(list) != NULL || + dn->dn_maxblkid == 0 || list_head(list) != NULL || dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == dnp->dn_datablkszsec); dnp->dn_datablkszsec = @@ -566,6 +565,15 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dn->dn_next_blksz[txgoff] = 0; } + if (dn->dn_next_bonuslen[txgoff]) { + if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN) + dnp->dn_bonuslen = 0; + else + dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff]; + ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN); + dn->dn_next_bonuslen[txgoff] = 0; + } + if (dn->dn_next_indblkshift[txgoff]) { ASSERT(dnp->dn_nlevels == 1); dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; @@ -583,20 +591,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); /* process all the "freed" ranges in the file */ - if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) { - for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL; - rp = AVL_PREV(&dn->dn_ranges[txgoff], rp)) - dnode_sync_free_range(dn, - rp->fr_blkid, rp->fr_nblks, tx); + while (rp = avl_last(&dn->dn_ranges[txgoff])) { + dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx); + /* grab the mutex so we don't race with dnode_block_freed() */ + mutex_enter(&dn->dn_mtx); + avl_remove(&dn->dn_ranges[txgoff], rp); + mutex_exit(&dn->dn_mtx); + kmem_free(rp, sizeof (free_range_t)); } - mutex_enter(&dn->dn_mtx); - for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) { - free_range_t *last = rp; - rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp); - avl_remove(&dn->dn_ranges[txgoff], last); - kmem_free(last, sizeof (free_range_t)); - } - mutex_exit(&dn->dn_mtx); if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) { dnode_sync_free(dn, tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index 7d4689f3352a..20d8ec85cc91 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu_objset.h> #include <sys/dsl_dataset.h> #include <sys/dsl_dir.h> @@ -38,35 +36,44 @@ #include <sys/unique.h> #include <sys/zfs_context.h> #include <sys/zfs_ioctl.h> +#include <sys/spa.h> +#include <sys/zfs_znode.h> +#include <sys/sunddi.h> + +static char *dsl_reaper = "the grim reaper"; static dsl_checkfunc_t dsl_dataset_destroy_begin_check; static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; static dsl_checkfunc_t dsl_dataset_rollback_check; static dsl_syncfunc_t dsl_dataset_rollback_sync; -static dsl_checkfunc_t dsl_dataset_destroy_check; -static dsl_syncfunc_t dsl_dataset_destroy_sync; +static dsl_syncfunc_t dsl_dataset_set_reservation_sync; #define DS_REF_MAX (1ULL << 62) #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE +#define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) + + /* - * We use weighted reference counts to express the various forms of exclusion - * between different open modes. A STANDARD open is 1 point, an EXCLUSIVE open - * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE. - * This makes the exclusion logic simple: the total refcnt for all opens cannot - * exceed DS_REF_MAX. For example, EXCLUSIVE opens are exclusive because their - * weight (DS_REF_MAX) consumes the entire refcnt space. PRIMARY opens consume - * just over half of the refcnt space, so there can't be more than one, but it - * can peacefully coexist with any number of STANDARD opens. + * Figure out how much of this delta should be propogated to the dsl_dir + * layer. If there's a refreservation, that space has already been + * partially accounted for in our ancestors. */ -static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = { - 0, /* DS_MODE_NONE - invalid */ - 1, /* DS_MODE_STANDARD - unlimited number */ - (DS_REF_MAX >> 1) + 1, /* DS_MODE_PRIMARY - only one of these */ - DS_REF_MAX /* DS_MODE_EXCLUSIVE - no other opens */ -}; +static int64_t +parent_delta(dsl_dataset_t *ds, int64_t delta) +{ + uint64_t old_bytes, new_bytes; + if (ds->ds_reserved == 0) + return (delta); + + old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); + new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); + + ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); + return (new_bytes - old_bytes); +} void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) @@ -74,6 +81,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); + int64_t delta; dprintf_bp(bp, "born, ds=%p\n", ds); @@ -89,23 +97,28 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) * dsl_dir. */ ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ - dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, + dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, used, compressed, uncompressed, tx); dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); return; } dmu_buf_will_dirty(ds->ds_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); + delta = parent_delta(ds, used); ds->ds_phys->ds_used_bytes += used; ds->ds_phys->ds_compressed_bytes += compressed; ds->ds_phys->ds_uncompressed_bytes += uncompressed; ds->ds_phys->ds_unique_bytes += used; mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, - used, compressed, uncompressed, tx); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, + compressed, uncompressed, tx); + dsl_dir_transfer_space(ds->ds_dir, used - delta, + DD_USED_REFRSRV, DD_USED_HEAD, tx); + mutex_exit(&ds->ds_dir->dd_lock); } -void +int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, dmu_tx_t *tx) { @@ -113,10 +126,11 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); + ASSERT(pio != NULL); ASSERT(dmu_tx_is_syncing(tx)); /* No block pointer => nothing to free */ if (BP_IS_HOLE(bp)) - return; + return (0); ASSERT(used > 0); if (ds == NULL) { @@ -125,51 +139,59 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, * Account for the meta-objset space in its placeholder * dataset. */ - err = arc_free(pio, tx->tx_pool->dp_spa, - tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT); + err = dsl_free(pio, tx->tx_pool, + tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT); ASSERT(err == 0); - dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, + dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, -used, -compressed, -uncompressed, tx); dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); - return; + return (used); } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); + ASSERT(!dsl_dataset_is_snapshot(ds)); dmu_buf_will_dirty(ds->ds_dbuf, tx); if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { int err; + int64_t delta; dprintf_bp(bp, "freeing: %s", ""); - err = arc_free(pio, tx->tx_pool->dp_spa, - tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT); + err = dsl_free(pio, tx->tx_pool, + tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT); ASSERT(err == 0); + mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); - /* XXX unique_bytes is not accurate for head datasets */ - /* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */ + ASSERT(ds->ds_phys->ds_unique_bytes >= used || + !DS_UNIQUE_IS_ACCURATE(ds)); + delta = parent_delta(ds, -used); ds->ds_phys->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, - -used, -compressed, -uncompressed, tx); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + delta, -compressed, -uncompressed, tx); + dsl_dir_transfer_space(ds->ds_dir, -used - delta, + DD_USED_REFRSRV, DD_USED_HEAD, tx); + mutex_exit(&ds->ds_dir->dd_lock); } else { dprintf_bp(bp, "putting on dead list: %s", ""); VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); + ASSERT3U(ds->ds_prev->ds_object, ==, + ds->ds_phys->ds_prev_snap_obj); + ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ - if (ds->ds_phys->ds_prev_snap_obj != 0) { - ASSERT3U(ds->ds_prev->ds_object, ==, - ds->ds_phys->ds_prev_snap_obj); - ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); - if (ds->ds_prev->ds_phys->ds_next_snap_obj == - ds->ds_object && bp->blk_birth > - ds->ds_prev->ds_phys->ds_prev_snap_txg) { - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - mutex_enter(&ds->ds_prev->ds_lock); - ds->ds_prev->ds_phys->ds_unique_bytes += - used; - mutex_exit(&ds->ds_prev->ds_lock); - } + if (ds->ds_prev->ds_phys->ds_next_snap_obj == + ds->ds_object && bp->blk_birth > + ds->ds_prev->ds_phys->ds_prev_snap_txg) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + mutex_enter(&ds->ds_prev->ds_lock); + ds->ds_prev->ds_phys->ds_unique_bytes += used; + mutex_exit(&ds->ds_prev->ds_lock); + } + if (bp->blk_birth > ds->ds_origin_txg) { + dsl_dir_transfer_space(ds->ds_dir, used, + DD_USED_HEAD, DD_USED_SNAP, tx); } } mutex_enter(&ds->ds_lock); @@ -180,6 +202,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); ds->ds_phys->ds_uncompressed_bytes -= uncompressed; mutex_exit(&ds->ds_lock); + + return (used); } uint64_t @@ -216,32 +240,38 @@ static void dsl_dataset_evict(dmu_buf_t *db, void *dsv) { dsl_dataset_t *ds = dsv; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - /* open_refcount == DS_REF_MAX when deleting */ - ASSERT(ds->ds_open_refcount == 0 || - ds->ds_open_refcount == DS_REF_MAX); + ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); dprintf_ds(ds, "evicting %s\n", ""); - unique_remove(ds->ds_phys->ds_fsid_guid); + unique_remove(ds->ds_fsid_guid); if (ds->ds_user_ptr != NULL) ds->ds_user_evict_func(ds, ds->ds_user_ptr); if (ds->ds_prev) { - dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds); + dsl_dataset_drop_ref(ds->ds_prev, ds); ds->ds_prev = NULL; } bplist_close(&ds->ds_deadlist); - dsl_dir_close(ds->ds_dir, ds); + if (ds->ds_dir) + dsl_dir_close(ds->ds_dir, ds); - if (list_link_active(&ds->ds_synced_link)) - list_remove(&dp->dp_synced_objsets, ds); + ASSERT(!list_link_active(&ds->ds_synced_link)); + if (mutex_owned(&ds->ds_lock)) + mutex_exit(&ds->ds_lock); mutex_destroy(&ds->ds_lock); + if (mutex_owned(&ds->ds_opening_lock)) + mutex_exit(&ds->ds_opening_lock); + mutex_destroy(&ds->ds_opening_lock); + if (mutex_owned(&ds->ds_deadlist.bpl_lock)) + mutex_exit(&ds->ds_deadlist.bpl_lock); mutex_destroy(&ds->ds_deadlist.bpl_lock); + rw_destroy(&ds->ds_rwlock); + cv_destroy(&ds->ds_exclusive_cv); kmem_free(ds, sizeof (dsl_dataset_t)); } @@ -266,16 +296,54 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds) return (err); headphys = headdbuf->db_data; err = zap_value_search(dp->dp_meta_objset, - headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname); + headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); dmu_buf_rele(headdbuf, FTAG); return (err); } -int -dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, - int mode, void *tag, dsl_dataset_t **dsp) +static int +dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + matchtype_t mt; + int err; + + if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_FIRST; + else + mt = MT_EXACT; + + err = zap_lookup_norm(mos, snapobj, name, 8, 1, + value, mt, NULL, 0, NULL); + if (err == ENOTSUP && mt == MT_FIRST) + err = zap_lookup(mos, snapobj, name, 8, 1, value); + return (err); +} + +static int +dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + matchtype_t mt; + int err; + + if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_FIRST; + else + mt = MT_EXACT; + + err = zap_remove_norm(mos, snapobj, name, mt, tx); + if (err == ENOTSUP && mt == MT_FIRST) + err = zap_remove(mos, snapobj, name, tx); + return (err); +} + +static int +dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, + dsl_dataset_t **dsp) { - uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)]; objset_t *mos = dp->dp_meta_objset; dmu_buf_t *dbuf; dsl_dataset_t *ds; @@ -297,8 +365,11 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, ds->ds_phys = dbuf->db_data; mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&ds->ds_rwlock, 0, 0, 0); + cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); err = bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); @@ -312,42 +383,65 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, * just opened it. */ mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_deadlist.bpl_lock); + rw_destroy(&ds->ds_rwlock); + cv_destroy(&ds->ds_exclusive_cv); kmem_free(ds, sizeof (dsl_dataset_t)); dmu_buf_rele(dbuf, tag); return (err); } - if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) { + if (!dsl_dataset_is_snapshot(ds)) { ds->ds_snapname[0] = '\0'; if (ds->ds_phys->ds_prev_snap_obj) { - err = dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_NONE, ds, &ds->ds_prev); + err = dsl_dataset_get_ref(dp, + ds->ds_phys->ds_prev_snap_obj, + ds, &ds->ds_prev); } - } else { - if (snapname) { -#ifdef ZFS_DEBUG - dsl_dataset_phys_t *headphys; - dmu_buf_t *headdbuf; - err = dmu_bonus_hold(mos, - ds->ds_dir->dd_phys->dd_head_dataset_obj, - FTAG, &headdbuf); + + if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) { + dsl_dataset_t *origin; + + err = dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_origin_obj, + FTAG, &origin); if (err == 0) { - headphys = headdbuf->db_data; - uint64_t foundobj; - err = zap_lookup(dp->dp_meta_objset, - headphys->ds_snapnames_zapobj, - snapname, sizeof (foundobj), 1, - &foundobj); - ASSERT3U(foundobj, ==, dsobj); - dmu_buf_rele(headdbuf, FTAG); + ds->ds_origin_txg = + origin->ds_phys->ds_creation_txg; + dsl_dataset_rele(origin, FTAG); } -#endif - (void) strcat(ds->ds_snapname, snapname); - } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { - err = dsl_dataset_get_snapname(ds); } + } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { + err = dsl_dataset_get_snapname(ds); + } + + if (err == 0 && !dsl_dataset_is_snapshot(ds)) { + /* + * In sync context, we're called with either no lock + * or with the write lock. If we're not syncing, + * we're always called with the read lock held. + */ + boolean_t need_lock = + !RW_WRITE_HELD(&dp->dp_config_rwlock) && + dsl_pool_sync_context(dp); + + if (need_lock) + rw_enter(&dp->dp_config_rwlock, RW_READER); + + err = dsl_prop_get_ds(ds, + "refreservation", sizeof (uint64_t), 1, + &ds->ds_reserved, NULL); + if (err == 0) { + err = dsl_prop_get_ds(ds, + "refquota", sizeof (uint64_t), 1, + &ds->ds_quota, NULL); + } + + if (need_lock) + rw_exit(&dp->dp_config_rwlock); + } else { + ds->ds_reserved = ds->ds_quota = 0; } if (err == 0) { @@ -356,13 +450,14 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, } if (err || winner) { bplist_close(&ds->ds_deadlist); - if (ds->ds_prev) { - dsl_dataset_close(ds->ds_prev, - DS_MODE_NONE, ds); - } + if (ds->ds_prev) + dsl_dataset_drop_ref(ds->ds_prev, ds); dsl_dir_close(ds->ds_dir, ds); mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_deadlist.bpl_lock); + rw_destroy(&ds->ds_rwlock); + cv_destroy(&ds->ds_exclusive_cv); kmem_free(ds, sizeof (dsl_dataset_t)); if (err) { dmu_buf_rele(dbuf, tag); @@ -370,101 +465,175 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, } ds = winner; } else { - uint64_t new = + ds->ds_fsid_guid = unique_insert(ds->ds_phys->ds_fsid_guid); - if (new != ds->ds_phys->ds_fsid_guid) { - /* XXX it won't necessarily be synced... */ - ds->ds_phys->ds_fsid_guid = new; - } } } ASSERT3P(ds->ds_dbuf, ==, dbuf); ASSERT3P(ds->ds_phys, ==, dbuf->db_data); - + ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || + spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || + dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); mutex_enter(&ds->ds_lock); - if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY && - (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) && - !DS_MODE_IS_INCONSISTENT(mode)) || - (ds->ds_open_refcount + weight > DS_REF_MAX)) { + if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { mutex_exit(&ds->ds_lock); - dsl_dataset_close(ds, DS_MODE_NONE, tag); - return (EBUSY); + dmu_buf_rele(ds->ds_dbuf, tag); + return (ENOENT); } - ds->ds_open_refcount += weight; mutex_exit(&ds->ds_lock); - *dsp = ds; return (0); } +static int +dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + /* + * In syncing context we don't want the rwlock lock: there + * may be an existing writer waiting for sync phase to + * finish. We don't need to worry about such writers, since + * sync phase is single-threaded, so the writer can't be + * doing anything while we are active. + */ + if (dsl_pool_sync_context(dp)) { + ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); + return (0); + } + + /* + * Normal users will hold the ds_rwlock as a READER until they + * are finished (i.e., call dsl_dataset_rele()). "Owners" will + * drop their READER lock after they set the ds_owner field. + * + * If the dataset is being destroyed, the destroy thread will + * obtain a WRITER lock for exclusive access after it's done its + * open-context work and then change the ds_owner to + * dsl_reaper once destruction is assured. So threads + * may block here temporarily, until the "destructability" of + * the dataset is determined. + */ + ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); + mutex_enter(&ds->ds_lock); + while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { + rw_exit(&dp->dp_config_rwlock); + cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); + if (DSL_DATASET_IS_DESTROYED(ds)) { + mutex_exit(&ds->ds_lock); + dsl_dataset_drop_ref(ds, tag); + rw_enter(&dp->dp_config_rwlock, RW_READER); + return (ENOENT); + } + rw_enter(&dp->dp_config_rwlock, RW_READER); + } + mutex_exit(&ds->ds_lock); + return (0); +} + +int +dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, + dsl_dataset_t **dsp) +{ + int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); + + if (err) + return (err); + return (dsl_dataset_hold_ref(*dsp, tag)); +} + int -dsl_dataset_open_spa(spa_t *spa, const char *name, int mode, - void *tag, dsl_dataset_t **dsp) +dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner, + dsl_dataset_t **dsp) +{ + int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp); + + ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER); + + if (err) + return (err); + if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { + dsl_dataset_rele(*dsp, owner); + return (EBUSY); + } + return (0); +} + +int +dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) { dsl_dir_t *dd; dsl_pool_t *dp; - const char *tail; + const char *snapname; uint64_t obj; - dsl_dataset_t *ds = NULL; int err = 0; - err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail); + err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); if (err) return (err); dp = dd->dd_pool; obj = dd->dd_phys->dd_head_dataset_obj; rw_enter(&dp->dp_config_rwlock, RW_READER); - if (obj == 0) { - /* A dataset with no associated objset */ + if (obj) + err = dsl_dataset_get_ref(dp, obj, tag, dsp); + else err = ENOENT; + if (err) goto out; - } - if (tail != NULL) { - objset_t *mos = dp->dp_meta_objset; + err = dsl_dataset_hold_ref(*dsp, tag); - err = dsl_dataset_open_obj(dp, obj, NULL, - DS_MODE_NONE, tag, &ds); - if (err) - goto out; - obj = ds->ds_phys->ds_snapnames_zapobj; - dsl_dataset_close(ds, DS_MODE_NONE, tag); - ds = NULL; + /* we may be looking for a snapshot */ + if (err == 0 && snapname != NULL) { + dsl_dataset_t *ds = NULL; - if (tail[0] != '@') { + if (*snapname++ != '@') { + dsl_dataset_rele(*dsp, tag); err = ENOENT; goto out; } - tail++; - /* Look for a snapshot */ - if (!DS_MODE_IS_READONLY(mode)) { - err = EROFS; - goto out; + dprintf("looking for snapshot '%s'\n", snapname); + err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); + if (err == 0) + err = dsl_dataset_get_ref(dp, obj, tag, &ds); + dsl_dataset_rele(*dsp, tag); + + ASSERT3U((err == 0), ==, (ds != NULL)); + + if (ds) { + mutex_enter(&ds->ds_lock); + if (ds->ds_snapname[0] == 0) + (void) strlcpy(ds->ds_snapname, snapname, + sizeof (ds->ds_snapname)); + mutex_exit(&ds->ds_lock); + err = dsl_dataset_hold_ref(ds, tag); + *dsp = err ? NULL : ds; } - dprintf("looking for snapshot '%s'\n", tail); - err = zap_lookup(mos, obj, tail, 8, 1, &obj); - if (err) - goto out; } - err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds); - out: rw_exit(&dp->dp_config_rwlock); dsl_dir_close(dd, FTAG); - - ASSERT3U((err == 0), ==, (ds != NULL)); - /* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */ - - *dsp = ds; return (err); } int -dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp) +dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp) { - return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp)); + int err = dsl_dataset_hold(name, owner, dsp); + if (err) + return (err); + if ((*dsp)->ds_phys->ds_num_children > 0 && + !DS_MODE_IS_READONLY(flags)) { + dsl_dataset_rele(*dsp, owner); + return (EROFS); + } + if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { + dsl_dataset_rele(*dsp, owner); + return (EBUSY); + } + return (0); } void @@ -477,11 +646,11 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name) VERIFY(0 == dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { (void) strcat(name, "@"); + /* + * We use a "recursive" mutex so that we + * can call dprintf_ds() with ds_lock held. + */ if (!MUTEX_HELD(&ds->ds_lock)) { - /* - * We use a "recursive" mutex so that we - * can call dprintf_ds() with ds_lock held. - */ mutex_enter(&ds->ds_lock); (void) strcat(name, ds->ds_snapname); mutex_exit(&ds->ds_lock); @@ -505,7 +674,6 @@ dsl_dataset_namelen(dsl_dataset_t *ds) if (ds->ds_snapname[0]) { ++result; /* adding one for the @-sign */ if (!MUTEX_HELD(&ds->ds_lock)) { - /* see dsl_datset_name */ mutex_enter(&ds->ds_lock); result += strlen(ds->ds_snapname); mutex_exit(&ds->ds_lock); @@ -519,119 +687,160 @@ dsl_dataset_namelen(dsl_dataset_t *ds) } void -dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag) +dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) { - uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)]; + dmu_buf_rele(ds->ds_dbuf, tag); +} + +void +dsl_dataset_rele(dsl_dataset_t *ds, void *tag) +{ + if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { + rw_exit(&ds->ds_rwlock); + } + dsl_dataset_drop_ref(ds, tag); +} + +void +dsl_dataset_disown(dsl_dataset_t *ds, void *owner) +{ + ASSERT((ds->ds_owner == owner && ds->ds_dbuf) || + (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); + mutex_enter(&ds->ds_lock); - ASSERT3U(ds->ds_open_refcount, >=, weight); - ds->ds_open_refcount -= weight; - dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n", - mode, ds->ds_open_refcount); + ds->ds_owner = NULL; + if (RW_WRITE_HELD(&ds->ds_rwlock)) { + rw_exit(&ds->ds_rwlock); + cv_broadcast(&ds->ds_exclusive_cv); + } mutex_exit(&ds->ds_lock); + if (ds->ds_dbuf) + dsl_dataset_drop_ref(ds, owner); + else + dsl_dataset_evict(ds->ds_dbuf, ds); +} - dmu_buf_rele(ds->ds_dbuf, tag); +boolean_t +dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner) +{ + boolean_t gotit = FALSE; + + mutex_enter(&ds->ds_lock); + if (ds->ds_owner == NULL && + (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { + ds->ds_owner = owner; + if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) + rw_exit(&ds->ds_rwlock); + gotit = TRUE; + } + mutex_exit(&ds->ds_lock); + return (gotit); } void -dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx) +dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) { - objset_t *mos = dp->dp_meta_objset; + ASSERT3P(owner, ==, ds->ds_owner); + if (!RW_WRITE_HELD(&ds->ds_rwlock)) + rw_enter(&ds->ds_rwlock, RW_WRITER); +} + +uint64_t +dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, + uint64_t flags, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dd->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; - dsl_dataset_t *ds; uint64_t dsobj; - dsl_dir_t *dd; + objset_t *mos = dp->dp_meta_objset; - dsl_dir_create_root(mos, ddobjp, tx); - VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd)); + if (origin == NULL) + origin = dp->dp_origin_snap; + + ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); + ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; + bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = dd->dd_object; + dsphys->ds_flags = flags; dsphys->ds_fsid_guid = unique_create(); - unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); dsphys->ds_snapnames_zapobj = - zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx); + zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, + DMU_OT_NONE, 0, tx); dsphys->ds_creation_time = gethrestime_sec(); - dsphys->ds_creation_txg = tx->tx_txg; + dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; dsphys->ds_deadlist_obj = bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + + if (origin) { + dsphys->ds_prev_snap_obj = origin->ds_object; + dsphys->ds_prev_snap_txg = + origin->ds_phys->ds_creation_txg; + dsphys->ds_used_bytes = + origin->ds_phys->ds_used_bytes; + dsphys->ds_compressed_bytes = + origin->ds_phys->ds_compressed_bytes; + dsphys->ds_uncompressed_bytes = + origin->ds_phys->ds_uncompressed_bytes; + dsphys->ds_bp = origin->ds_phys->ds_bp; + dsphys->ds_flags |= origin->ds_phys->ds_flags; + + dmu_buf_will_dirty(origin->ds_dbuf, tx); + origin->ds_phys->ds_num_children++; + + if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { + if (origin->ds_phys->ds_next_clones_obj == 0) { + origin->ds_phys->ds_next_clones_obj = + zap_create(mos, + DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY(0 == zap_add_int(mos, + origin->ds_phys->ds_next_clones_obj, + dsobj, tx)); + } + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dd->dd_phys->dd_origin_obj = origin->ds_object; + } + + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) + dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + dmu_buf_rele(dbuf, FTAG); dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_head_dataset_obj = dsobj; - dsl_dir_close(dd, FTAG); - VERIFY(0 == - dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds)); - (void) dmu_objset_create_impl(dp->dp_spa, ds, - &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx); - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + return (dsobj); } uint64_t -dsl_dataset_create_sync(dsl_dir_t *pdd, - const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx) +dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, + dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) { dsl_pool_t *dp = pdd->dd_pool; - dmu_buf_t *dbuf; - dsl_dataset_phys_t *dsphys; uint64_t dsobj, ddobj; - objset_t *mos = dp->dp_meta_objset; dsl_dir_t *dd; - ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp); - ASSERT(clone_parent == NULL || - clone_parent->ds_phys->ds_num_children > 0); ASSERT(lastname[0] != '@'); - ASSERT(dmu_tx_is_syncing(tx)); - ddobj = dsl_dir_create_sync(pdd, lastname, tx); + ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); - dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, - DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; - dsphys->ds_dir_obj = dd->dd_object; - dsphys->ds_fsid_guid = unique_create(); - unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ - (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, - sizeof (dsphys->ds_guid)); - dsphys->ds_snapnames_zapobj = - zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx); - dsphys->ds_creation_time = gethrestime_sec(); - dsphys->ds_creation_txg = tx->tx_txg; - dsphys->ds_deadlist_obj = - bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - if (clone_parent) { - dsphys->ds_prev_snap_obj = clone_parent->ds_object; - dsphys->ds_prev_snap_txg = - clone_parent->ds_phys->ds_creation_txg; - dsphys->ds_used_bytes = - clone_parent->ds_phys->ds_used_bytes; - dsphys->ds_compressed_bytes = - clone_parent->ds_phys->ds_compressed_bytes; - dsphys->ds_uncompressed_bytes = - clone_parent->ds_phys->ds_uncompressed_bytes; - dsphys->ds_bp = clone_parent->ds_phys->ds_bp; + dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); - dmu_buf_will_dirty(clone_parent->ds_dbuf, tx); - clone_parent->ds_phys->ds_num_children++; + dsl_deleg_set_create_perms(dd, tx, cr); - dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object; - } - dmu_buf_rele(dbuf, FTAG); - - dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_head_dataset_obj = dsobj; dsl_dir_close(dd, FTAG); return (dsobj); @@ -653,21 +862,24 @@ dsl_snapshot_destroy_one(char *name, void *arg) (void) strcat(name, "@"); (void) strcat(name, da->snapname); - err = dsl_dataset_open(name, - DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT, + err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT, da->dstg, &ds); cp = strchr(name, '@'); *cp = '\0'; - if (err == ENOENT) - return (0); - if (err) { + if (err == 0) { + dsl_dataset_make_exclusive(ds, da->dstg); + if (ds->ds_user_ptr) { + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } + dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, + dsl_dataset_destroy_sync, ds, da->dstg, 0); + } else if (err == ENOENT) { + err = 0; + } else { (void) strcpy(da->failed, name); - return (err); } - - dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, ds, da->dstg, 0); - return (0); + return (err); } /* @@ -681,16 +893,8 @@ dsl_snapshots_destroy(char *fsname, char *snapname) struct destroyarg da; dsl_sync_task_t *dst; spa_t *spa; - char *cp; - cp = strchr(fsname, '/'); - if (cp) { - *cp = '\0'; - err = spa_open(fsname, &spa, FTAG); - *cp = '/'; - } else { - err = spa_open(fsname, &spa, FTAG); - } + err = spa_open(fsname, &spa, FTAG); if (err) return (err); da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); @@ -706,17 +910,14 @@ dsl_snapshots_destroy(char *fsname, char *snapname) for (dst = list_head(&da.dstg->dstg_tasks); dst; dst = list_next(&da.dstg->dstg_tasks, dst)) { dsl_dataset_t *ds = dst->dst_arg1; + /* + * Return the file system name that triggered the error + */ if (dst->dst_err) { dsl_dataset_name(ds, fsname); - cp = strchr(fsname, '@'); - *cp = '\0'; + *strchr(fsname, '@') = '\0'; } - /* - * If it was successful, destroy_sync would have - * closed the ds - */ - if (err) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, da.dstg); + dsl_dataset_disown(ds, da.dstg); } dsl_sync_task_group_destroy(da.dstg); @@ -724,36 +925,33 @@ dsl_snapshots_destroy(char *fsname, char *snapname) return (err); } +/* + * ds must be opened as OWNER. On return (whether successful or not), + * ds will be closed and caller can no longer dereference it. + */ int -dsl_dataset_destroy(const char *name) +dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) { int err; dsl_sync_task_group_t *dstg; objset_t *os; - dsl_dataset_t *ds; dsl_dir_t *dd; uint64_t obj; - if (strchr(name, '@')) { + if (dsl_dataset_is_snapshot(ds)) { /* Destroying a snapshot is simpler */ - err = dsl_dataset_open(name, - DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT, - FTAG, &ds); - if (err) - return (err); + dsl_dataset_make_exclusive(ds, tag); + + if (ds->ds_user_ptr) { + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, - ds, FTAG, 0); - if (err) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - return (err); + ds, tag, 0); + goto out; } - err = dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os); - if (err) - return (err); - ds = os->os->os_dsl_dataset; dd = ds->ds_dir; /* @@ -762,10 +960,12 @@ dsl_dataset_destroy(const char *name) */ err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, dsl_dataset_destroy_begin_sync, ds, NULL, 0); - if (err) { - dmu_objset_close(os); - return (err); - } + if (err) + goto out; + + err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os); + if (err) + goto out; /* * remove the objects in open context, so that we won't @@ -773,66 +973,73 @@ dsl_dataset_destroy(const char *name) */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, ds->ds_phys->ds_prev_snap_txg)) { - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END); - dmu_tx_hold_bonus(tx, obj); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - /* - * Perhaps there is not enough disk - * space. Just deal with it from - * dsl_dataset_destroy_sync(). - */ - dmu_tx_abort(tx); - continue; - } - VERIFY(0 == dmu_object_free(os, obj, tx)); - dmu_tx_commit(tx); + /* + * Ignore errors, if there is not enough disk space + * we will deal with it in dsl_dataset_destroy_sync(). + */ + (void) dmu_free_object(os, obj); } - /* Make sure it's not dirty before we finish destroying it. */ - txg_wait_synced(dd->dd_pool, 0); dmu_objset_close(os); if (err != ESRCH) - return (err); + goto out; + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); + rw_exit(&dd->dd_pool->dp_config_rwlock); - err = dsl_dataset_open(name, - DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT, - FTAG, &ds); if (err) - return (err); + goto out; - err = dsl_dir_open(name, FTAG, &dd, NULL); - if (err) { - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - return (err); + if (ds->ds_user_ptr) { + /* + * We need to sync out all in-flight IO before we try + * to evict (the dataset evict func is trying to clear + * the cached entries for this dataset in the ARC). + */ + txg_wait_synced(dd->dd_pool, 0); } /* * Blow away the dsl_dir + head dataset. */ + dsl_dataset_make_exclusive(ds, tag); + if (ds->ds_user_ptr) { + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); dsl_sync_task_create(dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, ds, FTAG, 0); + dsl_dataset_destroy_sync, ds, tag, 0); dsl_sync_task_create(dstg, dsl_dir_destroy_check, dsl_dir_destroy_sync, dd, FTAG, 0); err = dsl_sync_task_group_wait(dstg); dsl_sync_task_group_destroy(dstg); - /* if it is successful, *destroy_sync will close the ds+dd */ - if (err) { - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + /* if it is successful, dsl_dir_destroy_sync will close the dd */ + if (err) dsl_dir_close(dd, FTAG); - } +out: + dsl_dataset_disown(ds, tag); return (err); } int -dsl_dataset_rollback(dsl_dataset_t *ds) +dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost) { - ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX); - return (dsl_sync_task_do(ds->ds_dir->dd_pool, + int err; + + ASSERT(ds->ds_owner); + + dsl_dataset_make_exclusive(ds, ds->ds_owner); + err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_rollback_check, dsl_dataset_rollback_sync, - ds, NULL, 0)); + ds, &ost, 0); + /* drop exclusive access */ + mutex_enter(&ds->ds_lock); + rw_exit(&ds->ds_rwlock); + cv_broadcast(&ds->ds_exclusive_cv); + mutex_exit(&ds->ds_lock); + return (err); } void * @@ -904,14 +1111,56 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) } } +/* + * The unique space in the head dataset can be calculated by subtracting + * the space used in the most recent snapshot, that is still being used + * in this file system, from the space currently in use. To figure out + * the space in the most recent snapshot still in use, we need to take + * the total space used in the snapshot and subtract out the space that + * has been freed up since the snapshot was taken. + */ +static void +dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) +{ + uint64_t mrs_used; + uint64_t dlused, dlcomp, dluncomp; + + ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj); + + if (ds->ds_phys->ds_prev_snap_obj != 0) + mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; + else + mrs_used = 0; + + VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp, + &dluncomp)); + + ASSERT3U(dlused, <=, mrs_used); + ds->ds_phys->ds_unique_bytes = + ds->ds_phys->ds_used_bytes - (mrs_used - dlused); + + if (!DS_UNIQUE_IS_ACCURATE(ds) && + spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_UNIQUE_ACCURATE) + ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; +} + +static uint64_t +dsl_dataset_unique(dsl_dataset_t *ds) +{ + if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds)) + dsl_dataset_recalc_head_uniq(ds); + + return (ds->ds_phys->ds_unique_bytes); +} + struct killarg { - uint64_t *usedp; - uint64_t *compressedp; - uint64_t *uncompressedp; + dsl_dataset_t *ds; zio_t *zio; dmu_tx_t *tx; }; +/* ARGSUSED */ static int kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg) { @@ -920,16 +1169,9 @@ kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg) ASSERT3U(bc->bc_errno, ==, 0); - /* - * Since this callback is not called concurrently, no lock is - * needed on the accounting values. - */ - *ka->usedp += bp_get_dasize(spa, bp); - *ka->compressedp += BP_GET_PSIZE(bp); - *ka->uncompressedp += BP_GET_UCSIZE(bp); - /* XXX check for EIO? */ - (void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL, - ARC_NOWAIT); + ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); + (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx); + return (0); } @@ -938,14 +1180,12 @@ static int dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; + dmu_objset_type_t *ost = arg2; /* - * There must be a previous snapshot. I suppose we could roll - * it back to being empty (and re-initialize the upper (ZPL) - * layer). But for now there's no way to do this via the user - * interface. + * We can only roll back to emptyness if it is a ZPL objset. */ - if (ds->ds_phys->ds_prev_snap_txg == 0) + if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0) return (EINVAL); /* @@ -966,13 +1206,44 @@ dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) /* ARGSUSED */ static void -dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; + dmu_objset_type_t *ost = arg2; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; dmu_buf_will_dirty(ds->ds_dbuf, tx); + /* + * Before the roll back destroy the zil. + */ + if (ds->ds_user_ptr != NULL) { + zil_rollback_destroy( + ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx); + + /* + * We need to make sure that the objset_impl_t is reopened after + * we do the rollback, otherwise it will have the wrong + * objset_phys_t. Normally this would happen when this + * dataset-open is closed, thus causing the + * dataset to be immediately evicted. But when doing "zfs recv + * -F", we reopen the objset before that, so that there is no + * window where the dataset is closed and inconsistent. + */ + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } + + /* Transfer space that was freed since last snap back to the head. */ + { + uint64_t used; + + VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist, + ds->ds_origin_txg, UINT64_MAX, &used)); + dsl_dir_transfer_space(ds->ds_dir, used, + DD_USED_SNAP, DD_USED_HEAD, tx); + } + /* Zero out the deadlist. */ bplist_close(&ds->ds_deadlist); bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); @@ -984,39 +1255,65 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx) { /* Free blkptrs that we gave birth to */ zio_t *zio; - uint64_t used = 0, compressed = 0, uncompressed = 0; struct killarg ka; zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - ka.usedp = &used; - ka.compressedp = &compressed; - ka.uncompressedp = &uncompressed; + ka.ds = ds; ka.zio = zio; ka.tx = tx; (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, ADVANCE_POST, kill_blkptr, &ka); (void) zio_wait(zio); - - dsl_dir_diduse_space(ds->ds_dir, - -used, -compressed, -uncompressed, tx); } - /* Change our contents to that of the prev snapshot */ - ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj); - ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp; - ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes; - ds->ds_phys->ds_compressed_bytes = - ds->ds_prev->ds_phys->ds_compressed_bytes; - ds->ds_phys->ds_uncompressed_bytes = - ds->ds_prev->ds_phys->ds_uncompressed_bytes; - ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags; - ds->ds_phys->ds_unique_bytes = 0; + ASSERT(!(ds->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) || + ds->ds_phys->ds_unique_bytes == 0); + + if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) { + /* Change our contents to that of the prev snapshot */ + + ASSERT3U(ds->ds_prev->ds_object, ==, + ds->ds_phys->ds_prev_snap_obj); + ASSERT3U(ds->ds_phys->ds_used_bytes, <=, + ds->ds_prev->ds_phys->ds_used_bytes); + + ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp; + ds->ds_phys->ds_used_bytes = + ds->ds_prev->ds_phys->ds_used_bytes; + ds->ds_phys->ds_compressed_bytes = + ds->ds_prev->ds_phys->ds_compressed_bytes; + ds->ds_phys->ds_uncompressed_bytes = + ds->ds_prev->ds_phys->ds_uncompressed_bytes; + ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags; + + if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ds->ds_prev->ds_phys->ds_unique_bytes = 0; + } + } else { + objset_impl_t *osi; + + ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0); + ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0); + ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0); - if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - ds->ds_prev->ds_phys->ds_unique_bytes = 0; + bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t)); + ds->ds_phys->ds_flags = 0; + ds->ds_phys->ds_unique_bytes = 0; + if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_UNIQUE_ACCURATE) + ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + + osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds, + &ds->ds_phys->ds_bp, *ost, tx); +#ifdef _KERNEL + zfs_create_fs(&osi->os, kcred, NULL, tx); +#endif } + + spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa, + tx, cr, "dataset = %llu", ds->ds_object); } /* ARGSUSED */ @@ -1024,6 +1321,9 @@ static int dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t count; + int err; /* * Can't delete a head dataset if there are snapshots of it. @@ -1034,26 +1334,44 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) return (EINVAL); + /* + * This is really a dsl_dir thing, but check it here so that + * we'll be less likely to leave this dataset inconsistent & + * nearly destroyed. + */ + err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); + if (err) + return (err); + if (count != 0) + return (EEXIST); + return (0); } /* ARGSUSED */ static void -dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; + dsl_pool_t *dp = ds->ds_dir->dd_pool; /* Mark it as inconsistent on-disk, in case we crash */ dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; + + spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, + cr, "dataset = %llu", ds->ds_object); } /* ARGSUSED */ -static int +int dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; + /* we have an owner hold, so noone else can destroy us */ + ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); + /* Can't delete a branch point. */ if (ds->ds_phys->ds_num_children > 1) return (EEXIST); @@ -1078,11 +1396,50 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); } +struct refsarg { + kmutex_t lock; + boolean_t gone; + kcondvar_t cv; +}; + +/* ARGSUSED */ +static void +dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) +{ + struct refsarg *arg = argv; + + mutex_enter(&arg->lock); + arg->gone = TRUE; + cv_signal(&arg->cv); + mutex_exit(&arg->lock); +} + static void -dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) +dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) +{ + struct refsarg arg; + + mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); + arg.gone = FALSE; + (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, + dsl_dataset_refs_gone); + dmu_buf_rele(ds->ds_dbuf, tag); + mutex_enter(&arg.lock); + while (!arg.gone) + cv_wait(&arg.cv, &arg.lock); + ASSERT(arg.gone); + mutex_exit(&arg.lock); + ds->ds_dbuf = NULL; + ds->ds_phys = NULL; + mutex_destroy(&arg.lock); + cv_destroy(&arg.cv); +} + +void +dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - uint64_t used = 0, compressed = 0, uncompressed = 0; zio_t *zio; int err; int after_branch_point = FALSE; @@ -1091,29 +1448,53 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) dsl_dataset_t *ds_prev = NULL; uint64_t obj; - ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX); + ASSERT(ds->ds_owner); ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); ASSERT(ds->ds_prev == NULL || ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + /* signal any waiters that this dataset is going away */ + mutex_enter(&ds->ds_lock); + ds->ds_owner = dsl_reaper; + cv_broadcast(&ds->ds_exclusive_cv); + mutex_exit(&ds->ds_lock); + + /* Remove our reservation */ + if (ds->ds_reserved != 0) { + uint64_t val = 0; + dsl_dataset_set_reservation_sync(ds, &val, cr, tx); + ASSERT3U(ds->ds_reserved, ==, 0); + } + ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); + dsl_pool_ds_destroyed(ds, tx); + obj = ds->ds_object; if (ds->ds_phys->ds_prev_snap_obj != 0) { if (ds->ds_prev) { ds_prev = ds->ds_prev; } else { - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_NONE, FTAG, &ds_prev)); + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); } after_branch_point = (ds_prev->ds_phys->ds_next_snap_obj != obj); dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); if (after_branch_point && + ds_prev->ds_phys->ds_next_clones_obj != 0) { + VERIFY(0 == zap_remove_int(mos, + ds_prev->ds_phys->ds_next_clones_obj, obj, tx)); + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY(0 == zap_add_int(mos, + ds_prev->ds_phys->ds_next_clones_obj, + ds->ds_phys->ds_next_snap_obj, tx)); + } + } + if (after_branch_point && ds->ds_phys->ds_next_snap_obj == 0) { /* This clone is toast. */ ASSERT(ds_prev->ds_phys->ds_num_children > 1); @@ -1130,14 +1511,15 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) blkptr_t bp; dsl_dataset_t *ds_next; uint64_t itor = 0; + uint64_t old_unique; + int64_t used = 0, compressed = 0, uncompressed = 0; - spa_scrub_restart(dp->dp_spa, tx->tx_txg); - - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_next_snap_obj, NULL, - DS_MODE_NONE, FTAG, &ds_next)); + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); + old_unique = dsl_dataset_unique(ds_next); + dmu_buf_will_dirty(ds_next->ds_dbuf, tx); ds_next->ds_phys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; @@ -1154,8 +1536,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) * * XXX we're doing this long task with the config lock held */ - while (bplist_iterate(&ds_next->ds_deadlist, &itor, - &bp) == 0) { + while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) { if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) { VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, &bp, tx)); @@ -1170,16 +1551,23 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) compressed += BP_GET_PSIZE(&bp); uncompressed += BP_GET_UCSIZE(&bp); /* XXX check return value? */ - (void) arc_free(zio, dp->dp_spa, tx->tx_txg, + (void) dsl_free(zio, dp, tx->tx_txg, &bp, NULL, NULL, ARC_NOWAIT); } } + ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); + + /* change snapused */ + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -used, -compressed, -uncompressed, tx); + /* free next's deadlist */ bplist_close(&ds_next->ds_deadlist); bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx); /* set next's deadlist to our deadlist */ + bplist_close(&ds->ds_deadlist); ds_next->ds_phys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos, @@ -1200,51 +1588,50 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) * config lock held */ dsl_dataset_t *ds_after_next; + uint64_t space; - VERIFY(0 == dsl_dataset_open_obj(dp, - ds_next->ds_phys->ds_next_snap_obj, NULL, - DS_MODE_NONE, FTAG, &ds_after_next)); - itor = 0; - while (bplist_iterate(&ds_after_next->ds_deadlist, - &itor, &bp) == 0) { - if (bp.blk_birth > - ds->ds_phys->ds_prev_snap_txg && - bp.blk_birth <= - ds->ds_phys->ds_creation_txg) { - ds_next->ds_phys->ds_unique_bytes += - bp_get_dasize(dp->dp_spa, &bp); - } - } + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds_next->ds_phys->ds_next_snap_obj, + FTAG, &ds_after_next)); + + VERIFY(0 == + bplist_space_birthrange(&ds_after_next->ds_deadlist, + ds->ds_phys->ds_prev_snap_txg, + ds->ds_phys->ds_creation_txg, &space)); + ds_next->ds_phys->ds_unique_bytes += space; - dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG); + dsl_dataset_rele(ds_after_next, FTAG); ASSERT3P(ds_next->ds_prev, ==, NULL); } else { - /* - * It would be nice to update the head dataset's - * unique. To do so we would have to traverse - * it for blocks born after ds_prev, which is - * pretty expensive just to maintain something - * for debugging purposes. - */ ASSERT3P(ds_next->ds_prev, ==, ds); - dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE, - ds_next); + dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); + ds_next->ds_prev = NULL; if (ds_prev) { - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_NONE, ds_next, &ds_next->ds_prev)); - } else { - ds_next->ds_prev = NULL; + VERIFY(0 == dsl_dataset_get_ref(dp, + ds->ds_phys->ds_prev_snap_obj, + ds_next, &ds_next->ds_prev)); } - } - dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG); - /* - * NB: unique_bytes is not accurate for head objsets - * because we don't update it when we delete the most - * recent snapshot -- see above comment. - */ - ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); + dsl_dataset_recalc_head_uniq(ds_next); + + /* + * Reduce the amount of our unconsmed refreservation + * being charged to our parent by the amount of + * new unique data we have gained. + */ + if (old_unique < ds_next->ds_reserved) { + int64_t mrsdelta; + uint64_t new_unique = + ds_next->ds_phys->ds_unique_bytes; + + ASSERT(old_unique <= new_unique); + mrsdelta = MIN(new_unique - old_unique, + ds_next->ds_reserved - old_unique); + dsl_dir_diduse_space(ds->ds_dir, + DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); + } + } + dsl_dataset_rele(ds_next, FTAG); } else { /* * There's no next snapshot, so this is a head dataset. @@ -1263,76 +1650,106 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) * Free everything that we point to (that's born after * the previous snapshot, if we are a clone) * - * XXX we're doing this long task with the config lock held + * NB: this should be very quick, because we already + * freed all the objects in open context. */ - ka.usedp = &used; - ka.compressedp = &compressed; - ka.uncompressedp = &uncompressed; + ka.ds = ds; ka.zio = zio; ka.tx = tx; err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, ADVANCE_POST, kill_blkptr, &ka); ASSERT3U(err, ==, 0); + ASSERT(spa_version(dp->dp_spa) < SPA_VERSION_UNIQUE_ACCURATE || + ds->ds_phys->ds_unique_bytes == 0); } err = zio_wait(zio); ASSERT3U(err, ==, 0); - dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx); - - if (ds->ds_phys->ds_snapnames_zapobj) { - err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); - ASSERT(err == 0); - } - if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { - /* Erase the link in the dataset */ + /* Erase the link in the dir */ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; - /* - * dsl_dir_sync_destroy() called us, they'll destroy - * the dataset. - */ + ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); + err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); + ASSERT(err == 0); } else { /* remove from snapshot namespace */ dsl_dataset_t *ds_head; - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL, - DS_MODE_NONE, FTAG, &ds_head)); + ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); VERIFY(0 == dsl_dataset_get_snapname(ds)); #ifdef ZFS_DEBUG { uint64_t val; - err = zap_lookup(mos, - ds_head->ds_phys->ds_snapnames_zapobj, - ds->ds_snapname, 8, 1, &val); + + err = dsl_dataset_snap_lookup(ds_head, + ds->ds_snapname, &val); ASSERT3U(err, ==, 0); ASSERT3U(val, ==, obj); } #endif - err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj, - ds->ds_snapname, tx); + err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); ASSERT(err == 0); - dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG); + dsl_dataset_rele(ds_head, FTAG); } if (ds_prev && ds->ds_prev != ds_prev) - dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG); - - spa_clear_bootfs(dp->dp_spa, ds->ds_object, tx); - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag); + dsl_dataset_rele(ds_prev, FTAG); + + spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); + spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx, + cr, "dataset = %llu", ds->ds_object); + + if (ds->ds_phys->ds_next_clones_obj != 0) { + uint64_t count; + ASSERT(0 == zap_count(mos, + ds->ds_phys->ds_next_clones_obj, &count) && count == 0); + VERIFY(0 == dmu_object_free(mos, + ds->ds_phys->ds_next_clones_obj, tx)); + } + if (ds->ds_phys->ds_props_obj != 0) + VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); + dsl_dir_close(ds->ds_dir, ds); + ds->ds_dir = NULL; + dsl_dataset_drain_refs(ds, tag); VERIFY(0 == dmu_object_free(mos, obj, tx)); +} +static int +dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t asize; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + /* + * If there's an fs-only reservation, any blocks that might become + * owned by the snapshot dataset must be accommodated by space + * outside of the reservation. + */ + asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved); + if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE)) + return (ENOSPC); + + /* + * Propogate any reserved space for this snapshot to other + * snapshot checks in this sync group. + */ + if (asize > 0) + dsl_dir_willuse_space(ds->ds_dir, asize, tx); + + return (0); } /* ARGSUSED */ int dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) { - objset_t *os = arg1; - dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dataset_t *ds = arg1; const char *snapname = arg2; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; int err; uint64_t value; @@ -1346,8 +1763,7 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) /* * Check for conflicting name snapshot name. */ - err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj, - snapname, 8, 1, &value); + err = dsl_dataset_snap_lookup(ds, snapname, &value); if (err == 0) return (EEXIST); if (err != ENOENT) @@ -1360,34 +1776,44 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) return (ENAMETOOLONG); + err = dsl_dataset_snapshot_reserve_space(ds, tx); + if (err) + return (err); + ds->ds_trysnap_txg = tx->tx_txg; return (0); } void -dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { - objset_t *os = arg1; - dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dataset_t *ds = arg1; const char *snapname = arg2; dsl_pool_t *dp = ds->ds_dir->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; - uint64_t dsobj; + uint64_t dsobj, crtxg; objset_t *mos = dp->dp_meta_objset; int err; - spa_scrub_restart(dp->dp_spa, tx->tx_txg); ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); + /* + * The origin's ds_creation_txg has to be < TXG_INITIAL + */ + if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) + crtxg = 1; + else + crtxg = tx->tx_txg; + dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; + bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = ds->ds_dir->dd_object; dsphys->ds_fsid_guid = unique_create(); - unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; @@ -1395,7 +1821,7 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) dsphys->ds_next_snap_obj = ds->ds_object; dsphys->ds_num_children = 1; dsphys->ds_creation_time = gethrestime_sec(); - dsphys->ds_creation_txg = tx->tx_txg; + dsphys->ds_creation_txg = crtxg; dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; @@ -1406,6 +1832,8 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); if (ds->ds_prev) { + uint64_t next_clones_obj = + ds->ds_prev->ds_phys->ds_next_clones_obj; ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object || ds->ds_prev->ds_phys->ds_num_children > 1); @@ -1414,15 +1842,33 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, ds->ds_prev->ds_phys->ds_creation_txg); ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; + } else if (next_clones_obj != 0) { + VERIFY3U(0, ==, zap_remove_int(mos, + next_clones_obj, dsphys->ds_next_snap_obj, tx)); + VERIFY3U(0, ==, zap_add_int(mos, + next_clones_obj, dsobj, tx)); } } + /* + * If we have a reference-reservation on this dataset, we will + * need to increase the amount of refreservation being charged + * since our unique space is going to zero. + */ + if (ds->ds_reserved) { + int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, + add, 0, 0, tx); + } + bplist_close(&ds->ds_deadlist); dmu_buf_will_dirty(ds->ds_dbuf, tx); - ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg); + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); ds->ds_phys->ds_prev_snap_obj = dsobj; - ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg; + ds->ds_phys->ds_prev_snap_txg = crtxg; ds->ds_phys->ds_unique_bytes = 0; + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) + ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; ds->ds_phys->ds_deadlist_obj = bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, @@ -1434,10 +1880,14 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT(err == 0); if (ds->ds_prev) - dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds); - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, snapname, - DS_MODE_NONE, ds, &ds->ds_prev)); + dsl_dataset_drop_ref(ds->ds_prev, ds); + VERIFY(0 == dsl_dataset_get_ref(dp, + ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); + + dsl_pool_ds_snapshotted(ds, tx); + + spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr, + "dataset = %llu", dsobj); } void @@ -1447,22 +1897,38 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) ASSERT(ds->ds_user_ptr != NULL); ASSERT(ds->ds_phys->ds_next_snap_obj == 0); + /* + * in case we had to change ds_fsid_guid when we opened it, + * sync it out now. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; + dsl_dir_dirty(ds->ds_dir, tx); dmu_objset_sync(ds->ds_user_ptr, zio, tx); - /* Unneeded? bplist_close(&ds->ds_deadlist); */ } void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { + uint64_t refd, avail, uobjs, aobjs; + dsl_dir_stats(ds->ds_dir, nv); + dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, ds->ds_phys->ds_creation_time); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, ds->ds_phys->ds_creation_txg); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, - ds->ds_phys->ds_used_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, + ds->ds_quota); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, + ds->ds_reserved); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, + ds->ds_phys->ds_guid); if (ds->ds_phys->ds_next_snap_obj) { /* @@ -1483,29 +1949,29 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) { stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; + stat->dds_guid = ds->ds_phys->ds_guid; if (ds->ds_phys->ds_next_snap_obj) { stat->dds_is_snapshot = B_TRUE; stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; } /* clone origin is really a dsl_dir thing... */ - if (ds->ds_dir->dd_phys->dd_clone_parent_obj) { + rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); + if (dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_t *ods; - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); - VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool, - ds->ds_dir->dd_phys->dd_clone_parent_obj, - NULL, DS_MODE_NONE, FTAG, &ods)); - dsl_dataset_name(ods, stat->dds_clone_of); - dsl_dataset_close(ods, DS_MODE_NONE, FTAG); - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, + ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); + dsl_dataset_name(ods, stat->dds_origin); + dsl_dataset_drop_ref(ods, FTAG); } + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); } uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds) { - return (ds->ds_phys->ds_fsid_guid); + return (ds->ds_fsid_guid); } void @@ -1515,10 +1981,37 @@ dsl_dataset_space(dsl_dataset_t *ds, { *refdbytesp = ds->ds_phys->ds_used_bytes; *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); + if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) + *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; + if (ds->ds_quota != 0) { + /* + * Adjust available bytes according to refquota + */ + if (*refdbytesp < ds->ds_quota) + *availbytesp = MIN(*availbytesp, + ds->ds_quota - *refdbytesp); + else + *availbytesp = 0; + } *usedobjsp = ds->ds_phys->ds_bp.blk_fill; *availobjsp = DN_MAX_OBJECT - *usedobjsp; } +boolean_t +dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || + dsl_pool_sync_context(dp)); + if (ds->ds_prev == NULL) + return (B_FALSE); + if (ds->ds_phys->ds_bp.blk_birth > + ds->ds_prev->ds_phys->ds_creation_txg) + return (B_TRUE); + return (B_FALSE); +} + /* ARGSUSED */ static int dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) @@ -1526,20 +2019,18 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) dsl_dataset_t *ds = arg1; char *newsnapname = arg2; dsl_dir_t *dd = ds->ds_dir; - objset_t *mos = dd->dd_pool->dp_meta_objset; dsl_dataset_t *hds; uint64_t val; int err; - err = dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds); + err = dsl_dataset_hold_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); if (err) return (err); /* new name better not be in use */ - err = zap_lookup(mos, hds->ds_phys->ds_snapnames_zapobj, - newsnapname, 8, 1, &val); - dsl_dataset_close(hds, DS_MODE_NONE, FTAG); + err = dsl_dataset_snap_lookup(hds, newsnapname, &val); + dsl_dataset_rele(hds, FTAG); if (err == 0) err = EEXIST; @@ -1554,10 +2045,11 @@ dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, + cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - char *newsnapname = arg2; + const char *newsnapname = arg2; dsl_dir_t *dd = ds->ds_dir; objset_t *mos = dd->dd_pool->dp_meta_objset; dsl_dataset_t *hds; @@ -1565,12 +2057,11 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT(ds->ds_phys->ds_next_snap_obj != 0); - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds)); + VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); VERIFY(0 == dsl_dataset_get_snapname(ds)); - err = zap_remove(mos, hds->ds_phys->ds_snapnames_zapobj, - ds->ds_snapname, tx); + err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); ASSERT3U(err, ==, 0); mutex_enter(&ds->ds_lock); (void) strcpy(ds->ds_snapname, newsnapname); @@ -1579,10 +2070,12 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) ds->ds_snapname, 8, 1, &ds->ds_object, tx); ASSERT3U(err, ==, 0); - dsl_dataset_close(hds, DS_MODE_NONE, FTAG); + spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, + cr, "dataset = %llu", ds->ds_object); + dsl_dataset_rele(hds, FTAG); } -struct renamearg { +struct renamesnaparg { dsl_sync_task_group_t *dstg; char failed[MAXPATHLEN]; char *oldsnap; @@ -1592,7 +2085,7 @@ struct renamearg { static int dsl_snapshot_rename_one(char *name, void *arg) { - struct renamearg *ra = arg; + struct renamesnaparg *ra = arg; dsl_dataset_t *ds = NULL; char *cp; int err; @@ -1600,25 +2093,33 @@ dsl_snapshot_rename_one(char *name, void *arg) cp = name + strlen(name); *cp = '@'; (void) strcpy(cp + 1, ra->oldsnap); - err = dsl_dataset_open(name, DS_MODE_READONLY | DS_MODE_STANDARD, - ra->dstg, &ds); + + /* + * For recursive snapshot renames the parent won't be changing + * so we just pass name for both the to/from argument. + */ + err = zfs_secpolicy_rename_perms(name, name, CRED()); if (err == ENOENT) { - *cp = '\0'; return (0); - } - if (err) { + } else if (err) { (void) strcpy(ra->failed, name); - *cp = '\0'; - dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg); return (err); } #ifdef _KERNEL - /* for all filesystems undergoing rename, we'll need to unmount it */ + /* + * For all filesystems undergoing rename, we'll need to unmount it. + */ (void) zfs_unmount_snap(name, NULL); #endif - + err = dsl_dataset_hold(name, ra->dstg, &ds); *cp = '\0'; + if (err == ENOENT) { + return (0); + } else if (err) { + (void) strcpy(ra->failed, name); + return (err); + } dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); @@ -1630,7 +2131,7 @@ static int dsl_recursive_rename(char *oldname, const char *newname) { int err; - struct renamearg *ra; + struct renamesnaparg *ra; dsl_sync_task_t *dst; spa_t *spa; char *cp, *fsname = spa_strdup(oldname); @@ -1640,19 +2141,12 @@ dsl_recursive_rename(char *oldname, const char *newname) cp = strchr(fsname, '@'); *cp = '\0'; - cp = strchr(fsname, '/'); - if (cp) { - *cp = '\0'; - err = spa_open(fsname, &spa, FTAG); - *cp = '/'; - } else { - err = spa_open(fsname, &spa, FTAG); - } + err = spa_open(fsname, &spa, FTAG); if (err) { kmem_free(fsname, len + 1); return (err); } - ra = kmem_alloc(sizeof (struct renamearg), KM_SLEEP); + ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); ra->oldsnap = strchr(oldname, '@') + 1; @@ -1675,21 +2169,32 @@ dsl_recursive_rename(char *oldname, const char *newname) (void) strcat(ra->failed, "@"); (void) strcat(ra->failed, ra->newsnap); } - dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg); + dsl_dataset_rele(ds, ra->dstg); } - (void) strcpy(oldname, ra->failed); + if (err) + (void) strcpy(oldname, ra->failed); dsl_sync_task_group_destroy(ra->dstg); - kmem_free(ra, sizeof (struct renamearg)); + kmem_free(ra, sizeof (struct renamesnaparg)); spa_close(spa, FTAG); return (err); } +static int +dsl_valid_rename(char *oldname, void *arg) +{ + int delta = *(int *)arg; + + if (strlen(oldname) + delta >= MAXNAMELEN) + return (ENAMETOOLONG); + + return (0); +} + #pragma weak dmu_objset_rename = dsl_dataset_rename int -dsl_dataset_rename(char *oldname, const char *newname, - boolean_t recursive) +dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) { dsl_dir_t *dd; dsl_dataset_t *ds; @@ -1700,7 +2205,15 @@ dsl_dataset_rename(char *oldname, const char *newname, if (err) return (err); if (tail == NULL) { - err = dsl_dir_rename(dd, newname); + int delta = strlen(newname) - strlen(oldname); + + /* if we're growing, validate child name lengths */ + if (delta > 0) + err = dmu_objset_find(oldname, dsl_valid_rename, + &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + + if (!err) + err = dsl_dir_rename(dd, newname); dsl_dir_close(dd, FTAG); return (err); } @@ -1723,8 +2236,7 @@ dsl_dataset_rename(char *oldname, const char *newname, if (recursive) { err = dsl_recursive_rename(oldname, newname); } else { - err = dsl_dataset_open(oldname, - DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds); + err = dsl_dataset_hold(oldname, FTAG, &ds); if (err) return (err); @@ -1732,278 +2244,640 @@ dsl_dataset_rename(char *oldname, const char *newname, dsl_dataset_snapshot_rename_check, dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); - dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG); + dsl_dataset_rele(ds, FTAG); } return (err); } +struct promotenode { + list_node_t link; + dsl_dataset_t *ds; +}; + struct promotearg { - uint64_t used, comp, uncomp, unique; - uint64_t newnext_obj, snapnames_obj; + list_t shared_snaps, origin_snaps, clone_snaps; + dsl_dataset_t *origin_origin, *origin_head; + uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; }; +static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); + +/* ARGSUSED */ static int dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *hds = arg1; struct promotearg *pa = arg2; - dsl_dir_t *dd = hds->ds_dir; - dsl_pool_t *dp = hds->ds_dir->dd_pool; - dsl_dir_t *pdd = NULL; - dsl_dataset_t *ds = NULL; - dsl_dataset_t *pivot_ds = NULL; - dsl_dataset_t *newnext_ds = NULL; + struct promotenode *snap = list_head(&pa->shared_snaps); + dsl_dataset_t *origin_ds = snap->ds; int err; - char *name = NULL; - uint64_t itor = 0; - blkptr_t bp; - - bzero(pa, sizeof (*pa)); - /* Check that it is a clone */ - if (dd->dd_phys->dd_clone_parent_obj == 0) + /* Check that it is a real clone */ + if (!dsl_dir_is_clone(hds->ds_dir)) return (EINVAL); /* Since this is so expensive, don't do the preliminary check */ if (!dmu_tx_is_syncing(tx)) return (0); - if (err = dsl_dataset_open_obj(dp, - dd->dd_phys->dd_clone_parent_obj, - NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds)) - goto out; - pdd = pivot_ds->ds_dir; - - { - dsl_dataset_t *phds; - if (err = dsl_dataset_open_obj(dd->dd_pool, - pdd->dd_phys->dd_head_dataset_obj, - NULL, DS_MODE_NONE, FTAG, &phds)) - goto out; - pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj; - dsl_dataset_close(phds, DS_MODE_NONE, FTAG); - } - - if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) { - err = EXDEV; - goto out; - } - - /* find pivot point's new next ds */ - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object, - NULL, DS_MODE_NONE, FTAG, &newnext_ds)); - while (newnext_ds->ds_phys->ds_prev_snap_obj != pivot_ds->ds_object) { - dsl_dataset_t *prev; - - if (err = dsl_dataset_open_obj(dd->dd_pool, - newnext_ds->ds_phys->ds_prev_snap_obj, - NULL, DS_MODE_NONE, FTAG, &prev)) - goto out; - dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG); - newnext_ds = prev; - } - pa->newnext_obj = newnext_ds->ds_object; + if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) + return (EXDEV); - /* compute pivot point's new unique space */ - while ((err = bplist_iterate(&newnext_ds->ds_deadlist, - &itor, &bp)) == 0) { - if (bp.blk_birth > pivot_ds->ds_phys->ds_prev_snap_txg) - pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp); - } - if (err != ENOENT) - goto out; + /* compute origin's new unique space */ + snap = list_tail(&pa->clone_snaps); + ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); + err = bplist_space_birthrange(&snap->ds->ds_deadlist, + origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique); + if (err) + return (err); - /* Walk the snapshots that we are moving */ - name = kmem_alloc(MAXPATHLEN, KM_SLEEP); - ds = pivot_ds; - /* CONSTCOND */ - while (TRUE) { + /* + * Walk the snapshots that we are moving + * + * Compute space to transfer. Consider the incremental changes + * to used for each snapshot: + * (my used) = (prev's used) + (blocks born) - (blocks killed) + * So each snapshot gave birth to: + * (blocks born) = (my used) - (prev's used) + (blocks killed) + * So a sequence would look like: + * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) + * Which simplifies to: + * uN + kN + kN-1 + ... + k1 + k0 + * Note however, if we stop before we reach the ORIGIN we get: + * uN + kN + kN-1 + ... + kM - uM-1 + */ + pa->used = origin_ds->ds_phys->ds_used_bytes; + pa->comp = origin_ds->ds_phys->ds_compressed_bytes; + pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; + for (snap = list_head(&pa->shared_snaps); snap; + snap = list_next(&pa->shared_snaps, snap)) { uint64_t val, dlused, dlcomp, dluncomp; - dsl_dataset_t *prev; + dsl_dataset_t *ds = snap->ds; /* Check that the snapshot name does not conflict */ - dsl_dataset_name(ds, name); - err = zap_lookup(dd->dd_pool->dp_meta_objset, - hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, - 8, 1, &val); - if (err != ENOENT) { - if (err == 0) - err = EEXIST; - goto out; - } - - /* - * compute space to transfer. Each snapshot gave birth to: - * (my used) - (prev's used) + (deadlist's used) - */ - pa->used += ds->ds_phys->ds_used_bytes; - pa->comp += ds->ds_phys->ds_compressed_bytes; - pa->uncomp += ds->ds_phys->ds_uncompressed_bytes; + VERIFY(0 == dsl_dataset_get_snapname(ds)); + err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); - /* If we reach the first snapshot, we're done. */ + /* The very first snapshot does not have a deadlist */ if (ds->ds_phys->ds_prev_snap_obj == 0) - break; + continue; if (err = bplist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp)) - goto out; - if (err = dsl_dataset_open_obj(dd->dd_pool, - ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE, - FTAG, &prev)) - goto out; - pa->used += dlused - prev->ds_phys->ds_used_bytes; - pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes; - pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes; + return (err); + pa->used += dlused; + pa->comp += dlcomp; + pa->uncomp += dluncomp; + } - /* - * We could be a clone of a clone. If we reach our - * parent's branch point, we're done. - */ - if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { - dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG); - break; - } - if (ds != pivot_ds) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - ds = prev; + /* + * If we are a clone of a clone then we never reached ORIGIN, + * so we need to subtract out the clone origin's used space. + */ + if (pa->origin_origin) { + pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; + pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; + pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; } /* Check that there is enough space here */ - err = dsl_dir_transfer_possible(pdd, dd, pa->used); + err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, + pa->used); + if (err) + return (err); -out: - if (ds && ds != pivot_ds) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - if (pivot_ds) - dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG); - if (newnext_ds) - dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG); - if (name) - kmem_free(name, MAXPATHLEN); - return (err); + /* + * Compute the amounts of space that will be used by snapshots + * after the promotion (for both origin and clone). For each, + * it is the amount of space that will be on all of their + * deadlists (that was not born before their new origin). + */ + if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + uint64_t space; + + /* + * Note, typically this will not be a clone of a clone, + * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so + * these snaplist_space() -> bplist_space_birthrange() + * calls will be fast because they do not have to + * iterate over all bps. + */ + snap = list_head(&pa->origin_snaps); + err = snaplist_space(&pa->shared_snaps, + snap->ds->ds_origin_txg, &pa->cloneusedsnap); + if (err) + return (err); + + err = snaplist_space(&pa->clone_snaps, + snap->ds->ds_origin_txg, &space); + if (err) + return (err); + pa->cloneusedsnap += space; + } + if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + err = snaplist_space(&pa->origin_snaps, + origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); + if (err) + return (err); + } + + return (0); } static void -dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *hds = arg1; struct promotearg *pa = arg2; + struct promotenode *snap = list_head(&pa->shared_snaps); + dsl_dataset_t *origin_ds = snap->ds; + dsl_dataset_t *origin_head; dsl_dir_t *dd = hds->ds_dir; dsl_pool_t *dp = hds->ds_dir->dd_pool; - dsl_dir_t *pdd = NULL; - dsl_dataset_t *ds, *pivot_ds; - char *name; + dsl_dir_t *odd = NULL; + uint64_t oldnext_obj; + int64_t delta; - ASSERT(dd->dd_phys->dd_clone_parent_obj != 0); ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); - VERIFY(0 == dsl_dataset_open_obj(dp, - dd->dd_phys->dd_clone_parent_obj, - NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds)); + snap = list_head(&pa->origin_snaps); + origin_head = snap->ds; + /* - * We need to explicitly open pdd, since pivot_ds's pdd will be + * We need to explicitly open odd, since origin_ds's dd will be * changing. */ - VERIFY(0 == dsl_dir_open_obj(dp, pivot_ds->ds_dir->dd_object, - NULL, FTAG, &pdd)); + VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, + NULL, FTAG, &odd)); + + /* change origin's next snap */ + dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); + oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; + snap = list_tail(&pa->clone_snaps); + ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); + origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; + + /* change the origin's next clone */ + if (origin_ds->ds_phys->ds_next_clones_obj) { + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + origin_ds->ds_phys->ds_next_clones_obj, + origin_ds->ds_phys->ds_next_snap_obj, tx)); + VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, + origin_ds->ds_phys->ds_next_clones_obj, + oldnext_obj, tx)); + } - /* move snapshots to this dir */ - name = kmem_alloc(MAXPATHLEN, KM_SLEEP); - ds = pivot_ds; - /* CONSTCOND */ - while (TRUE) { - dsl_dataset_t *prev; + /* change origin */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); + dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; + hds->ds_origin_txg = origin_head->ds_origin_txg; + dmu_buf_will_dirty(odd->dd_dbuf, tx); + odd->dd_phys->dd_origin_obj = origin_ds->ds_object; + origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg; + /* move snapshots to this dir */ + for (snap = list_head(&pa->shared_snaps); snap; + snap = list_next(&pa->shared_snaps, snap)) { + dsl_dataset_t *ds = snap->ds; + + /* unregister props as dsl_dir is changing */ + if (ds->ds_user_ptr) { + ds->ds_user_evict_func(ds, ds->ds_user_ptr); + ds->ds_user_ptr = NULL; + } /* move snap name entry */ - dsl_dataset_name(ds, name); - VERIFY(0 == zap_remove(dp->dp_meta_objset, - pa->snapnames_obj, ds->ds_snapname, tx)); + VERIFY(0 == dsl_dataset_get_snapname(ds)); + VERIFY(0 == dsl_dataset_snap_remove(origin_head, + ds->ds_snapname, tx)); VERIFY(0 == zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); - /* change containing dsl_dir */ dmu_buf_will_dirty(ds->ds_dbuf, tx); - ASSERT3U(ds->ds_phys->ds_dir_obj, ==, pdd->dd_object); + ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); ds->ds_phys->ds_dir_obj = dd->dd_object; - ASSERT3P(ds->ds_dir, ==, pdd); + ASSERT3P(ds->ds_dir, ==, odd); dsl_dir_close(ds->ds_dir, ds); VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, NULL, ds, &ds->ds_dir)); ASSERT3U(dsl_prop_numcb(ds), ==, 0); + } - if (ds->ds_phys->ds_prev_snap_obj == 0) - break; + /* + * Change space accounting. + * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either + * both be valid, or both be 0 (resulting in delta == 0). This + * is true for each of {clone,origin} independently. + */ + + delta = pa->cloneusedsnap - + dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; + ASSERT3S(delta, >=, 0); + ASSERT3U(pa->used, >=, delta); + dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); + dsl_dir_diduse_space(dd, DD_USED_HEAD, + pa->used - delta, pa->comp, pa->uncomp, tx); + + delta = pa->originusedsnap - + odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; + ASSERT3S(delta, <=, 0); + ASSERT3U(pa->used, >=, -delta); + dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); + dsl_dir_diduse_space(odd, DD_USED_HEAD, + -pa->used - delta, -pa->comp, -pa->uncomp, tx); + + origin_ds->ds_phys->ds_unique_bytes = pa->unique; + + /* log history record */ + spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, + cr, "dataset = %llu", hds->ds_object); + + dsl_dir_close(odd, FTAG); +} + +static char *snaplist_tag = "snaplist"; +/* + * Make a list of dsl_dataset_t's for the snapshots between first_obj + * (exclusive) and last_obj (inclusive). The list will be in reverse + * order (last_obj will be the list_head()). If first_obj == 0, do all + * snapshots back to this dataset's origin. + */ +static int +snaplist_make(dsl_pool_t *dp, boolean_t own, + uint64_t first_obj, uint64_t last_obj, list_t *l) +{ + uint64_t obj = last_obj; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); + + list_create(l, sizeof (struct promotenode), + offsetof(struct promotenode, link)); - VERIFY(0 == dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE, - FTAG, &prev)); + while (obj != first_obj) { + dsl_dataset_t *ds; + struct promotenode *snap; + int err; - if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { - dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG); - break; + if (own) { + err = dsl_dataset_own_obj(dp, obj, + 0, snaplist_tag, &ds); + if (err == 0) + dsl_dataset_make_exclusive(ds, snaplist_tag); + } else { + err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); + } + if (err == ENOENT) { + /* lost race with snapshot destroy */ + struct promotenode *last = list_tail(l); + ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); + obj = last->ds->ds_phys->ds_prev_snap_obj; + continue; + } else if (err) { + return (err); } - if (ds != pivot_ds) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - ds = prev; + + if (first_obj == 0) + first_obj = ds->ds_dir->dd_phys->dd_origin_obj; + + snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); + snap->ds = ds; + list_insert_tail(l, snap); + obj = ds->ds_phys->ds_prev_snap_obj; } - if (ds != pivot_ds) - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); - /* change pivot point's next snap */ - dmu_buf_will_dirty(pivot_ds->ds_dbuf, tx); - pivot_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj; + return (0); +} - /* change clone_parent-age */ - dmu_buf_will_dirty(dd->dd_dbuf, tx); - ASSERT3U(dd->dd_phys->dd_clone_parent_obj, ==, pivot_ds->ds_object); - dd->dd_phys->dd_clone_parent_obj = pdd->dd_phys->dd_clone_parent_obj; - dmu_buf_will_dirty(pdd->dd_dbuf, tx); - pdd->dd_phys->dd_clone_parent_obj = pivot_ds->ds_object; +static int +snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) +{ + struct promotenode *snap; - /* change space accounting */ - dsl_dir_diduse_space(pdd, -pa->used, -pa->comp, -pa->uncomp, tx); - dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx); - pivot_ds->ds_phys->ds_unique_bytes = pa->unique; + *spacep = 0; + for (snap = list_head(l); snap; snap = list_next(l, snap)) { + uint64_t used; + int err = bplist_space_birthrange(&snap->ds->ds_deadlist, + mintxg, UINT64_MAX, &used); + if (err) + return (err); + *spacep += used; + } + return (0); +} - dsl_dir_close(pdd, FTAG); - dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG); - kmem_free(name, MAXPATHLEN); +static void +snaplist_destroy(list_t *l, boolean_t own) +{ + struct promotenode *snap; + + if (!list_link_active(&l->list_head)) + return; + + while ((snap = list_tail(l)) != NULL) { + list_remove(l, snap); + if (own) + dsl_dataset_disown(snap->ds, snaplist_tag); + else + dsl_dataset_rele(snap->ds, snaplist_tag); + kmem_free(snap, sizeof (struct promotenode)); + } + list_destroy(l); } +/* + * Promote a clone. Nomenclature note: + * "clone" or "cds": the original clone which is being promoted + * "origin" or "ods": the snapshot which is originally clone's origin + * "origin head" or "ohds": the dataset which is the head + * (filesystem/volume) for the origin + * "origin origin": the origin of the origin's filesystem (typically + * NULL, indicating that the clone is not a clone of a clone). + */ int dsl_dataset_promote(const char *name) { dsl_dataset_t *ds; - int err; + dsl_dir_t *dd; + dsl_pool_t *dp; dmu_object_info_t doi; - struct promotearg pa; + struct promotearg pa = { 0 }; + struct promotenode *snap; + int err; - err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds); + err = dsl_dataset_hold(name, FTAG, &ds); if (err) return (err); + dd = ds->ds_dir; + dp = dd->dd_pool; - err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset, + err = dmu_object_info(dp->dp_meta_objset, ds->ds_phys->ds_snapnames_zapobj, &doi); if (err) { - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + dsl_dataset_rele(ds, FTAG); return (err); } + if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } + + /* + * We are going to inherit all the snapshots taken before our + * origin (i.e., our new origin will be our parent's origin). + * Take ownership of them so that we can rename them into our + * namespace. + */ + rw_enter(&dp->dp_config_rwlock, RW_READER); + + err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, + &pa.shared_snaps); + if (err != 0) + goto out; + + err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); + if (err != 0) + goto out; + + snap = list_head(&pa.shared_snaps); + ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); + err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, + snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); + if (err != 0) + goto out; + + if (dsl_dir_is_clone(snap->ds->ds_dir)) { + err = dsl_dataset_own_obj(dp, + snap->ds->ds_dir->dd_phys->dd_origin_obj, + 0, FTAG, &pa.origin_origin); + if (err != 0) + goto out; + } + +out: + rw_exit(&dp->dp_config_rwlock); + /* * Add in 128x the snapnames zapobj size, since we will be moving * a bunch of snapnames to the promoted ds, and dirtying their * bonus buffers. */ - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_promote_check, - dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks); - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + if (err == 0) { + err = dsl_sync_task_do(dp, dsl_dataset_promote_check, + dsl_dataset_promote_sync, ds, &pa, + 2 + 2 * doi.doi_physical_blks); + } + + snaplist_destroy(&pa.shared_snaps, B_TRUE); + snaplist_destroy(&pa.clone_snaps, B_FALSE); + snaplist_destroy(&pa.origin_snaps, B_FALSE); + if (pa.origin_origin) + dsl_dataset_disown(pa.origin_origin, FTAG); + dsl_dataset_rele(ds, FTAG); return (err); } +struct cloneswaparg { + dsl_dataset_t *cds; /* clone dataset */ + dsl_dataset_t *ohds; /* origin's head dataset */ + boolean_t force; + int64_t unused_refres_delta; /* change in unconsumed refreservation */ +}; + +/* ARGSUSED */ +static int +dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + struct cloneswaparg *csa = arg1; + + /* they should both be heads */ + if (dsl_dataset_is_snapshot(csa->cds) || + dsl_dataset_is_snapshot(csa->ohds)) + return (EINVAL); + + /* the branch point should be just before them */ + if (csa->cds->ds_prev != csa->ohds->ds_prev) + return (EINVAL); + + /* cds should be the clone */ + if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj != + csa->ohds->ds_object) + return (EINVAL); + + /* the clone should be a child of the origin */ + if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) + return (EINVAL); + + /* ohds shouldn't be modified unless 'force' */ + if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) + return (ETXTBSY); + + /* adjust amount of any unconsumed refreservation */ + csa->unused_refres_delta = + (int64_t)MIN(csa->ohds->ds_reserved, + csa->ohds->ds_phys->ds_unique_bytes) - + (int64_t)MIN(csa->ohds->ds_reserved, + csa->cds->ds_phys->ds_unique_bytes); + + if (csa->unused_refres_delta > 0 && + csa->unused_refres_delta > + dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) + return (ENOSPC); + + return (0); +} + +/* ARGSUSED */ +static void +dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + struct cloneswaparg *csa = arg1; + dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; + + ASSERT(csa->cds->ds_reserved == 0); + ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota); + + dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); + dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); + dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx); + + if (csa->cds->ds_user_ptr != NULL) { + csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr); + csa->cds->ds_user_ptr = NULL; + } + + if (csa->ohds->ds_user_ptr != NULL) { + csa->ohds->ds_user_evict_func(csa->ohds, + csa->ohds->ds_user_ptr); + csa->ohds->ds_user_ptr = NULL; + } + + /* reset origin's unique bytes */ + VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, + csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &csa->cds->ds_prev->ds_phys->ds_unique_bytes)); + + /* swap blkptrs */ + { + blkptr_t tmp; + tmp = csa->ohds->ds_phys->ds_bp; + csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; + csa->cds->ds_phys->ds_bp = tmp; + } + + /* set dd_*_bytes */ + { + int64_t dused, dcomp, duncomp; + uint64_t cdl_used, cdl_comp, cdl_uncomp; + uint64_t odl_used, odl_comp, odl_uncomp; + + ASSERT3U(csa->cds->ds_dir->dd_phys-> + dd_used_breakdown[DD_USED_SNAP], ==, 0); + + VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used, + &cdl_comp, &cdl_uncomp)); + VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used, + &odl_comp, &odl_uncomp)); + + dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - + (csa->ohds->ds_phys->ds_used_bytes + odl_used); + dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - + (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); + duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + + cdl_uncomp - + (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); + + dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, + dused, dcomp, duncomp, tx); + dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, + -dused, -dcomp, -duncomp, tx); + + /* + * The difference in the space used by snapshots is the + * difference in snapshot space due to the head's + * deadlist (since that's the only thing that's + * changing that affects the snapused). + */ + VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, + csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used)); + VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist, + csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used)); + dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, + DD_USED_HEAD, DD_USED_SNAP, tx); + } + +#define SWITCH64(x, y) \ + { \ + uint64_t __tmp = (x); \ + (x) = (y); \ + (y) = __tmp; \ + } + + /* swap ds_*_bytes */ + SWITCH64(csa->ohds->ds_phys->ds_used_bytes, + csa->cds->ds_phys->ds_used_bytes); + SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, + csa->cds->ds_phys->ds_compressed_bytes); + SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, + csa->cds->ds_phys->ds_uncompressed_bytes); + SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, + csa->cds->ds_phys->ds_unique_bytes); + + /* apply any parent delta for change in unconsumed refreservation */ + dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, + csa->unused_refres_delta, 0, 0, tx); + + /* swap deadlists */ + bplist_close(&csa->cds->ds_deadlist); + bplist_close(&csa->ohds->ds_deadlist); + SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, + csa->cds->ds_phys->ds_deadlist_obj); + VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, + csa->cds->ds_phys->ds_deadlist_obj)); + VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, + csa->ohds->ds_phys->ds_deadlist_obj)); +} + +/* + * Swap 'clone' with its origin head file system. Used at the end + * of "online recv" to swizzle the file system to the new version. + */ +int +dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, + boolean_t force) +{ + struct cloneswaparg csa; + int error; + + ASSERT(clone->ds_owner); + ASSERT(origin_head->ds_owner); +retry: + /* Need exclusive access for the swap */ + rw_enter(&clone->ds_rwlock, RW_WRITER); + if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { + rw_exit(&clone->ds_rwlock); + rw_enter(&origin_head->ds_rwlock, RW_WRITER); + if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { + rw_exit(&origin_head->ds_rwlock); + goto retry; + } + } + csa.cds = clone; + csa.ohds = origin_head; + csa.force = force; + error = dsl_sync_task_do(clone->ds_dir->dd_pool, + dsl_dataset_clone_swap_check, + dsl_dataset_clone_swap_sync, &csa, NULL, 9); + return (error); +} + /* * Given a pool name and a dataset object number in that pool, * return the name of that dataset. @@ -2013,23 +2887,220 @@ dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) { spa_t *spa; dsl_pool_t *dp; - dsl_dataset_t *ds = NULL; + dsl_dataset_t *ds; int error; if ((error = spa_open(pname, &spa, FTAG)) != 0) return (error); dp = spa_get_dsl(spa); rw_enter(&dp->dp_config_rwlock, RW_READER); - if ((error = dsl_dataset_open_obj(dp, obj, - NULL, DS_MODE_NONE, FTAG, &ds)) != 0) { - rw_exit(&dp->dp_config_rwlock); - spa_close(spa, FTAG); - return (error); + if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { + dsl_dataset_name(ds, buf); + dsl_dataset_rele(ds, FTAG); } - dsl_dataset_name(ds, buf); - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); rw_exit(&dp->dp_config_rwlock); spa_close(spa, FTAG); + return (error); +} + +int +dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, + uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) +{ + int error = 0; + + ASSERT3S(asize, >, 0); + + /* + * *ref_rsrv is the portion of asize that will come from any + * unconsumed refreservation space. + */ + *ref_rsrv = 0; + + mutex_enter(&ds->ds_lock); + /* + * Make a space adjustment for reserved bytes. + */ + if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { + ASSERT3U(*used, >=, + ds->ds_reserved - ds->ds_phys->ds_unique_bytes); + *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); + *ref_rsrv = + asize - MIN(asize, parent_delta(ds, asize + inflight)); + } + + if (!check_quota || ds->ds_quota == 0) { + mutex_exit(&ds->ds_lock); + return (0); + } + /* + * If they are requesting more space, and our current estimate + * is over quota, they get to try again unless the actual + * on-disk is over quota and there are no pending changes (which + * may free up space for us). + */ + if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { + if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) + error = ERESTART; + else + error = EDQUOT; + } + mutex_exit(&ds->ds_lock); + + return (error); +} + +/* ARGSUSED */ +static int +dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *quotap = arg2; + uint64_t new_quota = *quotap; + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) + return (ENOTSUP); + + if (new_quota == 0) + return (0); + + if (new_quota < ds->ds_phys->ds_used_bytes || + new_quota < ds->ds_reserved) + return (ENOSPC); + return (0); } + +/* ARGSUSED */ +void +dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *quotap = arg2; + uint64_t new_quota = *quotap; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + ds->ds_quota = new_quota; + + dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx); + + spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa, + tx, cr, "%lld dataset = %llu ", + (longlong_t)new_quota, ds->ds_object); +} + +int +dsl_dataset_set_quota(const char *dsname, uint64_t quota) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_hold(dsname, FTAG, &ds); + if (err) + return (err); + + if (quota != ds->ds_quota) { + /* + * If someone removes a file, then tries to set the quota, we + * want to make sure the file freeing takes effect. + */ + txg_wait_open(ds->ds_dir->dd_pool, 0); + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, + ds, "a, 0); + } + dsl_dataset_rele(ds, FTAG); + return (err); +} + +static int +dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *reservationp = arg2; + uint64_t new_reservation = *reservationp; + int64_t delta; + uint64_t unique; + + if (new_reservation > INT64_MAX) + return (EOVERFLOW); + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < + SPA_VERSION_REFRESERVATION) + return (ENOTSUP); + + if (dsl_dataset_is_snapshot(ds)) + return (EINVAL); + + /* + * If we are doing the preliminary check in open context, the + * space estimates may be inaccurate. + */ + if (!dmu_tx_is_syncing(tx)) + return (0); + + mutex_enter(&ds->ds_lock); + unique = dsl_dataset_unique(ds); + delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved); + mutex_exit(&ds->ds_lock); + + if (delta > 0 && + delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) + return (ENOSPC); + if (delta > 0 && ds->ds_quota > 0 && + new_reservation > ds->ds_quota) + return (ENOSPC); + + return (0); +} + +/* ARGSUSED */ +static void +dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, + dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *reservationp = arg2; + uint64_t new_reservation = *reservationp; + uint64_t unique; + int64_t delta; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + mutex_enter(&ds->ds_dir->dd_lock); + mutex_enter(&ds->ds_lock); + unique = dsl_dataset_unique(ds); + delta = MAX(0, (int64_t)(new_reservation - unique)) - + MAX(0, (int64_t)(ds->ds_reserved - unique)); + ds->ds_reserved = new_reservation; + mutex_exit(&ds->ds_lock); + + dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); + mutex_exit(&ds->ds_dir->dd_lock); + dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation", + new_reservation, cr, tx); + + spa_history_internal_log(LOG_DS_REFRESERV, + ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu", + (longlong_t)new_reservation, ds->ds_object); +} + +int +dsl_dataset_set_reservation(const char *dsname, uint64_t reservation) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_hold(dsname, FTAG, &ds); + if (err) + return (err); + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_set_reservation_check, + dsl_dataset_set_reservation_sync, ds, &reservation, 0); + dsl_dataset_rele(ds, FTAG); + return (err); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c new file mode 100644 index 000000000000..2ce16fe20e12 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c @@ -0,0 +1,735 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * DSL permissions are stored in a two level zap attribute + * mechanism. The first level identifies the "class" of + * entry. The class is identified by the first 2 letters of + * the attribute. The second letter "l" or "d" identifies whether + * it is a local or descendent permission. The first letter + * identifies the type of entry. + * + * ul$<id> identifies permissions granted locally for this userid. + * ud$<id> identifies permissions granted on descendent datasets for + * this userid. + * Ul$<id> identifies permission sets granted locally for this userid. + * Ud$<id> identifies permission sets granted on descendent datasets for + * this userid. + * gl$<id> identifies permissions granted locally for this groupid. + * gd$<id> identifies permissions granted on descendent datasets for + * this groupid. + * Gl$<id> identifies permission sets granted locally for this groupid. + * Gd$<id> identifies permission sets granted on descendent datasets for + * this groupid. + * el$ identifies permissions granted locally for everyone. + * ed$ identifies permissions granted on descendent datasets + * for everyone. + * El$ identifies permission sets granted locally for everyone. + * Ed$ identifies permission sets granted to descendent datasets for + * everyone. + * c-$ identifies permission to create at dataset creation time. + * C-$ identifies permission sets to grant locally at dataset creation + * time. + * s-$@<name> permissions defined in specified set @<name> + * S-$@<name> Sets defined in named set @<name> + * + * Each of the above entities points to another zap attribute that contains one + * attribute for each allowed permission, such as create, destroy,... + * All of the "upper" case class types will specify permission set names + * rather than permissions. + * + * Basically it looks something like this: + * ul$12 -> ZAP OBJ -> permissions... + * + * The ZAP OBJ is referred to as the jump object. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_synctask.h> +#include <sys/dsl_deleg.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zio_checksum.h> /* for the default checksum value */ +#include <sys/zap.h> +#include <sys/fs/zfs.h> +#include <sys/cred.h> +#include <sys/sunddi.h> + +#include "zfs_deleg.h" + +/* + * Validate that user is allowed to delegate specified permissions. + * + * In order to delegate "create" you must have "create" + * and "allow". + */ +int +dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr) +{ + nvpair_t *whopair = NULL; + int error; + + if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) + return (error); + + while (whopair = nvlist_next_nvpair(nvp, whopair)) { + nvlist_t *perms; + nvpair_t *permpair = NULL; + + VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); + + while (permpair = nvlist_next_nvpair(perms, permpair)) { + const char *perm = nvpair_name(permpair); + + if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0) + return (EPERM); + + if ((error = dsl_deleg_access(ddname, perm, cr)) != 0) + return (error); + } + } + return (0); +} + +/* + * Validate that user is allowed to unallow specified permissions. They + * must have the 'allow' permission, and even then can only unallow + * perms for their uid. + */ +int +dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) +{ + nvpair_t *whopair = NULL; + int error; + char idstr[32]; + + if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) + return (error); + + (void) snprintf(idstr, sizeof (idstr), "%lld", + (longlong_t)crgetuid(cr)); + + while (whopair = nvlist_next_nvpair(nvp, whopair)) { + zfs_deleg_who_type_t type = nvpair_name(whopair)[0]; + + if (type != ZFS_DELEG_USER && + type != ZFS_DELEG_USER_SETS) + return (EPERM); + + if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0) + return (EPERM); + } + return (0); +} + +static void +dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + nvlist_t *nvp = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; + nvpair_t *whopair = NULL; + uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + + if (zapobj == 0) { + dmu_buf_will_dirty(dd->dd_dbuf, tx); + zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos, + DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + } + + while (whopair = nvlist_next_nvpair(nvp, whopair)) { + const char *whokey = nvpair_name(whopair); + nvlist_t *perms; + nvpair_t *permpair = NULL; + uint64_t jumpobj; + + VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); + + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { + jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, + DMU_OT_NONE, 0, tx); + VERIFY(zap_update(mos, zapobj, + whokey, 8, 1, &jumpobj, tx) == 0); + } + + while (permpair = nvlist_next_nvpair(perms, permpair)) { + const char *perm = nvpair_name(permpair); + uint64_t n = 0; + + VERIFY(zap_update(mos, jumpobj, + perm, 8, 1, &n, tx) == 0); + spa_history_internal_log(LOG_DS_PERM_UPDATE, + dd->dd_pool->dp_spa, tx, cr, + "%s %s dataset = %llu", whokey, perm, + dd->dd_phys->dd_head_dataset_obj); + } + } +} + +static void +dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + nvlist_t *nvp = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; + nvpair_t *whopair = NULL; + uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + + if (zapobj == 0) + return; + + while (whopair = nvlist_next_nvpair(nvp, whopair)) { + const char *whokey = nvpair_name(whopair); + nvlist_t *perms; + nvpair_t *permpair = NULL; + uint64_t jumpobj; + + if (nvpair_value_nvlist(whopair, &perms) != 0) { + if (zap_lookup(mos, zapobj, whokey, 8, + 1, &jumpobj) == 0) { + (void) zap_remove(mos, zapobj, whokey, tx); + VERIFY(0 == zap_destroy(mos, jumpobj, tx)); + } + spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE, + dd->dd_pool->dp_spa, tx, cr, + "%s dataset = %llu", whokey, + dd->dd_phys->dd_head_dataset_obj); + continue; + } + + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) + continue; + + while (permpair = nvlist_next_nvpair(perms, permpair)) { + const char *perm = nvpair_name(permpair); + uint64_t n = 0; + + (void) zap_remove(mos, jumpobj, perm, tx); + if (zap_count(mos, jumpobj, &n) == 0 && n == 0) { + (void) zap_remove(mos, zapobj, + whokey, tx); + VERIFY(0 == zap_destroy(mos, + jumpobj, tx)); + } + spa_history_internal_log(LOG_DS_PERM_REMOVE, + dd->dd_pool->dp_spa, tx, cr, + "%s %s dataset = %llu", whokey, perm, + dd->dd_phys->dd_head_dataset_obj); + } + } +} + +int +dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) +{ + dsl_dir_t *dd; + int error; + nvpair_t *whopair = NULL; + int blocks_modified = 0; + + error = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (error) + return (error); + + if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) < + SPA_VERSION_DELEGATED_PERMS) { + dsl_dir_close(dd, FTAG); + return (ENOTSUP); + } + + while (whopair = nvlist_next_nvpair(nvp, whopair)) + blocks_modified++; + + error = dsl_sync_task_do(dd->dd_pool, NULL, + unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, + dd, nvp, blocks_modified); + dsl_dir_close(dd, FTAG); + + return (error); +} + +/* + * Find all 'allow' permissions from a given point and then continue + * traversing up to the root. + * + * This function constructs an nvlist of nvlists. + * each setpoint is an nvlist composed of an nvlist of an nvlist + * of the individual * users/groups/everyone/create + * permissions. + * + * The nvlist will look like this. + * + * { source fsname -> { whokeys { permissions,...}, ...}} + * + * The fsname nvpairs will be arranged in a bottom up order. For example, + * if we have the following structure a/b/c then the nvpairs for the fsnames + * will be ordered a/b/c, a/b, a. + */ +int +dsl_deleg_get(const char *ddname, nvlist_t **nvp) +{ + dsl_dir_t *dd, *startdd; + dsl_pool_t *dp; + int error; + objset_t *mos; + + error = dsl_dir_open(ddname, FTAG, &startdd, NULL); + if (error) + return (error); + + dp = startdd->dd_pool; + mos = dp->dp_meta_objset; + + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + rw_enter(&dp->dp_config_rwlock, RW_READER); + for (dd = startdd; dd != NULL; dd = dd->dd_parent) { + zap_cursor_t basezc; + zap_attribute_t baseza; + nvlist_t *sp_nvp; + uint64_t n; + char source[MAXNAMELEN]; + + if (dd->dd_phys->dd_deleg_zapobj && + (zap_count(mos, dd->dd_phys->dd_deleg_zapobj, + &n) == 0) && n) { + VERIFY(nvlist_alloc(&sp_nvp, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + } else { + continue; + } + + for (zap_cursor_init(&basezc, mos, + dd->dd_phys->dd_deleg_zapobj); + zap_cursor_retrieve(&basezc, &baseza) == 0; + zap_cursor_advance(&basezc)) { + zap_cursor_t zc; + zap_attribute_t za; + nvlist_t *perms_nvp; + + ASSERT(baseza.za_integer_length == 8); + ASSERT(baseza.za_num_integers == 1); + + VERIFY(nvlist_alloc(&perms_nvp, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + for (zap_cursor_init(&zc, mos, baseza.za_first_integer); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + VERIFY(nvlist_add_boolean(perms_nvp, + za.za_name) == 0); + } + zap_cursor_fini(&zc); + VERIFY(nvlist_add_nvlist(sp_nvp, baseza.za_name, + perms_nvp) == 0); + nvlist_free(perms_nvp); + } + + zap_cursor_fini(&basezc); + + dsl_dir_name(dd, source); + VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0); + nvlist_free(sp_nvp); + } + rw_exit(&dp->dp_config_rwlock); + + dsl_dir_close(startdd, FTAG); + return (0); +} + +/* + * Routines for dsl_deleg_access() -- access checking. + */ +typedef struct perm_set { + avl_node_t p_node; + boolean_t p_matched; + char p_setname[ZFS_MAX_DELEG_NAME]; +} perm_set_t; + +static int +perm_set_compare(const void *arg1, const void *arg2) +{ + const perm_set_t *node1 = arg1; + const perm_set_t *node2 = arg2; + int val; + + val = strcmp(node1->p_setname, node2->p_setname); + if (val == 0) + return (0); + return (val > 0 ? 1 : -1); +} + +/* + * Determine whether a specified permission exists. + * + * First the base attribute has to be retrieved. i.e. ul$12 + * Once the base object has been retrieved the actual permission + * is lookup up in the zap object the base object points to. + * + * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if + * there is no perm in that jumpobj. + */ +static int +dsl_check_access(objset_t *mos, uint64_t zapobj, + char type, char checkflag, void *valp, const char *perm) +{ + int error; + uint64_t jumpobj, zero; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, type, checkflag, valp); + error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); + if (error == 0) { + error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero); + if (error == ENOENT) + error = EPERM; + } + return (error); +} + +/* + * check a specified user/group for a requested permission + */ +static int +dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm, + int checkflag, cred_t *cr) +{ + const gid_t *gids; + int ngids; + int i; + uint64_t id; + + /* check for user */ + id = crgetuid(cr); + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_USER, checkflag, &id, perm) == 0) + return (0); + + /* check for users primary group */ + id = crgetgid(cr); + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) + return (0); + + /* check for everyone entry */ + id = -1; + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0) + return (0); + + /* check each supplemental group user is a member of */ + ngids = crgetngroups(cr); + gids = crgetgroups(cr); + for (i = 0; i != ngids; i++) { + id = gids[i]; + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) + return (0); + } + + return (EPERM); +} + +/* + * Iterate over the sets specified in the specified zapobj + * and load them into the permsets avl tree. + */ +static int +dsl_load_sets(objset_t *mos, uint64_t zapobj, + char type, char checkflag, void *valp, avl_tree_t *avl) +{ + zap_cursor_t zc; + zap_attribute_t za; + perm_set_t *permnode; + avl_index_t idx; + uint64_t jumpobj; + int error; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, type, checkflag, valp); + + error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); + if (error != 0) + return (error); + + for (zap_cursor_init(&zc, mos, jumpobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP); + (void) strlcpy(permnode->p_setname, za.za_name, + sizeof (permnode->p_setname)); + permnode->p_matched = B_FALSE; + + if (avl_find(avl, permnode, &idx) == NULL) { + avl_insert(avl, permnode, idx); + } else { + kmem_free(permnode, sizeof (perm_set_t)); + } + } + zap_cursor_fini(&zc); + return (0); +} + +/* + * Load all permissions user based on cred belongs to. + */ +static void +dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl, + char checkflag, cred_t *cr) +{ + const gid_t *gids; + int ngids, i; + uint64_t id; + + id = crgetuid(cr); + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_USER_SETS, checkflag, &id, avl); + + id = crgetgid(cr); + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); + + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl); + + ngids = crgetngroups(cr); + gids = crgetgroups(cr); + for (i = 0; i != ngids; i++) { + id = gids[i]; + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); + } +} + +/* + * Check if user has requested permission. + */ +int +dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) +{ + dsl_dataset_t *ds; + dsl_dir_t *dd; + dsl_pool_t *dp; + void *cookie; + int error; + char checkflag = ZFS_DELEG_LOCAL; + objset_t *mos; + avl_tree_t permsets; + perm_set_t *setnode; + + error = dsl_dataset_hold(dsname, FTAG, &ds); + if (error) + return (error); + + dp = ds->ds_dir->dd_pool; + mos = dp->dp_meta_objset; + + if (dsl_delegation_on(mos) == B_FALSE) { + dsl_dataset_rele(ds, FTAG); + return (ECANCELED); + } + + if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) < + SPA_VERSION_DELEGATED_PERMS) { + dsl_dataset_rele(ds, FTAG); + return (EPERM); + } + + avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), + offsetof(perm_set_t, p_node)); + + rw_enter(&dp->dp_config_rwlock, RW_READER); + for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent, + checkflag = ZFS_DELEG_DESCENDENT) { + uint64_t zapobj; + boolean_t expanded; + + /* + * If not in global zone then make sure + * the zoned property is set + */ + if (!INGLOBALZONE(curthread)) { + uint64_t zoned; + + if (dsl_prop_get_dd(dd, + zfs_prop_to_name(ZFS_PROP_ZONED), + 8, 1, &zoned, NULL) != 0) + break; + if (!zoned) + break; + } + zapobj = dd->dd_phys->dd_deleg_zapobj; + + if (zapobj == 0) + continue; + + dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr); +again: + expanded = B_FALSE; + for (setnode = avl_first(&permsets); setnode; + setnode = AVL_NEXT(&permsets, setnode)) { + if (setnode->p_matched == B_TRUE) + continue; + + /* See if this set directly grants this permission */ + error = dsl_check_access(mos, zapobj, + ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm); + if (error == 0) + goto success; + if (error == EPERM) + setnode->p_matched = B_TRUE; + + /* See if this set includes other sets */ + error = dsl_load_sets(mos, zapobj, + ZFS_DELEG_NAMED_SET_SETS, 0, + setnode->p_setname, &permsets); + if (error == 0) + setnode->p_matched = expanded = B_TRUE; + } + /* + * If we expanded any sets, that will define more sets, + * which we need to check. + */ + if (expanded) + goto again; + + error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr); + if (error == 0) + goto success; + } + error = EPERM; +success: + rw_exit(&dp->dp_config_rwlock); + dsl_dataset_rele(ds, FTAG); + + cookie = NULL; + while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL) + kmem_free(setnode, sizeof (perm_set_t)); + + return (error); +} + +/* + * Other routines. + */ + +static void +copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj, + boolean_t dosets, uint64_t uid, dmu_tx_t *tx) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t jumpobj, pjumpobj; + uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + zap_cursor_t zc; + zap_attribute_t za; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, + dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE, + ZFS_DELEG_LOCAL, NULL); + if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0) + return; + + if (zapobj == 0) { + dmu_buf_will_dirty(dd->dd_dbuf, tx); + zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos, + DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + } + + zfs_deleg_whokey(whokey, + dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER, + ZFS_DELEG_LOCAL, &uid); + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) { + jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0); + } + + for (zap_cursor_init(&zc, mos, pjumpobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t zero = 0; + ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); + + VERIFY(zap_update(mos, jumpobj, za.za_name, + 8, 1, &zero, tx) == 0); + } + zap_cursor_fini(&zc); +} + +/* + * set all create time permission on new dataset. + */ +void +dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr) +{ + dsl_dir_t *dd; + uint64_t uid = crgetuid(cr); + + if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) < + SPA_VERSION_DELEGATED_PERMS) + return; + + for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) { + uint64_t pzapobj = dd->dd_phys->dd_deleg_zapobj; + + if (pzapobj == 0) + continue; + + copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx); + copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx); + } +} + +int +dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + + if (zapobj == 0) + return (0); + + for (zap_cursor_init(&zc, mos, zapobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); + VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx)); + } + zap_cursor_fini(&zc); + VERIFY(0 == zap_destroy(mos, zapobj, tx)); + return (0); +} + +boolean_t +dsl_delegation_on(objset_t *os) +{ + return (os->os->os_spa->spa_delegation); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c index 5e563b632909..48d87f97f669 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c @@ -19,26 +19,28 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> +#include <sys/dmu_objset.h> #include <sys/dmu_tx.h> #include <sys/dsl_dataset.h> #include <sys/dsl_dir.h> #include <sys/dsl_prop.h> #include <sys/dsl_synctask.h> +#include <sys/dsl_deleg.h> #include <sys/spa.h> #include <sys/zap.h> #include <sys/zio.h> #include <sys/arc.h> +#include <sys/sunddi.h> #include "zfs_namecheck.h" -static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd); -static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx); +static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); +static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, + cred_t *cr, dmu_tx_t *tx); /* ARGSUSED */ @@ -55,8 +57,6 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) ASSERT(dd->dd_space_towrite[t] == 0); } - ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes); - if (dd->dd_parent) dsl_dir_close(dd->dd_parent, dd); @@ -91,9 +91,9 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dmu_object_info_t doi; dmu_object_info_from_db(dbuf, &doi); ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR); + ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); } #endif - /* XXX assert bonus buffer size is correct */ if (dd == NULL) { dsl_dir_t *winner; int err; @@ -103,7 +103,6 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_dbuf = dbuf; dd->dd_pool = dp; dd->dd_phys = dbuf->db_data; - dd->dd_used_bytes = dd->dd_phys->dd_used_bytes; mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), @@ -112,36 +111,25 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, if (dd->dd_phys->dd_parent_obj) { err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, NULL, dd, &dd->dd_parent); - if (err) { - mutex_destroy(&dd->dd_lock); - kmem_free(dd, sizeof (dsl_dir_t)); - dmu_buf_rele(dbuf, tag); - return (err); - } + if (err) + goto errout; if (tail) { #ifdef ZFS_DEBUG uint64_t foundobj; err = zap_lookup(dp->dp_meta_objset, - dd->dd_parent->dd_phys-> - dd_child_dir_zapobj, + dd->dd_parent->dd_phys->dd_child_dir_zapobj, tail, sizeof (foundobj), 1, &foundobj); ASSERT(err || foundobj == ddobj); #endif (void) strcpy(dd->dd_myname, tail); } else { err = zap_value_search(dp->dp_meta_objset, - dd->dd_parent->dd_phys-> - dd_child_dir_zapobj, - ddobj, dd->dd_myname); - } - if (err) { - dsl_dir_close(dd->dd_parent, dd); - mutex_destroy(&dd->dd_lock); - kmem_free(dd, sizeof (dsl_dir_t)); - dmu_buf_rele(dbuf, tag); - return (err); + dd->dd_parent->dd_phys->dd_child_dir_zapobj, + ddobj, 0, dd->dd_myname); } + if (err) + goto errout; } else { (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); } @@ -174,6 +162,15 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, ASSERT3P(dd->dd_dbuf, ==, dbuf); *ddp = dd; return (0); + +errout: + if (dd->dd_parent) + dsl_dir_close(dd->dd_parent, dd); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); + dmu_buf_rele(dbuf, tag); + return (err); + } void @@ -404,27 +401,37 @@ dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) } uint64_t -dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx) +dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, + dmu_tx_t *tx) { - objset_t *mos = pds->dd_pool->dp_meta_objset; + objset_t *mos = dp->dp_meta_objset; uint64_t ddobj; dsl_dir_phys_t *dsphys; dmu_buf_t *dbuf; ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); - VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, - name, sizeof (uint64_t), 1, &ddobj, tx)); + if (pds) { + VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, + name, sizeof (uint64_t), 1, &ddobj, tx)); + } else { + /* it's the root dir */ + VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); + } VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; dsphys->dd_creation_time = gethrestime_sec(); - dsphys->dd_parent_obj = pds->dd_object; + if (pds) + dsphys->dd_parent_obj = pds->dd_object; dsphys->dd_props_zapobj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); dsphys->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); + if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) + dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; dmu_buf_rele(dbuf, FTAG); return (ddobj); @@ -461,23 +468,27 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) } void -dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) +dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t val, obj; + dd_used_t t; ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); /* Remove our reservation. */ val = 0; - dsl_dir_set_reservation_sync(dd, &val, tx); - ASSERT3U(dd->dd_used_bytes, ==, 0); + dsl_dir_set_reservation_sync(dd, &val, cr, tx); + ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0); ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); + for (t = 0; t < DD_USED_NUM; t++) + ASSERT3U(dd->dd_phys->dd_used_breakdown[t], ==, 0); VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); + VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); VERIFY(0 == zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); @@ -486,65 +497,53 @@ dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) VERIFY(0 == dmu_object_free(mos, obj, tx)); } -void -dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx) +boolean_t +dsl_dir_is_clone(dsl_dir_t *dd) { - dsl_dir_phys_t *dsp; - dmu_buf_t *dbuf; - int error; - - *ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, - DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); - - error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, - sizeof (uint64_t), 1, ddobjp, tx); - ASSERT3U(error, ==, 0); - - VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); - dsp = dbuf->db_data; - - dsp->dd_creation_time = gethrestime_sec(); - dsp->dd_props_zapobj = zap_create(mos, - DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); - dsp->dd_child_dir_zapobj = zap_create(mos, - DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); - - dmu_buf_rele(dbuf, FTAG); + return (dd->dd_phys->dd_origin_obj && + (dd->dd_pool->dp_origin_snap == NULL || + dd->dd_phys->dd_origin_obj != + dd->dd_pool->dp_origin_snap->ds_object)); } void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, - dsl_dir_space_available(dd, NULL, 0, TRUE)); - mutex_enter(&dd->dd_lock); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_used_bytes); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, - dd->dd_phys->dd_quota); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, + dd->dd_phys->dd_used_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, dd->dd_phys->dd_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, dd->dd_phys->dd_compressed_bytes == 0 ? 100 : (dd->dd_phys->dd_uncompressed_bytes * 100 / dd->dd_phys->dd_compressed_bytes)); + if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, + dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, + dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, + dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, + dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] + + dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]); + } mutex_exit(&dd->dd_lock); - if (dd->dd_phys->dd_clone_parent_obj) { + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + if (dsl_dir_is_clone(dd)) { dsl_dataset_t *ds; char buf[MAXNAMELEN]; - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_clone_parent_obj, - NULL, DS_MODE_NONE, FTAG, &ds)); + VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, + dd->dd_phys->dd_origin_obj, FTAG, &ds)); dsl_dataset_name(ds, buf); - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); - rw_exit(&dd->dd_pool->dp_config_rwlock); - + dsl_dataset_rele(ds, FTAG); dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); } + rw_exit(&dd->dd_pool->dp_config_rwlock); } void @@ -580,7 +579,6 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; - dd->dd_phys->dd_used_bytes = dd->dd_used_bytes; mutex_exit(&dd->dd_lock); /* release the hold from dsl_dir_dirty */ @@ -588,15 +586,13 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) } static uint64_t -dsl_dir_estimated_space(dsl_dir_t *dd) +dsl_dir_space_towrite(dsl_dir_t *dd) { - int64_t space; + uint64_t space = 0; int i; ASSERT(MUTEX_HELD(&dd->dd_lock)); - space = dd->dd_phys->dd_used_bytes; - ASSERT(space >= 0); for (i = 0; i < TXG_SIZE; i++) { space += dd->dd_space_towrite[i&TXG_MASK]; ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); @@ -630,13 +626,9 @@ dsl_dir_space_available(dsl_dir_t *dd, mutex_enter(&dd->dd_lock); if (dd->dd_phys->dd_quota != 0) quota = dd->dd_phys->dd_quota; - if (ondiskonly) { - used = dd->dd_used_bytes; - } else { - used = dsl_dir_estimated_space(dd); - } - if (dd == ancestor) - used += delta; + used = dd->dd_phys->dd_used_bytes; + if (!ondiskonly) + used += dsl_dir_space_towrite(dd); if (dd->dd_parent == NULL) { uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE); @@ -651,6 +643,14 @@ dsl_dir_space_available(dsl_dir_t *dd, parentspace += dd->dd_phys->dd_reserved - used; } + if (dd == ancestor) { + ASSERT(delta <= 0); + ASSERT(used >= -delta); + used += delta; + if (parentspace != UINT64_MAX) + parentspace -= delta; + } + if (used > quota) { /* over quota */ myspace = 0; @@ -678,50 +678,68 @@ dsl_dir_space_available(dsl_dir_t *dd, struct tempreserve { list_node_t tr_node; + dsl_pool_t *tr_dp; dsl_dir_t *tr_ds; uint64_t tr_size; }; -/* - * Reserve space in this dsl_dir, to be used in this tx's txg. - * After the space has been dirtied (and thus - * dsl_dir_willuse_space() has been called), the reservation should - * be canceled, using dsl_dir_tempreserve_clear(). - */ static int -dsl_dir_tempreserve_impl(dsl_dir_t *dd, - uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx) +dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, + boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list, + dmu_tx_t *tx, boolean_t first) { uint64_t txg = tx->tx_txg; - uint64_t est_used, quota, parent_rsrv; - int edquot = EDQUOT; + uint64_t est_inflight, used_on_disk, quota, parent_rsrv; + struct tempreserve *tr; + int enospc = EDQUOT; int txgidx = txg & TXG_MASK; int i; - struct tempreserve *tr; + uint64_t ref_rsrv = 0; ASSERT3U(txg, !=, 0); - ASSERT3S(asize, >=, 0); + ASSERT3S(asize, >, 0); mutex_enter(&dd->dd_lock); + /* * Check against the dsl_dir's quota. We don't add in the delta * when checking for over-quota because they get one free hit. */ - est_used = dsl_dir_estimated_space(dd); + est_inflight = dsl_dir_space_towrite(dd); for (i = 0; i < TXG_SIZE; i++) - est_used += dd->dd_tempreserved[i]; + est_inflight += dd->dd_tempreserved[i]; + used_on_disk = dd->dd_phys->dd_used_bytes; - quota = UINT64_MAX; + /* + * On the first iteration, fetch the dataset's used-on-disk and + * refreservation values. Also, if checkrefquota is set, test if + * allocating this space would exceed the dataset's refquota. + */ + if (first && tx->tx_objset) { + int error; + dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset; + + error = dsl_dataset_check_quota(ds, checkrefquota, + asize, est_inflight, &used_on_disk, &ref_rsrv); + if (error) { + mutex_exit(&dd->dd_lock); + return (error); + } + } - if (dd->dd_phys->dd_quota) + /* + * If this transaction will result in a net free of space, + * we want to let it through. + */ + if (ignorequota || netfree || dd->dd_phys->dd_quota == 0) + quota = UINT64_MAX; + else quota = dd->dd_phys->dd_quota; /* - * If this transaction will result in a net free of space, we want - * to let it through, but we have to be careful: the space that it - * frees won't become available until *after* this txg syncs. - * Therefore, to ensure that it's possible to remove files from - * a full pool without inducing transient overcommits, we throttle + * Adjust the quota against the actual pool size at the root. + * To ensure that it's possible to remove files from a full + * pool without inducing transient overcommits, we throttle * netfree transactions against a quota that is slightly larger, * but still within the pool's allocation slop. In cases where * we're very close to full, this will allow a steady trickle of @@ -731,47 +749,45 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); if (poolsize < quota) { quota = poolsize; - edquot = ENOSPC; + enospc = ENOSPC; } - } else if (netfree) { - quota = UINT64_MAX; } /* * If they are requesting more space, and our current estimate - * is over quota. They get to try again unless the actual + * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes (which * may free up space for us). */ - if (asize > 0 && est_used > quota) { - if (dd->dd_space_towrite[txg & TXG_MASK] != 0 || - dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 || - dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 || - dd->dd_used_bytes < quota) - edquot = ERESTART; - dprintf_dd(dd, "failing: used=%lluK est_used = %lluK " + if (used_on_disk + est_inflight > quota) { + if (est_inflight > 0 || used_on_disk < quota) + enospc = ERESTART; + dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " "quota=%lluK tr=%lluK err=%d\n", - dd->dd_used_bytes>>10, est_used>>10, - quota>>10, asize>>10, edquot); + used_on_disk>>10, est_inflight>>10, + quota>>10, asize>>10, enospc); mutex_exit(&dd->dd_lock); - return (edquot); + return (enospc); } /* We need to up our estimated delta before dropping dd_lock */ dd->dd_tempreserved[txgidx] += asize; - parent_rsrv = parent_delta(dd, est_used, asize); + parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, + asize - ref_rsrv); mutex_exit(&dd->dd_lock); - tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); + tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_ds = dd; tr->tr_size = asize; list_insert_tail(tr_list, tr); /* see if it's OK with our parent */ if (dd->dd_parent && parent_rsrv) { + boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0); + return (dsl_dir_tempreserve_impl(dd->dd_parent, - parent_rsrv, netfree, tr_list, tx)); + parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); } else { return (0); } @@ -779,42 +795,62 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, /* * Reserve space in this dsl_dir, to be used in this tx's txg. - * After the space has been dirtied (and thus - * dsl_dir_willuse_space() has been called), the reservation should - * be canceled, using dsl_dir_tempreserve_clear(). + * After the space has been dirtied (and dsl_dir_willuse_space() + * has been called), the reservation should be canceled, using + * dsl_dir_tempreserve_clear(). */ int -dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, - uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx) +dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, + uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx) { - int err = 0; + int err; list_t *tr_list; + if (asize == 0) { + *tr_cookiep = NULL; + return (0); + } + tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); list_create(tr_list, sizeof (struct tempreserve), offsetof(struct tempreserve, tr_node)); - ASSERT3S(asize, >=, 0); + ASSERT3S(asize, >, 0); ASSERT3S(fsize, >=, 0); - err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, - tr_list, tx); - + err = arc_tempreserve_space(lsize, tx->tx_txg); if (err == 0) { struct tempreserve *tr; - err = arc_tempreserve_space(lsize); - if (err == 0) { - tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); - tr->tr_ds = NULL; - tr->tr_size = lsize; - list_insert_tail(tr_list, tr); + tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_size = lsize; + list_insert_tail(tr_list, tr); + + err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx); + } else { + if (err == EAGAIN) { + txg_delay(dd->dd_pool, tx->tx_txg, 1); + err = ERESTART; } + dsl_pool_memory_pressure(dd->dd_pool); + } + + if (err == 0) { + struct tempreserve *tr; + + tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_dp = dd->dd_pool; + tr->tr_size = asize; + list_insert_tail(tr_list, tr); + + err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, + FALSE, asize > usize, tr_list, tx, TRUE); } if (err) dsl_dir_tempreserve_clear(tr_list, tx); else *tr_cookiep = tr_list; + return (err); } @@ -831,15 +867,20 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) ASSERT3U(tx->tx_txg, !=, 0); + if (tr_cookie == NULL) + return; + while (tr = list_head(tr_list)) { - if (tr->tr_ds == NULL) { - arc_tempreserve_clear(tr->tr_size); - } else { + if (tr->tr_dp) { + dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx); + } else if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, tr->tr_size); tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; mutex_exit(&tr->tr_ds->dd_lock); + } else { + arc_tempreserve_clear(tr->tr_size); } list_remove(tr_list, tr); kmem_free(tr, sizeof (struct tempreserve)); @@ -848,13 +889,8 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) kmem_free(tr_list, sizeof (list_t)); } -/* - * Call in open context when we think we're going to write/free space, - * eg. when dirtying data. Be conservative (ie. OK to write less than - * this or free more than this, but don't write more or free less). - */ -void -dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) +static void +dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) { int64_t parent_space; uint64_t est_used; @@ -863,7 +899,7 @@ dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) if (space > 0) dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; - est_used = dsl_dir_estimated_space(dd); + est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes; parent_space = parent_delta(dd, est_used, space); mutex_exit(&dd->dd_lock); @@ -872,39 +908,96 @@ dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) /* XXX this is potentially expensive and unnecessary... */ if (parent_space && dd->dd_parent) - dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); + dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx); +} + +/* + * Call in open context when we think we're going to write/free space, + * eg. when dirtying data. Be conservative (ie. OK to write less than + * this or free more than this, but don't write more or free less). + */ +void +dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) +{ + dsl_pool_willuse_space(dd->dd_pool, space, tx); + dsl_dir_willuse_space_impl(dd, space, tx); } /* call from syncing context when we actually write/free space for this dd */ void -dsl_dir_diduse_space(dsl_dir_t *dd, +dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) { int64_t accounted_delta; + boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(type < DD_USED_NUM); dsl_dir_dirty(dd, tx); - mutex_enter(&dd->dd_lock); - accounted_delta = parent_delta(dd, dd->dd_used_bytes, used); - ASSERT(used >= 0 || dd->dd_used_bytes >= -used); + if (needlock) + mutex_enter(&dd->dd_lock); + accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used); + ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used); ASSERT(compressed >= 0 || dd->dd_phys->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); - dd->dd_used_bytes += used; + dd->dd_phys->dd_used_bytes += used; dd->dd_phys->dd_uncompressed_bytes += uncompressed; dd->dd_phys->dd_compressed_bytes += compressed; - mutex_exit(&dd->dd_lock); + + if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + ASSERT(used > 0 || + dd->dd_phys->dd_used_breakdown[type] >= -used); + dd->dd_phys->dd_used_breakdown[type] += used; +#ifdef DEBUG + dd_used_t t; + uint64_t u = 0; + for (t = 0; t < DD_USED_NUM; t++) + u += dd->dd_phys->dd_used_breakdown[t]; + ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes); +#endif + } + if (needlock) + mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { - dsl_dir_diduse_space(dd->dd_parent, + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, accounted_delta, compressed, uncompressed, tx); + dsl_dir_transfer_space(dd->dd_parent, + used - accounted_delta, + DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); } } -/* ARGSUSED */ +void +dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) +{ + boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(oldtype < DD_USED_NUM); + ASSERT(newtype < DD_USED_NUM); + + if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN)) + return; + + dsl_dir_dirty(dd, tx); + if (needlock) + mutex_enter(&dd->dd_lock); + ASSERT(delta > 0 ? + dd->dd_phys->dd_used_breakdown[oldtype] >= delta : + dd->dd_phys->dd_used_breakdown[newtype] >= -delta); + ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta)); + dd->dd_phys->dd_used_breakdown[oldtype] -= delta; + dd->dd_phys->dd_used_breakdown[newtype] += delta; + if (needlock) + mutex_exit(&dd->dd_lock); +} + static int dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -921,22 +1014,22 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) /* * If we are doing the preliminary check in open context, and * there are pending changes, then don't fail it, since the - * pending changes could under-estimat the amount of space to be + * pending changes could under-estimate the amount of space to be * freed up. */ - towrite = dd->dd_space_towrite[0] + dd->dd_space_towrite[1] + - dd->dd_space_towrite[2] + dd->dd_space_towrite[3]; + towrite = dsl_dir_space_towrite(dd); if ((dmu_tx_is_syncing(tx) || towrite == 0) && (new_quota < dd->dd_phys->dd_reserved || - new_quota < dsl_dir_estimated_space(dd))) { + new_quota < dd->dd_phys->dd_used_bytes + towrite)) { err = ENOSPC; } mutex_exit(&dd->dd_lock); return (err); } +/* ARGSUSED */ static void -dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; uint64_t *quotap = arg2; @@ -947,6 +1040,10 @@ dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) mutex_enter(&dd->dd_lock); dd->dd_phys->dd_quota = new_quota; mutex_exit(&dd->dd_lock); + + spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa, + tx, cr, "%lld dataset = %llu ", + (longlong_t)new_quota, dd->dd_phys->dd_head_dataset_obj); } int @@ -958,20 +1055,22 @@ dsl_dir_set_quota(const char *ddname, uint64_t quota) err = dsl_dir_open(ddname, FTAG, &dd, NULL); if (err) return (err); - /* - * If someone removes a file, then tries to set the quota, we - * want to make sure the file freeing takes effect. - */ - txg_wait_open(dd->dd_pool, 0); - err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, - dsl_dir_set_quota_sync, dd, "a, 0); + if (quota != dd->dd_phys->dd_quota) { + /* + * If someone removes a file, then tries to set the quota, we + * want to make sure the file freeing takes effect. + */ + txg_wait_open(dd->dd_pool, 0); + + err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, + dsl_dir_set_quota_sync, dd, "a, 0); + } dsl_dir_close(dd, FTAG); return (err); } -/* ARGSUSED */ -static int +int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; @@ -991,7 +1090,7 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); mutex_enter(&dd->dd_lock); - used = dd->dd_used_bytes; + used = dd->dd_phys->dd_used_bytes; delta = MAX(used, new_reservation) - MAX(used, dd->dd_phys->dd_reserved); mutex_exit(&dd->dd_lock); @@ -1011,8 +1110,9 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); } +/* ARGSUSED */ static void -dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; uint64_t *reservationp = arg2; @@ -1020,19 +1120,24 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) uint64_t used; int64_t delta; + dmu_buf_will_dirty(dd->dd_dbuf, tx); + mutex_enter(&dd->dd_lock); - used = dd->dd_used_bytes; + used = dd->dd_phys->dd_used_bytes; delta = MAX(used, new_reservation) - MAX(used, dd->dd_phys->dd_reserved); - mutex_exit(&dd->dd_lock); - - dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_reserved = new_reservation; if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ - dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx); + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, + delta, 0, 0, tx); } + mutex_exit(&dd->dd_lock); + + spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa, + tx, cr, "%lld dataset = %llu", + (longlong_t)new_reservation, dd->dd_phys->dd_head_dataset_obj); } int @@ -1074,7 +1179,7 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) return (delta); mutex_enter(&dd->dd_lock); - delta = parent_delta(dd, dd->dd_used_bytes, delta); + delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta); mutex_exit(&dd->dd_lock); return (would_change(dd->dd_parent, delta, ancestor)); } @@ -1084,7 +1189,7 @@ struct renamearg { const char *mynewname; }; -/* ARGSUSED */ +/*ARGSUSED*/ static int dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -1110,7 +1215,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) if (ra->newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = - MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved); + MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved); /* no rename into our descendant */ if (closest_common_ancestor(dd, ra->newparent) == dd) @@ -1125,7 +1230,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct renamearg *ra = arg2; @@ -1136,15 +1241,24 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2); if (ra->newparent != dd->dd_parent) { - uint64_t myspace = - MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved); - - dsl_dir_diduse_space(dd->dd_parent, -myspace, + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, + -dd->dd_phys->dd_used_bytes, -dd->dd_phys->dd_compressed_bytes, -dd->dd_phys->dd_uncompressed_bytes, tx); - dsl_dir_diduse_space(ra->newparent, myspace, + dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD, + dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_compressed_bytes, dd->dd_phys->dd_uncompressed_bytes, tx); + + if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) { + uint64_t unused_rsrv = dd->dd_phys->dd_reserved - + dd->dd_phys->dd_used_bytes; + + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, + -unused_rsrv, 0, 0, tx); + dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV, + unused_rsrv, 0, 0, tx); + } } dmu_buf_will_dirty(dd->dd_dbuf, tx); @@ -1164,6 +1278,9 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, 8, 1, &dd->dd_object, tx); ASSERT3U(err, ==, 0); + + spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, + tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj); } int @@ -1189,7 +1306,6 @@ dsl_dir_rename(dsl_dir_t *dd, const char *newname) goto out; } - err = dsl_sync_task_do(dd->dd_pool, dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c index 00abf7ec2c6b..4585dc805fe5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dsl_pool.h> #include <sys/dsl_dataset.h> #include <sys/dsl_dir.h> @@ -36,20 +34,36 @@ #include <sys/zio.h> #include <sys/zfs_context.h> #include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/spa_impl.h> + +int zfs_no_write_throttle = 0; +int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ +int zfs_txg_synctime = 5; /* target secs to sync a txg */ + +uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ +uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ +uint64_t zfs_write_limit_inflated = 0; +uint64_t zfs_write_limit_override = 0; +extern uint64_t zfs_write_limit_min; + +kmutex_t zfs_write_limit_lock; + +static pgcnt_t old_physmem = 0; static int -dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp) +dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { uint64_t obj; int err; err = zap_lookup(dp->dp_meta_objset, dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, - MOS_DIR_NAME, sizeof (obj), 1, &obj); + name, sizeof (obj), 1, &obj); if (err) return (err); - return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp)); + return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); } static dsl_pool_t * @@ -62,6 +76,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) dp->dp_spa = spa; dp->dp_meta_rootbp = *bp; rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); + dp->dp_write_limit = zfs_write_limit_min; txg_init(dp, txg); txg_list_create(&dp->dp_dirty_datasets, @@ -70,9 +85,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, offsetof(dsl_sync_task_group_t, dstg_node)); - list_create(&dp->dp_synced_objsets, sizeof (dsl_dataset_t), + list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), offsetof(dsl_dataset_t, ds_synced_link)); + mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL); + return (dp); } @@ -81,9 +99,11 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + dsl_dir_t *dd; + dsl_dataset_t *ds; objset_impl_t *osi; - rw_enter(&dp->dp_config_rwlock, RW_READER); + rw_enter(&dp->dp_config_rwlock, RW_WRITER); err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi); if (err) goto out; @@ -100,10 +120,73 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; - err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir); + err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); if (err) goto out; + if (spa_version(spa) >= SPA_VERSION_ORIGIN) { + err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); + if (err) + goto out; + err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, + FTAG, &ds); + if (err) + goto out; + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + dp, &dp->dp_origin_snap); + if (err) + goto out; + dsl_dataset_rele(ds, FTAG); + dsl_dir_close(dd, dp); + } + + /* get scrub status */ + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, + &dp->dp_scrub_func); + if (err == 0) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, + &dp->dp_scrub_queue_obj); + if (err) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, + &dp->dp_scrub_min_txg); + if (err) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, + &dp->dp_scrub_max_txg); + if (err) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + &dp->dp_scrub_bookmark); + if (err) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, + &spa->spa_scrub_errors); + if (err) + goto out; + if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { + /* + * A new-type scrub was in progress on an old + * pool. Restart from the beginning, since the + * old software may have changed the pool in the + * meantime. + */ + dsl_pool_scrub_restart(dp); + } + } else { + /* + * It's OK if there is no scrub in progress (and if + * there was an I/O error, ignore it). + */ + err = 0; + } + out: rw_exit(&dp->dp_config_rwlock); if (err) @@ -117,7 +200,15 @@ out: void dsl_pool_close(dsl_pool_t *dp) { - /* drop our reference from dsl_pool_open() */ + /* drop our references from dsl_pool_open() */ + + /* + * Since we held the origin_snap from "syncing" context (which + * includes pool-opening context), it actually only got a "ref" + * and not a hold, so just drop that here. + */ + if (dp->dp_origin_snap) + dsl_dataset_drop_ref(dp->dp_origin_snap, dp); if (dp->dp_mos_dir) dsl_dir_close(dp->dp_mos_dir, dp); if (dp->dp_root_dir) @@ -130,20 +221,27 @@ dsl_pool_close(dsl_pool_t *dp) txg_list_destroy(&dp->dp_dirty_datasets); txg_list_destroy(&dp->dp_dirty_dirs); txg_list_destroy(&dp->dp_sync_tasks); - list_destroy(&dp->dp_synced_objsets); + list_destroy(&dp->dp_synced_datasets); - arc_flush(); + arc_flush(dp->dp_spa); txg_fini(dp); rw_destroy(&dp->dp_config_rwlock); + mutex_destroy(&dp->dp_lock); + mutex_destroy(&dp->dp_scrub_cancel_lock); kmem_free(dp, sizeof (dsl_pool_t)); } dsl_pool_t * -dsl_pool_create(spa_t *spa, uint64_t txg) +dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); + objset_impl_t *osip; + dsl_dataset_t *ds; + uint64_t dsobj; + + /* create and open the MOS (meta-objset) */ dp->dp_meta_objset = &dmu_objset_create_impl(spa, NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os; @@ -153,13 +251,29 @@ dsl_pool_create(spa_t *spa, uint64_t txg) ASSERT3U(err, ==, 0); /* create and open the root dir */ - dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx); + dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir)); /* create and open the meta-objset dir */ - (void) dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx); - VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir)); + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); + VERIFY(0 == dsl_pool_open_special_dir(dp, + MOS_DIR_NAME, &dp->dp_mos_dir)); + + if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) + dsl_pool_create_origin(dp, tx); + + /* create the root dataset */ + dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); + + /* create the root objset */ + VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + osip = dmu_objset_create_impl(dp->dp_spa, ds, + dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); +#ifdef _KERNEL + zfs_create_fs(&osip->os, kcred, zplprops, tx); +#endif + dsl_dataset_rele(ds, FTAG); dmu_tx_commit(tx); @@ -175,26 +289,42 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dsl_dataset_t *ds; dsl_sync_task_group_t *dstg; objset_impl_t *mosi = dp->dp_meta_objset->os; + hrtime_t start, write_time; + uint64_t data_written; int err; tx = dmu_tx_create_assigned(dp, txg); + dp->dp_read_overhead = 0; zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { if (!list_link_active(&ds->ds_synced_link)) - list_insert_tail(&dp->dp_synced_objsets, ds); + list_insert_tail(&dp->dp_synced_datasets, ds); else dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, zio, tx); } + DTRACE_PROBE(pool_sync__1setup); + + start = gethrtime(); err = zio_wait(zio); + write_time = gethrtime() - start; ASSERT(err == 0); + DTRACE_PROBE(pool_sync__2rootzio); while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) dsl_sync_task_group_sync(dstg, tx); + DTRACE_PROBE(pool_sync__3task); + + start = gethrtime(); while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) dsl_dir_sync(dd, tx); + write_time += gethrtime() - start; + + if (spa_sync_pass(dp->dp_spa) == 1) + dsl_pool_scrub_sync(dp, tx); + start = gethrtime(); if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL || list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) { zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); @@ -204,8 +334,51 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); } + write_time += gethrtime() - start; + DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, + hrtime_t, dp->dp_read_overhead); + write_time -= dp->dp_read_overhead; dmu_tx_commit(tx); + + data_written = dp->dp_space_towrite[txg & TXG_MASK]; + dp->dp_space_towrite[txg & TXG_MASK] = 0; + ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); + + /* + * If the write limit max has not been explicitly set, set it + * to a fraction of available physical memory (default 1/8th). + * Note that we must inflate the limit because the spa + * inflates write sizes to account for data replication. + * Check this each sync phase to catch changing memory size. + */ + if (physmem != old_physmem && zfs_write_limit_shift) { + mutex_enter(&zfs_write_limit_lock); + old_physmem = physmem; + zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; + zfs_write_limit_inflated = MAX(zfs_write_limit_min, + spa_get_asize(dp->dp_spa, zfs_write_limit_max)); + mutex_exit(&zfs_write_limit_lock); + } + + /* + * Attempt to keep the sync time consistent by adjusting the + * amount of write traffic allowed into each transaction group. + * Weight the throughput calculation towards the current value: + * thru = 3/4 old_thru + 1/4 new_thru + */ + ASSERT(zfs_write_limit_min > 0); + if (data_written > zfs_write_limit_min / 8 && write_time > 0) { + uint64_t throughput = (data_written * NANOSEC) / write_time; + if (dp->dp_throughput) + dp->dp_throughput = throughput / 4 + + 3 * dp->dp_throughput / 4; + else + dp->dp_throughput = throughput; + dp->dp_write_limit = MIN(zfs_write_limit_inflated, + MAX(zfs_write_limit_min, + dp->dp_throughput * zfs_txg_synctime)); + } } void @@ -213,8 +386,8 @@ dsl_pool_zil_clean(dsl_pool_t *dp) { dsl_dataset_t *ds; - while (ds = list_head(&dp->dp_synced_objsets)) { - list_remove(&dp->dp_synced_objsets, ds); + while (ds = list_head(&dp->dp_synced_datasets)) { + list_remove(&dp->dp_synced_datasets, ds); ASSERT(ds->ds_user_ptr != NULL); zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil); dmu_buf_rele(ds->ds_dbuf, ds); @@ -254,3 +427,187 @@ dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) return (space - resv); } + +int +dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) +{ + uint64_t reserved = 0; + uint64_t write_limit = (zfs_write_limit_override ? + zfs_write_limit_override : dp->dp_write_limit); + + if (zfs_no_write_throttle) { + atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], + space); + return (0); + } + + /* + * Check to see if we have exceeded the maximum allowed IO for + * this transaction group. We can do this without locks since + * a little slop here is ok. Note that we do the reserved check + * with only half the requested reserve: this is because the + * reserve requests are worst-case, and we really don't want to + * throttle based off of worst-case estimates. + */ + if (write_limit > 0) { + reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] + + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; + + if (reserved && reserved > write_limit) + return (ERESTART); + } + + atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); + + /* + * If this transaction group is over 7/8ths capacity, delay + * the caller 1 clock tick. This will slow down the "fill" + * rate until the sync process can catch up with us. + */ + if (reserved && reserved > (write_limit - (write_limit >> 3))) + txg_delay(dp, tx->tx_txg, 1); + + return (0); +} + +void +dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) +{ + ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); + atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); +} + +void +dsl_pool_memory_pressure(dsl_pool_t *dp) +{ + uint64_t space_inuse = 0; + int i; + + if (dp->dp_write_limit == zfs_write_limit_min) + return; + + for (i = 0; i < TXG_SIZE; i++) { + space_inuse += dp->dp_space_towrite[i]; + space_inuse += dp->dp_tempreserved[i]; + } + dp->dp_write_limit = MAX(zfs_write_limit_min, + MIN(dp->dp_write_limit, space_inuse / 4)); +} + +void +dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) +{ + if (space > 0) { + mutex_enter(&dp->dp_lock); + dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; + mutex_exit(&dp->dp_lock); + } +} + +/* ARGSUSED */ +static int +upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + dmu_tx_t *tx = arg; + dsl_dataset_t *ds, *prev = NULL; + int err; + dsl_pool_t *dp = spa_get_dsl(spa); + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err) + return (err); + + while (ds->ds_phys->ds_prev_snap_obj != 0) { + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) + break; + dsl_dataset_rele(ds, FTAG); + ds = prev; + prev = NULL; + } + + if (prev == NULL) { + prev = dp->dp_origin_snap; + + /* + * The $ORIGIN can't have any data, or the accounting + * will be wrong. + */ + ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); + + /* The origin doesn't get attached to itself */ + if (ds->ds_object == prev->ds_object) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_prev_snap_obj = prev->ds_object; + ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; + + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; + + dmu_buf_will_dirty(prev->ds_dbuf, tx); + prev->ds_phys->ds_num_children++; + + if (ds->ds_phys->ds_next_snap_obj == 0) { + ASSERT(ds->ds_prev == NULL); + VERIFY(0 == dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); + } + } + + ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); + ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); + + if (prev->ds_phys->ds_next_clones_obj == 0) { + prev->ds_phys->ds_next_clones_obj = + zap_create(dp->dp_meta_objset, + DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY(0 == zap_add_int(dp->dp_meta_objset, + prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); + + dsl_dataset_rele(ds, FTAG); + if (prev != dp->dp_origin_snap) + dsl_dataset_rele(prev, FTAG); + return (0); +} + +void +dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dp->dp_origin_snap != NULL); + + (void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, + tx, DS_FIND_CHILDREN); +} + +void +dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) +{ + uint64_t dsobj; + dsl_dataset_t *ds; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dp->dp_origin_snap == NULL); + + /* create the origin dir, ds, & snap-ds */ + rw_enter(&dp->dp_config_rwlock, RW_WRITER); + dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, + NULL, 0, kcred, tx); + VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx); + VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + dp, &dp->dp_origin_snap)); + dsl_dataset_rele(ds, FTAG); + rw_exit(&dp->dp_config_rwlock); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c index 2fff66d06b1e..212acbbc5968 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -44,14 +44,20 @@ dodefault(const char *propname, int intsz, int numint, void *buf) { zfs_prop_t prop; - if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL || - zfs_prop_readonly(prop)) + /* + * The setonce properties are read-only, BUT they still + * have a default value that can be used as the initial + * value. + */ + if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL || + (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop))) return (ENOENT); - if (zfs_prop_get_type(prop) == prop_type_string) { + if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { if (intsz != 1) return (EOVERFLOW); - (void) strncpy(buf, zfs_prop_default_string(prop), numint); + (void) strncpy(buf, zfs_prop_default_string(prop), + numint); } else { if (intsz != 8 || numint < 1) return (EOVERFLOW); @@ -62,13 +68,16 @@ dodefault(const char *propname, int intsz, int numint, void *buf) return (0); } -static int -dsl_prop_get_impl(dsl_dir_t *dd, const char *propname, +int +dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, int intsz, int numint, void *buf, char *setpoint) { int err = ENOENT; + objset_t *mos = dd->dd_pool->dp_meta_objset; zfs_prop_t prop; + ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); + if (setpoint) setpoint[0] = '\0'; @@ -79,7 +88,6 @@ dsl_prop_get_impl(dsl_dir_t *dd, const char *propname, * ouside this loop. */ for (; dd != NULL; dd = dd->dd_parent) { - objset_t *mos = dd->dd_pool->dp_meta_objset; ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, intsz, numint, buf); @@ -92,8 +100,7 @@ dsl_prop_get_impl(dsl_dir_t *dd, const char *propname, /* * Break out of this loop for non-inheritable properties. */ - if (prop != ZFS_PROP_INVAL && - !zfs_prop_inheritable(prop)) + if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) break; } if (err == ENOENT) @@ -102,6 +109,26 @@ dsl_prop_get_impl(dsl_dir_t *dd, const char *propname, return (err); } +int +dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, + int intsz, int numint, void *buf, char *setpoint) +{ + ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock)); + + if (ds->ds_phys->ds_props_obj) { + int err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_props_obj, propname, intsz, numint, buf); + if (err != ENOENT) { + if (setpoint) + dsl_dataset_name(ds, setpoint); + return (err); + } + } + + return (dsl_prop_get_dd(ds->ds_dir, propname, + intsz, numint, buf, setpoint)); +} + /* * Register interest in the named property. We'll call the callback * once to notify it of the current property value, and again each time @@ -114,18 +141,20 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, dsl_prop_changed_cb_t *callback, void *cbarg) { dsl_dir_t *dd = ds->ds_dir; + dsl_pool_t *dp = dd->dd_pool; uint64_t value; dsl_prop_cb_record_t *cbr; int err; int need_rwlock; - need_rwlock = !RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock); + need_rwlock = !RW_WRITE_HELD(&dp->dp_config_rwlock); if (need_rwlock) - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_prop_get_impl(dd, propname, 8, 1, &value, NULL); + err = dsl_prop_get_ds(ds, propname, 8, 1, &value, NULL); if (err != 0) { - rw_exit(&dd->dd_pool->dp_config_rwlock); + if (need_rwlock) + rw_exit(&dp->dp_config_rwlock); return (err); } @@ -141,46 +170,30 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, cbr->cbr_func(cbr->cbr_arg, value); - VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object, + VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, NULL, cbr, &dd)); if (need_rwlock) - rw_exit(&dd->dd_pool->dp_config_rwlock); - /* Leave dataset open until this callback is unregistered */ + rw_exit(&dp->dp_config_rwlock); + /* Leave dir open until this callback is unregistered */ return (0); } int -dsl_prop_get_ds(dsl_dir_t *dd, const char *propname, - int intsz, int numints, void *buf, char *setpoint) -{ - int err; - - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - err = dsl_prop_get_impl(dd, propname, intsz, numints, buf, setpoint); - rw_exit(&dd->dd_pool->dp_config_rwlock); - - return (err); -} - -int -dsl_prop_get(const char *ddname, const char *propname, +dsl_prop_get(const char *dsname, const char *propname, int intsz, int numints, void *buf, char *setpoint) { - dsl_dir_t *dd; - const char *tail; + dsl_dataset_t *ds; int err; - err = dsl_dir_open(ddname, FTAG, &dd, &tail); + err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); - if (tail && tail[0] != '@') { - dsl_dir_close(dd, FTAG); - return (ENOENT); - } - err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint); + rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); + err = dsl_prop_get_ds(ds, propname, intsz, numints, buf, setpoint); + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); - dsl_dir_close(dd, FTAG); + dsl_dataset_rele(ds, FTAG); return (err); } @@ -264,8 +277,9 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, dsl_prop_cb_record_t *cbr; objset_t *mos = dp->dp_meta_objset; zap_cursor_t zc; - zap_attribute_t za; + zap_attribute_t *za; int err; + uint64_t dummyval; ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); @@ -278,7 +292,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, * being inherited here or below; stop the recursion. */ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, - 8, 1, &value); + 8, 1, &dummyval); if (err == 0) { dsl_dir_close(dd, FTAG); return; @@ -287,22 +301,34 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, } mutex_enter(&dd->dd_lock); - for (cbr = list_head(&dd->dd_prop_cbs); - cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { - if (strcmp(cbr->cbr_propname, propname) == 0) { - cbr->cbr_func(cbr->cbr_arg, value); - } + for (cbr = list_head(&dd->dd_prop_cbs); cbr; + cbr = list_next(&dd->dd_prop_cbs, cbr)) { + uint64_t propobj = cbr->cbr_ds->ds_phys->ds_props_obj; + + if (strcmp(cbr->cbr_propname, propname) != 0) + continue; + + /* + * If the property is set on this ds, then it is not + * inherited here; don't call the callback. + */ + if (propobj && 0 == zap_lookup(mos, propobj, propname, + 8, 1, &dummyval)) + continue; + + cbr->cbr_func(cbr->cbr_arg, value); } mutex_exit(&dd->dd_lock); + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); for (zap_cursor_init(&zc, mos, dd->dd_phys->dd_child_dir_zapobj); - zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { - /* XXX recursion could blow stack; esp. za! */ - dsl_prop_changed_notify(dp, za.za_first_integer, + dsl_prop_changed_notify(dp, za->za_first_integer, propname, value, FALSE); } + kmem_free(za, sizeof (zap_attribute_t)); zap_cursor_fini(&zc); dsl_dir_close(dd, FTAG); } @@ -316,22 +342,37 @@ struct prop_set_arg { static void -dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; + dsl_dataset_t *ds = arg1; struct prop_set_arg *psa = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t zapobj = dd->dd_phys->dd_props_zapobj; - uint64_t intval; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t zapobj, intval; int isint; + char valbuf[32]; + char *valstr; isint = (dodefault(psa->name, 8, 1, &intval) == 0); + if (dsl_dataset_is_snapshot(ds)) { + ASSERT(spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_SNAP_PROPS); + if (ds->ds_phys->ds_props_obj == 0) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_props_obj = + zap_create(mos, + DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); + } + zapobj = ds->ds_phys->ds_props_obj; + } else { + zapobj = ds->ds_dir->dd_phys->dd_props_zapobj; + } + if (psa->numints == 0) { int err = zap_remove(mos, zapobj, psa->name, tx); ASSERT(err == 0 || err == ENOENT); if (isint) { - VERIFY(0 == dsl_prop_get_impl(dd->dd_parent, + VERIFY(0 == dsl_prop_get_ds(ds, psa->name, 8, 1, &intval, NULL)); } } else { @@ -342,32 +383,63 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) } if (isint) { - dsl_prop_changed_notify(dd->dd_pool, - dd->dd_object, psa->name, intval, TRUE); + if (dsl_dataset_is_snapshot(ds)) { + dsl_prop_cb_record_t *cbr; + /* + * It's a snapshot; nothing can inherit this + * property, so just look for callbacks on this + * ds here. + */ + mutex_enter(&ds->ds_dir->dd_lock); + for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr; + cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) { + if (cbr->cbr_ds == ds && + strcmp(cbr->cbr_propname, psa->name) == 0) + cbr->cbr_func(cbr->cbr_arg, intval); + } + mutex_exit(&ds->ds_dir->dd_lock); + } else { + dsl_prop_changed_notify(ds->ds_dir->dd_pool, + ds->ds_dir->dd_object, psa->name, intval, TRUE); + } + } + if (isint) { + (void) snprintf(valbuf, sizeof (valbuf), + "%lld", (longlong_t)intval); + valstr = valbuf; + } else { + valstr = (char *)psa->buf; } + spa_history_internal_log((psa->numints == 0) ? LOG_DS_INHERIT : + LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr, + "%s=%s dataset = %llu", psa->name, valstr, ds->ds_object); } -int -dsl_prop_set_dd(dsl_dir_t *dd, const char *propname, - int intsz, int numints, const void *buf) +void +dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, + cred_t *cr, dmu_tx_t *tx) { - struct prop_set_arg psa; + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t zapobj = dd->dd_phys->dd_props_zapobj; - psa.name = propname; - psa.intsz = intsz; - psa.numints = numints; - psa.buf = buf; + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY(0 == zap_update(mos, zapobj, name, sizeof (val), 1, &val, tx)); + + dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE); - return (dsl_sync_task_do(dd->dd_pool, - NULL, dsl_prop_set_sync, dd, &psa, 2)); + spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr, + "%s=%llu dataset = %llu", name, (u_longlong_t)val, + dd->dd_phys->dd_head_dataset_obj); } int -dsl_prop_set(const char *ddname, const char *propname, +dsl_prop_set(const char *dsname, const char *propname, int intsz, int numints, const void *buf) { - dsl_dir_t *dd; + dsl_dataset_t *ds; int err; + struct prop_set_arg psa; /* * We must do these checks before we get to the syncfunc, since @@ -378,11 +450,24 @@ dsl_prop_set(const char *ddname, const char *propname, if (intsz * numints >= ZAP_MAXVALUELEN) return (E2BIG); - err = dsl_dir_open(ddname, FTAG, &dd, NULL); + err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); - err = dsl_prop_set_dd(dd, propname, intsz, numints, buf); - dsl_dir_close(dd, FTAG); + + if (dsl_dataset_is_snapshot(ds) && + spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) { + dsl_dataset_rele(ds, FTAG); + return (ENOTSUP); + } + + psa.name = propname; + psa.intsz = intsz; + psa.numints = numints; + psa.buf = buf; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + NULL, dsl_prop_set_sync, ds, &psa, 2); + + dsl_dataset_rele(ds, FTAG); return (err); } @@ -390,45 +475,55 @@ dsl_prop_set(const char *ddname, const char *propname, * Iterate over all properties for this dataset and return them in an nvlist. */ int -dsl_prop_get_all(objset_t *os, nvlist_t **nvp) +dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local) { dsl_dataset_t *ds = os->os->os_dsl_dataset; dsl_dir_t *dd = ds->ds_dir; + boolean_t snapshot = dsl_dataset_is_snapshot(ds); int err = 0; - dsl_pool_t *dp; - objset_t *mos; - - if (dsl_dataset_is_snapshot(ds)) { - VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - return (0); - } + dsl_pool_t *dp = dd->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t propobj = ds->ds_phys->ds_props_obj; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - dp = dd->dd_pool; - mos = dp->dp_meta_objset; + if (local && snapshot && !propobj) + return (0); rw_enter(&dp->dp_config_rwlock, RW_READER); - for (; dd != NULL; dd = dd->dd_parent) { + while (dd != NULL) { char setpoint[MAXNAMELEN]; zap_cursor_t zc; zap_attribute_t za; + dsl_dir_t *dd_next; + + if (propobj) { + dsl_dataset_name(ds, setpoint); + dd_next = dd; + } else { + dsl_dir_name(dd, setpoint); + propobj = dd->dd_phys->dd_props_zapobj; + dd_next = dd->dd_parent; + } - dsl_dir_name(dd, setpoint); - - for (zap_cursor_init(&zc, mos, dd->dd_phys->dd_props_zapobj); + for (zap_cursor_init(&zc, mos, propobj); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { nvlist_t *propval; - zfs_prop_t prop; - /* - * Skip non-inheritable properties. - */ - if ((prop = zfs_name_to_prop(za.za_name)) != - ZFS_PROP_INVAL && !zfs_prop_inheritable(prop) && - dd != ds->ds_dir) + zfs_prop_t prop = zfs_name_to_prop(za.za_name); + + /* Skip non-inheritable properties. */ + if (prop != ZPROP_INVAL && + !zfs_prop_inheritable(prop) && + (dd != ds->ds_dir || (snapshot && dd != dd_next))) continue; + /* Skip properties not valid for this type. */ + if (snapshot && prop != ZPROP_INVAL && + !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT)) + continue; + + /* Skip properties already defined */ if (nvlist_lookup_nvlist(*nvp, za.za_name, &propval) == 0) continue; @@ -441,28 +536,26 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp) */ char *tmp = kmem_alloc(za.za_num_integers, KM_SLEEP); - err = zap_lookup(mos, - dd->dd_phys->dd_props_zapobj, - za.za_name, 1, za.za_num_integers, - tmp); + err = zap_lookup(mos, propobj, + za.za_name, 1, za.za_num_integers, tmp); if (err != 0) { kmem_free(tmp, za.za_num_integers); break; } - VERIFY(nvlist_add_string(propval, - ZFS_PROP_VALUE, tmp) == 0); + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, + tmp) == 0); kmem_free(tmp, za.za_num_integers); } else { /* * Integer property */ ASSERT(za.za_integer_length == 8); - (void) nvlist_add_uint64(propval, - ZFS_PROP_VALUE, za.za_first_integer); + (void) nvlist_add_uint64(propval, ZPROP_VALUE, + za.za_first_integer); } - VERIFY(nvlist_add_string(propval, - ZFS_PROP_SOURCE, setpoint) == 0); + VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, + setpoint) == 0); VERIFY(nvlist_add_nvlist(*nvp, za.za_name, propval) == 0); nvlist_free(propval); @@ -472,6 +565,14 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp) if (err != ENOENT) break; err = 0; + /* + * If we are just after the props that have been set + * locally, then we are done after the first iteration. + */ + if (local) + break; + dd = dd_next; + propobj = 0; } rw_exit(&dp->dp_config_rwlock); @@ -484,7 +585,7 @@ dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value) nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(propval, ZFS_PROP_VALUE, value) == 0); + VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0); nvlist_free(propval); } @@ -495,7 +596,7 @@ dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_string(propval, ZFS_PROP_VALUE, value) == 0); + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0); nvlist_free(propval); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c new file mode 100644 index 000000000000..5f675b787df7 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c @@ -0,0 +1,929 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/dsl_pool.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_synctask.h> +#include <sys/dnode.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/arc.h> +#include <sys/zap.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/zil_impl.h> + +typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); + +static scrub_cb_t dsl_pool_scrub_clean_cb; +static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; + +int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */ +int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */ +boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ + +extern int zfs_txg_timeout; + +static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = { + NULL, + dsl_pool_scrub_clean_cb +}; + +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +/* ARGSUSED */ +static void +dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_pool_t *dp = arg1; + enum scrub_func *funcp = arg2; + dmu_object_type_t ot = 0; + boolean_t complete = B_FALSE; + + dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx); + + ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE); + ASSERT(*funcp > SCRUB_FUNC_NONE); + ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS); + + dp->dp_scrub_min_txg = 0; + dp->dp_scrub_max_txg = tx->tx_txg; + + if (*funcp == SCRUB_FUNC_CLEAN) { + vdev_t *rvd = dp->dp_spa->spa_root_vdev; + + /* rewrite all disk labels */ + vdev_config_dirty(rvd); + + if (vdev_resilver_needed(rvd, + &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) { + spa_event_notify(dp->dp_spa, NULL, + ESC_ZFS_RESILVER_START); + dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, + tx->tx_txg); + } + + /* zero out the scrub stats in all vdev_stat_t's */ + vdev_scrub_stat_update(rvd, + dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : + POOL_SCRUB_EVERYTHING, B_FALSE); + + dp->dp_spa->spa_scrub_started = B_TRUE; + } + + /* back to the generic stuff */ + + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) + ot = DMU_OT_ZAP_OTHER; + + dp->dp_scrub_func = *funcp; + dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, + ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); + bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + dp->dp_scrub_restart = B_FALSE; + dp->dp_spa->spa_scrub_errors = 0; + + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, + &dp->dp_scrub_func, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, + &dp->dp_scrub_queue_obj, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, + &dp->dp_scrub_min_txg, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, + &dp->dp_scrub_max_txg, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + &dp->dp_scrub_bookmark, tx)); + VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, + &dp->dp_spa->spa_scrub_errors, tx)); + + spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr, + "func=%u mintxg=%llu maxtxg=%llu", + *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg); +} + +int +dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func) +{ + return (dsl_sync_task_do(dp, NULL, + dsl_pool_scrub_setup_sync, dp, &func, 0)); +} + +/* ARGSUSED */ +static void +dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_pool_t *dp = arg1; + boolean_t *completep = arg2; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + mutex_enter(&dp->dp_scrub_cancel_lock); + + if (dp->dp_scrub_restart) { + dp->dp_scrub_restart = B_FALSE; + *completep = B_FALSE; + } + + /* XXX this is scrub-clean specific */ + mutex_enter(&dp->dp_spa->spa_scrub_lock); + while (dp->dp_spa->spa_scrub_inflight > 0) { + cv_wait(&dp->dp_spa->spa_scrub_io_cv, + &dp->dp_spa->spa_scrub_lock); + } + mutex_exit(&dp->dp_spa->spa_scrub_lock); + dp->dp_spa->spa_scrub_started = B_FALSE; + dp->dp_spa->spa_scrub_active = B_FALSE; + + dp->dp_scrub_func = SCRUB_FUNC_NONE; + VERIFY(0 == dmu_object_free(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, tx)); + dp->dp_scrub_queue_obj = 0; + bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_QUEUE, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MIN_TXG, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_MAX_TXG, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_FUNC, tx)); + VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_ERRORS, tx)); + + spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, + "complete=%u", *completep); + + /* below is scrub-clean specific */ + vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE, + *completep); + /* + * If the scrub/resilver completed, update all DTLs to reflect this. + * Whether it succeeded or not, vacate all temporary scrub DTLs. + */ + vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, + *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); + if (dp->dp_scrub_min_txg && *completep) + spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH); + spa_errlog_rotate(dp->dp_spa); + + /* + * We may have finished replacing a device. + * Let the async thread assess this and handle the detach. + */ + spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE); + + dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0; + mutex_exit(&dp->dp_scrub_cancel_lock); +} + +int +dsl_pool_scrub_cancel(dsl_pool_t *dp) +{ + boolean_t complete = B_FALSE; + + return (dsl_sync_task_do(dp, NULL, + dsl_pool_scrub_cancel_sync, dp, &complete, 3)); +} + +int +dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, + zio_done_func_t *done, void *private, uint32_t arc_flags) +{ + /* + * This function will be used by bp-rewrite wad to intercept frees. + */ + return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp, + done, private, arc_flags)); +} + +static boolean_t +bookmark_is_zero(const zbookmark_t *zb) +{ + return (zb->zb_objset == 0 && zb->zb_object == 0 && + zb->zb_level == 0 && zb->zb_blkid == 0); +} + +/* dnp is the dnode for zb1->zb_object */ +static boolean_t +bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, + const zbookmark_t *zb2) +{ + uint64_t zb1nextL0, zb2thisobj; + + ASSERT(zb1->zb_objset == zb2->zb_objset); + ASSERT(zb1->zb_object != -1ULL); + ASSERT(zb2->zb_level == 0); + + /* + * A bookmark in the deadlist is considered to be after + * everything else. + */ + if (zb2->zb_object == -1ULL) + return (B_TRUE); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) + return (B_FALSE); + + zb1nextL0 = (zb1->zb_blkid + 1) << + ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + + zb2thisobj = zb2->zb_object ? zb2->zb_object : + zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + + if (zb1->zb_object == 0) { + uint64_t nextobj = zb1nextL0 * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; + return (nextobj <= zb2thisobj); + } + + if (zb1->zb_object < zb2thisobj) + return (B_TRUE); + if (zb1->zb_object > zb2thisobj) + return (B_FALSE); + if (zb2->zb_object == 0) + return (B_FALSE); + return (zb1nextL0 <= zb2->zb_blkid); +} + +static boolean_t +scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) +{ + int elapsed_ticks; + int mintime; + + if (dp->dp_scrub_pausing) + return (B_TRUE); /* we're already pausing */ + + if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) + return (B_FALSE); /* we're resuming */ + + /* We only know how to resume from level-0 blocks. */ + if (zb->zb_level != 0) + return (B_FALSE); + + mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time : + zfs_scrub_min_time; + elapsed_ticks = lbolt64 - dp->dp_scrub_start_time; + if (elapsed_ticks > hz * zfs_txg_timeout || + (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) { + dprintf("pausing at %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); + dp->dp_scrub_pausing = B_TRUE; + dp->dp_scrub_bookmark = *zb; + return (B_TRUE); + } + return (B_FALSE); +} + +typedef struct zil_traverse_arg { + dsl_pool_t *zta_dp; + zil_header_t *zta_zh; +} zil_traverse_arg_t; + +/* ARGSUSED */ +static void +traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + zil_traverse_arg_t *zta = arg; + dsl_pool_t *dp = zta->zta_dp; + zil_header_t *zh = zta->zta_zh; + zbookmark_t zb; + + if (bp->blk_birth <= dp->dp_scrub_min_txg) + return; + + if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) + return; + + zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET]; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; + VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); +} + +/* ARGSUSED */ +static void +traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + if (lrc->lrc_txtype == TX_WRITE) { + zil_traverse_arg_t *zta = arg; + dsl_pool_t *dp = zta->zta_dp; + zil_header_t *zh = zta->zta_zh; + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_t zb; + + if (bp->blk_birth <= dp->dp_scrub_min_txg) + return; + + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return; + + zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET]; + zb.zb_object = lr->lr_foid; + zb.zb_level = BP_GET_LEVEL(bp); + zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); + VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); + } +} + +static void +traverse_zil(dsl_pool_t *dp, zil_header_t *zh) +{ + uint64_t claim_txg = zh->zh_claim_txg; + zil_traverse_arg_t zta = { dp, zh }; + zilog_t *zilog; + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed (or, in read-only mode, blocks that *would* be claimed). + */ + if (claim_txg == 0 && (spa_mode & FWRITE)) + return; + + zilog = zil_alloc(dp->dp_meta_objset, zh); + + (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta, + claim_txg); + + zil_free(zilog); +} + +static void +scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, + arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) +{ + int err; + arc_buf_t *buf = NULL; + + if (bp->blk_birth == 0) + return; + + if (bp->blk_birth <= dp->dp_scrub_min_txg) + return; + + if (scrub_pause(dp, zb)) + return; + + if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { + /* + * If we already visited this bp & everything below (in + * a prior txg), don't bother doing it again. + */ + if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark)) + return; + + /* + * If we found the block we're trying to resume from, or + * we went past it to a different object, zero it out to + * indicate that it's OK to start checking for pausing + * again. + */ + if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 || + zb->zb_object > dp->dp_scrub_bookmark.zb_object) { + dprintf("resuming at %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + bzero(&dp->dp_scrub_bookmark, sizeof (*zb)); + } + } + + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + + err = arc_read(NULL, dp->dp_spa, bp, pbuf, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + mutex_enter(&dp->dp_spa->spa_scrub_lock); + dp->dp_spa->spa_scrub_errors++; + mutex_exit(&dp->dp_spa->spa_scrub_lock); + return; + } + cbp = buf->b_data; + + for (i = 0; i < epb; i++, cbp++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + scrub_visitbp(dp, dnp, buf, cbp, &czb); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + uint32_t flags = ARC_WAIT; + dnode_phys_t *child_dnp; + int i, j; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + err = arc_read(NULL, dp->dp_spa, bp, pbuf, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + mutex_enter(&dp->dp_spa->spa_scrub_lock); + dp->dp_spa->spa_scrub_errors++; + mutex_exit(&dp->dp_spa->spa_scrub_lock); + return; + } + child_dnp = buf->b_data; + + for (i = 0; i < epb; i++, child_dnp++) { + for (j = 0; j < child_dnp->dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, + zb->zb_blkid * epb + i, + child_dnp->dn_nlevels - 1, j); + scrub_visitbp(dp, child_dnp, buf, + &child_dnp->dn_blkptr[j], &czb); + } + } + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + uint32_t flags = ARC_WAIT; + objset_phys_t *osp; + int j; + + err = arc_read_nolock(NULL, dp->dp_spa, bp, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) { + mutex_enter(&dp->dp_spa->spa_scrub_lock); + dp->dp_spa->spa_scrub_errors++; + mutex_exit(&dp->dp_spa->spa_scrub_lock); + return; + } + + osp = buf->b_data; + + traverse_zil(dp, &osp->os_zil_header); + + for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, 0, + osp->os_meta_dnode.dn_nlevels - 1, j); + scrub_visitbp(dp, &osp->os_meta_dnode, buf, + &osp->os_meta_dnode.dn_blkptr[j], &czb); + } + } + + (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); + if (buf) + (void) arc_buf_remove_ref(buf, &buf); +} + +static void +scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) +{ + zbookmark_t zb; + + SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0); + scrub_visitbp(dp, NULL, NULL, bp, &zb); +} + +void +dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { + SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0); + } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_object, tx) != 0) { + return; + } + + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_phys->ds_next_snap_obj, tx) == 0); + } + ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); +} + +void +dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); + + if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { + dp->dp_scrub_bookmark.zb_objset = + ds->ds_phys->ds_prev_snap_obj; + } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_object, tx) == 0) { + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_phys->ds_prev_snap_obj, tx) == 0); + } +} + +struct enqueue_clones_arg { + dmu_tx_t *tx; + uint64_t originobj; +}; + +/* ARGSUSED */ +static int +enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + struct enqueue_clones_arg *eca = arg; + dsl_dataset_t *ds; + int err; + dsl_pool_t *dp; + + err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); + if (err) + return (err); + dp = ds->ds_dir->dd_pool; + + if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { + while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); + + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + ds = prev; + } + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_object, eca->tx) == 0); + } + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) +{ + dsl_dataset_t *ds; + uint64_t min_txg_save; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + + /* + * Iterate over the bps in this ds. + */ + min_txg_save = dp->dp_scrub_min_txg; + dp->dp_scrub_min_txg = + MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg); + scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp); + dp->dp_scrub_min_txg = min_txg_save; + + if (dp->dp_scrub_pausing) + goto out; + + /* + * Add descendent datasets to work queue. + */ + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_phys->ds_next_snap_obj, tx) == 0); + } + if (ds->ds_phys->ds_num_children > 1) { + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + struct enqueue_clones_arg eca; + eca.tx = tx; + eca.originobj = ds->ds_object; + + (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, + NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); + } else { + VERIFY(zap_join(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, + dp->dp_scrub_queue_obj, tx) == 0); + } + } + +out: + dsl_dataset_rele(ds, FTAG); +} + +/* ARGSUSED */ +static int +enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + dmu_tx_t *tx = arg; + dsl_dataset_t *ds; + int err; + dsl_pool_t *dp; + + err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); + if (err) + return (err); + + dp = ds->ds_dir->dd_pool; + + while (ds->ds_phys->ds_prev_snap_obj != 0) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + /* + * If this is a clone, we don't need to worry about it for now. + */ + if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(prev, FTAG); + return (0); + } + dsl_dataset_rele(ds, FTAG); + ds = prev; + } + + VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds->ds_object, tx) == 0); + dsl_dataset_rele(ds, FTAG); + return (0); +} + +void +dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + boolean_t complete = B_TRUE; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + /* If the spa is not fully loaded, don't bother. */ + if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE) + return; + + if (dp->dp_scrub_restart) { + enum scrub_func func = dp->dp_scrub_func; + dp->dp_scrub_restart = B_FALSE; + dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); + } + + if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { + /* + * We must have resumed after rebooting; reset the vdev + * stats to know that we're doing a scrub (although it + * will think we're just starting now). + */ + vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, + dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : + POOL_SCRUB_EVERYTHING, B_FALSE); + } + + dp->dp_scrub_pausing = B_FALSE; + dp->dp_scrub_start_time = lbolt64; + dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); + dp->dp_spa->spa_scrub_active = B_TRUE; + + if (dp->dp_scrub_bookmark.zb_objset == 0) { + /* First do the MOS & ORIGIN */ + scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp); + if (dp->dp_scrub_pausing) + goto out; + + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, + NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); + } else { + scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); + } + ASSERT(!dp->dp_scrub_pausing); + } else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) { + /* + * If we were paused, continue from here. Note if the + * ds we were paused on was deleted, the zb_objset will + * be -1, so we will skip this and find a new objset + * below. + */ + scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx); + if (dp->dp_scrub_pausing) + goto out; + } + + /* + * In case we were paused right at the end of the ds, zero the + * bookmark so we don't think that we're still trying to resume. + */ + bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + + /* keep pulling things out of the zap-object-as-queue */ + while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj), + zap_cursor_retrieve(&zc, &za) == 0) { + VERIFY(0 == zap_remove(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, za.za_name, tx)); + scrub_visitds(dp, za.za_first_integer, tx); + if (dp->dp_scrub_pausing) + break; + zap_cursor_fini(&zc); + } + zap_cursor_fini(&zc); + if (dp->dp_scrub_pausing) + goto out; + + /* done. */ + + dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); + return; +out: + VERIFY(0 == zap_update(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + &dp->dp_scrub_bookmark, tx)); + VERIFY(0 == zap_update(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, + &dp->dp_spa->spa_scrub_errors, tx)); + + /* XXX this is scrub-clean specific */ + mutex_enter(&dp->dp_spa->spa_scrub_lock); + while (dp->dp_spa->spa_scrub_inflight > 0) { + cv_wait(&dp->dp_spa->spa_scrub_io_cv, + &dp->dp_spa->spa_scrub_lock); + } + mutex_exit(&dp->dp_spa->spa_scrub_lock); +} + +void +dsl_pool_scrub_restart(dsl_pool_t *dp) +{ + mutex_enter(&dp->dp_scrub_cancel_lock); + dp->dp_scrub_restart = B_TRUE; + mutex_exit(&dp->dp_scrub_cancel_lock); +} + +/* + * scrub consumers + */ + +static void +dsl_pool_scrub_clean_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + + zio_data_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_inflight--; + cv_broadcast(&spa->spa_scrub_io_cv); + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) + spa->spa_scrub_errors++; + mutex_exit(&spa->spa_scrub_lock); +} + +static int +dsl_pool_scrub_clean_cb(dsl_pool_t *dp, + const blkptr_t *bp, const zbookmark_t *zb) +{ + size_t size = BP_GET_LSIZE(bp); + int d; + spa_t *spa = dp->dp_spa; + boolean_t needs_io; + int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; + int zio_priority; + + if (dp->dp_scrub_isresilver == 0) { + /* It's a scrub */ + zio_flags |= ZIO_FLAG_SCRUB; + zio_priority = ZIO_PRIORITY_SCRUB; + needs_io = B_TRUE; + } else { + /* It's a resilver */ + zio_flags |= ZIO_FLAG_RESILVER; + zio_priority = ZIO_PRIORITY_RESILVER; + needs_io = B_FALSE; + } + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + for (d = 0; d < BP_GET_NDVAS(bp); d++) { + vdev_t *vd = vdev_lookup_top(spa, + DVA_GET_VDEV(&bp->blk_dva[d])); + + /* + * Keep track of how much data we've examined so that + * zpool(1M) status can make useful progress reports. + */ + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_scrub_examined += + DVA_GET_ASIZE(&bp->blk_dva[d]); + mutex_exit(&vd->vdev_stat_lock); + + /* if it's a resilver, this may not be in the target range */ + if (!needs_io) { + if (DVA_GET_GANG(&bp->blk_dva[d])) { + /* + * Gang members may be spread across multiple + * vdevs, so the best we can do is look at the + * pool-wide DTL. + * XXX -- it would be better to change our + * allocation policy to ensure that this can't + * happen. + */ + vd = spa->spa_root_vdev; + } + needs_io = vdev_dtl_contains(&vd->vdev_dtl_map, + bp->blk_birth, 1); + } + } + + if (needs_io && !zfs_no_scrub_io) { + void *data = zio_data_buf_alloc(size); + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); + + zio_nowait(zio_read(NULL, spa, bp, data, size, + dsl_pool_scrub_clean_done, NULL, zio_priority, + zio_flags, zb)); + } + + /* do not relocate this block */ + return (0); +} + +int +dsl_pool_scrub_clean(dsl_pool_t *dp) +{ + /* + * Purge all vdev caches. We do this here rather than in sync + * context because this requires a writer lock on the spa_config + * lock, which we can't do from sync context. The + * spa_scrub_reopen flag indicates that vdev_open() should not + * attempt to start another scrub. + */ + spa_config_enter(dp->dp_spa, SCL_ALL, FTAG, RW_WRITER); + dp->dp_spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(dp->dp_spa->spa_root_vdev); + dp->dp_spa->spa_scrub_reopen = B_FALSE; + spa_config_exit(dp->dp_spa, SCL_ALL, FTAG); + + return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c index 17deb569c4ab..21100225abf7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,6 +30,7 @@ #include <sys/dsl_pool.h> #include <sys/dsl_dir.h> #include <sys/dsl_synctask.h> +#include <sys/cred.h> #define DST_AVG_BLKSHIFT 14 @@ -49,6 +50,7 @@ dsl_sync_task_group_create(dsl_pool_t *dp) list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t), offsetof(dsl_sync_task_t, dst_node)); dstg->dstg_pool = dp; + dstg->dstg_cr = CRED(); return (dstg); } @@ -123,6 +125,16 @@ top: } void +dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) +{ + uint64_t txg; + + dstg->dstg_nowaiter = B_TRUE; + txg = dmu_tx_get_txg(tx); + VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg)); +} + +void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg) { dsl_sync_task_t *dst; @@ -146,7 +158,7 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) * Check for sufficient space. */ dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir, - dstg->dstg_space, dstg->dstg_space * 3, 0, &tr_cookie, tx); + dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx); /* don't bother trying again */ if (dstg->dstg_err == ERESTART) dstg->dstg_err = EAGAIN; @@ -171,12 +183,16 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) */ for (dst = list_head(&dstg->dstg_tasks); dst; dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx); + dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, + dstg->dstg_cr, tx); } } rw_exit(&dstg->dstg_pool->dp_config_rwlock); dsl_dir_tempreserve_clear(tr_cookie, tx); + + if (dstg->dstg_nowaiter) + dsl_sync_task_group_destroy(dstg); } int @@ -194,3 +210,16 @@ dsl_sync_task_do(dsl_pool_t *dp, dsl_sync_task_group_destroy(dstg); return (err); } + +void +dsl_sync_task_do_nowait(dsl_pool_t *dp, + dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, + void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx) +{ + dsl_sync_task_group_t *dstg; + + dstg = dsl_sync_task_group_create(dp); + dsl_sync_task_create(dstg, checkfunc, syncfunc, + arg1, arg2, blocks_modified); + dsl_sync_task_group_nowait(dstg, tx); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c index 0dba134cef9b..22b56d617799 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa_impl.h> #include <sys/dmu.h> @@ -35,6 +33,7 @@ #include <sys/zio.h> uint64_t metaslab_aliquot = 512ULL << 10; +uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* * ========================================================================== @@ -341,7 +340,7 @@ metaslab_fini(metaslab_t *msp) int t; vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, - -msp->ms_smo.smo_alloc); + -msp->ms_smo.smo_alloc, B_TRUE); metaslab_group_remove(mg, msp); @@ -534,8 +533,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); - ASSERT3U(db->db_size, ==, sizeof (*smo)); - bcopy(smo, db->db_data, db->db_size); + ASSERT3U(db->db_size, >=, sizeof (*smo)); + bcopy(smo, db->db_data, sizeof (*smo)); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); @@ -569,10 +568,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) space_map_create(&msp->ms_freemap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); } - vdev_space_update(vd, sm->sm_size, 0); + vdev_space_update(vd, sm->sm_size, 0, B_TRUE); } - vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc); + vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE); ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); @@ -714,11 +713,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, * Allocate a block for the specified i/o. */ static int -metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d, - dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid) +metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) { metaslab_group_t *mg, *rotor; - metaslab_class_t *mc; vdev_t *vd; int dshift = 3; int all_zero; @@ -728,7 +726,11 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d, ASSERT(!DVA_IS_VALID(&dva[d])); - mc = spa_metaslab_class_select(spa); + /* + * For testing, make some blocks above a certain size be gang blocks. + */ + if (psize >= metaslab_gang_bang && (LBOLT & 3) == 0) + return (ENOSPC); /* * Start at the rotor and loop through all mgs until we find something. @@ -754,7 +756,7 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d, */ if (hintdva) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); - if (hintdva_avoid) + if (flags & METASLAB_HINTBP_AVOID) mg = vd->vdev_mg->mg_next; else mg = vd->vdev_mg; @@ -764,12 +766,34 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d, } else { mg = mc->mc_rotor; } - rotor = mg; + /* + * If the hint put us into the wrong class, just follow the rotor. + */ + if (mg->mg_class != mc) + mg = mc->mc_rotor; + + rotor = mg; top: all_zero = B_TRUE; do { vd = mg->mg_vd; + /* + * Don't allocate from faulted devices. + */ + if (!vdev_writeable(vd)) + goto next; + /* + * Avoid writing single-copy data to a failing vdev + */ + if ((vd->vdev_stat.vs_write_errors > 0 || + vd->vdev_state < VDEV_STATE_HEALTHY) && + d == 0 && dshift == 3) { + all_zero = B_FALSE; + goto next; + } + + ASSERT(mg->mg_class == mc); distance = vd->vdev_asize >> dshift; if (distance <= (1ULL << vd->vdev_ms_shift)) @@ -818,11 +842,12 @@ top: DVA_SET_VDEV(&dva[d], vd->vdev_id); DVA_SET_OFFSET(&dva[d], offset); - DVA_SET_GANG(&dva[d], 0); + DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); DVA_SET_ASIZE(&dva[d], asize); return (0); } +next: mc->mc_rotor = mg->mg_next; mc->mc_allocated = 0; } while ((mg = mg->mg_next) != rotor); @@ -879,38 +904,6 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); - - /* - * verify that this region is actually allocated in - * either a ms_allocmap or the ms_map - */ - if (msp->ms_map.sm_loaded) { - boolean_t allocd = B_FALSE; - int i; - - if (!space_map_contains(&msp->ms_map, offset, size)) { - allocd = B_TRUE; - } else { - for (i = 0; i < TXG_CONCURRENT_STATES; i++) { - space_map_t *sm = &msp->ms_allocmap - [(txg - i) & TXG_MASK]; - if (space_map_contains(sm, - offset, size)) { - allocd = B_TRUE; - break; - } - } - } - - if (!allocd) { - zfs_panic_recover("freeing free segment " - "(vdev=%llu offset=%llx size=%llx)", - (longlong_t)vdev, (longlong_t)offset, - (longlong_t)size); - } - } - - } mutex_exit(&msp->ms_lock); @@ -946,16 +939,18 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); - if (error) { + if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); } - if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) - vdev_dirty(vd, VDD_METASLAB, msp, txg); - space_map_claim(&msp->ms_map, offset, size); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + + if (spa_mode & FWRITE) { /* don't dirty if we're zdb(1M) */ + if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + vdev_dirty(vd, VDD_METASLAB, msp, txg); + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + } mutex_exit(&msp->ms_lock); @@ -963,32 +958,45 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) } int -metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas, - uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid) +metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, + int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; - int d; int error = 0; + ASSERT(bp->blk_birth == 0); + + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); + + if (mc->mc_rotor == NULL) { /* no vdevs in this class */ + spa_config_exit(spa, SCL_ALLOC, FTAG); + return (ENOSPC); + } + ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); - for (d = 0; d < ndvas; d++) { - error = metaslab_alloc_dva(spa, psize, dva, d, hintdva, - txg, hintbp_avoid); + for (int d = 0; d < ndvas; d++) { + error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, + txg, flags); if (error) { for (d--; d >= 0; d--) { metaslab_free_dva(spa, &dva[d], txg, B_TRUE); bzero(&dva[d], sizeof (dva_t)); } + spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); } } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); + spa_config_exit(spa, SCL_ALLOC, FTAG); + + bp->blk_birth = txg; + return (0); } @@ -997,12 +1005,16 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); - int d; ASSERT(!BP_IS_HOLE(bp)); + ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg); + + spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); - for (d = 0; d < ndvas; d++) + for (int d = 0; d < ndvas; d++) metaslab_free_dva(spa, &dva[d], txg, now); + + spa_config_exit(spa, SCL_FREE, FTAG); } int @@ -1010,14 +1022,28 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); - int d, error; - int last_error = 0; + int error = 0; ASSERT(!BP_IS_HOLE(bp)); - for (d = 0; d < ndvas; d++) + if (txg != 0) { + /* + * First do a dry run to make sure all DVAs are claimable, + * so we don't have to unwind from partial failures below. + */ + if ((error = metaslab_claim(spa, bp, 0)) != 0) + return (error); + } + + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); + + for (int d = 0; d < ndvas; d++) if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) - last_error = error; + break; + + spa_config_exit(spa, SCL_ALLOC, FTAG); + + ASSERT(error == 0 || txg == 0); - return (last_error); + return (error); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c index a2f4614fed87..5fe4e638055a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -61,11 +60,13 @@ refcount_fini(void) void refcount_create(refcount_t *rc) { + mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&rc->rc_list, sizeof (reference_t), offsetof(reference_t, ref_link)); list_create(&rc->rc_removed, sizeof (reference_t), offsetof(reference_t, ref_link)); - mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); + rc->rc_count = 0; + rc->rc_removed_count = 0; } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c new file mode 100644 index 000000000000..db3b70fc68b0 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c @@ -0,0 +1,249 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/refcount.h> +#include <sys/rrwlock.h> + +/* + * This file contains the implementation of a re-entrant read + * reader/writer lock (aka "rrwlock"). + * + * This is a normal reader/writer lock with the additional feature + * of allowing threads who have already obtained a read lock to + * re-enter another read lock (re-entrant read) - even if there are + * waiting writers. + * + * Callers who have not obtained a read lock give waiting writers priority. + * + * The rrwlock_t lock does not allow re-entrant writers, nor does it + * allow a re-entrant mix of reads and writes (that is, it does not + * allow a caller who has already obtained a read lock to be able to + * then grab a write lock without first dropping all read locks, and + * vice versa). + * + * The rrwlock_t uses tsd (thread specific data) to keep a list of + * nodes (rrw_node_t), where each node keeps track of which specific + * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering + * should be rare, a thread that grabs multiple reads on the same rrwlock_t + * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the + * tsd list can represent a different rrwlock_t. This allows a thread + * to enter multiple and unique rrwlock_ts for read locks at the same time. + * + * Since using tsd exposes some overhead, the rrwlock_t only needs to + * keep tsd data when writers are waiting. If no writers are waiting, then + * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd + * is needed. Once a writer attempts to grab the lock, readers then + * keep tsd data and bump the linked readers count (rr_linked_rcount). + * + * If there are waiting writers and there are anonymous readers, then a + * reader doesn't know if it is a re-entrant lock. But since it may be one, + * we allow the read to proceed (otherwise it could deadlock). Since once + * waiting writers are active, readers no longer bump the anonymous count, + * the anonymous readers will eventually flush themselves out. At this point, + * readers will be able to tell if they are a re-entrant lock (have a + * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then + * we must let the proceed. If they are not, then the reader blocks for the + * waiting writers. Hence, we do not starve writers. + */ + +/* global key for TSD */ +uint_t rrw_tsd_key; + +typedef struct rrw_node { + struct rrw_node *rn_next; + rrwlock_t *rn_rrl; +} rrw_node_t; + +static rrw_node_t * +rrn_find(rrwlock_t *rrl) +{ + rrw_node_t *rn; + + if (refcount_count(&rrl->rr_linked_rcount) == 0) + return (NULL); + + for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { + if (rn->rn_rrl == rrl) + return (rn); + } + return (NULL); +} + +/* + * Add a node to the head of the singly linked list. + */ +static void +rrn_add(rrwlock_t *rrl) +{ + rrw_node_t *rn; + + rn = kmem_alloc(sizeof (*rn), KM_SLEEP); + rn->rn_rrl = rrl; + rn->rn_next = tsd_get(rrw_tsd_key); + VERIFY(tsd_set(rrw_tsd_key, rn) == 0); +} + +/* + * If a node is found for 'rrl', then remove the node from this + * thread's list and return TRUE; otherwise return FALSE. + */ +static boolean_t +rrn_find_and_remove(rrwlock_t *rrl) +{ + rrw_node_t *rn; + rrw_node_t *prev = NULL; + + if (refcount_count(&rrl->rr_linked_rcount) == 0) + return (B_FALSE); + + for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { + if (rn->rn_rrl == rrl) { + if (prev) + prev->rn_next = rn->rn_next; + else + VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); + kmem_free(rn, sizeof (*rn)); + return (B_TRUE); + } + prev = rn; + } + return (B_FALSE); +} + +void +rrw_init(rrwlock_t *rrl) +{ + mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); + rrl->rr_writer = NULL; + refcount_create(&rrl->rr_anon_rcount); + refcount_create(&rrl->rr_linked_rcount); + rrl->rr_writer_wanted = B_FALSE; +} + +void +rrw_destroy(rrwlock_t *rrl) +{ + mutex_destroy(&rrl->rr_lock); + cv_destroy(&rrl->rr_cv); + ASSERT(rrl->rr_writer == NULL); + refcount_destroy(&rrl->rr_anon_rcount); + refcount_destroy(&rrl->rr_linked_rcount); +} + +static void +rrw_enter_read(rrwlock_t *rrl, void *tag) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(rrl->rr_writer != curthread); + ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); + + while (rrl->rr_writer || (rrl->rr_writer_wanted && + refcount_is_zero(&rrl->rr_anon_rcount) && + rrn_find(rrl) == NULL)) + cv_wait(&rrl->rr_cv, &rrl->rr_lock); + + if (rrl->rr_writer_wanted) { + /* may or may not be a re-entrant enter */ + rrn_add(rrl); + (void) refcount_add(&rrl->rr_linked_rcount, tag); + } else { + (void) refcount_add(&rrl->rr_anon_rcount, tag); + } + ASSERT(rrl->rr_writer == NULL); + mutex_exit(&rrl->rr_lock); +} + +static void +rrw_enter_write(rrwlock_t *rrl) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(rrl->rr_writer != curthread); + + while (refcount_count(&rrl->rr_anon_rcount) > 0 || + refcount_count(&rrl->rr_linked_rcount) > 0 || + rrl->rr_writer != NULL) { + rrl->rr_writer_wanted = B_TRUE; + cv_wait(&rrl->rr_cv, &rrl->rr_lock); + } + rrl->rr_writer_wanted = B_FALSE; + rrl->rr_writer = curthread; + mutex_exit(&rrl->rr_lock); +} + +void +rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) +{ + if (rw == RW_READER) + rrw_enter_read(rrl, tag); + else + rrw_enter_write(rrl); +} + +void +rrw_exit(rrwlock_t *rrl, void *tag) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || + !refcount_is_zero(&rrl->rr_linked_rcount) || + rrl->rr_writer != NULL); + + if (rrl->rr_writer == NULL) { + if (rrn_find_and_remove(rrl)) { + if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0) + cv_broadcast(&rrl->rr_cv); + + } else { + if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0) + cv_broadcast(&rrl->rr_cv); + } + } else { + ASSERT(rrl->rr_writer == curthread); + ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && + refcount_is_zero(&rrl->rr_linked_rcount)); + rrl->rr_writer = NULL; + cv_broadcast(&rrl->rr_cv); + } + mutex_exit(&rrl->rr_lock); +} + +boolean_t +rrw_held(rrwlock_t *rrl, krw_t rw) +{ + boolean_t held; + + mutex_enter(&rrl->rr_lock); + if (rw == RW_WRITER) { + held = (rrl->rr_writer == curthread); + } else { + held = (!refcount_is_zero(&rrl->rr_anon_rcount) || + !refcount_is_zero(&rrl->rr_linked_rcount)); + } + mutex_exit(&rrl->rr_lock); + + return (held); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c index ce5c26131af5..ca7076cb6fd9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,20 +30,20 @@ #include <sys/zio_checksum.h> /* - * SHA-256 checksum, as specified in FIPS 180-2, available at: - * http://csrc.nist.gov/cryptval + * SHA-256 checksum, as specified in FIPS 180-3, available at: + * http://csrc.nist.gov/publications/PubsFIPS.html * * This is a very compact implementation of SHA-256. * It is designed to be simple and portable, not to be fast. */ /* - * The literal definitions according to FIPS180-2 would be: + * The literal definitions of Ch() and Maj() according to FIPS 180-3 are: * - * Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) - * Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z))) + * Ch(x, y, z) (x & y) ^ (~x & z) + * Maj(x, y, z) (x & y) ^ (x & z) ^ (y & z) * - * We use logical equivalents which require one less op. + * We use equivalent logical reductions here that require one less op. */ #define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) #define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y)))) @@ -105,20 +104,19 @@ zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp) uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; uint8_t pad[128]; - int padsize = size & 63; - int i; + int i, padsize; - for (i = 0; i < size - padsize; i += 64) + for (i = 0; i < (size & ~63ULL); i += 64) SHA256Transform(H, (uint8_t *)buf + i); - for (i = 0; i < padsize; i++) - pad[i] = ((uint8_t *)buf)[i]; + for (padsize = 0; i < size; i++) + pad[padsize++] = *((uint8_t *)buf + i); for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++) pad[padsize] = 0; - for (i = 0; i < 8; i++) - pad[padsize++] = (size << 3) >> (56 - 8 * i); + for (i = 56; i >= 0; i -= 8) + pad[padsize++] = (size << 3) >> i; for (i = 0; i < padsize; i += 64) SHA256Transform(H, pad + i); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index 6a7c525ae991..163b21572247 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a @@ -56,16 +54,388 @@ #include <sys/dsl_prop.h> #include <sys/dsl_synctask.h> #include <sys/fs/zfs.h> +#include <sys/arc.h> #include <sys/callb.h> #include <sys/sunddi.h> +#include <sys/spa_boot.h> + +#include "zfs_prop.h" +#include "zfs_comutil.h" -int zio_taskq_threads = 0; -SYSCTL_DECL(_vfs_zfs); -SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); -TUNABLE_INT("vfs.zfs.zio.taskq_threads", &zio_taskq_threads); -SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_threads, CTLFLAG_RW, - &zio_taskq_threads, 0, "Number of ZIO threads per ZIO type"); +int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = { + /* ISSUE INTR */ + { 1, 1 }, /* ZIO_TYPE_NULL */ + { 1, 8 }, /* ZIO_TYPE_READ */ + { 8, 1 }, /* ZIO_TYPE_WRITE */ + { 1, 1 }, /* ZIO_TYPE_FREE */ + { 1, 1 }, /* ZIO_TYPE_CLAIM */ + { 1, 1 }, /* ZIO_TYPE_IOCTL */ +}; +static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); +static boolean_t spa_has_active_shared_spare(spa_t *spa); + +/* + * ========================================================================== + * SPA properties routines + * ========================================================================== + */ + +/* + * Add a (source=src, propname=propval) list to an nvlist. + */ +static void +spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, + uint64_t intval, zprop_source_t src) +{ + const char *propname = zpool_prop_to_name(prop); + nvlist_t *propval; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); + + if (strval != NULL) + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); + else + VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); + + VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); + nvlist_free(propval); +} + +/* + * Get property values from the spa configuration. + */ +static void +spa_prop_get_config(spa_t *spa, nvlist_t **nvp) +{ + uint64_t size = spa_get_space(spa); + uint64_t used = spa_get_alloc(spa); + uint64_t cap, version; + zprop_source_t src = ZPROP_SRC_NONE; + spa_config_dirent_t *dp; + + ASSERT(MUTEX_HELD(&spa->spa_props_lock)); + + /* + * readonly properties + */ + spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src); + + cap = (size == 0) ? 0 : (used * 100 / size); + spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); + + spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, + spa->spa_root_vdev->vdev_state, src); + + /* + * settable properties that are not stored in the pool property object. + */ + version = spa_version(spa); + if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); + + if (spa->spa_root != NULL) + spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, + 0, ZPROP_SRC_LOCAL); + + if ((dp = list_head(&spa->spa_config_list)) != NULL) { + if (dp->scd_path == NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, + "none", 0, ZPROP_SRC_LOCAL); + } else if (strcmp(dp->scd_path, spa_config_path) != 0) { + spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, + dp->scd_path, 0, ZPROP_SRC_LOCAL); + } + } +} + +/* + * Get zpool property values. + */ +int +spa_prop_get(spa_t *spa, nvlist_t **nvp) +{ + zap_cursor_t zc; + zap_attribute_t za; + objset_t *mos = spa->spa_meta_objset; + int err; + + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + mutex_enter(&spa->spa_props_lock); + + /* + * Get properties from the spa config. + */ + spa_prop_get_config(spa, nvp); + + /* If no pool property object, no more prop to get. */ + if (spa->spa_pool_props_object == 0) { + mutex_exit(&spa->spa_props_lock); + return (0); + } + + /* + * Get properties from the MOS pool property object. + */ + for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); + (err = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + uint64_t intval = 0; + char *strval = NULL; + zprop_source_t src = ZPROP_SRC_DEFAULT; + zpool_prop_t prop; + + if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) + continue; + + switch (za.za_integer_length) { + case 8: + /* integer property */ + if (za.za_first_integer != + zpool_prop_default_numeric(prop)) + src = ZPROP_SRC_LOCAL; + + if (prop == ZPOOL_PROP_BOOTFS) { + dsl_pool_t *dp; + dsl_dataset_t *ds = NULL; + + dp = spa_get_dsl(spa); + rw_enter(&dp->dp_config_rwlock, RW_READER); + if (err = dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &ds)) { + rw_exit(&dp->dp_config_rwlock); + break; + } + + strval = kmem_alloc( + MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, + KM_SLEEP); + dsl_dataset_name(ds, strval); + dsl_dataset_rele(ds, FTAG); + rw_exit(&dp->dp_config_rwlock); + } else { + strval = NULL; + intval = za.za_first_integer; + } + + spa_prop_add_list(*nvp, prop, strval, intval, src); + + if (strval != NULL) + kmem_free(strval, + MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); + + break; + + case 1: + /* string property */ + strval = kmem_alloc(za.za_num_integers, KM_SLEEP); + err = zap_lookup(mos, spa->spa_pool_props_object, + za.za_name, 1, za.za_num_integers, strval); + if (err) { + kmem_free(strval, za.za_num_integers); + break; + } + spa_prop_add_list(*nvp, prop, strval, 0, src); + kmem_free(strval, za.za_num_integers); + break; + + default: + break; + } + } + zap_cursor_fini(&zc); + mutex_exit(&spa->spa_props_lock); +out: + if (err && err != ENOENT) { + nvlist_free(*nvp); + *nvp = NULL; + return (err); + } + + return (0); +} + +/* + * Validate the given pool properties nvlist and modify the list + * for the property values to be set. + */ +static int +spa_prop_validate(spa_t *spa, nvlist_t *props) +{ + nvpair_t *elem; + int error = 0, reset_bootfs = 0; + uint64_t objnum; + + elem = NULL; + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + zpool_prop_t prop; + char *propname, *strval; + uint64_t intval; + objset_t *os; + char *slash; + + propname = nvpair_name(elem); + + if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) + return (EINVAL); + + switch (prop) { + case ZPOOL_PROP_VERSION: + error = nvpair_value_uint64(elem, &intval); + if (!error && + (intval < spa_version(spa) || intval > SPA_VERSION)) + error = EINVAL; + break; + + case ZPOOL_PROP_DELEGATION: + case ZPOOL_PROP_AUTOREPLACE: + case ZPOOL_PROP_LISTSNAPS: + error = nvpair_value_uint64(elem, &intval); + if (!error && intval > 1) + error = EINVAL; + break; + + case ZPOOL_PROP_BOOTFS: + if (spa_version(spa) < SPA_VERSION_BOOTFS) { + error = ENOTSUP; + break; + } + + /* + * Make sure the vdev config is bootable + */ + if (!vdev_is_bootable(spa->spa_root_vdev)) { + error = ENOTSUP; + break; + } + + reset_bootfs = 1; + + error = nvpair_value_string(elem, &strval); + + if (!error) { + uint64_t compress; + + if (strval == NULL || strval[0] == '\0') { + objnum = zpool_prop_default_numeric( + ZPOOL_PROP_BOOTFS); + break; + } + + if (error = dmu_objset_open(strval, DMU_OST_ZFS, + DS_MODE_USER | DS_MODE_READONLY, &os)) + break; + + /* We don't support gzip bootable datasets */ + if ((error = dsl_prop_get_integer(strval, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), + &compress, NULL)) == 0 && + !BOOTFS_COMPRESS_VALID(compress)) { + error = ENOTSUP; + } else { + objnum = dmu_objset_id(os); + } + dmu_objset_close(os); + } + break; + + case ZPOOL_PROP_FAILUREMODE: + error = nvpair_value_uint64(elem, &intval); + if (!error && (intval < ZIO_FAILURE_MODE_WAIT || + intval > ZIO_FAILURE_MODE_PANIC)) + error = EINVAL; + + /* + * This is a special case which only occurs when + * the pool has completely failed. This allows + * the user to change the in-core failmode property + * without syncing it out to disk (I/Os might + * currently be blocked). We do this by returning + * EIO to the caller (spa_prop_set) to trick it + * into thinking we encountered a property validation + * error. + */ + if (!error && spa_suspended(spa)) { + spa->spa_failmode = intval; + error = EIO; + } + break; + + case ZPOOL_PROP_CACHEFILE: + if ((error = nvpair_value_string(elem, &strval)) != 0) + break; + + if (strval[0] == '\0') + break; + + if (strcmp(strval, "none") == 0) + break; + + if (strval[0] != '/') { + error = EINVAL; + break; + } + + slash = strrchr(strval, '/'); + ASSERT(slash != NULL); + + if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || + strcmp(slash, "/..") == 0) + error = EINVAL; + break; + } + + if (error) + break; + } + + if (!error && reset_bootfs) { + error = nvlist_remove(props, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); + + if (!error) { + error = nvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); + } + } + + return (error); +} + +int +spa_prop_set(spa_t *spa, nvlist_t *nvp) +{ + int error; + + if ((error = spa_prop_validate(spa, nvp)) != 0) + return (error); + + return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, + spa, nvp, 3)); +} + +/* + * If the bootfs property value is dsobj, clear it. + */ +void +spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) +{ + if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { + VERIFY(zap_remove(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); + spa->spa_bootfs = 0; + } +} /* * ========================================================================== @@ -117,40 +487,26 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) static void spa_activate(spa_t *spa) { - int t; - int nthreads = zio_taskq_threads; - char name[32]; ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_normal_class = metaslab_class_create(); + spa->spa_log_class = metaslab_class_create(); - if (nthreads == 0) - nthreads = max_ncpus; - for (t = 0; t < ZIO_TYPES; t++) { - snprintf(name, sizeof(name), "spa_zio_issue %d", t); - spa->spa_zio_issue_taskq[t] = taskq_create(name, nthreads, - maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); - snprintf(name, sizeof(name), "spa_zio_intr %d", t); - spa->spa_zio_intr_taskq[t] = taskq_create(name, nthreads, - maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); + for (int t = 0; t < ZIO_TYPES; t++) { + for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { + spa->spa_zio_taskq[t][q] = taskq_create("spa_zio", + zio_taskq_threads[t][q], maxclsyspri, 50, + INT_MAX, TASKQ_PREPOPULATE); + } } - rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); - - mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); - - list_create(&spa->spa_dirty_list, sizeof (vdev_t), - offsetof(vdev_t, vdev_dirty_node)); + list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_config_dirty_node)); + list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_state_dirty_node)); txg_list_create(&spa->spa_vdev_txg_list, offsetof(struct vdev, vdev_txg_node)); @@ -169,8 +525,6 @@ spa_activate(spa_t *spa) static void spa_deactivate(spa_t *spa) { - int t; - ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); @@ -179,18 +533,22 @@ spa_deactivate(spa_t *spa) txg_list_destroy(&spa->spa_vdev_txg_list); - list_destroy(&spa->spa_dirty_list); + list_destroy(&spa->spa_config_dirty_list); + list_destroy(&spa->spa_state_dirty_list); - for (t = 0; t < ZIO_TYPES; t++) { - taskq_destroy(spa->spa_zio_issue_taskq[t]); - taskq_destroy(spa->spa_zio_intr_taskq[t]); - spa->spa_zio_issue_taskq[t] = NULL; - spa->spa_zio_intr_taskq[t] = NULL; + for (int t = 0; t < ZIO_TYPES; t++) { + for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { + taskq_destroy(spa->spa_zio_taskq[t][q]); + spa->spa_zio_taskq[t][q] = NULL; + } } metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; + metaslab_class_destroy(spa->spa_log_class); + spa->spa_log_class = NULL; + /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. @@ -200,16 +558,6 @@ spa_deactivate(spa_t *spa) avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); - rw_destroy(&spa->spa_traverse_lock); - mutex_destroy(&spa->spa_uberblock_lock); - mutex_destroy(&spa->spa_errlog_lock); - mutex_destroy(&spa->spa_errlist_lock); - mutex_destroy(&spa->spa_config_lock.scl_lock); - cv_destroy(&spa->spa_config_lock.scl_cv); - mutex_destroy(&spa->spa_sync_bplist.bpl_lock); - mutex_destroy(&spa->spa_history_lock); - mutex_destroy(&spa->spa_props_lock); - spa->spa_state = POOL_STATE_UNINITIALIZED; } @@ -233,8 +581,13 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, if ((*vdp)->vdev_ops->vdev_op_leaf) return (0); - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) { + error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children); + + if (error == ENOENT) + return (0); + + if (error) { vdev_free(*vdp); *vdp = NULL; return (EINVAL); @@ -263,6 +616,8 @@ spa_unload(spa_t *spa) { int i; + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + /* * Stop async tasks. */ @@ -277,10 +632,17 @@ spa_unload(spa_t *spa) } /* - * Wait for any outstanding prefetch I/O to complete. + * Wait for any outstanding async I/O to complete. + */ + mutex_enter(&spa->spa_async_root_lock); + while (spa->spa_async_root_count != 0) + cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock); + mutex_exit(&spa->spa_async_root_lock); + + /* + * Drop and purge level 2 cache */ - spa_config_enter(spa, RW_WRITER, FTAG); - spa_config_exit(spa, FTAG); + spa_l2cache_drop(spa); /* * Close the dsl pool. @@ -297,16 +659,31 @@ spa_unload(spa_t *spa) vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); - for (i = 0; i < spa->spa_nspares; i++) - vdev_free(spa->spa_spares[i]); - if (spa->spa_spares) { - kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); - spa->spa_spares = NULL; + for (i = 0; i < spa->spa_spares.sav_count; i++) + vdev_free(spa->spa_spares.sav_vdevs[i]); + if (spa->spa_spares.sav_vdevs) { + kmem_free(spa->spa_spares.sav_vdevs, + spa->spa_spares.sav_count * sizeof (void *)); + spa->spa_spares.sav_vdevs = NULL; + } + if (spa->spa_spares.sav_config) { + nvlist_free(spa->spa_spares.sav_config); + spa->spa_spares.sav_config = NULL; } - if (spa->spa_sparelist) { - nvlist_free(spa->spa_sparelist); - spa->spa_sparelist = NULL; + spa->spa_spares.sav_count = 0; + + for (i = 0; i < spa->spa_l2cache.sav_count; i++) + vdev_free(spa->spa_l2cache.sav_vdevs[i]); + if (spa->spa_l2cache.sav_vdevs) { + kmem_free(spa->spa_l2cache.sav_vdevs, + spa->spa_l2cache.sav_count * sizeof (void *)); + spa->spa_l2cache.sav_vdevs = NULL; } + if (spa->spa_l2cache.sav_config) { + nvlist_free(spa->spa_l2cache.sav_config); + spa->spa_l2cache.sav_config = NULL; + } + spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; } @@ -314,8 +691,8 @@ spa_unload(spa_t *spa) /* * Load (or re-load) the current list of vdevs describing the active spares for * this pool. When this is called, we have some form of basic information in - * 'spa_sparelist'. We parse this into vdevs, try to open them, and then - * re-generate a more complete list including status information. + * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and + * then re-generate a more complete list including status information. */ static void spa_load_spares(spa_t *spa) @@ -325,31 +702,34 @@ spa_load_spares(spa_t *spa) int i; vdev_t *vd, *tvd; + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + /* * First, close and free any existing spare vdevs. */ - for (i = 0; i < spa->spa_nspares; i++) { - vd = spa->spa_spares[i]; + for (i = 0; i < spa->spa_spares.sav_count; i++) { + vd = spa->spa_spares.sav_vdevs[i]; /* Undo the call to spa_activate() below */ - if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && - tvd->vdev_isspare) + if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, + B_FALSE)) != NULL && tvd->vdev_isspare) spa_spare_remove(tvd); vdev_close(vd); vdev_free(vd); } - if (spa->spa_spares) - kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); + if (spa->spa_spares.sav_vdevs) + kmem_free(spa->spa_spares.sav_vdevs, + spa->spa_spares.sav_count * sizeof (void *)); - if (spa->spa_sparelist == NULL) + if (spa->spa_spares.sav_config == NULL) nspares = 0; else - VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, + VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); - spa->spa_nspares = (int)nspares; - spa->spa_spares = NULL; + spa->spa_spares.sav_count = (int)nspares; + spa->spa_spares.sav_vdevs = NULL; if (nspares == 0) return; @@ -363,15 +743,17 @@ spa_load_spares(spa_t *spa) * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ - spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); - for (i = 0; i < spa->spa_nspares; i++) { + spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), + KM_SLEEP); + for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); - spa->spa_spares[i] = vd; + spa->spa_spares.sav_vdevs[i] = vd; - if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { + if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, + B_FALSE)) != NULL) { if (!tvd->vdev_isspare) spa_spare_add(tvd); @@ -392,29 +774,167 @@ spa_load_spares(spa_t *spa) spa_spare_activate(tvd); } + vd->vdev_top = vd; + if (vdev_open(vd) != 0) continue; - vd->vdev_top = vd; - (void) vdev_validate_spare(vd); + if (vdev_validate_aux(vd) == 0) + spa_spare_add(vd); } /* * Recompute the stashed list of spares, with status information * this time. */ - VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, + VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); - spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); - for (i = 0; i < spa->spa_nspares; i++) - spares[i] = vdev_config_generate(spa, spa->spa_spares[i], - B_TRUE, B_TRUE); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, - spares, spa->spa_nspares) == 0); - for (i = 0; i < spa->spa_nspares; i++) + spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), + KM_SLEEP); + for (i = 0; i < spa->spa_spares.sav_count; i++) + spares[i] = vdev_config_generate(spa, + spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); + for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); - kmem_free(spares, spa->spa_nspares * sizeof (void *)); + kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); +} + +/* + * Load (or re-load) the current list of vdevs describing the active l2cache for + * this pool. When this is called, we have some form of basic information in + * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and + * then re-generate a more complete list including status information. + * Devices which are already active have their details maintained, and are + * not re-opened. + */ +static void +spa_load_l2cache(spa_t *spa) +{ + nvlist_t **l2cache; + uint_t nl2cache; + int i, j, oldnvdevs; + uint64_t guid, size; + vdev_t *vd, **oldvdevs, **newvdevs; + spa_aux_vdev_t *sav = &spa->spa_l2cache; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if (sav->sav_config != NULL) { + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); + } else { + nl2cache = 0; + } + + oldvdevs = sav->sav_vdevs; + oldnvdevs = sav->sav_count; + sav->sav_vdevs = NULL; + sav->sav_count = 0; + + /* + * Process new nvlist of vdevs. + */ + for (i = 0; i < nl2cache; i++) { + VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, + &guid) == 0); + + newvdevs[i] = NULL; + for (j = 0; j < oldnvdevs; j++) { + vd = oldvdevs[j]; + if (vd != NULL && guid == vd->vdev_guid) { + /* + * Retain previous vdev for add/remove ops. + */ + newvdevs[i] = vd; + oldvdevs[j] = NULL; + break; + } + } + + if (newvdevs[i] == NULL) { + /* + * Create new vdev + */ + VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, + VDEV_ALLOC_L2CACHE) == 0); + ASSERT(vd != NULL); + newvdevs[i] = vd; + + /* + * Commit this vdev as an l2cache device, + * even if it fails to open. + */ + spa_l2cache_add(vd); + + vd->vdev_top = vd; + vd->vdev_aux = sav; + + spa_l2cache_activate(vd); + + if (vdev_open(vd) != 0) + continue; + + (void) vdev_validate_aux(vd); + + if (!vdev_is_dead(vd)) { + size = vdev_get_rsize(vd); + l2arc_add_vdev(spa, vd, + VDEV_LABEL_START_SIZE, + size - VDEV_LABEL_START_SIZE); + } + } + } + + /* + * Purge vdevs that were dropped + */ + for (i = 0; i < oldnvdevs; i++) { + uint64_t pool; + + vd = oldvdevs[i]; + if (vd != NULL) { + if ((spa_mode & FWRITE) && + spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL && + l2arc_vdev_present(vd)) { + l2arc_remove_vdev(vd); + } + (void) vdev_close(vd); + spa_l2cache_remove(vd); + } + } + + if (oldvdevs) + kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); + + if (sav->sav_config == NULL) + goto out; + + sav->sav_vdevs = newvdevs; + sav->sav_count = (int)nl2cache; + + /* + * Recompute the stashed list of l2cache devices, with status + * information this time. + */ + VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, + DATA_TYPE_NVLIST_ARRAY) == 0); + + l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); + for (i = 0; i < sav->sav_count; i++) + l2cache[i] = vdev_config_generate(spa, + sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); + VERIFY(nvlist_add_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); +out: + for (i = 0; i < sav->sav_count; i++) + nvlist_free(l2cache[i]); + if (sav->sav_count) + kmem_free(l2cache, sav->sav_count * sizeof (void *)); } static int @@ -440,6 +960,50 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) } /* + * Checks to see if the given vdev could not be opened, in which case we post a + * sysevent to notify the autoreplace code that the device has been removed. + */ +static void +spa_check_removed(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + spa_check_removed(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { + zfs_post_autoreplace(vd->vdev_spa, vd); + spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); + } +} + +/* + * Check for missing log devices + */ +int +spa_check_logs(spa_t *spa) +{ + switch (spa->spa_log_state) { + case SPA_LOG_MISSING: + /* need to recheck in case slog has been restored */ + case SPA_LOG_UNKNOWN: + if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, + DS_FIND_CHILDREN)) { + spa->spa_log_state = SPA_LOG_MISSING; + return (1); + } + break; + + case SPA_LOG_CLEAR: + (void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL, + DS_FIND_CHILDREN); + break; + } + spa->spa_log_state = SPA_LOG_GOOD; + return (0); +} + +/* * Load an existing storage pool, using the pool's builtin spa_config as a * source of configuration information. */ @@ -453,7 +1017,10 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) uint64_t config_cache_txg = spa->spa_config_txg; uint64_t pool_guid; uint64_t version; - zio_t *zio; + uint64_t autoreplace = 0; + char *ereport = FM_EREPORT_ZFS_POOL; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa->spa_load_state = state; @@ -468,7 +1035,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) * it's not present treat it as the initial version. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) - version = ZFS_VERSION_INITIAL; + version = SPA_VERSION_INITIAL; (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); @@ -486,10 +1053,10 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. */ - spa_config_enter(spa, RW_WRITER, FTAG); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_ubsync.ub_version = version; error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); - spa_config_exit(spa, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) goto out; @@ -500,18 +1067,19 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) /* * Try to open all vdevs, loading each label in the process. */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_open(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) goto out; /* * Validate the labels for all leaf vdevs. We need to grab the config - * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD - * flag. + * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER. */ - spa_config_enter(spa, RW_READER, FTAG); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_validate(rvd); - spa_config_exit(spa, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) goto out; @@ -524,12 +1092,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) /* * Find the best uberblock. */ - bzero(ub, sizeof (uberblock_t)); - - zio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); - vdev_uberblock_load(zio, rvd, ub); - error = zio_wait(zio); + vdev_uberblock_load(NULL, rvd, ub); /* * If we weren't able to find a single valid uberblock, return failure. @@ -544,7 +1107,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) /* * If the pool is newer than the code, we can't open it. */ - if (ub->ub_version > ZFS_VERSION) { + if (ub->ub_version > SPA_VERSION) { vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_VERSION_NEWER); error = ENOTSUP; @@ -596,12 +1159,8 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) goto out; } - /* - * hostid is set after the root file system is mounted, so - * ignore the check until it's done. - */ - if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, - &hostid) == 0 && root_mounted()) { + if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig, + ZPOOL_CONFIG_HOSTID, &hostid) == 0) { char *hostname; unsigned long myhostid = 0; @@ -609,12 +1168,13 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); - if ((unsigned long)hostid != myhostid) { + if (hostid != 0 && myhostid != 0 && + (unsigned long)hostid != myhostid) { cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " - "another system (host: %s hostid: 0x%lx). " + "another system (host: %s hostid: 0x%lx). " "See: http://www.sun.com/msg/ZFS-8000-EY", - spa->spa_name, hostname, + spa_name(spa), hostname, (unsigned long)hostid); error = EBADF; goto out; @@ -695,7 +1255,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) * Load any hot spares for this pool. */ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); + DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); if (error != 0 && error != ENOENT) { vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); @@ -703,20 +1263,59 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) goto out; } if (error == 0) { - ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); - if (load_nvlist(spa, spa->spa_spares_object, - &spa->spa_sparelist) != 0) { + ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); + if (load_nvlist(spa, spa->spa_spares.sav_object, + &spa->spa_spares.sav_config) != 0) { vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); error = EIO; goto out; } - spa_config_enter(spa, RW_WRITER, FTAG); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); - spa_config_exit(spa, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); } + /* + * Load any level 2 ARC devices for this pool. + */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_L2CACHE, sizeof (uint64_t), 1, + &spa->spa_l2cache.sav_object); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + if (error == 0) { + ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); + if (load_nvlist(spa, spa->spa_l2cache.sav_object, + &spa->spa_l2cache.sav_config) != 0) { + vdev_set_state(rvd, B_TRUE, + VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_l2cache(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + } + + if (spa_check_logs(spa)) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LOG); + error = ENXIO; + ereport = FM_EREPORT_ZFS_LOG_REPLAY; + goto out; + } + + + spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); @@ -730,11 +1329,33 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) if (error == 0) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), sizeof (uint64_t), 1, &spa->spa_bootfs); + (void) zap_lookup(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), + sizeof (uint64_t), 1, &autoreplace); + (void) zap_lookup(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_DELEGATION), + sizeof (uint64_t), 1, &spa->spa_delegation); + (void) zap_lookup(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), + sizeof (uint64_t), 1, &spa->spa_failmode); } /* + * If the 'autoreplace' property is set, then post a resource notifying + * the ZFS DE that it should not issue any faults for unopenable + * devices. We also iterate over the vdevs, and post a sysevent for any + * unopenable vdevs so that the normal autoreplace handler can take + * over. + */ + if (autoreplace && state != SPA_LOAD_TRYIMPORT) + spa_check_removed(spa->spa_root_vdev); + + /* * Load the vdev state for all toplevel vdevs. */ vdev_load(rvd); @@ -742,9 +1363,9 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) /* * Propagate the leaf DTLs we just loaded all the way up the tree. */ - spa_config_enter(spa, RW_WRITER, FTAG); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_dtl_reassess(rvd, 0, 0, B_FALSE); - spa_config_exit(spa, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); /* * Check the state of the root vdev. If it can't be opened, it @@ -766,7 +1387,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), spa_first_txg(spa)); - (void) dmu_objset_find(spa->spa_name, + (void) dmu_objset_find(spa_name(spa), zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); @@ -800,8 +1421,9 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) error = 0; out: + spa->spa_minref = refcount_count(&spa->spa_refcount); if (error && error != EBADF) - zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); + zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); spa->spa_load_state = SPA_LOAD_NONE; spa->spa_ena = 0; @@ -814,7 +1436,7 @@ out: * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the - * POOL_STATE_UNITIALIZED state. + * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some @@ -825,7 +1447,6 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) { spa_t *spa; int error; - int loaded = B_FALSE; int locked = B_FALSE; *spapp = NULL; @@ -860,11 +1481,10 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ - zfs_post_ok(spa, NULL); spa_unload(spa); spa_deactivate(spa); + spa_config_sync(spa, B_TRUE, B_TRUE); spa_remove(spa); - spa_config_sync(); if (locked) mutex_exit(&spa_namespace_lock); return (ENOENT); @@ -876,12 +1496,9 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ - if (config != NULL && spa->spa_root_vdev != NULL) { - spa_config_enter(spa, RW_READER, FTAG); + if (config != NULL && spa->spa_root_vdev != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); - spa_config_exit(spa, FTAG); - } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = B_TRUE; @@ -890,30 +1507,19 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) *spapp = NULL; return (error); } else { - zfs_post_ok(spa, NULL); spa->spa_last_open_failed = B_FALSE; } - - loaded = B_TRUE; } spa_open_ref(spa, tag); + if (locked) mutex_exit(&spa_namespace_lock); *spapp = spa; - if (config != NULL) { - spa_config_enter(spa, RW_READER, FTAG); + if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); - spa_config_exit(spa, FTAG); - } - - /* - * If we just loaded the pool, resilver anything that's out of date. - */ - if (loaded && (spa_mode & FWRITE)) - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); return (0); } @@ -952,6 +1558,9 @@ spa_inject_delref(spa_t *spa) mutex_exit(&spa_namespace_lock); } +/* + * Add spares device information to the nvlist. + */ static void spa_add_spares(spa_t *spa, nvlist_t *config) { @@ -963,12 +1572,12 @@ spa_add_spares(spa_t *spa, nvlist_t *config) uint_t vsc; uint64_t pool; - if (spa->spa_nspares == 0) + if (spa->spa_spares.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, + VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); if (nspares != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, @@ -984,7 +1593,8 @@ spa_add_spares(spa_t *spa, nvlist_t *config) for (i = 0; i < nspares; i++) { VERIFY(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &guid) == 0); - if (spa_spare_exists(guid, &pool) && pool != 0ULL) { + if (spa_spare_exists(guid, &pool, NULL) && + pool != 0ULL) { VERIFY(nvlist_lookup_uint64_array( spares[i], ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); @@ -995,6 +1605,62 @@ spa_add_spares(spa_t *spa, nvlist_t *config) } } +/* + * Add l2cache device information to the nvlist, including vdev stats. + */ +static void +spa_add_l2cache(spa_t *spa, nvlist_t *config) +{ + nvlist_t **l2cache; + uint_t i, j, nl2cache; + nvlist_t *nvroot; + uint64_t guid; + vdev_t *vd; + vdev_stat_t *vs; + uint_t vsc; + + if (spa->spa_l2cache.sav_count == 0) + return; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + VERIFY(nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + if (nl2cache != 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + + /* + * Update level 2 cache device stats. + */ + + for (i = 0; i < nl2cache; i++) { + VERIFY(nvlist_lookup_uint64(l2cache[i], + ZPOOL_CONFIG_GUID, &guid) == 0); + + vd = NULL; + for (j = 0; j < spa->spa_l2cache.sav_count; j++) { + if (guid == + spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { + vd = spa->spa_l2cache.sav_vdevs[j]; + break; + } + } + ASSERT(vd != NULL); + + VERIFY(nvlist_lookup_uint64_array(l2cache[i], + ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); + vdev_get_stats(vd, vs); + } + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { @@ -1008,7 +1674,12 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); + if (spa_suspended(spa)) + VERIFY(nvlist_add_uint64(*config, + ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); + spa_add_spares(spa, *config); + spa_add_l2cache(spa, *config); } /* @@ -1037,45 +1708,48 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) } /* - * Validate that the 'spares' array is well formed. We must have an array of - * nvlists, each which describes a valid leaf vdev. If this is an import (mode - * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long - * as they are well-formed. + * Validate that the auxiliary device array is well formed. We must have an + * array of nvlists, each which describes a valid leaf vdev. If this is an + * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be + * specified, as long as they are well-formed. */ static int -spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) +spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, + spa_aux_vdev_t *sav, const char *config, uint64_t version, + vdev_labeltype_t label) { - nvlist_t **spares; - uint_t i, nspares; + nvlist_t **dev; + uint_t i, ndev; vdev_t *vd; int error; + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + /* - * It's acceptable to have no spares specified. + * It's acceptable to have no devs specified. */ - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) != 0) + if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) return (0); - if (nspares == 0) + if (ndev == 0) return (EINVAL); /* - * Make sure the pool is formatted with a version that supports hot - * spares. + * Make sure the pool is formatted with a version that supports this + * device type. */ - if (spa_version(spa) < ZFS_VERSION_SPARES) + if (spa_version(spa) < version) return (ENOTSUP); /* - * Set the pending spare list so we correctly handle device in-use + * Set the pending device list so we correctly handle device in-use * checking. */ - spa->spa_pending_spares = spares; - spa->spa_pending_nspares = nspares; + sav->sav_pending = dev; + sav->sav_npending = ndev; - for (i = 0; i < nspares; i++) { - if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, + for (i = 0; i < ndev; i++) { + if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, mode)) != 0) goto out; @@ -1085,43 +1759,149 @@ spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) goto out; } + /* + * The L2ARC currently only supports disk devices in + * kernel context. For user-level testing, we allow it. + */ +#ifdef _KERNEL + if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && + strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { + error = ENOTBLK; + goto out; + } +#endif vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && - (error = vdev_label_init(vd, crtxg, - VDEV_LABEL_SPARE)) == 0) { - VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, + (error = vdev_label_init(vd, crtxg, label)) == 0) { + VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } vdev_free(vd); - if (error && mode != VDEV_ALLOC_SPARE) + if (error && + (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) goto out; else error = 0; } out: - spa->spa_pending_spares = NULL; - spa->spa_pending_nspares = 0; + sav->sav_pending = NULL; + sav->sav_npending = 0; return (error); } +static int +spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) +{ + int error; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, + &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, + VDEV_LABEL_SPARE)) != 0) { + return (error); + } + + return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, + &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, + VDEV_LABEL_L2CACHE)); +} + +static void +spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, + const char *config) +{ + int i; + + if (sav->sav_config != NULL) { + nvlist_t **olddevs; + uint_t oldndevs; + nvlist_t **newdevs; + + /* + * Generate new dev list by concatentating with the + * current dev list. + */ + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, + &olddevs, &oldndevs) == 0); + + newdevs = kmem_alloc(sizeof (void *) * + (ndevs + oldndevs), KM_SLEEP); + for (i = 0; i < oldndevs; i++) + VERIFY(nvlist_dup(olddevs[i], &newdevs[i], + KM_SLEEP) == 0); + for (i = 0; i < ndevs; i++) + VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], + KM_SLEEP) == 0); + + VERIFY(nvlist_remove(sav->sav_config, config, + DATA_TYPE_NVLIST_ARRAY) == 0); + + VERIFY(nvlist_add_nvlist_array(sav->sav_config, + config, newdevs, ndevs + oldndevs) == 0); + for (i = 0; i < oldndevs + ndevs; i++) + nvlist_free(newdevs[i]); + kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); + } else { + /* + * Generate a new dev list. + */ + VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, + devs, ndevs) == 0); + } +} + +/* + * Stop and drop level 2 ARC devices + */ +void +spa_l2cache_drop(spa_t *spa) +{ + vdev_t *vd; + int i; + spa_aux_vdev_t *sav = &spa->spa_l2cache; + + for (i = 0; i < sav->sav_count; i++) { + uint64_t pool; + + vd = sav->sav_vdevs[i]; + ASSERT(vd != NULL); + + if ((spa_mode & FWRITE) && + spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && + l2arc_vdev_present(vd)) { + l2arc_remove_vdev(vd); + } + if (vd->vdev_isl2cache) + spa_l2cache_remove(vd); + vdev_clear_stats(vd); + (void) vdev_close(vd); + } +} + /* * Pool Creation */ int -spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) +spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, + const char *history_str, nvlist_t *zplprops) { spa_t *spa; + char *altroot = NULL; vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; int c, error = 0; uint64_t txg = TXG_INITIAL; - nvlist_t **spares; - uint_t nspares; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + uint64_t version; /* * If this pool already exists, return failure. @@ -1135,36 +1915,51 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) /* * Allocate a new spa_t structure. */ + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(pool, altroot); spa_activate(spa); spa->spa_uberblock.ub_txg = txg - 1; - spa->spa_uberblock.ub_version = ZFS_VERSION; + + if (props && (error = spa_prop_validate(spa, props))) { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + + if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), + &version) != 0) + version = SPA_VERSION; + ASSERT(version <= SPA_VERSION); + spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; /* * Create the root vdev. */ - spa_config_enter(spa, RW_WRITER, FTAG); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); ASSERT(error != 0 || rvd != NULL); ASSERT(error != 0 || spa->spa_root_vdev == rvd); - if (error == 0 && rvd->vdev_children == 0) + if (error == 0 && !zfs_allocatable_devs(nvroot)) error = EINVAL; if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && - (error = spa_validate_spares(spa, nvroot, txg, + (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { for (c = 0; c < rvd->vdev_children; c++) vdev_init(rvd->vdev_child[c], txg); vdev_config_dirty(rvd); } - spa_config_exit(spa, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_unload(spa); @@ -1179,17 +1974,32 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { - VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, + VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); - spa_config_enter(spa, RW_WRITER, FTAG); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); - spa_config_exit(spa, FTAG); - spa->spa_sync_spares = B_TRUE; + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_spares.sav_sync = B_TRUE; + } + + /* + * Get the list of level 2 cache devices, if specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_l2cache(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_l2cache.sav_sync = B_TRUE; } - spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); + spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); spa->spa_meta_objset = dp->dp_meta_objset; tx = dmu_tx_create_assigned(dp, txg); @@ -1198,7 +2008,7 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) * Create the pool config object. */ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, - DMU_OT_PACKED_NVLIST, 1 << 14, + DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); if (zap_add(spa->spa_meta_objset, @@ -1207,12 +2017,14 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) cmn_err(CE_PANIC, "failed to add pool config"); } - /* Newly created pools are always deflated. */ - spa->spa_deflate = TRUE; - if (zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, - sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { - cmn_err(CE_PANIC, "failed to add deflate"); + /* Newly created pools with the right version are always deflated. */ + if (version >= SPA_VERSION_RAIDZ_DEFLATE) { + spa->spa_deflate = TRUE; + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { + cmn_err(CE_PANIC, "failed to add deflate"); + } } /* @@ -1234,11 +2046,20 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) /* * Create the pool's history object. */ - spa_history_create_obj(spa, tx); + if (version >= SPA_VERSION_ZPOOL_HISTORY) + spa_history_create_obj(spa, tx); + + /* + * Set pool properties. + */ + spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); + spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); + spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); + if (props) + spa_sync_props(spa, props, CRED(), tx); dmu_tx_commit(tx); - spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); @@ -1248,10 +2069,15 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) */ txg_wait_synced(spa->spa_dsl_pool, txg); - spa_config_sync(); + spa_config_sync(spa, B_FALSE, B_TRUE); + + if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) + (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); mutex_exit(&spa_namespace_lock); + spa->spa_minref = refcount_count(&spa->spa_refcount); + return (0); } @@ -1259,17 +2085,16 @@ spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) * Import the given pool into the system. We set up the necessary spa_t and * then call spa_load() to do the dirty work. */ -int -spa_import(const char *pool, nvlist_t *config, const char *altroot) +static int +spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, + boolean_t isroot, boolean_t allowfaulted) { spa_t *spa; - int error; + char *altroot = NULL; + int error, loaderr; nvlist_t *nvroot; - nvlist_t **spares; - uint_t nspares; - - if (!(spa_mode & FWRITE)) - return (EROFS); + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; /* * If a pool with this name exists, return failure. @@ -1283,78 +2108,355 @@ spa_import(const char *pool, nvlist_t *config, const char *altroot) /* * Create and initialize the spa structure. */ + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(pool, altroot); spa_activate(spa); + if (allowfaulted) + spa->spa_import_faulted = B_TRUE; + spa->spa_is_root = isroot; + /* * Pass off the heavy lifting to spa_load(). - * Pass TRUE for mosconfig because the user-supplied config - * is actually the one to trust when doing an import. + * Pass TRUE for mosconfig (unless this is a root pool) because + * the user-supplied config is actually the one to trust when + * doing an import. */ - error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); + loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot); - spa_config_enter(spa, RW_WRITER, FTAG); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Toss any existing sparelist, as it doesn't have any validity anymore, * and conflicts with spa_has_spare(). */ - if (spa->spa_sparelist) { - nvlist_free(spa->spa_sparelist); - spa->spa_sparelist = NULL; + if (!isroot && spa->spa_spares.sav_config) { + nvlist_free(spa->spa_spares.sav_config); + spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } + if (!isroot && spa->spa_l2cache.sav_config) { + nvlist_free(spa->spa_l2cache.sav_config); + spa->spa_l2cache.sav_config = NULL; + spa_load_l2cache(spa); + } VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (error == 0) - error = spa_validate_spares(spa, nvroot, -1ULL, - VDEV_ALLOC_SPARE); - spa_config_exit(spa, FTAG); + error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); + if (error == 0) + error = spa_validate_aux(spa, nvroot, -1ULL, + VDEV_ALLOC_L2CACHE); + spa_config_exit(spa, SCL_ALL, FTAG); - if (error != 0) { - spa_unload(spa); - spa_deactivate(spa); - spa_remove(spa); + if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { + if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { + /* + * If we failed to load the pool, but 'allowfaulted' is + * set, then manually set the config as if the config + * passed in was specified in the cache file. + */ + error = 0; + spa->spa_import_faulted = B_FALSE; + if (spa->spa_config == NULL) + spa->spa_config = spa_config_generate(spa, + NULL, -1ULL, B_TRUE); + spa_unload(spa); + spa_deactivate(spa); + spa_config_sync(spa, B_FALSE, B_TRUE); + } else { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + } mutex_exit(&spa_namespace_lock); return (error); } /* - * Override any spares as specified by the user, as these may have - * correct device names/devids, etc. + * Override any spares and level 2 cache devices as specified by + * the user, as these may have correct device names/devids, etc. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { - if (spa->spa_sparelist) - VERIFY(nvlist_remove(spa->spa_sparelist, + if (spa->spa_spares.sav_config) + VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); else - VERIFY(nvlist_alloc(&spa->spa_sparelist, + VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); - spa_config_enter(spa, RW_WRITER, FTAG); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); - spa_config_exit(spa, FTAG); - spa->spa_sync_spares = B_TRUE; + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_spares.sav_sync = B_TRUE; + } + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + if (spa->spa_l2cache.sav_config) + VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); + else + VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_l2cache(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_l2cache.sav_sync = B_TRUE; } + if (spa_mode & FWRITE) { + /* + * Update the config cache to include the newly-imported pool. + */ + spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot); + } + + spa->spa_import_faulted = B_FALSE; + mutex_exit(&spa_namespace_lock); + + return (0); +} + +#if defined(sun) +#ifdef _KERNEL +/* + * Build a "root" vdev for a top level vdev read in from a rootpool + * device label. + */ +static void +spa_build_rootpool_config(nvlist_t *config) +{ + nvlist_t *nvtop, *nvroot; + uint64_t pgid; + /* - * Update the config cache to include the newly-imported pool. + * Add this top-level vdev to the child array. */ - spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) + == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) + == 0); - mutex_exit(&spa_namespace_lock); + /* + * Put this pool's top-level vdevs into a root vdev. + */ + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) + == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &nvtop, 1) == 0); /* - * Resilver anything that's out of date. + * Replace the existing vdev_tree with the new root vdev in + * this pool's configuration (remove the old, add the new). */ - if (spa_mode & FWRITE) - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + nvlist_free(nvroot); +} + +/* + * Get the root pool information from the root disk, then import the root pool + * during the system boot up time. + */ +extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); + +int +spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf, + uint64_t *besttxg) +{ + nvlist_t *config; + uint64_t txg; + int error; + + if (error = vdev_disk_read_rootlabel(devpath, devid, &config)) + return (error); + + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); + if (bestconf != NULL) + *bestconf = config; + else + nvlist_free(config); + *besttxg = txg; return (0); } +boolean_t +spa_rootdev_validate(nvlist_t *nv) +{ + uint64_t ival; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) + return (B_FALSE); + + return (B_TRUE); +} + + +/* + * Given the boot device's physical path or devid, check if the device + * is in a valid state. If so, return the configuration from the vdev + * label. + */ +int +spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) +{ + nvlist_t *conf = NULL; + uint64_t txg = 0; + nvlist_t *nvtop, **child; + char *type; + char *bootpath = NULL; + uint_t children, c; + char *tmp; + int error; + + if (devpath && ((tmp = strchr(devpath, ' ')) != NULL)) + *tmp = '\0'; + if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) { + cmn_err(CE_NOTE, "error reading device label"); + return (error); + } + if (txg == 0) { + cmn_err(CE_NOTE, "this device is detached"); + nvlist_free(conf); + return (EINVAL); + } + + VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0); + + if (strcmp(type, VDEV_TYPE_DISK) == 0) { + if (spa_rootdev_validate(nvtop)) { + goto out; + } else { + nvlist_free(conf); + return (EINVAL); + } + } + + ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0); + + VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0); + + /* + * Go thru vdevs in the mirror to see if the given device + * has the most recent txg. Only the device with the most + * recent txg has valid information and should be booted. + */ + for (c = 0; c < children; c++) { + char *cdevid, *cpath; + uint64_t tmptxg; + + if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, + &cpath) != 0) + return (EINVAL); + if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID, + &cdevid) != 0) + return (EINVAL); + if ((spa_check_rootconf(cpath, cdevid, NULL, + &tmptxg) == 0) && (tmptxg > txg)) { + txg = tmptxg; + VERIFY(nvlist_lookup_string(child[c], + ZPOOL_CONFIG_PATH, &bootpath) == 0); + } + } + + /* Does the best device match the one we've booted from? */ + if (bootpath) { + cmn_err(CE_NOTE, "try booting from '%s'", bootpath); + return (EINVAL); + } +out: + *bestconf = conf; + return (0); +} + +/* + * Import a root pool. + * + * For x86. devpath_list will consist of devid and/or physpath name of + * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). + * The GRUB "findroot" command will return the vdev we should boot. + * + * For Sparc, devpath_list consists the physpath name of the booting device + * no matter the rootpool is a single device pool or a mirrored pool. + * e.g. + * "/pci@1f,0/ide@d/disk@0,0:a" + */ +int +spa_import_rootpool(char *devpath, char *devid) +{ + nvlist_t *conf = NULL; + char *pname; + int error; + + /* + * Get the vdev pathname and configuation from the most + * recently updated vdev (highest txg). + */ + if (error = spa_get_rootconf(devpath, devid, &conf)) + goto msg_out; + + /* + * Add type "root" vdev to the config. + */ + spa_build_rootpool_config(conf); + + VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); + + /* + * We specify 'allowfaulted' for this to be treated like spa_open() + * instead of spa_import(). This prevents us from marking vdevs as + * persistently unavailable, and generates FMA ereports as if it were a + * pool open, not import. + */ + error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE); + if (error == EEXIST) + error = 0; + + nvlist_free(conf); + return (error); + +msg_out: + cmn_err(CE_NOTE, "\n" + " *************************************************** \n" + " * This device is not bootable! * \n" + " * It is either offlined or detached or faulted. * \n" + " * Please try to boot from a different device. * \n" + " *************************************************** "); + + return (error); +} +#endif +#endif + +/* + * Import a non-root pool into the system. + */ +int +spa_import(const char *pool, nvlist_t *config, nvlist_t *props) +{ + return (spa_import_common(pool, config, props, B_FALSE, B_FALSE)); +} + +int +spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props) +{ + return (spa_import_common(pool, config, props, B_FALSE, B_TRUE)); +} + + /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. @@ -1393,9 +2495,7 @@ spa_tryimport(nvlist_t *tryconfig) * If 'tryconfig' was at least parsable, return the current config. */ if (spa->spa_root_vdev != NULL) { - spa_config_enter(spa, RW_READER, FTAG); config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); - spa_config_exit(spa, FTAG); VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, @@ -1404,9 +2504,42 @@ spa_tryimport(nvlist_t *tryconfig) spa->spa_uberblock.ub_timestamp) == 0); /* - * Add the list of hot spares. + * If the bootfs property exists on this pool then we + * copy it out so that external consumers can tell which + * pools are bootable. + */ + if (spa->spa_bootfs) { + char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + /* + * We have to play games with the name since the + * pool was opened as TRYIMPORT_NAME. + */ + if (dsl_dsobj_to_dsname(spa_name(spa), + spa->spa_bootfs, tmpname) == 0) { + char *cp; + char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + cp = strchr(tmpname, '/'); + if (cp == NULL) { + (void) strlcpy(dsname, tmpname, + MAXPATHLEN); + } else { + (void) snprintf(dsname, MAXPATHLEN, + "%s/%s", poolname, ++cp); + } + VERIFY(nvlist_add_string(config, + ZPOOL_CONFIG_BOOTFS, dsname) == 0); + kmem_free(dsname, MAXPATHLEN); + } + kmem_free(tmpname, MAXPATHLEN); + } + + /* + * Add the list of hot spares and level 2 cache devices. */ spa_add_spares(spa, config); + spa_add_l2cache(spa, config); } spa_unload(spa); @@ -1426,7 +2559,8 @@ spa_tryimport(nvlist_t *tryconfig) * configuration from the cache afterwards. */ static int -spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) +spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, + boolean_t force) { spa_t *spa; @@ -1461,7 +2595,6 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) * Objsets may be open only because they're dirty, so we * have to force it to sync before checking spa_refcnt. */ - spa_scrub_suspend(spa); txg_wait_synced(spa->spa_dsl_pool, 0); /* @@ -1472,14 +2605,23 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0 && new_state != POOL_STATE_UNINITIALIZED)) { - spa_scrub_resume(spa); spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (EBUSY); } - spa_scrub_resume(spa); - VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); + /* + * A pool cannot be exported if it has an active shared spare. + * This is to prevent other pools stealing the active spare + * from an exported pool. At user's own will, such pool can + * be forcedly exported. + */ + if (!force && new_state == POOL_STATE_EXPORTED && + spa_has_active_shared_spare(spa)) { + spa_async_resume(spa); + mutex_exit(&spa_namespace_lock); + return (EXDEV); + } /* * We want this to be reflected on every label, @@ -1487,14 +2629,16 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) * final sync that pushes these changes out. */ if (new_state != POOL_STATE_UNINITIALIZED) { - spa_config_enter(spa, RW_WRITER, FTAG); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; spa->spa_final_txg = spa_last_synced_txg(spa) + 1; vdev_config_dirty(spa->spa_root_vdev); - spa_config_exit(spa, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); } } + spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); + if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); @@ -1504,8 +2648,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); if (new_state != POOL_STATE_UNINITIALIZED) { + spa_config_sync(spa, B_TRUE, B_TRUE); spa_remove(spa); - spa_config_sync(); } mutex_exit(&spa_namespace_lock); @@ -1518,16 +2662,16 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) int spa_destroy(char *pool) { - return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); + return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE)); } /* * Export a storage pool. */ int -spa_export(char *pool, nvlist_t **oldconfig) +spa_export(char *pool, nvlist_t **oldconfig, boolean_t force) { - return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); + return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force)); } /* @@ -1537,10 +2681,10 @@ spa_export(char *pool, nvlist_t **oldconfig) int spa_reset(char *pool) { - return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); + return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, + B_FALSE)); } - /* * ========================================================================== * Device manipulation @@ -1548,7 +2692,7 @@ spa_reset(char *pool) */ /* - * Add capacity to a storage pool. + * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) @@ -1557,8 +2701,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) int c, error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; - nvlist_t **spares; - uint_t i, nspares; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; txg = spa_vdev_enter(spa); @@ -1566,35 +2710,29 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, error)); - spa->spa_pending_vdev = vd; + spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) != 0) + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, + &nspares) != 0) nspares = 0; - if (vd->vdev_children == 0 && nspares == 0) { - spa->spa_pending_vdev = NULL; + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, + &nl2cache) != 0) + nl2cache = 0; + + if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); - } - if (vd->vdev_children != 0) { - if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { - spa->spa_pending_vdev = NULL; - return (spa_vdev_exit(spa, vd, txg, error)); - } - } + if (vd->vdev_children != 0 && + (error = vdev_create(vd, txg, B_FALSE)) != 0) + return (spa_vdev_exit(spa, vd, txg, error)); /* - * We must validate the spares after checking the children. Otherwise, - * vdev_inuse() will blindly overwrite the spare. + * We must validate the spares and l2cache devices after checking the + * children. Otherwise, vdev_inuse() will blindly overwrite the spare. */ - if ((error = spa_validate_spares(spa, nvroot, txg, - VDEV_ALLOC_ADD)) != 0) { - spa->spa_pending_vdev = NULL; + if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); - } - - spa->spa_pending_vdev = NULL; /* * Transfer each new top-level vdev from vd to rvd. @@ -1608,43 +2746,17 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } if (nspares != 0) { - if (spa->spa_sparelist != NULL) { - nvlist_t **oldspares; - uint_t oldnspares; - nvlist_t **newspares; - - VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); - - newspares = kmem_alloc(sizeof (void *) * - (nspares + oldnspares), KM_SLEEP); - for (i = 0; i < oldnspares; i++) - VERIFY(nvlist_dup(oldspares[i], - &newspares[i], KM_SLEEP) == 0); - for (i = 0; i < nspares; i++) - VERIFY(nvlist_dup(spares[i], - &newspares[i + oldnspares], - KM_SLEEP) == 0); - - VERIFY(nvlist_remove(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); - - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, newspares, - nspares + oldnspares) == 0); - for (i = 0; i < oldnspares + nspares; i++) - nvlist_free(newspares[i]); - kmem_free(newspares, (oldnspares + nspares) * - sizeof (void *)); - } else { - VERIFY(nvlist_alloc(&spa->spa_sparelist, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, spares, nspares) == 0); - } - + spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, + ZPOOL_CONFIG_SPARES); spa_load_spares(spa); - spa->spa_sync_spares = B_TRUE; + spa->spa_spares.sav_sync = B_TRUE; + } + + if (nl2cache != 0) { + spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, + ZPOOL_CONFIG_L2CACHE); + spa_load_l2cache(spa); + spa->spa_l2cache.sav_sync = B_TRUE; } /* @@ -1676,7 +2788,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own - * mirror using the 'replacing' vdev, which is functionally idendical to + * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) @@ -1686,14 +2798,17 @@ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) { uint64_t txg, open_txg; - int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; + dmu_tx_t *tx; + char *oldvdpath, *newvdpath; + int newvd_isspare; + int error; txg = spa_vdev_enter(spa); - oldvd = vdev_lookup_by_guid(rvd, guid); + oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); @@ -1704,7 +2819,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, - VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) + VDEV_ALLOC_ADD)) != 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; @@ -1715,6 +2833,12 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); + /* + * Spares can't replace logs + */ + if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root @@ -1828,6 +2952,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) if (newvd->vdev_isspare) spa_spare_activate(newvd); + oldvdpath = spa_strdup(oldvd->vdev_path); + newvdpath = spa_strdup(newvd->vdev_path); + newvd_isspare = newvd->vdev_isspare; /* * Mark newvd's DTL dirty in this txg. @@ -1836,10 +2963,25 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + if (dmu_tx_assign(tx, TXG_WAIT) == 0) { + spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx, + CRED(), "%s vdev=%s %s vdev=%s", + replacing && newvd_isspare ? "spare in" : + replacing ? "replace" : "attach", newvdpath, + replacing ? "for" : "to", oldvdpath); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + + spa_strfree(oldvdpath); + spa_strfree(newvdpath); + /* * Kick off a resilver to update newvd. */ - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); return (0); } @@ -1858,10 +3000,11 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid; + size_t len; txg = spa_vdev_enter(spa); - vd = vdev_lookup_by_guid(rvd, guid); + vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); @@ -1886,7 +3029,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) } ASSERT(pvd->vdev_ops != &vdev_spare_ops || - spa_version(spa) >= ZFS_VERSION_SPARES); + spa_version(spa) >= SPA_VERSION_SPARES); /* * Only mirror, replacing, and spare vdevs support detach. @@ -1925,13 +3068,26 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) break; } + if (c == pvd->vdev_children) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + /* - * If we are a replacing or spare vdev, then we can always detach the - * latter child, as that is how one cancels the operation. + * If we are detaching the second disk from a replacing vdev, then + * check to see if we changed the original vdev's path to have "/old" + * at the end in spa_vdev_attach(). If so, undo that change now. */ - if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && - c == pvd->vdev_children) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && + pvd->vdev_child[0]->vdev_path != NULL && + pvd->vdev_child[1]->vdev_path != NULL) { + ASSERT(pvd->vdev_child[1] == vd); + cvd = pvd->vdev_child[0]; + len = strlen(vd->vdev_path); + if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && + strcmp(cvd->vdev_path + len, "/old") == 0) { + spa_strfree(cvd->vdev_path); + cvd->vdev_path = spa_strdup(vd->vdev_path); + } + } /* * If we are detaching the original disk from a spare, then it implies @@ -1992,7 +3148,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) /* * Reevaluate the parent vdev state. */ - vdev_propagate_state(cvd->vdev_parent); + vdev_propagate_state(cvd); /* * If the device we just detached was smaller than the others, it may be @@ -2015,6 +3171,8 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); + spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + error = spa_vdev_exit(spa, vd, txg, 0); /* @@ -2028,8 +3186,11 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) while ((spa = spa_next(spa)) != NULL) { if (spa->spa_state != POOL_STATE_ACTIVE) continue; - + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); } @@ -2037,100 +3198,125 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) return (error); } +static nvlist_t * +spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) +{ + for (int i = 0; i < count; i++) { + uint64_t guid; + + VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, + &guid) == 0); + + if (guid == target_guid) + return (nvpp[i]); + } + + return (NULL); +} + +static void +spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, + nvlist_t *dev_to_remove) +{ + nvlist_t **newdev = NULL; + + if (count > 1) + newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); + + for (int i = 0, j = 0; i < count; i++) { + if (dev[i] == dev_to_remove) + continue; + VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); + } + + VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); + VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); + + for (int i = 0; i < count - 1; i++) + nvlist_free(newdev[i]); + + if (count > 1) + kmem_free(newdev, (count - 1) * sizeof (void *)); +} + /* * Remove a device from the pool. Currently, this supports removing only hot - * spares. + * spares and level 2 ARC devices. */ int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) { vdev_t *vd; - nvlist_t **spares, *nv, **newspares; - uint_t i, j, nspares; - int ret = 0; - - spa_config_enter(spa, RW_WRITER, FTAG); + nvlist_t **spares, **l2cache, *nv; + uint_t nspares, nl2cache; + uint64_t txg; + int error = 0; - vd = spa_lookup_by_guid(spa, guid); + txg = spa_vdev_enter(spa); - nv = NULL; - if (spa->spa_spares != NULL && - nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - for (i = 0; i < nspares; i++) { - uint64_t theguid; + vd = spa_lookup_by_guid(spa, guid, B_FALSE); - VERIFY(nvlist_lookup_uint64(spares[i], - ZPOOL_CONFIG_GUID, &theguid) == 0); - if (theguid == guid) { - nv = spares[i]; - break; - } + if (spa->spa_spares.sav_vdevs != NULL && + nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && + (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { + /* + * Only remove the hot spare if it's not currently in use + * in this pool. + */ + if (vd == NULL || unspare) { + spa_vdev_remove_aux(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares, nv); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } else { + error = EBUSY; } - } - - /* - * We only support removing a hot spare, and only if it's not currently - * in use in this pool. - */ - if (nv == NULL && vd == NULL) { - ret = ENOENT; - goto out; - } - - if (nv == NULL && vd != NULL) { - ret = ENOTSUP; - goto out; - } - - if (!unspare && nv != NULL && vd != NULL) { - ret = EBUSY; - goto out; - } - - if (nspares == 1) { - newspares = NULL; + } else if (spa->spa_l2cache.sav_vdevs != NULL && + nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && + (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { + /* + * Cache devices can always be removed. + */ + spa_vdev_remove_aux(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); + spa_load_l2cache(spa); + spa->spa_l2cache.sav_sync = B_TRUE; + } else if (vd != NULL) { + /* + * Normal vdevs cannot be removed (yet). + */ + error = ENOTSUP; } else { - newspares = kmem_alloc((nspares - 1) * sizeof (void *), - KM_SLEEP); - for (i = 0, j = 0; i < nspares; i++) { - if (spares[i] != nv) - VERIFY(nvlist_dup(spares[i], - &newspares[j++], KM_SLEEP) == 0); - } + /* + * There is no vdev of any kind with the specified guid. + */ + error = ENOENT; } - VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, - DATA_TYPE_NVLIST_ARRAY) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, - newspares, nspares - 1) == 0); - for (i = 0; i < nspares - 1; i++) - nvlist_free(newspares[i]); - kmem_free(newspares, (nspares - 1) * sizeof (void *)); - spa_load_spares(spa); - spa->spa_sync_spares = B_TRUE; - -out: - spa_config_exit(spa, FTAG); - - return (ret); + return (spa_vdev_exit(spa, NULL, txg, error)); } /* - * Find any device that's done replacing, so we can detach it. + * Find any device that's done replacing, or a vdev marked 'unspare' that's + * current spared, so we can detach it. */ static vdev_t * -spa_vdev_replace_done_hunt(vdev_t *vd) +spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; int c; for (c = 0; c < vd->vdev_children; c++) { - oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); + oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } + /* + * Check for a completed replacement. + */ if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { oldvd = vd->vdev_child[0]; newvd = vd->vdev_child[1]; @@ -2144,20 +3330,38 @@ spa_vdev_replace_done_hunt(vdev_t *vd) mutex_exit(&newvd->vdev_dtl_lock); } + /* + * Check for a completed resilver with the 'unspare' flag set. + */ + if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { + newvd = vd->vdev_child[0]; + oldvd = vd->vdev_child[1]; + + mutex_enter(&newvd->vdev_dtl_lock); + if (newvd->vdev_unspare && + newvd->vdev_dtl_map.sm_space == 0 && + newvd->vdev_dtl_scrub.sm_space == 0) { + newvd->vdev_unspare = 0; + mutex_exit(&newvd->vdev_dtl_lock); + return (oldvd); + } + mutex_exit(&newvd->vdev_dtl_lock); + } + return (NULL); } static void -spa_vdev_replace_done(spa_t *spa) +spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd; vdev_t *pvd; uint64_t guid; uint64_t pguid = 0; - spa_config_enter(spa, RW_READER, FTAG); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { + while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { guid = vd->vdev_guid; /* * If we have just finished replacing a hot spared device, then @@ -2171,15 +3375,15 @@ spa_vdev_replace_done(spa_t *spa) ASSERT(pvd->vdev_parent->vdev_children == 2); pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; } - spa_config_exit(spa, FTAG); + spa_config_exit(spa, SCL_CONFIG, FTAG); if (spa_vdev_detach(spa, guid, B_TRUE) != 0) return; if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) return; - spa_config_enter(spa, RW_READER, FTAG); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); } - spa_config_exit(spa, FTAG); + spa_config_exit(spa, SCL_CONFIG, FTAG); } /* @@ -2189,42 +3393,40 @@ spa_vdev_replace_done(spa_t *spa) int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) { - vdev_t *rvd, *vd; + vdev_t *vd; uint64_t txg; - rvd = spa->spa_root_vdev; - txg = spa_vdev_enter(spa); - if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) { /* - * Determine if this is a reference to a hot spare. In that - * case, update the path as stored in the spare list. + * Determine if this is a reference to a hot spare device. If + * it is, update the path manually as there is no associated + * vdev_t that can be synced to disk. */ nvlist_t **spares; uint_t i, nspares; - if (spa->spa_sparelist != NULL) { - VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + + if (spa->spa_spares.sav_config != NULL) { + VERIFY(nvlist_lookup_nvlist_array( + spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0); for (i = 0; i < nspares; i++) { uint64_t theguid; VERIFY(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &theguid) == 0); - if (theguid == guid) - break; + if (theguid == guid) { + VERIFY(nvlist_add_string(spares[i], + ZPOOL_CONFIG_PATH, newpath) == 0); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + return (spa_vdev_exit(spa, NULL, txg, + 0)); + } } - - if (i == nspares) - return (spa_vdev_exit(spa, NULL, txg, ENOENT)); - - VERIFY(nvlist_add_string(spares[i], - ZPOOL_CONFIG_PATH, newpath) == 0); - spa_load_spares(spa); - spa->spa_sync_spares = B_TRUE; - return (spa_vdev_exit(spa, NULL, txg, 0)); - } else { - return (spa_vdev_exit(spa, NULL, txg, ENOENT)); } + + return (spa_vdev_exit(spa, NULL, txg, ENOENT)); } if (!vd->vdev_ops->vdev_op_leaf) @@ -2244,397 +3446,36 @@ spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) * ========================================================================== */ -static void -spa_scrub_io_done(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - - zio_data_buf_free(zio->io_data, zio->io_size); - - mutex_enter(&spa->spa_scrub_lock); - if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; - spa->spa_scrub_errors++; - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_scrub_errors++; - mutex_exit(&vd->vdev_stat_lock); - } - - if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) - cv_broadcast(&spa->spa_scrub_io_cv); - - ASSERT(spa->spa_scrub_inflight >= 0); - - mutex_exit(&spa->spa_scrub_lock); -} - -static void -spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, - zbookmark_t *zb) -{ - size_t size = BP_GET_LSIZE(bp); - void *data; - - mutex_enter(&spa->spa_scrub_lock); - /* - * Do not give too much work to vdev(s). - */ - while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - } - spa->spa_scrub_inflight++; - mutex_exit(&spa->spa_scrub_lock); - - data = zio_data_buf_alloc(size); - - if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) - flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ - - flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; - - zio_nowait(zio_read(NULL, spa, bp, data, size, - spa_scrub_io_done, NULL, priority, flags, zb)); -} - -/* ARGSUSED */ -static int -spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) -{ - blkptr_t *bp = &bc->bc_blkptr; - vdev_t *vd = spa->spa_root_vdev; - dva_t *dva = bp->blk_dva; - int needs_resilver = B_FALSE; - int d; - - if (bc->bc_errno) { - /* - * We can't scrub this block, but we can continue to scrub - * the rest of the pool. Note the error and move along. - */ - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_errors++; - mutex_exit(&spa->spa_scrub_lock); - - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_scrub_errors++; - mutex_exit(&vd->vdev_stat_lock); - - return (ERESTART); - } - - ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); - - for (d = 0; d < BP_GET_NDVAS(bp); d++) { - vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); - - ASSERT(vd != NULL); - - /* - * Keep track of how much data we've examined so that - * zpool(1M) status can make useful progress reports. - */ - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); - mutex_exit(&vd->vdev_stat_lock); - - if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { - if (DVA_GET_GANG(&dva[d])) { - /* - * Gang members may be spread across multiple - * vdevs, so the best we can do is look at the - * pool-wide DTL. - * XXX -- it would be better to change our - * allocation policy to ensure that this can't - * happen. - */ - vd = spa->spa_root_vdev; - } - if (vdev_dtl_contains(&vd->vdev_dtl_map, - bp->blk_birth, 1)) - needs_resilver = B_TRUE; - } - } - - if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) - spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, - ZIO_FLAG_SCRUB, &bc->bc_bookmark); - else if (needs_resilver) - spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, - ZIO_FLAG_RESILVER, &bc->bc_bookmark); - - return (0); -} - -static void -spa_scrub_thread(void *arg) -{ - spa_t *spa = arg; - callb_cpr_t cprinfo; - traverse_handle_t *th = spa->spa_scrub_th; - vdev_t *rvd = spa->spa_root_vdev; - pool_scrub_type_t scrub_type = spa->spa_scrub_type; - int error = 0; - boolean_t complete; - - CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); - - /* - * If we're restarting due to a snapshot create/delete, - * wait for that to complete. - */ - txg_wait_synced(spa_get_dsl(spa), 0); - - dprintf("start %s mintxg=%llu maxtxg=%llu\n", - scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", - spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); - - spa_config_enter(spa, RW_WRITER, FTAG); - vdev_reopen(rvd); /* purge all vdev caches */ - vdev_config_dirty(rvd); /* rewrite all disk labels */ - vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); - spa_config_exit(spa, FTAG); - - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_errors = 0; - spa->spa_scrub_active = 1; - ASSERT(spa->spa_scrub_inflight == 0); - - while (!spa->spa_scrub_stop) { - CALLB_CPR_SAFE_BEGIN(&cprinfo); - while (spa->spa_scrub_suspended) { - spa->spa_scrub_active = 0; - cv_broadcast(&spa->spa_scrub_cv); - cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); - spa->spa_scrub_active = 1; - } - CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); - - if (spa->spa_scrub_restart_txg != 0) - break; - - mutex_exit(&spa->spa_scrub_lock); - error = traverse_more(th); - mutex_enter(&spa->spa_scrub_lock); - if (error != EAGAIN) - break; - } - - while (spa->spa_scrub_inflight) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - - spa->spa_scrub_active = 0; - cv_broadcast(&spa->spa_scrub_cv); - - mutex_exit(&spa->spa_scrub_lock); - - spa_config_enter(spa, RW_WRITER, FTAG); - - mutex_enter(&spa->spa_scrub_lock); - - /* - * Note: we check spa_scrub_restart_txg under both spa_scrub_lock - * AND the spa config lock to synchronize with any config changes - * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). - */ - if (spa->spa_scrub_restart_txg != 0) - error = ERESTART; - - if (spa->spa_scrub_stop) - error = EINTR; - - /* - * Even if there were uncorrectable errors, we consider the scrub - * completed. The downside is that if there is a transient error during - * a resilver, we won't resilver the data properly to the target. But - * if the damage is permanent (more likely) we will resilver forever, - * which isn't really acceptable. Since there is enough information for - * the user to know what has failed and why, this seems like a more - * tractable approach. - */ - complete = (error == 0); - - dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", - scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", - spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", - error, spa->spa_scrub_errors, spa->spa_scrub_stop); - - mutex_exit(&spa->spa_scrub_lock); - - /* - * If the scrub/resilver completed, update all DTLs to reflect this. - * Whether it succeeded or not, vacate all temporary scrub DTLs. - */ - vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, - complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); - vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); - spa_errlog_rotate(spa); - - spa_config_exit(spa, FTAG); - - mutex_enter(&spa->spa_scrub_lock); - - /* - * We may have finished replacing a device. - * Let the async thread assess this and handle the detach. - */ - spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); - - /* - * If we were told to restart, our final act is to start a new scrub. - */ - if (error == ERESTART) - spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? - SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); - - spa->spa_scrub_type = POOL_SCRUB_NONE; - spa->spa_scrub_active = 0; - spa->spa_scrub_thread = NULL; - cv_broadcast(&spa->spa_scrub_cv); - CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ - thread_exit(); -} - -void -spa_scrub_suspend(spa_t *spa) -{ - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_suspended++; - while (spa->spa_scrub_active) { - cv_broadcast(&spa->spa_scrub_cv); - cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); - } - while (spa->spa_scrub_inflight) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - mutex_exit(&spa->spa_scrub_lock); -} - -void -spa_scrub_resume(spa_t *spa) -{ - mutex_enter(&spa->spa_scrub_lock); - ASSERT(spa->spa_scrub_suspended != 0); - if (--spa->spa_scrub_suspended == 0) - cv_broadcast(&spa->spa_scrub_cv); - mutex_exit(&spa->spa_scrub_lock); -} - -void -spa_scrub_restart(spa_t *spa, uint64_t txg) -{ - /* - * Something happened (e.g. snapshot create/delete) that means - * we must restart any in-progress scrubs. The itinerary will - * fix this properly. - */ - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_restart_txg = txg; - mutex_exit(&spa->spa_scrub_lock); -} - int -spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) +spa_scrub(spa_t *spa, pool_scrub_type_t type) { - space_seg_t *ss; - uint64_t mintxg, maxtxg; - vdev_t *rvd = spa->spa_root_vdev; + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if ((uint_t)type >= POOL_SCRUB_TYPES) return (ENOTSUP); - mutex_enter(&spa->spa_scrub_lock); - /* - * If there's a scrub or resilver already in progress, stop it. + * If a resilver was requested, but there is no DTL on a + * writeable leaf device, we have nothing to do. */ - while (spa->spa_scrub_thread != NULL) { - /* - * Don't stop a resilver unless forced. - */ - if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { - mutex_exit(&spa->spa_scrub_lock); - return (EBUSY); - } - spa->spa_scrub_stop = 1; - cv_broadcast(&spa->spa_scrub_cv); - cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); - } - - /* - * Terminate the previous traverse. - */ - if (spa->spa_scrub_th != NULL) { - traverse_fini(spa->spa_scrub_th); - spa->spa_scrub_th = NULL; - } - - if (rvd == NULL) { - ASSERT(spa->spa_scrub_stop == 0); - ASSERT(spa->spa_scrub_type == type); - ASSERT(spa->spa_scrub_restart_txg == 0); - mutex_exit(&spa->spa_scrub_lock); + if (type == POOL_SCRUB_RESILVER && + !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } - mintxg = TXG_INITIAL - 1; - maxtxg = spa_last_synced_txg(spa) + 1; - - mutex_enter(&rvd->vdev_dtl_lock); + if (type == POOL_SCRUB_EVERYTHING && + spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && + spa->spa_dsl_pool->dp_scrub_isresilver) + return (EBUSY); - if (rvd->vdev_dtl_map.sm_space == 0) { - /* - * The pool-wide DTL is empty. - * If this is a resilver, there's nothing to do except - * check whether any in-progress replacements have completed. - */ - if (type == POOL_SCRUB_RESILVER) { - type = POOL_SCRUB_NONE; - spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); - } + if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { + return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); + } else if (type == POOL_SCRUB_NONE) { + return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); } else { - /* - * The pool-wide DTL is non-empty. - * If this is a normal scrub, upgrade to a resilver instead. - */ - if (type == POOL_SCRUB_EVERYTHING) - type = POOL_SCRUB_RESILVER; - } - - if (type == POOL_SCRUB_RESILVER) { - /* - * Determine the resilvering boundaries. - * - * Note: (mintxg, maxtxg) is an open interval, - * i.e. mintxg and maxtxg themselves are not included. - * - * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 - * so we don't claim to resilver a txg that's still changing. - */ - ss = avl_first(&rvd->vdev_dtl_map.sm_root); - mintxg = ss->ss_start - 1; - ss = avl_last(&rvd->vdev_dtl_map.sm_root); - maxtxg = MIN(ss->ss_end, maxtxg); - } - - mutex_exit(&rvd->vdev_dtl_lock); - - spa->spa_scrub_stop = 0; - spa->spa_scrub_type = type; - spa->spa_scrub_restart_txg = 0; - - if (type != POOL_SCRUB_NONE) { - spa->spa_scrub_mintxg = mintxg; - spa->spa_scrub_maxtxg = maxtxg; - spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, - ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, - ZIO_FLAG_CANFAIL); - traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); - spa->spa_scrub_thread = thread_create(NULL, 0, - spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); + return (EINVAL); } - - mutex_exit(&spa->spa_scrub_lock); - - return (0); } /* @@ -2644,23 +3485,29 @@ spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) */ static void -spa_async_reopen(spa_t *spa) +spa_async_remove(spa_t *spa, vdev_t *vd) { - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *tvd; - int c; + if (vd->vdev_remove_wanted) { + vd->vdev_remove_wanted = 0; + vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); + vdev_clear(spa, vd); + vdev_state_dirty(vd->vdev_top); + } - spa_config_enter(spa, RW_WRITER, FTAG); + for (int c = 0; c < vd->vdev_children; c++) + spa_async_remove(spa, vd->vdev_child[c]); +} - for (c = 0; c < rvd->vdev_children; c++) { - tvd = rvd->vdev_child[c]; - if (tvd->vdev_reopen_wanted) { - tvd->vdev_reopen_wanted = 0; - vdev_reopen(tvd); - } +static void +spa_async_probe(spa_t *spa, vdev_t *vd) +{ + if (vd->vdev_probe_wanted) { + vd->vdev_probe_wanted = 0; + vdev_reopen(vd); /* vdev_open() does the actual probe */ } - spa_config_exit(spa, FTAG); + for (int c = 0; c < vd->vdev_children; c++) + spa_async_probe(spa, vd->vdev_child[c]); } static void @@ -2686,28 +3533,38 @@ spa_async_thread(void *arg) } /* - * See if any devices need to be reopened. + * See if any devices need to be marked REMOVED. */ - if (tasks & SPA_ASYNC_REOPEN) - spa_async_reopen(spa); + if (tasks & SPA_ASYNC_REMOVE) { + spa_vdev_state_enter(spa); + spa_async_remove(spa, spa->spa_root_vdev); + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) + spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); + for (int i = 0; i < spa->spa_spares.sav_count; i++) + spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); + (void) spa_vdev_state_exit(spa, NULL, 0); + } /* - * If any devices are done replacing, detach them. + * See if any devices need to be probed. */ - if (tasks & SPA_ASYNC_REPLACE_DONE) - spa_vdev_replace_done(spa); + if (tasks & SPA_ASYNC_PROBE) { + spa_vdev_state_enter(spa); + spa_async_probe(spa, spa->spa_root_vdev); + (void) spa_vdev_state_exit(spa, NULL, 0); + } /* - * Kick off a scrub. + * If any devices are done replacing, detach them. */ - if (tasks & SPA_ASYNC_SCRUB) - VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); + if (tasks & SPA_ASYNC_RESILVER_DONE) + spa_vdev_resilver_done(spa); /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER) - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); /* * Let the world know that we're done. @@ -2775,10 +3632,13 @@ spa_sync_deferred_frees(spa_t *spa, uint64_t txg) int error; uint8_t c = 1; - zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); - while (bplist_iterate(bpl, &itor, &blk) == 0) - zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); + while (bplist_iterate(bpl, &itor, &blk) == 0) { + ASSERT(blk.blk_birth < txg); + zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED)); + } error = zio_wait(zio); ASSERT3U(error, ==, 0); @@ -2798,19 +3658,27 @@ static void spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) { char *packed = NULL; + size_t bufsize; size_t nvsize = 0; dmu_buf_t *db; VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); - packed = kmem_alloc(nvsize, KM_SLEEP); + /* + * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration + * information. This avoids the dbuf_will_dirty() path and + * saves us a pre-read to get data we don't actually care about. + */ + bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); + packed = kmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); + bzero(packed + nvsize, bufsize - nvsize); - dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); + dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); - kmem_free(packed, nvsize); + kmem_free(packed, bufsize); VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); @@ -2819,50 +3687,49 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) } static void -spa_sync_spares(spa_t *spa, dmu_tx_t *tx) +spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, + const char *config, const char *entry) { nvlist_t *nvroot; - nvlist_t **spares; + nvlist_t **list; int i; - if (!spa->spa_sync_spares) + if (!sav->sav_sync) return; /* - * Update the MOS nvlist describing the list of available spares. - * spa_validate_spares() will have already made sure this nvlist is - * valid and the vdevs are labelled appropriately. + * Update the MOS nvlist describing the list of available devices. + * spa_validate_aux() will have already made sure this nvlist is + * valid and the vdevs are labeled appropriately. */ - if (spa->spa_spares_object == 0) { - spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, - DMU_OT_PACKED_NVLIST, 1 << 14, - DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); + if (sav->sav_object == 0) { + sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, + sizeof (uint64_t), tx); VERIFY(zap_update(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, - sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); + DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, + &sav->sav_object, tx) == 0); } VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (spa->spa_nspares == 0) { - VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - NULL, 0) == 0); + if (sav->sav_count == 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); } else { - spares = kmem_alloc(spa->spa_nspares * sizeof (void *), - KM_SLEEP); - for (i = 0; i < spa->spa_nspares; i++) - spares[i] = vdev_config_generate(spa, - spa->spa_spares[i], B_FALSE, B_TRUE); - VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - spares, spa->spa_nspares) == 0); - for (i = 0; i < spa->spa_nspares; i++) - nvlist_free(spares[i]); - kmem_free(spares, spa->spa_nspares * sizeof (void *)); + list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); + for (i = 0; i < sav->sav_count; i++) + list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], + B_FALSE, B_FALSE, B_TRUE); + VERIFY(nvlist_add_nvlist_array(nvroot, config, list, + sav->sav_count) == 0); + for (i = 0; i < sav->sav_count; i++) + nvlist_free(list[i]); + kmem_free(list, sav->sav_count * sizeof (void *)); } - spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); + spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); nvlist_free(nvroot); - spa->spa_sync_spares = B_FALSE; + sav->sav_sync = B_FALSE; } static void @@ -2870,10 +3737,15 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; - if (list_is_empty(&spa->spa_dirty_list)) + if (list_is_empty(&spa->spa_config_dirty_list)) return; - config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + config = spa_config_generate(spa, spa->spa_root_vdev, + dmu_tx_get_txg(tx), B_FALSE); + + spa_config_exit(spa, SCL_STATE, FTAG); if (spa->spa_config_syncing) nvlist_free(spa->spa_config_syncing); @@ -2882,41 +3754,140 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } +/* + * Set zpool properties. + */ static void -spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) +spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { spa_t *spa = arg1; - nvlist_t *nvp = arg2; - nvpair_t *nvpair; objset_t *mos = spa->spa_meta_objset; - uint64_t zapobj; + nvlist_t *nvp = arg2; + nvpair_t *elem; + uint64_t intval; + char *strval; + zpool_prop_t prop; + const char *propname; + zprop_type_t proptype; + spa_config_dirent_t *dp; mutex_enter(&spa->spa_props_lock); - if (spa->spa_pool_props_object == 0) { - zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); - VERIFY(zapobj > 0); - spa->spa_pool_props_object = zapobj; + elem = NULL; + while ((elem = nvlist_next_nvpair(nvp, elem))) { + switch (prop = zpool_name_to_prop(nvpair_name(elem))) { + case ZPOOL_PROP_VERSION: + /* + * Only set version for non-zpool-creation cases + * (set/import). spa_create() needs special care + * for version setting. + */ + if (tx->tx_txg != TXG_INITIAL) { + VERIFY(nvpair_value_uint64(elem, + &intval) == 0); + ASSERT(intval <= SPA_VERSION); + ASSERT(intval >= spa_version(spa)); + spa->spa_uberblock.ub_version = intval; + vdev_config_dirty(spa->spa_root_vdev); + } + break; - VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_PROPS, 8, 1, - &spa->spa_pool_props_object, tx) == 0); - } - mutex_exit(&spa->spa_props_lock); + case ZPOOL_PROP_ALTROOT: + /* + * 'altroot' is a non-persistent property. It should + * have been set temporarily at creation or import time. + */ + ASSERT(spa->spa_root != NULL); + break; - nvpair = NULL; - while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { - switch (zpool_name_to_prop(nvpair_name(nvpair))) { - case ZFS_PROP_BOOTFS: - VERIFY(nvlist_lookup_uint64(nvp, - nvpair_name(nvpair), &spa->spa_bootfs) == 0); - VERIFY(zap_update(mos, - spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, - &spa->spa_bootfs, tx) == 0); + case ZPOOL_PROP_CACHEFILE: + /* + * 'cachefile' is a non-persistent property, but note + * an async request that the config cache needs to be + * udpated. + */ + VERIFY(nvpair_value_string(elem, &strval) == 0); + + dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); + + if (strval[0] == '\0') + dp->scd_path = spa_strdup(spa_config_path); + else if (strcmp(strval, "none") == 0) + dp->scd_path = NULL; + else + dp->scd_path = spa_strdup(strval); + + list_insert_head(&spa->spa_config_list, dp); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); break; + default: + /* + * Set pool property values in the poolprops mos object. + */ + if (spa->spa_pool_props_object == 0) { + objset_t *mos = spa->spa_meta_objset; + + VERIFY((spa->spa_pool_props_object = + zap_create(mos, DMU_OT_POOL_PROPS, + DMU_OT_NONE, 0, tx)) > 0); + + VERIFY(zap_update(mos, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, + 8, 1, &spa->spa_pool_props_object, tx) + == 0); + } + + /* normalize the property name */ + propname = zpool_prop_to_name(prop); + proptype = zpool_prop_get_type(prop); + + if (nvpair_type(elem) == DATA_TYPE_STRING) { + ASSERT(proptype == PROP_TYPE_STRING); + VERIFY(nvpair_value_string(elem, &strval) == 0); + VERIFY(zap_update(mos, + spa->spa_pool_props_object, propname, + 1, strlen(strval) + 1, strval, tx) == 0); + + } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { + VERIFY(nvpair_value_uint64(elem, &intval) == 0); + + if (proptype == PROP_TYPE_INDEX) { + const char *unused; + VERIFY(zpool_prop_index_to_string( + prop, intval, &unused) == 0); + } + VERIFY(zap_update(mos, + spa->spa_pool_props_object, propname, + 8, 1, &intval, tx) == 0); + } else { + ASSERT(0); /* not allowed */ + } + + switch (prop) { + case ZPOOL_PROP_DELEGATION: + spa->spa_delegation = intval; + break; + case ZPOOL_PROP_BOOTFS: + spa->spa_bootfs = intval; + break; + case ZPOOL_PROP_FAILUREMODE: + spa->spa_failmode = intval; + break; + default: + break; + } + } + + /* log internal history if this is not a zpool create */ + if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && + tx->tx_txg != TXG_INITIAL) { + spa_history_internal_log(LOG_POOL_PROPSET, + spa, tx, cr, "%s %lld %s", + nvpair_name(elem), intval, spa_name(spa)); } } + + mutex_exit(&spa->spa_props_lock); } /* @@ -2933,25 +3904,37 @@ spa_sync(spa_t *spa, uint64_t txg) vdev_t *vd; dmu_tx_t *tx; int dirty_vdevs; + int error; /* * Lock out configuration changes. */ - spa_config_enter(spa, RW_READER, FTAG); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; + /* + * If there are any pending vdev state changes, convert them + * into config changes that go out with this transaction group. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { + vdev_state_clean(vd); + vdev_config_dirty(vd); + } + spa_config_exit(spa, SCL_STATE, FTAG); + VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); tx = dmu_tx_create_assigned(dp, txg); /* - * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, + * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. */ - if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && - spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { + if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && + spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { int i; for (i = 0; i < rvd->vdev_children; i++) { @@ -2967,6 +3950,19 @@ spa_sync(spa_t *spa, uint64_t txg) } } + if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && + spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { + dsl_pool_create_origin(dp, tx); + + /* Keeping the origin open increases spa_minref */ + spa->spa_minref += 3; + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && + spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { + dsl_pool_upgrade_clones(dp, tx); + } + /* * If anything has changed in this txg, push the deferred frees * from the previous txg. If not, leave them alone so that we @@ -2984,7 +3980,10 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_sync_pass++; spa_sync_config_object(spa, tx); - spa_sync_spares(spa, tx); + spa_sync_aux_dev(spa, &spa->spa_spares, tx, + ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); + spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, + ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); @@ -3005,35 +4004,52 @@ spa_sync(spa_t *spa, uint64_t txg) * Rewrite the vdev configuration (which includes the uberblock) * to commit the transaction group. * - * If there are any dirty vdevs, sync the uberblock to all vdevs. - * Otherwise, pick a random top-level vdev that's known to be - * visible in the config cache (see spa_vdev_add() for details). - * If the write fails, try the next vdev until we're tried them all. + * If there are no dirty vdevs, we sync the uberblock to a few + * random top-level vdevs that are known to be visible in the + * config cache (see spa_vdev_add() for a complete description). + * If there *are* dirty vdevs, sync the uberblock to all vdevs. */ - if (!list_is_empty(&spa->spa_dirty_list)) { - VERIFY(vdev_config_sync(rvd, txg) == 0); - } else { - int children = rvd->vdev_children; - int c0 = spa_get_random(children); - int c; - - for (c = 0; c < children; c++) { - vd = rvd->vdev_child[(c0 + c) % children]; - if (vd->vdev_ms_array == 0) - continue; - if (vdev_config_sync(vd, txg) == 0) - break; + for (;;) { + /* + * We hold SCL_STATE to prevent vdev open/close/etc. + * while we're attempting to write the vdev labels. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + if (list_is_empty(&spa->spa_config_dirty_list)) { + vdev_t *svd[SPA_DVAS_PER_BP]; + int svdcount = 0; + int children = rvd->vdev_children; + int c0 = spa_get_random(children); + int c; + + for (c = 0; c < children; c++) { + vd = rvd->vdev_child[(c0 + c) % children]; + if (vd->vdev_ms_array == 0 || vd->vdev_islog) + continue; + svd[svdcount++] = vd; + if (svdcount == SPA_DVAS_PER_BP) + break; + } + error = vdev_config_sync(svd, svdcount, txg); + } else { + error = vdev_config_sync(rvd->vdev_child, + rvd->vdev_children, txg); } - if (c == children) - VERIFY(vdev_config_sync(rvd, txg) == 0); - } + spa_config_exit(spa, SCL_STATE, FTAG); + + if (error == 0) + break; + zio_suspend(spa, NULL); + zio_resume_wait(spa); + } dmu_tx_commit(tx); /* * Clear the dirty config list. */ - while ((vd = list_head(&spa->spa_dirty_list)) != NULL) + while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) vdev_config_clean(vd); /* @@ -3046,21 +4062,12 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_config_syncing = NULL; } - /* - * Make a stable copy of the fully synced uberblock. - * We use this as the root for pool traversals. - */ - spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ - - spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ - + spa->spa_traverse_wanted = B_TRUE; rw_enter(&spa->spa_traverse_lock, RW_WRITER); - spa->spa_traverse_wanted = 0; + spa->spa_traverse_wanted = B_FALSE; spa->spa_ubsync = spa->spa_uberblock; rw_exit(&spa->spa_traverse_lock); - spa_scrub_resume(spa); /* resume scrub with new ubsync */ - /* * Clean up the ZIL records for the synced txg. */ @@ -3081,7 +4088,7 @@ spa_sync(spa_t *spa, uint64_t txg) ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); ASSERT(bpl->bpl_queue == NULL); - spa_config_exit(spa, FTAG); + spa_config_exit(spa, SCL_CONFIG, FTAG); /* * If any async tasks have been requested, kick them off. @@ -3100,7 +4107,7 @@ spa_sync_allpools(void) spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { - if (spa_state(spa) != POOL_STATE_ACTIVE) + if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); @@ -3139,7 +4146,6 @@ spa_evict_all(void) spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); - VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); @@ -3153,27 +4159,42 @@ spa_evict_all(void) } vdev_t * -spa_lookup_by_guid(spa_t *spa, uint64_t guid) +spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) { - return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); + vdev_t *vd; + int i; + + if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) + return (vd); + + if (l2cache) { + for (i = 0; i < spa->spa_l2cache.sav_count; i++) { + vd = spa->spa_l2cache.sav_vdevs[i]; + if (vd->vdev_guid == guid) + return (vd); + } + } + + return (NULL); } void -spa_upgrade(spa_t *spa) +spa_upgrade(spa_t *spa, uint64_t version) { - spa_config_enter(spa, RW_WRITER, FTAG); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * This should only be called for a non-faulted pool, and since a * future version would result in an unopenable pool, this shouldn't be * possible. */ - ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); + ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); + ASSERT(version >= spa->spa_uberblock.ub_version); - spa->spa_uberblock.ub_version = ZFS_VERSION; + spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); - spa_config_exit(spa, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); } @@ -3183,119 +4204,98 @@ spa_has_spare(spa_t *spa, uint64_t guid) { int i; uint64_t spareguid; + spa_aux_vdev_t *sav = &spa->spa_spares; - for (i = 0; i < spa->spa_nspares; i++) - if (spa->spa_spares[i]->vdev_guid == guid) + for (i = 0; i < sav->sav_count; i++) + if (sav->sav_vdevs[i]->vdev_guid == guid) return (B_TRUE); - for (i = 0; i < spa->spa_pending_nspares; i++) { - if (nvlist_lookup_uint64(spa->spa_pending_spares[i], - ZPOOL_CONFIG_GUID, &spareguid) == 0 && - spareguid == guid) + for (i = 0; i < sav->sav_npending; i++) { + if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, + &spareguid) == 0 && spareguid == guid) return (B_TRUE); } return (B_FALSE); } -int -spa_set_props(spa_t *spa, nvlist_t *nvp) -{ - return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, - spa, nvp, 3)); -} - -int -spa_get_props(spa_t *spa, nvlist_t **nvp) +/* + * Check if a pool has an active shared spare device. + * Note: reference count of an active spare is 2, as a spare and as a replace + */ +static boolean_t +spa_has_active_shared_spare(spa_t *spa) { - zap_cursor_t zc; - zap_attribute_t za; - objset_t *mos = spa->spa_meta_objset; - zfs_source_t src; - zfs_prop_t prop; - nvlist_t *propval; - uint64_t value; - int err; - - VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - mutex_enter(&spa->spa_props_lock); - /* If no props object, then just return empty nvlist */ - if (spa->spa_pool_props_object == 0) { - mutex_exit(&spa->spa_props_lock); - return (0); - } - - for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); - (err = zap_cursor_retrieve(&zc, &za)) == 0; - zap_cursor_advance(&zc)) { - - if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) - continue; - - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - switch (za.za_integer_length) { - case 8: - if (zfs_prop_default_numeric(prop) == - za.za_first_integer) - src = ZFS_SRC_DEFAULT; - else - src = ZFS_SRC_LOCAL; - value = za.za_first_integer; - - if (prop == ZFS_PROP_BOOTFS) { - dsl_pool_t *dp; - dsl_dataset_t *ds = NULL; - char strval[MAXPATHLEN]; - - dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); - if ((err = dsl_dataset_open_obj(dp, - za.za_first_integer, NULL, DS_MODE_NONE, - FTAG, &ds)) != 0) { - rw_exit(&dp->dp_config_rwlock); - break; - } - dsl_dataset_name(ds, strval); - dsl_dataset_close(ds, DS_MODE_NONE, FTAG); - rw_exit(&dp->dp_config_rwlock); + int i, refcnt; + uint64_t pool; + spa_aux_vdev_t *sav = &spa->spa_spares; - VERIFY(nvlist_add_uint64(propval, - ZFS_PROP_SOURCE, src) == 0); - VERIFY(nvlist_add_string(propval, - ZFS_PROP_VALUE, strval) == 0); - } else { - VERIFY(nvlist_add_uint64(propval, - ZFS_PROP_SOURCE, src) == 0); - VERIFY(nvlist_add_uint64(propval, - ZFS_PROP_VALUE, value) == 0); - } - VERIFY(nvlist_add_nvlist(*nvp, za.za_name, - propval) == 0); - break; - } - nvlist_free(propval); - } - zap_cursor_fini(&zc); - mutex_exit(&spa->spa_props_lock); - if (err && err != ENOENT) { - nvlist_free(*nvp); - return (err); + for (i = 0; i < sav->sav_count; i++) { + if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, + &refcnt) && pool != 0ULL && pool == spa_guid(spa) && + refcnt > 2) + return (B_TRUE); } - return (0); + return (B_FALSE); } /* - * If the bootfs property value is dsobj, clear it. + * Post a sysevent corresponding to the given event. The 'name' must be one of + * the event definitions in sys/sysevent/eventdefs.h. The payload will be + * filled in from the spa and (optionally) the vdev. This doesn't do anything + * in the userland libzpool, as we don't want consumers to misinterpret ztest + * or zdb as real changes. */ void -spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) +spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) { - if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { - VERIFY(zap_remove(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); - spa->spa_bootfs = 0; +#if 0 +#ifdef _KERNEL + sysevent_t *ev; + sysevent_attr_list_t *attr = NULL; + sysevent_value_t value; + sysevent_id_t eid; + + ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", + SE_SLEEP); + + value.value_type = SE_DATA_TYPE_STRING; + value.value.sv_string = spa_name(spa); + if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) + goto done; + + value.value_type = SE_DATA_TYPE_UINT64; + value.value.sv_uint64 = spa_guid(spa); + if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) + goto done; + + if (vd) { + value.value_type = SE_DATA_TYPE_UINT64; + value.value.sv_uint64 = vd->vdev_guid; + if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, + SE_SLEEP) != 0) + goto done; + + if (vd->vdev_path) { + value.value_type = SE_DATA_TYPE_STRING; + value.value.sv_string = vd->vdev_path; + if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, + &value, SE_SLEEP) != 0) + goto done; + } } + + if (sysevent_attach_attributes(ev, attr) != 0) + goto done; + attr = NULL; + + (void) log_sysevent(ev, SE_SLEEP, &eid); + +done: + if (attr) + sysevent_free_attr(attr); + sysevent_free(ev); +#endif +#endif } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c index 9e8bcf391158..1ffdb10dbfa5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/spa_impl.h> @@ -43,16 +41,18 @@ /* * Pool configuration repository. * - * The configuration for all pools, in addition to being stored on disk, is - * stored in /etc/zfs/zpool.cache as a packed nvlist. The kernel maintains - * this list as pools are created, destroyed, or modified. + * Pool configuration is stored as a packed nvlist on the filesystem. By + * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot + * (when the ZFS module is loaded). Pools can also have the 'cachefile' + * property set that allows them to be stored in an alternate location until + * the control of external software. * - * We have a single nvlist which holds all the configuration information. When - * the module loads, we read this information from the cache and populate the - * SPA namespace. This namespace is maintained independently in spa.c. - * Whenever the namespace is modified, or the configuration of a pool is - * changed, we call spa_config_sync(), which walks through all the active pools - * and writes the configuration to disk. + * For each cache file, we have a single nvlist which holds all the + * configuration information. When the module loads, we read this information + * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is + * maintained independently in spa.c. Whenever the namespace is modified, or + * the configuration of a pool is changed, we call spa_config_sync(), which + * walks through all the active pools and writes the configuration to disk. */ static uint64_t spa_config_generation = 1; @@ -61,7 +61,7 @@ static uint64_t spa_config_generation = 1; * This can be overridden in userland to preserve an alternate namespace for * userland pools when doing testing. */ -const char *spa_config_dir = ZPOOL_CACHE_DIR; +const char *spa_config_path = ZPOOL_CACHE; /* * Called when the module is first loaded, this routine loads the configuration @@ -75,17 +75,21 @@ spa_config_load(void) nvlist_t *nvlist, *child; nvpair_t *nvpair; spa_t *spa; - char pathname[128]; + char *pathname; struct _buf *file; uint64_t fsize; /* * Open the configuration file. */ - (void) snprintf(pathname, sizeof (pathname), "%s/%s", - spa_config_dir, ZPOOL_CACHE_FILE); + pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); file = kobj_open_file(pathname); + + kmem_free(pathname, MAXPATHLEN); + if (file == (struct _buf *)-1) { ZFS_LOG(1, "Cannot open %s.", pathname); return; @@ -148,47 +152,32 @@ out: kobj_close_file(file); } -/* - * Synchronize all pools to disk. This must be called with the namespace lock - * held. - */ -void -spa_config_sync(void) +static void +spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) { - spa_t *spa = NULL; - nvlist_t *config; size_t buflen; char *buf; vnode_t *vp; int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; - char pathname[128]; - char pathname2[128]; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0); + char *temp; /* - * Add all known pools to the configuration list, ignoring those with - * alternate root paths. + * If the nvlist is empty (NULL), then remove the old cachefile. */ - spa = NULL; - while ((spa = spa_next(spa)) != NULL) { - mutex_enter(&spa->spa_config_cache_lock); - if (spa->spa_config && spa->spa_name && spa->spa_root == NULL) - VERIFY(nvlist_add_nvlist(config, spa->spa_name, - spa->spa_config) == 0); - mutex_exit(&spa->spa_config_cache_lock); + if (nvl == NULL) { + (void) vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE); + return; } /* * Pack the configuration into a buffer. */ - VERIFY(nvlist_size(config, &buflen, NV_ENCODE_XDR) == 0); + VERIFY(nvlist_size(nvl, &buflen, NV_ENCODE_XDR) == 0); buf = kmem_alloc(buflen, KM_SLEEP); + temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); - VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR, + VERIFY(nvlist_pack(nvl, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0); /* @@ -196,29 +185,92 @@ spa_config_sync(void) * 'write to temporary file, sync, move over original' to make sure we * always have a consistent view of the data. */ - (void) snprintf(pathname, sizeof (pathname), "%s/%s", spa_config_dir, - ZPOOL_CACHE_TMP); + (void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path); - if (vn_open(pathname, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) != 0) - goto out; + if (vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) == 0) { + if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, + 0, RLIM64_INFINITY, kcred, NULL) == 0 && + VOP_FSYNC(vp, FSYNC, kcred, NULL) == 0) { + (void) vn_rename(temp, dp->scd_path, UIO_SYSSPACE); + } + (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL); + VN_RELE(vp); + } + + (void) vn_remove(temp, UIO_SYSSPACE, RMFILE); + + kmem_free(buf, buflen); + kmem_free(temp, MAXPATHLEN); +} + +/* + * Synchronize pool configuration to disk. This must be called with the + * namespace lock held. + */ +void +spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) +{ + spa_config_dirent_t *dp, *tdp; + nvlist_t *nvl; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Iterate over all cachefiles for the pool, past or present. When the + * cachefile is changed, the new one is pushed onto this list, allowing + * us to update previous cachefiles that no longer contain this pool. + */ + for (dp = list_head(&target->spa_config_list); dp != NULL; + dp = list_next(&target->spa_config_list, dp)) { + spa_t *spa = NULL; + if (dp->scd_path == NULL) + continue; - if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, NULL) == 0 && - VOP_FSYNC(vp, FSYNC, kcred) == 0) { - (void) snprintf(pathname2, sizeof (pathname2), "%s/%s", - spa_config_dir, ZPOOL_CACHE_FILE); - (void) vn_rename(pathname, pathname2, UIO_SYSSPACE); + /* + * Iterate over all pools, adding any matching pools to 'nvl'. + */ + nvl = NULL; + while ((spa = spa_next(spa)) != NULL) { + if (spa == target && removing) + continue; + + mutex_enter(&spa->spa_props_lock); + tdp = list_head(&spa->spa_config_list); + if (spa->spa_config == NULL || + tdp->scd_path == NULL || + strcmp(tdp->scd_path, dp->scd_path) != 0) { + mutex_exit(&spa->spa_props_lock); + continue; + } + + if (nvl == NULL) + VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + + VERIFY(nvlist_add_nvlist(nvl, spa->spa_name, + spa->spa_config) == 0); + mutex_exit(&spa->spa_props_lock); + } + + spa_config_write(dp, nvl); + nvlist_free(nvl); } - (void) VOP_CLOSE(vp, oflags, 1, 0, kcred); - VN_RELE(vp); + /* + * Remove any config entries older than the current one. + */ + dp = list_head(&target->spa_config_list); + while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { + list_remove(&target->spa_config_list, tdp); + if (tdp->scd_path != NULL) + spa_strfree(tdp->scd_path); + kmem_free(tdp, sizeof (spa_config_dirent_t)); + } -out: - (void) vn_remove(pathname, UIO_SYSSPACE, RMFILE); spa_config_generation++; - kmem_free(buf, buflen); - nvlist_free(config); + if (postsysevent) + spa_event_notify(target, NULL, ESC_ZFS_CONFIG_SYNC); } /* @@ -231,27 +283,25 @@ nvlist_t * spa_all_configs(uint64_t *generation) { nvlist_t *pools; - spa_t *spa; + spa_t *spa = NULL; if (*generation == spa_config_generation) return (NULL); VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0); - spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { - if (INGLOBALZONE(curproc) || + if (INGLOBALZONE(curthread) || zone_dataset_visible(spa_name(spa), NULL)) { - mutex_enter(&spa->spa_config_cache_lock); + mutex_enter(&spa->spa_props_lock); VERIFY(nvlist_add_nvlist(pools, spa_name(spa), spa->spa_config) == 0); - mutex_exit(&spa->spa_config_cache_lock); + mutex_exit(&spa->spa_props_lock); } } - mutex_exit(&spa_namespace_lock); - *generation = spa_config_generation; + mutex_exit(&spa_namespace_lock); return (pools); } @@ -259,11 +309,11 @@ spa_all_configs(uint64_t *generation) void spa_config_set(spa_t *spa, nvlist_t *config) { - mutex_enter(&spa->spa_config_cache_lock); + mutex_enter(&spa->spa_props_lock); if (spa->spa_config != NULL) nvlist_free(spa->spa_config); spa->spa_config = config; - mutex_exit(&spa->spa_config_cache_lock); + mutex_exit(&spa->spa_props_lock); } /* @@ -277,11 +327,16 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) nvlist_t *config, *nvroot; vdev_t *rvd = spa->spa_root_vdev; unsigned long hostid = 0; + boolean_t locked = B_FALSE; - ASSERT(spa_config_held(spa, RW_READER)); - - if (vd == NULL) + if (vd == NULL) { vd = rvd; + locked = B_TRUE; + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + } + + ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == + (SCL_CONFIG | SCL_STATE)); /* * If txg is -1, report the current value of spa->spa_config_txg. @@ -302,8 +357,10 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)) == 0); (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, - hostid) == 0); + if (hostid != 0) { + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, + hostid) == 0); + } VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname.nodename) == 0); @@ -315,30 +372,48 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) if (vd->vdev_isspare) VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, 1ULL) == 0); + if (vd->vdev_islog) + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, + 1ULL) == 0); vd = vd->vdev_top; /* label contains top config */ } - nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE); + nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); + if (locked) + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (config); } /* - * Update all disk labels, generate a fresh config based on the current - * in-core state, and sync the global config cache. + * For a pool that's not currently a booting rootpool, update all disk labels, + * generate a fresh config based on the current in-core state, and sync the + * global config cache. */ void spa_config_update(spa_t *spa, int what) { + spa_config_update_common(spa, what, FALSE); +} + +/* + * Update all disk labels, generate a fresh config based on the current + * in-core state, and sync the global config cache (do not sync the config + * cache if this is a booting rootpool). + */ +void +spa_config_update_common(spa_t *spa, int what, boolean_t isroot) +{ vdev_t *rvd = spa->spa_root_vdev; uint64_t txg; int c; ASSERT(MUTEX_HELD(&spa_namespace_lock)); - spa_config_enter(spa, RW_WRITER, FTAG); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); txg = spa_last_synced_txg(spa) + 1; if (what == SPA_CONFIG_UPDATE_POOL) { vdev_config_dirty(rvd); @@ -358,7 +433,7 @@ spa_config_update(spa_t *spa, int what) } } } - spa_config_exit(spa, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); /* * Wait for the mosconfig to be regenerated and synced. @@ -368,8 +443,9 @@ spa_config_update(spa_t *spa, int what) /* * Update the global config cache to reflect the new mosconfig. */ - spa_config_sync(); + if (!isroot) + spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); if (what == SPA_CONFIG_UPDATE_POOL) - spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); + spa_config_update_common(spa, SPA_CONFIG_UPDATE_VDEVS, isroot); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c index c52acaf30801..e5c395f63d2b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -298,10 +298,7 @@ void spa_errlog_rotate(spa_t *spa) { mutex_enter(&spa->spa_errlist_lock); - - ASSERT(!spa->spa_scrub_finished); spa->spa_scrub_finished = B_TRUE; - mutex_exit(&spa->spa_errlist_lock); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c index 66428013a784..8e20c4d32cd7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c @@ -20,15 +20,24 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" +#include <sys/spa.h> #include <sys/spa_impl.h> #include <sys/zap.h> #include <sys/dsl_synctask.h> +#include <sys/dmu_tx.h> +#include <sys/dmu_objset.h> +#include <sys/utsname.h> +#include <sys/sunddi.h> +#ifdef _KERNEL +#include <sys/cmn_err.h> +#include <sys/zone.h> +#endif /* * Routines to manage the on-disk history log. @@ -59,16 +68,6 @@ * and permanently lost. */ -typedef enum history_log_type { - LOG_CMD_CREATE, - LOG_CMD_NO_CREATE -} history_log_type_t; - -typedef struct history_arg { - const char *ha_history_str; - history_log_type_t ha_log_type; -} history_arg_t; - /* convert a logical offset to physical */ static uint64_t spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp) @@ -156,8 +155,9 @@ spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, /* see if we need to reset logical BOF */ while (shpp->sh_phys_max_off - shpp->sh_pool_create_len - (shpp->sh_eof - shpp->sh_bof) <= len) { - if ((err = spa_history_advance_bof(spa, shpp)) != 0) + if ((err = spa_history_advance_bof(spa, shpp)) != 0) { return (err); + } } phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); @@ -175,11 +175,22 @@ spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, return (0); } +static char * +spa_history_zone() +{ +#ifdef _KERNEL + /* XXX: pr_host can be changed by default from within a jail! */ + if (jailed(curthread->td_ucred)) + return (curthread->td_ucred->cr_prison->pr_host); +#endif + return ("global"); +} + /* * Write out a history event. */ -void -spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) +static void +spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { spa_t *spa = arg1; history_arg_t *hap = arg2; @@ -193,9 +204,6 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) char *record_packed = NULL; int ret; - if (history_str == NULL) - return; - /* * If we have an older pool that doesn't have a command * history object, create it now. @@ -222,16 +230,39 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) } #endif - /* construct a nvlist of the current time and cmd string */ VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME, gethrestime_sec()) == 0); - VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, history_str) == 0); + VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, + (uint64_t)crgetuid(cr)) == 0); + if (hap->ha_zone[0] != '\0') + VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE, + hap->ha_zone) == 0); +#ifdef _KERNEL + VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_HOST, + utsname.nodename) == 0); +#endif + if (hap->ha_log_type == LOG_CMD_POOL_CREATE || + hap->ha_log_type == LOG_CMD_NORMAL) { + VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, + history_str) == 0); + } else { + VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT, + hap->ha_event) == 0); + VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TXG, + tx->tx_txg) == 0); + VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR, + history_str) == 0); + } + + VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0); + record_packed = kmem_alloc(reclen, KM_SLEEP); + VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen, NV_ENCODE_XDR, KM_SLEEP) == 0); mutex_enter(&spa->spa_history_lock); - if (hap->ha_log_type == LOG_CMD_CREATE) + if (hap->ha_log_type == LOG_CMD_POOL_CREATE) VERIFY(shpp->sh_eof == shpp->sh_pool_create_len); /* write out the packed length as little endian */ @@ -240,7 +271,7 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) if (!ret) ret = spa_history_write(spa, record_packed, reclen, shpp, tx); - if (!ret && hap->ha_log_type == LOG_CMD_CREATE) { + if (!ret && hap->ha_log_type == LOG_CMD_POOL_CREATE) { shpp->sh_pool_create_len += sizeof (le_len) + reclen; shpp->sh_bof = shpp->sh_pool_create_len; } @@ -249,18 +280,26 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) nvlist_free(nvrecord); kmem_free(record_packed, reclen); dmu_buf_rele(dbp, FTAG); + + if (hap->ha_log_type == LOG_INTERNAL) { + kmem_free((void*)hap->ha_history_str, HIS_MAX_RECORD_LEN); + kmem_free(hap, sizeof (history_arg_t)); + } } /* * Write out a history event. */ int -spa_history_log(spa_t *spa, const char *history_str, uint64_t pool_create) +spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what) { history_arg_t ha; + ASSERT(what != LOG_INTERNAL); + ha.ha_history_str = history_str; - ha.ha_log_type = pool_create ? LOG_CMD_CREATE : LOG_CMD_NO_CREATE; + ha.ha_log_type = what; + (void) strlcpy(ha.ha_zone, spa_history_zone(), sizeof (ha.ha_zone)); return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync, spa, &ha, 0)); } @@ -352,3 +391,39 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) dmu_buf_rele(dbp, FTAG); return (err); } + +void +spa_history_internal_log(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, cred_t *cr, const char *fmt, ...) +{ + history_arg_t *hap; + char *str; + va_list adx; + + /* + * If this is part of creating a pool, not everything is + * initialized yet, so don't bother logging the internal events. + */ + if (tx->tx_txg == TXG_INITIAL) + return; + + hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); + str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); + + va_start(adx, fmt); + (void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx); + va_end(adx); + + hap->ha_log_type = LOG_INTERNAL; + hap->ha_history_str = str; + hap->ha_event = event; + hap->ha_zone[0] = '\0'; + + if (dmu_tx_is_syncing(tx)) { + spa_history_log_sync(spa, hap, cr, tx); + } else { + dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, + spa_history_log_sync, spa, hap, 0, tx); + } + /* spa_history_log_sync() will free hap and str */ +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index 1e1f0ee93068..7a41d4ff5396 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa_impl.h> #include <sys/zio.h> @@ -44,6 +42,10 @@ #include <sys/dsl_dir.h> #include <sys/dsl_prop.h> #include <sys/fs/zfs.h> +#include <sys/metaslab_impl.h> +#include <sys/sunddi.h> +#include <sys/arc.h> +#include "zfs_prop.h" /* * SPA locking @@ -72,25 +74,17 @@ * This reference count keep track of any active users of the spa_t. The * spa_t cannot be destroyed or freed while this is non-zero. Internally, * the refcount is never really 'zero' - opening a pool implicitly keeps - * some references in the DMU. Internally we check against SPA_MINREF, but + * some references in the DMU. Internally we check against spa_minref, but * present the image of a zero/non-zero value to consumers. * - * spa_config_lock (per-spa crazy rwlock) + * spa_config_lock[] (per-spa array of rwlocks) * - * This SPA special is a recursive rwlock, capable of being acquired from - * asynchronous threads. It has protects the spa_t from config changes, - * and must be held in the following circumstances: + * This protects the spa_t from config changes, and must be held in + * the following circumstances: * * - RW_READER to perform I/O to the spa * - RW_WRITER to change the vdev config * - * spa_config_cache_lock (per-spa mutex) - * - * This mutex prevents the spa_config nvlist from being updated. No - * other locks are required to obtain this lock, although implicitly you - * must have the namespace lock or non-zero refcount to have any kind - * of spa_t pointer at all. - * * The locking order is fairly straightforward: * * spa_namespace_lock -> spa_refcount @@ -98,21 +92,20 @@ * The namespace lock must be acquired to increase the refcount from 0 * or to check if it is zero. * - * spa_refcount -> spa_config_lock + * spa_refcount -> spa_config_lock[] * * There must be at least one valid reference on the spa_t to acquire * the config lock. * - * spa_namespace_lock -> spa_config_lock + * spa_namespace_lock -> spa_config_lock[] * * The namespace lock must always be taken before the config lock. * * - * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and - * are globally visible. + * The spa_namespace_lock can be acquired directly and is globally visible. * - * The namespace is manipulated using the following functions, all which require - * the spa_namespace_lock to be held. + * The namespace is manipulated using the following functions, all of which + * require the spa_namespace_lock to be held. * * spa_lookup() Lookup a spa_t by name. * @@ -143,16 +136,70 @@ * zero. Must be called with spa_namespace_lock * held. * - * The spa_config_lock is manipulated using the following functions: + * The spa_config_lock[] is an array of rwlocks, ordered as follows: + * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. + * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). + * + * To read the configuration, it suffices to hold one of these locks as reader. + * To modify the configuration, you must hold all locks as writer. To modify + * vdev state without altering the vdev tree's topology (e.g. online/offline), + * you must hold SCL_STATE and SCL_ZIO as writer. + * + * We use these distinct config locks to avoid recursive lock entry. + * For example, spa_sync() (which holds SCL_CONFIG as reader) induces + * block allocations (SCL_ALLOC), which may require reading space maps + * from disk (dmu_read() -> zio_read() -> SCL_ZIO). + * + * The spa config locks cannot be normal rwlocks because we need the + * ability to hand off ownership. For example, SCL_ZIO is acquired + * by the issuing thread and later released by an interrupt thread. + * They do, however, obey the usual write-wanted semantics to prevent + * writer (i.e. system administrator) starvation. + * + * The lock acquisition rules are as follows: + * + * SCL_CONFIG + * Protects changes to the vdev tree topology, such as vdev + * add/remove/attach/detach. Protects the dirty config list + * (spa_config_dirty_list) and the set of spares and l2arc devices. + * + * SCL_STATE + * Protects changes to pool state and vdev state, such as vdev + * online/offline/fault/degrade/clear. Protects the dirty state list + * (spa_state_dirty_list) and global pool state (spa_state). * - * spa_config_enter() Acquire the config lock as RW_READER or - * RW_WRITER. At least one reference on the spa_t - * must exist. + * SCL_ALLOC + * Protects changes to metaslab groups and classes. + * Held as reader by metaslab_alloc() and metaslab_claim(). * - * spa_config_exit() Release the config lock. + * SCL_ZIO + * Held by bp-level zios (those which have no io_vd upon entry) + * to prevent changes to the vdev tree. The bp-level zio implicitly + * protects all of its vdev child zios, which do not hold SCL_ZIO. * - * spa_config_held() Returns true if the config lock is currently - * held in the given state. + * SCL_FREE + * Protects changes to metaslab groups and classes. + * Held as reader by metaslab_free(). SCL_FREE is distinct from + * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free + * blocks in zio_done() while another i/o that holds either + * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. + * + * SCL_VDEV + * Held as reader to prevent changes to the vdev tree during trivial + * inquiries such as bp_get_dasize(). SCL_VDEV is distinct from the + * other locks, and lower than all of them, to ensure that it's safe + * to acquire regardless of caller context. + * + * In addition, the following rules apply: + * + * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. + * The lock ordering is SCL_CONFIG > spa_props_lock. + * + * (b) I/O operations on leaf vdevs. For any zio operation that takes + * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), + * or zio_write_phys() -- the caller must ensure that the config cannot + * cannot change in the interim, and that the vdev cannot be reopened. + * SCL_STATE as reader suffices for both. * * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * @@ -163,10 +210,12 @@ * to complete, sync the updated configs to the * cache, and release the namespace lock. * - * The spa_name() function also requires either the spa_namespace_lock - * or the spa_config_lock, as both are needed to do a rename. spa_rename() is - * also implemented within this file since is requires manipulation of the - * namespace. + * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). + * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual + * locking is, always, based on spa_namespace_lock and spa_config_lock[]. + * + * spa_rename() is also implemented within this file since is requires + * manipulation of the namespace. */ static avl_tree_t spa_namespace_avl; @@ -177,12 +226,15 @@ int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; static avl_tree_t spa_spare_avl; +static kmutex_t spa_l2cache_lock; +static avl_tree_t spa_l2cache_avl; kmem_cache_t *spa_buffer_pool; int spa_mode; #ifdef ZFS_DEBUG -int zfs_flags = ~0; +/* Everything except dprintf is on by default in debug builds */ +int zfs_flags = ~ZFS_DEBUG_DPRINTF; #else int zfs_flags = 0; #endif @@ -198,7 +250,128 @@ TUNABLE_INT("vfs.zfs.recover", &zfs_recover); SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0, "Try to recover from otherwise-fatal errors."); -#define SPA_MINREF 5 /* spa_refcnt for an open-but-idle pool */ + +/* + * ========================================================================== + * SPA config locking + * ========================================================================== + */ +static void +spa_config_lock_init(spa_t *spa) +{ + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&scl->scl_count); + scl->scl_writer = NULL; + scl->scl_write_wanted = 0; + } +} + +static void +spa_config_lock_destroy(spa_t *spa) +{ + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + mutex_destroy(&scl->scl_lock); + cv_destroy(&scl->scl_cv); + refcount_destroy(&scl->scl_count); + ASSERT(scl->scl_writer == NULL); + ASSERT(scl->scl_write_wanted == 0); + } +} + +int +spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) +{ + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (!(locks & (1 << i))) + continue; + mutex_enter(&scl->scl_lock); + if (rw == RW_READER) { + if (scl->scl_writer || scl->scl_write_wanted) { + mutex_exit(&scl->scl_lock); + spa_config_exit(spa, locks ^ (1 << i), tag); + return (0); + } + } else { + ASSERT(scl->scl_writer != curthread); + if (!refcount_is_zero(&scl->scl_count)) { + mutex_exit(&scl->scl_lock); + spa_config_exit(spa, locks ^ (1 << i), tag); + return (0); + } + scl->scl_writer = curthread; + } + (void) refcount_add(&scl->scl_count, tag); + mutex_exit(&scl->scl_lock); + } + return (1); +} + +void +spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) +{ + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (!(locks & (1 << i))) + continue; + mutex_enter(&scl->scl_lock); + if (rw == RW_READER) { + while (scl->scl_writer || scl->scl_write_wanted) { + cv_wait(&scl->scl_cv, &scl->scl_lock); + } + } else { + ASSERT(scl->scl_writer != curthread); + while (!refcount_is_zero(&scl->scl_count)) { + scl->scl_write_wanted++; + cv_wait(&scl->scl_cv, &scl->scl_lock); + scl->scl_write_wanted--; + } + scl->scl_writer = curthread; + } + (void) refcount_add(&scl->scl_count, tag); + mutex_exit(&scl->scl_lock); + } +} + +void +spa_config_exit(spa_t *spa, int locks, void *tag) +{ + for (int i = SCL_LOCKS - 1; i >= 0; i--) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (!(locks & (1 << i))) + continue; + mutex_enter(&scl->scl_lock); + ASSERT(!refcount_is_zero(&scl->scl_count)); + if (refcount_remove(&scl->scl_count, tag) == 0) { + ASSERT(scl->scl_writer == NULL || + scl->scl_writer == curthread); + scl->scl_writer = NULL; /* OK in either case */ + cv_broadcast(&scl->scl_cv); + } + mutex_exit(&scl->scl_lock); + } +} + +int +spa_config_held(spa_t *spa, int locks, krw_t rw) +{ + int locks_held = 0; + + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (!(locks & (1 << i))) + continue; + if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || + (rw == RW_WRITER && scl->scl_writer == curthread)) + locks_held |= 1 << i; + } + + return (locks_held); +} /* * ========================================================================== @@ -213,14 +386,30 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0, spa_t * spa_lookup(const char *name) { - spa_t search, *spa; + static spa_t search; /* spa_t is large; don't allocate on stack */ + spa_t *spa; avl_index_t where; + char c; + char *cp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); - search.spa_name = (char *)name; + /* + * If it's a full dataset name, figure out the pool name and + * just use that. + */ + cp = strpbrk(name, "/@"); + if (cp) { + c = *cp; + *cp = '\0'; + } + + (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); spa = avl_find(&spa_namespace_avl, &search, &where); + if (cp) + *cp = c; + return (spa); } @@ -233,29 +422,40 @@ spa_t * spa_add(const char *name, const char *altroot) { spa_t *spa; + spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); - spa->spa_name = spa_strdup(name); - spa->spa_state = POOL_STATE_UNINITIALIZED; - spa->spa_freeze_txg = UINT64_MAX; - spa->spa_final_txg = UINT64_MAX; + rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); - mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_async_root_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&spa->spa_scrub_cv, NULL, CV_DEFAULT, NULL); - cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_async_root_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); + + (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); + spa->spa_state = POOL_STATE_UNINITIALIZED; + spa->spa_freeze_txg = UINT64_MAX; + spa->spa_final_txg = UINT64_MAX; refcount_create(&spa->spa_refcount); - refcount_create(&spa->spa_config_lock.scl_count); + spa_config_lock_init(spa); avl_add(&spa_namespace_avl, spa); + mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); + /* * Set the alternate root, if there is one. */ @@ -264,6 +464,16 @@ spa_add(const char *name, const char *altroot) spa_active_count++; } + /* + * Every pool starts with the default cachefile + */ + list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), + offsetof(spa_config_dirent_t, scd_link)); + + dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); + dp->scd_path = spa_strdup(spa_config_path); + list_insert_head(&spa->spa_config_list, dp); + return (spa); } @@ -275,9 +485,10 @@ spa_add(const char *name, const char *altroot) void spa_remove(spa_t *spa) { + spa_config_dirent_t *dp; + ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); - ASSERT(spa->spa_scrub_thread == NULL); avl_remove(&spa_namespace_avl, spa); cv_broadcast(&spa_namespace_cv); @@ -287,21 +498,37 @@ spa_remove(spa_t *spa) spa_active_count--; } - if (spa->spa_name) - spa_strfree(spa->spa_name); + while ((dp = list_head(&spa->spa_config_list)) != NULL) { + list_remove(&spa->spa_config_list, dp); + if (dp->scd_path != NULL) + spa_strfree(dp->scd_path); + kmem_free(dp, sizeof (spa_config_dirent_t)); + } + + list_destroy(&spa->spa_config_list); spa_config_set(spa, NULL); refcount_destroy(&spa->spa_refcount); - refcount_destroy(&spa->spa_config_lock.scl_count); + + spa_config_lock_destroy(spa); + + rw_destroy(&spa->spa_traverse_lock); cv_destroy(&spa->spa_async_cv); + cv_destroy(&spa->spa_async_root_cv); cv_destroy(&spa->spa_scrub_io_cv); - cv_destroy(&spa->spa_scrub_cv); + cv_destroy(&spa->spa_suspend_cv); - mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_async_lock); - mutex_destroy(&spa->spa_config_cache_lock); + mutex_destroy(&spa->spa_async_root_lock); + mutex_destroy(&spa->spa_scrub_lock); + mutex_destroy(&spa->spa_errlog_lock); + mutex_destroy(&spa->spa_errlist_lock); + mutex_destroy(&spa->spa_sync_bplist.bpl_lock); + mutex_destroy(&spa->spa_history_lock); + mutex_destroy(&spa->spa_props_lock); + mutex_destroy(&spa->spa_suspend_lock); kmem_free(spa, sizeof (spa_t)); } @@ -334,9 +561,8 @@ spa_next(spa_t *prev) void spa_open_ref(spa_t *spa, void *tag) { - ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF || + ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); - (void) refcount_add(&spa->spa_refcount, tag); } @@ -347,15 +573,14 @@ spa_open_ref(spa_t *spa, void *tag) void spa_close(spa_t *spa, void *tag) { - ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF || + ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); - (void) refcount_remove(&spa->spa_refcount, tag); } /* * Check to see if the spa refcount is zero. Must be called with - * spa_namespace_lock held. We really compare against SPA_MINREF, which is the + * spa_namespace_lock held. We really compare against spa_minref, which is the * number of references acquired when opening a pool */ boolean_t @@ -363,16 +588,119 @@ spa_refcount_zero(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); - return (refcount_count(&spa->spa_refcount) == SPA_MINREF); + return (refcount_count(&spa->spa_refcount) == spa->spa_minref); } /* * ========================================================================== - * SPA spare tracking + * SPA spare and l2cache tracking * ========================================================================== */ /* + * Hot spares and cache devices are tracked using the same code below, + * for 'auxiliary' devices. + */ + +typedef struct spa_aux { + uint64_t aux_guid; + uint64_t aux_pool; + avl_node_t aux_avl; + int aux_count; +} spa_aux_t; + +static int +spa_aux_compare(const void *a, const void *b) +{ + const spa_aux_t *sa = a; + const spa_aux_t *sb = b; + + if (sa->aux_guid < sb->aux_guid) + return (-1); + else if (sa->aux_guid > sb->aux_guid) + return (1); + else + return (0); +} + +void +spa_aux_add(vdev_t *vd, avl_tree_t *avl) +{ + avl_index_t where; + spa_aux_t search; + spa_aux_t *aux; + + search.aux_guid = vd->vdev_guid; + if ((aux = avl_find(avl, &search, &where)) != NULL) { + aux->aux_count++; + } else { + aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); + aux->aux_guid = vd->vdev_guid; + aux->aux_count = 1; + avl_insert(avl, aux, where); + } +} + +void +spa_aux_remove(vdev_t *vd, avl_tree_t *avl) +{ + spa_aux_t search; + spa_aux_t *aux; + avl_index_t where; + + search.aux_guid = vd->vdev_guid; + aux = avl_find(avl, &search, &where); + + ASSERT(aux != NULL); + + if (--aux->aux_count == 0) { + avl_remove(avl, aux); + kmem_free(aux, sizeof (spa_aux_t)); + } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { + aux->aux_pool = 0ULL; + } +} + +boolean_t +spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) +{ + spa_aux_t search, *found; + + search.aux_guid = guid; + found = avl_find(avl, &search, NULL); + + if (pool) { + if (found) + *pool = found->aux_pool; + else + *pool = 0ULL; + } + + if (refcnt) { + if (found) + *refcnt = found->aux_count; + else + *refcnt = 0; + } + + return (found != NULL); +} + +void +spa_aux_activate(vdev_t *vd, avl_tree_t *avl) +{ + spa_aux_t search, *found; + avl_index_t where; + + search.aux_guid = vd->vdev_guid; + found = avl_find(avl, &search, &where); + ASSERT(found != NULL); + ASSERT(found->aux_pool == 0ULL); + + found->aux_pool = spa_guid(vd->vdev_spa); +} + +/* * Spares are tracked globally due to the following constraints: * * - A spare may be part of multiple pools. @@ -394,196 +722,110 @@ spa_refcount_zero(spa_t *spa) * be completely consistent with respect to other vdev configuration changes. */ -typedef struct spa_spare { - uint64_t spare_guid; - uint64_t spare_pool; - avl_node_t spare_avl; - int spare_count; -} spa_spare_t; - static int spa_spare_compare(const void *a, const void *b) { - const spa_spare_t *sa = a; - const spa_spare_t *sb = b; - - if (sa->spare_guid < sb->spare_guid) - return (-1); - else if (sa->spare_guid > sb->spare_guid) - return (1); - else - return (0); + return (spa_aux_compare(a, b)); } void spa_spare_add(vdev_t *vd) { - avl_index_t where; - spa_spare_t search; - spa_spare_t *spare; - mutex_enter(&spa_spare_lock); ASSERT(!vd->vdev_isspare); - - search.spare_guid = vd->vdev_guid; - if ((spare = avl_find(&spa_spare_avl, &search, &where)) != NULL) { - spare->spare_count++; - } else { - spare = kmem_zalloc(sizeof (spa_spare_t), KM_SLEEP); - spare->spare_guid = vd->vdev_guid; - spare->spare_count = 1; - avl_insert(&spa_spare_avl, spare, where); - } + spa_aux_add(vd, &spa_spare_avl); vd->vdev_isspare = B_TRUE; - mutex_exit(&spa_spare_lock); } void spa_spare_remove(vdev_t *vd) { - spa_spare_t search; - spa_spare_t *spare; - avl_index_t where; - mutex_enter(&spa_spare_lock); - - search.spare_guid = vd->vdev_guid; - spare = avl_find(&spa_spare_avl, &search, &where); - ASSERT(vd->vdev_isspare); - ASSERT(spare != NULL); - - if (--spare->spare_count == 0) { - avl_remove(&spa_spare_avl, spare); - kmem_free(spare, sizeof (spa_spare_t)); - } else if (spare->spare_pool == spa_guid(vd->vdev_spa)) { - spare->spare_pool = 0ULL; - } - + spa_aux_remove(vd, &spa_spare_avl); vd->vdev_isspare = B_FALSE; mutex_exit(&spa_spare_lock); } boolean_t -spa_spare_exists(uint64_t guid, uint64_t *pool) +spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) { - spa_spare_t search, *found; - avl_index_t where; + boolean_t found; mutex_enter(&spa_spare_lock); - - search.spare_guid = guid; - found = avl_find(&spa_spare_avl, &search, &where); - - if (pool) { - if (found) - *pool = found->spare_pool; - else - *pool = 0ULL; - } - + found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); mutex_exit(&spa_spare_lock); - return (found != NULL); + return (found); } void spa_spare_activate(vdev_t *vd) { - spa_spare_t search, *found; - avl_index_t where; - mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); - - search.spare_guid = vd->vdev_guid; - found = avl_find(&spa_spare_avl, &search, &where); - ASSERT(found != NULL); - ASSERT(found->spare_pool == 0ULL); - - found->spare_pool = spa_guid(vd->vdev_spa); + spa_aux_activate(vd, &spa_spare_avl); mutex_exit(&spa_spare_lock); } /* - * ========================================================================== - * SPA config locking - * ========================================================================== + * Level 2 ARC devices are tracked globally for the same reasons as spares. + * Cache devices currently only support one pool per cache device, and so + * for these devices the aux reference count is currently unused beyond 1. */ -/* - * Acquire the config lock. The config lock is a special rwlock that allows for - * recursive enters. Because these enters come from the same thread as well as - * asynchronous threads working on behalf of the owner, we must unilaterally - * allow all reads access as long at least one reader is held (even if a write - * is requested). This has the side effect of write starvation, but write locks - * are extremely rare, and a solution to this problem would be significantly - * more complex (if even possible). - * - * We would like to assert that the namespace lock isn't held, but this is a - * valid use during create. - */ -void -spa_config_enter(spa_t *spa, krw_t rw, void *tag) +static int +spa_l2cache_compare(const void *a, const void *b) { - spa_config_lock_t *scl = &spa->spa_config_lock; - - mutex_enter(&scl->scl_lock); - - if (scl->scl_writer != curthread) { - if (rw == RW_READER) { - while (scl->scl_writer != NULL) - cv_wait(&scl->scl_cv, &scl->scl_lock); - } else { - while (scl->scl_writer != NULL || - !refcount_is_zero(&scl->scl_count)) - cv_wait(&scl->scl_cv, &scl->scl_lock); - scl->scl_writer = curthread; - } - } - - (void) refcount_add(&scl->scl_count, tag); + return (spa_aux_compare(a, b)); +} - mutex_exit(&scl->scl_lock); +void +spa_l2cache_add(vdev_t *vd) +{ + mutex_enter(&spa_l2cache_lock); + ASSERT(!vd->vdev_isl2cache); + spa_aux_add(vd, &spa_l2cache_avl); + vd->vdev_isl2cache = B_TRUE; + mutex_exit(&spa_l2cache_lock); } -/* - * Release the spa config lock, notifying any waiters in the process. - */ void -spa_config_exit(spa_t *spa, void *tag) +spa_l2cache_remove(vdev_t *vd) { - spa_config_lock_t *scl = &spa->spa_config_lock; + mutex_enter(&spa_l2cache_lock); + ASSERT(vd->vdev_isl2cache); + spa_aux_remove(vd, &spa_l2cache_avl); + vd->vdev_isl2cache = B_FALSE; + mutex_exit(&spa_l2cache_lock); +} - mutex_enter(&scl->scl_lock); +boolean_t +spa_l2cache_exists(uint64_t guid, uint64_t *pool) +{ + boolean_t found; - ASSERT(!refcount_is_zero(&scl->scl_count)); - if (refcount_remove(&scl->scl_count, tag) == 0) { - cv_broadcast(&scl->scl_cv); - scl->scl_writer = NULL; /* OK in either case */ - } + mutex_enter(&spa_l2cache_lock); + found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); + mutex_exit(&spa_l2cache_lock); - mutex_exit(&scl->scl_lock); + return (found); } -/* - * Returns true if the config lock is held in the given manner. - */ -boolean_t -spa_config_held(spa_t *spa, krw_t rw) +void +spa_l2cache_activate(vdev_t *vd) { - spa_config_lock_t *scl = &spa->spa_config_lock; - boolean_t held; - - mutex_enter(&scl->scl_lock); - if (rw == RW_WRITER) - held = (scl->scl_writer == curthread); - else - held = !refcount_is_zero(&scl->scl_count); - mutex_exit(&scl->scl_lock); + mutex_enter(&spa_l2cache_lock); + ASSERT(vd->vdev_isl2cache); + spa_aux_activate(vd, &spa_l2cache_avl); + mutex_exit(&spa_l2cache_lock); +} - return (held); +void +spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc) +{ + vdev_space_update(vd, space, alloc, B_FALSE); } /* @@ -600,14 +842,9 @@ spa_config_held(spa_t *spa, krw_t rw) uint64_t spa_vdev_enter(spa_t *spa) { - /* - * Suspend scrub activity while we mess with the config. - */ - spa_scrub_suspend(spa); - mutex_enter(&spa_namespace_lock); - spa_config_enter(spa, RW_WRITER, spa); + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); return (spa_last_synced_txg(spa) + 1); } @@ -625,6 +862,8 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) ASSERT(txg > spa_last_synced_txg(spa)); + spa->spa_pending_vdev = NULL; + /* * Reassess the DTLs. */ @@ -633,17 +872,12 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) /* * If the config changed, notify the scrub thread that it must restart. */ - if (error == 0 && !list_is_empty(&spa->spa_dirty_list)) { + if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { + dsl_pool_scrub_restart(spa->spa_dsl_pool); config_changed = B_TRUE; - spa_scrub_restart(spa, txg); } - spa_config_exit(spa, spa); - - /* - * Allow scrubbing to resume. - */ - spa_scrub_resume(spa); + spa_config_exit(spa, SCL_ALL, spa); /* * Note: this txg_wait_synced() is important because it ensures @@ -662,7 +896,7 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) * If the config changed, update the config cache. */ if (config_changed) - spa_config_sync(); + spa_config_sync(spa, B_FALSE, B_TRUE); mutex_exit(&spa_namespace_lock); @@ -670,6 +904,26 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) } /* + * Lock the given spa_t for the purpose of changing vdev state. + */ +void +spa_vdev_state_enter(spa_t *spa) +{ + spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER); +} + +int +spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) +{ + if (vd != NULL) + vdev_state_dirty(vd->vdev_top); + + spa_config_exit(spa, SCL_STATE_ALL, spa); + + return (error); +} + +/* * ========================================================================== * Miscellaneous functions * ========================================================================== @@ -696,11 +950,10 @@ spa_rename(const char *name, const char *newname) return (err); } - spa_config_enter(spa, RW_WRITER, FTAG); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); avl_remove(&spa_namespace_avl, spa); - spa_strfree(spa->spa_name); - spa->spa_name = spa_strdup(newname); + (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name)); avl_add(&spa_namespace_avl, spa); /* @@ -710,14 +963,14 @@ spa_rename(const char *name, const char *newname) */ vdev_config_dirty(spa->spa_root_vdev); - spa_config_exit(spa, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa->spa_dsl_pool, 0); /* * Sync the updated config cache. */ - spa_config_sync(); + spa_config_sync(spa, B_FALSE, B_TRUE); spa_close(spa, FTAG); @@ -754,7 +1007,7 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) break; /* - * Check any devices we may in the process of adding. + * Check any devices we may be in the process of adding. */ if (spa->spa_pending_vdev) { if (vdev_lookup_by_guid(spa->spa_pending_vdev, @@ -848,12 +1101,12 @@ spa_freeze(spa_t *spa) { uint64_t freeze_txg = 0; - spa_config_enter(spa, RW_WRITER, FTAG); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); if (spa->spa_freeze_txg == UINT64_MAX) { freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; spa->spa_freeze_txg = freeze_txg; } - spa_config_exit(spa, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); if (freeze_txg != 0) txg_wait_synced(spa_get_dsl(spa), freeze_txg); } @@ -880,7 +1133,7 @@ spa_traverse_rwlock(spa_t *spa) return (&spa->spa_traverse_lock); } -int +boolean_t spa_traverse_wanted(spa_t *spa) { return (spa->spa_traverse_wanted); @@ -922,13 +1175,6 @@ spa_sync_pass(spa_t *spa) char * spa_name(spa_t *spa) { - /* - * Accessing the name requires holding either the namespace lock or the - * config lock, both of which are required to do a rename. - */ - ASSERT(MUTEX_HELD(&spa_namespace_lock) || - spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER)); - return (spa->spa_name); } @@ -972,16 +1218,6 @@ spa_freeze_txg(spa_t *spa) } /* - * In the future, this may select among different metaslab classes - * depending on the zdp. For now, there's no such distinction. - */ -metaslab_class_t * -spa_metaslab_class_select(spa_t *spa) -{ - return (spa->spa_normal_class); -} - -/* * Return how much space is allocated in the pool (ie. sum of all asize) */ uint64_t @@ -1024,6 +1260,22 @@ spa_get_asize(spa_t *spa, uint64_t lsize) return (lsize * 6); } +/* + * Return the failure mode that has been set to this pool. The default + * behavior will be to block all I/Os when a complete failure occurs. + */ +uint8_t +spa_get_failmode(spa_t *spa) +{ + return (spa->spa_failmode); +} + +boolean_t +spa_suspended(spa_t *spa) +{ + return (spa->spa_suspended); +} + uint64_t spa_version(spa_t *spa) { @@ -1034,11 +1286,11 @@ int spa_max_replication(spa_t *spa) { /* - * As of ZFS_VERSION == ZFS_VERSION_DITTO_BLOCKS, we are able to + * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to * handle BPs with more than one DVA allocated. Set our max * replication level accordingly. */ - if (spa_version(spa) < ZFS_VERSION_DITTO_BLOCKS) + if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) return (1); return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } @@ -1051,12 +1303,15 @@ bp_get_dasize(spa_t *spa, const blkptr_t *bp) if (!spa->spa_deflate) return (BP_GET_ASIZE(bp)); + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (i = 0; i < SPA_DVAS_PER_BP; i++) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i])); - sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> SPA_MINBLOCKSHIFT) * - vd->vdev_deflate_ratio; + if (vd) + sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> + SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } + spa_config_exit(spa, SCL_VDEV, FTAG); return (sz); } @@ -1088,18 +1343,27 @@ spa_busy(void) } void +spa_boot_init() +{ + spa_config_load(); +} + +void spa_init(int mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), offsetof(spa_t, spa_avl)); - mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), + offsetof(spa_aux_t, aux_avl)); - avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_spare_t), - offsetof(spa_spare_t, spare_avl)); + avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), + offsetof(spa_aux_t, aux_avl)); spa_mode = mode; @@ -1108,23 +1372,53 @@ spa_init(int mode) zio_init(); dmu_init(); zil_init(); + vdev_cache_stat_init(); + zfs_prop_init(); + zpool_prop_init(); spa_config_load(); + l2arc_start(); } void spa_fini(void) { + l2arc_stop(); + spa_evict_all(); + vdev_cache_stat_fini(); zil_fini(); dmu_fini(); zio_fini(); + unique_fini(); refcount_fini(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); + avl_destroy(&spa_l2cache_avl); cv_destroy(&spa_namespace_cv); mutex_destroy(&spa_namespace_lock); mutex_destroy(&spa_spare_lock); + mutex_destroy(&spa_l2cache_lock); +} + +/* + * Return whether this pool has slogs. No locking needed. + * It's not a problem if the wrong answer is returned as it's only for + * performance and not correctness + */ +boolean_t +spa_has_slogs(spa_t *spa) +{ + return (spa->spa_log_class->mc_rotor != NULL); +} + +/* + * Return whether this pool is the root pool. + */ +boolean_t +spa_is_root(spa_t *spa) +{ + return (spa->spa_is_root); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c index 23313a908ab4..8fdfa6200ea9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -300,6 +300,7 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, uint64_t *entry, *entry_map, *entry_map_end; uint64_t bufsize, size, offset, end, space; uint64_t mapstart = sm->sm_start; + int error = 0; ASSERT(MUTEX_HELD(sm->sm_lock)); @@ -337,9 +338,10 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, smo->smo_object, offset, size); mutex_exit(sm->sm_lock); - VERIFY3U(dmu_read(os, smo->smo_object, offset, size, - entry_map), ==, 0); + error = dmu_read(os, smo->smo_object, offset, size, entry_map); mutex_enter(sm->sm_lock); + if (error != 0) + break; entry_map_end = entry_map + (size / sizeof (uint64_t)); for (entry = entry_map; entry < entry_map_end; entry++) { @@ -354,20 +356,25 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, SM_RUN_DECODE(e) << sm->sm_shift); } } - VERIFY3U(sm->sm_space, ==, space); + + if (error == 0) { + VERIFY3U(sm->sm_space, ==, space); + + sm->sm_loaded = B_TRUE; + sm->sm_ops = ops; + if (ops != NULL) + ops->smop_load(sm); + } else { + space_map_vacate(sm, NULL, NULL); + } zio_buf_free(entry_map, bufsize); sm->sm_loading = B_FALSE; - sm->sm_loaded = B_TRUE; - sm->sm_ops = ops; cv_broadcast(&sm->sm_load_cv); - if (ops != NULL) - ops->smop_load(sm); - - return (0); + return (error); } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h index f58ffc059f91..f3e00877a8e2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ARC_H #define _SYS_ARC_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #ifdef __cplusplus @@ -35,11 +33,12 @@ extern "C" { #endif #include <sys/zio.h> +#include <sys/dmu.h> +#include <sys/spa.h> typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private); -typedef void arc_byteswap_func_t(void *buf, size_t size); typedef int arc_evict_func_t(void *private); /* generic arc_done_func_t's which you can use */ @@ -49,15 +48,16 @@ arc_done_func_t arc_getbuf_func; struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; + krwlock_t b_lock; void *b_data; arc_evict_func_t *b_efunc; void *b_private; }; typedef enum arc_buf_contents { - ARC_BUFC_UNDEF, /* buffer contents undefined */ ARC_BUFC_DATA, /* buffer contains data */ - ARC_BUFC_METADATA /* buffer contains metadata */ + ARC_BUFC_METADATA, /* buffer contains metadata */ + ARC_BUFC_NUMTYPES } arc_buf_contents_t; /* * These are the flags we pass into calls to the arc @@ -66,7 +66,12 @@ typedef enum arc_buf_contents { #define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */ #define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ #define ARC_CACHED (1 << 4) /* I/O was already in cache */ +#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */ +void arc_space_consume(uint64_t space); +void arc_space_return(uint64_t space); +void *arc_data_buf_alloc(uint64_t space); +void arc_data_buf_free(void *buf, uint64_t space); arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type); void arc_buf_add_ref(arc_buf_t *buf, void *tag); @@ -81,13 +86,24 @@ void arc_buf_thaw(arc_buf_t *buf); int arc_referenced(arc_buf_t *buf); #endif -int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, +typedef struct writeprops { + dmu_object_type_t wp_type; + uint8_t wp_level; + uint8_t wp_copies; + uint8_t wp_dncompress, wp_oscompress; + uint8_t wp_dnchecksum, wp_oschecksum; +} writeprops_t; + +int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, + arc_done_func_t *done, void *private, int priority, int zio_flags, + uint32_t *arc_flags, const zbookmark_t *zb); +int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_done_func_t *done, void *private, int priority, int flags, - uint32_t *arc_flags, zbookmark_t *zb); -zio_t *arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, - int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + uint32_t *arc_flags, const zbookmark_t *zb); +zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, + boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, - int flags, zbookmark_t *zb); + int zio_flags, const zbookmark_t *zb); int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_done_func_t *done, void *private, uint32_t arc_flags); int arc_tryread(spa_t *spa, blkptr_t *bp, void *data); @@ -95,13 +111,25 @@ int arc_tryread(spa_t *spa, blkptr_t *bp, void *data); void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); int arc_buf_evict(arc_buf_t *buf); -void arc_flush(void); -void arc_tempreserve_clear(uint64_t tempreserve); -int arc_tempreserve_space(uint64_t tempreserve); +void arc_flush(spa_t *spa); +void arc_tempreserve_clear(uint64_t reserve); +int arc_tempreserve_space(uint64_t reserve, uint64_t txg); void arc_init(void); void arc_fini(void); +/* + * Level 2 ARC + */ + +void l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end); +void l2arc_remove_vdev(vdev_t *vd); +boolean_t l2arc_vdev_present(vdev_t *vd); +void l2arc_init(void); +void l2arc_fini(void); +void l2arc_start(void); +void l2arc_stop(void); + #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h index b4c83765c873..cdb93a6c35a3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_BPLIST_H #define _SYS_BPLIST_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/spa.h> #include <sys/txg.h> @@ -75,12 +73,14 @@ extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object); extern void bplist_close(bplist_t *bpl); extern boolean_t bplist_empty(bplist_t *bpl); extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp); -extern int bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx); -extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp); +extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx); +extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp); extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx); extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx); extern int bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +extern int bplist_space_birthrange(bplist_t *bpl, + uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h index d33657b9e67c..b27d89fe2162 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -239,7 +239,7 @@ typedef struct dbuf_hash_table { uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); -dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn); +void dbuf_create_bonus(struct dnode *dn); dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, @@ -271,7 +271,7 @@ void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_sync_list(list_t *list, dmu_tx_t *tx); -void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks, +void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, struct dmu_tx *); void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); @@ -279,10 +279,21 @@ void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); void dbuf_init(void); void dbuf_fini(void); -#define DBUF_GET_BUFC_TYPE(db) \ - ((((db)->db_level > 0) || \ - (dmu_ot[(db)->db_dnode->dn_type].ot_metadata)) ? \ - ARC_BUFC_METADATA : ARC_BUFC_DATA); +#define DBUF_IS_METADATA(db) \ + ((db)->db_level > 0 || dmu_ot[(db)->db_dnode->dn_type].ot_metadata) + +#define DBUF_GET_BUFC_TYPE(db) \ + (DBUF_IS_METADATA(db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) + +#define DBUF_IS_CACHEABLE(db) \ + ((db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ + (DBUF_IS_METADATA(db) && \ + ((db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) + +#define DBUF_IS_L2CACHEABLE(db) \ + ((db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ + (DBUF_IS_METADATA(db) && \ + ((db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) #ifdef ZFS_DEBUG diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index 8c2a1fdaa823..4535c6864074 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,6 +38,7 @@ #include <sys/types.h> #include <sys/param.h> +#include <sys/cred.h> #ifdef __cplusplus extern "C" { @@ -91,7 +92,7 @@ typedef enum dmu_object_type { DMU_OT_DSL_DATASET, /* UINT64 */ /* zpl: */ DMU_OT_ZNODE, /* ZNODE */ - DMU_OT_ACL, /* ACL */ + DMU_OT_OLDACL, /* Old ACL */ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ DMU_OT_MASTER_NODE, /* ZAP */ @@ -108,7 +109,13 @@ typedef enum dmu_object_type { DMU_OT_SPA_HISTORY, /* UINT8 */ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ DMU_OT_POOL_PROPS, /* ZAP */ - + DMU_OT_DSL_PERMS, /* ZAP */ + DMU_OT_ACL, /* ACL */ + DMU_OT_SYSACL, /* SYSACL */ + DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ + DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ + DMU_OT_NEXT_CLONES, /* ZAP */ + DMU_OT_SCRUB_QUEUE, /* ZAP */ DMU_OT_NUMTYPES } dmu_object_type_t; @@ -127,15 +134,15 @@ void byteswap_uint32_array(void *buf, size_t size); void byteswap_uint16_array(void *buf, size_t size); void byteswap_uint8_array(void *buf, size_t size); void zap_byteswap(void *buf, size_t size); +void zfs_oldacl_byteswap(void *buf, size_t size); void zfs_acl_byteswap(void *buf, size_t size); void zfs_znode_byteswap(void *buf, size_t size); -#define DS_MODE_NONE 0 /* invalid, to aid debugging */ -#define DS_MODE_STANDARD 1 /* normal access, no special needs */ -#define DS_MODE_PRIMARY 2 /* the "main" access, e.g. a mount */ -#define DS_MODE_EXCLUSIVE 3 /* exclusive access, e.g. to destroy */ -#define DS_MODE_LEVELS 4 -#define DS_MODE_LEVEL(x) ((x) & (DS_MODE_LEVELS - 1)) +#define DS_MODE_NOHOLD 0 /* internal use only */ +#define DS_MODE_USER 1 /* simple access, no special needs */ +#define DS_MODE_OWNER 2 /* the "main" access, e.g. a mount */ +#define DS_MODE_TYPE_MASK 0x3 +#define DS_MODE_TYPE(x) ((x) & DS_MODE_TYPE_MASK) #define DS_MODE_READONLY 0x8 #define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY) #define DS_MODE_INCONSISTENT 0x10 @@ -149,20 +156,23 @@ void zfs_znode_byteswap(void *buf, size_t size); * operation, including metadata. */ #define DMU_MAX_ACCESS (10<<20) /* 10MB */ +#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ /* * Public routines to create, destroy, open, and close objsets. */ int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, objset_t **osp); +int dmu_objset_open_ds(struct dsl_dataset *ds, dmu_objset_type_t type, + objset_t **osp); void dmu_objset_close(objset_t *os); -int dmu_objset_evict_dbufs(objset_t *os, int try); +int dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, - void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg); + objset_t *clone_parent, uint64_t flags, + void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); int dmu_objset_destroy(const char *name); int dmu_snapshots_destroy(char *fsname, char *snapname); -int dmu_objset_rollback(const char *name); +int dmu_objset_rollback(objset_t *os); int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); int dmu_objset_rename(const char *name, const char *newname, boolean_t recursive); @@ -180,11 +190,6 @@ typedef struct dmu_buf { typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); /* - * Callback function to perform byte swapping on a block. - */ -typedef void dmu_byteswap_func_t(void *buf, size_t size); - -/* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ #define DMU_POOL_DIRECTORY_OBJECT 1 @@ -197,6 +202,20 @@ typedef void dmu_byteswap_func_t(void *buf, size_t size); #define DMU_POOL_DEFLATE "deflate" #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" +#define DMU_POOL_L2CACHE "l2cache" + +/* 4x8 zbookmark_t */ +#define DMU_POOL_SCRUB_BOOKMARK "scrub_bookmark" +/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */ +#define DMU_POOL_SCRUB_QUEUE "scrub_queue" +/* 1x8 txg */ +#define DMU_POOL_SCRUB_MIN_TXG "scrub_min_txg" +/* 1x8 txg */ +#define DMU_POOL_SCRUB_MAX_TXG "scrub_max_txg" +/* 1x4 enum scrub_func */ +#define DMU_POOL_SCRUB_FUNC "scrub_func" +/* 1x8 count */ +#define DMU_POOL_SCRUB_ERRORS "scrub_errors" /* * Allocate an object from this objset. The range of object numbers @@ -298,6 +317,7 @@ int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb, */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); +int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); /* * Obtain the DMU buffer from the specified object which contains the @@ -417,6 +437,9 @@ void dmu_tx_commit(dmu_tx_t *tx); */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); +int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size); +int dmu_free_object(objset_t *os, uint64_t object); /* * Convenience functions. @@ -458,8 +481,10 @@ typedef struct dmu_object_info { uint64_t doi_max_block_offset; } dmu_object_info_t; +typedef void arc_byteswap_func_t(void *buf, size_t size); + typedef struct dmu_object_type_info { - dmu_byteswap_func_t *ot_byteswap; + arc_byteswap_func_t *ot_byteswap; boolean_t ot_metadata; char *ot_name; } dmu_object_type_info_t; @@ -482,10 +507,11 @@ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, typedef struct dmu_objset_stats { uint64_t dds_num_clones; /* number of clones of this */ uint64_t dds_creation_txg; + uint64_t dds_guid; dmu_objset_type_t dds_type; uint8_t dds_is_snapshot; uint8_t dds_inconsistent; - char dds_clone_of[MAXNAMELEN]; + char dds_origin[MAXNAMELEN]; } dmu_objset_stats_t; /* @@ -531,9 +557,13 @@ extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, - uint64_t *id, uint64_t *offp); + uint64_t *id, uint64_t *offp, boolean_t *case_conflict); +extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, + int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); +extern void dmu_objset_set_user(objset_t *os, void *user_ptr); +extern void *dmu_objset_get_user(objset_t *os); /* * Return the txg number for the given assigned transaction. @@ -544,7 +574,7 @@ uint64_t dmu_tx_get_txg(dmu_tx_t *tx); * Synchronous write. * If a parent zio is provided this function initiates a write on the * provided buffer as a child of the parent zio. - * In the absense of a parent zio, the write is completed synchronously. + * In the absence of a parent zio, the write is completed synchronously. * At write completion, blk is filled with the bp of the written block. * Note that while the data covered by this function will be on stable * storage when the write completes this new data does not become a @@ -572,9 +602,30 @@ typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); -int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp); -int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, - boolean_t force, struct file *fp, uint64_t voffset); +int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, + struct file *fp, offset_t *off); + +typedef struct dmu_recv_cookie { + /* + * This structure is opaque! + * + * If logical and real are different, we are recving the stream + * into the "real" temporary clone, and then switching it with + * the "logical" target. + */ + struct dsl_dataset *drc_logical_ds; + struct dsl_dataset *drc_real_ds; + struct drr_begin *drc_drrb; + char *drc_tosnap; + boolean_t drc_newfs; + boolean_t drc_force; +} dmu_recv_cookie_t; + +int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *, + boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *); +int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp); +int dmu_recv_end(dmu_recv_cookie_t *drc); +void dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h index 807011e94ffc..96ce688e1551 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DMU_IMPL_H #define _SYS_DMU_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/txg_impl.h> #include <sys/zio.h> #include <sys/dnode.h> @@ -51,7 +49,7 @@ extern "C" { * XXX try to improve evicting path? * * dp_config_rwlock > os_obj_lock > dn_struct_rwlock > - * dn_dbufs_mtx > hash_mutexes > db_mtx > leafs + * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs * * dp_config_rwlock * must be held before: everything @@ -177,7 +175,10 @@ extern "C" { * dmu_tx_try_assign: dn_notxholds(cv) * dmu_tx_unassign: none * - * dd_lock (leaf) + * dd_lock + * must be held before: + * ds_lock + * ancestors' dd_lock * protects: * dd_prop_cbs * dd_sync_* @@ -207,13 +208,14 @@ extern "C" { * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes) * dnode_free: none (dn_dirtyblksz, os_*_dnodes) * - * ds_lock (leaf) + * ds_lock * protects: * ds_user_ptr * ds_user_evice_func * ds_open_refcount * ds_snapname * ds_phys accounting + * ds_reserved * held from: * dsl_dataset_* * diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h index 8293a3b4076a..15df29a17799 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -69,12 +69,13 @@ typedef struct objset_impl { uint8_t os_checksum; /* can change, under dsl_dir's locks */ uint8_t os_compress; /* can change, under dsl_dir's locks */ uint8_t os_copies; /* can change, under dsl_dir's locks */ - uint8_t os_md_checksum; - uint8_t os_md_compress; + uint8_t os_primary_cache; /* can change, under dsl_dir's locks */ + uint8_t os_secondary_cache; /* can change, under dsl_dir's locks */ /* no lock needed: */ struct dmu_tx *os_synctx; /* XXX sketchy */ blkptr_t *os_rootbp; + zil_header_t os_zil_header; /* Protected by os_obj_lock */ kmutex_t os_obj_lock; @@ -86,19 +87,27 @@ typedef struct objset_impl { list_t os_free_dnodes[TXG_SIZE]; list_t os_dnodes; list_t os_downgraded_dbufs; + + /* stuff we store for the user */ + kmutex_t os_user_ptr_lock; + void *os_user_ptr; } objset_impl_t; #define DMU_META_DNODE_OBJECT 0 +#define DMU_OS_IS_L2CACHEABLE(os) \ + ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ + (os)->os_secondary_cache == ZFS_CACHE_METADATA) + /* called from zpl */ int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, objset_t **osp); void dmu_objset_close(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, - void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg); + objset_t *clone_parent, uint64_t flags, + void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); int dmu_objset_destroy(const char *name); -int dmu_objset_rollback(const char *name); +int dmu_objset_rollback(objset_t *os); int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); void dmu_objset_stats(objset_t *os, nvlist_t *nv); void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); @@ -107,8 +116,10 @@ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t dmu_objset_fsid_guid(objset_t *os); int dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags); +int dmu_objset_find_spa(spa_t *spa, const char *name, + int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); -int dmu_objset_evict_dbufs(objset_t *os, int try); +int dmu_objset_evict_dbufs(objset_t *os); /* called from dsl */ void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h index ea9fa6c1e36c..05e5ffdbff5d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -100,6 +100,7 @@ struct traverse_handle { int traverse_dsl_dataset(struct dsl_dataset *ds, uint64_t txg_start, int advance, blkptr_cb_t func, void *arg); +int traverse_zvol(objset_t *os, int advance, blkptr_cb_t func, void *arg); traverse_handle_t *traverse_init(spa_t *spa, blkptr_cb_t *func, void *arg, int advance, int zio_flags); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h index 89f4799b57fe..6aaf35dc038f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -64,6 +64,7 @@ struct dmu_tx { uint64_t tx_space_towrite; uint64_t tx_space_tofree; uint64_t tx_space_tooverwrite; + uint64_t tx_space_tounref; refcount_t tx_space_written; refcount_t tx_space_freed; #endif @@ -86,6 +87,9 @@ typedef struct dmu_tx_hold { uint64_t txh_space_towrite; uint64_t txh_space_tofree; uint64_t txh_space_tooverwrite; + uint64_t txh_space_tounref; + uint64_t txh_memory_tohold; + uint64_t txh_fudge; #ifdef ZFS_DEBUG enum dmu_tx_hold_type txh_type; uint64_t txh_arg1; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h index 327e538cf809..c79ff48a60c5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DNODE_H #define _SYS_DNODE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/avl.h> #include <sys/spa.h> @@ -41,12 +39,19 @@ extern "C" { #endif /* - * Flags. + * dnode_hold() flags. */ #define DNODE_MUST_BE_ALLOCATED 1 #define DNODE_MUST_BE_FREE 2 /* + * dnode_next_offset() flags. + */ +#define DNODE_FIND_HOLE 1 +#define DNODE_FIND_BACKWARDS 2 +#define DNODE_FIND_HAVELOCK 4 + +/* * Fixed constants. */ #define DNODE_SHIFT 9 /* 512 bytes */ @@ -64,6 +69,7 @@ extern "C" { #define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) #define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) #define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) +#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1) #define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) #define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) @@ -156,6 +162,7 @@ typedef struct dnode { uint64_t dn_maxblkid; uint8_t dn_next_nlevels[TXG_SIZE]; uint8_t dn_next_indblkshift[TXG_SIZE]; + uint16_t dn_next_bonuslen[TXG_SIZE]; uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ /* protected by os_lock: */ @@ -197,11 +204,12 @@ dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp, uint64_t object); void dnode_special_close(dnode_t *dn); +void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); int dnode_hold(struct objset_impl *dd, uint64_t object, void *ref, dnode_t **dnp); int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag, void *ref, dnode_t **dnp); -void dnode_add_ref(dnode_t *dn, void *ref); +boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); void dnode_sync(dnode_t *dn, dmu_tx_t *tx); @@ -220,13 +228,13 @@ void dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx); void dnode_diduse_space(dnode_t *dn, int64_t space); void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx); -void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); +void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t); uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); void dnode_init(void); void dnode_fini(void); -int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl, - uint64_t blkfill, uint64_t txg); -int dnode_evict_dbufs(dnode_t *dn, int try); +int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, + int minlvl, uint64_t blkfill, uint64_t txg); +void dnode_evict_dbufs(dnode_t *dn); #ifdef ZFS_DEBUG diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h index 8cfc1dcc9840..8665aec2dda8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DSL_DATASET_H #define _SYS_DSL_DATASET_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/spa.h> #include <sys/txg.h> @@ -47,6 +45,8 @@ struct dsl_pool; typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *); #define DS_FLAG_INCONSISTENT (1ULL<<0) +#define DS_IS_INCONSISTENT(ds) \ + ((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) /* * NB: nopromote can not yet be set, but we want support for it in this * on-disk version, so that we don't need to upgrade for it later. It @@ -55,16 +55,29 @@ typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *); */ #define DS_FLAG_NOPROMOTE (1ULL<<1) +/* + * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly + * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE, + * refquota/refreservations). + */ +#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2) + +/* + * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose + * name lookups should be performed case-insensitively. + */ +#define DS_FLAG_CI_DATASET (1ULL<<16) + typedef struct dsl_dataset_phys { - uint64_t ds_dir_obj; - uint64_t ds_prev_snap_obj; + uint64_t ds_dir_obj; /* DMU_OT_DSL_DIR */ + uint64_t ds_prev_snap_obj; /* DMU_OT_DSL_DATASET */ uint64_t ds_prev_snap_txg; - uint64_t ds_next_snap_obj; - uint64_t ds_snapnames_zapobj; /* zap obj of snaps; ==0 for snaps */ + uint64_t ds_next_snap_obj; /* DMU_OT_DSL_DATASET */ + uint64_t ds_snapnames_zapobj; /* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */ uint64_t ds_num_children; /* clone/snap children; ==0 for head */ uint64_t ds_creation_time; /* seconds since 1970 */ uint64_t ds_creation_txg; - uint64_t ds_deadlist_obj; + uint64_t ds_deadlist_obj; /* DMU_OT_BPLIST */ uint64_t ds_used_bytes; uint64_t ds_compressed_bytes; uint64_t ds_uncompressed_bytes; @@ -76,9 +89,11 @@ typedef struct dsl_dataset_phys { */ uint64_t ds_fsid_guid; uint64_t ds_guid; - uint64_t ds_flags; + uint64_t ds_flags; /* DS_FLAG_* */ blkptr_t ds_bp; - uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */ + uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */ + uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */ + uint64_t ds_pad[6]; /* pad out to 320 bytes for good measure */ } dsl_dataset_phys_t; typedef struct dsl_dataset { @@ -87,9 +102,11 @@ typedef struct dsl_dataset { dsl_dataset_phys_t *ds_phys; dmu_buf_t *ds_dbuf; uint64_t ds_object; + uint64_t ds_fsid_guid; - /* only used in syncing context: */ - struct dsl_dataset *ds_prev; /* only valid for non-snapshots */ + /* only used in syncing context, only valid for non-snapshots: */ + struct dsl_dataset *ds_prev; + uint64_t ds_origin_txg; /* has internal locking: */ bplist_t ds_deadlist; @@ -105,11 +122,23 @@ typedef struct dsl_dataset { kmutex_t ds_lock; void *ds_user_ptr; dsl_dataset_evict_func_t *ds_user_evict_func; - uint64_t ds_open_refcount; + + /* + * ds_owner is protected by the ds_rwlock and the ds_lock + */ + krwlock_t ds_rwlock; + kcondvar_t ds_exclusive_cv; + void *ds_owner; /* no locking; only for making guesses */ uint64_t ds_trysnap_txg; + /* for objset_open() */ + kmutex_t ds_opening_lock; + + uint64_t ds_reserved; /* cached refreservation */ + uint64_t ds_quota; /* cached refquota */ + /* Protected by ds_lock; keep at end of struct for better locality */ char ds_snapname[MAXNAMELEN]; } dsl_dataset_t; @@ -117,23 +146,38 @@ typedef struct dsl_dataset { #define dsl_dataset_is_snapshot(ds) \ ((ds)->ds_phys->ds_num_children != 0) -int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode, - void *tag, dsl_dataset_t **dsp); -int dsl_dataset_open(const char *name, int mode, void *tag, +#define DS_UNIQUE_IS_ACCURATE(ds) \ + (((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) + +int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp); +int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, + void *tag, dsl_dataset_t **); +int dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp); -int dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj, - const char *tail, int mode, void *tag, dsl_dataset_t **); +int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, + int flags, void *owner, dsl_dataset_t **); void dsl_dataset_name(dsl_dataset_t *ds, char *name); -void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag); -uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, - const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx); -int dsl_dataset_destroy(const char *name); +void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); +void dsl_dataset_disown(dsl_dataset_t *ds, void *owner); +void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag); +boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, + void *owner); +void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner); +uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, + dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); +uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, + uint64_t flags, dmu_tx_t *tx); +int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag); int dsl_snapshots_destroy(char *fsname, char *snapname); +dsl_checkfunc_t dsl_dataset_destroy_check; +dsl_syncfunc_t dsl_dataset_destroy_sync; dsl_checkfunc_t dsl_dataset_snapshot_check; dsl_syncfunc_t dsl_dataset_snapshot_sync; -int dsl_dataset_rollback(dsl_dataset_t *ds); +int dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost); int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); int dsl_dataset_promote(const char *name); +int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, + boolean_t force); void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds, void *p, dsl_dataset_evict_func_t func); @@ -144,10 +188,12 @@ void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); +boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds); + void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); -void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, +int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, dmu_tx_t *tx); int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); @@ -160,11 +206,19 @@ void dsl_dataset_space(dsl_dataset_t *ds, uint64_t *usedobjsp, uint64_t *availobjsp); uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds); -void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp, - dmu_tx_t *tx); - int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); +int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, + uint64_t asize, uint64_t inflight, uint64_t *used, + uint64_t *ref_rsrv); +int dsl_dataset_set_quota(const char *dsname, uint64_t quota); +void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, + dmu_tx_t *tx); +int dsl_dataset_set_reservation(const char *dsname, uint64_t reservation); +void dsl_dataset_set_flags(dsl_dataset_t *ds, uint64_t flags); +int64_t dsl_dataset_new_refreservation(dsl_dataset_t *ds, uint64_t reservation, + dmu_tx_t *tx); + #ifdef ZFS_DEBUG #define dprintf_ds(ds, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h new file mode 100644 index 000000000000..a29e44e67d0c --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DSL_DELEG_H +#define _SYS_DSL_DELEG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/dmu.h> +#include <sys/dsl_pool.h> +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_DELEG_PERM_NONE "" +#define ZFS_DELEG_PERM_CREATE "create" +#define ZFS_DELEG_PERM_DESTROY "destroy" +#define ZFS_DELEG_PERM_SNAPSHOT "snapshot" +#define ZFS_DELEG_PERM_ROLLBACK "rollback" +#define ZFS_DELEG_PERM_CLONE "clone" +#define ZFS_DELEG_PERM_PROMOTE "promote" +#define ZFS_DELEG_PERM_RENAME "rename" +#define ZFS_DELEG_PERM_MOUNT "mount" +#define ZFS_DELEG_PERM_SHARE "share" +#define ZFS_DELEG_PERM_SEND "send" +#define ZFS_DELEG_PERM_RECEIVE "receive" +#define ZFS_DELEG_PERM_ALLOW "allow" +#define ZFS_DELEG_PERM_USERPROP "userprop" +#define ZFS_DELEG_PERM_VSCAN "vscan" + +/* + * Note: the names of properties that are marked delegatable are also + * valid delegated permissions + */ + +int dsl_deleg_get(const char *ddname, nvlist_t **nvp); +int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset); +int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr); +void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr); +int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr); +int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr); +int dsl_deleg_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx); +boolean_t dsl_delegation_on(objset_t *os); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DELEG_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h index e0595d3c368b..86b9636ceaab 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DSL_DIR_H #define _SYS_DSL_DIR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dsl_pool.h> #include <sys/dsl_synctask.h> @@ -40,11 +38,22 @@ extern "C" { struct dsl_dataset; +typedef enum dd_used { + DD_USED_HEAD, + DD_USED_SNAP, + DD_USED_CHILD, + DD_USED_CHILD_RSRV, + DD_USED_REFRSRV, + DD_USED_NUM +} dd_used_t; + +#define DD_FLAG_USED_BREAKDOWN (1<<0) + typedef struct dsl_dir_phys { uint64_t dd_creation_time; /* not actually used */ uint64_t dd_head_dataset_obj; uint64_t dd_parent_obj; - uint64_t dd_clone_parent_obj; + uint64_t dd_origin_obj; uint64_t dd_child_dir_zapobj; /* * how much space our children are accounting for; for leaf @@ -58,7 +67,10 @@ typedef struct dsl_dir_phys { /* Administrative reservation setting */ uint64_t dd_reserved; uint64_t dd_props_zapobj; - uint64_t dd_pad[21]; /* pad out to 256 bytes for good measure */ + uint64_t dd_deleg_zapobj; /* dataset delegation permissions */ + uint64_t dd_flags; + uint64_t dd_used_breakdown[DD_USED_NUM]; + uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */ } dsl_dir_phys_t; struct dsl_dir { @@ -78,9 +90,6 @@ struct dsl_dir { kmutex_t dd_lock; list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */ - /* Accounting */ - /* reflects any changes to dd_phys->dd_used_bytes made this syncing */ - int64_t dd_used_bytes; /* gross estimate of space used by in-flight tx's */ uint64_t dd_tempreserved[TXG_SIZE]; /* amount of space we expect to write; == amount of dirty data */ @@ -99,8 +108,8 @@ int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, void dsl_dir_name(dsl_dir_t *dd, char *buf); int dsl_dir_namelen(dsl_dir_t *dd); int dsl_dir_is_private(dsl_dir_t *dd); -uint64_t dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx); -void dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx); +uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, + const char *name, dmu_tx_t *tx); dsl_checkfunc_t dsl_dir_destroy_check; dsl_syncfunc_t dsl_dir_destroy_sync; void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); @@ -109,18 +118,26 @@ uint64_t dsl_dir_space_available(dsl_dir_t *dd, void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, - uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx); + uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep, + dmu_tx_t *tx); void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); -void dsl_dir_diduse_space(dsl_dir_t *dd, +void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); +void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); int dsl_dir_set_quota(const char *ddname, uint64_t quota); int dsl_dir_set_reservation(const char *ddname, uint64_t reservation); int dsl_dir_rename(dsl_dir_t *dd, const char *newname); int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); +int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx); +boolean_t dsl_dir_is_clone(dsl_dir_t *dd); +void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, + uint64_t reservation, cred_t *cr, dmu_tx_t *tx); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" +#define ORIGIN_DIR_NAME "$ORIGIN" #ifdef ZFS_DEBUG #define dprintf_dd(dd, fmt, ...) do { \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h index f7ec67a0e062..4dd88fe6fa55 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h @@ -19,19 +19,18 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DSL_POOL_H #define _SYS_DSL_POOL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/spa.h> #include <sys/txg.h> #include <sys/txg_impl.h> #include <sys/zfs_context.h> +#include <sys/zio.h> #ifdef __cplusplus extern "C" { @@ -39,6 +38,16 @@ extern "C" { struct objset; struct dsl_dir; +struct dsl_dataset; +struct dsl_pool; +struct dmu_tx; + +enum scrub_func { + SCRUB_FUNC_NONE, + SCRUB_FUNC_CLEAN, + SCRUB_FUNC_NUMFUNCS +}; + typedef struct dsl_pool { /* Immutable */ @@ -46,11 +55,31 @@ typedef struct dsl_pool { struct objset *dp_meta_objset; struct dsl_dir *dp_root_dir; struct dsl_dir *dp_mos_dir; + struct dsl_dataset *dp_origin_snap; uint64_t dp_root_dir_obj; /* No lock needed - sync context only */ blkptr_t dp_meta_rootbp; - list_t dp_synced_objsets; + list_t dp_synced_datasets; + hrtime_t dp_read_overhead; + uint64_t dp_throughput; + uint64_t dp_write_limit; + + /* Uses dp_lock */ + kmutex_t dp_lock; + uint64_t dp_space_towrite[TXG_SIZE]; + uint64_t dp_tempreserved[TXG_SIZE]; + + enum scrub_func dp_scrub_func; + uint64_t dp_scrub_queue_obj; + uint64_t dp_scrub_min_txg; + uint64_t dp_scrub_max_txg; + zbookmark_t dp_scrub_bookmark; + boolean_t dp_scrub_pausing; + boolean_t dp_scrub_isresilver; + uint64_t dp_scrub_start_time; + kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */ + boolean_t dp_scrub_restart; /* Has its own locking */ tx_state_t dp_tx; @@ -69,11 +98,26 @@ typedef struct dsl_pool { int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); void dsl_pool_close(dsl_pool_t *dp); -dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg); +dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg); void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); void dsl_pool_zil_clean(dsl_pool_t *dp); int dsl_pool_sync_context(dsl_pool_t *dp); uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); +int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx); +void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); +void dsl_pool_memory_pressure(dsl_pool_t *dp); +void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); +int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, + zio_done_func_t *done, void *private, uint32_t arc_flags); +void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); + +int dsl_pool_scrub_cancel(dsl_pool_t *dp); +int dsl_pool_scrub_clean(dsl_pool_t *dp); +void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_scrub_restart(dsl_pool_t *dp); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h index d2debff8b8c0..d66caa86cff6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,6 +37,7 @@ extern "C" { #endif struct dsl_dataset; +struct dsl_dir; /* The callback func may not call into the DMU or DSL! */ typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval); @@ -59,12 +60,16 @@ int dsl_prop_get(const char *ddname, const char *propname, int intsz, int numints, void *buf, char *setpoint); int dsl_prop_get_integer(const char *ddname, const char *propname, uint64_t *valuep, char *setpoint); -int dsl_prop_get_all(objset_t *os, nvlist_t **nvp); +int dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local); +int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, + int intsz, int numints, void *buf, char *setpoint); +int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, + int intsz, int numints, void *buf, char *setpoint); int dsl_prop_set(const char *ddname, const char *propname, int intsz, int numints, const void *buf); -int dsl_prop_set_dd(dsl_dir_t *dd, const char *propname, - int intsz, int numints, const void *buf); +void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, + cred_t *cr, dmu_tx_t *tx); void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); void dsl_prop_nvlist_add_string(nvlist_t *nv, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h index e695b182f74b..4995bfe5acca 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,7 +38,7 @@ extern "C" { struct dsl_pool; typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *); -typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *); +typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *); typedef struct dsl_sync_task { list_node_t dst_node; @@ -53,9 +53,11 @@ typedef struct dsl_sync_task_group { txg_node_t dstg_node; list_t dstg_tasks; struct dsl_pool *dstg_pool; + cred_t *dstg_cr; uint64_t dstg_txg; int dstg_err; int dstg_space; + boolean_t dstg_nowaiter; } dsl_sync_task_group_t; dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp); @@ -63,12 +65,16 @@ void dsl_sync_task_create(dsl_sync_task_group_t *dstg, dsl_checkfunc_t *, dsl_syncfunc_t *, void *arg1, void *arg2, int blocks_modified); int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg); +void dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg); void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); int dsl_sync_task_do(struct dsl_pool *dp, dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, void *arg1, void *arg2, int blocks_modified); +void dsl_sync_task_do_nowait(struct dsl_pool *dp, + dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, + void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h index 095dd3ce2464..1c9d89e8fd69 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_METASLAB_H #define _SYS_METASLAB_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/spa.h> #include <sys/space_map.h> #include <sys/txg.h> @@ -47,8 +45,12 @@ extern void metaslab_fini(metaslab_t *msp); extern void metaslab_sync(metaslab_t *msp, uint64_t txg); extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg); -extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, - int ncopies, uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid); +#define METASLAB_HINTBP_FAVOR 0x0 +#define METASLAB_HINTBP_AVOID 0x1 +#define METASLAB_GANG_HEADER 0x2 + +extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags); extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now); extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h index c64c6627f783..e84b1bf65f99 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -28,6 +28,8 @@ #pragma ident "%Z%%M% %I% %E% SMI" +#include <sys/cdefs.h> +#include <sys/types.h> #include_next <sys/refcount.h> #include <sys/list.h> #include <sys/zfs_context.h> @@ -59,7 +61,7 @@ typedef struct refcount { int64_t rc_removed_count; } refcount_t; -/* Note: refcount_t should be initialized to zero before use. */ +/* Note: refcount_t must be initialized with refcount_create() */ void refcount_create(refcount_t *rc); void refcount_destroy(refcount_t *rc); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h new file mode 100644 index 000000000000..760fc822db56 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h @@ -0,0 +1,79 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_RR_RW_LOCK_H +#define _SYS_RR_RW_LOCK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +/* + * A reader-writer lock implementation that allows re-entrant reads, but + * still gives writers priority on "new" reads. + * + * See rrwlock.c for more details about the implementation. + * + * Fields of the rrwlock_t structure: + * - rr_lock: protects modification and reading of rrwlock_t fields + * - rr_cv: cv for waking up readers or waiting writers + * - rr_writer: thread id of the current writer + * - rr_anon_rount: number of active anonymous readers + * - rr_linked_rcount: total number of non-anonymous active readers + * - rr_writer_wanted: a writer wants the lock + */ +typedef struct rrwlock { + kmutex_t rr_lock; + kcondvar_t rr_cv; + kthread_t *rr_writer; + refcount_t rr_anon_rcount; + refcount_t rr_linked_rcount; + boolean_t rr_writer_wanted; +} rrwlock_t; + +/* + * 'tag' is used in reference counting tracking. The + * 'tag' must be the same in a rrw_enter() as in its + * corresponding rrw_exit(). + */ +void rrw_init(rrwlock_t *rrl); +void rrw_destroy(rrwlock_t *rrl); +void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag); +void rrw_exit(rrwlock_t *rrl, void *tag); +boolean_t rrw_held(rrwlock_t *rrl, krw_t rw); + +#define RRW_READ_HELD(x) rrw_held(x, RW_READER) +#define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_RR_RW_LOCK_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index f0eb2e171aad..99bcb915911e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SPA_H #define _SYS_SPA_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/avl.h> #include <sys/zfs_context.h> #include <sys/nvpair.h> @@ -47,6 +45,7 @@ typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; typedef struct zilog zilog_t; typedef struct traverse_handle traverse_handle_t; +typedef struct spa_aux_vdev spa_aux_vdev_t; struct dsl_pool; /* @@ -88,6 +87,11 @@ struct dsl_pool; #define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1) /* + * Size of block to hold the configuration data (a packed nvlist) + */ +#define SPA_CONFIG_BLOCKSIZE (1 << 14) + +/* * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. * The ASIZE encoding should be at least 64 times larger (6 more bits) * to support up to 4-way RAID-Z mirror mode with worst-case gang block @@ -258,7 +262,6 @@ typedef struct blkptr { ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ ((zc1).zc_word[3] - (zc2).zc_word[3]))) - #define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) #define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ @@ -291,6 +294,8 @@ typedef struct blkptr { ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } +#define BLK_FILL_ALREADY_FREED (-1ULL) + /* * Note: the byteorder is either 0 or -1, both of which are palindromes. * This simplifies the endianness handling a bit. @@ -318,23 +323,30 @@ typedef struct blkptr { extern int spa_open(const char *pool, spa_t **, void *tag); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); -extern int spa_create(const char *pool, nvlist_t *config, const char *altroot); -extern int spa_import(const char *pool, nvlist_t *config, const char *altroot); +extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, + const char *history_str, nvlist_t *zplprops); +extern int spa_check_rootconf(char *devpath, char *devid, + nvlist_t **bestconf, uint64_t *besttxg); +extern boolean_t spa_rootdev_validate(nvlist_t *nv); +extern int spa_import_rootpool(char *devpath, char *devid); +extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props); +extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(char *pool); -extern int spa_export(char *pool, nvlist_t **oldconfig); +extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force); extern int spa_reset(char *pool); extern void spa_async_request(spa_t *spa, int flag); +extern void spa_async_unrequest(spa_t *spa, int flag); extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); -#define SPA_ASYNC_REOPEN 0x01 -#define SPA_ASYNC_REPLACE_DONE 0x02 -#define SPA_ASYNC_SCRUB 0x04 -#define SPA_ASYNC_RESILVER 0x08 -#define SPA_ASYNC_CONFIG_UPDATE 0x10 +#define SPA_ASYNC_CONFIG_UPDATE 0x01 +#define SPA_ASYNC_REMOVE 0x02 +#define SPA_ASYNC_PROBE 0x04 +#define SPA_ASYNC_RESILVER_DONE 0x08 +#define SPA_ASYNC_RESILVER 0x10 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); @@ -347,19 +359,27 @@ extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); extern void spa_spare_remove(vdev_t *vd); -extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool); +extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); extern void spa_spare_activate(vdev_t *vd); +/* L2ARC state (which is global across all pools) */ +extern void spa_l2cache_add(vdev_t *vd); +extern void spa_l2cache_remove(vdev_t *vd); +extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); +extern void spa_l2cache_activate(vdev_t *vd); +extern void spa_l2cache_drop(spa_t *spa); +extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc); + /* scrubbing */ -extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force); -extern void spa_scrub_suspend(spa_t *spa); -extern void spa_scrub_resume(spa_t *spa); -extern void spa_scrub_restart(spa_t *spa, uint64_t txg); +extern int spa_scrub(spa_t *spa, pool_scrub_type_t type); /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); +/* spa namespace global mutex */ +extern kmutex_t spa_namespace_lock; + /* * SPA configuration functions in spa_config.c */ @@ -367,13 +387,14 @@ extern void spa_sync_allpools(void); #define SPA_CONFIG_UPDATE_POOL 0 #define SPA_CONFIG_UPDATE_VDEVS 1 -extern void spa_config_sync(void); +extern void spa_config_sync(spa_t *, boolean_t, boolean_t); extern void spa_config_load(void); extern nvlist_t *spa_all_configs(uint64_t *); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); extern void spa_config_update(spa_t *spa, int what); +extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot); /* * Miscellaneous SPA routines in spa_misc.c @@ -390,18 +411,34 @@ extern void spa_open_ref(spa_t *spa, void *tag); extern void spa_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); -/* Pool configuration lock */ -extern void spa_config_enter(spa_t *spa, krw_t rw, void *tag); -extern void spa_config_exit(spa_t *spa, void *tag); -extern boolean_t spa_config_held(spa_t *spa, krw_t rw); +#define SCL_CONFIG 0x01 +#define SCL_STATE 0x02 +#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ +#define SCL_ALLOC 0x08 +#define SCL_ZIO 0x10 +#define SCL_FREE 0x20 +#define SCL_VDEV 0x40 +#define SCL_LOCKS 7 +#define SCL_ALL ((1 << SCL_LOCKS) - 1) +#define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) + +/* Pool configuration locks */ +extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); +extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); +extern void spa_config_exit(spa_t *spa, int locks, void *tag); +extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); +/* Pool vdev state change lock */ +extern void spa_vdev_state_enter(spa_t *spa); +extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); + /* Accessor functions */ extern krwlock_t *spa_traverse_rwlock(spa_t *spa); -extern int spa_traverse_wanted(spa_t *spa); +extern boolean_t spa_traverse_wanted(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); @@ -414,8 +451,6 @@ extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern int spa_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); -struct metaslab_class; -extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa); extern uint64_t spa_get_alloc(spa_t *spa); extern uint64_t spa_get_space(spa_t *spa); extern uint64_t spa_get_dspace(spa_t *spa); @@ -423,6 +458,8 @@ extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); extern uint64_t spa_version(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_busy(void); +extern uint8_t spa_get_failmode(spa_t *spa); +extern boolean_t spa_suspended(spa_t *spa); /* Miscellaneous support routines */ extern int spa_rename(const char *oldname, const char *newname); @@ -432,18 +469,38 @@ extern void spa_strfree(char *); extern uint64_t spa_get_random(uint64_t range); extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); -extern void spa_upgrade(spa_t *spa); +extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_evict_all(void); -extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid); +extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, + boolean_t l2cache); extern boolean_t spa_has_spare(spa_t *, uint64_t guid); extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp); +extern boolean_t spa_has_slogs(spa_t *spa); +extern boolean_t spa_is_root(spa_t *spa); /* history logging */ +typedef enum history_log_type { + LOG_CMD_POOL_CREATE, + LOG_CMD_NORMAL, + LOG_INTERNAL +} history_log_type_t; + +typedef struct history_arg { + const char *ha_history_str; + history_log_type_t ha_log_type; + history_internal_events_t ha_event; + char ha_zone[MAXPATHLEN]; +} history_arg_t; + +extern char *spa_his_ievent_table[]; + extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf, - uint64_t pool_create); + history_log_type_t what); +void spa_history_internal_log(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, cred_t *cr, const char *fmt, ...); /* error handling */ struct zbookmark; @@ -451,7 +508,8 @@ struct zio; extern void spa_log_error(spa_t *spa, struct zio *zio); extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, struct zio *zio, uint64_t stateoroffset, uint64_t length); -extern void zfs_post_ok(spa_t *spa, vdev_t *vd); +extern void zfs_post_remove(spa_t *spa, vdev_t *vd); +extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); extern void spa_errlog_rotate(spa_t *spa); @@ -459,15 +517,22 @@ extern void spa_errlog_drain(spa_t *spa); extern void spa_errlog_sync(spa_t *spa, uint64_t txg); extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); +/* vdev cache */ +extern void vdev_cache_stat_init(void); +extern void vdev_cache_stat_fini(void); + /* Initialization and termination */ extern void spa_init(int flags); extern void spa_fini(void); +extern void spa_boot_init(); /* properties */ -extern int spa_set_props(spa_t *spa, nvlist_t *nvp); -extern int spa_get_props(spa_t *spa, nvlist_t **nvp); -extern void spa_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); -extern boolean_t spa_has_bootfs(spa_t *spa); +extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); +extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); +extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); + +/* asynchronous event notification */ +extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h new file mode 100644 index 000000000000..b56073b97516 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPA_BOOT_H +#define _SYS_SPA_BOOT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/nvpair.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern char *spa_get_bootprop(char *prop); +extern void spa_free_bootprop(char *prop); +extern int spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf_p); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_BOOT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h index 8c57123ad4b8..ab41ba605c6a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SPA_IMPL_H #define _SYS_SPA_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/spa.h> #include <sys/vdev.h> #include <sys/metaslab.h> @@ -43,13 +41,6 @@ extern "C" { #endif -typedef struct spa_config_lock { - kmutex_t scl_lock; - refcount_t scl_count; - kthread_t *scl_writer; - kcondvar_t scl_cv; -} spa_config_lock_t; - typedef struct spa_error_entry { zbookmark_t se_bookmark; char *se_name; @@ -64,31 +55,61 @@ typedef struct spa_history_phys { uint64_t sh_records_lost; /* num of records overwritten */ } spa_history_phys_t; -typedef struct spa_props { - nvlist_t *spa_props_nvp; - list_node_t spa_list_node; -} spa_props_t; +struct spa_aux_vdev { + uint64_t sav_object; /* MOS object for device list */ + nvlist_t *sav_config; /* cached device config */ + vdev_t **sav_vdevs; /* devices */ + int sav_count; /* number devices */ + boolean_t sav_sync; /* sync the device list */ + nvlist_t **sav_pending; /* pending device additions */ + uint_t sav_npending; /* # pending devices */ +}; + +typedef struct spa_config_lock { + kmutex_t scl_lock; + kthread_t *scl_writer; + int scl_write_wanted; + kcondvar_t scl_cv; + refcount_t scl_count; +} spa_config_lock_t; + +typedef struct spa_config_dirent { + list_node_t scd_link; + char *scd_path; +} spa_config_dirent_t; + +typedef enum spa_log_state { + SPA_LOG_UNKNOWN = 0, /* unknown log state */ + SPA_LOG_MISSING, /* missing log(s) */ + SPA_LOG_CLEAR, /* clear the log(s) */ + SPA_LOG_GOOD, /* log(s) are good */ +} spa_log_state_t; + +enum zio_taskq_type { + ZIO_TASKQ_ISSUE = 0, + ZIO_TASKQ_INTERRUPT, + ZIO_TASKQ_TYPES +}; struct spa { /* * Fields protected by spa_namespace_lock. */ - char *spa_name; /* pool name */ + char spa_name[MAXNAMELEN]; /* pool name */ avl_node_t spa_avl; /* node in spa_namespace_avl */ nvlist_t *spa_config; /* last synced config */ nvlist_t *spa_config_syncing; /* currently syncing config */ uint64_t spa_config_txg; /* txg of last config change */ - kmutex_t spa_config_cache_lock; /* for spa_config RW_READER */ int spa_sync_pass; /* iterate-to-convergence */ int spa_state; /* pool state */ int spa_inject_ref; /* injection references */ uint8_t spa_traverse_wanted; /* traverse lock wanted */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ - taskq_t *spa_zio_issue_taskq[ZIO_TYPES]; - taskq_t *spa_zio_intr_taskq[ZIO_TYPES]; + taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; metaslab_class_t *spa_normal_class; /* normal data class */ + metaslab_class_t *spa_log_class; /* intent log data class */ uint64_t spa_first_txg; /* first txg after spa_open() */ uint64_t spa_final_txg; /* txg of export/destroy */ uint64_t spa_freeze_txg; /* freeze pool at this txg */ @@ -96,12 +117,10 @@ struct spa { txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ uint64_t spa_load_guid; /* initial guid for spa_load */ - list_t spa_dirty_list; /* vdevs with dirty labels */ - uint64_t spa_spares_object; /* MOS object for spare list */ - nvlist_t *spa_sparelist; /* cached spare config */ - vdev_t **spa_spares; /* available hot spares */ - int spa_nspares; /* number of hot spares */ - boolean_t spa_sync_spares; /* sync the spares list */ + list_t spa_config_dirty_list; /* vdevs with dirty config */ + list_t spa_state_dirty_list; /* vdevs with dirty state */ + spa_aux_vdev_t spa_spares; /* hot spares */ + spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_syncing_txg; /* txg currently syncing */ uint64_t spa_sync_bplist_obj; /* object for deferred frees */ @@ -110,28 +129,24 @@ struct spa { uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ - kthread_t *spa_scrub_thread; /* scrub/resilver thread */ - traverse_handle_t *spa_scrub_th; /* scrub traverse handle */ - uint64_t spa_scrub_restart_txg; /* need to restart */ - uint64_t spa_scrub_mintxg; /* min txg we'll scrub */ - uint64_t spa_scrub_maxtxg; /* max txg we'll scrub */ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */ uint64_t spa_scrub_errors; /* scrub I/O error count */ - int spa_scrub_suspended; /* tell scrubber to suspend */ - kcondvar_t spa_scrub_cv; /* scrub thread state change */ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ - uint8_t spa_scrub_stop; /* tell scrubber to stop */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ uint8_t spa_scrub_finished; /* indicator to rotate logs */ + uint8_t spa_scrub_started; /* started since last boot */ + uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ kmutex_t spa_async_lock; /* protect async state */ kthread_t *spa_async_thread; /* thread doing async task */ int spa_async_suspended; /* async tasks suspended */ kcondvar_t spa_async_cv; /* wait for thread_exit() */ uint16_t spa_async_tasks; /* async task mask */ + kmutex_t spa_async_root_lock; /* protects async root count */ + uint64_t spa_async_root_count; /* number of async root zios */ + kcondvar_t spa_async_root_cv; /* notify when count == 0 */ char *spa_root; /* alternate root directory */ - kmutex_t spa_uberblock_lock; /* vdev_uberblock_load_done() */ uint64_t spa_ena; /* spa-wide ereport ENA */ boolean_t spa_last_open_failed; /* true if last open faled */ kmutex_t spa_errlog_lock; /* error log lock */ @@ -144,22 +159,37 @@ struct spa { uint64_t spa_history; /* history object */ kmutex_t spa_history_lock; /* history lock */ vdev_t *spa_pending_vdev; /* pending vdev additions */ - nvlist_t **spa_pending_spares; /* pending spare additions */ - uint_t spa_pending_nspares; /* # pending spares */ kmutex_t spa_props_lock; /* property lock */ uint64_t spa_pool_props_object; /* object for properties */ uint64_t spa_bootfs; /* default boot filesystem */ + uint64_t spa_failmode; /* failure mode for the pool */ + uint64_t spa_delegation; /* delegation on/off */ + list_t spa_config_list; /* previous cache file(s) */ + zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ + kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ + kcondvar_t spa_suspend_cv; /* notification of resume */ + uint8_t spa_suspended; /* pool is suspended */ + boolean_t spa_import_faulted; /* allow faulted vdevs */ + boolean_t spa_is_root; /* pool is root */ + int spa_minref; /* num refs when first opened */ + spa_log_state_t spa_log_state; /* log state */ /* - * spa_refcnt must be the last element because it changes size based on - * compilation options. In order for the MDB module to function - * correctly, the other fields must remain in the same location. + * spa_refcnt & spa_config_lock must be the last elements + * because refcount_t changes size based on compilation options. + * In order for the MDB module to function correctly, the other + * fields must remain in the same location. */ - spa_config_lock_t spa_config_lock; /* configuration changes */ + spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ refcount_t spa_refcount; /* number of opens */ }; -extern const char *spa_config_dir; -extern kmutex_t spa_namespace_lock; +extern const char *spa_config_path; + +#define BOOTFS_COMPRESS_VALID(compress) \ + ((compress) == ZIO_COMPRESS_LZJB || \ + ((compress) == ZIO_COMPRESS_ON && \ + ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \ + (compress) == ZIO_COMPRESS_OFF) #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h index dae129c2e5a4..23bdff211b4a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -76,6 +75,14 @@ extern void txg_suspend(struct dsl_pool *dp); extern void txg_resume(struct dsl_pool *dp); /* + * Delay the caller by the specified number of ticks or until + * the txg closes (whichever comes first). This is intended + * to be used to throttle writers when the system nears its + * capacity. + */ +extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks); + +/* * Wait until the given transaction group has finished syncing. * Try to make this happen as soon as possible (eg. kick off any * necessary syncs immediately). If txg==0, wait for the currently open @@ -95,7 +102,10 @@ extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg); * Returns TRUE if we are "backed up" waiting for the syncing * transaction to complete; otherwise returns FALSE. */ -extern int txg_stalled(struct dsl_pool *dp); +extern boolean_t txg_stalled(struct dsl_pool *dp); + +/* returns TRUE if someone is waiting for the next txg to sync */ +extern boolean_t txg_sync_waiting(struct dsl_pool *dp); /* * Per-txg object lists. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h index 45a138afaac3..a58be84be5af 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -59,7 +58,7 @@ typedef struct tx_state { kcondvar_t tx_sync_done_cv; kcondvar_t tx_quiesce_more_cv; kcondvar_t tx_quiesce_done_cv; - kcondvar_t tx_timeout_exit_cv; + kcondvar_t tx_timeout_cv; kcondvar_t tx_exit_cv; /* wait for all threads to exit */ uint8_t tx_threads; /* number of threads */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h index ab0f2dcf8c1b..55a0dd5aec0d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -49,7 +49,7 @@ extern "C" { struct uberblock { uint64_t ub_magic; /* UBERBLOCK_MAGIC */ - uint64_t ub_version; /* ZFS_VERSION */ + uint64_t ub_version; /* SPA_VERSION */ uint64_t ub_txg; /* txg of last sync */ uint64_t ub_guid_sum; /* sum of all vdev guids */ uint64_t ub_timestamp; /* UTC time of last sync */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h index c8c177e3ca6c..2ef3093edf1c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,8 +38,12 @@ extern "C" { #define UNIQUE_BITS 56 void unique_init(void); +void unique_fini(void); -/* Return a new unique value. */ +/* + * Return a new unique value (which will not be uniquified against until + * it is unique_insert()-ed. + */ uint64_t unique_create(void); /* Return a unique value, which equals the one passed in if possible. */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h index 31208116256d..013389501e51 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_VDEV_H #define _SYS_VDEV_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/spa.h> #include <sys/zio.h> #include <sys/dmu.h> @@ -40,35 +38,31 @@ extern "C" { extern boolean_t zfs_nocacheflush; -/* - * Fault injection modes. - */ -#define VDEV_FAULT_NONE 0 -#define VDEV_FAULT_RANDOM 1 -#define VDEV_FAULT_COUNT 2 - extern int vdev_open(vdev_t *); extern int vdev_validate(vdev_t *); extern void vdev_close(vdev_t *); extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); extern void vdev_init(vdev_t *, uint64_t txg); extern void vdev_reopen(vdev_t *); -extern int vdev_validate_spare(vdev_t *); +extern int vdev_validate_aux(vdev_t *vd); +extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); +extern boolean_t vdev_is_bootable(vdev_t *vd); extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size); extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done); - -extern const char *vdev_description(vdev_t *vd); +extern boolean_t vdev_resilver_needed(vdev_t *vd, + uint64_t *minp, uint64_t *maxp); extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); -extern void vdev_stat_update(zio_t *zio); +extern void vdev_clear_stats(vdev_t *vd); +extern void vdev_stat_update(zio_t *zio, uint64_t psize); extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete); extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec); @@ -77,24 +71,27 @@ extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux); extern void vdev_space_update(vdev_t *vd, int64_t space_delta, - int64_t alloc_delta); + int64_t alloc_delta, boolean_t update_root); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); -extern void vdev_io_start(zio_t *zio); -extern void vdev_io_done(zio_t *zio); - -extern int vdev_online(spa_t *spa, uint64_t guid); -extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp); +extern int vdev_fault(spa_t *spa, uint64_t guid); +extern int vdev_degrade(spa_t *spa, uint64_t guid); +extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, + vdev_state_t *); +extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); extern void vdev_clear(spa_t *spa, vdev_t *vd); -extern int vdev_error_inject(vdev_t *vd, zio_t *zio); -extern int vdev_is_dead(vdev_t *vd); +extern boolean_t vdev_is_dead(vdev_t *vd); +extern boolean_t vdev_readable(vdev_t *vd); +extern boolean_t vdev_writeable(vdev_t *vd); +extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); extern void vdev_cache_init(vdev_t *vd); extern void vdev_cache_fini(vdev_t *vd); extern int vdev_cache_read(zio_t *zio); extern void vdev_cache_write(zio_t *zio); +extern void vdev_cache_purge(vdev_t *vd); extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); @@ -103,16 +100,20 @@ extern void vdev_queue_io_done(zio_t *zio); extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); -extern int vdev_config_sync(vdev_t *vd, uint64_t txg); +extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); + +extern void vdev_state_dirty(vdev_t *vd); +extern void vdev_state_clean(vdev_t *vd); extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, - boolean_t getstats, boolean_t isspare); + boolean_t getstats, boolean_t isspare, boolean_t isl2cache); /* * Label routines */ struct uberblock; extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); +extern int vdev_label_number(uint64_t psise, uint64_t offset); extern nvlist_t *vdev_label_read_config(vdev_t *vd); extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub); @@ -120,7 +121,8 @@ typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ VDEV_LABEL_REPLACE, /* replace an existing device */ VDEV_LABEL_SPARE, /* add a new hot spare */ - VDEV_LABEL_REMOVE /* remove an existing device */ + VDEV_LABEL_REMOVE, /* remove an existing device */ + VDEV_LABEL_L2CACHE /* add an L2ARC cache device */ } vdev_labeltype_t; extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h index 95536a77db9a..b748571ea0c3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,6 +30,8 @@ #include <sys/vdev.h> #ifdef _KERNEL +#include <sys/buf.h> +#include <sys/ddi.h> #include <sys/sunldi.h> #include <sys/sunddi.h> #endif @@ -45,6 +46,9 @@ typedef struct vdev_disk { ldi_handle_t vd_lh; } vdev_disk_t; +#ifdef _KERNEL +extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); +#endif #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h index aba756713f9c..7e24edea7f38 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_VDEV_IMPL_H #define _SYS_VDEV_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/avl.h> #include <sys/dmu.h> #include <sys/metaslab.h> @@ -61,7 +59,7 @@ typedef struct vdev_cache_entry vdev_cache_entry_t; typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); -typedef void vdev_io_start_func_t(zio_t *zio); +typedef int vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); @@ -140,9 +138,12 @@ struct vdev { txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ - uint8_t vdev_reopen_wanted; /* async reopen wanted? */ - list_node_t vdev_dirty_node; /* config dirty list */ + boolean_t vdev_remove_wanted; /* async remove wanted? */ + boolean_t vdev_probe_wanted; /* async probe wanted? */ + list_node_t vdev_config_dirty_node; /* config dirty list */ + list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ + uint64_t vdev_islog; /* is an intent log device */ /* * Leaf vdev state. @@ -151,22 +152,30 @@ struct vdev { space_map_obj_t vdev_dtl; /* dirty time log on-disk state */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_wholedisk; /* true if this is a whole disk */ - uint64_t vdev_offline; /* device taken offline? */ + uint64_t vdev_offline; /* persistent offline state */ + uint64_t vdev_faulted; /* persistent faulted state */ + uint64_t vdev_degraded; /* persistent degraded state */ + uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_nparity; /* number of parity devices for raidz */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ - uint64_t vdev_fault_arg; /* fault injection paramater */ - int vdev_fault_mask; /* zio types to fault */ - uint8_t vdev_fault_mode; /* fault injection mode */ - uint8_t vdev_cache_active; /* vdev_cache and vdev_queue */ + char *vdev_physpath; /* vdev device path (if any) */ + uint64_t vdev_not_present; /* not present during import */ + uint64_t vdev_unspare; /* unspare when resilvering done */ + hrtime_t vdev_last_try; /* last reopen time */ + boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + boolean_t vdev_checkremove; /* temporary online test */ + boolean_t vdev_forcefault; /* force online fault */ uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ uint8_t vdev_detached; /* device detached? */ - uint64_t vdev_isspare; /* was a hot spare */ + uint8_t vdev_cant_read; /* vdev is failing all reads */ + uint8_t vdev_cant_write; /* vdev is failing all writes */ + uint64_t vdev_isspare; /* was a hot spare */ + uint64_t vdev_isl2cache; /* was a l2cache device */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ vdev_cache_t vdev_cache; /* physical block cache */ - uint64_t vdev_not_present; /* not present during import */ - hrtime_t vdev_last_try; /* last reopen time */ - boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */ + zio_t *vdev_probe_zio; /* root of current probe */ /* * For DTrace to work in userland (libzpool) context, these fields must @@ -177,6 +186,7 @@ struct vdev { */ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ kmutex_t vdev_stat_lock; /* vdev_stat */ + kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ }; #define VDEV_SKIP_SIZE (8 << 10) @@ -239,6 +249,7 @@ typedef struct vdev_label { #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 #define VDEV_ALLOC_SPARE 2 +#define VDEV_ALLOC_L2CACHE 3 /* * Allocate or free a vdev @@ -275,8 +286,8 @@ extern vdev_ops_t vdev_raidz_ops; extern vdev_ops_t vdev_geom_ops; #else extern vdev_ops_t vdev_disk_ops; -extern vdev_ops_t vdev_file_ops; #endif +extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; extern vdev_ops_t vdev_spare_ops; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h index f89d9385ea38..f88cc068bd57 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,7 +31,7 @@ /* * ZAP - ZFS Attribute Processor * - * The ZAP is a module which sits on top of the DMU (Data Managemnt + * The ZAP is a module which sits on top of the DMU (Data Management * Unit) and implements a higher-level storage primitive using DMU * objects. Its primary consumer is the ZPL (ZFS Posix Layer). * @@ -91,10 +91,38 @@ extern "C" { #define ZAP_MAXVALUELEN 1024 /* + * The matchtype specifies which entry will be accessed. + * MT_EXACT: only find an exact match (non-normalized) + * MT_FIRST: find the "first" normalized (case and Unicode + * form) match; the designated "first" match will not change as long + * as the set of entries with this normalization doesn't change + * MT_BEST: if there is an exact match, find that, otherwise find the + * first normalized match + */ +typedef enum matchtype +{ + MT_EXACT, + MT_BEST, + MT_FIRST +} matchtype_t; + +/* * Create a new zapobj with no attributes and return its object number. + * MT_EXACT will cause the zap object to only support MT_EXACT lookups, + * otherwise any matchtype can be used for lookups. + * + * normflags specifies what normalization will be done. values are: + * 0: no normalization (legacy on-disk format, supports MT_EXACT matching + * only) + * U8_TEXTPREP_TOLOWER: case normalization will be performed. + * MT_FIRST/MT_BEST matching will find entries that match without + * regard to case (eg. looking for "foo" can find an entry "Foo"). + * Eventually, other flags will permit unicode normalization as well. */ uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); /* * Create a new zapobj with no attributes from the given (unallocated) @@ -102,6 +130,9 @@ uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, */ int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +int zap_create_claim_norm(objset_t *ds, uint64_t obj, + int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); /* * The zapobj passed in must be a valid ZAP object for all of the @@ -140,9 +171,20 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); * If the attribute is longer than the buffer, as many integers as will * fit will be transferred to 'buf'. If the entire attribute was not * transferred, the call will return EOVERFLOW. + * + * If rn_len is nonzero, realname will be set to the name of the found + * entry (which may be different from the requested name if matchtype is + * not MT_EXACT). + * + * If normalization_conflictp is not NULL, it will be set if there is + * another name with the same case/unicode normalized form. */ int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf); +int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *normalization_conflictp); /* * Create an attribute with the given name and value. @@ -182,6 +224,8 @@ int zap_length(objset_t *ds, uint64_t zapobj, const char *name, * return ENOENT. */ int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); +int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, + matchtype_t mt, dmu_tx_t *tx); /* * Returns (in *count) the number of attributes in the specified zap @@ -191,11 +235,28 @@ int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); /* - * Returns (in name) the name of the entry whose value + * Returns (in name) the name of the entry whose (value & mask) * (za_first_integer) is value, or ENOENT if not found. The string - * pointed to by name must be at least 256 bytes long. + * pointed to by name must be at least 256 bytes long. If mask==0, the + * match must be exact (ie, same as mask=-1ULL). */ -int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name); +int zap_value_search(objset_t *os, uint64_t zapobj, + uint64_t value, uint64_t mask, char *name); + +/* + * Transfer all the entries from fromobj into intoobj. Only works on + * int_size=8 num_integers=1 values. Fails if there are any duplicated + * entries. + */ +int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); + +/* + * Manipulate entries where the name + value are the "same" (the name is + * a stringified version of the value). + */ +int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); +int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); +int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); struct zap; struct zap_leaf; @@ -211,6 +272,11 @@ typedef struct zap_cursor { typedef struct { int za_integer_length; + /* + * za_normalization_conflict will be set if there are additional + * entries with this normalized form (eg, "foo" and "Foo"). + */ + boolean_t za_normalization_conflict; uint64_t za_num_integers; uint64_t za_first_integer; /* no sign extension for <8byte ints */ char za_name[MAXNAMELEN]; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h index 4e43f4ae49a1..0dc02ab6b0ac 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -59,7 +59,8 @@ typedef struct mzap_ent_phys { typedef struct mzap_phys { uint64_t mz_block_type; /* ZBT_MICRO */ uint64_t mz_salt; - uint64_t mz_pad[6]; + uint64_t mz_normflags; + uint64_t mz_pad[5]; mzap_ent_phys_t mz_chunk[1]; /* actually variable size depending on block size */ } mzap_phys_t; @@ -127,6 +128,7 @@ typedef struct zap_phys { uint64_t zap_num_leafs; /* number of leafs */ uint64_t zap_num_entries; /* number of entries */ uint64_t zap_salt; /* salt to stir into hash function */ + uint64_t zap_normflags; /* flags for u8_textprep_str() */ /* * This structure is followed by padding, and then the embedded * pointer table. The embedded pointer table takes up second @@ -142,7 +144,8 @@ typedef struct zap { uint64_t zap_object; struct dmu_buf *zap_dbuf; krwlock_t zap_rwlock; - int zap_ismicro; + boolean_t zap_ismicro; + int zap_normflags; uint64_t zap_salt; union { struct { @@ -165,34 +168,45 @@ typedef struct zap { } zap_u; } zap_t; +typedef struct zap_name { + zap_t *zn_zap; + const char *zn_name_orij; + uint64_t zn_hash; + matchtype_t zn_matchtype; + const char *zn_name_norm; + char zn_normbuf[ZAP_MAXNAMELEN]; +} zap_name_t; + #define zap_f zap_u.zap_fat #define zap_m zap_u.zap_micro -uint64_t zap_hash(zap_t *zap, const char *name); +boolean_t zap_match(zap_name_t *zn, const char *matchname); int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, - krw_t lti, int fatreader, zap_t **zapp); + krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); void zap_unlockdir(zap_t *zap); void zap_evict(dmu_buf_t *db, void *vmzap); +zap_name_t *zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt); +void zap_name_free(zap_name_t *zn); #define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) void fzap_byteswap(void *buf, size_t size); int fzap_count(zap_t *zap, uint64_t *count); -int fzap_lookup(zap_t *zap, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf); -int fzap_add(zap_t *zap, const char *name, - uint64_t integer_size, uint64_t num_integers, +int fzap_lookup(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, void *buf, + char *realname, int rn_len, boolean_t *normalization_conflictp); +int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); -int fzap_update(zap_t *zap, const char *name, +int fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); -int fzap_length(zap_t *zap, const char *name, +int fzap_length(zap_name_t *zn, uint64_t *integer_size, uint64_t *num_integers); -int fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx); +int fzap_remove(zap_name_t *zn, dmu_tx_t *tx); int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za); void fzap_get_stats(zap_t *zap, zap_stats_t *zs); void zap_put_leaf(struct zap_leaf *l); -int fzap_add_cd(zap_t *zap, const char *name, +int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, uint32_t cd, dmu_tx_t *tx); void fzap_upgrade(zap_t *zap, dmu_tx_t *tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h index 147fb7212454..14144e059e54 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -92,6 +92,8 @@ typedef enum zap_chunk_type { ZAP_CHUNK_TYPE_MAX = 250 } zap_chunk_type_t; +#define ZLF_ENTRIES_CDSORTED (1<<0) + /* * TAKE NOTE: * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified. @@ -109,7 +111,8 @@ typedef struct zap_leaf_phys { /* above is accessable to zap, below is zap_leaf private */ uint16_t lh_freelist; /* chunk head of free list */ - uint8_t lh_pad2[12]; + uint8_t lh_flags; /* ZLF_* flags */ + uint8_t lh_pad2[11]; } l_hdr; /* 2 24-byte chunks */ /* @@ -148,7 +151,7 @@ typedef union zap_leaf_chunk { } zap_leaf_chunk_t; typedef struct zap_leaf { - krwlock_t l_rwlock; /* only used on head of chain */ + krwlock_t l_rwlock; uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */ int l_bs; /* block size shift */ dmu_buf_t *l_dbuf; @@ -174,7 +177,7 @@ typedef struct zap_entry_handle { * value must equal zap_hash(name). */ extern int zap_leaf_lookup(zap_leaf_t *l, - const char *name, uint64_t h, zap_entry_handle_t *zeh); + zap_name_t *zn, zap_entry_handle_t *zeh); /* * Return a handle to the entry with this hash+cd, or the entry with the @@ -219,12 +222,19 @@ extern int zap_entry_create(zap_leaf_t *l, zap_entry_handle_t *zeh); /* + * Return true if there are additional entries with the same normalized + * form. + */ +extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, + zap_name_t *zn, const char *name, zap_t *zap); + +/* * Other stuff. */ -extern void zap_leaf_init(zap_leaf_t *l); +extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort); extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len); -extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl); +extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort); extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs); #ifdef __cplusplus diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h index 3250b760fb07..fe953184db44 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,6 +33,7 @@ #endif #include <sys/acl.h> #include <sys/dmu.h> +#include <sys/zfs_fuid.h> #ifdef __cplusplus extern "C" { @@ -40,33 +41,131 @@ extern "C" { struct znode_phys; -#define ACCESS_UNDETERMINED -1 - #define ACE_SLOT_CNT 6 +#define ZFS_ACL_VERSION_INITIAL 0ULL +#define ZFS_ACL_VERSION_FUID 1ULL +#define ZFS_ACL_VERSION ZFS_ACL_VERSION_FUID + +/* + * ZFS ACLs are store in various forms. + * Files created with ACL version ZFS_ACL_VERSION_INITIAL + * will all be created with fixed length ACEs of type + * zfs_oldace_t. + * + * Files with ACL version ZFS_ACL_VERSION_FUID will be created + * with various sized ACEs. The abstraction entries will utilize + * zfs_ace_hdr_t, normal user/group entries will use zfs_ace_t + * and some specialized CIFS ACEs will use zfs_object_ace_t. + */ + +/* + * All ACEs have a common hdr. For + * owner@, group@, and everyone@ this is all + * thats needed. + */ +typedef struct zfs_ace_hdr { + uint16_t z_type; + uint16_t z_flags; + uint32_t z_access_mask; +} zfs_ace_hdr_t; + +typedef zfs_ace_hdr_t zfs_ace_abstract_t; + +/* + * Standard ACE + */ +typedef struct zfs_ace { + zfs_ace_hdr_t z_hdr; + uint64_t z_fuid; +} zfs_ace_t; + +/* + * The following type only applies to ACE_ACCESS_ALLOWED|DENIED_OBJECT_ACE_TYPE + * and will only be set/retrieved in a CIFS context. + */ -typedef struct zfs_znode_acl { +typedef struct zfs_object_ace { + zfs_ace_t z_ace; + uint8_t z_object_type[16]; /* object type */ + uint8_t z_inherit_type[16]; /* inherited object type */ +} zfs_object_ace_t; + +typedef struct zfs_oldace { + uint32_t z_fuid; /* "who" */ + uint32_t z_access_mask; /* access mask */ + uint16_t z_flags; /* flags, i.e inheritance */ + uint16_t z_type; /* type of entry allow/deny */ +} zfs_oldace_t; + +typedef struct zfs_acl_phys_v0 { + uint64_t z_acl_extern_obj; /* ext acl pieces */ + uint32_t z_acl_count; /* Number of ACEs */ + uint16_t z_acl_version; /* acl version */ + uint16_t z_acl_pad; /* pad */ + zfs_oldace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */ +} zfs_acl_phys_v0_t; + +#define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT) + +typedef struct zfs_acl_phys { uint64_t z_acl_extern_obj; /* ext acl pieces */ - uint32_t z_acl_count; /* Number of ACEs */ + uint32_t z_acl_size; /* Number of bytes in ACL */ uint16_t z_acl_version; /* acl version */ - uint16_t z_acl_pad; /* pad */ - ace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */ -} zfs_znode_acl_t; - -#define ACL_DATA_ALLOCED 0x1 + uint16_t z_acl_count; /* ace count */ + uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ +} zfs_acl_phys_t; + + + +typedef struct acl_ops { + uint32_t (*ace_mask_get) (void *acep); /* get access mask */ + void (*ace_mask_set) (void *acep, + uint32_t mask); /* set access mask */ + uint16_t (*ace_flags_get) (void *acep); /* get flags */ + void (*ace_flags_set) (void *acep, + uint16_t flags); /* set flags */ + uint16_t (*ace_type_get)(void *acep); /* get type */ + void (*ace_type_set)(void *acep, + uint16_t type); /* set type */ + uint64_t (*ace_who_get)(void *acep); /* get who/fuid */ + void (*ace_who_set)(void *acep, + uint64_t who); /* set who/fuid */ + size_t (*ace_size)(void *acep); /* how big is this ace */ + size_t (*ace_abstract_size)(void); /* sizeof abstract entry */ + int (*ace_mask_off)(void); /* off of access mask in ace */ + int (*ace_data)(void *acep, void **datap); + /* ptr to data if any */ +} acl_ops_t; /* - * Max ACL size is prepended deny for all entries + the - * canonical six tacked on * the end. + * A zfs_acl_t structure is composed of a list of zfs_acl_node_t's. + * Each node will have one or more ACEs associated with it. You will + * only have multiple nodes during a chmod operation. Normally only + * one node is required. */ -#define MAX_ACL_SIZE (MAX_ACL_ENTRIES * 2 + 6) +typedef struct zfs_acl_node { + list_node_t z_next; /* Next chunk of ACEs */ + void *z_acldata; /* pointer into actual ACE(s) */ + void *z_allocdata; /* pointer to kmem allocated memory */ + size_t z_allocsize; /* Size of blob in bytes */ + size_t z_size; /* length of ACL data */ + int z_ace_count; /* number of ACEs in this acl node */ + int z_ace_idx; /* ace iterator positioned on */ +} zfs_acl_node_t; typedef struct zfs_acl { - int z_slots; /* number of allocated slots for ACEs */ - int z_acl_count; - uint_t z_state; - ace_t *z_acl; + int z_acl_count; /* Number of ACEs */ + size_t z_acl_bytes; /* Number of bytes in ACL */ + uint_t z_version; /* version of ACL */ + void *z_next_ace; /* pointer to next ACE */ + int z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */ + zfs_acl_node_t *z_curr_node; /* current node iterator is handling */ + list_t z_acl; /* chunks of ACE data */ + acl_ops_t z_ops; /* ACL operations */ + boolean_t z_has_fuids; /* FUIDs present in ACL? */ } zfs_acl_t; +#define ACL_DATA_ALLOCED 0x1 #define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) /* @@ -80,31 +179,34 @@ typedef struct zfs_acl { #define ZFS_ACL_NOALLOW 1 #define ZFS_ACL_GROUPMASK 2 #define ZFS_ACL_PASSTHROUGH 3 -#define ZFS_ACL_SECURE 4 +#define ZFS_ACL_RESTRICTED 4 struct znode; +struct zfsvfs; +struct zfs_fuid_info; #ifdef _KERNEL void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *, - dmu_tx_t *, cred_t *); -#ifdef TODO -int zfs_getacl(struct znode *, vsecattr_t *, cred_t *); -#endif -int zfs_mode_update(struct znode *, uint64_t, dmu_tx_t *); + dmu_tx_t *, cred_t *, zfs_acl_t *, zfs_fuid_info_t **); #ifdef TODO -int zfs_setacl(struct znode *, vsecattr_t *, cred_t *); +int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); +int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); #endif void zfs_acl_rele(void *); -void zfs_ace_byteswap(ace_t *, int); -extern int zfs_zaccess(struct znode *, int, cred_t *); -extern int zfs_zaccess_rwx(struct znode *, mode_t, cred_t *); +void zfs_oldace_byteswap(ace_t *, int); +void zfs_ace_byteswap(void *, size_t, boolean_t); +extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *); +extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); +extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); extern int zfs_acl_access(struct znode *, int, cred_t *); -int zfs_acl_chmod_setattr(struct znode *, uint64_t, dmu_tx_t *); +int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); int zfs_zaccess_rename(struct znode *, struct znode *, struct znode *, struct znode *, cred_t *cr); -int zfs_zaccess_v4_perm(struct znode *, int, cred_t *); void zfs_acl_free(zfs_acl_t *); +int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, zfs_acl_t **); +int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, + struct zfs_fuid_info **, dmu_tx_t *); #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h index 4deeb3c9bf75..76fdc0dce7a5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -45,6 +44,7 @@ extern "C" { #include <sys/cmn_err.h> #include <sys/kmem.h> #include <sys/taskq.h> +#include <sys/taskqueue.h> #include <sys/systm.h> #include <sys/conf.h> #include <sys/mutex.h> @@ -73,11 +73,19 @@ extern "C" { #include <sys/ktr.h> #include <sys/stack.h> #include <sys/lockf.h> +#include <sys/pathname.h> #include <sys/policy.h> +#include <sys/refstr.h> #include <sys/zone.h> #include <sys/eventhandler.h> +#include <sys/extattr.h> #include <sys/misc.h> +#include <sys/sig.h> +#include <sys/osd.h> #include <sys/zfs_debug.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/u8_textprep.h> +#include <sys/fm/util.h> #include <machine/stdarg.h> @@ -99,6 +107,14 @@ extern "C" { #define CPU_SEQID (curcpu) +#define tsd_create(keyp, destructor) do { \ + *(keyp) = osd_thread_register((destructor)); \ + KASSERT(*(keyp) > 0, ("cannot register OSD")); \ +} while (0) +#define tsd_destroy(keyp) osd_thread_deregister(*(keyp)) +#define tsd_get(key) osd_thread_get(curthread, (key)) +#define tsd_set(key, value) osd_thread_set(curthread, (key), (value)) + #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h index a676533ac4a2..905e8dd2c0e3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -57,7 +56,8 @@ int zfsctl_destroy_snapshot(const char *snapname, int force); int zfsctl_umount_snapshots(vfs_t *, int, cred_t *); int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, - int flags, vnode_t *rdir, cred_t *cr); + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp); int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h index f60d614953f3..ebb66e8ae4e9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h @@ -28,6 +28,7 @@ #pragma ident "%Z%%M% %I% %E% SMI" +#include <sys/pathname.h> #include <sys/dmu.h> #include <sys/zfs_znode.h> @@ -41,6 +42,8 @@ extern "C" { #define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ #define ZXATTR 0x0008 /* we want the xattr dir */ #define ZRENAMING 0x0010 /* znode is being renamed */ +#define ZCILOOK 0x0020 /* case-insensitive lookup requested */ +#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */ /* mknode flags */ #define IS_ROOT_NODE 0x01 /* create a root node */ @@ -48,15 +51,17 @@ extern "C" { #define IS_REPLAY 0x04 /* we are replaying intent log */ extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, - int); + int, int *, pathname_t *); extern void zfs_dirent_unlock(zfs_dirlock_t *); extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int); extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, boolean_t *); -extern int zfs_dirlook(znode_t *, char *, vnode_t **); -extern void zfs_mknode(znode_t *, vattr_t *, uint64_t *, - dmu_tx_t *, cred_t *, uint_t, znode_t **, int); +extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *, + pathname_t *); +extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, + uint_t, znode_t **, int, zfs_acl_t *, zfs_fuid_info_t **); extern void zfs_rmnode(znode_t *); +extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); extern boolean_t zfs_dirempty(znode_t *); extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h new file mode 100644 index 000000000000..8d73b41938df --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_FUID_H +#define _SYS_FS_ZFS_FUID_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#ifdef _KERNEL +#include <sys/kidmap.h> +#include <sys/dmu.h> +#include <sys/zfs_vfsops.h> +#endif +#include <sys/avl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + ZFS_OWNER, + ZFS_GROUP, + ZFS_ACE_USER, + ZFS_ACE_GROUP +} zfs_fuid_type_t; + +/* + * Estimate space needed for one more fuid table entry. + * for now assume its current size + 1K + */ +#define FUID_SIZE_ESTIMATE(z) (z->z_fuid_size + (SPA_MINBLOCKSIZE << 1)) + +#define FUID_INDEX(x) (x >> 32) +#define FUID_RID(x) (x & 0xffffffff) +#define FUID_ENCODE(idx, rid) ((idx << 32) | rid) +/* + * FUIDs cause problems for the intent log + * we need to replay the creation of the FUID, + * but we can't count on the idmapper to be around + * and during replay the FUID index may be different than + * before. Also, if an ACL has 100 ACEs and 12 different + * domains we don't want to log 100 domain strings, but rather + * just the unique 12. + */ + +/* + * The FUIDs in the log will index into + * domain string table and the bottom half will be the rid. + * Used for mapping ephemeral uid/gid during ACL setting to FUIDs + */ +typedef struct zfs_fuid { + list_node_t z_next; + uint64_t z_id; /* uid/gid being converted to fuid */ + uint64_t z_domidx; /* index in AVL domain table */ + uint64_t z_logfuid; /* index for domain in log */ +} zfs_fuid_t; + +/* list of unique domains */ +typedef struct zfs_fuid_domain { + list_node_t z_next; + uint64_t z_domidx; /* AVL tree idx */ + const char *z_domain; /* domain string */ +} zfs_fuid_domain_t; + +/* + * FUID information necessary for logging create, setattr, and setacl. + */ +typedef struct zfs_fuid_info { + list_t z_fuids; + list_t z_domains; + uint64_t z_fuid_owner; + uint64_t z_fuid_group; + char **z_domain_table; /* Used during replay */ + uint32_t z_fuid_cnt; /* How many fuids in z_fuids */ + uint32_t z_domain_cnt; /* How many domains */ + size_t z_domain_str_sz; /* len of domain strings z_domain list */ +} zfs_fuid_info_t; + +#ifdef _KERNEL +struct znode; +extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t); +extern void zfs_fuid_destroy(zfsvfs_t *); +extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t, + dmu_tx_t *, cred_t *, zfs_fuid_info_t **); +extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t, + dmu_tx_t *, zfs_fuid_info_t **); +extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, uid_t *uid, + uid_t *gid); +extern zfs_fuid_info_t *zfs_fuid_info_alloc(void); +extern void zfs_fuid_info_free(); +extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *); +#endif + +char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t); +uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *); +void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_FUID_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h index 61a0a9ebdc2e..05a21c846ee8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,6 +31,11 @@ #include <sys/cred.h> #include <sys/dmu.h> #include <sys/zio.h> +#include <sys/dsl_deleg.h> + +#ifdef _KERNEL +#include <sys/nvpair.h> +#endif /* _KERNEL */ #ifdef __cplusplus extern "C" { @@ -42,9 +47,13 @@ extern "C" { #define ZFS_SNAPDIR_HIDDEN 0 #define ZFS_SNAPDIR_VISIBLE 1 -#define DMU_BACKUP_VERSION (1ULL) +#define DMU_BACKUP_STREAM_VERSION (1ULL) +#define DMU_BACKUP_HEADER_VERSION (2ULL) #define DMU_BACKUP_MAGIC 0x2F5bacbacULL +#define DRR_FLAG_CLONE (1<<0) +#define DRR_FLAG_CI_DATA (1<<1) + /* * zfs ioctl command structure */ @@ -53,14 +62,14 @@ typedef struct dmu_replay_record { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, DRR_WRITE, DRR_FREE, DRR_END, } drr_type; - uint32_t drr_pad; + uint32_t drr_payloadlen; union { struct drr_begin { uint64_t drr_magic; uint64_t drr_version; uint64_t drr_creation_time; dmu_objset_type_t drr_type; - uint32_t drr_pad; + uint32_t drr_flags; uint64_t drr_toguid; uint64_t drr_fromguid; char drr_toname[MAXNAMELEN]; @@ -109,48 +118,71 @@ typedef struct zinject_record { uint32_t zi_error; uint64_t zi_type; uint32_t zi_freq; + uint32_t zi_pad; /* pad out to 64 bit alignment */ } zinject_record_t; #define ZINJECT_NULL 0x1 #define ZINJECT_FLUSH_ARC 0x2 #define ZINJECT_UNLOAD_SPA 0x4 +typedef struct zfs_share { + uint64_t z_exportdata; + uint64_t z_sharedata; + uint64_t z_sharetype; /* 0 = share, 1 = unshare */ + uint64_t z_sharemax; /* max length of share string */ +} zfs_share_t; + +/* + * ZFS file systems may behave the usual, POSIX-compliant way, where + * name lookups are case-sensitive. They may also be set up so that + * all the name lookups are case-insensitive, or so that only some + * lookups, the ones that set an FIGNORECASE flag, are case-insensitive. + */ +typedef enum zfs_case { + ZFS_CASE_SENSITIVE, + ZFS_CASE_INSENSITIVE, + ZFS_CASE_MIXED +} zfs_case_t; + typedef struct zfs_cmd { char zc_name[MAXPATHLEN]; - char zc_value[MAXPATHLEN * 2]; + char zc_value[MAXPATHLEN]; + char zc_string[MAXNAMELEN]; uint64_t zc_guid; - uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_nvlist_src; /* really (char *) */ uint64_t zc_nvlist_src_size; - uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst; /* really (char *) */ uint64_t zc_nvlist_dst_size; uint64_t zc_cookie; - uint64_t zc_cred; - uint64_t zc_dev; uint64_t zc_objset_type; - uint64_t zc_history; /* really (char *) */ - uint64_t zc_history_len; + uint64_t zc_perm_action; + uint64_t zc_history; /* really (char *) */ + uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; + zfs_share_t zc_share; uint64_t zc_jailid; dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; zinject_record_t zc_inject_record; } zfs_cmd_t; -#ifdef _KERNEL -typedef struct zfs_create_data { - cred_t *zc_cred; - dev_t zc_dev; - nvlist_t *zc_props; -} zfs_create_data_t; -#endif - #define ZVOL_MAX_MINOR (1 << 16) #define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1) #ifdef _KERNEL -extern int zfs_secpolicy_write(const char *dataset, cred_t *cr); +typedef struct zfs_creat { + nvlist_t *zct_zplprops; + nvlist_t *zct_props; +} zfs_creat_t; + +extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); +extern int zfs_secpolicy_rename_perms(const char *from, + const char *to, cred_t *cr); +extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern int zfs_busy(void); extern int zfs_unmount_snap(char *, void *); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h index aa82cc178091..8d53c02b77aa 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,6 +31,8 @@ #include <sys/list.h> #include <sys/vfs.h> #include <sys/zil.h> +#include <sys/rrwlock.h> +#include <sys/zfs_ioctl.h> #ifdef __cplusplus extern "C" { @@ -46,35 +48,50 @@ struct zfsvfs { uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */ + uint64_t z_fuid_obj; /* fuid table object number */ + uint64_t z_fuid_size; /* fuid table size */ + avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ + avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ + krwlock_t z_fuid_lock; /* fuid lock */ + boolean_t z_fuid_loaded; /* fuid tables are loaded */ + struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ uint_t z_acl_mode; /* acl chmod/mode behavior */ uint_t z_acl_inherit; /* acl inheritance behavior */ + zfs_case_t z_case; /* case-sense */ + boolean_t z_utf8; /* utf8-only */ + int z_norm; /* normalization flags */ boolean_t z_atime; /* enable atimes mount option */ - boolean_t z_unmounted1; /* unmounted phase 1 */ - boolean_t z_unmounted2; /* unmounted phase 2 */ - uint32_t z_op_cnt; /* vnode/vfs operations ref count */ - krwlock_t z_um_lock; /* rw lock for umount phase 2 */ + boolean_t z_unmounted; /* unmounted */ + rrwlock_t z_teardown_lock; + krwlock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all vnodes in the fs */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ vnode_t *z_ctldir; /* .zfs directory pointer */ boolean_t z_show_ctldir; /* expose .zfs in the root dir */ boolean_t z_issnap; /* true if this is a snapshot */ + boolean_t z_vscan; /* virus scan on/off */ + boolean_t z_use_fuids; /* version allows fuids */ + kmutex_t z_online_recv_lock; /* recv in prog grabs as WRITER */ + uint64_t z_version; /* ZPL version */ #define ZFS_OBJ_MTX_SZ 64 kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ }; /* - * The total file ID size is limited to 12 bytes (including the length - * field) in the NFSv2 protocol. For historical reasons, this same limit - * is currently being imposed by the Solaris NFSv3 implementation... - * although the protocol actually permits a maximum of 64 bytes. It will - * not be possible to expand beyond 12 bytes without abandoning support - * of NFSv2 and making some changes to the Solaris NFSv3 implementation. + * Normal filesystems (those not under .zfs/snapshot) have a total + * file ID size limited to 12 bytes (including the length field) due to + * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical + * reasons, this same limit is being imposed by the Solaris NFSv3 implementation + * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It + * is not possible to expand beyond 12 bytes without abandoning support + * of NFSv2. * - * For the time being, we will partition up the available space as follows: + * For normal filesystems, we partition up the available space as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) + * * We reserve only 48 bits for the object number, as this is the limit * currently defined and imposed by the DMU. */ @@ -84,6 +101,22 @@ typedef struct zfid_short { uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ } zfid_short_t; +/* + * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes + * (including the length field). This makes files under .zfs/snapshot + * accessible by NFSv3 and NFSv4, but not NFSv2. + * + * For files under .zfs/snapshot, we partition up the available space + * as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * 6 bytes objset id (48 bits) + * 4 bytes currently just zero (32 bits) + * + * We reserve only 48 bits for the object number and objset id, as these are + * the limits currently defined and imposed by the DMU. + */ typedef struct zfid_long { zfid_short_t z_fid; uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ @@ -93,6 +126,12 @@ typedef struct zfid_long { #define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) #define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) +extern uint_t zfs_fsyncer_key; +extern int zfs_super_owner; + +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode); + #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h index 6b2923298df2..a0cf44064970 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h @@ -19,19 +19,18 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FS_ZFS_ZNODE_H #define _SYS_FS_ZFS_ZNODE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef _KERNEL #include <sys/list.h> #include <sys/dmu.h> #include <sys/zfs_vfsops.h> +#include <sys/rrwlock.h> #endif #include <sys/zfs_acl.h> #include <sys/zil.h> @@ -41,34 +40,62 @@ extern "C" { #endif /* - * Define special zfs pflags + * Additional file level attributes, that are stored + * in the upper half of zp_flags */ -#define ZFS_XATTR 0x1 /* is an extended attribute */ -#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ -#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ +#define ZFS_READONLY 0x0000000100000000 +#define ZFS_HIDDEN 0x0000000200000000 +#define ZFS_SYSTEM 0x0000000400000000 +#define ZFS_ARCHIVE 0x0000000800000000 +#define ZFS_IMMUTABLE 0x0000001000000000 +#define ZFS_NOUNLINK 0x0000002000000000 +#define ZFS_APPENDONLY 0x0000004000000000 +#define ZFS_NODUMP 0x0000008000000000 +#define ZFS_OPAQUE 0x0000010000000000 +#define ZFS_AV_QUARANTINED 0x0000020000000000 +#define ZFS_AV_MODIFIED 0x0000040000000000 + +#define ZFS_ATTR_SET(zp, attr, value) \ +{ \ + if (value) \ + zp->z_phys->zp_flags |= attr; \ + else \ + zp->z_phys->zp_flags &= ~attr; \ +} -#define MASTER_NODE_OBJ 1 +/* + * Define special zfs pflags + */ +#define ZFS_XATTR 0x1 /* is an extended attribute */ +#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ +#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ +#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */ +#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */ +#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ +#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ +#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ /* - * special attributes for master node. + * Is ID ephemeral? */ +#define IS_EPHEMERAL(x) (x > MAXUID) -#define ZFS_FSID "FSID" -#define ZFS_UNLINKED_SET "DELETE_QUEUE" -#define ZFS_ROOT_OBJ "ROOT" -#define ZPL_VERSION_OBJ "VERSION" -#define ZFS_PROP_BLOCKPERPAGE "BLOCKPERPAGE" -#define ZFS_PROP_NOGROWBLOCKS "NOGROWBLOCKS" +/* + * Should we use FUIDs? + */ +#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID &&\ + spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) -#define ZFS_FLAG_BLOCKPERPAGE 0x1 -#define ZFS_FLAG_NOGROWBLOCKS 0x2 +#define MASTER_NODE_OBJ 1 /* - * ZPL version - rev'd whenever an incompatible on-disk format change - * occurs. Independent of SPA/DMU/ZAP versioning. + * Special attributes for master node. */ - -#define ZPL_VERSION 1ULL +#define ZFS_FSID "FSID" +#define ZFS_UNLINKED_SET "DELETE_QUEUE" +#define ZFS_ROOT_OBJ "ROOT" +#define ZPL_VERSION_STR "VERSION" +#define ZFS_FUID_TABLES "FUID" #define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) @@ -83,14 +110,20 @@ extern "C" { #define ZFS_MAXNAMELEN (MAXNAMELEN - 1) /* + * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in + * the directory entries. + */ +#ifndef IFTODT +#define IFTODT(mode) (((mode) & S_IFMT) >> 12) +#endif + +/* * The directory entry has the type (currently unused on Solaris) in the * top 4 bits, and the object number in the low 48 bits. The "middle" * 12 bits are unused. */ #define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4) #define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) -#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj) - /* * This is the persistent portion of the znode. It is stored @@ -112,8 +145,9 @@ typedef struct znode_phys { uint64_t zp_flags; /* 120 - persistent flags */ uint64_t zp_uid; /* 128 - file owner */ uint64_t zp_gid; /* 136 - owning group */ - uint64_t zp_pad[4]; /* 144 - future */ - zfs_znode_acl_t zp_acl; /* 176 - 263 ACL */ + uint64_t zp_zap; /* 144 - extra attributes */ + uint64_t zp_pad[3]; /* 152 - future */ + zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */ /* * Data may pad out any remaining bytes in the znode buffer, eg: * @@ -121,7 +155,9 @@ typedef struct znode_phys { * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| * |<---- znode (264) ---->|<---- data (56) ---->| * - * At present, we only use this space to store symbolic links. + * At present, we use this space for the following: + * - symbolic links + * - 32-byte anti-virus scanstamp (regular files only) */ } znode_phys_t; @@ -153,12 +189,12 @@ typedef struct znode { avl_tree_t z_range_avl; /* avl tree of file range locks */ uint8_t z_unlinked; /* file has been unlinked */ uint8_t z_atime_dirty; /* atime needs to be synced */ - uint8_t z_dbuf_held; /* Is z_dbuf already held? */ uint8_t z_zn_prefetch; /* Prefetch znodes? */ uint_t z_blksz; /* block size in bytes */ uint_t z_seq; /* modification sequence number */ uint64_t z_mapcnt; /* number of pages mapped to file */ uint64_t z_last_itx; /* last ZIL itx on this znode */ + uint64_t z_gen; /* generation (same as zp_gen) */ uint32_t z_sync_cnt; /* synchronous open count */ kmutex_t z_acl_lock; /* acl data lock */ list_node_t z_link_node; /* all znodes in fs link */ @@ -167,6 +203,8 @@ typedef struct znode { */ znode_phys_t *z_phys; /* pointer to persistent znode */ dmu_buf_t *z_dbuf; /* buffer containing the z_phys */ + /* FreeBSD-specific field. */ + struct task z_task; } znode_t; @@ -195,42 +233,51 @@ typedef struct znode { /* * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation. * ZFS_EXIT() must be called before exitting the vop. + * ZFS_VERIFY_ZP() verifies the znode is valid. */ #define ZFS_ENTER(zfsvfs) \ { \ - atomic_add_32(&(zfsvfs)->z_op_cnt, 1); \ - if ((zfsvfs)->z_unmounted1) { \ + rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG); \ + if ((zfsvfs)->z_unmounted) { \ ZFS_EXIT(zfsvfs); \ return (EIO); \ } \ } -#define ZFS_EXIT(zfsvfs) atomic_add_32(&(zfsvfs)->z_op_cnt, -1) + +#define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG) + +#define ZFS_VERIFY_ZP(zp) \ + if ((zp)->z_dbuf == NULL) { \ + ZFS_EXIT((zp)->z_zfsvfs); \ + return (EIO); \ + } \ /* * Macros for dealing with dmu_buf_hold */ -#define ZFS_OBJ_HASH(obj_num) (obj_num & (ZFS_OBJ_MTX_SZ - 1)) -#define ZFS_OBJ_MUTEX(zp) \ - (&zp->z_zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(zp->z_id)]) +#define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1)) +#define ZFS_OBJ_MUTEX(zfsvfs, obj_num) \ + (&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) #define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \ - mutex_enter(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]); - + mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) +#define ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \ + mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) #define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ - mutex_exit(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) + mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) /* * Macros to encode/decode ZFS stored time values from/to struct timespec */ #define ZFS_TIME_ENCODE(tp, stmp) \ { \ - stmp[0] = (uint64_t)(tp)->tv_sec; \ - stmp[1] = (uint64_t)(tp)->tv_nsec; \ + (stmp)[0] = (uint64_t)(tp)->tv_sec; \ + (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ } #define ZFS_TIME_DECODE(tp, stmp) \ { \ - (tp)->tv_sec = (time_t)stmp[0]; \ - (tp)->tv_nsec = (long)stmp[1]; \ + (tp)->tv_sec = (time_t)(stmp)[0]; \ + (tp)->tv_nsec = (long)(stmp)[1]; \ } /* @@ -244,9 +291,10 @@ typedef struct znode { if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ zfs_time_stamper(zp, ACCESSED, NULL) -extern int zfs_init_fs(zfsvfs_t *, znode_t **, cred_t *); +extern int zfs_init_fs(zfsvfs_t *, znode_t **); extern void zfs_set_dataprop(objset_t *); -extern void zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx); +extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *, + dmu_tx_t *tx); extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *); extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *); extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); @@ -254,33 +302,43 @@ extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); extern void zfs_znode_init(void); extern void zfs_znode_fini(void); extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **); +extern int zfs_rezget(znode_t *); extern void zfs_zinactive(znode_t *); extern void zfs_znode_delete(znode_t *, dmu_tx_t *); extern void zfs_znode_free(znode_t *); extern void zfs_remove_op_tables(); extern int zfs_create_op_tables(); extern dev_t zfs_cmpldev(uint64_t); - -extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *dzp, znode_t *zp, char *name); -extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype, +extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); +extern int zfs_set_version(const char *name, uint64_t newvers); +extern int zfs_get_stats(objset_t *os, nvlist_t *nv); +extern void zfs_znode_dmu_fini(znode_t *); + +extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, vsecattr_t *, zfs_fuid_info_t *, + vattr_t *vap); +extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp, + vattr_t *vap); +extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, char *name); -extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype, +extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name); -extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype, +extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name, char *link); -extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype, +extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t len, int ioflag); extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, uint64_t off, uint64_t len); extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, vattr_t *vap, uint_t mask_applied); + znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp); #ifndef ZFS_NO_ACL -extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, int aclcnt, ace_t *z_ace); +extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, + vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); #endif +extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap); +extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); extern zil_get_data_t zfs_get_data; extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h index 947ba9fa6076..4d02d14f7075 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZIL_H #define _SYS_ZIL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/spa.h> #include <sys/zio.h> @@ -88,22 +86,61 @@ typedef struct zil_trailer { #define ZIL_ZC_OBJSET 2 #define ZIL_ZC_SEQ 3 +typedef enum zil_create { + Z_FILE, + Z_DIR, + Z_XATTRDIR, +} zil_create_t; + +/* + * size of xvattr log section. + * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps + * for create time and a single 64 bit integer for all of the attributes, + * and 4 64 bit integers (32 bytes) for the scanstamp. + * + */ + +#define ZIL_XVAT_SIZE(mapsize) \ + sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \ + (sizeof (uint64_t) * 7) + +/* + * Size of ACL in log. The ACE data is padded out to properly align + * on 8 byte boundary. + */ + +#define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t))) + /* * Intent log transaction types and record structures */ -#define TX_CREATE 1 /* Create file */ -#define TX_MKDIR 2 /* Make directory */ -#define TX_MKXATTR 3 /* Make XATTR directory */ -#define TX_SYMLINK 4 /* Create symbolic link to a file */ -#define TX_REMOVE 5 /* Remove file */ -#define TX_RMDIR 6 /* Remove directory */ -#define TX_LINK 7 /* Create hard link to a file */ -#define TX_RENAME 8 /* Rename a file */ -#define TX_WRITE 9 /* File write */ -#define TX_TRUNCATE 10 /* Truncate a file */ -#define TX_SETATTR 11 /* Set file attributes */ -#define TX_ACL 12 /* Set acl */ -#define TX_MAX_TYPE 13 /* Max transaction type */ +#define TX_CREATE 1 /* Create file */ +#define TX_MKDIR 2 /* Make directory */ +#define TX_MKXATTR 3 /* Make XATTR directory */ +#define TX_SYMLINK 4 /* Create symbolic link to a file */ +#define TX_REMOVE 5 /* Remove file */ +#define TX_RMDIR 6 /* Remove directory */ +#define TX_LINK 7 /* Create hard link to a file */ +#define TX_RENAME 8 /* Rename a file */ +#define TX_WRITE 9 /* File write */ +#define TX_TRUNCATE 10 /* Truncate a file */ +#define TX_SETATTR 11 /* Set file attributes */ +#define TX_ACL_V0 12 /* Set old formatted ACL */ +#define TX_ACL 13 /* Set ACL */ +#define TX_CREATE_ACL 14 /* create with ACL */ +#define TX_CREATE_ATTR 15 /* create + attrs */ +#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */ +#define TX_MKDIR_ACL 17 /* mkdir with ACL */ +#define TX_MKDIR_ATTR 18 /* mkdir with attr */ +#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ +#define TX_MAX_TYPE 20 /* Max transaction type */ + +/* + * The transactions for mkdir, symlink, remove, rmdir, link, and rename + * may have the following bit set, indicating the original request + * specified case-insensitive handling of names. + */ +#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ /* * Format of log records. @@ -124,6 +161,23 @@ typedef struct { /* common log record header */ uint64_t lrc_seq; /* see comment above */ } lr_t; +/* + * Handle option extended vattr attributes. + * + * Whenever new attributes are added the version number + * will need to be updated as will code in + * zfs_log.c and zfs_replay.c + */ +typedef struct { + uint32_t lr_attr_masksize; /* number of elements in array */ + uint32_t lr_attr_bitmap; /* First entry of array */ + /* remainder of array and any additional fields */ +} lr_attr_t; + +/* + * log record for creates without optional ACL. + * This log record does support optional xvattr_t attributes. + */ typedef struct { lr_t lr_common; /* common portion of log record */ uint64_t lr_doid; /* object id of directory */ @@ -136,8 +190,42 @@ typedef struct { uint64_t lr_rdev; /* rdev of object to create */ /* name of object to create follows this */ /* for symlinks, link content follows name */ + /* for creates with xvattr data, the name follows the xvattr info */ } lr_create_t; +/* + * FUID ACL record will be an array of ACEs from the original ACL. + * If this array includes ephemeral IDs, the record will also include + * an array of log-specific FUIDs to replace the ephemeral IDs. + * Only one copy of each unique domain will be present, so the log-specific + * FUIDs will use an index into a compressed domain table. On replay this + * information will be used to construct real FUIDs (and bypass idmap, + * since it may not be available). + */ + +/* + * Log record for creates with optional ACL + * This log record is also used for recording any FUID + * information needed for replaying the create. If the + * file doesn't have any actual ACEs then the lr_aclcnt + * would be zero. + */ +typedef struct { + lr_create_t lr_create; /* common create portion */ + uint64_t lr_aclcnt; /* number of ACEs in ACL */ + uint64_t lr_domcnt; /* number of unique domains */ + uint64_t lr_fuidcnt; /* number of real fuids */ + uint64_t lr_acl_bytes; /* number of bytes in ACL */ + uint64_t lr_acl_flags; /* ACL flags */ + /* lr_acl_bytes number of variable sized ace's follows */ + /* if create is also setting xvattr's, then acl data follows xvattr */ + /* if ACE FUIDs are needed then they will follow the xvattr_t */ + /* Following the FUIDs will be the domain table information. */ + /* The FUIDs for the owner and group will be in the lr_create */ + /* portion of the record. */ + /* name follows ACL data */ +} lr_acl_create_t; + typedef struct { lr_t lr_common; /* common portion of log record */ uint64_t lr_doid; /* obj id of directory */ @@ -185,6 +273,7 @@ typedef struct { uint64_t lr_size; /* size to set */ uint64_t lr_atime[2]; /* access time */ uint64_t lr_mtime[2]; /* modification time */ + /* optional attribute lr_attr_t may be here */ } lr_setattr_t; typedef struct { @@ -192,6 +281,17 @@ typedef struct { uint64_t lr_foid; /* obj id of file */ uint64_t lr_aclcnt; /* number of acl entries */ /* lr_aclcnt number of ace_t entries follow this */ +} lr_acl_v0_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* obj id of file */ + uint64_t lr_aclcnt; /* number of ACEs in ACL */ + uint64_t lr_domcnt; /* number of unique domains */ + uint64_t lr_fuidcnt; /* number of real fuids */ + uint64_t lr_acl_bytes; /* number of bytes in ACL */ + uint64_t lr_acl_flags; /* ACL flags */ + /* lr_acl_bytes number of variable sized ace's follows */ } lr_acl_t; /* @@ -213,6 +313,7 @@ typedef struct itx { void *itx_private; /* type-specific opaque data */ itx_wr_state_t itx_wr_state; /* write state */ uint8_t itx_sync; /* synchronous transaction */ + uint64_t itx_sod; /* record size on disk */ lr_t itx_lr; /* common part of log record */ /* followed by type-specific part of lr_xx_t and its immediate data */ } itx_t; @@ -234,6 +335,7 @@ typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, uint64_t txg); typedef int zil_replay_func_t(); +typedef void zil_replay_cleaner_t(); typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio); extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, @@ -249,15 +351,19 @@ extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); extern void zil_close(zilog_t *zilog); extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp, - zil_replay_func_t *replay_func[TX_MAX_TYPE]); + zil_replay_func_t *replay_func[TX_MAX_TYPE], + zil_replay_cleaner_t *replay_cleaner); extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); +extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); -extern itx_t *zil_itx_create(int txtype, size_t lrsize); +extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid); extern int zil_claim(char *osname, void *txarg); +extern int zil_check_log_chain(char *osname, void *txarg); +extern int zil_clear_log_chain(char *osname, void *txarg); extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); extern void zil_clean(zilog_t *zilog); extern int zil_is_committed(zilog_t *zilog); @@ -265,7 +371,7 @@ extern int zil_is_committed(zilog_t *zilog); extern int zil_suspend(zilog_t *zilog); extern void zil_resume(zilog_t *zilog); -extern void zil_add_vdev(zilog_t *zilog, uint64_t vdev); +extern void zil_add_block(zilog_t *zilog, blkptr_t *bp); extern int zil_disable; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h index 3ecf4e4debf5..0fc800b96dea 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -51,15 +51,13 @@ typedef struct lwb { } lwb_t; /* - * Vdev flushing: We use a bit map of size ZIL_VDEV_BMAP bytes. - * Any vdev numbers beyond that use a linked list of zil_vdev_t structures. + * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs + * we've touched so we know which ones need a write cache flush at the end. */ - -#define ZIL_VDEV_BMSZ 16 /* 16 * 8 = 128 vdevs */ -typedef struct zil_vdev { - uint64_t vdev; /* device written */ - list_node_t vdev_seq_node; /* zilog->zl_vdev_list linkage */ -} zil_vdev_t; +typedef struct zil_vdev_node { + uint64_t zv_vdev; /* vdev to be flushed */ + avl_node_t zv_node; /* AVL tree linkage */ +} zil_vdev_node_t; /* * Stable storage intent log management structure. One per dataset. @@ -91,8 +89,8 @@ struct zilog { uint64_t zl_cur_used; /* current commit log size used */ uint64_t zl_prev_used; /* previous commit log size used */ list_t zl_lwb_list; /* in-flight log write list */ - list_t zl_vdev_list; /* list of [vdev, seq] pairs */ - uint8_t zl_vdev_bmap[ZIL_VDEV_BMSZ]; /* bitmap of vdevs */ + kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */ + avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */ taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */ avl_tree_t zl_dva_tree; /* track DVAs during log parse */ clock_t zl_replay_time; /* lbolt of when replay started */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index b026ae6450c6..6331567498b5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -20,20 +20,17 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _ZIO_H #define _ZIO_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/txg.h> #include <sys/avl.h> -#include <sys/dkio.h> #include <sys/fs/zfs.h> #include <sys/zio_impl.h> @@ -60,10 +57,6 @@ typedef struct zio_block_tail { (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ sizeof (uint64_t)) -#define ZIO_GET_IOSIZE(zio) \ - (BP_IS_GANG((zio)->io_bp) ? \ - SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp)) - typedef struct zio_gbh { blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; uint64_t zg_filler[SPA_GBH_FILLER]; @@ -107,6 +100,10 @@ enum zio_compress { #define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB #define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF +#define ZIO_FAILURE_MODE_WAIT 0 +#define ZIO_FAILURE_MODE_CONTINUE 1 +#define ZIO_FAILURE_MODE_PANIC 2 + #define ZIO_PRIORITY_NOW (zio_priority_table[0]) #define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1]) #define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2]) @@ -121,51 +118,70 @@ enum zio_compress { #define ZIO_FLAG_MUSTSUCCEED 0x00000 #define ZIO_FLAG_CANFAIL 0x00001 -#define ZIO_FLAG_FAILFAST 0x00002 -#define ZIO_FLAG_CONFIG_HELD 0x00004 -#define ZIO_FLAG_CONFIG_GRABBED 0x00008 +#define ZIO_FLAG_SPECULATIVE 0x00002 +#define ZIO_FLAG_CONFIG_WRITER 0x00004 +#define ZIO_FLAG_DONT_RETRY 0x00008 #define ZIO_FLAG_DONT_CACHE 0x00010 #define ZIO_FLAG_DONT_QUEUE 0x00020 -#define ZIO_FLAG_DONT_PROPAGATE 0x00040 -#define ZIO_FLAG_DONT_RETRY 0x00080 - -#define ZIO_FLAG_PHYSICAL 0x00100 -#define ZIO_FLAG_IO_BYPASS 0x00200 -#define ZIO_FLAG_IO_REPAIR 0x00400 -#define ZIO_FLAG_SPECULATIVE 0x00800 +#define ZIO_FLAG_DONT_AGGREGATE 0x00040 +#define ZIO_FLAG_DONT_PROPAGATE 0x00080 -#define ZIO_FLAG_RESILVER 0x01000 -#define ZIO_FLAG_SCRUB 0x02000 -#define ZIO_FLAG_SCRUB_THREAD 0x04000 -#define ZIO_FLAG_SUBBLOCK 0x08000 +#define ZIO_FLAG_IO_BYPASS 0x00100 +#define ZIO_FLAG_IO_REPAIR 0x00200 +#define ZIO_FLAG_IO_RETRY 0x00400 +#define ZIO_FLAG_IO_REWRITE 0x00800 -#define ZIO_FLAG_NOBOOKMARK 0x10000 -#define ZIO_FLAG_USER 0x20000 +#define ZIO_FLAG_PROBE 0x01000 +#define ZIO_FLAG_RESILVER 0x02000 +#define ZIO_FLAG_SCRUB 0x04000 +#define ZIO_FLAG_SCRUB_THREAD 0x08000 -#define ZIO_FLAG_METADATA 0x40000 +#define ZIO_FLAG_GANG_CHILD 0x10000 #define ZIO_FLAG_GANG_INHERIT \ (ZIO_FLAG_CANFAIL | \ - ZIO_FLAG_FAILFAST | \ - ZIO_FLAG_CONFIG_HELD | \ - ZIO_FLAG_DONT_RETRY | \ - ZIO_FLAG_IO_REPAIR | \ ZIO_FLAG_SPECULATIVE | \ + ZIO_FLAG_CONFIG_WRITER | \ + ZIO_FLAG_DONT_RETRY | \ + ZIO_FLAG_DONT_CACHE | \ + ZIO_FLAG_DONT_AGGREGATE | \ ZIO_FLAG_RESILVER | \ ZIO_FLAG_SCRUB | \ ZIO_FLAG_SCRUB_THREAD) #define ZIO_FLAG_VDEV_INHERIT \ (ZIO_FLAG_GANG_INHERIT | \ - ZIO_FLAG_DONT_CACHE | \ - ZIO_FLAG_PHYSICAL) + ZIO_FLAG_IO_REPAIR | \ + ZIO_FLAG_IO_RETRY | \ + ZIO_FLAG_PROBE) + +#define ZIO_PIPELINE_CONTINUE 0x100 +#define ZIO_PIPELINE_STOP 0x101 + +#define ZIO_GANG_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ + ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) + +enum zio_child { + ZIO_CHILD_VDEV = 0, + ZIO_CHILD_GANG, + ZIO_CHILD_LOGICAL, + ZIO_CHILD_TYPES +}; + +enum zio_wait_type { + ZIO_WAIT_READY = 0, + ZIO_WAIT_DONE, + ZIO_WAIT_TYPES +}; /* - * We'll take the EILSEQ (Illegal byte sequence) errno - * to indicate checksum errors. + * We'll take the EILSEQ and ENOMSG to indicate checksum errors and + * fragmentation. */ #define ECKSUM EILSEQ +#define EFRAGS ENOMSG typedef struct zio zio_t; typedef void zio_done_func_t(zio_t *zio); @@ -200,23 +216,64 @@ typedef struct zbookmark { uint64_t zb_blkid; } zbookmark_t; +typedef struct zio_prop { + enum zio_checksum zp_checksum; + enum zio_compress zp_compress; + dmu_object_type_t zp_type; + uint8_t zp_level; + uint8_t zp_ndvas; +} zio_prop_t; + +typedef struct zio_gang_node { + zio_gbh_phys_t *gn_gbh; + struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; +} zio_gang_node_t; + +typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, + zio_gang_node_t *gn, void *data); + +typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); + +typedef struct zio_transform { + void *zt_orig_data; + uint64_t zt_orig_size; + uint64_t zt_bufsize; + zio_transform_func_t *zt_transform; + struct zio_transform *zt_next; +} zio_transform_t; + +typedef int zio_pipe_stage_t(zio_t *zio); + +/* + * The io_reexecute flags are distinct from io_flags because the child must + * be able to propagate them to the parent. The normal io_flags are local + * to the zio, not protected by any lock, and not modifiable by children; + * the reexecute flags are protected by io_lock, modifiable by children, + * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. + */ +#define ZIO_REEXECUTE_NOW 0x01 +#define ZIO_REEXECUTE_SUSPEND 0x02 + struct zio { /* Core information about this I/O */ - zio_t *io_parent; - zio_t *io_root; - spa_t *io_spa; zbookmark_t io_bookmark; - enum zio_checksum io_checksum; - enum zio_compress io_compress; - int io_ndvas; + zio_prop_t io_prop; + zio_type_t io_type; + enum zio_child io_child_type; + int io_cmd; + uint8_t io_priority; + uint8_t io_reexecute; + uint8_t io_async_root; uint64_t io_txg; + spa_t *io_spa; blkptr_t *io_bp; blkptr_t io_bp_copy; + zio_t *io_parent; zio_t *io_child; zio_t *io_sibling_prev; zio_t *io_sibling_next; - zio_transform_t *io_transform_stack; zio_t *io_logical; + zio_transform_t *io_transform_stack; /* Callback info */ zio_done_func_t *io_ready; @@ -231,9 +288,9 @@ struct zio { /* Stuff for the vdev stack */ vdev_t *io_vd; void *io_vsd; + zio_done_func_t *io_vsd_free; uint64_t io_offset; uint64_t io_deadline; - uint64_t io_timestamp; avl_node_t io_offset_node; avl_node_t io_deadline_node; avl_tree_t *io_vdev_tree; @@ -242,19 +299,17 @@ struct zio { /* Internal pipeline state */ int io_flags; - enum zio_type io_type; - enum zio_stage io_stage; - uint8_t io_stalled; - uint8_t io_priority; - struct dk_callback io_dk_callback; - int io_cmd; - int io_retries; - int io_error; - uint32_t io_numerrors; + zio_stage_t io_stage; uint32_t io_pipeline; - uint32_t io_async_stages; - uint64_t io_children_notready; - uint64_t io_children_notdone; + int io_orig_flags; + zio_stage_t io_orig_stage; + uint32_t io_orig_pipeline; + int io_error; + int io_child_error[ZIO_CHILD_TYPES]; + uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; + uint64_t *io_stall; + zio_gang_node_t *io_gang_tree; + void *io_executor; void *io_waiter; kmutex_t io_lock; kcondvar_t io_cv; @@ -269,76 +324,76 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags); -extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, +extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - int priority, int flags, zbookmark_t *zb); + int priority, int flags, const zbookmark_t *zb); -extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, - int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, - int flags, zbookmark_t *zb); +extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + void *data, uint64_t size, zio_prop_t *zp, + zio_done_func_t *ready, zio_done_func_t *done, void *private, + int priority, int flags, const zbookmark_t *zb); -extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum, - uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *done, void *private, int priority, int flags, - zbookmark_t *zb); +extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + void *data, uint64_t size, zio_done_func_t *done, void *private, + int priority, int flags, zbookmark_t *zb); extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private); + zio_done_func_t *done, void *private, int flags); extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private); + zio_done_func_t *done, void *private, int flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, int priority, int flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, - zio_done_func_t *done, void *private, int priority, int flags); + zio_done_func_t *done, void *private, int priority, int flags, + boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, - zio_done_func_t *done, void *private, int priority, int flags); + zio_done_func_t *done, void *private, int priority, int flags, + boolean_t labels); extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t txg); extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg); +extern void zio_flush(zio_t *zio, vdev_t *vd); extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); +extern void zio_execute(zio_t *zio); +extern void zio_interrupt(zio_t *zio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); -/* - * Move an I/O to the next stage of the pipeline and execute that stage. - * There's no locking on io_stage because there's no legitimate way for - * multiple threads to be attempting to process the same I/O. - */ -extern void zio_next_stage(zio_t *zio); -extern void zio_next_stage_async(zio_t *zio); -extern void zio_wait_children_done(zio_t *zio); +extern void zio_resubmit_stage_async(void *); -/* - * Delegate I/O to a child vdev. - */ extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, int priority, int flags, zio_done_func_t *done, void *private); +extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, + void *data, uint64_t size, int type, int priority, + int flags, zio_done_func_t *done, void *private); + extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); extern void zio_checksum_verified(zio_t *zio); -extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp); +extern int zio_worst_error(int e1, int e2); extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent); extern uint8_t zio_compress_select(uint8_t child, uint8_t parent); -boolean_t zio_should_retry(zio_t *zio); +extern void zio_suspend(spa_t *spa, zio_t *zio); +extern void zio_resume(spa_t *spa); +extern void zio_resume_wait(spa_t *spa); /* * Initial setup and teardown. @@ -358,6 +413,7 @@ extern int zio_inject_list_next(int *id, char *name, size_t buflen, extern int zio_clear_fault(int id); extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, int error); +extern int zio_handle_label_injection(zio_t *zio, int error); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h index bb7bd41e0bb3..da407399da06 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZIO_CHECKSUM_H #define _SYS_ZIO_CHECKSUM_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zio.h> #ifdef __cplusplus @@ -64,7 +62,7 @@ extern zio_checksum_t fletcher_4_incremental_byteswap; extern zio_checksum_t zio_checksum_SHA256; -extern void zio_checksum(uint_t checksum, zio_cksum_t *zcp, +extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, void *data, uint64_t size); extern int zio_checksum_error(zio_t *zio); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h index d2ddbc34e922..e7503b733cc0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _ZIO_IMPL_H #define _ZIO_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/zio.h> @@ -38,162 +36,102 @@ extern "C" { /* * I/O Groups: pipeline stage definitions. */ - typedef enum zio_stage { ZIO_STAGE_OPEN = 0, /* RWFCI */ - ZIO_STAGE_WAIT_CHILDREN_READY, /* RWFCI */ - ZIO_STAGE_WRITE_COMPRESS, /* -W--- */ - ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */ + ZIO_STAGE_ISSUE_ASYNC, /* -W--- */ - ZIO_STAGE_GANG_PIPELINE, /* -WFC- */ + ZIO_STAGE_READ_BP_INIT, /* R---- */ + ZIO_STAGE_WRITE_BP_INIT, /* -W--- */ + + ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */ - ZIO_STAGE_GET_GANG_HEADER, /* -WFC- */ - ZIO_STAGE_REWRITE_GANG_MEMBERS, /* -W--- */ - ZIO_STAGE_FREE_GANG_MEMBERS, /* --F-- */ - ZIO_STAGE_CLAIM_GANG_MEMBERS, /* ---C- */ + ZIO_STAGE_GANG_ASSEMBLE, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE, /* RWFC- */ ZIO_STAGE_DVA_ALLOCATE, /* -W--- */ ZIO_STAGE_DVA_FREE, /* --F-- */ ZIO_STAGE_DVA_CLAIM, /* ---C- */ - ZIO_STAGE_GANG_CHECKSUM_GENERATE, /* -W--- */ - ZIO_STAGE_READY, /* RWFCI */ ZIO_STAGE_VDEV_IO_START, /* RW--I */ ZIO_STAGE_VDEV_IO_DONE, /* RW--I */ ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */ - ZIO_STAGE_WAIT_CHILDREN_DONE, /* RWFCI */ - ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */ - ZIO_STAGE_READ_GANG_MEMBERS, /* R---- */ - ZIO_STAGE_READ_DECOMPRESS, /* R---- */ - ZIO_STAGE_DONE /* RWFCI */ + ZIO_STAGE_DONE, /* RWFCI */ + ZIO_STAGES } zio_stage_t; -/* - * The stages for which there's some performance value in going async. - * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well. - */ -#define ZIO_ASYNC_PIPELINE_STAGES \ - ((1U << ZIO_STAGE_CHECKSUM_GENERATE) | \ - (1U << ZIO_STAGE_VDEV_IO_DONE) | \ - (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \ - (1U << ZIO_STAGE_READ_DECOMPRESS)) +#define ZIO_INTERLOCK_STAGES \ + ((1U << ZIO_STAGE_READY) | \ + (1U << ZIO_STAGE_DONE)) -#define ZIO_VDEV_IO_PIPELINE \ +#define ZIO_INTERLOCK_PIPELINE \ + ZIO_INTERLOCK_STAGES + +#define ZIO_VDEV_IO_STAGES \ ((1U << ZIO_STAGE_VDEV_IO_START) | \ (1U << ZIO_STAGE_VDEV_IO_DONE) | \ (1U << ZIO_STAGE_VDEV_IO_ASSESS)) -#define ZIO_READ_PHYS_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_READY) | \ - ZIO_VDEV_IO_PIPELINE | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \ +#define ZIO_VDEV_CHILD_PIPELINE \ + (ZIO_VDEV_IO_STAGES | \ (1U << ZIO_STAGE_DONE)) -#define ZIO_READ_PIPELINE \ - ZIO_READ_PHYS_PIPELINE +#define ZIO_READ_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + (1U << ZIO_STAGE_CHECKSUM_VERIFY)) -#define ZIO_WRITE_PHYS_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_CHECKSUM_GENERATE) | \ - (1U << ZIO_STAGE_READY) | \ - ZIO_VDEV_IO_PIPELINE | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_DONE)) +#define ZIO_READ_PHYS_PIPELINE \ + ZIO_READ_COMMON_STAGES -#define ZIO_WRITE_COMMON_PIPELINE \ - ZIO_WRITE_PHYS_PIPELINE +#define ZIO_READ_PIPELINE \ + (ZIO_READ_COMMON_STAGES | \ + (1U << ZIO_STAGE_READ_BP_INIT)) -#define ZIO_WRITE_PIPELINE \ - ((1U << ZIO_STAGE_WRITE_COMPRESS) | \ - ZIO_WRITE_COMMON_PIPELINE) +#define ZIO_WRITE_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + (1U << ZIO_STAGE_ISSUE_ASYNC) | \ + (1U << ZIO_STAGE_CHECKSUM_GENERATE)) -#define ZIO_GANG_STAGES \ - ((1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \ - (1U << ZIO_STAGE_READ_GANG_MEMBERS)) +#define ZIO_WRITE_PHYS_PIPELINE \ + ZIO_WRITE_COMMON_STAGES #define ZIO_REWRITE_PIPELINE \ - ((1U << ZIO_STAGE_GANG_PIPELINE) | \ - (1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \ - ZIO_WRITE_COMMON_PIPELINE) + (ZIO_WRITE_COMMON_STAGES | \ + (1U << ZIO_STAGE_WRITE_BP_INIT)) -#define ZIO_WRITE_ALLOCATE_PIPELINE \ - ((1U << ZIO_STAGE_DVA_ALLOCATE) | \ - ZIO_WRITE_COMMON_PIPELINE) +#define ZIO_WRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + (1U << ZIO_STAGE_WRITE_BP_INIT) | \ + (1U << ZIO_STAGE_DVA_ALLOCATE)) -#define ZIO_GANG_FREE_STAGES \ - ((1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_FREE_GANG_MEMBERS)) +#define ZIO_GANG_STAGES \ + ((1U << ZIO_STAGE_GANG_ASSEMBLE) | \ + (1U << ZIO_STAGE_GANG_ISSUE)) #define ZIO_FREE_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_GANG_PIPELINE) | \ - (1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_DVA_FREE) | \ - (1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_DONE)) + (ZIO_INTERLOCK_STAGES | \ + (1U << ZIO_STAGE_DVA_FREE)) #define ZIO_CLAIM_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_GANG_PIPELINE) | \ - (1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_DVA_CLAIM) | \ - (1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_DONE)) + (ZIO_INTERLOCK_STAGES | \ + (1U << ZIO_STAGE_DVA_CLAIM)) #define ZIO_IOCTL_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_READY) | \ - ZIO_VDEV_IO_PIPELINE | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_DONE)) - -#define ZIO_WAIT_FOR_CHILDREN_PIPELINE \ - ((1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_DONE)) - -#define ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE \ - ((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_DONE)) + (ZIO_INTERLOCK_STAGES | \ + (1U << ZIO_STAGE_VDEV_IO_START) | \ + (1U << ZIO_STAGE_VDEV_IO_ASSESS)) -#define ZIO_VDEV_CHILD_PIPELINE \ - (ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE | \ - ZIO_VDEV_IO_PIPELINE) - -#define ZIO_ERROR_PIPELINE_MASK \ - ZIO_WAIT_FOR_CHILDREN_PIPELINE - -typedef struct zio_transform zio_transform_t; -struct zio_transform { - void *zt_data; - uint64_t zt_size; - uint64_t zt_bufsize; - zio_transform_t *zt_next; -}; +#define ZIO_CONFIG_LOCK_BLOCKING_STAGES \ + ((1U << ZIO_STAGE_VDEV_IO_START) | \ + (1U << ZIO_STAGE_DVA_ALLOCATE) | \ + (1U << ZIO_STAGE_DVA_CLAIM)) extern void zio_inject_init(void); extern void zio_inject_fini(void); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h index df85824d59bd..2a6452aa433c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,17 +35,21 @@ extern "C" { #endif +#define ZVOL_OBJ 1ULL +#define ZVOL_ZAP_OBJ 2ULL + #ifdef _KERNEL extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); extern int zvol_check_volblocksize(uint64_t volblocksize); extern int zvol_get_stats(objset_t *os, nvlist_t *nv); -extern void zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx); -extern int zvol_create_minor(const char *, dev_t); +extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); +extern int zvol_create_minor(const char *, major_t); extern int zvol_remove_minor(const char *); -extern int zvol_set_volsize(const char *, dev_t, uint64_t); +extern int zvol_set_volsize(const char *, major_t, uint64_t); extern int zvol_set_volblocksize(const char *, uint64_t); extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr); +extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks); extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr); #ifndef __FreeBSD__ extern int zvol_strategy(buf_t *bp); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c index 844beb6864a5..040e4d70fc04 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/txg_impl.h> #include <sys/dmu_impl.h> @@ -37,9 +35,18 @@ static void txg_sync_thread(void *arg); static void txg_quiesce_thread(void *arg); -static void txg_timelimit_thread(void *arg); -int txg_time = 5; /* max 5 seconds worth of delta per txg */ +int zfs_txg_timeout = 30; /* max seconds worth of delta per txg */ +extern int zfs_txg_synctime; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS TXG"); +TUNABLE_INT("vfs.zfs.txg.timeout", &zfs_txg_timeout); +SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RDTUN, &zfs_txg_timeout, 0, + "Maximum seconds worth of delta per txg"); +TUNABLE_INT("vfs.zfs.txg.synctime", &zfs_txg_synctime); +SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, synctime, CTLFLAG_RDTUN, &zfs_txg_synctime, + 0, "Target seconds to sync a txg"); /* * Prepare the txg subsystem. @@ -48,14 +55,19 @@ void txg_init(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; - int c, i; + int c; bzero(tx, sizeof (tx_state_t)); tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); + for (c = 0; c < max_ncpus; c++) { + int i; + mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); - for (i = 0; i < TXG_SIZE; i++) - cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL); + for (i = 0; i < TXG_SIZE; i++) { + cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, + NULL); + } } rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL); @@ -64,7 +76,6 @@ txg_init(dsl_pool_t *dp, uint64_t txg) cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); - cv_init(&tx->tx_timeout_exit_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); tx->tx_open_txg = txg; @@ -77,12 +88,11 @@ void txg_fini(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; - int c, i; + int c; ASSERT(tx->tx_threads == 0); cv_destroy(&tx->tx_exit_cv); - cv_destroy(&tx->tx_timeout_exit_cv); cv_destroy(&tx->tx_quiesce_done_cv); cv_destroy(&tx->tx_quiesce_more_cv); cv_destroy(&tx->tx_sync_done_cv); @@ -91,9 +101,11 @@ txg_fini(dsl_pool_t *dp) mutex_destroy(&tx->tx_sync_lock); for (c = 0; c < max_ncpus; c++) { + int i; + + mutex_destroy(&tx->tx_cpu[c].tc_lock); for (i = 0; i < TXG_SIZE; i++) cv_destroy(&tx->tx_cpu[c].tc_cv[i]); - mutex_destroy(&tx->tx_cpu[c].tc_lock); } kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); @@ -115,15 +127,17 @@ txg_sync_start(dsl_pool_t *dp) ASSERT(tx->tx_threads == 0); - tx->tx_threads = 3; + tx->tx_threads = 2; tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, dp, 0, &p0, TS_RUN, minclsyspri); - tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread, - dp, 0, &p0, TS_RUN, minclsyspri); - - tx->tx_timelimit_thread = thread_create(NULL, 0, txg_timelimit_thread, + /* + * The sync thread can need a larger-than-default stack size on + * 32-bit x86. This is due in part to nested pools and + * scrub_visitbp() recursion. + */ + tx->tx_sync_thread = thread_create(NULL, 12<<10, txg_sync_thread, dp, 0, &p0, TS_RUN, minclsyspri); mutex_exit(&tx->tx_sync_lock); @@ -148,12 +162,12 @@ txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) } static void -txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, int secmax) +txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time) { CALLB_CPR_SAFE_BEGIN(cpr); - if (secmax) - (void) cv_timedwait(cv, &tx->tx_sync_lock, secmax * hz); + if (time) + (void) cv_timedwait(cv, &tx->tx_sync_lock, time); else cv_wait(cv, &tx->tx_sync_lock); @@ -172,22 +186,21 @@ txg_sync_stop(dsl_pool_t *dp) /* * Finish off any work in progress. */ - ASSERT(tx->tx_threads == 3); + ASSERT(tx->tx_threads == 2); txg_wait_synced(dp, 0); /* - * Wake all 3 sync threads (one per state) and wait for them to die. + * Wake all sync threads and wait for them to die. */ mutex_enter(&tx->tx_sync_lock); - ASSERT(tx->tx_threads == 3); + ASSERT(tx->tx_threads == 2); tx->tx_exiting = 1; cv_broadcast(&tx->tx_quiesce_more_cv); cv_broadcast(&tx->tx_quiesce_done_cv); cv_broadcast(&tx->tx_sync_more_cv); - cv_broadcast(&tx->tx_timeout_exit_cv); while (tx->tx_threads != 0) cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); @@ -279,22 +292,29 @@ txg_sync_thread(void *arg) dsl_pool_t *dp = arg; tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; + uint64_t start, delta; txg_thread_enter(tx, &cpr); + start = delta = 0; for (;;) { + uint64_t timer, timeout = zfs_txg_timeout * hz; uint64_t txg; /* * We sync when there's someone waiting on us, or the - * quiesce thread has handed off a txg to us. + * quiesce thread has handed off a txg to us, or we have + * reached our timeout. */ - while (!tx->tx_exiting && + timer = (delta >= timeout ? 0 : timeout - delta); + while (!tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && tx->tx_quiesced_txg == 0) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); - txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, 0); + txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); + delta = LBOLT - start; + timer = (delta > timeout ? 0 : timeout - delta); } /* @@ -325,10 +345,13 @@ txg_sync_thread(void *arg) rw_exit(&tx->tx_suspend); dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - txg, tx->tx_quiesce_txg_waiting, - tx->tx_sync_txg_waiting); + txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); + + start = LBOLT; spa_sync(dp->dp_spa, txg); + delta = LBOLT - start; + mutex_enter(&tx->tx_sync_lock); rw_enter(&tx->tx_suspend, RW_WRITER); tx->tx_synced_txg = txg; @@ -383,13 +406,43 @@ txg_quiesce_thread(void *arg) } } +/* + * Delay this thread by 'ticks' if we are still in the open transaction + * group and there is already a waiting txg quiesing or quiesced. Abort + * the delay if this txg stalls or enters the quiesing state. + */ +void +txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) +{ + tx_state_t *tx = &dp->dp_tx; + int timeout = LBOLT + ticks; + + /* don't delay if this txg could transition to quiesing immediately */ + if (tx->tx_open_txg > txg || + tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) + return; + + mutex_enter(&tx->tx_sync_lock); + if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) { + mutex_exit(&tx->tx_sync_lock); + return; + } + + while (LBOLT < timeout && + tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) + (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, + timeout - LBOLT); + + mutex_exit(&tx->tx_sync_lock); +} + void txg_wait_synced(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; mutex_enter(&tx->tx_sync_lock); - ASSERT(tx->tx_threads == 3); + ASSERT(tx->tx_threads == 2); if (txg == 0) txg = tx->tx_open_txg; if (tx->tx_sync_txg_waiting < txg) @@ -412,7 +465,7 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg) tx_state_t *tx = &dp->dp_tx; mutex_enter(&tx->tx_sync_lock); - ASSERT(tx->tx_threads == 3); + ASSERT(tx->tx_threads == 2); if (txg == 0) txg = tx->tx_open_txg + 1; if (tx->tx_quiesce_txg_waiting < txg) @@ -426,37 +479,20 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg) mutex_exit(&tx->tx_sync_lock); } -static void -txg_timelimit_thread(void *arg) +boolean_t +txg_stalled(dsl_pool_t *dp) { - dsl_pool_t *dp = arg; tx_state_t *tx = &dp->dp_tx; - callb_cpr_t cpr; - - txg_thread_enter(tx, &cpr); - - while (!tx->tx_exiting) { - uint64_t txg = tx->tx_open_txg + 1; - - txg_thread_wait(tx, &cpr, &tx->tx_timeout_exit_cv, txg_time); - - if (tx->tx_quiesce_txg_waiting < txg) - tx->tx_quiesce_txg_waiting = txg; - - while (!tx->tx_exiting && tx->tx_open_txg < txg) { - dprintf("pushing out %llu\n", txg); - cv_broadcast(&tx->tx_quiesce_more_cv); - txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); - } - } - txg_thread_exit(tx, &cpr, &tx->tx_timelimit_thread); + return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); } -int -txg_stalled(dsl_pool_t *dp) +boolean_t +txg_sync_waiting(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; - return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); + + return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting || + tx->tx_quiesced_txg != 0); } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c index b52e729d6294..fbe7b619a29a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,8 +30,7 @@ #include <sys/unique.h> static avl_tree_t unique_avl; -static kmutex_t unique_mtx; /* Lock never initialized. */ -SX_SYSINIT(unique, &unique_mtx, "unique lock"); +static kmutex_t unique_mtx; typedef struct unique { avl_node_t un_link; @@ -58,12 +57,22 @@ unique_init(void) { avl_create(&unique_avl, unique_compare, sizeof (unique_t), offsetof(unique_t, un_link)); + mutex_init(&unique_mtx, NULL, MUTEX_DEFAULT, NULL); +} + +void +unique_fini(void) +{ + avl_destroy(&unique_avl); + mutex_destroy(&unique_mtx); } uint64_t unique_create(void) { - return (unique_insert(0)); + uint64_t value = unique_insert(0); + unique_remove(value); + return (value); } uint64_t diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c index b966099f4640..7d0602c8ee36 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/fm/fs/zfs.h> #include <sys/spa.h> @@ -40,6 +38,7 @@ #include <sys/zio.h> #include <sys/zap.h> #include <sys/fs/zfs.h> +#include <sys/arc.h> SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); @@ -58,14 +57,18 @@ static vdev_ops_t *vdev_ops_table[] = { &vdev_geom_ops, #else &vdev_disk_ops, - &vdev_file_ops, #endif + &vdev_file_ops, &vdev_missing_ops, NULL }; -/* maximum scrub/resilver I/O queue */ -int zfs_scrub_limit = 70; +/* maximum scrub/resilver I/O queue per leaf vdev */ +int zfs_scrub_limit = 10; + +TUNABLE_INT("vfs.zfs.scrub_limit", &zfs_scrub_limit); +SYSCTL_INT(_vfs_zfs, OID_AUTO, scrub_limit, CTLFLAG_RDTUN, &zfs_scrub_limit, 0, + "Maximum scrub/resilver I/O queue"); /* * Given a vdev type, return the appropriate ops vector. @@ -143,8 +146,12 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; - if (vdev < rvd->vdev_children) + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + + if (vdev < rvd->vdev_children) { + ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); + } return (NULL); } @@ -173,7 +180,7 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd) uint64_t id = cvd->vdev_id; vdev_t **newchild; - ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); + ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; @@ -256,7 +263,7 @@ vdev_compact_children(vdev_t *pvd) int oldc = pvd->vdev_children; int newc, c; - ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER)); + ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); for (c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) @@ -319,6 +326,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); txg_list_create(&vd->vdev_ms_list, @@ -326,44 +334,13 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) txg_list_create(&vd->vdev_dtl_list, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); + vdev_queue_init(vd); + vdev_cache_init(vd); return (vd); } /* - * Free a vdev_t that has been removed from service. - */ -static void -vdev_free_common(vdev_t *vd) -{ - spa_t *spa = vd->vdev_spa; - - if (vd->vdev_path) - spa_strfree(vd->vdev_path); - if (vd->vdev_devid) - spa_strfree(vd->vdev_devid); - - if (vd->vdev_isspare) - spa_spare_remove(vd); - - txg_list_destroy(&vd->vdev_ms_list); - txg_list_destroy(&vd->vdev_dtl_list); - mutex_enter(&vd->vdev_dtl_lock); - space_map_unload(&vd->vdev_dtl_map); - space_map_destroy(&vd->vdev_dtl_map); - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); - space_map_destroy(&vd->vdev_dtl_scrub); - mutex_exit(&vd->vdev_dtl_lock); - mutex_destroy(&vd->vdev_dtl_lock); - mutex_destroy(&vd->vdev_stat_lock); - - if (vd == spa->spa_root_vdev) - spa->spa_root_vdev = NULL; - - kmem_free(vd, sizeof (vdev_t)); -} - -/* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. @@ -374,10 +351,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, { vdev_ops_t *ops; char *type; - uint64_t guid = 0; + uint64_t guid = 0, islog, nparity; vdev_t *vd; - ASSERT(spa_config_held(spa, RW_WRITER)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (EINVAL); @@ -401,6 +378,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (EINVAL); + } else if (alloctype == VDEV_ALLOC_L2CACHE) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (EINVAL); } /* @@ -409,47 +389,61 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (EINVAL); - vd = vdev_alloc_common(spa, id, guid, ops); - - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) - vd->vdev_path = spa_strdup(vd->vdev_path); - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) - vd->vdev_devid = spa_strdup(vd->vdev_devid); + /* + * Determine whether we're a log vdev. + */ + islog = 0; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); + if (islog && spa_version(spa) < SPA_VERSION_SLOGS) + return (ENOTSUP); /* - * Set the nparity propery for RAID-Z vdevs. + * Set the nparity property for RAID-Z vdevs. */ + nparity = -1ULL; if (ops == &vdev_raidz_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, - &vd->vdev_nparity) == 0) { + &nparity) == 0) { /* * Currently, we can only support 2 parity devices. */ - if (vd->vdev_nparity > 2) + if (nparity == 0 || nparity > 2) return (EINVAL); /* * Older versions can only support 1 parity device. */ - if (vd->vdev_nparity == 2 && - spa_version(spa) < ZFS_VERSION_RAID6) + if (nparity == 2 && + spa_version(spa) < SPA_VERSION_RAID6) return (ENOTSUP); - } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ - if (spa_version(spa) >= ZFS_VERSION_RAID6) + if (spa_version(spa) >= SPA_VERSION_RAID6) return (EINVAL); - /* * Otherwise, we default to 1 parity device for RAID-Z. */ - vd->vdev_nparity = 1; + nparity = 1; } } else { - vd->vdev_nparity = 0; + nparity = 0; } + ASSERT(nparity != -1ULL); + + vd = vdev_alloc_common(spa, id, guid, ops); + + vd->vdev_islog = islog; + vd->vdev_nparity = nparity; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) + vd->vdev_path = spa_strdup(vd->vdev_path); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) + vd->vdev_devid = spa_strdup(vd->vdev_devid); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, + &vd->vdev_physpath) == 0) + vd->vdev_physpath = spa_strdup(vd->vdev_physpath); /* * Set the whole_disk property. If it's not specified, leave the value @@ -463,8 +457,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, - &vd->vdev_not_present); + if (!spa->spa_import_faulted) + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + &vd->vdev_not_present); /* * Get the alignment requirement. @@ -484,13 +479,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } /* - * If we're a leaf vdev, try to load the DTL object and offline state. + * If we're a leaf vdev, try to load the DTL object and other state. */ - if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, - &vd->vdev_dtl.smo_object); + if (vd->vdev_ops->vdev_op_leaf && + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) { + if (alloctype == VDEV_ALLOC_LOAD) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, + &vd->vdev_dtl.smo_object); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, + &vd->vdev_unspare); + } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); + + /* + * When importing a pool, we want to ignore the persistent fault + * state, as the diagnosis made on another system may not be + * valid in the current context. + */ + if (spa->spa_load_state == SPA_LOAD_OPEN) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, + &vd->vdev_faulted); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, + &vd->vdev_degraded); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, + &vd->vdev_removed); + } } /* @@ -507,6 +521,7 @@ void vdev_free(vdev_t *vd) { int c; + spa_t *spa = vd->vdev_spa; /* * vdev_free() implies closing the vdev first. This is simpler than @@ -514,7 +529,7 @@ vdev_free(vdev_t *vd) */ vdev_close(vd); - ASSERT(!list_link_active(&vd->vdev_dirty_node)); + ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); /* * Free all children. @@ -542,7 +557,40 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_parent == NULL); - vdev_free_common(vd); + /* + * Clean up vdev structure. + */ + vdev_queue_fini(vd); + vdev_cache_fini(vd); + + if (vd->vdev_path) + spa_strfree(vd->vdev_path); + if (vd->vdev_devid) + spa_strfree(vd->vdev_devid); + if (vd->vdev_physpath) + spa_strfree(vd->vdev_physpath); + + if (vd->vdev_isspare) + spa_spare_remove(vd); + if (vd->vdev_isl2cache) + spa_l2cache_remove(vd); + + txg_list_destroy(&vd->vdev_ms_list); + txg_list_destroy(&vd->vdev_dtl_list); + mutex_enter(&vd->vdev_dtl_lock); + space_map_unload(&vd->vdev_dtl_map); + space_map_destroy(&vd->vdev_dtl_map); + space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + space_map_destroy(&vd->vdev_dtl_scrub); + mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_stat_lock); + mutex_destroy(&vd->vdev_probe_lock); + + if (vd == spa->spa_root_vdev) + spa->spa_root_vdev = NULL; + + kmem_free(vd, sizeof (vdev_t)); } /* @@ -592,16 +640,21 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } - if (list_link_active(&svd->vdev_dirty_node)) { + if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } - tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted; - svd->vdev_reopen_wanted = 0; + if (list_link_active(&svd->vdev_state_dirty_node)) { + vdev_state_clean(svd); + vdev_state_dirty(tvd); + } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; + + tvd->vdev_islog = svd->vdev_islog; + svd->vdev_islog = 0; } static void @@ -628,7 +681,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; - ASSERT(spa_config_held(spa, RW_WRITER)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); @@ -657,7 +710,7 @@ vdev_remove_parent(vdev_t *cvd) vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; - ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); + ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || @@ -667,22 +720,16 @@ vdev_remove_parent(vdev_t *cvd) vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); - cvd->vdev_id = mvd->vdev_id; - vdev_add_child(pvd, cvd); /* - * If we created a new toplevel vdev, then we need to change the child's - * vdev GUID to match the old toplevel vdev. Otherwise, we could have - * detached an offline device, and when we go to import the pool we'll - * think we have two toplevel vdevs, instead of a different version of - * the same toplevel vdev. + * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. + * Otherwise, we could have detached an offline device, and when we + * go to import the pool we'll think we have two top-level vdevs, + * instead of a different version of the same top-level vdev. */ - if (cvd->vdev_top == cvd) { - pvd->vdev_guid_sum -= cvd->vdev_guid; - cvd->vdev_guid_sum -= cvd->vdev_guid; - cvd->vdev_guid = mvd->vdev_guid; - cvd->vdev_guid_sum += mvd->vdev_guid; - pvd->vdev_guid_sum += cvd->vdev_guid; - } + if (mvd->vdev_top == mvd) + cvd->vdev_guid = cvd->vdev_guid_sum = mvd->vdev_guid; + cvd->vdev_id = mvd->vdev_id; + vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) @@ -697,7 +744,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; - metaslab_class_t *mc = spa_metaslab_class_select(spa); + metaslab_class_t *mc; uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; @@ -707,10 +754,13 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ return (0); - dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); - ASSERT(oldc <= newc); + if (vd->vdev_islog) + mc = spa->spa_log_class; + else + mc = spa->spa_normal_class; + if (vd->vdev_mg == NULL) vd->vdev_mg = metaslab_group_create(mc, vd); @@ -737,8 +787,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) error = dmu_bonus_hold(mos, object, FTAG, &db); if (error) return (error); - ASSERT3U(db->db_size, ==, sizeof (smo)); - bcopy(db->db_data, &smo, db->db_size); + ASSERT3U(db->db_size, >=, sizeof (smo)); + bcopy(db->db_data, &smo, sizeof (smo)); ASSERT3U(smo.smo_object, ==, object); dmu_buf_rele(db, FTAG); } @@ -765,6 +815,112 @@ vdev_metaslab_fini(vdev_t *vd) } } +typedef struct vdev_probe_stats { + boolean_t vps_readable; + boolean_t vps_writeable; + int vps_flags; + zio_t *vps_root; + vdev_t *vps_vd; +} vdev_probe_stats_t; + +static void +vdev_probe_done(zio_t *zio) +{ + vdev_probe_stats_t *vps = zio->io_private; + vdev_t *vd = vps->vps_vd; + + if (zio->io_type == ZIO_TYPE_READ) { + ASSERT(zio->io_vd == vd); + if (zio->io_error == 0) + vps->vps_readable = 1; + if (zio->io_error == 0 && (spa_mode & FWRITE)) { + zio_nowait(zio_write_phys(vps->vps_root, vd, + zio->io_offset, zio->io_size, zio->io_data, + ZIO_CHECKSUM_OFF, vdev_probe_done, vps, + ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); + } else { + zio_buf_free(zio->io_data, zio->io_size); + } + } else if (zio->io_type == ZIO_TYPE_WRITE) { + ASSERT(zio->io_vd == vd); + if (zio->io_error == 0) + vps->vps_writeable = 1; + zio_buf_free(zio->io_data, zio->io_size); + } else if (zio->io_type == ZIO_TYPE_NULL) { + ASSERT(zio->io_vd == NULL); + ASSERT(zio == vps->vps_root); + + vd->vdev_cant_read |= !vps->vps_readable; + vd->vdev_cant_write |= !vps->vps_writeable; + + if (vdev_readable(vd) && + (vdev_writeable(vd) || !(spa_mode & FWRITE))) { + zio->io_error = 0; + } else { + ASSERT(zio->io_error != 0); + zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, + zio->io_spa, vd, NULL, 0, 0); + zio->io_error = ENXIO; + } + kmem_free(vps, sizeof (*vps)); + } +} + +/* + * Determine whether this device is accessible by reading and writing + * to several known locations: the pad regions of each vdev label + * but the first (which we leave alone in case it contains a VTOC). + */ +zio_t * +vdev_probe(vdev_t *vd, zio_t *pio) +{ + spa_t *spa = vd->vdev_spa; + vdev_probe_stats_t *vps; + zio_t *zio; + + vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); + + vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_DONT_RETRY; + + if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { + /* + * vdev_cant_read and vdev_cant_write can only transition + * from TRUE to FALSE when we have the SCL_ZIO lock as writer; + * otherwise they can only transition from FALSE to TRUE. + * This ensures that any zio looking at these values can + * assume that failures persist for the life of the I/O. + * That's important because when a device has intermittent + * connectivity problems, we want to ensure that they're + * ascribed to the device (ENXIO) and not the zio (EIO). + * + * Since we hold SCL_ZIO as writer here, clear both values + * so the probe can reevaluate from first principles. + */ + vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + } + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + zio = zio_null(pio, spa, vdev_probe_done, vps, vps->vps_flags); + + vps->vps_root = zio; + vps->vps_vd = vd; + + for (int l = 1; l < VDEV_LABELS; l++) { + zio_nowait(zio_read_phys(zio, vd, + vdev_label_offset(vd->vdev_psize, l, + offsetof(vdev_label_t, vl_pad)), + VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE), + ZIO_CHECKSUM_OFF, vdev_probe_done, vps, + ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); + } + + return (zio); +} + /* * Prepare a virtual device for access. */ @@ -781,20 +937,14 @@ vdev_open(vdev_t *vd) vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); - if (vd->vdev_fault_mode == VDEV_FAULT_COUNT) - vd->vdev_fault_arg >>= 1; - else - vd->vdev_fault_mode = VDEV_FAULT_NONE; - vd->vdev_stat.vs_aux = VDEV_AUX_NONE; - if (vd->vdev_ops->vdev_op_leaf) { - vdev_cache_init(vd); - vdev_queue_init(vd); - vd->vdev_cache_active = B_TRUE; - } - - if (vd->vdev_offline) { + if (!vd->vdev_removed && vd->vdev_faulted) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); + return (ENXIO); + } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (ENXIO); @@ -805,16 +955,25 @@ vdev_open(vdev_t *vd) if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, ENXIO); - dprintf("%s = %d, osize %llu, state = %d\n", - vdev_description(vd), error, osize, vd->vdev_state); - if (error) { + if (vd->vdev_removed && + vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) + vd->vdev_removed = B_FALSE; + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); return (error); } - vd->vdev_state = VDEV_STATE_HEALTHY; + vd->vdev_removed = B_FALSE; + + if (vd->vdev_degraded) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + } else { + vd->vdev_state = VDEV_STATE_HEALTHY; + } for (c = 0; c < vd->vdev_children; c++) if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { @@ -883,6 +1042,17 @@ vdev_open(vdev_t *vd) } /* + * Ensure we can issue some IO before declaring the + * vdev open for business. + */ + if (vd->vdev_ops->vdev_op_leaf && + (error = zio_wait(vdev_probe(vd, NULL))) != 0) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_IO_FAILURE); + return (error); + } + + /* * If this is a top-level vdev, compute the raidz-deflation * ratio. Note, we hard-code in 128k (1<<17) because it is the * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE @@ -895,16 +1065,17 @@ vdev_open(vdev_t *vd) } /* - * This allows the ZFS DE to close cases appropriately. If a device - * goes away and later returns, we want to close the associated case. - * But it's not enough to simply post this only when a device goes from - * CANT_OPEN -> HEALTHY. If we reboot the system and the device is - * back, we also need to close the case (otherwise we will try to replay - * it). So we have to post this notifier every time. Since this only - * occurs during pool open or error recovery, this should not be an - * issue. + * If a leaf vdev has a DTL, and seems healthy, then kick off a + * resilver. But don't do this if we are doing a reopen for a + * scrub, since this would just restart the scrub we are already + * doing. */ - zfs_post_ok(vd->vdev_spa, vd); + if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) { + mutex_enter(&vd->vdev_dtl_lock); + if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) + spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER); + mutex_exit(&vd->vdev_dtl_lock); + } return (0); } @@ -912,8 +1083,7 @@ vdev_open(vdev_t *vd) /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't - * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen() - * won't succeed if the device has been changed underneath. + * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if @@ -926,7 +1096,7 @@ vdev_validate(vdev_t *vd) spa_t *spa = vd->vdev_spa; int c; nvlist_t *label; - uint64_t guid; + uint64_t guid, top_guid; uint64_t state; for (c = 0; c < vd->vdev_children; c++) @@ -938,7 +1108,7 @@ vdev_validate(vdev_t *vd) * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ - if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) { + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { if ((label = vdev_label_read_config(vd)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, @@ -954,8 +1124,20 @@ vdev_validate(vdev_t *vd) return (0); } + /* + * If this vdev just became a top-level vdev because its + * sibling was detached, it will have adopted the parent's + * vdev guid -- but the label may or may not be on disk yet. + * Fortunately, either version of the label will have the + * same top guid, so if we're a top-level vdev, we can + * safely compare to that instead. + */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, - &guid) != 0 || guid != vd->vdev_guid) { + &guid) != 0 || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, + &top_guid) != 0 || + (vd->vdev_guid != guid && + (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); @@ -975,14 +1157,15 @@ vdev_validate(vdev_t *vd) if (spa->spa_load_state == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) return (EBADF); - } - /* - * If we were able to open and validate a vdev that was previously - * marked permanently unavailable, clear that state now. - */ - if (vd->vdev_not_present) - vd->vdev_not_present = 0; + /* + * If we were able to open and validate a vdev that was + * previously marked permanently unavailable, clear that state + * now. + */ + if (vd->vdev_not_present) + vd->vdev_not_present = 0; + } return (0); } @@ -995,11 +1178,7 @@ vdev_close(vdev_t *vd) { vd->vdev_ops->vdev_op_close(vd); - if (vd->vdev_cache_active) { - vdev_cache_fini(vd); - vdev_queue_fini(vd); - vd->vdev_cache_active = B_FALSE; - } + vdev_cache_purge(vd); /* * We record the previous state before we close it, so that if we are @@ -1020,7 +1199,7 @@ vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - ASSERT(spa_config_held(spa, RW_WRITER)); + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); vdev_close(vd); (void) vdev_open(vd); @@ -1029,22 +1208,24 @@ vdev_reopen(vdev_t *vd) * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). - * - * The downside to this is that if the user is simply experimenting by - * overwriting an entire disk, we'll fault the device rather than - * demonstrate self-healing capabilities. On the other hand, with - * proper FMA integration, the series of errors we'd see from the device - * would result in a faulted device anyway. Given that this doesn't - * model any real-world corruption, it's better to catch this here and - * correctly identify that the device has either changed beneath us, or - * is corrupted beyond recognition. */ - (void) vdev_validate(vd); + if (vd->vdev_aux) { + (void) vdev_validate_aux(vd); + if (vdev_readable(vd) && vdev_writeable(vd) && + !l2arc_vdev_present(vd)) { + uint64_t size = vdev_get_rsize(vd); + l2arc_add_vdev(spa, vd, + VDEV_LABEL_START_SIZE, + size - VDEV_LABEL_START_SIZE); + } + } else { + (void) vdev_validate(vd); + } /* - * Reassess root vdev's health. + * Reassess parent vdev's health. */ - vdev_propagate_state(spa->spa_root_vdev); + vdev_propagate_state(vd); } int @@ -1150,22 +1331,27 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) spa_t *spa = vd->vdev_spa; int c; - ASSERT(spa_config_held(spa, RW_WRITER)); + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); - /* - * We're successfully scrubbed everything up to scrub_txg. - * Therefore, excise all old DTLs up to that point, then - * fold in the DTLs for everything we couldn't scrub. - */ - if (scrub_txg != 0) { + if (scrub_txg != 0 && + (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { + /* XXX should check scrub_done? */ + /* + * We completed a scrub up to scrub_txg. If we + * did it without rebooting, then the scrub dtl + * will be valid, so excise the old region and + * fold in the scrub dtl. Otherwise, leave the + * dtl as-is if there was an error. + */ space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); } if (scrub_done) space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); mutex_exit(&vd->vdev_dtl_lock); + if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); return; @@ -1212,8 +1398,8 @@ vdev_dtl_load(vdev_t *vd) if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) return (error); - ASSERT3U(db->db_size, ==, sizeof (*smo)); - bcopy(db->db_data, smo, db->db_size); + ASSERT3U(db->db_size, >=, sizeof (*smo)); + bcopy(db->db_data, smo, sizeof (*smo)); dmu_buf_rele(db, FTAG); mutex_enter(&vd->vdev_dtl_lock); @@ -1235,9 +1421,6 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) dmu_buf_t *db; dmu_tx_t *tx; - dprintf("%s in txg %llu pass %d\n", - vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached) { @@ -1247,8 +1430,6 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) smo->smo_object = 0; } dmu_tx_commit(tx); - dprintf("detach %s committed in txg %llu\n", - vdev_description(vd), txg); return; } @@ -1283,13 +1464,56 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); - ASSERT3U(db->db_size, ==, sizeof (*smo)); - bcopy(smo, db->db_data, db->db_size); + ASSERT3U(db->db_size, >=, sizeof (*smo)); + bcopy(smo, db->db_data, sizeof (*smo)); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); } +/* + * Determine if resilver is needed, and if so the txg range. + */ +boolean_t +vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) +{ + boolean_t needed = B_FALSE; + uint64_t thismin = UINT64_MAX; + uint64_t thismax = 0; + + if (vd->vdev_children == 0) { + mutex_enter(&vd->vdev_dtl_lock); + if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) { + space_seg_t *ss; + + ss = avl_first(&vd->vdev_dtl_map.sm_root); + thismin = ss->ss_start - 1; + ss = avl_last(&vd->vdev_dtl_map.sm_root); + thismax = ss->ss_end; + needed = B_TRUE; + } + mutex_exit(&vd->vdev_dtl_lock); + } else { + int c; + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + uint64_t cmin, cmax; + + if (vdev_resilver_needed(cvd, &cmin, &cmax)) { + thismin = MIN(thismin, cmin); + thismax = MAX(thismax, cmax); + needed = B_TRUE; + } + } + } + + if (needed && minp) { + *minp = thismin; + *maxp = thismax; + } + return (needed); +} + void vdev_load(vdev_t *vd) { @@ -1319,19 +1543,22 @@ vdev_load(vdev_t *vd) } /* - * This special case of vdev_spare() is used for hot spares. It's sole purpose - * it to set the vdev state for the associated vdev. To do this, we make sure - * that we can open the underlying device, then try to read the label, and make - * sure that the label is sane and that it hasn't been repurposed to another - * pool. + * The special vdev case is used for hot spares and l2cache devices. Its + * sole purpose it to set the vdev state for the associated vdev. To do this, + * we make sure that we can open the underlying device, then try to read the + * label, and make sure that the label is sane and that it hasn't been + * repurposed to another pool. */ int -vdev_validate_spare(vdev_t *vd) +vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; + if (!vdev_readable(vd)) + return (0); + if ((label = vdev_label_read_config(vd)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); @@ -1339,7 +1566,7 @@ vdev_validate_spare(vdev_t *vd) } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || - version > ZFS_VERSION || + version > SPA_VERSION || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { @@ -1349,8 +1576,6 @@ vdev_validate_spare(vdev_t *vd) return (-1); } - spa_spare_add(vd); - /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. @@ -1364,8 +1589,6 @@ vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; - dprintf("%s txg %llu\n", vdev_description(vd), txg); - while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) metaslab_sync_done(msp, txg); } @@ -1378,9 +1601,6 @@ vdev_sync(vdev_t *vd, uint64_t txg) metaslab_t *msp; dmu_tx_t *tx; - dprintf("%s txg %llu pass %d\n", - vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); - if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { ASSERT(vd == vd->vdev_top); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); @@ -1408,81 +1628,139 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize) return (vd->vdev_ops->vdev_op_asize(vd, psize)); } -void -vdev_io_start(zio_t *zio) +/* + * Mark the given vdev faulted. A faulted vdev behaves as if the device could + * not be opened, and no I/O is attempted. + */ +int +vdev_fault(spa_t *spa, uint64_t guid) { - zio->io_vd->vdev_ops->vdev_op_io_start(zio); -} + vdev_t *vd; -void -vdev_io_done(zio_t *zio) -{ - zio->io_vd->vdev_ops->vdev_op_io_done(zio); + spa_vdev_state_enter(spa); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + + /* + * Faulted state takes precedence over degraded. + */ + vd->vdev_faulted = 1ULL; + vd->vdev_degraded = 0ULL; + vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); + + /* + * If marking the vdev as faulted cause the top-level vdev to become + * unavailable, then back off and simply mark the vdev as degraded + * instead. + */ + if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { + vd->vdev_degraded = 1ULL; + vd->vdev_faulted = 0ULL; + + /* + * If we reopen the device and it's not dead, only then do we + * mark it degraded. + */ + vdev_reopen(vd); + + if (vdev_readable(vd)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + } + } + + return (spa_vdev_state_exit(spa, vd, 0)); } -const char * -vdev_description(vdev_t *vd) +/* + * Mark the given vdev degraded. A degraded vdev is purely an indication to the + * user that something is wrong. The vdev continues to operate as normal as far + * as I/O is concerned. + */ +int +vdev_degrade(spa_t *spa, uint64_t guid) { - if (vd == NULL || vd->vdev_ops == NULL) - return ("<unknown>"); + vdev_t *vd; + + spa_vdev_state_enter(spa); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENODEV)); - if (vd->vdev_path != NULL) - return (vd->vdev_path); + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + + /* + * If the vdev is already faulted, then don't do anything. + */ + if (vd->vdev_faulted || vd->vdev_degraded) + return (spa_vdev_state_exit(spa, NULL, 0)); - if (vd->vdev_parent == NULL) - return (spa_name(vd->vdev_spa)); + vd->vdev_degraded = 1ULL; + if (!vdev_is_dead(vd)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); - return (vd->vdev_ops->vdev_op_type); + return (spa_vdev_state_exit(spa, vd, 0)); } +/* + * Online the given vdev. If 'unspare' is set, it implies two things. First, + * any attached spare device should be detached when the device finishes + * resilvering. Second, the online should be treated like a 'test' online case, + * so no FMA events are generated if the device fails to open. + */ int -vdev_online(spa_t *spa, uint64_t guid) +vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { - vdev_t *rvd, *vd; - uint64_t txg; - - txg = spa_vdev_enter(spa); + vdev_t *vd; - rvd = spa->spa_root_vdev; + spa_vdev_state_enter(spa); - if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) - return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - - dprintf("ONLINE: %s\n", vdev_description(vd)); + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; + vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); + vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); vdev_reopen(vd->vdev_top); + vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; - vdev_config_dirty(vd->vdev_top); + if (newstate) + *newstate = vd->vdev_state; + if ((flags & ZFS_ONLINE_UNSPARE) && + !vdev_is_dead(vd) && vd->vdev_parent && + vd->vdev_parent->vdev_ops == &vdev_spare_ops && + vd->vdev_parent->vdev_child[0] == vd) + vd->vdev_unspare = B_TRUE; - (void) spa_vdev_exit(spa, NULL, txg, 0); + (void) spa_vdev_state_exit(spa, vd, 0); - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); return (0); } int -vdev_offline(spa_t *spa, uint64_t guid, int istmp) +vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { - vdev_t *rvd, *vd; - uint64_t txg; - - txg = spa_vdev_enter(spa); + vdev_t *vd; - rvd = spa->spa_root_vdev; + spa_vdev_state_enter(spa); - if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) - return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - - dprintf("OFFLINE: %s\n", vdev_description(vd)); + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); /* * If the device isn't already offline, try to offline it. @@ -1496,7 +1774,7 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp) * as long as the remaining devices don't have any DTL holes. */ if (vd->vdev_top->vdev_dtl_map.sm_space != 0) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* * Offline this device and reopen its top-level vdev. @@ -1505,18 +1783,16 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp) */ vd->vdev_offline = B_TRUE; vdev_reopen(vd->vdev_top); - if (vdev_is_dead(vd->vdev_top)) { + if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { vd->vdev_offline = B_FALSE; vdev_reopen(vd->vdev_top); - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + return (spa_vdev_state_exit(spa, NULL, EBUSY)); } } - vd->vdev_tmpoffline = istmp; + vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); - vdev_config_dirty(vd->vdev_top); - - return (spa_vdev_exit(spa, NULL, txg, 0)); + return (spa_vdev_state_exit(spa, vd, 0)); } /* @@ -1527,56 +1803,78 @@ vdev_offline(spa_t *spa, uint64_t guid, int istmp) void vdev_clear(spa_t *spa, vdev_t *vd) { - int c; + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) - vd = spa->spa_root_vdev; + vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); + + /* + * If we're in the FAULTED state or have experienced failed I/O, then + * clear the persistent state and attempt to reopen the device. We + * also mark the vdev config dirty, so that the new faulted state is + * written out to disk. + */ + if (vd->vdev_faulted || vd->vdev_degraded || + !vdev_readable(vd) || !vdev_writeable(vd)) { + + vd->vdev_faulted = vd->vdev_degraded = 0; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + + vdev_reopen(vd); + + if (vd != rvd) + vdev_state_dirty(vd->vdev_top); + + if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) + spa_async_request(spa, SPA_ASYNC_RESILVER); + + spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); + } } -int +boolean_t vdev_is_dead(vdev_t *vd) { - return (vd->vdev_state <= VDEV_STATE_CANT_OPEN); + return (vd->vdev_state < VDEV_STATE_DEGRADED); } -int -vdev_error_inject(vdev_t *vd, zio_t *zio) +boolean_t +vdev_readable(vdev_t *vd) +{ + return (!vdev_is_dead(vd) && !vd->vdev_cant_read); +} + +boolean_t +vdev_writeable(vdev_t *vd) { - int error = 0; + return (!vdev_is_dead(vd) && !vd->vdev_cant_write); +} - if (vd->vdev_fault_mode == VDEV_FAULT_NONE) - return (0); +boolean_t +vdev_accessible(vdev_t *vd, zio_t *zio) +{ + ASSERT(zio->io_vd == vd); - if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0) - return (0); + if (vdev_is_dead(vd) || vd->vdev_remove_wanted) + return (B_FALSE); - switch (vd->vdev_fault_mode) { - case VDEV_FAULT_RANDOM: - if (spa_get_random(vd->vdev_fault_arg) == 0) - error = EIO; - break; - - case VDEV_FAULT_COUNT: - if ((int64_t)--vd->vdev_fault_arg <= 0) - vd->vdev_fault_mode = VDEV_FAULT_NONE; - error = EIO; - break; - } + if (zio->io_type == ZIO_TYPE_READ) + return (!vd->vdev_cant_read); - if (error != 0) { - dprintf("returning %d for type %d on %s state %d offset %llx\n", - error, zio->io_type, vdev_description(vd), - vd->vdev_state, zio->io_offset); - } + if (zio->io_type == ZIO_TYPE_WRITE) + return (!vd->vdev_cant_write); - return (error); + return (B_TRUE); } /* @@ -1586,10 +1884,10 @@ void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { vdev_t *rvd = vd->vdev_spa->spa_root_vdev; - int c, t; mutex_enter(&vd->vdev_stat_lock); bcopy(&vd->vdev_stat, vs, sizeof (*vs)); + vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_rsize(vd); @@ -1600,49 +1898,80 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) * over all top-level vdevs (i.e. the direct children of the root). */ if (vd == rvd) { - for (c = 0; c < rvd->vdev_children; c++) { + for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *cvd = rvd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; mutex_enter(&vd->vdev_stat_lock); - for (t = 0; t < ZIO_TYPES; t++) { + for (int t = 0; t < ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } - vs->vs_read_errors += cvs->vs_read_errors; - vs->vs_write_errors += cvs->vs_write_errors; - vs->vs_checksum_errors += cvs->vs_checksum_errors; vs->vs_scrub_examined += cvs->vs_scrub_examined; - vs->vs_scrub_errors += cvs->vs_scrub_errors; mutex_exit(&vd->vdev_stat_lock); } } } void -vdev_stat_update(zio_t *zio) +vdev_clear_stats(vdev_t *vd) { - vdev_t *vd = zio->io_vd; + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_space = 0; + vd->vdev_stat.vs_dspace = 0; + vd->vdev_stat.vs_alloc = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +void +vdev_stat_update(zio_t *zio, uint64_t psize) +{ + vdev_t *rvd = zio->io_spa->spa_root_vdev; + vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = &vd->vdev_stat; zio_type_t type = zio->io_type; int flags = zio->io_flags; + /* + * If this i/o is a gang leader, it didn't do any actual work. + */ + if (zio->io_gang_tree) + return; + if (zio->io_error == 0) { + /* + * If this is a root i/o, don't count it -- we've already + * counted the top-level vdevs, and vdev_get_stats() will + * aggregate them when asked. This reduces contention on + * the root vdev_stat_lock and implicitly handles blocks + * that compress away to holes, for which there is no i/o. + * (Holes never create vdev children, so all the counters + * remain zero, which is what we want.) + * + * Note: this only applies to successful i/o (io_error == 0) + * because unlike i/o counts, errors are not additive. + * When reading a ditto block, for example, failure of + * one top-level vdev does not imply a root-level error. + */ + if (vd == rvd) + return; + + ASSERT(vd == zio->io_vd); if (!(flags & ZIO_FLAG_IO_BYPASS)) { mutex_enter(&vd->vdev_stat_lock); vs->vs_ops[type]++; - vs->vs_bytes[type] += zio->io_size; + vs->vs_bytes[type] += psize; mutex_exit(&vd->vdev_stat_lock); } - if ((flags & ZIO_FLAG_IO_REPAIR) && - zio->io_delegate_list == NULL) { + if (flags & ZIO_FLAG_IO_REPAIR) { + ASSERT(zio->io_delegate_list == NULL); mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_SCRUB_THREAD) - vs->vs_scrub_repaired += zio->io_size; + vs->vs_scrub_repaired += psize; else - vs->vs_self_healed += zio->io_size; + vs->vs_self_healed += psize; mutex_exit(&vd->vdev_stat_lock); } return; @@ -1651,22 +1980,18 @@ vdev_stat_update(zio_t *zio) if (flags & ZIO_FLAG_SPECULATIVE) return; - if (!vdev_is_dead(vd)) { - mutex_enter(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_READ) { - if (zio->io_error == ECKSUM) - vs->vs_checksum_errors++; - else - vs->vs_read_errors++; - } - if (type == ZIO_TYPE_WRITE) - vs->vs_write_errors++; - mutex_exit(&vd->vdev_stat_lock); + mutex_enter(&vd->vdev_stat_lock); + if (type == ZIO_TYPE_READ) { + if (zio->io_error == ECKSUM) + vs->vs_checksum_errors++; + else + vs->vs_read_errors++; } + if (type == ZIO_TYPE_WRITE) + vs->vs_write_errors++; + mutex_exit(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_WRITE) { - if (txg == 0 || vd->vdev_children != 0) - return; + if (type == ZIO_TYPE_WRITE && txg != 0 && vd->vdev_children == 0) { if (flags & ZIO_FLAG_SCRUB_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) @@ -1705,7 +2030,6 @@ vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) vs->vs_scrub_complete = 0; vs->vs_scrub_examined = 0; vs->vs_scrub_repaired = 0; - vs->vs_scrub_errors = 0; vs->vs_scrub_start = gethrestime_sec(); vs->vs_scrub_end = 0; } @@ -1717,33 +2041,48 @@ vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) * Update the in-core space usage stats for this vdev and the root vdev. */ void -vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta) +vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, + boolean_t update_root) { - ASSERT(vd == vd->vdev_top); int64_t dspace_delta = space_delta; + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; - do { - if (vd->vdev_ms_count) { - /* - * If this is a top-level vdev, apply the - * inverse of its psize-to-asize (ie. RAID-Z) - * space-expansion factor. We must calculate - * this here and not at the root vdev because - * the root vdev's psize-to-asize is simply the - * max of its childrens', thus not accurate - * enough for us. - */ - ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); - dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * - vd->vdev_deflate_ratio; - } + ASSERT(vd == vd->vdev_top); + + /* + * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion + * factor. We must calculate this here and not at the root vdev + * because the root vdev's psize-to-asize is simply the max of its + * childrens', thus not accurate enough for us. + */ + ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); + dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * + vd->vdev_deflate_ratio; + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_space += space_delta; + vd->vdev_stat.vs_alloc += alloc_delta; + vd->vdev_stat.vs_dspace += dspace_delta; + mutex_exit(&vd->vdev_stat_lock); + + if (update_root) { + ASSERT(rvd == vd->vdev_parent); + ASSERT(vd->vdev_ms_count != 0); + + /* + * Don't count non-normal (e.g. intent log) space as part of + * the pool's capacity. + */ + if (vd->vdev_mg->mg_class != spa->spa_normal_class) + return; - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_space += space_delta; - vd->vdev_stat.vs_alloc += alloc_delta; - vd->vdev_stat.vs_dspace += dspace_delta; - mutex_exit(&vd->vdev_stat_lock); - } while ((vd = vd->vdev_parent) != NULL); + mutex_enter(&rvd->vdev_stat_lock); + rvd->vdev_stat.vs_space += space_delta; + rvd->vdev_stat.vs_alloc += alloc_delta; + rvd->vdev_stat.vs_dspace += dspace_delta; + mutex_exit(&rvd->vdev_stat_lock); + } } /* @@ -1759,13 +2098,53 @@ vdev_config_dirty(vdev_t *vd) int c; /* - * The dirty list is protected by the config lock. The caller must - * either hold the config lock as writer, or must be the sync thread - * (which holds the lock as reader). There's only one sync thread, + * If this is an aux vdev (as with l2cache devices), then we update the + * vdev config manually and set the sync flag. + */ + if (vd->vdev_aux != NULL) { + spa_aux_vdev_t *sav = vd->vdev_aux; + nvlist_t **aux; + uint_t naux; + + for (c = 0; c < sav->sav_count; c++) { + if (sav->sav_vdevs[c] == vd) + break; + } + + if (c == sav->sav_count) { + /* + * We're being removed. There's nothing more to do. + */ + ASSERT(sav->sav_sync == B_TRUE); + return; + } + + sav->sav_sync = B_TRUE; + + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0); + + ASSERT(c < naux); + + /* + * Setting the nvlist in the middle if the array is a little + * sketchy, but it will work. + */ + nvlist_free(aux[c]); + aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE); + + return; + } + + /* + * The dirty list is protected by the SCL_CONFIG lock. The caller + * must either hold SCL_CONFIG as writer, or must be the sync thread + * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ - ASSERT(spa_config_held(spa, RW_WRITER) || - dsl_pool_sync_context(spa_get_dsl(spa))); + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) @@ -1773,8 +2152,8 @@ vdev_config_dirty(vdev_t *vd) } else { ASSERT(vd == vd->vdev_top); - if (!list_link_active(&vd->vdev_dirty_node)) - list_insert_head(&spa->spa_dirty_list, vd); + if (!list_link_active(&vd->vdev_config_dirty_node)) + list_insert_head(&spa->spa_config_dirty_list, vd); } } @@ -1783,14 +2162,58 @@ vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - ASSERT(spa_config_held(spa, RW_WRITER) || - dsl_pool_sync_context(spa_get_dsl(spa))); + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_CONFIG, RW_READER))); + + ASSERT(list_link_active(&vd->vdev_config_dirty_node)); + list_remove(&spa->spa_config_dirty_list, vd); +} + +/* + * Mark a top-level vdev's state as dirty, so that the next pass of + * spa_sync() can convert this into vdev_config_dirty(). We distinguish + * the state changes from larger config changes because they require + * much less locking, and are often needed for administrative actions. + */ +void +vdev_state_dirty(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(vd == vd->vdev_top); + + /* + * The state list is protected by the SCL_STATE lock. The caller + * must either hold SCL_STATE as writer, or must be the sync thread + * (which holds SCL_STATE as reader). There's only one sync thread, + * so this is sufficient to ensure mutual exclusion. + */ + ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_STATE, RW_READER))); - ASSERT(list_link_active(&vd->vdev_dirty_node)); - list_remove(&spa->spa_dirty_list, vd); + if (!list_link_active(&vd->vdev_state_dirty_node)) + list_insert_head(&spa->spa_state_dirty_list, vd); } void +vdev_state_clean(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_STATE, RW_READER))); + + ASSERT(list_link_active(&vd->vdev_state_dirty_node)); + list_remove(&spa->spa_state_dirty_list, vd); +} + +/* + * Propagate vdev state up from children to parent. + */ +void vdev_propagate_state(vdev_t *vd) { vdev_t *rvd = vd->vdev_spa->spa_root_vdev; @@ -1799,28 +2222,45 @@ vdev_propagate_state(vdev_t *vd) int c; vdev_t *child; - for (c = 0; c < vd->vdev_children; c++) { - child = vd->vdev_child[c]; - if (child->vdev_state <= VDEV_STATE_CANT_OPEN) - faulted++; - else if (child->vdev_state == VDEV_STATE_DEGRADED) - degraded++; - - if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) - corrupted++; - } + if (vd->vdev_children > 0) { + for (c = 0; c < vd->vdev_children; c++) { + child = vd->vdev_child[c]; + + if (!vdev_readable(child) || + (!vdev_writeable(child) && (spa_mode & FWRITE))) { + /* + * Root special: if there is a top-level log + * device, treat the root vdev as if it were + * degraded. + */ + if (child->vdev_islog && vd == rvd) + degraded++; + else + faulted++; + } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { + degraded++; + } - vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) + corrupted++; + } - /* - * Root special: if there is a toplevel vdev that cannot be - * opened due to corrupted metadata, then propagate the root - * vdev's aux state as 'corrupt' rather than 'insufficient - * replicas'. - */ - if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) - vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); + vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + + /* + * Root special: if there is a top-level vdev that cannot be + * opened due to corrupted metadata, then propagate the root + * vdev's aux state as 'corrupt' rather than 'insufficient + * replicas'. + */ + if (corrupted && vd == rvd && + rvd->vdev_state == VDEV_STATE_CANT_OPEN) + vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + } + + if (vd->vdev_parent) + vdev_propagate_state(vd->vdev_parent); } /* @@ -1835,6 +2275,7 @@ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; + spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { vd->vdev_stat.vs_aux = aux; @@ -1857,14 +2298,36 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); - if (state == VDEV_STATE_CANT_OPEN) { + if (vd->vdev_removed && + state == VDEV_STATE_CANT_OPEN && + (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { + /* + * If the previous state is set to VDEV_STATE_REMOVED, then this + * device was previously marked removed and someone attempted to + * reopen it. If this failed due to a nonexistent device, then + * keep the device in the REMOVED state. We also let this be if + * it is one of our special test online cases, which is only + * attempting to online the device and shouldn't generate an FMA + * fault. + */ + vd->vdev_state = VDEV_STATE_REMOVED; + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + } else if (state == VDEV_STATE_REMOVED) { + /* + * Indicate to the ZFS DE that this device has been removed, and + * any recent errors should be ignored. + */ + zfs_post_remove(spa, vd); + vd->vdev_removed = B_TRUE; + } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import, we mark it as * "not available", which signifies that it was never there to * begin with. Failure to open such a device is not considered * an error. */ - if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT && + if (spa->spa_load_state == SPA_LOAD_IMPORT && + !spa->spa_import_faulted && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; @@ -1874,9 +2337,18 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. + * + * If the 'checkremove' flag is set, then this is an attempt to + * online the device in response to an insertion event. If we + * hit this case, then we have detected an insertion event for a + * faulted or offline device that wasn't in the removed state. + * In this scenario, we don't post an ereport because we are + * about to replace the device, or attempt an online with + * vdev_forcefault, which will generate the fault for us. */ - if (vd->vdev_prevstate != state && !vd->vdev_not_present && - vd != vd->vdev_spa->spa_root_vdev) { + if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && + !vd->vdev_not_present && !vd->vdev_checkremove && + vd != spa->spa_root_vdev) { const char *class; switch (aux) { @@ -1898,18 +2370,54 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; + case VDEV_AUX_IO_FAILURE: + class = FM_EREPORT_ZFS_IO_FAILURE; + break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } - zfs_ereport_post(class, vd->vdev_spa, - vd, NULL, save_state, 0); + zfs_ereport_post(class, spa, vd, NULL, save_state, 0); } + + /* Erase any notion of persistent removed state */ + vd->vdev_removed = B_FALSE; + } else { + vd->vdev_removed = B_FALSE; } - if (isopen) - return; + if (!isopen) + vdev_propagate_state(vd); +} - if (vd->vdev_parent != NULL) - vdev_propagate_state(vd->vdev_parent); +/* + * Check the vdev configuration to ensure that it's capable of supporting + * a root pool. Currently, we do not support RAID-Z or partial configuration. + * In addition, only a single top-level vdev is allowed and none of the leaves + * can be wholedisks. + */ +boolean_t +vdev_is_bootable(vdev_t *vd) +{ + int c; + + if (!vd->vdev_ops->vdev_op_leaf) { + char *vdev_type = vd->vdev_ops->vdev_op_type; + + if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && + vd->vdev_children > 1) { + return (B_FALSE); + } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { + return (B_FALSE); + } + } else if (vd->vdev_wholedisk == 1) { + return (B_FALSE); + } + + for (c = 0; c < vd->vdev_children; c++) { + if (!vdev_is_bootable(vd->vdev_child[c])) + return (B_FALSE); + } + return (B_TRUE); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c index 4e419b678eb4..aa8f6f0e5a0f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c @@ -19,16 +19,15 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/vdev_impl.h> #include <sys/zio.h> +#include <sys/kstat.h> /* * Virtual device read-ahead caching. @@ -36,15 +35,16 @@ * This file implements a simple LRU read-ahead cache. When the DMU reads * a given block, it will often want other, nearby blocks soon thereafter. * We take advantage of this by reading a larger disk region and caching - * the result. In the best case, this can turn 256 back-to-back 512-byte - * reads into a single 128k read followed by 255 cache hits; this reduces + * the result. In the best case, this can turn 128 back-to-back 512-byte + * reads into a single 64k read followed by 127 cache hits; this reduces * latency dramatically. In the worst case, it can turn an isolated 512-byte - * read into a 128k read, which doesn't affect latency all that much but is + * read into a 64k read, which doesn't affect latency all that much but is * terribly wasteful of bandwidth. A more intelligent version of the cache * could keep track of access patterns and not do read-ahead unless it sees - * at least two temporally close I/Os to the same region. It could also - * take advantage of semantic information about the I/O. And it could use - * something faster than an AVL tree; that was chosen solely for convenience. + * at least two temporally close I/Os to the same region. Currently, only + * metadata I/O is inflated. A futher enhancement could take advantage of + * more semantic information about the I/O. And it could use something + * faster than an AVL tree; that was chosen solely for convenience. * * There are five cache operations: allocate, fill, read, write, evict. * @@ -69,13 +69,15 @@ /* * All i/os smaller than zfs_vdev_cache_max will be turned into * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software - * track buffer. At most zfs_vdev_cache_size bytes will be kept in each + * track buffer). At most zfs_vdev_cache_size bytes will be kept in each * vdev's vdev_cache. */ -int zfs_vdev_cache_max = 1<<14; -int zfs_vdev_cache_size = 10ULL << 20; +int zfs_vdev_cache_max = 1<<14; /* 16KB */ +int zfs_vdev_cache_size = 10ULL << 20; /* 10MB */ int zfs_vdev_cache_bshift = 16; +#define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */ + SYSCTL_DECL(_vfs_zfs_vdev); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); TUNABLE_INT("vfs.zfs.vdev.cache.max", &zfs_vdev_cache_max); @@ -84,8 +86,25 @@ SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN, TUNABLE_INT("vfs.zfs.vdev.cache.size", &zfs_vdev_cache_size); SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN, &zfs_vdev_cache_size, 0, "Size of VDEV cache"); +TUNABLE_INT("vfs.zfs.vdev.cache.bshift", &zfs_vdev_cache_bshift); +SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, bshift, CTLFLAG_RDTUN, + &zfs_vdev_cache_bshift, 0, "Turn too small requests into 1 << this value"); + +kstat_t *vdc_ksp = NULL; + +typedef struct vdc_stats { + kstat_named_t vdc_stat_delegations; + kstat_named_t vdc_stat_hits; + kstat_named_t vdc_stat_misses; +} vdc_stats_t; + +static vdc_stats_t vdc_stats = { + { "delegations", KSTAT_DATA_UINT64 }, + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 } +}; -#define VCBS (1 << zfs_vdev_cache_bshift) +#define VDCSTAT_BUMP(stat) atomic_add_64(&vdc_stats.stat.value.ui64, 1); static int vdev_cache_offset_compare(const void *a1, const void *a2) @@ -127,10 +146,6 @@ vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) ASSERT(ve->ve_fill_io == NULL); ASSERT(ve->ve_data != NULL); - dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n", - vc, ve->ve_offset, ve->ve_lastused, LBOLT - ve->ve_lastused, - ve->ve_hits, ve->ve_missed_update); - avl_remove(&vc->vc_lastused_tree, ve); avl_remove(&vc->vc_offset_tree, ve); zio_buf_free(ve->ve_data, VCBS); @@ -161,10 +176,8 @@ vdev_cache_allocate(zio_t *zio) if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > zfs_vdev_cache_size) { ve = avl_first(&vc->vc_lastused_tree); - if (ve->ve_fill_io != NULL) { - dprintf("can't evict in %p, still filling\n", vc); + if (ve->ve_fill_io != NULL) return (NULL); - } ASSERT(ve->ve_hits != 0); vdev_cache_evict(vc, ve); } @@ -239,7 +252,7 @@ vdev_cache_fill(zio_t *zio) zio->io_delegate_list = dio->io_delegate_next; dio->io_delegate_next = NULL; dio->io_error = zio->io_error; - zio_next_stage(dio); + zio_execute(dio); } } @@ -287,6 +300,7 @@ vdev_cache_read(zio_t *zio) fio->io_delegate_list = zio; zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); + VDCSTAT_BUMP(vdc_stat_delegations); return (0); } @@ -294,7 +308,8 @@ vdev_cache_read(zio_t *zio) zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); - zio_next_stage(zio); + zio_execute(zio); + VDCSTAT_BUMP(vdc_stat_hits); return (0); } @@ -305,11 +320,9 @@ vdev_cache_read(zio_t *zio) return (ENOMEM); } - fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset, + fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK, - vdev_cache_fill, ve); + ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); ve->ve_fill_io = fio; fio->io_delegate_list = zio; @@ -317,6 +330,7 @@ vdev_cache_read(zio_t *zio) mutex_exit(&vc->vc_lock); zio_nowait(fio); + VDCSTAT_BUMP(vdc_stat_misses); return (0); } @@ -361,6 +375,18 @@ vdev_cache_write(zio_t *zio) } void +vdev_cache_purge(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve; + + mutex_enter(&vc->vc_lock); + while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) + vdev_cache_evict(vc, ve); + mutex_exit(&vc->vc_lock); +} + +void vdev_cache_init(vdev_t *vd) { vdev_cache_t *vc = &vd->vdev_cache; @@ -380,15 +406,32 @@ void vdev_cache_fini(vdev_t *vd) { vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve; - mutex_enter(&vc->vc_lock); - while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) - vdev_cache_evict(vc, ve); - mutex_exit(&vc->vc_lock); + vdev_cache_purge(vd); avl_destroy(&vc->vc_offset_tree); avl_destroy(&vc->vc_lastused_tree); mutex_destroy(&vc->vc_lock); } + +void +vdev_cache_stat_init(void) +{ + vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", + KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (vdc_ksp != NULL) { + vdc_ksp->ks_data = &vdc_stats; + kstat_install(vdc_ksp); + } +} + +void +vdev_cache_stat_fini(void) +{ + if (vdc_ksp != NULL) { + kstat_delete(vdc_ksp); + vdc_ksp = NULL; + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c index b965b1c5f09f..35d4e2a9200d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c @@ -19,19 +19,19 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa.h> +#include <sys/refcount.h> #include <sys/vdev_disk.h> #include <sys/vdev_impl.h> #include <sys/fs/zfs.h> #include <sys/zio.h> #include <sys/sunldi.h> +#include <sys/fm/fs/zfs.h> /* * Virtual device vector for disks. @@ -50,6 +50,8 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) vdev_disk_t *dvd; struct dk_minfo dkm; int error; + dev_t dev; + int otyp; /* * We must have a pathname, and it must be absolute. @@ -77,6 +79,11 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) * 3. Otherwise, the device may have moved. Try opening the device * by the devid instead. * + * If the vdev is part of the root pool, we avoid opening it by path. + * We do this because there is no /dev path available early in boot, + * and if we try to open the device by path at a later point, we can + * deadlock when devfsadm attempts to open the underlying backing store + * file. */ if (vd->vdev_devid != NULL) { if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, @@ -88,7 +95,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) error = EINVAL; /* presume failure */ - if (vd->vdev_path != NULL) { + if (vd->vdev_path != NULL && !spa_is_root(vd->vdev_spa)) { ddi_devid_t devid; if (vd->vdev_wholedisk == -1ULL) { @@ -141,12 +148,60 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, spa_mode, kcred, &dvd->vd_lh, zfs_li); + /* + * If all else fails, then try opening by physical path (if available) + * or the logical path (if we failed due to the devid check). While not + * as reliable as the devid, this will give us something, and the higher + * level vdev validation will prevent us from opening the wrong device. + */ + if (error) { + if (vd->vdev_physpath != NULL && + (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != ENODEV) + error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode, + kcred, &dvd->vd_lh, zfs_li); + + /* + * Note that we don't support the legacy auto-wholedisk support + * as above. This hasn't been used in a very long time and we + * don't need to propagate its oddities to this edge condition. + */ + if (error && vd->vdev_path != NULL && + !spa_is_root(vd->vdev_spa)) + error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred, + &dvd->vd_lh, zfs_li); + } + if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } /* + * Once a device is opened, verify that the physical device path (if + * available) is up to date. + */ + if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && + ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { + char *physpath, *minorname; + + physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); + minorname = NULL; + if (ddi_dev_pathname(dev, otyp, physpath) == 0 && + ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && + (vd->vdev_physpath == NULL || + strcmp(vd->vdev_physpath, physpath) != 0)) { + if (vd->vdev_physpath) + spa_strfree(vd->vdev_physpath); + (void) strlcat(physpath, ":", MAXPATHLEN); + (void) strlcat(physpath, minorname, MAXPATHLEN); + vd->vdev_physpath = spa_strdup(physpath); + } + if (minorname) + kmem_free(minorname, strlen(minorname) + 1); + kmem_free(physpath, MAXPATHLEN); + } + + /* * Determine the actual size of the device. */ if (ldi_get_size(dvd->vd_lh, psize) != 0) { @@ -191,10 +246,6 @@ vdev_disk_close(vdev_t *vd) if (dvd == NULL) return; - dprintf("removing disk %s, devid %s\n", - vd->vdev_path ? vd->vdev_path : "<none>", - vd->vdev_devid ? vd->vdev_devid : "<none>"); - if (dvd->vd_minor != NULL) ddi_devid_str_free(dvd->vd_minor); @@ -208,18 +259,59 @@ vdev_disk_close(vdev_t *vd) vd->vdev_tsd = NULL; } +int +vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size, + uint64_t offset, int flags) +{ + buf_t *bp; + int error = 0; + + if (vd_lh == NULL) + return (EINVAL); + + ASSERT(flags & B_READ || flags & B_WRITE); + + bp = getrbuf(KM_SLEEP); + bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST; + bp->b_bcount = size; + bp->b_un.b_addr = (void *)data; + bp->b_lblkno = lbtodb(offset); + bp->b_bufsize = size; + + error = ldi_strategy(vd_lh, bp); + ASSERT(error == 0); + if ((error = biowait(bp)) == 0 && bp->b_resid != 0) + error = EIO; + freerbuf(bp); + + return (error); +} + static void vdev_disk_io_intr(buf_t *bp) { vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp; zio_t *zio = vdb->vdb_io; - if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0) + /* + * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. + * Rather than teach the rest of the stack about other error + * possibilities (EFAULT, etc), we normalize the error value here. + */ + zio->io_error = (geterror(bp) != 0 ? EIO : 0); + + if (zio->io_error == 0 && bp->b_resid != 0) zio->io_error = EIO; kmem_free(vdb, sizeof (vdev_disk_buf_t)); - zio_next_stage_async(zio); + zio_interrupt(zio); +} + +static void +vdev_disk_ioctl_free(zio_t *zio) +{ + kmem_free(zio->io_vsd, sizeof (struct dk_callback)); } static void @@ -229,26 +321,24 @@ vdev_disk_ioctl_done(void *zio_arg, int error) zio->io_error = error; - zio_next_stage_async(zio); + zio_interrupt(zio); } -static void +static int vdev_disk_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_disk_t *dvd = vd->vdev_tsd; vdev_disk_buf_t *vdb; + struct dk_callback *dkc; buf_t *bp; - int flags, error; + int error; if (zio->io_type == ZIO_TYPE_IOCTL) { - zio_vdev_io_bypass(zio); - /* XXPOLICY */ - if (vdev_is_dead(vd)) { + if (!vdev_readable(vd)) { zio->io_error = ENXIO; - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } switch (zio->io_cmd) { @@ -263,12 +353,15 @@ vdev_disk_io_start(zio_t *zio) break; } - zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done; - zio->io_dk_callback.dkc_cookie = zio; + zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); + zio->io_vsd_free = vdev_disk_ioctl_free; + + dkc->dkc_callback = vdev_disk_ioctl_done; + dkc->dkc_flag = FLUSH_VOLATILE; + dkc->dkc_cookie = zio; error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, - (uintptr_t)&zio->io_dk_callback, - FKIOCTL, kcred, NULL); + (uintptr_t)dkc, FKIOCTL, kcred, NULL); if (error == 0) { /* @@ -276,13 +369,16 @@ vdev_disk_io_start(zio_t *zio) * and will call vdev_disk_ioctl_done() * upon completion. */ - return; - } else if (error == ENOTSUP) { + return (ZIO_PIPELINE_STOP); + } + + if (error == ENOTSUP || error == ENOTTY) { /* - * If we get ENOTSUP, we know that no future - * attempts will ever succeed. In this case we - * set a persistent bit so that we don't bother - * with the ioctl in the future. + * If we get ENOTSUP or ENOTTY, we know that + * no future attempts will ever succeed. + * In this case we set a persistent bit so + * that we don't bother with the ioctl in the + * future. */ vd->vdev_nowritecache = B_TRUE; } @@ -294,61 +390,51 @@ vdev_disk_io_start(zio_t *zio) zio->io_error = ENOTSUP; } - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } - if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) - return; - - if ((zio = vdev_queue_io(zio)) == NULL) - return; - - flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); - flags |= B_BUSY | B_NOCACHE; - if (zio->io_flags & ZIO_FLAG_FAILFAST) - flags |= B_FAILFAST; - vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP); vdb->vdb_io = zio; bp = &vdb->vdb_buf; bioinit(bp); - bp->b_flags = flags; + bp->b_flags = B_BUSY | B_NOCACHE | + (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE) | + ((zio->io_flags & ZIO_FLAG_IO_RETRY) ? 0 : B_FAILFAST); bp->b_bcount = zio->io_size; bp->b_un.b_addr = zio->io_data; bp->b_lblkno = lbtodb(zio->io_offset); bp->b_bufsize = zio->io_size; bp->b_iodone = (int (*)())vdev_disk_io_intr; - /* XXPOLICY */ - error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio); - if (error) { - zio->io_error = error; - bioerror(bp, error); - bp->b_resid = bp->b_bcount; - bp->b_iodone(bp); - return; - } - - error = ldi_strategy(dvd->vd_lh, bp); /* ldi_strategy() will return non-zero only on programming errors */ - ASSERT(error == 0); + VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); + + return (ZIO_PIPELINE_STOP); } static void vdev_disk_io_done(zio_t *zio) { - vdev_queue_io_done(zio); - - if (zio->io_type == ZIO_TYPE_WRITE) - vdev_cache_write(zio); - - if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); + vdev_t *vd = zio->io_vd; - zio_next_stage(zio); + /* + * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if + * the device has been removed. If this is the case, then we trigger an + * asynchronous removal of the device. Otherwise, probe the device and + * make sure it's still accessible. + */ + if (zio->io_error == EIO) { + vdev_disk_t *dvd = vd->vdev_tsd; + int state = DKIO_NONE; + + if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, + FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } + } } vdev_ops_t vdev_disk_ops = { @@ -361,3 +447,80 @@ vdev_ops_t vdev_disk_ops = { VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; + +/* + * Given the root disk device devid or pathname, read the label from + * the device, and construct a configuration nvlist. + */ +int +vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) +{ + ldi_handle_t vd_lh; + vdev_label_t *label; + uint64_t s, size; + int l; + ddi_devid_t tmpdevid; + int error = -1; + char *minor_name; + + /* + * Read the device label and build the nvlist. + */ + if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, + &minor_name) == 0) { + error = ldi_open_by_devid(tmpdevid, minor_name, + spa_mode, kcred, &vd_lh, zfs_li); + ddi_devid_free(tmpdevid); + ddi_devid_str_free(minor_name); + } + + if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, + zfs_li))) + return (error); + + if (ldi_get_size(vd_lh, &s)) { + (void) ldi_close(vd_lh, FREAD, kcred); + return (EIO); + } + + size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); + label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); + + for (l = 0; l < VDEV_LABELS; l++) { + uint64_t offset, state, txg = 0; + + /* read vdev label */ + offset = vdev_label_offset(size, l, 0); + if (vdev_disk_physio(vd_lh, (caddr_t)label, + VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE + + VDEV_PHYS_SIZE, offset, B_READ) != 0) + continue; + + if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, + sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state >= POOL_STATE_DESTROYED) { + nvlist_free(*config); + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0) { + nvlist_free(*config); + *config = NULL; + continue; + } + + break; + } + + kmem_free(label, sizeof (vdev_label_t)); + (void) ldi_close(vd_lh, FREAD, kcred); + + return (error); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c index ab2d34c08256..673b633f595b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c @@ -19,18 +19,17 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/vdev_file.h> #include <sys/vdev_impl.h> #include <sys/zio.h> #include <sys/fs/zfs.h> +#include <sys/fm/fs/zfs.h> /* * Virtual device vector for files. @@ -61,8 +60,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) * to local zone users, so the underlying devices should be as well. */ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); - error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX, - 0, &vp, 0, 0, rootdir); + error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, + spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; @@ -80,12 +79,13 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) return (ENODEV); } #endif - /* * Determine the physical size of the file. */ vattr.va_mask = AT_SIZE; - error = VOP_GETATTR(vp, &vattr, 0); + vn_lock(vp, LK_SHARED | LK_RETRY); + error = VOP_GETATTR(vp, &vattr, kcred); + VOP_UNLOCK(vp, 0); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); @@ -101,71 +101,46 @@ static void vdev_file_close(vdev_t *vd) { vdev_file_t *vf = vd->vdev_tsd; + int vfslocked; if (vf == NULL) return; if (vf->vf_vnode != NULL) { - (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred); - (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred); + vfslocked = VFS_LOCK_GIANT(vf->vf_vnode->v_mount); + (void)vn_close(vf->vf_vnode, spa_mode, kcred, curthread); VN_RELE(vf->vf_vnode); + VFS_UNLOCK_GIANT(vfslocked); } kmem_free(vf, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; } -static void +static int vdev_file_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; ssize_t resid; - int error; if (zio->io_type == ZIO_TYPE_IOCTL) { - zio_vdev_io_bypass(zio); - /* XXPOLICY */ - if (vdev_is_dead(vd)) { + if (!vdev_readable(vd)) { zio->io_error = ENXIO; - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, - kcred); - dprintf("fsync(%s) = %d\n", vdev_description(vd), - zio->io_error); + kcred, NULL); break; default: zio->io_error = ENOTSUP; } - zio_next_stage_async(zio); - return; - } - - /* - * In the kernel, don't bother double-caching, but in userland, - * we want to test the vdev_cache code. - */ -#ifndef _KERNEL - if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) - return; -#endif - - if ((zio = vdev_queue_io(zio)) == NULL) - return; - - /* XXPOLICY */ - error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio); - if (error) { - zio->io_error = error; - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? @@ -176,23 +151,15 @@ vdev_file_io_start(zio_t *zio) if (resid != 0 && zio->io_error == 0) zio->io_error = ENOSPC; - zio_next_stage_async(zio); + zio_interrupt(zio); + + return (ZIO_PIPELINE_STOP); } +/* ARGSUSED */ static void vdev_file_io_done(zio_t *zio) { - vdev_queue_io_done(zio); - -#ifndef _KERNEL - if (zio->io_type == ZIO_TYPE_WRITE) - vdev_cache_write(zio); -#endif - - if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); - - zio_next_stage(zio); } vdev_ops_t vdev_file_ops = { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c index eebc911edc4b..f151f83ff82d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -96,13 +96,9 @@ vdev_geom_orphan(struct g_consumer *cp) g_wither_geom(gp, error); } vdev_geom_release(vd); - /* Both methods below work, but in a bit different way. */ -#if 0 - vd->vdev_reopen_wanted = 1; -#else - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); -#endif + + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); } static struct g_consumer * @@ -229,7 +225,7 @@ vdev_geom_worker(void *arg) vd->vdev_nowritecache = B_TRUE; } g_destroy_bio(bp); - zio_next_stage_async(zio); + zio_interrupt(zio); } } @@ -249,6 +245,194 @@ vdev_geom_get_id(struct g_consumer *cp) return (id); } +static uint64_t +nvlist_get_guid(nvlist_t *list) +{ + nvpair_t *elem = NULL; + uint64_t value; + + while ((elem = nvlist_next_nvpair(list, elem)) != NULL) { + if (nvpair_type(elem) == DATA_TYPE_UINT64 && + strcmp(nvpair_name(elem), "guid") == 0) { + VERIFY(nvpair_value_uint64(elem, &value) == 0); + return (value); + } + } + return (0); +} + +static char * +nvlist_get_devid(nvlist_t *list, uint64_t guid) +{ + nvpair_t *elem = NULL; + int progress; + char *id; + + progress = 0; + id = NULL; + + while ((elem = nvlist_next_nvpair(list, elem)) != NULL) { + switch (nvpair_type(elem)) { + case DATA_TYPE_STRING: + { + char *value; + + VERIFY(nvpair_value_string(elem, &value) == 0); + if (strcmp(nvpair_name(elem), "type") == 0 && + strcmp(value, "disk") == 0) { + progress |= 0x01; + } else if (strcmp(nvpair_name(elem), "devid") == 0) { + progress |= 0x02; + id = value; + } + break; + } + case DATA_TYPE_UINT64: + { + uint64_t value; + + VERIFY(nvpair_value_uint64(elem, &value) == 0); + if (strcmp(nvpair_name(elem), "guid") == 0 && + value == guid) { + progress |= 0x04; + } + break; + } + case DATA_TYPE_NVLIST: + { + nvlist_t *value; + char *lid; + + VERIFY(nvpair_value_nvlist(elem, &value) == 0); + lid = nvlist_get_devid(value, guid); + if (lid != NULL) + return (lid); + break; + } + case DATA_TYPE_NVLIST_ARRAY: + { + nvlist_t **value; + u_int c, count; + char *lid; + + VERIFY(nvpair_value_nvlist_array(elem, &value, + &count) == 0); + + for (c = 0; c < count; c++) { + lid = nvlist_get_devid(value[c], guid); + if (lid != NULL) + return (lid); + } + break; + } + } + if (progress == 0x07) + break; + } + if (progress != 0x07) + id = NULL; + return (id); +} + +static int +vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size) +{ + struct bio *bp; + u_char *p; + off_t off; + int error; + + ASSERT((offset % cp->provider->sectorsize) == 0); + ASSERT((size % cp->provider->sectorsize) == 0); + + bp = g_alloc_bio(); + off = offset; + offset += size; + p = data; + error = 0; + + for (; off < offset; off += MAXPHYS, p += MAXPHYS, size -= MAXPHYS) { + bzero(bp, sizeof(*bp)); + bp->bio_cmd = cmd; + bp->bio_done = NULL; + bp->bio_offset = off; + bp->bio_length = MIN(size, MAXPHYS); + bp->bio_data = p; + g_io_request(bp, cp); + error = biowait(bp, "vdev_geom_io"); + if (error != 0) + break; + } + + g_destroy_bio(bp); + return (error); +} + +static char * +vdev_geom_read_id(struct g_consumer *cp) +{ + struct g_provider *pp; + vdev_label_t *label; + char *p, *buf; + size_t buflen; + uint64_t psize; + off_t offset, size; + char *id; + int error, l, len; + + g_topology_assert_not(); + + pp = cp->provider; + + psize = pp->mediasize; + psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t)); + + size = sizeof(*label) + pp->sectorsize - + ((sizeof(*label) - 1) % pp->sectorsize) - 1; + + id = NULL; + label = kmem_alloc(size, KM_SLEEP); + buflen = sizeof(label->vl_vdev_phys.vp_nvlist); + + for (l = 0; l < VDEV_LABELS && id == NULL; l++) { + nvlist_t *config = NULL; + uint64_t guid; + + offset = vdev_label_offset(psize, l, 0); + if ((offset % pp->sectorsize) != 0) + continue; + + error = vdev_geom_io(cp, BIO_READ, label, offset, size); + if (error != 0) + continue; + buf = label->vl_vdev_phys.vp_nvlist; + + if (nvlist_unpack(buf, buflen, &config, 0) != 0) + continue; + + guid = nvlist_get_guid(config); + if (guid == 0) { + nvlist_free(config); + continue; + } + id = nvlist_get_devid(config, guid); + if (id != NULL) { + char *tmp; + + tmp = kmem_zalloc(DISK_IDENT_SIZE, KM_SLEEP); + strlcpy(tmp, id, DISK_IDENT_SIZE); + id = tmp; + } + + nvlist_free(config); + } + + kmem_free(label, size); + if (id != NULL) + ZFS_LOG(1, "ID of %s: %s", pp->name, id); + return (id); +} + static void vdev_geom_free_id(char *id) { @@ -290,6 +474,7 @@ vdev_geom_attach_by_id_event(void *arg, int flags __unused) zgp->orphan = vdev_geom_taste_orphan; zcp = g_new_consumer(zgp); + /* First round tries to get provider's ID without reading metadata. */ LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; @@ -324,6 +509,41 @@ vdev_geom_attach_by_id_event(void *arg, int flags __unused) } } } + /* Second round looks for ID by reading ZFS metadata. */ + LIST_FOREACH(mp, &g_classes, class) { + if (mp == &zfs_vdev_class) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + LIST_FOREACH(pp, &gp->provider, provider) { + if (pp->flags & G_PF_WITHER) + continue; + g_attach(zcp, pp); + if (g_access(zcp, 1, 0, 0) != 0) { + g_detach(zcp); + continue; + } + g_topology_unlock(); + id = vdev_geom_read_id(zcp); + g_topology_lock(); + g_access(zcp, -1, 0, 0); + g_detach(zcp); + if (id == NULL || strcmp(id, ap->id) != 0) { + vdev_geom_free_id(id); + continue; + } + vdev_geom_free_id(id); + ap->cp = vdev_geom_attach(pp, ap->write); + if (ap->cp == NULL) { + printf("ZFS WARNING: Cannot open %s " + "for writting.\n", pp->name); + continue; + } + goto end; + } + } + } ap->cp = NULL; end: g_destroy_consumer(zcp); @@ -345,25 +565,13 @@ vdev_geom_attach_by_id(const char *id, int write) return (cp); } -static int -vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +static struct g_consumer * +vdev_geom_open_by_path_and_devid(vdev_t *vd) { - vdev_geom_ctx_t *ctx; struct g_provider *pp; struct g_consumer *cp; - char *id = NULL; - int owned; - - /* - * We must have a pathname, and it must be absolute. - */ - if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); - } + char *id; - if ((owned = mtx_owned(&Giant))) - mtx_unlock(&Giant); cp = NULL; g_topology_lock(); pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1); @@ -380,40 +588,101 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) ZFS_LOG(1, "ID mismatch for provider %s: " "[%s]!=[%s].", vd->vdev_path, vd->vdev_devid, id); - goto next; - } - ZFS_LOG(1, "ID match for provider %s.", vd->vdev_path); + } else + ZFS_LOG(1, "ID match for provider %s.", + vd->vdev_path); + vdev_geom_free_id(id); } } -next: g_topology_unlock(); - vdev_geom_free_id(id); - if (cp == NULL && vd->vdev_devid != NULL) { - ZFS_LOG(1, "Searching by ID [%s].", vd->vdev_devid); - cp = vdev_geom_attach_by_id(vd->vdev_devid, - !!(spa_mode & FWRITE)); - if (cp != NULL) { - size_t len = strlen(cp->provider->name) + 6; /* 6 == strlen("/dev/") + 1 */ - char *buf = kmem_alloc(len, KM_SLEEP); - - snprintf(buf, len, "/dev/%s", cp->provider->name); - spa_strfree(vd->vdev_path); - vd->vdev_path = buf; - - ZFS_LOG(1, "Attach by ID [%s] succeeded, provider %s.", - vd->vdev_devid, vd->vdev_path); + + return (cp); +} + +static struct g_consumer * +vdev_geom_open_by_devid(vdev_t *vd) +{ + struct g_consumer *cp; + char *buf; + size_t len; + + /* + * We can't search by devid if it's missing. + */ + if (vd->vdev_devid == NULL) + return (NULL); + + ZFS_LOG(1, "Searching by ID [%s].", vd->vdev_devid); + cp = vdev_geom_attach_by_id(vd->vdev_devid, !!(spa_mode & FWRITE)); + if (cp != NULL) { + len = strlen(cp->provider->name) + strlen("/dev/") + 1; + buf = kmem_alloc(len, KM_SLEEP); + + snprintf(buf, len, "/dev/%s", cp->provider->name); + spa_strfree(vd->vdev_path); + vd->vdev_path = buf; + + ZFS_LOG(1, "Attach by ID [%s] succeeded, provider %s.", + vd->vdev_devid, vd->vdev_path); + } else + ZFS_LOG(1, "Search by ID [%s] failed.", vd->vdev_devid); + + return (cp); +} + +static int +vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + vdev_geom_ctx_t *ctx; + struct g_provider *pp; + struct g_consumer *cp; + int owned; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + vd->vdev_tsd = NULL; + + if ((owned = mtx_owned(&Giant))) + mtx_unlock(&Giant); + cp = vdev_geom_open_by_path_and_devid(vd); + if (cp == NULL) { + /* + * The device at vd->vdev_path doesn't have the right devid. + * The disks might have merely moved around so try all other + * geom providers to find one with the right devid. + */ + cp = vdev_geom_open_by_devid(vd); + if (cp == NULL) { + ZFS_LOG(1, "Provider %s not found.", vd->vdev_path); + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + if (owned) + mtx_lock(&Giant); + return (EACCES); } } if (owned) mtx_lock(&Giant); - if (cp == NULL) { - ZFS_LOG(1, "Provider %s (id=[%s]) not found.", vd->vdev_path, - vd->vdev_devid); - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (EACCES); - } + + cp->private = vd; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP); + bioq_init(&ctx->gc_queue); + mtx_init(&ctx->gc_queue_mtx, "zfs:vdev:geom:queue", NULL, MTX_DEF); + ctx->gc_consumer = cp; + ctx->gc_state = 0; + + vd->vdev_tsd = ctx; pp = cp->provider; + kproc_create(vdev_geom_worker, ctx, NULL, 0, 0, "vdev:worker %s", + pp->name); + /* * Determine the actual size of the device. */ @@ -430,19 +699,6 @@ next: */ vd->vdev_nowritecache = B_FALSE; - cp->private = vd; - - ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP); - bioq_init(&ctx->gc_queue); - mtx_init(&ctx->gc_queue_mtx, "zfs:vdev:geom:queue", NULL, MTX_DEF); - ctx->gc_consumer = cp; - ctx->gc_state = 0; - - vd->vdev_tsd = ctx; - - kproc_create(vdev_geom_worker, ctx, NULL, 0, 0, "vdev:worker %s", - pp->name); - return (0); } @@ -469,13 +725,16 @@ vdev_geom_io_intr(struct bio *bp) zio = bp->bio_caller1; ctx = zio->io_vd->vdev_tsd; + if ((zio->io_error = bp->bio_error) == 0 && bp->bio_resid != 0) + zio->io_error = EIO; + mtx_lock(&ctx->gc_queue_mtx); bioq_insert_tail(&ctx->gc_queue, bp); wakeup_one(&ctx->gc_queue); mtx_unlock(&ctx->gc_queue_mtx); } -static void +static int vdev_geom_io_start(zio_t *zio) { vdev_t *vd; @@ -492,18 +751,19 @@ vdev_geom_io_start(zio_t *zio) cp = ctx->gc_consumer; if (zio->io_type == ZIO_TYPE_IOCTL) { - zio_vdev_io_bypass(zio); - /* XXPOLICY */ - if (vdev_is_dead(vd)) { + if (!vdev_readable(vd)) { zio->io_error = ENXIO; - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: + + if (zfs_nocacheflush) + break; + if (vd->vdev_nowritecache) { zio->io_error = ENOTSUP; break; @@ -514,27 +774,13 @@ vdev_geom_io_start(zio_t *zio) zio->io_error = ENOTSUP; } - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } - - if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) - return; - - if ((zio = vdev_queue_io(zio)) == NULL) - return; - sendreq: - - error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio); - if (error == 0 && cp == NULL) - error = ENXIO; - if (error) { - zio->io_error = error; - zio_next_stage_async(zio); - return; + if (cp == NULL) { + zio->io_error = ENXIO; + return (ZIO_PIPELINE_CONTINUE); } - bp = g_alloc_bio(); bp->bio_caller1 = zio; switch (zio->io_type) { @@ -555,20 +801,33 @@ sendreq: bp->bio_done = vdev_geom_io_intr; g_io_request(bp, cp); + + return (ZIO_PIPELINE_STOP); } static void vdev_geom_io_done(zio_t *zio) { - vdev_queue_io_done(zio); - - if (zio->io_type == ZIO_TYPE_WRITE) - vdev_cache_write(zio); - if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); - - zio_next_stage(zio); + /* + * If the device returned ENXIO, then attempt we should verify if GEOM + * provider has been removed. If this is the case, then we trigger an + * asynchronous removal of the device. + */ + if (zio->io_error == ENXIO) { + vdev_t *vd = zio->io_vd; + vdev_geom_ctx_t *ctx; + struct g_provider *pp = NULL; + + ctx = vd->vdev_tsd; + if (ctx != NULL && ctx->gc_consumer != NULL) + pp = ctx->gc_consumer->provider; + + if (pp == NULL || (pp->flags & G_PF_ORPHAN)) { + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } + } } vdev_ops_t vdev_geom_ops = { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c index 9d9f5556fa08..bf930466fbd6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Virtual Device Labels * --------------------- @@ -62,7 +60,7 @@ * or a device was added, we want to update all the labels such that we can deal * with fatal failure at any point. To this end, each disk has two labels which * are updated before and after the uberblock is synced. Assuming we have - * labels and an uberblock with the following transacation groups: + * labels and an uberblock with the following transaction groups: * * L1 UB L2 * +------+ +------+ +------+ @@ -153,34 +151,56 @@ uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset) { ASSERT(offset < sizeof (vdev_label_t)); + ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0); return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); } +/* + * Returns back the vdev label associated with the passed in offset. + */ +int +vdev_label_number(uint64_t psize, uint64_t offset) +{ + int l; + + if (offset >= psize - VDEV_LABEL_END_SIZE) { + offset -= psize - VDEV_LABEL_END_SIZE; + offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t); + } + l = offset / sizeof (vdev_label_t); + return (l < VDEV_LABELS ? l : -1); +} + static void vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private) + uint64_t size, zio_done_func_t *done, void *private, int flags) { - ASSERT(vd->vdev_children == 0); + ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == + SCL_STATE_ALL); + ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_read_phys(zio, vd, vdev_label_offset(vd->vdev_psize, l, offset), size, buf, ZIO_CHECKSUM_LABEL, done, private, - ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE)); + ZIO_PRIORITY_SYNC_READ, flags, B_TRUE)); } static void vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private) + uint64_t size, zio_done_func_t *done, void *private, int flags) { - ASSERT(vd->vdev_children == 0); + ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || + (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) == + (SCL_CONFIG | SCL_STATE) && + dsl_pool_sync_context(spa_get_dsl(zio->io_spa)))); + ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_write_phys(zio, vd, vdev_label_offset(vd->vdev_psize, l, offset), size, buf, ZIO_CHECKSUM_LABEL, done, private, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL)); + ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE)); } /* @@ -188,7 +208,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, */ nvlist_t * vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - boolean_t isspare) + boolean_t isspare, boolean_t isl2cache) { nvlist_t *nv = NULL; @@ -196,7 +216,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type) == 0); - if (!isspare) + if (!isspare && !isl2cache) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) == 0); VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); @@ -209,6 +229,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid) == 0); + if (vd->vdev_physpath != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, + vd->vdev_physpath) == 0); + if (vd->vdev_nparity != 0) { ASSERT(strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_RAIDZ) == 0); @@ -219,7 +243,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, */ ASSERT(vd->vdev_nparity == 1 || (vd->vdev_nparity == 2 && - spa_version(spa) >= ZFS_VERSION_RAID6)); + spa_version(spa) >= SPA_VERSION_RAID6)); /* * Note that we'll add the nparity tag even on storage pools @@ -240,7 +264,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_isspare) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0); - if (!isspare && vd == vd->vdev_top) { + if (!isspare && !isl2cache && vd == vd->vdev_top) { VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, vd->vdev_ms_array) == 0); VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, @@ -249,6 +273,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vd->vdev_ashift) == 0); VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize) == 0); + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, + vd->vdev_islog) == 0); } if (vd->vdev_dtl.smo_object != 0) @@ -271,7 +297,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, for (c = 0; c < vd->vdev_children; c++) child[c] = vdev_config_generate(spa, vd->vdev_child[c], - getstats, isspare); + getstats, isspare, isl2cache); VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, vd->vdev_children) == 0); @@ -285,9 +311,18 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_offline && !vd->vdev_tmpoffline) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE) == 0); - else - (void) nvlist_remove(nv, ZPOOL_CONFIG_OFFLINE, - DATA_TYPE_UINT64); + if (vd->vdev_faulted) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, + B_TRUE) == 0); + if (vd->vdev_degraded) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, + B_TRUE) == 0); + if (vd->vdev_removed) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, + B_TRUE) == 0); + if (vd->vdev_unspare) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, + B_TRUE) == 0); } return (nv); @@ -300,23 +335,23 @@ vdev_label_read_config(vdev_t *vd) nvlist_t *config = NULL; vdev_phys_t *vp; zio_t *zio; - int l; + int flags = + ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; - ASSERT(spa_config_held(spa, RW_READER)); + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); - if (vdev_is_dead(vd)) + if (!vdev_readable(vd)) return (NULL); vp = zio_buf_alloc(sizeof (vdev_phys_t)); - for (l = 0; l < VDEV_LABELS; l++) { + for (int l = 0; l < VDEV_LABELS; l++) { - zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CONFIG_HELD); + zio = zio_root(spa, NULL, NULL, flags); vdev_label_read(zio, vd, l, vp, offsetof(vdev_label_t, vl_vdev_phys), - sizeof (vdev_phys_t), NULL, NULL); + sizeof (vdev_phys_t), NULL, NULL, flags); if (zio_wait(zio) == 0 && nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), @@ -340,7 +375,7 @@ vdev_label_read_config(vdev_t *vd) */ static boolean_t vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, - uint64_t *spare_guid) + uint64_t *spare_guid, uint64_t *l2cache_guid) { spa_t *spa = vd->vdev_spa; uint64_t state, pool_guid, device_guid, txg, spare_pool; @@ -349,6 +384,8 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, if (spare_guid) *spare_guid = 0ULL; + if (l2cache_guid) + *l2cache_guid = 0ULL; /* * Read the label, if any, and perform some basic sanity checks. @@ -367,7 +404,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, return (B_FALSE); } - if (state != POOL_STATE_SPARE && + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, @@ -383,9 +420,10 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, * be a part of. The only way this is allowed is if the device is a hot * spare (which we check for later on). */ - if (state != POOL_STATE_SPARE && + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && !spa_guid_exists(pool_guid, device_guid) && - !spa_spare_exists(device_guid, NULL)) + !spa_spare_exists(device_guid, NULL, NULL) && + !spa_l2cache_exists(device_guid, NULL)) return (B_FALSE); /* @@ -395,21 +433,23 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, * user has attempted to add the same vdev multiple times in the same * transaction. */ - if (state != POOL_STATE_SPARE && txg == 0 && vdtxg == crtxg) + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + txg == 0 && vdtxg == crtxg) return (B_TRUE); /* * Check to see if this is a spare device. We do an explicit check for * spa_has_spare() here because it may be on our pending list of spares - * to add. + * to add. We also check if it is an l2cache device. */ - if (spa_spare_exists(device_guid, &spare_pool) || + if (spa_spare_exists(device_guid, &spare_pool, NULL) || spa_has_spare(spa, device_guid)) { if (spare_guid) *spare_guid = device_guid; switch (reason) { case VDEV_LABEL_CREATE: + case VDEV_LABEL_L2CACHE: return (B_TRUE); case VDEV_LABEL_REPLACE: @@ -422,6 +462,12 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, } /* + * Check to see if this is an l2cache device. + */ + if (spa_l2cache_exists(device_guid, NULL)) + return (B_TRUE); + + /* * If the device is marked ACTIVE, then this device is in use by another * pool on the system. */ @@ -445,15 +491,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) vdev_boot_header_t *vb; uberblock_t *ub; zio_t *zio; - int l, c, n; char *buf; size_t buflen; int error; - uint64_t spare_guid; + uint64_t spare_guid, l2cache_guid; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; - ASSERT(spa_config_held(spa, RW_WRITER)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) if ((error = vdev_label_init(vd->vdev_child[c], crtxg, reason)) != 0) return (error); @@ -471,38 +517,56 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * Determine if the vdev is in use. */ if (reason != VDEV_LABEL_REMOVE && - vdev_inuse(vd, crtxg, reason, &spare_guid)) + vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) return (EBUSY); ASSERT(reason != VDEV_LABEL_REMOVE || - vdev_inuse(vd, crtxg, reason, NULL)); + vdev_inuse(vd, crtxg, reason, NULL, NULL)); /* - * If this is a request to add or replace a spare that is in use - * elsewhere on the system, then we must update the guid (which was - * initialized to a random value) to reflect the actual GUID (which is - * shared between multiple pools). + * If this is a request to add or replace a spare or l2cache device + * that is in use elsewhere on the system, then we must update the + * guid (which was initialized to a random value) to reflect the + * actual GUID (which is shared between multiple pools). */ - if (reason != VDEV_LABEL_REMOVE && spare_guid != 0ULL) { - vdev_t *pvd = vd->vdev_parent; + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE && + spare_guid != 0ULL) { + uint64_t guid_delta = spare_guid - vd->vdev_guid; - for (; pvd != NULL; pvd = pvd->vdev_parent) { - pvd->vdev_guid_sum -= vd->vdev_guid; - pvd->vdev_guid_sum += spare_guid; - } + vd->vdev_guid += guid_delta; - vd->vdev_guid = vd->vdev_guid_sum = spare_guid; + for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum += guid_delta; /* * If this is a replacement, then we want to fallthrough to the * rest of the code. If we're adding a spare, then it's already - * labelled appropriately and we can just return. + * labeled appropriately and we can just return. */ if (reason == VDEV_LABEL_SPARE) return (0); ASSERT(reason == VDEV_LABEL_REPLACE); } + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && + l2cache_guid != 0ULL) { + uint64_t guid_delta = l2cache_guid - vd->vdev_guid; + + vd->vdev_guid += guid_delta; + + for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum += guid_delta; + + /* + * If this is a replacement, then we want to fallthrough to the + * rest of the code. If we're adding an l2cache, then it's + * already labeled appropriately and we can just return. + */ + if (reason == VDEV_LABEL_L2CACHE) + return (0); + ASSERT(reason == VDEV_LABEL_REPLACE); + } + /* * Initialize its label. */ @@ -532,6 +596,19 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) POOL_STATE_SPARE) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); + } else if (reason == VDEV_LABEL_L2CACHE || + (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) { + /* + * For level 2 ARC devices, add a special label. + */ + VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, + POOL_STATE_L2CACHE) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); } else { label = spa_config_generate(spa, vd, 0ULL, B_FALSE); @@ -576,23 +653,22 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Write everything in parallel. */ - zio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + zio = zio_root(spa, NULL, NULL, flags); - for (l = 0; l < VDEV_LABELS; l++) { + for (int l = 0; l < VDEV_LABELS; l++) { vdev_label_write(zio, vd, l, vp, offsetof(vdev_label_t, vl_vdev_phys), - sizeof (vdev_phys_t), NULL, NULL); + sizeof (vdev_phys_t), NULL, NULL, flags); vdev_label_write(zio, vd, l, vb, offsetof(vdev_label_t, vl_boot_header), - sizeof (vdev_boot_header_t), NULL, NULL); + sizeof (vdev_boot_header_t), NULL, NULL, flags); - for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_write(zio, vd, l, ub, VDEV_UBERBLOCK_OFFSET(vd, n), - VDEV_UBERBLOCK_SIZE(vd), NULL, NULL); + VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags); } } @@ -605,14 +681,20 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * If this vdev hasn't been previously identified as a spare, then we - * mark it as such only if a) we are labelling it as a spare, or b) it - * exists as a spare elsewhere in the system. + * mark it as such only if a) we are labeling it as a spare, or b) it + * exists as a spare elsewhere in the system. Do the same for + * level 2 ARC devices. */ if (error == 0 && !vd->vdev_isspare && (reason == VDEV_LABEL_SPARE || - spa_spare_exists(vd->vdev_guid, NULL))) + spa_spare_exists(vd->vdev_guid, NULL, NULL))) spa_spare_add(vd); + if (error == 0 && !vd->vdev_isl2cache && + (reason == VDEV_LABEL_L2CACHE || + spa_l2cache_exists(vd->vdev_guid, NULL))) + spa_l2cache_add(vd); + return (error); } @@ -651,17 +733,17 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) static void vdev_uberblock_load_done(zio_t *zio) { + zio_t *rio = zio->io_private; uberblock_t *ub = zio->io_data; - uberblock_t *ubbest = zio->io_private; - spa_t *spa = zio->io_spa; + uberblock_t *ubbest = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd)); if (zio->io_error == 0 && uberblock_verify(ub) == 0) { - mutex_enter(&spa->spa_uberblock_lock); + mutex_enter(&rio->io_lock); if (vdev_uberblock_compare(ub, ubbest) > 0) *ubbest = *ub; - mutex_exit(&spa->spa_uberblock_lock); + mutex_exit(&rio->io_lock); } zio_buf_free(zio->io_data, zio->io_size); @@ -670,136 +752,169 @@ vdev_uberblock_load_done(zio_t *zio) void vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) { - int l, c, n; - - for (c = 0; c < vd->vdev_children; c++) - vdev_uberblock_load(zio, vd->vdev_child[c], ubbest); + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + int flags = + ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; + + if (vd == rvd) { + ASSERT(zio == NULL); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + zio = zio_root(spa, NULL, ubbest, flags); + bzero(ubbest, sizeof (uberblock_t)); + } - if (!vd->vdev_ops->vdev_op_leaf) - return; + ASSERT(zio != NULL); - if (vdev_is_dead(vd)) - return; + for (int c = 0; c < vd->vdev_children; c++) + vdev_uberblock_load(zio, vd->vdev_child[c], ubbest); - for (l = 0; l < VDEV_LABELS; l++) { - for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { - vdev_label_read(zio, vd, l, - zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), - VDEV_UBERBLOCK_OFFSET(vd, n), - VDEV_UBERBLOCK_SIZE(vd), - vdev_uberblock_load_done, ubbest); + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + for (int l = 0; l < VDEV_LABELS; l++) { + for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + vdev_label_read(zio, vd, l, + zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), + VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd), + vdev_uberblock_load_done, zio, flags); + } } } + + if (vd == rvd) { + (void) zio_wait(zio); + spa_config_exit(spa, SCL_ALL, FTAG); + } } /* - * Write the uberblock to both labels of all leaves of the specified vdev. + * On success, increment root zio's count of good writes. * We only get credit for writes to known-visible vdevs; see spa_vdev_add(). */ static void vdev_uberblock_sync_done(zio_t *zio) { - uint64_t *good_writes = zio->io_root->io_private; + uint64_t *good_writes = zio->io_private; if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) atomic_add_64(good_writes, 1); } +/* + * Write the uberblock to all labels of all leaves of the specified vdev. + */ static void -vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, uint64_t txg) +vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) { - int l, c, n; + uberblock_t *ubbuf; + int n; - for (c = 0; c < vd->vdev_children; c++) - vdev_uberblock_sync(zio, ub, vd->vdev_child[c], txg); + for (int c = 0; c < vd->vdev_children; c++) + vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags); if (!vd->vdev_ops->vdev_op_leaf) return; - if (vdev_is_dead(vd)) + if (!vdev_writeable(vd)) return; - n = txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); + n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); - ASSERT(ub->ub_txg == txg); + ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); + bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); + *ubbuf = *ub; - for (l = 0; l < VDEV_LABELS; l++) - vdev_label_write(zio, vd, l, ub, - VDEV_UBERBLOCK_OFFSET(vd, n), - VDEV_UBERBLOCK_SIZE(vd), - vdev_uberblock_sync_done, NULL); + for (int l = 0; l < VDEV_LABELS; l++) + vdev_label_write(zio, vd, l, ubbuf, + VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), + vdev_uberblock_sync_done, zio->io_private, + flags | ZIO_FLAG_DONT_PROPAGATE); - dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg); + zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); } -static int -vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *vd, uint64_t txg) +int +vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { - uberblock_t *ubbuf; - size_t size = vd->vdev_top ? VDEV_UBERBLOCK_SIZE(vd) : SPA_MAXBLOCKSIZE; - uint64_t *good_writes; + spa_t *spa = svd[0]->vdev_spa; zio_t *zio; - int error; - - ubbuf = zio_buf_alloc(size); - bzero(ubbuf, size); - *ubbuf = *ub; + uint64_t good_writes = 0; - good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + zio = zio_root(spa, NULL, &good_writes, flags); - zio = zio_root(spa, NULL, good_writes, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + for (int v = 0; v < svdcount; v++) + vdev_uberblock_sync(zio, ub, svd[v], flags); - vdev_uberblock_sync(zio, ubbuf, vd, txg); - - error = zio_wait(zio); - - if (error && *good_writes != 0) { - dprintf("partial success: good_writes = %llu\n", *good_writes); - error = 0; - } + (void) zio_wait(zio); /* - * It's possible to have no good writes and no error if every vdev is in - * the CANT_OPEN state. + * Flush the uberblocks to disk. This ensures that the odd labels + * are no longer needed (because the new uberblocks and the even + * labels are safely on disk), so it is safe to overwrite them. */ - if (*good_writes == 0 && error == 0) - error = EIO; + zio = zio_root(spa, NULL, NULL, flags); - kmem_free(good_writes, sizeof (uint64_t)); - zio_buf_free(ubbuf, size); + for (int v = 0; v < svdcount; v++) + zio_flush(zio, svd[v]); - return (error); + (void) zio_wait(zio); + + return (good_writes >= 1 ? 0 : EIO); } /* - * Sync out an individual vdev. + * On success, increment the count of good writes for our top-level vdev. */ static void -vdev_sync_label_done(zio_t *zio) +vdev_label_sync_done(zio_t *zio) { - uint64_t *good_writes = zio->io_root->io_private; + uint64_t *good_writes = zio->io_private; if (zio->io_error == 0) atomic_add_64(good_writes, 1); } +/* + * If there weren't enough good writes, indicate failure to the parent. + */ +static void +vdev_label_sync_top_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_private; + + if (*good_writes == 0) + zio->io_error = EIO; + + kmem_free(good_writes, sizeof (uint64_t)); +} + +/* + * We ignore errors for log and cache devices, simply free the private data. + */ +static void +vdev_label_sync_ignore_done(zio_t *zio) +{ + kmem_free(zio->io_private, sizeof (uint64_t)); +} + +/* + * Write all even or odd labels to all leaves of the specified vdev. + */ static void -vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg) +vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) { nvlist_t *label; vdev_phys_t *vp; char *buf; size_t buflen; - int c; - for (c = 0; c < vd->vdev_children; c++) - vdev_sync_label(zio, vd->vdev_child[c], l, txg); + for (int c = 0; c < vd->vdev_children; c++) + vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags); if (!vd->vdev_ops->vdev_op_leaf) return; - if (vdev_is_dead(vd)) + if (!vdev_writeable(vd)) return; /* @@ -813,107 +928,110 @@ vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg) buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); - if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) - vdev_label_write(zio, vd, l, vp, - offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), - vdev_sync_label_done, NULL); + if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) { + for (; l < VDEV_LABELS; l += 2) { + vdev_label_write(zio, vd, l, vp, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t), + vdev_label_sync_done, zio->io_private, + flags | ZIO_FLAG_DONT_PROPAGATE); + } + } zio_buf_free(vp, sizeof (vdev_phys_t)); nvlist_free(label); - - dprintf("%s label %d txg %llu\n", vdev_description(vd), l, txg); } -static int -vdev_sync_labels(vdev_t *vd, int l, uint64_t txg) +int +vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) { - uint64_t *good_writes; + list_t *dl = &spa->spa_config_dirty_list; + vdev_t *vd; zio_t *zio; int error; - ASSERT(vd == vd->vdev_top); - - good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); - - zio = zio_root(vd->vdev_spa, NULL, good_writes, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); - /* - * Recursively kick off writes to all labels. + * Write the new labels to disk. */ - vdev_sync_label(zio, vd, l, txg); + zio = zio_root(spa, NULL, NULL, flags); + + for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { + uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t), + KM_SLEEP); + zio_t *vio = zio_null(zio, spa, + (vd->vdev_islog || vd->vdev_aux != NULL) ? + vdev_label_sync_ignore_done : vdev_label_sync_top_done, + good_writes, flags); + vdev_label_sync(vio, vd, l, txg, flags); + zio_nowait(vio); + } error = zio_wait(zio); - if (error && *good_writes != 0) { - dprintf("partial success: good_writes = %llu\n", *good_writes); - error = 0; - } + /* + * Flush the new labels to disk. + */ + zio = zio_root(spa, NULL, NULL, flags); - if (*good_writes == 0 && error == 0) - error = ENODEV; + for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) + zio_flush(zio, vd); - kmem_free(good_writes, sizeof (uint64_t)); + (void) zio_wait(zio); return (error); } /* - * Sync the entire vdev configuration. + * Sync the uberblock and any changes to the vdev configuration. * * The order of operations is carefully crafted to ensure that * if the system panics or loses power at any time, the state on disk * is still transactionally consistent. The in-line comments below * describe the failure semantics at each stage. * - * Moreover, it is designed to be idempotent: if spa_sync_labels() fails + * Moreover, vdev_config_sync() is designed to be idempotent: if it fails * at any time, you can just call it again, and it will resume its work. */ int -vdev_config_sync(vdev_t *uvd, uint64_t txg) +vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) { - spa_t *spa = uvd->vdev_spa; + spa_t *spa = svd[0]->vdev_spa; uberblock_t *ub = &spa->spa_uberblock; - vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; zio_t *zio; - int l, error; + int error; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; ASSERT(ub->ub_txg <= txg); /* - * If this isn't a resync due to I/O errors, and nothing changed - * in this transaction group, and the vdev configuration hasn't changed, + * If this isn't a resync due to I/O errors, + * and nothing changed in this transaction group, + * and the vdev configuration hasn't changed, * then there's nothing to do. */ - if (ub->ub_txg < txg && uberblock_update(ub, rvd, txg) == B_FALSE && - list_is_empty(&spa->spa_dirty_list)) { - dprintf("nothing to sync in %s in txg %llu\n", - spa_name(spa), txg); + if (ub->ub_txg < txg && + uberblock_update(ub, spa->spa_root_vdev, txg) == B_FALSE && + list_is_empty(&spa->spa_config_dirty_list)) return (0); - } if (txg > spa_freeze_txg(spa)) return (0); ASSERT(txg <= spa->spa_final_txg); - dprintf("syncing %s txg %llu\n", spa_name(spa), txg); - /* * Flush the write cache of every disk that's been written to * in this transaction group. This ensures that all blocks * written in this txg will be committed to stable storage * before any uberblock that references them. */ - zio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); + zio = zio_root(spa, NULL, NULL, flags); + for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd; - vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) { - zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE, - NULL, NULL, ZIO_PRIORITY_NOW, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); - } + vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) + zio_flush(zio, vd); + (void) zio_wait(zio); /* @@ -921,34 +1039,15 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg) * system dies in the middle of this process, that's OK: all of the * even labels that made it to disk will be newer than any uberblock, * and will therefore be considered invalid. The odd labels (L1, L3), - * which have not yet been touched, will still be valid. - */ - for (vd = list_head(&spa->spa_dirty_list); vd != NULL; - vd = list_next(&spa->spa_dirty_list, vd)) { - for (l = 0; l < VDEV_LABELS; l++) { - if (l & 1) - continue; - if ((error = vdev_sync_labels(vd, l, txg)) != 0) - return (error); - } - } - - /* - * Flush the new labels to disk. This ensures that all even-label - * updates are committed to stable storage before the uberblock update. + * which have not yet been touched, will still be valid. We flush + * the new labels to disk to ensure that all even-label updates + * are committed to stable storage before the uberblock update. */ - zio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); - for (vd = list_head(&spa->spa_dirty_list); vd != NULL; - vd = list_next(&spa->spa_dirty_list, vd)) { - zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE, - NULL, NULL, ZIO_PRIORITY_NOW, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); - } - (void) zio_wait(zio); + if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) + return (error); /* - * Sync the uberblocks to all vdevs in the tree specified by uvd. + * Sync the uberblocks to all vdevs in svd[]. * If the system dies in the middle of this step, there are two cases * to consider, and the on-disk state is consistent either way: * @@ -962,50 +1061,18 @@ vdev_config_sync(vdev_t *uvd, uint64_t txg) * been successfully committed) will be valid with respect * to the new uberblocks. */ - if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0) + if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) return (error); /* - * Flush the uberblocks to disk. This ensures that the odd labels - * are no longer needed (because the new uberblocks and the even - * labels are safely on disk), so it is safe to overwrite them. - */ - (void) zio_wait(zio_ioctl(NULL, spa, uvd, DKIOCFLUSHWRITECACHE, - NULL, NULL, ZIO_PRIORITY_NOW, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); - - /* * Sync out odd labels for every dirty vdev. If the system dies * in the middle of this process, the even labels and the new * uberblocks will suffice to open the pool. The next time * the pool is opened, the first thing we'll do -- before any * user data is modified -- is mark every vdev dirty so that - * all labels will be brought up to date. + * all labels will be brought up to date. We flush the new labels + * to disk to ensure that all odd-label updates are committed to + * stable storage before the next transaction group begins. */ - for (vd = list_head(&spa->spa_dirty_list); vd != NULL; - vd = list_next(&spa->spa_dirty_list, vd)) { - for (l = 0; l < VDEV_LABELS; l++) { - if ((l & 1) == 0) - continue; - if ((error = vdev_sync_labels(vd, l, txg)) != 0) - return (error); - } - } - - /* - * Flush the new labels to disk. This ensures that all odd-label - * updates are committed to stable storage before the next - * transaction group begins. - */ - zio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL); - for (vd = list_head(&spa->spa_dirty_list); vd != NULL; - vd = list_next(&spa->spa_dirty_list, vd)) { - zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE, - NULL, NULL, ZIO_PRIORITY_NOW, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); - } - (void) zio_wait(zio); - - return (0); + return (vdev_label_sync_list(spa, 1, txg, flags)); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c index 73d1a83d9436..c4629ff45087 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/vdev_impl.h> @@ -39,8 +37,9 @@ typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; int mc_error; - short mc_tried; - short mc_skipped; + uint8_t mc_tried; + uint8_t mc_skipped; + uint8_t mc_speculative; } mirror_child_t; typedef struct mirror_map { @@ -53,6 +52,14 @@ typedef struct mirror_map { int vdev_mirror_shift = 21; +static void +vdev_mirror_map_free(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + + kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); +} + static mirror_map_t * vdev_mirror_map_alloc(zio_t *zio) { @@ -110,18 +117,10 @@ vdev_mirror_map_alloc(zio_t *zio) } zio->io_vsd = mm; + zio->io_vsd_free = vdev_mirror_map_free; return (mm); } -static void -vdev_mirror_map_free(zio_t *zio) -{ - mirror_map_t *mm = zio->io_vsd; - - kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); - zio->io_vsd = NULL; -} - static int vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { @@ -195,13 +194,6 @@ vdev_mirror_scrub_done(zio_t *zio) mc->mc_skipped = 0; } -static void -vdev_mirror_repair_done(zio_t *zio) -{ - ASSERT(zio->io_private == zio->io_parent); - vdev_mirror_map_free(zio->io_private); -} - /* * Try to find a child whose DTL doesn't contain the block we want to read. * If we can't, try the read on any vdev we haven't already tried. @@ -219,7 +211,7 @@ vdev_mirror_child_select(zio_t *zio) /* * Try to find a child whose DTL doesn't contain the block to read. * If a child is known to be completely inaccessible (indicated by - * vdev_is_dead() returning B_TRUE), don't even try. + * vdev_readable() returning B_FALSE), don't even try. */ for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { if (c >= mm->mm_children) @@ -227,7 +219,7 @@ vdev_mirror_child_select(zio_t *zio) mc = &mm->mm_child[c]; if (mc->mc_tried || mc->mc_skipped) continue; - if (vdev_is_dead(mc->mc_vd)) { + if (!vdev_readable(mc->mc_vd)) { mc->mc_error = ENXIO; mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; @@ -237,6 +229,7 @@ vdev_mirror_child_select(zio_t *zio) return (c); mc->mc_error = ESTALE; mc->mc_skipped = 1; + mc->mc_speculative = 1; } /* @@ -253,7 +246,7 @@ vdev_mirror_child_select(zio_t *zio) return (-1); } -static void +static int vdev_mirror_io_start(zio_t *zio) { mirror_map_t *mm; @@ -275,12 +268,10 @@ vdev_mirror_io_start(zio_t *zio) zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio_buf_alloc(zio->io_size), zio->io_size, - zio->io_type, zio->io_priority, - ZIO_FLAG_CANFAIL, + zio->io_type, zio->io_priority, 0, vdev_mirror_scrub_done, mc)); } - zio_wait_children_done(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } /* * For normal reads just pick one child. @@ -310,13 +301,27 @@ vdev_mirror_io_start(zio_t *zio) while (children--) { mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, - zio->io_data, zio->io_size, zio->io_type, zio->io_priority, - ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc)); + mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_mirror_child_done, mc)); c++; } - zio_wait_children_done(zio); + return (ZIO_PIPELINE_CONTINUE); +} + +static int +vdev_mirror_worst_error(mirror_map_t *mm) +{ + int error[2] = { 0, 0 }; + + for (int c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc = &mm->mm_child[c]; + int s = mc->mc_speculative; + error[s] = zio_worst_error(error[s], mc->mc_error); + } + + return (error[0] ? error[0] : error[1]); } static void @@ -328,41 +333,45 @@ vdev_mirror_io_done(zio_t *zio) int good_copies = 0; int unexpected_errors = 0; - zio->io_error = 0; - zio->io_numerrors = 0; - for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; - if (mc->mc_tried && mc->mc_error == 0) { - good_copies++; - continue; - } - - /* - * We preserve any EIOs because those may be worth retrying; - * whereas ECKSUM and ENXIO are more likely to be persistent. - */ if (mc->mc_error) { - if (zio->io_error != EIO) - zio->io_error = mc->mc_error; if (!mc->mc_skipped) unexpected_errors++; - zio->io_numerrors++; + } else if (mc->mc_tried) { + good_copies++; } } if (zio->io_type == ZIO_TYPE_WRITE) { /* * XXX -- for now, treat partial writes as success. - * XXX -- For a replacing vdev, we need to make sure the - * new child succeeds. + * + * Now that we support write reallocation, it would be better + * to treat partial failure as real failure unless there are + * no non-degraded top-level vdevs left, and not update DTLs + * if we intend to reallocate. */ /* XXPOLICY */ - if (good_copies != 0) - zio->io_error = 0; - vdev_mirror_map_free(zio); - zio_next_stage(zio); + if (good_copies != mm->mm_children) { + /* + * Always require at least one good copy. + * + * For ditto blocks (io_vd == NULL), require + * all copies to be good. + * + * XXX -- for replacing vdevs, there's no great answer. + * If the old device is really dead, we may not even + * be able to access it -- so we only want to + * require good writes to the new device. But if + * the new device turns out to be flaky, we want + * to be able to detach it -- which requires all + * writes to the old device to have succeeded. + */ + if (good_copies == 0 || zio->io_vd == NULL) + zio->io_error = vdev_mirror_worst_error(mm); + } return; } @@ -375,40 +384,27 @@ vdev_mirror_io_done(zio_t *zio) if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { ASSERT(c >= 0 && c < mm->mm_children); mc = &mm->mm_child[c]; - dprintf("retrying i/o (err=%d) on child %s\n", - zio->io_error, vdev_description(mc->mc_vd)); - zio->io_error = 0; zio_vdev_io_redone(zio); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, - ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL, + ZIO_TYPE_READ, zio->io_priority, 0, vdev_mirror_child_done, mc)); - zio_wait_children_done(zio); return; } /* XXPOLICY */ - if (good_copies) - zio->io_error = 0; - else + if (good_copies == 0) { + zio->io_error = vdev_mirror_worst_error(mm); ASSERT(zio->io_error != 0); + } if (good_copies && (spa_mode & FWRITE) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER) || ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { - zio_t *rio; - /* * Use the good data we have in hand to repair damaged children. - * - * We issue all repair I/Os as children of 'rio' to arrange - * that vdev_mirror_map_free(zio) will be invoked after all - * repairs complete, but before we advance to the next stage. */ - rio = zio_null(zio, zio->io_spa, - vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL); - for (c = 0; c < mm->mm_children; c++) { /* * Don't rewrite known good children. @@ -429,24 +425,13 @@ vdev_mirror_io_done(zio_t *zio) mc->mc_error = ESTALE; } - dprintf("resilvered %s @ 0x%llx error %d\n", - vdev_description(mc->mc_vd), mc->mc_offset, - mc->mc_error); - - zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd, - mc->mc_offset, zio->io_data, zio->io_size, + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, + zio->io_data, zio->io_size, ZIO_TYPE_WRITE, zio->io_priority, - ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE, NULL, NULL)); + ZIO_FLAG_IO_REPAIR, NULL, NULL)); } - - zio_nowait(rio); - zio_wait_children_done(zio); - return; } - - vdev_mirror_map_free(zio); - zio_next_stage(zio); } static void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c index b35f4a5bcd03..731f7d3dcec9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * The 'missing' vdev is a special vdev type used only during import. It * signifies a placeholder in the root vdev for some vdev that we know is @@ -63,18 +60,17 @@ vdev_missing_close(vdev_t *vd) } /* ARGSUSED */ -static void +static int vdev_missing_io_start(zio_t *zio) { zio->io_error = ENOTSUP; - zio_next_stage_async(zio); + return (ZIO_PIPELINE_CONTINUE); } /* ARGSUSED */ static void vdev_missing_io_done(zio_t *zio) { - zio_next_stage(zio); } vdev_ops_t vdev_missing_ops = { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c index 8ef524f71931..cd4d5aef241f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/vdev_impl.h> @@ -55,6 +53,25 @@ int zfs_vdev_ramp_rate = 2; */ int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; +SYSCTL_DECL(_vfs_zfs_vdev); +TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RDTUN, + &zfs_vdev_max_pending, 0, "Maximum I/O requests pending on each device"); +TUNABLE_INT("vfs.zfs.vdev.min_pending", &zfs_vdev_min_pending); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RDTUN, + &zfs_vdev_min_pending, 0, + "Initial number of I/O requests pending to each device"); +TUNABLE_INT("vfs.zfs.vdev.time_shift", &zfs_vdev_time_shift); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, time_shift, CTLFLAG_RDTUN, + &zfs_vdev_time_shift, 0, "Used for calculating I/O request deadline"); +TUNABLE_INT("vfs.zfs.vdev.ramp_rate", &zfs_vdev_ramp_rate); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, ramp_rate, CTLFLAG_RDTUN, + &zfs_vdev_ramp_rate, 0, "Exponential I/O issue ramp-up rate"); +TUNABLE_INT("vfs.zfs.vdev.aggregation_limit", &zfs_vdev_aggregation_limit); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RDTUN, + &zfs_vdev_aggregation_limit, 0, + "I/O requests are aggregated up to this size"); + /* * Virtual device vector for disk I/O scheduling. */ @@ -162,7 +179,7 @@ vdev_queue_agg_io_done(zio_t *aio) aio->io_delegate_list = dio->io_delegate_next; dio->io_delegate_next = NULL; dio->io_error = aio->io_error; - zio_next_stage(dio); + zio_execute(dio); } ASSERT3U(offset, ==, aio->io_size); @@ -172,11 +189,8 @@ vdev_queue_agg_io_done(zio_t *aio) #define IS_ADJACENT(io, nio) \ ((io)->io_offset + (io)->io_size == (nio)->io_offset) -typedef void zio_issue_func_t(zio_t *); - static zio_t * -vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, - zio_issue_func_t **funcp) +vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { zio_t *fio, *lio, *aio, *dio; avl_tree_t *tree; @@ -184,8 +198,6 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, ASSERT(MUTEX_HELD(&vq->vq_lock)); - *funcp = NULL; - if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || avl_numnodes(&vq->vq_deadline_tree) == 0) return (NULL); @@ -196,6 +208,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, size = fio->io_size; while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && + !((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && size + dio->io_size <= zfs_vdev_aggregation_limit) { dio->io_delegate_next = fio; fio = dio; @@ -203,6 +216,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, } while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && + !((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && size + dio->io_size <= zfs_vdev_aggregation_limit) { lio->io_delegate_next = dio; lio = dio; @@ -212,15 +226,12 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, if (fio != lio) { char *buf = zio_buf_alloc(size); uint64_t offset = 0; - int nagg = 0; ASSERT(size <= zfs_vdev_aggregation_limit); - aio = zio_vdev_child_io(fio, NULL, fio->io_vd, - fio->io_offset, buf, size, fio->io_type, - ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_NOBOOKMARK, + aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, + buf, size, fio->io_type, ZIO_PRIORITY_NOW, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_delegate_list = fio; @@ -233,19 +244,12 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, offset += dio->io_size; vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); - nagg++; } ASSERT(offset == size); - dprintf("%5s T=%llu off=%8llx agg=%3d " - "old=%5llx new=%5llx\n", - zio_type_name[fio->io_type], - fio->io_deadline, fio->io_offset, nagg, fio->io_size, size); - avl_add(&vq->vq_pending_tree, aio); - *funcp = zio_nowait; return (aio); } @@ -254,8 +258,6 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, avl_add(&vq->vq_pending_tree, fio); - *funcp = zio_next_stage; - return (fio); } @@ -264,7 +266,6 @@ vdev_queue_io(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; - zio_issue_func_t *func; ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); @@ -280,42 +281,45 @@ vdev_queue_io(zio_t *zio) mutex_enter(&vq->vq_lock); - zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + - zio->io_priority; + zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority; vdev_queue_io_add(vq, zio); - nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func); + nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); mutex_exit(&vq->vq_lock); - if (nio == NULL || func != zio_nowait) - return (nio); + if (nio == NULL) + return (NULL); + + if (nio->io_done == vdev_queue_agg_io_done) { + zio_nowait(nio); + return (NULL); + } - func(nio); - return (NULL); + return (nio); } void vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; - zio_t *nio; - zio_issue_func_t *func; - int i; mutex_enter(&vq->vq_lock); avl_remove(&vq->vq_pending_tree, zio); - for (i = 0; i < zfs_vdev_ramp_rate; i++) { - nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func); + for (int i = 0; i < zfs_vdev_ramp_rate; i++) { + zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); if (nio == NULL) break; mutex_exit(&vq->vq_lock); - if (func == zio_next_stage) + if (nio->io_done == vdev_queue_agg_io_done) { + zio_nowait(nio); + } else { zio_vdev_io_reissue(nio); - func(nio); + zio_execute(nio); + } mutex_enter(&vq->vq_lock); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c index 0c866307653b..69e314468ee4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/vdev_impl.h> @@ -194,6 +192,18 @@ vdev_raidz_exp2(uint_t a, int exp) return (vdev_raidz_pow2[exp]); } +static void +vdev_raidz_map_free(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + int c; + + for (c = 0; c < rm->rm_firstdatacol; c++) + zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + + kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); +} + static raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, uint64_t nparity) @@ -276,23 +286,11 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, } zio->io_vsd = rm; + zio->io_vsd_free = vdev_raidz_map_free; return (rm); } static void -vdev_raidz_map_free(zio_t *zio) -{ - raidz_map_t *rm = zio->io_vsd; - int c; - - for (c = 0; c < rm->rm_firstdatacol; c++) - zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); - - kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); - zio->io_vsd = NULL; -} - -static void vdev_raidz_generate_parity_p(raidz_map_t *rm) { uint64_t *p, *src, pcount, ccount, i; @@ -632,14 +630,7 @@ vdev_raidz_child_done(zio_t *zio) rc->rc_skipped = 0; } -static void -vdev_raidz_repair_done(zio_t *zio) -{ - ASSERT(zio->io_private == zio->io_parent); - vdev_raidz_map_free(zio->io_private); -} - -static void +static int vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -669,11 +660,11 @@ vdev_raidz_io_start(zio_t *zio) cvd = vd->vdev_child[rc->rc_devidx]; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, - zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, + zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } - zio_wait_children_done(zio); - return; + + return (ZIO_PIPELINE_CONTINUE); } ASSERT(zio->io_type == ZIO_TYPE_READ); @@ -686,7 +677,7 @@ vdev_raidz_io_start(zio_t *zio) for (c = rm->rm_cols - 1; c >= 0; c--) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; - if (vdev_is_dead(cvd)) { + if (!vdev_readable(cvd)) { if (c >= rm->rm_firstdatacol) rm->rm_missingdata++; else @@ -709,12 +700,12 @@ vdev_raidz_io_start(zio_t *zio) (zio->io_flags & ZIO_FLAG_SCRUB)) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, - zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, + zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } } - zio_wait_children_done(zio); + return (ZIO_PIPELINE_CONTINUE); } /* @@ -724,8 +715,6 @@ static void raidz_checksum_error(zio_t *zio, raidz_col_t *rc) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; - dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", - vdev_description(vd)); if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { mutex_enter(&vd->vdev_stat_lock); @@ -783,6 +772,17 @@ static uint64_t raidz_corrected_p; static uint64_t raidz_corrected_q; static uint64_t raidz_corrected_pq; +static int +vdev_raidz_worst_error(raidz_map_t *rm) +{ + int error = 0; + + for (int c = 0; c < rm->rm_cols; c++) + error = zio_worst_error(error, rm->rm_col[c].rc_error); + + return (error); +} + static void vdev_raidz_io_done(zio_t *zio) { @@ -794,26 +794,19 @@ vdev_raidz_io_done(zio_t *zio) int parity_errors = 0; int parity_untried = 0; int data_errors = 0; + int total_errors = 0; int n, c, c1; ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ - zio->io_error = 0; - zio->io_numerrors = 0; - ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; - /* - * We preserve any EIOs because those may be worth retrying; - * whereas ECKSUM and ENXIO are more likely to be persistent. - */ if (rc->rc_error) { - if (zio->io_error != EIO) - zio->io_error = rc->rc_error; + ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ if (c < rm->rm_firstdatacol) parity_errors++; @@ -823,7 +816,7 @@ vdev_raidz_io_done(zio_t *zio) if (!rc->rc_skipped) unexpected_errors++; - zio->io_numerrors++; + total_errors++; } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { parity_untried++; } @@ -831,16 +824,19 @@ vdev_raidz_io_done(zio_t *zio) if (zio->io_type == ZIO_TYPE_WRITE) { /* - * If this is not a failfast write, and we were able to - * write enough columns to reconstruct the data, good enough. + * XXX -- for now, treat partial writes as a success. + * (If we couldn't write enough columns to reconstruct + * the data, the I/O failed. Otherwise, good enough.) + * + * Now that we support write reallocation, it would be better + * to treat partial failure as real failure unless there are + * no non-degraded top-level vdevs left, and not update DTLs + * if we intend to reallocate. */ /* XXPOLICY */ - if (zio->io_numerrors <= rm->rm_firstdatacol && - !(zio->io_flags & ZIO_FLAG_FAILFAST)) - zio->io_error = 0; + if (total_errors > rm->rm_firstdatacol) + zio->io_error = vdev_raidz_worst_error(rm); - vdev_raidz_map_free(zio); - zio_next_stage(zio); return; } @@ -862,12 +858,10 @@ vdev_raidz_io_done(zio_t *zio) * has a valid checksum. Naturally, this case applies in the absence of * any errors. */ - if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) { + if (total_errors <= rm->rm_firstdatacol - parity_untried) { switch (data_errors) { case 0: if (zio_checksum_error(zio) == 0) { - zio->io_error = 0; - /* * If we read parity information (unnecessarily * as it happens since no reconstruction was @@ -919,7 +913,6 @@ vdev_raidz_io_done(zio_t *zio) } if (zio_checksum_error(zio) == 0) { - zio->io_error = 0; if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) atomic_inc_64(&raidz_corrected_p); else @@ -981,9 +974,7 @@ vdev_raidz_io_done(zio_t *zio) vdev_raidz_reconstruct_pq(rm, c1, c); if (zio_checksum_error(zio) == 0) { - zio->io_error = 0; atomic_inc_64(&raidz_corrected_pq); - goto done; } break; @@ -1009,7 +1000,6 @@ vdev_raidz_io_done(zio_t *zio) if (rm->rm_col[c].rc_tried) continue; - zio->io_error = 0; zio_vdev_io_redone(zio); do { rc = &rm->rm_col[c]; @@ -1018,11 +1008,10 @@ vdev_raidz_io_done(zio_t *zio) zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_devidx], rc->rc_offset, rc->rc_data, rc->rc_size, - zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, + zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } while (++c < rm->rm_cols); - dprintf("rereading\n"); - zio_wait_children_done(zio); + return; } @@ -1034,8 +1023,15 @@ vdev_raidz_io_done(zio_t *zio) * in absent data. Before we attempt combinatorial reconstruction make * sure we have a chance of coming up with the right answer. */ - if (zio->io_numerrors >= rm->rm_firstdatacol) { - ASSERT(zio->io_error != 0); + if (total_errors >= rm->rm_firstdatacol) { + zio->io_error = vdev_raidz_worst_error(rm); + /* + * If there were exactly as many device errors as parity + * columns, yet we couldn't reconstruct the data, then at + * least one device must have returned bad data silently. + */ + if (total_errors == rm->rm_firstdatacol) + zio->io_error = zio_worst_error(zio->io_error, ECKSUM); goto done; } @@ -1053,7 +1049,6 @@ vdev_raidz_io_done(zio_t *zio) if (zio_checksum_error(zio) == 0) { zio_buf_free(orig, rc->rc_size); - zio->io_error = 0; atomic_inc_64(&raidz_corrected_p); /* @@ -1085,7 +1080,6 @@ vdev_raidz_io_done(zio_t *zio) if (zio_checksum_error(zio) == 0) { zio_buf_free(orig, rc->rc_size); - zio->io_error = 0; atomic_inc_64(&raidz_corrected_q); /* @@ -1127,7 +1121,6 @@ vdev_raidz_io_done(zio_t *zio) if (zio_checksum_error(zio) == 0) { zio_buf_free(orig, rc->rc_size); zio_buf_free(orig1, rc1->rc_size); - zio->io_error = 0; atomic_inc_64(&raidz_corrected_pq); /* @@ -1159,6 +1152,7 @@ vdev_raidz_io_done(zio_t *zio) * all children. */ zio->io_error = ECKSUM; + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; @@ -1173,18 +1167,9 @@ done: if (zio->io_error == 0 && (spa_mode & FWRITE) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { - zio_t *rio; - /* * Use the good data we have in hand to repair damaged children. - * - * We issue all repair I/Os as children of 'rio' to arrange - * that vdev_raidz_map_free(zio) will be invoked after all - * repairs complete, but before we advance to the next stage. */ - rio = zio_null(zio, zio->io_spa, - vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL); - for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; @@ -1192,25 +1177,12 @@ done: if (rc->rc_error == 0) continue; - dprintf("%s resilvered %s @ 0x%llx error %d\n", - vdev_description(vd), - vdev_description(cvd), - zio->io_offset, rc->rc_error); - - zio_nowait(zio_vdev_child_io(rio, NULL, cvd, + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, ZIO_TYPE_WRITE, zio->io_priority, - ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_CANFAIL, NULL, NULL)); + ZIO_FLAG_IO_REPAIR, NULL, NULL)); } - - zio_nowait(rio); - zio_wait_children_done(zio); - return; } - - vdev_raidz_map_free(zio); - zio_next_stage(zio); } static void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c index 0e8752c6ce83..88383f002b80 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/vdev_impl.h> @@ -44,18 +42,17 @@ * probably fine. Adding bean counters during alloc/free can make this * future guesswork more accurate. */ -/*ARGSUSED*/ static int too_many_errors(vdev_t *vd, int numerrors) { + ASSERT3U(numerrors, <=, vd->vdev_children); return (numerrors > 0); } static int vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { - vdev_t *cvd; - int c, error; + int c; int lasterror = 0; int numerrors = 0; @@ -65,9 +62,11 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) } for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; + vdev_t *cvd = vd->vdev_child[c]; + int error; - if ((error = vdev_open(cvd)) != 0) { + if ((error = vdev_open(cvd)) != 0 && + !cvd->vdev_islog) { lasterror = error; numerrors++; continue; @@ -97,13 +96,14 @@ vdev_root_close(vdev_t *vd) static void vdev_root_state_change(vdev_t *vd, int faulted, int degraded) { - if (too_many_errors(vd, faulted)) + if (too_many_errors(vd, faulted)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); - else if (degraded != 0) + } else if (degraded) { vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); - else + } else { vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); + } } vdev_ops_t vdev_root_ops = { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c index 4246ec0b0e6c..90fe3d094318 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -44,6 +44,7 @@ #include <sys/spa.h> #include <sys/dmu.h> #include <sys/zfs_context.h> +#include <sys/zfs_znode.h> #include <sys/zap.h> #include <sys/refcount.h> #include <sys/zap_impl.h> @@ -103,6 +104,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx) zp->zap_num_leafs = 1; zp->zap_num_entries = 0; zp->zap_salt = zap->zap_salt; + zp->zap_normflags = zap->zap_normflags; /* block 1 will be the first leaf */ for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++) @@ -119,7 +121,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx) l->l_dbuf = db; l->l_phys = db->db_data; - zap_leaf_init(l); + zap_leaf_init(l, zp->zap_normflags != 0); kmem_free(l, sizeof (zap_leaf_t)); dmu_buf_rele(db, FTAG); @@ -399,7 +401,7 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx) ASSERT(winner == NULL); dmu_buf_will_dirty(l->l_dbuf, tx); - zap_leaf_init(l); + zap_leaf_init(l, zap->zap_normflags != 0); zap->zap_f.zap_phys->zap_num_leafs++; @@ -580,9 +582,10 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) } static int -zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx, - zap_leaf_t **lp) +zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp) { + zap_t *zap = zn->zn_zap; + uint64_t hash = zn->zn_hash; zap_leaf_t *nl; int prefix_diff, i, err; uint64_t sibling; @@ -602,7 +605,9 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx, zap_put_leaf(l); zap_unlockdir(zap); - err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap); + err = zap_lockdir(os, object, tx, RW_WRITER, + FALSE, FALSE, &zn->zn_zap); + zap = zn->zn_zap; if (err) return (err); ASSERT(!zap->zap_ismicro); @@ -643,7 +648,7 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx, } nl = zap_create_leaf(zap, tx); - zap_leaf_split(l, nl); + zap_leaf_split(l, nl, zap->zap_normflags != 0); /* set sibling pointers */ for (i = 0; i < (1ULL<<prefix_diff); i++) { @@ -664,8 +669,9 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx, } static void -zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx) +zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) { + zap_t *zap = zn->zn_zap; int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift && l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); @@ -685,7 +691,8 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx) zap_unlockdir(zap); err = zap_lockdir(os, zapobj, tx, - RW_WRITER, FALSE, &zap); + RW_WRITER, FALSE, FALSE, &zn->zn_zap); + zap = zn->zn_zap; if (err) return; } @@ -721,53 +728,58 @@ fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers) } /* - * Routines for maniplulating attributes. + * Routines for manipulating attributes. */ int -fzap_lookup(zap_t *zap, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf) +fzap_lookup(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, void *buf, + char *realname, int rn_len, boolean_t *ncp) { zap_leaf_t *l; int err; - uint64_t hash; zap_entry_handle_t zeh; - err = fzap_checksize(name, integer_size, num_integers); + err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers); if (err != 0) return (err); - hash = zap_hash(zap, name); - err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l); + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); if (err != 0) return (err); - err = zap_leaf_lookup(l, name, hash, &zeh); - if (err == 0) + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { err = zap_entry_read(&zeh, integer_size, num_integers, buf); + (void) zap_entry_read_name(&zeh, rn_len, realname); + if (ncp) { + *ncp = zap_entry_normalization_conflict(&zeh, + zn, NULL, zn->zn_zap); + } + } zap_put_leaf(l); return (err); } int -fzap_add_cd(zap_t *zap, const char *name, +fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, uint32_t cd, dmu_tx_t *tx) { zap_leaf_t *l; - uint64_t hash; int err; zap_entry_handle_t zeh; + zap_t *zap = zn->zn_zap; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(!zap->zap_ismicro); - ASSERT(fzap_checksize(name, integer_size, num_integers) == 0); + ASSERT(fzap_checksize(zn->zn_name_orij, + integer_size, num_integers) == 0); - hash = zap_hash(zap, name); - err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); retry: - err = zap_leaf_lookup(l, name, hash, &zeh); + err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { err = EEXIST; goto out; @@ -775,63 +787,62 @@ retry: if (err != ENOENT) goto out; - err = zap_entry_create(l, name, hash, cd, + err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, cd, integer_size, num_integers, val, &zeh); if (err == 0) { zap_increment_num_entries(zap, 1, tx); } else if (err == EAGAIN) { - err = zap_expand_leaf(zap, l, hash, tx, &l); + err = zap_expand_leaf(zn, l, tx, &l); + zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; } out: - zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx); + if (zap != NULL) + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx); return (err); } int -fzap_add(zap_t *zap, const char *name, +fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { - int err = fzap_checksize(name, integer_size, num_integers); + int err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers); if (err != 0) return (err); - return (fzap_add_cd(zap, name, integer_size, num_integers, + return (fzap_add_cd(zn, integer_size, num_integers, val, ZAP_MAXCD, tx)); } int -fzap_update(zap_t *zap, const char *name, +fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_leaf_t *l; - uint64_t hash; int err, create; zap_entry_handle_t zeh; + zap_t *zap = zn->zn_zap; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - err = fzap_checksize(name, integer_size, num_integers); + err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers); if (err != 0) return (err); - hash = zap_hash(zap, name); - err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); retry: - err = zap_leaf_lookup(l, name, hash, &zeh); + err = zap_leaf_lookup(l, zn, &zeh); create = (err == ENOENT); ASSERT(err == 0 || err == ENOENT); - /* XXX If this leaf is chained, split it if we can. */ - if (create) { - err = zap_entry_create(l, name, hash, ZAP_MAXCD, - integer_size, num_integers, val, &zeh); + err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, + ZAP_MAXCD, integer_size, num_integers, val, &zeh); if (err == 0) zap_increment_num_entries(zap, 1, tx); } else { @@ -839,29 +850,29 @@ retry: } if (err == EAGAIN) { - err = zap_expand_leaf(zap, l, hash, tx, &l); + err = zap_expand_leaf(zn, l, tx, &l); + zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; } - zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx); + if (zap != NULL) + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx); return (err); } int -fzap_length(zap_t *zap, const char *name, +fzap_length(zap_name_t *zn, uint64_t *integer_size, uint64_t *num_integers) { zap_leaf_t *l; int err; - uint64_t hash; zap_entry_handle_t zeh; - hash = zap_hash(zap, name); - err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l); + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); if (err != 0) return (err); - err = zap_leaf_lookup(l, name, hash, &zeh); + err = zap_leaf_lookup(l, zn, &zeh); if (err != 0) goto out; @@ -875,40 +886,44 @@ out: } int -fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx) +fzap_remove(zap_name_t *zn, dmu_tx_t *tx) { zap_leaf_t *l; - uint64_t hash; int err; zap_entry_handle_t zeh; - hash = zap_hash(zap, name); - err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); - err = zap_leaf_lookup(l, name, hash, &zeh); + err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { zap_entry_remove(&zeh); - zap_increment_num_entries(zap, -1, tx); + zap_increment_num_entries(zn->zn_zap, -1, tx); } zap_put_leaf(l); - dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n", - zap->zap_objset, zap->zap_object, name, err); return (err); } +/* + * Helper functions for consumers. + */ + int -zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name) +zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, + char *name) { zap_cursor_t zc; zap_attribute_t *za; int err; + if (mask == 0) + mask = -1ULL; + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); for (zap_cursor_init(&zc, os, zapobj); (err = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { - if (ZFS_DIRENT_OBJ(za->za_first_integer) == value) { + if ((za->za_first_integer & mask) == (value & mask)) { (void) strcpy(name, za->za_name); break; } @@ -918,6 +933,53 @@ zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name) return (err); } +int +zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + int err; + + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + if (za.za_integer_length != 8 || za.za_num_integers != 1) + return (EINVAL); + err = zap_add(os, intoobj, za.za_name, + 8, 1, &za.za_first_integer, tx); + if (err) + return (err); + } + zap_cursor_fini(&zc); + return (0); +} + +int +zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); + return (zap_add(os, obj, name, 8, 1, &value, tx)); +} + +int +zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); + return (zap_remove(os, obj, name, tx)); +} + +int +zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); + return (zap_lookup(os, obj, name, 8, 1, &value)); +} /* * Routines for iterating over the attributes. @@ -983,6 +1045,10 @@ again: err = zap_entry_read_name(&zeh, sizeof (za->za_name), za->za_name); ASSERT(err == 0); + + za->za_normalization_conflict = + zap_entry_normalization_conflict(&zeh, + NULL, za->za_name, zap); } rw_exit(&zc->zc_leaf->l_rwlock); return (err); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c index 5dff5145308a..da498b6bc9e3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,6 +38,8 @@ #include <sys/spa.h> #include <sys/dmu.h> +static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); + #define CHAIN_END 0xffff /* end of the chunk chain */ /* half the (current) minimum block size */ @@ -150,7 +152,7 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) } void -zap_leaf_init(zap_leaf_t *l) +zap_leaf_init(zap_leaf_t *l, boolean_t sort) { int i; @@ -165,6 +167,8 @@ zap_leaf_init(zap_leaf_t *l) l->l_phys->l_hdr.lh_block_type = ZBT_LEAF; l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC; l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); + if (sort) + l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; } /* @@ -327,19 +331,30 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, /* * Only to be used on 8-bit arrays. * array_len is actual len in bytes (not encoded le_value_length). - * buf is null-terminated. + * namenorm is null-terminated. */ -static int -zap_leaf_array_equal(zap_leaf_t *l, int chunk, - int array_len, const char *buf) +static boolean_t +zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int chunk, int array_len) { int bseen = 0; + if (zn->zn_matchtype == MT_FIRST) { + char *thisname = kmem_alloc(array_len, KM_SLEEP); + boolean_t match; + + zap_leaf_array_read(l, chunk, 1, array_len, 1, + array_len, thisname); + match = zap_match(zn, thisname); + kmem_free(thisname, array_len); + return (match); + } + + /* Fast path for exact matching */ while (bseen < array_len) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - if (bcmp(la->la_array, buf + bseen, toread)) + if (bcmp(la->la_array, zn->zn_name_orij + bseen, toread)) break; chunk = la->la_next; bseen += toread; @@ -352,15 +367,15 @@ zap_leaf_array_equal(zap_leaf_t *l, int chunk, */ int -zap_leaf_lookup(zap_leaf_t *l, - const char *name, uint64_t h, zap_entry_handle_t *zeh) +zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh) { uint16_t *chunkp; struct zap_leaf_entry *le; ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); - for (chunkp = LEAF_HASH_ENTPTR(l, h); +again: + for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash); *chunkp != CHAIN_END; chunkp = &le->le_next) { uint16_t chunk = *chunkp; le = ZAP_LEAF_ENTRY(l, chunk); @@ -368,11 +383,18 @@ zap_leaf_lookup(zap_leaf_t *l, ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - if (le->le_hash != h) + if (le->le_hash != zn->zn_hash) continue; - if (zap_leaf_array_equal(l, le->le_name_chunk, - le->le_name_length, name)) { + /* + * NB: the entry chain is always sorted by cd on + * normalized zap objects, so this will find the + * lowest-cd match for MT_FIRST. + */ + ASSERT(zn->zn_matchtype == MT_EXACT || + (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); + if (zap_leaf_array_match(l, zn, le->le_name_chunk, + le->le_name_length)) { zeh->zeh_num_integers = le->le_value_length; zeh->zeh_integer_size = le->le_int_size; zeh->zeh_cd = le->le_cd; @@ -383,6 +405,15 @@ zap_leaf_lookup(zap_leaf_t *l, } } + /* + * NB: we could of course do this in one pass, but that would be + * a pain. We'll see if MT_BEST is even used much. + */ + if (zn->zn_matchtype == MT_BEST) { + zn->zn_matchtype = MT_FIRST; + goto again; + } + return (ENOENT); } @@ -539,22 +570,41 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, return (E2BIG); if (cd == ZAP_MAXCD) { - for (cd = 0; cd < ZAP_MAXCD; cd++) { + /* find the lowest unused cd */ + if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { + cd = 0; + for (chunk = *LEAF_HASH_ENTPTR(l, h); chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); - if (le->le_hash == h && - le->le_cd == cd) { + if (le->le_cd > cd) break; + if (le->le_hash == h) { + ASSERT3U(cd, ==, le->le_cd); + cd++; } } - /* If this cd is not in use, we are good. */ - if (chunk == CHAIN_END) - break; + } else { + /* old unsorted format; do it the O(n^2) way */ + for (cd = 0; cd < ZAP_MAXCD; cd++) { + for (chunk = *LEAF_HASH_ENTPTR(l, h); + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(l, chunk); + if (le->le_hash == h && + le->le_cd == cd) { + break; + } + } + /* If this cd is not in use, we are good. */ + if (chunk == CHAIN_END) + break; + } } - /* If we tried all the cd's, we lose. */ - if (cd == ZAP_MAXCD) - return (ENOSPC); + /* + * we would run out of space in a block before we could + * have ZAP_MAXCD entries + */ + ASSERT3U(cd, <, ZAP_MAXCD); } if (l->l_phys->l_hdr.lh_nfree < numchunks) @@ -574,9 +624,8 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, le->le_cd = cd; /* link it into the hash chain */ - chunkp = LEAF_HASH_ENTPTR(l, h); - le->le_next = *chunkp; - *chunkp = chunk; + /* XXX if we did the search above, we could just use that */ + chunkp = zap_leaf_rehash_entry(l, chunk); l->l_phys->l_hdr.lh_nentries++; @@ -591,16 +640,76 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, } /* + * Determine if there is another entry with the same normalized form. + * For performance purposes, either zn or name must be provided (the + * other can be NULL). Note, there usually won't be any hash + * conflicts, in which case we don't need the concatenated/normalized + * form of the name. But all callers have one of these on hand anyway, + * so might as well take advantage. A cleaner but slower interface + * would accept neither argument, and compute the normalized name as + * needed (using zap_name_alloc(zap_entry_read_name(zeh))). + */ +boolean_t +zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, + const char *name, zap_t *zap) +{ + uint64_t chunk; + struct zap_leaf_entry *le; + boolean_t allocdzn = B_FALSE; + + if (zap->zap_normflags == 0) + return (B_FALSE); + + for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash); + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk); + if (le->le_hash != zeh->zeh_hash) + continue; + if (le->le_cd == zeh->zeh_cd) + continue; + + if (zn == NULL) { + zn = zap_name_alloc(zap, name, MT_FIRST); + allocdzn = B_TRUE; + } + if (zap_leaf_array_match(zeh->zeh_leaf, zn, + le->le_name_chunk, le->le_name_length)) { + if (allocdzn) + zap_name_free(zn); + return (B_TRUE); + } + } + if (allocdzn) + zap_name_free(zn); + return (B_FALSE); +} + +/* * Routines for transferring entries between leafs. */ -static void +static uint16_t * zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); - uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash); - le->le_next = *ptr; - *ptr = entry; + struct zap_leaf_entry *le2; + uint16_t *chunkp; + + /* + * keep the entry chain sorted by cd + * NB: this will not cause problems for unsorted leafs, though + * it is unnecessary there. + */ + for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash); + *chunkp != CHAIN_END; chunkp = &le2->le_next) { + le2 = ZAP_LEAF_ENTRY(l, *chunkp); + if (le2->le_cd > le->le_cd) + break; + } + + le->le_next = *chunkp; + *chunkp = entry; + return (chunkp); } static uint16_t @@ -644,7 +753,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) nle = ZAP_LEAF_ENTRY(nl, chunk); *nle = *le; /* structure assignment */ - zap_leaf_rehash_entry(nl, chunk); + (void) zap_leaf_rehash_entry(nl, chunk); nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); nle->le_value_chunk = @@ -660,7 +769,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) * Transfer the entries whose hash prefix ends in 1 to the new leaf. */ void -zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl) +zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) { int i; int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len; @@ -674,6 +783,9 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl) /* break existing hash chains */ zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); + if (sort) + l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; + /* * Transfer entries whose hash bit 'bit' is set to nl; rehash * the remaining entries @@ -691,7 +803,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl) if (le->le_hash & (1ULL << bit)) zap_leaf_transfer_entry(l, i, nl); else - zap_leaf_rehash_entry(l, i); + (void) zap_leaf_rehash_entry(l, i); } } @@ -726,7 +838,7 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * - le->le_int_size); + le->le_int_size); n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_entries_using_n_chunks[n]++; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c index 9a882a5491e7..75b43a6f88da 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -34,9 +34,104 @@ #include <sys/zap_leaf.h> #include <sys/avl.h> +#ifdef _KERNEL +#include <sys/sunddi.h> +#endif + +static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx); + + +static uint64_t +zap_hash(zap_t *zap, const char *normname) +{ + const uint8_t *cp; + uint8_t c; + uint64_t crc = zap->zap_salt; + + /* NB: name must already be normalized, if necessary */ + + ASSERT(crc != 0); + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) { + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; + } + + /* + * Only use 28 bits, since we need 4 bits in the cookie for the + * collision differentiator. We MUST use the high bits, since + * those are the ones that we first pay attention to when + * chosing the bucket. + */ + crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); -static void mzap_upgrade(zap_t *zap, dmu_tx_t *tx); + return (crc); +} + +static int +zap_normalize(zap_t *zap, const char *name, char *namenorm) +{ + size_t inlen, outlen; + int err; + + inlen = strlen(name) + 1; + outlen = ZAP_MAXNAMELEN; + + err = 0; + (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, + zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST, + &err); + + return (err); +} + +boolean_t +zap_match(zap_name_t *zn, const char *matchname) +{ + if (zn->zn_matchtype == MT_FIRST) { + char norm[ZAP_MAXNAMELEN]; + if (zap_normalize(zn->zn_zap, matchname, norm) != 0) + return (B_FALSE); + + return (strcmp(zn->zn_name_norm, norm) == 0); + } else { + /* MT_BEST or MT_EXACT */ + return (strcmp(zn->zn_name_orij, matchname) == 0); + } +} + +void +zap_name_free(zap_name_t *zn) +{ + kmem_free(zn, sizeof (zap_name_t)); +} + +/* XXX combine this with zap_lockdir()? */ +zap_name_t * +zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt) +{ + zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + + zn->zn_zap = zap; + zn->zn_name_orij = name; + zn->zn_matchtype = mt; + if (zap->zap_normflags) { + if (zap_normalize(zap, name, zn->zn_normbuf) != 0) { + zap_name_free(zn); + return (NULL); + } + zn->zn_name_norm = zn->zn_normbuf; + } else { + if (mt != MT_EXACT) { + zap_name_free(zn); + return (NULL); + } + zn->zn_name_norm = zn->zn_name_orij; + } + + zn->zn_hash = zap_hash(zap, zn->zn_name_norm); + return (zn); +} static void mzap_byteswap(mzap_phys_t *buf, size_t size) @@ -44,6 +139,7 @@ mzap_byteswap(mzap_phys_t *buf, size_t size) int i, max; buf->mz_block_type = BSWAP_64(buf->mz_block_type); buf->mz_salt = BSWAP_64(buf->mz_salt); + buf->mz_normflags = BSWAP_64(buf->mz_normflags); max = (size / MZAP_ENT_LEN) - 1; for (i = 0; i < max; i++) { buf->mz_chunk[i].mze_value = @@ -93,7 +189,6 @@ mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ASSERT(mzep->mze_cd < ZAP_MAXCD); - ASSERT3U(zap_hash(zap, mzep->mze_name), ==, hash); mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); mze->mze_chunkid = chunkid; @@ -103,30 +198,34 @@ mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) } static mzap_ent_t * -mze_find(zap_t *zap, const char *name, uint64_t hash) +mze_find(zap_name_t *zn) { mzap_ent_t mze_tofind; mzap_ent_t *mze; avl_index_t idx; - avl_tree_t *avl = &zap->zap_m.zap_avl; + avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; - ASSERT(zap->zap_ismicro); - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - ASSERT3U(zap_hash(zap, name), ==, hash); + ASSERT(zn->zn_zap->zap_ismicro); + ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); - if (strlen(name) >= sizeof (mze_tofind.mze_phys.mze_name)) + if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name)) return (NULL); - mze_tofind.mze_hash = hash; + mze_tofind.mze_hash = zn->zn_hash; mze_tofind.mze_phys.mze_cd = 0; +again: mze = avl_find(avl, &mze_tofind, &idx); if (mze == NULL) mze = avl_nearest(avl, idx, AVL_AFTER); - for (; mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { - if (strcmp(name, mze->mze_phys.mze_name) == 0) + for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { + if (zap_match(zn, mze->mze_phys.mze_name)) return (mze); } + if (zn->zn_matchtype == MT_BEST) { + zn->zn_matchtype = MT_FIRST; + goto again; + } return (NULL); } @@ -193,7 +292,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) zap->zap_object = obj; zap->zap_dbuf = db; - if (((uint64_t *)db->db_data)[0] != ZBT_MICRO) { + if (*(uint64_t *)db->db_data != ZBT_MICRO) { mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, MUTEX_DEFAULT, 0); zap->zap_f.zap_block_shift = highbit(db->db_size) - 1; @@ -219,6 +318,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) if (zap->zap_ismicro) { zap->zap_salt = zap->zap_m.zap_phys->mz_salt; + zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags; zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; avl_create(&zap->zap_m.zap_avl, mze_compare, sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); @@ -227,13 +327,18 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; if (mze->mze_name[0]) { + zap_name_t *zn; + zap->zap_m.zap_num_entries++; - mze_insert(zap, i, - zap_hash(zap, mze->mze_name), mze); + zn = zap_name_alloc(zap, mze->mze_name, + MT_EXACT); + mze_insert(zap, i, zn->zn_hash, mze); + zap_name_free(zn); } } } else { zap->zap_salt = zap->zap_f.zap_phys->zap_salt; + zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags; ASSERT3U(sizeof (struct zap_leaf_header), ==, 2*ZAP_LEAF_CHUNKSIZE); @@ -260,7 +365,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, - krw_t lti, int fatreader, zap_t **zapp) + krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { zap_t *zap; dmu_buf_t *db; @@ -312,15 +417,14 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, ASSERT(!zap->zap_ismicro || zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); - if (zap->zap_ismicro && tx && + if (zap->zap_ismicro && tx && adding && zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; if (newsz > MZAP_MAX_BLKSZ) { dprintf("upgrading obj %llu: num_entries=%u\n", obj, zap->zap_m.zap_num_entries); - mzap_upgrade(zap, tx); *zapp = zap; - return (0); + return (mzap_upgrade(zapp, tx)); } err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); ASSERT3U(err, ==, 0); @@ -339,11 +443,12 @@ zap_unlockdir(zap_t *zap) dmu_buf_rele(zap->zap_dbuf, NULL); } -static void -mzap_upgrade(zap_t *zap, dmu_tx_t *tx) +static int +mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) { mzap_phys_t *mzp; int i, sz, nchunks, err; + zap_t *zap = *zapp; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); @@ -354,10 +459,14 @@ mzap_upgrade(zap_t *zap, dmu_tx_t *tx) err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); - ASSERT(err == 0); + if (err) { + kmem_free(mzp, sz); + return (err); + } dprintf("upgrading obj=%llu with %u chunks\n", zap->zap_object, nchunks); + /* XXX destroy the avl later, so we can use the stored hash value */ mze_destroy(zap); fzap_upgrade(zap, tx); @@ -365,44 +474,25 @@ mzap_upgrade(zap_t *zap, dmu_tx_t *tx) for (i = 0; i < nchunks; i++) { int err; mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; + zap_name_t *zn; if (mze->mze_name[0] == 0) continue; dprintf("adding %s=%llu\n", mze->mze_name, mze->mze_value); - err = fzap_add_cd(zap, - mze->mze_name, 8, 1, &mze->mze_value, - mze->mze_cd, tx); - ASSERT3U(err, ==, 0); + zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); + err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx); + zap = zn->zn_zap; /* fzap_add_cd() may change zap */ + zap_name_free(zn); + if (err) + break; } kmem_free(mzp, sz); + *zapp = zap; + return (err); } -uint64_t -zap_hash(zap_t *zap, const char *name) -{ - const uint8_t *cp; - uint8_t c; - uint64_t crc = zap->zap_salt; - - ASSERT(crc != 0); - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; - - /* - * Only use 28 bits, since we need 4 bits in the cookie for the - * collision differentiator. We MUST use the high bits, since - * those are the onces that we first pay attention to when - * chosing the bucket. - */ - crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); - - return (crc); -} - - static void -mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx) +mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx) { dmu_buf_t *db; mzap_phys_t *zp; @@ -421,7 +511,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx) zp = db->db_data; zp->mz_block_type = ZBT_MICRO; zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; - ASSERT(zp->mz_salt != 0); + zp->mz_normflags = normflags; dmu_buf_rele(db, FTAG); } @@ -429,12 +519,21 @@ int zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { + return (zap_create_claim_norm(os, obj, + 0, ot, bonustype, bonuslen, tx)); +} + +int +zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, + dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ int err; err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); if (err != 0) return (err); - mzap_create_impl(os, obj, tx); + mzap_create_impl(os, obj, normflags, tx); return (0); } @@ -442,9 +541,16 @@ uint64_t zap_create(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { + return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); +} + +uint64_t +zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); - mzap_create_impl(os, obj, tx); + mzap_create_impl(os, obj, normflags, tx); return (obj); } @@ -482,7 +588,7 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) zap_t *zap; int err; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); if (!zap->zap_ismicro) { @@ -495,36 +601,102 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) } /* - * Routines for maniplulating attributes. + * zn may be NULL; if not specified, it will be computed if needed. + * See also the comment above zap_entry_normalization_conflict(). + */ +static boolean_t +mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) +{ + mzap_ent_t *other; + int direction = AVL_BEFORE; + boolean_t allocdzn = B_FALSE; + + if (zap->zap_normflags == 0) + return (B_FALSE); + +again: + for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction); + other && other->mze_hash == mze->mze_hash; + other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { + + if (zn == NULL) { + zn = zap_name_alloc(zap, mze->mze_phys.mze_name, + MT_FIRST); + allocdzn = B_TRUE; + } + if (zap_match(zn, other->mze_phys.mze_name)) { + if (allocdzn) + zap_name_free(zn); + return (B_TRUE); + } + } + + if (direction == AVL_BEFORE) { + direction = AVL_AFTER; + goto again; + } + + if (allocdzn) + zap_name_free(zn); + return (B_FALSE); +} + +/* + * Routines for manipulating attributes. */ int zap_lookup(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { + return (zap_lookup_norm(os, zapobj, name, integer_size, + num_integers, buf, MT_EXACT, NULL, 0, NULL)); +} + +int +zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ zap_t *zap; int err; mzap_ent_t *mze; + zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); + zn = zap_name_alloc(zap, name, mt); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + if (!zap->zap_ismicro) { - err = fzap_lookup(zap, name, - integer_size, num_integers, buf); + err = fzap_lookup(zn, integer_size, num_integers, buf, + realname, rn_len, ncp); } else { - mze = mze_find(zap, name, zap_hash(zap, name)); + mze = mze_find(zn); if (mze == NULL) { err = ENOENT; } else { - if (num_integers < 1) + if (num_integers < 1) { err = EOVERFLOW; - else if (integer_size != 8) + } else if (integer_size != 8) { err = EINVAL; - else + } else { *(uint64_t *)buf = mze->mze_phys.mze_value; + (void) strlcpy(realname, + mze->mze_phys.mze_name, rn_len); + if (ncp) { + *ncp = mzap_normalization_conflict(zap, + zn, mze); + } + } } } + zap_name_free(zn); zap_unlockdir(zap); return (err); } @@ -536,14 +708,20 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, zap_t *zap; int err; mzap_ent_t *mze; + zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); + zn = zap_name_alloc(zap, name, MT_EXACT); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } if (!zap->zap_ismicro) { - err = fzap_length(zap, name, integer_size, num_integers); + err = fzap_length(zn, integer_size, num_integers); } else { - mze = mze_find(zap, name, zap_hash(zap, name)); + mze = mze_find(zn); if (mze == NULL) { err = ENOENT; } else { @@ -553,28 +731,31 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, *num_integers = 1; } } + zap_name_free(zn); zap_unlockdir(zap); return (err); } static void -mzap_addent(zap_t *zap, const char *name, uint64_t hash, uint64_t value) +mzap_addent(zap_name_t *zn, uint64_t value) { int i; + zap_t *zap = zn->zn_zap; int start = zap->zap_m.zap_alloc_next; uint32_t cd; - dprintf("obj=%llu %s=%llu\n", zap->zap_object, name, value); + dprintf("obj=%llu %s=%llu\n", zap->zap_object, + zn->zn_name_orij, value); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); #ifdef ZFS_DEBUG for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; - ASSERT(strcmp(name, mze->mze_name) != 0); + ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0); } #endif - cd = mze_find_unused_cd(zap, hash); + cd = mze_find_unused_cd(zap, zn->zn_hash); /* given the limited size of the microzap, this can't happen */ ASSERT(cd != ZAP_MAXCD); @@ -584,13 +765,13 @@ again: if (mze->mze_name[0] == 0) { mze->mze_value = value; mze->mze_cd = cd; - (void) strcpy(mze->mze_name, name); + (void) strcpy(mze->mze_name, zn->zn_name_orij); zap->zap_m.zap_num_entries++; zap->zap_m.zap_alloc_next = i+1; if (zap->zap_m.zap_alloc_next == zap->zap_m.zap_num_chunks) zap->zap_m.zap_alloc_next = 0; - mze_insert(zap, i, hash, mze); + mze_insert(zap, i, zn->zn_hash, mze); return; } } @@ -610,29 +791,39 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name, int err; mzap_ent_t *mze; const uint64_t *intval = val; - uint64_t hash; + zap_name_t *zn; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap); + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); + zn = zap_name_alloc(zap, name, MT_EXACT); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } if (!zap->zap_ismicro) { - err = fzap_add(zap, name, integer_size, num_integers, val, tx); + err = fzap_add(zn, integer_size, num_integers, val, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", zapobj, integer_size, num_integers, name); - mzap_upgrade(zap, tx); - err = fzap_add(zap, name, integer_size, num_integers, val, tx); + err = mzap_upgrade(&zn->zn_zap, tx); + if (err == 0) + err = fzap_add(zn, integer_size, num_integers, val, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ } else { - hash = zap_hash(zap, name); - mze = mze_find(zap, name, hash); + mze = mze_find(zn); if (mze != NULL) { err = EEXIST; } else { - mzap_addent(zap, name, hash, *intval); + mzap_addent(zn, *intval); } } - zap_unlockdir(zap); + ASSERT(zap == zn->zn_zap); + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap); return (err); } @@ -643,68 +834,87 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, zap_t *zap; mzap_ent_t *mze; const uint64_t *intval = val; - uint64_t hash; + zap_name_t *zn; int err; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap); + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); - ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + zn = zap_name_alloc(zap, name, MT_EXACT); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } if (!zap->zap_ismicro) { - err = fzap_update(zap, name, - integer_size, num_integers, val, tx); + err = fzap_update(zn, integer_size, num_integers, val, tx); + zap = zn->zn_zap; /* fzap_update() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", zapobj, integer_size, num_integers, name); - mzap_upgrade(zap, tx); - err = fzap_update(zap, name, - integer_size, num_integers, val, tx); + err = mzap_upgrade(&zn->zn_zap, tx); + if (err == 0) + err = fzap_update(zn, integer_size, num_integers, + val, tx); + zap = zn->zn_zap; /* fzap_update() may change zap */ } else { - hash = zap_hash(zap, name); - mze = mze_find(zap, name, hash); + mze = mze_find(zn); if (mze != NULL) { mze->mze_phys.mze_value = *intval; zap->zap_m.zap_phys->mz_chunk [mze->mze_chunkid].mze_value = *intval; } else { - mzap_addent(zap, name, hash, *intval); + mzap_addent(zn, *intval); } } - zap_unlockdir(zap); + ASSERT(zap == zn->zn_zap); + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ + zap_unlockdir(zap); return (err); } int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) { + return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx)); +} + +int +zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, + matchtype_t mt, dmu_tx_t *tx) +{ zap_t *zap; int err; mzap_ent_t *mze; + zap_name_t *zn; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap); + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); if (err) return (err); + zn = zap_name_alloc(zap, name, mt); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } if (!zap->zap_ismicro) { - err = fzap_remove(zap, name, tx); + err = fzap_remove(zn, tx); } else { - mze = mze_find(zap, name, zap_hash(zap, name)); + mze = mze_find(zn); if (mze == NULL) { - dprintf("fail: %s\n", name); err = ENOENT; } else { - dprintf("success: %s\n", name); zap->zap_m.zap_num_entries--; bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], sizeof (mzap_ent_phys_t)); mze_remove(zap, mze); } } + zap_name_free(zn); zap_unlockdir(zap); return (err); } - /* * Routines for iterating over the attributes. */ @@ -781,7 +991,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) if (zc->zc_zap == NULL) { err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, - RW_READER, TRUE, &zc->zc_zap); + RW_READER, TRUE, FALSE, &zc->zc_zap); if (err) return (err); } else { @@ -796,14 +1006,17 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) mze_tofind.mze_phys.mze_cd = zc->zc_cd; mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); - ASSERT(mze == NULL || 0 == bcmp(&mze->mze_phys, - &zc->zc_zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], - sizeof (mze->mze_phys))); if (mze == NULL) { mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, idx, AVL_AFTER); } if (mze) { + ASSERT(0 == bcmp(&mze->mze_phys, + &zc->zc_zap->zap_m.zap_phys->mz_chunk + [mze->mze_chunkid], sizeof (mze->mze_phys))); + + za->za_normalization_conflict = + mzap_normalization_conflict(zc->zc_zap, NULL, mze); za->za_integer_length = 8; za->za_num_integers = 1; za->za_first_integer = mze->mze_phys.mze_value; @@ -839,7 +1052,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) int err; zap_t *zap; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c index 33c2909316e3..ec7d29e64f70 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/time.h> @@ -43,15 +41,19 @@ #include <sys/fs/zfs.h> #include <sys/policy.h> #include <sys/zfs_znode.h> +#include <sys/zfs_fuid.h> #include <sys/zfs_acl.h> #include <sys/zfs_dir.h> #include <sys/zfs_vfsops.h> #include <sys/dmu.h> +#include <sys/dnode.h> #include <sys/zap.h> #include <acl/acl_common.h> #define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE #define DENY ACE_ACCESS_DENIED_ACE_TYPE +#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE +#define MIN_ACE_TYPE ALLOW #define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) #define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ @@ -60,8 +62,15 @@ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) #define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) -#define WRITE_MASK (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS| \ - ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|ACE_WRITE_OWNER) +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) + +#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ + ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ + ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) + +#define WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\ + ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) #define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) @@ -70,59 +79,656 @@ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) #define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ - ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE) + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) + +#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ + ZFS_ACL_PROTECTED) + +#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ + ZFS_ACL_OBJ_ACE) + +static uint16_t +zfs_ace_v0_get_type(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_v0_get_flags(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_v0_get_mask(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_v0_get_who(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_fuid); +} + +static void +zfs_ace_v0_set_type(void *acep, uint16_t type) +{ + ((zfs_oldace_t *)acep)->z_type = type; +} -#define SECURE_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) +static void +zfs_ace_v0_set_flags(void *acep, uint16_t flags) +{ + ((zfs_oldace_t *)acep)->z_flags = flags; +} -#define OGE_PAD 6 /* traditional owner/group/everyone ACES */ +static void +zfs_ace_v0_set_mask(void *acep, uint32_t mask) +{ + ((zfs_oldace_t *)acep)->z_access_mask = mask; +} -static int zfs_ace_can_use(znode_t *zp, ace_t *); +static void +zfs_ace_v0_set_who(void *acep, uint64_t who) +{ + ((zfs_oldace_t *)acep)->z_fuid = who; +} + +/*ARGSUSED*/ +static size_t +zfs_ace_v0_size(void *acep) +{ + return (sizeof (zfs_oldace_t)); +} + +static size_t +zfs_ace_v0_abstract_size(void) +{ + return (sizeof (zfs_oldace_t)); +} + +static int +zfs_ace_v0_mask_off(void) +{ + return (offsetof(zfs_oldace_t, z_access_mask)); +} + +/*ARGSUSED*/ +static int +zfs_ace_v0_data(void *acep, void **datap) +{ + *datap = NULL; + return (0); +} + +static acl_ops_t zfs_acl_v0_ops = { + zfs_ace_v0_get_mask, + zfs_ace_v0_set_mask, + zfs_ace_v0_get_flags, + zfs_ace_v0_set_flags, + zfs_ace_v0_get_type, + zfs_ace_v0_set_type, + zfs_ace_v0_get_who, + zfs_ace_v0_set_who, + zfs_ace_v0_size, + zfs_ace_v0_abstract_size, + zfs_ace_v0_mask_off, + zfs_ace_v0_data +}; + +static uint16_t +zfs_ace_fuid_get_type(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_fuid_get_flags(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_fuid_get_mask(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_fuid_get_who(void *args) +{ + uint16_t entry_type; + zfs_ace_t *acep = args; + + entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (-1); + return (((zfs_ace_t *)acep)->z_fuid); +} + +static void +zfs_ace_fuid_set_type(void *acep, uint16_t type) +{ + ((zfs_ace_hdr_t *)acep)->z_type = type; +} + +static void +zfs_ace_fuid_set_flags(void *acep, uint16_t flags) +{ + ((zfs_ace_hdr_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_fuid_set_mask(void *acep, uint32_t mask) +{ + ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_fuid_set_who(void *arg, uint64_t who) +{ + zfs_ace_t *acep = arg; + + uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return; + acep->z_fuid = who; +} + +static size_t +zfs_ace_fuid_size(void *acep) +{ + zfs_ace_hdr_t *zacep = acep; + uint16_t entry_type; + + switch (zacep->z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + return (sizeof (zfs_object_ace_t)); + case ALLOW: + case DENY: + entry_type = + (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); + if (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (sizeof (zfs_ace_hdr_t)); + /*FALLTHROUGH*/ + default: + return (sizeof (zfs_ace_t)); + } +} + +static size_t +zfs_ace_fuid_abstract_size(void) +{ + return (sizeof (zfs_ace_hdr_t)); +} + +static int +zfs_ace_fuid_mask_off(void) +{ + return (offsetof(zfs_ace_hdr_t, z_access_mask)); +} + +static int +zfs_ace_fuid_data(void *acep, void **datap) +{ + zfs_ace_t *zacep = acep; + zfs_object_ace_t *zobjp; + + switch (zacep->z_hdr.z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjp = acep; + *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); + return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); + default: + *datap = NULL; + return (0); + } +} + +static acl_ops_t zfs_acl_fuid_ops = { + zfs_ace_fuid_get_mask, + zfs_ace_fuid_set_mask, + zfs_ace_fuid_get_flags, + zfs_ace_fuid_set_flags, + zfs_ace_fuid_get_type, + zfs_ace_fuid_set_type, + zfs_ace_fuid_get_who, + zfs_ace_fuid_set_who, + zfs_ace_fuid_size, + zfs_ace_fuid_abstract_size, + zfs_ace_fuid_mask_off, + zfs_ace_fuid_data +}; + +static int +zfs_acl_version(int version) +{ + if (version < ZPL_VERSION_FUID) + return (ZFS_ACL_VERSION_INITIAL); + else + return (ZFS_ACL_VERSION_FUID); +} + +static int +zfs_acl_version_zp(znode_t *zp) +{ + return (zfs_acl_version(zp->z_zfsvfs->z_version)); +} static zfs_acl_t * -zfs_acl_alloc(int slots) +zfs_acl_alloc(int vers) { zfs_acl_t *aclp; aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); - if (slots != 0) { - aclp->z_acl = kmem_alloc(ZFS_ACL_SIZE(slots), KM_SLEEP); - aclp->z_acl_count = 0; - aclp->z_state = ACL_DATA_ALLOCED; - } else { - aclp->z_state = 0; - } - aclp->z_slots = slots; + list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), + offsetof(zfs_acl_node_t, z_next)); + aclp->z_version = vers; + if (vers == ZFS_ACL_VERSION_FUID) + aclp->z_ops = zfs_acl_fuid_ops; + else + aclp->z_ops = zfs_acl_v0_ops; return (aclp); } +static zfs_acl_node_t * +zfs_acl_node_alloc(size_t bytes) +{ + zfs_acl_node_t *aclnode; + + aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); + if (bytes) { + aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); + aclnode->z_allocdata = aclnode->z_acldata; + aclnode->z_allocsize = bytes; + aclnode->z_size = bytes; + } + + return (aclnode); +} + +static void +zfs_acl_node_free(zfs_acl_node_t *aclnode) +{ + if (aclnode->z_allocsize) + kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); + kmem_free(aclnode, sizeof (zfs_acl_node_t)); +} + +static void +zfs_acl_release_nodes(zfs_acl_t *aclp) +{ + zfs_acl_node_t *aclnode; + + while (aclnode = list_head(&aclp->z_acl)) { + list_remove(&aclp->z_acl, aclnode); + zfs_acl_node_free(aclnode); + } + aclp->z_acl_count = 0; + aclp->z_acl_bytes = 0; +} + void zfs_acl_free(zfs_acl_t *aclp) { - if (aclp->z_state == ACL_DATA_ALLOCED) { - kmem_free(aclp->z_acl, ZFS_ACL_SIZE(aclp->z_slots)); - } + zfs_acl_release_nodes(aclp); + list_destroy(&aclp->z_acl); kmem_free(aclp, sizeof (zfs_acl_t)); } -static uint32_t -zfs_v4_to_unix(uint32_t access_mask) +static boolean_t +zfs_acl_valid_ace_type(uint_t type, uint_t flags) { - uint32_t new_mask = 0; + uint16_t entry_type; + + switch (type) { + case ALLOW: + case DENY: + case ACE_SYSTEM_AUDIT_ACE_TYPE: + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry_type = flags & ACE_TYPE_FLAGS; + return (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE || entry_type == 0 || + entry_type == ACE_IDENTIFIER_GROUP); + default: + if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) + return (B_TRUE); + } + return (B_FALSE); +} +static boolean_t +zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) +{ /* - * This is used for mapping v4 permissions into permissions - * that can be passed to secpolicy_vnode_access() + * first check type of entry */ - if (access_mask & (ACE_READ_DATA | ACE_LIST_DIRECTORY | - ACE_READ_ATTRIBUTES | ACE_READ_ACL)) - new_mask |= S_IROTH; - if (access_mask & (ACE_WRITE_DATA | ACE_APPEND_DATA | - ACE_WRITE_ATTRIBUTES | ACE_ADD_FILE | ACE_WRITE_NAMED_ATTRS)) - new_mask |= S_IWOTH; - if (access_mask & (ACE_EXECUTE | ACE_READ_NAMED_ATTRS)) - new_mask |= S_IXOTH; - return (new_mask); + if (!zfs_acl_valid_ace_type(type, iflags)) + return (B_FALSE); + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (aclp->z_version < ZFS_ACL_VERSION_FUID) + return (B_FALSE); + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + } + + /* + * next check inheritance level flags + */ + + if (obj_type == VDIR && + (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void * +zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, + uint32_t *access_mask, uint16_t *iflags, uint16_t *type) +{ + zfs_acl_node_t *aclnode; + + if (start == NULL) { + aclnode = list_head(&aclp->z_acl); + if (aclnode == NULL) + return (NULL); + + aclp->z_next_ace = aclnode->z_acldata; + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + } + + aclnode = aclp->z_curr_node; + + if (aclnode == NULL) + return (NULL); + + if (aclnode->z_ace_idx >= aclnode->z_ace_count) { + aclnode = list_next(&aclp->z_acl, aclnode); + if (aclnode == NULL) + return (NULL); + else { + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + aclp->z_next_ace = aclnode->z_acldata; + } + } + + if (aclnode->z_ace_idx < aclnode->z_ace_count) { + void *acep = aclp->z_next_ace; + size_t ace_size; + + /* + * Make sure we don't overstep our bounds + */ + ace_size = aclp->z_ops.ace_size(acep); + + if (((caddr_t)acep + ace_size) > + ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { + return (NULL); + } + + *iflags = aclp->z_ops.ace_flags_get(acep); + *type = aclp->z_ops.ace_type_get(acep); + *access_mask = aclp->z_ops.ace_mask_get(acep); + *who = aclp->z_ops.ace_who_get(acep); + aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; + aclnode->z_ace_idx++; + return ((void *)acep); + } + return (NULL); +} + +/*ARGSUSED*/ +static uint64_t +zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, + uint16_t *flags, uint16_t *type, uint32_t *mask) +{ + zfs_acl_t *aclp = datap; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + uint64_t who; + + acep = zfs_acl_next_ace(aclp, acep, &who, mask, + flags, type); + return ((uint64_t)(uintptr_t)acep); +} + +static zfs_acl_node_t * +zfs_acl_curr_node(zfs_acl_t *aclp) +{ + ASSERT(aclp->z_curr_node); + return (aclp->z_curr_node); +} + +/* + * Copy ACE to internal ZFS format. + * While processing the ACL each ACE will be validated for correctness. + * ACE FUIDs will be created later. + */ +int +zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap, + zfs_ace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + uint16_t entry_type; + zfs_ace_t *aceptr = z_acl; + ace_t *acep = datap; + zfs_object_ace_t *zobjacep; + ace_object_t *aceobjp; + + for (i = 0; i != aclcnt; i++) { + aceptr->z_hdr.z_access_mask = acep->a_access_mask; + aceptr->z_hdr.z_flags = acep->a_flags; + aceptr->z_hdr.z_type = acep->a_type; + entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; + if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE) { + if (!aclp->z_has_fuids) + aclp->z_has_fuids = IS_EPHEMERAL(acep->a_who); + aceptr->z_fuid = (uint64_t)acep->a_who; + } + + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type, + aceptr->z_hdr.z_flags) != B_TRUE) + return (EINVAL); + + switch (acep->a_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjacep = (zfs_object_ace_t *)aceptr; + aceobjp = (ace_object_t *)acep; + + bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, + sizeof (aceobjp->a_obj_type)); + bcopy(aceobjp->a_inherit_obj_type, + zobjacep->z_inherit_type, + sizeof (aceobjp->a_inherit_obj_type)); + acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); + break; + default: + acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); + } + + aceptr = (zfs_ace_t *)((caddr_t)aceptr + + aclp->z_ops.ace_size(aceptr)); + } + + *size = (caddr_t)aceptr - (caddr_t)z_acl; + + return (0); +} + +/* + * Copy ZFS ACEs to fixed size ace_t layout + */ +static void +zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, + void *datap, int filter) +{ + uint64_t who; + uint32_t access_mask; + uint16_t iflags, type; + zfs_ace_hdr_t *zacep = NULL; + ace_t *acep = datap; + ace_object_t *objacep; + zfs_object_ace_t *zobjacep; + size_t ace_size; + uint16_t entry_type; + + while (zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type)) { + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (filter) { + continue; + } + zobjacep = (zfs_object_ace_t *)zacep; + objacep = (ace_object_t *)acep; + bcopy(zobjacep->z_object_type, + objacep->a_obj_type, + sizeof (zobjacep->z_object_type)); + bcopy(zobjacep->z_inherit_type, + objacep->a_inherit_obj_type, + sizeof (zobjacep->z_inherit_type)); + ace_size = sizeof (ace_object_t); + break; + default: + ace_size = sizeof (ace_t); + break; + } + + entry_type = (iflags & ACE_TYPE_FLAGS); + if ((entry_type != ACE_OWNER && + entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE)) { + acep->a_who = zfs_fuid_map_id(zfsvfs, who, + cr, (entry_type & ACE_IDENTIFIER_GROUP) ? + ZFS_ACE_GROUP : ZFS_ACE_USER); + } else { + acep->a_who = (uid_t)(int64_t)who; + } + acep->a_access_mask = access_mask; + acep->a_flags = iflags; + acep->a_type = type; + acep = (ace_t *)((caddr_t)acep + ace_size); + } +} + +static int +zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, + zfs_oldace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + zfs_oldace_t *aceptr = z_acl; + + for (i = 0; i != aclcnt; i++, aceptr++) { + aceptr->z_access_mask = acep[i].a_access_mask; + aceptr->z_type = acep[i].a_type; + aceptr->z_flags = acep[i].a_flags; + aceptr->z_fuid = acep[i].a_who; + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_type, + aceptr->z_flags) != B_TRUE) + return (EINVAL); + } + *size = (caddr_t)aceptr - (caddr_t)z_acl; + return (0); +} + +/* + * convert old ACL format to new + */ +void +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp) +{ + zfs_oldace_t *oldaclp; + int i; + uint16_t type, iflags; + uint32_t access_mask; + uint64_t who; + void *cookie = NULL; + zfs_acl_node_t *newaclnode; + + ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); + /* + * First create the ACE in a contiguous piece of memory + * for zfs_copy_ace_2_fuid(). + * + * We only convert an ACL once, so this won't happen + * everytime. + */ + oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, + KM_SLEEP); + i = 0; + while (cookie = zfs_acl_next_ace(aclp, cookie, &who, + &access_mask, &iflags, &type)) { + oldaclp[i].z_flags = iflags; + oldaclp[i].z_type = type; + oldaclp[i].z_fuid = who; + oldaclp[i++].z_access_mask = access_mask; + } + + newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * + sizeof (zfs_object_ace_t)); + aclp->z_ops = zfs_acl_fuid_ops; + VERIFY(zfs_copy_ace_2_fuid(ZTOV(zp)->v_type, aclp, oldaclp, + newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size) == 0); + newaclnode->z_ace_count = aclp->z_acl_count; + aclp->z_version = ZFS_ACL_VERSION; + kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); + + /* + * Release all previous ACL nodes + */ + + zfs_acl_release_nodes(aclp); + + list_insert_head(&aclp->z_acl, newaclnode); + + aclp->z_acl_bytes = newaclnode->z_size; + aclp->z_acl_count = newaclnode->z_ace_count; + } /* @@ -133,157 +739,213 @@ zfs_unix_to_v4(uint32_t access_mask) { uint32_t new_mask = 0; - if (access_mask & 01) - new_mask |= (ACE_EXECUTE); - if (access_mask & 02) { - new_mask |= (ACE_WRITE_DATA); - } if (access_mask & 04) { + if (access_mask & S_IXOTH) + new_mask |= ACE_EXECUTE; + if (access_mask & S_IWOTH) + new_mask |= ACE_WRITE_DATA; + if (access_mask & S_IROTH) new_mask |= ACE_READ_DATA; - } return (new_mask); } static void -zfs_set_ace(ace_t *zacep, uint32_t access_mask, int access_type, - uid_t uid, int entry_type) +zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, + uint16_t access_type, uint64_t fuid, uint16_t entry_type) { - zacep->a_access_mask = access_mask; - zacep->a_type = access_type; - zacep->a_who = uid; - zacep->a_flags = entry_type; + uint16_t type = entry_type & ACE_TYPE_FLAGS; + + aclp->z_ops.ace_mask_set(acep, access_mask); + aclp->z_ops.ace_type_set(acep, access_type); + aclp->z_ops.ace_flags_set(acep, entry_type); + if ((type != ACE_OWNER && type != OWNING_GROUP && + type != ACE_EVERYONE)) + aclp->z_ops.ace_who_set(acep, fuid); } +/* + * Determine mode of file based on ACL. + * Also, create FUIDs for any User/Group ACEs + */ static uint64_t -zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) -{ - int i; - int entry_type; - mode_t mode = (zp->z_phys->zp_mode & - (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); - mode_t seen = 0; - ace_t *acep; - - for (i = 0, acep = aclp->z_acl; - i != aclp->z_acl_count; i++, acep++) { - entry_type = (acep->a_flags & ACE_TYPE_FLAGS); +zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, + zfs_fuid_info_t **fuidp, dmu_tx_t *tx) +{ + int entry_type; + mode_t mode; + mode_t seen = 0; + zfs_ace_hdr_t *acep = NULL; + uint64_t who; + uint16_t iflags, type; + uint32_t access_mask; + + mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + + while (acep = zfs_acl_next_ace(aclp, acep, &who, + &access_mask, &iflags, &type)) { + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + /* + * Skip over owner@, group@ or everyone@ inherit only ACEs + */ + if ((iflags & ACE_INHERIT_ONLY_ACE) && + (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || + entry_type == OWNING_GROUP)) + continue; + if (entry_type == ACE_OWNER) { - if ((acep->a_access_mask & ACE_READ_DATA) && + if ((access_mask & ACE_READ_DATA) && (!(seen & S_IRUSR))) { seen |= S_IRUSR; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IRUSR; } } - if ((acep->a_access_mask & ACE_WRITE_DATA) && + if ((access_mask & ACE_WRITE_DATA) && (!(seen & S_IWUSR))) { seen |= S_IWUSR; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IWUSR; } } - if ((acep->a_access_mask & ACE_EXECUTE) && + if ((access_mask & ACE_EXECUTE) && (!(seen & S_IXUSR))) { seen |= S_IXUSR; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IXUSR; } } } else if (entry_type == OWNING_GROUP) { - if ((acep->a_access_mask & ACE_READ_DATA) && + if ((access_mask & ACE_READ_DATA) && (!(seen & S_IRGRP))) { seen |= S_IRGRP; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IRGRP; } } - if ((acep->a_access_mask & ACE_WRITE_DATA) && + if ((access_mask & ACE_WRITE_DATA) && (!(seen & S_IWGRP))) { seen |= S_IWGRP; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IWGRP; } } - if ((acep->a_access_mask & ACE_EXECUTE) && + if ((access_mask & ACE_EXECUTE) && (!(seen & S_IXGRP))) { seen |= S_IXGRP; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IXGRP; } } } else if (entry_type == ACE_EVERYONE) { - if ((acep->a_access_mask & ACE_READ_DATA)) { + if ((access_mask & ACE_READ_DATA)) { if (!(seen & S_IRUSR)) { seen |= S_IRUSR; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IRUSR; } } if (!(seen & S_IRGRP)) { seen |= S_IRGRP; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IRGRP; } } if (!(seen & S_IROTH)) { seen |= S_IROTH; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IROTH; } } } - if ((acep->a_access_mask & ACE_WRITE_DATA)) { + if ((access_mask & ACE_WRITE_DATA)) { if (!(seen & S_IWUSR)) { seen |= S_IWUSR; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IWUSR; } } if (!(seen & S_IWGRP)) { seen |= S_IWGRP; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IWGRP; } } if (!(seen & S_IWOTH)) { seen |= S_IWOTH; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IWOTH; } } } - if ((acep->a_access_mask & ACE_EXECUTE)) { + if ((access_mask & ACE_EXECUTE)) { if (!(seen & S_IXUSR)) { seen |= S_IXUSR; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IXUSR; } } if (!(seen & S_IXGRP)) { seen |= S_IXGRP; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IXGRP; } } if (!(seen & S_IXOTH)) { seen |= S_IXOTH; - if (acep->a_type == ALLOW) { + if (type == ALLOW) { mode |= S_IXOTH; } } } } + /* + * Now handle FUID create for user/group ACEs + */ + if (entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP) { + aclp->z_ops.ace_who_set(acep, + zfs_fuid_create(zp->z_zfsvfs, who, cr, + (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP, + tx, fuidp)); + } } return (mode); } static zfs_acl_t * -zfs_acl_node_read_internal(znode_t *zp) +zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify) { zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; - aclp = zfs_acl_alloc(0); - aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; - aclp->z_acl = &zp->z_phys->zp_acl.z_ace_data[0]; + aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version); + + /* + * Version 0 to 1 znode_acl_phys has the size/count fields swapped. + * Version 0 didn't have a size field, only a count. + */ + if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) { + aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_size; + aclp->z_acl_bytes = ZFS_ACL_SIZE(aclp->z_acl_count); + } else { + aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; + aclp->z_acl_bytes = zp->z_phys->zp_acl.z_acl_size; + } + + aclnode = zfs_acl_node_alloc(will_modify ? aclp->z_acl_bytes : 0); + aclnode->z_ace_count = aclp->z_acl_count; + if (will_modify) { + bcopy(zp->z_phys->zp_acl.z_ace_data, aclnode->z_acldata, + aclp->z_acl_bytes); + } else { + aclnode->z_size = aclp->z_acl_bytes; + aclnode->z_acldata = &zp->z_phys->zp_acl.z_ace_data[0]; + } + + list_insert_head(&aclp->z_acl, aclnode); return (aclp); } @@ -292,212 +954,176 @@ zfs_acl_node_read_internal(znode_t *zp) * Read an external acl object. */ static int -zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp) +zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) { uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj; zfs_acl_t *aclp; + size_t aclsize; + size_t acl_count; + zfs_acl_node_t *aclnode; int error; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) { - *aclpp = zfs_acl_node_read_internal(zp); + *aclpp = zfs_acl_node_read_internal(zp, will_modify); return (0); } - aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count); + aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version); + if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) { + zfs_acl_phys_v0_t *zacl0 = + (zfs_acl_phys_v0_t *)&zp->z_phys->zp_acl; + aclsize = ZFS_ACL_SIZE(zacl0->z_acl_count); + acl_count = zacl0->z_acl_count; + } else { + aclsize = zp->z_phys->zp_acl.z_acl_size; + acl_count = zp->z_phys->zp_acl.z_acl_count; + if (aclsize == 0) + aclsize = acl_count * sizeof (zfs_ace_t); + } + aclnode = zfs_acl_node_alloc(aclsize); + list_insert_head(&aclp->z_acl, aclnode); error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0, - ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl); + aclsize, aclnode->z_acldata); + aclnode->z_ace_count = acl_count; + aclp->z_acl_count = acl_count; + aclp->z_acl_bytes = aclsize; + if (error != 0) { zfs_acl_free(aclp); + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = EIO; return (error); } - aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; - *aclpp = aclp; return (0); } -static boolean_t -zfs_acl_valid(znode_t *zp, ace_t *uace, int aclcnt, int *inherit) -{ - ace_t *acep; - int i; - - *inherit = 0; - - if (aclcnt > MAX_ACL_ENTRIES || aclcnt <= 0) { - return (B_FALSE); - } - - for (i = 0, acep = uace; i != aclcnt; i++, acep++) { - - /* - * first check type of entry - */ - - switch (acep->a_flags & ACE_TYPE_FLAGS) { - case ACE_OWNER: - acep->a_who = -1; - break; - case (ACE_IDENTIFIER_GROUP | ACE_GROUP): - case ACE_IDENTIFIER_GROUP: - if (acep->a_flags & ACE_GROUP) { - acep->a_who = -1; - } - break; - case ACE_EVERYONE: - acep->a_who = -1; - break; - } - - /* - * next check inheritance level flags - */ - - if (acep->a_type != ALLOW && acep->a_type != DENY) - return (B_FALSE); - - /* - * Only directories should have inheritance flags. - */ - if (ZTOV(zp)->v_type != VDIR && (acep->a_flags & - (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE| - ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) { - return (B_FALSE); - } - - if (acep->a_flags & - (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)) - *inherit = 1; - - if (acep->a_flags & - (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { - if ((acep->a_flags & (ACE_FILE_INHERIT_ACE| - ACE_DIRECTORY_INHERIT_ACE)) == 0) { - return (B_FALSE); - } - } - } - - return (B_TRUE); -} /* - * common code for setting acl's. + * common code for setting ACLs. * * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's * already checked the acl and knows whether to inherit. */ int -zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, dmu_tx_t *tx, int *ihp) +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, + zfs_fuid_info_t **fuidp, dmu_tx_t *tx) { - int inherit = 0; int error; znode_phys_t *zphys = zp->z_phys; - zfs_znode_acl_t *zacl = &zphys->zp_acl; - uint32_t acl_phys_size = ZFS_ACL_SIZE(aclp->z_acl_count); + zfs_acl_phys_t *zacl = &zphys->zp_acl; zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t aoid = zphys->zp_acl.z_acl_extern_obj; + uint64_t off = 0; + dmu_object_type_t otype; + zfs_acl_node_t *aclnode; ASSERT(MUTEX_HELD(&zp->z_lock)); ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - if (ihp) - inherit = *ihp; /* already determined by caller */ - else if (!zfs_acl_valid(zp, aclp->z_acl, - aclp->z_acl_count, &inherit)) { - return (EINVAL); - } - dmu_buf_will_dirty(zp->z_dbuf, tx); + zphys->zp_mode = zfs_mode_fuid_compute(zp, aclp, cr, fuidp, tx); + /* - * Will ACL fit internally? + * Decide which opbject type to use. If we are forced to + * use old ACL format than transform ACL into zfs_oldace_t + * layout. */ - if (aclp->z_acl_count > ACE_SLOT_CNT) { + if (!zfsvfs->z_use_fuids) { + otype = DMU_OT_OLDACL; + } else { + if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && + (zfsvfs->z_version >= ZPL_VERSION_FUID)) + zfs_acl_xform(zp, aclp); + ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); + otype = DMU_OT_ACL; + } + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && aclp->z_version != zacl->z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, + zp->z_phys->zp_acl.z_acl_extern_obj, tx); + if (error) + return (error); + aoid = 0; + } if (aoid == 0) { aoid = dmu_object_alloc(zfsvfs->z_os, - DMU_OT_ACL, acl_phys_size, DMU_OT_NONE, 0, tx); + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? DN_MAX_BONUSLEN : 0, tx); } else { (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, - acl_phys_size, 0, tx); + aclp->z_acl_bytes, 0, tx); } zphys->zp_acl.z_acl_extern_obj = aoid; - zphys->zp_acl.z_acl_count = aclp->z_acl_count; - dmu_write(zfsvfs->z_os, aoid, 0, - acl_phys_size, aclp->z_acl, tx); + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } } else { + void *start = zacl->z_ace_data; /* * Migrating back embedded? */ if (zphys->zp_acl.z_acl_extern_obj) { error = dmu_object_free(zfsvfs->z_os, - zp->z_phys->zp_acl.z_acl_extern_obj, tx); + zp->z_phys->zp_acl.z_acl_extern_obj, tx); if (error) return (error); zphys->zp_acl.z_acl_extern_obj = 0; } - bcopy(aclp->z_acl, zacl->z_ace_data, - aclp->z_acl_count * sizeof (ace_t)); - zacl->z_acl_count = aclp->z_acl_count; + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } } - zp->z_phys->zp_flags &= ~(ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE); - if (inherit) { - zp->z_phys->zp_flags |= ZFS_INHERIT_ACE; - } else if (ace_trivial(zacl->z_ace_data, zacl->z_acl_count) == 0) { - zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL; + /* + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. + */ + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + zphys->zp_acl.z_acl_size = aclp->z_acl_count; + zphys->zp_acl.z_acl_count = aclp->z_acl_bytes; + } else { + zphys->zp_acl.z_acl_size = aclp->z_acl_bytes; + zphys->zp_acl.z_acl_count = aclp->z_acl_count; } - zphys->zp_mode = zfs_mode_compute(zp, aclp); - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + zphys->zp_acl.z_acl_version = aclp->z_version; - return (0); -} + /* + * Replace ACL wide bits, but first clear them. + */ + zp->z_phys->zp_flags &= ~ZFS_ACL_WIDE_FLAGS; -/* - * Create space for slots_needed ACEs to be append - * to aclp. - */ -static void -zfs_acl_append(zfs_acl_t *aclp, int slots_needed) -{ - ace_t *newacep; - ace_t *oldaclp; - int slot_cnt; - int slots_left = aclp->z_slots - aclp->z_acl_count; + zp->z_phys->zp_flags |= aclp->z_hints; - if (aclp->z_state == ACL_DATA_ALLOCED) - ASSERT(aclp->z_slots >= aclp->z_acl_count); - if (slots_left < slots_needed || aclp->z_state != ACL_DATA_ALLOCED) { - slot_cnt = aclp->z_slots + 1 + (slots_needed - slots_left); - newacep = kmem_alloc(ZFS_ACL_SIZE(slot_cnt), KM_SLEEP); - bcopy(aclp->z_acl, newacep, - ZFS_ACL_SIZE(aclp->z_acl_count)); - oldaclp = aclp->z_acl; - if (aclp->z_state == ACL_DATA_ALLOCED) - kmem_free(oldaclp, ZFS_ACL_SIZE(aclp->z_slots)); - aclp->z_acl = newacep; - aclp->z_slots = slot_cnt; - aclp->z_state = ACL_DATA_ALLOCED; - } -} + if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) + zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL; -/* - * Remove "slot" ACE from aclp - */ -static void -zfs_ace_remove(zfs_acl_t *aclp, int slot) -{ - if (aclp->z_acl_count > 1) { - (void) memmove(&aclp->z_acl[slot], - &aclp->z_acl[slot +1], sizeof (ace_t) * - (--aclp->z_acl_count - slot)); - } else - aclp->z_acl_count--; + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + return (0); } /* @@ -506,16 +1132,24 @@ zfs_ace_remove(zfs_acl_t *aclp, int slot) * This applies the "groupmask" value for aclmode property. */ static void -zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner) +zfs_acl_prepend_fixup(zfs_acl_t *aclp, void *acep, void *origacep, + mode_t mode, uint64_t owner) { - int rmask, wmask, xmask; int user_ace; + uint16_t aceflags; + uint32_t origmask, acepmask; + uint64_t fuid; - user_ace = (!(acep->a_flags & + aceflags = aclp->z_ops.ace_flags_get(acep); + fuid = aclp->z_ops.ace_who_get(acep); + origmask = aclp->z_ops.ace_mask_get(origacep); + acepmask = aclp->z_ops.ace_mask_get(acep); + + user_ace = (!(aceflags & (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP))); - if (user_ace && (acep->a_who == owner)) { + if (user_ace && (fuid == owner)) { rmask = S_IRUSR; wmask = S_IWUSR; xmask = S_IXUSR; @@ -525,33 +1159,38 @@ zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner) xmask = S_IXGRP; } - if (origacep->a_access_mask & ACE_READ_DATA) { - if (mode & rmask) - acep->a_access_mask &= ~ACE_READ_DATA; - else - acep->a_access_mask |= ACE_READ_DATA; + if (origmask & ACE_READ_DATA) { + if (mode & rmask) { + acepmask &= ~ACE_READ_DATA; + } else { + acepmask |= ACE_READ_DATA; + } } - if (origacep->a_access_mask & ACE_WRITE_DATA) { - if (mode & wmask) - acep->a_access_mask &= ~ACE_WRITE_DATA; - else - acep->a_access_mask |= ACE_WRITE_DATA; + if (origmask & ACE_WRITE_DATA) { + if (mode & wmask) { + acepmask &= ~ACE_WRITE_DATA; + } else { + acepmask |= ACE_WRITE_DATA; + } } - if (origacep->a_access_mask & ACE_APPEND_DATA) { - if (mode & wmask) - acep->a_access_mask &= ~ACE_APPEND_DATA; - else - acep->a_access_mask |= ACE_APPEND_DATA; + if (origmask & ACE_APPEND_DATA) { + if (mode & wmask) { + acepmask &= ~ACE_APPEND_DATA; + } else { + acepmask |= ACE_APPEND_DATA; + } } - if (origacep->a_access_mask & ACE_EXECUTE) { - if (mode & xmask) - acep->a_access_mask &= ~ACE_EXECUTE; - else - acep->a_access_mask |= ACE_EXECUTE; + if (origmask & ACE_EXECUTE) { + if (mode & xmask) { + acepmask &= ~ACE_EXECUTE; + } else { + acepmask |= ACE_EXECUTE; + } } + aclp->z_ops.ace_mask_set(acep, acepmask); } /* @@ -560,116 +1199,156 @@ zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner) static void zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode) { - int cnt; - ace_t *acep; + zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl); + void *acep; + int maskoff = aclp->z_ops.ace_mask_off(); + size_t abstract_size = aclp->z_ops.ace_abstract_size(); + + ASSERT(aclnode != NULL); - cnt = aclp->z_acl_count -1; - acep = aclp->z_acl; + acep = (void *)((caddr_t)aclnode->z_acldata + + aclnode->z_size - (abstract_size * 6)); /* * Fixup final ACEs to match the mode */ - ASSERT(cnt >= 5); - adjust_ace_pair(&acep[cnt - 1], mode); /* everyone@ */ - adjust_ace_pair(&acep[cnt - 3], (mode & 0070) >> 3); /* group@ */ - adjust_ace_pair(&acep[cnt - 5], (mode & 0700) >> 6); /* owner@ */ + adjust_ace_pair_common(acep, maskoff, abstract_size, + (mode & 0700) >> 6); /* owner@ */ + + acep = (caddr_t)acep + (abstract_size * 2); + + adjust_ace_pair_common(acep, maskoff, abstract_size, + (mode & 0070) >> 3); /* group@ */ + + acep = (caddr_t)acep + (abstract_size * 2); + adjust_ace_pair_common(acep, maskoff, + abstract_size, mode); /* everyone@ */ } static int -zfs_acl_ace_match(ace_t *acep, int allow_deny, int type, int mask) +zfs_acl_ace_match(zfs_acl_t *aclp, void *acep, int allow_deny, + int entry_type, int accessmask) { - return (acep->a_access_mask == mask && acep->a_type == allow_deny && - ((acep->a_flags & ACE_TYPE_FLAGS) == type)); + uint32_t mask = aclp->z_ops.ace_mask_get(acep); + uint16_t type = aclp->z_ops.ace_type_get(acep); + uint16_t flags = aclp->z_ops.ace_flags_get(acep); + + return (mask == accessmask && type == allow_deny && + ((flags & ACE_TYPE_FLAGS) == entry_type)); } /* * Can prepended ACE be reused? */ static int -zfs_reuse_deny(ace_t *acep, int i) +zfs_reuse_deny(zfs_acl_t *aclp, void *acep, void *prevacep) { int okay_masks; + uint16_t prevtype; + uint16_t prevflags; + uint16_t flags; + uint32_t mask, prevmask; - if (i < 1) + if (prevacep == NULL) return (B_FALSE); - if (acep[i-1].a_type != DENY) + prevtype = aclp->z_ops.ace_type_get(prevacep); + prevflags = aclp->z_ops.ace_flags_get(prevacep); + flags = aclp->z_ops.ace_flags_get(acep); + mask = aclp->z_ops.ace_mask_get(acep); + prevmask = aclp->z_ops.ace_mask_get(prevacep); + + if (prevtype != DENY) return (B_FALSE); - if (acep[i-1].a_flags != (acep[i].a_flags & ACE_IDENTIFIER_GROUP)) + if (prevflags != (flags & ACE_IDENTIFIER_GROUP)) return (B_FALSE); - okay_masks = (acep[i].a_access_mask & OKAY_MASK_BITS); + okay_masks = (mask & OKAY_MASK_BITS); - if (acep[i-1].a_access_mask & ~okay_masks) + if (prevmask & ~okay_masks) return (B_FALSE); return (B_TRUE); } + /* - * Create space to prepend an ACE + * Insert new ACL node into chain of zfs_acl_node_t's + * + * This will result in two possible results. + * 1. If the ACL is currently just a single zfs_acl_node and + * we are prepending the entry then current acl node will have + * a new node inserted above it. + * + * 2. If we are inserting in the middle of current acl node then + * the current node will be split in two and new node will be inserted + * in between the two split nodes. */ -static void -zfs_acl_prepend(zfs_acl_t *aclp, int i) -{ - ace_t *oldaclp = NULL; - ace_t *to, *from; - int slots_left = aclp->z_slots - aclp->z_acl_count; - int oldslots; - int need_free = 0; - - if (aclp->z_state == ACL_DATA_ALLOCED) - ASSERT(aclp->z_slots >= aclp->z_acl_count); - - if (slots_left == 0 || aclp->z_state != ACL_DATA_ALLOCED) { - - to = kmem_alloc(ZFS_ACL_SIZE(aclp->z_acl_count + - OGE_PAD), KM_SLEEP); - if (aclp->z_state == ACL_DATA_ALLOCED) - need_free++; - from = aclp->z_acl; - oldaclp = aclp->z_acl; - (void) memmove(to, from, - sizeof (ace_t) * aclp->z_acl_count); - aclp->z_state = ACL_DATA_ALLOCED; - } else { - from = aclp->z_acl; - to = aclp->z_acl; +static zfs_acl_node_t * +zfs_acl_ace_insert(zfs_acl_t *aclp, void *acep) +{ + zfs_acl_node_t *newnode; + zfs_acl_node_t *trailernode = NULL; + zfs_acl_node_t *currnode = zfs_acl_curr_node(aclp); + int curr_idx = aclp->z_curr_node->z_ace_idx; + int trailer_count; + size_t oldsize; + + newnode = zfs_acl_node_alloc(aclp->z_ops.ace_size(acep)); + newnode->z_ace_count = 1; + + oldsize = currnode->z_size; + + if (curr_idx != 1) { + trailernode = zfs_acl_node_alloc(0); + trailernode->z_acldata = acep; + + trailer_count = currnode->z_ace_count - curr_idx + 1; + currnode->z_ace_count = curr_idx - 1; + currnode->z_size = (caddr_t)acep - (caddr_t)currnode->z_acldata; + trailernode->z_size = oldsize - currnode->z_size; + trailernode->z_ace_count = trailer_count; } - - (void) memmove(&to[i + 1], &from[i], - sizeof (ace_t) * (aclp->z_acl_count - i)); - - if (oldaclp) { - aclp->z_acl = to; - oldslots = aclp->z_slots; - aclp->z_slots = aclp->z_acl_count + OGE_PAD; - if (need_free) - kmem_free(oldaclp, ZFS_ACL_SIZE(oldslots)); + aclp->z_acl_count += 1; + aclp->z_acl_bytes += aclp->z_ops.ace_size(acep); + + if (curr_idx == 1) + list_insert_before(&aclp->z_acl, currnode, newnode); + else + list_insert_after(&aclp->z_acl, currnode, newnode); + if (trailernode) { + list_insert_after(&aclp->z_acl, newnode, trailernode); + aclp->z_curr_node = trailernode; + trailernode->z_ace_idx = 1; } + return (newnode); } /* * Prepend deny ACE */ -static void -zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, int i, +static void * +zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep, mode_t mode) { - ace_t *acep; - - zfs_acl_prepend(aclp, i); - - acep = aclp->z_acl; - zfs_set_ace(&acep[i], 0, DENY, acep[i + 1].a_who, - (acep[i + 1].a_flags & ACE_TYPE_FLAGS)); - zfs_acl_prepend_fixup(&acep[i], &acep[i+1], mode, zp->z_phys->zp_uid); - aclp->z_acl_count++; + zfs_acl_node_t *aclnode; + void *newacep; + uint64_t fuid; + uint16_t flags; + + aclnode = zfs_acl_ace_insert(aclp, acep); + newacep = aclnode->z_acldata; + fuid = aclp->z_ops.ace_who_get(acep); + flags = aclp->z_ops.ace_flags_get(acep); + zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS)); + zfs_acl_prepend_fixup(aclp, newacep, acep, mode, zp->z_phys->zp_uid); + + return (newacep); } /* @@ -677,41 +1356,74 @@ zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, int i, * and original ACE with inheritance flags stripped off. */ static void -zfs_acl_split_ace(zfs_acl_t *aclp, int i) +zfs_acl_split_ace(zfs_acl_t *aclp, zfs_ace_hdr_t *acep) { - ace_t *acep = aclp->z_acl; - - zfs_acl_prepend(aclp, i); - acep = aclp->z_acl; - acep[i] = acep[i + 1]; - acep[i].a_flags |= ACE_INHERIT_ONLY_ACE; - acep[i + 1].a_flags &= ~ALL_INHERIT; - aclp->z_acl_count++; + zfs_acl_node_t *aclnode; + zfs_acl_node_t *currnode; + void *newacep; + uint16_t type, flags; + uint32_t mask; + uint64_t fuid; + + type = aclp->z_ops.ace_type_get(acep); + flags = aclp->z_ops.ace_flags_get(acep); + mask = aclp->z_ops.ace_mask_get(acep); + fuid = aclp->z_ops.ace_who_get(acep); + + aclnode = zfs_acl_ace_insert(aclp, acep); + newacep = aclnode->z_acldata; + + aclp->z_ops.ace_type_set(newacep, type); + aclp->z_ops.ace_flags_set(newacep, flags | ACE_INHERIT_ONLY_ACE); + aclp->z_ops.ace_mask_set(newacep, mask); + aclp->z_ops.ace_type_set(newacep, type); + aclp->z_ops.ace_who_set(newacep, fuid); + aclp->z_next_ace = acep; + flags &= ~ALL_INHERIT; + aclp->z_ops.ace_flags_set(acep, flags); + currnode = zfs_acl_curr_node(aclp); + ASSERT(currnode->z_ace_idx >= 1); + currnode->z_ace_idx -= 1; } /* * Are ACES started at index i, the canonical six ACES? */ static int -zfs_have_canonical_six(zfs_acl_t *aclp, int i) +zfs_have_canonical_six(zfs_acl_t *aclp) { - ace_t *acep = aclp->z_acl; + void *acep; + zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl); + int i = 0; + size_t abstract_size = aclp->z_ops.ace_abstract_size(); - if ((zfs_acl_ace_match(&acep[i], + ASSERT(aclnode != NULL); + + if (aclnode->z_ace_count < 6) + return (0); + + acep = (void *)((caddr_t)aclnode->z_acldata + + aclnode->z_size - (aclp->z_ops.ace_abstract_size() * 6)); + + if ((zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), DENY, ACE_OWNER, 0) && - zfs_acl_ace_match(&acep[i + 1], ALLOW, ACE_OWNER, - OWNER_ALLOW_MASK) && zfs_acl_ace_match(&acep[i + 2], - DENY, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 3], - ALLOW, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 4], + zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), + ALLOW, ACE_OWNER, OWNER_ALLOW_MASK) && + zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), DENY, + OWNING_GROUP, 0) && zfs_acl_ace_match(aclp, (caddr_t)acep + + (abstract_size * i++), + ALLOW, OWNING_GROUP, 0) && + zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) && - zfs_acl_ace_match(&acep[i + 5], ALLOW, ACE_EVERYONE, - EVERYONE_ALLOW_MASK))) { + zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), + ALLOW, ACE_EVERYONE, EVERYONE_ALLOW_MASK))) { return (1); } else { return (0); } } + /* * Apply step 1g, to group entries * @@ -721,73 +1433,89 @@ zfs_have_canonical_six(zfs_acl_t *aclp, int i) * group has. */ static void -zfs_fixup_group_entries(ace_t *acep, mode_t mode) +zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep, + mode_t mode) { + uint32_t prevmask = aclp->z_ops.ace_mask_get(prevacep); + uint32_t mask = aclp->z_ops.ace_mask_get(acep); + uint16_t prevflags = aclp->z_ops.ace_flags_get(prevacep); mode_t extramode = (mode >> 3) & 07; mode_t ownermode = (mode >> 6); - if (acep[0].a_flags & ACE_IDENTIFIER_GROUP) { + if (prevflags & ACE_IDENTIFIER_GROUP) { extramode &= ~ownermode; if (extramode) { - if (extramode & 04) { - acep[0].a_access_mask &= ~ACE_READ_DATA; - acep[1].a_access_mask &= ~ACE_READ_DATA; + if (extramode & S_IROTH) { + prevmask &= ~ACE_READ_DATA; + mask &= ~ACE_READ_DATA; } - if (extramode & 02) { - acep[0].a_access_mask &= - ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - acep[1].a_access_mask &= - ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + if (extramode & S_IWOTH) { + prevmask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); } - if (extramode & 01) { - acep[0].a_access_mask &= ~ACE_EXECUTE; - acep[1].a_access_mask &= ~ACE_EXECUTE; + if (extramode & S_IXOTH) { + prevmask &= ~ACE_EXECUTE; + mask &= ~ACE_EXECUTE; } } } + aclp->z_ops.ace_mask_set(acep, mask); + aclp->z_ops.ace_mask_set(prevacep, prevmask); } /* * Apply the chmod algorithm as described * in PSARC/2002/240 */ -static int -zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp, - dmu_tx_t *tx) +static void +zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; - ace_t *acep; + void *acep = NULL, *prevacep = NULL; + uint64_t who; int i; - int error; int entry_type; int reuse_deny; int need_canonical_six = 1; - int inherit = 0; - int iflags; + uint16_t iflags, type; + uint32_t access_mask; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); ASSERT(MUTEX_HELD(&zp->z_lock)); - i = 0; - while (i < aclp->z_acl_count) { - acep = aclp->z_acl; - entry_type = (acep[i].a_flags & ACE_TYPE_FLAGS); - iflags = (acep[i].a_flags & ALL_INHERIT); + aclp->z_hints = (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); - if ((acep[i].a_type != ALLOW && acep[i].a_type != DENY) || - (iflags & ACE_INHERIT_ONLY_ACE)) { - i++; - if (iflags) - inherit = 1; - continue; - } + /* + * If discard then just discard all ACL nodes which + * represent the ACEs. + * + * New owner@/group@/everone@ ACEs will be added + * later. + */ + if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) + zfs_acl_release_nodes(aclp); + while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type)) { - if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) { - zfs_ace_remove(aclp, i); - continue; + entry_type = (iflags & ACE_TYPE_FLAGS); + iflags = (iflags & ALL_INHERIT); + + if ((type != ALLOW && type != DENY) || + (iflags & ACE_INHERIT_ONLY_ACE)) { + if (iflags) + aclp->z_hints |= ZFS_INHERIT_ACE; + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + break; + } + goto nextace; } /* @@ -796,20 +1524,19 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp, if ((iflags & (ACE_FILE_INHERIT_ACE| ACE_DIRECTORY_INHERIT_ACE)) && (!(iflags & ACE_INHERIT_ONLY_ACE))) { - zfs_acl_split_ace(aclp, i); - i++; - inherit = 1; - continue; + zfs_acl_split_ace(aclp, acep); + aclp->z_hints |= ZFS_INHERIT_ACE; + goto nextace; } if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || (entry_type == OWNING_GROUP)) { - acep[i].a_access_mask &= ~OGE_CLEAR; - i++; - continue; - + access_mask &= ~OGE_CLEAR; + aclp->z_ops.ace_mask_set(acep, access_mask); + goto nextace; } else { - if (acep[i].a_type == ALLOW) { + reuse_deny = B_TRUE; + if (type == ALLOW) { /* * Check preceding ACE if any, to see @@ -819,25 +1546,27 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp, */ if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) { - reuse_deny = zfs_reuse_deny(acep, i); + reuse_deny = zfs_reuse_deny(aclp, acep, + prevacep); - if (reuse_deny == B_FALSE) { - zfs_acl_prepend_deny(zp, aclp, - i, mode); - i++; - acep = aclp->z_acl; + if (!reuse_deny) { + prevacep = + zfs_acl_prepend_deny(zp, + aclp, acep, mode); } else { zfs_acl_prepend_fixup( - &acep[i - 1], - &acep[i], mode, + aclp, prevacep, + acep, mode, zp->z_phys->zp_uid); } - zfs_fixup_group_entries(&acep[i - 1], - mode); + zfs_fixup_group_entries(aclp, acep, + prevacep, mode); + } } - i++; } +nextace: + prevacep = acep; } /* @@ -845,51 +1574,56 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp, */ if (aclp->z_acl_count >= 6) { - i = aclp->z_acl_count - 6; - - if (zfs_have_canonical_six(aclp, i)) { + if (zfs_have_canonical_six(aclp)) { need_canonical_six = 0; } } if (need_canonical_six) { - - zfs_acl_append(aclp, 6); - i = aclp->z_acl_count; - acep = aclp->z_acl; - zfs_set_ace(&acep[i++], 0, DENY, -1, ACE_OWNER); - zfs_set_ace(&acep[i++], OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER); - zfs_set_ace(&acep[i++], 0, DENY, -1, OWNING_GROUP); - zfs_set_ace(&acep[i++], 0, ALLOW, -1, OWNING_GROUP); - zfs_set_ace(&acep[i++], EVERYONE_DENY_MASK, - DENY, -1, ACE_EVERYONE); - zfs_set_ace(&acep[i++], EVERYONE_ALLOW_MASK, - ALLOW, -1, ACE_EVERYONE); + size_t abstract_size = aclp->z_ops.ace_abstract_size(); + void *zacep; + zfs_acl_node_t *aclnode = + zfs_acl_node_alloc(abstract_size * 6); + + aclnode->z_size = abstract_size * 6; + aclnode->z_ace_count = 6; + aclp->z_acl_bytes += aclnode->z_size; + list_insert_tail(&aclp->z_acl, aclnode); + + zacep = aclnode->z_acldata; + + i = 0; + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), + 0, DENY, -1, ACE_OWNER); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), + OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0, + DENY, -1, OWNING_GROUP); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0, + ALLOW, -1, OWNING_GROUP); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), + EVERYONE_DENY_MASK, DENY, -1, ACE_EVERYONE); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), + EVERYONE_ALLOW_MASK, ALLOW, -1, ACE_EVERYONE); aclp->z_acl_count += 6; } zfs_acl_fixup_canonical_six(aclp, mode); - - zp->z_phys->zp_mode = mode; - error = zfs_aclset_common(zp, aclp, tx, &inherit); - return (error); } - int -zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx) +zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) { - zfs_acl_t *aclp = NULL; int error; - ASSERT(MUTEX_HELD(&zp->z_lock)); + mutex_enter(&zp->z_lock); mutex_enter(&zp->z_acl_lock); - error = zfs_acl_node_read(zp, &aclp); + *aclp = NULL; + error = zfs_acl_node_read(zp, aclp, B_TRUE); if (error == 0) - error = zfs_acl_chmod(zp, mode, aclp, tx); + zfs_acl_chmod(zp, mode, *aclp); mutex_exit(&zp->z_acl_lock); - if (aclp) - zfs_acl_free(aclp); + mutex_exit(&zp->z_lock); return (error); } @@ -897,104 +1631,159 @@ zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx) * strip off write_owner and write_acl */ static void -zfs_securemode_update(zfsvfs_t *zfsvfs, ace_t *acep) +zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep) { - if ((zfsvfs->z_acl_inherit == ZFS_ACL_SECURE) && - (acep->a_type == ALLOW)) - acep->a_access_mask &= ~SECURE_CLEAR; + uint32_t mask = aclp->z_ops.ace_mask_get(acep); + + if ((zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) && + (aclp->z_ops.ace_type_get(acep) == ALLOW)) { + mask &= ~RESTRICTED_CLEAR; + aclp->z_ops.ace_mask_set(acep, mask); + } +} + +/* + * Should ACE be inherited? + */ +static int +zfs_ace_can_use(znode_t *zp, uint16_t acep_flags) +{ + int vtype = ZTOV(zp)->v_type; + int iflags = (acep_flags & 0xf); + + if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + else if (iflags & ACE_FILE_INHERIT_ACE) + return (!((vtype == VDIR) && + (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); + return (0); } /* * inherit inheritable ACEs from parent */ static zfs_acl_t * -zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp) +zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, boolean_t *need_chmod) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; - ace_t *pacep; - ace_t *acep; - int ace_cnt = 0; - int pace_cnt; - int i, j; + void *pacep; + void *acep, *acep2; + zfs_acl_node_t *aclnode, *aclnode2; zfs_acl_t *aclp = NULL; - - i = j = 0; - pace_cnt = paclp->z_acl_count; - pacep = paclp->z_acl; + uint64_t who; + uint32_t access_mask; + uint16_t iflags, newflags, type; + size_t ace_size; + void *data1, *data2; + size_t data1sz, data2sz; + enum vtype vntype = ZTOV(zp)->v_type; + + *need_chmod = B_TRUE; + pacep = NULL; + aclp = zfs_acl_alloc(paclp->z_version); if (zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) { - for (i = 0; i != pace_cnt; i++) { + while (pacep = zfs_acl_next_ace(paclp, pacep, &who, + &access_mask, &iflags, &type)) { - if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW && - pacep[i].a_type == ALLOW) + /* + * don't inherit bogus ACEs + */ + if (!zfs_acl_valid_ace_type(type, iflags)) continue; - if (zfs_ace_can_use(zp, &pacep[i])) { - ace_cnt++; - if (!(pacep[i].a_flags & - ACE_NO_PROPAGATE_INHERIT_ACE)) - ace_cnt++; - } - } - } - - aclp = zfs_acl_alloc(ace_cnt + OGE_PAD); - if (ace_cnt && zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) { - acep = aclp->z_acl; - pacep = paclp->z_acl; - for (i = 0; i != pace_cnt; i++) { - if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW && - pacep[i].a_type == ALLOW) + type == ALLOW) continue; - if (zfs_ace_can_use(zp, &pacep[i])) { + ace_size = aclp->z_ops.ace_size(pacep); - /* - * Now create entry for inherited ace - */ - - acep[j] = pacep[i]; + if (!zfs_ace_can_use(zp, iflags)) + continue; - /* - * When AUDIT/ALARM a_types are supported - * they should be inherited here. - */ + /* + * If owner@, group@, or everyone@ inheritable + * then zfs_acl_chmod() isn't needed. + */ + if (zfsvfs->z_acl_inherit == + ZFS_ACL_PASSTHROUGH && + ((iflags & (ACE_OWNER|ACE_EVERYONE)) || + ((iflags & OWNING_GROUP) == + OWNING_GROUP)) && (vntype == VREG || + (vntype == VDIR && + (iflags & ACE_DIRECTORY_INHERIT_ACE)))) + *need_chmod = B_FALSE; + + aclnode = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode); + acep = aclnode->z_acldata; + zfs_set_ace(aclp, acep, access_mask, type, + who, iflags|ACE_INHERITED_ACE); - if ((pacep[i].a_flags & - ACE_NO_PROPAGATE_INHERIT_ACE) || - (ZTOV(zp)->v_type != VDIR)) { - acep[j].a_flags &= ~ALL_INHERIT; - zfs_securemode_update(zfsvfs, &acep[j]); - j++; - continue; - } + /* + * Copy special opaque data if any + */ + if ((data1sz = paclp->z_ops.ace_data(pacep, + &data1)) != 0) { + VERIFY((data2sz = aclp->z_ops.ace_data(acep, + &data2)) == data1sz); + bcopy(data1, data2, data2sz); + } + aclp->z_acl_count++; + aclnode->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + newflags = aclp->z_ops.ace_flags_get(acep); + + if (vntype == VDIR) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if ((iflags & ACE_NO_PROPAGATE_INHERIT_ACE) || + (vntype != VDIR)) { + newflags &= ~ALL_INHERIT; + aclp->z_ops.ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + zfs_restricted_update(zfsvfs, aclp, acep); + continue; + } - ASSERT(ZTOV(zp)->v_type == VDIR); + ASSERT(vntype == VDIR); + + newflags = aclp->z_ops.ace_flags_get(acep); + if ((iflags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) != + ACE_FILE_INHERIT_ACE) { + aclnode2 = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode2); + acep2 = aclnode2->z_acldata; + zfs_set_ace(aclp, acep2, + access_mask, type, who, + iflags|ACE_INHERITED_ACE); + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops.ace_flags_set(acep, newflags); + newflags &= ~ALL_INHERIT; + aclp->z_ops.ace_flags_set(acep2, + newflags|ACE_INHERITED_ACE); /* - * If we are inheriting an ACE targeted for - * only files, then make sure inherit_only - * is on for future propagation. + * Copy special opaque data if any */ - if ((pacep[i].a_flags & (ACE_FILE_INHERIT_ACE | - ACE_DIRECTORY_INHERIT_ACE)) != - ACE_FILE_INHERIT_ACE) { - j++; - acep[j] = acep[j-1]; - acep[j-1].a_flags |= - ACE_INHERIT_ONLY_ACE; - acep[j].a_flags &= ~ALL_INHERIT; - } else { - acep[j].a_flags |= ACE_INHERIT_ONLY_ACE; + if ((data1sz = aclp->z_ops.ace_data(acep, + &data1)) != 0) { + VERIFY((data2sz = + aclp->z_ops.ace_data(acep2, + &data2)) == data1sz); + bcopy(data1, data2, data1sz); } - zfs_securemode_update(zfsvfs, &acep[j]); - j++; + aclp->z_acl_count++; + aclnode2->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + zfs_restricted_update(zfsvfs, aclp, acep2); + } else { + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops.ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); } } } - aclp->z_acl_count = j; - ASSERT(aclp->z_slots >= aclp->z_acl_count); - return (aclp); } @@ -1004,14 +1793,20 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp) */ void zfs_perm_init(znode_t *zp, znode_t *parent, int flag, - vattr_t *vap, dmu_tx_t *tx, cred_t *cr) + vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + zfs_acl_t *setaclp, zfs_fuid_info_t **fuidp) { - uint64_t mode; - uid_t uid; - gid_t gid; + uint64_t mode, fuid, fgid; int error; - int pull_down; - zfs_acl_t *aclp, *paclp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_acl_t *aclp = NULL; + zfs_acl_t *paclp; + xvattr_t *xvap = (xvattr_t *)vap; + gid_t gid; + boolean_t need_chmod = B_TRUE; + + if (setaclp) + aclp = setaclp; mode = MAKEIMODE(vap->va_type, vap->va_mode); @@ -1020,22 +1815,38 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag, */ if ((flag & (IS_ROOT_NODE | IS_REPLAY)) || ((flag & IS_XATTR) && (vap->va_type == VDIR))) { - uid = vap->va_uid; + fuid = zfs_fuid_create(zfsvfs, vap->va_uid, cr, + ZFS_OWNER, tx, fuidp); + fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, + ZFS_GROUP, tx, fuidp); gid = vap->va_gid; } else { - uid = crgetuid(cr); - if ((vap->va_mask & AT_GID) && - ((vap->va_gid == parent->z_phys->zp_gid) || - groupmember(vap->va_gid, cr) || - secpolicy_vnode_create_gid(cr) == 0)) + fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, tx, cr, fuidp); + fgid = 0; + if (vap->va_mask & AT_GID) { + fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, + ZFS_GROUP, tx, fuidp); gid = vap->va_gid; - else + if (fgid != parent->z_phys->zp_gid && + !groupmember(vap->va_gid, cr) && + secpolicy_vnode_create_gid(cr) != 0) + fgid = 0; + } + if (fgid == 0) { + if (parent->z_phys->zp_mode & S_ISGID) { + fgid = parent->z_phys->zp_gid; + gid = zfs_fuid_map_id(zfsvfs, fgid, + cr, ZFS_GROUP); + } else { + fgid = zfs_fuid_create_cred(zfsvfs, + ZFS_GROUP, tx, cr, fuidp); #ifdef __FreeBSD__ - gid = parent->z_phys->zp_gid; + gid = parent->z_phys->zp_gid; #else - gid = (parent->z_phys->zp_mode & S_ISGID) ? - parent->z_phys->zp_gid : crgetgid(cr); + gid = crgetgid(cr); #endif + } + } } /* @@ -1045,55 +1856,57 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag, * file's new group, clear the file's set-GID bit. */ - if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) + if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) { mode |= S_ISGID; - else { + } else { if ((mode & S_ISGID) && - secpolicy_vnode_setids_setgids(cr, gid) != 0) + secpolicy_vnode_setids_setgids(ZTOV(zp), cr, gid) != 0) mode &= ~S_ISGID; } - zp->z_phys->zp_uid = uid; - zp->z_phys->zp_gid = gid; + zp->z_phys->zp_uid = fuid; + zp->z_phys->zp_gid = fgid; zp->z_phys->zp_mode = mode; - mutex_enter(&parent->z_lock); - pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE); - if (pull_down) { - mutex_enter(&parent->z_acl_lock); - VERIFY(0 == zfs_acl_node_read(parent, &paclp)); - mutex_exit(&parent->z_acl_lock); - aclp = zfs_acl_inherit(zp, paclp); - zfs_acl_free(paclp); + if (aclp == NULL) { + mutex_enter(&parent->z_lock); + if ((ZTOV(parent)->v_type == VDIR && + (parent->z_phys->zp_flags & ZFS_INHERIT_ACE)) && + !(zp->z_phys->zp_flags & ZFS_XATTR)) { + mutex_enter(&parent->z_acl_lock); + VERIFY(0 == zfs_acl_node_read(parent, &paclp, B_FALSE)); + mutex_exit(&parent->z_acl_lock); + aclp = zfs_acl_inherit(zp, paclp, &need_chmod); + zfs_acl_free(paclp); + } else { + aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + } + mutex_exit(&parent->z_lock); + mutex_enter(&zp->z_lock); + mutex_enter(&zp->z_acl_lock); + if (need_chmod) + zfs_acl_chmod(zp, mode, aclp); } else { - aclp = zfs_acl_alloc(6); + mutex_enter(&zp->z_lock); + mutex_enter(&zp->z_acl_lock); } - mutex_exit(&parent->z_lock); - mutex_enter(&zp->z_lock); - mutex_enter(&zp->z_acl_lock); - error = zfs_acl_chmod(zp, mode, aclp, tx); + + /* Force auto_inherit on all new directory objects */ + if (vap->va_type == VDIR) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + + error = zfs_aclset_common(zp, aclp, cr, fuidp, tx); + + /* Set optional attributes if any */ + if (vap->va_mask & AT_XVATTR) + zfs_xvattr_set(zp, xvap); + mutex_exit(&zp->z_lock); mutex_exit(&zp->z_acl_lock); ASSERT3U(error, ==, 0); - zfs_acl_free(aclp); -} - -/* - * Should ACE be inherited? - */ -static int -zfs_ace_can_use(znode_t *zp, ace_t *acep) -{ - int vtype = ZTOV(zp)->v_type; - int iflags = (acep->a_flags & 0xf); - - if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) - return (1); - else if (iflags & ACE_FILE_INHERIT_ACE) - return (!((vtype == VDIR) && - (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); - return (0); + if (aclp != setaclp) + zfs_acl_free(aclp); } #ifdef TODO @@ -1101,42 +1914,89 @@ zfs_ace_can_use(znode_t *zp, ace_t *acep) * Retrieve a files ACL */ int -zfs_getacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr) +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) { zfs_acl_t *aclp; - ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + ulong_t mask; int error; + int count = 0; + int largeace = 0; - if (error = zfs_zaccess(zp, ACE_READ_ACL, cr)) { - /* - * If owner of file then allow reading of the - * ACL. - */ - if (crgetuid(cr) != zp->z_phys->zp_uid) - return (error); - } + mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | + VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); + + if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)) + return (error); if (mask == 0) return (ENOSYS); mutex_enter(&zp->z_acl_lock); - error = zfs_acl_node_read(zp, &aclp); + error = zfs_acl_node_read(zp, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); } + /* + * Scan ACL to determine number of ACEs + */ + if ((zp->z_phys->zp_flags & ZFS_ACL_OBJ_ACE) && + !(mask & VSA_ACE_ALLTYPES)) { + void *zacep = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t type, iflags; + + while (zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type)) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + largeace++; + continue; + default: + count++; + } + } + vsecp->vsa_aclcnt = count; + } else + count = aclp->z_acl_count; if (mask & VSA_ACECNT) { - vsecp->vsa_aclcnt = aclp->z_acl_count; + vsecp->vsa_aclcnt = count; } if (mask & VSA_ACE) { - vsecp->vsa_aclentp = kmem_alloc(aclp->z_acl_count * - sizeof (ace_t), KM_SLEEP); - bcopy(aclp->z_acl, vsecp->vsa_aclentp, - aclp->z_acl_count * sizeof (ace_t)); + size_t aclsz; + + zfs_acl_node_t *aclnode = list_head(&aclp->z_acl); + + aclsz = count * sizeof (ace_t) + + sizeof (ace_object_t) * largeace; + + vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); + vsecp->vsa_aclentsz = aclsz; + + if (aclp->z_version == ZFS_ACL_VERSION_FUID) + zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr, + vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); + else { + bcopy(aclnode->z_acldata, vsecp->vsa_aclentp, + count * sizeof (ace_t)); + } + } + if (mask & VSA_ACE_ACLFLAGS) { + vsecp->vsa_aclflags = 0; + if (zp->z_phys->zp_flags & ZFS_ACL_DEFAULTED) + vsecp->vsa_aclflags |= ACL_DEFAULTED; + if (zp->z_phys->zp_flags & ZFS_ACL_PROTECTED) + vsecp->vsa_aclflags |= ACL_PROTECTED; + if (zp->z_phys->zp_flags & ZFS_ACL_AUTO_INHERIT) + vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; } mutex_exit(&zp->z_acl_lock); @@ -1147,37 +2007,100 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr) } #endif /* TODO */ +int +zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, + vsecattr_t *vsecp, zfs_acl_t **zaclp) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + int aclcnt = vsecp->vsa_aclcnt; + int error; + + if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) + return (EINVAL); + + aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); + + aclp->z_hints = 0; + aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + if ((error = zfs_copy_ace_2_oldace(obj_type, aclp, + (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, + aclcnt, &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } else { + if ((error = zfs_copy_ace_2_fuid(obj_type, aclp, + vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, + &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } + aclp->z_acl_bytes = aclnode->z_size; + aclnode->z_ace_count = aclcnt; + aclp->z_acl_count = aclcnt; + list_insert_head(&aclp->z_acl, aclnode); + + /* + * If flags are being set then add them to z_hints + */ + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { + if (vsecp->vsa_aclflags & ACL_PROTECTED) + aclp->z_hints |= ZFS_ACL_PROTECTED; + if (vsecp->vsa_aclflags & ACL_DEFAULTED) + aclp->z_hints |= ZFS_ACL_DEFAULTED; + if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + } + + *zaclp = aclp; + + return (0); +} + #ifdef TODO /* * Set a files ACL */ int -zfs_setacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr) +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; - ace_t *acep = vsecp->vsa_aclentp; - int aclcnt = vsecp->vsa_aclcnt; ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); dmu_tx_t *tx; int error; - int inherit; zfs_acl_t *aclp; + zfs_fuid_info_t *fuidp = NULL; if (mask == 0) - return (EINVAL); + return (ENOSYS); - if (!zfs_acl_valid(zp, acep, aclcnt, &inherit)) - return (EINVAL); + if (zp->z_phys->zp_flags & ZFS_IMMUTABLE) + return (EPERM); + + if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) + return (error); + + error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, &aclp); + if (error) + return (error); + + /* + * If ACL wide flags aren't being set then preserve any + * existing flags. + */ + if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { + aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); + } top: - error = zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr); - if (error == EACCES || error == ACCESS_UNDETERMINED) { - if ((error = secpolicy_vnode_setdac(cr, - zp->z_phys->zp_uid)) != 0) { - return (error); - } - } else if (error) { - return (error == EROFS ? error : EPERM); + if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) { + zfs_acl_free(aclp); + return (error); } mutex_enter(&zp->z_lock); @@ -1187,10 +2110,34 @@ top: dmu_tx_hold_bonus(tx, zp->z_id); if (zp->z_phys->zp_acl.z_acl_extern_obj) { - dmu_tx_hold_write(tx, zp->z_phys->zp_acl.z_acl_extern_obj, - 0, ZFS_ACL_SIZE(aclcnt)); - } else if (aclcnt > ACE_SLOT_CNT) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ZFS_ACL_SIZE(aclcnt)); + /* Are we upgrading ACL? */ + if (zfsvfs->z_version <= ZPL_VERSION_FUID && + zp->z_phys->zp_acl.z_acl_version == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, + zp->z_phys->zp_acl.z_acl_extern_obj, + 0, DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, + zp->z_phys->zp_acl.z_acl_extern_obj, + 0, aclp->z_acl_bytes); + } + } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); + } + if (aclp->z_has_fuids) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } } error = dmu_tx_assign(tx, zfsvfs->z_assign); @@ -1204,17 +2151,18 @@ top: goto top; } dmu_tx_abort(tx); + zfs_acl_free(aclp); return (error); } - aclp = zfs_acl_alloc(aclcnt); - bcopy(acep, aclp->z_acl, sizeof (ace_t) * aclcnt); - aclp->z_acl_count = aclcnt; - error = zfs_aclset_common(zp, aclp, tx, &inherit); + error = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); ASSERT(error == 0); + zfs_log_acl(zilog, tx, zp, vsecp, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); zfs_acl_free(aclp); - zfs_log_acl(zilog, tx, TX_ACL, zp, aclcnt, acep); dmu_tx_commit(tx); done: mutex_exit(&zp->z_acl_lock); @@ -1224,46 +2172,34 @@ done: } #endif /* TODO */ +/* + * working_mode returns the permissions that were not granted + */ static int -zfs_ace_access(ace_t *zacep, int *working_mode) -{ - if (*working_mode == 0) { - return (0); - } - - if (zacep->a_access_mask & *working_mode) { - if (zacep->a_type == ALLOW) { - *working_mode &= - ~(*working_mode & zacep->a_access_mask); - if (*working_mode == 0) - return (0); - } else if (zacep->a_type == DENY) { - return (EACCES); - } - } - - /* - * haven't been specifcally denied at this point - * so return UNDETERMINED. - */ - - return (ACCESS_UNDETERMINED); -} - - -static int -zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr) +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) { zfs_acl_t *aclp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; - ace_t *zacep; - gid_t gid; - int cnt; - int i; int error; - int access_deny = ACCESS_UNDETERMINED; - uint_t entry_type; uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t fowner; + uid_t gowner; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0) + return (0); + + *check_privs = B_TRUE; if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ *working_mode = 0; @@ -1275,93 +2211,155 @@ zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr) if ((v4_mode & WRITE_MASK) && (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && (!IS_DEVVP(ZTOV(zp)))) { + *check_privs = B_FALSE; return (EROFS); } + /* + * Only check for READONLY on non-directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + (((ZTOV(zp)->v_type != VDIR) && + (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) || + (ZTOV(zp)->v_type == VDIR && + (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) { + *check_privs = B_FALSE; + return (EPERM); + } + + if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && + (zp->z_phys->zp_flags & ZFS_NOUNLINK)) { + *check_privs = B_FALSE; + return (EPERM); + } + + if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && + (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) { + *check_privs = B_FALSE; + return (EACCES); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + mutex_enter(&zp->z_acl_lock); - error = zfs_acl_node_read(zp, &aclp); + error = zfs_acl_node_read(zp, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); } + while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type)) { - zacep = aclp->z_acl; - cnt = aclp->z_acl_count; + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; - for (i = 0; i != cnt; i++) { + if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE)) + continue; - DTRACE_PROBE2(zfs__access__common, - ace_t *, &zacep[i], int, *working_mode); + entry_type = (iflags & ACE_TYPE_FLAGS); - if (zacep[i].a_flags & ACE_INHERIT_ONLY_ACE) - continue; + checkit = B_FALSE; - entry_type = (zacep[i].a_flags & ACE_TYPE_FLAGS); switch (entry_type) { case ACE_OWNER: - if (uid == zp->z_phys->zp_uid) { - access_deny = zfs_ace_access(&zacep[i], - working_mode); - } + if (uid == fowner) + checkit = B_TRUE; break; - case (ACE_IDENTIFIER_GROUP | ACE_GROUP): + case OWNING_GROUP: + who = gowner; + /*FALLTHROUGH*/ case ACE_IDENTIFIER_GROUP: - /* - * Owning group gid is in znode not ACL - */ - if (entry_type == (ACE_IDENTIFIER_GROUP | ACE_GROUP)) - gid = zp->z_phys->zp_gid; - else - gid = zacep[i].a_who; - - if (groupmember(gid, cr)) { - access_deny = zfs_ace_access(&zacep[i], - working_mode); - } + checkit = zfs_groupmember(zfsvfs, who, cr); break; case ACE_EVERYONE: - access_deny = zfs_ace_access(&zacep[i], working_mode); + checkit = B_TRUE; break; /* USER Entry */ default: if (entry_type == 0) { - if (uid == zacep[i].a_who) { - access_deny = zfs_ace_access(&zacep[i], - working_mode); - } + uid_t newid; + + newid = zfs_fuid_map_id(zfsvfs, who, cr, + ZFS_ACE_USER); + if (newid != IDMAP_WK_CREATOR_OWNER_UID && + uid == newid) + checkit = B_TRUE; break; + } else { + zfs_acl_free(aclp); + mutex_exit(&zp->z_acl_lock); + return (EIO); + } + } + + if (checkit) { + uint32_t mask_matched = (access_mask & *working_mode); + + if (mask_matched) { + if (type == DENY) + deny_mask |= mask_matched; + + *working_mode &= ~mask_matched; } - zfs_acl_free(aclp); - mutex_exit(&zp->z_acl_lock); - return (EIO); } - if (access_deny != ACCESS_UNDETERMINED) + /* Are we done? */ + if (*working_mode == 0) break; } mutex_exit(&zp->z_acl_lock); zfs_acl_free(aclp); - return (access_deny); + /* Put the found 'denies' back on the working mode */ + if (deny_mask) { + *working_mode |= deny_mask; + return (EACCES); + } else if (*working_mode) { + return (-1); + } + + return (0); } +static int +zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, + cred_t *cr) +{ + if (*working_mode != ACE_WRITE_DATA) + return (EACCES); + + return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, + check_privs, B_FALSE, cr)); +} /* * Determine whether Access should be granted/denied, invoking least * priv subsytem when a deny is determined. */ int -zfs_zaccess(znode_t *zp, int mode, cred_t *cr) +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) { - int working_mode; - int error; - int is_attr; - znode_t *xzp; - znode_t *check_zp = zp; + uint32_t working_mode; + int error; + int is_attr; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + boolean_t check_privs; + znode_t *xzp; + znode_t *check_zp = zp; is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR)); @@ -1374,7 +2372,9 @@ zfs_zaccess(znode_t *zp, int mode, cred_t *cr) zp->z_phys->zp_parent, &xzp)) != 0) { return (error); } + check_zp = xzp; + /* * fixup mode to map to xattr perms */ @@ -1390,18 +2390,76 @@ zfs_zaccess(znode_t *zp, int mode, cred_t *cr) } } - error = zfs_zaccess_common(check_zp, mode, &working_mode, cr); + if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, + &check_privs, skipaclchk, cr)) == 0) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (0); + } - if (error == EROFS) { + if (error && !check_privs) { if (is_attr) VN_RELE(ZTOV(xzp)); return (error); } - if (error || working_mode) { - working_mode = (zfs_v4_to_unix(working_mode) << 6); - error = secpolicy_vnode_access(cr, ZTOV(check_zp), - check_zp->z_phys->zp_uid, working_mode); + if (error && (flags & V_APPEND)) { + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + } + + if (error && check_privs) { + uid_t owner; + mode_t checkmode = 0; + + owner = zfs_fuid_map_id(zfsvfs, check_zp->z_phys->zp_uid, cr, + ZFS_OWNER); + + /* + * First check for implicit owner permission on + * read_acl/read_attributes + */ + + error = 0; + ASSERT(working_mode != 0); + + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && + owner == crgetuid(cr))) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VWRITE; + if (working_mode & ACE_EXECUTE) + checkmode |= VEXEC; + + if (checkmode) + error = secpolicy_vnode_access(cr, ZTOV(check_zp), + owner, checkmode); + + if (error == 0 && (working_mode & ACE_WRITE_OWNER)) + error = secpolicy_vnode_chown(ZTOV(check_zp), cr, B_TRUE); + if (error == 0 && (working_mode & ACE_WRITE_ACL)) + error = secpolicy_vnode_setdac(ZTOV(check_zp), cr, owner); + + if (error == 0 && (working_mode & + (ACE_DELETE|ACE_DELETE_CHILD))) + error = secpolicy_vnode_remove(ZTOV(check_zp), cr); + + if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { + error = secpolicy_vnode_chown(ZTOV(check_zp), cr, B_FALSE); + } + if (error == 0) { + /* + * See if any bits other than those already checked + * for are still present. If so then return EACCES + */ + if (working_mode & ~(ZFS_CHECKED_MASKS)) { + error = EACCES; + } + } } if (is_attr) @@ -1411,38 +2469,37 @@ zfs_zaccess(znode_t *zp, int mode, cred_t *cr) } /* - * Special zaccess function to check for special nfsv4 perm. - * doesn't call secpolicy_vnode_access() for failure, since that - * would probably be the wrong policy function to call. - * instead its up to the caller to handle that situation. + * Translate traditional unix VREAD/VWRITE/VEXEC mode into + * native ACL format and call zfs_zaccess() */ - int -zfs_zaccess_v4_perm(znode_t *zp, int mode, cred_t *cr) +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) { - int working_mode = 0; - return (zfs_zaccess_common(zp, mode, &working_mode, cr)); + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); } /* - * Translate tradition unix VREAD/VWRITE/VEXEC mode into - * native ACL format and call zfs_zaccess() + * Access function for secpolicy_vnode_setattr */ int -zfs_zaccess_rwx(znode_t *zp, mode_t mode, cred_t *cr) +zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) { int v4_mode = zfs_unix_to_v4(mode >> 6); - return (zfs_zaccess(zp, v4_mode, cr)); + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); } static int -zfs_delete_final_check(znode_t *zp, znode_t *dzp, cred_t *cr) +zfs_delete_final_check(znode_t *zp, znode_t *dzp, + mode_t missing_perms, cred_t *cr) { int error; + uid_t downer; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; - error = secpolicy_vnode_access(cr, ZTOV(zp), - dzp->z_phys->zp_uid, S_IWRITE|S_IEXEC); + downer = zfs_fuid_map_id(zfsvfs, dzp->z_phys->zp_uid, cr, ZFS_OWNER); + + error = secpolicy_vnode_access(cr, ZTOV(dzp), downer, missing_perms); if (error == 0) error = zfs_sticky_remove_access(dzp, zp, cr); @@ -1488,83 +2545,88 @@ zfs_delete_final_check(znode_t *zp, znode_t *dzp, cred_t *cr) int zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) { - int dzp_working_mode = 0; - int zp_working_mode = 0; + uint32_t dzp_working_mode = 0; + uint32_t zp_working_mode = 0; int dzp_error, zp_error; + mode_t missing_perms; + boolean_t dzpcheck_privs = B_TRUE; + boolean_t zpcheck_privs = B_TRUE; /* - * Arghh, this check is going to require a couple of questions - * to be asked. We want specific DELETE permissions to + * We want specific DELETE permissions to * take precedence over WRITE/EXECUTE. We don't * want an ACL such as this to mess us up. * user:joe:write_data:deny,user:joe:delete:allow * * However, deny permissions may ultimately be overridden * by secpolicy_vnode_access(). + * + * We will ask for all of the necessary permissions and then + * look at the working modes from the directory and target object + * to determine what was found. */ - dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, - &dzp_working_mode, cr); - zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, cr); - - if (dzp_error == EROFS || zp_error == EROFS) - return (dzp_error); + if (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + return (EPERM); /* - * First check the first row. - * We only need to see if parent Allows delete_child + * First row + * If the directory permissions allow the delete, we are done. */ - if ((dzp_working_mode & ACE_DELETE_CHILD) == 0) + if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) return (0); /* - * Second row - * we already have the necessary information in - * zp_working_mode, zp_error and dzp_error. + * If target object has delete permission then we are done */ - - if ((zp_working_mode & ACE_DELETE) == 0) + if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, + &zpcheck_privs, B_FALSE, cr)) == 0) return (0); + ASSERT(dzp_error && zp_error); + + if (!dzpcheck_privs) + return (dzp_error); + if (!zpcheck_privs) + return (zp_error); + /* - * Now zp_error should either be EACCES which indicates - * a "deny" delete entry or ACCESS_UNDETERMINED if the "delete" - * entry exists on the target. + * Second row * - * dzp_error should be either EACCES which indicates a "deny" - * entry for delete_child or ACCESS_UNDETERMINED if no delete_child - * entry exists. If value is EACCES then we are done - * and zfs_delete_final_check() will make the final decision - * regarding to allow the delete. + * If directory returns EACCES then delete_child was denied + * due to deny delete_child. In this case send the request through + * secpolicy_vnode_remove(). We don't use zfs_delete_final_check() + * since that *could* allow the delete based on write/execute permission + * and we want delete permissions to override write/execute. */ - ASSERT(zp_error != 0 && dzp_error != 0); if (dzp_error == EACCES) - return (zfs_delete_final_check(zp, dzp, cr)); + return (secpolicy_vnode_remove(ZTOV(dzp), cr)); /* XXXPJD: s/dzp/zp/ ? */ /* * Third Row - * Only need to check for write/execute on parent + * only need to see if we have write/execute on directory. */ - dzp_error = zfs_zaccess_common(dzp, ACE_WRITE_DATA|ACE_EXECUTE, - &dzp_working_mode, cr); + if ((dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) + return (zfs_sticky_remove_access(dzp, zp, cr)); - if (dzp_error == EROFS) + if (!dzpcheck_privs) return (dzp_error); - if ((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) == 0) - return (zfs_sticky_remove_access(dzp, zp, cr)); - /* - * Fourth Row + * Fourth row */ - if (((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) != 0) && - ((zp_working_mode & ACE_DELETE) == 0)) - return (zfs_sticky_remove_access(dzp, zp, cr)); + missing_perms = (dzp_working_mode & ACE_WRITE_DATA) ? VWRITE : 0; + missing_perms |= (dzp_working_mode & ACE_EXECUTE) ? VEXEC : 0; + + ASSERT(missing_perms); + + return (zfs_delete_final_check(zp, dzp, missing_perms, cr)); - return (zfs_delete_final_check(zp, dzp, cr)); } int @@ -1574,6 +2636,9 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, int add_perm; int error; + if (szp->z_phys->zp_flags & ZFS_AV_QUARANTINED) + return (EACCES); + add_perm = (ZTOV(szp)->v_type == VDIR) ? ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; @@ -1586,7 +2651,7 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, * to another. */ if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) { - if (error = zfs_zaccess(szp, ACE_WRITE_DATA, cr)) + if (error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr)) return (error); } @@ -1610,7 +2675,7 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, /* * Now check for add permissions */ - error = zfs_zaccess(tdzp, add_perm, cr); + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); return (error); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c index c8450d488bdb..b6c43f4245f2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,7 +32,7 @@ #include <sys/zfs_acl.h> void -zfs_ace_byteswap(ace_t *ace, int ace_cnt) +zfs_oldace_byteswap(ace_t *ace, int ace_cnt) { int i; @@ -45,9 +44,78 @@ zfs_ace_byteswap(ace_t *ace, int ace_cnt) } } +/* + * swap ace_t and ace_oject_t + */ +void +zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) +{ +#ifdef TODO + caddr_t end; + caddr_t ptr; + zfs_ace_t *zacep; + ace_t *acep; + uint16_t entry_type; + size_t entry_size; + int ace_type; + + end = (caddr_t)buf + size; + ptr = buf; + + while (ptr < end) { + if (zfs_layout) { + zacep = (zfs_ace_t *)ptr; + zacep->z_hdr.z_access_mask = + BSWAP_32(zacep->z_hdr.z_access_mask); + zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags); + ace_type = zacep->z_hdr.z_type = + BSWAP_16(zacep->z_hdr.z_type); + entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS; + } else { + acep = (ace_t *)ptr; + acep->a_access_mask = BSWAP_32(acep->a_access_mask); + acep->a_flags = BSWAP_16(acep->a_flags); + ace_type = acep->a_type = BSWAP_16(acep->a_type); + acep->a_who = BSWAP_32(acep->a_who); + entry_type = acep->a_flags & ACE_TYPE_FLAGS; + } + switch (entry_type) { + case ACE_OWNER: + case ACE_EVERYONE: + case (ACE_IDENTIFIER_GROUP | ACE_GROUP): + entry_size = zfs_layout ? + sizeof (zfs_ace_hdr_t) : sizeof (ace_t); + break; + case ACE_IDENTIFIER_GROUP: + default: + if (zfs_layout) { + zacep->z_fuid = BSWAP_64(zacep->z_fuid); + } + switch (ace_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + entry_size = zfs_layout ? + sizeof (zfs_object_ace_t) : + sizeof (ace_object_t); + break; + default: + entry_size = zfs_layout ? sizeof (zfs_ace_t) : + sizeof (ace_t); + break; + } + } + ptr = ptr + entry_size; + } +#else /* TODO */ + panic("%s:%u: TODO", __func__, __LINE__); +#endif /* TODO */ +} + /* ARGSUSED */ void -zfs_acl_byteswap(void *buf, size_t size) +zfs_oldacl_byteswap(void *buf, size_t size) { int cnt; @@ -58,7 +126,14 @@ zfs_acl_byteswap(void *buf, size_t size) cnt = size / sizeof (ace_t); - zfs_ace_byteswap((ace_t *)buf, cnt); + zfs_oldace_byteswap((ace_t *)buf, cnt); +} + +/* ARGSUSED */ +void +zfs_acl_byteswap(void *buf, size_t size) +{ + zfs_ace_byteswap(buf, size, B_TRUE); } void @@ -86,14 +161,19 @@ zfs_znode_byteswap(void *buf, size_t size) zp->zp_flags = BSWAP_64(zp->zp_flags); zp->zp_uid = BSWAP_64(zp->zp_uid); zp->zp_gid = BSWAP_64(zp->zp_gid); + zp->zp_zap = BSWAP_64(zp->zp_zap); zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]); zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]); zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]); - zp->zp_pad[3] = BSWAP_64(zp->zp_pad[3]); zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj); - zp->zp_acl.z_acl_count = BSWAP_32(zp->zp_acl.z_acl_count); + zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size); zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version); - zp->zp_acl.z_acl_pad = BSWAP_16(zp->zp_acl.z_acl_pad); - zfs_ace_byteswap(&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT); + zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count); + if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) { + zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0], + ZFS_ACE_SPACE); + } else + zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0], + ACE_SLOT_CNT); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c index 286fe97e1142..654d2f949b3f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -53,6 +53,17 @@ * reliable way to auto-unmount the filesystem when it's "no longer in use". * When the user unmounts a filesystem, we call zfsctl_unmount(), which * unmounts any snapshots within the snapshot directory. + * + * The '.zfs', '.zfs/snapshot', and all directories created under + * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and + * share the same vfs_t as the head filesystem (what '.zfs' lives under). + * + * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>' + * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. + * However, vnodes within these mounted on file systems have their v_vfsp + * fields set to the head filesystem to make NFS happy (see + * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t + * so that it cannot be freed until all snapshots have been unmounted. */ #include <sys/zfs_context.h> @@ -63,7 +74,23 @@ #include <sys/gfs.h> #include <sys/stat.h> #include <sys/dmu.h> +#include <sys/dsl_deleg.h> #include <sys/mount.h> +#include <sys/sunddi.h> + +#include "zfs_namecheck.h" + +typedef struct zfsctl_node { + gfs_dir_t zc_gfs_private; + uint64_t zc_id; + timestruc_t zc_cmtime; /* ctime and mtime, always the same */ +} zfsctl_node_t; + +typedef struct zfsctl_snapdir { + zfsctl_node_t sd_node; + kmutex_t sd_lock; + avl_tree_t sd_snaps; +} zfsctl_snapdir_t; typedef struct { char *se_name; @@ -92,18 +119,7 @@ static struct vop_vector zfsctl_ops_snapshot; static vnode_t *zfsctl_mknode_snapdir(vnode_t *); static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); - -typedef struct zfsctl_node { - gfs_dir_t zc_gfs_private; - uint64_t zc_id; - timestruc_t zc_cmtime; /* ctime and mtime, always the same */ -} zfsctl_node_t; - -typedef struct zfsctl_snapdir { - zfsctl_node_t sd_node; - kmutex_t sd_lock; - avl_tree_t sd_snaps; -} zfsctl_snapdir_t; +static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *); /* * Root directory elements. We have only a single static entry, 'snapshot'. @@ -237,14 +253,14 @@ static int zfsctl_common_access(ap) struct vop_access_args /* { struct vnode *a_vp; - accmode_t a_accmode; + int a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { - accmode_t accmode = ap->a_accmode; + int mode = ap->a_accmode; - if (accmode & VWRITE) + if (mode & VWRITE) return (EACCES); return (0); @@ -283,6 +299,7 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) vap->va_flags = 0; } +/*ARGSUSED*/ static int zfsctl_common_fid(ap) struct vop_fid_args /* { @@ -360,6 +377,7 @@ zfsctl_root_getattr(ap) struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; + struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; @@ -382,11 +400,18 @@ zfsctl_root_getattr(ap) /* ARGSUSED */ int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, - int flags, vnode_t *rdir, cred_t *cr) + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; int err; + /* + * No extended attributes allowed under .zfs + */ + if (flags & LOOKUP_XATTR) + return (EINVAL); + ZFS_ENTER(zfsvfs); if (strcmp(nm, "..") == 0) { @@ -394,7 +419,8 @@ zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, if (err == 0) VOP_UNLOCK(*vpp, 0); } else { - err = gfs_dir_lookup(dvp, nm, vpp); + err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp); } ZFS_EXIT(zfsvfs); @@ -407,7 +433,7 @@ zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, */ /* ARGSUSED */ int -zfsctl_root_lookup_vop(ap) +zfsctl_freebsd_root_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; @@ -428,7 +454,7 @@ zfsctl_root_lookup_vop(ap) ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); - err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr); + err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr, NULL, NULL, NULL); if (err == 0 && (nm[0] != '.' || nm[1] != '\0')) vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); @@ -443,7 +469,7 @@ static struct vop_vector zfsctl_ops_root = { .vop_getattr = zfsctl_root_getattr, .vop_access = zfsctl_common_access, .vop_readdir = gfs_vop_readdir, - .vop_lookup = zfsctl_root_lookup_vop, + .vop_lookup = zfsctl_freebsd_root_lookup, .vop_inactive = gfs_vop_inactive, .vop_reclaim = zfsctl_common_reclaim, .vop_fid = zfsctl_common_fid, @@ -454,6 +480,8 @@ zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) { objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + if (snapshot_namecheck(name, NULL, NULL) != 0) + return (EILSEQ); dmu_objset_name(os, zname); if (strlen(zname) + 1 + strlen(name) >= len) return (ENAMETOOLONG); @@ -463,38 +491,18 @@ zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) } static int -zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr) +zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr) { - zfsctl_snapdir_t *sdp = dvp->v_data; - zfs_snapentry_t search, *sep; - struct vop_inactive_args ap; - avl_index_t where; - int err; - - ASSERT(MUTEX_HELD(&sdp->sd_lock)); - - search.se_name = (char *)name; - if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) - return (ENOENT); + vnode_t *svp = sep->se_root; + int error; - ASSERT(vn_ismntpt(sep->se_root)); + ASSERT(vn_ismntpt(svp)); /* this will be dropped by dounmount() */ - if ((err = vn_vfswlock(sep->se_root)) != 0) - return (err); - - err = dounmount(vn_mountedvfs(sep->se_root), force, curthread); - if (err) - return (err); - ASSERT(sep->se_root->v_count == 1); - ap.a_vp = sep->se_root; - gfs_vop_inactive(&ap); - - avl_remove(&sdp->sd_snaps, sep); - kmem_free(sep->se_name, strlen(sep->se_name) + 1); - kmem_free(sep, sizeof (zfs_snapentry_t)); + if ((error = vn_vfswlock(svp)) != 0) + return (error); - return (0); + return (dounmount(vn_mountedvfs(svp), fflags, curthread)); } #if 0 @@ -553,20 +561,40 @@ zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) #endif #if 0 +/*ARGSUSED*/ static int zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, - cred_t *cr) + cred_t *cr, caller_context_t *ct, int flags) { zfsctl_snapdir_t *sdp = sdvp->v_data; zfs_snapentry_t search, *sep; + zfsvfs_t *zfsvfs; avl_index_t where; char from[MAXNAMELEN], to[MAXNAMELEN]; + char real[MAXNAMELEN]; int err; + zfsvfs = sdvp->v_vfsp->vfs_data; + ZFS_ENTER(zfsvfs); + + if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + err = dmu_snapshot_realname(zfsvfs->z_os, snm, real, + MAXNAMELEN, NULL); + if (err == 0) { + snm = real; + } else if (err != ENOTSUP) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + ZFS_EXIT(zfsvfs); + err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from); - if (err) - return (err); - err = zfs_secpolicy_write(from, cr); + if (!err) + err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to); + if (!err) + err = zfs_secpolicy_rename_perms(from, to, cr); if (err) return (err); @@ -579,10 +607,6 @@ zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, if (strcmp(snm, tnm) == 0) return (0); - err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to); - if (err) - return (err); - mutex_enter(&sdp->sd_lock); search.se_name = (char *)snm; @@ -604,29 +628,55 @@ zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, #if 0 /* ARGSUSED */ static int -zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) +zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, + caller_context_t *ct, int flags) { zfsctl_snapdir_t *sdp = dvp->v_data; + zfs_snapentry_t *sep; + zfs_snapentry_t search; + zfsvfs_t *zfsvfs; char snapname[MAXNAMELEN]; + char real[MAXNAMELEN]; int err; + zfsvfs = dvp->v_vfsp->vfs_data; + ZFS_ENTER(zfsvfs); + + if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + + err = dmu_snapshot_realname(zfsvfs->z_os, name, real, + MAXNAMELEN, NULL); + if (err == 0) { + name = real; + } else if (err != ENOTSUP) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + ZFS_EXIT(zfsvfs); + err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname); - if (err) - return (err); - err = zfs_secpolicy_write(snapname, cr); + if (!err) + err = zfs_secpolicy_destroy_perms(snapname, cr); if (err) return (err); mutex_enter(&sdp->sd_lock); - err = zfsctl_unmount_snap(dvp, name, 0, cr); - if (err) { - mutex_exit(&sdp->sd_lock); - return (err); + search.se_name = name; + sep = avl_find(&sdp->sd_snaps, &search, NULL); + if (sep) { + avl_remove(&sdp->sd_snaps, sep); + err = zfsctl_unmount_snap(sep, MS_FORCE, cr); + if (err) + avl_add(&sdp->sd_snaps, sep); + else + err = dmu_objset_destroy(snapname); + } else { + err = ENOENT; } - err = dmu_objset_destroy(snapname); - mutex_exit(&sdp->sd_lock); return (err); @@ -634,6 +684,57 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) #endif /* + * This creates a snapshot under '.zfs/snapshot'. + */ +/* ARGSUSED */ +static int +zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, + cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp) +{ + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + char name[MAXNAMELEN]; + int err; + static enum symfollow follow = NO_FOLLOW; + static enum uio_seg seg = UIO_SYSSPACE; + + if (snapshot_namecheck(dirname, NULL, NULL) != 0) + return (EILSEQ); + + dmu_objset_name(zfsvfs->z_os, name); + + *vpp = NULL; + + err = zfs_secpolicy_snapshot_perms(name, cr); + if (err) + return (err); + + if (err == 0) { + err = dmu_objset_snapshot(name, dirname, B_FALSE); + if (err) + return (err); + err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp); + } + + return (err); +} + +static int +zfsctl_freebsd_snapdir_mkdir(ap) + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + return (zfsctl_snapdir_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, NULL, + ap->a_vpp, ap->a_cnp->cn_cred, NULL, 0, NULL)); +} + +/* * Lookup entry point for the 'snapshot' directory. Try to open the * snapshot if it exist, creating the pseudo filesystem vnode as necessary. * Perform a mount of the associated dataset on top of the vnode. @@ -649,17 +750,25 @@ zfsctl_snapdir_lookup(ap) { vnode_t *dvp = ap->a_dvp; vnode_t **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; char nm[NAME_MAX + 1]; zfsctl_snapdir_t *sdp = dvp->v_data; objset_t *snap; char snapname[MAXNAMELEN]; + char real[MAXNAMELEN]; char *mountpoint; zfs_snapentry_t *sep, search; size_t mountpoint_len; avl_index_t where; zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; int err; + int flags = 0; + /* + * No extended attributes allowed under .zfs + */ + if (flags & LOOKUP_XATTR) + return (EINVAL); ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); @@ -681,6 +790,26 @@ zfsctl_snapdir_lookup(ap) ZFS_ENTER(zfsvfs); + if (flags & FIGNORECASE) { + boolean_t conflict = B_FALSE; + + err = dmu_snapshot_realname(zfsvfs->z_os, nm, real, + MAXNAMELEN, &conflict); + if (err == 0) { + strlcpy(nm, real, sizeof(nm)); + } else if (err != ENOTSUP) { + ZFS_EXIT(zfsvfs); + return (err); + } +#if 0 + if (realpnp) + (void) strlcpy(realpnp->pn_buf, nm, + realpnp->pn_bufsize); + if (conflict && direntflags) + *direntflags = ED_CASE_CONFLICT; +#endif + } + mutex_enter(&sdp->sd_lock); search.se_name = (char *)nm; if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { @@ -692,6 +821,13 @@ zfsctl_snapdir_lookup(ap) * try to remount it. */ goto domount; + } else { + /* + * VROOT was set during the traverse call. We need + * to clear it since we're pretending to be part + * of our parent's vfs. + */ + (*vpp)->v_flag &= ~VROOT; } vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); mutex_exit(&sdp->sd_lock); @@ -706,13 +842,25 @@ zfsctl_snapdir_lookup(ap) if (err) { mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); - return (err); + /* + * handle "ls *" or "?" in a graceful manner, + * forcing EILSEQ to ENOENT. + * Since shell ultimately passes "*" or "?" as name to lookup + */ + return (err == EILSEQ ? ENOENT : err); } if (dmu_objset_open(snapname, DMU_OST_ZFS, - DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) { + DS_MODE_USER | DS_MODE_READONLY, &snap) != 0) { mutex_exit(&sdp->sd_lock); + /* Translate errors and add SAVENAME when needed. */ + if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE) { + err = EJUSTRETURN; + cnp->cn_flags |= SAVENAME; + } else { + err = ENOENT; + } ZFS_EXIT(zfsvfs); - return (ENOENT); + return (err); } sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP); @@ -735,7 +883,6 @@ domount: if (err == 0) vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); mutex_exit(&sdp->sd_lock); - /* * If we had an error, drop our hold on the vnode and * zfsctl_snapshot_inactive() will clean up. @@ -750,25 +897,41 @@ domount: /* ARGSUSED */ static int -zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, - offset_t *offp, offset_t *nextp, void *data) +zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, + offset_t *offp, offset_t *nextp, void *data, int flags) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; char snapname[MAXNAMELEN]; uint64_t id, cookie; + boolean_t case_conflict; + int error; ZFS_ENTER(zfsvfs); cookie = *offp; - if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, - &cookie) == ENOENT) { - *eofp = 1; + error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, + &cookie, &case_conflict); + if (error) { ZFS_EXIT(zfsvfs); - return (0); + if (error == ENOENT) { + *eofp = 1; + return (0); + } + return (error); } - (void) strcpy(dp->d_name, snapname); - dp->d_ino = ZFSCTL_INO_SNAP(id); + if (flags & V_RDDIR_ENTFLAGS) { + edirent_t *eodp = dp; + + (void) strcpy(eodp->ed_name, snapname); + eodp->ed_ino = ZFSCTL_INO_SNAP(id); + eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0; + } else { + struct dirent64 *odp = dp; + + (void) strcpy(odp->d_name, snapname); + odp->d_ino = ZFSCTL_INO_SNAP(id); + } *nextp = cookie; ZFS_EXIT(zfsvfs); @@ -776,6 +939,13 @@ zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, return (0); } +/* + * pvp is the '.zfs' directory (zfsctl_node_t). + * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t). + * + * This function is the callback to create a GFS vnode for '.zfs/snapshot' + * when a lookup is performed on .zfs for "snapshot". + */ vnode_t * zfsctl_mknode_snapdir(vnode_t *pvp) { @@ -802,6 +972,7 @@ zfsctl_snapdir_getattr(ap) struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; + struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; @@ -847,6 +1018,7 @@ static struct vop_vector zfsctl_ops_snapdir = { .vop_ioctl = VOP_EINVAL, .vop_getattr = zfsctl_snapdir_getattr, .vop_access = zfsctl_common_access, + .vop_mkdir = zfsctl_freebsd_snapdir_mkdir, .vop_readdir = gfs_vop_readdir, .vop_lookup = zfsctl_snapdir_lookup, .vop_inactive = zfsctl_snapdir_inactive, @@ -854,6 +1026,13 @@ static struct vop_vector zfsctl_ops_snapdir = { .vop_fid = zfsctl_common_fid, }; +/* + * pvp is the GFS vnode '.zfs/snapshot'. + * + * This creates a GFS node under '.zfs/snapshot' representing each + * snapshot. This newly created GFS node is what we mount snapshot + * vfs_t's ontop of. + */ static vnode_t * zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) { @@ -862,8 +1041,10 @@ zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp, &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); + VN_HOLD(vp); zcp = vp->v_data; zcp->zc_id = objset; + VFS_HOLD(vp->v_vfsp); VOP_UNLOCK(vp, 0); return (vp); @@ -877,13 +1058,14 @@ zfsctl_snapshot_inactive(ap) } */ *ap; { vnode_t *vp = ap->a_vp; + cred_t *cr = ap->a_td->td_ucred; struct vop_inactive_args iap; zfsctl_snapdir_t *sdp; zfs_snapentry_t *sep, *next; int locked; vnode_t *dvp; - VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0); + VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0); sdp = dvp->v_data; VOP_UNLOCK(dvp, 0); @@ -914,6 +1096,7 @@ zfsctl_snapshot_inactive(ap) if (!locked) mutex_exit(&sdp->sd_lock); VN_RELE(dvp); + VFS_RELE(vp->v_vfsp); /* * Dispose of the vnode for the snapshot mount point. @@ -931,7 +1114,6 @@ zfsctl_traverse_begin(vnode_t **vpp, int lktype) { VN_HOLD(*vpp); - /* Snapshot should be already mounted, but just in case. */ if (vn_mountedvfs(*vpp) == NULL) return (ENOENT); @@ -983,6 +1165,36 @@ zfsctl_snapshot_fid(ap) return (err); } +static int +zfsctl_snapshot_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + cred_t *cr = ap->a_cnp->cn_cred; + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + int error; + + if (cnp->cn_namelen != 2 || cnp->cn_nameptr[0] != '.' || + cnp->cn_nameptr[1] != '.') { + return (ENOENT); + } + + ASSERT(dvp->v_type == VDIR); + ASSERT(zfsvfs->z_ctldir != NULL); + + error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", vpp, + NULL, 0, NULL, cr, NULL, NULL, NULL); + if (error == 0) + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); + return (error); +} + /* * These VP's should never see the light of day. They should always * be covered. @@ -990,6 +1202,7 @@ zfsctl_snapshot_fid(ap) static struct vop_vector zfsctl_ops_snapshot = { .vop_default = &default_vnodeops, .vop_inactive = zfsctl_snapshot_inactive, + .vop_lookup = zfsctl_snapshot_lookup, .vop_reclaim = zfsctl_common_reclaim, .vop_getattr = zfsctl_snapshot_getattr, .vop_fid = zfsctl_snapshot_fid, @@ -1007,7 +1220,7 @@ zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) ASSERT(zfsvfs->z_ctldir != NULL); error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, - NULL, 0, NULL, kcred); + NULL, 0, NULL, kcred, NULL, NULL, NULL); if (error != 0) return (error); sdp = dvp->v_data; @@ -1025,6 +1238,12 @@ zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) if (sep != NULL) { VN_HOLD(vp); + /* + * Return the mounted root rather than the covered mount point. + * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid> + * and returns the ZFS vnode mounted on top of the GFS node. + * This ZFS vnode is the root of the vfs for objset 'objsetid'. + */ error = traverse(&vp, LK_SHARED | LK_RETRY); if (error == 0) { if (vp == sep->se_root) @@ -1055,16 +1274,15 @@ zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) int zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) { - struct vop_inactive_args ap; zfsvfs_t *zfsvfs = vfsp->vfs_data; - vnode_t *dvp, *svp; + vnode_t *dvp; zfsctl_snapdir_t *sdp; zfs_snapentry_t *sep, *next; int error; ASSERT(zfsvfs->z_ctldir != NULL); error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, - NULL, 0, NULL, cr); + NULL, 0, NULL, cr, NULL, NULL, NULL); if (error != 0) return (error); sdp = dvp->v_data; @@ -1073,7 +1291,6 @@ zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) sep = avl_first(&sdp->sd_snaps); while (sep != NULL) { - svp = sep->se_root; next = AVL_NEXT(&sdp->sd_snaps, sep); /* @@ -1081,40 +1298,16 @@ zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) * have just been unmounted by somebody else, and * will be cleaned up by zfsctl_snapdir_inactive(). */ - if (vn_ismntpt(svp)) { - if ((error = vn_vfswlock(svp)) != 0) - goto out; - - /* - * Increase usecount, so dounmount() won't vrele() it - * to 0 and call zfsctl_snapdir_inactive(). - */ - VN_HOLD(svp); - vfsp = vn_mountedvfs(svp); - mtx_lock(&Giant); - error = dounmount(vfsp, fflags, curthread); - mtx_unlock(&Giant); - if (error != 0) { - VN_RELE(svp); - goto out; + if (vn_ismntpt(sep->se_root)) { + error = zfsctl_unmount_snap(sep, fflags, cr); + if (error) { + avl_add(&sdp->sd_snaps, sep); + break; } - - avl_remove(&sdp->sd_snaps, sep); - kmem_free(sep->se_name, strlen(sep->se_name) + 1); - kmem_free(sep, sizeof (zfs_snapentry_t)); - - /* - * We can't use VN_RELE(), as that will try to - * invoke zfsctl_snapdir_inactive(), and that - * would lead to an attempt to re-grab the sd_lock. - */ - ASSERT3U(svp->v_count, ==, 1); - ap.a_vp = svp; - gfs_vop_inactive(&ap); } sep = next; } -out: + mutex_exit(&sdp->sd_lock); VN_RELE(dvp); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c index f233b8f61e8e..45ec88b7b735 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/time.h> @@ -40,6 +38,7 @@ #include <sys/errno.h> #include <sys/stat.h> #include <sys/unistd.h> +#include <sys/sunddi.h> #include <sys/random.h> #include <sys/policy.h> #include <sys/kcondvar.h> @@ -52,7 +51,50 @@ #include <sys/dmu.h> #include <sys/atomic.h> #include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> #include <sys/dnlc.h> +#include <sys/extdirent.h> + +/* + * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups + * of names after deciding which is the appropriate lookup interface. + */ +static int +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact, + boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) +{ + int error; + + if (zfsvfs->z_norm) { + matchtype_t mt = MT_FIRST; + boolean_t conflict = B_FALSE; + size_t bufsz = 0; + char *buf = NULL; + + if (rpnp) { + buf = rpnp->pn_buf; + bufsz = rpnp->pn_bufsize; + } + if (exact) + mt = MT_EXACT; + /* + * In the non-mixed case we only expect there would ever + * be one match, but we need to use the normalizing lookup. + */ + error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, + zoid, mt, buf, bufsz, &conflict); + if (!error && deflags) + *deflags = conflict ? ED_CASE_CONFLICT : 0; + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); + } + *zoid = ZFS_DIRENT_OBJ(*zoid); + + if (error == ENOENT && update) + dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE); + + return (error); +} /* * Lock a directory entry. A dirlock on <dzp, name> protects that name @@ -67,24 +109,38 @@ * ZEXISTS: if the entry does not exist, fail with ENOENT. * ZSHARED: allow concurrent access with other ZSHARED callers. * ZXATTR: we want dzp's xattr directory + * ZCILOOK: On a mixed sensitivity file system, + * this lookup should be case-insensitive. + * ZCIEXACT: On a purely case-insensitive file system, + * this lookup should be case-sensitive. + * ZRENAMING: we are locking for renaming, force narrow locks * * Output arguments: * zpp - pointer to the znode for the entry (NULL if there isn't one) * dlpp - pointer to the dirlock for this entry (NULL on error) + * direntflags - (case-insensitive lookup only) + * flags if multiple case-sensitive matches exist in directory + * realpnp - (case-insensitive lookup only) + * actual name matched within the directory * * Return value: 0 on success or errno on failure. * * NOTE: Always checks for, and rejects, '.' and '..'. + * NOTE: For case-insensitive file systems we take wide locks (see below), + * but return znode pointers to a single match. */ int zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - int flag) + int flag, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zfs_dirlock_t *dl; + boolean_t update; + boolean_t exact; uint64_t zoid; - int error; - vnode_t *vp; + vnode_t *vp = NULL; + int error = 0; + int cmpflags; *zpp = NULL; *dlpp = NULL; @@ -98,6 +154,59 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, return (EEXIST); /* + * Case sensitivity and normalization preferences are set when + * the file system is created. These are stored in the + * zfsvfs->z_case and zfsvfs->z_norm fields. These choices + * affect what vnodes can be cached in the DNLC, how we + * perform zap lookups, and the "width" of our dirlocks. + * + * A normal dirlock locks a single name. Note that with + * normalization a name can be composed multiple ways, but + * when normalized, these names all compare equal. A wide + * dirlock locks multiple names. We need these when the file + * system is supporting mixed-mode access. It is sometimes + * necessary to lock all case permutations of file name at + * once so that simultaneous case-insensitive/case-sensitive + * behaves as rationally as possible. + */ + + /* + * Decide if exact matches should be requested when performing + * a zap lookup on file systems supporting case-insensitive + * access. + */ + exact = + ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) || + ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK)); + + /* + * Only look in or update the DNLC if we are looking for the + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name. + * + * Maybe can add TO-UPPERed version of name to dnlc in ci-only + * case for performance improvement? + */ + update = !zfsvfs->z_norm || + ((zfsvfs->z_case == ZFS_CASE_MIXED) && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); + + /* + * ZRENAMING indicates we are in a situation where we should + * take narrow locks regardless of the file system's + * preferences for normalizing and case folding. This will + * prevent us deadlocking trying to grab the same wide lock + * twice if the two names happen to be case-insensitive + * matches. + */ + if (flag & ZRENAMING) + cmpflags = 0; + else + cmpflags = zfsvfs->z_norm; + + /* * Wait until there are no locks on this name. */ rw_enter(&dzp->z_name_lock, RW_READER); @@ -108,9 +217,16 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, rw_exit(&dzp->z_name_lock); return (ENOENT); } - for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) - if (strcmp(name, dl->dl_name) == 0) + for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { + if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, + U8_UNICODE_LATEST, &error) == 0) || error != 0) break; + } + if (error != 0) { + mutex_exit(&dzp->z_lock); + rw_exit(&dzp->z_name_lock); + return (ENOENT); + } if (dl == NULL) { /* * Allocate a new dirlock and add it to the list. @@ -156,7 +272,8 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, zoid = dzp->z_phys->zp_xattr; error = (zoid == 0 ? ENOENT : 0); } else { - vp = dnlc_lookup(ZTOV(dzp), name); + if (update) + vp = dnlc_lookup(ZTOV(dzp), name); if (vp == DNLC_NO_VNODE) { VN_RELE(vp); error = ENOENT; @@ -170,11 +287,8 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, *zpp = VTOZ(vp); return (0); } else { - error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, - 8, 1, &zoid); - zoid = ZFS_DIRENT_OBJ(zoid); - if (error == ENOENT) - dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE); + error = zfs_match_find(zfsvfs, dzp, name, exact, + update, direntflags, realpnp, &zoid); } } if (error) { @@ -192,7 +306,7 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, zfs_dirent_unlock(dl); return (error); } - if (!(flag & ZXATTR)) + if (!(flag & ZXATTR) && update) dnlc_update(ZTOV(dzp), name, ZTOV(*zpp)); } @@ -239,7 +353,8 @@ zfs_dirent_unlock(zfs_dirlock_t *dl) * special pseudo-directory. */ int -zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp) +zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags, + int *deflg, pathname_t *rpnp) { zfs_dirlock_t *dl; znode_t *zp; @@ -257,7 +372,8 @@ zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp) if (dzp->z_phys->zp_parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, - "snapshot", vpp, NULL, 0, NULL, kcred); + "snapshot", vpp, NULL, 0, NULL, kcred, + NULL, NULL, NULL); return (error); } rw_enter(&dzp->z_parent_lock, RW_READER); @@ -268,30 +384,25 @@ zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp) } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { *vpp = zfsctl_root(dzp); } else { - error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS | ZSHARED); + int zf; + + zf = ZEXISTS | ZSHARED; + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); if (error == 0) { *vpp = ZTOV(zp); zfs_dirent_unlock(dl); dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ } + rpnp = NULL; } - return (error); -} - -static char * -zfs_unlinked_hexname(char namebuf[17], uint64_t x) -{ - char *name = &namebuf[16]; - const char digits[16] = "0123456789abcdef"; - - *name = '\0'; - do { - *--name = digits[x & 0xf]; - x >>= 4; - } while (x != 0); + if ((flags & FIGNORECASE) && rpnp && !error) + (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); - return (name); + return (error); } /* @@ -312,15 +423,12 @@ void zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; - char obj_name[17]; - int error; ASSERT(zp->z_unlinked); ASSERT3U(zp->z_phys->zp_links, ==, 0); - error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj, - zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx); - ASSERT3U(error, ==, 0); + VERIFY3U(0, ==, + zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); } /* @@ -377,7 +485,9 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs) /* * Delete the entire contents of a directory. Return a count - * of the number of entries that could not be deleted. + * of the number of entries that could not be deleted. If we encounter + * an error, return a count of at least one so that the directory stays + * in the unlinked set. * * NOTE: this function assumes that the directory is inactive, * so there is no need to lock its entries before deletion. @@ -401,7 +511,10 @@ zfs_purgedir(znode_t *dzp) zap_cursor_advance(&zc)) { error = zfs_zget(zfsvfs, ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); - ASSERT3U(error, ==, 0); + if (error) { + skipped += 1; + continue; + } ASSERT((ZTOV(xzp)->v_type == VREG) || (ZTOV(xzp)->v_type == VLNK)); @@ -423,13 +536,15 @@ zfs_purgedir(znode_t *dzp) dl.dl_name = zap.za_name; error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); - ASSERT3U(error, ==, 0); + if (error) + skipped += 1; dmu_tx_commit(tx); VN_RELE(ZTOV(xzp)); } zap_cursor_fini(&zc); - ASSERT(error == ENOENT); + if (error != ENOENT) + skipped += 1; return (skipped); } @@ -439,7 +554,6 @@ zfs_rmnode(znode_t *zp) zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; znode_t *xzp = NULL; - char obj_name[17]; dmu_tx_t *tx; uint64_t acl_obj; int error; @@ -450,6 +564,24 @@ zfs_rmnode(znode_t *zp) ASSERT(zp->z_phys->zp_links == 0); /* + * If this is a ZIL replay then leave the object in the unlinked set. + * Otherwise we can get a deadlock, because the delete can be + * quite large and span multiple tx's and txgs, but each replay + * creates a tx to atomically run the replay function and mark the + * replay record as complete. We deadlock trying to start a tx in + * a new txg to further the deletion but can't because the replay + * tx hasn't finished. + * + * We actually delete the object if we get a failure to create an + * object in zil_replay_log_record(), or after calling zil_replay(). + */ + if (zfsvfs->z_assign >= TXG_INITIAL) { + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + + /* * If this is an attribute directory, purge its contents. */ if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && @@ -457,14 +589,29 @@ zfs_rmnode(znode_t *zp) if (zfs_purgedir(zp) != 0) { /* * Not enough space to delete some xattrs. - * Leave it on the unlinked set. + * Leave it in the unlinked set. */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); VFS_UNLOCK_GIANT(vfslocked); return; } } /* + * Free up all the data in the file. + */ + error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); + if (error) { + /* + * Not enough space. Leave the file in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + + /* * If the file has extended attributes, we're going to unlink * the xattr dir. */ @@ -476,7 +623,7 @@ zfs_rmnode(znode_t *zp) acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; /* - * Set up the transaction. + * Set up the final transaction. */ tx = dmu_tx_create(os); dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); @@ -495,8 +642,9 @@ zfs_rmnode(znode_t *zp) * which point we'll call zfs_unlinked_drain() to process it). */ dmu_tx_abort(tx); - VFS_UNLOCK_GIANT(vfslocked); - return; + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + goto out; } if (xzp) { @@ -509,19 +657,27 @@ zfs_rmnode(znode_t *zp) } /* Remove this znode from the unlinked set */ - error = zap_remove(os, zfsvfs->z_unlinkedobj, - zfs_unlinked_hexname(obj_name, zp->z_id), tx); - ASSERT3U(error, ==, 0); + VERIFY3U(0, ==, + zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); zfs_znode_delete(zp, tx); dmu_tx_commit(tx); - +out: if (xzp) VN_RELE(ZTOV(xzp)); VFS_UNLOCK_GIANT(vfslocked); } +static uint64_t +zfs_dirent(znode_t *zp) +{ + uint64_t de = zp->z_id; + if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) + de |= IFTODT((zp)->z_phys->zp_mode) << 60; + return (de); +} + /* * Link zp into dl. Can only fail if zp has been unlinked. */ @@ -558,10 +714,7 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); mutex_exit(&dzp->z_lock); - /* - * MacOS X will fill in the 4-bit object type here. - */ - value = ZFS_DIRENT_MAKE(IFTODT(zp->z_phys->zp_mode), zp->z_id); + value = zfs_dirent(zp); error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, 8, 1, &value, tx); ASSERT(error == 0); @@ -632,7 +785,20 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); mutex_exit(&dzp->z_lock); - error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, tx); + if (zp->z_zfsvfs->z_norm) { + if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && + (flag & ZCIEXACT)) || + ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) && + !(flag & ZCILOOK))) + error = zap_remove_norm(zp->z_zfsvfs->z_os, + dzp->z_id, dl->dl_name, MT_EXACT, tx); + else + error = zap_remove_norm(zp->z_zfsvfs->z_os, + dzp->z_id, dl->dl_name, MT_FIRST, tx); + } else { + error = zap_remove(zp->z_zfsvfs->z_os, + dzp->z_id, dl->dl_name, tx); + } ASSERT(error == 0); if (unlinkedp != NULL) @@ -660,17 +826,29 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) zfsvfs_t *zfsvfs = zp->z_zfsvfs; znode_t *xzp; dmu_tx_t *tx; - uint64_t xoid; int error; + zfs_fuid_info_t *fuidp = NULL; *xvpp = NULL; - if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, cr)) + if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)) return (error); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) @@ -678,13 +856,15 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) dmu_tx_abort(tx); return (error); } - zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0); - ASSERT(xzp->z_id == xoid); + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp); ASSERT(xzp->z_phys->zp_parent == zp->z_id); dmu_buf_will_dirty(zp->z_dbuf, tx); - zp->z_phys->zp_xattr = xoid; + zp->z_phys->zp_xattr = xzp->z_id; - (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, ""); + (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, + xzp, "", NULL, fuidp, vap); + if (fuidp) + zfs_fuid_info_free(fuidp); dmu_tx_commit(tx); *xvpp = ZTOV(xzp); @@ -714,7 +894,7 @@ zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags) vattr_t va; int error; top: - error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR); + error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); if (error) return (error); @@ -751,8 +931,7 @@ top: va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID; va.va_type = VDIR; va.va_mode = S_IFDIR | S_ISVTX | 0777; - va.va_uid = (uid_t)zp->z_phys->zp_uid; - va.va_gid = (gid_t)zp->z_phys->zp_gid; + zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); error = zfs_make_xattrdir(zp, &va, xvpp, cr); zfs_dirent_unlock(dl); @@ -782,16 +961,23 @@ int zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) { uid_t uid; + uid_t downer; + uid_t fowner; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */ return (0); - if ((zdp->z_phys->zp_mode & S_ISVTX) == 0 || - (uid = crgetuid(cr)) == zdp->z_phys->zp_uid || - uid == zp->z_phys->zp_uid || + if ((zdp->z_phys->zp_mode & S_ISVTX) == 0) + return (0); + + downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER); + + if ((uid = crgetuid(cr)) == downer || uid == fowner || (ZTOV(zp)->v_type == VREG && - zfs_zaccess(zp, ACE_WRITE_DATA, cr) == 0)) + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) return (0); else - return (secpolicy_vnode_remove(cr)); + return (secpolicy_vnode_remove(ZTOV(zp), cr)); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c index e2385a0ba2c4..17e4b0a09c9b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/spa.h> #include <sys/spa_impl.h> #include <sys/vdev.h> @@ -53,7 +51,7 @@ extern void devctl_notify(const char *__system, const char *__subsystem, * pool X * * If we are in a loading state, all errors are chained together by the same - * SPA-wide ENA. + * SPA-wide ENA (Error Numeric Association). * * For isolated I/O requests, we get the ENA from the zio_t. The propagation * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want @@ -90,11 +88,10 @@ extern void devctl_notify(const char *__system, const char *__subsystem, * We keep track of the ENA for a ZIO chain through the 'io_logical' member. * When a new logical I/O is issued, we set this to point to itself. Child I/Os * then inherit this pointer, so that when it is first set subsequent failures - * will use the same ENA. If a physical I/O is issued (by passing the - * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a - * unique ENA will be generated. For an aggregate I/O, this pointer is set to - * NULL, and no ereport will be generated (since it doesn't actually correspond - * to any particular device or piece of data). + * will use the same ENA. For vdev cache fill and queue aggregation I/O, + * this pointer is set to NULL, and no ereport will be generated (since it + * doesn't actually correspond to any particular device or piece of data, + * and the caller will always retry without caching or queueing anyway). */ void zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, @@ -104,6 +101,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, char buf[1024]; struct sbuf sb; struct timespec ts; + int state; /* * If we are doing a spa_tryimport(), ignore errors. @@ -120,21 +118,33 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, spa->spa_last_open_failed) return; - /* - * Ignore any errors from I/Os that we are going to retry anyway - we - * only generate errors from the final failure. - */ - if (zio && zio_should_retry(zio)) - return; + if (zio != NULL) { + /* + * If this is not a read or write zio, ignore the error. This + * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. + */ + if (zio->io_type != ZIO_TYPE_READ && + zio->io_type != ZIO_TYPE_WRITE) + return; - /* - * If this is not a read or write zio, ignore the error. This can occur - * if the DKIOCFLUSHWRITECACHE ioctl fails. - */ - if (zio && zio->io_type != ZIO_TYPE_READ && - zio->io_type != ZIO_TYPE_WRITE) - return; + /* + * Ignore any errors from speculative I/Os, as failure is an + * expected result. + */ + if (zio->io_flags & ZIO_FLAG_SPECULATIVE) + return; + /* + * If the vdev has already been marked as failing due to a + * failed probe, then ignore any subsequent I/O errors, as the + * DE will automatically fault the vdev on the first such + * failure. + */ + if (vd != NULL && + (!vdev_readable(vd) || !vdev_writeable(vd)) && + strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0) + return; + } nanotime(&ts); sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); @@ -187,22 +197,28 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, */ /* + * If we are importing a faulted pool, then we treat it like an open, + * not an import. Otherwise, the DE will ignore all faults during + * import, since the default behavior is to mark the devices as + * persistently unavailable, not leave them in the faulted state. + */ + state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state; + + /* * Generic payload members common to all ereports. - * - * The direct reference to spa_name is used rather than spa_name() - * because of the asynchronous nature of the zio pipeline. spa_name() - * asserts that the config lock is held in some form. This is always - * the case in I/O context, but because the check for RW_WRITER compares - * against 'curthread', we may be in an asynchronous context and blow - * this assert. Rather than loosen this assert, we acknowledge that all - * contexts in which this function is called (pool open, I/O) are safe, - * and dereference the name directly. */ - sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa->spa_name); + sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa)); sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)); - sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, - spa->spa_load_state); + sbuf_printf(&sb, " %s=%d", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, state); + + if (spa != NULL) { + sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, + spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? + FM_EREPORT_FAILMODE_WAIT : + spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? + FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC); + } if (vd != NULL) { vdev_t *pvd = vd->vdev_parent; @@ -290,7 +306,6 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, mutex_exit(&spa->spa_errlist_lock); sbuf_finish(&sb); - ZFS_LOG(1, "%s", sbuf_data(&sb)); devctl_notify("ZFS", spa->spa_name, subclass, sbuf_data(&sb)); if (sbuf_overflowed(&sb)) printf("ZFS WARNING: sbuf overflowed\n"); @@ -298,13 +313,8 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, #endif } -/* - * The 'resource.fs.zfs.ok' event is an internal signal that the associated - * resource (pool or disk) has been identified by ZFS as healthy. This will - * then trigger the DE to close the associated case, if any. - */ -void -zfs_post_ok(spa_t *spa, vdev_t *vd) +static void +zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) { #ifdef _KERNEL char buf[1024]; @@ -318,7 +328,7 @@ zfs_post_ok(spa_t *spa, vdev_t *vd) sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec); snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE, - ZFS_ERROR_CLASS, FM_RESOURCE_OK); + ZFS_ERROR_CLASS, name); sbuf_printf(&sb, " %s=%hhu", FM_VERSION, FM_RSRC_VERSION); sbuf_printf(&sb, " %s=%s", FM_CLASS, class); sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, @@ -327,9 +337,33 @@ zfs_post_ok(spa_t *spa, vdev_t *vd) sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid); sbuf_finish(&sb); + ZFS_LOG(1, "%s", sbuf_data(&sb)); devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb)); if (sbuf_overflowed(&sb)) printf("ZFS WARNING: sbuf overflowed\n"); sbuf_delete(&sb); #endif } + +/* + * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev + * has been removed from the system. This will cause the DE to ignore any + * recent I/O errors, inferring that they are due to the asynchronous device + * removal. + */ +void +zfs_post_remove(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_REMOVED); +} + +/* + * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool + * has the 'autoreplace' property set, and therefore any broken vdevs will be + * handled by higher level logic, and no vdev fault should be generated. + */ +void +zfs_post_autoreplace(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c new file mode 100644 index 000000000000..dfec3ed903bc --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c @@ -0,0 +1,716 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/zfs_context.h> +#include <sys/sunddi.h> +#include <sys/dmu.h> +#include <sys/avl.h> +#include <sys/zap.h> +#include <sys/refcount.h> +#include <sys/nvpair.h> +#ifdef _KERNEL +#include <sys/kidmap.h> +#include <sys/sid.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_znode.h> +#endif +#include <sys/zfs_fuid.h> + +/* + * FUID Domain table(s). + * + * The FUID table is stored as a packed nvlist of an array + * of nvlists which contain an index, domain string and offset + * + * During file system initialization the nvlist(s) are read and + * two AVL trees are created. One tree is keyed by the index number + * and the other by the domain string. Nodes are never removed from + * trees, but new entries may be added. If a new entry is added then the + * on-disk packed nvlist will also be updated. + */ + +#define FUID_IDX "fuid_idx" +#define FUID_DOMAIN "fuid_domain" +#define FUID_OFFSET "fuid_offset" +#define FUID_NVP_ARRAY "fuid_nvlist" + +typedef struct fuid_domain { + avl_node_t f_domnode; + avl_node_t f_idxnode; + ksiddomain_t *f_ksid; + uint64_t f_idx; +} fuid_domain_t; + +static char *nulldomain = ""; + +/* + * Compare two indexes. + */ +static int +idx_compare(const void *arg1, const void *arg2) +{ + const fuid_domain_t *node1 = arg1; + const fuid_domain_t *node2 = arg2; + + if (node1->f_idx < node2->f_idx) + return (-1); + else if (node1->f_idx > node2->f_idx) + return (1); + return (0); +} + +/* + * Compare two domain strings. + */ +static int +domain_compare(const void *arg1, const void *arg2) +{ + const fuid_domain_t *node1 = arg1; + const fuid_domain_t *node2 = arg2; + int val; + + val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); + if (val == 0) + return (0); + return (val > 0 ? 1 : -1); +} + +/* + * load initial fuid domain and idx trees. This function is used by + * both the kernel and zdb. + */ +uint64_t +zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, + avl_tree_t *domain_tree) +{ + dmu_buf_t *db; + uint64_t fuid_size; + + avl_create(idx_tree, idx_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); + avl_create(domain_tree, domain_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); + + VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db)); + fuid_size = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + if (fuid_size) { + nvlist_t **fuidnvp; + nvlist_t *nvp = NULL; + uint_t count; + char *packed; + int i; + + packed = kmem_alloc(fuid_size, KM_SLEEP); + VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0); + VERIFY(nvlist_unpack(packed, fuid_size, + &nvp, 0) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, + &fuidnvp, &count) == 0); + + for (i = 0; i != count; i++) { + fuid_domain_t *domnode; + char *domain; + uint64_t idx; + + VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN, + &domain) == 0); + VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX, + &idx) == 0); + + domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); + + domnode->f_idx = idx; + domnode->f_ksid = ksid_lookupdomain(domain); + avl_add(idx_tree, domnode); + avl_add(domain_tree, domnode); + } + nvlist_free(nvp); + kmem_free(packed, fuid_size); + } + return (fuid_size); +} + +void +zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree) +{ + fuid_domain_t *domnode; + void *cookie; + + cookie = NULL; + while (domnode = avl_destroy_nodes(domain_tree, &cookie)) + ksiddomain_rele(domnode->f_ksid); + + avl_destroy(domain_tree); + cookie = NULL; + while (domnode = avl_destroy_nodes(idx_tree, &cookie)) + kmem_free(domnode, sizeof (fuid_domain_t)); + avl_destroy(idx_tree); +} + +char * +zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx) +{ + fuid_domain_t searchnode, *findnode; + avl_index_t loc; + + searchnode.f_idx = idx; + + findnode = avl_find(idx_tree, &searchnode, &loc); + + return (findnode ? findnode->f_ksid->kd_name : nulldomain); +} + +#ifdef _KERNEL +/* + * Load the fuid table(s) into memory. + */ +static void +zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + int error = 0; + + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + + if (zfsvfs->z_fuid_loaded) { + rw_exit(&zfsvfs->z_fuid_lock); + return; + } + + if (zfsvfs->z_fuid_obj == 0) { + + /* first make sure we need to allocate object */ + + error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); + if (error == ENOENT && tx != NULL) { + zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, + DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, + sizeof (uint64_t), tx); + VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, sizeof (uint64_t), 1, + &zfsvfs->z_fuid_obj, tx) == 0); + } + } + + if (zfsvfs->z_fuid_obj != 0) { + zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os, + zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, + &zfsvfs->z_fuid_domain); + zfsvfs->z_fuid_loaded = B_TRUE; + } + + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * Query domain table for a given domain. + * + * If domain isn't found it is added to AVL trees and + * the results are pushed out to disk. + */ +int +zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, + dmu_tx_t *tx) +{ + fuid_domain_t searchnode, *findnode; + avl_index_t loc; + krw_t rw = RW_READER; + + /* + * If the dummy "nobody" domain then return an index of 0 + * to cause the created FUID to be a standard POSIX id + * for the user nobody. + */ + if (domain[0] == '\0') { + *retdomain = nulldomain; + return (0); + } + + searchnode.f_ksid = ksid_lookupdomain(domain); + if (retdomain) { + *retdomain = searchnode.f_ksid->kd_name; + } + if (!zfsvfs->z_fuid_loaded) + zfs_fuid_init(zfsvfs, tx); + +retry: + rw_enter(&zfsvfs->z_fuid_lock, rw); + findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc); + + if (findnode) { + rw_exit(&zfsvfs->z_fuid_lock); + ksiddomain_rele(searchnode.f_ksid); + return (findnode->f_idx); + } else { + fuid_domain_t *domnode; + nvlist_t *nvp; + nvlist_t **fuids; + uint64_t retidx; + size_t nvsize = 0; + char *packed; + dmu_buf_t *db; + int i = 0; + + if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) { + rw_exit(&zfsvfs->z_fuid_lock); + rw = RW_WRITER; + goto retry; + } + + domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); + domnode->f_ksid = searchnode.f_ksid; + + retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1; + + avl_add(&zfsvfs->z_fuid_domain, domnode); + avl_add(&zfsvfs->z_fuid_idx, domnode); + /* + * Now resync the on-disk nvlist. + */ + VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + domnode = avl_first(&zfsvfs->z_fuid_domain); + fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP); + while (domnode) { + VERIFY(nvlist_alloc(&fuids[i], + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, + domnode->f_idx) == 0); + VERIFY(nvlist_add_uint64(fuids[i], + FUID_OFFSET, 0) == 0); + VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN, + domnode->f_ksid->kd_name) == 0); + domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode); + } + VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, + fuids, retidx) == 0); + for (i = 0; i != retidx; i++) + nvlist_free(fuids[i]); + kmem_free(fuids, retidx * sizeof (void *)); + VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); + packed = kmem_alloc(nvsize, KM_SLEEP); + VERIFY(nvlist_pack(nvp, &packed, &nvsize, + NV_ENCODE_XDR, KM_SLEEP) == 0); + nvlist_free(nvp); + zfsvfs->z_fuid_size = nvsize; + dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, + zfsvfs->z_fuid_size, packed, tx); + kmem_free(packed, zfsvfs->z_fuid_size); + VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, + FTAG, &db)); + dmu_buf_will_dirty(db, tx); + *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; + dmu_buf_rele(db, FTAG); + + rw_exit(&zfsvfs->z_fuid_lock); + return (retidx); + } +} + +/* + * Query domain table by index, returning domain string + * + * Returns a pointer from an avl node of the domain string. + * + */ +static char * +zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) +{ + char *domain; + + if (idx == 0 || !zfsvfs->z_use_fuids) + return (NULL); + + if (!zfsvfs->z_fuid_loaded) + zfs_fuid_init(zfsvfs, NULL); + + rw_enter(&zfsvfs->z_fuid_lock, RW_READER); + + if (zfsvfs->z_fuid_obj) + domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx); + else + domain = nulldomain; + rw_exit(&zfsvfs->z_fuid_lock); + + ASSERT(domain); + return (domain); +} + +void +zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp) +{ + *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid, + cr, ZFS_OWNER); + *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid, + cr, ZFS_GROUP); +} + +uid_t +zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, + cred_t *cr, zfs_fuid_type_t type) +{ + uint32_t index = FUID_INDEX(fuid); + char *domain; + uid_t id; + + if (index == 0) + return (fuid); + + domain = zfs_fuid_find_by_idx(zfsvfs, index); + ASSERT(domain != NULL); + +#ifdef TODO + if (type == ZFS_OWNER || type == ZFS_ACE_USER) { + (void) kidmap_getuidbysid(crgetzone(cr), domain, + FUID_RID(fuid), &id); + } else { + (void) kidmap_getgidbysid(crgetzone(cr), domain, + FUID_RID(fuid), &id); + } +#else + panic(__func__); +#endif + return (id); +} + +/* + * Add a FUID node to the list of fuid's being created for this + * ACL + * + * If ACL has multiple domains, then keep only one copy of each unique + * domain. + */ +static void +zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, + uint64_t idx, uint64_t id, zfs_fuid_type_t type) +{ + zfs_fuid_t *fuid; + zfs_fuid_domain_t *fuid_domain; + zfs_fuid_info_t *fuidp; + uint64_t fuididx; + boolean_t found = B_FALSE; + + if (*fuidpp == NULL) + *fuidpp = zfs_fuid_info_alloc(); + + fuidp = *fuidpp; + /* + * First find fuid domain index in linked list + * + * If one isn't found then create an entry. + */ + + for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains); + fuid_domain; fuid_domain = list_next(&fuidp->z_domains, + fuid_domain), fuididx++) { + if (idx == fuid_domain->z_domidx) { + found = B_TRUE; + break; + } + } + + if (!found) { + fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP); + fuid_domain->z_domain = domain; + fuid_domain->z_domidx = idx; + list_insert_tail(&fuidp->z_domains, fuid_domain); + fuidp->z_domain_str_sz += strlen(domain) + 1; + fuidp->z_domain_cnt++; + } + + if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) { + /* + * Now allocate fuid entry and add it on the end of the list + */ + + fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); + fuid->z_id = id; + fuid->z_domidx = idx; + fuid->z_logfuid = FUID_ENCODE(fuididx, rid); + + list_insert_tail(&fuidp->z_fuids, fuid); + fuidp->z_fuid_cnt++; + } else { + if (type == ZFS_OWNER) + fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid); + else + fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid); + } +} + +/* + * Create a file system FUID, based on information in the users cred + */ +uint64_t +zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, + dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp) +{ + uint64_t idx; + ksid_t *ksid; + uint32_t rid; + char *kdomain; + const char *domain; + uid_t id; + + VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); + + if (type == ZFS_OWNER) + id = crgetuid(cr); + else + id = crgetgid(cr); + + if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id)) + return ((uint64_t)id); + +#ifdef TODO + ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP); + + VERIFY(ksid != NULL); + rid = ksid_getrid(ksid); + domain = ksid_getdomain(ksid); + + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); + + zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); + + return (FUID_ENCODE(idx, rid)); +#else + panic(__func__); +#endif +} + +/* + * Create a file system FUID for an ACL ace + * or a chown/chgrp of the file. + * This is similar to zfs_fuid_create_cred, except that + * we can't find the domain + rid information in the + * cred. Instead we have to query Winchester for the + * domain and rid. + * + * During replay operations the domain+rid information is + * found in the zfs_fuid_info_t that the replay code has + * attached to the zfsvfs of the file system. + */ +uint64_t +zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, + zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp) +{ + const char *domain; + char *kdomain; + uint32_t fuid_idx = FUID_INDEX(id); + uint32_t rid; + idmap_stat status; + uint64_t idx; + boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL); + zfs_fuid_t *zfuid = NULL; + zfs_fuid_info_t *fuidp; + + /* + * If POSIX ID, or entry is already a FUID then + * just return the id + * + * We may also be handed an already FUID'ized id via + * chmod. + */ + + if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0) + return (id); + + if (is_replay) { + fuidp = zfsvfs->z_fuid_replay; + + /* + * If we are passed an ephemeral id, but no + * fuid_info was logged then return NOBODY. + * This is most likely a result of idmap service + * not being available. + */ + if (fuidp == NULL) + return (UID_NOBODY); + + switch (type) { + case ZFS_ACE_USER: + case ZFS_ACE_GROUP: + zfuid = list_head(&fuidp->z_fuids); + rid = FUID_RID(zfuid->z_logfuid); + idx = FUID_INDEX(zfuid->z_logfuid); + break; + case ZFS_OWNER: + rid = FUID_RID(fuidp->z_fuid_owner); + idx = FUID_INDEX(fuidp->z_fuid_owner); + break; + case ZFS_GROUP: + rid = FUID_RID(fuidp->z_fuid_group); + idx = FUID_INDEX(fuidp->z_fuid_group); + break; + }; + domain = fuidp->z_domain_table[idx -1]; + } else { +#ifdef TODO + if (type == ZFS_OWNER || type == ZFS_ACE_USER) + status = kidmap_getsidbyuid(crgetzone(cr), id, + &domain, &rid); + else + status = kidmap_getsidbygid(crgetzone(cr), id, + &domain, &rid); + + if (status != 0) { + /* + * When returning nobody we will need to + * make a dummy fuid table entry for logging + * purposes. + */ + rid = UID_NOBODY; + domain = nulldomain; + } +#else + panic(__func__); +#endif + } + + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); + + if (!is_replay) + zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type); + else if (zfuid != NULL) { + list_remove(&fuidp->z_fuids, zfuid); + kmem_free(zfuid, sizeof (zfs_fuid_t)); + } + return (FUID_ENCODE(idx, rid)); +} + +void +zfs_fuid_destroy(zfsvfs_t *zfsvfs) +{ + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + if (!zfsvfs->z_fuid_loaded) { + rw_exit(&zfsvfs->z_fuid_lock); + return; + } + zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * Allocate zfs_fuid_info for tracking FUIDs created during + * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR() + */ +zfs_fuid_info_t * +zfs_fuid_info_alloc(void) +{ + zfs_fuid_info_t *fuidp; + + fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP); + list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t), + offsetof(zfs_fuid_domain_t, z_next)); + list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t), + offsetof(zfs_fuid_t, z_next)); + return (fuidp); +} + +/* + * Release all memory associated with zfs_fuid_info_t + */ +void +zfs_fuid_info_free(zfs_fuid_info_t *fuidp) +{ + zfs_fuid_t *zfuid; + zfs_fuid_domain_t *zdomain; + + while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) { + list_remove(&fuidp->z_fuids, zfuid); + kmem_free(zfuid, sizeof (zfs_fuid_t)); + } + + if (fuidp->z_domain_table != NULL) + kmem_free(fuidp->z_domain_table, + (sizeof (char **)) * fuidp->z_domain_cnt); + + while ((zdomain = list_head(&fuidp->z_domains)) != NULL) { + list_remove(&fuidp->z_domains, zdomain); + kmem_free(zdomain, sizeof (zfs_fuid_domain_t)); + } + + kmem_free(fuidp, sizeof (zfs_fuid_info_t)); +} + +/* + * Check to see if id is a groupmember. If cred + * has ksid info then sidlist is checked first + * and if still not found then POSIX groups are checked + * + * Will use a straight FUID compare when possible. + */ +boolean_t +zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) +{ + ksid_t *ksid = crgetsid(cr, KSID_GROUP); + uid_t gid; + +#ifdef TODO + if (ksid) { + int i; + ksid_t *ksid_groups; + ksidlist_t *ksidlist = crgetsidlist(cr); + uint32_t idx = FUID_INDEX(id); + uint32_t rid = FUID_RID(id); + + ASSERT(ksidlist); + ksid_groups = ksidlist->ksl_sids; + + for (i = 0; i != ksidlist->ksl_nsid; i++) { + if (idx == 0) { + if (id != IDMAP_WK_CREATOR_GROUP_GID && + id == ksid_groups[i].ks_id) { + return (B_TRUE); + } + } else { + char *domain; + + domain = zfs_fuid_find_by_idx(zfsvfs, idx); + ASSERT(domain != NULL); + + if (strcmp(domain, + IDMAP_WK_CREATOR_SID_AUTHORITY) == 0) + return (B_FALSE); + + if ((strcmp(domain, + ksid_groups[i].ks_domain->kd_name) == 0) && + rid == ksid_groups[i].ks_rid) + return (B_TRUE); + } + } + } +#endif + + /* + * Not found in ksidlist, check posix groups + */ + gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP); + return (groupmember(gid, cr)); +} +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index 8699922ccf09..a6829eb1f122 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/systm.h> @@ -43,6 +41,7 @@ #include <sys/cmn_err.h> #include <sys/stat.h> #include <sys/zfs_ioctl.h> +#include <sys/zfs_znode.h> #include <sys/zap.h> #include <sys/spa.h> #include <sys/spa_impl.h> @@ -52,6 +51,8 @@ #include <sys/dsl_dir.h> #include <sys/dsl_dataset.h> #include <sys/dsl_prop.h> +#include <sys/dsl_deleg.h> +#include <sys/dmu_objset.h> #include <sys/sunddi.h> #include <sys/policy.h> #include <sys/zone.h> @@ -62,10 +63,13 @@ #include <sys/varargs.h> #include <sys/fs/zfs.h> #include <sys/zfs_ctldir.h> +#include <sys/zfs_dir.h> #include <sys/zvol.h> +#include <sys/dmu_objset.h> #include "zfs_namecheck.h" #include "zfs_prop.h" +#include "zfs_deleg.h" CTASSERT(sizeof(zfs_cmd_t) <= PAGE_SIZE); @@ -75,18 +79,24 @@ extern void zfs_init(void); extern void zfs_fini(void); typedef int zfs_ioc_func_t(zfs_cmd_t *); -typedef int zfs_secpolicy_func_t(const char *, cred_t *); +typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *); typedef struct zfs_ioc_vec { zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; enum { - no_name, - pool_name, - dataset_name - } zvec_namecheck; + NO_NAME, + POOL_NAME, + DATASET_NAME + } zvec_namecheck; + boolean_t zvec_his_log; } zfs_ioc_vec_t; +static void clear_props(char *dataset, nvlist_t *props); +static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, + boolean_t *); +int zfs_set_prop_nvlist(const char *, nvlist_t *); + /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ void __dprintf(const char *file, const char *func, int line, const char *fmt, ...) @@ -123,13 +133,122 @@ __dprintf(const char *file, const char *func, int line, const char *fmt, ...) char *, newfile, char *, func, int, line, char *, buf); } +static void +history_str_free(char *buf) +{ + kmem_free(buf, HIS_MAX_RECORD_LEN); +} + +static char * +history_str_get(zfs_cmd_t *zc) +{ + char *buf; + + if (zc->zc_history == 0) + return (NULL); + + buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); + if (copyinstr((void *)(uintptr_t)zc->zc_history, + buf, HIS_MAX_RECORD_LEN, NULL) != 0) { + history_str_free(buf); + return (NULL); + } + + buf[HIS_MAX_RECORD_LEN -1] = '\0'; + + return (buf); +} + +/* + * Check to see if the named dataset is currently defined as bootable + */ +static boolean_t +zfs_is_bootfs(const char *name) +{ + spa_t *spa; + boolean_t ret = B_FALSE; + + if (spa_open(name, &spa, FTAG) == 0) { + if (spa->spa_bootfs) { + objset_t *os; + + if (dmu_objset_open(name, DMU_OST_ZFS, + DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { + ret = (dmu_objset_id(os) == spa->spa_bootfs); + dmu_objset_close(os); + } + } + spa_close(spa, FTAG); + } + return (ret); +} + +/* + * zfs_earlier_version + * + * Return non-zero if the spa version is less than requested version. + */ +static int +zfs_earlier_version(const char *name, int version) +{ + spa_t *spa; + + if (spa_open(name, &spa, FTAG) == 0) { + if (spa_version(spa) < version) { + spa_close(spa, FTAG); + return (1); + } + spa_close(spa, FTAG); + } + return (0); +} + +/* + * zpl_earlier_version + * + * Return TRUE if the ZPL version is less than requested version. + */ +static boolean_t +zpl_earlier_version(const char *name, int version) +{ + objset_t *os; + boolean_t rc = B_TRUE; + + if (dmu_objset_open(name, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { + uint64_t zplversion; + + if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) + rc = zplversion < version; + dmu_objset_close(os); + } + return (rc); +} + +static void +zfs_log_history(zfs_cmd_t *zc) +{ + spa_t *spa; + char *buf; + + if ((buf = history_str_get(zc)) == NULL) + return; + + if (spa_open(zc->zc_name, &spa, FTAG) == 0) { + if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) + (void) spa_history_log(spa, buf, LOG_CMD_NORMAL); + spa_close(spa, FTAG); + } + history_str_free(buf); +} + /* * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ /* ARGSUSED */ static int -zfs_secpolicy_none(const char *unused1, cred_t *cr) +zfs_secpolicy_none(zfs_cmd_t *zc, cred_t *cr) { return (0); } @@ -140,10 +259,10 @@ zfs_secpolicy_none(const char *unused1, cred_t *cr) */ /* ARGSUSED */ static int -zfs_secpolicy_read(const char *dataset, cred_t *cr) +zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr) { - if (INGLOBALZONE(curproc) || - zone_dataset_visible(dataset, NULL)) + if (INGLOBALZONE(curthread) || + zone_dataset_visible(zc->zc_name, NULL)) return (0); return (ENOENT); @@ -159,14 +278,14 @@ zfs_dozonecheck(const char *dataset, cred_t *cr) * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ - if (!INGLOBALZONE(curproc) && + if (!INGLOBALZONE(curthread) && !zone_dataset_visible(dataset, &writable)) return (ENOENT); if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL)) return (ENOENT); - if (INGLOBALZONE(curproc)) { + if (INGLOBALZONE(curthread)) { /* * If the fs is zoned, only root can access it from the * global zone. @@ -187,47 +306,324 @@ zfs_dozonecheck(const char *dataset, cred_t *cr) return (0); } -/* - * Policy for dataset write operations (create children, set properties, etc). - * Requires SYS_MOUNT privilege, and must be writable in the local zone. - */ int -zfs_secpolicy_write(const char *dataset, cred_t *cr) +zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) +{ + int error; + + error = zfs_dozonecheck(name, cr); + if (error == 0) { + error = secpolicy_zfs(cr); + if (error) + error = dsl_deleg_access(name, perm, cr); + } + return (error); +} + +static int +zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr) +{ + /* + * Check permissions for special properties. + */ + switch (prop) { + case ZFS_PROP_ZONED: + /* + * Disallow setting of 'zoned' from within a local zone. + */ + if (!INGLOBALZONE(curthread)) + return (EPERM); + break; + + case ZFS_PROP_QUOTA: + if (!INGLOBALZONE(curthread)) { + uint64_t zoned; + char setpoint[MAXNAMELEN]; + /* + * Unprivileged users are allowed to modify the + * quota on things *under* (ie. contained by) + * the thing they own. + */ + if (dsl_prop_get_integer(name, "zoned", &zoned, + setpoint)) + return (EPERM); + if (!zoned || strlen(name) <= strlen(setpoint)) + return (EPERM); + } + break; + } + + return (zfs_secpolicy_write_perms(name, zfs_prop_to_name(prop), cr)); +} + +int +zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) { int error; - if (error = zfs_dozonecheck(dataset, cr)) + error = zfs_dozonecheck(zc->zc_name, cr); + if (error) return (error); - return (secpolicy_zfs(cr)); + /* + * permission to set permissions will be evaluated later in + * dsl_deleg_can_allow() + */ + return (0); +} + +int +zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr) +{ + int error; + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_ROLLBACK, cr); + if (error == 0) + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_MOUNT, cr); + return (error); +} + +int +zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_SEND, cr)); +} + +int +zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) +{ + if (!INGLOBALZONE(curthread)) + return (EPERM); + + if (secpolicy_nfs(cr) == 0) { + return (0); + } else { + vnode_t *vp; + int error; + + if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, + NO_FOLLOW, NULL, &vp)) != 0) + return (error); + + /* Now make sure mntpnt and dataset are ZFS */ + + if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || + (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), + zc->zc_name) != 0)) { + VN_RELE(vp); + return (EPERM); + } + + VN_RELE(vp); + return (dsl_deleg_access(zc->zc_name, + ZFS_DELEG_PERM_SHARE, cr)); + } } -/* - * Policy for operations that want to write a dataset's parent: - * create, destroy, snapshot, clone, restore. - */ static int -zfs_secpolicy_parent(const char *dataset, cred_t *cr) +zfs_get_parent(const char *datasetname, char *parent, int parentsize) { - char parentname[MAXNAMELEN]; char *cp; /* * Remove the @bla or /bla from the end of the name to get the parent. */ - (void) strncpy(parentname, dataset, sizeof (parentname)); - cp = strrchr(parentname, '@'); + (void) strncpy(parent, datasetname, parentsize); + cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; } else { - cp = strrchr(parentname, '/'); + cp = strrchr(parent, '/'); if (cp == NULL) return (ENOENT); cp[0] = '\0'; + } + + return (0); +} + +int +zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) +{ + int error; + + if ((error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); +} + +static int +zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); +} + +/* + * Must have sys_config privilege to check the iscsi permission + */ +/* ARGSUSED */ +static int +zfs_secpolicy_iscsi(zfs_cmd_t *zc, cred_t *cr) +{ + return (secpolicy_zfs(cr)); +} + +int +zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) +{ + char parentname[MAXNAMELEN]; + int error; + + if ((error = zfs_secpolicy_write_perms(from, + ZFS_DELEG_PERM_RENAME, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(from, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + if ((error = zfs_get_parent(to, parentname, + sizeof (parentname))) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_CREATE, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + return (error); +} + +static int +zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); +} +static int +zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) +{ + char parentname[MAXNAMELEN]; + objset_t *clone; + int error; + + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_PROMOTE, cr); + if (error) + return (error); + + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &clone); + + if (error == 0) { + dsl_dataset_t *pclone = NULL; + dsl_dir_t *dd; + dd = clone->os->os_dsl_dataset->ds_dir; + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dd->dd_pool, + dd->dd_phys->dd_origin_obj, FTAG, &pclone); + rw_exit(&dd->dd_pool->dp_config_rwlock); + if (error) { + dmu_objset_close(clone); + return (error); + } + + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_MOUNT, cr); + + dsl_dataset_name(pclone, parentname); + dmu_objset_close(clone); + dsl_dataset_rele(pclone, FTAG); + if (error == 0) + error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_PROMOTE, cr); } + return (error); +} + +static int +zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr) +{ + int error; + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_RECEIVE, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); - return (zfs_secpolicy_write(parentname, cr)); + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_CREATE, cr)); +} + +int +zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) +{ + int error; + + if ((error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_SNAPSHOT, cr)) != 0) + return (error); + + error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_MOUNT, cr); + + return (error); +} + +static int +zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr) +{ + + return (zfs_secpolicy_snapshot_perms(zc->zc_name, cr)); +} + +static int +zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) +{ + char parentname[MAXNAMELEN]; + int error; + + if ((error = zfs_get_parent(zc->zc_name, parentname, + sizeof (parentname))) != 0) + return (error); + + if (zc->zc_value[0] != '\0') { + if ((error = zfs_secpolicy_write_perms(zc->zc_value, + ZFS_DELEG_PERM_CLONE, cr)) != 0) + return (error); + } + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_CREATE, cr)) != 0) + return (error); + + error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_MOUNT, cr); + + return (error); +} + +static int +zfs_secpolicy_umount(zfs_cmd_t *zc, cred_t *cr) +{ + int error; + + error = secpolicy_fs_unmount(cr, NULL); + if (error) { + error = dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr); + } + return (error); } /* @@ -236,7 +632,7 @@ zfs_secpolicy_parent(const char *dataset, cred_t *cr) */ /* ARGSUSED */ static int -zfs_secpolicy_config(const char *unused, cred_t *cr) +zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr) { if (secpolicy_sys_config(cr, B_FALSE) != 0) return (EPERM); @@ -245,15 +641,48 @@ zfs_secpolicy_config(const char *unused, cred_t *cr) } /* + * Just like zfs_secpolicy_config, except that we will check for + * mount permission on the dataset for permission to create/remove + * the minor nodes. + */ +static int +zfs_secpolicy_minor(zfs_cmd_t *zc, cred_t *cr) +{ + if (secpolicy_sys_config(cr, B_FALSE) != 0) { + return (dsl_deleg_access(zc->zc_name, + ZFS_DELEG_PERM_MOUNT, cr)); + } + + return (0); +} + +/* * Policy for fault injection. Requires all privileges. */ /* ARGSUSED */ static int -zfs_secpolicy_inject(const char *unused, cred_t *cr) +zfs_secpolicy_inject(zfs_cmd_t *zc, cred_t *cr) { return (secpolicy_zinject(cr)); } +static int +zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) +{ + zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); + + if (prop == ZPROP_INVAL) { + if (!zfs_prop_user(zc->zc_value)) + return (EINVAL); + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_USERPROP, cr)); + } else { + if (!zfs_prop_inheritable(prop)) + return (EINVAL); + return (zfs_secpolicy_setprop(zc->zc_name, prop, cr)); + } +} + /* * Policy for dataset backup operations (sendbackup). * Requires SYS_MOUNT privilege, and must be writable in the local zone. @@ -263,7 +692,7 @@ zfs_secpolicy_operator(const char *dataset, cred_t *cr) { int writable = 1; - if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable)) + if (!INGLOBALZONE(curthread) && !zone_dataset_visible(dataset, &writable)) return (ENOENT); if (secpolicy_zfs(cr) != 0 && !groupmember(GID_OPERATOR, cr)) return (EPERM); @@ -274,35 +703,33 @@ zfs_secpolicy_operator(const char *dataset, cred_t *cr) * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int -get_nvlist(zfs_cmd_t *zc, nvlist_t **nvp) +get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp) { char *packed; - size_t size; int error; - nvlist_t *config = NULL; + nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ - if ((size = zc->zc_nvlist_src_size) == 0) + if (size == 0) return (EINVAL); packed = kmem_alloc(size, KM_SLEEP); - if ((error = xcopyin((void *)(uintptr_t)zc->zc_nvlist_src, packed, - size)) != 0) { + if ((error = xcopyin((void *)(uintptr_t)nvl, packed, size)) != 0) { kmem_free(packed, size); return (error); } - if ((error = nvlist_unpack(packed, size, &config, 0)) != 0) { + if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { kmem_free(packed, size); return (error); } kmem_free(packed, size); - *nvp = config; + *nvp = list; return (0); } @@ -326,6 +753,7 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) */ error = 0; } else { + packed = kmem_alloc(size, KM_SLEEP); VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, KM_SLEEP) == 0); error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, @@ -341,15 +769,67 @@ static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; - nvlist_t *config; + nvlist_t *config, *props = NULL; + nvlist_t *rootprops = NULL; + nvlist_t *zplprops = NULL; + char *buf; + + if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &config)) + return (error); - if ((error = get_nvlist(zc, &config)) != 0) + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) { + nvlist_free(config); return (error); + } + + if (props) { + nvlist_t *nvl = NULL; + uint64_t version = SPA_VERSION; + + (void) nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); + if (version < SPA_VERSION_INITIAL || version > SPA_VERSION) { + error = EINVAL; + goto pool_props_bad; + } + (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); + if (nvl) { + error = nvlist_dup(nvl, &rootprops, KM_SLEEP); + if (error != 0) { + nvlist_free(config); + nvlist_free(props); + return (error); + } + (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); + } + VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + error = zfs_fill_zplprops_root(version, rootprops, + zplprops, NULL); + if (error) + goto pool_props_bad; + } + + buf = history_str_get(zc); - error = spa_create(zc->zc_name, config, zc->zc_value[0] == '\0' ? - NULL : zc->zc_value); + error = spa_create(zc->zc_name, config, props, buf, zplprops); + /* + * Set the remaining root properties + */ + if (!error && + (error = zfs_set_prop_nvlist(zc->zc_name, rootprops)) != 0) + (void) spa_destroy(zc->zc_name); + + if (buf != NULL) + history_str_free(buf); + +pool_props_bad: + nvlist_free(rootprops); + nvlist_free(zplprops); nvlist_free(config); + nvlist_free(props); return (error); } @@ -357,35 +837,55 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) static int zfs_ioc_pool_destroy(zfs_cmd_t *zc) { - return (spa_destroy(zc->zc_name)); + int error; + zfs_log_history(zc); + error = spa_destroy(zc->zc_name); + return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { int error; - nvlist_t *config; + nvlist_t *config, *props = NULL; uint64_t guid; - if ((error = get_nvlist(zc, &config)) != 0) + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &config)) != 0) return (error); + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) { + nvlist_free(config); + return (error); + } + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = EINVAL; + else if (zc->zc_cookie) + error = spa_import_faulted(zc->zc_name, config, + props); else - error = spa_import(zc->zc_name, config, - zc->zc_value[0] == '\0' ? NULL : zc->zc_value); + error = spa_import(zc->zc_name, config, props); nvlist_free(config); + if (props) + nvlist_free(props); + return (error); } static int zfs_ioc_pool_export(zfs_cmd_t *zc) { - return (spa_export(zc->zc_name, NULL)); + int error; + boolean_t force = (boolean_t)zc->zc_cookie; + + zfs_log_history(zc); + error = spa_export(zc->zc_name, NULL, force); + return (error); } static int @@ -441,7 +941,8 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc) nvlist_t *tryconfig, *config; int error; - if ((error = get_nvlist(zc, &tryconfig)) != 0) + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); @@ -466,7 +967,7 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - error = spa_scrub(spa, zc->zc_cookie, B_FALSE); + error = spa_scrub(spa, zc->zc_cookie); spa_close(spa, FTAG); @@ -496,8 +997,12 @@ zfs_ioc_pool_upgrade(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - spa_upgrade(spa); + if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) { + spa_close(spa, FTAG); + return (EINVAL); + } + spa_upgrade(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); @@ -517,7 +1022,7 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) { + if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (ENOTSUP); } @@ -525,7 +1030,8 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc) hist_buf = kmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { - error = xcopyout(hist_buf, (char *)(uintptr_t)zc->zc_history, + error = xcopyout(hist_buf, + (char *)(uintptr_t)zc->zc_history, zc->zc_history_len); } @@ -535,45 +1041,6 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc) } static int -zfs_ioc_pool_log_history(zfs_cmd_t *zc) -{ - spa_t *spa; - char *history_str = NULL; - size_t size; - int error; - - size = zc->zc_history_len; - if (size == 0 || size > HIS_MAX_RECORD_LEN) - return (EINVAL); - - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) { - spa_close(spa, FTAG); - return (ENOTSUP); - } - - /* add one for the NULL delimiter */ - size++; - history_str = kmem_alloc(size, KM_SLEEP); - if ((error = xcopyin((void *)(uintptr_t)zc->zc_history, history_str, - size)) != 0) { - spa_close(spa, FTAG); - kmem_free(history_str, size); - return (error); - } - history_str[size - 1] = '\0'; - - error = spa_history_log(spa, history_str, zc->zc_history_offset); - - spa_close(spa, FTAG); - kmem_free(history_str, size); - - return (error); -} - -static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { int error; @@ -591,9 +1058,8 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc) int error; if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS, - DS_MODE_NONE | DS_MODE_READONLY, &osp)) != 0) + DS_MODE_USER | DS_MODE_READONLY, &osp)) != 0) return (error); - error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_close(osp); @@ -606,26 +1072,40 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; - nvlist_t *config; + nvlist_t *config, **l2cache, **spares; + uint_t nl2cache = 0, nspares = 0; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); + error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &config); + (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache); + + (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, + &spares, &nspares); + /* * A root pool with concatenated devices is not supported. - * Thus, can not add a device to a root pool with one device. + * Thus, can not add a device to a root pool. + * + * Intent log device can not be added to a rootpool because + * during mountroot, zil is replayed, a seperated log device + * can not be accessed during the mountroot time. + * + * l2cache and spare devices are ok to be added to a rootpool. */ - if (spa->spa_root_vdev->vdev_children == 1 && spa->spa_bootfs != 0) { + if (spa->spa_bootfs != 0 && nl2cache == 0 && nspares == 0) { spa_close(spa, FTAG); return (EDOM); } - if ((error = get_nvlist(zc, &config)) == 0) { + if (error == 0) { error = spa_vdev_add(spa, config); nvlist_free(config); } - spa_close(spa, FTAG); return (error); } @@ -645,28 +1125,35 @@ zfs_ioc_vdev_remove(zfs_cmd_t *zc) } static int -zfs_ioc_vdev_online(zfs_cmd_t *zc) +zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; + vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - error = vdev_online(spa, zc->zc_guid); - spa_close(spa, FTAG); - return (error); -} + switch (zc->zc_cookie) { + case VDEV_STATE_ONLINE: + error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); + break; -static int -zfs_ioc_vdev_offline(zfs_cmd_t *zc) -{ - spa_t *spa; - int istmp = zc->zc_cookie; - int error; + case VDEV_STATE_OFFLINE: + error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); + break; - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - error = vdev_offline(spa, zc->zc_guid, istmp); + case VDEV_STATE_FAULTED: + error = vdev_fault(spa, zc->zc_guid); + break; + + case VDEV_STATE_DEGRADED: + error = vdev_degrade(spa, zc->zc_guid); + break; + + default: + error = EINVAL; + } + zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } @@ -682,7 +1169,8 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - if ((error = get_nvlist(zc, &config)) == 0) { + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); nvlist_free(config); } @@ -723,6 +1211,16 @@ zfs_ioc_vdev_setpath(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + */ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { @@ -730,44 +1228,29 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc) int error; nvlist_t *nv; -retry: - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &os); - if (error != 0) { - /* - * This is ugly: dmu_objset_open() can return EBUSY if - * the objset is held exclusively. Fortunately this hold is - * only for a short while, so we retry here. - * This avoids user code having to handle EBUSY, - * for example for a "zfs list". - */ - if (error == EBUSY) { - delay(1); - goto retry; - } + if (error = dmu_objset_open(zc->zc_name, + DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) return (error); - } dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (zc->zc_nvlist_dst != 0 && - (error = dsl_prop_get_all(os, &nv)) == 0) { + (error = dsl_prop_get_all(os, &nv, FALSE)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a - * DS_MODE_STANDARD open, because it could be + * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... */ - if (!zc->zc_objset_stats.dds_inconsistent && - dmu_objset_type(os) == DMU_OST_ZVOL) - VERIFY(zvol_get_stats(os, nv) == 0); + if (!zc->zc_objset_stats.dds_inconsistent) { + if (dmu_objset_type(os) == DMU_OST_ZVOL) + VERIFY(zvol_get_stats(os, nv) == 0); + } error = put_nvlist(zc, nv); nvlist_free(nv); } - spa_altroot(dmu_objset_spa(os), zc->zc_value, sizeof (zc->zc_value)); - dmu_objset_close(os); if (error == ENOMEM) error = 0; @@ -775,27 +1258,87 @@ retry: } static int +nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) +{ + uint64_t value; + int error; + + /* + * zfs_get_zplprop() will either find a value or give us + * the default value (if there is one). + */ + if ((error = zfs_get_zplprop(os, prop, &value)) != 0) + return (error); + VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); + return (0); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for zpl property nvlist + * + * outputs: + * zc_nvlist_dst zpl property nvlist + * zc_nvlist_dst_size size of zpl property nvlist + */ +static int +zfs_ioc_objset_zplprops(zfs_cmd_t *zc) +{ + objset_t *os; + int err; + + if (err = dmu_objset_open(zc->zc_name, + DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) + return (err); + + dmu_objset_fast_stat(os, &zc->zc_objset_stats); + + /* + * NB: nvl_add_zplprop() will read the objset contents, + * which we aren't supposed to do with a DS_MODE_USER + * hold, because it could be inconsistent. + */ + if (zc->zc_nvlist_dst != 0 && + !zc->zc_objset_stats.dds_inconsistent && + dmu_objset_type(os) == DMU_OST_ZFS) { + nvlist_t *nv; + + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) + err = put_nvlist(zc, nv); + nvlist_free(nv); + } else { + err = ENOENT; + } + dmu_objset_close(os); + return (err); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_name name of next filesystem + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + */ +static int zfs_ioc_dataset_list_next(zfs_cmd_t *zc) { objset_t *os; int error; char *p; -retry: - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &os); - if (error != 0) { - /* - * This is ugly: dmu_objset_open() can return EBUSY if - * the objset is held exclusively. Fortunately this hold is - * only for a short while, so we retry here. - * This avoids user code having to handle EBUSY, - * for example for a "zfs list". - */ - if (error == EBUSY) { - delay(1); - goto retry; - } + if (error = dmu_objset_open(zc->zc_name, + DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) { if (error == ENOENT) error = ESRCH; return (error); @@ -812,8 +1355,9 @@ retry: NULL, &zc->zc_cookie); if (error == ENOENT) error = ESRCH; - } while (error == 0 && !INGLOBALZONE(curproc) && + } while (error == 0 && !INGLOBALZONE(curthread) && !zone_dataset_visible(zc->zc_name, NULL)); + dmu_objset_close(os); /* * If it's a hidden dataset (ie. with a '$' in its name), don't @@ -822,35 +1366,31 @@ retry: if (error == 0 && strchr(zc->zc_name, '$') == NULL) error = zfs_ioc_objset_stats(zc); /* fill in the stats */ - dmu_objset_close(os); return (error); } +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_name name of next snapshot + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + */ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { objset_t *os; int error; -retry: - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &os); - if (error != 0) { - /* - * This is ugly: dmu_objset_open() can return EBUSY if - * the objset is held exclusively. Fortunately this hold is - * only for a short while, so we retry here. - * This avoids user code having to handle EBUSY, - * for example for a "zfs list". - */ - if (error == EBUSY) { - delay(1); - goto retry; - } - if (error == ENOENT) - error = ESRCH; - return (error); - } + error = dmu_objset_open(zc->zc_name, + DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os); + if (error) + return (error == ENOENT ? ESRCH : error); /* * A dataset name of maximum length cannot have any snapshots, @@ -863,36 +1403,36 @@ retry: error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), - zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie); - if (error == ENOENT) - error = ESRCH; - + zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL); + dmu_objset_close(os); if (error == 0) error = zfs_ioc_objset_stats(zc); /* fill in the stats */ + else if (error == ENOENT) + error = ESRCH; - dmu_objset_close(os); + /* if we failed, undo the @ that we tacked on to zc_name */ + if (error) + *strchr(zc->zc_name, '@') = '\0'; return (error); } -static int -zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl) +int +zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) { nvpair_t *elem; int error; - const char *propname; - zfs_prop_t prop; uint64_t intval; char *strval; - char buf[MAXNAMELEN]; - const char *p; - spa_t *spa; + /* + * First validate permission to set all of the properties + */ elem = NULL; while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - propname = nvpair_name(elem); + const char *propname = nvpair_name(elem); + zfs_prop_t prop = zfs_name_to_prop(propname); - if ((prop = zfs_name_to_prop(propname)) == - ZFS_PROP_INVAL) { + if (prop == ZPROP_INVAL) { /* * If this is a user-defined property, it must be a * string, and there is no further validation to do. @@ -901,51 +1441,19 @@ zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl) nvpair_type(elem) != DATA_TYPE_STRING) return (EINVAL); - VERIFY(nvpair_value_string(elem, &strval) == 0); - error = dsl_prop_set(name, propname, 1, - strlen(strval) + 1, strval); - if (error == 0) - continue; - else + if (error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_USERPROP, CRED())) return (error); + continue; } + if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0) + return (error); + /* - * Check permissions for special properties. + * Check that this value is valid for this pool version */ switch (prop) { - case ZFS_PROP_ZONED: - /* - * Disallow setting of 'zoned' from within a local zone. - */ - if (!INGLOBALZONE(curproc)) - return (EPERM); - break; - - case ZFS_PROP_QUOTA: - if (error = zfs_dozonecheck(name, cr)) - return (error); - - if (!INGLOBALZONE(curproc)) { - uint64_t zoned; - char setpoint[MAXNAMELEN]; - int dslen; - /* - * Unprivileged users are allowed to modify the - * quota on things *under* (ie. contained by) - * the thing they own. - */ - if (dsl_prop_get_integer(name, "jailed", &zoned, - setpoint)) - return (EPERM); - if (!zoned) /* this shouldn't happen */ - return (EPERM); - dslen = strlen(name); - if (dslen <= strlen(setpoint)) - return (EPERM); - } - break; - case ZFS_PROP_COMPRESSION: /* * If the user specified gzip compression, make sure @@ -953,35 +1461,64 @@ zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl) * we'll catch them later. */ if (nvpair_type(elem) == DATA_TYPE_UINT64 && - nvpair_value_uint64(elem, &intval) == 0 && - intval >= ZIO_COMPRESS_GZIP_1 && - intval <= ZIO_COMPRESS_GZIP_9) { - if ((p = strchr(name, '/')) == NULL) { - p = name; - } else { - bcopy(name, buf, p - name); - buf[p - name] = '\0'; - p = buf; - } - - if (spa_open(p, &spa, FTAG) == 0) { - if (spa_version(spa) < - ZFS_VERSION_GZIP_COMPRESSION) { - spa_close(spa, FTAG); - return (ENOTSUP); - } + nvpair_value_uint64(elem, &intval) == 0) { + if (intval >= ZIO_COMPRESS_GZIP_1 && + intval <= ZIO_COMPRESS_GZIP_9 && + zfs_earlier_version(name, + SPA_VERSION_GZIP_COMPRESSION)) + return (ENOTSUP); - spa_close(spa, FTAG); - } + /* + * If this is a bootable dataset then + * verify that the compression algorithm + * is supported for booting. We must return + * something other than ENOTSUP since it + * implies a downrev pool version. + */ + if (zfs_is_bootfs(name) && + !BOOTFS_COMPRESS_VALID(intval)) + return (ERANGE); } break; + + case ZFS_PROP_COPIES: + if (zfs_earlier_version(name, + SPA_VERSION_DITTO_BLOCKS)) + return (ENOTSUP); + break; + + case ZFS_PROP_SHARESMB: + if (zpl_earlier_version(name, ZPL_VERSION_FUID)) + return (ENOTSUP); + break; + } + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + const char *propname = nvpair_name(elem); + zfs_prop_t prop = zfs_name_to_prop(propname); + + if (prop == ZPROP_INVAL) { + VERIFY(nvpair_value_string(elem, &strval) == 0); + error = dsl_prop_set(name, propname, 1, + strlen(strval) + 1, strval); + if (error == 0) + continue; + else + return (error); } switch (prop) { case ZFS_PROP_QUOTA: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dir_set_quota(name, - intval)) != 0) + (error = dsl_dir_set_quota(name, intval)) != 0) + return (error); + break; + + case ZFS_PROP_REFQUOTA: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = dsl_dataset_set_quota(name, intval)) != 0) return (error); break; @@ -992,24 +1529,36 @@ zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl) return (error); break; - case ZFS_PROP_VOLSIZE: + case ZFS_PROP_REFRESERVATION: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zvol_set_volsize(name, dev, + (error = dsl_dataset_set_reservation(name, intval)) != 0) return (error); break; + case ZFS_PROP_VOLSIZE: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = zvol_set_volsize(name, + ddi_driver_major(zfs_dip), intval)) != 0) + return (error); + break; + case ZFS_PROP_VOLBLOCKSIZE: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zvol_set_volblocksize(name, - intval)) != 0) + (error = zvol_set_volblocksize(name, intval)) != 0) + return (error); + break; + + case ZFS_PROP_VERSION: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = zfs_set_version(name, intval)) != 0) return (error); break; default: if (nvpair_type(elem) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != - prop_type_string) + PROP_TYPE_STRING) return (EINVAL); VERIFY(nvpair_value_string(elem, &strval) == 0); if ((error = dsl_prop_set(name, @@ -1022,22 +1571,18 @@ zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl) VERIFY(nvpair_value_uint64(elem, &intval) == 0); switch (zfs_prop_get_type(prop)) { - case prop_type_number: + case PROP_TYPE_NUMBER: break; - case prop_type_boolean: - if (intval > 1) - return (EINVAL); - break; - case prop_type_string: + case PROP_TYPE_STRING: return (EINVAL); - case prop_type_index: + case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) return (EINVAL); break; default: - cmn_err(CE_PANIC, "unknown property " - "type"); + cmn_err(CE_PANIC, + "unknown property type"); break; } @@ -1054,127 +1599,79 @@ zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl) return (0); } +/* + * inputs: + * zc_name name of filesystem + * zc_value name of property to inherit + * zc_nvlist_src{_size} nvlist of properties to apply + * zc_cookie clear existing local props? + * + * outputs: none + */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; int error; - zfs_prop_t prop; - /* - * If zc_value is set, then this is an attempt to inherit a value. - * Otherwise, zc_nvlist refers to a list of properties to set. - */ - if (zc->zc_value[0] != '\0') { - if (!zfs_prop_user(zc->zc_value) && - ((prop = zfs_name_to_prop(zc->zc_value)) == - ZFS_PROP_INVAL || - !zfs_prop_inheritable(prop))) - return (EINVAL); + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &nvl)) != 0) + return (error); + + if (zc->zc_cookie) { + nvlist_t *origprops; + objset_t *os; + + if (dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { + if (dsl_prop_get_all(os, &origprops, TRUE) == 0) { + clear_props(zc->zc_name, origprops); + nvlist_free(origprops); + } + dmu_objset_close(os); + } - return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL)); } - if ((error = get_nvlist(zc, &nvl)) != 0) - return (error); + error = zfs_set_prop_nvlist(zc->zc_name, nvl); - error = zfs_set_prop_nvlist(zc->zc_name, zc->zc_dev, - (cred_t *)(uintptr_t)zc->zc_cred, nvl); nvlist_free(nvl); return (error); } +/* + * inputs: + * zc_name name of filesystem + * zc_value name of property to inherit + * + * outputs: none + */ +static int +zfs_ioc_inherit_prop(zfs_cmd_t *zc) +{ + /* the property name has been validated by zfs_secpolicy_inherit() */ + return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL)); +} + static int zfs_ioc_pool_set_props(zfs_cmd_t *zc) { - nvlist_t *nvl; - int error, reset_bootfs = 0; - uint64_t objnum; - zpool_prop_t prop; - nvpair_t *elem; - char *propname, *strval; + nvlist_t *props; spa_t *spa; - vdev_t *rvdev; - char *vdev_type; - objset_t *os; + int error; - if ((error = get_nvlist(zc, &nvl)) != 0) + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &props))) return (error); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { - nvlist_free(nvl); + nvlist_free(props); return (error); } - if (spa_version(spa) < ZFS_VERSION_BOOTFS) { - nvlist_free(nvl); - spa_close(spa, FTAG); - return (ENOTSUP); - } - - elem = NULL; - while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - - propname = nvpair_name(elem); - - if ((prop = zpool_name_to_prop(propname)) == - ZFS_PROP_INVAL) { - nvlist_free(nvl); - spa_close(spa, FTAG); - return (EINVAL); - } - - switch (prop) { - case ZFS_PROP_BOOTFS: - /* - * A bootable filesystem can not be on a RAIDZ pool - * nor a striped pool with more than 1 device. - */ - rvdev = spa->spa_root_vdev; - vdev_type = - rvdev->vdev_child[0]->vdev_ops->vdev_op_type; - if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || - (strcmp(vdev_type, VDEV_TYPE_MIRROR) != 0 && - rvdev->vdev_children > 1)) { - error = ENOTSUP; - break; - } - - reset_bootfs = 1; - - VERIFY(nvpair_value_string(elem, &strval) == 0); - if (strval == NULL || strval[0] == '\0') { - objnum = - zfs_prop_default_numeric(ZFS_PROP_BOOTFS); - break; - } - - if (error = dmu_objset_open(strval, DMU_OST_ZFS, - DS_MODE_STANDARD | DS_MODE_READONLY, &os)) - break; - objnum = dmu_objset_id(os); - dmu_objset_close(os); - break; + error = spa_prop_set(spa, props); - default: - error = EINVAL; - } - - if (error) - break; - } - if (error == 0) { - if (reset_bootfs) { - VERIFY(nvlist_remove(nvl, - zpool_prop_to_name(ZFS_PROP_BOOTFS), - DATA_TYPE_STRING) == 0); - VERIFY(nvlist_add_uint64(nvl, - zpool_prop_to_name(ZFS_PROP_BOOTFS), objnum) == 0); - } - error = spa_set_props(spa, nvl); - } - - nvlist_free(nvl); + nvlist_free(props); spa_close(spa, FTAG); return (error); @@ -1190,7 +1687,7 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - error = spa_get_props(spa, &nvp); + error = spa_prop_get(spa, &nvp); if (error == 0 && zc->zc_nvlist_dst != 0) error = put_nvlist(zc, nvp); @@ -1205,11 +1702,145 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc) } static int +zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc) +{ +#ifdef TODO + nvlist_t *nvp; + int error; + uint32_t uid; + uint32_t gid; + uint32_t *groups; + uint_t group_cnt; + cred_t *usercred; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &nvp)) != 0) { + return (error); + } + + if ((error = nvlist_lookup_uint32(nvp, + ZFS_DELEG_PERM_UID, &uid)) != 0) { + nvlist_free(nvp); + return (EPERM); + } + + if ((error = nvlist_lookup_uint32(nvp, + ZFS_DELEG_PERM_GID, &gid)) != 0) { + nvlist_free(nvp); + return (EPERM); + } + + if ((error = nvlist_lookup_uint32_array(nvp, ZFS_DELEG_PERM_GROUPS, + &groups, &group_cnt)) != 0) { + nvlist_free(nvp); + return (EPERM); + } + usercred = cralloc(); + if ((crsetugid(usercred, uid, gid) != 0) || + (crsetgroups(usercred, group_cnt, (gid_t *)groups) != 0)) { + nvlist_free(nvp); + crfree(usercred); + return (EPERM); + } + nvlist_free(nvp); + error = dsl_deleg_access(zc->zc_name, + zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred); + crfree(usercred); + return (error); +#else + return (EPERM); +#endif +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_src{_size} nvlist of delegated permissions + * zc_perm_action allow/unallow flag + * + * outputs: none + */ +static int +zfs_ioc_set_fsacl(zfs_cmd_t *zc) +{ + int error; + nvlist_t *fsaclnv = NULL; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &fsaclnv)) != 0) + return (error); + + /* + * Verify nvlist is constructed correctly + */ + if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { + nvlist_free(fsaclnv); + return (EINVAL); + } + + /* + * If we don't have PRIV_SYS_MOUNT, then validate + * that user is allowed to hand out each permission in + * the nvlist(s) + */ + + error = secpolicy_zfs(CRED()); + if (error) { + if (zc->zc_perm_action == B_FALSE) { + error = dsl_deleg_can_allow(zc->zc_name, + fsaclnv, CRED()); + } else { + error = dsl_deleg_can_unallow(zc->zc_name, + fsaclnv, CRED()); + } + } + + if (error == 0) + error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); + + nvlist_free(fsaclnv); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * zc_nvlist_src{_size} nvlist of delegated permissions + */ +static int +zfs_ioc_get_fsacl(zfs_cmd_t *zc) +{ + nvlist_t *nvp; + int error; + + if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { + error = put_nvlist(zc, nvp); + nvlist_free(nvp); + } + + return (error); +} + +/* + * inputs: + * zc_name name of volume + * + * outputs: none + */ +static int zfs_ioc_create_minor(zfs_cmd_t *zc) { - return (zvol_create_minor(zc->zc_name, zc->zc_dev)); + return (zvol_create_minor(zc->zc_name, ddi_driver_major(zfs_dip))); } +/* + * inputs: + * zc_name name of volume + * + * outputs: none + */ static int zfs_ioc_remove_minor(zfs_cmd_t *zc) { @@ -1228,7 +1859,7 @@ zfs_get_vfs(const char *resource) mtx_lock(&mountlist_mtx); TAILQ_FOREACH(vfsp, &mountlist, mnt_list) { - if (strcmp(vfsp->mnt_stat.f_mntfromname, resource) == 0) { + if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) { VFS_HOLD(vfsp); break; } @@ -1237,21 +1868,183 @@ zfs_get_vfs(const char *resource) return (vfsp); } +/* ARGSUSED */ static void -zfs_create_cb(objset_t *os, void *arg, dmu_tx_t *tx) +zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +{ + zfs_creat_t *zct = arg; + + zfs_create_fs(os, cr, zct->zct_zplprops, tx); +} + +#define ZFS_PROP_UNDEFINED ((uint64_t)-1) + +/* + * inputs: + * createprops list of properties requested by creator + * default_zplver zpl version to use if unspecified in createprops + * fuids_ok fuids allowed in this version of the spa? + * os parent objset pointer (NULL if root fs) + * + * outputs: + * zplprops values for the zplprops we attach to the master node object + * is_ci true if requested file system will be purely case-insensitive + * + * Determine the settings for utf8only, normalization and + * casesensitivity. Specific values may have been requested by the + * creator and/or we can inherit values from the parent dataset. If + * the file system is of too early a vintage, a creator can not + * request settings for these properties, even if the requested + * setting is the default value. We don't actually want to create dsl + * properties for these, so remove them from the source nvlist after + * processing. + */ +static int +zfs_fill_zplprops_impl(objset_t *os, uint64_t default_zplver, + boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops, + boolean_t *is_ci) +{ + uint64_t zplver = default_zplver; + uint64_t sense = ZFS_PROP_UNDEFINED; + uint64_t norm = ZFS_PROP_UNDEFINED; + uint64_t u8 = ZFS_PROP_UNDEFINED; + + ASSERT(zplprops != NULL); + + /* + * Pull out creator prop choices, if any. + */ + if (createprops) { + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE)); + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_CASE), &sense); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_CASE)); + } + + /* + * If the zpl version requested is whacky or the file system + * or pool is version is too "young" to support normalization + * and the creator tried to set a value for one of the props, + * error out. + */ + if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || + (zplver >= ZPL_VERSION_FUID && !fuids_ok) || + (zplver < ZPL_VERSION_NORMALIZATION && + (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || + sense != ZFS_PROP_UNDEFINED))) + return (ENOTSUP); + + /* + * Put the version in the zplprops + */ + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); + + if (norm == ZFS_PROP_UNDEFINED) + VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); + + /* + * If we're normalizing, names must always be valid UTF-8 strings. + */ + if (norm) + u8 = 1; + if (u8 == ZFS_PROP_UNDEFINED) + VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); + + if (sense == ZFS_PROP_UNDEFINED) + VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); + + if (is_ci) + *is_ci = (sense == ZFS_CASE_INSENSITIVE); + + return (0); +} + +static int +zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, + nvlist_t *zplprops, boolean_t *is_ci) +{ + boolean_t fuids_ok = B_TRUE; + uint64_t zplver = ZPL_VERSION; + objset_t *os = NULL; + char parentname[MAXNAMELEN]; + char *cp; + int error; + + (void) strlcpy(parentname, dataset, sizeof (parentname)); + cp = strrchr(parentname, '/'); + ASSERT(cp != NULL); + cp[0] = '\0'; + + if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) { + zplver = ZPL_VERSION_FUID - 1; + fuids_ok = B_FALSE; + } + + /* + * Open parent object set so we can inherit zplprop values. + */ + if ((error = dmu_objset_open(parentname, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &os)) != 0) + return (error); + + error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, createprops, + zplprops, is_ci); + dmu_objset_close(os); + return (error); +} + +static int +zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, + nvlist_t *zplprops, boolean_t *is_ci) { - zfs_create_data_t *zc = arg; + boolean_t fuids_ok = B_TRUE; + uint64_t zplver = ZPL_VERSION; + int error; - zfs_create_fs(os, (cred_t *)(uintptr_t)zc->zc_cred, tx); + if (spa_vers < SPA_VERSION_FUID) { + zplver = ZPL_VERSION_FUID - 1; + fuids_ok = B_FALSE; + } + + error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, createprops, + zplprops, is_ci); + return (error); } +/* + * inputs: + * zc_objset_type type of objset to create (fs vs zvol) + * zc_name name of new objset + * zc_value name of snapshot to clone from (may be empty) + * zc_nvlist_src{_size} nvlist of properties to apply + * + * outputs: none + */ static int zfs_ioc_create(zfs_cmd_t *zc) { objset_t *clone; int error = 0; - zfs_create_data_t cbdata = { 0 }; - void (*cbfunc)(objset_t *os, void *arg, dmu_tx_t *tx); + zfs_creat_t zct; + nvlist_t *nvprops = NULL; + void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); dmu_objset_type_t type = zc->zc_objset_type; switch (type) { @@ -1266,16 +2059,19 @@ zfs_ioc_create(zfs_cmd_t *zc) default: cbfunc = NULL; + break; } - if (strchr(zc->zc_name, '@')) + if (strchr(zc->zc_name, '@') || + strchr(zc->zc_name, '%')) return (EINVAL); if (zc->zc_nvlist_src != 0 && - (error = get_nvlist(zc, &cbdata.zc_props)) != 0) + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &nvprops)) != 0) return (error); - cbdata.zc_cred = (cred_t *)(uintptr_t)zc->zc_cred; - cbdata.zc_dev = (dev_t)zc->zc_dev; + zct.zct_zplprops = NULL; + zct.zct_props = nvprops; if (zc->zc_value[0] != '\0') { /* @@ -1283,39 +2079,48 @@ zfs_ioc_create(zfs_cmd_t *zc) */ zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) { - nvlist_free(cbdata.zc_props); + nvlist_free(nvprops); return (EINVAL); } error = dmu_objset_open(zc->zc_value, type, - DS_MODE_STANDARD | DS_MODE_READONLY, &clone); + DS_MODE_USER | DS_MODE_READONLY, &clone); + if (error) { + nvlist_free(nvprops); + return (error); + } + + error = dmu_objset_create(zc->zc_name, type, clone, 0, + NULL, NULL); if (error) { - nvlist_free(cbdata.zc_props); + dmu_objset_close(clone); + nvlist_free(nvprops); return (error); } - error = dmu_objset_create(zc->zc_name, type, clone, NULL, NULL); dmu_objset_close(clone); } else { + boolean_t is_insensitive = B_FALSE; + if (cbfunc == NULL) { - nvlist_free(cbdata.zc_props); + nvlist_free(nvprops); return (EINVAL); } if (type == DMU_OST_ZVOL) { uint64_t volsize, volblocksize; - if (cbdata.zc_props == NULL || - nvlist_lookup_uint64(cbdata.zc_props, + if (nvprops == NULL || + nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) { - nvlist_free(cbdata.zc_props); + nvlist_free(nvprops); return (EINVAL); } - if ((error = nvlist_lookup_uint64(cbdata.zc_props, + if ((error = nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize)) != 0 && error != ENOENT) { - nvlist_free(cbdata.zc_props); + nvlist_free(nvprops); return (EINVAL); } @@ -1327,56 +2132,127 @@ zfs_ioc_create(zfs_cmd_t *zc) volblocksize)) != 0 || (error = zvol_check_volsize(volsize, volblocksize)) != 0) { - nvlist_free(cbdata.zc_props); + nvlist_free(nvprops); return (error); } - } + } else if (type == DMU_OST_ZFS) { + int error; - error = dmu_objset_create(zc->zc_name, type, NULL, cbfunc, - &cbdata); + /* + * We have to have normalization and + * case-folding flags correct when we do the + * file system creation, so go figure them out + * now. + */ + VERIFY(nvlist_alloc(&zct.zct_zplprops, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + error = zfs_fill_zplprops(zc->zc_name, nvprops, + zct.zct_zplprops, &is_insensitive); + if (error != 0) { + nvlist_free(nvprops); + nvlist_free(zct.zct_zplprops); + return (error); + } + } + error = dmu_objset_create(zc->zc_name, type, NULL, + is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); + nvlist_free(zct.zct_zplprops); } /* * It would be nice to do this atomically. */ if (error == 0) { - if ((error = zfs_set_prop_nvlist(zc->zc_name, - zc->zc_dev, (cred_t *)(uintptr_t)zc->zc_cred, - cbdata.zc_props)) != 0) + if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0) (void) dmu_objset_destroy(zc->zc_name); } - - nvlist_free(cbdata.zc_props); + nvlist_free(nvprops); return (error); } +struct snap_prop_arg { + nvlist_t *nvprops; + const char *snapname; +}; + +static int +set_snap_props(char *name, void *arg) +{ + struct snap_prop_arg *snpa = arg; + int len = strlen(name) + strlen(snpa->snapname) + 2; + char *buf = kmem_alloc(len, KM_SLEEP); + int err; + + (void) snprintf(buf, len, "%s@%s", name, snpa->snapname); + err = zfs_set_prop_nvlist(buf, snpa->nvprops); + if (err) + (void) dmu_objset_destroy(buf); + kmem_free(buf, len); + return (err); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value short name of snapshot + * zc_cookie recursive flag + * + * outputs: none + */ static int zfs_ioc_snapshot(zfs_cmd_t *zc) { + nvlist_t *nvprops = NULL; + int error; + boolean_t recursive = zc->zc_cookie; + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) return (EINVAL); - return (dmu_objset_snapshot(zc->zc_name, - zc->zc_value, zc->zc_cookie)); + + if (zc->zc_nvlist_src != 0 && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &nvprops)) != 0) + return (error); + + error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, recursive); + + /* + * It would be nice to do this atomically. + */ + if (error == 0) { + struct snap_prop_arg snpa; + snpa.nvprops = nvprops; + snpa.snapname = zc->zc_value; + if (recursive) { + error = dmu_objset_find(zc->zc_name, + set_snap_props, &snpa, DS_FIND_CHILDREN); + if (error) { + (void) dmu_snapshots_destroy(zc->zc_name, + zc->zc_value); + } + } else { + error = set_snap_props(zc->zc_name, &snpa); + } + } + nvlist_free(nvprops); + return (error); } int zfs_unmount_snap(char *name, void *arg) { - char *snapname = arg; - char *cp; vfs_t *vfsp = NULL; - /* - * Snapshots (which are under .zfs control) must be unmounted - * before they can be destroyed. - */ + if (arg) { + char *snapname = arg; + int len = strlen(name) + strlen(snapname) + 2; + char *buf = kmem_alloc(len, KM_SLEEP); - if (snapname) { - (void) strcat(name, "@"); - (void) strcat(name, snapname); - vfsp = zfs_get_vfs(name); - cp = strchr(name, '@'); - *cp = '\0'; + (void) strcpy(buf, name); + (void) strcat(buf, "@"); + (void) strcat(buf, snapname); + vfsp = zfs_get_vfs(buf); + kmem_free(buf, len); } else if (strchr(name, '@')) { vfsp = zfs_get_vfs(name); } @@ -1400,6 +2276,13 @@ zfs_unmount_snap(char *name, void *arg) return (0); } +/* + * inputs: + * zc_name name of filesystem + * zc_value short name of snapshot + * + * outputs: none + */ static int zfs_ioc_destroy_snaps(zfs_cmd_t *zc) { @@ -1414,6 +2297,13 @@ zfs_ioc_destroy_snaps(zfs_cmd_t *zc) return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value)); } +/* + * inputs: + * zc_name name of dataset to destroy + * zc_objset_type type of objset + * + * outputs: none + */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { @@ -1426,19 +2316,76 @@ zfs_ioc_destroy(zfs_cmd_t *zc) return (dmu_objset_destroy(zc->zc_name)); } +/* + * inputs: + * zc_name name of dataset to rollback (to most recent snapshot) + * + * outputs: none + */ static int zfs_ioc_rollback(zfs_cmd_t *zc) { - return (dmu_objset_rollback(zc->zc_name)); + objset_t *os; + int error; + zfsvfs_t *zfsvfs = NULL; + + /* + * Get the zfsvfs for the receiving objset. There + * won't be one if we're operating on a zvol, if the + * objset doesn't exist yet, or is not mounted. + */ + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, DS_MODE_USER, &os); + if (error) + return (error); + + if (dmu_objset_type(os) == DMU_OST_ZFS) { + mutex_enter(&os->os->os_user_ptr_lock); + zfsvfs = dmu_objset_get_user(os); + if (zfsvfs != NULL) + VFS_HOLD(zfsvfs->z_vfs); + mutex_exit(&os->os->os_user_ptr_lock); + } + + if (zfsvfs != NULL) { + char osname[MAXNAMELEN]; + int mode; + + error = zfs_suspend_fs(zfsvfs, osname, &mode); + if (error == 0) { + int resume_err; + + ASSERT(strcmp(osname, zc->zc_name) == 0); + error = dmu_objset_rollback(os); + resume_err = zfs_resume_fs(zfsvfs, osname, mode); + error = error ? error : resume_err; + } else { + dmu_objset_close(os); + } + VFS_RELE(zfsvfs->z_vfs); + } else { + error = dmu_objset_rollback(os); + } + /* Note, the dmu_objset_rollback() releases the objset for us. */ + + return (error); } +/* + * inputs: + * zc_name old name of dataset + * zc_value new name of dataset + * zc_cookie recursive flag (only valid for snapshots) + * + * outputs: none + */ static int zfs_ioc_rename(zfs_cmd_t *zc) { - int recursive = zc->zc_cookie & 1; + boolean_t recursive = zc->zc_cookie & 1; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; - if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || + strchr(zc->zc_value, '%')) return (EINVAL); /* @@ -1452,48 +2399,199 @@ zfs_ioc_rename(zfs_cmd_t *zc) if (err) return (err); } - return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive)); } +static void +clear_props(char *dataset, nvlist_t *props) +{ + zfs_cmd_t *zc; + nvpair_t *prop; + + if (props == NULL) + return; + zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); + (void) strcpy(zc->zc_name, dataset); + for (prop = nvlist_next_nvpair(props, NULL); prop; + prop = nvlist_next_nvpair(props, prop)) { + (void) strcpy(zc->zc_value, nvpair_name(prop)); + if (zfs_secpolicy_inherit(zc, CRED()) == 0) + (void) zfs_ioc_inherit_prop(zc); + } + kmem_free(zc, sizeof (zfs_cmd_t)); +} + +/* + * inputs: + * zc_name name of containing filesystem + * zc_nvlist_src{_size} nvlist of properties to apply + * zc_value name of snapshot to create + * zc_string name of clone origin (if DRR_FLAG_CLONE) + * zc_cookie file descriptor to recv from + * zc_begin_record the BEGIN record of the stream (not byteswapped) + * zc_guid force flag + * + * outputs: + * zc_cookie number of bytes read + */ static int -zfs_ioc_recvbackup(zfs_cmd_t *zc) +zfs_ioc_recv(zfs_cmd_t *zc) { - kthread_t *td = curthread; - struct file *fp; - int error; - offset_t new_off; + file_t *fp; + objset_t *os; + dmu_recv_cookie_t drc; + zfsvfs_t *zfsvfs = NULL; + boolean_t force = (boolean_t)zc->zc_guid; + int error, fd; + offset_t off; + nvlist_t *props = NULL; + nvlist_t *origprops = NULL; + objset_t *origin = NULL; + char *tosnap; + char tofs[ZFS_MAXNAMELEN]; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || - strchr(zc->zc_value, '@') == NULL) + strchr(zc->zc_value, '@') == NULL || + strchr(zc->zc_value, '%')) return (EINVAL); - error = fget_read(td, zc->zc_cookie, &fp); - if (error) + (void) strcpy(tofs, zc->zc_value); + tosnap = strchr(tofs, '@'); + *tosnap = '\0'; + tosnap++; + + if (zc->zc_nvlist_src != 0 && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &props)) != 0) return (error); - error = dmu_recvbackup(zc->zc_value, &zc->zc_begin_record, - &zc->zc_cookie, (boolean_t)zc->zc_guid, fp, - fp->f_offset); + fd = zc->zc_cookie; + fp = getf(fd, 0); + if (fp == NULL) { + nvlist_free(props); + return (EBADF); + } - new_off = fp->f_offset + zc->zc_cookie; - fp->f_offset = new_off; + if (dmu_objset_open(tofs, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { + /* + * Try to get the zfsvfs for the receiving objset. + * There won't be one if we're operating on a zvol, + * if the objset doesn't exist yet, or is not mounted. + */ + mutex_enter(&os->os->os_user_ptr_lock); + if (zfsvfs = dmu_objset_get_user(os)) { + if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) { + mutex_exit(&os->os->os_user_ptr_lock); + dmu_objset_close(os); + zfsvfs = NULL; + error = EBUSY; + goto out; + } + VFS_HOLD(zfsvfs->z_vfs); + } + mutex_exit(&os->os->os_user_ptr_lock); + + /* + * If new properties are supplied, they are to completely + * replace the existing ones, so stash away the existing ones. + */ + if (props) + (void) dsl_prop_get_all(os, &origprops, TRUE); + + dmu_objset_close(os); + } + + if (zc->zc_string[0]) { + error = dmu_objset_open(zc->zc_string, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &origin); + if (error) + goto out; + } + + error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record, + force, origin, zfsvfs != NULL, &drc); + if (origin) + dmu_objset_close(origin); + if (error) + goto out; + + /* + * Reset properties. We do this before we receive the stream + * so that the properties are applied to the new data. + */ + if (props) { + clear_props(tofs, origprops); + /* + * XXX - Note, this is all-or-nothing; should be best-effort. + */ + (void) zfs_set_prop_nvlist(tofs, props); + } + + off = fp->f_offset; + error = dmu_recv_stream(&drc, fp, &off); + + if (error == 0 && zfsvfs) { + char osname[MAXNAMELEN]; + int mode; + + /* online recv */ + error = zfs_suspend_fs(zfsvfs, osname, &mode); + if (error == 0) { + int resume_err; + + error = dmu_recv_end(&drc); + resume_err = zfs_resume_fs(zfsvfs, osname, mode); + error = error ? error : resume_err; + } else { + dmu_recv_abort_cleanup(&drc); + } + } else if (error == 0) { + error = dmu_recv_end(&drc); + } - fdrop(fp, td); + zc->zc_cookie = off - fp->f_offset; + if (off >= 0 && off <= MAXOFFSET_T) + fp->f_offset = off; + + /* + * On error, restore the original props. + */ + if (error && props) { + clear_props(tofs, props); + (void) zfs_set_prop_nvlist(tofs, origprops); + } +out: + if (zfsvfs) { + mutex_exit(&zfsvfs->z_online_recv_lock); + VFS_RELE(zfsvfs->z_vfs); + } + nvlist_free(props); + nvlist_free(origprops); + releasef(fp); return (error); } +/* + * inputs: + * zc_name name of snapshot to send + * zc_value short name of incremental fromsnap (may be empty) + * zc_cookie file descriptor to send stream to + * zc_obj fromorigin flag (mutually exclusive with zc_value) + * + * outputs: none + */ static int -zfs_ioc_sendbackup(zfs_cmd_t *zc) +zfs_ioc_send(zfs_cmd_t *zc) { - kthread_t *td = curthread; - struct file *fp; objset_t *fromsnap = NULL; objset_t *tosnap; - int error, fd; + file_t *fp; + int error; + offset_t off; error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap); + DS_MODE_USER | DS_MODE_READONLY, &tosnap); if (error) return (error); @@ -1507,25 +2605,27 @@ zfs_ioc_sendbackup(zfs_cmd_t *zc) *(cp+1) = 0; (void) strlcat(buf, zc->zc_value, sizeof (buf)); error = dmu_objset_open(buf, DMU_OST_ANY, - DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap); + DS_MODE_USER | DS_MODE_READONLY, &fromsnap); if (error) { dmu_objset_close(tosnap); return (error); } } - fd = zc->zc_cookie; - error = fget_write(td, fd, &fp); - if (error) { + fp = getf(zc->zc_cookie, 1); + if (fp == NULL) { dmu_objset_close(tosnap); if (fromsnap) dmu_objset_close(fromsnap); - return (error); + return (EBADF); } - error = dmu_sendbackup(tosnap, fromsnap, fp); + off = fp->f_offset; + error = dmu_sendbackup(tosnap, fromsnap, zc->zc_obj, fp, &off); - fdrop(fp, td); + if (off >= 0 && off <= MAXOFFSET_T) + fp->f_offset = off; + releasef(fp); if (fromsnap) dmu_objset_close(fromsnap); dmu_objset_close(tosnap); @@ -1595,28 +2695,58 @@ zfs_ioc_clear(zfs_cmd_t *zc) vdev_t *vd; int error; + /* + * On zpool clear we also fix up missing slogs + */ + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(zc->zc_name); + if (spa == NULL) { + mutex_exit(&spa_namespace_lock); + return (EIO); + } + if (spa->spa_log_state == SPA_LOG_MISSING) { + /* we need to let spa_open/spa_load clear the chains */ + spa->spa_log_state = SPA_LOG_CLEAR; + } + mutex_exit(&spa_namespace_lock); + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - spa_config_enter(spa, RW_WRITER, FTAG); + spa_vdev_state_enter(spa); if (zc->zc_guid == 0) { vd = NULL; - } else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) { - spa_config_exit(spa, FTAG); - spa_close(spa, FTAG); - return (ENODEV); + } else { + vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); + if (vd == NULL) { + (void) spa_vdev_state_exit(spa, NULL, ENODEV); + spa_close(spa, FTAG); + return (ENODEV); + } } vdev_clear(spa, vd); - spa_config_exit(spa, FTAG); + (void) spa_vdev_state_exit(spa, NULL, 0); + + /* + * Resume any suspended I/Os. + */ + zio_resume(spa); spa_close(spa, FTAG); return (0); } +/* + * inputs: + * zc_name name of filesystem + * zc_value name of origin snapshot + * + * outputs: none + */ static int zfs_ioc_promote(zfs_cmd_t *zc) { @@ -1634,68 +2764,221 @@ zfs_ioc_promote(zfs_cmd_t *zc) return (dsl_dataset_promote(zc->zc_name)); } +#ifdef TODO +/* + * We don't want to have a hard dependency + * against some special symbols in sharefs + * nfs, and smbsrv. Determine them if needed when + * the first file system is shared. + * Neither sharefs, nfs or smbsrv are unloadable modules. + */ +int (*znfsexport_fs)(void *arg); +int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t); +int (*zsmbexport_fs)(void *arg, boolean_t add_share); + +int zfs_nfsshare_inited; +int zfs_smbshare_inited; + +ddi_modhandle_t nfs_mod; +ddi_modhandle_t sharefs_mod; +ddi_modhandle_t smbsrv_mod; +#endif +kmutex_t zfs_share_lock; + +#ifdef TODO +static int +zfs_init_sharefs() +{ + int error; + + ASSERT(MUTEX_HELD(&zfs_share_lock)); + /* Both NFS and SMB shares also require sharetab support. */ + if (sharefs_mod == NULL && ((sharefs_mod = + ddi_modopen("fs/sharefs", + KRTLD_MODE_FIRST, &error)) == NULL)) { + return (ENOSYS); + } + if (zshare_fs == NULL && ((zshare_fs = + (int (*)(enum sharefs_sys_op, share_t *, uint32_t)) + ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) { + return (ENOSYS); + } + return (0); +} +#endif + +static int +zfs_ioc_share(zfs_cmd_t *zc) +{ +#ifdef TODO + int error; + int opcode; + + switch (zc->zc_share.z_sharetype) { + case ZFS_SHARE_NFS: + case ZFS_UNSHARE_NFS: + if (zfs_nfsshare_inited == 0) { + mutex_enter(&zfs_share_lock); + if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs", + KRTLD_MODE_FIRST, &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + if (znfsexport_fs == NULL && + ((znfsexport_fs = (int (*)(void *)) + ddi_modsym(nfs_mod, + "nfs_export", &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + error = zfs_init_sharefs(); + if (error) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + zfs_nfsshare_inited = 1; + mutex_exit(&zfs_share_lock); + } + break; + case ZFS_SHARE_SMB: + case ZFS_UNSHARE_SMB: + if (zfs_smbshare_inited == 0) { + mutex_enter(&zfs_share_lock); + if (smbsrv_mod == NULL && ((smbsrv_mod = + ddi_modopen("drv/smbsrv", + KRTLD_MODE_FIRST, &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + if (zsmbexport_fs == NULL && ((zsmbexport_fs = + (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod, + "smb_server_share", &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + error = zfs_init_sharefs(); + if (error) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + zfs_smbshare_inited = 1; + mutex_exit(&zfs_share_lock); + } + break; + default: + return (EINVAL); + } + + switch (zc->zc_share.z_sharetype) { + case ZFS_SHARE_NFS: + case ZFS_UNSHARE_NFS: + if (error = + znfsexport_fs((void *) + (uintptr_t)zc->zc_share.z_exportdata)) + return (error); + break; + case ZFS_SHARE_SMB: + case ZFS_UNSHARE_SMB: + if (error = zsmbexport_fs((void *) + (uintptr_t)zc->zc_share.z_exportdata, + zc->zc_share.z_sharetype == ZFS_SHARE_SMB ? + B_TRUE : B_FALSE)) { + return (error); + } + break; + } + + opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS || + zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ? + SHAREFS_ADD : SHAREFS_REMOVE; + + /* + * Add or remove share from sharetab + */ + error = zshare_fs(opcode, + (void *)(uintptr_t)zc->zc_share.z_sharedata, + zc->zc_share.z_sharemax); + + return (error); +#else + return (ENOSYS); +#endif +} + +/* + * pool create, destroy, and export don't log the history as part of + * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export + * do the logging of those commands. + */ static int zfs_ioc_jail(zfs_cmd_t *zc) { - return (zone_dataset_attach((cred_t *)(uintptr_t)zc->zc_cred, - zc->zc_name, (int)zc->zc_jailid)); + return (zone_dataset_attach(curthread->td_ucred, zc->zc_name, + (int)zc->zc_jailid)); } static int zfs_ioc_unjail(zfs_cmd_t *zc) { - return (zone_dataset_detach((cred_t *)(uintptr_t)zc->zc_cred, - zc->zc_name, (int)zc->zc_jailid)); + return (zone_dataset_detach(curthread->td_ucred, zc->zc_name, + (int)zc->zc_jailid)); } static zfs_ioc_vec_t zfs_ioc_vec[] = { - { zfs_ioc_pool_create, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_destroy, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_import, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_export, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_configs, zfs_secpolicy_none, no_name }, - { zfs_ioc_pool_stats, zfs_secpolicy_read, pool_name }, - { zfs_ioc_pool_tryimport, zfs_secpolicy_config, no_name }, - { zfs_ioc_pool_scrub, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_freeze, zfs_secpolicy_config, no_name }, - { zfs_ioc_pool_upgrade, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_get_history, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_log_history, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_add, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_remove, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_online, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_offline, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_attach, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_detach, zfs_secpolicy_config, pool_name }, - { zfs_ioc_vdev_setpath, zfs_secpolicy_config, pool_name }, - { zfs_ioc_objset_stats, zfs_secpolicy_read, dataset_name }, - { zfs_ioc_dataset_list_next, zfs_secpolicy_read, dataset_name }, - { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, dataset_name }, - { zfs_ioc_set_prop, zfs_secpolicy_write, dataset_name }, - { zfs_ioc_create_minor, zfs_secpolicy_config, dataset_name }, - { zfs_ioc_remove_minor, zfs_secpolicy_config, dataset_name }, - { zfs_ioc_create, zfs_secpolicy_parent, dataset_name }, - { zfs_ioc_destroy, zfs_secpolicy_parent, dataset_name }, - { zfs_ioc_rollback, zfs_secpolicy_write, dataset_name }, - { zfs_ioc_rename, zfs_secpolicy_write, dataset_name }, - { zfs_ioc_recvbackup, zfs_secpolicy_write, dataset_name }, - { zfs_ioc_sendbackup, zfs_secpolicy_operator, dataset_name }, - { zfs_ioc_inject_fault, zfs_secpolicy_inject, no_name }, - { zfs_ioc_clear_fault, zfs_secpolicy_inject, no_name }, - { zfs_ioc_inject_list_next, zfs_secpolicy_inject, no_name }, - { zfs_ioc_error_log, zfs_secpolicy_inject, pool_name }, - { zfs_ioc_clear, zfs_secpolicy_config, pool_name }, - { zfs_ioc_promote, zfs_secpolicy_write, dataset_name }, - { zfs_ioc_destroy_snaps, zfs_secpolicy_write, dataset_name }, - { zfs_ioc_snapshot, zfs_secpolicy_operator, dataset_name }, - { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, pool_name }, - { zfs_ioc_obj_to_path, zfs_secpolicy_config, no_name }, - { zfs_ioc_pool_set_props, zfs_secpolicy_config, pool_name }, - { zfs_ioc_pool_get_props, zfs_secpolicy_read, pool_name }, - { zfs_ioc_jail, zfs_secpolicy_config, dataset_name }, - { zfs_ioc_unjail, zfs_secpolicy_config, dataset_name } + { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE }, + { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE }, + { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE }, + { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE }, + { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, + { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, + { zfs_ioc_dataset_list_next, zfs_secpolicy_read, + DATASET_NAME, B_FALSE }, + { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, + DATASET_NAME, B_FALSE }, + { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE }, + { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE }, + { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE }, + { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE }, + { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE }, + { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE }, + { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE }, + { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE }, + { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE }, + { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE }, + { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE }, + { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE }, + { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE }, + { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE }, + { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE }, + { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE }, + { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE }, + { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE }, + { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE }, + { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, + { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, + DATASET_NAME, B_FALSE }, + { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE }, + { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE }, + { zfs_ioc_jail, zfs_secpolicy_config, DATASET_NAME, B_TRUE }, + { zfs_ioc_unjail, zfs_secpolicy_config, DATASET_NAME, B_TRUE } }; static int @@ -1711,9 +2994,7 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (EINVAL); - zc->zc_cred = (uintptr_t)td->td_ucred; - zc->zc_dev = (uintptr_t)dev; - error = zfs_ioc_vec[vec].zvec_secpolicy(zc->zc_name, td->td_ucred); + error = zfs_ioc_vec[vec].zvec_secpolicy(zc, td->td_ucred); /* * Ensure that all pool/dataset names are valid before we pass down to @@ -1722,17 +3003,17 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, if (error == 0) { zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; switch (zfs_ioc_vec[vec].zvec_namecheck) { - case pool_name: + case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = EINVAL; break; - case dataset_name: + case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = EINVAL; break; - case no_name: + case NO_NAME: break; } } @@ -1740,6 +3021,9 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, if (error == 0) error = zfs_ioc_vec[vec].zvec_func(zc); + if (zfs_ioc_vec[vec].zvec_his_log == B_TRUE) + zfs_log_history(zc); + return (error); } @@ -1761,7 +3045,7 @@ static struct cdevsw zfs_cdevsw = { static void zfsdev_init(void) { - zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0660, + zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666, ZFS_DEV_NAME); } @@ -1775,6 +3059,10 @@ zfsdev_fini(void) static struct task zfs_start_task; static struct root_hold_token *zfs_root_token; + +uint_t zfs_fsyncer_key; +extern uint_t rrw_tsd_key; + static void zfs_start(void *context __unused, int pending __unused) { @@ -1783,7 +3071,11 @@ zfs_start(void *context __unused, int pending __unused) spa_init(FREAD | FWRITE); zfs_init(); zvol_init(); - printf("ZFS storage pool version " ZFS_VERSION_STRING "\n"); + + tsd_create(&zfs_fsyncer_key, NULL); + tsd_create(&rrw_tsd_key, NULL); + + printf("ZFS storage pool version " SPA_VERSION_STRING "\n"); root_mount_rel(zfs_root_token); } @@ -1800,6 +3092,7 @@ zfs_modevent(module_t mod, int type, void *unused __unused) "feature in FreeBSD.\n"); TASK_INIT(&zfs_start_task, 0, zfs_start, NULL); taskqueue_enqueue(taskqueue_thread, &zfs_start_task); + mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); error = 0; break; case MOD_UNLOAD: @@ -1812,6 +3105,9 @@ zfs_modevent(module_t mod, int type, void *unused __unused) zfs_fini(); spa_fini(); zfsdev_fini(); + tsd_destroy(&zfs_fsyncer_key); + tsd_destroy(&rrw_tsd_key); + mutex_destroy(&zfs_share_lock); error = 0; break; } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c index dde9ec1a335c..5f99780d7544 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/systm.h> @@ -36,49 +34,282 @@ #include <sys/zfs_znode.h> #include <sys/zfs_dir.h> #include <sys/zil.h> +#include <sys/zil_impl.h> #include <sys/byteorder.h> #include <sys/policy.h> #include <sys/stat.h> #include <sys/acl.h> #include <sys/dmu.h> #include <sys/spa.h> +#include <sys/zfs_fuid.h> /* * All the functions in this file are used to construct the log entries - * to record transactions. They allocate * a intent log transaction + * to record transactions. They allocate * an intent log transaction * structure (itx_t) and save within it all the information necessary to * possibly replay the transaction. The itx is then assigned a sequence * number and inserted in the in-memory list anchored in the zilog. */ +int +zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap) +{ + int isxvattr = (vap->va_mask & AT_XVATTR); + switch (type) { + case Z_FILE: + if (vsecp == NULL && !isxvattr) + return (TX_CREATE); + if (vsecp && isxvattr) + return (TX_CREATE_ACL_ATTR); + if (vsecp) + return (TX_CREATE_ACL); + else + return (TX_CREATE_ATTR); + /*NOTREACHED*/ + case Z_DIR: + if (vsecp == NULL && !isxvattr) + return (TX_MKDIR); + if (vsecp && isxvattr) + return (TX_MKDIR_ACL_ATTR); + if (vsecp) + return (TX_MKDIR_ACL); + else + return (TX_MKDIR_ATTR); + case Z_XATTRDIR: + return (TX_MKXATTR); + } + ASSERT(0); + return (TX_MAX_TYPE); +} + +/* + * build up the log data necessary for logging xvattr_t + * First lr_attr_t is initialized. following the lr_attr_t + * is the mapsize and attribute bitmap copied from the xvattr_t. + * Following the bitmap and bitmapsize two 64 bit words are reserved + * for the create time which may be set. Following the create time + * records a single 64 bit integer which has the bits to set on + * replay for the xvattr. + */ +static void +zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) +{ + uint32_t *bitmap; + uint64_t *attrs; + uint64_t *crtime; + xoptattr_t *xoap; + void *scanstamp; + int i; + + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); + + lrattr->lr_attr_masksize = xvap->xva_mapsize; + bitmap = &lrattr->lr_attr_bitmap; + for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) { + *bitmap = xvap->xva_reqattrmap[i]; + } + + /* Now pack the attributes up in a single uint64_t */ + attrs = (uint64_t *)bitmap; + crtime = attrs + 1; + scanstamp = (caddr_t)(crtime + 2); + *attrs = 0; + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) + *attrs |= (xoap->xoa_readonly == 0) ? 0 : + XAT0_READONLY; + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) + *attrs |= (xoap->xoa_hidden == 0) ? 0 : + XAT0_HIDDEN; + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) + *attrs |= (xoap->xoa_system == 0) ? 0 : + XAT0_SYSTEM; + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) + *attrs |= (xoap->xoa_archive == 0) ? 0 : + XAT0_ARCHIVE; + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) + *attrs |= (xoap->xoa_immutable == 0) ? 0 : + XAT0_IMMUTABLE; + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) + *attrs |= (xoap->xoa_nounlink == 0) ? 0 : + XAT0_NOUNLINK; + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) + *attrs |= (xoap->xoa_appendonly == 0) ? 0 : + XAT0_APPENDONLY; + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) + *attrs |= (xoap->xoa_opaque == 0) ? 0 : + XAT0_APPENDONLY; + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) + *attrs |= (xoap->xoa_nodump == 0) ? 0 : + XAT0_NODUMP; + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) + *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 : + XAT0_AV_QUARANTINED; + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) + *attrs |= (xoap->xoa_av_modified == 0) ? 0 : + XAT0_AV_MODIFIED; + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime); + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); +} + +static void * +zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start) +{ + zfs_fuid_t *zfuid; + uint64_t *fuidloc = start; + + /* First copy in the ACE FUIDs */ + for (zfuid = list_head(&fuidp->z_fuids); zfuid; + zfuid = list_next(&fuidp->z_fuids, zfuid)) { + *fuidloc++ = zfuid->z_logfuid; + } + return (fuidloc); +} + + +static void * +zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) +{ + zfs_fuid_domain_t *zdomain; + + /* now copy in the domain info, if any */ + if (fuidp->z_domain_str_sz != 0) { + for (zdomain = list_head(&fuidp->z_domains); zdomain; + zdomain = list_next(&fuidp->z_domains, zdomain)) { + bcopy((void *)zdomain->z_domain, start, + strlen(zdomain->z_domain) + 1); + start = (caddr_t)start + + strlen(zdomain->z_domain) + 1; + } + } + return (start); +} + /* - * zfs_log_create() is used to handle TX_CREATE, TX_MKDIR and TX_MKXATTR + * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, + * TX_MKDIR_ATTR and TX_MKXATTR * transactions. + * + * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID + * domain information appended prior to the name. In this case the + * uid/gid in the log record will be a log centric FUID. + * + * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that + * may contain attributes, ACL and optional fuid information. + * + * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify + * and ACL and normal users/groups in the ACEs. + * + * There may be an optional xvattr attribute information similar + * to zfs_log_setattr. + * + * Also, after the file name "domain" strings may be appended. */ void -zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *dzp, znode_t *zp, char *name) +zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp, + zfs_fuid_info_t *fuidp, vattr_t *vap) { itx_t *itx; uint64_t seq; lr_create_t *lr; + lr_acl_create_t *lracl; + size_t aclsize; + size_t xvatsize = 0; + size_t txsize; + xvattr_t *xvap = (xvattr_t *)vap; + void *end; + size_t lrsize; size_t namesize = strlen(name) + 1; + size_t fuidsz = 0; if (zilog == NULL) return; - itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + /* + * If we have FUIDs present then add in space for + * domains and ACE fuid's if any. + */ + if (fuidp) { + fuidsz += fuidp->z_domain_str_sz; + fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t); + } + + if (vap->va_mask & AT_XVATTR) + xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize); + + if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR || + (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR || + (int)txtype == TX_MKXATTR) { + txsize = sizeof (*lr) + namesize + fuidsz + xvatsize; + lrsize = sizeof (*lr); + } else { + aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0; + txsize = + sizeof (lr_acl_create_t) + namesize + fuidsz + + ZIL_ACE_LENGTH(aclsize) + xvatsize; + lrsize = sizeof (lr_acl_create_t); + } + + itx = zil_itx_create(txtype, txsize); + lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_foid = zp->z_id; lr->lr_mode = zp->z_phys->zp_mode; - lr->lr_uid = zp->z_phys->zp_uid; - lr->lr_gid = zp->z_phys->zp_gid; + if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) { + lr->lr_uid = (uint64_t)zp->z_phys->zp_uid; + } else { + lr->lr_uid = fuidp->z_fuid_owner; + } + if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) { + lr->lr_gid = (uint64_t)zp->z_phys->zp_gid; + } else { + lr->lr_gid = fuidp->z_fuid_group; + } lr->lr_gen = zp->z_phys->zp_gen; lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; lr->lr_rdev = zp->z_phys->zp_rdev; - bcopy(name, (char *)(lr + 1), namesize); + + /* + * Fill in xvattr info if any + */ + if (vap->va_mask & AT_XVATTR) { + zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap); + end = (caddr_t)lr + lrsize + xvatsize; + } else { + end = (caddr_t)lr + lrsize; + } + + /* Now fill in any ACL info */ + + if (vsecp) { + lracl = (lr_acl_create_t *)&itx->itx_lr; + lracl->lr_aclcnt = vsecp->vsa_aclcnt; + lracl->lr_acl_bytes = aclsize; + lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; + lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; + if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS) + lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; + else + lracl->lr_acl_flags = 0; + + bcopy(vsecp->vsa_aclentp, end, aclsize); + end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize); + } + + /* drop in FUID info */ + if (fuidp) { + end = zfs_log_fuid_ids(fuidp, end); + end = zfs_log_fuid_domains(fuidp, end); + } + /* + * Now place file name in log record + */ + bcopy(name, end, namesize); seq = zil_itx_assign(zilog, itx, tx); dzp->z_last_itx = seq; @@ -89,7 +320,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype, * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions. */ void -zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype, +zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, char *name) { itx_t *itx; @@ -113,7 +344,7 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype, * zfs_log_link() handles TX_LINK transactions. */ void -zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype, +zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name) { itx_t *itx; @@ -139,8 +370,8 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype, * zfs_log_symlink() handles TX_SYMLINK transactions. */ void -zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *dzp, znode_t *zp, char *name, char *link) +zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, char *link) { itx_t *itx; uint64_t seq; @@ -173,7 +404,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype, * zfs_log_rename() handles TX_RENAME transactions. */ void -zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype, +zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) { itx_t *itx; @@ -203,15 +434,16 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype, */ ssize_t zfs_immediate_write_sz = 32768; +#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \ + sizeof (lr_write_t)) + void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t len, int ioflag) + znode_t *zp, offset_t off, ssize_t resid, int ioflag) { - itx_t *itx; - uint64_t seq; - lr_write_t *lr; itx_wr_state_t write_state; - int err; + boolean_t slogging; + uintptr_t fsync_cnt; if (zilog == NULL || zp->z_unlinked) return; @@ -220,52 +452,84 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, * Writes are handled in three different ways: * * WR_INDIRECT: - * If the write is greater than zfs_immediate_write_sz then - * later *if* we need to log the write then dmu_sync() is used - * to immediately write the block and it's block pointer is put - * in the log record. + * In this mode, if we need to commit the write later, then the block + * is immediately written into the file system (using dmu_sync), + * and a pointer to the block is put into the log record. + * When the txg commits the block is linked in. + * This saves additionally writing the data into the log record. + * There are a few requirements for this to occur: + * - write is greater than zfs_immediate_write_sz + * - not using slogs (as slogs are assumed to always be faster + * than writing into the main pool) + * - the write occupies only one block * WR_COPIED: * If we know we'll immediately be committing the - * transaction (FDSYNC (O_DSYNC)), the we allocate a larger + * transaction (FSYNC or FDSYNC), the we allocate a larger * log record here for the data and copy the data in. * WR_NEED_COPY: * Otherwise we don't allocate a buffer, and *if* we need to * flush the write later then a buffer is allocated and * we retrieve the data using the dmu. */ - if (len > zfs_immediate_write_sz) + slogging = spa_has_slogs(zilog->zl_spa); + if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz) write_state = WR_INDIRECT; - else if (ioflag & FDSYNC) + else if (ioflag & (FSYNC | FDSYNC)) write_state = WR_COPIED; else write_state = WR_NEED_COPY; - itx = zil_itx_create(txtype, sizeof (*lr) + - (write_state == WR_COPIED ? len : 0)); - lr = (lr_write_t *)&itx->itx_lr; - if (write_state == WR_COPIED) { - err = dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1); - if (err) { + if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { + (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); + } + + while (resid) { + itx_t *itx; + lr_write_t *lr; + ssize_t len; + + /* + * If the write would overflow the largest block then split it. + */ + if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA) + len = SPA_MAXBLOCKSIZE >> 1; + else + len = resid; + + itx = zil_itx_create(txtype, sizeof (*lr) + + (write_state == WR_COPIED ? len : 0)); + lr = (lr_write_t *)&itx->itx_lr; + if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, + zp->z_id, off, len, lr + 1) != 0) { kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; write_state = WR_NEED_COPY; } - } - itx->itx_wr_state = write_state; - lr->lr_foid = zp->z_id; - lr->lr_offset = off; - lr->lr_length = len; - lr->lr_blkoff = 0; - BP_ZERO(&lr->lr_blkptr); + itx->itx_wr_state = write_state; + if (write_state == WR_NEED_COPY) + itx->itx_sod += len; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = len; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); - itx->itx_private = zp->z_zfsvfs; + itx->itx_private = zp->z_zfsvfs; - itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; + if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) || + (ioflag & (FSYNC | FDSYNC))) + itx->itx_sync = B_TRUE; + else + itx->itx_sync = B_FALSE; + + zp->z_last_itx = zil_itx_assign(zilog, itx, tx); + + off += len; + resid -= len; + } } /* @@ -298,25 +562,60 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, */ void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, vattr_t *vap, uint_t mask_applied) + znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) { - itx_t *itx; - uint64_t seq; - lr_setattr_t *lr; + itx_t *itx; + uint64_t seq; + lr_setattr_t *lr; + xvattr_t *xvap = (xvattr_t *)vap; + size_t recsize = sizeof (lr_setattr_t); + void *start; + if (zilog == NULL || zp->z_unlinked) return; - itx = zil_itx_create(txtype, sizeof (*lr)); + /* + * If XVATTR set, then log record size needs to allow + * for lr_attr_t + xvattr mask, mapsize and create time + * plus actual attribute values + */ + if (vap->va_mask & AT_XVATTR) + recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize); + + if (fuidp) + recsize += fuidp->z_domain_str_sz; + + itx = zil_itx_create(txtype, recsize); lr = (lr_setattr_t *)&itx->itx_lr; lr->lr_foid = zp->z_id; lr->lr_mask = (uint64_t)mask_applied; lr->lr_mode = (uint64_t)vap->va_mode; - lr->lr_uid = (uint64_t)vap->va_uid; - lr->lr_gid = (uint64_t)vap->va_gid; + if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid)) + lr->lr_uid = fuidp->z_fuid_owner; + else + lr->lr_uid = (uint64_t)vap->va_uid; + + if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid)) + lr->lr_gid = fuidp->z_fuid_group; + else + lr->lr_gid = (uint64_t)vap->va_gid; + lr->lr_size = (uint64_t)vap->va_size; ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime); ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime); + start = (lr_setattr_t *)(lr + 1); + if (vap->va_mask & AT_XVATTR) { + zfs_log_xvattr((lr_attr_t *)start, xvap); + start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize); + } + + /* + * Now stick on domain information if any on end + */ + + if (fuidp) + (void) zfs_log_fuid_domains(fuidp, start); itx->itx_sync = (zp->z_sync_cnt != 0); seq = zil_itx_assign(zilog, itx, tx); @@ -327,21 +626,64 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, * zfs_log_acl() handles TX_ACL transactions. */ void -zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, int aclcnt, ace_t *z_ace) +zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, + vsecattr_t *vsecp, zfs_fuid_info_t *fuidp) { itx_t *itx; uint64_t seq; + lr_acl_v0_t *lrv0; lr_acl_t *lr; + int txtype; + int lrsize; + size_t txsize; + size_t aclbytes = vsecp->vsa_aclentsz; if (zilog == NULL || zp->z_unlinked) return; - itx = zil_itx_create(txtype, sizeof (*lr) + aclcnt * sizeof (ace_t)); + txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ? + TX_ACL_V0 : TX_ACL; + + if (txtype == TX_ACL) + lrsize = sizeof (*lr); + else + lrsize = sizeof (*lrv0); + + txsize = lrsize + + ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) + + (fuidp ? fuidp->z_domain_str_sz : 0) + + sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0); + + itx = zil_itx_create(txtype, txsize); + lr = (lr_acl_t *)&itx->itx_lr; lr->lr_foid = zp->z_id; - lr->lr_aclcnt = (uint64_t)aclcnt; - bcopy(z_ace, (ace_t *)(lr + 1), aclcnt * sizeof (ace_t)); + if (txtype == TX_ACL) { + lr->lr_acl_bytes = aclbytes; + lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; + lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) + lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; + else + lr->lr_acl_flags = 0; + } + lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt; + + if (txtype == TX_ACL_V0) { + lrv0 = (lr_acl_v0_t *)lr; + bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes); + } else { + void *start = (ace_t *)(lr + 1); + + bcopy(vsecp->vsa_aclentp, start, aclbytes); + + start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes); + + if (fuidp) { + start = zfs_log_fuid_ids(fuidp, start); + (void) zfs_log_fuid_domains(fuidp, start); + } + } itx->itx_sync = (zp->z_sync_cnt != 0); seq = zil_itx_assign(zilog, itx, tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c index eb3215d79e62..573a82c98e19 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,6 +38,7 @@ #include <sys/zfs_znode.h> #include <sys/zfs_dir.h> #include <sys/zfs_acl.h> +#include <sys/zfs_fuid.h> #include <sys/spa.h> #include <sys/zil.h> #include <sys/byteorder.h> @@ -61,8 +62,8 @@ zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, vap->va_mask = (uint_t)mask; vap->va_type = IFTOVT(mode); vap->va_mode = mode & MODEMASK; - vap->va_uid = (uid_t)uid; - vap->va_gid = (gid_t)gid; + vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid; + vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid; vap->va_rdev = zfs_cmpldev(rdev); vap->va_nodeid = nodeid; } @@ -74,24 +75,365 @@ zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap) return (ENOTSUP); } +static void +zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) +{ + xoptattr_t *xoap = NULL; + uint64_t *attrs; + uint64_t *crtime; + uint32_t *bitmap; + void *scanstamp; + int i; + + xvap->xva_vattr.va_mask |= AT_XVATTR; + if ((xoap = xva_getxoptattr(xvap)) == NULL) { + xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */ + return; + } + + ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize); + + bitmap = &lrattr->lr_attr_bitmap; + for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++) + xvap->xva_reqattrmap[i] = *bitmap; + + attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1); + crtime = attrs + 1; + scanstamp = (caddr_t)(crtime + 2); + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) + xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0); + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) + xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0); + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) + xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) + xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0); + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) + xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) + xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0); + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) + xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0); + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) + xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0); + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) + xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) + xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0); + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) + xoap->xoa_av_quarantined = + ((*attrs & XAT0_AV_QUARANTINED) != 0); + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime); + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ); +} + +static int +zfs_replay_domain_cnt(uint64_t uid, uint64_t gid) +{ + uint64_t uid_idx; + uint64_t gid_idx; + int domcnt = 0; + + uid_idx = FUID_INDEX(uid); + gid_idx = FUID_INDEX(gid); + if (uid_idx) + domcnt++; + if (gid_idx > 0 && gid_idx != uid_idx) + domcnt++; + + return (domcnt); +} + +static void * +zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start, + int domcnt) +{ + int i; + + for (i = 0; i != domcnt; i++) { + fuid_infop->z_domain_table[i] = start; + start = (caddr_t)start + strlen(start) + 1; + } + + return (start); +} + +/* + * Set the uid/gid in the fuid_info structure. + */ +static void +zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid) +{ + /* + * If owner or group are log specific FUIDs then slurp up + * domain information and build zfs_fuid_info_t + */ + if (IS_EPHEMERAL(uid)) + fuid_infop->z_fuid_owner = uid; + + if (IS_EPHEMERAL(gid)) + fuid_infop->z_fuid_group = gid; +} + +/* + * Load fuid domains into fuid_info_t + */ +static zfs_fuid_info_t * +zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid) +{ + int domcnt; + + zfs_fuid_info_t *fuid_infop; + + fuid_infop = zfs_fuid_info_alloc(); + + domcnt = zfs_replay_domain_cnt(uid, gid); + + if (domcnt == 0) + return (fuid_infop); + + fuid_infop->z_domain_table = + kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); + + zfs_replay_fuid_ugid(fuid_infop, uid, gid); + + fuid_infop->z_domain_cnt = domcnt; + *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt); + return (fuid_infop); +} + +/* + * load zfs_fuid_t's and fuid_domains into fuid_info_t + */ +static zfs_fuid_info_t * +zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid, + uint64_t gid) +{ + uint64_t *log_fuid = (uint64_t *)start; + zfs_fuid_info_t *fuid_infop; + int i; + + fuid_infop = zfs_fuid_info_alloc(); + fuid_infop->z_domain_cnt = domcnt; + + fuid_infop->z_domain_table = + kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); + + for (i = 0; i != idcnt; i++) { + zfs_fuid_t *zfuid; + + zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); + zfuid->z_logfuid = *log_fuid; + zfuid->z_id = -1; + zfuid->z_domidx = 0; + list_insert_tail(&fuid_infop->z_fuids, zfuid); + log_fuid++; + } + + zfs_replay_fuid_ugid(fuid_infop, uid, gid); + + *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt); + return (fuid_infop); +} + +static void +zfs_replay_swap_attrs(lr_attr_t *lrattr) +{ + /* swap the lr_attr structure */ + byteswap_uint32_array(lrattr, sizeof (*lrattr)); + /* swap the bitmap */ + byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) * + sizeof (uint32_t)); + /* swap the attributes, create time + 64 bit word for attributes */ + byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) * + (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t)); +} + +/* + * Replay file create with optional ACL, xvattr information as well + * as option FUID information. + */ +static int +zfs_replay_create_acl(zfsvfs_t *zfsvfs, + lr_acl_create_t *lracl, boolean_t byteswap) +{ + char *name = NULL; /* location determined later */ + lr_create_t *lr = (lr_create_t *)lracl; + znode_t *dzp; + vnode_t *vp = NULL; + xvattr_t xva; + int vflg = 0; + vsecattr_t vsec = { 0 }; + lr_attr_t *lrattr; + void *aclstart; + void *fuidstart; + size_t xvatlen = 0; + uint64_t txtype; + int error; + + if (byteswap) { + byteswap_uint64_array(lracl, sizeof (*lracl)); + txtype = (int)lr->lr_common.lrc_txtype; + if (txtype == TX_CREATE_ACL_ATTR || + txtype == TX_MKDIR_ACL_ATTR) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + zfs_replay_swap_attrs(lrattr); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + } + + aclstart = (caddr_t)(lracl + 1) + xvatlen; + zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE); + /* swap fuids */ + if (lracl->lr_fuidcnt) { + byteswap_uint64_array((caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes), + lracl->lr_fuidcnt * sizeof (uint64_t)); + } + } + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + xva_init(&xva); + zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); + + /* + * All forms of zfs create (create, mkdir, mkxattrdir, symlink) + * eventually end up in zfs_mknode(), which assigns the object's + * creation time and generation number. The generic VOP_CREATE() + * doesn't have either concept, so we smuggle the values inside + * the vattr's otherwise unused va_ctime and va_nblocks fields. + */ + ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); + xva.xva_vattr.va_nblocks = lr->lr_gen; + + error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); + if (error != ENOENT) + goto bail; + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + switch ((int)lr->lr_common.lrc_txtype) { + case TX_CREATE_ACL: + aclstart = (caddr_t)(lracl + 1); + fuidstart = (caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + /*FALLTHROUGH*/ + case TX_CREATE_ACL_ATTR: + if (name == NULL) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + xva.xva_vattr.va_mask |= AT_XVATTR; + zfs_replay_xvattr(lrattr, &xva); + } + vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; + vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; + vsec.vsa_aclcnt = lracl->lr_aclcnt; + vsec.vsa_aclentsz = lracl->lr_acl_bytes; + vsec.vsa_aclflags = lracl->lr_acl_flags; + if (zfsvfs->z_fuid_replay == NULL) { + fuidstart = (caddr_t)(lracl + 1) + xvatlen + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + } + +#ifdef TODO + error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr, + 0, 0, &vp, kcred, vflg, NULL, &vsec); +#else + panic("%s:%u: unsupported condition", __func__, __LINE__); +#endif + break; + case TX_MKDIR_ACL: + aclstart = (caddr_t)(lracl + 1); + fuidstart = (caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + /*FALLTHROUGH*/ + case TX_MKDIR_ACL_ATTR: + if (name == NULL) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr(lrattr, &xva); + } + vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; + vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; + vsec.vsa_aclcnt = lracl->lr_aclcnt; + vsec.vsa_aclentsz = lracl->lr_acl_bytes; + vsec.vsa_aclflags = lracl->lr_acl_flags; + if (zfsvfs->z_fuid_replay == NULL) { + fuidstart = (caddr_t)(lracl + 1) + xvatlen + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + } +#ifdef TODO + error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr, + &vp, kcred, NULL, vflg, &vsec); +#else + panic("%s:%u: unsupported condition", __func__, __LINE__); +#endif + break; + default: + error = ENOTSUP; + } + +bail: + if (error == 0 && vp != NULL) + VN_RELE(vp); + + VN_RELE(ZTOV(dzp)); + + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; + + return (error); +} + static int zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) { - char *name = (char *)(lr + 1); /* name follows lr_create_t */ + char *name = NULL; /* location determined later */ char *link; /* symlink content follows name */ znode_t *dzp; vnode_t *vp = NULL; - vattr_t va; + xvattr_t xva; + int vflg = 0; + size_t lrsize = sizeof (lr_create_t); + lr_attr_t *lrattr; + void *start; + size_t xvatlen; + uint64_t txtype; struct componentname cn; int error; - if (byteswap) + if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); + txtype = (int)lr->lr_common.lrc_txtype; + if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) + zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); + } + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); - zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID, + xva_init(&xva); + zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); /* @@ -101,34 +443,89 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) * doesn't have either concept, so we smuggle the values inside * the vattr's otherwise unused va_ctime and va_nblocks fields. */ - ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime); - va.va_nblocks = lr->lr_gen; + ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); + xva.xva_vattr.va_nblocks = lr->lr_gen; + + error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); + if (error != ENOENT) + goto out; + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + /* + * Symlinks don't have fuid info, and CIFS never creates + * symlinks. + * + * The _ATTR versions will grab the fuid info in their subcases. + */ + if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && + (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && + (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { + start = (lr + 1); + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + } - cn.cn_nameptr = name; cn.cn_cred = kcred; cn.cn_thread = curthread; cn.cn_flags = SAVENAME; vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); switch ((int)lr->lr_common.lrc_txtype) { + case TX_CREATE_ATTR: + lrattr = (lr_attr_t *)(caddr_t)(lr + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); + start = (caddr_t)(lr + 1) + xvatlen; + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + name = (char *)start; + + /*FALLTHROUGH*/ case TX_CREATE: - error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &va); + if (name == NULL) + name = (char *)start; + + cn.cn_nameptr = name; + error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/); break; + case TX_MKDIR_ATTR: + lrattr = (lr_attr_t *)(caddr_t)(lr + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); + start = (caddr_t)(lr + 1) + xvatlen; + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + name = (char *)start; + + /*FALLTHROUGH*/ case TX_MKDIR: - error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &va); + if (name == NULL) + name = (char *)(lr + 1); + + cn.cn_nameptr = name; + error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/); break; case TX_MKXATTR: - error = zfs_make_xattrdir(dzp, &va, &vp, kcred); + name = (char *)(lr + 1); + error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred); break; case TX_SYMLINK: + name = (char *)(lr + 1); link = name + strlen(name) + 1; - error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &va, link); + cn.cn_nameptr = name; + error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &xva.xva_vattr, link /*,vflg*/); break; default: error = ENOTSUP; } VOP_UNLOCK(ZTOV(dzp), 0); +out: if (error == 0 && vp != NULL) { VOP_UNLOCK(vp, 0); VN_RELE(vp); @@ -136,6 +533,9 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) VN_RELE(ZTOV(dzp)); + if (zfsvfs->z_fuid_replay) + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; return (error); } @@ -147,6 +547,7 @@ zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap) struct componentname cn; vnode_t *vp; int error; + int vflg = 0; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -154,7 +555,8 @@ zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap) if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); - bzero(&cn, sizeof(cn)); + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; cn.cn_nameptr = name; cn.cn_namelen = strlen(name); cn.cn_nameiop = DELETE; @@ -171,10 +573,10 @@ zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap) switch ((int)lr->lr_common.lrc_txtype) { case TX_REMOVE: - error = VOP_REMOVE(ZTOV(dzp), vp, &cn); + error = VOP_REMOVE(ZTOV(dzp), vp, &cn /*,vflg*/); break; case TX_RMDIR: - error = VOP_RMDIR(ZTOV(dzp), vp, &cn); + error = VOP_RMDIR(ZTOV(dzp), vp, &cn /*,vflg*/); break; default: error = ENOTSUP; @@ -194,6 +596,7 @@ zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap) znode_t *dzp, *zp; struct componentname cn; int error; + int vflg = 0; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -206,6 +609,8 @@ zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap) return (error); } + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; cn.cn_nameptr = name; cn.cn_cred = kcred; cn.cn_thread = curthread; @@ -213,7 +618,7 @@ zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap) vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); - error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn); + error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn /*,vflg*/); VOP_UNLOCK(ZTOV(zp), 0); VOP_UNLOCK(ZTOV(dzp), 0); @@ -233,6 +638,7 @@ zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap) vnode_t *svp, *tvp; kthread_t *td = curthread; int error; + int vflg = 0; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -245,9 +651,10 @@ zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap) return (error); } + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; svp = tvp = NULL; - bzero(&scn, sizeof(scn)); scn.cn_nameptr = sname; scn.cn_namelen = strlen(sname); scn.cn_nameiop = DELETE; @@ -262,7 +669,6 @@ zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap) goto fail; VOP_UNLOCK(svp, 0); - bzero(&tcn, sizeof(tcn)); tcn.cn_nameptr = tname; tcn.cn_namelen = strlen(tname); tcn.cn_nameiop = RENAME; @@ -279,7 +685,7 @@ zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap) goto fail; } - error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn); + error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn /*,vflg*/); return (error); fail: if (svp != NULL) @@ -334,13 +740,21 @@ static int zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap) { znode_t *zp; - vattr_t va; + xvattr_t xva; + vattr_t *vap = &xva.xva_vattr; vnode_t *vp; int error; + void *start; - if (byteswap) + xva_init(&xva); + if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); + if ((lr->lr_mask & AT_XVATTR) && + zfsvfs->z_version >= ZPL_VERSION_INITIAL) + zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); + } + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { /* * As we can log setattrs out of order, it's possible the @@ -352,35 +766,112 @@ zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap) return (error); } - zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode, + zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode, lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); - va.va_size = lr->lr_size; - ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime); - ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime); + vap->va_size = lr->lr_size; + ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime); + ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime); + + /* + * Fill in xvattr_t portions if necessary. + */ + + start = (lr_setattr_t *)(lr + 1); + if (vap->va_mask & AT_XVATTR) { + zfs_replay_xvattr((lr_attr_t *)start, &xva); + start = (caddr_t)start + + ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize); + } else + xva.xva_vattr.va_mask &= ~AT_XVATTR; + + zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); vp = ZTOV(zp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - error = VOP_SETATTR(vp, &va, kcred); + error = VOP_SETATTR(vp, vap, kcred); VOP_UNLOCK(vp, 0); + + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; VN_RELE(vp); return (error); } static int -zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap) +zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap) { ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ -#ifdef TODO vsecattr_t vsa; + znode_t *zp; + int error; + + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + zfs_oldace_byteswap(ace, lr->lr_aclcnt); + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log acls out of order, it's possible the + * file has been removed. In this case just drop the acl + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + bzero(&vsa, sizeof (vsa)); + vsa.vsa_mask = VSA_ACE | VSA_ACECNT; + vsa.vsa_aclcnt = lr->lr_aclcnt; + vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt; + vsa.vsa_aclflags = 0; + vsa.vsa_aclentp = ace; + +#ifdef TODO + error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL); +#else + panic("%s:%u: unsupported condition", __func__, __LINE__); #endif + + VN_RELE(ZTOV(zp)); + + return (error); +} + +/* + * Replaying ACLs is complicated by FUID support. + * The log record may contain some optional data + * to be used for replaying FUID's. These pieces + * are the actual FUIDs that were created initially. + * The FUID table index may no longer be valid and + * during zfs_create() a new index may be assigned. + * Because of this the log will contain the original + * doman+rid in order to create a new FUID. + * + * The individual ACEs may contain an ephemeral uid/gid which is no + * longer valid and will need to be replaced with an actual FUID. + * + */ +static int +zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap) +{ + ace_t *ace = (ace_t *)(lr + 1); + vsecattr_t vsa; znode_t *zp; int error; if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); - zfs_ace_byteswap(ace, lr->lr_aclcnt); + zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); + if (lr->lr_fuidcnt) { + byteswap_uint64_array((caddr_t)ace + + ZIL_ACE_LENGTH(lr->lr_acl_bytes), + lr->lr_fuidcnt * sizeof (uint64_t)); + } } if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { @@ -396,15 +887,30 @@ zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap) #ifdef TODO bzero(&vsa, sizeof (vsa)); - vsa.vsa_mask = VSA_ACE | VSA_ACECNT; + vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS; vsa.vsa_aclcnt = lr->lr_aclcnt; vsa.vsa_aclentp = ace; + vsa.vsa_aclentsz = lr->lr_acl_bytes; + vsa.vsa_aclflags = lr->lr_acl_flags; + + if (lr->lr_fuidcnt) { + void *fuidstart = (caddr_t)ace + + ZIL_ACE_LENGTH(lr->lr_acl_bytes); + + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, &fuidstart, + lr->lr_fuidcnt, lr->lr_domcnt, 0, 0); + } + + error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL); - error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred); + if (zfsvfs->z_fuid_replay) + zfs_fuid_info_free(zfsvfs->z_fuid_replay); #else error = EOPNOTSUPP; #endif + zfsvfs->z_fuid_replay = NULL; VN_RELE(ZTOV(zp)); return (error); @@ -426,5 +932,12 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_write, /* TX_WRITE */ zfs_replay_truncate, /* TX_TRUNCATE */ zfs_replay_setattr, /* TX_SETATTR */ + zfs_replay_acl_v0, /* TX_ACL_V0 */ zfs_replay_acl, /* TX_ACL */ + zfs_replay_create_acl, /* TX_CREATE_ACL */ + zfs_replay_create, /* TX_CREATE_ATTR */ + zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */ + zfs_replay_create_acl, /* TX_MKDIR_ACL */ + zfs_replay_create, /* TX_MKDIR_ATTR */ + zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c index 07ec0f6b6e90..f0a75b5fa0d7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c @@ -472,10 +472,14 @@ zfs_range_unlock_reader(znode_t *zp, rl_t *remove) */ if (remove->r_cnt == 1) { avl_remove(tree, remove); - if (remove->r_write_wanted) + if (remove->r_write_wanted) { cv_broadcast(&remove->r_wr_cv); - if (remove->r_read_wanted) + cv_destroy(&remove->r_wr_cv); + } + if (remove->r_read_wanted) { cv_broadcast(&remove->r_rd_cv); + cv_destroy(&remove->r_rd_cv); + } } else { ASSERT3U(remove->r_cnt, ==, 0); ASSERT3U(remove->r_write_wanted, ==, 0); @@ -501,10 +505,14 @@ zfs_range_unlock_reader(znode_t *zp, rl_t *remove) rl->r_cnt--; if (rl->r_cnt == 0) { avl_remove(tree, rl); - if (rl->r_write_wanted) + if (rl->r_write_wanted) { cv_broadcast(&rl->r_wr_cv); - if (rl->r_read_wanted) + cv_destroy(&rl->r_wr_cv); + } + if (rl->r_read_wanted) { cv_broadcast(&rl->r_rd_cv); + cv_destroy(&rl->r_rd_cv); + } kmem_free(rl, sizeof (rl_t)); } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 28f3293ec435..5becdb46a9f1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/systm.h> @@ -44,6 +42,7 @@ #include <sys/dmu.h> #include <sys/dsl_prop.h> #include <sys/dsl_dataset.h> +#include <sys/dsl_deleg.h> #include <sys/spa.h> #include <sys/zap.h> #include <sys/varargs.h> @@ -51,17 +50,47 @@ #include <sys/atomic.h> #include <sys/zfs_ioctl.h> #include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> #include <sys/sunddi.h> #include <sys/dnlc.h> +#include <sys/dmu_objset.h> +#include <sys/spa_boot.h> +#include <sys/vdev_impl.h> /* VDEV_BOOT_VERSION */ struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); + SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); + +int zfs_super_owner = 0; +SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, + "File system owner can perform privileged operation on his file systems"); + int zfs_debug_level = 0; TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level); SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0, "Debug level"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); +static int zfs_version_acl = ZFS_ACL_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, + "ZFS_ACL_VERSION"); +static int zfs_version_dmu_backup_header = DMU_BACKUP_HEADER_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_header, CTLFLAG_RD, + &zfs_version_dmu_backup_header, 0, "DMU_BACKUP_HEADER_VERSION"); +static int zfs_version_dmu_backup_stream = DMU_BACKUP_STREAM_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_stream, CTLFLAG_RD, + &zfs_version_dmu_backup_stream, 0, "DMU_BACKUP_STREAM_VERSION"); +static int zfs_version_spa = SPA_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, + "SPA_VERSION"); +static int zfs_version_vdev_boot = VDEV_BOOT_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, vdev_boot, CTLFLAG_RD, + &zfs_version_vdev_boot, 0, "VDEV_BOOT_VERSION"); +static int zfs_version_zpl = ZPL_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, + "ZPL_VERSION"); + static int zfs_mount(vfs_t *vfsp, kthread_t *td); static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td); static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td); @@ -82,7 +111,7 @@ static struct vfsops zfs_vfsops = { .vfs_fhtovp = zfs_fhtovp, }; -VFS_SET(zfs_vfsops, zfs, VFCF_JAIL); +VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); /* * We need to keep a count of active fs's. @@ -235,6 +264,27 @@ exec_changed_cb(void *arg, uint64_t newval) } } +/* + * The nbmand mount option can be changed at mount time. + * We can't allow it to be toggled on live file systems or incorrect + * behavior may be seen from cifs clients + * + * This property isn't registered via dsl_prop_register(), but this callback + * will be called when a file system is first mounted + */ +static void +nbmand_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == FALSE) { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); + } else { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); + } +} + static void snapdir_changed_cb(void *arg, uint64_t newval) { @@ -244,64 +294,27 @@ snapdir_changed_cb(void *arg, uint64_t newval) } static void -acl_mode_changed_cb(void *arg, uint64_t newval) +vscan_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; - zfsvfs->z_acl_mode = newval; + zfsvfs->z_vscan = newval; } static void -acl_inherit_changed_cb(void *arg, uint64_t newval) +acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; - zfsvfs->z_acl_inherit = newval; + zfsvfs->z_acl_mode = newval; } -static int -zfs_refresh_properties(vfs_t *vfsp) +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) { - zfsvfs_t *zfsvfs = vfsp->vfs_data; - - /* - * Remount operations default to "rw" unless "ro" is explicitly - * specified. - */ - if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { - readonly_changed_cb(zfsvfs, B_TRUE); - } else { - if (!dmu_objset_is_snapshot(zfsvfs->z_os)) - readonly_changed_cb(zfsvfs, B_FALSE); - else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) - return (EROFS); - } - - if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { - setuid_changed_cb(zfsvfs, B_FALSE); - } else { - if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) - setuid_changed_cb(zfsvfs, B_FALSE); - else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) - setuid_changed_cb(zfsvfs, B_TRUE); - } - - if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) - exec_changed_cb(zfsvfs, B_FALSE); - else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) - exec_changed_cb(zfsvfs, B_TRUE); - - if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) - atime_changed_cb(zfsvfs, B_TRUE); - else if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) - atime_changed_cb(zfsvfs, B_FALSE); - - if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) - xattr_changed_cb(zfsvfs, B_TRUE); - else if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) - xattr_changed_cb(zfsvfs, B_FALSE); + zfsvfs_t *zfsvfs = arg; - return (0); + zfsvfs->z_acl_inherit = newval; } static int @@ -310,10 +323,12 @@ zfs_register_callbacks(vfs_t *vfsp) struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; + uint64_t nbmand; int readonly, do_readonly = FALSE; int setuid, do_setuid = FALSE; int exec, do_exec = FALSE; int xattr, do_xattr = FALSE; + int atime, do_atime = FALSE; int error = 0; ASSERT(vfsp); @@ -360,6 +375,34 @@ zfs_register_callbacks(vfs_t *vfsp) xattr = B_TRUE; do_xattr = B_TRUE; } + if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { + atime = B_FALSE; + do_atime = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { + atime = B_TRUE; + do_atime = B_TRUE; + } + + /* + * nbmand is a special property. It can only be changed at + * mount time. + * + * This is weird, but it is documented to only be changeable + * at mount time. + */ + if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { + nbmand = B_FALSE; + } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { + nbmand = B_TRUE; + } else { + char osname[MAXNAMELEN]; + + dmu_objset_name(os, osname); + if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, + NULL)) { + return (error); + } + } /* * Register property callbacks. @@ -386,6 +429,8 @@ zfs_register_callbacks(vfs_t *vfsp) "aclmode", acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "vscan", vscan_changed_cb, zfsvfs); if (error) goto unregister; @@ -400,6 +445,10 @@ zfs_register_callbacks(vfs_t *vfsp) exec_changed_cb(zfsvfs, exec); if (do_xattr) xattr_changed_cb(zfsvfs, xattr); + if (do_atime) + atime_changed_cb(zfsvfs, atime); + + nbmand_changed_cb(zfsvfs, nbmand); return (0); @@ -419,14 +468,73 @@ unregister: (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); return (error); } static int -zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td) +zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) +{ + int error; + + error = zfs_register_callbacks(zfsvfs->z_vfs); + if (error) + return (error); + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); + + /* + * If we are not mounting (ie: online recv), then we don't + * have to worry about replaying the log as we blocked all + * operations out since we closed the ZIL. + */ + if (mounting) { + boolean_t readonly; + + /* + * During replay we remove the read only flag to + * allow replays to succeed. + */ + readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + + /* + * Parse and replay the intent log. + */ + zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, + zfs_replay_vector, zfs_unlinked_drain); + + zfs_unlinked_drain(zfsvfs); + zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ + } + + if (!zil_disable) + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + + return (0); +} + +static void +zfs_freezfsvfs(zfsvfs_t *zfsvfs) +{ + mutex_destroy(&zfsvfs->z_znodes_lock); + mutex_destroy(&zfsvfs->z_online_recv_lock); + list_destroy(&zfsvfs->z_all_znodes); + rrw_destroy(&zfsvfs->z_teardown_lock); + rw_destroy(&zfsvfs->z_teardown_inactive_lock); + rw_destroy(&zfsvfs->z_fuid_lock); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} + +static int +zfs_domount(vfs_t *vfsp, char *osname) { - cred_t *cr = td->td_ucred; uint64_t recordsize, readonly; int error = 0; int mode; @@ -449,9 +557,12 @@ zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td) zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); - rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL); + rrw_init(&zfsvfs->z_teardown_lock); + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL)) @@ -466,14 +577,13 @@ zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td) if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) goto out; + mode = DS_MODE_OWNER; if (readonly) - mode = DS_MODE_PRIMARY | DS_MODE_READONLY; - else - mode = DS_MODE_PRIMARY; + mode |= DS_MODE_READONLY; error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); if (error == EROFS) { - mode = DS_MODE_PRIMARY | DS_MODE_READONLY; + mode = DS_MODE_OWNER | DS_MODE_READONLY; error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); } @@ -481,34 +591,40 @@ zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td) if (error) goto out; - if (error = zfs_init_fs(zfsvfs, &zp, cr)) + if (error = zfs_init_fs(zfsvfs, &zp)) goto out; + /* + * Set features for file system. + */ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_use_fuids) { + vfs_set_feature(vfsp, VFSFT_XVATTR); + vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS); + vfs_set_feature(vfsp, VFSFT_ACLONCREATE); + } + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); + } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + } + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { - uint64_t xattr; + uint64_t pval; ASSERT(mode & DS_MODE_READONLY); atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); - if (error = dsl_prop_get_integer(osname, "xattr", &xattr, NULL)) + if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) goto out; - xattr_changed_cb(zfsvfs, xattr); + xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; } else { - error = zfs_register_callbacks(vfsp); - if (error) - goto out; - - zfs_unlinked_drain(zfsvfs); - - /* - * Parse and replay the intent log. - */ - zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, - zfs_replay_vector); - - if (!zil_disable) - zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + error = zfsvfs_setup(zfsvfs, B_TRUE); } vfs_mountedfrom(vfsp, osname); @@ -519,15 +635,12 @@ out: if (error) { if (zfsvfs->z_os) dmu_objset_close(zfsvfs->z_os); - rw_destroy(&zfsvfs->z_um_lock); - mutex_destroy(&zfsvfs->z_znodes_lock); - kmem_free(zfsvfs, sizeof (zfsvfs_t)); + zfs_freezfsvfs(zfsvfs); } else { atomic_add_32(&zfs_active_fs_count, 1); } return (error); - } void @@ -567,6 +680,9 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs) VERIFY(dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "vscan", + vscan_changed_cb, zfsvfs) == 0); } } @@ -574,22 +690,94 @@ zfs_unregister_callbacks(zfsvfs_t *zfsvfs) static int zfs_mount(vfs_t *vfsp, kthread_t *td) { - char *from; - int error; + vnode_t *mvp = vfsp->mnt_vnodecovered; + cred_t *cr = td->td_ucred; + char *osname; + int error = 0; + int canwrite; + + if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) + return (EINVAL); + + /* + * If full-owner-access is enabled and delegated administration is + * turned on, we must set nosuid. + */ + if (zfs_super_owner && + dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { + secpolicy_fs_mount_clearopts(cr, vfsp); + } + + /* + * Check for mount privilege? + * + * If we don't have privilege then see if + * we have local permission to allow it + */ + error = secpolicy_fs_mount(cr, mvp, vfsp); + if (error) { + error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr); + if (error == 0) { + vattr_t vattr; + + /* + * Make sure user is the owner of the mount point + * or has sufficient privileges. + */ + + vattr.va_mask = AT_UID; + + if (error = VOP_GETATTR(mvp, &vattr, cr)) { + goto out; + } + +#if 0 /* CHECK THIS! Is probably needed for zfs_suser. */ + if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && + VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { + error = EPERM; + goto out; + } +#else + if (error = secpolicy_vnode_owner(mvp, cr, vattr.va_uid)) { + goto out; + } + + if (error = VOP_ACCESS(mvp, VWRITE, cr, td)) { + goto out; + } +#endif + + secpolicy_fs_mount_clearopts(cr, vfsp); + } else { + goto out; + } + } + + /* + * Refuse to mount a filesystem if we are in a local zone and the + * dataset is not visible. + */ + if (!INGLOBALZONE(curthread) && + (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { + error = EPERM; + goto out; + } /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. */ - if (vfsp->vfs_flag & MS_REMOUNT) - return (zfs_refresh_properties(vfsp)); - - if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&from, NULL)) - return (EINVAL); + if (vfsp->vfs_flag & MS_REMOUNT) { + /* refresh mount options */ + zfs_unregister_callbacks(vfsp->vfs_data); + error = zfs_register_callbacks(vfsp); + goto out; + } DROP_GIANT(); - error = zfs_domount(vfsp, from, td); + error = zfs_domount(vfsp, osname); PICKUP_GIANT(); +out: return (error); } @@ -671,18 +859,131 @@ zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td) return (error); } +/* + * Teardown the zfsvfs::z_os. + * + * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' + * and 'z_teardown_inactive_lock' held. + */ +static int +zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) +{ + znode_t *zp; + + rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + + if (!unmounting) { + /* + * We purge the parent filesystem's vfsp as the parent + * filesystem and all of its snapshots have their vnode's + * v_vfsp set to the parent's filesystem's vfsp. Note, + * 'z_parent' is self referential for non-snapshots. + */ + (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); + } + + /* + * Close the zil. NB: Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + + /* + * If we are not unmounting (ie: online recv) and someone already + * unmounted this file system while we were doing the switcheroo, + * or a reopen of z_os failed then just bail out now. + */ + if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrw_exit(&zfsvfs->z_teardown_lock, FTAG); + return (EIO); + } + + /* + * At this point there are no vops active, and any new vops will + * fail with EIO since we have z_teardown_lock for writer (only + * relavent for forced unmount). + * + * Release all holds on dbufs. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; + zp = list_next(&zfsvfs->z_all_znodes, zp)) + if (zp->z_dbuf) { + ASSERT(ZTOV(zp)->v_count > 0); + zfs_znode_dmu_fini(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * If we are unmounting, set the unmounted flag and let new vops + * unblock. zfs_inactive will have the unmounted behavior, and all + * other vops will fail with EIO. + */ + if (unmounting) { + zfsvfs->z_unmounted = B_TRUE; + rrw_exit(&zfsvfs->z_teardown_lock, FTAG); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + } + + /* + * z_os will be NULL if there was an error in attempting to reopen + * zfsvfs, so just return as the properties had already been + * unregistered and cached data had been evicted before. + */ + if (zfsvfs->z_os == NULL) + return (0); + + /* + * Unregister properties. + */ + zfs_unregister_callbacks(zfsvfs); + + /* + * Evict cached data + */ + if (dmu_objset_evict_dbufs(zfsvfs->z_os)) { + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + (void) dmu_objset_evict_dbufs(zfsvfs->z_os); + } + + return (0); +} + /*ARGSUSED*/ static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td) { zfsvfs_t *zfsvfs = vfsp->vfs_data; + objset_t *os; cred_t *cr = td->td_ucred; int ret; - if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0) - return (ret); + if (fflag & MS_FORCE) { + /* TODO: Force unmount is not well implemented yet, so deny it. */ + ZFS_LOG(0, "Force unmount is not supported, removing FORCE flag."); + fflag &= ~MS_FORCE; + } - (void) dnlc_purge_vfsp(vfsp, 0); + ret = secpolicy_fs_unmount(cr, vfsp); + if (ret) { + ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), + ZFS_DELEG_PERM_MOUNT, cr); + if (ret) + return (ret); + } + /* + * We purge the parent filesystem's vfsp as the parent filesystem + * and all of its snapshots have their vnode's v_vfsp set to the + * parent's filesystem's vfsp. Note, 'z_parent' is self + * referential for non-snapshots. + */ + (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); /* * Unmount any snapshots mounted under .zfs before unmounting the @@ -714,33 +1015,63 @@ zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td) return (ret); } - if (fflag & MS_FORCE) { + if (!(fflag & MS_FORCE)) { + /* + * Check the number of active vnodes in the file system. + * Our count is maintained in the vfs structure, but the + * number is off by 1 to indicate a hold on the vfs + * structure itself. + * + * The '.zfs' directory maintains a reference of its + * own, and any active references underneath are + * reflected in the vnode count. + */ + if (zfsvfs->z_ctldir == NULL) { + if (vfsp->vfs_count > 1) + return (EBUSY); + } else { + if (vfsp->vfs_count > 2 || + zfsvfs->z_ctldir->v_count > 1) + return (EBUSY); + } + } else { MNT_ILOCK(vfsp); vfsp->mnt_kern_flag |= MNTK_UNMOUNTF; MNT_IUNLOCK(vfsp); - zfsvfs->z_unmounted1 = B_TRUE; + } + + VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + os = zfsvfs->z_os; + + /* + * z_os will be NULL if there was an error in + * attempting to reopen zfsvfs. + */ + if (os != NULL) { + /* + * Unset the objset user_ptr. + */ + mutex_enter(&os->os->os_user_ptr_lock); + dmu_objset_set_user(os, NULL); + mutex_exit(&os->os->os_user_ptr_lock); /* - * Wait for all zfs threads to leave zfs. - * Grabbing a rwlock as reader in all vops and - * as writer here doesn't work because it too easy to get - * multiple reader enters as zfs can re-enter itself. - * This can lead to deadlock if there is an intervening - * rw_enter as writer. - * So a file system threads ref count (z_op_cnt) is used. - * A polling loop on z_op_cnt may seem inefficient, but - * - this saves all threads on exit from having to grab a - * mutex in order to cv_signal - * - only occurs on forced unmount in the rare case when - * there are outstanding threads within the file system. + * Finally release the objset */ - while (zfsvfs->z_op_cnt) { - delay(1); - } + dmu_objset_close(os); } - zfs_objset_close(zfsvfs); - VFS_RELE(vfsp); + /* + * We can now safely destroy the '.zfs' directory node. + */ + if (zfsvfs->z_ctldir != NULL) + zfsctl_destroy(zfsvfs); + if (zfsvfs->z_issnap) { + vnode_t *svp = vfsp->mnt_vnodecovered; + + ASSERT(svp->v_count == 2); + VN_RELE(svp); + } zfs_freevfs(vfsp); return (0); @@ -772,7 +1103,6 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) { - kthread_t *td = curthread; zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; uint64_t object = 0; @@ -824,7 +1154,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) ASSERT(*vpp != NULL); if (object == ZFSCTL_INO_SNAPDIR) { VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, - 0, NULL, NULL) == 0); + 0, NULL, NULL, NULL, NULL, NULL) == 0); } else { VN_HOLD(*vpp); } @@ -854,84 +1184,79 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) *vpp = ZTOV(zp); /* XXX: LK_RETRY? */ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - vnode_create_vobject(*vpp, zp->z_phys->zp_size, td); + vnode_create_vobject(*vpp, zp->z_phys->zp_size, curthread); ZFS_EXIT(zfsvfs); return (0); } -static void -zfs_objset_close(zfsvfs_t *zfsvfs) +/* + * Block out VOPs and close zfsvfs_t::z_os + * + * Note, if successful, then we return with the 'z_teardown_lock' and + * 'z_teardown_inactive_lock' write held. + */ +int +zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode) { - znode_t *zp, *nextzp; - objset_t *os = zfsvfs->z_os; + int error; - /* - * For forced unmount, at this point all vops except zfs_inactive - * are erroring EIO. We need to now suspend zfs_inactive threads - * while we are freeing dbufs before switching zfs_inactive - * to use behaviour without a objset. - */ - rw_enter(&zfsvfs->z_um_lock, RW_WRITER); + if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) + return (error); - /* - * Release all holds on dbufs - * Note, although we have stopped all other vop threads and - * zfs_inactive(), the dmu can callback via znode_pageout_func() - * which can zfs_znode_free() the znode. - * So we lock z_all_znodes; search the list for a held - * dbuf; drop the lock (we know zp can't disappear if we hold - * a dbuf lock; then regrab the lock and restart. - */ - mutex_enter(&zfsvfs->z_znodes_lock); - for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) { - nextzp = list_next(&zfsvfs->z_all_znodes, zp); - if (zp->z_dbuf_held) { - /* dbufs should only be held when force unmounting */ - zp->z_dbuf_held = 0; - mutex_exit(&zfsvfs->z_znodes_lock); - dmu_buf_rele(zp->z_dbuf, NULL); - /* Start again */ - mutex_enter(&zfsvfs->z_znodes_lock); - nextzp = list_head(&zfsvfs->z_all_znodes); - } - } - mutex_exit(&zfsvfs->z_znodes_lock); + *mode = zfsvfs->z_os->os_mode; + dmu_objset_name(zfsvfs->z_os, name); + dmu_objset_close(zfsvfs->z_os); - /* - * Unregister properties. - */ - if (!dmu_objset_is_snapshot(os)) - zfs_unregister_callbacks(zfsvfs); + return (0); +} - /* - * Switch zfs_inactive to behaviour without an objset. - * It just tosses cached pages and frees the znode & vnode. - * Then re-enable zfs_inactive threads in that new behaviour. - */ - zfsvfs->z_unmounted2 = B_TRUE; - rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */ +/* + * Reopen zfsvfs_t::z_os and release VOPs. + */ +int +zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode) +{ + int err; - /* - * Close the zil. Can't close the zil while zfs_inactive - * threads are blocked as zil_close can call zfs_inactive. - */ - if (zfsvfs->z_log) { - zil_close(zfsvfs->z_log); - zfsvfs->z_log = NULL; - } + ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); + if (err) { + zfsvfs->z_os = NULL; + } else { + znode_t *zp; + + VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + + /* + * Attempt to re-establish all the active znodes with + * their dbufs. If a zfs_rezget() fails, then we'll let + * any potential callers discover that via ZFS_ENTER_VERIFY_VP + * when they try to use their znode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + (void) zfs_rezget(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); - /* - * Evict all dbufs so that cached znodes will be freed - */ - if (dmu_objset_evict_dbufs(os, 1)) { - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - (void) dmu_objset_evict_dbufs(os, 0); } - /* - * Finally close the objset - */ - dmu_objset_close(os); + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrw_exit(&zfsvfs->z_teardown_lock, FTAG); + + if (err) { + /* + * Since we couldn't reopen zfsvfs::z_os, force + * unmount this file system. + */ + if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) + (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); + } + return (err); } static void @@ -942,9 +1267,9 @@ zfs_freevfs(vfs_t *vfsp) for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); - rw_destroy(&zfsvfs->z_um_lock); - mutex_destroy(&zfsvfs->z_znodes_lock); - kmem_free(zfsvfs, sizeof (zfsvfs_t)); + + zfs_fuid_destroy(zfsvfs); + zfs_freezfsvfs(zfsvfs); atomic_add_32(&zfs_active_fs_count, -1); } @@ -957,7 +1282,7 @@ static void zfs_vnodes_adjust(void) { #ifdef __i386__ - int val; + int newdesiredvnodes; desiredvnodes_backup = desiredvnodes; @@ -966,10 +1291,11 @@ zfs_vnodes_adjust(void) * vntblinit(). If it is equal to desiredvnodes, it means that * it wasn't tuned by the administrator and we can tune it down. */ - val = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size / - (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); - if (desiredvnodes == val) - desiredvnodes = (3 * desiredvnodes) / 4; + newdesiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * + vm_kmem_size / (5 * (sizeof(struct vm_object) + + sizeof(struct vnode)))); + if (newdesiredvnodes == desiredvnodes) + desiredvnodes = (3 * newdesiredvnodes) / 4; #endif } @@ -986,20 +1312,20 @@ void zfs_init(void) { - printf("ZFS filesystem version " ZFS_VERSION_STRING "\n"); + printf("ZFS filesystem version " SPA_VERSION_STRING "\n"); /* - * Initialize .zfs directory structures + * Initialize znode cache, vnode ops, etc... */ - zfsctl_init(); + zfs_znode_init(); /* - * Initialize znode cache, vnode ops, etc... + * Initialize .zfs directory structures */ - zfs_znode_init(); + zfsctl_init(); /* - * Reduce number of vnodes. Originally number of vnodes is calculated + * Reduce number of vnode. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ @@ -1019,3 +1345,95 @@ zfs_busy(void) { return (zfs_active_fs_count != 0); } + +int +zfs_set_version(const char *name, uint64_t newvers) +{ + int error; + objset_t *os; + dmu_tx_t *tx; + uint64_t curvers; + + /* + * XXX for now, require that the filesystem be unmounted. Would + * be nice to find the zfsvfs_t and just update that if + * possible. + */ + + if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) + return (EINVAL); + + error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os); + if (error) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &curvers); + if (error) + goto out; + if (newvers < curvers) { + error = EINVAL; + goto out; + } + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto out; + } + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, + &newvers, tx); + + spa_history_internal_log(LOG_DS_UPGRADE, + dmu_objset_spa(os), tx, CRED(), + "oldver=%llu newver=%llu dataset = %llu", curvers, newvers, + dmu_objset_id(os)); + dmu_tx_commit(tx); + +out: + dmu_objset_close(os); + return (error); +} +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + const char *pname; + int error = ENOENT; + + /* + * Look up the file system's value for the property. For the + * version property, we look up a slightly different string. + */ + if (prop == ZFS_PROP_VERSION) + pname = ZPL_VERSION_STR; + else + pname = zfs_prop_to_name(prop); + + if (os != NULL) + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + default: + return (error); + } + error = 0; + } + return (error); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index 49ea690a977a..d37c90e981c3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -19,14 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Portions Copyright 2007 Jeremy Teo */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/time.h> @@ -46,7 +44,6 @@ #include <sys/cmn_err.h> #include <sys/errno.h> #include <sys/unistd.h> -#include <sys/zfs_vfsops.h> #include <sys/zfs_dir.h> #include <sys/zfs_acl.h> #include <sys/zfs_ioctl.h> @@ -61,8 +58,11 @@ #include <sys/sunddi.h> #include <sys/filio.h> #include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> #include <sys/dnlc.h> #include <sys/zfs_rlock.h> +#include <sys/extdirent.h> +#include <sys/kidmap.h> #include <sys/bio.h> #include <sys/buf.h> #include <sys/sf_buf.h> @@ -74,14 +74,16 @@ * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, - * and wait the the intent log to commit if it's is a synchronous operation. - * Morover, the vnode ops must work in both normal and log replay context. + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). - * A ZFS_EXIT(zfsvfs) is needed before all returns. + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: @@ -154,26 +156,41 @@ * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ + /* ARGSUSED */ static int -zfs_open(vnode_t **vpp, int flag, cred_t *cr) +zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(*vpp); + if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && + ((flag & FAPPEND) == 0)) { + return (EPERM); + } + + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && + zp->z_phys->zp_size > 0) + if (fs_vscan(*vpp, cr, 0) != 0) + return (EACCES); + /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); + return (0); } /* ARGSUSED */ static int -zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) +zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) { znode_t *zp = VTOZ(vp); /* Decrement the synchronous opens in the znode */ - if (flag & (FSYNC | FDSYNC)) + if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); /* @@ -182,6 +199,12 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) cleanlocks(vp, ddi_get_pid(), 0); cleanshares(vp, ddi_get_pid()); + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && + zp->z_phys->zp_size > 0) + VERIFY(fs_vscan(vp, cr, 1) == 0); + return (0); } @@ -231,31 +254,34 @@ zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, - int *rvalp) + int *rvalp, caller_context_t *ct) { offset_t off; int error; zfsvfs_t *zfsvfs; + znode_t *zp; switch (com) { - case _FIOFFS: + case _FIOFFS: return (0); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ - case _FIOGDIO: - case _FIOSDIO: + case _FIOGDIO: + case _FIOSDIO: return (0); - case _FIO_SEEK_DATA: - case _FIO_SEEK_HOLE: + case _FIO_SEEK_DATA: + case _FIO_SEEK_HOLE: if (ddi_copyin((void *)data, &off, sizeof (off), flag)) return (EFAULT); - zfsvfs = VTOZ(vp)->z_zfsvfs; + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); @@ -474,6 +500,7 @@ offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ * and return buffer. * ioflag - SYNC flags; used to provide FRSYNC semantics. * cr - credentials of caller. + * ct - caller context * * OUT: uio - updated offset and range, buffer filled. * @@ -489,12 +516,19 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - objset_t *os = zfsvfs->z_os; + objset_t *os; ssize_t n, nbytes; int error; rl_t *rl; ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + os = zfsvfs->z_os; + + if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { + ZFS_EXIT(zfsvfs); + return (EACCES); + } /* * Validate file offset @@ -554,8 +588,12 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) error = mappedread(vp, nbytes, uio); else error = dmu_read_uio(os, zp->z_id, uio, nbytes); - if (error) + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = EIO; break; + } n -= nbytes; } @@ -623,6 +661,7 @@ zfs_prefault_write(ssize_t n, struct uio *uio) * and data buffer. * ioflag - IO_APPEND flag set if in append mode. * cr - credentials of caller. + * ct - caller context (NFS/CIFS fem monitor only) * * OUT: uio - updated offset and range. * @@ -643,11 +682,12 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) uint64_t end_size; dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; offset_t woff; ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; + uint64_t pflags; int error; /* @@ -661,6 +701,20 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) limit = MAXOFFSET_T; ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * If immutable or not appending then return EPERM + */ + pflags = zp->z_phys->zp_flags; + if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || + ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && + (uio->uio_loffset < zp->z_phys->zp_size))) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + zilog = zfsvfs->z_log; /* * Pre-fault the pages to ensure slow (eg NFS) pages @@ -808,15 +862,18 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(cr, + secpolicy_vnode_setid_retain(vp, cr, (zp->z_phys->zp_mode & S_ISUID) != 0 && zp->z_phys->zp_uid == 0) != 0) { - zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); + zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); } mutex_exit(&zp->z_acl_lock); @@ -872,7 +929,7 @@ zfs_get_done(dmu_buf_t *db, void *vzgd) dmu_buf_rele(db, vzgd); zfs_range_unlock(rl); VN_RELE(vp); - zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp))); + zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); VFS_UNLOCK_GIANT(vfslocked); } @@ -957,11 +1014,10 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) lr->lr_blkoff = off - boff; error = dmu_sync(zio, db, &lr->lr_blkptr, lr->lr_common.lrc_txg, zfs_get_done, zgd); - ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz); - if (error == 0) { - zil_add_vdev(zfsvfs->z_log, - DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr))); - } + ASSERT((error && error != EINPROGRESS) || + lr->lr_length <= zp->z_blksz); + if (error == 0) + zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); /* * If we get EINPROGRESS, then we need to wait for a * write IO initiated by dmu_sync() to complete before @@ -981,14 +1037,21 @@ out: /*ARGSUSED*/ static int -zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr) +zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, + caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); - error = zfs_zaccess_rwx(zp, mode, cr); + ZFS_VERIFY_ZP(zp); + + if (flag & V_ACE_MASK) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); + else + error = zfs_zaccess_rwx(zp, mode, flag, cr); + ZFS_EXIT(zfsvfs); return (error); } @@ -1003,6 +1066,9 @@ zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr) * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. + * ct - caller context + * direntflags - directory lookup flags + * realpnp - returned pathname. * * OUT: vpp - vnode of located entry, NULL if not found. * @@ -1015,19 +1081,21 @@ zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr) /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, - int nameiop, cred_t *cr, kthread_t *td) + int nameiop, cred_t *cr, kthread_t *td, int flags) { - znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error; + int *direntflags = NULL; + void *realpnp = NULL; ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); *vpp = NULL; -#ifdef TODO if (flags & LOOKUP_XATTR) { +#ifdef TODO /* * If the xattr property is off, refuse the lookup request. */ @@ -1035,6 +1103,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, ZFS_EXIT(zfsvfs); return (EINVAL); } +#endif /* * We don't allow recursive attributes.. @@ -1054,14 +1123,15 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, * Do we have permission to get into attribute directory? */ - if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) { + if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, + B_FALSE, cr)) { VN_RELE(*vpp); + *vpp = NULL; } ZFS_EXIT(zfsvfs); return (error); } -#endif /* TODO */ if (dvp->v_type != VDIR) { ZFS_EXIT(zfsvfs); @@ -1072,13 +1142,19 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, * Check accessibility of directory. */ - if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) { + if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } - if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) { + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); + if (error == 0) { /* * Convert device special files */ @@ -1162,6 +1238,8 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. + * ct - caller context + * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * @@ -1172,22 +1250,52 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ + /* ARGSUSED */ static int zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, - vnode_t **vpp, cred_t *cr) + vnode_t **vpp, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - objset_t *os = zfsvfs->z_os; + zilog_t *zilog; + objset_t *os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; - uint64_t zoid; + zfs_acl_t *aclp = NULL; + zfs_fuid_info_t *fuidp = NULL; + void *vsecp = NULL; + int flag = 0; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || (vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr)))) + return (EINVAL); ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } top: *vpp = NULL; @@ -1204,22 +1312,40 @@ top: error = 0; } else { /* possible VN_HOLD(zp) */ - if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) { + int zflg = 0; + + if (flag & FIGNORECASE) + zflg |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL); + if (error) { if (strcmp(name, "..") == 0) error = EISDIR; ZFS_EXIT(zfsvfs); + if (aclp) + zfs_acl_free(aclp); + return (error); + } + } + if (vsecp && aclp == NULL) { + error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); + if (error) { + ZFS_EXIT(zfsvfs); + if (dl) + zfs_dirent_unlock(dl); return (error); } } - - zoid = zp ? zp->z_id : -1ULL; if (zp == NULL) { + uint64_t txtype; + /* * Create a new file object and update the directory * to reference it. */ - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { goto out; } @@ -1235,11 +1361,26 @@ top: tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || + IS_EPHEMERAL(crgetgid(cr))) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, + FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) + if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); + } error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { zfs_dirent_unlock(dl); @@ -1251,14 +1392,23 @@ top: } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); + if (aclp) + zfs_acl_free(aclp); return (error); } - zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); - ASSERT(zp->z_id == zoid); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); (void) zfs_link_create(dl, zp, tx, ZNEW); - zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + if (flag & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, fuidp, vap); + if (fuidp) + zfs_fuid_info_free(fuidp); dmu_tx_commit(tx); } else { + int aflags = (flag & FAPPEND) ? V_APPEND : 0; + /* * A directory entry already exists for this name. */ @@ -1279,7 +1429,7 @@ top: /* * Verify requested access to file. */ - if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) { + if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { goto out; } @@ -1292,13 +1442,12 @@ top: */ if ((ZTOV(zp)->v_type == VREG) && (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { + /* we can't hold any locks when calling zfs_freesp() */ + zfs_dirent_unlock(dl); + dl = NULL; error = zfs_freesp(zp, 0, 0, mode, TRUE); - if (error == ERESTART && - zfsvfs->z_assign == TXG_NOWAIT) { - /* NB: we already did dmu_tx_wait() */ - zfs_dirent_unlock(dl); - VN_RELE(ZTOV(zp)); - goto top; + if (error == 0) { + vnevent_create(ZTOV(zp), ct); } } } @@ -1325,6 +1474,8 @@ out: *vpp = svp; } } + if (aclp) + zfs_acl_free(aclp); ZFS_EXIT(zfsvfs); return (error); @@ -1336,6 +1487,8 @@ out: * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. + * ct - caller context + * flags - case flags * * RETURN: 0 if success * error code if failure @@ -1344,28 +1497,45 @@ out: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ +/*ARGSUSED*/ static int -zfs_remove(vnode_t *dvp, char *name, cred_t *cr) +zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, + int flags) { znode_t *zp, *dzp = VTOZ(dvp); znode_t *xzp = NULL; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; uint64_t acl_obj, xattr_obj; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; - boolean_t unlinked; + boolean_t unlinked, toobig = FALSE; + uint64_t txtype; + pathname_t *realnmp = NULL; + pathname_t realnm; int error; + int zflg = ZEXISTS; ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) { + zflg |= ZCILOOK; + pn_alloc(&realnm); + realnmp = &realnm; + } top: /* * Attempt to lock directory; fail if entry doesn't exist. */ - if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { + if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, realnmp)) { + if (realnmp) + pn_free(realnmp); ZFS_EXIT(zfsvfs); return (error); } @@ -1384,9 +1554,12 @@ top: goto out; } - vnevent_remove(vp); + vnevent_remove(vp, dvp, name, ct); - dnlc_remove(dvp, name); + if (realnmp) + dnlc_remove(dvp, realnmp->pn_buf); + else + dnlc_remove(dvp, name); may_delete_now = FALSE; @@ -1399,8 +1572,13 @@ top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_bonus(tx, zp->z_id); - if (may_delete_now) - dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + if (may_delete_now) { + toobig = + zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; + /* if the file is too big, only hold_free a token amount */ + dmu_tx_hold_free(tx, zp->z_id, 0, + (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); + } /* are there any extended attributes? */ if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { @@ -1425,6 +1603,8 @@ top: dmu_tx_abort(tx); goto top; } + if (realnmp) + pn_free(realnmp); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); @@ -1433,7 +1613,7 @@ top: /* * Remove the directory entry. */ - error = zfs_link_destroy(dl, zp, tx, 0, &unlinked); + error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); if (error) { dmu_tx_commit(tx); @@ -1442,7 +1622,7 @@ top: if (0 && unlinked) { VI_LOCK(vp); - delete_now = may_delete_now && + delete_now = may_delete_now && !toobig && vp->v_count == 1 && !vn_has_cached_data(vp) && zp->z_phys->zp_xattr == xattr_obj && zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; @@ -1469,21 +1649,26 @@ top: VI_UNLOCK(vp); mutex_exit(&zp->z_lock); zfs_znode_delete(zp, tx); - VFS_RELE(zfsvfs->z_vfs); } else if (unlinked) { zfs_unlinked_add(zp, tx); } - zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name); + txtype = TX_REMOVE; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name); dmu_tx_commit(tx); out: + if (realnmp) + pn_free(realnmp); + zfs_dirent_unlock(dl); if (!delete_now) { VN_RELE(vp); } else if (xzp) { - /* this rele delayed to prevent nesting transactions */ + /* this rele is delayed to prevent nesting transactions */ VN_RELE(ZTOV(xzp)); } @@ -1499,6 +1684,8 @@ out: * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. + * ct - caller context + * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * @@ -1509,49 +1696,104 @@ out: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ +/*ARGSUSED*/ static int -zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) +zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, + caller_context_t *ct, int flags, vsecattr_t *vsecp) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; zfs_dirlock_t *dl; - uint64_t zoid = 0; + uint64_t txtype; dmu_tx_t *tx; int error; + zfs_acl_t *aclp = NULL; + zfs_fuid_info_t *fuidp = NULL; + int zf = ZNEW; ASSERT(vap->va_type == VDIR); + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))|| + IS_EPHEMERAL(crgetgid(cr)))) + return (EINVAL); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; if (dzp->z_phys->zp_flags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (EINVAL); } -top: - *vpp = NULL; + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + if (vap->va_mask & AT_XVATTR) + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } /* * First make sure the new directory doesn't exist. */ - if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) { +top: + *vpp = NULL; + + if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, + NULL, NULL)) { ZFS_EXIT(zfsvfs); return (error); } - if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) { + if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } + if (vsecp && aclp == NULL) { + error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); + if (error) { + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) + if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || + IS_EPHEMERAL(crgetgid(cr))) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); error = dmu_tx_assign(tx, zfsvfs->z_assign); @@ -1564,13 +1806,18 @@ top: } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); + if (aclp) + zfs_acl_free(aclp); return (error); } /* * Create new node. */ - zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); + + if (aclp) + zfs_acl_free(aclp); /* * Now put new name in parent dir. @@ -1579,7 +1826,13 @@ top: *vpp = ZTOV(zp); - zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname); + txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); + + if (fuidp) + zfs_fuid_info_free(fuidp); dmu_tx_commit(tx); zfs_dirent_unlock(dl); @@ -1597,6 +1850,8 @@ top: * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. + * ct - caller context + * flags - case flags * * RETURN: 0 if success * error code if failure @@ -1604,27 +1859,35 @@ top: * Timestamps: * dvp - ctime|mtime updated */ +/*ARGSUSED*/ static int -zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) +zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, + caller_context_t *ct, int flags) { znode_t *dzp = VTOZ(dvp); znode_t *zp; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; + int zflg = ZEXISTS; ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + if (flags & FIGNORECASE) + zflg |= ZCILOOK; top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ - if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { + if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL)) { ZFS_EXIT(zfsvfs); return (error); } @@ -1645,7 +1908,7 @@ top: goto out; } - vnevent_rmdir(vp); + vnevent_rmdir(vp, dvp, name, ct); /* * Grab a lock on the directory to make sure that noone is @@ -1683,10 +1946,14 @@ top: cache_purge(dvp); #endif - error = zfs_link_destroy(dl, zp, tx, 0, NULL); + error = zfs_link_destroy(dl, zp, tx, zflg, NULL); - if (error == 0) - zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name); + if (error == 0) { + uint64_t txtype = TX_RMDIR; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name); + } dmu_tx_commit(tx); @@ -1713,6 +1980,8 @@ out: * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. + * ct - caller context + * flags - case flags * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. @@ -1734,6 +2003,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon { znode_t *zp = VTOZ(vp); iovec_t *iovp; + edirent_t *eodp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; @@ -1747,11 +2017,14 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon int outcount; int error; uint8_t prefetch; + boolean_t check_sysattrs; uint8_t type; int ncooks; u_long *cooks = NULL; + int flags = 0; ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); /* * If we are not given an eof variable, @@ -1809,6 +2082,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon bufsize = bytes_wanted; odp = (struct dirent64 *)iovp->iov_base; } + eodp = (struct edirent *)odp; if (ncookies != NULL) { /* @@ -1819,6 +2093,19 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon *cookies = cooks; *ncookies = ncooks; } + /* + * If this VFS supports the system attribute view interface; and + * we're looking at an extended attribute directory; and we care + * about normalization conflicts on this vfs; then we must check + * for normalization conflicts with the sysattr name space. + */ +#ifdef TODO + check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && + (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && + (flags & V_RDDIR_ENTFLAGS); +#else + check_sysattrs = 0; +#endif /* * Transform to file-system independent format @@ -1827,20 +2114,24 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; + off64_t *next; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); + zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); + zap.za_normalization_conflict = 0; objnum = zp->z_phys->zp_parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); + zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { @@ -1870,8 +2161,21 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ type = ZFS_DIRENT_TYPE(zap.za_first_integer); + + if (check_sysattrs && !zap.za_normalization_conflict) { +#ifdef TODO + zap.za_normalization_conflict = + xattr_sysattr_casechk(zap.za_name); +#else + panic("%s:%u: TODO", __func__, __LINE__); +#endif + } } - reclen = DIRENT64_RECLEN(strlen(zap.za_name)); + + if (flags & V_RDDIR_ENTFLAGS) + reclen = EDIRENT_RECLEN(strlen(zap.za_name)); + else + reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? @@ -1886,16 +2190,31 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon } break; } - /* - * Add this entry: - */ - odp->d_ino = objnum; - odp->d_reclen = reclen; - odp->d_namlen = strlen(zap.za_name); - (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); - odp->d_type = type; + if (flags & V_RDDIR_ENTFLAGS) { + /* + * Add extended flag entry: + */ + eodp->ed_ino = objnum; + eodp->ed_reclen = reclen; + /* NOTE: ed_off is the offset for the *next* entry */ + next = &(eodp->ed_off); + eodp->ed_eflags = zap.za_normalization_conflict ? + ED_CASE_CONFLICT : 0; + (void) strncpy(eodp->ed_name, zap.za_name, + EDIRENT_NAMELEN(reclen)); + eodp = (edirent_t *)((intptr_t)eodp + reclen); + } else { + /* + * Add normal entry: + */ + odp->d_ino = objnum; + odp->d_reclen = reclen; + odp->d_namlen = strlen(zap.za_name); + (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); + odp->d_type = type; + odp = (dirent64_t *)((intptr_t)odp + reclen); + } outcount += reclen; - odp = (dirent64_t *)((intptr_t)odp + reclen); ASSERT(outcount <= bufsize); @@ -1956,26 +2275,34 @@ update: return (error); } +ulong_t zfs_fsync_sync_cnt = 4; + static int -zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr) +zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; + (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); ZFS_EXIT(zfsvfs); return (0); } + /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. - * flags - [UNUSED] + * If AT_XVATTR set, then optional attrs are requested + * flags - ATTR_NOACLCHECK (CIFS server context) * cr - credentials of caller. + * ct - caller context * * OUT: vap - attribute values. * @@ -1983,54 +2310,170 @@ zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr) */ /* ARGSUSED */ static int -zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) +zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - znode_phys_t *pzp = zp->z_phys; + znode_phys_t *pzp; + int error = 0; uint32_t blksize; u_longlong_t nblocks; - int error; + uint64_t links; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap = NULL; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + pzp = zp->z_phys; + + mutex_enter(&zp->z_lock); + + /* + * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. + * Also, if we are the owner don't bother, since owner should + * always be allowed to read basic attributes of file. + */ + if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && + (pzp->zp_uid != crgetuid(cr))) { + if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, + skipaclchk, cr)) { + mutex_exit(&zp->z_lock); + ZFS_EXIT(zfsvfs); + return (error); + } + } /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ - mutex_enter(&zp->z_lock); vap->va_type = IFTOVT(pzp->zp_mode); vap->va_mode = pzp->zp_mode & ~S_IFMT; - vap->va_uid = zp->z_phys->zp_uid; - vap->va_gid = zp->z_phys->zp_gid; + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); +// vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; vap->va_nodeid = zp->z_id; - vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX); /* nlink_t limit! */ + if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) + links = pzp->zp_links + 1; + else + links = pzp->zp_links; + vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ vap->va_size = pzp->zp_size; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_rdev = zfs_cmpldev(pzp->zp_rdev); vap->va_seq = zp->z_seq; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ - ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); - ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); - ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); - ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); - /* - * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. - * Also, if we are the owner don't bother, since owner should - * always be allowed to read basic attributes of file. + * Add in any requested optional attributes and the create time. + * Also set the corresponding bits in the returned attribute bitmap. */ - if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) && - (zp->z_phys->zp_uid != crgetuid(cr))) { - if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) { - mutex_exit(&zp->z_lock); - ZFS_EXIT(zfsvfs); - return (error); + if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + xoap->xoa_archive = + ((pzp->zp_flags & ZFS_ARCHIVE) != 0); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + xoap->xoa_readonly = + ((pzp->zp_flags & ZFS_READONLY) != 0); + XVA_SET_RTN(xvap, XAT_READONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + xoap->xoa_system = + ((pzp->zp_flags & ZFS_SYSTEM) != 0); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + xoap->xoa_hidden = + ((pzp->zp_flags & ZFS_HIDDEN) != 0); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + xoap->xoa_nounlink = + ((pzp->zp_flags & ZFS_NOUNLINK) != 0); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + xoap->xoa_immutable = + ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + xoap->xoa_appendonly = + ((pzp->zp_flags & ZFS_APPENDONLY) != 0); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + xoap->xoa_nodump = + ((pzp->zp_flags & ZFS_NODUMP) != 0); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + xoap->xoa_opaque = + ((pzp->zp_flags & ZFS_OPAQUE) != 0); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + xoap->xoa_av_quarantined = + ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + xoap->xoa_av_modified = + ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && + vp->v_type == VREG && + (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { + size_t len; + dmu_object_info_t doi; + + /* + * Only VREG files have anti-virus scanstamps, so we + * won't conflict with symlinks in the bonus buffer. + */ + dmu_object_info_from_db(zp->z_dbuf, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + sizeof (znode_phys_t); + if (len <= doi.doi_bonus_size) { + /* + * pzp points to the start of the + * znode_phys_t. pzp + 1 points to the + * first byte after the znode_phys_t. + */ + (void) memcpy(xoap->xoa_av_scanstamp, + pzp + 1, + sizeof (xoap->xoa_av_scanstamp)); + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); + XVA_SET_RTN(xvap, XAT_CREATETIME); } } + ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); + ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); + ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); + ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); + mutex_exit(&zp->z_lock); dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks); @@ -2054,8 +2497,11 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) * * IN: vp - vnode of file to be modified. * vap - new attribute values. + * If AT_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. + * ct - caller context * * RETURN: 0 if success * error code if failure @@ -2068,10 +2514,10 @@ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { - struct znode *zp = VTOZ(vp); - znode_phys_t *pzp = zp->z_phys; + znode_t *zp = VTOZ(vp); + znode_phys_t *pzp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; uint_t mask = vap->va_mask; @@ -2081,6 +2527,11 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, znode_t *attrzp; int need_policy = FALSE; int err; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp = NULL; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; if (mask == 0) return (0); @@ -2088,13 +2539,69 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, if (mask & AT_NOSET) return (EINVAL); - if (mask & AT_SIZE && vp->v_type == VDIR) + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + pzp = zp->z_phys; + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & AT_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + if (mask & AT_SIZE && vp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); return (EISDIR); + } - if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) + if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { + ZFS_EXIT(zfsvfs); return (EINVAL); + } - ZFS_ENTER(zfsvfs); + /* + * If this is an xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + + /* + * Immutable files can only alter immutable bit and atime + */ + if ((pzp->zp_flags & ZFS_IMMUTABLE) && + ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || + ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + if (mask & (AT_ATIME | AT_MTIME)) { + if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { + ZFS_EXIT(zfsvfs); + return (EOVERFLOW); + } + } top: attrzp = NULL; @@ -2109,7 +2616,7 @@ top: */ if (mask & AT_SIZE) { - err = zfs_zaccess(zp, ACE_WRITE_DATA, cr); + err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); @@ -2120,18 +2627,22 @@ top: * block if there are locks present... this * should be addressed in openat(). */ - do { - err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); - /* NB: we already did dmu_tx_wait() if necessary */ - } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT); + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { ZFS_EXIT(zfsvfs); return (err); } } - if (mask & (AT_ATIME|AT_MTIME)) - need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr); + if (mask & (AT_ATIME|AT_MTIME) || + ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + skipaclchk, cr); if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); @@ -2151,7 +2662,8 @@ top: */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); - take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr); + take_group = (mask & AT_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and @@ -2165,11 +2677,12 @@ top: if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { - if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ - secpolicy_setid_clear(vap, cr); + secpolicy_setid_clear(vap, vp, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; @@ -2181,12 +2694,38 @@ top: mutex_enter(&zp->z_lock); oldva.va_mode = pzp->zp_mode; - oldva.va_uid = zp->z_phys->zp_uid; - oldva.va_gid = zp->z_phys->zp_gid; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & AT_XVATTR) { + if ((need_policy == FALSE) && + (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) && + xoap->xoa_appendonly != + ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) || + (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) && + xoap->xoa_nounlink != + ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) || + (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) && + xoap->xoa_immutable != + ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) || + (XVA_ISSET_REQ(xvap, XAT_NODUMP) && + xoap->xoa_nodump != + ((pzp->zp_flags & ZFS_NODUMP) != 0)) || + (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) && + xoap->xoa_av_modified != + ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) || + ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) && + ((vp->v_type != VREG && xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) || + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || + (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + mutex_exit(&zp->z_lock); if (mask & AT_MODE) { - if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { @@ -2211,10 +2750,9 @@ top: if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; - } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, - (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp); + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); @@ -2232,25 +2770,58 @@ top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); + if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } if (mask & AT_MODE) { uint64_t pmode = pzp->zp_mode; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - if (zp->z_phys->zp_acl.z_acl_extern_obj) - dmu_tx_hold_write(tx, - pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE); - else + if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (err); + } + if (pzp->zp_acl.z_acl_extern_obj) { + /* Are we upgrading ACL from old V0 format to new V1 */ + if (zfsvfs->z_version <= ZPL_VERSION_FUID && + pzp->zp_acl.z_acl_version == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, + pzp->zp_acl.z_acl_extern_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, + pzp->zp_acl.z_acl_extern_obj, 0, + aclp->z_acl_bytes); + } + } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, ZFS_ACL_SIZE(MAX_ACL_SIZE)); + 0, aclp->z_acl_bytes); + } } - if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) { - err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp); + if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { + err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); if (err) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); + if (aclp) + zfs_acl_free(aclp); return (err); } dmu_tx_hold_bonus(tx, attrzp->z_id); @@ -2260,6 +2831,12 @@ top: if (err) { if (attrzp) VN_RELE(ZTOV(attrzp)); + + if (aclp) { + zfs_acl_free(aclp); + aclp = NULL; + } + if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); @@ -2283,26 +2860,36 @@ top: mutex_enter(&zp->z_lock); if (mask & AT_MODE) { - err = zfs_acl_chmod_setattr(zp, new_mode, tx); + mutex_enter(&zp->z_acl_lock); + zp->z_phys->zp_mode = new_mode; + err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); ASSERT3U(err, ==, 0); + mutex_exit(&zp->z_acl_lock); } if (attrzp) mutex_enter(&attrzp->z_lock); if (mask & AT_UID) { - zp->z_phys->zp_uid = (uint64_t)vap->va_uid; + pzp->zp_uid = zfs_fuid_create(zfsvfs, + vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); if (attrzp) { - attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid; + attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs, + vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); } } if (mask & AT_GID) { - zp->z_phys->zp_gid = (uint64_t)vap->va_gid; + pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, + cr, ZFS_GROUP, tx, &fuidp); if (attrzp) - attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid; + attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, + vap->va_gid, cr, ZFS_GROUP, tx, &fuidp); } + if (aclp) + zfs_acl_free(aclp); + if (attrzp) mutex_exit(&attrzp->z_lock); @@ -2312,14 +2899,38 @@ top: if (mask & AT_MTIME) ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); + /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ if (mask & AT_SIZE) zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); else if (mask != 0) zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & AT_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + size_t len; + dmu_object_info_t doi; + + ASSERT(vp->v_type == VREG); + + /* Grow the bonus buffer if necessary. */ + dmu_object_info_from_db(zp->z_dbuf, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + sizeof (znode_phys_t); + if (len > doi.doi_bonus_size) + VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); + } + zfs_xvattr_set(zp, xvap); + } if (mask != 0) - zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask); + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + if (fuidp) + zfs_fuid_info_free(fuidp); mutex_exit(&zp->z_lock); if (attrzp) @@ -2436,6 +3047,8 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) * tdvp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. + * ct - caller context + * flags - case flags * * RETURN: 0 if success * error code if failure @@ -2443,25 +3056,31 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) * Timestamps: * sdvp,tdvp - ctime|mtime updated */ +/*ARGSUSED*/ static int -zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr) +zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, + caller_context_t *ct, int flags) { znode_t *tdzp, *szp, *tzp; znode_t *sdzp = VTOZ(sdvp); zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; vnode_t *realvp; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; - int cmp, serr, terr, error; + int cmp, serr, terr; + int error = 0; + int zflg = 0; ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(sdzp); + zilog = zfsvfs->z_log; /* * Make sure we have the real vp for the target directory. */ - if (VOP_REALVP(tdvp, &realvp) == 0) + if (VOP_REALVP(tdvp, &realvp, ct) == 0) tdvp = realvp; if (tdvp->v_vfsp != sdvp->v_vfsp) { @@ -2470,6 +3089,16 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr) } tdzp = VTOZ(tdvp); + ZFS_VERIFY_ZP(tdzp); + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + top: szp = NULL; tzp = NULL; @@ -2497,7 +3126,14 @@ top: } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { - cmp = strcmp(snm, tnm); + /* + * First compare the two name arguments without + * considering any case folding. + */ + int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); + + cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); + ASSERT(error == 0 || !zfsvfs->z_utf8); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument @@ -2508,13 +3144,49 @@ top: ZFS_EXIT(zfsvfs); return (0); } + /* + * If the file system is case-folding, then we may + * have some more checking to do. A case-folding file + * system is either supporting mixed case sensitivity + * access or is completely case-insensitive. Note + * that the file system is always case preserving. + * + * In mixed sensitivity mode case sensitive behavior + * is the default. FIGNORECASE must be used to + * explicitly request case insensitive behavior. + * + * If the source and target names provided differ only + * by case (e.g., a request to rename 'tim' to 'Tim'), + * we will treat this as a special case in the + * case-insensitive mode: as long as the source name + * is an exact match, we will allow this to proceed as + * a name-change request. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + (zfsvfs->z_case == ZFS_CASE_MIXED && + flags & FIGNORECASE)) && + u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, + &error) == 0) { + /* + * case preserving rename request, require exact + * name matches + */ + zflg |= ZCIEXACT; + zflg &= ~ZCILOOK; + } } + if (cmp < 0) { - serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); - terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); + serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, + ZEXISTS | zflg, NULL, NULL); + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); } else { - terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); - serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, zflg, NULL, NULL); + serr = zfs_dirent_lock(&sdl, + sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, + NULL, NULL); } if (serr) { @@ -2588,9 +3260,17 @@ top: } } - vnevent_rename_src(ZTOV(szp)); + vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); if (tzp) - vnevent_rename_dest(ZTOV(tzp)); + vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); + + /* + * notify the target directory if it is not the same + * as source directory. + */ + if (tdvp != sdvp) { + vnevent_rename_dest_dir(tdvp, ct); + } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ @@ -2622,15 +3302,22 @@ top: } if (tzp) /* Attempt to remove the existing target */ - error = zfs_link_destroy(tdl, tzp, tx, 0, NULL); + error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); if (error == 0) { error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error == 0) { + szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); ASSERT(error == 0); - zfs_log_rename(zilog, tx, TX_RENAME, sdzp, - sdl->dl_name, tdzp, tdl->dl_name, szp); + + zfs_log_rename(zilog, tx, + TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), + sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); + + /* Update path information for the target vnode */ + vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); } #ifdef FREEBSD_NAMECACHE if (error == 0) { @@ -2665,6 +3352,8 @@ out: * vap - Attributes of new entry. * target - Target path of new symlink. * cr - credentials of caller. + * ct - caller context + * flags - case flags * * RETURN: 0 if success * error code if failure @@ -2672,23 +3361,37 @@ out: * Timestamps: * dvp - ctime|mtime updated */ +/*ARGSUSED*/ static int -zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td) +zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, + cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - uint64_t zoid; + zilog_t *zilog; int len = strlen(link); int error; + int zflg = ZNEW; + zfs_fuid_info_t *fuidp = NULL; + int flags = 0; ASSERT(vap->va_type == VLNK); ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + if (flags & FIGNORECASE) + zflg |= ZCILOOK; top: - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } @@ -2701,7 +3404,8 @@ top: /* * Attempt to lock directory; fail if entry already exists. */ - if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) { + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); + if (error) { ZFS_EXIT(zfsvfs); return (error); } @@ -2712,6 +3416,18 @@ top: dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); + if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { zfs_dirent_unlock(dl); @@ -2732,23 +3448,22 @@ top: * Put the link content into bonus buffer if it will fit; * otherwise, store it just like any other file data. */ - zoid = 0; if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { - zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); if (len != 0) bcopy(link, zp->z_phys + 1, len); } else { dmu_buf_t *dbp; - zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); - + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); /* * Nothing can access the znode yet so no locking needed * for growing the znode's blocksize. */ zfs_grow_blocksize(zp, len, tx); - VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp)); + VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, + zp->z_id, 0, FTAG, &dbp)); dmu_buf_will_dirty(dbp, tx); ASSERT3U(len, <=, dbp->db_size); @@ -2763,9 +3478,14 @@ top: (void) zfs_link_create(dl, zp, tx, ZNEW); out: if (error == 0) { - zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link); + uint64_t txtype = TX_SYMLINK; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); *vpp = ZTOV(zp); } + if (fuidp) + zfs_fuid_info_free(fuidp); dmu_tx_commit(tx); @@ -2782,6 +3502,7 @@ out: * IN: vp - vnode of symbolic link. * uoip - structure to contain the link path. * cr - credentials of caller. + * ct - caller context * * OUT: uio - structure to contain the link path. * @@ -2793,7 +3514,7 @@ out: */ /* ARGSUSED */ static int -zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr) +zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; @@ -2801,6 +3522,7 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr) int error; ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); bufsz = (size_t)zp->z_phys->zp_size; if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { @@ -2830,6 +3552,7 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr) * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. + * ct - caller context * * RETURN: 0 if success * error code if failure @@ -2840,30 +3563,44 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr) */ /* ARGSUSED */ static int -zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr) +zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, + caller_context_t *ct, int flags) { znode_t *dzp = VTOZ(tdvp); znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; vnode_t *realvp; int error; + int zf = ZNEW; + uid_t owner; ASSERT(tdvp->v_type == VDIR); ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; - if (VOP_REALVP(svp, &realvp) == 0) + if (VOP_REALVP(svp, &realvp, ct) == 0) svp = realvp; if (svp->v_vfsp != tdvp->v_vfsp) { ZFS_EXIT(zfsvfs); return (EXDEV); } - szp = VTOZ(svp); + ZFS_VERIFY_ZP(szp); + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + top: /* * We do not support links between attributes and non-attributes @@ -2886,13 +3623,14 @@ top: return (EPERM); } - if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) && - secpolicy_basic_link(cr) != 0) { + owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); + if (owner != crgetuid(cr) && + secpolicy_basic_link(svp, cr) != 0) { ZFS_EXIT(zfsvfs); return (EPERM); } - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } @@ -2900,7 +3638,8 @@ top: /* * Attempt to lock directory; fail if entry already exists. */ - if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) { + error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); + if (error) { ZFS_EXIT(zfsvfs); return (error); } @@ -2923,40 +3662,45 @@ top: error = zfs_link_create(dl, szp, tx, 0); - if (error == 0) - zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name); + if (error == 0) { + uint64_t txtype = TX_LINK; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_link(zilog, tx, txtype, dzp, szp, name); + } dmu_tx_commit(tx); zfs_dirent_unlock(dl); + if (error == 0) { + vnevent_link(svp, ct); + } + ZFS_EXIT(zfsvfs); return (error); } +/*ARGSUSED*/ void -zfs_inactive(vnode_t *vp, cred_t *cr) +zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; - rw_enter(&zfsvfs->z_um_lock, RW_READER); - if (zfsvfs->z_unmounted2) { - ASSERT(zp->z_dbuf_held == 0); - + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_dbuf == NULL) { + /* + * The fs has been unmounted, or we did a + * suspend/resume and this file no longer exists. + */ mutex_enter(&zp->z_lock); VI_LOCK(vp); vp->v_count = 0; /* count arrives as 1 */ - VI_UNLOCK(vp); - if (zp->z_dbuf == NULL) { - mutex_exit(&zp->z_lock); - zfs_znode_free(zp); - } else { - mutex_exit(&zp->z_lock); - } - rw_exit(&zfsvfs->z_um_lock); - VFS_RELE(zfsvfs->z_vfs); + mutex_exit(&zp->z_lock); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + zfs_znode_free(zp); return; } @@ -2977,23 +3721,26 @@ zfs_inactive(vnode_t *vp, cred_t *cr) } zfs_zinactive(zp); - rw_exit(&zfsvfs->z_um_lock); + rw_exit(&zfsvfs->z_teardown_inactive_lock); } CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); +/*ARGSUSED*/ static int -zfs_fid(vnode_t *vp, fid_t *fidp) +zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint32_t gen = (uint32_t)zp->z_phys->zp_gen; + uint32_t gen; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i; ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + gen = (uint32_t)zp->z_gen; size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; fidp->fid_len = size; @@ -3030,7 +3777,8 @@ zfs_fid(vnode_t *vp, fid_t *fidp) } static int -zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) +zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) { znode_t *zp, *xzp; zfsvfs_t *zfsvfs; @@ -3051,9 +3799,10 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); *valp = 0; error = zfs_dirent_lock(&dl, zp, "", &xzp, - ZXATTR | ZEXISTS | ZSHARED); + ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); if (error == 0) { zfs_dirent_unlock(dl); if (!zfs_dirempty(xzp)) @@ -3086,14 +3835,17 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) #ifdef TODO /*ARGSUSED*/ static int -zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) +zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, + caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ZFS_ENTER(zfsvfs); - error = zfs_getacl(zp, vsecp, cr); + ZFS_VERIFY_ZP(zp); + error = zfs_getacl(zp, vsecp, skipaclchk, cr); ZFS_EXIT(zfsvfs); return (error); @@ -3103,14 +3855,17 @@ zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) #ifdef TODO /*ARGSUSED*/ static int -zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) +zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, + caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ZFS_ENTER(zfsvfs); - error = zfs_setacl(zp, vsecp, cr); + ZFS_VERIFY_ZP(zp); + error = zfs_setacl(zp, vsecp, skipaclchk, cr); ZFS_EXIT(zfsvfs); return (error); } @@ -3129,7 +3884,7 @@ zfs_freebsd_open(ap) znode_t *zp = VTOZ(vp); int error; - error = zfs_open(&vp, ap->a_mode, ap->a_cred); + error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); if (error == 0) vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td); return (error); @@ -3145,7 +3900,7 @@ zfs_freebsd_close(ap) } */ *ap; { - return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred)); + return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL)); } static int @@ -3161,7 +3916,7 @@ zfs_freebsd_ioctl(ap) { return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, - ap->a_fflag, ap->a_cred, NULL)); + ap->a_fflag, ap->a_cred, NULL, NULL)); } static int @@ -3194,13 +3949,13 @@ static int zfs_freebsd_access(ap) struct vop_access_args /* { struct vnode *a_vp; - accmode_t a_accmode; + int a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { - return (zfs_access(ap->a_vp, ap->a_accmode, 0, ap->a_cred)); + return (zfs_access(ap->a_vp, ap->a_accmode, 0, ap->a_cred, NULL)); } static int @@ -3218,7 +3973,7 @@ zfs_freebsd_lookup(ap) strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, - cnp->cn_cred, cnp->cn_thread)); + cnp->cn_cred, cnp->cn_thread, 0)); } static int @@ -3240,7 +3995,7 @@ zfs_freebsd_create(ap) mode = vap->va_mode & ALLPERMS; return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, - ap->a_vpp, cnp->cn_cred)); + ap->a_vpp, cnp->cn_cred, cnp->cn_thread)); } static int @@ -3255,7 +4010,7 @@ zfs_freebsd_remove(ap) ASSERT(ap->a_cnp->cn_flags & SAVENAME); return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr, - ap->a_cnp->cn_cred)); + ap->a_cnp->cn_cred, NULL, 0)); } static int @@ -3274,7 +4029,7 @@ zfs_freebsd_mkdir(ap) vattr_init_mask(vap); return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, - ap->a_cnp->cn_cred)); + ap->a_cnp->cn_cred, NULL, 0, NULL)); } static int @@ -3289,7 +4044,7 @@ zfs_freebsd_rmdir(ap) ASSERT(cnp->cn_flags & SAVENAME); - return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred)); + return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0)); } static int @@ -3318,7 +4073,7 @@ zfs_freebsd_fsync(ap) { vop_stdfsync(ap); - return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred)); + return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); } static int @@ -3327,10 +4082,45 @@ zfs_freebsd_getattr(ap) struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; + struct thread *a_td; } */ *ap; { + vattr_t *vap = ap->a_vap; + xvattr_t xvap; + u_long fflags = 0; + int error; + + xva_init(&xvap); + xvap.xva_vattr = *vap; + xvap.xva_vattr.va_mask |= AT_XVATTR; + + /* Convert chflags into ZFS-type flags. */ + /* XXX: what about SF_SETTABLE?. */ + XVA_SET_REQ(&xvap, XAT_IMMUTABLE); + XVA_SET_REQ(&xvap, XAT_APPENDONLY); + XVA_SET_REQ(&xvap, XAT_NOUNLINK); + XVA_SET_REQ(&xvap, XAT_NODUMP); + error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); + if (error != 0) + return (error); - return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred)); + /* Convert ZFS xattr into chflags. */ +#define FLAG_CHECK(fflag, xflag, xfield) do { \ + if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ + fflags |= (fflag); \ +} while (0) + FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, + xvap.xva_xoptattrs.xoa_immutable); + FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, + xvap.xva_xoptattrs.xoa_appendonly); + FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, + xvap.xva_xoptattrs.xoa_nounlink); + FLAG_CHECK(UF_NODUMP, XAT_NODUMP, + xvap.xva_xoptattrs.xoa_nodump); +#undef FLAG_CHECK + *vap = xvap.xva_vattr; + vap->va_flags = fflags; + return (0); } static int @@ -3339,18 +4129,46 @@ zfs_freebsd_setattr(ap) struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; + struct thread *a_td; } */ *ap; { vattr_t *vap = ap->a_vap; - - /* No support for FreeBSD's chflags(2). */ - if (vap->va_flags != VNOVAL) - return (EOPNOTSUPP); + xvattr_t xvap; + u_long fflags; + uint64_t zflags; vattr_init_mask(vap); vap->va_mask &= ~AT_NOSET; - return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL)); + xva_init(&xvap); + xvap.xva_vattr = *vap; + + if (vap->va_flags != VNOVAL) { + fflags = vap->va_flags; + if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0) + return (EOPNOTSUPP); + zflags = VTOZ(ap->a_vp)->z_phys->zp_flags; + +#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ + if (((fflags & (fflag)) && !(zflags & (zflag))) || \ + ((zflags & (zflag)) && !(fflags & (fflag)))) { \ + XVA_SET_REQ(&xvap, (xflag)); \ + (xfield) = ((fflags & (fflag)) != 0); \ + } \ +} while (0) + /* Convert chflags into ZFS-type flags. */ + /* XXX: what about SF_SETTABLE?. */ + FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, + xvap.xva_xoptattrs.xoa_immutable); + FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, + xvap.xva_xoptattrs.xoa_appendonly); + FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, + xvap.xva_xoptattrs.xoa_nounlink); + FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, + xvap.xva_xoptattrs.xoa_nounlink); +#undef FLAG_CHANGE + } + return (zfs_setattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL)); } static int @@ -3374,7 +4192,7 @@ zfs_freebsd_rename(ap) ASSERT(ap->a_tcnp->cn_flags & SAVENAME); error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp, - ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred); + ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0); if (tdvp == tvp) VN_RELE(tdvp); @@ -3419,7 +4237,7 @@ zfs_freebsd_readlink(ap) } */ *ap; { - return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred)); + return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); } static int @@ -3434,7 +4252,7 @@ zfs_freebsd_link(ap) ASSERT(cnp->cn_flags & SAVENAME); - return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); + return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); } static int @@ -3446,10 +4264,23 @@ zfs_freebsd_inactive(ap) { vnode_t *vp = ap->a_vp; - zfs_inactive(vp, ap->a_td->td_ucred); + zfs_inactive(vp, ap->a_td->td_ucred, NULL); return (0); } +static void +zfs_reclaim_complete(void *arg, int pending) +{ + znode_t *zp = arg; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_LOG(1, "zp=%p", zp); + ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); + zfs_znode_free(zp); +} + static int zfs_freebsd_reclaim(ap) struct vop_reclaim_args /* { @@ -3460,7 +4291,6 @@ zfs_freebsd_reclaim(ap) vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs; - int rele = 1; ASSERT(zp != NULL); @@ -3471,24 +4301,34 @@ zfs_freebsd_reclaim(ap) mutex_enter(&zp->z_lock); ASSERT(zp->z_phys); - ASSERT(zp->z_dbuf_held); - zfsvfs = zp->z_zfsvfs; + ZTOV(zp) = NULL; if (!zp->z_unlinked) { - zp->z_dbuf_held = 0; - ZTOV(zp) = NULL; + int locked; + + zfsvfs = zp->z_zfsvfs; mutex_exit(&zp->z_lock); - dmu_buf_rele(zp->z_dbuf, NULL); + locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 : + ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id); + if (locked == 0) { + /* + * Lock can't be obtained due to deadlock possibility, + * so defer znode destruction. + */ + TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp); + taskqueue_enqueue(taskqueue_thread, &zp->z_task); + } else { + zfs_znode_dmu_fini(zp); + if (locked == 1) + ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); + zfs_znode_free(zp); + } } else { mutex_exit(&zp->z_lock); } VI_LOCK(vp); - if (vp->v_count > 0) - rele = 0; vp->v_data = NULL; ASSERT(vp->v_holdcnt >= 1); VI_UNLOCK(vp); - if (!zp->z_unlinked && rele) - VFS_RELE(zfsvfs->z_vfs); return (0); } @@ -3500,7 +4340,7 @@ zfs_freebsd_fid(ap) } */ *ap; { - return (zfs_fid(ap->a_vp, (void *)ap->a_fid)); + return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); } static int @@ -3514,7 +4354,7 @@ zfs_freebsd_pathconf(ap) ulong_t val; int error; - error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred); + error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); if (error == 0) *ap->a_retval = val; else if (error == EOPNOTSUPP) @@ -3522,52 +4362,408 @@ zfs_freebsd_pathconf(ap) return (error); } +/* + * FreeBSD's extended attributes namespace defines file name prefix for ZFS' + * extended attribute name: + * + * NAMESPACE PREFIX + * system freebsd:system: + * user (none, can be used to access ZFS fsattr(5) attributes + * created on Solaris) + */ +static int +zfs_create_attrname(int attrnamespace, const char *name, char *attrname, + size_t size) +{ + const char *namespace, *prefix, *suffix; + + /* We don't allow '/' character in attribute name. */ + if (strchr(name, '/') != NULL) + return (EINVAL); + /* We don't allow attribute names that start with "freebsd:" string. */ + if (strncmp(name, "freebsd:", 8) == 0) + return (EINVAL); + + bzero(attrname, size); + + switch (attrnamespace) { + case EXTATTR_NAMESPACE_USER: +#if 0 + prefix = "freebsd:"; + namespace = EXTATTR_NAMESPACE_USER_STRING; + suffix = ":"; +#else + /* + * This is the default namespace by which we can access all + * attributes created on Solaris. + */ + prefix = namespace = suffix = ""; +#endif + break; + case EXTATTR_NAMESPACE_SYSTEM: + prefix = "freebsd:"; + namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; + suffix = ":"; + break; + case EXTATTR_NAMESPACE_EMPTY: + default: + return (EINVAL); + } + if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, + name) >= size) { + return (ENAMETOOLONG); + } + return (0); +} + +/* + * Vnode operating to retrieve a named extended attribute. + */ +static int +zfs_getextattr(struct vop_getextattr_args *ap) +/* +vop_getextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + struct vattr va; + vnode_t *xvp = NULL, *vp; + int error, flags; + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof(attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + flags = FREAD; + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname, + xvp, td); + error = vn_open_cred(&nd, &flags, 0, ap->a_cred, NULL); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (ap->a_size != NULL) { + error = VOP_GETATTR(vp, &va, ap->a_cred); + if (error == 0) + *ap->a_size = (size_t)va.va_size; + } else if (ap->a_uio != NULL) + error = VOP_READ(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred); + + VOP_UNLOCK(vp, 0); + vn_close(vp, flags, ap->a_cred, td); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Vnode operation to remove a named attribute. + */ +int +zfs_deleteextattr(struct vop_deleteextattr_args *ap) +/* +vop_deleteextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + struct vattr va; + vnode_t *xvp = NULL, *vp; + int error, flags; + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof(attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE, + UIO_SYSSPACE, attrname, xvp, td); + error = namei(&nd); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + VOP_LEASE(nd.ni_dvp, td, ap->a_cred, LEASE_WRITE); + error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); + + vput(nd.ni_dvp); + if (vp == nd.ni_dvp) + vrele(vp); + else + vput(vp); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Vnode operation to set a named attribute. + */ +static int +zfs_setextattr(struct vop_setextattr_args *ap) +/* +vop_setextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + struct vattr va; + vnode_t *xvp = NULL, *vp; + int error, flags; + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof(attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + flags = FFLAGS(O_WRONLY | O_CREAT); + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname, + xvp, td); + error = vn_open_cred(&nd, &flags, 0600, ap->a_cred, NULL); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + VOP_LEASE(vp, td, ap->a_cred, LEASE_WRITE); + VATTR_NULL(&va); + va.va_size = 0; + error = VOP_SETATTR(vp, &va, ap->a_cred); + if (error == 0) + VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred); + + VOP_UNLOCK(vp, 0); + vn_close(vp, flags, ap->a_cred, td); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Vnode operation to retrieve extended attributes on a vnode. + */ +static int +zfs_listextattr(struct vop_listextattr_args *ap) +/* +vop_listextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrprefix[16]; + u_char dirbuf[sizeof(struct dirent)]; + struct dirent *dp; + struct iovec aiov; + struct uio auio, *uio = ap->a_uio; + size_t *sizep = ap->a_size; + size_t plen; + vnode_t *xvp = NULL, *vp; + int done, error, eof, pos; + + error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, + sizeof(attrprefix)); + if (error != 0) + return (error); + plen = strlen(attrprefix); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE, UIO_SYSSPACE, + ".", xvp, td); + error = namei(&nd); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = td; + auio.uio_rw = UIO_READ; + auio.uio_offset = 0; + + if (sizep != NULL) + *sizep = 0; + + do { + u_char nlen; + + aiov.iov_base = (void *)dirbuf; + aiov.iov_len = sizeof(dirbuf); + auio.uio_resid = sizeof(dirbuf); + error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); + done = sizeof(dirbuf) - auio.uio_resid; + if (error != 0) + break; + for (pos = 0; pos < done;) { + dp = (struct dirent *)(dirbuf + pos); + pos += dp->d_reclen; + /* + * XXX: Temporarily we also accept DT_UNKNOWN, as this + * is what we get when attribute was created on Solaris. + */ + if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) + continue; + if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) + continue; + else if (strncmp(dp->d_name, attrprefix, plen) != 0) + continue; + nlen = dp->d_namlen - plen; + if (sizep != NULL) + *sizep += 1 + nlen; + else if (uio != NULL) { + /* + * Format of extattr name entry is one byte for + * length and the rest for name. + */ + error = uiomove(&nlen, 1, uio->uio_rw, uio); + if (error == 0) { + error = uiomove(dp->d_name + plen, nlen, + uio->uio_rw, uio); + } + if (error != 0) + break; + } + } + } while (!eof && error == 0); + + vput(vp); + ZFS_EXIT(zfsvfs); + + return (error); +} + struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_vnodeops = { - .vop_default = &default_vnodeops, - .vop_inactive = zfs_freebsd_inactive, - .vop_reclaim = zfs_freebsd_reclaim, - .vop_access = zfs_freebsd_access, + .vop_default = &default_vnodeops, + .vop_inactive = zfs_freebsd_inactive, + .vop_reclaim = zfs_freebsd_reclaim, + .vop_access = zfs_freebsd_access, #ifdef FREEBSD_NAMECACHE - .vop_lookup = vfs_cache_lookup, - .vop_cachedlookup = zfs_freebsd_lookup, + .vop_lookup = vfs_cache_lookup, + .vop_cachedlookup = zfs_freebsd_lookup, #else - .vop_lookup = zfs_freebsd_lookup, + .vop_lookup = zfs_freebsd_lookup, #endif - .vop_getattr = zfs_freebsd_getattr, - .vop_setattr = zfs_freebsd_setattr, - .vop_create = zfs_freebsd_create, - .vop_mknod = zfs_freebsd_create, - .vop_mkdir = zfs_freebsd_mkdir, - .vop_readdir = zfs_freebsd_readdir, - .vop_fsync = zfs_freebsd_fsync, - .vop_open = zfs_freebsd_open, - .vop_close = zfs_freebsd_close, - .vop_rmdir = zfs_freebsd_rmdir, - .vop_ioctl = zfs_freebsd_ioctl, - .vop_link = zfs_freebsd_link, - .vop_symlink = zfs_freebsd_symlink, - .vop_readlink = zfs_freebsd_readlink, - .vop_read = zfs_freebsd_read, - .vop_write = zfs_freebsd_write, - .vop_remove = zfs_freebsd_remove, - .vop_rename = zfs_freebsd_rename, - .vop_pathconf = zfs_freebsd_pathconf, - .vop_bmap = VOP_EOPNOTSUPP, - .vop_fid = zfs_freebsd_fid, + .vop_getattr = zfs_freebsd_getattr, + .vop_setattr = zfs_freebsd_setattr, + .vop_create = zfs_freebsd_create, + .vop_mknod = zfs_freebsd_create, + .vop_mkdir = zfs_freebsd_mkdir, + .vop_readdir = zfs_freebsd_readdir, + .vop_fsync = zfs_freebsd_fsync, + .vop_open = zfs_freebsd_open, + .vop_close = zfs_freebsd_close, + .vop_rmdir = zfs_freebsd_rmdir, + .vop_ioctl = zfs_freebsd_ioctl, + .vop_link = zfs_freebsd_link, + .vop_symlink = zfs_freebsd_symlink, + .vop_readlink = zfs_freebsd_readlink, + .vop_read = zfs_freebsd_read, + .vop_write = zfs_freebsd_write, + .vop_remove = zfs_freebsd_remove, + .vop_rename = zfs_freebsd_rename, + .vop_pathconf = zfs_freebsd_pathconf, + .vop_bmap = VOP_EOPNOTSUPP, + .vop_fid = zfs_freebsd_fid, + .vop_getextattr = zfs_getextattr, + .vop_deleteextattr = zfs_deleteextattr, + .vop_setextattr = zfs_setextattr, + .vop_listextattr = zfs_listextattr, }; struct vop_vector zfs_fifoops = { - .vop_default = &fifo_specops, - .vop_fsync = VOP_PANIC, - .vop_access = zfs_freebsd_access, - .vop_getattr = zfs_freebsd_getattr, - .vop_inactive = zfs_freebsd_inactive, - .vop_read = VOP_PANIC, - .vop_reclaim = zfs_freebsd_reclaim, - .vop_setattr = zfs_freebsd_setattr, - .vop_write = VOP_PANIC, - .vop_fid = zfs_freebsd_fid, + .vop_default = &fifo_specops, + .vop_fsync = VOP_PANIC, + .vop_access = zfs_freebsd_access, + .vop_getattr = zfs_freebsd_getattr, + .vop_inactive = zfs_freebsd_inactive, + .vop_read = VOP_PANIC, + .vop_reclaim = zfs_freebsd_reclaim, + .vop_setattr = zfs_freebsd_setattr, + .vop_write = VOP_PANIC, + .vop_fid = zfs_freebsd_fid, }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c index a964ec257f30..86838df837f2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c @@ -19,14 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Portions Copyright 2007 Jeremy Teo */ -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef _KERNEL #include <sys/types.h> #include <sys/param.h> @@ -35,11 +33,12 @@ #include <sys/sysmacros.h> #include <sys/resource.h> #include <sys/mntent.h> +#include <sys/u8_textprep.h> +#include <sys/dsl_dataset.h> #include <sys/vfs.h> #include <sys/vnode.h> #include <sys/file.h> #include <sys/kmem.h> -#include <sys/cmn_err.h> #include <sys/errno.h> #include <sys/unistd.h> #include <sys/atomic.h> @@ -47,7 +46,9 @@ #include <sys/zfs_acl.h> #include <sys/zfs_ioctl.h> #include <sys/zfs_rlock.h> +#include <sys/zfs_fuid.h> #include <sys/fs/zfs.h> +#include <sys/kidmap.h> #endif /* _KERNEL */ #include <sys/dmu.h> @@ -57,26 +58,53 @@ #include <sys/zfs_znode.h> #include <sys/refcount.h> +#include "zfs_prop.h" + /* Used by fstat(1). */ SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t), "sizeof(znode_t)"); /* + * Define ZNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define ZNODE_STATS +#endif /* DEBUG */ + +#ifdef ZNODE_STATS +#define ZNODE_STAT_ADD(stat) ((stat)++) +#else +#define ZNODE_STAT_ADD(stat) /* nothing */ +#endif /* ZNODE_STATS */ + +#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) +#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) + +/* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL -struct kmem_cache *znode_cache = NULL; +static kmem_cache_t *znode_cache = NULL; /*ARGSUSED*/ static void -znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr) +znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) { +#if 1 /* XXXPJD: From OpenSolaris. */ + /* + * We should never drop all dbuf refs without first clearing + * the eviction callback. + */ + panic("evicting znode %p\n", user_ptr); +#else /* XXXPJD */ znode_t *zp = user_ptr; vnode_t *vp; mutex_enter(&zp->z_lock); + zp->z_dbuf = NULL; vp = ZTOV(zp); if (vp == NULL) { mutex_exit(&zp->z_lock); @@ -85,16 +113,15 @@ znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr) ZTOV(zp) = NULL; vhold(vp); mutex_exit(&zp->z_lock); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); vrecycle(vp, curthread); VOP_UNLOCK(vp, 0); vdrop(vp); zfs_znode_free(zp); } else { - /* signal force unmount that this znode can be freed */ - zp->z_dbuf = NULL; mutex_exit(&zp->z_lock); } +#endif } extern struct vop_vector zfs_vnodeops; @@ -107,24 +134,29 @@ extern struct vop_vector zfs_fifoops; * 'cdrarg' is defined at kmem_cache_create() time. */ static int -zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags) +zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) { znode_t *zp = buf; vnode_t *vp; - vfs_t *vfsp = cdrarg; + vfs_t *vfsp = arg; int error; - if (cdrarg != NULL) { - error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp); - ASSERT(error == 0); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - zp->z_vnode = vp; - vp->v_data = (caddr_t)zp; - VN_LOCK_AREC(vp); - VN_LOCK_ASHARE(vp); - } else { - zp->z_vnode = NULL; - } + POINTER_INVALIDATE(&zp->z_zfsvfs); + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + ASSERT(vfsp != NULL); + + error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp); + if (error != 0 && (kmflags & KM_NOSLEEP)) + return (-1); + ASSERT(error == 0); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + zp->z_vnode = vp; + vp->v_data = (caddr_t)zp; + VN_LOCK_AREC(vp); + VN_LOCK_ASHARE(vp); + + list_link_init(&zp->z_link_node); + mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); @@ -135,29 +167,189 @@ zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags) avl_create(&zp->z_range_avl, zfs_range_compare, sizeof (rl_t), offsetof(rl_t, r_node)); - zp->z_dbuf_held = 0; - zp->z_dirlocks = 0; + zp->z_dbuf = NULL; + zp->z_dirlocks = NULL; return (0); } /*ARGSUSED*/ static void -zfs_znode_cache_destructor(void *buf, void *cdarg) +zfs_znode_cache_destructor(void *buf, void *arg) { znode_t *zp = buf; - ASSERT(zp->z_dirlocks == 0); + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + ASSERT(ZTOV(zp) == NULL); + vn_free(ZTOV(zp)); + ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); rw_destroy(&zp->z_map_lock); rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); - mutex_destroy(&zp->z_range_lock); avl_destroy(&zp->z_range_avl); + mutex_destroy(&zp->z_range_lock); + + ASSERT(zp->z_dbuf == NULL); + ASSERT(zp->z_dirlocks == NULL); +} + +#ifdef ZNODE_STATS +static struct { + uint64_t zms_zfsvfs_invalid; + uint64_t zms_zfsvfs_unmounted; + uint64_t zms_zfsvfs_recheck_invalid; + uint64_t zms_obj_held; + uint64_t zms_vnode_locked; + uint64_t zms_not_only_dnlc; +} znode_move_stats; +#endif /* ZNODE_STATS */ + +#if defined(sun) +static void +zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) +{ + vnode_t *vp; + + /* Copy fields. */ + nzp->z_zfsvfs = ozp->z_zfsvfs; + + /* Swap vnodes. */ + vp = nzp->z_vnode; + nzp->z_vnode = ozp->z_vnode; + ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ + ZTOV(ozp)->v_data = ozp; + ZTOV(nzp)->v_data = nzp; + + nzp->z_id = ozp->z_id; + ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ + ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); + nzp->z_unlinked = ozp->z_unlinked; + nzp->z_atime_dirty = ozp->z_atime_dirty; + nzp->z_zn_prefetch = ozp->z_zn_prefetch; + nzp->z_blksz = ozp->z_blksz; + nzp->z_seq = ozp->z_seq; + nzp->z_mapcnt = ozp->z_mapcnt; + nzp->z_last_itx = ozp->z_last_itx; + nzp->z_gen = ozp->z_gen; + nzp->z_sync_cnt = ozp->z_sync_cnt; + nzp->z_phys = ozp->z_phys; + nzp->z_dbuf = ozp->z_dbuf; + + /* Update back pointers. */ + (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, + znode_evict_error); - ASSERT(zp->z_dbuf_held == 0); + /* + * Invalidate the original znode by clearing fields that provide a + * pointer back to the znode. Set the low bit of the vfs pointer to + * ensure that zfs_znode_move() recognizes the znode as invalid in any + * subsequent callback. + */ + ozp->z_dbuf = NULL; + POINTER_INVALIDATE(&ozp->z_zfsvfs); } +/* + * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise + * returns a non-zero error code. + */ +static int +zfs_enter(zfsvfs_t *zfsvfs) +{ + ZFS_ENTER(zfsvfs); + return (0); +} + +/*ARGSUSED*/ +static kmem_cbrc_t +zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) +{ + znode_t *ozp = buf, *nzp = newbuf; + zfsvfs_t *zfsvfs; + vnode_t *vp; + + /* + * The znode is on the file system's list of known znodes if the vfs + * pointer is valid. We set the low bit of the vfs pointer when freeing + * the znode to invalidate it, and the memory patterns written by kmem + * (baddcafe and deadbeef) set at least one of the two low bits. A newly + * created znode sets the vfs pointer last of all to indicate that the + * znode is known and in a valid state to be moved by this function. + */ + zfsvfs = ozp->z_zfsvfs; + if (!POINTER_IS_VALID(zfsvfs)) { + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * Ensure that the filesystem is not unmounted during the move. + */ + if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */ + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); + return (KMEM_CBRC_DONT_KNOW); + } + + mutex_enter(&zfsvfs->z_znodes_lock); + /* + * Recheck the vfs pointer in case the znode was removed just before + * acquiring the lock. + */ + if (zfsvfs != ozp->z_zfsvfs) { + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * At this point we know that as long as we hold z_znodes_lock, the + * znode cannot be freed and fields within the znode can be safely + * accessed. Now, prevent a race with zfs_zget(). + */ + if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); + return (KMEM_CBRC_LATER); + } + + vp = ZTOV(ozp); + if (mutex_tryenter(&vp->v_lock) == 0) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); + return (KMEM_CBRC_LATER); + } + + /* Only move znodes that are referenced _only_ by the DNLC. */ + if (vp->v_count != 1 || !vn_in_dnlc(vp)) { + mutex_exit(&vp->v_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); + return (KMEM_CBRC_LATER); + } + + /* + * The znode is known and in a valid state to move. We're holding the + * locks needed to execute the critical section. + */ + zfs_znode_move_impl(ozp, nzp); + mutex_exit(&vp->v_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); + + list_link_replace(&ozp->z_link_node, &nzp->z_link_node); + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + + return (KMEM_CBRC_YES); +} +#endif /* sun */ + void zfs_znode_init(void) { @@ -168,6 +360,9 @@ zfs_znode_init(void) znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL, zfs_znode_cache_destructor, NULL, NULL, NULL, 0); +#if defined(sun) + kmem_cache_set_move(znode_cache, zfs_znode_move); +#endif } void @@ -186,44 +381,43 @@ zfs_znode_fini(void) * incore "master" object. Verify version compatibility. */ int -zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) +zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp) { objset_t *os = zfsvfs->z_os; - uint64_t version = ZPL_VERSION; int i, error; - dmu_object_info_t doi; uint64_t fsid_guid; + uint64_t zval; *zpp = NULL; - /* - * XXX - hack to auto-create the pool root filesystem at - * the first attempted mount. - */ - if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) { - dmu_tx_t *tx = dmu_tx_create(os); - - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */ - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */ - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */ - error = dmu_tx_assign(tx, TXG_WAIT); - ASSERT3U(error, ==, 0); - zfs_create_fs(os, cr, tx); - dmu_tx_commit(tx); - } - - error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1, - &version); + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error) { return (error); - } else if (version != ZPL_VERSION) { + } else if (zfsvfs->z_version > ZPL_VERSION) { (void) printf("Mismatched versions: File system " - "is version %lld on-disk format, which is " + "is version %llu on-disk format, which is " "incompatible with this software version %lld!", - (u_longlong_t)version, ZPL_VERSION); + (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); return (ENOTSUP); } + if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) + return (error); + zfsvfs->z_norm = (int)zval; + if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) + return (error); + zfsvfs->z_utf8 = (zval != 0); + if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) + return (error); + zfsvfs->z_case = (uint_t)zval; + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a @@ -244,9 +438,10 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) return (error); ASSERT(zfsvfs->z_root != 0); - /* - * Create the per mount vop tables. - */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error) + return (error); /* * Initialize zget mutex's @@ -255,14 +450,21 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); - if (error) + if (error) { + /* + * On error, we destroy the mutexes here since it's not + * possible for the caller to determine if the mutexes were + * initialized properly. + */ + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); return (error); + } ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, - &zfsvfs->z_unlinkedobj); - if (error) - return (error); + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error == ENOENT) + error = 0; return (0); } @@ -307,6 +509,50 @@ zfs_cmpldev(uint64_t dev) return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); } +static void +zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) +{ + znode_t *nzp; + + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); + + mutex_enter(&zp->z_lock); + + ASSERT(zp->z_dbuf == NULL); + zp->z_dbuf = db; + nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); + + /* + * there should be no + * concurrent zgets on this object. + */ + if (nzp != NULL) + panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); + + /* + * Slap on VROOT if we are the root znode + */ + if (zp->z_id == zfsvfs->z_root) + ZTOV(zp)->v_flag |= VROOT; + + mutex_exit(&zp->z_lock); + vn_exists(ZTOV(zp)); +} + +void +zfs_znode_dmu_fini(znode_t *zp) +{ + dmu_buf_t *db = zp->z_dbuf; + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || + zp->z_unlinked || + RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); + ASSERT(zp->z_dbuf != NULL); + zp->z_dbuf = NULL; + VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); + dmu_buf_rele(db, NULL); +} + /* * Construct a new znode/vnode and intialize. * @@ -315,42 +561,45 @@ zfs_cmpldev(uint64_t dev) * return the znode */ static znode_t * -zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz) +zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) { znode_t *zp; vnode_t *vp; - int error; zp = kmem_cache_alloc(znode_cache, KM_SLEEP); - zfs_znode_cache_constructor(zp, zfsvfs->z_vfs, 0); + zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0); ASSERT(zp->z_dirlocks == NULL); + ASSERT(zp->z_dbuf == NULL); + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); - zp->z_phys = db->db_data; - zp->z_zfsvfs = zfsvfs; + /* + * Defer setting z_zfsvfs until the znode is ready to be a candidate for + * the zfs_znode_move() callback. + */ + zp->z_phys = NULL; zp->z_unlinked = 0; zp->z_atime_dirty = 0; - zp->z_dbuf_held = 0; zp->z_mapcnt = 0; zp->z_last_itx = 0; - zp->z_dbuf = db; - zp->z_id = obj_num; + zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; - mutex_enter(&zfsvfs->z_znodes_lock); - list_insert_tail(&zfsvfs->z_all_znodes, zp); - mutex_exit(&zfsvfs->z_znodes_lock); - vp = ZTOV(zp); +#ifdef TODO + vn_reinit(vp); +#endif + + zfs_znode_dmu_init(zfsvfs, zp, db); + + zp->z_gen = zp->z_phys->zp_gen; + +#if 0 if (vp == NULL) return (zp); - - vp->v_vflag |= VV_FORCEINSMQ; - error = insmntque(vp, zfsvfs->z_vfs); - vp->v_vflag &= ~VV_FORCEINSMQ; - KASSERT(error == 0, ("insmntque() failed: error %d", error)); +#endif vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); switch (vp->v_type) { @@ -362,37 +611,18 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz) break; } - return (zp); -} - -static void -zfs_znode_dmu_init(znode_t *zp) -{ - znode_t *nzp; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - dmu_buf_t *db = zp->z_dbuf; - - mutex_enter(&zp->z_lock); - - nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_pageout_func); - - /* - * there should be no - * concurrent zgets on this object. - */ - ASSERT3P(nzp, ==, NULL); - + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + membar_producer(); /* - * Slap on VROOT if we are the root znode + * Everything else must be valid before assigning z_zfsvfs makes the + * znode eligible for zfs_znode_move(). */ - if (zp->z_id == zfsvfs->z_root) { - ZTOV(zp)->v_flag |= VROOT; - } + zp->z_zfsvfs = zfsvfs; + mutex_exit(&zfsvfs->z_znodes_lock); - ASSERT(zp->z_dbuf_held == 0); - zp->z_dbuf_held = 1; VFS_HOLD(zfsvfs->z_vfs); - mutex_exit(&zp->z_lock); + return (zp); } /* @@ -406,31 +636,34 @@ zfs_znode_dmu_init(znode_t *zp) * IS_ROOT_NODE - new object will be root * IS_XATTR - new object is an attribute * IS_REPLAY - intent log replay + * bonuslen - length of bonus buffer + * setaclp - File/Dir initial ACL + * fuidp - Tracks fuid allocation. * - * OUT: oid - ID of created object + * OUT: zpp - allocated znode * */ void -zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, - uint_t flag, znode_t **zpp, int bonuslen) +zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp, + zfs_fuid_info_t **fuidp) { - dmu_buf_t *dbp; + dmu_buf_t *db; znode_phys_t *pzp; - znode_t *zp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; timestruc_t now; - uint64_t gen; + uint64_t gen, obj; int err; ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ - *oid = vap->va_nodeid; + obj = vap->va_nodeid; flag |= IS_REPLAY; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ } else { - *oid = 0; + obj = 0; gethrestime(&now); gen = dmu_tx_get_txg(tx); } @@ -446,44 +679,45 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, */ if (vap->va_type == VDIR) { if (flag & IS_REPLAY) { - err = zap_create_claim(zfsvfs->z_os, *oid, - DMU_OT_DIRECTORY_CONTENTS, + err = zap_create_claim_norm(zfsvfs->z_os, obj, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); ASSERT3U(err, ==, 0); } else { - *oid = zap_create(zfsvfs->z_os, - DMU_OT_DIRECTORY_CONTENTS, + obj = zap_create_norm(zfsvfs->z_os, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); } } else { if (flag & IS_REPLAY) { - err = dmu_object_claim(zfsvfs->z_os, *oid, + err = dmu_object_claim(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); ASSERT3U(err, ==, 0); } else { - *oid = dmu_object_alloc(zfsvfs->z_os, + obj = dmu_object_alloc(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); } } - VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp)); - dmu_buf_will_dirty(dbp, tx); + VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); + dmu_buf_will_dirty(db, tx); /* * Initialize the znode physical data to zero. */ - ASSERT(dbp->db_size >= sizeof (znode_phys_t)); - bzero(dbp->db_data, dbp->db_size); - pzp = dbp->db_data; + ASSERT(db->db_size >= sizeof (znode_phys_t)); + bzero(db->db_data, db->db_size); + pzp = db->db_data; /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { + dzp->z_dbuf = db; dzp->z_phys = pzp; - dzp->z_id = *oid; + dzp->z_id = obj; } /* @@ -496,6 +730,9 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, pzp->zp_rdev = zfs_expldev(vap->va_rdev); } + if (zfsvfs->z_use_fuids) + pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; + if (vap->va_type == VDIR) { pzp->zp_size = 2; /* contents ("." and "..") */ pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; @@ -523,25 +760,91 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, } pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); - zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0); - - zfs_perm_init(zp, dzp, flag, vap, tx, cr); + if (!(flag & IS_ROOT_NODE)) { + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); + *zpp = zfs_znode_alloc(zfsvfs, db, 0); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); + } else { + /* + * If we are creating the root node, the "parent" we + * passed in is the znode for the root. + */ + *zpp = dzp; + } + zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp); + if (!(flag & IS_ROOT_NODE)) { + vnode_t *vp; + + vp = ZTOV(*zpp); + vp->v_vflag |= VV_FORCEINSMQ; + err = insmntque(vp, zfsvfs->z_vfs); + vp->v_vflag &= ~VV_FORCEINSMQ; + KASSERT(err == 0, ("insmntque() failed: error %d", err)); + } +} - if (zpp) { - kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp); +void +zfs_xvattr_set(znode_t *zp, xvattr_t *xvap) +{ + xoptattr_t *xoap; - mutex_enter(hash_mtx); - zfs_znode_dmu_init(zp); - mutex_exit(hash_mtx); + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); - *zpp = zp; - } else { - if (ZTOV(zp) != NULL) { - ZTOV(zp)->v_count = 0; - VOP_UNLOCK(ZTOV(zp), 0); - } - dmu_buf_rele(dbp, NULL); - zfs_znode_free(zp); + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); + XVA_SET_RTN(xvap, XAT_READONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, + xoap->xoa_av_quarantined); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp)); + zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } } @@ -552,10 +855,10 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) dmu_buf_t *db; znode_t *zp; vnode_t *vp; - int err; + int err, first = 1; *zpp = NULL; - +again: ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); @@ -572,84 +875,118 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) return (EINVAL); } - ASSERT(db->db_object == obj_num); - ASSERT(db->db_offset == -1); - ASSERT(db->db_data != NULL); - zp = dmu_buf_get_user(db); - if (zp != NULL) { mutex_enter(&zp->z_lock); + /* + * Since we do immediate eviction of the z_dbuf, we + * should never find a dbuf with a znode that doesn't + * know about the dbuf. + */ + ASSERT3P(zp->z_dbuf, ==, db); ASSERT3U(zp->z_id, ==, obj_num); if (zp->z_unlinked) { - dmu_buf_rele(db, NULL); - mutex_exit(&zp->z_lock); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - return (ENOENT); - } else if (zp->z_dbuf_held) { - dmu_buf_rele(db, NULL); + err = ENOENT; } else { - zp->z_dbuf_held = 1; - VFS_HOLD(zfsvfs->z_vfs); - } - - if (ZTOV(zp) != NULL) - VN_HOLD(ZTOV(zp)); - else { - err = getnewvnode("zfs", zfsvfs->z_vfs, &zfs_vnodeops, - &zp->z_vnode); - ASSERT(err == 0); - vp = ZTOV(zp); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - vp->v_data = (caddr_t)zp; - VN_LOCK_AREC(vp); - VN_LOCK_ASHARE(vp); - vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); - if (vp->v_type == VDIR) - zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ - vp->v_vflag |= VV_FORCEINSMQ; - err = insmntque(vp, zfsvfs->z_vfs); - vp->v_vflag &= ~VV_FORCEINSMQ; - KASSERT(err == 0, ("insmntque() failed: error %d", err)); - VOP_UNLOCK(vp, 0); + if (ZTOV(zp) != NULL) + VN_HOLD(ZTOV(zp)); + else { + if (first) { + ZFS_LOG(1, "dying znode detected (zp=%p)", zp); + first = 0; + } + /* + * znode is dying so we can't reuse it, we must + * wait until destruction is completed. + */ + dmu_buf_rele(db, NULL); + mutex_exit(&zp->z_lock); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + tsleep(zp, 0, "zcollide", 1); + goto again; + } + *zpp = zp; + err = 0; } + dmu_buf_rele(db, NULL); mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - *zpp = zp; - return (0); + return (err); } /* * Not found create new znode/vnode */ - zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size); - ASSERT3U(zp->z_id, ==, obj_num); - zfs_znode_dmu_init(zp); + zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); + + vp = ZTOV(zp); + vp->v_vflag |= VV_FORCEINSMQ; + err = insmntque(vp, zfsvfs->z_vfs); + vp->v_vflag &= ~VV_FORCEINSMQ; + KASSERT(err == 0, ("insmntque() failed: error %d", err)); + VOP_UNLOCK(vp, 0); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); *zpp = zp; - if ((vp = ZTOV(zp)) != NULL) - VOP_UNLOCK(vp, 0); return (0); } -void -zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) +int +zfs_rezget(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; + dmu_object_info_t doi; + dmu_buf_t *db; + uint64_t obj_num = zp->z_id; + int err; + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); - ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); - if (zp->z_phys->zp_acl.z_acl_extern_obj) { - error = dmu_object_free(zfsvfs->z_os, - zp->z_phys->zp_acl.z_acl_extern_obj, tx); - ASSERT3U(error, ==, 0); + err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (err); } - error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx); - ASSERT3U(error, ==, 0); - zp->z_dbuf_held = 0; - ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); - dmu_buf_rele(zp->z_dbuf, NULL); + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_ZNODE || + doi.doi_bonus_size < sizeof (znode_phys_t)) { + dmu_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (EINVAL); + } + + if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { + dmu_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (EIO); + } + + zfs_znode_dmu_init(zfsvfs, zp, db); + zp->z_unlinked = (zp->z_phys->zp_links == 0); + zp->z_blksz = doi.doi_data_block_size; + + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + return (0); +} + +void +zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + uint64_t obj = zp->z_id; + uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); + if (acl_obj) + VERIFY(0 == dmu_object_free(os, acl_obj, tx)); + VERIFY(0 == dmu_object_free(os, obj, tx)); + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); + zfs_znode_free(zp); } void @@ -659,7 +996,7 @@ zfs_zinactive(znode_t *zp) zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t z_id = zp->z_id; - ASSERT(zp->z_dbuf_held && zp->z_phys); + ASSERT(zp->z_dbuf && zp->z_phys); /* * Don't allow a zfs_zget() while were trying to release this znode @@ -686,17 +1023,13 @@ zfs_zinactive(znode_t *zp) * remove the file from the file system. */ if (zp->z_unlinked) { - ZTOV(zp) = NULL; mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); ASSERT(vp->v_count == 0); vrecycle(vp, curthread); zfs_rmnode(zp); - VFS_RELE(zfsvfs->z_vfs); return; } - ASSERT(zp->z_phys); - ASSERT(zp->z_dbuf_held); mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); } @@ -706,11 +1039,15 @@ zfs_znode_free(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; + ASSERT(ZTOV(zp) == NULL); mutex_enter(&zfsvfs->z_znodes_lock); + POINTER_INVALIDATE(&zp->z_zfsvfs); list_remove(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); kmem_cache_free(znode_cache, zp); + + VFS_RELE(zfsvfs->z_vfs); } void @@ -733,11 +1070,17 @@ zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) if (flag & AT_ATIME) ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); - if (flag & AT_MTIME) + if (flag & AT_MTIME) { ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); + if (zp->z_zfsvfs->z_use_fuids) + zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); + } - if (flag & AT_CTIME) + if (flag & AT_CTIME) { ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); + if (zp->z_zfsvfs->z_use_fuids) + zp->z_phys->zp_flags |= ZFS_ARCHIVE; + } } /* @@ -796,113 +1139,195 @@ zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) } /* - * Free space in a file. + * Increase the file length * * IN: zp - znode of file to free data in. - * off - start of section to free. - * len - length of section to free (0 => to EOF). - * flag - current file open mode flags. + * end - new end-of-file * * RETURN: 0 if success * error code if failure */ -int -zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +static int +zfs_extend(znode_t *zp, uint64_t end) { - vnode_t *vp = ZTOV(zp); - dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + dmu_tx_t *tx; rl_t *rl; - uint64_t end = off + len; - uint64_t size, new_blksz; + uint64_t newblksz; int error; - if (ZTOV(zp)->v_type == VFIFO) - return (0); - /* - * If we will change zp_size then lock the whole file, - * otherwise just lock the range being freed. + * We will change zp_size, lock the whole file. */ - if (len == 0 || off + len > zp->z_phys->zp_size) { - rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); - } else { - rl = zfs_range_lock(zp, off, len, RL_WRITER); - /* recheck, in case zp_size changed */ - if (off + len > zp->z_phys->zp_size) { - /* lost race: file size changed, lock whole file */ - zfs_range_unlock(rl); - rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); - } - } + rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ - size = zp->z_phys->zp_size; - if (len == 0 && size == off && off != 0) { + if (end <= zp->z_phys->zp_size) { zfs_range_unlock(rl); return (0); } - +top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); - new_blksz = 0; - if (end > size && + if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end, SPA_MAXBLOCKSIZE); + newblksz = MIN(end, SPA_MAXBLOCKSIZE); } else { - new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz); + newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); } - dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz)); - } else if (off < size) { - /* - * If len == 0, we are truncating the file. - */ - dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END); + dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); + } else { + newblksz = 0; } error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } dmu_tx_abort(tx); zfs_range_unlock(rl); return (error); } + dmu_buf_will_dirty(zp->z_dbuf, tx); + + if (newblksz) + zfs_grow_blocksize(zp, newblksz, tx); - if (new_blksz) - zfs_grow_blocksize(zp, new_blksz, tx); + zp->z_phys->zp_size = end; - if (end > size || len == 0) - zp->z_phys->zp_size = end; + zfs_range_unlock(rl); - if (off < size) { - objset_t *os = zfsvfs->z_os; - uint64_t rlen = len; + dmu_tx_commit(tx); - if (len == 0) - rlen = -1; - else if (end > size) - rlen = size - off; - VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx)); + rw_enter(&zp->z_map_lock, RW_WRITER); + error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0); + ASSERT(error == 0); + vnode_pager_setsize(ZTOV(zp), end); + rw_exit(&zp->z_map_lock); + + return (0); +} + +/* + * Free space in a file. + * + * IN: zp - znode of file to free data in. + * off - start of section to free. + * len - length of section to free. + * + * RETURN: 0 if success + * error code if failure + */ +static int +zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + rl_t *rl; + int error; + + /* + * Lock the range being freed. + */ + rl = zfs_range_lock(zp, off, len, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (off >= zp->z_phys->zp_size) { + zfs_range_unlock(rl); + return (0); } - if (log) { - zfs_time_stamper(zp, CONTENT_MODIFIED, tx); - zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + if (off + len > zp->z_phys->zp_size) + len = zp->z_phys->zp_size - off; + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); + + if (error == 0) { + /* + * In FreeBSD we cannot free block in the middle of a file, + * but only at the end of a file. + */ + rw_enter(&zp->z_map_lock, RW_WRITER); + error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0); + ASSERT(error == 0); + vnode_pager_setsize(ZTOV(zp), off); + rw_exit(&zp->z_map_lock); } zfs_range_unlock(rl); + return (error); +} + +/* + * Truncate a file + * + * IN: zp - znode of file to free data in. + * end - new end-of-file. + * + * RETURN: 0 if success + * error code if failure + */ +static int +zfs_trunc(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + dmu_tx_t *tx; + rl_t *rl; + int error; + + /* + * We will change zp_size, lock the whole file. + */ + rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end >= zp->z_phys->zp_size) { + zfs_range_unlock(rl); + return (0); + } + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); + if (error) { + zfs_range_unlock(rl); + return (error); + } +top: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_range_unlock(rl); + return (error); + } + dmu_buf_will_dirty(zp->z_dbuf, tx); + + zp->z_phys->zp_size = end; + dmu_tx_commit(tx); + zfs_range_unlock(rl); + /* * Clear any mapped pages in the truncated region. This has to * happen outside of the transaction to avoid the possibility of @@ -910,30 +1335,90 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) * about to invalidate. */ rw_enter(&zp->z_map_lock, RW_WRITER); - if (end > size) - vnode_pager_setsize(vp, end); - else if (len == 0) { #if 0 - error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE); + error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE); #else - error = vinvalbuf(vp, V_SAVE, 0, 0); - vnode_pager_setsize(vp, end); + error = vinvalbuf(vp, V_SAVE, 0, 0); + ASSERT(error == 0); + vnode_pager_setsize(vp, end); #endif - } rw_exit(&zp->z_map_lock); return (0); } +/* + * Free space in a file + * + * IN: zp - znode of file to free data in. + * off - start of range + * len - end of range (0 => EOF) + * flag - current file open mode flags. + * log - TRUE if this action should be logged + * + * RETURN: 0 if success + * error code if failure + */ +int +zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +{ + vnode_t *vp = ZTOV(zp); + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + int error; + + if (off > zp->z_phys->zp_size) { + error = zfs_extend(zp, off+len); + if (error == 0 && log) + goto log; + else + return (error); + } + + if (len == 0) { + error = zfs_trunc(zp, off); + } else { + if ((error = zfs_free_range(zp, off, len)) == 0 && + off + len > zp->z_phys->zp_size) + error = zfs_extend(zp, off+len); + } + if (error || !log) + return (error); +log: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto log; + } + dmu_tx_abort(tx); + return (error); + } + + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + + dmu_tx_commit(tx); + return (0); +} + void -zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx) +zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { zfsvfs_t zfsvfs; - uint64_t moid, doid, roid = 0; - uint64_t version = ZPL_VERSION; + uint64_t moid, doid, version; + uint64_t sense = ZFS_CASE_SENSITIVE; + uint64_t norm = 0; + nvpair_t *elem; int error; znode_t *rootzp = NULL; + vnode_t *vp; vattr_t vattr; + znode_t *zp; /* * First attempt to create master node. @@ -950,9 +1435,35 @@ zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx) /* * Set starting attributes. */ - - error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx); - ASSERT(error == 0); + if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) + version = ZPL_VERSION; + else + version = ZPL_VERSION_FUID - 1; + error = zap_update(os, moid, ZPL_VERSION_STR, + 8, 1, &version, tx); + elem = NULL; + while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { + /* For the moment we expect all zpl props to be uint64_ts */ + uint64_t val; + char *name; + + ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); + VERIFY(nvpair_value_uint64(elem, &val) == 0); + name = nvpair_name(elem); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { + version = val; + error = zap_update(os, moid, ZPL_VERSION_STR, + 8, 1, &version, tx); + } else { + error = zap_update(os, moid, name, 8, 1, &val, tx); + } + ASSERT(error == 0); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) + norm = val; + else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) + sense = val; + } + ASSERT(version != 0); /* * Create a delete queue. @@ -966,39 +1477,62 @@ zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx) * Create root znode. Create minimal znode/vnode/zfsvfs * to allow zfs_mknode to work. */ + VATTR_NULL(&vattr); vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0755; - vattr.va_uid = UID_ROOT; - vattr.va_gid = GID_WHEEL; + vattr.va_uid = crgetuid(cr); + vattr.va_gid = crgetgid(cr); rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); - zfs_znode_cache_constructor(rootzp, NULL, 0); - rootzp->z_zfsvfs = &zfsvfs; + zfs_znode_cache_constructor(rootzp, &zfsvfs, 0); rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; - rootzp->z_dbuf_held = 0; + + vp = ZTOV(rootzp); + vp->v_type = VDIR; bzero(&zfsvfs, sizeof (zfsvfs_t)); zfsvfs.z_os = os; zfsvfs.z_assign = TXG_NOWAIT; zfsvfs.z_parent = &zfsvfs; + zfsvfs.z_version = version; + zfsvfs.z_use_fuids = USE_FUIDS(version, os); + zfsvfs.z_norm = norm; + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) + zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); - zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0); - ASSERT3U(rootzp->z_id, ==, roid); - error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx); + ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); + rootzp->z_zfsvfs = &zfsvfs; + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL); + ASSERT3P(zp, ==, rootzp); + error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT(error == 0); + POINTER_INVALIDATE(&rootzp->z_zfsvfs); + VI_LOCK(vp); + ZTOV(rootzp)->v_data = NULL; + ZTOV(rootzp)->v_count = 0; + ZTOV(rootzp)->v_holdcnt = 0; + ZTOV(rootzp) = NULL; + VOP_UNLOCK(vp, 0); + vdestroy(vp); + dmu_buf_rele(rootzp->z_dbuf, NULL); + rootzp->z_dbuf = NULL; mutex_destroy(&zfsvfs.z_znodes_lock); kmem_cache_free(znode_cache, rootzp); } -#endif /* _KERNEL */ +#endif /* _KERNEL */ /* * Given an object number, return its parent object number and whether * or not the object is an extended attribute directory. @@ -1058,7 +1592,8 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) if (is_xattrdir) { (void) sprintf(component + 1, "<xattrdir>"); } else { - error = zap_value_search(osp, pobj, obj, component + 1); + error = zap_value_search(osp, pobj, obj, + ZFS_DIRENT_OBJ(-1ULL), component + 1); if (error != 0) break; } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c index 69ee509d50ed..1f6fa0db9460 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/dmu.h> @@ -174,7 +172,11 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp) *abufpp = NULL; - error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array, + /* + * We shouldn't be doing any scrubbing while we're doing log + * replay, it's OK to not lock. + */ + error = arc_read_nolock(NULL, zilog->zl_spa, &blk, arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb); @@ -185,17 +187,20 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp) zio_cksum_t cksum = bp->blk_cksum; /* + * Validate the checksummed log block. + * * Sequence numbers should be... sequential. The checksum * verifier for the next block should be bp's checksum plus 1. + * + * Also check the log chain linkage and size used. */ cksum.zc_word[ZIL_ZC_SEQ]++; - if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum))) - error = ESTALE; - else if (BP_IS_HOLE(&ztp->zit_next_blk)) - error = ENOENT; - else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) - error = EOVERFLOW; + if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, + sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) || + (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))) { + error = ECKSUM; + } if (error) { VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1); @@ -290,7 +295,8 @@ zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) */ if (bp->blk_birth >= first_txg && zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) { - err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL)); + err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL, + ZIO_FLAG_MUSTSUCCEED)); ASSERT(err == 0); } } @@ -430,6 +436,16 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) mutex_enter(&zilog->zl_lock); + /* + * It is possible for the ZIL to get the previously mounted zilog + * structure of the same dataset if quickly remounted and the dbuf + * eviction has not completed. In this case we can see a non + * empty lwb list and keep_first will be set. We fix this by + * clearing the keep_first. This will be slower but it's very rare. + */ + if (!list_is_empty(&zilog->zl_lwb_list) && keep_first) + keep_first = B_FALSE; + ASSERT3U(zilog->zl_destroy_txg, <, txg); zilog->zl_destroy_txg = txg; zilog->zl_keep_first = keep_first; @@ -453,12 +469,37 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) mutex_exit(&zilog->zl_lock); dmu_tx_commit(tx); +} + +/* + * zil_rollback_destroy() is only called by the rollback code. + * We already have a syncing tx. Rollback has exclusive access to the + * dataset, so we don't have to worry about concurrent zil access. + * The actual freeing of any log blocks occurs in zil_sync() later in + * this txg syncing phase. + */ +void +zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx) +{ + const zil_header_t *zh = zilog->zl_header; + uint64_t txg; - if (keep_first) /* no need to wait in this case */ + if (BP_IS_HOLE(&zh->zh_log)) return; - txg_wait_synced(zilog->zl_dmu_pool, txg); - ASSERT(BP_IS_HOLE(&zh->zh_log)); + txg = dmu_tx_get_txg(tx); + ASSERT3U(zilog->zl_destroy_txg, <, txg); + zilog->zl_destroy_txg = txg; + zilog->zl_keep_first = B_FALSE; + + /* + * Ensure there's no outstanding ZIL IO. No lwbs or just the + * unused one that allocated in advance is ok. + */ + ASSERT(zilog->zl_lwb_list.list_head.list_next == + zilog->zl_lwb_list.list_head.list_prev); + (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record, + tx, zh->zh_claim_txg); } int @@ -471,9 +512,9 @@ zil_claim(char *osname, void *txarg) objset_t *os; int error; - error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os); + error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); if (error) { - cmn_err(CE_WARN, "can't process intent log for %s", osname); + cmn_err(CE_WARN, "can't open objset for %s", osname); return (0); } @@ -500,104 +541,164 @@ zil_claim(char *osname, void *txarg) return (0); } -void -zil_add_vdev(zilog_t *zilog, uint64_t vdev) +/* + * Check the log by walking the log chain. + * Checksum errors are ok as they indicate the end of the chain. + * Any other error (no device or read failure) returns an error. + */ +/* ARGSUSED */ +int +zil_check_log_chain(char *osname, void *txarg) { - zil_vdev_t *zv, *new; - uint64_t bmap_sz = sizeof (zilog->zl_vdev_bmap) << 3; - uchar_t *cp; + zilog_t *zilog; + zil_header_t *zh; + blkptr_t blk; + arc_buf_t *abuf; + objset_t *os; + char *lrbuf; + zil_trailer_t *ztp; + int error; - if (zfs_nocacheflush) - return; + error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); + if (error) { + cmn_err(CE_WARN, "can't open objset for %s", osname); + return (0); + } - if (vdev < bmap_sz) { - cp = zilog->zl_vdev_bmap + (vdev / 8); - atomic_or_8(cp, 1 << (vdev % 8)); - } else { - /* - * insert into ordered list - */ - mutex_enter(&zilog->zl_lock); - for (zv = list_head(&zilog->zl_vdev_list); zv != NULL; - zv = list_next(&zilog->zl_vdev_list, zv)) { - if (zv->vdev == vdev) { - /* duplicate found - just return */ - mutex_exit(&zilog->zl_lock); - return; - } - if (zv->vdev > vdev) { - /* insert before this entry */ - new = kmem_alloc(sizeof (zil_vdev_t), - KM_SLEEP); - new->vdev = vdev; - list_insert_before(&zilog->zl_vdev_list, - zv, new); - mutex_exit(&zilog->zl_lock); - return; - } - } - /* ran off end of list, insert at the end */ - ASSERT(zv == NULL); - new = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP); - new->vdev = vdev; - list_insert_tail(&zilog->zl_vdev_list, new); - mutex_exit(&zilog->zl_lock); + zilog = dmu_objset_zil(os); + zh = zil_header_in_syncing_context(zilog); + blk = zh->zh_log; + if (BP_IS_HOLE(&blk)) { + dmu_objset_close(os); + return (0); /* no chain */ + } + + for (;;) { + error = zil_read_log_block(zilog, &blk, &abuf); + if (error) + break; + lrbuf = abuf->b_data; + ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1; + blk = ztp->zit_next_blk; + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + } + dmu_objset_close(os); + if (error == ECKSUM) + return (0); /* normal end of chain */ + return (error); +} + +/* + * Clear a log chain + */ +/* ARGSUSED */ +int +zil_clear_log_chain(char *osname, void *txarg) +{ + zilog_t *zilog; + zil_header_t *zh; + objset_t *os; + dmu_tx_t *tx; + int error; + + error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); + if (error) { + cmn_err(CE_WARN, "can't open objset for %s", osname); + return (0); } + + zilog = dmu_objset_zil(os); + tx = dmu_tx_create(zilog->zl_os); + (void) dmu_tx_assign(tx, TXG_WAIT); + zh = zil_header_in_syncing_context(zilog); + BP_ZERO(&zh->zh_log); + dsl_dataset_dirty(dmu_objset_ds(os), tx); + dmu_tx_commit(tx); + dmu_objset_close(os); + return (0); +} + +static int +zil_vdev_compare(const void *x1, const void *x2) +{ + uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; + uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; + + if (v1 < v2) + return (-1); + if (v1 > v2) + return (1); + + return (0); } -/* start an async flush of the write cache for this vdev */ void -zil_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio) +zil_add_block(zilog_t *zilog, blkptr_t *bp) { - vdev_t *vd; + avl_tree_t *t = &zilog->zl_vdev_tree; + avl_index_t where; + zil_vdev_node_t *zv, zvsearch; + int ndvas = BP_GET_NDVAS(bp); + int i; - if (*zio == NULL) - *zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + if (zfs_nocacheflush) + return; - vd = vdev_lookup_top(spa, vdev); - ASSERT(vd); + ASSERT(zilog->zl_writer); - (void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE, - NULL, NULL, ZIO_PRIORITY_NOW, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); + /* + * Even though we're zl_writer, we still need a lock because the + * zl_get_data() callbacks may have dmu_sync() done callbacks + * that will run concurrently. + */ + mutex_enter(&zilog->zl_vdev_lock); + for (i = 0; i < ndvas; i++) { + zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); + if (avl_find(t, &zvsearch, &where) == NULL) { + zv = kmem_alloc(sizeof (*zv), KM_SLEEP); + zv->zv_vdev = zvsearch.zv_vdev; + avl_insert(t, zv, where); + } + } + mutex_exit(&zilog->zl_vdev_lock); } void zil_flush_vdevs(zilog_t *zilog) { - zil_vdev_t *zv; - zio_t *zio = NULL; spa_t *spa = zilog->zl_spa; - uint64_t vdev; - uint8_t b; - int i, j; + avl_tree_t *t = &zilog->zl_vdev_tree; + void *cookie = NULL; + zil_vdev_node_t *zv; + zio_t *zio; ASSERT(zilog->zl_writer); - for (i = 0; i < sizeof (zilog->zl_vdev_bmap); i++) { - b = zilog->zl_vdev_bmap[i]; - if (b == 0) - continue; - for (j = 0; j < 8; j++) { - if (b & (1 << j)) { - vdev = (i << 3) + j; - zil_flush_vdev(spa, vdev, &zio); - } - } - zilog->zl_vdev_bmap[i] = 0; - } + /* + * We don't need zl_vdev_lock here because we're the zl_writer, + * and all zl_get_data() callbacks are done. + */ + if (avl_numnodes(t) == 0) + return; + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) { - zil_flush_vdev(spa, zv->vdev, &zio); - list_remove(&zilog->zl_vdev_list, zv); - kmem_free(zv, sizeof (zil_vdev_t)); + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { + vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); + if (vd != NULL) + zio_flush(zio, vd); + kmem_free(zv, sizeof (*zv)); } + /* * Wait for all the flushes to complete. Not all devices actually * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. */ - if (zio) - (void) zio_wait(zio); + (void) zio_wait(zio); + + spa_config_exit(spa, SCL_STATE, FTAG); } /* @@ -609,6 +710,15 @@ zil_lwb_write_done(zio_t *zio) lwb_t *lwb = zio->io_private; zilog_t *zilog = lwb->lwb_zilog; + ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); + ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG); + ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); + ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); + ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); + ASSERT(!BP_IS_GANG(zio->io_bp)); + ASSERT(!BP_IS_HOLE(zio->io_bp)); + ASSERT(zio->io_bp->blk_fill == 0); + /* * Now that we've written this log block, we have a stable pointer * to the next block in the chain, so it's OK to let the txg in @@ -619,19 +729,13 @@ zil_lwb_write_done(zio_t *zio) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_buf = NULL; - if (zio->io_error) { + if (zio->io_error) zilog->zl_log_error = B_TRUE; - mutex_exit(&zilog->zl_lock); - return; - } mutex_exit(&zilog->zl_lock); } /* * Initialize the io for a log block. - * - * Note, we should not initialize the IO until we are about - * to use it, since zio_rewrite() does a spa_config_enter(). */ static void zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) @@ -649,9 +753,9 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) } if (lwb->lwb_zio == NULL) { lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, - ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf, + 0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb, - ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_CANFAIL, &zb); } } @@ -751,8 +855,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) list_insert_tail(&zilog->zl_lwb_list, nlwb); mutex_exit(&zilog->zl_lock); - /* Record the vdev for later flushing */ - zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk)))); + /* Record the block for later vdev flushing */ + zil_add_block(zilog, &lwb->lwb_blk); /* * kick off the write for the old log block @@ -848,7 +952,7 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) } itx_t * -zil_itx_create(int txtype, size_t lrsize) +zil_itx_create(uint64_t txtype, size_t lrsize) { itx_t *itx; @@ -857,6 +961,7 @@ zil_itx_create(int txtype, size_t lrsize) itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); itx->itx_lr.lrc_txtype = txtype; itx->itx_lr.lrc_reclen = lrsize; + itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ itx->itx_lr.lrc_seq = 0; /* defensive */ return (itx); @@ -871,7 +976,7 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_itx_list, itx); - zilog->zl_itx_list_sz += itx->itx_lr.lrc_reclen; + zilog->zl_itx_list_sz += itx->itx_sod; itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq; mutex_exit(&zilog->zl_lock); @@ -907,7 +1012,7 @@ zil_itx_clean(zilog_t *zilog) while ((itx = list_head(&zilog->zl_itx_list)) != NULL && itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) { list_remove(&zilog->zl_itx_list, itx); - zilog->zl_itx_list_sz -= itx->itx_lr.lrc_reclen; + zilog->zl_itx_list_sz -= itx->itx_sod; list_insert_tail(&clean_list, itx); } cv_broadcast(&zilog->zl_cv_writer); @@ -941,18 +1046,17 @@ zil_clean(zilog_t *zilog) mutex_exit(&zilog->zl_lock); } -void +static void zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) { uint64_t txg; - uint64_t reclen; uint64_t commit_seq = 0; itx_t *itx, *itx_next = (itx_t *)-1; lwb_t *lwb; spa_t *spa; zilog->zl_writer = B_TRUE; - zilog->zl_root_zio = NULL; + ASSERT(zilog->zl_root_zio == NULL); spa = zilog->zl_spa; if (zilog->zl_suspend) { @@ -1009,10 +1113,9 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) if (itx == NULL) break; - reclen = itx->itx_lr.lrc_reclen; if ((itx->itx_lr.lrc_seq > seq) && ((lwb == NULL) || (lwb->lwb_nused == 0) || - (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)))) { + (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) { break; } @@ -1024,6 +1127,7 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) */ itx_next = list_next(&zilog->zl_itx_list, itx); list_remove(&zilog->zl_itx_list, itx); + zilog->zl_itx_list_sz -= itx->itx_sod; mutex_exit(&zilog->zl_lock); txg = itx->itx_lr.lrc_txg; ASSERT(txg); @@ -1034,7 +1138,6 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); mutex_enter(&zilog->zl_lock); - zilog->zl_itx_list_sz -= reclen; } DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); /* determine commit sequence number */ @@ -1058,9 +1161,9 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) if (zilog->zl_root_zio) { DTRACE_PROBE1(zil__cw3, zilog_t *, zilog); (void) zio_wait(zilog->zl_root_zio); + zilog->zl_root_zio = NULL; DTRACE_PROBE1(zil__cw4, zilog_t *, zilog); - if (!zfs_nocacheflush) - zil_flush_vdevs(zilog); + zil_flush_vdevs(zilog); } if (zilog->zl_log_error || lwb == NULL) { @@ -1195,8 +1298,6 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) zilog->zl_destroy_txg = TXG_INITIAL - 1; mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); - cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); list_create(&zilog->zl_itx_list, sizeof (itx_t), offsetof(itx_t, itx_node)); @@ -1204,8 +1305,13 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) list_create(&zilog->zl_lwb_list, sizeof (lwb_t), offsetof(lwb_t, lwb_node)); - list_create(&zilog->zl_vdev_list, sizeof (zil_vdev_t), - offsetof(zil_vdev_t, vdev_seq_node)); + mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&zilog->zl_vdev_tree, zil_vdev_compare, + sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); + + cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); + cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); return (zilog); } @@ -1214,7 +1320,6 @@ void zil_free(zilog_t *zilog) { lwb_t *lwb; - zil_vdev_t *zv; zilog->zl_stop_sync = 1; @@ -1226,38 +1331,36 @@ zil_free(zilog_t *zilog) } list_destroy(&zilog->zl_lwb_list); - while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) { - list_remove(&zilog->zl_vdev_list, zv); - kmem_free(zv, sizeof (zil_vdev_t)); - } - list_destroy(&zilog->zl_vdev_list); + avl_destroy(&zilog->zl_vdev_tree); + mutex_destroy(&zilog->zl_vdev_lock); ASSERT(list_head(&zilog->zl_itx_list) == NULL); list_destroy(&zilog->zl_itx_list); - cv_destroy(&zilog->zl_cv_suspend); - cv_destroy(&zilog->zl_cv_writer); mutex_destroy(&zilog->zl_lock); + cv_destroy(&zilog->zl_cv_writer); + cv_destroy(&zilog->zl_cv_suspend); + kmem_free(zilog, sizeof (zilog_t)); } /* * return true if the initial log block is not valid */ -static int +static boolean_t zil_empty(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; arc_buf_t *abuf = NULL; if (BP_IS_HOLE(&zh->zh_log)) - return (1); + return (B_TRUE); if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0) - return (1); + return (B_TRUE); VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); - return (0); + return (B_FALSE); } /* @@ -1326,7 +1429,6 @@ zil_suspend(zilog_t *zilog) */ while (zilog->zl_suspending) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); - ASSERT(BP_IS_HOLE(&zh->zh_log)); mutex_exit(&zilog->zl_lock); return (0); } @@ -1346,7 +1448,6 @@ zil_suspend(zilog_t *zilog) zil_destroy(zilog, B_FALSE); mutex_enter(&zilog->zl_lock); - ASSERT(BP_IS_HOLE(&zh->zh_log)); zilog->zl_suspending = B_FALSE; cv_broadcast(&zilog->zl_cv_suspend); mutex_exit(&zilog->zl_lock); @@ -1366,6 +1467,7 @@ zil_resume(zilog_t *zilog) typedef struct zil_replay_arg { objset_t *zr_os; zil_replay_func_t **zr_replay; + zil_replay_cleaner_t *zr_replay_cleaner; void *zr_arg; uint64_t *zr_txgp; boolean_t zr_byteswap; @@ -1391,6 +1493,9 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ return; + /* Strip case-insensitive bit, still present in log record */ + txtype &= ~TX_CI; + /* * Make a copy of the data so we can revise and extend it. */ @@ -1465,10 +1570,12 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) * On the first pass, arrange for the replay vector * to fail its dmu_tx_assign(). That's the only way * to ensure that those code paths remain well tested. + * + * Only byteswap (if needed) on the 1st pass. */ *zr->zr_txgp = replay_txg - (pass == 1); error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, - zr->zr_byteswap); + zr->zr_byteswap && pass == 1); *zr->zr_txgp = TXG_NOWAIT; } @@ -1491,6 +1598,8 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) * transaction. */ if (error != ERESTART && !sunk) { + if (zr->zr_replay_cleaner) + zr->zr_replay_cleaner(zr->zr_arg); txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); sunk = B_TRUE; continue; /* retry */ @@ -1510,8 +1619,9 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) name = kmem_alloc(MAXNAMELEN, KM_SLEEP); dmu_objset_name(zr->zr_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " - "dataset %s, seq 0x%llx, txtype %llu\n", - error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype); + "dataset %s, seq 0x%llx, txtype %llu %s\n", + error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype, + (lr->lrc_txtype & TX_CI) ? "CI" : ""); zilog->zl_stop_replay = 1; kmem_free(name, MAXNAMELEN); } @@ -1528,7 +1638,8 @@ zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) */ void zil_replay(objset_t *os, void *arg, uint64_t *txgp, - zil_replay_func_t *replay_func[TX_MAX_TYPE]) + zil_replay_func_t *replay_func[TX_MAX_TYPE], + zil_replay_cleaner_t *replay_cleaner) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; @@ -1542,6 +1653,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp, zr.zr_os = os; zr.zr_replay = replay_func; + zr.zr_replay_cleaner = replay_cleaner; zr.zr_arg = arg; zr.zr_txgp = txgp; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); @@ -1560,6 +1672,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp, kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE); zil_destroy(zilog, B_FALSE); + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); //printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index b5dd35f5599e..4650d42b7c2f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/fm/fs/zfs.h> #include <sys/spa.h> @@ -61,23 +59,9 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { char *zio_type_name[ZIO_TYPES] = { "null", "read", "write", "free", "claim", "ioctl" }; -/* At or above this size, force gang blocking - for testing */ -uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; - -/* Force an allocation failure when non-zero */ -uint16_t zio_zil_fail_shift = 0; - -typedef struct zio_sync_pass { - int zp_defer_free; /* defer frees after this pass */ - int zp_dontcompress; /* don't compress after this pass */ - int zp_rewrite; /* rewrite new bps after this pass */ -} zio_sync_pass_t; - -zio_sync_pass_t zio_sync_pass = { - 1, /* zp_defer_free */ - 4, /* zp_dontcompress */ - 1, /* zp_rewrite */ -}; +#define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */ +#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */ +#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */ /* * ========================================================================== @@ -94,6 +78,13 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; extern vmem_t *zio_alloc_arena; #endif +/* + * An allocating zio is one that either currently has the DVA allocate + * stage set or will have it later in its lifetime. + */ +#define IO_IS_ALLOCATING(zio) \ + ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) + void zio_init(void) { @@ -107,7 +98,6 @@ zio_init(void) data_alloc_arena = zio_alloc_arena; #endif #endif - zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); @@ -144,9 +134,6 @@ zio_init(void) zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NODEBUG); - - dprintf("creating cache for size %5lx align %5lx\n", - size, align); } } @@ -212,7 +199,7 @@ zio_buf_alloc(size_t size) ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); + return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); #else return (kmem_alloc(size, KM_SLEEP)); #endif @@ -232,7 +219,7 @@ zio_data_buf_alloc(size_t size) ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP)); + return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); #else return (kmem_alloc(size, KM_SLEEP)); #endif @@ -272,13 +259,15 @@ zio_data_buf_free(void *buf, size_t size) * ========================================================================== */ static void -zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) +zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, + zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); - zt->zt_data = data; - zt->zt_size = size; + zt->zt_orig_data = zio->io_data; + zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; + zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; @@ -288,128 +277,233 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) } static void -zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) +zio_pop_transforms(zio_t *zio) { - zio_transform_t *zt = zio->io_transform_stack; + zio_transform_t *zt; + + while ((zt = zio->io_transform_stack) != NULL) { + if (zt->zt_transform != NULL) + zt->zt_transform(zio, + zt->zt_orig_data, zt->zt_orig_size); - *data = zt->zt_data; - *size = zt->zt_size; - *bufsize = zt->zt_bufsize; + zio_buf_free(zio->io_data, zt->zt_bufsize); - zio->io_transform_stack = zt->zt_next; - kmem_free(zt, sizeof (zio_transform_t)); + zio->io_data = zt->zt_orig_data; + zio->io_size = zt->zt_orig_size; + zio->io_transform_stack = zt->zt_next; - if ((zt = zio->io_transform_stack) != NULL) { - zio->io_data = zt->zt_data; - zio->io_size = zt->zt_size; + kmem_free(zt, sizeof (zio_transform_t)); } } +/* + * ========================================================================== + * I/O transform callbacks for subblocks and decompression + * ========================================================================== + */ +static void +zio_subblock(zio_t *zio, void *data, uint64_t size) +{ + ASSERT(zio->io_size > size); + + if (zio->io_type == ZIO_TYPE_READ) + bcopy(zio->io_data, data, size); +} + static void -zio_clear_transform_stack(zio_t *zio) +zio_decompress(zio_t *zio, void *data, uint64_t size) { - void *data; - uint64_t size, bufsize; + if (zio->io_error == 0 && + zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), + zio->io_data, zio->io_size, data, size) != 0) + zio->io_error = EIO; +} - ASSERT(zio->io_transform_stack != NULL); +/* + * ========================================================================== + * I/O parent/child relationships and pipeline interlocks + * ========================================================================== + */ - zio_pop_transform(zio, &data, &size, &bufsize); - while (zio->io_transform_stack != NULL) { - zio_buf_free(data, bufsize); - zio_pop_transform(zio, &data, &size, &bufsize); +static void +zio_add_child(zio_t *pio, zio_t *zio) +{ + mutex_enter(&pio->io_lock); + if (zio->io_stage < ZIO_STAGE_READY) + pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; + if (zio->io_stage < ZIO_STAGE_DONE) + pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; + zio->io_sibling_prev = NULL; + zio->io_sibling_next = pio->io_child; + if (pio->io_child != NULL) + pio->io_child->io_sibling_prev = zio; + pio->io_child = zio; + zio->io_parent = pio; + mutex_exit(&pio->io_lock); +} + +static void +zio_remove_child(zio_t *pio, zio_t *zio) +{ + zio_t *next, *prev; + + ASSERT(zio->io_parent == pio); + + mutex_enter(&pio->io_lock); + next = zio->io_sibling_next; + prev = zio->io_sibling_prev; + if (next != NULL) + next->io_sibling_prev = prev; + if (prev != NULL) + prev->io_sibling_next = next; + if (pio->io_child == zio) + pio->io_child = next; + mutex_exit(&pio->io_lock); +} + +static boolean_t +zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) +{ + uint64_t *countp = &zio->io_children[child][wait]; + boolean_t waiting = B_FALSE; + + mutex_enter(&zio->io_lock); + ASSERT(zio->io_stall == NULL); + if (*countp != 0) { + zio->io_stage--; + zio->io_stall = countp; + waiting = B_TRUE; + } + mutex_exit(&zio->io_lock); + + return (waiting); +} + +static void +zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) +{ + uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; + int *errorp = &pio->io_child_error[zio->io_child_type]; + + mutex_enter(&pio->io_lock); + if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) + *errorp = zio_worst_error(*errorp, zio->io_error); + pio->io_reexecute |= zio->io_reexecute; + ASSERT3U(*countp, >, 0); + if (--*countp == 0 && pio->io_stall == countp) { + pio->io_stall = NULL; + mutex_exit(&pio->io_lock); + zio_execute(pio); + } else { + mutex_exit(&pio->io_lock); } } +static void +zio_inherit_child_errors(zio_t *zio, enum zio_child c) +{ + if (zio->io_child_error[c] != 0 && zio->io_error == 0) + zio->io_error = zio->io_child_error[c]; +} + /* * ========================================================================== - * Create the various types of I/O (read, write, free) + * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) + zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset, + const zbookmark_t *zb, uint8_t stage, uint32_t pipeline) { zio_t *zio; ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); + ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); + + ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); + ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); + ASSERT(vd || stage == ZIO_STAGE_OPEN); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); - zio->io_parent = pio; - zio->io_spa = spa; - zio->io_txg = txg; + + mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); + + if (vd != NULL) + zio->io_child_type = ZIO_CHILD_VDEV; + else if (flags & ZIO_FLAG_GANG_CHILD) + zio->io_child_type = ZIO_CHILD_GANG; + else + zio->io_child_type = ZIO_CHILD_LOGICAL; + if (bp != NULL) { zio->io_bp = bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; + if (type != ZIO_TYPE_WRITE) + zio->io_bp = &zio->io_bp_copy; /* so caller can free */ + if (zio->io_child_type == ZIO_CHILD_LOGICAL) { + if (BP_IS_GANG(bp)) + pipeline |= ZIO_GANG_STAGES; + zio->io_logical = zio; + } } + + zio->io_spa = spa; + zio->io_txg = txg; + zio->io_data = data; + zio->io_size = size; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; - zio->io_stage = stage; - zio->io_pipeline = pipeline; - zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; - zio->io_timestamp = lbolt64; - zio->io_flags = flags; - mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); - zio_push_transform(zio, data, size, size); + zio->io_vd = vd; + zio->io_offset = offset; + zio->io_orig_flags = zio->io_flags = flags; + zio->io_orig_stage = zio->io_stage = stage; + zio->io_orig_pipeline = zio->io_pipeline = pipeline; - /* - * Note on config lock: - * - * If CONFIG_HELD is set, then the caller already has the config - * lock, so we don't need it for this io. - * - * We set CONFIG_GRABBED to indicate that we have grabbed the - * config lock on behalf of this io, so it should be released - * in zio_done. - * - * Unless CONFIG_HELD is set, we will grab the config lock for - * any top-level (parent-less) io, *except* NULL top-level ios. - * The NULL top-level ios rarely have any children, so we delay - * grabbing the lock until the first child is added (but it is - * still grabbed on behalf of the top-level i/o, so additional - * children don't need to also grab it). This greatly reduces - * contention on the config lock. - */ - if (pio == NULL) { - if (type != ZIO_TYPE_NULL && - !(flags & ZIO_FLAG_CONFIG_HELD)) { - spa_config_enter(zio->io_spa, RW_READER, zio); - zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; - } - zio->io_root = zio; - } else { - zio->io_root = pio->io_root; - if (!(flags & ZIO_FLAG_NOBOOKMARK)) + if (zb != NULL) + zio->io_bookmark = *zb; + + if (pio != NULL) { + /* + * Logical I/Os can have logical, gang, or vdev children. + * Gang I/Os can have gang or vdev children. + * Vdev I/Os can only have vdev children. + * The following ASSERT captures all of these constraints. + */ + ASSERT(zio->io_child_type <= pio->io_child_type); + if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; - mutex_enter(&pio->io_lock); - if (pio->io_parent == NULL && - pio->io_type == ZIO_TYPE_NULL && - !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && - !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { - pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; - spa_config_enter(zio->io_spa, RW_READER, pio); - } - if (stage < ZIO_STAGE_READY) - pio->io_children_notready++; - pio->io_children_notdone++; - zio->io_sibling_next = pio->io_child; - zio->io_sibling_prev = NULL; - if (pio->io_child != NULL) - pio->io_child->io_sibling_prev = zio; - pio->io_child = zio; - zio->io_ndvas = pio->io_ndvas; - mutex_exit(&pio->io_lock); + zio_add_child(pio, zio); } return (zio); } +static void +zio_destroy(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + uint8_t async_root = zio->io_async_root; + + mutex_destroy(&zio->io_lock); + cv_destroy(&zio->io_cv); + kmem_cache_free(zio_cache, zio); + + if (async_root) { + mutex_enter(&spa->spa_async_root_lock); + if (--spa->spa_async_root_count == 0) + cv_broadcast(&spa->spa_async_root_cv); + mutex_exit(&spa->spa_async_root_lock); + } +} + zio_t * zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, int flags) @@ -417,8 +511,8 @@ zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, - ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, - ZIO_WAIT_FOR_CHILDREN_PIPELINE); + ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, + ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } @@ -430,160 +524,89 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) } zio_t * -zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, - uint64_t size, zio_done_func_t *done, void *private, - int priority, int flags, zbookmark_t *zb) +zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, + void *data, uint64_t size, zio_done_func_t *done, void *private, + int priority, int flags, const zbookmark_t *zb) { zio_t *zio; - ASSERT3U(size, ==, BP_GET_LSIZE(bp)); - - zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, - ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, + zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp, + data, size, done, private, + ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); - zio->io_bookmark = *zb; - - zio->io_logical = zio; - - /* - * Work off our copy of the bp so the caller can free it. - */ - zio->io_bp = &zio->io_bp_copy; - - if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { - uint64_t csize = BP_GET_PSIZE(bp); - void *cbuf = zio_buf_alloc(csize); - - zio_push_transform(zio, cbuf, csize, csize); - zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; - } - - if (BP_IS_GANG(bp)) { - uint64_t gsize = SPA_GANGBLOCKSIZE; - void *gbuf = zio_buf_alloc(gsize); - - zio_push_transform(zio, gbuf, gsize, gsize); - zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; - } return (zio); } zio_t * -zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, - uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, - int flags, zbookmark_t *zb) +zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + void *data, uint64_t size, zio_prop_t *zp, + zio_done_func_t *ready, zio_done_func_t *done, void *private, + int priority, int flags, const zbookmark_t *zb) { zio_t *zio; - ASSERT(checksum >= ZIO_CHECKSUM_OFF && - checksum < ZIO_CHECKSUM_FUNCTIONS); - - ASSERT(compress >= ZIO_COMPRESS_OFF && - compress < ZIO_COMPRESS_FUNCTIONS); + ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && + zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && + zp->zp_compress >= ZIO_COMPRESS_OFF && + zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && + zp->zp_type < DMU_OT_NUMTYPES && + zp->zp_level < 32 && + zp->zp_ndvas > 0 && + zp->zp_ndvas <= spa_max_replication(spa)); + ASSERT(ready != NULL); zio = zio_create(pio, spa, txg, bp, data, size, done, private, - ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, + ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); zio->io_ready = ready; - - zio->io_bookmark = *zb; - - zio->io_logical = zio; - - zio->io_checksum = checksum; - zio->io_compress = compress; - zio->io_ndvas = ncopies; - - if (compress != ZIO_COMPRESS_OFF) - zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; - - if (bp->blk_birth != txg) { - /* XXX the bp usually (always?) gets re-zeroed later */ - BP_ZERO(bp); - BP_SET_LSIZE(bp, size); - BP_SET_PSIZE(bp, size); - } else { - /* Make sure someone doesn't change their mind on overwrites */ - ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), - spa_max_replication(spa)) == BP_GET_NDVAS(bp)); - } + zio->io_prop = *zp; return (zio); } zio_t * -zio_rewrite(zio_t *pio, spa_t *spa, int checksum, - uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *done, void *private, int priority, int flags, - zbookmark_t *zb) +zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, + uint64_t size, zio_done_func_t *done, void *private, int priority, + int flags, zbookmark_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, done, private, - ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, + ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); - zio->io_bookmark = *zb; - zio->io_checksum = checksum; - zio->io_compress = ZIO_COMPRESS_OFF; - - if (pio != NULL) - ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); - - return (zio); -} - -static zio_t * -zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, - uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *done, void *private, int priority, int flags) -{ - zio_t *zio; - - BP_ZERO(bp); - BP_SET_LSIZE(bp, size); - BP_SET_PSIZE(bp, size); - BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); - - zio = zio_create(pio, spa, txg, bp, data, size, done, private, - ZIO_TYPE_WRITE, priority, flags, - ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); - - zio->io_checksum = checksum; - zio->io_compress = ZIO_COMPRESS_OFF; - return (zio); } zio_t * zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private) + zio_done_func_t *done, void *private, int flags) { zio_t *zio; ASSERT(!BP_IS_HOLE(bp)); + if (bp->blk_fill == BLK_FILL_ALREADY_FREED) + return (zio_null(pio, spa, NULL, NULL, flags)); + if (txg == spa->spa_syncing_txg && - spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { + spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) { bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); - return (zio_null(pio, spa, NULL, NULL, 0)); + return (zio_null(pio, spa, NULL, NULL, flags)); } - zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, - ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, - ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); - - zio->io_bp = &zio->io_bp_copy; + zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), + done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, + NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); return (zio); } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private) + zio_done_func_t *done, void *private, int flags) { zio_t *zio; @@ -601,11 +624,9 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); ASSERT3U(spa_first_txg(spa), <=, txg); - zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, - ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, - ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); - - zio->io_bp = &zio->io_bp_copy; + zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), + done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, + NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); return (zio); } @@ -619,10 +640,9 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, if (vd->vdev_children == 0) { zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, - ZIO_TYPE_IOCTL, priority, flags, + ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); - zio->io_vd = vd; zio->io_cmd = cmd; } else { zio = zio_null(pio, spa, NULL, NULL, flags); @@ -635,54 +655,23 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, return (zio); } -static void -zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, - int checksum) -{ - ASSERT(vd->vdev_children == 0); - - ASSERT(size <= SPA_MAXBLOCKSIZE); - ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); - ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); - - ASSERT(offset + size <= VDEV_LABEL_START_SIZE || - offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); - ASSERT3U(offset + size, <=, vd->vdev_psize); - - BP_ZERO(bp); - - BP_SET_LSIZE(bp, size); - BP_SET_PSIZE(bp, size); - - BP_SET_CHECKSUM(bp, checksum); - BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); - BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); - - if (checksum != ZIO_CHECKSUM_OFF) - ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); -} - zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, - int priority, int flags) + int priority, int flags, boolean_t labels) { zio_t *zio; - blkptr_t blk; - zio_phys_bp_init(vd, &blk, offset, size, checksum); + ASSERT(vd->vdev_children == 0); + ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT3U(offset + size, <=, vd->vdev_psize); - zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, - ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, + zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, + ZIO_TYPE_READ, priority, flags, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); - zio->io_vd = vd; - zio->io_offset = offset; - - /* - * Work off our copy of the bp so the caller can free it. - */ - zio->io_bp = &zio->io_bp_copy; + zio->io_prop.zp_checksum = checksum; return (zio); } @@ -690,53 +679,49 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, - int priority, int flags) + int priority, int flags, boolean_t labels) { - zio_block_tail_t *zbt; - void *wbuf; zio_t *zio; - blkptr_t blk; - zio_phys_bp_init(vd, &blk, offset, size, checksum); + ASSERT(vd->vdev_children == 0); + ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT3U(offset + size, <=, vd->vdev_psize); - zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, - ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, + zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, + ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); - zio->io_vd = vd; - zio->io_offset = offset; - - zio->io_bp = &zio->io_bp_copy; - zio->io_checksum = checksum; + zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_zbt) { /* * zbt checksums are necessarily destructive -- they modify - * one word of the write buffer to hold the verifier/checksum. + * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is - * being written to multiple places. + * being written to multiple places in parallel. */ - wbuf = zio_buf_alloc(size); + void *wbuf = zio_buf_alloc(size); bcopy(data, wbuf, size); - zio_push_transform(zio, wbuf, size, size); - - zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; - zbt->zbt_cksum = blk.blk_cksum; + zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* - * Create a child I/O to do some work for us. It has no associated bp. + * Create a child I/O to do some work for us. */ zio_t * -zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, +zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, int priority, int flags, zio_done_func_t *done, void *private) { uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; - zio_t *cio; + zio_t *zio; + + ASSERT(vd->vdev_parent == + (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); if (type == ZIO_TYPE_READ && bp != NULL) { /* @@ -746,517 +731,754 @@ zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, * eliminates redundant checksums in the interior nodes. */ pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; - zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); + pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); } - cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, + if (vd->vdev_children == 0) + offset += VDEV_LABEL_START_SIZE; + + zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, done, private, type, priority, - (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, + (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) | + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags, + vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START - 1, pipeline); - cio->io_vd = vd; - cio->io_offset = offset; - - return (cio); + return (zio); } -/* - * ========================================================================== - * Initiate I/O, either sync or async - * ========================================================================== - */ -int -zio_wait(zio_t *zio) +zio_t * +zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, + int type, int priority, int flags, zio_done_func_t *done, void *private) { - int error; - - ASSERT(zio->io_stage == ZIO_STAGE_OPEN); - - zio->io_waiter = curthread; + zio_t *zio; - zio_next_stage_async(zio); + ASSERT(vd->vdev_ops->vdev_op_leaf); - mutex_enter(&zio->io_lock); - while (zio->io_stalled != ZIO_STAGE_DONE) - cv_wait(&zio->io_cv, &zio->io_lock); - mutex_exit(&zio->io_lock); + zio = zio_create(NULL, vd->vdev_spa, 0, NULL, + data, size, done, private, type, priority, + flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, + vd, offset, NULL, + ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE); - error = zio->io_error; - cv_destroy(&zio->io_cv); - mutex_destroy(&zio->io_lock); - kmem_cache_free(zio_cache, zio); - - return (error); + return (zio); } void -zio_nowait(zio_t *zio) +zio_flush(zio_t *zio, vdev_t *vd) { - zio_next_stage_async(zio); + zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, + NULL, NULL, ZIO_PRIORITY_NOW, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } /* * ========================================================================== - * I/O pipeline interlocks: parent/child dependency scoreboarding + * Prepare to read and write logical blocks * ========================================================================== */ -static void -zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) + +static int +zio_read_bp_init(zio_t *zio) { - mutex_enter(&zio->io_lock); - if (*countp == 0) { - ASSERT(zio->io_stalled == 0); - mutex_exit(&zio->io_lock); - zio_next_stage(zio); - } else { - zio->io_stalled = stage; - mutex_exit(&zio->io_lock); + blkptr_t *bp = zio->io_bp; + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) { + uint64_t csize = BP_GET_PSIZE(bp); + void *cbuf = zio_buf_alloc(csize); + + zio_push_transform(zio, cbuf, csize, csize, zio_decompress); } + + if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) + zio->io_flags |= ZIO_FLAG_DONT_CACHE; + + return (ZIO_PIPELINE_CONTINUE); } -static void -zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) +static int +zio_write_bp_init(zio_t *zio) { - zio_t *pio = zio->io_parent; + zio_prop_t *zp = &zio->io_prop; + int compress = zp->zp_compress; + blkptr_t *bp = zio->io_bp; + void *cbuf; + uint64_t lsize = zio->io_size; + uint64_t csize = lsize; + uint64_t cbufsize = 0; + int pass = 1; - mutex_enter(&pio->io_lock); - if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) - pio->io_error = zio->io_error; - if (--*countp == 0 && pio->io_stalled == stage) { - pio->io_stalled = 0; - mutex_exit(&pio->io_lock); - zio_next_stage_async(pio); + /* + * If our children haven't all reached the ready stage, + * wait for them and then repeat this pipeline stage. + */ + if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || + zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) + return (ZIO_PIPELINE_STOP); + + if (!IO_IS_ALLOCATING(zio)) + return (ZIO_PIPELINE_CONTINUE); + + ASSERT(compress != ZIO_COMPRESS_INHERIT); + + if (bp->blk_birth == zio->io_txg) { + /* + * We're rewriting an existing block, which means we're + * working on behalf of spa_sync(). For spa_sync() to + * converge, it must eventually be the case that we don't + * have to allocate new blocks. But compression changes + * the blocksize, which forces a reallocate, and makes + * convergence take longer. Therefore, after the first + * few passes, stop compressing to ensure convergence. + */ + pass = spa_sync_pass(zio->io_spa); + ASSERT(pass > 1); + + if (pass > SYNC_PASS_DONT_COMPRESS) + compress = ZIO_COMPRESS_OFF; + + /* + * Only MOS (objset 0) data should need to be rewritten. + */ + ASSERT(zio->io_logical->io_bookmark.zb_objset == 0); + + /* Make sure someone doesn't change their mind on overwrites */ + ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp), + spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp)); + } + + if (compress != ZIO_COMPRESS_OFF) { + if (!zio_compress_data(compress, zio->io_data, zio->io_size, + &cbuf, &csize, &cbufsize)) { + compress = ZIO_COMPRESS_OFF; + } else if (csize != 0) { + zio_push_transform(zio, cbuf, csize, cbufsize, NULL); + } + } + + /* + * The final pass of spa_sync() must be all rewrites, but the first + * few passes offer a trade-off: allocating blocks defers convergence, + * but newly allocated blocks are sequential, so they can be written + * to disk faster. Therefore, we allow the first few passes of + * spa_sync() to allocate new blocks, but force rewrites after that. + * There should only be a handful of blocks after pass 1 in any case. + */ + if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && + pass > SYNC_PASS_REWRITE) { + ASSERT(csize != 0); + uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; + zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; + zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { - mutex_exit(&pio->io_lock); + BP_ZERO(bp); + zio->io_pipeline = ZIO_WRITE_PIPELINE; + } + + if (csize == 0) { + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + } else { + ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); + BP_SET_LSIZE(bp, lsize); + BP_SET_PSIZE(bp, csize); + BP_SET_COMPRESS(bp, compress); + BP_SET_CHECKSUM(bp, zp->zp_checksum); + BP_SET_TYPE(bp, zp->zp_type); + BP_SET_LEVEL(bp, zp->zp_level); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); } + + return (ZIO_PIPELINE_CONTINUE); } +/* + * ========================================================================== + * Execute the I/O pipeline + * ========================================================================== + */ + static void -zio_wait_children_ready(zio_t *zio) +zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q) { - zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, - &zio->io_children_notready); -} + zio_type_t t = zio->io_type; -void -zio_wait_children_done(zio_t *zio) -{ - zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, - &zio->io_children_notdone); + /* + * If we're a config writer, the normal issue and interrupt threads + * may all be blocked waiting for the config lock. In this case, + * select the otherwise-unused taskq for ZIO_TYPE_NULL. + */ + if (zio->io_flags & ZIO_FLAG_CONFIG_WRITER) + t = ZIO_TYPE_NULL; + + /* + * A similar issue exists for the L2ARC write thread until L2ARC 2.0. + */ + if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) + t = ZIO_TYPE_NULL; + + (void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q], + (task_func_t *)zio_execute, zio, TQ_SLEEP); } -static void -zio_ready(zio_t *zio) +static boolean_t +zio_taskq_member(zio_t *zio, enum zio_taskq_type q) { - zio_t *pio = zio->io_parent; + kthread_t *executor = zio->io_executor; + spa_t *spa = zio->io_spa; - if (zio->io_ready) - zio->io_ready(zio); + for (zio_type_t t = 0; t < ZIO_TYPES; t++) + if (taskq_member(spa->spa_zio_taskq[t][q], executor)) + return (B_TRUE); - if (pio != NULL) - zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, - &pio->io_children_notready); + return (B_FALSE); +} - if (zio->io_bp) - zio->io_bp_copy = *zio->io_bp; +static int +zio_issue_async(zio_t *zio) +{ + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); - zio_next_stage(zio); + return (ZIO_PIPELINE_STOP); } -static void -zio_done(zio_t *zio) +void +zio_interrupt(zio_t *zio) { - zio_t *pio = zio->io_parent; - spa_t *spa = zio->io_spa; - blkptr_t *bp = zio->io_bp; - vdev_t *vd = zio->io_vd; + zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT); +} - ASSERT(zio->io_children_notready == 0); - ASSERT(zio->io_children_notdone == 0); +/* + * Execute the I/O pipeline until one of the following occurs: + * (1) the I/O completes; (2) the pipeline stalls waiting for + * dependent child I/Os; (3) the I/O issues, so we're waiting + * for an I/O completion interrupt; (4) the I/O is delegated by + * vdev-level caching or aggregation; (5) the I/O is deferred + * due to vdev-level queueing; (6) the I/O is handed off to + * another thread. In all cases, the pipeline stops whenever + * there's no CPU work; it never burns a thread in cv_wait(). + * + * There's no locking on io_stage because there's no legitimate way + * for multiple threads to be attempting to process the same I/O. + */ +static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES]; - if (bp != NULL) { - ASSERT(bp->blk_pad[0] == 0); - ASSERT(bp->blk_pad[1] == 0); - ASSERT(bp->blk_pad[2] == 0); - ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); - if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && - !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { - ASSERT(!BP_SHOULD_BYTESWAP(bp)); - if (zio->io_ndvas != 0) - ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); - ASSERT(BP_COUNT_GANG(bp) == 0 || - (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); - } - } +void +zio_execute(zio_t *zio) +{ + zio->io_executor = curthread; - if (vd != NULL) - vdev_stat_update(zio); + while (zio->io_stage < ZIO_STAGE_DONE) { + uint32_t pipeline = zio->io_pipeline; + zio_stage_t stage = zio->io_stage; + int rv; - if (zio->io_error) { - /* - * If this I/O is attached to a particular vdev, - * generate an error message describing the I/O failure - * at the block level. We ignore these errors if the - * device is currently unavailable. - */ - if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) - zfs_ereport_post(FM_EREPORT_ZFS_IO, - zio->io_spa, vd, zio, 0, 0); + ASSERT(!MUTEX_HELD(&zio->io_lock)); - if ((zio->io_error == EIO || - !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && - zio->io_logical == zio) { - /* - * For root I/O requests, tell the SPA to log the error - * appropriately. Also, generate a logical data - * ereport. - */ - spa_log_error(zio->io_spa, zio); + while (((1U << ++stage) & pipeline) == 0) + continue; - zfs_ereport_post(FM_EREPORT_ZFS_DATA, - zio->io_spa, NULL, zio, 0, 0); - } + ASSERT(stage <= ZIO_STAGE_DONE); + ASSERT(zio->io_stall == NULL); /* - * For I/O requests that cannot fail, panic appropriately. + * If we are in interrupt context and this pipeline stage + * will grab a config lock that is held across I/O, + * issue async to avoid deadlock. */ - if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { - char *blkbuf; - - blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); - if (blkbuf) { - sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, - bp ? bp : &zio->io_bp_copy); - } - panic("ZFS: %s (%s on %s off %llx: zio %p %s): error " - "%d", zio->io_error == ECKSUM ? - "bad checksum" : "I/O failure", - zio_type_name[zio->io_type], - vdev_description(vd), - (u_longlong_t)zio->io_offset, - zio, blkbuf ? blkbuf : "", zio->io_error); + if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) && + zio->io_vd == NULL && + zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); + return; } + + zio->io_stage = stage; + rv = zio_pipeline[stage](zio); + + if (rv == ZIO_PIPELINE_STOP) + return; + + ASSERT(rv == ZIO_PIPELINE_CONTINUE); } - zio_clear_transform_stack(zio); +} - if (zio->io_done) - zio->io_done(zio); +/* + * ========================================================================== + * Initiate I/O, either sync or async + * ========================================================================== + */ +int +zio_wait(zio_t *zio) +{ + int error; - ASSERT(zio->io_delegate_list == NULL); - ASSERT(zio->io_delegate_next == NULL); + ASSERT(zio->io_stage == ZIO_STAGE_OPEN); + ASSERT(zio->io_executor == NULL); - if (pio != NULL) { - zio_t *next, *prev; + zio->io_waiter = curthread; - mutex_enter(&pio->io_lock); - next = zio->io_sibling_next; - prev = zio->io_sibling_prev; - if (next != NULL) - next->io_sibling_prev = prev; - if (prev != NULL) - prev->io_sibling_next = next; - if (pio->io_child == zio) - pio->io_child = next; - mutex_exit(&pio->io_lock); + zio_execute(zio); - zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, - &pio->io_children_notdone); - } + mutex_enter(&zio->io_lock); + while (zio->io_executor != NULL) + cv_wait(&zio->io_cv, &zio->io_lock); + mutex_exit(&zio->io_lock); - /* - * Note: this I/O is now done, and will shortly be freed, so there is no - * need to clear this (or any other) flag. - */ - if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) - spa_config_exit(spa, zio); + error = zio->io_error; + zio_destroy(zio); - if (zio->io_waiter != NULL) { - mutex_enter(&zio->io_lock); - ASSERT(zio->io_stage == ZIO_STAGE_DONE); - zio->io_stalled = zio->io_stage; - cv_broadcast(&zio->io_cv); - mutex_exit(&zio->io_lock); - } else { - cv_destroy(&zio->io_cv); - mutex_destroy(&zio->io_lock); - kmem_cache_free(zio_cache, zio); + return (error); +} + +void +zio_nowait(zio_t *zio) +{ + ASSERT(zio->io_executor == NULL); + + if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) { + /* + * This is a logical async I/O with no parent to wait for it. + * Attach it to the pool's global async root zio so that + * spa_unload() has a way of waiting for async I/O to finish. + */ + spa_t *spa = zio->io_spa; + zio->io_async_root = B_TRUE; + mutex_enter(&spa->spa_async_root_lock); + spa->spa_async_root_count++; + mutex_exit(&spa->spa_async_root_lock); } + + zio_execute(zio); } /* * ========================================================================== - * Compression support + * Reexecute or suspend/resume failed I/O * ========================================================================== */ + static void -zio_write_compress(zio_t *zio) +zio_reexecute(zio_t *pio) { - int compress = zio->io_compress; - blkptr_t *bp = zio->io_bp; - void *cbuf; - uint64_t lsize = zio->io_size; - uint64_t csize = lsize; - uint64_t cbufsize = 0; - int pass; + zio_t *zio, *zio_next; - if (bp->blk_birth == zio->io_txg) { + pio->io_flags = pio->io_orig_flags; + pio->io_stage = pio->io_orig_stage; + pio->io_pipeline = pio->io_orig_pipeline; + pio->io_reexecute = 0; + pio->io_error = 0; + for (int c = 0; c < ZIO_CHILD_TYPES; c++) + pio->io_child_error[c] = 0; + + if (IO_IS_ALLOCATING(pio)) { /* - * We're rewriting an existing block, which means we're - * working on behalf of spa_sync(). For spa_sync() to - * converge, it must eventually be the case that we don't - * have to allocate new blocks. But compression changes - * the blocksize, which forces a reallocate, and makes - * convergence take longer. Therefore, after the first - * few passes, stop compressing to ensure convergence. + * Remember the failed bp so that the io_ready() callback + * can update its accounting upon reexecution. The block + * was already freed in zio_done(); we indicate this with + * a fill count of -1 so that zio_free() knows to skip it. */ - pass = spa_sync_pass(zio->io_spa); - if (pass > zio_sync_pass.zp_dontcompress) - compress = ZIO_COMPRESS_OFF; - } else { - ASSERT(BP_IS_HOLE(bp)); - pass = 1; + blkptr_t *bp = pio->io_bp; + ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg); + bp->blk_fill = BLK_FILL_ALREADY_FREED; + pio->io_bp_orig = *bp; + BP_ZERO(bp); } - if (compress != ZIO_COMPRESS_OFF) - if (!zio_compress_data(compress, zio->io_data, zio->io_size, - &cbuf, &csize, &cbufsize)) - compress = ZIO_COMPRESS_OFF; - - if (compress != ZIO_COMPRESS_OFF && csize != 0) - zio_push_transform(zio, cbuf, csize, cbufsize); + /* + * As we reexecute pio's children, new children could be created. + * New children go to the head of the io_child list, however, + * so we will (correctly) not reexecute them. The key is that + * the remainder of the io_child list, from 'zio_next' onward, + * cannot be affected by any side effects of reexecuting 'zio'. + */ + for (zio = pio->io_child; zio != NULL; zio = zio_next) { + zio_next = zio->io_sibling_next; + mutex_enter(&pio->io_lock); + pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; + pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; + mutex_exit(&pio->io_lock); + zio_reexecute(zio); + } /* - * The final pass of spa_sync() must be all rewrites, but the first - * few passes offer a trade-off: allocating blocks defers convergence, - * but newly allocated blocks are sequential, so they can be written - * to disk faster. Therefore, we allow the first few passes of - * spa_sync() to reallocate new blocks, but force rewrites after that. - * There should only be a handful of blocks after pass 1 in any case. + * Now that all children have been reexecuted, execute the parent. */ - if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && - pass > zio_sync_pass.zp_rewrite) { - ASSERT(csize != 0); - BP_SET_LSIZE(bp, lsize); - BP_SET_COMPRESS(bp, compress); - zio->io_pipeline = ZIO_REWRITE_PIPELINE; - } else { - if (bp->blk_birth == zio->io_txg) - BP_ZERO(bp); - if (csize == 0) { - BP_ZERO(bp); - zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; - } else { - ASSERT3U(BP_GET_NDVAS(bp), ==, 0); - BP_SET_LSIZE(bp, lsize); - BP_SET_PSIZE(bp, csize); - BP_SET_COMPRESS(bp, compress); - zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; - } + zio_execute(pio); +} + +void +zio_suspend(spa_t *spa, zio_t *zio) +{ + if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) + fm_panic("Pool '%s' has encountered an uncorrectable I/O " + "failure and the failure mode property for this pool " + "is set to panic.", spa_name(spa)); + + zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); + + mutex_enter(&spa->spa_suspend_lock); + + if (spa->spa_suspend_zio_root == NULL) + spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 0); + + spa->spa_suspended = B_TRUE; + + if (zio != NULL) { + ASSERT(zio != spa->spa_suspend_zio_root); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(zio->io_parent == NULL); + ASSERT(zio->io_stage == ZIO_STAGE_DONE); + zio_add_child(spa->spa_suspend_zio_root, zio); } - zio_next_stage(zio); + mutex_exit(&spa->spa_suspend_lock); } -static void -zio_read_decompress(zio_t *zio) +void +zio_resume(spa_t *spa) { - blkptr_t *bp = zio->io_bp; - void *data; - uint64_t size; - uint64_t bufsize; - int compress = BP_GET_COMPRESS(bp); + zio_t *pio, *zio; - ASSERT(compress != ZIO_COMPRESS_OFF); + /* + * Reexecute all previously suspended i/o. + */ + mutex_enter(&spa->spa_suspend_lock); + spa->spa_suspended = B_FALSE; + cv_broadcast(&spa->spa_suspend_cv); + pio = spa->spa_suspend_zio_root; + spa->spa_suspend_zio_root = NULL; + mutex_exit(&spa->spa_suspend_lock); + + if (pio == NULL) + return; - zio_pop_transform(zio, &data, &size, &bufsize); + while ((zio = pio->io_child) != NULL) { + zio_remove_child(pio, zio); + zio->io_parent = NULL; + zio_reexecute(zio); + } - if (zio_decompress_data(compress, data, size, - zio->io_data, zio->io_size)) - zio->io_error = EIO; + ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0); - zio_buf_free(data, bufsize); + (void) zio_wait(pio); +} - zio_next_stage(zio); +void +zio_resume_wait(spa_t *spa) +{ + mutex_enter(&spa->spa_suspend_lock); + while (spa_suspended(spa)) + cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); + mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== - * Gang block support + * Gang blocks. + * + * A gang block is a collection of small blocks that looks to the DMU + * like one large block. When zio_dva_allocate() cannot find a block + * of the requested size, due to either severe fragmentation or the pool + * being nearly full, it calls zio_write_gang_block() to construct the + * block from smaller fragments. + * + * A gang block consists of a gang header (zio_gbh_phys_t) and up to + * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like + * an indirect block: it's an array of block pointers. It consumes + * only one sector and hence is allocatable regardless of fragmentation. + * The gang header's bps point to its gang members, which hold the data. + * + * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> + * as the verifier to ensure uniqueness of the SHA256 checksum. + * Critically, the gang block bp's blk_cksum is the checksum of the data, + * not the gang header. This ensures that data block signatures (needed for + * deduplication) are independent of how the block is physically stored. + * + * Gang blocks can be nested: a gang member may itself be a gang block. + * Thus every gang block is a tree in which root and all interior nodes are + * gang headers, and the leaves are normal blocks that contain user data. + * The root of the gang tree is called the gang leader. + * + * To perform any operation (read, rewrite, free, claim) on a gang block, + * zio_gang_assemble() first assembles the gang tree (minus data leaves) + * in the io_gang_tree field of the original logical i/o by recursively + * reading the gang leader and all gang headers below it. This yields + * an in-core tree containing the contents of every gang header and the + * bps for every constituent of the gang block. + * + * With the gang tree now assembled, zio_gang_issue() just walks the gang tree + * and invokes a callback on each bp. To free a gang block, zio_gang_issue() + * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. + * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). + * zio_read_gang() is a wrapper around zio_read() that omits reading gang + * headers, since we already have those in io_gang_tree. zio_rewrite_gang() + * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() + * of the gang header plus zio_checksum_compute() of the data to update the + * gang header's blk_cksum as described above. + * + * The two-phase assemble/issue model solves the problem of partial failure -- + * what if you'd freed part of a gang block but then couldn't read the + * gang header for another part? Assembling the entire gang tree first + * ensures that all the necessary gang header I/O has succeeded before + * starting the actual work of free, claim, or write. Once the gang tree + * is assembled, free and claim are in-memory operations that cannot fail. + * + * In the event that a gang write fails, zio_dva_unallocate() walks the + * gang tree to immediately free (i.e. insert back into the space map) + * everything we've allocated. This ensures that we don't get ENOSPC + * errors during repeated suspend/resume cycles due to a flaky device. + * + * Gang rewrites only happen during sync-to-convergence. If we can't assemble + * the gang tree, we won't modify the block, so we can safely defer the free + * (knowing that the block is still intact). If we *can* assemble the gang + * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free + * each constituent bp and we can allocate a new block on the next sync pass. + * + * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ -static void -zio_gang_pipeline(zio_t *zio) + +static zio_t * +zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { - /* - * By default, the pipeline assumes that we're dealing with a gang - * block. If we're not, strip out any gang-specific stages. - */ - if (!BP_IS_GANG(zio->io_bp)) - zio->io_pipeline &= ~ZIO_GANG_STAGES; + if (gn != NULL) + return (pio); - zio_next_stage(zio); + return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), + NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark)); } -static void -zio_gang_byteswap(zio_t *zio) +zio_t * +zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { - ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); + zio_t *zio; - if (BP_SHOULD_BYTESWAP(zio->io_bp)) - byteswap_uint64_array(zio->io_data, zio->io_size); + if (gn != NULL) { + zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, + gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + /* + * As we rewrite each gang header, the pipeline will compute + * a new gang block header checksum for it; but no one will + * compute a new data checksum, so we do that here. The one + * exception is the gang leader: the pipeline already computed + * its data checksum because that stage precedes gang assembly. + * (Presently, nothing actually uses interior data checksums; + * this is just good hygiene.) + */ + if (gn != pio->io_logical->io_gang_tree) { + zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), + data, BP_GET_PSIZE(bp)); + } + } else { + zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, + data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + } + + return (zio); } -static void -zio_get_gang_header(zio_t *zio) +/* ARGSUSED */ +zio_t * +zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { - blkptr_t *bp = zio->io_bp; - uint64_t gsize = SPA_GANGBLOCKSIZE; - void *gbuf = zio_buf_alloc(gsize); + return (zio_free(pio, pio->io_spa, pio->io_txg, bp, + NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); +} - ASSERT(BP_IS_GANG(bp)); +/* ARGSUSED */ +zio_t * +zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +{ + return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, + NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); +} - zio_push_transform(zio, gbuf, gsize, gsize); +static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { + NULL, + zio_read_gang, + zio_rewrite_gang, + zio_free_gang, + zio_claim_gang, + NULL +}; - zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, - NULL, NULL, ZIO_TYPE_READ, zio->io_priority, - zio->io_flags & ZIO_FLAG_GANG_INHERIT, - ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); +static void zio_gang_tree_assemble_done(zio_t *zio); - zio_wait_children_done(zio); +static zio_gang_node_t * +zio_gang_node_alloc(zio_gang_node_t **gnpp) +{ + zio_gang_node_t *gn; + + ASSERT(*gnpp == NULL); + + gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); + gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); + *gnpp = gn; + + return (gn); } static void -zio_read_gang_members(zio_t *zio) +zio_gang_node_free(zio_gang_node_t **gnpp) { - zio_gbh_phys_t *gbh; - uint64_t gsize, gbufsize, loff, lsize; - int i; + zio_gang_node_t *gn = *gnpp; - ASSERT(BP_IS_GANG(zio->io_bp)); + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + ASSERT(gn->gn_child[g] == NULL); - zio_gang_byteswap(zio); - zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); + zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); + kmem_free(gn, sizeof (*gn)); + *gnpp = NULL; +} - for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { - blkptr_t *gbp = &gbh->zg_blkptr[i]; - lsize = BP_GET_PSIZE(gbp); +static void +zio_gang_tree_free(zio_gang_node_t **gnpp) +{ + zio_gang_node_t *gn = *gnpp; - ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); - ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); - ASSERT3U(loff + lsize, <=, zio->io_size); - ASSERT(i < SPA_GBH_NBLKPTRS); - ASSERT(!BP_IS_HOLE(gbp)); + if (gn == NULL) + return; - zio_nowait(zio_read(zio, zio->io_spa, gbp, - (char *)zio->io_data + loff, lsize, NULL, NULL, - zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, - &zio->io_bookmark)); - } + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + zio_gang_tree_free(&gn->gn_child[g]); - zio_buf_free(gbh, gbufsize); - zio_wait_children_done(zio); + zio_gang_node_free(gnpp); } static void -zio_rewrite_gang_members(zio_t *zio) +zio_gang_tree_assemble(zio_t *lio, blkptr_t *bp, zio_gang_node_t **gnpp) { - zio_gbh_phys_t *gbh; - uint64_t gsize, gbufsize, loff, lsize; - int i; + zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); - ASSERT(BP_IS_GANG(zio->io_bp)); - ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); + ASSERT(lio->io_logical == lio); + ASSERT(BP_IS_GANG(bp)); + + zio_nowait(zio_read(lio, lio->io_spa, bp, gn->gn_gbh, + SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, + lio->io_priority, ZIO_GANG_CHILD_FLAGS(lio), &lio->io_bookmark)); +} - zio_gang_byteswap(zio); - zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); +static void +zio_gang_tree_assemble_done(zio_t *zio) +{ + zio_t *lio = zio->io_logical; + zio_gang_node_t *gn = zio->io_private; + blkptr_t *bp = zio->io_bp; - ASSERT(gsize == gbufsize); + ASSERT(zio->io_parent == lio); + ASSERT(zio->io_child == NULL); - for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { - blkptr_t *gbp = &gbh->zg_blkptr[i]; - lsize = BP_GET_PSIZE(gbp); + if (zio->io_error) + return; - ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); - ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); - ASSERT3U(loff + lsize, <=, zio->io_size); - ASSERT(i < SPA_GBH_NBLKPTRS); - ASSERT(!BP_IS_HOLE(gbp)); + if (BP_SHOULD_BYTESWAP(bp)) + byteswap_uint64_array(zio->io_data, zio->io_size); - zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, - zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, - NULL, NULL, zio->io_priority, zio->io_flags, - &zio->io_bookmark)); - } + ASSERT(zio->io_data == gn->gn_gbh); + ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); + ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); - zio_push_transform(zio, gbh, gsize, gbufsize); - zio_wait_children_ready(zio); + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + if (!BP_IS_GANG(gbp)) + continue; + zio_gang_tree_assemble(lio, gbp, &gn->gn_child[g]); + } } static void -zio_free_gang_members(zio_t *zio) +zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) { - zio_gbh_phys_t *gbh; - uint64_t gsize, gbufsize; - int i; + zio_t *lio = pio->io_logical; + zio_t *zio; - ASSERT(BP_IS_GANG(zio->io_bp)); + ASSERT(BP_IS_GANG(bp) == !!gn); + ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(lio->io_bp)); + ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == lio->io_gang_tree); - zio_gang_byteswap(zio); - zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); + /* + * If you're a gang header, your data is in gn->gn_gbh. + * If you're a gang member, your data is in 'data' and gn == NULL. + */ + zio = zio_gang_issue_func[lio->io_type](pio, bp, gn, data); - for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { - blkptr_t *gbp = &gbh->zg_blkptr[i]; + if (gn != NULL) { + ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); - if (BP_IS_HOLE(gbp)) - continue; - zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, - gbp, NULL, NULL)); + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + if (BP_IS_HOLE(gbp)) + continue; + zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); + data = (char *)data + BP_GET_PSIZE(gbp); + } } - zio_buf_free(gbh, gbufsize); - zio_next_stage(zio); + if (gn == lio->io_gang_tree) + ASSERT3P((char *)lio->io_data + lio->io_size, ==, data); + + if (zio != pio) + zio_nowait(zio); } -static void -zio_claim_gang_members(zio_t *zio) +static int +zio_gang_assemble(zio_t *zio) { - zio_gbh_phys_t *gbh; - uint64_t gsize, gbufsize; - int i; + blkptr_t *bp = zio->io_bp; - ASSERT(BP_IS_GANG(zio->io_bp)); + ASSERT(BP_IS_GANG(bp) && zio == zio->io_logical); - zio_gang_byteswap(zio); - zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); + zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); - for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { - blkptr_t *gbp = &gbh->zg_blkptr[i]; - if (BP_IS_HOLE(gbp)) - continue; - zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, - gbp, NULL, NULL)); - } + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_gang_issue(zio_t *zio) +{ + zio_t *lio = zio->io_logical; + blkptr_t *bp = zio->io_bp; + + if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) + return (ZIO_PIPELINE_STOP); + + ASSERT(BP_IS_GANG(bp) && zio == lio); + + if (zio->io_child_error[ZIO_CHILD_GANG] == 0) + zio_gang_tree_issue(lio, lio->io_gang_tree, bp, lio->io_data); + else + zio_gang_tree_free(&lio->io_gang_tree); + + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - zio_buf_free(gbh, gbufsize); - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } static void -zio_write_allocate_gang_member_done(zio_t *zio) +zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio->io_parent; + zio_t *lio = zio->io_logical; dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; - int d; - ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); + if (BP_IS_HOLE(zio->io_bp)) + return; + + ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); + + ASSERT(zio->io_child_type == ZIO_CHILD_GANG); + ASSERT3U(zio->io_prop.zp_ndvas, ==, lio->io_prop.zp_ndvas); + ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); + ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); - ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); - ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); - for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { + for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); @@ -1265,97 +1487,77 @@ zio_write_allocate_gang_member_done(zio_t *zio) mutex_exit(&pio->io_lock); } -static void -zio_write_allocate_gang_members(zio_t *zio) +static int +zio_write_gang_block(zio_t *pio) { - blkptr_t *bp = zio->io_bp; - dva_t *dva = bp->blk_dva; - spa_t *spa = zio->io_spa; + spa_t *spa = pio->io_spa; + blkptr_t *bp = pio->io_bp; + zio_t *lio = pio->io_logical; + zio_t *zio; + zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; - uint64_t txg = zio->io_txg; - uint64_t resid = zio->io_size; - uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); - uint64_t gsize, loff, lsize; - uint32_t gbps_left; - int ndvas = zio->io_ndvas; + uint64_t txg = pio->io_txg; + uint64_t resid = pio->io_size; + uint64_t lsize; + int ndvas = lio->io_prop.zp_ndvas; int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); + zio_prop_t zp; int error; - int i, d; - - gsize = SPA_GANGBLOCKSIZE; - gbps_left = SPA_GBH_NBLKPTRS; - - error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE); - if (error == ENOSPC) - panic("can't allocate gang block header"); - ASSERT(error == 0); - - for (d = 0; d < gbh_ndvas; d++) - DVA_SET_GANG(&dva[d], 1); - - bp->blk_birth = txg; - - gbh = zio_buf_alloc(gsize); - bzero(gbh, gsize); - /* We need to test multi-level gang blocks */ - if (maxalloc >= zio_gang_bang && (LBOLT & 0x1) == 0) - maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); + error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE, + bp, gbh_ndvas, txg, pio == lio ? NULL : lio->io_bp, + METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); + if (error) { + pio->io_error = error; + return (ZIO_PIPELINE_CONTINUE); + } - for (loff = 0, i = 0; loff != zio->io_size; - loff += lsize, resid -= lsize, gbps_left--, i++) { - blkptr_t *gbp = &gbh->zg_blkptr[i]; - dva = gbp->blk_dva; + if (pio == lio) { + gnpp = &lio->io_gang_tree; + } else { + gnpp = pio->io_private; + ASSERT(pio->io_ready == zio_write_gang_member_ready); + } - ASSERT(gbps_left != 0); - maxalloc = MIN(maxalloc, resid); + gn = zio_gang_node_alloc(gnpp); + gbh = gn->gn_gbh; + bzero(gbh, SPA_GANGBLOCKSIZE); - while (resid <= maxalloc * gbps_left) { - error = metaslab_alloc(spa, maxalloc, gbp, ndvas, - txg, bp, B_FALSE); - if (error == 0) - break; - ASSERT3U(error, ==, ENOSPC); - if (maxalloc == SPA_MINBLOCKSIZE) - panic("really out of space"); - maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); - } + /* + * Create the gang header. + */ + zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, + pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); - if (resid <= maxalloc * gbps_left) { - lsize = maxalloc; - BP_SET_LSIZE(gbp, lsize); - BP_SET_PSIZE(gbp, lsize); - BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); - gbp->blk_birth = txg; - zio_nowait(zio_rewrite(zio, spa, - zio->io_checksum, txg, gbp, - (char *)zio->io_data + loff, lsize, - zio_write_allocate_gang_member_done, NULL, - zio->io_priority, zio->io_flags, - &zio->io_bookmark)); - } else { - lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); - ASSERT(lsize != SPA_MINBLOCKSIZE); - zio_nowait(zio_write_allocate(zio, spa, - zio->io_checksum, txg, gbp, - (char *)zio->io_data + loff, lsize, - zio_write_allocate_gang_member_done, NULL, - zio->io_priority, zio->io_flags)); - } + /* + * Create and nowait the gang children. + */ + for (int g = 0; resid != 0; resid -= lsize, g++) { + lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), + SPA_MINBLOCKSIZE); + ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); + + zp.zp_checksum = lio->io_prop.zp_checksum; + zp.zp_compress = ZIO_COMPRESS_OFF; + zp.zp_type = DMU_OT_NONE; + zp.zp_level = 0; + zp.zp_ndvas = lio->io_prop.zp_ndvas; + + zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], + (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, + zio_write_gang_member_ready, NULL, &gn->gn_child[g], + pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark)); } - ASSERT(resid == 0 && loff == zio->io_size); - - zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; - - zio_push_transform(zio, gbh, gsize, gsize); /* - * As much as we'd like this to be zio_wait_children_ready(), - * updating our ASIZE doesn't happen until the io_done callback, - * so we have to wait for that to finish in order for our BP - * to be stable. + * Set pio's pipeline to just wait for zio to finish. */ - zio_wait_children_done(zio); + pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + zio_nowait(zio); + + return (ZIO_PIPELINE_CONTINUE); } /* @@ -1363,59 +1565,139 @@ zio_write_allocate_gang_members(zio_t *zio) * Allocate and free blocks * ========================================================================== */ -static void + +static int zio_dva_allocate(zio_t *zio) { + spa_t *spa = zio->io_spa; + metaslab_class_t *mc = spa->spa_normal_class; blkptr_t *bp = zio->io_bp; int error; ASSERT(BP_IS_HOLE(bp)); ASSERT3U(BP_GET_NDVAS(bp), ==, 0); - ASSERT3U(zio->io_ndvas, >, 0); - ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa)); - - /* For testing, make some blocks above a certain size be gang blocks */ - if (zio->io_size >= zio_gang_bang && (LBOLT & 0x3) == 0) { - zio_write_allocate_gang_members(zio); - return; - } - + ASSERT3U(zio->io_prop.zp_ndvas, >, 0); + ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas, - zio->io_txg, NULL, B_FALSE); + error = metaslab_alloc(spa, mc, zio->io_size, bp, + zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0); - if (error == 0) { - bp->blk_birth = zio->io_txg; - } else if (error == ENOSPC) { - if (zio->io_size == SPA_MINBLOCKSIZE) - panic("really, truly out of space"); - zio_write_allocate_gang_members(zio); - return; - } else { + if (error) { + if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) + return (zio_write_gang_block(zio)); zio->io_error = error; } - zio_next_stage(zio); + + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_dva_free(zio_t *zio) { - blkptr_t *bp = zio->io_bp; + metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); - metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); + return (ZIO_PIPELINE_CONTINUE); +} - BP_ZERO(bp); +static int +zio_dva_claim(zio_t *zio) +{ + int error; - zio_next_stage(zio); + error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); + if (error) + zio->io_error = error; + + return (ZIO_PIPELINE_CONTINUE); } +/* + * Undo an allocation. This is used by zio_done() when an I/O fails + * and we want to give back the block we just allocated. + * This handles both normal blocks and gang blocks. + */ static void -zio_dva_claim(zio_t *zio) +zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { - zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); + spa_t *spa = zio->io_spa; + boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE); + + ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); + + if (zio->io_bp == bp && !now) { + /* + * This is a rewrite for sync-to-convergence. + * We can't do a metaslab_free(NOW) because bp wasn't allocated + * during this sync pass, which means that metaslab_sync() + * already committed the allocation. + */ + ASSERT(DVA_EQUAL(BP_IDENTITY(bp), + BP_IDENTITY(&zio->io_bp_orig))); + ASSERT(spa_sync_pass(spa) > 1); - zio_next_stage(zio); + if (BP_IS_GANG(bp) && gn == NULL) { + /* + * This is a gang leader whose gang header(s) we + * couldn't read now, so defer the free until later. + * The block should still be intact because without + * the headers, we'd never even start the rewrite. + */ + bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); + return; + } + } + + if (!BP_IS_HOLE(bp)) + metaslab_free(spa, bp, bp->blk_birth, now); + + if (gn != NULL) { + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + zio_dva_unallocate(zio, gn->gn_child[g], + &gn->gn_gbh->zg_blkptr[g]); + } + } +} + +/* + * Try to allocate an intent log block. Return 0 on success, errno on failure. + */ +int +zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, + uint64_t txg) +{ + int error; + + error = metaslab_alloc(spa, spa->spa_log_class, size, + new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); + + if (error) + error = metaslab_alloc(spa, spa->spa_normal_class, size, + new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); + + if (error == 0) { + BP_SET_LSIZE(new_bp, size); + BP_SET_PSIZE(new_bp, size); + BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); + BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); + BP_SET_LEVEL(new_bp, 0); + BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); + } + + return (error); +} + +/* + * Free an intent log block. We know it can't be a gang block, so there's + * nothing to do except metaslab_free() it. + */ +void +zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) +{ + ASSERT(!BP_IS_GANG(bp)); + + metaslab_free(spa, bp, txg, B_FALSE); } /* @@ -1425,150 +1707,223 @@ zio_dva_claim(zio_t *zio) */ static void -zio_vdev_io_start(zio_t *zio) +zio_vdev_io_probe_done(zio_t *zio) +{ + zio_t *dio; + vdev_t *vd = zio->io_private; + + mutex_enter(&vd->vdev_probe_lock); + ASSERT(vd->vdev_probe_zio == zio); + vd->vdev_probe_zio = NULL; + mutex_exit(&vd->vdev_probe_lock); + + while ((dio = zio->io_delegate_list) != NULL) { + zio->io_delegate_list = dio->io_delegate_next; + dio->io_delegate_next = NULL; + if (!vdev_accessible(vd, dio)) + dio->io_error = ENXIO; + zio_execute(dio); + } +} + +/* + * Probe the device to determine whether I/O failure is specific to this + * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged). + */ +static int +zio_vdev_io_probe(zio_t *zio) { vdev_t *vd = zio->io_vd; - vdev_t *tvd = vd ? vd->vdev_top : NULL; - blkptr_t *bp = zio->io_bp; - uint64_t align; + zio_t *pio = NULL; + boolean_t created_pio = B_FALSE; - if (vd == NULL) { - /* The mirror_ops handle multiple DVAs in a single BP */ - vdev_mirror_ops.vdev_op_io_start(zio); - return; + /* + * Don't probe the probe. + */ + if (zio->io_flags & ZIO_FLAG_PROBE) + return (ZIO_PIPELINE_CONTINUE); + + /* + * To prevent 'probe storms' when a device fails, we create + * just one probe i/o at a time. All zios that want to probe + * this vdev will join the probe zio's io_delegate_list. + */ + mutex_enter(&vd->vdev_probe_lock); + + if ((pio = vd->vdev_probe_zio) == NULL) { + vd->vdev_probe_zio = pio = zio_root(zio->io_spa, + zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL); + created_pio = B_TRUE; + vd->vdev_probe_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_PROBE); } - align = 1ULL << tvd->vdev_ashift; + zio->io_delegate_next = pio->io_delegate_list; + pio->io_delegate_list = zio; + + mutex_exit(&vd->vdev_probe_lock); - if (zio->io_retries == 0 && vd == tvd) - zio->io_flags |= ZIO_FLAG_FAILFAST; + if (created_pio) { + zio_nowait(vdev_probe(vd, pio)); + zio_nowait(pio); + } + + return (ZIO_PIPELINE_STOP); +} + +static int +zio_vdev_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + uint64_t align; + spa_t *spa = zio->io_spa; + + ASSERT(zio->io_error == 0); + ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); + + if (vd == NULL) { + if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) + spa_config_enter(spa, SCL_ZIO, zio, RW_READER); - if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && - vd->vdev_children == 0) { - zio->io_flags |= ZIO_FLAG_PHYSICAL; - zio->io_offset += VDEV_LABEL_START_SIZE; + /* + * The mirror_ops handle multiple DVAs in a single BP. + */ + return (vdev_mirror_ops.vdev_op_io_start(zio)); } + align = 1ULL << vd->vdev_top->vdev_ashift; + if (P2PHASE(zio->io_size, align) != 0) { uint64_t asize = P2ROUNDUP(zio->io_size, align); char *abuf = zio_buf_alloc(asize); - ASSERT(vd == tvd); + ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { bcopy(zio->io_data, abuf, zio->io_size); bzero(abuf + zio->io_size, asize - zio->io_size); } - zio_push_transform(zio, abuf, asize, asize); - ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); - zio->io_flags |= ZIO_FLAG_SUBBLOCK; + zio_push_transform(zio, abuf, asize, asize, zio_subblock); } ASSERT(P2PHASE(zio->io_offset, align) == 0); ASSERT(P2PHASE(zio->io_size, align) == 0); - ASSERT(bp == NULL || - P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); - vdev_io_start(zio); + if (vd->vdev_ops->vdev_op_leaf && + (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { - /* zio_next_stage_async() gets called from io completion interrupt */ -} + if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) + return (ZIO_PIPELINE_STOP); -static void -zio_vdev_io_done(zio_t *zio) -{ - if (zio->io_vd == NULL) - /* The mirror_ops handle multiple DVAs in a single BP */ - vdev_mirror_ops.vdev_op_io_done(zio); - else - vdev_io_done(zio); + if ((zio = vdev_queue_io(zio)) == NULL) + return (ZIO_PIPELINE_STOP); + + if (!vdev_accessible(vd, zio)) { + zio->io_error = ENXIO; + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); + } + + } + + return (vd->vdev_ops->vdev_op_io_start(zio)); } -/* XXPOLICY */ -boolean_t -zio_should_retry(zio_t *zio) +static int +zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; + vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; + boolean_t unexpected_error = B_FALSE; - if (zio->io_error == 0) - return (B_FALSE); - if (zio->io_delegate_list != NULL) - return (B_FALSE); - if (vd && vd != vd->vdev_top) - return (B_FALSE); - if (zio->io_flags & ZIO_FLAG_DONT_RETRY) - return (B_FALSE); - if (zio->io_retries > 0) - return (B_FALSE); + if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) + return (ZIO_PIPELINE_STOP); - return (B_TRUE); + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + + if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { + + vdev_queue_io_done(zio); + + if (zio->io_type == ZIO_TYPE_WRITE) + vdev_cache_write(zio); + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_device_injection(vd, EIO); + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_label_injection(zio, EIO); + + if (zio->io_error) { + if (!vdev_accessible(vd, zio)) { + zio->io_error = ENXIO; + } else { + unexpected_error = B_TRUE; + } + } + } + + ops->vdev_op_io_done(zio); + + if (unexpected_error) + return (zio_vdev_io_probe(zio)); + + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; - vdev_t *tvd = vd ? vd->vdev_top : NULL; - - ASSERT(zio->io_vsd == NULL); - - if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { - void *abuf; - uint64_t asize; - ASSERT(vd == tvd); - zio_pop_transform(zio, &abuf, &asize, &asize); - if (zio->io_type == ZIO_TYPE_READ) - bcopy(abuf, zio->io_data, zio->io_size); - zio_buf_free(abuf, asize); - zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; + + if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) + return (ZIO_PIPELINE_STOP); + + if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) + spa_config_exit(zio->io_spa, SCL_ZIO, zio); + + if (zio->io_vsd != NULL) { + zio->io_vsd_free(zio); + zio->io_vsd = NULL; } - if (zio_injection_enabled && !zio->io_error) + if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); /* * If the I/O failed, determine whether we should attempt to retry it. */ - /* XXPOLICY */ - if (zio_should_retry(zio)) { - ASSERT(tvd == vd); - - zio->io_retries++; + if (zio->io_error && vd == NULL && + !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { + ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; - zio->io_flags &= ZIO_FLAG_VDEV_INHERIT | - ZIO_FLAG_CONFIG_GRABBED; - /* XXPOLICY */ - zio->io_flags &= ~ZIO_FLAG_FAILFAST; - zio->io_flags |= ZIO_FLAG_DONT_CACHE; + zio->io_flags |= ZIO_FLAG_IO_RETRY | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); + return (ZIO_PIPELINE_STOP); + } - dprintf("retry #%d for %s to %s offset %llx\n", - zio->io_retries, zio_type_name[zio->io_type], - vdev_description(vd), zio->io_offset); + /* + * If we got an error on a leaf device, convert it to ENXIO + * if the device is not accessible at all. + */ + if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && + !vdev_accessible(vd, zio)) + zio->io_error = ENXIO; - zio_next_stage_async(zio); - return; - } + /* + * If we can't write to an interior vdev (mirror or RAID-Z), + * set vdev_cant_write so that we stop trying to allocate from it. + */ + if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && + vd != NULL && !vd->vdev_ops->vdev_op_leaf) + vd->vdev_cant_write = B_TRUE; - if (zio->io_error != 0 && zio->io_error != ECKSUM && - !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) { - /* - * Poor man's hotplug support. Even if we're done retrying this - * I/O, try to reopen the vdev to see if it's still attached. - * To avoid excessive thrashing, we only try it once a minute. - * This also has the effect of detecting when missing devices - * have come back, by polling the device once a minute. - * - * We need to do this asynchronously because we can't grab - * all the necessary locks way down here. - */ - if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) { - vd->vdev_last_try = gethrtime(); - tvd->vdev_reopen_wanted = 1; - spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN); - } - } + if (zio->io_error) + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } void @@ -1603,49 +1958,63 @@ zio_vdev_io_bypass(zio_t *zio) * Generate and verify checksums * ========================================================================== */ -static void +static int zio_checksum_generate(zio_t *zio) { - int checksum = zio->io_checksum; blkptr_t *bp = zio->io_bp; + enum zio_checksum checksum; - ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); + if (bp == NULL) { + /* + * This is zio_write_phys(). + * We're either generating a label checksum, or none at all. + */ + checksum = zio->io_prop.zp_checksum; - BP_SET_CHECKSUM(bp, checksum); - BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + if (checksum == ZIO_CHECKSUM_OFF) + return (ZIO_PIPELINE_CONTINUE); - zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); + ASSERT(checksum == ZIO_CHECKSUM_LABEL); + } else { + if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { + ASSERT(!IO_IS_ALLOCATING(zio)); + checksum = ZIO_CHECKSUM_GANG_HEADER; + } else { + checksum = BP_GET_CHECKSUM(bp); + } + } - zio_next_stage(zio); + zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); + + return (ZIO_PIPELINE_CONTINUE); } -static void -zio_gang_checksum_generate(zio_t *zio) +static int +zio_checksum_verify(zio_t *zio) { - zio_cksum_t zc; - zio_gbh_phys_t *gbh = zio->io_data; - - ASSERT(BP_IS_GANG(zio->io_bp)); - ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); - - zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); + blkptr_t *bp = zio->io_bp; + int error; - zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); + if (bp == NULL) { + /* + * This is zio_read_phys(). + * We're either verifying a label checksum, or nothing at all. + */ + if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) + return (ZIO_PIPELINE_CONTINUE); - zio_next_stage(zio); -} + ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); + } -static void -zio_checksum_verify(zio_t *zio) -{ - if (zio->io_bp != NULL) { - zio->io_error = zio_checksum_error(zio); - if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) + if ((error = zio_checksum_error(zio)) != 0) { + zio->io_error = error; + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, zio->io_spa, zio->io_vd, zio, 0, 0); + } } - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } /* @@ -1658,204 +2027,263 @@ zio_checksum_verified(zio_t *zio) } /* - * Set the external verifier for a gang block based on stuff in the bp + * ========================================================================== + * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. + * An error of 0 indictes success. ENXIO indicates whole-device failure, + * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO + * indicate errors that are specific to one I/O, and most likely permanent. + * Any other error is presumed to be worse because we weren't expecting it. + * ========================================================================== */ -void -zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) +int +zio_worst_error(int e1, int e2) { - blkptr_t *bp = zio->io_bp; + static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; + int r1, r2; - zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); - zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); - zcp->zc_word[2] = bp->blk_birth; - zcp->zc_word[3] = 0; + for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) + if (e1 == zio_error_rank[r1]) + break; + + for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) + if (e2 == zio_error_rank[r2]) + break; + + return (r1 > r2 ? e1 : e2); } /* * ========================================================================== - * Define the pipeline + * I/O completion * ========================================================================== */ -typedef void zio_pipe_stage_t(zio_t *zio); - -static void -zio_badop(zio_t *zio) +static int +zio_ready(zio_t *zio) { - panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); -} - -zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { - zio_badop, - zio_wait_children_ready, - zio_write_compress, - zio_checksum_generate, - zio_gang_pipeline, - zio_get_gang_header, - zio_rewrite_gang_members, - zio_free_gang_members, - zio_claim_gang_members, - zio_dva_allocate, - zio_dva_free, - zio_dva_claim, - zio_gang_checksum_generate, - zio_ready, - zio_vdev_io_start, - zio_vdev_io_done, - zio_vdev_io_assess, - zio_wait_children_done, - zio_checksum_verify, - zio_read_gang_members, - zio_read_decompress, - zio_done, - zio_badop -}; + blkptr_t *bp = zio->io_bp; + zio_t *pio = zio->io_parent; -/* - * Move an I/O to the next stage of the pipeline and execute that stage. - * There's no locking on io_stage because there's no legitimate way for - * multiple threads to be attempting to process the same I/O. - */ -void -zio_next_stage(zio_t *zio) -{ - uint32_t pipeline = zio->io_pipeline; + if (zio->io_ready) { + if (BP_IS_GANG(bp) && + zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY)) + return (ZIO_PIPELINE_STOP); - ASSERT(!MUTEX_HELD(&zio->io_lock)); + ASSERT(IO_IS_ALLOCATING(zio)); + ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); + ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); - if (zio->io_error) { - dprintf("zio %p vdev %s offset %llx stage %d error %d\n", - zio, vdev_description(zio->io_vd), - zio->io_offset, zio->io_stage, zio->io_error); - if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) - pipeline &= ZIO_ERROR_PIPELINE_MASK; + zio->io_ready(zio); } - while (((1U << ++zio->io_stage) & pipeline) == 0) - continue; + if (bp != NULL && bp != &zio->io_bp_copy) + zio->io_bp_copy = *bp; - ASSERT(zio->io_stage <= ZIO_STAGE_DONE); - ASSERT(zio->io_stalled == 0); + if (zio->io_error) + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - /* - * See the comment in zio_next_stage_async() about per-CPU taskqs. - */ - if (((1U << zio->io_stage) & zio->io_async_stages) && - (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) && - !(zio->io_flags & ZIO_FLAG_METADATA)) { - taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; - (void) taskq_dispatch(tq, - (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); - } else { - zio_pipeline[zio->io_stage](zio); - } + if (pio != NULL) + zio_notify_parent(pio, zio, ZIO_WAIT_READY); + + return (ZIO_PIPELINE_CONTINUE); } -void -zio_next_stage_async(zio_t *zio) +static int +zio_done(zio_t *zio) { - taskq_t *tq; - uint32_t pipeline = zio->io_pipeline; - - ASSERT(!MUTEX_HELD(&zio->io_lock)); + spa_t *spa = zio->io_spa; + zio_t *pio = zio->io_parent; + zio_t *lio = zio->io_logical; + blkptr_t *bp = zio->io_bp; + vdev_t *vd = zio->io_vd; + uint64_t psize = zio->io_size; - if (zio->io_error) { - dprintf("zio %p vdev %s offset %llx stage %d error %d\n", - zio, vdev_description(zio->io_vd), - zio->io_offset, zio->io_stage, zio->io_error); - if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) - pipeline &= ZIO_ERROR_PIPELINE_MASK; - } + /* + * If our of children haven't all completed, + * wait for them and then repeat this pipeline stage. + */ + if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || + zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || + zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) + return (ZIO_PIPELINE_STOP); - while (((1U << ++zio->io_stage) & pipeline) == 0) - continue; + for (int c = 0; c < ZIO_CHILD_TYPES; c++) + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + ASSERT(zio->io_children[c][w] == 0); - ASSERT(zio->io_stage <= ZIO_STAGE_DONE); - ASSERT(zio->io_stalled == 0); + if (bp != NULL) { + ASSERT(bp->blk_pad[0] == 0); + ASSERT(bp->blk_pad[1] == 0); + ASSERT(bp->blk_pad[2] == 0); + ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || + (pio != NULL && bp == pio->io_bp)); + if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && + !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { + ASSERT(!BP_SHOULD_BYTESWAP(bp)); + ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp)); + ASSERT(BP_COUNT_GANG(bp) == 0 || + (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); + } + } /* - * For performance, we'll probably want two sets of task queues: - * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU - * part is for read performance: since we have to make a pass over - * the data to checksum it anyway, we want to do this on the same CPU - * that issued the read, because (assuming CPU scheduling affinity) - * that thread is probably still there. Getting this optimization - * right avoids performance-hostile cache-to-cache transfers. - * - * Note that having two sets of task queues is also necessary for - * correctness: if all of the issue threads get bogged down waiting - * for dependent reads (e.g. metaslab freelist) to complete, then - * there won't be any threads available to service I/O completion - * interrupts. + * If there were child vdev or gang errors, they apply to us now. */ - if ((1U << zio->io_stage) & zio->io_async_stages) { - if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) - tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; - else - tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; - (void) taskq_dispatch(tq, - (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); - } else { - zio_pipeline[zio->io_stage](zio); - } -} + zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); + zio_inherit_child_errors(zio, ZIO_CHILD_GANG); -static boolean_t -zio_alloc_should_fail(void) -{ - static uint16_t allocs = 0; + zio_pop_transforms(zio); /* note: may set zio->io_error */ - return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0); -} + vdev_stat_update(zio, psize); -/* - * Try to allocate an intent log block. Return 0 on success, errno on failure. - */ -int -zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, - uint64_t txg) -{ - int error; + if (zio->io_error) { + /* + * If this I/O is attached to a particular vdev, + * generate an error message describing the I/O failure + * at the block level. We ignore these errors if the + * device is currently unavailable. + */ + if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) + zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); - spa_config_enter(spa, RW_READER, FTAG); + if ((zio->io_error == EIO || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) { + /* + * For logical I/O requests, tell the SPA to log the + * error and generate a logical data ereport. + */ + spa_log_error(spa, zio); + zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, + 0, 0); + } + } - if (zio_zil_fail_shift && zio_alloc_should_fail()) { - spa_config_exit(spa, FTAG); - return (ENOSPC); + if (zio->io_error && zio == lio) { + /* + * Determine whether zio should be reexecuted. This will + * propagate all the way to the root via zio_notify_parent(). + */ + ASSERT(vd == NULL && bp != NULL); + + if (IO_IS_ALLOCATING(zio)) + if (zio->io_error != ENOSPC) + zio->io_reexecute |= ZIO_REEXECUTE_NOW; + else + zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + + if ((zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_FREE) && + zio->io_error == ENXIO && + spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) + zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + + if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) + zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } /* - * We were passed the previous log blocks dva_t in bp->blk_dva[0]. + * If there were logical child errors, they apply to us now. + * We defer this until now to avoid conflating logical child + * errors with errors that happened to the zio itself when + * updating vdev stats and reporting FMA events above. */ - error = metaslab_alloc(spa, size, new_bp, 1, txg, old_bp, B_TRUE); + zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); - if (error == 0) { - BP_SET_LSIZE(new_bp, size); - BP_SET_PSIZE(new_bp, size); - BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); - BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); - BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); - BP_SET_LEVEL(new_bp, 0); - BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); - new_bp->blk_birth = txg; + if (zio->io_reexecute) { + /* + * This is a logical I/O that wants to reexecute. + * + * Reexecute is top-down. When an i/o fails, if it's not + * the root, it simply notifies its parent and sticks around. + * The parent, seeing that it still has children in zio_done(), + * does the same. This percolates all the way up to the root. + * The root i/o will reexecute or suspend the entire tree. + * + * This approach ensures that zio_reexecute() honors + * all the original i/o dependency relationships, e.g. + * parents not executing until children are ready. + */ + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + if (IO_IS_ALLOCATING(zio)) + zio_dva_unallocate(zio, zio->io_gang_tree, bp); + + zio_gang_tree_free(&zio->io_gang_tree); + + if (pio != NULL) { + /* + * We're not a root i/o, so there's nothing to do + * but notify our parent. Don't propagate errors + * upward since we haven't permanently failed yet. + */ + zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; + zio_notify_parent(pio, zio, ZIO_WAIT_DONE); + } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { + /* + * We'd fail again if we reexecuted now, so suspend + * until conditions improve (e.g. device comes online). + */ + zio_suspend(spa, zio); + } else { + /* + * Reexecution is potentially a huge amount of work. + * Hand it off to the otherwise-unused claim taskq. + */ + (void) taskq_dispatch( + spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], + (task_func_t *)zio_reexecute, zio, TQ_SLEEP); + } + return (ZIO_PIPELINE_STOP); } - spa_config_exit(spa, FTAG); + ASSERT(zio->io_child == NULL); + ASSERT(zio->io_reexecute == 0); + ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); - return (error); -} + if (zio->io_done) + zio->io_done(zio); -/* - * Free an intent log block. We know it can't be a gang block, so there's - * nothing to do except metaslab_free() it. - */ -void -zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) -{ - ASSERT(!BP_IS_GANG(bp)); + zio_gang_tree_free(&zio->io_gang_tree); - spa_config_enter(spa, RW_READER, FTAG); + ASSERT(zio->io_delegate_list == NULL); + ASSERT(zio->io_delegate_next == NULL); - metaslab_free(spa, bp, txg, B_FALSE); + if (pio != NULL) { + zio_remove_child(pio, zio); + zio_notify_parent(pio, zio, ZIO_WAIT_DONE); + } + + if (zio->io_waiter != NULL) { + mutex_enter(&zio->io_lock); + zio->io_executor = NULL; + cv_broadcast(&zio->io_cv); + mutex_exit(&zio->io_lock); + } else { + zio_destroy(zio); + } - spa_config_exit(spa, FTAG); + return (ZIO_PIPELINE_STOP); } + +/* + * ========================================================================== + * I/O pipeline definition + * ========================================================================== + */ +static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = { + NULL, + zio_issue_async, + zio_read_bp_init, + zio_write_bp_init, + zio_checksum_generate, + zio_gang_assemble, + zio_gang_issue, + zio_dva_allocate, + zio_dva_free, + zio_dva_claim, + zio_ready, + zio_vdev_io_start, + zio_vdev_io_done, + zio_vdev_io_assess, + zio_checksum_verify, + zio_done +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c index f0d9a1463580..bf7fe733fe0c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/zio.h> @@ -96,25 +94,59 @@ zio_checksum_select(uint8_t child, uint8_t parent) } /* + * Set the external verifier for a gang block based on <vdev, offset, txg>, + * a tuple which is guaranteed to be unique for the life of the pool. + */ +static void +zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp) +{ + dva_t *dva = BP_IDENTITY(bp); + uint64_t txg = bp->blk_birth; + + ASSERT(BP_IS_GANG(bp)); + + ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); +} + +/* + * Set the external verifier for a label block based on its offset. + * The vdev is implicit, and the txg is unknowable at pool open time -- + * hence the logic in vdev_uberblock_load() to find the most recent copy. + */ +static void +zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) +{ + ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); +} + +/* * Generate the checksum. */ void -zio_checksum(uint_t checksum, zio_cksum_t *zcp, void *data, uint64_t size) +zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, + void *data, uint64_t size) { + blkptr_t *bp = zio->io_bp; + uint64_t offset = zio->io_offset; zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t zbt_cksum; - ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); + ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS); ASSERT(ci->ci_func[0] != NULL); if (ci->ci_zbt) { - *zcp = zbt->zbt_cksum; + if (checksum == ZIO_CHECKSUM_GANG_HEADER) + zio_checksum_gang_verifier(&zbt->zbt_cksum, bp); + else if (checksum == ZIO_CHECKSUM_LABEL) + zio_checksum_label_verifier(&zbt->zbt_cksum, offset); + else + bp->blk_cksum = zbt->zbt_cksum; zbt->zbt_magic = ZBT_MAGIC; ci->ci_func[0](data, size, &zbt_cksum); zbt->zbt_cksum = zbt_cksum; } else { - ci->ci_func[0](data, size, zcp); + ci->ci_func[0](data, size, &bp->blk_cksum); } } @@ -122,47 +154,49 @@ int zio_checksum_error(zio_t *zio) { blkptr_t *bp = zio->io_bp; - zio_cksum_t zc = bp->blk_cksum; - uint_t checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : - BP_GET_CHECKSUM(bp); - int byteswap = BP_SHOULD_BYTESWAP(bp); + uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + int byteswap; void *data = zio->io_data; - uint64_t size = ZIO_GET_IOSIZE(zio); + uint64_t size = (bp == NULL ? zio->io_size : + (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); + uint64_t offset = zio->io_offset; zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - zio_cksum_t actual_cksum, expected_cksum; + zio_cksum_t actual_cksum, expected_cksum, verifier; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (EINVAL); if (ci->ci_zbt) { if (checksum == ZIO_CHECKSUM_GANG_HEADER) - zio_set_gang_verifier(zio, &zc); + zio_checksum_gang_verifier(&verifier, bp); + else if (checksum == ZIO_CHECKSUM_LABEL) + zio_checksum_label_verifier(&verifier, offset); + else + verifier = bp->blk_cksum; + + byteswap = (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)); - if (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)) { - expected_cksum = zbt->zbt_cksum; + if (byteswap) + byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); + + expected_cksum = zbt->zbt_cksum; + zbt->zbt_cksum = verifier; + ci->ci_func[byteswap](data, size, &actual_cksum); + zbt->zbt_cksum = expected_cksum; + + if (byteswap) byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); - zbt->zbt_cksum = zc; - byteswap_uint64_array(&zbt->zbt_cksum, - sizeof (zio_cksum_t)); - ci->ci_func[1](data, size, &actual_cksum); - zbt->zbt_cksum = expected_cksum; - byteswap_uint64_array(&zbt->zbt_cksum, - sizeof (zio_cksum_t)); - } else { - expected_cksum = zbt->zbt_cksum; - zbt->zbt_cksum = zc; - ci->ci_func[0](data, size, &actual_cksum); - zbt->zbt_cksum = expected_cksum; - } - zc = expected_cksum; } else { ASSERT(!BP_IS_GANG(bp)); + byteswap = BP_SHOULD_BYTESWAP(bp); + expected_cksum = bp->blk_cksum; ci->ci_func[byteswap](data, size, &actual_cksum); } - if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) + if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (ECKSUM); if (zio_injection_enabled && !zio->io_error) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c index 4cada09d835c..b3469fdd5c24 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * ZFS fault injection * @@ -47,6 +45,7 @@ #include <sys/zfs_ioctl.h> #include <sys/spa_impl.h> #include <sys/vdev_impl.h> +#include <sys/fs/zfs.h> uint32_t zio_injection_enabled; @@ -145,6 +144,56 @@ zio_handle_fault_injection(zio_t *zio, int error) return (ret); } +/* + * Determine if the zio is part of a label update and has an injection + * handler associated with that portion of the label. Currently, we + * allow error injection in either the nvlist or the uberblock region of + * of the vdev label. + */ +int +zio_handle_label_injection(zio_t *zio, int error) +{ + inject_handler_t *handler; + vdev_t *vd = zio->io_vd; + uint64_t offset = zio->io_offset; + int label; + int ret = 0; + + if (offset + zio->io_size > VDEV_LABEL_START_SIZE && + offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + uint64_t start = handler->zi_record.zi_start; + uint64_t end = handler->zi_record.zi_end; + + /* Ignore device only faults */ + if (handler->zi_record.zi_start == 0) + continue; + + /* + * The injection region is the relative offsets within a + * vdev label. We must determine the label which is being + * updated and adjust our region accordingly. + */ + label = vdev_label_number(vd->vdev_psize, offset); + start = vdev_label_offset(vd->vdev_psize, label, start); + end = vdev_label_offset(vd->vdev_psize, label, end); + + if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && + (offset >= start && offset <= end)) { + ret = error; + break; + } + } + rw_exit(&inject_lock); + return (ret); +} + + int zio_handle_device_injection(vdev_t *vd, int error) { @@ -156,6 +205,10 @@ zio_handle_device_injection(vdev_t *vd, int error) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { + /* Ignore label specific faults */ + if (handler->zi_record.zi_start != 0) + continue; + if (vd->vdev_guid == handler->zi_record.zi_guid) { if (handler->zi_record.zi_error == error) { /* @@ -230,7 +283,7 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) * fault injection isn't a performance critical path. */ if (flags & ZINJECT_FLUSH_ARC) - arc_flush(); + arc_flush(NULL); return (0); } @@ -304,6 +357,7 @@ zio_clear_fault(int id) void zio_inject_init(void) { + rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); list_create(&inject_handlers, sizeof (inject_handler_t), offsetof(inject_handler_t, zi_link)); } @@ -312,4 +366,5 @@ void zio_inject_fini(void) { list_destroy(&inject_handlers); + rw_destroy(&inject_lock); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c index fedae03e5107..db0ebf29b7ca 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c @@ -23,12 +23,10 @@ * All rights reserved. */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * ZFS volume emulation driver. * @@ -57,6 +55,9 @@ #include <sys/zap.h> #include <sys/spa.h> #include <sys/zio.h> +#include <sys/dmu_traverse.h> +#include <sys/dnode.h> +#include <sys/dsl_dataset.h> #include <sys/dsl_prop.h> #include <sys/dkio.h> #include <sys/byteorder.h> @@ -69,10 +70,14 @@ #include <sys/refcount.h> #include <sys/zfs_znode.h> #include <sys/zfs_rlock.h> +#include <sys/vdev_impl.h> +#include <sys/zvol.h> #include <geom/geom.h> #include "zfs_namecheck.h" +#define ZVOL_DUMPSIZE "dumpsize" + struct g_class zfs_zvol_class = { .name = "ZFS::ZVOL", .version = G_VERSION, @@ -80,11 +85,31 @@ struct g_class zfs_zvol_class = { DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); -#define ZVOL_OBJ 1ULL -#define ZVOL_ZAP_OBJ 2ULL - +/* + * This lock protects the zvol_state structure from being modified + * while it's being used, e.g. an open that comes in before a create + * finishes. It also protects temporary opens of the dataset so that, + * e.g., an open doesn't get a spurious EBUSY. + */ +static kmutex_t zvol_state_lock; static uint32_t zvol_minors; +#define NUM_EXTENTS ((SPA_MAXBLOCKSIZE) / sizeof (zvol_extent_t)) + +typedef struct zvol_extent { + dva_t ze_dva; /* dva associated with this extent */ + uint64_t ze_stride; /* extent stride */ + uint64_t ze_size; /* number of blocks in extent */ +} zvol_extent_t; + +/* + * The list of extents associated with the dump device + */ +typedef struct zvol_ext_list { + zvol_extent_t zl_extents[NUM_EXTENTS]; + struct zvol_ext_list *zl_next; +} zvol_ext_list_t; + /* * The in-core state of each volume. */ @@ -94,11 +119,12 @@ typedef struct zvol_state { uint64_t zv_volblocksize; /* volume block size */ struct g_provider *zv_provider; /* GEOM provider */ uint8_t zv_min_bs; /* minimum addressable block shift */ - uint8_t zv_readonly; /* hard readonly; like write-protect */ + uint8_t zv_flags; /* readonly; dumpified */ objset_t *zv_objset; /* objset handle */ uint32_t zv_mode; /* DS_MODE_* flags at open time */ uint32_t zv_total_opens; /* total open count */ zilog_t *zv_zilog; /* ZIL handle */ + zvol_ext_list_t *zv_list; /* List of extents for dump */ uint64_t zv_txg_assign; /* txg to assign during ZIL replay */ znode_t zv_znode; /* for range locking */ int zv_state; @@ -107,11 +133,28 @@ typedef struct zvol_state { } zvol_state_t; /* + * zvol specific flags + */ +#define ZVOL_RDONLY 0x1 +#define ZVOL_DUMPIFIED 0x2 +#define ZVOL_EXCL 0x4 + +/* * zvol maximum transfer in one DMU tx. */ int zvol_maxphys = DMU_MAX_ACCESS/2; +extern int zfs_set_prop_nvlist(const char *, nvlist_t *); static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio); +static int zvol_dumpify(zvol_state_t *zv); +static int zvol_dump_fini(zvol_state_t *zv); +static int zvol_dump_init(zvol_state_t *zv, boolean_t resize); + +static void +zvol_size_changed(zvol_state_t *zv, major_t maj) +{ + +} int zvol_check_volsize(uint64_t volsize, uint64_t blocksize) @@ -145,7 +188,10 @@ zvol_readonly_changed_cb(void *arg, uint64_t newval) { zvol_state_t *zv = arg; - zv->zv_readonly = (uint8_t)newval; + if (newval) + zv->zv_flags |= ZVOL_RDONLY; + else + zv->zv_flags &= ~ZVOL_RDONLY; } int @@ -179,6 +225,7 @@ zvol_minor_lookup(const char *name) struct g_geom *gp; g_topology_assert(); + ASSERT(MUTEX_HELD(&zvol_state_lock)); LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { @@ -196,21 +243,29 @@ zvol_access(struct g_provider *pp, int acr, int acw, int ace) zvol_state_t *zv; g_topology_assert(); + mutex_enter(&zvol_state_lock); zv = pp->private; if (zv == NULL) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); + mutex_exit(&zvol_state_lock); return (pp->error); } ASSERT(zv->zv_objset != NULL); - if (acw > 0 && (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY))) + if (acw > 0 && + ((zv->zv_flags & ZVOL_RDONLY) || + (zv->zv_mode & DS_MODE_READONLY))) { + mutex_exit(&zvol_state_lock); return (EROFS); + } zv->zv_total_opens += acr + acw + ace; + mutex_exit(&zvol_state_lock); + return (0); } @@ -324,8 +379,12 @@ zvol_serve_one(zvol_state_t *zv, struct bio *bp) dmu_tx_commit(tx); } } - if (error) + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = EIO; break; + } off += size; addr += size; resid -= size; @@ -368,7 +427,7 @@ zvol_worker(void *arg) break; } - if (bp->bio_cmd != BIO_READ && !zil_disable) + if (bp->bio_cmd == BIO_FLUSH && !zil_disable) zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); g_io_deliver(bp, bp->bio_error); @@ -376,25 +435,152 @@ zvol_worker(void *arg) } void -zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx) +zvol_init_extent(zvol_extent_t *ze, blkptr_t *bp) { - zfs_create_data_t *zc = arg; + ze->ze_dva = bp->blk_dva[0]; /* structure assignment */ + ze->ze_stride = 0; + ze->ze_size = 1; +} + +/* extent mapping arg */ +struct maparg { + zvol_ext_list_t *ma_list; + zvol_extent_t *ma_extent; + int ma_gang; +}; + +/*ARGSUSED*/ +static int +zvol_map_block(traverse_blk_cache_t *bc, spa_t *spa, void *arg) +{ + zbookmark_t *zb = &bc->bc_bookmark; + blkptr_t *bp = &bc->bc_blkptr; + void *data = bc->bc_data; + dnode_phys_t *dnp = bc->bc_dnode; + struct maparg *ma = (struct maparg *)arg; + uint64_t stride; + + /* If there is an error, then keep trying to make progress */ + if (bc->bc_errno) + return (ERESTART); + +#ifdef ZFS_DEBUG + if (zb->zb_level == -1) { + ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); + ASSERT3U(BP_GET_LEVEL(bp), ==, 0); + } else { + ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); + ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); + } + + if (zb->zb_level > 0) { + uint64_t fill = 0; + blkptr_t *bpx, *bpend; + + for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx); + bpx < bpend; bpx++) { + if (bpx->blk_birth != 0) { + fill += bpx->blk_fill; + } else { + ASSERT(bpx->blk_fill == 0); + } + } + ASSERT3U(fill, ==, bp->blk_fill); + } + + if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) { + uint64_t fill = 0; + dnode_phys_t *dnx, *dnend; + + for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT); + dnx < dnend; dnx++) { + if (dnx->dn_type != DMU_OT_NONE) + fill++; + } + ASSERT3U(fill, ==, bp->blk_fill); + } +#endif + + if (zb->zb_level || dnp->dn_type == DMU_OT_DNODE) + return (0); + + /* Abort immediately if we have encountered gang blocks */ + if (BP_IS_GANG(bp)) { + ma->ma_gang++; + return (EINTR); + } + + /* first time? */ + if (ma->ma_extent->ze_size == 0) { + zvol_init_extent(ma->ma_extent, bp); + return (0); + } + + stride = (DVA_GET_OFFSET(&bp->blk_dva[0])) - + ((DVA_GET_OFFSET(&ma->ma_extent->ze_dva)) + + (ma->ma_extent->ze_size - 1) * (ma->ma_extent->ze_stride)); + if (DVA_GET_VDEV(BP_IDENTITY(bp)) == + DVA_GET_VDEV(&ma->ma_extent->ze_dva)) { + if (ma->ma_extent->ze_stride == 0) { + /* second block in this extent */ + ma->ma_extent->ze_stride = stride; + ma->ma_extent->ze_size++; + return (0); + } else if (ma->ma_extent->ze_stride == stride) { + /* + * the block we allocated has the same + * stride + */ + ma->ma_extent->ze_size++; + return (0); + } + } + + /* + * dtrace -n 'zfs-dprintf + * /stringof(arg0) == "zvol.c"/ + * { + * printf("%s: %s", stringof(arg1), stringof(arg3)) + * } ' + */ + dprintf("ma_extent 0x%lx mrstride 0x%lx stride %lx\n", + ma->ma_extent->ze_size, ma->ma_extent->ze_stride, stride); + dprintf_bp(bp, "%s", "next blkptr:"); + /* start a new extent */ + if (ma->ma_extent == &ma->ma_list->zl_extents[NUM_EXTENTS - 1]) { + ma->ma_list->zl_next = kmem_zalloc(sizeof (zvol_ext_list_t), + KM_SLEEP); + ma->ma_list = ma->ma_list->zl_next; + ma->ma_extent = &ma->ma_list->zl_extents[0]; + } else { + ma->ma_extent++; + } + zvol_init_extent(ma->ma_extent, bp); + return (0); +} + +/* ARGSUSED */ +void +zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +{ + zfs_creat_t *zct = arg; + nvlist_t *nvprops = zct->zct_props; int error; uint64_t volblocksize, volsize; - VERIFY(nvlist_lookup_uint64(zc->zc_props, + VERIFY(nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); - if (nvlist_lookup_uint64(zc->zc_props, + if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); /* - * These properites must be removed from the list so the generic + * These properties must be removed from the list so the generic * property setting step won't apply to them. */ - VERIFY(nvlist_remove_all(zc->zc_props, + VERIFY(nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); - (void) nvlist_remove_all(zc->zc_props, + (void) nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, @@ -467,10 +653,110 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { }; /* - * Create a minor node for the specified volume. + * reconstruct dva that gets us to the desired offset (offset + * is in bytes) */ int -zvol_create_minor(const char *name, dev_t dev) +zvol_get_dva(zvol_state_t *zv, uint64_t offset, dva_t *dva) +{ + zvol_ext_list_t *zl; + zvol_extent_t *ze; + int idx; + uint64_t tmp; + + if ((zl = zv->zv_list) == NULL) + return (EIO); + idx = 0; + ze = &zl->zl_extents[0]; + while (offset >= ze->ze_size * zv->zv_volblocksize) { + offset -= ze->ze_size * zv->zv_volblocksize; + + if (idx == NUM_EXTENTS - 1) { + /* we've reached the end of this array */ + ASSERT(zl->zl_next != NULL); + if (zl->zl_next == NULL) + return (-1); + zl = zl->zl_next; + ze = &zl->zl_extents[0]; + idx = 0; + } else { + ze++; + idx++; + } + } + DVA_SET_VDEV(dva, DVA_GET_VDEV(&ze->ze_dva)); + tmp = DVA_GET_OFFSET((&ze->ze_dva)); + tmp += (ze->ze_stride * (offset / zv->zv_volblocksize)); + DVA_SET_OFFSET(dva, tmp); + return (0); +} + +static void +zvol_free_extents(zvol_state_t *zv) +{ + zvol_ext_list_t *zl; + zvol_ext_list_t *tmp; + + if (zv->zv_list != NULL) { + zl = zv->zv_list; + while (zl != NULL) { + tmp = zl->zl_next; + kmem_free(zl, sizeof (zvol_ext_list_t)); + zl = tmp; + } + zv->zv_list = NULL; + } +} + +int +zvol_get_lbas(zvol_state_t *zv) +{ + struct maparg ma; + zvol_ext_list_t *zl; + zvol_extent_t *ze; + uint64_t blocks = 0; + int err; + + ma.ma_list = zl = kmem_zalloc(sizeof (zvol_ext_list_t), KM_SLEEP); + ma.ma_extent = &ma.ma_list->zl_extents[0]; + ma.ma_gang = 0; + zv->zv_list = ma.ma_list; + + err = traverse_zvol(zv->zv_objset, ADVANCE_PRE, zvol_map_block, &ma); + if (err == EINTR && ma.ma_gang) { + /* + * We currently don't support dump devices when the pool + * is so fragmented that our allocation has resulted in + * gang blocks. + */ + zvol_free_extents(zv); + return (EFRAGS); + } + ASSERT3U(err, ==, 0); + + ze = &zl->zl_extents[0]; + while (ze) { + blocks += ze->ze_size; + if (ze == &zl->zl_extents[NUM_EXTENTS - 1]) { + zl = zl->zl_next; + ze = &zl->zl_extents[0]; + } else { + ze++; + } + } + if (blocks != (zv->zv_volsize / zv->zv_volblocksize)) { + zvol_free_extents(zv); + return (EIO); + } + + return (0); +} + +/* + * Create a minor node (plus a whole lot more) for the specified volume. + */ +int +zvol_create_minor(const char *name, major_t maj) { struct g_provider *pp; struct g_geom *gp; @@ -478,11 +764,12 @@ zvol_create_minor(const char *name, dev_t dev) objset_t *os; dmu_object_info_t doi; uint64_t volsize; - int ds_mode = DS_MODE_PRIMARY; + int ds_mode = DS_MODE_OWNER; int error; DROP_GIANT(); g_topology_lock(); + mutex_enter(&zvol_state_lock); if ((zv = zvol_minor_lookup(name)) != NULL) { error = EEXIST; @@ -496,11 +783,7 @@ zvol_create_minor(const char *name, dev_t dev) if (error) goto end; - g_topology_unlock(); - PICKUP_GIANT(); error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); - DROP_GIANT(); - g_topology_lock(); if (error) { dmu_objset_close(os); goto end; @@ -524,14 +807,12 @@ zvol_create_minor(const char *name, dev_t dev) mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, sizeof (rl_t), offsetof(rl_t, r_node)); - - /* get and cache the blocksize */ error = dmu_object_info(os, ZVOL_OBJ, &doi); ASSERT(error == 0); zv->zv_volblocksize = doi.doi_data_block_size; - zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector); + zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL); /* XXX this should handle the possible i/o error */ VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset), @@ -547,6 +828,7 @@ zvol_create_minor(const char *name, dev_t dev) zvol_minors++; end: + mutex_exit(&zvol_state_lock); g_topology_unlock(); PICKUP_GIANT(); @@ -565,6 +847,7 @@ zvol_remove_minor(const char *name) DROP_GIANT(); g_topology_lock(); + mutex_enter(&zvol_state_lock); if ((zv = zvol_minor_lookup(name)) == NULL) { error = ENXIO; @@ -602,6 +885,7 @@ zvol_remove_minor(const char *name) zvol_minors--; end: + mutex_exit(&zvol_state_lock); g_topology_unlock(); PICKUP_GIANT(); @@ -609,55 +893,143 @@ end: } int -zvol_set_volsize(const char *name, dev_t dev, uint64_t volsize) +zvol_prealloc(zvol_state_t *zv) +{ + objset_t *os = zv->zv_objset; + dmu_tx_t *tx; + void *data; + uint64_t refd, avail, usedobjs, availobjs; + uint64_t resid = zv->zv_volsize; + uint64_t off = 0; + + /* Check the space usage before attempting to allocate the space */ + dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs); + if (avail < zv->zv_volsize) + return (ENOSPC); + + /* Free old extents if they exist */ + zvol_free_extents(zv); + + /* allocate the blocks by writing each one */ + data = kmem_zalloc(SPA_MAXBLOCKSIZE, KM_SLEEP); + + while (resid != 0) { + int error; + uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE); + + tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + kmem_free(data, SPA_MAXBLOCKSIZE); + (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off); + return (error); + } + dmu_write(os, ZVOL_OBJ, off, bytes, data, tx); + dmu_tx_commit(tx); + off += bytes; + resid -= bytes; + } + kmem_free(data, SPA_MAXBLOCKSIZE); + txg_wait_synced(dmu_objset_pool(os), 0); + + return (0); +} + +int +zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize) { - zvol_state_t *zv; dmu_tx_t *tx; int error; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1, + &volsize, tx); + dmu_tx_commit(tx); + + if (error == 0) + error = dmu_free_long_range(zv->zv_objset, + ZVOL_OBJ, volsize, DMU_OBJECT_END); + + /* + * If we are using a faked-up state (zv_provider == NULL) then don't + * try to update the in-core zvol state. + */ + if (error == 0 && zv->zv_provider) { + zv->zv_volsize = volsize; + zvol_size_changed(zv, maj); + } + return (error); +} + +int +zvol_set_volsize(const char *name, major_t maj, uint64_t volsize) +{ + zvol_state_t *zv; + int error; dmu_object_info_t doi; + uint64_t old_volsize = 0ULL; + zvol_state_t state = { 0 }; DROP_GIANT(); g_topology_lock(); + mutex_enter(&zvol_state_lock); if ((zv = zvol_minor_lookup(name)) == NULL) { - error = ENXIO; - goto end; + /* + * If we are doing a "zfs clone -o volsize=", then the + * minor node won't exist yet. + */ + error = dmu_objset_open(name, DMU_OST_ZVOL, DS_MODE_OWNER, + &state.zv_objset); + if (error != 0) + goto out; + zv = &state; } + old_volsize = zv->zv_volsize; if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 || (error = zvol_check_volsize(volsize, - doi.doi_data_block_size)) != 0) { - goto end; - } + doi.doi_data_block_size)) != 0) + goto out; - if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) { + if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) { error = EROFS; - goto end; + goto out; } - tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); - dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - goto end; - } + error = zvol_update_volsize(zv, maj, volsize); - error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1, - &volsize, tx); - if (error == 0) { - error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, volsize, - DMU_OBJECT_END, tx); +#if 0 + /* + * Reinitialize the dump area to the new size. If we + * failed to resize the dump area then restore the it back to + * it's original size. + */ + if (error == 0 && zv->zv_flags & ZVOL_DUMPIFIED) { + if ((error = zvol_dumpify(zv)) != 0 || + (error = dumpvp_resize()) != 0) { + (void) zvol_update_volsize(zv, maj, old_volsize); + error = zvol_dumpify(zv); + } } +#endif - dmu_tx_commit(tx); +out: + if (state.zv_objset) + dmu_objset_close(state.zv_objset); - if (error == 0) { - zv->zv_volsize = volsize; - zv->zv_provider->mediasize = volsize; /* XXX: Not supported. */ - } -end: + mutex_exit(&zvol_state_lock); g_topology_unlock(); PICKUP_GIANT(); @@ -673,13 +1045,13 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) DROP_GIANT(); g_topology_lock(); + mutex_enter(&zvol_state_lock); if ((zv = zvol_minor_lookup(name)) == NULL) { error = ENXIO; goto end; } - - if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) { + if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) { error = EROFS; goto end; } @@ -702,6 +1074,7 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) #endif } end: + mutex_exit(&zvol_state_lock); g_topology_unlock(); PICKUP_GIANT(); @@ -716,7 +1089,7 @@ zvol_get_done(dmu_buf_t *db, void *vzgd) dmu_buf_rele(db, vzgd); zfs_range_unlock(rl); - zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp))); + zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); } @@ -754,7 +1127,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) /* * Lock the range of the block to ensure that when the data is - * written out and it's checksum is being calculated that no other + * written out and its checksum is being calculated that no other * thread can change the block. */ boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t); @@ -766,8 +1139,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) error = dmu_sync(zio, db, &lr->lr_blkptr, lr->lr_common.lrc_txg, zvol_get_done, zgd); if (error == 0) - zil_add_vdev(zv->zv_zilog, - DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr))); + zil_add_block(zv->zv_zilog, &lr->lr_blkptr); /* * If we get EINPROGRESS, then we need to wait for a * write IO initiated by dmu_sync() to complete before @@ -791,11 +1163,230 @@ zvol_busy(void) void zvol_init(void) { + mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); ZFS_LOG(1, "ZVOL Initialized."); } void zvol_fini(void) { + mutex_destroy(&zvol_state_lock); ZFS_LOG(1, "ZVOL Deinitialized."); } + +static boolean_t +zvol_is_swap(zvol_state_t *zv) +{ + vnode_t *vp; + boolean_t ret = B_FALSE; + char *devpath; + size_t devpathlen; + int error; + +#if 0 + devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(zv->zv_name) + 1; + devpath = kmem_alloc(devpathlen, KM_SLEEP); + (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name); + error = lookupname(devpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + kmem_free(devpath, devpathlen); + + ret = !error && IS_SWAPVP(common_specvp(vp)); + + if (vp != NULL) + VN_RELE(vp); +#endif + + return (ret); +} + +static int +zvol_dump_init(zvol_state_t *zv, boolean_t resize) +{ + dmu_tx_t *tx; + int error = 0; + objset_t *os = zv->zv_objset; + nvlist_t *nv = NULL; + uint64_t checksum, compress, refresrv; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + /* + * If we are resizing the dump device then we only need to + * update the refreservation to match the newly updated + * zvolsize. Otherwise, we save off the original state of the + * zvol so that we can restore them if the zvol is ever undumpified. + */ + if (resize) { + error = zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, + &zv->zv_volsize, tx); + } else { + error = dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL); + error = error ? error : dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL); + error = error ? error : dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL); + + error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, + &compress, tx); + error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx); + error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, + &refresrv, tx); + } + dmu_tx_commit(tx); + + /* Truncate the file */ + if (!error) + error = dmu_free_long_range(zv->zv_objset, + ZVOL_OBJ, 0, DMU_OBJECT_END); + + if (error) + return (error); + + /* + * We only need update the zvol's property if we are initializing + * the dump area for the first time. + */ + if (!resize) { + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0); + VERIFY(nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), + ZIO_COMPRESS_OFF) == 0); + VERIFY(nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), + ZIO_CHECKSUM_OFF) == 0); + + error = zfs_set_prop_nvlist(zv->zv_name, nv); + nvlist_free(nv); + + if (error) + return (error); + } + + /* Allocate the space for the dump */ + error = zvol_prealloc(zv); + return (error); +} + +static int +zvol_dumpify(zvol_state_t *zv) +{ + int error = 0; + uint64_t dumpsize = 0; + dmu_tx_t *tx; + objset_t *os = zv->zv_objset; + + if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) + return (EROFS); + + /* + * We do not support swap devices acting as dump devices. + */ + if (zvol_is_swap(zv)) + return (ENOTSUP); + + if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, + 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) { + boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE; + + if ((error = zvol_dump_init(zv, resize)) != 0) { + (void) zvol_dump_fini(zv); + return (error); + } + } + + /* + * Build up our lba mapping. + */ + error = zvol_get_lbas(zv); + if (error) { + (void) zvol_dump_fini(zv); + return (error); + } + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + (void) zvol_dump_fini(zv); + return (error); + } + + zv->zv_flags |= ZVOL_DUMPIFIED; + error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1, + &zv->zv_volsize, tx); + dmu_tx_commit(tx); + + if (error) { + (void) zvol_dump_fini(zv); + return (error); + } + + txg_wait_synced(dmu_objset_pool(os), 0); + return (0); +} + +static int +zvol_dump_fini(zvol_state_t *zv) +{ + dmu_tx_t *tx; + objset_t *os = zv->zv_objset; + nvlist_t *nv; + int error = 0; + uint64_t checksum, compress, refresrv; + + /* + * Attempt to restore the zvol back to its pre-dumpified state. + * This is a best-effort attempt as it's possible that not all + * of these properties were initialized during the dumpify process + * (i.e. error during zvol_dump_init). + */ + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx); + dmu_tx_commit(tx); + + (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum); + (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress); + (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv); + + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + (void) nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum); + (void) nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress); + (void) nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv); + (void) zfs_set_prop_nvlist(zv->zv_name, nv); + nvlist_free(nv); + + zvol_free_extents(zv); + zv->zv_flags &= ~ZVOL_DUMPIFIED; + (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END); + + return (0); +} |