diff options
Diffstat (limited to 'sys/ufs')
-rw-r--r-- | sys/ufs/ffs/ffs_alloc.c | 7 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_balloc.c | 190 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_extern.h | 8 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_inode.c | 102 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_softdep.c | 521 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_softdep_stub.c | 14 | ||||
-rw-r--r-- | sys/ufs/ffs/softdep.h | 13 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_bmap.c | 19 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_extern.h | 18 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_inode.c | 3 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_lookup.c | 5 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_readwrite.c | 341 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_vnops.c | 13 |
13 files changed, 1043 insertions, 211 deletions
diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index 710d6d14424b..1360ec8b18fe 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -188,9 +188,10 @@ nospace: * invoked to get an appropriate block. */ int -ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) +ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, cred, bpp) struct inode *ip; ufs2_daddr_t lbprev; + ufs2_daddr_t bprev; ufs2_daddr_t bpref; int osize, nsize; struct ucred *cred; @@ -200,7 +201,7 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) struct fs *fs; struct buf *bp; int cg, request, error, reclaimed; - ufs2_daddr_t bprev, bno; + ufs2_daddr_t bno; *bpp = 0; vp = ITOV(ip); @@ -224,7 +225,7 @@ retry: if (suser_cred(cred, PRISON_ROOT) && freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) goto nospace; - if ((bprev = DIP(ip, i_db[lbprev])) == 0) { + if (bprev == 0) { printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", devtoname(ip->i_dev), (long)fs->fs_bsize, (intmax_t)bprev, fs->fs_fsmnt); diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c index 9b1c3839b4cb..d9e8a08dbf5d 100644 --- a/sys/ufs/ffs/ffs_balloc.c +++ b/sys/ufs/ffs/ffs_balloc.c @@ -73,6 +73,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, struct ucred *cred, int flags, struct buf **bpp) { struct inode *ip; + struct ufs1_dinode *dp; ufs_lbn_t lbn, lastlbn; struct fs *fs; ufs1_daddr_t nb; @@ -86,12 +87,15 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, struct thread *td = curthread; /* XXX */ ip = VTOI(vp); + dp = ip->i_din1; fs = ip->i_fs; lbn = lblkno(fs, startoffset); size = blkoff(fs, startoffset) + size; if (size > fs->fs_bsize) panic("ffs_balloc_ufs1: blk too big"); *bpp = NULL; + if (flags & IO_EXT) + return (EOPNOTSUPP); if (lbn < 0) return (EFBIG); @@ -105,22 +109,20 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, nb = lastlbn; osize = blksize(fs, ip, nb); if (osize < fs->fs_bsize && osize > 0) { - error = ffs_realloccg(ip, nb, - ffs_blkpref_ufs1(ip, lastlbn, (int)nb, - &ip->i_din1->di_db[0]), - osize, (int)fs->fs_bsize, cred, &bp); + error = ffs_realloccg(ip, nb, dp->di_db[nb], + ffs_blkpref_ufs1(ip, lastlbn, (int)nb, + &dp->di_db[0]), osize, (int)fs->fs_bsize, cred, &bp); if (error) return (error); if (DOINGSOFTDEP(vp)) softdep_setup_allocdirect(ip, nb, - dbtofsb(fs, bp->b_blkno), - ip->i_din1->di_db[nb], + dbtofsb(fs, bp->b_blkno), dp->di_db[nb], fs->fs_bsize, osize, bp); ip->i_size = smalllblktosize(fs, nb + 1); - ip->i_din1->di_size = ip->i_size; - ip->i_din1->di_db[nb] = dbtofsb(fs, bp->b_blkno); + dp->di_size = ip->i_size; + dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); ip->i_flag |= IN_CHANGE | IN_UPDATE; - if (flags & BA_SYNC) + if (flags & IO_SYNC) bwrite(bp); else bawrite(bp); @@ -132,7 +134,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, if (lbn < NDADDR) { if (flags & BA_METAONLY) panic("ffs_balloc_ufs1: BA_METAONLY for direct block"); - nb = ip->i_din1->di_db[lbn]; + nb = dp->di_db[lbn]; if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); if (error) { @@ -157,10 +159,9 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, } bp->b_blkno = fsbtodb(fs, nb); } else { - error = ffs_realloccg(ip, lbn, + error = ffs_realloccg(ip, lbn, dp->di_db[lbn], ffs_blkpref_ufs1(ip, lbn, (int)lbn, - &ip->i_din1->di_db[0]), - osize, nsize, cred, &bp); + &dp->di_db[0]), osize, nsize, cred, &bp); if (error) return (error); if (DOINGSOFTDEP(vp)) @@ -174,8 +175,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, else nsize = fs->fs_bsize; error = ffs_alloc(ip, lbn, - ffs_blkpref_ufs1(ip, lbn, (int)lbn, - &ip->i_din1->di_db[0]), + ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]), nsize, cred, &newb); if (error) return (error); @@ -187,7 +187,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, softdep_setup_allocdirect(ip, lbn, newb, 0, nsize, 0, bp); } - ip->i_din1->di_db[lbn] = dbtofsb(fs, bp->b_blkno); + dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); ip->i_flag |= IN_CHANGE | IN_UPDATE; *bpp = bp; return (0); @@ -206,7 +206,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, * Fetch the first indirect block allocating if necessary. */ --num; - nb = ip->i_din1->di_ib[indirs[0].in_off]; + nb = dp->di_ib[indirs[0].in_off]; allocib = NULL; allocblk = allociblk; if (nb == 0) { @@ -233,7 +233,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, else if ((error = bwrite(bp)) != 0) goto fail; } - allocib = &ip->i_din1->di_ib[indirs[0].in_off]; + allocib = &dp->di_ib[indirs[0].in_off]; *allocib = nb; ip->i_flag |= IN_CHANGE | IN_UPDATE; } @@ -289,7 +289,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, * If required, write synchronously, otherwise use * delayed write. */ - if (flags & BA_SYNC) { + if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->fs_bsize) @@ -329,7 +329,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, * If required, write synchronously, otherwise use * delayed write. */ - if (flags & BA_SYNC) { + if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->fs_bsize) @@ -382,7 +382,7 @@ fail: } else { bap = (ufs1_daddr_t *)bp->b_data; bap[indirs[unwindidx].in_off] = 0; - if (flags & BA_SYNC) { + if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->fs_bsize) @@ -398,7 +398,7 @@ fail: */ (void) chkdq(ip, -btodb(deallocated), cred, FORCE); #endif - ip->i_din1->di_blocks -= btodb(deallocated); + dp->di_blocks -= btodb(deallocated); ip->i_flag |= IN_CHANGE | IN_UPDATE; } (void) VOP_FSYNC(vp, cred, MNT_WAIT, td); @@ -417,6 +417,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, struct ucred *cred, int flags, struct buf **bpp) { struct inode *ip; + struct ufs2_dinode *dp; ufs_lbn_t lbn, lastlbn; struct fs *fs; struct buf *bp, *nbp; @@ -428,6 +429,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, struct thread *td = curthread; /* XXX */ ip = VTOI(vp); + dp = ip->i_din2; fs = ip->i_fs; lbn = lblkno(fs, startoffset); size = blkoff(fs, startoffset) + size; @@ -438,6 +440,112 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, return (EFBIG); /* + * Check for allocating external data. + */ + if (flags & IO_EXT) { + if (lbn >= NXADDR) + return (EFBIG); + /* + * If the next write will extend the data into a new block, + * and the data is currently composed of a fragment + * this fragment has to be extended to be a full block. + */ + lastlbn = lblkno(fs, dp->di_extsize); + if (lastlbn < lbn) { + nb = lastlbn; + osize = sblksize(fs, dp->di_extsize, nb); + if (osize < fs->fs_bsize && osize > 0) { + error = ffs_realloccg(ip, -1 - nb, + dp->di_extb[nb], + ffs_blkpref_ufs2(ip, lastlbn, (int)nb, + &dp->di_extb[0]), osize, + (int)fs->fs_bsize, cred, &bp); + if (error) + return (error); + if (DOINGSOFTDEP(vp)) + softdep_setup_allocext(ip, nb, + dbtofsb(fs, bp->b_blkno), + dp->di_extb[nb], + fs->fs_bsize, osize, bp); + dp->di_extsize = smalllblktosize(fs, nb + 1); + dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno); + bp->b_xflags |= BX_ALTDATA; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (flags & IO_SYNC) + bwrite(bp); + else + bawrite(bp); + } + } + /* + * All blocks are direct blocks + */ + if (flags & BA_METAONLY) + panic("ffs_balloc_ufs2: BA_METAONLY for ext block"); + nb = dp->di_extb[lbn]; + if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) { + error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + bp->b_blkno = fsbtodb(fs, nb); + bp->b_xflags |= BX_ALTDATA; + *bpp = bp; + return (0); + } + if (nb != 0) { + /* + * Consider need to reallocate a fragment. + */ + osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); + nsize = fragroundup(fs, size); + if (nsize <= osize) { + error = bread(vp, -1 - lbn, osize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + bp->b_blkno = fsbtodb(fs, nb); + bp->b_xflags |= BX_ALTDATA; + } else { + error = ffs_realloccg(ip, -1 - lbn, + dp->di_extb[lbn], + ffs_blkpref_ufs2(ip, lbn, (int)lbn, + &dp->di_extb[0]), osize, nsize, cred, &bp); + if (error) + return (error); + bp->b_xflags |= BX_ALTDATA; + if (DOINGSOFTDEP(vp)) + softdep_setup_allocext(ip, lbn, + dbtofsb(fs, bp->b_blkno), nb, + nsize, osize, bp); + } + } else { + if (dp->di_extsize < smalllblktosize(fs, lbn + 1)) + nsize = fragroundup(fs, size); + else + nsize = fs->fs_bsize; + error = ffs_alloc(ip, lbn, + ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]), + nsize, cred, &newb); + if (error) + return (error); + bp = getblk(vp, -1 - lbn, nsize, 0, 0); + bp->b_blkno = fsbtodb(fs, newb); + bp->b_xflags |= BX_ALTDATA; + if (flags & BA_CLRBUF) + vfs_bio_clrbuf(bp); + if (DOINGSOFTDEP(vp)) + softdep_setup_allocext(ip, lbn, newb, 0, + nsize, 0, bp); + } + dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + *bpp = bp; + return (0); + } + /* * If the next write will extend the file into a new block, * and the file is currently composed of a fragment * this fragment has to be extended to be a full block. @@ -447,22 +555,22 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, nb = lastlbn; osize = blksize(fs, ip, nb); if (osize < fs->fs_bsize && osize > 0) { - error = ffs_realloccg(ip, nb, + error = ffs_realloccg(ip, nb, dp->di_db[nb], ffs_blkpref_ufs2(ip, lastlbn, (int)nb, - &ip->i_din2->di_db[0]), - osize, (int)fs->fs_bsize, cred, &bp); + &dp->di_db[0]), osize, (int)fs->fs_bsize, + cred, &bp); if (error) return (error); if (DOINGSOFTDEP(vp)) softdep_setup_allocdirect(ip, nb, dbtofsb(fs, bp->b_blkno), - ip->i_din2->di_db[nb], + dp->di_db[nb], fs->fs_bsize, osize, bp); ip->i_size = smalllblktosize(fs, nb + 1); - ip->i_din2->di_size = ip->i_size; - ip->i_din2->di_db[nb] = dbtofsb(fs, bp->b_blkno); + dp->di_size = ip->i_size; + dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); ip->i_flag |= IN_CHANGE | IN_UPDATE; - if (flags & BA_SYNC) + if (flags & IO_SYNC) bwrite(bp); else bawrite(bp); @@ -474,7 +582,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, if (lbn < NDADDR) { if (flags & BA_METAONLY) panic("ffs_balloc_ufs2: BA_METAONLY for direct block"); - nb = ip->i_din2->di_db[lbn]; + nb = dp->di_db[lbn]; if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); if (error) { @@ -499,10 +607,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, } bp->b_blkno = fsbtodb(fs, nb); } else { - error = ffs_realloccg(ip, lbn, + error = ffs_realloccg(ip, lbn, dp->di_db[lbn], ffs_blkpref_ufs2(ip, lbn, (int)lbn, - &ip->i_din2->di_db[0]), - osize, nsize, cred, &bp); + &dp->di_db[0]), osize, nsize, cred, &bp); if (error) return (error); if (DOINGSOFTDEP(vp)) @@ -517,8 +624,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, nsize = fs->fs_bsize; error = ffs_alloc(ip, lbn, ffs_blkpref_ufs2(ip, lbn, (int)lbn, - &ip->i_din2->di_db[0]), - nsize, cred, &newb); + &dp->di_db[0]), nsize, cred, &newb); if (error) return (error); bp = getblk(vp, lbn, nsize, 0, 0); @@ -529,7 +635,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, softdep_setup_allocdirect(ip, lbn, newb, 0, nsize, 0, bp); } - ip->i_din2->di_db[lbn] = dbtofsb(fs, bp->b_blkno); + dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); ip->i_flag |= IN_CHANGE | IN_UPDATE; *bpp = bp; return (0); @@ -548,7 +654,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, * Fetch the first indirect block allocating if necessary. */ --num; - nb = ip->i_din2->di_ib[indirs[0].in_off]; + nb = dp->di_ib[indirs[0].in_off]; allocib = NULL; allocblk = allociblk; if (nb == 0) { @@ -575,7 +681,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, else if ((error = bwrite(bp)) != 0) goto fail; } - allocib = &ip->i_din2->di_ib[indirs[0].in_off]; + allocib = &dp->di_ib[indirs[0].in_off]; *allocib = nb; ip->i_flag |= IN_CHANGE | IN_UPDATE; } @@ -631,7 +737,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, * If required, write synchronously, otherwise use * delayed write. */ - if (flags & BA_SYNC) { + if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->fs_bsize) @@ -671,7 +777,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, * If required, write synchronously, otherwise use * delayed write. */ - if (flags & BA_SYNC) { + if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->fs_bsize) @@ -724,7 +830,7 @@ fail: } else { bap = (ufs2_daddr_t *)bp->b_data; bap[indirs[unwindidx].in_off] = 0; - if (flags & BA_SYNC) { + if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->fs_bsize) @@ -740,7 +846,7 @@ fail: */ (void) chkdq(ip, -btodb(deallocated), cred, FORCE); #endif - ip->i_din2->di_blocks -= btodb(deallocated); + dp->di_blocks -= btodb(deallocated); ip->i_flag |= IN_CHANGE | IN_UPDATE; } (void) VOP_FSYNC(vp, cred, MNT_WAIT, td); diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h index ae59ca380ffb..c2972c8f2305 100644 --- a/sys/ufs/ffs/ffs_extern.h +++ b/sys/ufs/ffs/ffs_extern.h @@ -79,8 +79,8 @@ int ffs_mountroot(void); int ffs_mount(struct mount *, char *, caddr_t, struct nameidata *, struct thread *); int ffs_reallocblks(struct vop_reallocblks_args *); -int ffs_realloccg(struct inode *, - ufs2_daddr_t, ufs2_daddr_t, int, int, struct ucred *, struct buf **); +int ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t, + ufs2_daddr_t, int, int, struct ucred *, struct buf **); void ffs_setblock(struct fs *, u_char *, ufs1_daddr_t); int ffs_snapblkfree(struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t); void ffs_snapremove(struct vnode *vp); @@ -115,11 +115,13 @@ void softdep_update_inodeblock(struct inode *, struct buf *, int); void softdep_load_inodeblock(struct inode *); void softdep_freefile(struct vnode *, ino_t, int); int softdep_request_cleanup(struct fs *, struct vnode *); -void softdep_setup_freeblocks(struct inode *, off_t); +void softdep_setup_freeblocks(struct inode *, off_t, int); void softdep_setup_inomapdep(struct buf *, struct inode *, ino_t); void softdep_setup_blkmapdep(struct buf *, struct fs *, ufs2_daddr_t); void softdep_setup_allocdirect(struct inode *, ufs_lbn_t, ufs2_daddr_t, ufs2_daddr_t, long, long, struct buf *); +void softdep_setup_allocext(struct inode *, ufs_lbn_t, ufs2_daddr_t, + ufs2_daddr_t, long, long, struct buf *); void softdep_setup_allocindir_meta(struct buf *, struct inode *, struct buf *, int, ufs2_daddr_t); void softdep_setup_allocindir_page(struct inode *, ufs_lbn_t, diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index 08e5fddc3eff..83fa66ef22fd 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -146,22 +146,81 @@ ffs_truncate(vp, length, flags, cred, td) struct inode *oip; ufs2_daddr_t bn, lbn, lastblock, lastiblock[NIADDR], indir_lbn[NIADDR]; ufs2_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; - ufs2_daddr_t count, blocksreleased = 0; + ufs2_daddr_t count, blocksreleased = 0, datablocks; struct fs *fs; struct buf *bp; + int needextclean, softdepslowdown, extblocks; int offset, size, level, nblocks; - int i, aflags, error, allerror; + int i, error, allerror; off_t osize; oip = VTOI(ovp); fs = oip->i_fs; if (length < 0) return (EINVAL); + /* + * Historically clients did not have to specify which data + * they were truncating. So, if not specified, we assume + * traditional behavior, e.g., just the normal data. + */ + if ((flags & (IO_EXT | IO_NORMAL)) == 0) + flags |= IO_NORMAL; + /* + * If we are truncating the extended-attributes, and cannot + * do it with soft updates, then do it slowly here. If we are + * truncating both the extended attributes and the file contents + * (e.g., the file is being unlinked), then pick it off with + * soft updates below. + */ + needextclean = 0; + softdepslowdown = softdep_slowdown(ovp); + extblocks = 0; + datablocks = DIP(oip, i_blocks); + if (fs->fs_magic == FS_UFS2_MAGIC && oip->i_din2->di_extsize > 0) { + extblocks = btodb(fragroundup(fs, oip->i_din2->di_extsize)); + datablocks -= extblocks; + } + if ((flags & IO_EXT) && extblocks > 0) { + if (DOINGSOFTDEP(ovp) && softdepslowdown == 0 && length == 0) { + if ((flags & IO_NORMAL) == 0) { + softdep_setup_freeblocks(oip, length, IO_EXT); + return (0); + } + needextclean = 1; + } else { + if (length != 0) + panic("ffs_truncate: partial trunc of extdata"); + if ((error = VOP_FSYNC(ovp, cred, MNT_WAIT, td)) != 0) + return (error); + osize = oip->i_din2->di_extsize; + oip->i_din2->di_blocks -= extblocks; +#ifdef QUOTA + (void) chkdq(oip, -extblocks, NOCRED, 0); +#endif + vinvalbuf(ovp, V_ALT, cred, td, 0, 0); + oip->i_din2->di_extsize = 0; + for (i = 0; i < NXADDR; i++) { + oldblks[i] = oip->i_din2->di_extb[i]; + oip->i_din2->di_extb[i] = 0; + } + oip->i_flag |= IN_CHANGE | IN_UPDATE; + if ((error = ffs_update(ovp, 1))) + return (error); + for (i = 0; i < NXADDR; i++) { + if (oldblks[i] == 0) + continue; + ffs_blkfree(fs, oip->i_devvp, oldblks[i], + sblksize(fs, osize, i), oip->i_number); + } + } + } + if ((flags & IO_NORMAL) == 0) + return (0); if (length > fs->fs_maxfilesize) return (EFBIG); if (ovp->v_type == VLNK && (oip->i_size < ovp->v_mount->mnt_maxsymlinklen || - DIP(oip, i_blocks) == 0)) { + datablocks == 0)) { #ifdef DIAGNOSTIC if (length != 0) panic("ffs_truncate: partial truncate of symlink"); @@ -170,10 +229,14 @@ ffs_truncate(vp, length, flags, cred, td) oip->i_size = 0; DIP(oip, i_size) = 0; oip->i_flag |= IN_CHANGE | IN_UPDATE; + if (needextclean) + softdep_setup_freeblocks(oip, length, IO_EXT); return (UFS_UPDATE(ovp, 1)); } if (oip->i_size == length) { oip->i_flag |= IN_CHANGE | IN_UPDATE; + if (needextclean) + softdep_setup_freeblocks(oip, length, IO_EXT); return (UFS_UPDATE(ovp, 0)); } if (fs->fs_ronly) @@ -187,7 +250,7 @@ ffs_truncate(vp, length, flags, cred, td) ffs_snapremove(ovp); ovp->v_lasta = ovp->v_clen = ovp->v_cstart = ovp->v_lastw = 0; if (DOINGSOFTDEP(ovp)) { - if (length > 0 || softdep_slowdown(ovp)) { + if (length > 0 || softdepslowdown) { /* * If a file is only partially truncated, then * we have to clean up the data structures @@ -197,17 +260,18 @@ ffs_truncate(vp, length, flags, cred, td) * rarely, we solve the problem by syncing the file * so that it will have no data structures left. */ - if ((error = VOP_FSYNC(ovp, cred, MNT_WAIT, - td)) != 0) + if ((error = VOP_FSYNC(ovp, cred, MNT_WAIT, td)) != 0) return (error); if (oip->i_flag & IN_SPACECOUNTED) - fs->fs_pendingblocks -= DIP(oip, i_blocks); + fs->fs_pendingblocks -= datablocks; } else { #ifdef QUOTA - (void) chkdq(oip, -DIP(oip, i_blocks), NOCRED, 0); + (void) chkdq(oip, -datablocks, NOCRED, 0); #endif - softdep_setup_freeblocks(oip, length); - vinvalbuf(ovp, 0, cred, td, 0, 0); + softdep_setup_freeblocks(oip, length, needextclean ? + IO_EXT | IO_NORMAL : IO_NORMAL); + vinvalbuf(ovp, needextclean ? 0 : V_NORMAL, + cred, td, 0, 0); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (ffs_update(ovp, 0)); } @@ -220,18 +284,15 @@ ffs_truncate(vp, length, flags, cred, td) */ if (osize < length) { vnode_pager_setsize(ovp, length); - aflags = BA_CLRBUF; - if (flags & IO_SYNC) - aflags |= BA_SYNC; - error = UFS_BALLOC(ovp, length - 1, 1, - cred, aflags, &bp); + flags |= BA_CLRBUF; + error = UFS_BALLOC(ovp, length - 1, 1, cred, flags, &bp); if (error) return (error); oip->i_size = length; DIP(oip, i_size) = length; if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; - if (aflags & BA_SYNC) + if (flags & IO_SYNC) bwrite(bp); else bawrite(bp); @@ -252,10 +313,8 @@ ffs_truncate(vp, length, flags, cred, td) DIP(oip, i_size) = length; } else { lbn = lblkno(fs, length); - aflags = BA_CLRBUF; - if (flags & IO_SYNC) - aflags |= BA_SYNC; - error = UFS_BALLOC(ovp, length - 1, 1, cred, aflags, &bp); + flags |= BA_CLRBUF; + error = UFS_BALLOC(ovp, length - 1, 1, cred, flags, &bp); if (error) { return (error); } @@ -281,7 +340,7 @@ ffs_truncate(vp, length, flags, cred, td) allocbuf(bp, size); if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; - if (aflags & BA_SYNC) + if (flags & IO_SYNC) bwrite(bp); else bawrite(bp); @@ -420,6 +479,7 @@ done: if (newblks[i] != DIP(oip, i_db[i])) panic("ffs_truncate2"); if (length == 0 && + (fs->fs_magic != FS_UFS2_MAGIC || oip->i_din2->di_extsize == 0) && (!TAILQ_EMPTY(&ovp->v_dirtyblkhd) || !TAILQ_EMPTY(&ovp->v_cleanblkhd))) panic("ffs_truncate3"); diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index f03615099af9..631c82a34855 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -157,6 +157,7 @@ static void clear_inodedeps(struct thread *); static int flush_pagedep_deps(struct vnode *, struct mount *, struct diraddhd *); static int flush_inodedep_deps(struct fs *, ino_t); +static int flush_deplist(struct allocdirectlst *, int, int *); static int handle_written_filepage(struct pagedep *, struct buf *); static void diradd_inode_written(struct diradd *, struct inodedep *); static int handle_written_inodeblock(struct inodedep *, struct buf *); @@ -181,7 +182,7 @@ static void free_allocdirect(struct allocdirectlst *, static int check_inode_unwritten(struct inodedep *); static int free_inodedep(struct inodedep *); static void handle_workitem_freeblocks(struct freeblks *, int); -static void merge_inode_lists(struct inodedep *); +static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); static void setup_allocindir_phase2(struct buf *, struct inode *, struct allocindir *); static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, @@ -1041,12 +1042,15 @@ top: inodedep->id_nlinkdelta = 0; inodedep->id_savedino1 = NULL; inodedep->id_savedsize = -1; + inodedep->id_savedextsize = -1; inodedep->id_buf = NULL; LIST_INIT(&inodedep->id_pendinghd); LIST_INIT(&inodedep->id_inowait); LIST_INIT(&inodedep->id_bufwait); TAILQ_INIT(&inodedep->id_inoupdt); TAILQ_INIT(&inodedep->id_newinoupdt); + TAILQ_INIT(&inodedep->id_extupdt); + TAILQ_INIT(&inodedep->id_newextupdt); ACQUIRE_LOCK(&lk); LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); sema_release(&inodedep_in_progress); @@ -1566,6 +1570,103 @@ handle_workitem_freefrag(freefrag) } /* + * Set up a dependency structure for an external attributes data block. + * This routine follows much of the structure of softdep_setup_allocdirect. + * See the description of softdep_setup_allocdirect above for details. + */ +void +softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) + struct inode *ip; + ufs_lbn_t lbn; + ufs2_daddr_t newblkno; + ufs2_daddr_t oldblkno; + long newsize; + long oldsize; + struct buf *bp; +{ + struct allocdirect *adp, *oldadp; + struct allocdirectlst *adphead; + struct bmsafemap *bmsafemap; + struct inodedep *inodedep; + struct newblk *newblk; + + MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), + M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); + adp->ad_list.wk_type = D_ALLOCDIRECT; + adp->ad_lbn = lbn; + adp->ad_newblkno = newblkno; + adp->ad_oldblkno = oldblkno; + adp->ad_newsize = newsize; + adp->ad_oldsize = oldsize; + adp->ad_state = ATTACHED | EXTDATA; + LIST_INIT(&adp->ad_newdirblk); + if (newblkno == oldblkno) + adp->ad_freefrag = NULL; + else + adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); + + if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) + panic("softdep_setup_allocext: lost block"); + + ACQUIRE_LOCK(&lk); + inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep); + adp->ad_inodedep = inodedep; + + if (newblk->nb_state == DEPCOMPLETE) { + adp->ad_state |= DEPCOMPLETE; + adp->ad_buf = NULL; + } else { + bmsafemap = newblk->nb_bmsafemap; + adp->ad_buf = bmsafemap->sm_buf; + LIST_REMOVE(newblk, nb_deps); + LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); + } + LIST_REMOVE(newblk, nb_hash); + FREE(newblk, M_NEWBLK); + + WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); + if (lbn >= NXADDR) { + FREE_LOCK(&lk); + panic("softdep_setup_allocext: lbn %d > NXADDR", lbn); + } + /* + * The list of allocdirects must be kept in sorted and ascending + * order so that the rollback routines can quickly determine the + * first uncommitted block (the size of the file stored on disk + * ends at the end of the lowest committed fragment, or if there + * are no fragments, at the end of the highest committed block). + * Since files generally grow, the typical case is that the new + * block is to be added at the end of the list. We speed this + * special case by checking against the last allocdirect in the + * list before laboriously traversing the list looking for the + * insertion point. + */ + adphead = &inodedep->id_newextupdt; + oldadp = TAILQ_LAST(adphead, allocdirectlst); + if (oldadp == NULL || oldadp->ad_lbn <= lbn) { + /* insert at end of list */ + TAILQ_INSERT_TAIL(adphead, adp, ad_next); + if (oldadp != NULL && oldadp->ad_lbn == lbn) + allocdirect_merge(adphead, adp, oldadp); + FREE_LOCK(&lk); + return; + } + TAILQ_FOREACH(oldadp, adphead, ad_next) { + if (oldadp->ad_lbn >= lbn) + break; + } + if (oldadp == NULL) { + FREE_LOCK(&lk); + panic("softdep_setup_allocext: lost entry"); + } + /* insert in middle of list */ + TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); + if (oldadp->ad_lbn == lbn) + allocdirect_merge(adphead, adp, oldadp); + FREE_LOCK(&lk); +} + +/* * Indirect block allocation dependencies. * * The same dependencies that exist for a direct block also exist when @@ -1769,7 +1870,8 @@ setup_allocindir_phase2(bp, ip, aip) LIST_INIT(&newindirdep->ir_deplisthd); LIST_INIT(&newindirdep->ir_donehd); if (bp->b_blkno == bp->b_lblkno) { - ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, NULL, NULL); + ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, + NULL, NULL); bp->b_blkno = blkno; } newindirdep->ir_savebp = @@ -1809,9 +1911,10 @@ setup_allocindir_phase2(bp, ip, aip) * can release it. */ void -softdep_setup_freeblocks(ip, length) +softdep_setup_freeblocks(ip, length, flags) struct inode *ip; /* The inode whose length is to be reduced */ off_t length; /* The new length for the file */ + int flags; /* IO_EXT and/or IO_NORMAL */ { struct freeblks *freeblks; struct inodedep *inodedep; @@ -1819,6 +1922,7 @@ softdep_setup_freeblocks(ip, length) struct vnode *vp; struct buf *bp; struct fs *fs; + ufs2_daddr_t extblocks, datablocks; int i, delay, error; fs = ip->i_fs; @@ -1831,27 +1935,46 @@ softdep_setup_freeblocks(ip, length) freeblks->fb_previousinum = ip->i_number; freeblks->fb_devvp = ip->i_devvp; freeblks->fb_mnt = ITOV(ip)->v_mount; - freeblks->fb_oldsize = ip->i_size; - freeblks->fb_newsize = length; - freeblks->fb_chkcnt = DIP(ip, i_blocks); - for (i = 0; i < NDADDR; i++) { - freeblks->fb_dblks[i] = DIP(ip, i_db[i]); - DIP(ip, i_db[i]) = 0; - } - for (i = 0; i < NIADDR; i++) { - freeblks->fb_iblks[i] = DIP(ip, i_ib[i]); - DIP(ip, i_ib[i]) = 0; - } - DIP(ip, i_blocks) = 0; - ip->i_size = 0; - DIP(ip, i_size) = 0; - /* - * If the file was removed, then the space being freed was - * accounted for then (see softdep_filereleased()). If the - * file is merely being truncated, then we account for it now. - */ - if ((ip->i_flag & IN_SPACECOUNTED) == 0) - fs->fs_pendingblocks += freeblks->fb_chkcnt; + extblocks = 0; + if (fs->fs_magic == FS_UFS2_MAGIC) + extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); + datablocks = DIP(ip, i_blocks) - extblocks; + if ((flags & IO_NORMAL) == 0) { + freeblks->fb_oldsize = 0; + freeblks->fb_chkcnt = 0; + } else { + freeblks->fb_oldsize = ip->i_size; + ip->i_size = 0; + DIP(ip, i_size) = 0; + freeblks->fb_chkcnt = datablocks; + for (i = 0; i < NDADDR; i++) { + freeblks->fb_dblks[i] = DIP(ip, i_db[i]); + DIP(ip, i_db[i]) = 0; + } + for (i = 0; i < NIADDR; i++) { + freeblks->fb_iblks[i] = DIP(ip, i_ib[i]); + DIP(ip, i_ib[i]) = 0; + } + /* + * If the file was removed, then the space being freed was + * accounted for then (see softdep_filereleased()). If the + * file is merely being truncated, then we account for it now. + */ + if ((ip->i_flag & IN_SPACECOUNTED) == 0) + fs->fs_pendingblocks += datablocks; + } + if ((flags & IO_EXT) == 0) { + freeblks->fb_oldextsize = 0; + } else { + freeblks->fb_oldextsize = ip->i_din2->di_extsize; + ip->i_din2->di_extsize = 0; + freeblks->fb_chkcnt += extblocks; + for (i = 0; i < NXADDR; i++) { + freeblks->fb_eblks[i] = ip->i_din2->di_extb[i]; + ip->i_din2->di_extb[i] = 0; + } + } + DIP(ip, i_blocks) -= freeblks->fb_chkcnt; /* * Push the zero'ed inode to to its disk buffer so that we are free * to delete its dependencies below. Once the dependencies are gone @@ -1897,9 +2020,18 @@ softdep_setup_freeblocks(ip, length) * If we still have a bitmap dependency, then the inode has never * been written to disk, so we can free any fragments without delay. */ - merge_inode_lists(inodedep); - while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) - free_allocdirect(&inodedep->id_inoupdt, adp, delay); + if (flags & IO_NORMAL) { + merge_inode_lists(&inodedep->id_newinoupdt, + &inodedep->id_inoupdt); + while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) + free_allocdirect(&inodedep->id_inoupdt, adp, delay); + } + if (flags & IO_EXT) { + merge_inode_lists(&inodedep->id_newextupdt, + &inodedep->id_extupdt); + while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) + free_allocdirect(&inodedep->id_extupdt, adp, delay); + } FREE_LOCK(&lk); bdwrite(bp); /* @@ -1911,14 +2043,21 @@ softdep_setup_freeblocks(ip, length) vp = ITOV(ip); ACQUIRE_LOCK(&lk); drain_output(vp, 1); - while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) { - bp = TAILQ_FIRST(&vp->v_dirtyblkhd); +restart: + TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { + if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || + ((flags & IO_NORMAL) == 0 && + (bp->b_xflags & BX_ALTDATA) == 0)) + continue; + if (getdirtybuf(&bp, MNT_WAIT) == 0) + goto restart; (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep); deallocate_dependencies(bp, inodedep); bp->b_flags |= B_INVAL | B_NOCACHE; FREE_LOCK(&lk); brelse(bp); ACQUIRE_LOCK(&lk); + goto restart; } if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0) (void) free_inodedep(inodedep); @@ -2216,6 +2355,8 @@ check_inode_unwritten(inodedep) LIST_FIRST(&inodedep->id_inowait) != NULL || TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || + TAILQ_FIRST(&inodedep->id_extupdt) != NULL || + TAILQ_FIRST(&inodedep->id_newextupdt) != NULL || inodedep->id_nlinkdelta != 0) return (0); inodedep->id_state |= ALLCOMPLETE; @@ -2249,6 +2390,8 @@ free_inodedep(inodedep) LIST_FIRST(&inodedep->id_inowait) != NULL || TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || + TAILQ_FIRST(&inodedep->id_extupdt) != NULL || + TAILQ_FIRST(&inodedep->id_newextupdt) != NULL || inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL) return (0); LIST_REMOVE(inodedep, id_hash); @@ -2288,30 +2431,48 @@ handle_workitem_freeblocks(freeblks, flags) nblocks = btodb(fs->fs_bsize); blocksreleased = 0; /* - * Indirect blocks first. + * Release all extended attribute blocks or frags. */ - for (level = (NIADDR - 1); level >= 0; level--) { - if ((bn = freeblks->fb_iblks[level]) == 0) - continue; - if ((error = indir_trunc(freeblks, fsbtodb(fs, bn), level, - baselbns[level], &blocksreleased)) == 0) - allerror = error; - ffs_blkfree(fs, freeblks->fb_devvp, bn, fs->fs_bsize, - freeblks->fb_previousinum); - fs->fs_pendingblocks -= nblocks; - blocksreleased += nblocks; + if (freeblks->fb_oldextsize > 0) { + for (i = (NXADDR - 1); i >= 0; i--) { + if ((bn = freeblks->fb_eblks[i]) == 0) + continue; + bsize = sblksize(fs, freeblks->fb_oldextsize, i); + ffs_blkfree(fs, freeblks->fb_devvp, bn, bsize, + freeblks->fb_previousinum); + blocksreleased += btodb(bsize); + } } /* - * All direct blocks or frags. + * Release all data blocks or frags. */ - for (i = (NDADDR - 1); i >= 0; i--) { - if ((bn = freeblks->fb_dblks[i]) == 0) - continue; - bsize = sblksize(fs, freeblks->fb_oldsize, i); - ffs_blkfree(fs, freeblks->fb_devvp, bn, bsize, - freeblks->fb_previousinum); - fs->fs_pendingblocks -= btodb(bsize); - blocksreleased += btodb(bsize); + if (freeblks->fb_oldsize > 0) { + /* + * Indirect blocks first. + */ + for (level = (NIADDR - 1); level >= 0; level--) { + if ((bn = freeblks->fb_iblks[level]) == 0) + continue; + if ((error = indir_trunc(freeblks, fsbtodb(fs, bn), + level, baselbns[level], &blocksreleased)) == 0) + allerror = error; + ffs_blkfree(fs, freeblks->fb_devvp, bn, fs->fs_bsize, + freeblks->fb_previousinum); + fs->fs_pendingblocks -= nblocks; + blocksreleased += nblocks; + } + /* + * All direct blocks or frags. + */ + for (i = (NDADDR - 1); i >= 0; i--) { + if ((bn = freeblks->fb_dblks[i]) == 0) + continue; + bsize = sblksize(fs, freeblks->fb_oldsize, i); + ffs_blkfree(fs, freeblks->fb_devvp, bn, bsize, + freeblks->fb_previousinum); + fs->fs_pendingblocks -= btodb(bsize); + blocksreleased += btodb(bsize); + } } /* * If we still have not finished background cleanup, then check @@ -3049,6 +3210,8 @@ softdep_releasefile(ip) struct inode *ip; /* inode with the zero effective link count */ { struct inodedep *inodedep; + struct fs *fs; + int extblocks; if (ip->i_effnlink > 0) panic("softdep_filerelease: file still referenced"); @@ -3073,7 +3236,11 @@ softdep_releasefile(ip) if ((inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep))) inodedep->id_state |= SPACECOUNTED; FREE_LOCK(&lk); - ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks); + fs = ip->i_fs; + extblocks = 0; + if (fs->fs_magic == FS_UFS2_MAGIC) + extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); + ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks; ip->i_fs->fs_pendinginodes += 1; ip->i_flag |= IN_SPACECOUNTED; } @@ -3404,6 +3571,7 @@ initiate_write_inodeblock_ufs1(inodedep, bp) * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; + inodedep->id_savedextsize = 0; if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) return; /* @@ -3556,12 +3724,81 @@ initiate_write_inodeblock_ufs2(inodedep, bp) * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; - if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) + inodedep->id_savedextsize = dp->di_extsize; + if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL && + TAILQ_FIRST(&inodedep->id_extupdt) == NULL) return; /* - * Set the dependencies to busy. + * Set the ext data dependencies to busy. */ ACQUIRE_LOCK(&lk); + for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; + adp = TAILQ_NEXT(adp, ad_next)) { +#ifdef DIAGNOSTIC + if (deplist != 0 && prevlbn >= adp->ad_lbn) { + FREE_LOCK(&lk); + panic("softdep_write_inodeblock: lbn order"); + } + prevlbn = adp->ad_lbn; + if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno) { + FREE_LOCK(&lk); + panic("%s: direct pointer #%jd mismatch %jd != %jd", + "softdep_write_inodeblock", + (intmax_t)adp->ad_lbn, + (intmax_t)dp->di_extb[adp->ad_lbn], + (intmax_t)adp->ad_newblkno); + } + deplist |= 1 << adp->ad_lbn; + if ((adp->ad_state & ATTACHED) == 0) { + FREE_LOCK(&lk); + panic("softdep_write_inodeblock: Unknown state 0x%x", + adp->ad_state); + } +#endif /* DIAGNOSTIC */ + adp->ad_state &= ~ATTACHED; + adp->ad_state |= UNDONE; + } + /* + * The on-disk inode cannot claim to be any larger than the last + * fragment that has been written. Otherwise, the on-disk inode + * might have fragments that were not the last block in the ext + * data which would corrupt the filesystem. + */ + for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; + lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { + dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno; + /* keep going until hitting a rollback to a frag */ + if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) + continue; + dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; + for (i = adp->ad_lbn + 1; i < NXADDR; i++) { +#ifdef DIAGNOSTIC + if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) { + FREE_LOCK(&lk); + panic("softdep_write_inodeblock: lost dep1"); + } +#endif /* DIAGNOSTIC */ + dp->di_extb[i] = 0; + } + lastadp = NULL; + break; + } + /* + * If we have zero'ed out the last allocated block of the ext + * data, roll back the size to the last currently allocated block. + * We know that this last allocated block is a full-sized as + * we already checked for fragments in the loop above. + */ + if (lastadp != NULL && + dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { + for (i = lastadp->ad_lbn; i >= 0; i--) + if (dp->di_extb[i] != 0) + break; + dp->di_extsize = (i + 1) * fs->fs_bsize; + } + /* + * Set the file data dependencies to busy. + */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef DIAGNOSTIC @@ -3617,7 +3854,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp) #ifdef DIAGNOSTIC if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) { FREE_LOCK(&lk); - panic("softdep_write_inodeblock: lost dep1"); + panic("softdep_write_inodeblock: lost dep2"); } #endif /* DIAGNOSTIC */ dp->di_db[i] = 0; @@ -3627,7 +3864,7 @@ initiate_write_inodeblock_ufs2(inodedep, bp) if (dp->di_ib[i] != 0 && (deplist & ((1 << NDADDR) << i)) == 0) { FREE_LOCK(&lk); - panic("softdep_write_inodeblock: lost dep2"); + panic("softdep_write_inodeblock: lost dep3"); } #endif /* DIAGNOSTIC */ dp->di_ib[i] = 0; @@ -3805,6 +4042,7 @@ static void handle_allocdirect_partdone(adp) struct allocdirect *adp; /* the completed allocdirect */ { + struct allocdirectlst *listhead; struct allocdirect *listadp; struct inodedep *inodedep; long bsize, delay; @@ -3822,11 +4060,16 @@ handle_allocdirect_partdone(adp) * which would corrupt the filesystem. Thus, we cannot free any * allocdirects after one whose ad_oldblkno claims a fragment as * these blocks must be rolled back to zero before writing the inode. - * We check the currently active set of allocdirects in id_inoupdt. + * We check the currently active set of allocdirects in id_inoupdt + * or id_extupdt as appropriate. */ inodedep = adp->ad_inodedep; bsize = inodedep->id_fs->fs_bsize; - TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) { + if (adp->ad_state & EXTDATA) + listhead = &inodedep->id_extupdt; + else + listhead = &inodedep->id_inoupdt; + TAILQ_FOREACH(listadp, listhead, ad_next) { /* found our block */ if (listadp == adp) break; @@ -3845,7 +4088,11 @@ handle_allocdirect_partdone(adp) */ if (listadp == NULL) { #ifdef DEBUG - TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next) + if (adp->ad_state & EXTDATA) + listhead = &inodedep->id_newextupdt; + else + listhead = &inodedep->id_newinoupdt; + TAILQ_FOREACH(listadp, listhead, ad_next) /* found our block */ if (listadp == adp) break; @@ -3868,7 +4115,7 @@ handle_allocdirect_partdone(adp) listadp = TAILQ_NEXT(adp, ad_next); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; - free_allocdirect(&inodedep->id_inoupdt, adp, delay); + free_allocdirect(listhead, adp, delay); } } @@ -4023,12 +4270,31 @@ handle_written_inodeblock(inodedep, bp) adp->ad_state |= ATTACHED; hadchanges = 1; } + for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { + nextadp = TAILQ_NEXT(adp, ad_next); + if (adp->ad_state & ATTACHED) { + lk.lkt_held = NOHOLDER; + panic("handle_written_inodeblock: new entry"); + } + if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno) { + lk.lkt_held = NOHOLDER; + panic("%s: direct pointers #%jd %s %jd != %jd", + "handle_written_inodeblock", + (intmax_t)adp->ad_lbn, "mismatch", + (intmax_t)dp2->di_extb[adp->ad_lbn], + (intmax_t)adp->ad_oldblkno); + } + dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno; + adp->ad_state &= ~UNDONE; + adp->ad_state |= ATTACHED; + hadchanges = 1; + } if (hadchanges && (bp->b_flags & B_DELWRI) == 0) stat_direct_blk_ptrs++; /* * Reset the file size to its most up-to-date value. */ - if (inodedep->id_savedsize == -1) { + if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) { lk.lkt_held = NOHOLDER; panic("handle_written_inodeblock: bad size"); } @@ -4042,8 +4308,13 @@ handle_written_inodeblock(inodedep, bp) dp2->di_size = inodedep->id_savedsize; hadchanges = 1; } + if (dp2->di_extsize != inodedep->id_savedextsize) { + dp2->di_extsize = inodedep->id_savedextsize; + hadchanges = 1; + } } inodedep->id_savedsize = -1; + inodedep->id_savedextsize = -1; /* * If there were any rollbacks in the inode block, then it must be * marked dirty so that its will eventually get written back in @@ -4056,6 +4327,8 @@ handle_written_inodeblock(inodedep, bp) */ if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) handle_allocdirect_partdone(adp); + if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) + handle_allocdirect_partdone(adp); /* * Process deallocations that were held pending until the * inode had been written to disk. Freeing of the inode @@ -4119,7 +4392,9 @@ handle_written_inodeblock(inodedep, bp) /* * If no outstanding dependencies, free it. */ - if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0) + if (free_inodedep(inodedep) || + (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && + TAILQ_FIRST(&inodedep->id_extupdt) == 0)) return (0); return (hadchanges); } @@ -4358,9 +4633,12 @@ softdep_update_inodeblock(ip, bp, waitfor) * the in-memory copy of the inode. Once merged process any * allocdirects that are completed by the merger. */ - merge_inode_lists(inodedep); + merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); + merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); + if (TAILQ_FIRST(&inodedep->id_extupdt) != NULL) + handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt)); /* * Now that the inode has been pushed into the buffer, the * operations dependent on the inode being written to disk @@ -4392,34 +4670,35 @@ softdep_update_inodeblock(ip, bp, waitfor) } /* - * Merge the new inode dependency list (id_newinoupdt) into the old - * inode dependency list (id_inoupdt). This routine must be called - * with splbio interrupts blocked. + * Merge the a new inode dependency list (such as id_newinoupdt) into an + * old inode dependency list (such as id_inoupdt). This routine must be + * called with splbio interrupts blocked. */ static void -merge_inode_lists(inodedep) - struct inodedep *inodedep; +merge_inode_lists(newlisthead, oldlisthead) + struct allocdirectlst *newlisthead; + struct allocdirectlst *oldlisthead; { struct allocdirect *listadp, *newadp; - newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); - for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) { + newadp = TAILQ_FIRST(newlisthead); + for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { if (listadp->ad_lbn < newadp->ad_lbn) { listadp = TAILQ_NEXT(listadp, ad_next); continue; } - TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); + TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); if (listadp->ad_lbn == newadp->ad_lbn) { - allocdirect_merge(&inodedep->id_inoupdt, newadp, + allocdirect_merge(oldlisthead, newadp, listadp); listadp = newadp; } - newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); + newadp = TAILQ_FIRST(newlisthead); } - while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) { - TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); - TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next); + while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { + TAILQ_REMOVE(newlisthead, newadp, ad_next); + TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); } } @@ -4454,6 +4733,8 @@ softdep_fsync(vp) } if (LIST_FIRST(&inodedep->id_inowait) != NULL || LIST_FIRST(&inodedep->id_bufwait) != NULL || + TAILQ_FIRST(&inodedep->id_extupdt) != NULL || + TAILQ_FIRST(&inodedep->id_newextupdt) != NULL || TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) { FREE_LOCK(&lk); @@ -4877,9 +5158,7 @@ flush_inodedep_deps(fs, ino) ino_t ino; { struct inodedep *inodedep; - struct allocdirect *adp; int error, waitfor; - struct buf *bp; /* * This work is done in two passes. The first pass grabs most @@ -4894,52 +5173,17 @@ flush_inodedep_deps(fs, ino) * We give a brief window at the top of the loop to allow * any pending I/O to complete. */ - for (waitfor = MNT_NOWAIT; ; ) { + for (error = 0, waitfor = MNT_NOWAIT; ; ) { + if (error) + return (error); FREE_LOCK(&lk); ACQUIRE_LOCK(&lk); if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) return (0); - TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) { - if (adp->ad_state & DEPCOMPLETE) - continue; - bp = adp->ad_buf; - if (getdirtybuf(&bp, waitfor) == 0) { - if (waitfor == MNT_NOWAIT) - continue; - break; - } - FREE_LOCK(&lk); - if (waitfor == MNT_NOWAIT) { - bawrite(bp); - } else if ((error = BUF_WRITE(bp)) != 0) { - ACQUIRE_LOCK(&lk); - return (error); - } - ACQUIRE_LOCK(&lk); - break; - } - if (adp != NULL) - continue; - TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) { - if (adp->ad_state & DEPCOMPLETE) - continue; - bp = adp->ad_buf; - if (getdirtybuf(&bp, waitfor) == 0) { - if (waitfor == MNT_NOWAIT) - continue; - break; - } - FREE_LOCK(&lk); - if (waitfor == MNT_NOWAIT) { - bawrite(bp); - } else if ((error = BUF_WRITE(bp)) != 0) { - ACQUIRE_LOCK(&lk); - return (error); - } - ACQUIRE_LOCK(&lk); - break; - } - if (adp != NULL) + if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || + flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || + flush_deplist(&inodedep->id_extupdt, waitfor, &error) || + flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) continue; /* * If pass2, we are done, otherwise do pass 2. @@ -4957,6 +5201,41 @@ flush_inodedep_deps(fs, ino) } /* + * Flush an inode dependency list. + * Called with splbio blocked. + */ +static int +flush_deplist(listhead, waitfor, errorp) + struct allocdirectlst *listhead; + int waitfor; + int *errorp; +{ + struct allocdirect *adp; + struct buf *bp; + + TAILQ_FOREACH(adp, listhead, ad_next) { + if (adp->ad_state & DEPCOMPLETE) + continue; + bp = adp->ad_buf; + if (getdirtybuf(&bp, waitfor) == 0) { + if (waitfor == MNT_NOWAIT) + continue; + return (1); + } + FREE_LOCK(&lk); + if (waitfor == MNT_NOWAIT) { + bawrite(bp); + } else if ((*errorp = BUF_WRITE(bp)) != 0) { + ACQUIRE_LOCK(&lk); + return (1); + } + ACQUIRE_LOCK(&lk); + return (1); + } + return (0); +} + +/* * Eliminate a pagedep dependency by flushing out all its diradd dependencies. * Called with splbio blocked. */ @@ -5406,6 +5685,12 @@ softdep_count_dependencies(bp, wantcount) if (!wantcount) goto out; } + if (TAILQ_FIRST(&inodedep->id_extupdt)) { + /* direct block pointer dependency */ + retval += 1; + if (!wantcount) + goto out; + } continue; case D_INDIRDEP: diff --git a/sys/ufs/ffs/ffs_softdep_stub.c b/sys/ufs/ffs/ffs_softdep_stub.c index df084c7dddc8..c20b53c1e51c 100644 --- a/sys/ufs/ffs/ffs_softdep_stub.c +++ b/sys/ufs/ffs/ffs_softdep_stub.c @@ -123,6 +123,20 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) panic("softdep_setup_allocdirect called"); } +void +softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) + struct inode *ip; + ufs_lbn_t lbn; + ufs2_daddr_t newblkno; + ufs2_daddr_t oldblkno; + long newsize; + long oldsize; + struct buf *bp; +{ + + panic("softdep_setup_allocdirect called"); +} + void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; diff --git a/sys/ufs/ffs/softdep.h b/sys/ufs/ffs/softdep.h index cbee51b5a84f..f29e89f2622d 100644 --- a/sys/ufs/ffs/softdep.h +++ b/sys/ufs/ffs/softdep.h @@ -89,8 +89,10 @@ * dependencies are complete. The INPROGRESS flag marks worklist * structures that are still on the worklist, but are being considered * for action by some process. The UFS1FMT flag indicates that the - * inode being processed is a ufs1 format. The ONWORKLIST flag shows - * whether the structure is currently linked onto a worklist. + * inode being processed is a ufs1 format. The EXTDATA flag indicates + * that the allocdirect describes an extended-attributes dependency. + * The ONWORKLIST flag shows whether the structure is currently linked + * onto a worklist. */ #define ATTACHED 0x0001 #define UNDONE 0x0002 @@ -106,6 +108,7 @@ #define NEWBLOCK 0x0800 /* pagedep only */ #define INPROGRESS 0x1000 /* dirrem, freeblks, freefrag, freefile only */ #define UFS1FMT 0x2000 /* indirdep only */ +#define EXTDATA 0x4000 /* allocdirect only */ #define ONWORKLIST 0x8000 #define ALLCOMPLETE (ATTACHED | COMPLETE | DEPCOMPLETE) @@ -251,12 +254,15 @@ struct inodedep { nlink_t id_nlinkdelta; /* saved effective link count */ LIST_ENTRY(inodedep) id_deps; /* bmsafemap's list of inodedep's */ struct buf *id_buf; /* related bmsafemap (if pending) */ + long id_savedextsize; /* ext size saved during rollback */ off_t id_savedsize; /* file size saved during rollback */ struct workhead id_pendinghd; /* entries awaiting directory write */ struct workhead id_bufwait; /* operations after inode written */ struct workhead id_inowait; /* operations waiting inode update */ struct allocdirectlst id_inoupdt; /* updates before inode written */ struct allocdirectlst id_newinoupdt; /* updates when inode written */ + struct allocdirectlst id_extupdt; /* extdata updates pre-inode write */ + struct allocdirectlst id_newextupdt; /* extdata updates at ino write */ union { struct ufs1_dinode *idu_savedino1; /* saved ufs1_dinode contents */ struct ufs2_dinode *idu_savedino2; /* saved ufs2_dinode contents */ @@ -427,11 +433,12 @@ struct freeblks { uid_t fb_uid; /* uid of previous owner of blocks */ struct vnode *fb_devvp; /* filesystem device vnode */ struct mount *fb_mnt; /* associated mount point */ + long fb_oldextsize; /* previous ext data size */ off_t fb_oldsize; /* previous file size */ - off_t fb_newsize; /* new file size */ ufs2_daddr_t fb_chkcnt; /* used to check cnt of blks released */ ufs2_daddr_t fb_dblks[NDADDR]; /* direct blk ptrs to deallocate */ ufs2_daddr_t fb_iblks[NIADDR]; /* indirect blk ptrs to deallocate */ + ufs2_daddr_t fb_eblks[NXADDR]; /* indirect blk ptrs to deallocate */ }; /* diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c index abe2bea0cd87..731354e2a050 100644 --- a/sys/ufs/ufs/ufs_bmap.c +++ b/sys/ufs/ufs/ufs_bmap.c @@ -83,7 +83,7 @@ ufs_bmap(ap) if (ap->a_bnp == NULL) return (0); - error = ufs_bmaparray(ap->a_vp, ap->a_bn, &blkno, + error = ufs_bmaparray(ap->a_vp, ap->a_bn, &blkno, NULL, ap->a_runp, ap->a_runb); *ap->a_bnp = blkno; return (error); @@ -104,10 +104,11 @@ ufs_bmap(ap) */ int -ufs_bmaparray(vp, bn, bnp, runp, runb) +ufs_bmaparray(vp, bn, bnp, nbp, runp, runb) struct vnode *vp; ufs2_daddr_t bn; ufs2_daddr_t *bnp; + struct buf *nbp; int *runp; int *runb; { @@ -146,7 +147,19 @@ ufs_bmaparray(vp, bn, bnp, runp, runb) num = *nump; if (num == 0) { - *bnp = blkptrtodb(ump, DIP(ip, i_db[bn])); + if (bn >= 0 && bn < NDADDR) { + *bnp = blkptrtodb(ump, DIP(ip, i_db[bn])); + } else if (bn < 0 && bn >= -NXADDR) { + *bnp = blkptrtodb(ump, ip->i_din2->di_extb[-1 - bn]); + if (*bnp == 0) + *bnp = -1; + if (nbp == NULL) + panic("ufs_bmaparray: mapping ext data"); + nbp->b_xflags |= BX_ALTDATA; + return (0); + } else { + panic("ufs_bmaparray: blkno out of range"); + } /* * Since this is FFS independent code, we are out of * scope for the definitions of BLK_NOCOPY and diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h index 85b508ae88c7..d4e333c2de9a 100644 --- a/sys/ufs/ufs/ufs_extern.h +++ b/sys/ufs/ufs/ufs_extern.h @@ -60,13 +60,15 @@ int ufs_vnoperatefifo(struct vop_generic_args *); int ufs_vnoperatespec(struct vop_generic_args *); int ufs_bmap(struct vop_bmap_args *); -int ufs_bmaparray(struct vnode *, ufs2_daddr_t, ufs2_daddr_t *, int *, - int *); +int ufs_bmaparray(struct vnode *, ufs2_daddr_t, ufs2_daddr_t *, + struct buf *, int *, int *); int ufs_fhtovp(struct mount *, struct ufid *, struct vnode **); int ufs_checkpath(struct inode *, struct inode *, struct ucred *); void ufs_dirbad(struct inode *, doff_t, char *); int ufs_dirbadentry(struct vnode *, struct direct *, int); int ufs_dirempty(struct inode *, ino_t, struct ucred *); +int ufs_extread(struct vop_read_args *); +int ufs_extwrite(struct vop_write_args *); void ufs_makedirentry(struct inode *, struct componentname *, struct direct *); int ufs_direnter(struct vnode *, struct vnode *, struct direct *, @@ -107,10 +109,12 @@ void softdep_change_linkcnt(struct inode *); void softdep_releasefile(struct inode *); int softdep_slowdown(struct vnode *); -/* Flags to low-level allocation routines. */ -#define BA_CLRBUF 0x01 /* Request allocated buffer be cleared. */ -#define BA_SYNC 0x02 /* Do all allocations synchronously. */ -#define BA_METAONLY 0x04 /* Return indirect block buffer. */ -#define BA_NOWAIT 0x08 /* do not sleep to await lock */ +/* + * Flags to low-level allocation routines. + * The low 16-bits are reserved for IO_ flags from vnode.h. + */ +#define BA_CLRBUF 0x00010000 /* Request alloced buffer be cleared. */ +#define BA_METAONLY 0x00020000 /* Return indirect block buffer. */ +#define BA_NOWAIT 0x00040000 /* Do not sleep to await lock. */ #endif /* !_UFS_UFS_EXTERN_H_ */ diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c index c9ac36259dd4..3166fecc8c9b 100644 --- a/sys/ufs/ufs/ufs_inode.c +++ b/sys/ufs/ufs/ufs_inode.c @@ -95,7 +95,8 @@ ufs_inactive(ap) #ifdef UFS_EXTATTR ufs_extattr_vnode_inactive(ap->a_vp, ap->a_td); #endif - error = UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); + error = UFS_TRUNCATE(vp, (off_t)0, IO_EXT | IO_NORMAL, + NOCRED, td); /* * Setting the mode to zero needs to wait for the inode * to be written just as does a change to the link count. diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c index 1df9146e04c6..4515b6db7fcf 100644 --- a/sys/ufs/ufs/ufs_lookup.c +++ b/sys/ufs/ufs/ufs_lookup.c @@ -752,7 +752,7 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp) panic("ufs_direnter: newblk"); flags = BA_CLRBUF; if (!DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp)) - flags |= BA_SYNC; + flags |= IO_SYNC; if ((error = UFS_BALLOC(dvp, (off_t)dp->i_offset, DIRBLKSIZ, cr, flags, &bp)) != 0) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) @@ -961,7 +961,8 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp) if (dp->i_dirhash != NULL) ufsdirhash_dirtrunc(dp, dp->i_endoff); #endif - (void) UFS_TRUNCATE(dvp, (off_t)dp->i_endoff, IO_SYNC, cr, td); + (void) UFS_TRUNCATE(dvp, (off_t)dp->i_endoff, + IO_NORMAL | IO_SYNC, cr, td); if (tvp != NULL) vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, td); } diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c index 406832ed6386..9db4e8794e35 100644 --- a/sys/ufs/ufs/ufs_readwrite.c +++ b/sys/ufs/ufs/ufs_readwrite.c @@ -1,4 +1,13 @@ /*- + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Marshall + * Kirk McKusick and Network Associates Laboratories, the Security + * Research Division of Network Associates, Inc. under DARPA/SPAWAR + * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS + * research program + * * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * @@ -77,6 +86,9 @@ READ(ap) int ioflag; vm_object_t object; + if (ap->a_ioflag & IO_EXT) + return (ufs_extread(ap)); + GIANT_REQUIRED; vp = ap->a_vp; @@ -400,6 +412,9 @@ WRITE(ap) int blkoffset, error, extended, flags, ioflag, resid, size, xfersize; vm_object_t object; + if (ap->a_ioflag & IO_EXT) + return (ufs_extwrite(ap)); + GIANT_REQUIRED; extended = 0; @@ -471,7 +486,7 @@ WRITE(ap) osize = ip->i_size; flags = 0; if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) - flags = BA_SYNC; + flags = IO_SYNC; #ifdef ENABLE_VFS_IOOPT if (object && (object->flags & OBJ_OPT)) { @@ -581,7 +596,8 @@ WRITE(ap) if (error) { if (ioflag & IO_UNIT) { (void)UFS_TRUNCATE(vp, osize, - ioflag & IO_SYNC, ap->a_cred, uio->uio_td); + IO_NORMAL | (ioflag & IO_SYNC), + ap->a_cred, uio->uio_td); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } @@ -595,7 +611,6 @@ WRITE(ap) return (error); } - /* * get page routine */ @@ -661,7 +676,7 @@ ffs_getpages(ap) poff = (foff % bsize) / PAGE_SIZE; dp = VTOI(vp)->i_devvp; - if (ufs_bmaparray(vp, reqlblkno, &reqblkno, &bforwards, &bbackwards) + if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards) || (reqblkno == -1)) { for(i = 0; i < pcount; i++) { if (i != ap->a_reqpage) @@ -730,3 +745,321 @@ ffs_getpages(ap) return (rtval); } + +/* + * Vnode op for reading. + */ +/* ARGSUSED */ +int +ufs_extread(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp; + struct inode *ip; + struct ufs2_dinode *dp; + struct uio *uio; + struct fs *fs; + struct buf *bp; + ufs_lbn_t lbn, nextlbn; + off_t bytesinfile; + long size, xfersize, blkoffset; + int error, orig_resid; + mode_t mode; + int ioflag; + + GIANT_REQUIRED; + + vp = ap->a_vp; + ip = VTOI(vp); + fs = ip->i_fs; + dp = ip->i_din2; + mode = ip->i_mode; + uio = ap->a_uio; + ioflag = ap->a_ioflag; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) + panic("ufs_extread: mode"); + +#endif + orig_resid = uio->uio_resid; + if (orig_resid <= 0) + return (0); + + bytesinfile = dp->di_extsize - uio->uio_offset; + if (bytesinfile <= 0) { + if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) + ip->i_flag |= IN_ACCESS; + return 0; + } + + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { + if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) + break; + + lbn = lblkno(fs, uio->uio_offset); + nextlbn = lbn + 1; + + /* + * size of buffer. The buffer representing the + * end of the file is rounded up to the size of + * the block type ( fragment or full block, + * depending ). + */ + size = sblksize(fs, dp->di_extsize, lbn); + blkoffset = blkoff(fs, uio->uio_offset); + + /* + * The amount we want to transfer in this iteration is + * one FS block less the amount of the data before + * our startpoint (duh!) + */ + xfersize = fs->fs_bsize - blkoffset; + + /* + * But if we actually want less than the block, + * or the file doesn't have a whole block more of data, + * then use the lesser number. + */ + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + if (bytesinfile < xfersize) + xfersize = bytesinfile; + + if (lblktosize(fs, nextlbn) >= dp->di_extsize) { + /* + * Don't do readahead if this is the end of the info. + */ + error = bread(vp, -1 - lbn, size, NOCRED, &bp); + } else { + /* + * If we have a second block, then + * fire off a request for a readahead + * as well as a read. Note that the 4th and 5th + * arguments point to arrays of the size specified in + * the 6th argument. + */ + int nextsize = sblksize(fs, dp->di_extsize, nextlbn); + + nextlbn = -1 - nextlbn; + error = breadn(vp, -1 - lbn, + size, &nextlbn, &nextsize, 1, NOCRED, &bp); + } + if (error) { + brelse(bp); + bp = NULL; + break; + } + + /* + * If IO_DIRECT then set B_DIRECT for the buffer. This + * will cause us to attempt to release the buffer later on + * and will cause the buffer cache to attempt to free the + * underlying pages. + */ + if (ioflag & IO_DIRECT) + bp->b_flags |= B_DIRECT; + + /* + * We should only get non-zero b_resid when an I/O error + * has occurred, which should cause us to break above. + * However, if the short read did not cause an error, + * then we want to ensure that we do not uiomove bad + * or uninitialized data. + */ + size -= bp->b_resid; + if (size < xfersize) { + if (size == 0) + break; + xfersize = size; + } + + error = uiomove((char *)bp->b_data + blkoffset, + (int)xfersize, uio); + if (error) + break; + + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { + /* + * If there are no dependencies, and it's VMIO, + * then we don't need the buf, mark it available + * for freeing. The VM has the data. + */ + bp->b_flags |= B_RELBUF; + brelse(bp); + } else { + /* + * Otherwise let whoever + * made the request take care of + * freeing it. We just queue + * it onto another list. + */ + bqrelse(bp); + } + } + + /* + * This can only happen in the case of an error + * because the loop above resets bp to NULL on each iteration + * and on normal completion has not set a new value into it. + * so it must have come from a 'break' statement + */ + if (bp != NULL) { + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { + bp->b_flags |= B_RELBUF; + brelse(bp); + } else { + bqrelse(bp); + } + } + + if ((error == 0 || uio->uio_resid != orig_resid) && + (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) + ip->i_flag |= IN_ACCESS; + return (error); +} + +/* + * Vnode op for external attribute writing. + */ +int +ufs_extwrite(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp; + struct uio *uio; + struct inode *ip; + struct ufs2_dinode *dp; + struct fs *fs; + struct buf *bp; + ufs_lbn_t lbn; + off_t osize; + int blkoffset, error, flags, ioflag, resid, size, xfersize; + + GIANT_REQUIRED; + + vp = ap->a_vp; + ip = VTOI(vp); + fs = ip->i_fs; + dp = ip->i_din2; + uio = ap->a_uio; + ioflag = ap->a_ioflag; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) + panic("ext_write: mode"); +#endif + + if (ioflag & IO_APPEND) + uio->uio_offset = dp->di_extsize; + + if (uio->uio_offset < 0 || + (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) + return (EFBIG); + + resid = uio->uio_resid; + osize = dp->di_extsize; + flags = IO_EXT; + if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) + flags |= IO_SYNC; + + for (error = 0; uio->uio_resid > 0;) { + lbn = lblkno(fs, uio->uio_offset); + blkoffset = blkoff(fs, uio->uio_offset); + xfersize = fs->fs_bsize - blkoffset; + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + + /* + * We must perform a read-before-write if the transfer size + * does not cover the entire buffer. + */ + if (fs->fs_bsize > xfersize) + flags |= BA_CLRBUF; + else + flags &= ~BA_CLRBUF; + error = UFS_BALLOC(vp, uio->uio_offset, xfersize, + ap->a_cred, flags, &bp); + if (error != 0) + break; + /* + * If the buffer is not valid we have to clear out any + * garbage data from the pages instantiated for the buffer. + * If we do not, a failed uiomove() during a write can leave + * the prior contents of the pages exposed to a userland + * mmap(). XXX deal with uiomove() errors a better way. + */ + if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) + vfs_bio_clrbuf(bp); + if (ioflag & IO_DIRECT) + bp->b_flags |= B_DIRECT; + if (ioflag & IO_NOWDRAIN) + bp->b_flags |= B_NOWDRAIN; + + if (uio->uio_offset + xfersize > dp->di_extsize) + dp->di_extsize = uio->uio_offset + xfersize; + + size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; + if (size < xfersize) + xfersize = size; + + error = + uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { + bp->b_flags |= B_RELBUF; + } + + /* + * If IO_SYNC each buffer is written synchronously. Otherwise + * if we have a severe page deficiency write the buffer + * asynchronously. Otherwise try to cluster, and if that + * doesn't do it then either do an async write (if O_DIRECT), + * or a delayed write (if not). + */ + if (ioflag & IO_SYNC) { + (void)bwrite(bp); + } else if (vm_page_count_severe() || + buf_dirty_count_severe() || + xfersize + blkoffset == fs->fs_bsize || + (ioflag & (IO_ASYNC | IO_DIRECT))) + bawrite(bp); + else + bdwrite(bp); + if (error || xfersize == 0) + break; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + /* + * If we successfully wrote any data, and we are not the superuser + * we clear the setuid and setgid bits as a precaution against + * tampering. + */ + if (resid > uio->uio_resid && ap->a_cred && + suser_cred(ap->a_cred, PRISON_ROOT)) { + ip->i_mode &= ~(ISUID | ISGID); + dp->di_mode = ip->i_mode; + } + if (error) { + if (ioflag & IO_UNIT) { + (void)UFS_TRUNCATE(vp, osize, + IO_EXT | (ioflag&IO_SYNC), ap->a_cred, uio->uio_td); + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + } + } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) + error = UFS_UPDATE(vp, 1); + return (error); +} diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 0ef9ed2c3b87..66d8319280ff 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -555,7 +555,8 @@ ufs_setattr(ap) default: break; } - if ((error = UFS_TRUNCATE(vp, vap->va_size, 0, cred, td)) != 0) + if ((error = UFS_TRUNCATE(vp, vap->va_size, IO_NORMAL, + cred, td)) != 0) return (error); } if (vap->va_atime.tv_sec != VNOVAL || @@ -1268,7 +1269,9 @@ abortit: xp->i_nlink--; DIP(xp, i_nlink) = xp->i_nlink; xp->i_flag |= IN_CHANGE; - ioflag = DOINGASYNC(tvp) ? 0 : IO_SYNC; + ioflag = IO_NORMAL; + if (DOINGASYNC(tvp)) + ioflag |= IO_SYNC; if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag, tcnp->cn_cred, tcnp->cn_thread)) != 0) goto bad; @@ -1762,7 +1765,9 @@ ufs_rmdir(ap) ip->i_nlink--; DIP(ip, i_nlink) = ip->i_nlink; ip->i_flag |= IN_CHANGE; - ioflag = DOINGASYNC(vp) ? 0 : IO_SYNC; + ioflag = IO_NORMAL; + if (DOINGASYNC(vp)) + ioflag |= IO_SYNC; error = UFS_TRUNCATE(vp, (off_t)0, ioflag, cnp->cn_cred, cnp->cn_thread); } @@ -1980,7 +1985,7 @@ ufs_strategy(ap) if (vp->v_type == VBLK || vp->v_type == VCHR) panic("ufs_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { - error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, NULL); + error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, bp, NULL, NULL); bp->b_blkno = blkno; if (error) { bp->b_error = error; |