aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorKirk McKusick <mckusick@FreeBSD.org>2018-08-18 22:21:59 +0000
committerKirk McKusick <mckusick@FreeBSD.org>2018-08-18 22:21:59 +0000
commit7e038bc257e9c5f7563695b88b481e493a33576f (patch)
treed1186e77ac4b97483067bb633ce79107e226047e /sys
parentdb7c2a482248b83d7e325ffa36ce14d93ab78ad2 (diff)
downloadsrc-7e038bc257e9c5f7563695b88b481e493a33576f.tar.gz
src-7e038bc257e9c5f7563695b88b481e493a33576f.zip
Replace the TRIM consolodation framework originally added in -r337396
driven by problems found with the algorithms being tested for TRIM consolodation. Reported by: Peter Holm Suggested by: kib Reviewed by: kib Sponsored by: Netflix
Notes
Notes: svn path=/head/; revision=338031
Diffstat (limited to 'sys')
-rw-r--r--sys/ufs/ffs/ffs_alloc.c148
-rw-r--r--sys/ufs/ffs/ffs_balloc.c4
-rw-r--r--sys/ufs/ffs/ffs_extern.h20
-rw-r--r--sys/ufs/ffs/ffs_inode.c21
-rw-r--r--sys/ufs/ffs/ffs_snapshot.c6
-rw-r--r--sys/ufs/ffs/ffs_softdep.c84
-rw-r--r--sys/ufs/ffs/ffs_vfsops.c3
-rw-r--r--sys/ufs/ffs/softdep.h1
-rw-r--r--sys/ufs/ufs/ufsmount.h10
9 files changed, 232 insertions, 65 deletions
diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
index 1e1b4f1350a1..1267de701dec 100644
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c
@@ -110,8 +110,6 @@ static ufs2_daddr_t
static void ffs_blkfree_cg(struct ufsmount *, struct fs *,
struct vnode *, ufs2_daddr_t, long, ino_t,
struct workhead *);
-static void ffs_blkfree_trim_completed(struct buf *);
-static void ffs_blkfree_trim_task(void *ctx, int pending __unused);
#ifdef INVARIANTS
static int ffs_checkblk(struct inode *, ufs2_daddr_t, long);
#endif
@@ -395,8 +393,24 @@ retry:
if (bno > 0) {
bp->b_blkno = fsbtodb(fs, bno);
if (!DOINGSOFTDEP(vp))
+ /*
+ * The usual case is that a smaller fragment that
+ * was just allocated has been replaced with a bigger
+ * fragment or a full-size block. If it is marked as
+ * B_DELWRI, the current contents have not been written
+ * to disk. It is possible that the block was written
+ * earlier, but very uncommon. If the block has never
+ * been written, there is no need to send a BIO_DELETE
+ * for it when it is freed. The gain from avoiding the
+ * TRIMs for the common case of unwritten blocks far
+ * exceeds the cost of the write amplification for the
+ * uncommon case of failing to send a TRIM for a block
+ * that had been written.
+ */
ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize,
- ip->i_number, vp->v_type, NULL);
+ ip->i_number, vp->v_type, NULL,
+ (bp->b_flags & B_DELWRI) != 0 ?
+ NOTRIM_KEY : SINGLETON_KEY);
delta = btodb(nsize - osize);
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
if (flags & IO_EXT)
@@ -521,7 +535,7 @@ ffs_reallocblks_ufs1(ap)
struct fs *fs;
struct inode *ip;
struct vnode *vp;
- struct buf *sbp, *ebp;
+ struct buf *sbp, *ebp, *bp;
ufs1_daddr_t *bap, *sbap, *ebap;
struct cluster_save *buflist;
struct ufsmount *ump;
@@ -730,14 +744,30 @@ ffs_reallocblks_ufs1(ap)
printf("\n\tnew:");
#endif
for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
+ bp = buflist->bs_children[i];
if (!DOINGSOFTDEP(vp))
+ /*
+ * The usual case is that a set of N-contiguous blocks
+ * that was just allocated has been replaced with a
+ * set of N+1-contiguous blocks. If they are marked as
+ * B_DELWRI, the current contents have not been written
+ * to disk. It is possible that the blocks were written
+ * earlier, but very uncommon. If the blocks have never
+ * been written, there is no need to send a BIO_DELETE
+ * for them when they are freed. The gain from avoiding
+ * the TRIMs for the common case of unwritten blocks
+ * far exceeds the cost of the write amplification for
+ * the uncommon case of failing to send a TRIM for the
+ * blocks that had been written.
+ */
ffs_blkfree(ump, fs, ump->um_devvp,
- dbtofsb(fs, buflist->bs_children[i]->b_blkno),
- fs->fs_bsize, ip->i_number, vp->v_type, NULL);
- buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
+ dbtofsb(fs, bp->b_blkno),
+ fs->fs_bsize, ip->i_number, vp->v_type, NULL,
+ (bp->b_flags & B_DELWRI) != 0 ?
+ NOTRIM_KEY : SINGLETON_KEY);
+ bp->b_blkno = fsbtodb(fs, blkno);
#ifdef INVARIANTS
- if (!ffs_checkblk(ip,
- dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
+ if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
panic("ffs_reallocblks: unallocated block 3");
#endif
#ifdef DEBUG
@@ -771,7 +801,7 @@ ffs_reallocblks_ufs2(ap)
struct fs *fs;
struct inode *ip;
struct vnode *vp;
- struct buf *sbp, *ebp;
+ struct buf *sbp, *ebp, *bp;
ufs2_daddr_t *bap, *sbap, *ebap;
struct cluster_save *buflist;
struct ufsmount *ump;
@@ -978,14 +1008,30 @@ ffs_reallocblks_ufs2(ap)
printf("\n\tnew:");
#endif
for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
+ bp = buflist->bs_children[i];
if (!DOINGSOFTDEP(vp))
+ /*
+ * The usual case is that a set of N-contiguous blocks
+ * that was just allocated has been replaced with a
+ * set of N+1-contiguous blocks. If they are marked as
+ * B_DELWRI, the current contents have not been written
+ * to disk. It is possible that the blocks were written
+ * earlier, but very uncommon. If the blocks have never
+ * been written, there is no need to send a BIO_DELETE
+ * for them when they are freed. The gain from avoiding
+ * the TRIMs for the common case of unwritten blocks
+ * far exceeds the cost of the write amplification for
+ * the uncommon case of failing to send a TRIM for the
+ * blocks that had been written.
+ */
ffs_blkfree(ump, fs, ump->um_devvp,
- dbtofsb(fs, buflist->bs_children[i]->b_blkno),
- fs->fs_bsize, ip->i_number, vp->v_type, NULL);
- buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
+ dbtofsb(fs, bp->b_blkno),
+ fs->fs_bsize, ip->i_number, vp->v_type, NULL,
+ (bp->b_flags & B_DELWRI) != 0 ?
+ NOTRIM_KEY : SINGLETON_KEY);
+ bp->b_blkno = fsbtodb(fs, blkno);
#ifdef INVARIANTS
- if (!ffs_checkblk(ip,
- dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
+ if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
panic("ffs_reallocblks: unallocated block 3");
#endif
#ifdef DEBUG
@@ -1823,8 +1869,7 @@ gotit:
/* XXX Fixme. */
UFS_UNLOCK(ump);
if (DOINGSOFTDEP(ITOV(ip)))
- softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno,
- size, 0);
+ softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0);
UFS_LOCK(ump);
return (blkno);
}
@@ -2254,6 +2299,17 @@ ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd)
bdwrite(bp);
}
+/*
+ * Structures and routines associated with trim management.
+ */
+MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures");
+
+#define TRIMLIST_HASH(ump, key) \
+ (&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize])
+
+static void ffs_blkfree_trim_completed(struct buf *);
+static void ffs_blkfree_trim_task(void *ctx, int pending __unused);
+
struct ffs_blkfree_trim_params {
struct task task;
struct ufsmount *ump;
@@ -2277,7 +2333,7 @@ ffs_blkfree_trim_task(ctx, pending)
tp->inum, tp->pdephd);
vn_finished_secondary_write(UFSTOVFS(tp->ump));
atomic_add_int(&tp->ump->um_trim_inflight, -1);
- free(tp, M_TEMP);
+ free(tp, M_TRIM);
}
static void
@@ -2287,13 +2343,45 @@ ffs_blkfree_trim_completed(bp)
struct ffs_blkfree_trim_params *tp;
tp = bp->b_fsprivate1;
- free(bp, M_TEMP);
+ free(bp, M_TRIM);
TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp);
taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task);
}
+/*
+ * Allocate a new key to use to identify a range of blocks.
+ */
+u_long
+ffs_blkrelease_start(ump, devvp, inum)
+ struct ufsmount *ump;
+ struct vnode *devvp;
+ ino_t inum;
+{
+ static u_long masterkey;
+ u_long key;
+
+ if ((ump->um_flags & UM_CANDELETE) == 0)
+ return (SINGLETON_KEY);
+ do {
+ key = atomic_fetchadd_long(&masterkey, 1);
+ } while (key < FIRST_VALID_KEY);
+ return (key);
+}
+
+/*
+ * Deallocate a key that has been used to identify a range of blocks.
+ */
+void
+ffs_blkrelease_finish(ump, key)
+ struct ufsmount *ump;
+ u_long key;
+{
+
+ return;
+}
+
void
-ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
+ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd, key)
struct ufsmount *ump;
struct fs *fs;
struct vnode *devvp;
@@ -2302,6 +2390,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
ino_t inum;
enum vtype vtype;
struct workhead *dephd;
+ u_long key;
{
struct mount *mp;
struct buf *bp;
@@ -2319,10 +2408,11 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
return;
}
/*
- * Nothing to delay if TRIM is disabled, or the operation is
- * performed on the snapshot.
+ * Nothing to delay if TRIM is not required for this block or TRIM
+ * is disabled or the operation is performed on a snapshot.
*/
- if (((ump->um_flags) & UM_CANDELETE) == 0 || devvp->v_type == VREG) {
+ if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) ||
+ devvp->v_type == VREG) {
ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd);
return;
}
@@ -2334,7 +2424,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
* and write some new data into it.
*/
atomic_add_int(&ump->um_trim_inflight, 1);
- tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK);
+ tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK);
tp->ump = ump;
tp->devvp = devvp;
tp->bno = bno;
@@ -2347,7 +2437,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
} else
tp->pdephd = NULL;
- bp = malloc(sizeof(*bp), M_TEMP, M_WAITOK | M_ZERO);
+ bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
bp->b_iocmd = BIO_DELETE;
bp->b_iooffset = dbtob(fsbtodb(fs, bno));
bp->b_iodone = ffs_blkfree_trim_completed;
@@ -2822,6 +2912,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
struct fs *fs;
ufs2_daddr_t blkno;
long blkcnt, blksize;
+ u_long key;
struct file *fp, *vfp;
cap_rights_t rights;
int filetype, error;
@@ -2956,15 +3047,18 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
blkno = cmd.value;
blkcnt = cmd.size;
blksize = fs->fs_frag - (blkno % fs->fs_frag);
+ key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO);
while (blkcnt > 0) {
- if (blksize > blkcnt)
+ if (blkcnt < blksize)
blksize = blkcnt;
ffs_blkfree(ump, fs, ump->um_devvp, blkno,
- blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL);
+ blksize * fs->fs_fsize, UFS_ROOTINO,
+ VDIR, NULL, key);
blkno += blksize;
blkcnt -= blksize;
blksize = fs->fs_frag;
}
+ ffs_blkrelease_finish(ump, key);
break;
/*
diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c
index 6143b4fca8c0..ad41f316000a 100644
--- a/sys/ufs/ffs/ffs_balloc.c
+++ b/sys/ufs/ffs/ffs_balloc.c
@@ -553,7 +553,7 @@ fail:
lbns_remfree++;
#endif
ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
- ip->i_number, vp->v_type, NULL);
+ ip->i_number, vp->v_type, NULL, SINGLETON_KEY);
}
return (error);
}
@@ -1147,7 +1147,7 @@ fail:
lbns_remfree++;
#endif
ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
- ip->i_number, vp->v_type, NULL);
+ ip->i_number, vp->v_type, NULL, SINGLETON_KEY);
}
return (error);
}
diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h
index 2df48ec91de9..6b8e708f7371 100644
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h
@@ -63,9 +63,11 @@ int ffs_balloc_ufs2(struct vnode *a_vp, off_t a_startoffset, int a_size,
struct ucred *a_cred, int a_flags, struct buf **a_bpp);
int ffs_blkatoff(struct vnode *, off_t, char **, struct buf **);
void ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *,
- ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *);
+ ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *, u_long);
ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *);
ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
+void ffs_blkrelease_finish(struct ufsmount *, u_long);
+u_long ffs_blkrelease_start(struct ufsmount *, struct vnode *, ino_t);
int ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int);
@@ -111,11 +113,27 @@ vfs_vget_t ffs_vget;
int ffs_vgetf(struct mount *, ino_t, int, struct vnode **, int);
void process_deferred_inactive(struct mount *mp);
+/*
+ * Flags to ffs_vgetf
+ */
#define FFSV_FORCEINSMQ 0x0001
+/*
+ * Flags to ffs_reload
+ */
#define FFSR_FORCE 0x0001
#define FFSR_UNSUSPEND 0x0002
+/*
+ * Definitions for TRIM interface
+ *
+ * Special keys and recommended hash table size
+ */
+#define NOTRIM_KEY 1 /* never written, so don't call trim for it */
+#define SINGLETON_KEY 2 /* only block being freed, so trim it now */
+#define FIRST_VALID_KEY 3 /* first valid key describing a block range */
+#define MAXTRIMIO 1024 /* maximum expected outstanding trim requests */
+
extern struct vop_vector ffs_vnodeops1;
extern struct vop_vector ffs_fifoops1;
extern struct vop_vector ffs_vnodeops2;
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index 6a26ef97189a..0ec662179880 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -197,6 +197,7 @@ ffs_truncate(vp, length, flags, cred)
int needextclean, extblocks;
int offset, size, level, nblocks;
int i, error, allerror, indiroff, waitforupdate;
+ u_long key;
off_t osize;
ip = VTOI(vp);
@@ -275,7 +276,7 @@ ffs_truncate(vp, length, flags, cred)
continue;
ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i],
sblksize(fs, osize, i), ip->i_number,
- vp->v_type, NULL);
+ vp->v_type, NULL, SINGLETON_KEY);
}
}
}
@@ -523,7 +524,7 @@ ffs_truncate(vp, length, flags, cred)
DIP_SET(ip, i_ib[level], 0);
ffs_blkfree(ump, fs, ump->um_devvp, bn,
fs->fs_bsize, ip->i_number,
- vp->v_type, NULL);
+ vp->v_type, NULL, SINGLETON_KEY);
blocksreleased += nblocks;
}
}
@@ -534,6 +535,7 @@ ffs_truncate(vp, length, flags, cred)
/*
* All whole direct blocks or frags.
*/
+ key = ffs_blkrelease_start(ump, ump->um_devvp, ip->i_number);
for (i = UFS_NDADDR - 1; i > lastblock; i--) {
long bsize;
@@ -543,9 +545,10 @@ ffs_truncate(vp, length, flags, cred)
DIP_SET(ip, i_db[i], 0);
bsize = blksize(fs, ip, i);
ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number,
- vp->v_type, NULL);
+ vp->v_type, NULL, key);
blocksreleased += btodb(bsize);
}
+ ffs_blkrelease_finish(ump, key);
if (lastblock < 0)
goto done;
@@ -575,7 +578,8 @@ ffs_truncate(vp, length, flags, cred)
*/
bn += numfrags(fs, newspace);
ffs_blkfree(ump, fs, ump->um_devvp, bn,
- oldspace - newspace, ip->i_number, vp->v_type, NULL);
+ oldspace - newspace, ip->i_number, vp->v_type,
+ NULL, SINGLETON_KEY);
blocksreleased += btodb(oldspace - newspace);
}
}
@@ -634,8 +638,10 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
{
struct buf *bp;
struct fs *fs;
+ struct ufsmount *ump;
struct vnode *vp;
caddr_t copy = NULL;
+ u_long key;
int i, nblocks, error = 0, allerror = 0;
ufs2_daddr_t nb, nlbn, last;
ufs2_daddr_t blkcount, factor, blocksreleased = 0;
@@ -644,6 +650,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
#define BAP(ip, i) (I_IS_UFS1(ip) ? bap1[i] : bap2[i])
fs = ITOFS(ip);
+ ump = ITOUMP(ip);
/*
* Calculate index in current block of last
@@ -719,6 +726,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
/*
* Recursively free totally unused blocks.
*/
+ key = ffs_blkrelease_start(ump, ITODEVVP(ip), ip->i_number);
for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
i--, nlbn += factor) {
nb = BAP(ip, i);
@@ -730,10 +738,11 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
allerror = error;
blocksreleased += blkcount;
}
- ffs_blkfree(ITOUMP(ip), fs, ITODEVVP(ip), nb, fs->fs_bsize,
- ip->i_number, vp->v_type, NULL);
+ ffs_blkfree(ump, fs, ITODEVVP(ip), nb, fs->fs_bsize,
+ ip->i_number, vp->v_type, NULL, key);
blocksreleased += nblocks;
}
+ ffs_blkrelease_finish(ump, key);
/*
* Recursively free last partial block.
diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c
index fed0456b13cb..4453c59517df 100644
--- a/sys/ufs/ffs/ffs_snapshot.c
+++ b/sys/ufs/ffs/ffs_snapshot.c
@@ -583,7 +583,7 @@ loop:
if (len != 0 && len < fs->fs_bsize) {
ffs_blkfree(ump, copy_fs, vp,
DIP(xp, i_db[loc]), len, xp->i_number,
- xvp->v_type, NULL);
+ xvp->v_type, NULL, SINGLETON_KEY);
blkno = DIP(xp, i_db[loc]);
DIP_SET(xp, i_db[loc], 0);
}
@@ -1265,7 +1265,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
if (blkno == BLK_SNAP)
blkno = blkstofrags(fs, lblkno);
ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
- vp->v_type, NULL);
+ vp->v_type, NULL, SINGLETON_KEY);
}
return (0);
}
@@ -1549,7 +1549,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
if (blkno == BLK_SNAP)
blkno = blkstofrags(fs, lblkno);
ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
- vp->v_type, NULL);
+ vp->v_type, NULL, SINGLETON_KEY);
}
return (0);
}
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 4943555198db..89d0b7382e4d 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -869,7 +869,7 @@ static void cancel_allocdirect(struct allocdirectlst *,
struct allocdirect *, struct freeblks *);
static int check_inode_unwritten(struct inodedep *);
static int free_inodedep(struct inodedep *);
-static void freework_freeblock(struct freework *);
+static void freework_freeblock(struct freework *, u_long);
static void freework_enqueue(struct freework *);
static int handle_workitem_freeblocks(struct freeblks *, int);
static int handle_complete_freeblocks(struct freeblks *, int);
@@ -884,7 +884,7 @@ static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
ufs2_daddr_t, ufs_lbn_t);
static void handle_workitem_freefrag(struct freefrag *);
static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
- ufs_lbn_t);
+ ufs_lbn_t, u_long);
static void allocdirect_merge(struct allocdirectlst *,
struct allocdirect *, struct allocdirect *);
static struct freefrag *allocindir_merge(struct allocindir *,
@@ -5289,7 +5289,22 @@ softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
KASSERT(MOUNTEDSOFTDEP(mp) != 0,
("softdep_setup_allocdirect called on non-softdep filesystem"));
if (oldblkno && oldblkno != newblkno)
- freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
+ /*
+ * The usual case is that a smaller fragment that
+ * was just allocated has been replaced with a bigger
+ * fragment or a full-size block. If it is marked as
+ * B_DELWRI, the current contents have not been written
+ * to disk. It is possible that the block was written
+ * earlier, but very uncommon. If the block has never
+ * been written, there is no need to send a BIO_DELETE
+ * for it when it is freed. The gain from avoiding the
+ * TRIMs for the common case of unwritten blocks far
+ * exceeds the cost of the write amplification for the
+ * uncommon case of failing to send a TRIM for a block
+ * that had been written.
+ */
+ freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
+ (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
else
freefrag = NULL;
@@ -5566,11 +5581,12 @@ newjfreefrag(freefrag, ip, blkno, size, lbn)
* Allocate a new freefrag structure.
*/
static struct freefrag *
-newfreefrag(ip, blkno, size, lbn)
+newfreefrag(ip, blkno, size, lbn, key)
struct inode *ip;
ufs2_daddr_t blkno;
long size;
ufs_lbn_t lbn;
+ u_long key;
{
struct freefrag *freefrag;
struct ufsmount *ump;
@@ -5591,6 +5607,7 @@ newfreefrag(ip, blkno, size, lbn)
freefrag->ff_vtype = ITOV(ip)->v_type;
freefrag->ff_blkno = blkno;
freefrag->ff_fragsize = size;
+ freefrag->ff_key = key;
if (MOUNTEDSUJ(UFSTOVFS(ump))) {
freefrag->ff_jdep = (struct worklist *)
@@ -5636,7 +5653,8 @@ handle_workitem_freefrag(freefrag)
}
FREE_LOCK(ump);
ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
- freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
+ freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype,
+ &wkhd, freefrag->ff_key);
ACQUIRE_LOCK(ump);
WORKITEM_FREE(freefrag, D_FREEFRAG);
FREE_LOCK(ump);
@@ -5676,7 +5694,22 @@ softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
lbn = bp->b_lblkno;
if (oldblkno && oldblkno != newblkno)
- freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
+ /*
+ * The usual case is that a smaller fragment that
+ * was just allocated has been replaced with a bigger
+ * fragment or a full-size block. If it is marked as
+ * B_DELWRI, the current contents have not been written
+ * to disk. It is possible that the block was written
+ * earlier, but very uncommon. If the block has never
+ * been written, there is no need to send a BIO_DELETE
+ * for it when it is freed. The gain from avoiding the
+ * TRIMs for the common case of unwritten blocks far
+ * exceeds the cost of the write amplification for the
+ * uncommon case of failing to send a TRIM for a block
+ * that had been written.
+ */
+ freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
+ (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
else
freefrag = NULL;
@@ -5789,7 +5822,8 @@ newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
struct jnewblk *jnewblk;
if (oldblkno)
- freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn);
+ freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn,
+ SINGLETON_KEY);
else
freefrag = NULL;
ACQUIRE_LOCK(ITOUMP(ip));
@@ -7724,8 +7758,9 @@ free_inodedep(inodedep)
* in memory immediately.
*/
static void
-freework_freeblock(freework)
+freework_freeblock(freework, key)
struct freework *freework;
+ u_long key;
{
struct freeblks *freeblks;
struct jnewblk *jnewblk;
@@ -7779,10 +7814,10 @@ freework_freeblock(freework)
FREE_LOCK(ump);
freeblks_free(ump, freeblks, btodb(bsize));
CTR4(KTR_SUJ,
- "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
+ "freework_freeblock: ino %jd blkno %jd lbn %jd size %d",
freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
- freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
+ freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key);
ACQUIRE_LOCK(ump);
/*
* The jnewblk will be discarded and the bits in the map never
@@ -7835,7 +7870,7 @@ handle_workitem_indirblk(freework)
return;
}
if (freework->fw_off == NINDIR(fs)) {
- freework_freeblock(freework);
+ freework_freeblock(freework, SINGLETON_KEY);
return;
}
freework->fw_state |= INPROGRESS;
@@ -7894,10 +7929,12 @@ handle_workitem_freeblocks(freeblks, flags)
struct allocindir *aip;
struct ufsmount *ump;
struct worklist *wk;
+ u_long key;
KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
("handle_workitem_freeblocks: Journal entries not written."));
ump = VFSTOUFS(freeblks->fb_list.wk_mp);
+ key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
ACQUIRE_LOCK(ump);
while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
WORKLIST_REMOVE(wk);
@@ -7935,7 +7972,7 @@ handle_workitem_freeblocks(freeblks, flags)
if (freework->fw_lbn <= -UFS_NDADDR)
handle_workitem_indirblk(freework);
else
- freework_freeblock(freework);
+ freework_freeblock(freework, key);
continue;
default:
panic("handle_workitem_freeblocks: Unknown type %s",
@@ -7948,6 +7985,7 @@ handle_workitem_freeblocks(freeblks, flags)
freeblks = NULL;
}
FREE_LOCK(ump);
+ ffs_blkrelease_finish(ump, key);
if (freeblks)
return handle_complete_freeblocks(freeblks, flags);
return (0);
@@ -8080,13 +8118,9 @@ indir_trunc(freework, dbn, lbn)
ufs1_daddr_t *bap1;
ufs2_daddr_t nb, nnb, *bap2;
ufs_lbn_t lbnadd, nlbn;
- int i, nblocks, ufs1fmt;
- int freedblocks;
- int goingaway;
- int freedeps;
- int needj;
- int level;
- int cnt;
+ u_long key;
+ int nblocks, ufs1fmt, freedblocks;
+ int goingaway, freedeps, needj, level, cnt, i;
freeblks = freework->fw_freeblks;
ump = VFSTOUFS(freeblks->fb_list.wk_mp);
@@ -8180,6 +8214,7 @@ indir_trunc(freework, dbn, lbn)
* arranges for the current level to be freed when subordinates
* are free when journaling.
*/
+ key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
if (i != NINDIR(fs) - 1) {
if (ufs1fmt)
@@ -8215,13 +8250,14 @@ indir_trunc(freework, dbn, lbn)
freedeps++;
}
CTR3(KTR_SUJ,
- "indir_trunc: ino %d blkno %jd size %ld",
+ "indir_trunc: ino %jd blkno %jd size %d",
freeblks->fb_inum, nb, fs->fs_bsize);
ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
fs->fs_bsize, freeblks->fb_inum,
- freeblks->fb_vtype, &wkhd);
+ freeblks->fb_vtype, &wkhd, key);
}
}
+ ffs_blkrelease_finish(ump, key);
if (goingaway) {
bp->b_flags |= B_INVAL | B_NOCACHE;
brelse(bp);
@@ -8244,7 +8280,7 @@ indir_trunc(freework, dbn, lbn)
if (level == 0)
freeblks->fb_cgwait += freedeps;
if (freework->fw_ref == 0)
- freework_freeblock(freework);
+ freework_freeblock(freework, SINGLETON_KEY);
FREE_LOCK(ump);
return;
}
@@ -8253,10 +8289,10 @@ indir_trunc(freework, dbn, lbn)
*/
dbn = dbtofsb(fs, dbn);
CTR3(KTR_SUJ,
- "indir_trunc 2: ino %d blkno %jd size %ld",
+ "indir_trunc 2: ino %jd blkno %jd size %d",
freeblks->fb_inum, dbn, fs->fs_bsize);
ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
- freeblks->fb_inum, freeblks->fb_vtype, NULL);
+ freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON_KEY);
/* Non SUJ softdep does single-threaded truncations. */
if (freework->fw_blkno == dbn) {
freework->fw_state |= ALLCOMPLETE;
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index 9ed5c58f7b0e..e3927327c79c 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -978,6 +978,8 @@ ffs_mountfs(devvp, mp, td)
taskqueue_thread_enqueue, &ump->um_trim_tq);
taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS,
"%s trim", mp->mnt_stat.f_mntonname);
+ ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM,
+ &ump->um_trimlisthashsize);
}
}
@@ -1256,6 +1258,7 @@ ffs_unmount(mp, mntflags)
pause("ufsutr", hz);
taskqueue_drain_all(ump->um_trim_tq);
taskqueue_free(ump->um_trim_tq);
+ free (ump->um_trimhash, M_TRIM);
}
g_topology_lock();
if (ump->um_fsckpid > 0) {
diff --git a/sys/ufs/ffs/softdep.h b/sys/ufs/ffs/softdep.h
index 707429fe68c6..1e2946ab9fb9 100644
--- a/sys/ufs/ffs/softdep.h
+++ b/sys/ufs/ffs/softdep.h
@@ -557,6 +557,7 @@ struct freefrag {
long ff_fragsize; /* size of fragment being deleted */
ino_t ff_inum; /* owning inode number */
enum vtype ff_vtype; /* owning inode's file type */
+ int ff_key; /* trim key when deleted */
};
/*
diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h
index 1958b02f6abb..0652e6a72930 100644
--- a/sys/ufs/ufs/ufsmount.h
+++ b/sys/ufs/ufs/ufsmount.h
@@ -47,6 +47,7 @@ struct ufs_args {
#ifdef MALLOC_DECLARE
MALLOC_DECLARE(M_UFSMNT);
+MALLOC_DECLARE(M_TRIM);
#endif
struct buf;
@@ -63,6 +64,7 @@ struct inodedep;
TAILQ_HEAD(inodedeplst, inodedep);
LIST_HEAD(bmsafemaphd, bmsafemap);
+LIST_HEAD(trimlist_hashhead, ffs_blkfree_trim_params);
/*
* This structure describes the UFS specific mount structure data.
@@ -70,7 +72,6 @@ LIST_HEAD(bmsafemaphd, bmsafemap);
* UFS (UFS1, UFS2, etc).
*
* Lock reference:
- * a - atomic operations
* c - set at allocation then constant until freed
* i - ufsmount interlock (UFS_LOCK / UFS_UNLOCK)
* q - associated quota file is locked
@@ -99,8 +100,13 @@ struct ufsmount {
char um_qflags[MAXQUOTAS]; /* (i) quota specific flags */
int64_t um_savedmaxfilesize; /* (c) track maxfilesize */
u_int um_flags; /* (i) filesystem flags */
- u_int um_trim_inflight; /* (a) outstanding trim count */
+ u_int um_trim_inflight; /* (i) outstanding trim count */
+ u_int um_trim_inflight_blks; /* (i) outstanding trim blks */
+ u_long um_trim_total; /* (i) total trim count */
+ u_long um_trim_total_blks; /* (i) total trim block count */
struct taskqueue *um_trim_tq; /* (c) trim request queue */
+ struct trimlist_hashhead *um_trimhash; /* (i) trimlist hash table */
+ u_long um_trimlisthashsize; /* (i) trim hash table size-1 */
/* (c) - below function ptrs */
int (*um_balloc)(struct vnode *, off_t, int, struct ucred *,
int, struct buf **);