aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorKonstantin Belousov <kib@FreeBSD.org>2007-01-23 10:01:19 +0000
committerKonstantin Belousov <kib@FreeBSD.org>2007-01-23 10:01:19 +0000
commit2cc7d26f7f9d897fb38b9cf621a73b89a3dd8365 (patch)
treeb0e8fba9d4f75f20a608a5e6e9db64ce7633b0f4 /sys
parent1f294570c0b513e3c7f73e495d5105ede3a74dda (diff)
downloadsrc-2cc7d26f7f9d897fb38b9cf621a73b89a3dd8365.tar.gz
src-2cc7d26f7f9d897fb38b9cf621a73b89a3dd8365.zip
Cylinder group bitmaps and blocks containing inode for a snapshot
file are after snaplock, while other ffs device buffers are before snaplock in global lock order. By itself, this could cause deadlock when bdwrite() tries to flush dirty buffers on snapshotted ffs. If, during the flush, COW activity for snapshot needs to allocate block and ffs_alloccg() selects the cylinder group that is being written by bdwrite(), then kernel would panic due to recursive buffer lock acquision. Avoid dealing with buffers in bdwrite() that are from other side of snaplock divisor in the lock order then the buffer being written. Add new BOP, bop_bdwrite(), to do dirty buffer flushing for same vnode in the bdwrite(). Default implementation, bufbdflush(), refactors the code from bdwrite(). For ffs device buffers, specialized implementation is used. Reviewed by: tegge, jeff, Russell Cattelan (cattelan xfs org, xfs changes) Tested by: Peter Holm X-MFC after: 3 weeks (if ever: it changes ABI)
Notes
Notes: svn path=/head/; revision=166193
Diffstat (limited to 'sys')
-rw-r--r--sys/geom/geom_vfs.c1
-rw-r--r--sys/gnu/fs/xfs/FreeBSD/xfs_mountops.c7
-rw-r--r--sys/kern/vfs_bio.c93
-rw-r--r--sys/nfs4client/nfs4_vnops.c1
-rw-r--r--sys/nfsclient/nfs_vnops.c1
-rw-r--r--sys/sys/buf.h4
-rw-r--r--sys/sys/bufobj.h4
-rw-r--r--sys/ufs/ffs/ffs_extern.h1
-rw-r--r--sys/ufs/ffs/ffs_snapshot.c114
-rw-r--r--sys/ufs/ffs/ffs_vfsops.c5
10 files changed, 189 insertions, 42 deletions
diff --git a/sys/geom/geom_vfs.c b/sys/geom/geom_vfs.c
index bff516b56124..4a33d2275a57 100644
--- a/sys/geom/geom_vfs.c
+++ b/sys/geom/geom_vfs.c
@@ -50,6 +50,7 @@ static struct buf_ops __g_vfs_bufops = {
.bop_write = bufwrite,
.bop_strategy = g_vfs_strategy,
.bop_sync = bufsync,
+ .bop_bdflush = bufbdflush
};
struct buf_ops *g_vfs_bufops = &__g_vfs_bufops;
diff --git a/sys/gnu/fs/xfs/FreeBSD/xfs_mountops.c b/sys/gnu/fs/xfs/FreeBSD/xfs_mountops.c
index cf4ae89c6328..da3650b3993b 100644
--- a/sys/gnu/fs/xfs/FreeBSD/xfs_mountops.c
+++ b/sys/gnu/fs/xfs/FreeBSD/xfs_mountops.c
@@ -497,9 +497,16 @@ xfs_geom_bufsync(struct bufobj *bo, int waitfor, struct thread *td)
return bufsync(bo,waitfor,td);
}
+static void
+xfs_geom_bufbdflush(struct bufobj *bo, struct buf *bp)
+{
+ bufbdflush(bo, bp);
+}
+
struct buf_ops xfs_bo_ops = {
.bop_name = "XFS",
.bop_write = xfs_geom_bufwrite,
.bop_strategy = xfs_geom_strategy,
.bop_sync = xfs_geom_bufsync,
+ .bop_bdflush = xfs_geom_bufbdflush,
};
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 6b3447ad7539..cb7d16f6f5b6 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -80,6 +80,7 @@ struct buf_ops buf_ops_bio = {
.bop_write = bufwrite,
.bop_strategy = bufstrategy,
.bop_sync = bufsync,
+ .bop_bdflush = bufbdflush,
};
/*
@@ -146,10 +147,13 @@ SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
static int hirunningspace;
SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
"Maximum amount of space to use for in-progress I/O");
-static int dirtybufferflushes;
+int dirtybufferflushes;
SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
-static int altbufferflushes;
+int bdwriteskip;
+SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
+ 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
+int altbufferflushes;
SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
0, "Number of fsync flushes to limit dirty buffers");
static int recursiveflushes;
@@ -164,7 +168,7 @@ SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
static int hidirtybuffers;
SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
"When the number of dirty buffers is considered severe");
-static int dirtybufthresh;
+int dirtybufthresh;
SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
static int numfreebuffers;
@@ -886,6 +890,47 @@ bufwrite(struct buf *bp)
return (0);
}
+void
+bufbdflush(struct bufobj *bo, struct buf *bp)
+{
+ struct buf *nbp;
+
+ if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
+ (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
+ altbufferflushes++;
+ } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
+ BO_LOCK(bo);
+ /*
+ * Try to find a buffer to flush.
+ */
+ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
+ if ((nbp->b_vflags & BV_BKGRDINPROG) ||
+ BUF_LOCK(nbp,
+ LK_EXCLUSIVE | LK_NOWAIT, NULL))
+ continue;
+ if (bp == nbp)
+ panic("bdwrite: found ourselves");
+ BO_UNLOCK(bo);
+ /* Don't countdeps with the bo lock held. */
+ if (buf_countdeps(nbp, 0)) {
+ BO_LOCK(bo);
+ BUF_UNLOCK(nbp);
+ continue;
+ }
+ if (nbp->b_flags & B_CLUSTEROK) {
+ vfs_bio_awrite(nbp);
+ } else {
+ bremfree(nbp);
+ bawrite(nbp);
+ }
+ dirtybufferflushes++;
+ break;
+ }
+ if (nbp == NULL)
+ BO_UNLOCK(bo);
+ }
+}
+
/*
* Delayed write. (Buffer is marked dirty). Do not bother writing
* anything if the buffer is marked invalid.
@@ -900,7 +945,6 @@ bdwrite(struct buf *bp)
{
struct thread *td = curthread;
struct vnode *vp;
- struct buf *nbp;
struct bufobj *bo;
CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
@@ -921,44 +965,9 @@ bdwrite(struct buf *bp)
*/
vp = bp->b_vp;
bo = bp->b_bufobj;
- if ((td->td_pflags & TDP_COWINPROGRESS) == 0) {
- BO_LOCK(bo);
- if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
- BO_UNLOCK(bo);
- (void) VOP_FSYNC(vp, MNT_NOWAIT, td);
- altbufferflushes++;
- } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
- /*
- * Try to find a buffer to flush.
- */
- TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
- if ((nbp->b_vflags & BV_BKGRDINPROG) ||
- BUF_LOCK(nbp,
- LK_EXCLUSIVE | LK_NOWAIT, NULL))
- continue;
- if (bp == nbp)
- panic("bdwrite: found ourselves");
- BO_UNLOCK(bo);
- /* Don't countdeps with the bo lock held. */
- if (buf_countdeps(nbp, 0)) {
- BO_LOCK(bo);
- BUF_UNLOCK(nbp);
- continue;
- }
- if (nbp->b_flags & B_CLUSTEROK) {
- vfs_bio_awrite(nbp);
- } else {
- bremfree(nbp);
- bawrite(nbp);
- }
- dirtybufferflushes++;
- break;
- }
- if (nbp == NULL)
- BO_UNLOCK(bo);
- } else
- BO_UNLOCK(bo);
- } else
+ if ((td->td_pflags & TDP_COWINPROGRESS) == 0)
+ BO_BDFLUSH(bo, bp);
+ else
recursiveflushes++;
bdirty(bp);
diff --git a/sys/nfs4client/nfs4_vnops.c b/sys/nfs4client/nfs4_vnops.c
index 5867471234bf..419fc8896e46 100644
--- a/sys/nfs4client/nfs4_vnops.c
+++ b/sys/nfs4client/nfs4_vnops.c
@@ -2874,4 +2874,5 @@ struct buf_ops buf_ops_nfs4 = {
.bop_write = nfs4_bwrite,
.bop_strategy = bufstrategy,
.bop_sync = bufsync,
+ .bop_bdflush = bufbdflush,
};
diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c
index 4308244627d9..c53c913564db 100644
--- a/sys/nfsclient/nfs_vnops.c
+++ b/sys/nfsclient/nfs_vnops.c
@@ -3275,4 +3275,5 @@ struct buf_ops buf_ops_nfs = {
.bop_write = nfs_bwrite,
.bop_strategy = bufstrategy,
.bop_sync = bufsync,
+ .bop_bdflush = bufbdflush,
};
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 359be84af143..a561aaa509cc 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -483,6 +483,10 @@ extern int maxswzone; /* Max KVA for swap structures */
extern int maxbcache; /* Max KVA for buffer cache */
extern int runningbufspace;
extern int hibufspace;
+extern int dirtybufthresh;
+extern int bdwriteskip;
+extern int dirtybufferflushes;
+extern int altbufferflushes;
extern int buf_maxio; /* nominal maximum I/O for buffer */
extern struct buf *buf; /* The buffer headers. */
extern char *buffers; /* The buffer contents. */
diff --git a/sys/sys/bufobj.h b/sys/sys/bufobj.h
index ea568e47cdcc..66507020ea89 100644
--- a/sys/sys/bufobj.h
+++ b/sys/sys/bufobj.h
@@ -70,17 +70,20 @@ struct bufv {
typedef void b_strategy_t(struct bufobj *, struct buf *);
typedef int b_write_t(struct buf *);
typedef int b_sync_t(struct bufobj *, int waitfor, struct thread *td);
+typedef void b_bdflush_t(struct bufobj *, struct buf *);
struct buf_ops {
char *bop_name;
b_write_t *bop_write;
b_strategy_t *bop_strategy;
b_sync_t *bop_sync;
+ b_bdflush_t *bop_bdflush;
};
#define BO_STRATEGY(bo, bp) ((bo)->bo_ops->bop_strategy((bo), (bp)))
#define BO_SYNC(bo, w, td) ((bo)->bo_ops->bop_sync((bo), (w), (td)))
#define BO_WRITE(bo, bp) ((bo)->bo_ops->bop_write((bp)))
+#define BO_BDFLUSH(bo, bp) ((bo)->bo_ops->bop_bdflush((bo), (bp)))
struct bufobj {
struct mtx *bo_mtx; /* Mutex which protects "i" things */
@@ -130,6 +133,7 @@ void bufobj_wrefl(struct bufobj *bo);
int bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo);
int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo);
int bufsync(struct bufobj *bo, int waitfor, struct thread *td);
+void bufbdflush(struct bufobj *bo, struct buf *bp);
#endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
#endif /* _SYS_BUFOBJ_H_ */
diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h
index 98f0f919e6c7..67986c12a803 100644
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h
@@ -61,6 +61,7 @@ ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *);
ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
int ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
+void ffs_bdflush(struct bufobj *, struct buf *);
int ffs_copyonwrite(struct vnode *, struct buf *);
int ffs_flushfiles(struct mount *, int, struct thread *);
void ffs_fragacct(struct fs *, int, int32_t [], int);
diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c
index 14b231cda6eb..d94bd6d7d3ef 100644
--- a/sys/ufs/ffs/ffs_snapshot.c
+++ b/sys/ufs/ffs/ffs_snapshot.c
@@ -163,6 +163,7 @@ static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
static void process_deferred_inactive(struct mount *);
static void try_free_snapdata(struct vnode *devvp, struct thread *td);
+static int ffs_bp_snapblk(struct vnode *, struct buf *);
/*
* To ensure the consistency of snapshots across crashes, we must
@@ -2065,6 +2066,119 @@ ffs_snapshot_unmount(mp)
}
/*
+ * Check the buffer block to be belong to device buffer that shall be
+ * locked after snaplk. devvp shall be locked on entry, and will be
+ * leaved locked upon exit.
+ */
+static int
+ffs_bp_snapblk(devvp, bp)
+ struct vnode *devvp;
+ struct buf *bp;
+{
+ struct snapdata *sn;
+ struct fs *fs;
+ ufs2_daddr_t lbn, *snapblklist;
+ int lower, upper, mid;
+
+ ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk");
+ KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp));
+ sn = devvp->v_rdev->si_snapdata;
+ if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL)
+ return (0);
+ fs = TAILQ_FIRST(&sn->sn_head)->i_fs;
+ lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
+ snapblklist = sn->sn_blklist;
+ upper = sn->sn_listsize - 1;
+ lower = 1;
+ while (lower <= upper) {
+ mid = (lower + upper) / 2;
+ if (snapblklist[mid] == lbn)
+ break;
+ if (snapblklist[mid] < lbn)
+ lower = mid + 1;
+ else
+ upper = mid - 1;
+ }
+ if (lower <= upper)
+ return (1);
+ return (0);
+}
+
+void
+ffs_bdflush(bo, bp)
+ struct bufobj *bo;
+ struct buf *bp;
+{
+ struct thread *td;
+ struct vnode *vp, *devvp;
+ struct buf *nbp;
+ int bp_bdskip;
+
+ if (bo->bo_dirty.bv_cnt <= dirtybufthresh)
+ return;
+
+ td = curthread;
+ vp = bp->b_vp;
+ devvp = bo->__bo_vnode;
+ KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp));
+
+ VI_LOCK(devvp);
+ bp_bdskip = ffs_bp_snapblk(devvp, bp);
+ if (bp_bdskip)
+ bdwriteskip++;
+ VI_UNLOCK(devvp);
+ if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) {
+ (void) VOP_FSYNC(vp, MNT_NOWAIT, td);
+ altbufferflushes++;
+ } else {
+ BO_LOCK(bo);
+ /*
+ * Try to find a buffer to flush.
+ */
+ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
+ if ((nbp->b_vflags & BV_BKGRDINPROG) ||
+ BUF_LOCK(nbp,
+ LK_EXCLUSIVE | LK_NOWAIT, NULL))
+ continue;
+ if (bp == nbp)
+ panic("bdwrite: found ourselves");
+ BO_UNLOCK(bo);
+ /*
+ * Don't countdeps with the bo lock
+ * held.
+ */
+ if (buf_countdeps(nbp, 0)) {
+ BO_LOCK(bo);
+ BUF_UNLOCK(nbp);
+ continue;
+ }
+ if (bp_bdskip) {
+ VI_LOCK(devvp);
+ if (!ffs_bp_snapblk(vp, nbp)) {
+ if (BO_MTX(bo) != VI_MTX(vp)) {
+ VI_UNLOCK(devvp);
+ BO_LOCK(bo);
+ }
+ BUF_UNLOCK(nbp);
+ continue;
+ }
+ VI_UNLOCK(devvp);
+ }
+ if (nbp->b_flags & B_CLUSTEROK) {
+ vfs_bio_awrite(nbp);
+ } else {
+ bremfree(nbp);
+ bawrite(nbp);
+ }
+ dirtybufferflushes++;
+ break;
+ }
+ if (nbp == NULL)
+ BO_UNLOCK(bo);
+ }
+}
+
+/*
* Check for need to copy block that is about to be written,
* copying the block if necessary.
*/
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index a2591bc88d41..225dd8a61ba2 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -117,6 +117,11 @@ static struct buf_ops ffs_ops = {
.bop_write = ffs_bufwrite,
.bop_strategy = ffs_geom_strategy,
.bop_sync = bufsync,
+#ifdef NO_FFS_SNAPSHOT
+ .bop_bdflush = bufbdflush,
+#else
+ .bop_bdflush = ffs_bdflush,
+#endif
};
static const char *ffs_opts[] = { "acls", "async", "atime", "clusterr",