src - FreeBSD source tree

diff options


context:
space:
mode:

author	Jeff Roberson <jeff@FreeBSD.org>	2007-12-30 01:42:15 +0000
committer	Jeff Roberson <jeff@FreeBSD.org>	2007-12-30 01:42:15 +0000
commit	397c19d1753d210247d77eb3ca33d1c7c1eb2fa9 (patch)
tree	0f2354bfc200294c2629e6ecfba76e364beda579 /sys/kern
parent	2a79fd39b4cf409d2c0bd7a449d7f3e91d7b9007 (diff)
download	src-397c19d1753d210247d77eb3ca33d1c7c1eb2fa9.tar.gz src-397c19d1753d210247d77eb3ca33d1c7c1eb2fa9.zip

Remove explicit locking of struct file.

- Introduce a finit() which is used to initailize the fields of struct file in such a way that the ops vector is only valid after the data, type, and flags are valid. - Protect f_flag and f_count with atomic operations. - Remove the global list of all files and associated accounting. - Rewrite the unp garbage collection such that it no longer requires the global list of all files and instead uses a list of all unp sockets. - Mark sockets in the accept queue so we don't incorrectly gc them. Tested by: kris, pho

Notes

Notes: svn path=/head/; revision=174988

Diffstat (limited to 'sys/kern')

-rw-r--r--

sys/kern/kern_descrip.c

172

-rw-r--r--

sys/kern/kern_event.c

-rw-r--r--

sys/kern/sys_generic.c

-rw-r--r--

sys/kern/sys_pipe.c

-rw-r--r--

sys/kern/uipc_mqueue.c

-rw-r--r--

sys/kern/uipc_syscalls.c

-rw-r--r--

sys/kern/uipc_usrreq.c

412

-rw-r--r--

sys/kern/vfs_syscalls.c

-rw-r--r--

sys/kern/vfs_vnops.c

9 files changed, 299 insertions, 449 deletions

diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 274522f14c21..070fac7516df 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c

@@ -95,7 +95,6 @@ static int do_dup(struct thread *td, enum dup_type type, int old, int new,

static int fd_first_free(struct filedesc *, int, int);

static int fd_last_used(struct filedesc *, int, int);

static void fdgrowtable(struct filedesc *, int);

-static int fdrop_locked(struct file *fp, struct thread *td);

static void fdunused(struct filedesc *fdp, int fd);

static void fdused(struct filedesc *fdp, int fd);

@@ -137,9 +136,7 @@ struct filedesc0 {

* Descriptor management.

-struct filelist filehead; /* head of list of open files */

-int openfiles; /* actual number of open files */

-struct sx filelist_lock; /* sx to protect filelist */

+volatile int openfiles; /* actual number of open files */

struct mtx sigio_lock; /* mtx to protect pointers to sigio */

void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);

@@ -428,9 +425,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)

error = EBADF;

break;

}

- FILE_LOCK(fp);

td->td_retval[0] = OFLAGS(fp->f_flag);

- FILE_UNLOCK(fp);

FILEDESC_SUNLOCK(fdp);

break;

@@ -441,12 +436,13 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)

error = EBADF;

break;

}

- FILE_LOCK(fp);

- fhold_locked(fp);

- fp->f_flag &= ~FCNTLFLAGS;

- fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;

- FILE_UNLOCK(fp);

+ fhold(fp);

FILEDESC_SUNLOCK(fdp);

+ do {

+ tmp = flg = fp->f_flag;

+ tmp &= ~FCNTLFLAGS;

+ tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;

+ } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);

tmp = fp->f_flag & FNONBLOCK;

error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);

if (error) {

@@ -459,9 +455,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)

fdrop(fp, td);

break;

}

- FILE_LOCK(fp);

- fp->f_flag &= ~FNONBLOCK;

- FILE_UNLOCK(fp);

+ atomic_clear_int(&fp->f_flag, FNONBLOCK);

tmp = 0;

(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);

fdrop(fp, td);

@@ -1359,15 +1353,13 @@ int

falloc(struct thread *td, struct file **resultfp, int *resultfd)

{

struct proc *p = td->td_proc;

- struct file *fp, *fq;

+ struct file *fp;

int error, i;

int maxuserfiles = maxfiles - (maxfiles / 20);

static struct timeval lastfail;

static int curfail;

fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);

- sx_xlock(&filelist_lock);

if ((openfiles >= maxuserfiles &&

priv_check(td, PRIV_MAXFILES) != 0) ||

openfiles >= maxfiles) {

@@ -1375,18 +1367,16 @@ falloc(struct thread *td, struct file **resultfp, int *resultfd)

printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n",

td->td_ucred->cr_ruid);

}

- sx_xunlock(&filelist_lock);

uma_zfree(file_zone, fp);

return (ENFILE);

}

- openfiles++;

+ atomic_add_int(&openfiles, 1);

* If the process has file descriptor zero open, add the new file

* descriptor to the list of open files at that point, otherwise

* put it at the front of the list of open files.

- fp->f_mtxp = mtx_pool_alloc(mtxpool_sleep);

fp->f_count = 1;

if (resultfp)

fp->f_count++;

@@ -1395,12 +1385,6 @@ falloc(struct thread *td, struct file **resultfp, int *resultfd)

fp->f_data = NULL;

fp->f_vnode = NULL;

FILEDESC_XLOCK(p->p_fd);

- if ((fq = p->p_fd->fd_ofiles[0])) {

- LIST_INSERT_AFTER(fq, fp, f_list);

- } else {

- LIST_INSERT_HEAD(&filehead, fp, f_list);

- }

- sx_xunlock(&filelist_lock);

if ((error = fdalloc(td, 0, &i))) {

FILEDESC_XUNLOCK(p->p_fd);

fdrop(fp, td);

@@ -1962,6 +1946,23 @@ closef(struct file *fp, struct thread *td)

}

+ * Initialize the file pointer with the specified properties.

+ *

+ * The ops are set with release semantics to be certain that the flags, type,

+ * and data are visible when ops is. This is to prevent ops methods from being

+ * called with bad data.

+ */

+void

+finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)

+ fp->f_data = data;

+ fp->f_flag = flag;

+ fp->f_type = type;

+ atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);

+/*

* Extract the file pointer associated with the specified descriptor for the

* current user process.

@@ -2135,54 +2136,20 @@ fputsock(struct socket *so)

sorele(so);

}

-int

-fdrop(struct file *fp, struct thread *td)

- FILE_LOCK(fp);

- return (fdrop_locked(fp, td));

- * Drop reference on struct file passed in, may call closef if the

- * reference hits zero.

- * Expects struct file locked, and will unlock it.

+ * Handle the last reference to a file being closed.

-static int

-fdrop_locked(struct file *fp, struct thread *td)

+int

+_fdrop(struct file *fp, struct thread *td)

{

int error;

- FILE_LOCK_ASSERT(fp, MA_OWNED);

- if (--fp->f_count > 0) {

- FILE_UNLOCK(fp);

- return (0);

- }

- /*

- * We might have just dropped the last reference to a file

- * object that is for a UNIX domain socket whose message

- * buffers are being examined in unp_gc(). If that is the

- * case, FWAIT will be set in f_gcflag and we need to wait for

- * unp_gc() to finish its scan.

- */

- while (fp->f_gcflag & FWAIT)

- msleep(&fp->f_gcflag, fp->f_mtxp, 0, "fpdrop", 0);

- /* We have the last ref so we can proceed without the file lock. */

- FILE_UNLOCK(fp);

- if (fp->f_count < 0)

- panic("fdrop: count < 0");

+ error = 0;

+ if (fp->f_count != 0)

+ panic("fdrop: count %d", fp->f_count);

if (fp->f_ops != &badfileops)

error = fo_close(fp, td);

- else

- error = 0;

- sx_xlock(&filelist_lock);

- LIST_REMOVE(fp, f_list);

- openfiles--;

- sx_xunlock(&filelist_lock);

+ atomic_subtract_int(&openfiles, 1);

crfree(fp->f_cred);

uma_zfree(file_zone, fp);

@@ -2225,9 +2192,7 @@ flock(struct thread *td, struct flock_args *uap)

lf.l_len = 0;

if (uap->how & LOCK_UN) {

lf.l_type = F_UNLCK;

- FILE_LOCK(fp);

- fp->f_flag &= ~FHASLOCK;

- FILE_UNLOCK(fp);

+ atomic_clear_int(&fp->f_flag, FHASLOCK);

error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);

goto done2;

}

@@ -2239,9 +2204,7 @@ flock(struct thread *td, struct flock_args *uap)

error = EBADF;

goto done2;

}

- FILE_LOCK(fp);

- fp->f_flag |= FHASLOCK;

- FILE_UNLOCK(fp);

+ atomic_set_int(&fp->f_flag, FHASLOCK);

error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,

(uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);

done2:

@@ -2286,9 +2249,7 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode,

* Check that the mode the file is being opened for is a

* subset of the mode of the existing descriptor.

- FILE_LOCK(wfp);

if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {

- FILE_UNLOCK(wfp);

FILEDESC_XUNLOCK(fdp);

return (EACCES);

}

@@ -2297,8 +2258,7 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode,

fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];

if (fp == NULL)

fdused(fdp, indx);

- fhold_locked(wfp);

- FILE_UNLOCK(wfp);

+ fhold(wfp);

FILEDESC_XUNLOCK(fdp);

if (fp != NULL)

@@ -2419,29 +2379,23 @@ sysctl_kern_file(SYSCTL_HANDLER_ARGS)

struct proc *p;

int error, n;

- /*

- * Note: because the number of file descriptors is calculated

- * in different ways for sizing vs returning the data,

- * there is information leakage from the first loop. However,

- * it is of a similar order of magnitude to the leakage from

- * global system statistics such as kern.openfiles.

- */

error = sysctl_wire_old_buffer(req, 0);

if (error != 0)

return (error);

if (req->oldptr == NULL) {

- n = 16; /* A slight overestimate. */

- sx_slock(&filelist_lock);

- LIST_FOREACH(fp, &filehead, f_list) {

- /*

- * We should grab the lock, but this is an

- * estimate, so does it really matter?

- */

- /* mtx_lock(fp->f_mtxp); */

- n += fp->f_count;

- /* mtx_unlock(f->f_mtxp); */

+ n = 0;

+ sx_slock(&allproc_lock);

+ FOREACH_PROC_IN_SYSTEM(p) {

+ if (p->p_state == PRS_NEW)

+ continue;

+ fdp = fdhold(p);

+ if (fdp == NULL)

+ continue;

+ /* overestimates sparse tables. */

+ n += fdp->fd_lastfile;

+ fddrop(fdp);

}

- sx_sunlock(&filelist_lock);

+ sx_sunlock(&allproc_lock);

return (SYSCTL_OUT(req, 0, n * sizeof(xf)));

}

error = 0;

@@ -2472,7 +2426,7 @@ sysctl_kern_file(SYSCTL_HANDLER_ARGS)

xf.xf_vnode = fp->f_vnode;

xf.xf_type = fp->f_type;

xf.xf_count = fp->f_count;

- xf.xf_msgcount = fp->f_msgcount;

+ xf.xf_msgcount = 0;

xf.xf_offset = fp->f_offset;

xf.xf_flag = fp->f_flag;

error = SYSCTL_OUT(req, &xf, sizeof(xf));

@@ -2523,7 +2477,6 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)

continue;

bzero(kif, sizeof(*kif));

kif->kf_structsize = sizeof(*kif);

- FILE_LOCK(fp);

vp = NULL;

so = NULL;

kif->kf_fd = i;

@@ -2531,7 +2484,6 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)

case DTYPE_VNODE:

kif->kf_type = KF_TYPE_VNODE;

vp = fp->f_vnode;

- vref(vp);

break;

case DTYPE_SOCKET:

@@ -2583,8 +2535,8 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)

if (fp->f_flag & FHASLOCK)

kif->kf_flags |= KF_FLAG_HASLOCK;

kif->kf_offset = fp->f_offset;

- FILE_UNLOCK(fp);

if (vp != NULL) {

+ vref(vp);

switch (vp->v_type) {

case VNON:

kif->kf_vnode_type = KF_VTYPE_VNON;

@@ -2736,7 +2688,7 @@ db_print_file(struct file *fp, int header)

p = file_to_first_proc(fp);

db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,

file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,

- fp->f_gcflag, fp->f_count, fp->f_msgcount, fp->f_vnode,

+ 0, fp->f_count, 0, fp->f_vnode,

p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");

}

@@ -2754,13 +2706,24 @@ DB_SHOW_COMMAND(file, db_show_file)

DB_SHOW_COMMAND(files, db_show_files)

{

+ struct filedesc *fdp;

struct file *fp;

+ struct proc *p;

int header;

+ int n;

header = 1;

- LIST_FOREACH(fp, &filehead, f_list) {

- db_print_file(fp, header);

- header = 0;

+ FOREACH_PROC_IN_SYSTEM(p) {

+ if (p->p_state == PRS_NEW)

+ continue;

+ if ((fdp = p->p_fd) == NULL)

+ continue;

+ for (n = 0; n < fdp->fd_nfiles; ++n) {

+ if ((fp = fdp->fd_ofiles[n]) == NULL)

+ continue;

+ db_print_file(fp, header);

+ header = 0;

+ }

}

#endif

@@ -2772,7 +2735,7 @@ SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,

&maxfiles, 0, "Maximum number of files");

SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,

- &openfiles, 0, "System-wide number of open files");

+ __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");

/* ARGSUSED*/

static void

@@ -2781,7 +2744,6 @@ filelistinit(void *dummy)

file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,

NULL, NULL, UMA_ALIGN_PTR, 0);

- sx_init(&filelist_lock, "filelist lock");

mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);

mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);

}

diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index 4d75822dbff2..b5d01d07621c 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c

@@ -531,12 +531,7 @@ kqueue(struct thread *td, struct kqueue_args *uap)

SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);

FILEDESC_XUNLOCK(fdp);

- FILE_LOCK(fp);

- fp->f_flag = FREAD | FWRITE;

- fp->f_type = DTYPE_KQUEUE;

- fp->f_data = kq;

- fp->f_ops = &kqueueops;

- FILE_UNLOCK(fp);

+ finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);

fdrop(fp, td);

td->td_retval[0] = fd;

@@ -990,24 +985,17 @@ kqueue_acquire(struct file *fp, struct kqueue **kqp)

error = 0;

- FILE_LOCK(fp);

- do {

- kq = fp->f_data;

- if (fp->f_type != DTYPE_KQUEUE || kq == NULL) {

- error = EBADF;

- break;

- }

- *kqp = kq;

- KQ_LOCK(kq);

- if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {

- KQ_UNLOCK(kq);

- error = EBADF;

- break;

- }

- kq->kq_refcnt++;

+ kq = fp->f_data;

+ if (fp->f_type != DTYPE_KQUEUE || kq == NULL)

+ return (EBADF);

+ *kqp = kq;

+ KQ_LOCK(kq);

+ if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {

KQ_UNLOCK(kq);

- } while (0);

- FILE_UNLOCK(fp);

+ return (EBADF);

+ }

+ kq->kq_refcnt++;

+ KQ_UNLOCK(kq);

return error;

}

diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 0800c08073e6..9c800f37d342 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c

@@ -646,21 +646,17 @@ kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)

FILEDESC_XUNLOCK(fdp);

goto out;

case FIONBIO:

- FILE_LOCK(fp);

if ((tmp = *(int *)data))

- fp->f_flag |= FNONBLOCK;

+ atomic_set_int(&fp->f_flag, FNONBLOCK);

else

- fp->f_flag &= ~FNONBLOCK;

- FILE_UNLOCK(fp);

+ atomic_clear_int(&fp->f_flag, FNONBLOCK);

data = (void *)&tmp;

break;

case FIOASYNC:

- FILE_LOCK(fp);

if ((tmp = *(int *)data))

- fp->f_flag |= FASYNC;

+ atomic_set_int(&fp->f_flag, FASYNC);

else

- fp->f_flag &= ~FASYNC;

- FILE_UNLOCK(fp);

+ atomic_clear_int(&fp->f_flag, FASYNC);

data = (void *)&tmp;

break;

}

diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
index 262ef0c1fec8..27ecf80896da 100644
--- a/sys/kern/sys_pipe.c
+++ b/sys/kern/sys_pipe.c

@@ -363,12 +363,7 @@ pipe(td, uap)

* to avoid races against processes which manage to dup() the read

* side while we are blocked trying to allocate the write side.

- FILE_LOCK(rf);

- rf->f_flag = FREAD | FWRITE;

- rf->f_type = DTYPE_PIPE;

- rf->f_data = rpipe;

- rf->f_ops = &pipeops;

- FILE_UNLOCK(rf);

+ finit(rf, FREAD | FWRITE, DTYPE_PIPE, rpipe, &pipeops);

error = falloc(td, &wf, &fd);

if (error) {

fdclose(fdp, rf, td->td_retval[0], td);

@@ -378,12 +373,7 @@ pipe(td, uap)

return (error);

}

/* An extra reference on `wf' has been held for us by falloc(). */

- FILE_LOCK(wf);

- wf->f_flag = FREAD | FWRITE;

- wf->f_type = DTYPE_PIPE;

- wf->f_data = wpipe;

- wf->f_ops = &pipeops;

- FILE_UNLOCK(wf);

+ finit(wf, FREAD | FWRITE, DTYPE_PIPE, wpipe, &pipeops);

fdrop(wf, td);

td->td_retval[1] = fd;

fdrop(rf, td);

diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c
index 1c5cadbc3311..8fe34bcba0a9 100644
--- a/sys/kern/uipc_mqueue.c
+++ b/sys/kern/uipc_mqueue.c

@@ -1999,12 +1999,8 @@ kmq_open(struct thread *td, struct kmq_open_args *uap)

mqnode_addref(pn);

sx_xunlock(&mqfs_data.mi_lock);

- FILE_LOCK(fp);

- fp->f_flag = (flags & (FREAD | FWRITE | O_NONBLOCK));

- fp->f_type = DTYPE_MQUEUE;

- fp->f_data = pn;

- fp->f_ops = &mqueueops;

- FILE_UNLOCK(fp);

+ finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn,

+ &mqueueops);

FILEDESC_XLOCK(fdp);

if (fdp->fd_ofiles[fd] == fp)

@@ -2097,6 +2093,7 @@ kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)

struct mqueue *mq;

struct file *fp;

struct mq_attr attr, oattr;

+ u_int oflag, flag;

int error;

if (uap->attr) {

@@ -2112,13 +2109,15 @@ kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)

oattr.mq_maxmsg = mq->mq_maxmsg;

oattr.mq_msgsize = mq->mq_msgsize;

oattr.mq_curmsgs = mq->mq_curmsgs;

- FILE_LOCK(fp);

- oattr.mq_flags = (O_NONBLOCK & fp->f_flag);

if (uap->attr) {

- fp->f_flag &= ~O_NONBLOCK;

- fp->f_flag |= (attr.mq_flags & O_NONBLOCK);

- }

- FILE_UNLOCK(fp);

+ do {

+ oflag = flag = fp->f_flag;

+ flag &= ~O_NONBLOCK;

+ flag |= (attr.mq_flags & O_NONBLOCK);

+ } while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);

+ } else

+ oflag = fp->f_flag;

+ oattr.mq_flags = (O_NONBLOCK & oflag);

fdrop(fp, td);

if (uap->oattr)

error = copyout(&oattr, uap->oattr, sizeof(oattr));

diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index faf7f24a66cc..616afa0e5d65 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c

@@ -180,12 +180,7 @@ socket(td, uap)

if (error) {

fdclose(fdp, fp, fd, td);

} else {

- FILE_LOCK(fp);

- fp->f_data = so; /* already has ref count */

- fp->f_flag = FREAD|FWRITE;

- fp->f_type = DTYPE_SOCKET;

- fp->f_ops = &socketops;

- FILE_UNLOCK(fp);

+ finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops);

td->td_retval[0] = fd;

}

fdrop(fp, td);

@@ -423,12 +418,7 @@ kern_accept(struct thread *td, int s, struct sockaddr **name,

if (pgid != 0)

fsetown(pgid, &so->so_sigio);

- FILE_LOCK(nfp);

- nfp->f_data = so; /* nfp has ref count from falloc */

- nfp->f_flag = fflag;

- nfp->f_type = DTYPE_SOCKET;

- nfp->f_ops = &socketops;

- FILE_UNLOCK(nfp);

+ finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);

/* Sync socket nonblocking/async state with file flags */

tmp = fflag & FNONBLOCK;

(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);

@@ -640,16 +630,8 @@ socketpair(td, uap)

if (error)

goto free4;

}

- FILE_LOCK(fp1);

- fp1->f_flag = FREAD|FWRITE;

- fp1->f_type = DTYPE_SOCKET;

- fp1->f_ops = &socketops;

- FILE_UNLOCK(fp1);

- FILE_LOCK(fp2);

- fp2->f_flag = FREAD|FWRITE;

- fp2->f_type = DTYPE_SOCKET;

- fp2->f_ops = &socketops;

- FILE_UNLOCK(fp2);

+ finit(fp1, FREAD | FWRITE, DTYPE_SOCKET, fp1->f_data, &socketops);

+ finit(fp2, FREAD | FWRITE, DTYPE_SOCKET, fp2->f_data, &socketops);

so1 = so2 = NULL;

error = copyout(sv, uap->rsv, 2 * sizeof (int));

if (error)

@@ -2270,12 +2252,7 @@ sctp_peeloff(td, uap)

so->so_qstate &= ~SQ_COMP;

so->so_head = NULL;

ACCEPT_UNLOCK();

- FILE_LOCK(nfp);

- nfp->f_data = so;

- nfp->f_flag = fflag;

- nfp->f_type = DTYPE_SOCKET;

- nfp->f_ops = &socketops;

- FILE_UNLOCK(nfp);

+ finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);

error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);

if (error)

goto noconnection;

diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index 9fea71b4b626..1d6cc464da0f 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c

@@ -233,10 +233,11 @@ static void unp_shutdown(struct unpcb *);

static void unp_drop(struct unpcb *, int);

static void unp_gc(__unused void *, int);

static void unp_scan(struct mbuf *, void (*)(struct file *));

-static void unp_mark(struct file *);

static void unp_discard(struct file *);

static void unp_freerights(struct file **, int);

static int unp_internalize(struct mbuf **, struct thread *);

+static void unp_internalize_fp(struct file *);

+static void unp_externalize_fp(struct file *);

static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *);

@@ -586,9 +587,9 @@ uipc_detach(struct socket *so)

unp_drop(ref, ECONNRESET);

UNP_PCB_UNLOCK(ref);

}

+ local_unp_rights = unp_rights;

UNP_GLOBAL_WUNLOCK();

unp->unp_socket->so_pcb = NULL;

- local_unp_rights = unp_rights;

saved_unp_addr = unp->unp_addr;

unp->unp_addr = NULL;

unp->unp_refcount--;

@@ -1600,10 +1601,7 @@ unp_externalize(struct mbuf *control, struct mbuf **controlp)

panic("unp_externalize fdalloc failed");

fp = *rp++;

td->td_proc->p_fd->fd_ofiles[f] = fp;

- FILE_LOCK(fp);

- fp->f_msgcount--;

- FILE_UNLOCK(fp);

- unp_rights--;

+ unp_externalize_fp(fp);

*fdp++ = f;

}

FILEDESC_XUNLOCK(td->td_proc->p_fd);

@@ -1765,11 +1763,8 @@ unp_internalize(struct mbuf **controlp, struct thread *td)

for (i = 0; i < oldfds; i++) {

fp = fdescp->fd_ofiles[*fdp++];

*rp++ = fp;

- FILE_LOCK(fp);

- fp->f_count++;

- fp->f_msgcount++;

- FILE_UNLOCK(fp);

- unp_rights++;

+ fhold(fp);

+ unp_internalize_fp(fp);

}

FILEDESC_SUNLOCK(fdescp);

break;

@@ -1860,230 +1855,198 @@ unp_addsockcred(struct thread *td, struct mbuf *control)

return (m);

}

+static struct unpcb *

+fptounp(struct file *fp)

+ struct socket *so;

+ if (fp->f_type != DTYPE_SOCKET)

+ return (NULL);

+ if ((so = fp->f_data) == NULL)

+ return (NULL);

+ if (so->so_proto->pr_domain != &localdomain)

+ return (NULL);

+ return sotounpcb(so);

+static void

+unp_discard(struct file *fp)

+ unp_externalize_fp(fp);

+ (void) closef(fp, (struct thread *)NULL);

+static void

+unp_internalize_fp(struct file *fp)

+ struct unpcb *unp;

+ UNP_GLOBAL_WLOCK();

+ if ((unp = fptounp(fp)) != NULL) {

+ unp->unp_file = fp;

+ unp->unp_msgcount++;

+ }

+ unp_rights++;

+ UNP_GLOBAL_WUNLOCK();

+static void

+unp_externalize_fp(struct file *fp)

+ struct unpcb *unp;

+ UNP_GLOBAL_WLOCK();

+ if ((unp = fptounp(fp)) != NULL)

+ unp->unp_msgcount--;

+ unp_rights--;

+ UNP_GLOBAL_WUNLOCK();

* unp_defer indicates whether additional work has been defered for a future

* pass through unp_gc(). It is thread local and does not require explicit

* synchronization.

-static int unp_defer;

+static int unp_marked;

+static int unp_unreachable;

-static int unp_taskcount;

-SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, "");

+static void

+unp_accessable(struct file *fp)

+ struct unpcb *unp;

+ unp = fptounp(fp);

+ if (fp == NULL)

+ return;

+ if (unp->unp_gcflag & UNPGC_REF)

+ return;

+ unp->unp_gcflag &= ~UNPGC_DEAD;

+ unp->unp_gcflag |= UNPGC_REF;

+ unp_marked++;

+static void

+unp_gc_process(struct unpcb *unp)

+ struct socket *soa;

+ struct socket *so;

+ struct file *fp;

+ /* Already processed. */

+ if (unp->unp_gcflag & UNPGC_SCANNED)

+ return;

+ fp = unp->unp_file;

+ /*

+ * Check for a socket potentially in a cycle. It must be in a

+ * queue as indicated by msgcount, and this must equal the file

+ * reference count. Note that when msgcount is 0 the file is NULL.

+ */

+ if (unp->unp_msgcount != 0 && fp->f_count != 0 &&

+ fp->f_count == unp->unp_msgcount) {

+ unp->unp_gcflag |= UNPGC_DEAD;

+ unp_unreachable++;

+ return;

+ }

+ /*

+ * Mark all sockets we reference with RIGHTS.

+ */

+ so = unp->unp_socket;

+ SOCKBUF_LOCK(&so->so_rcv);

+ unp_scan(so->so_rcv.sb_mb, unp_accessable);

+ SOCKBUF_UNLOCK(&so->so_rcv);

+ /*

+ * Mark all sockets in our accept queue.

+ */

+ ACCEPT_LOCK();

+ TAILQ_FOREACH(soa, &so->so_comp, so_list) {

+ SOCKBUF_LOCK(&soa->so_rcv);

+ unp_scan(soa->so_rcv.sb_mb, unp_accessable);

+ SOCKBUF_UNLOCK(&soa->so_rcv);

+ }

+ ACCEPT_UNLOCK();

+ unp->unp_gcflag |= UNPGC_SCANNED;

static int unp_recycled;

SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, "");

+static int unp_taskcount;

+SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, "");

static void

unp_gc(__unused void *arg, int pending)

{

- struct file *fp, *nextfp;

- struct socket *so;

- struct file **extra_ref, **fpp;

- int nunref, i;

- int nfiles_snap;

- int nfiles_slack = 20;

+ struct unp_head *heads[] = { &unp_dhead, &unp_shead, NULL };

+ struct unp_head **head;

+ struct file **unref;

+ struct unpcb *unp;

+ int i;

unp_taskcount++;

- unp_defer = 0;

+ UNP_GLOBAL_RLOCK();

+ /*

+ * First clear all gc flags from previous runs.

+ */

+ for (head = heads; *head != NULL; head++)

+ LIST_FOREACH(unp, *head, unp_link)

+ unp->unp_gcflag &= ~(UNPGC_REF|UNPGC_DEAD);

- * Before going through all this, set all FDs to be NOT deferred and

- * NOT externally accessible.

+ * Scan marking all reachable sockets with UNPGC_REF. Once a socket

+ * is reachable all of the sockets it references are reachable.

+ * Stop the scan once we do a complete loop without discovering

+ * a new reachable socket.

- sx_slock(&filelist_lock);

- LIST_FOREACH(fp, &filehead, f_list)

- fp->f_gcflag &= ~(FMARK|FDEFER);

do {

- KASSERT(unp_defer >= 0, ("unp_gc: unp_defer %d", unp_defer));

- LIST_FOREACH(fp, &filehead, f_list) {

- FILE_LOCK(fp);

- /*

- * If the file is not open, skip it -- could be a

- * file in the process of being opened, or in the

- * process of being closed. If the file is

- * "closing", it may have been marked for deferred

- * consideration. Clear the flag now if so.

- */

- if (fp->f_count == 0) {

- if (fp->f_gcflag & FDEFER)

- unp_defer--;

- fp->f_gcflag &= ~(FMARK|FDEFER);

- FILE_UNLOCK(fp);

- continue;

- }

- /*

- * If we already marked it as 'defer' in a

- * previous pass, then try to process it this

- * time and un-mark it.

- */

- if (fp->f_gcflag & FDEFER) {

- fp->f_gcflag &= ~FDEFER;

- unp_defer--;

- } else {

- /*

- * If it's not deferred, then check if it's

- * already marked.. if so skip it

- */

- if (fp->f_gcflag & FMARK) {

- FILE_UNLOCK(fp);

- continue;

- }

- /*

- * If all references are from messages in

- * transit, then skip it. it's not externally

- * accessible.

- */

- if (fp->f_count == fp->f_msgcount) {

- FILE_UNLOCK(fp);

- continue;

- }

- /*

- * If it got this far then it must be

- * externally accessible.

- */

- fp->f_gcflag |= FMARK;

- }

- /*

- * Either it was deferred, or it is externally

- * accessible and not already marked so. Now check

- * if it is possibly one of OUR sockets.

- */

- if (fp->f_type != DTYPE_SOCKET ||

- (so = fp->f_data) == NULL) {

- FILE_UNLOCK(fp);

- continue;

- }

- if (so->so_proto->pr_domain != &localdomain ||

- (so->so_proto->pr_flags & PR_RIGHTS) == 0) {

- FILE_UNLOCK(fp);

- continue;

+ unp_unreachable = 0;

+ unp_marked = 0;

+ for (head = heads; *head != NULL; head++)

+ LIST_FOREACH(unp, *head, unp_link)

+ unp_gc_process(unp);

+ } while (unp_marked);

+ UNP_GLOBAL_RUNLOCK();

+ if (unp_unreachable == 0)

+ return;

+ /*

+ * Allocate space for a local list of dead unpcbs.

+ */

+ unref = malloc(unp_unreachable * sizeof(struct file *),

+ M_TEMP, M_WAITOK);

+ /*

+ * Iterate looking for sockets which have been specifically marked

+ * as as unreachable and store them locally.

+ */

+ UNP_GLOBAL_RLOCK();

+ for (i = 0, head = heads; *head != NULL; head++)

+ LIST_FOREACH(unp, *head, unp_link)

+ if (unp->unp_gcflag & UNPGC_DEAD) {

+ unref[i++] = unp->unp_file;

+ KASSERT(unp->unp_file != NULL,

+ ("unp_gc: Invalid unpcb."));

+ KASSERT(i <= unp_unreachable,

+ ("unp_gc: incorrect unreachable count."));

}

- /*

- * Tell any other threads that do a subsequent

- * fdrop() that we are scanning the message

- * buffers.

- */

- fp->f_gcflag |= FWAIT;

- FILE_UNLOCK(fp);

- /*

- * So, Ok, it's one of our sockets and it IS

- * externally accessible (or was deferred). Now we

- * look to see if we hold any file descriptors in its

- * message buffers. Follow those links and mark them

- * as accessible too.

- */

- SOCKBUF_LOCK(&so->so_rcv);

- unp_scan(so->so_rcv.sb_mb, unp_mark);

- SOCKBUF_UNLOCK(&so->so_rcv);

- /*

- * Wake up any threads waiting in fdrop().

- */

- FILE_LOCK(fp);

- fp->f_gcflag &= ~FWAIT;

- wakeup(&fp->f_gcflag);

- FILE_UNLOCK(fp);

- }

- } while (unp_defer);

- sx_sunlock(&filelist_lock);

+ UNP_GLOBAL_RUNLOCK();

- * XXXRW: The following comments need updating for a post-SMPng and

- * deferred unp_gc() world, but are still generally accurate.

- *

- * We grab an extra reference to each of the file table entries that

- * are not otherwise accessible and then free the rights that are

- * stored in messages on them.

- *

- * The bug in the orginal code is a little tricky, so I'll describe

- * what's wrong with it here.

- *

- * It is incorrect to simply unp_discard each entry for f_msgcount

- * times -- consider the case of sockets A and B that contain

- * references to each other. On a last close of some other socket,

- * we trigger a gc since the number of outstanding rights (unp_rights)

- * is non-zero. If during the sweep phase the gc code unp_discards,

- * we end up doing a (full) closef on the descriptor. A closef on A

- * results in the following chain. Closef calls soo_close, which

- * calls soclose. Soclose calls first (through the switch

- * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply

- * returns because the previous instance had set unp_gcing, and we

- * return all the way back to soclose, which marks the socket with

- * SS_NOFDREF, and then calls sofree. Sofree calls sorflush to free

- * up the rights that are queued in messages on the socket A, i.e.,

- * the reference on B. The sorflush calls via the dom_dispose switch

- * unp_dispose, which unp_scans with unp_discard. This second

- * instance of unp_discard just calls closef on B.

- *

- * Well, a similar chain occurs on B, resulting in a sorflush on B,

- * which results in another closef on A. Unfortunately, A is already

- * being closed, and the descriptor has already been marked with

- * SS_NOFDREF, and soclose panics at this point.

- *

- * Here, we first take an extra reference to each inaccessible

- * descriptor. Then, we call sorflush ourself, since we know it is a

- * Unix domain socket anyhow. After we destroy all the rights

- * carried in messages, we do a last closef to get rid of our extra

- * reference. This is the last close, and the unp_detach etc will

- * shut down the socket.

- *

- * 91/09/19, bsy@cs.cmu.edu

+ * All further operation is now done on a local list. We first ref

+ * all sockets to avoid closing them until all are flushed.

-again:

- nfiles_snap = openfiles + nfiles_slack; /* some slack */

- extra_ref = malloc(nfiles_snap * sizeof(struct file *), M_TEMP,

- M_WAITOK);

- sx_slock(&filelist_lock);

- if (nfiles_snap < openfiles) {

- sx_sunlock(&filelist_lock);

- free(extra_ref, M_TEMP);

- nfiles_slack += 20;

- goto again;

- }

- for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref;

- fp != NULL; fp = nextfp) {

- nextfp = LIST_NEXT(fp, f_list);

- FILE_LOCK(fp);

- /*

- * If it's not open, skip it

- */

- if (fp->f_count == 0) {

- FILE_UNLOCK(fp);

- continue;

- }

- /*

- * If all refs are from msgs, and it's not marked accessible

- * then it must be referenced from some unreachable cycle of

- * (shut-down) FDs, so include it in our list of FDs to

- * remove.

- */

- if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) {

- *fpp++ = fp;

- nunref++;

- fp->f_count++;

- }

- FILE_UNLOCK(fp);

- }

- sx_sunlock(&filelist_lock);

+ for (i = 0; i < unp_unreachable; i++)

+ fhold(unref[i]);

- * For each FD on our hit list, do the following two things:

+ * Now flush all sockets, free'ing rights. This will free the

+ * struct files associated with these sockets but leave each socket

+ * with one remaining ref.

- for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {

- struct file *tfp = *fpp;

- FILE_LOCK(tfp);

- if (tfp->f_type == DTYPE_SOCKET &&

- tfp->f_data != NULL) {

- FILE_UNLOCK(tfp);

- sorflush(tfp->f_data);

- } else {

- FILE_UNLOCK(tfp);

- }

- for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {

- closef(*fpp, (struct thread *) NULL);

- unp_recycled++;

- }

- free(extra_ref, M_TEMP);

+ for (i = 0; i < unp_unreachable; i++)

+ sorflush(unref[i]->f_data);

+ /*

+ * And finally release the sockets so they can be reclaimed.

+ */

+ for (i = 0; i < unp_unreachable; i++)

+ fdrop(unref[i], NULL);

+ unp_recycled += unp_unreachable;

+ free(unref, M_TEMP);

}

void

@@ -2143,31 +2106,6 @@ unp_scan(struct mbuf *m0, void (*op)(struct file *))

}

-static void

-unp_mark(struct file *fp)

- /* XXXRW: Should probably assert file list lock here. */

- if (fp->f_gcflag & FMARK)

- return;

- unp_defer++;

- fp->f_gcflag |= (FMARK|FDEFER);

-static void

-unp_discard(struct file *fp)

- UNP_GLOBAL_WLOCK();

- FILE_LOCK(fp);

- fp->f_msgcount--;

- unp_rights--;

- FILE_UNLOCK(fp);

- UNP_GLOBAL_WUNLOCK();

- (void) closef(fp, (struct thread *)NULL);

#ifdef DDB

static void

db_print_indent(int indent)

diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 20d722efb646..0e42ea393f3f 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c

@@ -1022,6 +1022,8 @@ kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,

return (error);

/* An extra reference on `nfp' has been held for us by falloc(). */

fp = nfp;

+ /* Set the flags early so the finit in devfs can pick them up. */

+ fp->f_flag = flags & FMASK;

cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;

NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg, path, td);

td->td_dupfd = -1; /* XXX check for fdopen */

@@ -1067,16 +1069,16 @@ kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,

NDFREE(&nd, NDF_ONLY_PNBUF);

vp = nd.ni_vp;

- FILE_LOCK(fp);

- fp->f_vnode = vp;

- if (fp->f_data == NULL)

- fp->f_data = vp;

- fp->f_flag = flags & FMASK;

- fp->f_seqcount = 1;

- fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);

- if (fp->f_ops == &badfileops)

- fp->f_ops = &vnops;

- FILE_UNLOCK(fp);

+ fp->f_vnode = vp; /* XXX Does devfs need this? */

+ /*

+ * If the file wasn't claimed by devfs bind it to the normal

+ * vnode operations here.

+ */

+ if (fp->f_ops == &badfileops) {

+ KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));

+ fp->f_seqcount = 1;

+ finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops);

+ }

VOP_UNLOCK(vp, 0, td);

if (flags & (O_EXLOCK | O_SHLOCK)) {

@@ -1093,7 +1095,7 @@ kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,

if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,

type)) != 0)

goto bad;

- fp->f_flag |= FHASLOCK;

+ atomic_set_int(&fp->f_flag, FHASLOCK);

}

if (flags & O_TRUNC) {

if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)

@@ -4179,14 +4181,8 @@ fhopen(td, uap)

}

/* An extra reference on `nfp' has been held for us by falloc(). */

fp = nfp;

- FILE_LOCK(nfp);

nfp->f_vnode = vp;

- nfp->f_data = vp;

- nfp->f_flag = fmode & FMASK;

- nfp->f_type = DTYPE_VNODE;

- nfp->f_ops = &vnops;

- FILE_UNLOCK(nfp);

+ finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);

if (fmode & (O_EXLOCK | O_SHLOCK)) {

lf.l_whence = SEEK_SET;

lf.l_start = 0;

@@ -4215,7 +4211,7 @@ fhopen(td, uap)

goto out;

}

vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);

- fp->f_flag |= FHASLOCK;

+ atomic_set_int(&fp->f_flag, FHASLOCK);

}

VOP_UNLOCK(vp, 0, td);

diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 50835747e78f..c7df6adef59f 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c

@@ -488,10 +488,12 @@ vn_read(fp, uio, active_cred, flags, td)

{

struct vnode *vp;

int error, ioflag;

+ struct mtx *mtxp;

int vfslocked;

KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",

uio->uio_td, td));

+ mtxp = NULL;

vp = fp->f_vnode;

ioflag = 0;

if (fp->f_flag & FNONBLOCK)

@@ -505,13 +507,15 @@ vn_read(fp, uio, active_cred, flags, td)

* It is now protected by the FOFFSET_LOCKED flag.

if ((flags & FOF_OFFSET) == 0) {

- FILE_LOCK(fp);

+ mtxp = mtx_pool_find(mtxpool_sleep, fp);

+ mtx_lock(mtxp);

while(fp->f_vnread_flags & FOFFSET_LOCKED) {

fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;

- msleep(&fp->f_vnread_flags,fp->f_mtxp,PUSER -1,"vnread offlock",0);

+ msleep(&fp->f_vnread_flags, mtxp, PUSER -1,

+ "vnread offlock", 0);

}

fp->f_vnread_flags |= FOFFSET_LOCKED;

- FILE_UNLOCK(fp);

+ mtx_unlock(mtxp);

vn_lock(vp, LK_SHARED | LK_RETRY, td);

uio->uio_offset = fp->f_offset;

} else

@@ -526,11 +530,11 @@ vn_read(fp, uio, active_cred, flags, td)

error = VOP_READ(vp, uio, ioflag, fp->f_cred);

if ((flags & FOF_OFFSET) == 0) {

fp->f_offset = uio->uio_offset;

- FILE_LOCK(fp);

+ mtx_lock(mtxp);

if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)

wakeup(&fp->f_vnread_flags);

fp->f_vnread_flags = 0;

- FILE_UNLOCK(fp);

+ mtx_unlock(mtxp);

}

fp->f_nextoff = uio->uio_offset;

VOP_UNLOCK(vp, 0, td);