diff options
author | Jeff Roberson <jeff@FreeBSD.org> | 2007-12-30 01:42:15 +0000 |
---|---|---|
committer | Jeff Roberson <jeff@FreeBSD.org> | 2007-12-30 01:42:15 +0000 |
commit | 397c19d1753d210247d77eb3ca33d1c7c1eb2fa9 (patch) | |
tree | 0f2354bfc200294c2629e6ecfba76e364beda579 /sys/kern | |
parent | 2a79fd39b4cf409d2c0bd7a449d7f3e91d7b9007 (diff) | |
download | src-397c19d1753d210247d77eb3ca33d1c7c1eb2fa9.tar.gz src-397c19d1753d210247d77eb3ca33d1c7c1eb2fa9.zip |
Remove explicit locking of struct file.
- Introduce a finit() which is used to initailize the fields of struct file
in such a way that the ops vector is only valid after the data, type,
and flags are valid.
- Protect f_flag and f_count with atomic operations.
- Remove the global list of all files and associated accounting.
- Rewrite the unp garbage collection such that it no longer requires
the global list of all files and instead uses a list of all unp sockets.
- Mark sockets in the accept queue so we don't incorrectly gc them.
Tested by: kris, pho
Notes
Notes:
svn path=/head/; revision=174988
Diffstat (limited to 'sys/kern')
-rw-r--r-- | sys/kern/kern_descrip.c | 172 | ||||
-rw-r--r-- | sys/kern/kern_event.c | 34 | ||||
-rw-r--r-- | sys/kern/sys_generic.c | 12 | ||||
-rw-r--r-- | sys/kern/sys_pipe.c | 14 | ||||
-rw-r--r-- | sys/kern/uipc_mqueue.c | 23 | ||||
-rw-r--r-- | sys/kern/uipc_syscalls.c | 33 | ||||
-rw-r--r-- | sys/kern/uipc_usrreq.c | 412 | ||||
-rw-r--r-- | sys/kern/vfs_syscalls.c | 34 | ||||
-rw-r--r-- | sys/kern/vfs_vnops.c | 14 |
9 files changed, 299 insertions, 449 deletions
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 274522f14c21..070fac7516df 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -95,7 +95,6 @@ static int do_dup(struct thread *td, enum dup_type type, int old, int new, static int fd_first_free(struct filedesc *, int, int); static int fd_last_used(struct filedesc *, int, int); static void fdgrowtable(struct filedesc *, int); -static int fdrop_locked(struct file *fp, struct thread *td); static void fdunused(struct filedesc *fdp, int fd); static void fdused(struct filedesc *fdp, int fd); @@ -137,9 +136,7 @@ struct filedesc0 { /* * Descriptor management. */ -struct filelist filehead; /* head of list of open files */ -int openfiles; /* actual number of open files */ -struct sx filelist_lock; /* sx to protect filelist */ +volatile int openfiles; /* actual number of open files */ struct mtx sigio_lock; /* mtx to protect pointers to sigio */ void (*mq_fdclose)(struct thread *td, int fd, struct file *fp); @@ -428,9 +425,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) error = EBADF; break; } - FILE_LOCK(fp); td->td_retval[0] = OFLAGS(fp->f_flag); - FILE_UNLOCK(fp); FILEDESC_SUNLOCK(fdp); break; @@ -441,12 +436,13 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) error = EBADF; break; } - FILE_LOCK(fp); - fhold_locked(fp); - fp->f_flag &= ~FCNTLFLAGS; - fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; - FILE_UNLOCK(fp); + fhold(fp); FILEDESC_SUNLOCK(fdp); + do { + tmp = flg = fp->f_flag; + tmp &= ~FCNTLFLAGS; + tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; + } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); if (error) { @@ -459,9 +455,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) fdrop(fp, td); break; } - FILE_LOCK(fp); - fp->f_flag &= ~FNONBLOCK; - FILE_UNLOCK(fp); + atomic_clear_int(&fp->f_flag, FNONBLOCK); tmp = 0; (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); fdrop(fp, td); @@ -1359,15 +1353,13 @@ int falloc(struct thread *td, struct file **resultfp, int *resultfd) { struct proc *p = td->td_proc; - struct file *fp, *fq; + struct file *fp; int error, i; int maxuserfiles = maxfiles - (maxfiles / 20); static struct timeval lastfail; static int curfail; fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); - sx_xlock(&filelist_lock); - if ((openfiles >= maxuserfiles && priv_check(td, PRIV_MAXFILES) != 0) || openfiles >= maxfiles) { @@ -1375,18 +1367,16 @@ falloc(struct thread *td, struct file **resultfp, int *resultfd) printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n", td->td_ucred->cr_ruid); } - sx_xunlock(&filelist_lock); uma_zfree(file_zone, fp); return (ENFILE); } - openfiles++; + atomic_add_int(&openfiles, 1); /* * If the process has file descriptor zero open, add the new file * descriptor to the list of open files at that point, otherwise * put it at the front of the list of open files. */ - fp->f_mtxp = mtx_pool_alloc(mtxpool_sleep); fp->f_count = 1; if (resultfp) fp->f_count++; @@ -1395,12 +1385,6 @@ falloc(struct thread *td, struct file **resultfp, int *resultfd) fp->f_data = NULL; fp->f_vnode = NULL; FILEDESC_XLOCK(p->p_fd); - if ((fq = p->p_fd->fd_ofiles[0])) { - LIST_INSERT_AFTER(fq, fp, f_list); - } else { - LIST_INSERT_HEAD(&filehead, fp, f_list); - } - sx_xunlock(&filelist_lock); if ((error = fdalloc(td, 0, &i))) { FILEDESC_XUNLOCK(p->p_fd); fdrop(fp, td); @@ -1962,6 +1946,23 @@ closef(struct file *fp, struct thread *td) } /* + * Initialize the file pointer with the specified properties. + * + * The ops are set with release semantics to be certain that the flags, type, + * and data are visible when ops is. This is to prevent ops methods from being + * called with bad data. + */ +void +finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) +{ + fp->f_data = data; + fp->f_flag = flag; + fp->f_type = type; + atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); +} + + +/* * Extract the file pointer associated with the specified descriptor for the * current user process. * @@ -2135,54 +2136,20 @@ fputsock(struct socket *so) sorele(so); } -int -fdrop(struct file *fp, struct thread *td) -{ - - FILE_LOCK(fp); - return (fdrop_locked(fp, td)); -} - /* - * Drop reference on struct file passed in, may call closef if the - * reference hits zero. - * Expects struct file locked, and will unlock it. + * Handle the last reference to a file being closed. */ -static int -fdrop_locked(struct file *fp, struct thread *td) +int +_fdrop(struct file *fp, struct thread *td) { int error; - FILE_LOCK_ASSERT(fp, MA_OWNED); - - if (--fp->f_count > 0) { - FILE_UNLOCK(fp); - return (0); - } - - /* - * We might have just dropped the last reference to a file - * object that is for a UNIX domain socket whose message - * buffers are being examined in unp_gc(). If that is the - * case, FWAIT will be set in f_gcflag and we need to wait for - * unp_gc() to finish its scan. - */ - while (fp->f_gcflag & FWAIT) - msleep(&fp->f_gcflag, fp->f_mtxp, 0, "fpdrop", 0); - - /* We have the last ref so we can proceed without the file lock. */ - FILE_UNLOCK(fp); - if (fp->f_count < 0) - panic("fdrop: count < 0"); + error = 0; + if (fp->f_count != 0) + panic("fdrop: count %d", fp->f_count); if (fp->f_ops != &badfileops) error = fo_close(fp, td); - else - error = 0; - - sx_xlock(&filelist_lock); - LIST_REMOVE(fp, f_list); - openfiles--; - sx_xunlock(&filelist_lock); + atomic_subtract_int(&openfiles, 1); crfree(fp->f_cred); uma_zfree(file_zone, fp); @@ -2225,9 +2192,7 @@ flock(struct thread *td, struct flock_args *uap) lf.l_len = 0; if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; - FILE_LOCK(fp); - fp->f_flag &= ~FHASLOCK; - FILE_UNLOCK(fp); + atomic_clear_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); goto done2; } @@ -2239,9 +2204,7 @@ flock(struct thread *td, struct flock_args *uap) error = EBADF; goto done2; } - FILE_LOCK(fp); - fp->f_flag |= FHASLOCK; - FILE_UNLOCK(fp); + atomic_set_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); done2: @@ -2286,9 +2249,7 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ - FILE_LOCK(wfp); if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { - FILE_UNLOCK(wfp); FILEDESC_XUNLOCK(fdp); return (EACCES); } @@ -2297,8 +2258,7 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; if (fp == NULL) fdused(fdp, indx); - fhold_locked(wfp); - FILE_UNLOCK(wfp); + fhold(wfp); FILEDESC_XUNLOCK(fdp); if (fp != NULL) /* @@ -2419,29 +2379,23 @@ sysctl_kern_file(SYSCTL_HANDLER_ARGS) struct proc *p; int error, n; - /* - * Note: because the number of file descriptors is calculated - * in different ways for sizing vs returning the data, - * there is information leakage from the first loop. However, - * it is of a similar order of magnitude to the leakage from - * global system statistics such as kern.openfiles. - */ error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); if (req->oldptr == NULL) { - n = 16; /* A slight overestimate. */ - sx_slock(&filelist_lock); - LIST_FOREACH(fp, &filehead, f_list) { - /* - * We should grab the lock, but this is an - * estimate, so does it really matter? - */ - /* mtx_lock(fp->f_mtxp); */ - n += fp->f_count; - /* mtx_unlock(f->f_mtxp); */ + n = 0; + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + fdp = fdhold(p); + if (fdp == NULL) + continue; + /* overestimates sparse tables. */ + n += fdp->fd_lastfile; + fddrop(fdp); } - sx_sunlock(&filelist_lock); + sx_sunlock(&allproc_lock); return (SYSCTL_OUT(req, 0, n * sizeof(xf))); } error = 0; @@ -2472,7 +2426,7 @@ sysctl_kern_file(SYSCTL_HANDLER_ARGS) xf.xf_vnode = fp->f_vnode; xf.xf_type = fp->f_type; xf.xf_count = fp->f_count; - xf.xf_msgcount = fp->f_msgcount; + xf.xf_msgcount = 0; xf.xf_offset = fp->f_offset; xf.xf_flag = fp->f_flag; error = SYSCTL_OUT(req, &xf, sizeof(xf)); @@ -2523,7 +2477,6 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) continue; bzero(kif, sizeof(*kif)); kif->kf_structsize = sizeof(*kif); - FILE_LOCK(fp); vp = NULL; so = NULL; kif->kf_fd = i; @@ -2531,7 +2484,6 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) case DTYPE_VNODE: kif->kf_type = KF_TYPE_VNODE; vp = fp->f_vnode; - vref(vp); break; case DTYPE_SOCKET: @@ -2583,8 +2535,8 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) if (fp->f_flag & FHASLOCK) kif->kf_flags |= KF_FLAG_HASLOCK; kif->kf_offset = fp->f_offset; - FILE_UNLOCK(fp); if (vp != NULL) { + vref(vp); switch (vp->v_type) { case VNON: kif->kf_vnode_type = KF_VTYPE_VNON; @@ -2736,7 +2688,7 @@ db_print_file(struct file *fp, int header) p = file_to_first_proc(fp); db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp, file_type_to_name(fp->f_type), fp->f_data, fp->f_flag, - fp->f_gcflag, fp->f_count, fp->f_msgcount, fp->f_vnode, + 0, fp->f_count, 0, fp->f_vnode, p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); } @@ -2754,13 +2706,24 @@ DB_SHOW_COMMAND(file, db_show_file) DB_SHOW_COMMAND(files, db_show_files) { + struct filedesc *fdp; struct file *fp; + struct proc *p; int header; + int n; header = 1; - LIST_FOREACH(fp, &filehead, f_list) { - db_print_file(fp, header); - header = 0; + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + if ((fdp = p->p_fd) == NULL) + continue; + for (n = 0; n < fdp->fd_nfiles; ++n) { + if ((fp = fdp->fd_ofiles[n]) == NULL) + continue; + db_print_file(fp, header); + header = 0; + } } } #endif @@ -2772,7 +2735,7 @@ SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "Maximum number of files"); SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, - &openfiles, 0, "System-wide number of open files"); + __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files"); /* ARGSUSED*/ static void @@ -2781,7 +2744,6 @@ filelistinit(void *dummy) file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - sx_init(&filelist_lock, "filelist lock"); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF); } diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index 4d75822dbff2..b5d01d07621c 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -531,12 +531,7 @@ kqueue(struct thread *td, struct kqueue_args *uap) SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); FILEDESC_XUNLOCK(fdp); - FILE_LOCK(fp); - fp->f_flag = FREAD | FWRITE; - fp->f_type = DTYPE_KQUEUE; - fp->f_data = kq; - fp->f_ops = &kqueueops; - FILE_UNLOCK(fp); + finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); fdrop(fp, td); td->td_retval[0] = fd; @@ -990,24 +985,17 @@ kqueue_acquire(struct file *fp, struct kqueue **kqp) error = 0; - FILE_LOCK(fp); - do { - kq = fp->f_data; - if (fp->f_type != DTYPE_KQUEUE || kq == NULL) { - error = EBADF; - break; - } - *kqp = kq; - KQ_LOCK(kq); - if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { - KQ_UNLOCK(kq); - error = EBADF; - break; - } - kq->kq_refcnt++; + kq = fp->f_data; + if (fp->f_type != DTYPE_KQUEUE || kq == NULL) + return (EBADF); + *kqp = kq; + KQ_LOCK(kq); + if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { KQ_UNLOCK(kq); - } while (0); - FILE_UNLOCK(fp); + return (EBADF); + } + kq->kq_refcnt++; + KQ_UNLOCK(kq); return error; } diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index 0800c08073e6..9c800f37d342 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -646,21 +646,17 @@ kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) FILEDESC_XUNLOCK(fdp); goto out; case FIONBIO: - FILE_LOCK(fp); if ((tmp = *(int *)data)) - fp->f_flag |= FNONBLOCK; + atomic_set_int(&fp->f_flag, FNONBLOCK); else - fp->f_flag &= ~FNONBLOCK; - FILE_UNLOCK(fp); + atomic_clear_int(&fp->f_flag, FNONBLOCK); data = (void *)&tmp; break; case FIOASYNC: - FILE_LOCK(fp); if ((tmp = *(int *)data)) - fp->f_flag |= FASYNC; + atomic_set_int(&fp->f_flag, FASYNC); else - fp->f_flag &= ~FASYNC; - FILE_UNLOCK(fp); + atomic_clear_int(&fp->f_flag, FASYNC); data = (void *)&tmp; break; } diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c index 262ef0c1fec8..27ecf80896da 100644 --- a/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -363,12 +363,7 @@ pipe(td, uap) * to avoid races against processes which manage to dup() the read * side while we are blocked trying to allocate the write side. */ - FILE_LOCK(rf); - rf->f_flag = FREAD | FWRITE; - rf->f_type = DTYPE_PIPE; - rf->f_data = rpipe; - rf->f_ops = &pipeops; - FILE_UNLOCK(rf); + finit(rf, FREAD | FWRITE, DTYPE_PIPE, rpipe, &pipeops); error = falloc(td, &wf, &fd); if (error) { fdclose(fdp, rf, td->td_retval[0], td); @@ -378,12 +373,7 @@ pipe(td, uap) return (error); } /* An extra reference on `wf' has been held for us by falloc(). */ - FILE_LOCK(wf); - wf->f_flag = FREAD | FWRITE; - wf->f_type = DTYPE_PIPE; - wf->f_data = wpipe; - wf->f_ops = &pipeops; - FILE_UNLOCK(wf); + finit(wf, FREAD | FWRITE, DTYPE_PIPE, wpipe, &pipeops); fdrop(wf, td); td->td_retval[1] = fd; fdrop(rf, td); diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c index 1c5cadbc3311..8fe34bcba0a9 100644 --- a/sys/kern/uipc_mqueue.c +++ b/sys/kern/uipc_mqueue.c @@ -1999,12 +1999,8 @@ kmq_open(struct thread *td, struct kmq_open_args *uap) mqnode_addref(pn); sx_xunlock(&mqfs_data.mi_lock); - FILE_LOCK(fp); - fp->f_flag = (flags & (FREAD | FWRITE | O_NONBLOCK)); - fp->f_type = DTYPE_MQUEUE; - fp->f_data = pn; - fp->f_ops = &mqueueops; - FILE_UNLOCK(fp); + finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn, + &mqueueops); FILEDESC_XLOCK(fdp); if (fdp->fd_ofiles[fd] == fp) @@ -2097,6 +2093,7 @@ kmq_setattr(struct thread *td, struct kmq_setattr_args *uap) struct mqueue *mq; struct file *fp; struct mq_attr attr, oattr; + u_int oflag, flag; int error; if (uap->attr) { @@ -2112,13 +2109,15 @@ kmq_setattr(struct thread *td, struct kmq_setattr_args *uap) oattr.mq_maxmsg = mq->mq_maxmsg; oattr.mq_msgsize = mq->mq_msgsize; oattr.mq_curmsgs = mq->mq_curmsgs; - FILE_LOCK(fp); - oattr.mq_flags = (O_NONBLOCK & fp->f_flag); if (uap->attr) { - fp->f_flag &= ~O_NONBLOCK; - fp->f_flag |= (attr.mq_flags & O_NONBLOCK); - } - FILE_UNLOCK(fp); + do { + oflag = flag = fp->f_flag; + flag &= ~O_NONBLOCK; + flag |= (attr.mq_flags & O_NONBLOCK); + } while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0); + } else + oflag = fp->f_flag; + oattr.mq_flags = (O_NONBLOCK & oflag); fdrop(fp, td); if (uap->oattr) error = copyout(&oattr, uap->oattr, sizeof(oattr)); diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index faf7f24a66cc..616afa0e5d65 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -180,12 +180,7 @@ socket(td, uap) if (error) { fdclose(fdp, fp, fd, td); } else { - FILE_LOCK(fp); - fp->f_data = so; /* already has ref count */ - fp->f_flag = FREAD|FWRITE; - fp->f_type = DTYPE_SOCKET; - fp->f_ops = &socketops; - FILE_UNLOCK(fp); + finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops); td->td_retval[0] = fd; } fdrop(fp, td); @@ -423,12 +418,7 @@ kern_accept(struct thread *td, int s, struct sockaddr **name, if (pgid != 0) fsetown(pgid, &so->so_sigio); - FILE_LOCK(nfp); - nfp->f_data = so; /* nfp has ref count from falloc */ - nfp->f_flag = fflag; - nfp->f_type = DTYPE_SOCKET; - nfp->f_ops = &socketops; - FILE_UNLOCK(nfp); + finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); /* Sync socket nonblocking/async state with file flags */ tmp = fflag & FNONBLOCK; (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); @@ -640,16 +630,8 @@ socketpair(td, uap) if (error) goto free4; } - FILE_LOCK(fp1); - fp1->f_flag = FREAD|FWRITE; - fp1->f_type = DTYPE_SOCKET; - fp1->f_ops = &socketops; - FILE_UNLOCK(fp1); - FILE_LOCK(fp2); - fp2->f_flag = FREAD|FWRITE; - fp2->f_type = DTYPE_SOCKET; - fp2->f_ops = &socketops; - FILE_UNLOCK(fp2); + finit(fp1, FREAD | FWRITE, DTYPE_SOCKET, fp1->f_data, &socketops); + finit(fp2, FREAD | FWRITE, DTYPE_SOCKET, fp2->f_data, &socketops); so1 = so2 = NULL; error = copyout(sv, uap->rsv, 2 * sizeof (int)); if (error) @@ -2270,12 +2252,7 @@ sctp_peeloff(td, uap) so->so_qstate &= ~SQ_COMP; so->so_head = NULL; ACCEPT_UNLOCK(); - FILE_LOCK(nfp); - nfp->f_data = so; - nfp->f_flag = fflag; - nfp->f_type = DTYPE_SOCKET; - nfp->f_ops = &socketops; - FILE_UNLOCK(nfp); + finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); if (error) goto noconnection; diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 9fea71b4b626..1d6cc464da0f 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -233,10 +233,11 @@ static void unp_shutdown(struct unpcb *); static void unp_drop(struct unpcb *, int); static void unp_gc(__unused void *, int); static void unp_scan(struct mbuf *, void (*)(struct file *)); -static void unp_mark(struct file *); static void unp_discard(struct file *); static void unp_freerights(struct file **, int); static int unp_internalize(struct mbuf **, struct thread *); +static void unp_internalize_fp(struct file *); +static void unp_externalize_fp(struct file *); static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *); /* @@ -586,9 +587,9 @@ uipc_detach(struct socket *so) unp_drop(ref, ECONNRESET); UNP_PCB_UNLOCK(ref); } + local_unp_rights = unp_rights; UNP_GLOBAL_WUNLOCK(); unp->unp_socket->so_pcb = NULL; - local_unp_rights = unp_rights; saved_unp_addr = unp->unp_addr; unp->unp_addr = NULL; unp->unp_refcount--; @@ -1600,10 +1601,7 @@ unp_externalize(struct mbuf *control, struct mbuf **controlp) panic("unp_externalize fdalloc failed"); fp = *rp++; td->td_proc->p_fd->fd_ofiles[f] = fp; - FILE_LOCK(fp); - fp->f_msgcount--; - FILE_UNLOCK(fp); - unp_rights--; + unp_externalize_fp(fp); *fdp++ = f; } FILEDESC_XUNLOCK(td->td_proc->p_fd); @@ -1765,11 +1763,8 @@ unp_internalize(struct mbuf **controlp, struct thread *td) for (i = 0; i < oldfds; i++) { fp = fdescp->fd_ofiles[*fdp++]; *rp++ = fp; - FILE_LOCK(fp); - fp->f_count++; - fp->f_msgcount++; - FILE_UNLOCK(fp); - unp_rights++; + fhold(fp); + unp_internalize_fp(fp); } FILEDESC_SUNLOCK(fdescp); break; @@ -1860,230 +1855,198 @@ unp_addsockcred(struct thread *td, struct mbuf *control) return (m); } +static struct unpcb * +fptounp(struct file *fp) +{ + struct socket *so; + + if (fp->f_type != DTYPE_SOCKET) + return (NULL); + if ((so = fp->f_data) == NULL) + return (NULL); + if (so->so_proto->pr_domain != &localdomain) + return (NULL); + return sotounpcb(so); +} + +static void +unp_discard(struct file *fp) +{ + + unp_externalize_fp(fp); + (void) closef(fp, (struct thread *)NULL); +} + +static void +unp_internalize_fp(struct file *fp) +{ + struct unpcb *unp; + + UNP_GLOBAL_WLOCK(); + if ((unp = fptounp(fp)) != NULL) { + unp->unp_file = fp; + unp->unp_msgcount++; + } + unp_rights++; + UNP_GLOBAL_WUNLOCK(); +} + +static void +unp_externalize_fp(struct file *fp) +{ + struct unpcb *unp; + + UNP_GLOBAL_WLOCK(); + if ((unp = fptounp(fp)) != NULL) + unp->unp_msgcount--; + unp_rights--; + UNP_GLOBAL_WUNLOCK(); +} + /* * unp_defer indicates whether additional work has been defered for a future * pass through unp_gc(). It is thread local and does not require explicit * synchronization. */ -static int unp_defer; +static int unp_marked; +static int unp_unreachable; -static int unp_taskcount; -SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, ""); +static void +unp_accessable(struct file *fp) +{ + struct unpcb *unp; + + unp = fptounp(fp); + if (fp == NULL) + return; + if (unp->unp_gcflag & UNPGC_REF) + return; + unp->unp_gcflag &= ~UNPGC_DEAD; + unp->unp_gcflag |= UNPGC_REF; + unp_marked++; +} + +static void +unp_gc_process(struct unpcb *unp) +{ + struct socket *soa; + struct socket *so; + struct file *fp; + + /* Already processed. */ + if (unp->unp_gcflag & UNPGC_SCANNED) + return; + fp = unp->unp_file; + /* + * Check for a socket potentially in a cycle. It must be in a + * queue as indicated by msgcount, and this must equal the file + * reference count. Note that when msgcount is 0 the file is NULL. + */ + if (unp->unp_msgcount != 0 && fp->f_count != 0 && + fp->f_count == unp->unp_msgcount) { + unp->unp_gcflag |= UNPGC_DEAD; + unp_unreachable++; + return; + } + /* + * Mark all sockets we reference with RIGHTS. + */ + so = unp->unp_socket; + SOCKBUF_LOCK(&so->so_rcv); + unp_scan(so->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&so->so_rcv); + /* + * Mark all sockets in our accept queue. + */ + ACCEPT_LOCK(); + TAILQ_FOREACH(soa, &so->so_comp, so_list) { + SOCKBUF_LOCK(&soa->so_rcv); + unp_scan(soa->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&soa->so_rcv); + } + ACCEPT_UNLOCK(); + unp->unp_gcflag |= UNPGC_SCANNED; +} static int unp_recycled; SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, ""); +static int unp_taskcount; +SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, ""); + static void unp_gc(__unused void *arg, int pending) { - struct file *fp, *nextfp; - struct socket *so; - struct file **extra_ref, **fpp; - int nunref, i; - int nfiles_snap; - int nfiles_slack = 20; + struct unp_head *heads[] = { &unp_dhead, &unp_shead, NULL }; + struct unp_head **head; + struct file **unref; + struct unpcb *unp; + int i; unp_taskcount++; - unp_defer = 0; + UNP_GLOBAL_RLOCK(); + /* + * First clear all gc flags from previous runs. + */ + for (head = heads; *head != NULL; head++) + LIST_FOREACH(unp, *head, unp_link) + unp->unp_gcflag &= ~(UNPGC_REF|UNPGC_DEAD); /* - * Before going through all this, set all FDs to be NOT deferred and - * NOT externally accessible. + * Scan marking all reachable sockets with UNPGC_REF. Once a socket + * is reachable all of the sockets it references are reachable. + * Stop the scan once we do a complete loop without discovering + * a new reachable socket. */ - sx_slock(&filelist_lock); - LIST_FOREACH(fp, &filehead, f_list) - fp->f_gcflag &= ~(FMARK|FDEFER); do { - KASSERT(unp_defer >= 0, ("unp_gc: unp_defer %d", unp_defer)); - LIST_FOREACH(fp, &filehead, f_list) { - FILE_LOCK(fp); - /* - * If the file is not open, skip it -- could be a - * file in the process of being opened, or in the - * process of being closed. If the file is - * "closing", it may have been marked for deferred - * consideration. Clear the flag now if so. - */ - if (fp->f_count == 0) { - if (fp->f_gcflag & FDEFER) - unp_defer--; - fp->f_gcflag &= ~(FMARK|FDEFER); - FILE_UNLOCK(fp); - continue; - } - /* - * If we already marked it as 'defer' in a - * previous pass, then try to process it this - * time and un-mark it. - */ - if (fp->f_gcflag & FDEFER) { - fp->f_gcflag &= ~FDEFER; - unp_defer--; - } else { - /* - * If it's not deferred, then check if it's - * already marked.. if so skip it - */ - if (fp->f_gcflag & FMARK) { - FILE_UNLOCK(fp); - continue; - } - /* - * If all references are from messages in - * transit, then skip it. it's not externally - * accessible. - */ - if (fp->f_count == fp->f_msgcount) { - FILE_UNLOCK(fp); - continue; - } - /* - * If it got this far then it must be - * externally accessible. - */ - fp->f_gcflag |= FMARK; - } - /* - * Either it was deferred, or it is externally - * accessible and not already marked so. Now check - * if it is possibly one of OUR sockets. - */ - if (fp->f_type != DTYPE_SOCKET || - (so = fp->f_data) == NULL) { - FILE_UNLOCK(fp); - continue; - } - if (so->so_proto->pr_domain != &localdomain || - (so->so_proto->pr_flags & PR_RIGHTS) == 0) { - FILE_UNLOCK(fp); - continue; + unp_unreachable = 0; + unp_marked = 0; + for (head = heads; *head != NULL; head++) + LIST_FOREACH(unp, *head, unp_link) + unp_gc_process(unp); + } while (unp_marked); + UNP_GLOBAL_RUNLOCK(); + if (unp_unreachable == 0) + return; + /* + * Allocate space for a local list of dead unpcbs. + */ + unref = malloc(unp_unreachable * sizeof(struct file *), + M_TEMP, M_WAITOK); + /* + * Iterate looking for sockets which have been specifically marked + * as as unreachable and store them locally. + */ + UNP_GLOBAL_RLOCK(); + for (i = 0, head = heads; *head != NULL; head++) + LIST_FOREACH(unp, *head, unp_link) + if (unp->unp_gcflag & UNPGC_DEAD) { + unref[i++] = unp->unp_file; + KASSERT(unp->unp_file != NULL, + ("unp_gc: Invalid unpcb.")); + KASSERT(i <= unp_unreachable, + ("unp_gc: incorrect unreachable count.")); } - - /* - * Tell any other threads that do a subsequent - * fdrop() that we are scanning the message - * buffers. - */ - fp->f_gcflag |= FWAIT; - FILE_UNLOCK(fp); - - /* - * So, Ok, it's one of our sockets and it IS - * externally accessible (or was deferred). Now we - * look to see if we hold any file descriptors in its - * message buffers. Follow those links and mark them - * as accessible too. - */ - SOCKBUF_LOCK(&so->so_rcv); - unp_scan(so->so_rcv.sb_mb, unp_mark); - SOCKBUF_UNLOCK(&so->so_rcv); - - /* - * Wake up any threads waiting in fdrop(). - */ - FILE_LOCK(fp); - fp->f_gcflag &= ~FWAIT; - wakeup(&fp->f_gcflag); - FILE_UNLOCK(fp); - } - } while (unp_defer); - sx_sunlock(&filelist_lock); + UNP_GLOBAL_RUNLOCK(); /* - * XXXRW: The following comments need updating for a post-SMPng and - * deferred unp_gc() world, but are still generally accurate. - * - * We grab an extra reference to each of the file table entries that - * are not otherwise accessible and then free the rights that are - * stored in messages on them. - * - * The bug in the orginal code is a little tricky, so I'll describe - * what's wrong with it here. - * - * It is incorrect to simply unp_discard each entry for f_msgcount - * times -- consider the case of sockets A and B that contain - * references to each other. On a last close of some other socket, - * we trigger a gc since the number of outstanding rights (unp_rights) - * is non-zero. If during the sweep phase the gc code unp_discards, - * we end up doing a (full) closef on the descriptor. A closef on A - * results in the following chain. Closef calls soo_close, which - * calls soclose. Soclose calls first (through the switch - * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply - * returns because the previous instance had set unp_gcing, and we - * return all the way back to soclose, which marks the socket with - * SS_NOFDREF, and then calls sofree. Sofree calls sorflush to free - * up the rights that are queued in messages on the socket A, i.e., - * the reference on B. The sorflush calls via the dom_dispose switch - * unp_dispose, which unp_scans with unp_discard. This second - * instance of unp_discard just calls closef on B. - * - * Well, a similar chain occurs on B, resulting in a sorflush on B, - * which results in another closef on A. Unfortunately, A is already - * being closed, and the descriptor has already been marked with - * SS_NOFDREF, and soclose panics at this point. - * - * Here, we first take an extra reference to each inaccessible - * descriptor. Then, we call sorflush ourself, since we know it is a - * Unix domain socket anyhow. After we destroy all the rights - * carried in messages, we do a last closef to get rid of our extra - * reference. This is the last close, and the unp_detach etc will - * shut down the socket. - * - * 91/09/19, bsy@cs.cmu.edu + * All further operation is now done on a local list. We first ref + * all sockets to avoid closing them until all are flushed. */ -again: - nfiles_snap = openfiles + nfiles_slack; /* some slack */ - extra_ref = malloc(nfiles_snap * sizeof(struct file *), M_TEMP, - M_WAITOK); - sx_slock(&filelist_lock); - if (nfiles_snap < openfiles) { - sx_sunlock(&filelist_lock); - free(extra_ref, M_TEMP); - nfiles_slack += 20; - goto again; - } - for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; - fp != NULL; fp = nextfp) { - nextfp = LIST_NEXT(fp, f_list); - FILE_LOCK(fp); - /* - * If it's not open, skip it - */ - if (fp->f_count == 0) { - FILE_UNLOCK(fp); - continue; - } - /* - * If all refs are from msgs, and it's not marked accessible - * then it must be referenced from some unreachable cycle of - * (shut-down) FDs, so include it in our list of FDs to - * remove. - */ - if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) { - *fpp++ = fp; - nunref++; - fp->f_count++; - } - FILE_UNLOCK(fp); - } - sx_sunlock(&filelist_lock); + for (i = 0; i < unp_unreachable; i++) + fhold(unref[i]); /* - * For each FD on our hit list, do the following two things: + * Now flush all sockets, free'ing rights. This will free the + * struct files associated with these sockets but leave each socket + * with one remaining ref. */ - for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { - struct file *tfp = *fpp; - FILE_LOCK(tfp); - if (tfp->f_type == DTYPE_SOCKET && - tfp->f_data != NULL) { - FILE_UNLOCK(tfp); - sorflush(tfp->f_data); - } else { - FILE_UNLOCK(tfp); - } - } - for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { - closef(*fpp, (struct thread *) NULL); - unp_recycled++; - } - free(extra_ref, M_TEMP); + for (i = 0; i < unp_unreachable; i++) + sorflush(unref[i]->f_data); + /* + * And finally release the sockets so they can be reclaimed. + */ + for (i = 0; i < unp_unreachable; i++) + fdrop(unref[i], NULL); + unp_recycled += unp_unreachable; + free(unref, M_TEMP); } void @@ -2143,31 +2106,6 @@ unp_scan(struct mbuf *m0, void (*op)(struct file *)) } } -static void -unp_mark(struct file *fp) -{ - - /* XXXRW: Should probably assert file list lock here. */ - - if (fp->f_gcflag & FMARK) - return; - unp_defer++; - fp->f_gcflag |= (FMARK|FDEFER); -} - -static void -unp_discard(struct file *fp) -{ - - UNP_GLOBAL_WLOCK(); - FILE_LOCK(fp); - fp->f_msgcount--; - unp_rights--; - FILE_UNLOCK(fp); - UNP_GLOBAL_WUNLOCK(); - (void) closef(fp, (struct thread *)NULL); -} - #ifdef DDB static void db_print_indent(int indent) diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 20d722efb646..0e42ea393f3f 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1022,6 +1022,8 @@ kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags, return (error); /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; + /* Set the flags early so the finit in devfs can pick them up. */ + fp->f_flag = flags & FMASK; cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg, path, td); td->td_dupfd = -1; /* XXX check for fdopen */ @@ -1067,16 +1069,16 @@ kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags, NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; - FILE_LOCK(fp); - fp->f_vnode = vp; - if (fp->f_data == NULL) - fp->f_data = vp; - fp->f_flag = flags & FMASK; - fp->f_seqcount = 1; - fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE); - if (fp->f_ops == &badfileops) - fp->f_ops = &vnops; - FILE_UNLOCK(fp); + fp->f_vnode = vp; /* XXX Does devfs need this? */ + /* + * If the file wasn't claimed by devfs bind it to the normal + * vnode operations here. + */ + if (fp->f_ops == &badfileops) { + KASSERT(vp->v_type != VFIFO, ("Unexpected fifo.")); + fp->f_seqcount = 1; + finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops); + } VOP_UNLOCK(vp, 0, td); if (flags & (O_EXLOCK | O_SHLOCK)) { @@ -1093,7 +1095,7 @@ kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags, if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) goto bad; - fp->f_flag |= FHASLOCK; + atomic_set_int(&fp->f_flag, FHASLOCK); } if (flags & O_TRUNC) { if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) @@ -4179,14 +4181,8 @@ fhopen(td, uap) } /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; - - FILE_LOCK(nfp); nfp->f_vnode = vp; - nfp->f_data = vp; - nfp->f_flag = fmode & FMASK; - nfp->f_type = DTYPE_VNODE; - nfp->f_ops = &vnops; - FILE_UNLOCK(nfp); + finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops); if (fmode & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; @@ -4215,7 +4211,7 @@ fhopen(td, uap) goto out; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); - fp->f_flag |= FHASLOCK; + atomic_set_int(&fp->f_flag, FHASLOCK); } VOP_UNLOCK(vp, 0, td); diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 50835747e78f..c7df6adef59f 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -488,10 +488,12 @@ vn_read(fp, uio, active_cred, flags, td) { struct vnode *vp; int error, ioflag; + struct mtx *mtxp; int vfslocked; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); + mtxp = NULL; vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) @@ -505,13 +507,15 @@ vn_read(fp, uio, active_cred, flags, td) * It is now protected by the FOFFSET_LOCKED flag. */ if ((flags & FOF_OFFSET) == 0) { - FILE_LOCK(fp); + mtxp = mtx_pool_find(mtxpool_sleep, fp); + mtx_lock(mtxp); while(fp->f_vnread_flags & FOFFSET_LOCKED) { fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; - msleep(&fp->f_vnread_flags,fp->f_mtxp,PUSER -1,"vnread offlock",0); + msleep(&fp->f_vnread_flags, mtxp, PUSER -1, + "vnread offlock", 0); } fp->f_vnread_flags |= FOFFSET_LOCKED; - FILE_UNLOCK(fp); + mtx_unlock(mtxp); vn_lock(vp, LK_SHARED | LK_RETRY, td); uio->uio_offset = fp->f_offset; } else @@ -526,11 +530,11 @@ vn_read(fp, uio, active_cred, flags, td) error = VOP_READ(vp, uio, ioflag, fp->f_cred); if ((flags & FOF_OFFSET) == 0) { fp->f_offset = uio->uio_offset; - FILE_LOCK(fp); + mtx_lock(mtxp); if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) wakeup(&fp->f_vnread_flags); fp->f_vnread_flags = 0; - FILE_UNLOCK(fp); + mtx_unlock(mtxp); } fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0, td); |