diff options
author | Gleb Smirnoff <glebius@FreeBSD.org> | 2017-06-08 21:30:34 +0000 |
---|---|---|
committer | Gleb Smirnoff <glebius@FreeBSD.org> | 2017-06-08 21:30:34 +0000 |
commit | 779f106aa169256b7010a1d8f963ff656b881e92 (patch) | |
tree | 01f96ac5827e2a95a385258086155b37b80ec6b9 /sys/kern/uipc_socket.c | |
parent | 4623e047a7c87148916e11e6c7b80409f6e45519 (diff) | |
download | src-779f106aa169256b7010a1d8f963ff656b881e92.tar.gz src-779f106aa169256b7010a1d8f963ff656b881e92.zip |
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
Notes
Notes:
svn path=/head/; revision=319722
Diffstat (limited to 'sys/kern/uipc_socket.c')
-rw-r--r-- | sys/kern/uipc_socket.c | 851 |
1 files changed, 599 insertions, 252 deletions
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index f66b32518bfc..7a8d2ea17a77 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -106,6 +106,7 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_compat.h" +#include "opt_sctp.h" #include <sys/param.h> #include <sys/systm.h> @@ -154,13 +155,21 @@ __FBSDID("$FreeBSD$"); static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); +static void so_rdknl_lock(void *); +static void so_rdknl_unlock(void *); +static void so_rdknl_assert_locked(void *); +static void so_rdknl_assert_unlocked(void *); +static void so_wrknl_lock(void *); +static void so_wrknl_unlock(void *); +static void so_wrknl_assert_locked(void *); +static void so_wrknl_assert_unlocked(void *); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); -static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); static int filt_soempty(struct knote *kn, long hint); +static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { @@ -393,8 +402,16 @@ soalloc(struct vnet *vnet) return (NULL); } + /* + * The socket locking protocol allows to lock 2 sockets at a time, + * however, the first one must be a listening socket. WITNESS lacks + * a feature to change class of an existing lock, so we use DUPOK. + */ + mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); + so->so_rcv.sb_sel = &so->so_rdsel; + so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd.sb_sx, "so_snd_sx"); sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); @@ -450,9 +467,6 @@ sodealloc(struct socket *so) if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); - /* remove accept filter if one is present. */ - if (so->so_accf != NULL) - accept_filt_setopt(so, NULL); #ifdef MAC mac_socket_destroy(so); #endif @@ -460,10 +474,16 @@ sodealloc(struct socket *so) crfree(so->so_cred); khelp_destroy_osd(&so->osd); - sx_destroy(&so->so_snd.sb_sx); - sx_destroy(&so->so_rcv.sb_sx); - SOCKBUF_LOCK_DESTROY(&so->so_snd); - SOCKBUF_LOCK_DESTROY(&so->so_rcv); + if (SOLISTENING(so)) { + if (so->sol_accept_filter != NULL) + accept_filt_setopt(so, NULL); + } else { + sx_destroy(&so->so_snd.sb_sx); + sx_destroy(&so->so_rcv.sb_sx); + SOCKBUF_LOCK_DESTROY(&so->so_snd); + SOCKBUF_LOCK_DESTROY(&so->so_rcv); + } + mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } @@ -506,8 +526,6 @@ socreate(int dom, struct socket **aso, int type, int proto, if (so == NULL) return (ENOBUFS); - TAILQ_INIT(&so->so_incomp); - TAILQ_INIT(&so->so_comp); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || @@ -520,9 +538,10 @@ socreate(int dom, struct socket **aso, int type, int proto, #ifdef MAC mac_socket_create(cred, so); #endif - knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); - knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); - so->so_count = 1; + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. @@ -531,12 +550,10 @@ socreate(int dom, struct socket **aso, int type, int proto, error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { - KASSERT(so->so_count == 1, ("socreate: so_count %d", - so->so_count)); - so->so_count = 0; sodealloc(so); return (error); } + soref(so); *aso = so; return (0); } @@ -564,11 +581,11 @@ sonewconn(struct socket *head, int connstatus) static int overcount; struct socket *so; - int over; + u_int over; - ACCEPT_LOCK(); - over = (head->so_qlen > 3 * head->so_qlimit / 2); - ACCEPT_UNLOCK(); + SOLISTEN_LOCK(head); + over = (head->sol_qlen > 3 * head->sol_qlimit / 2); + SOLISTEN_UNLOCK(head); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else @@ -580,15 +597,15 @@ sonewconn(struct socket *head, int connstatus) log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", - __func__, head->so_pcb, head->so_qlen, overcount); + __func__, head->so_pcb, head->sol_qlen, overcount); overcount = 0; } return (NULL); } - VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", - __func__, __LINE__, head)); + VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", + __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " @@ -596,11 +613,8 @@ sonewconn(struct socket *head, int connstatus) __func__, head->so_pcb); return (NULL); } - if ((head->so_options & SO_ACCEPTFILTER) != 0) - connstatus = 0; - so->so_head = head; + so->so_listen = head; so->so_type = head->so_type; - so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_fibnum = head->so_fibnum; @@ -609,10 +623,12 @@ sonewconn(struct socket *head, int connstatus) #ifdef MAC mac_socket_newconn(head, so); #endif - knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); - knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); VNET_SO_ASSERT(head); - if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); @@ -624,32 +640,24 @@ sonewconn(struct socket *head, int connstatus) __func__, head->so_pcb); return (NULL); } - so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; - so->so_snd.sb_lowat = head->so_snd.sb_lowat; - so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; - so->so_snd.sb_timeo = head->so_snd.sb_timeo; - so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; - so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; + so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; + so->so_snd.sb_lowat = head->sol_sbsnd_lowat; + so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; + so->so_snd.sb_timeo = head->sol_sbsnd_timeo; + so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; + + SOLISTEN_LOCK(head); + if (head->sol_accept_filter != NULL) + connstatus = 0; so->so_state |= connstatus; - ACCEPT_LOCK(); - /* - * The accept socket may be tearing down but we just - * won a race on the ACCEPT_LOCK. - * However, if sctp_peeloff() is called on a 1-to-many - * style socket, the SO_ACCEPTCONN doesn't need to be set. - */ - if (!(head->so_options & SO_ACCEPTCONN) && - ((head->so_proto->pr_protocol != IPPROTO_SCTP) || - (head->so_type != SOCK_SEQPACKET))) { - SOCK_LOCK(so); - so->so_head = NULL; - sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */ - return (NULL); - } + so->so_options = head->so_options & ~SO_ACCEPTCONN; + soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - so->so_qstate |= SQ_COMP; - head->so_qlen++; + TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); + so->so_qstate = SQ_COMP; + head->sol_qlen++; + solisten_wakeup(head); /* unlocks */ } else { /* * Keep removing sockets from the head until there's room for @@ -658,28 +666,86 @@ sonewconn(struct socket *head, int connstatus) * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ - while (head->so_incqlen > head->so_qlimit) { + while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; - sp = TAILQ_FIRST(&head->so_incomp); - TAILQ_REMOVE(&head->so_incomp, sp, so_list); - head->so_incqlen--; - sp->so_qstate &= ~SQ_INCOMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); + + sp = TAILQ_FIRST(&head->sol_incomp); + TAILQ_REMOVE(&head->sol_incomp, sp, so_list); + head->sol_incqlen--; + SOCK_LOCK(sp); + sp->so_qstate = SQ_NONE; + sp->so_listen = NULL; + SOCK_UNLOCK(sp); + sorele(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); - ACCEPT_LOCK(); + SOLISTEN_LOCK(head); } - TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); - so->so_qstate |= SQ_INCOMP; - head->so_incqlen++; + TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); + so->so_qstate = SQ_INCOMP; + head->sol_incqlen++; + SOLISTEN_UNLOCK(head); } - ACCEPT_UNLOCK(); - if (connstatus) { - sorwakeup(head); - wakeup_one(&head->so_timeo); + return (so); +} + +#ifdef SCTP +/* + * Socket part of sctp_peeloff(). Detach a new socket from an + * association. The new socket is returned with a reference. + */ +struct socket * +sopeeloff(struct socket *head) +{ + struct socket *so; + + VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", + __func__, __LINE__, head)); + so = soalloc(head->so_vnet); + if (so == NULL) { + log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " + "limit reached or out of memory\n", + __func__, head->so_pcb); + return (NULL); } + so->so_type = head->so_type; + so->so_options = head->so_options; + so->so_linger = head->so_linger; + so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; + so->so_fibnum = head->so_fibnum; + so->so_proto = head->so_proto; + so->so_cred = crhold(head->so_cred); +#ifdef MAC + mac_socket_newconn(head, so); +#endif + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); + VNET_SO_ASSERT(head); + if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + sodealloc(so); + log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", + __func__, head->so_pcb); + return (NULL); + } + if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { + sodealloc(so); + log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", + __func__, head->so_pcb); + return (NULL); + } + so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; + so->so_snd.sb_lowat = head->so_snd.sb_lowat; + so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; + so->so_snd.sb_timeo = head->so_snd.sb_timeo; + so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; + + soref(so); + return (so); } +#endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) @@ -741,13 +807,137 @@ solisten_proto_check(struct socket *so) void solisten_proto(struct socket *so, int backlog) { + int sbrcv_lowat, sbsnd_lowat; + u_int sbrcv_hiwat, sbsnd_hiwat; + short sbrcv_flags, sbsnd_flags; + sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); + if (SOLISTENING(so)) + goto listening; + + /* + * Change this socket to listening state. + */ + sbrcv_lowat = so->so_rcv.sb_lowat; + sbsnd_lowat = so->so_snd.sb_lowat; + sbrcv_hiwat = so->so_rcv.sb_hiwat; + sbsnd_hiwat = so->so_snd.sb_hiwat; + sbrcv_flags = so->so_rcv.sb_flags; + sbsnd_flags = so->so_snd.sb_flags; + sbrcv_timeo = so->so_rcv.sb_timeo; + sbsnd_timeo = so->so_snd.sb_timeo; + + sbdestroy(&so->so_snd, so); + sbdestroy(&so->so_rcv, so); + sx_destroy(&so->so_snd.sb_sx); + sx_destroy(&so->so_rcv.sb_sx); + SOCKBUF_LOCK_DESTROY(&so->so_snd); + SOCKBUF_LOCK_DESTROY(&so->so_rcv); + +#ifdef INVARIANTS + bzero(&so->so_rcv, + sizeof(struct socket) - offsetof(struct socket, so_rcv)); +#endif + + so->sol_sbrcv_lowat = sbrcv_lowat; + so->sol_sbsnd_lowat = sbsnd_lowat; + so->sol_sbrcv_hiwat = sbrcv_hiwat; + so->sol_sbsnd_hiwat = sbsnd_hiwat; + so->sol_sbrcv_flags = sbrcv_flags; + so->sol_sbsnd_flags = sbsnd_flags; + so->sol_sbrcv_timeo = sbrcv_timeo; + so->sol_sbsnd_timeo = sbsnd_timeo; + + so->sol_qlen = so->sol_incqlen = 0; + TAILQ_INIT(&so->sol_incomp); + TAILQ_INIT(&so->sol_comp); + + so->sol_accept_filter = NULL; + so->sol_accept_filter_arg = NULL; + so->sol_accept_filter_str = NULL; + + so->so_options |= SO_ACCEPTCONN; + +listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; - so->so_qlimit = backlog; - so->so_options |= SO_ACCEPTCONN; + so->sol_qlimit = backlog; +} + +/* + * Wakeup listeners/subsystems once we have a complete connection. + * Enters with lock, returns unlocked. + */ +void +solisten_wakeup(struct socket *sol) +{ + + if (sol->sol_upcall != NULL) + (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); + else { + selwakeuppri(&sol->so_rdsel, PSOCK); + KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); + } + SOLISTEN_UNLOCK(sol); + wakeup_one(&sol->sol_comp); +} + +/* + * Return single connection off a listening socket queue. Main consumer of + * the function is kern_accept4(). Some modules, that do their own accept + * management also use the function. + * + * Listening socket must be locked on entry and is returned unlocked on + * return. + * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. + */ +int +solisten_dequeue(struct socket *head, struct socket **ret, int flags) +{ + struct socket *so; + int error; + + SOLISTEN_LOCK_ASSERT(head); + + while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && + head->so_error == 0) { + error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH, + "accept", 0); + if (error != 0) { + SOLISTEN_UNLOCK(head); + return (error); + } + } + if (head->so_error) { + error = head->so_error; + head->so_error = 0; + SOLISTEN_UNLOCK(head); + return (error); + } + if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) { + SOLISTEN_UNLOCK(head); + return (EWOULDBLOCK); + } + so = TAILQ_FIRST(&head->sol_comp); + SOCK_LOCK(so); + KASSERT(so->so_qstate == SQ_COMP, + ("%s: so %p not SQ_COMP", __func__, so)); + soref(so); + head->sol_qlen--; + so->so_qstate = SQ_NONE; + so->so_listen = NULL; + TAILQ_REMOVE(&head->sol_comp, so, so_list); + if (flags & ACCEPT4_INHERIT) + so->so_state |= (head->so_state & SS_NBIO); + else + so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; + SOCK_UNLOCK(so); + sorele(head); + + *ret = so; + return (0); } /* @@ -774,44 +964,62 @@ void sofree(struct socket *so) { struct protosw *pr = so->so_proto; - struct socket *head; - ACCEPT_LOCK_ASSERT(); SOCK_LOCK_ASSERT(so); if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || - (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { + (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); return; } - head = so->so_head; - if (head != NULL) { - KASSERT((so->so_qstate & SQ_COMP) != 0 || - (so->so_qstate & SQ_INCOMP) != 0, - ("sofree: so_head != NULL, but neither SQ_COMP nor " - "SQ_INCOMP")); - KASSERT((so->so_qstate & SQ_COMP) == 0 || - (so->so_qstate & SQ_INCOMP) == 0, - ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - so->so_qstate &= ~SQ_INCOMP; - so->so_head = NULL; - } - KASSERT((so->so_qstate & SQ_COMP) == 0 && - (so->so_qstate & SQ_INCOMP) == 0, - ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", - so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); - if (so->so_options & SO_ACCEPTCONN) { - KASSERT((TAILQ_EMPTY(&so->so_comp)), - ("sofree: so_comp populated")); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), - ("sofree: so_incomp populated")); + if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { + struct socket *sol; + + sol = so->so_listen; + KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); + + /* + * To solve race between close of a listening socket and + * a socket on its incomplete queue, we need to lock both. + * The order is first listening socket, then regular. + * Since we don't have SS_NOFDREF neither SS_PROTOREF, this + * function and the listening socket are the only pointers + * to so. To preserve so and sol, we reference both and then + * relock. + * After relock the socket may not move to so_comp since it + * doesn't have PCB already, but it may be removed from + * so_incomp. If that happens, we share responsiblity on + * freeing the socket, but soclose() has already removed + * it from queue. + */ + soref(sol); + soref(so); + SOCK_UNLOCK(so); + SOLISTEN_LOCK(sol); + SOCK_LOCK(so); + if (so->so_qstate == SQ_INCOMP) { + KASSERT(so->so_listen == sol, + ("%s: so %p migrated out of sol %p", + __func__, so, sol)); + TAILQ_REMOVE(&sol->sol_incomp, so, so_list); + sol->sol_incqlen--; + /* This is guarenteed not to be the last. */ + refcount_release(&sol->so_count); + so->so_qstate = SQ_NONE; + so->so_listen = NULL; + } else + KASSERT(so->so_listen == NULL, + ("%s: so %p not on (in)comp with so_listen", + __func__, so)); + sorele(sol); + KASSERT(so->so_count == 1, + ("%s: so %p count %u", __func__, so, so->so_count)); + so->so_count = 0; } + if (SOLISTENING(so)) + so->so_error = ECONNABORTED; SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) @@ -833,12 +1041,14 @@ sofree(struct socket *so) * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ - sbdestroy(&so->so_snd, so); - sbdestroy(&so->so_rcv, so); - seldrain(&so->so_snd.sb_sel); - seldrain(&so->so_rcv.sb_sel); - knlist_destroy(&so->so_rcv.sb_sel.si_note); - knlist_destroy(&so->so_snd.sb_sel.si_note); + if (!SOLISTENING(so)) { + sbdestroy(&so->so_snd, so); + sbdestroy(&so->so_rcv, so); + } + seldrain(&so->so_rdsel); + seldrain(&so->so_wrsel); + knlist_destroy(&so->so_rdsel.si_note); + knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } @@ -853,6 +1063,8 @@ sofree(struct socket *so) int soclose(struct socket *so) { + struct accept_queue lqueue; + bool listening; int error = 0; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); @@ -885,41 +1097,42 @@ soclose(struct socket *so) drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); - ACCEPT_LOCK(); - if (so->so_options & SO_ACCEPTCONN) { + + SOCK_LOCK(so); + if ((listening = (so->so_options & SO_ACCEPTCONN))) { struct socket *sp; - /* - * Prevent new additions to the accept queues due - * to ACCEPT_LOCK races while we are draining them. - */ - so->so_options &= ~SO_ACCEPTCONN; - while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { - TAILQ_REMOVE(&so->so_incomp, sp, so_list); - so->so_incqlen--; - sp->so_qstate &= ~SQ_INCOMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); - soabort(sp); - ACCEPT_LOCK(); - } - while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { - TAILQ_REMOVE(&so->so_comp, sp, so_list); - so->so_qlen--; - sp->so_qstate &= ~SQ_COMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); - soabort(sp); - ACCEPT_LOCK(); + + TAILQ_INIT(&lqueue); + TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); + TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); + + so->sol_qlen = so->sol_incqlen = 0; + + TAILQ_FOREACH(sp, &lqueue, so_list) { + SOCK_LOCK(sp); + sp->so_qstate = SQ_NONE; + sp->so_listen = NULL; + SOCK_UNLOCK(sp); + /* Guaranteed not to be the last. */ + refcount_release(&so->so_count); } - KASSERT((TAILQ_EMPTY(&so->so_comp)), - ("%s: so_comp populated", __func__)); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), - ("%s: so_incomp populated", __func__)); } - SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; - sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */ + sorele(so); + if (listening) { + struct socket *sp; + + TAILQ_FOREACH(sp, &lqueue, so_list) { + SOCK_LOCK(sp); + if (sp->so_count == 0) { + SOCK_UNLOCK(sp); + soabort(sp); + } else + /* sp is now in sofree() */ + SOCK_UNLOCK(sp); + } + } CURVNET_RESTORE(); return (error); } @@ -951,13 +1164,11 @@ soabort(struct socket *so) KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); - KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); - KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); + KASSERT(so->so_qstate == SQ_NONE, ("soabort: !SQ_NONE")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); - ACCEPT_LOCK(); SOCK_LOCK(so); sofree(so); } @@ -2892,15 +3103,15 @@ integer: break; case SO_LISTENQLIMIT: - optval = so->so_qlimit; + optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: - optval = so->so_qlen; + optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: - optval = so->so_incqlen; + optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: @@ -3047,7 +3258,7 @@ sohasoutofband(struct socket *so) if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); - selwakeuppri(&so->so_rcv.sb_sel, PSOCK); + selwakeuppri(&so->so_rdsel, PSOCK); } int @@ -3067,44 +3278,54 @@ int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { - int revents = 0; - - SOCKBUF_LOCK(&so->so_snd); - SOCKBUF_LOCK(&so->so_rcv); - if (events & (POLLIN | POLLRDNORM)) - if (soreadabledata(so)) - revents |= events & (POLLIN | POLLRDNORM); - - if (events & (POLLOUT | POLLWRNORM)) - if (sowriteable(so)) - revents |= events & (POLLOUT | POLLWRNORM); - - if (events & (POLLPRI | POLLRDBAND)) - if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) - revents |= events & (POLLPRI | POLLRDBAND); + int revents; - if ((events & POLLINIGNEOF) == 0) { - if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { - revents |= events & (POLLIN | POLLRDNORM); - if (so->so_snd.sb_state & SBS_CANTSENDMORE) - revents |= POLLHUP; + SOCK_LOCK(so); + if (SOLISTENING(so)) { + if (!(events & (POLLIN | POLLRDNORM))) + revents = 0; + else if (!TAILQ_EMPTY(&so->sol_comp)) + revents = events & (POLLIN | POLLRDNORM); + else { + selrecord(td, &so->so_rdsel); + revents = 0; } - } - - if (revents == 0) { - if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { - selrecord(td, &so->so_rcv.sb_sel); - so->so_rcv.sb_flags |= SB_SEL; + } else { + revents = 0; + SOCKBUF_LOCK(&so->so_snd); + SOCKBUF_LOCK(&so->so_rcv); + if (events & (POLLIN | POLLRDNORM)) + if (soreadabledata(so)) + revents |= events & (POLLIN | POLLRDNORM); + if (events & (POLLOUT | POLLWRNORM)) + if (sowriteable(so)) + revents |= events & (POLLOUT | POLLWRNORM); + if (events & (POLLPRI | POLLRDBAND)) + if (so->so_oobmark || + (so->so_rcv.sb_state & SBS_RCVATMARK)) + revents |= events & (POLLPRI | POLLRDBAND); + if ((events & POLLINIGNEOF) == 0) { + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + revents |= events & (POLLIN | POLLRDNORM); + if (so->so_snd.sb_state & SBS_CANTSENDMORE) + revents |= POLLHUP; + } } - - if (events & (POLLOUT | POLLWRNORM)) { - selrecord(td, &so->so_snd.sb_sel); - so->so_snd.sb_flags |= SB_SEL; + if (revents == 0) { + if (events & + (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { + selrecord(td, &so->so_rdsel); + so->so_rcv.sb_flags |= SB_SEL; + } + if (events & (POLLOUT | POLLWRNORM)) { + selrecord(td, &so->so_wrsel); + so->so_snd.sb_flags |= SB_SEL; + } } + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_UNLOCK(&so->so_snd); } - - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_UNLOCK(so); return (revents); } @@ -3113,28 +3334,38 @@ soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; + struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; + knl = &so->so_rdsel.si_note; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; + knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; + knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; default: return (EINVAL); } - SOCKBUF_LOCK(sb); - knlist_add(&sb->sb_sel.si_note, kn, 1); - sb->sb_flags |= SB_KNOTE; - SOCKBUF_UNLOCK(sb); + SOCK_LOCK(so); + if (SOLISTENING(so)) { + knlist_add(knl, kn, 1); + } else { + SOCKBUF_LOCK(sb); + knlist_add(knl, kn, 1); + sb->sb_flags |= SB_KNOTE; + SOCKBUF_UNLOCK(sb); + } + SOCK_UNLOCK(so); return (0); } @@ -3313,11 +3544,11 @@ filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; - SOCKBUF_LOCK(&so->so_rcv); - knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); - if (knlist_empty(&so->so_rcv.sb_sel.si_note)) + so_rdknl_lock(so); + knlist_remove(&so->so_rdsel.si_note, kn, 1); + if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; - SOCKBUF_UNLOCK(&so->so_rcv); + so_rdknl_unlock(so); } /*ARGSUSED*/ @@ -3327,11 +3558,13 @@ filt_soread(struct knote *kn, long hint) struct socket *so; so = kn->kn_fp->f_data; - if (so->so_options & SO_ACCEPTCONN) { - kn->kn_data = so->so_qlen; - return (!TAILQ_EMPTY(&so->so_comp)); + if (SOLISTENING(so)) { + SOCK_LOCK_ASSERT(so); + kn->kn_data = so->sol_qlen; + return (!TAILQ_EMPTY(&so->sol_comp)); } + SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; @@ -3357,11 +3590,11 @@ filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; - SOCKBUF_LOCK(&so->so_snd); - knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); - if (knlist_empty(&so->so_snd.sb_sel.si_note)) + so_wrknl_lock(so); + knlist_remove(&so->so_wrsel.si_note, kn, 1); + if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; - SOCKBUF_UNLOCK(&so->so_snd); + so_wrknl_unlock(so); } /*ARGSUSED*/ @@ -3371,6 +3604,10 @@ filt_sowrite(struct knote *kn, long hint) struct socket *so; so = kn->kn_fp->f_data; + + if (SOLISTENING(so)) + return (0); + SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); @@ -3397,6 +3634,10 @@ filt_soempty(struct knote *kn, long hint) struct socket *so; so = kn->kn_fp->f_data; + + if (SOLISTENING(so)) + return (1); + SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbused(&so->so_snd); @@ -3465,42 +3706,52 @@ soisconnected(struct socket *so) struct socket *head; int ret; + /* + * XXXGL: this is the only place where we acquire socket locks + * in reverse order: first child, then listening socket. To + * avoid possible LOR, use try semantics. + */ restart: - ACCEPT_LOCK(); SOCK_LOCK(so); + if ((head = so->so_listen) != NULL && + __predict_false(SOLISTEN_TRYLOCK(head) == 0)) { + SOCK_UNLOCK(so); + goto restart; + } so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; - head = so->so_head; - if (head != NULL && (so->so_qstate & SQ_INCOMP)) { + if (head != NULL && (so->so_qstate == SQ_INCOMP)) { +again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { + TAILQ_REMOVE(&head->sol_incomp, so, so_list); + head->sol_incqlen--; + TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); + head->sol_qlen++; + so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - so->so_qstate &= ~SQ_INCOMP; - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - head->so_qlen++; - so->so_qstate |= SQ_COMP; - ACCEPT_UNLOCK(); - sorwakeup(head); - wakeup_one(&head->so_timeo); + solisten_wakeup(head); /* unlocks */ } else { - ACCEPT_UNLOCK(); + SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, - head->so_accf->so_accept_filter->accf_callback, - head->so_accf->so_accept_filter_arg); + head->sol_accept_filter->accf_callback, + head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; - ret = head->so_accf->so_accept_filter->accf_callback(so, - head->so_accf->so_accept_filter_arg, M_NOWAIT); - if (ret == SU_ISCONNECTED) + ret = head->sol_accept_filter->accf_callback(so, + head->sol_accept_filter_arg, M_NOWAIT); + if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); + SOCKBUF_UNLOCK(&so->so_rcv); + goto again; + } + SOCKBUF_UNLOCK(&so->so_rcv); SOCK_UNLOCK(so); - if (ret == SU_ISCONNECTED) - goto restart; + SOLISTEN_UNLOCK(head); } return; } + if (head != NULL) + SOLISTEN_UNLOCK(head); SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); @@ -3510,16 +3761,17 @@ void soisdisconnecting(struct socket *so) { - /* - * Note: This code assumes that SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) are the same. - */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; - socantrcvmore_locked(so); - SOCKBUF_LOCK(&so->so_snd); - socantsendmore_locked(so); + + if (!SOLISTENING(so)) { + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + SOCKBUF_LOCK(&so->so_snd); + socantsendmore_locked(so); + } + SOCK_UNLOCK(so); wakeup(&so->so_timeo); } @@ -3527,17 +3779,18 @@ void soisdisconnected(struct socket *so) { - /* - * Note: This code assumes that SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) are the same. - */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; - socantrcvmore_locked(so); - SOCKBUF_LOCK(&so->so_snd); - sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); - socantsendmore_locked(so); + + if (!SOLISTENING(so)) { + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + SOCKBUF_LOCK(&so->so_snd); + sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); + socantsendmore_locked(so); + } + SOCK_UNLOCK(so); wakeup(&so->so_timeo); } @@ -3563,6 +3816,8 @@ soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) { struct sockbuf *sb; + KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); + switch (which) { case SO_RCV: sb = &so->so_rcv; @@ -3584,6 +3839,8 @@ soupcall_clear(struct socket *so, int which) { struct sockbuf *sb; + KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); + switch (which) { case SO_RCV: sb = &so->so_rcv; @@ -3595,12 +3852,110 @@ soupcall_clear(struct socket *so, int which) panic("soupcall_clear: bad which"); } SOCKBUF_LOCK_ASSERT(sb); - KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear")); + KASSERT(sb->sb_upcall != NULL, + ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } +void +solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) +{ + + SOLISTEN_LOCK_ASSERT(so); + so->sol_upcall = func; + so->sol_upcallarg = arg; +} + +static void +so_rdknl_lock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK(so); + else + SOCKBUF_LOCK(&so->so_rcv); +} + +static void +so_rdknl_unlock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK(so); + else + SOCKBUF_UNLOCK(&so->so_rcv); +} + +static void +so_rdknl_assert_locked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK_ASSERT(so); + else + SOCKBUF_LOCK_ASSERT(&so->so_rcv); +} + +static void +so_rdknl_assert_unlocked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK_ASSERT(so); + else + SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); +} + +static void +so_wrknl_lock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK(so); + else + SOCKBUF_LOCK(&so->so_snd); +} + +static void +so_wrknl_unlock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK(so); + else + SOCKBUF_UNLOCK(&so->so_snd); +} + +static void +so_wrknl_assert_locked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK_ASSERT(so); + else + SOCKBUF_LOCK_ASSERT(&so->so_snd); +} + +static void +so_wrknl_assert_unlocked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK_ASSERT(so); + else + SOCKBUF_UNLOCK_ASSERT(&so->so_snd); +} + /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to @@ -3622,32 +3977,24 @@ sotoxsocket(struct socket *so, struct xsocket *xso) xso->so_pcb = so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; - xso->so_qlen = so->so_qlen; - xso->so_incqlen = so->so_incqlen; - xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; - xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; - xso->so_oobmark = so->so_oobmark; - sbtoxsockbuf(&so->so_snd, &xso->so_snd); - sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); xso->so_uid = so->so_cred->cr_uid; -} - - -/* - * Socket accessor functions to provide external consumers with - * a safe interface to socket state - * - */ - -void -so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), - void *arg) -{ - - TAILQ_FOREACH(so, &so->so_comp, so_list) - func(so, arg); + xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; + if (SOLISTENING(so)) { + xso->so_qlen = so->sol_qlen; + xso->so_incqlen = so->sol_incqlen; + xso->so_qlimit = so->sol_qlimit; + xso->so_oobmark = 0; + bzero(&xso->so_snd, sizeof(xso->so_snd)); + bzero(&xso->so_rcv, sizeof(xso->so_rcv)); + } else { + xso->so_state |= so->so_qstate; + xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; + xso->so_oobmark = so->so_oobmark; + sbtoxsockbuf(&so->so_snd, &xso->so_snd); + sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + } } struct sockbuf * |