diff options
Diffstat (limited to 'sys/netinet')
-rw-r--r-- | sys/netinet/in_pcb.c | 316 | ||||
-rw-r--r-- | sys/netinet/in_pcb.h | 66 | ||||
-rw-r--r-- | sys/netinet/in_pcbgroup.c | 566 | ||||
-rw-r--r-- | sys/netinet/in_rss.c | 1 | ||||
-rw-r--r-- | sys/netinet/tcp_syncache.c | 1 |
5 files changed, 4 insertions, 946 deletions
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 9dd2aee11bf0..f1ac46b28477 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -45,7 +45,6 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" -#include "opt_pcbgroup.h" #include "opt_route.h" #include "opt_rss.h" @@ -542,9 +541,6 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, &pcbinfo->ipi_porthashmask); pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_lbgrouphashmask); -#ifdef PCBGROUP - in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); -#endif pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); @@ -567,9 +563,6 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) pcbinfo->ipi_porthashmask); hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, pcbinfo->ipi_lbgrouphashmask); -#ifdef PCBGROUP - in_pcbgroup_destroy(pcbinfo); -#endif uma_zdestroy(pcbinfo->ipi_zone); INP_LIST_LOCK_DESTROY(pcbinfo); INP_HASH_LOCK_DESTROY(pcbinfo); @@ -1522,8 +1515,7 @@ in_pcbdetach(struct inpcb *inp) * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, - * but where the inpcb lock may already held, or when acquiring a reference - * via a pcbgroup. + * but where the inpcb lock may already held. * * in_pcbref() should be used only to provide brief memory stability, and * must always be followed by a call to INP_WLOCK() and in_pcbrele() to @@ -1783,9 +1775,6 @@ in_pcbdrop(struct inpcb *inp) } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; -#ifdef PCBGROUP - in_pcbgroup_remove(inp); -#endif } } @@ -2097,241 +2086,6 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, return (local_wild); } -#ifdef PCBGROUP -/* - * Lookup PCB in hash list, using pcbgroup tables. - */ -static struct inpcb * -in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, - struct in_addr faddr, u_int fport_arg, struct in_addr laddr, - u_int lport_arg, int lookupflags, struct ifnet *ifp) -{ - struct inpcbhead *head; - struct inpcb *inp, *tmpinp; - u_short fport = fport_arg, lport = lport_arg; - bool locked; - - /* - * First look for an exact match. - */ - tmpinp = NULL; - INP_GROUP_LOCK(pcbgroup); - head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, - pcbgroup->ipg_hashmask)]; - CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { -#ifdef INET6 - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV4) == 0) - continue; -#endif - if (inp->inp_faddr.s_addr == faddr.s_addr && - inp->inp_laddr.s_addr == laddr.s_addr && - inp->inp_fport == fport && - inp->inp_lport == lport) { - /* - * XXX We should be able to directly return - * the inp here, without any checks. - * Well unless both bound with SO_REUSEPORT? - */ - if (prison_flag(inp->inp_cred, PR_IP4)) - goto found; - if (tmpinp == NULL) - tmpinp = inp; - } - } - if (tmpinp != NULL) { - inp = tmpinp; - goto found; - } - -#ifdef RSS - /* - * For incoming connections, we may wish to do a wildcard - * match for an RSS-local socket. - */ - if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { - struct inpcb *local_wild = NULL, *local_exact = NULL; -#ifdef INET6 - struct inpcb *local_wild_mapped = NULL; -#endif - struct inpcb *jail_wild = NULL; - struct inpcbhead *head; - int injail; - - /* - * Order of socket selection - we always prefer jails. - * 1. jailed, non-wild. - * 2. jailed, wild. - * 3. non-jailed, non-wild. - * 4. non-jailed, wild. - */ - - head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, - lport, 0, pcbgroup->ipg_hashmask)]; - CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { -#ifdef INET6 - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV4) == 0) - continue; -#endif - if (inp->inp_faddr.s_addr != INADDR_ANY || - inp->inp_lport != lport) - continue; - - injail = prison_flag(inp->inp_cred, PR_IP4); - if (injail) { - if (prison_check_ip4(inp->inp_cred, - &laddr) != 0) - continue; - } else { - if (local_exact != NULL) - continue; - } - - if (inp->inp_laddr.s_addr == laddr.s_addr) { - if (injail) - goto found; - else - local_exact = inp; - } else if (inp->inp_laddr.s_addr == INADDR_ANY) { -#ifdef INET6 - /* XXX inp locking, NULL check */ - if (inp->inp_vflag & INP_IPV6PROTO) - local_wild_mapped = inp; - else -#endif - if (injail) - jail_wild = inp; - else - local_wild = inp; - } - } /* LIST_FOREACH */ - - inp = jail_wild; - if (inp == NULL) - inp = local_exact; - if (inp == NULL) - inp = local_wild; -#ifdef INET6 - if (inp == NULL) - inp = local_wild_mapped; -#endif - if (inp != NULL) - goto found; - } -#endif - - /* - * Then look for a wildcard match, if requested. - */ - if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { - struct inpcb *local_wild = NULL, *local_exact = NULL; -#ifdef INET6 - struct inpcb *local_wild_mapped = NULL; -#endif - struct inpcb *jail_wild = NULL; - struct inpcbhead *head; - int injail; - - /* - * Order of socket selection - we always prefer jails. - * 1. jailed, non-wild. - * 2. jailed, wild. - * 3. non-jailed, non-wild. - * 4. non-jailed, wild. - */ - head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, - 0, pcbinfo->ipi_wildmask)]; - CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) { -#ifdef INET6 - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV4) == 0) - continue; -#endif - if (inp->inp_faddr.s_addr != INADDR_ANY || - inp->inp_lport != lport) - continue; - - injail = prison_flag(inp->inp_cred, PR_IP4); - if (injail) { - if (prison_check_ip4(inp->inp_cred, - &laddr) != 0) - continue; - } else { - if (local_exact != NULL) - continue; - } - - if (inp->inp_laddr.s_addr == laddr.s_addr) { - if (injail) - goto found; - else - local_exact = inp; - } else if (inp->inp_laddr.s_addr == INADDR_ANY) { -#ifdef INET6 - /* XXX inp locking, NULL check */ - if (inp->inp_vflag & INP_IPV6PROTO) - local_wild_mapped = inp; - else -#endif - if (injail) - jail_wild = inp; - else - local_wild = inp; - } - } /* LIST_FOREACH */ - inp = jail_wild; - if (inp == NULL) - inp = local_exact; - if (inp == NULL) - inp = local_wild; -#ifdef INET6 - if (inp == NULL) - inp = local_wild_mapped; -#endif - if (inp != NULL) - goto found; - } /* if (lookupflags & INPLOOKUP_WILDCARD) */ - INP_GROUP_UNLOCK(pcbgroup); - return (NULL); - -found: - if (lookupflags & INPLOOKUP_WLOCKPCB) - locked = INP_TRY_WLOCK(inp); - else if (lookupflags & INPLOOKUP_RLOCKPCB) - locked = INP_TRY_RLOCK(inp); - else - panic("%s: locking bug", __func__); - if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) { - if (lookupflags & INPLOOKUP_WLOCKPCB) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); - return (NULL); - } else if (!locked) - in_pcbref(inp); - INP_GROUP_UNLOCK(pcbgroup); - if (!locked) { - if (lookupflags & INPLOOKUP_WLOCKPCB) { - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) - return (NULL); - } else { - INP_RLOCK(inp); - if (in_pcbrele_rlocked(inp)) - return (NULL); - } - } -#ifdef INVARIANTS - if (lookupflags & INPLOOKUP_WLOCKPCB) - INP_WLOCK_ASSERT(inp); - else - INP_RLOCK_ASSERT(inp); -#endif - return (inp); -} -#endif /* PCBGROUP */ - /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has locked the hash list, and will not perform any further @@ -2497,40 +2251,17 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. - * - * Possibly more of this logic should be in in_pcbgroup.c. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { -#if defined(PCBGROUP) && !defined(RSS) - struct inpcbgroup *pcbgroup; -#endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); - /* - * When not using RSS, use connection groups in preference to the - * reservation table when looking up 4-tuples. When using RSS, just - * use the reservation table, due to the cost of the Toeplitz hash - * in software. - * - * XXXRW: This policy belongs in the pcbgroup code, as in principle - * we could be doing RSS with a non-Toeplitz hash that is affordable - * in software. - */ -#if defined(PCBGROUP) && !defined(RSS) - if (in_pcbgroup_enabled(pcbinfo)) { - pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, - fport); - return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, - laddr, lport, lookupflags, ifp)); - } -#endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp, M_NODOM)); } @@ -2540,39 +2271,12 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { -#ifdef PCBGROUP - struct inpcbgroup *pcbgroup; -#endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); -#ifdef PCBGROUP - /* - * If we can use a hardware-generated hash to look up the connection - * group, use that connection group to find the inpcb. Otherwise - * fall back on a software hash -- or the reservation table if we're - * using RSS. - * - * XXXRW: As above, that policy belongs in the pcbgroup code. - */ - if (in_pcbgroup_enabled(pcbinfo) && - !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { - pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), - m->m_pkthdr.flowid); - if (pcbgroup != NULL) - return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, - fport, laddr, lport, lookupflags, ifp)); -#ifndef RSS - pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, - fport); - return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, - laddr, lport, lookupflags, ifp)); -#endif - } -#endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp, m->m_pkthdr.numa_domain)); } @@ -2647,13 +2351,7 @@ in_pcbinshash_internal(struct inpcb *inp, struct mbuf *m) CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; -#ifdef PCBGROUP - if (m != NULL) { - in_pcbgroup_update_mbuf(inp, m); - } else { - in_pcbgroup_update(inp); - } -#endif + return (0); } @@ -2702,13 +2400,6 @@ in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) CK_LIST_REMOVE(inp, inp_hash); CK_LIST_INSERT_HEAD(head, inp, inp_hash); - -#ifdef PCBGROUP - if (m != NULL) - in_pcbgroup_update_mbuf(inp, m); - else - in_pcbgroup_update(inp); -#endif } void @@ -2749,9 +2440,6 @@ in_pcbremlists(struct inpcb *inp) } CK_LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; -#ifdef PCBGROUP - in_pcbgroup_remove(inp); -#endif } /* diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index d6a335236599..813c87559de3 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -156,7 +156,6 @@ struct in_conninfo { * (b) - Protected by the hpts lock. * (c) - Constant after initialization * (e) - Protected by the net_epoch_prempt epoch - * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb * (l) - Protected by the pcblist lock for the inpcb @@ -231,7 +230,6 @@ struct m_snd_tag; struct inpcb { /* Cache line #1 (amd64) */ CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */ - CK_LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ struct rwlock inp_lock; /* Cache line #2 (amd64) */ #define inp_start_zero inp_hpts @@ -276,8 +274,6 @@ struct inpcb { uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */ TAILQ_ENTRY(inpcb) inp_input; /* pacing in queue next lock(b) */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ - struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ - CK_LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ u_char inp_vflag; /* (i) IP version flag (v4/v6) */ @@ -423,7 +419,6 @@ struct inpcbport { * ipi_lock (before) * inpcb locks (before) * ipi_list locks (before) - * {ipi_hash_lock, pcbgroup locks} * * Locking key: * @@ -432,7 +427,6 @@ struct inpcbport { * (g) Locked by ipi_lock * (l) Locked by ipi_list_lock * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock - * (p) Protected by one or more pcbgroup locks * (x) Synchronisation properties poorly defined */ struct inpcbinfo { @@ -466,16 +460,7 @@ struct inpcbinfo { struct uma_zone *ipi_zone; /* (c) */ /* - * Connection groups associated with this protocol. These fields are - * constant, but pcbgroup structures themselves are protected by - * per-pcbgroup locks. - */ - struct inpcbgroup *ipi_pcbgroups; /* (c) */ - u_int ipi_npcbgroups; /* (c) */ - u_int ipi_hashfields; /* (c) */ - - /* - * Global lock protecting modification non-pcbgroup hash lookup tables. + * Global lock protecting modification hash lookup tables. */ struct mtx ipi_hash_lock; @@ -493,14 +478,6 @@ struct inpcbinfo { u_long ipi_porthashmask; /* (h) */ /* - * List of wildcard inpcbs for use with pcbgroups. In the past, was - * per-pcbgroup but is now global. All pcbgroup locks must be held - * to modify the list, so any is sufficient to read it. - */ - struct inpcbhead *ipi_wildbase; /* (p) */ - u_long ipi_wildmask; /* (p) */ - - /* * Load balance groups used for the SO_REUSEPORT_LB option, * hashed by local port. */ @@ -525,31 +502,6 @@ struct inpcbinfo { #ifdef _KERNEL /* - * Connection groups hold sets of connections that have similar CPU/thread - * affinity. Each connection belongs to exactly one connection group. - */ -struct inpcbgroup { - /* - * Per-connection group hash of inpcbs, hashed by local and foreign - * addresses and port numbers. - */ - struct inpcbhead *ipg_hashbase; /* (c) */ - u_long ipg_hashmask; /* (c) */ - - /* - * Notional affinity of this pcbgroup. - */ - u_int ipg_cpu; /* (p) */ - - /* - * Per-connection group lock, not to be confused with ipi_lock. - * Protects the hash table hung off the group, but also the global - * wildcard list in inpcbinfo. - */ - struct mtx ipg_lock; -} __aligned(CACHE_LINE_SIZE); - -/* * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group * (or unique address:port combination) can be re-used at most * INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which @@ -728,7 +680,7 @@ int inp_so_options(const struct inpcb *inp); */ #define INP_MBUF_L_ACKS 0x00000001 /* We need large mbufs for ack compression */ #define INP_MBUF_ACKCMP 0x00000002 /* TCP mbuf ack compression ok */ -#define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */ +/* 0x00000004 */ #define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ #define INP_FREED 0x00000010 /* inp itself is not valid */ #define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */ @@ -809,20 +761,6 @@ void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *, int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi); -struct inpcbgroup * - in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t); -struct inpcbgroup * - in_pcbgroup_byinpcb(struct inpcb *); -struct inpcbgroup * - in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short, - struct in_addr, u_short); -void in_pcbgroup_destroy(struct inpcbinfo *); -int in_pcbgroup_enabled(struct inpcbinfo *); -void in_pcbgroup_init(struct inpcbinfo *, u_int, int); -void in_pcbgroup_remove(struct inpcb *); -void in_pcbgroup_update(struct inpcb *); -void in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *); - void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *); diff --git a/sys/netinet/in_pcbgroup.c b/sys/netinet/in_pcbgroup.c deleted file mode 100644 index 11ed75be1198..000000000000 --- a/sys/netinet/in_pcbgroup.c +++ /dev/null @@ -1,566 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2010-2011 Juniper Networks, Inc. - * All rights reserved. - * - * This software was developed by Robert N. M. Watson under contract - * to Juniper Networks, Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> - -__FBSDID("$FreeBSD$"); - -#include "opt_inet6.h" -#include "opt_rss.h" - -#include <sys/param.h> -#include <sys/lock.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/mutex.h> -#include <sys/smp.h> -#include <sys/socket.h> -#include <sys/socketvar.h> - -#include <net/rss_config.h> - -#include <netinet/in.h> - -#include <netinet/in_pcb.h> -#include <netinet/in_rss.h> -#ifdef INET6 -#include <netinet6/in6_pcb.h> -#endif /* INET6 */ - -/* - * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's - * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization - * Strategies in Modern Operating Systems". This implementation differs - * significantly from that described in the paper, in that it attempts to - * introduce not just notions of affinity for connections and distribute work - * so as to reduce lock contention, but also align those notions with - * hardware work distribution strategies such as RSS. In this construction, - * connection groups supplement, rather than replace, existing reservation - * tables for protocol 4-tuples, offering CPU-affine lookup tables with - * minimal cache line migration and lock contention during steady state - * operation. - * - * Hardware-offloaded checksums are often inefficient in software -- for - * example, Toeplitz, specified by RSS, introduced a significant overhead if - * performed during per-packge processing. It is therefore desirable to fall - * back on traditional reservation table lookups without affinity where - * hardware-offloaded checksums aren't available, such as for traffic over - * non-RSS interfaces. - * - * Internet protocols, such as UDP and TCP, register to use connection groups - * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this - * indicates to the connection group code whether a 2-tuple or 4-tuple is - * used as an argument to hashes that assign a connection to a particular - * group. This must be aligned with any hardware offloaded distribution - * model, such as RSS or similar approaches taken in embedded network boards. - * Wildcard sockets require special handling, as in Willman 2006, and are - * shared between connection groups -- while being protected by group-local - * locks. This means that connection establishment and teardown can be - * signficantly more expensive than without connection groups, but that - * steady-state processing can be significantly faster. - * - * When RSS is used, certain connection group parameters, such as the number - * of groups, are provided by the RSS implementation, found in in_rss.c. - * Otherwise, in_pcbgroup.c selects possible sensible parameters - * corresponding to the degree of parallelism exposed by netisr. - * - * Most of the implementation of connection groups is in this file; however, - * connection group lookup is implemented in in_pcb.c alongside reservation - * table lookups -- see in_pcblookup_group(). - * - * TODO: - * - * Implement dynamic rebalancing of buckets with connection groups; when - * load is unevenly distributed, search for more optimal balancing on - * demand. This might require scaling up the number of connection groups - * by <<1. - * - * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection - * groups for ip_input and ip6_input, allowing non-offloaded work - * distribution. - * - * Expose effective CPU affinity of connections to userspace using socket - * options. - * - * Investigate per-connection affinity overrides based on socket options; an - * option could be set, certainly resulting in work being distributed - * differently in software, and possibly propagated to supporting hardware - * with TCAMs or hardware hash tables. This might require connections to - * exist in more than one connection group at a time. - * - * Hook netisr thread reconfiguration events, and propagate those to RSS so - * that rebalancing can occur when the thread pool grows or shrinks. - * - * Expose per-pcbgroup statistics to userspace monitoring tools such as - * netstat, in order to allow better debugging and profiling. - */ - -void -in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields, - int hash_nelements) -{ - struct inpcbgroup *pcbgroup; - u_int numpcbgroups, pgn; - - /* - * Only enable connection groups for a protocol if it has been - * specifically requested. - */ - if (hashfields == IPI_HASHFIELDS_NONE) - return; - - /* - * Connection groups are about multi-processor load distribution, - * lock contention, and connection CPU affinity. As such, no point - * in turning them on for a uniprocessor machine, it only wastes - * memory. - */ - if (mp_ncpus == 1) - return; - -#ifdef RSS - /* - * If we're using RSS, then RSS determines the number of connection - * groups to use: one connection group per RSS bucket. If for some - * reason RSS isn't able to provide a number of buckets, disable - * connection groups entirely. - * - * XXXRW: Can this ever happen? - */ - numpcbgroups = rss_getnumbuckets(); - if (numpcbgroups == 0) - return; -#else - /* - * Otherwise, we'll just use one per CPU for now. If we decide to - * do dynamic rebalancing a la RSS, we'll need similar logic here. - */ - numpcbgroups = mp_ncpus; -#endif - - pcbinfo->ipi_hashfields = hashfields; - pcbinfo->ipi_pcbgroups = malloc(numpcbgroups * - sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO); - pcbinfo->ipi_npcbgroups = numpcbgroups; - pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB, - &pcbinfo->ipi_wildmask); - for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { - pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; - pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB, - &pcbgroup->ipg_hashmask); - INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup"); - - /* - * Initialise notional affinity of the pcbgroup -- for RSS, - * we want the same notion of affinity as NICs to be used. In - * the non-RSS case, just round robin for the time being. - * - * XXXRW: The notion of a bucket to CPU mapping is common at - * both pcbgroup and RSS layers -- does that mean that we - * should migrate it all from RSS to here, and just leave RSS - * responsible only for providing hashing and mapping functions? - */ -#ifdef RSS - pcbgroup->ipg_cpu = rss_getcpu(pgn); -#else - pcbgroup->ipg_cpu = (pgn % mp_ncpus); -#endif - } -} - -void -in_pcbgroup_destroy(struct inpcbinfo *pcbinfo) -{ - struct inpcbgroup *pcbgroup; - u_int pgn; - - if (pcbinfo->ipi_npcbgroups == 0) - return; - - for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { - pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; - KASSERT(CK_LIST_EMPTY(pcbinfo->ipi_listhead), - ("in_pcbinfo_destroy: listhead not empty")); - INP_GROUP_LOCK_DESTROY(pcbgroup); - hashdestroy(pcbgroup->ipg_hashbase, M_PCB, - pcbgroup->ipg_hashmask); - } - hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask); - free(pcbinfo->ipi_pcbgroups, M_PCB); - pcbinfo->ipi_pcbgroups = NULL; - pcbinfo->ipi_npcbgroups = 0; - pcbinfo->ipi_hashfields = 0; -} - -/* - * Given a hash of whatever the covered tuple might be, return a pcbgroup - * index. Where RSS is supported, try to align bucket selection with RSS CPU - * affinity strategy. - */ -static __inline u_int -in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash) -{ - -#ifdef RSS - return (rss_getbucket(hash)); -#else - return (hash % pcbinfo->ipi_npcbgroups); -#endif -} - -/* - * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash - * information is insufficient to identify the pcbgroup. This might occur if - * a TCP packet turns up with a 2-tuple hash, or if an RSS hash is present but - * RSS is not compiled into the kernel. - */ -struct inpcbgroup * -in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash) -{ - -#ifdef RSS - if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE && - hashtype == M_HASHTYPE_RSS_TCP_IPV4) || - (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE && - hashtype == M_HASHTYPE_RSS_UDP_IPV4) || - (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE && - hashtype == M_HASHTYPE_RSS_IPV4)) - return (&pcbinfo->ipi_pcbgroups[ - in_pcbgroup_getbucket(pcbinfo, hash)]); -#endif - return (NULL); -} - -static struct inpcbgroup * -in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m) -{ - - return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), - m->m_pkthdr.flowid)); -} - -struct inpcbgroup * -in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr, - u_short lport, struct in_addr faddr, u_short fport) -{ - uint32_t hash; - - /* - * RSS note: we pass foreign addr/port as source, and local addr/port - * as destination, as we want to align with what the hardware is - * doing. - */ - switch (pcbinfo->ipi_hashfields) { - case IPI_HASHFIELDS_4TUPLE: -#ifdef RSS - hash = rss_hash_ip4_4tuple(faddr, fport, laddr, lport); -#else - hash = faddr.s_addr ^ fport; -#endif - break; - - case IPI_HASHFIELDS_2TUPLE: -#ifdef RSS - hash = rss_hash_ip4_2tuple(faddr, laddr); -#else - hash = faddr.s_addr ^ laddr.s_addr; -#endif - break; - - default: - hash = 0; - } - return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo, - hash)]); -} - -struct inpcbgroup * -in_pcbgroup_byinpcb(struct inpcb *inp) -{ -#ifdef RSS - /* - * Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined - * RSS bucket and thus we should use this pcbgroup, rather than - * using a tuple or hash. - * - * XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket - * fits in that! - */ - if (inp->inp_flags2 & INP_RSS_BUCKET_SET) - return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]); -#endif - - return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr, - inp->inp_lport, inp->inp_faddr, inp->inp_fport)); -} - -static void -in_pcbwild_add(struct inpcb *inp) -{ - struct inpcbinfo *pcbinfo; - struct inpcbhead *head; - u_int pgn; - - INP_WLOCK_ASSERT(inp); - KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD), - ("%s: is wild",__func__)); - - pcbinfo = inp->inp_pcbinfo; - for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) - INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); - head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport, - 0, pcbinfo->ipi_wildmask)]; - CK_LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild); - inp->inp_flags2 |= INP_PCBGROUPWILD; - for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) - INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); -} - -static void -in_pcbwild_remove(struct inpcb *inp) -{ - struct inpcbinfo *pcbinfo; - u_int pgn; - - INP_WLOCK_ASSERT(inp); - KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD), - ("%s: not wild", __func__)); - - pcbinfo = inp->inp_pcbinfo; - for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) - INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); - CK_LIST_REMOVE(inp, inp_pcbgroup_wild); - for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) - INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); - inp->inp_flags2 &= ~INP_PCBGROUPWILD; -} - -static __inline int -in_pcbwild_needed(struct inpcb *inp) -{ -#ifdef RSS - /* - * If it's a listen socket and INP_RSS_BUCKET_SET is set, - * it's a wildcard socket _but_ it's in a specific pcbgroup. - * Thus we don't treat it as a pcbwild inp. - */ - if (inp->inp_flags2 & INP_RSS_BUCKET_SET) - return (0); -#endif - -#ifdef INET6 - if (inp->inp_vflag & INP_IPV6) - return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)); - else -#endif - return (inp->inp_faddr.s_addr == htonl(INADDR_ANY)); -} - -static void -in_pcbwild_update_internal(struct inpcb *inp) -{ - int wildcard_needed; - - wildcard_needed = in_pcbwild_needed(inp); - if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD)) - in_pcbwild_add(inp); - else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD)) - in_pcbwild_remove(inp); -} - -/* - * Update the pcbgroup of an inpcb, which might include removing an old - * pcbgroup reference and/or adding a new one. Wildcard processing is not - * performed here, although ideally we'll never install a pcbgroup for a - * wildcard inpcb (asserted below). - */ -static void -in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo, - struct inpcbgroup *newpcbgroup, struct inpcb *inp) -{ - struct inpcbgroup *oldpcbgroup; - struct inpcbhead *pcbhash; - uint32_t hashkey_faddr; - - INP_WLOCK_ASSERT(inp); - - oldpcbgroup = inp->inp_pcbgroup; - if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) { - INP_GROUP_LOCK(oldpcbgroup); - CK_LIST_REMOVE(inp, inp_pcbgrouphash); - inp->inp_pcbgroup = NULL; - INP_GROUP_UNLOCK(oldpcbgroup); - } - if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) { -#ifdef INET6 - if (inp->inp_vflag & INP_IPV6) - hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); - else -#endif - hashkey_faddr = inp->inp_faddr.s_addr; - INP_GROUP_LOCK(newpcbgroup); - /* - * If the inp is an RSS bucket wildcard entry, ensure - * that the PCB hash is calculated correctly. - * - * The wildcard hash calculation differs from the - * non-wildcard definition. The source address is - * INADDR_ANY and the far port is 0. - */ - if (inp->inp_flags2 & INP_RSS_BUCKET_SET) { - pcbhash = &newpcbgroup->ipg_hashbase[ - INP_PCBHASH(INADDR_ANY, inp->inp_lport, 0, - newpcbgroup->ipg_hashmask)]; - } else { - pcbhash = &newpcbgroup->ipg_hashbase[ - INP_PCBHASH(hashkey_faddr, inp->inp_lport, - inp->inp_fport, - newpcbgroup->ipg_hashmask)]; - } - CK_LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash); - inp->inp_pcbgroup = newpcbgroup; - INP_GROUP_UNLOCK(newpcbgroup); - } - - KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)), - ("%s: pcbgroup and wildcard!", __func__)); -} - -/* - * Two update paths: one in which the 4-tuple on an inpcb has been updated - * and therefore connection groups may need to change (or a wildcard entry - * may needed to be installed), and another in which the 4-tuple has been - * set as a result of a packet received, in which case we may be able to use - * the hash on the mbuf to avoid doing a software hash calculation for RSS. - * - * In each case: first, let the wildcard code have a go at placing it as a - * wildcard socket. If it was a wildcard, or if the connection has been - * dropped, then no pcbgroup is required (so potentially clear it); - * otherwise, calculate and update the pcbgroup for the inpcb. - */ -void -in_pcbgroup_update(struct inpcb *inp) -{ - struct inpcbinfo *pcbinfo; - struct inpcbgroup *newpcbgroup; - - INP_WLOCK_ASSERT(inp); - - pcbinfo = inp->inp_pcbinfo; - if (!in_pcbgroup_enabled(pcbinfo)) - return; - - in_pcbwild_update_internal(inp); - if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && - !(inp->inp_flags & INP_DROPPED)) { -#ifdef INET6 - if (inp->inp_vflag & INP_IPV6) - newpcbgroup = in6_pcbgroup_byinpcb(inp); - else -#endif - newpcbgroup = in_pcbgroup_byinpcb(inp); - } else - newpcbgroup = NULL; - in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); -} - -void -in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m) -{ - struct inpcbinfo *pcbinfo; - struct inpcbgroup *newpcbgroup; - - INP_WLOCK_ASSERT(inp); - - pcbinfo = inp->inp_pcbinfo; - if (!in_pcbgroup_enabled(pcbinfo)) - return; - - /* - * Possibly should assert !INP_PCBGROUPWILD rather than testing for - * it; presumably this function should never be called for anything - * other than non-wildcard socket? - */ - in_pcbwild_update_internal(inp); - if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && - !(inp->inp_flags & INP_DROPPED)) { - newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m); -#ifdef INET6 - if (inp->inp_vflag & INP_IPV6) { - if (newpcbgroup == NULL) - newpcbgroup = in6_pcbgroup_byinpcb(inp); - } else { -#endif - if (newpcbgroup == NULL) - newpcbgroup = in_pcbgroup_byinpcb(inp); -#ifdef INET6 - } -#endif - } else - newpcbgroup = NULL; - in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); -} - -/* - * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb. - */ -void -in_pcbgroup_remove(struct inpcb *inp) -{ - struct inpcbgroup *pcbgroup; - - INP_WLOCK_ASSERT(inp); - - if (!in_pcbgroup_enabled(inp->inp_pcbinfo)) - return; - - if (inp->inp_flags2 & INP_PCBGROUPWILD) - in_pcbwild_remove(inp); - - pcbgroup = inp->inp_pcbgroup; - if (pcbgroup != NULL) { - INP_GROUP_LOCK(pcbgroup); - CK_LIST_REMOVE(inp, inp_pcbgrouphash); - inp->inp_pcbgroup = NULL; - INP_GROUP_UNLOCK(pcbgroup); - } -} - -/* - * Query whether or not it is appropriate to use pcbgroups to look up inpcbs - * for a protocol. - */ -int -in_pcbgroup_enabled(struct inpcbinfo *pcbinfo) -{ - - return (pcbinfo->ipi_npcbgroups > 0); -} diff --git a/sys/netinet/in_rss.c b/sys/netinet/in_rss.c index 76438a330bb5..5dffbc5b5fd0 100644 --- a/sys/netinet/in_rss.c +++ b/sys/netinet/in_rss.c @@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$"); #include "opt_inet6.h" -#include "opt_pcbgroup.h" #include <sys/param.h> #include <sys/mbuf.h> diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 4be888f22a37..39ec65df7426 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -38,7 +38,6 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" -#include "opt_pcbgroup.h" #include <sys/param.h> #include <sys/systm.h> |