aboutsummaryrefslogtreecommitdiff
path: root/sys/netinet
diff options
context:
space:
mode:
Diffstat (limited to 'sys/netinet')
-rw-r--r--sys/netinet/in_pcb.c316
-rw-r--r--sys/netinet/in_pcb.h66
-rw-r--r--sys/netinet/in_pcbgroup.c566
-rw-r--r--sys/netinet/in_rss.c1
-rw-r--r--sys/netinet/tcp_syncache.c1
5 files changed, 4 insertions, 946 deletions
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 9dd2aee11bf0..f1ac46b28477 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -45,7 +45,6 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ratelimit.h"
-#include "opt_pcbgroup.h"
#include "opt_route.h"
#include "opt_rss.h"
@@ -542,9 +541,6 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
&pcbinfo->ipi_porthashmask);
pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_lbgrouphashmask);
-#ifdef PCBGROUP
- in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
-#endif
pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0);
uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
@@ -567,9 +563,6 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
pcbinfo->ipi_porthashmask);
hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
pcbinfo->ipi_lbgrouphashmask);
-#ifdef PCBGROUP
- in_pcbgroup_destroy(pcbinfo);
-#endif
uma_zdestroy(pcbinfo->ipi_zone);
INP_LIST_LOCK_DESTROY(pcbinfo);
INP_HASH_LOCK_DESTROY(pcbinfo);
@@ -1522,8 +1515,7 @@ in_pcbdetach(struct inpcb *inp)
* in_pcbref() bumps the reference count on an inpcb in order to maintain
* stability of an inpcb pointer despite the inpcb lock being released. This
* is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
- * but where the inpcb lock may already held, or when acquiring a reference
- * via a pcbgroup.
+ * but where the inpcb lock may already held.
*
* in_pcbref() should be used only to provide brief memory stability, and
* must always be followed by a call to INP_WLOCK() and in_pcbrele() to
@@ -1783,9 +1775,6 @@ in_pcbdrop(struct inpcb *inp)
}
INP_HASH_WUNLOCK(inp->inp_pcbinfo);
inp->inp_flags &= ~INP_INHASHLIST;
-#ifdef PCBGROUP
- in_pcbgroup_remove(inp);
-#endif
}
}
@@ -2097,241 +2086,6 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
return (local_wild);
}
-#ifdef PCBGROUP
-/*
- * Lookup PCB in hash list, using pcbgroup tables.
- */
-static struct inpcb *
-in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
- struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
- u_int lport_arg, int lookupflags, struct ifnet *ifp)
-{
- struct inpcbhead *head;
- struct inpcb *inp, *tmpinp;
- u_short fport = fport_arg, lport = lport_arg;
- bool locked;
-
- /*
- * First look for an exact match.
- */
- tmpinp = NULL;
- INP_GROUP_LOCK(pcbgroup);
- head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
- pcbgroup->ipg_hashmask)];
- CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
-#ifdef INET6
- /* XXX inp locking */
- if ((inp->inp_vflag & INP_IPV4) == 0)
- continue;
-#endif
- if (inp->inp_faddr.s_addr == faddr.s_addr &&
- inp->inp_laddr.s_addr == laddr.s_addr &&
- inp->inp_fport == fport &&
- inp->inp_lport == lport) {
- /*
- * XXX We should be able to directly return
- * the inp here, without any checks.
- * Well unless both bound with SO_REUSEPORT?
- */
- if (prison_flag(inp->inp_cred, PR_IP4))
- goto found;
- if (tmpinp == NULL)
- tmpinp = inp;
- }
- }
- if (tmpinp != NULL) {
- inp = tmpinp;
- goto found;
- }
-
-#ifdef RSS
- /*
- * For incoming connections, we may wish to do a wildcard
- * match for an RSS-local socket.
- */
- if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
- struct inpcb *local_wild = NULL, *local_exact = NULL;
-#ifdef INET6
- struct inpcb *local_wild_mapped = NULL;
-#endif
- struct inpcb *jail_wild = NULL;
- struct inpcbhead *head;
- int injail;
-
- /*
- * Order of socket selection - we always prefer jails.
- * 1. jailed, non-wild.
- * 2. jailed, wild.
- * 3. non-jailed, non-wild.
- * 4. non-jailed, wild.
- */
-
- head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
- lport, 0, pcbgroup->ipg_hashmask)];
- CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
-#ifdef INET6
- /* XXX inp locking */
- if ((inp->inp_vflag & INP_IPV4) == 0)
- continue;
-#endif
- if (inp->inp_faddr.s_addr != INADDR_ANY ||
- inp->inp_lport != lport)
- continue;
-
- injail = prison_flag(inp->inp_cred, PR_IP4);
- if (injail) {
- if (prison_check_ip4(inp->inp_cred,
- &laddr) != 0)
- continue;
- } else {
- if (local_exact != NULL)
- continue;
- }
-
- if (inp->inp_laddr.s_addr == laddr.s_addr) {
- if (injail)
- goto found;
- else
- local_exact = inp;
- } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
-#ifdef INET6
- /* XXX inp locking, NULL check */
- if (inp->inp_vflag & INP_IPV6PROTO)
- local_wild_mapped = inp;
- else
-#endif
- if (injail)
- jail_wild = inp;
- else
- local_wild = inp;
- }
- } /* LIST_FOREACH */
-
- inp = jail_wild;
- if (inp == NULL)
- inp = local_exact;
- if (inp == NULL)
- inp = local_wild;
-#ifdef INET6
- if (inp == NULL)
- inp = local_wild_mapped;
-#endif
- if (inp != NULL)
- goto found;
- }
-#endif
-
- /*
- * Then look for a wildcard match, if requested.
- */
- if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
- struct inpcb *local_wild = NULL, *local_exact = NULL;
-#ifdef INET6
- struct inpcb *local_wild_mapped = NULL;
-#endif
- struct inpcb *jail_wild = NULL;
- struct inpcbhead *head;
- int injail;
-
- /*
- * Order of socket selection - we always prefer jails.
- * 1. jailed, non-wild.
- * 2. jailed, wild.
- * 3. non-jailed, non-wild.
- * 4. non-jailed, wild.
- */
- head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
- 0, pcbinfo->ipi_wildmask)];
- CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
-#ifdef INET6
- /* XXX inp locking */
- if ((inp->inp_vflag & INP_IPV4) == 0)
- continue;
-#endif
- if (inp->inp_faddr.s_addr != INADDR_ANY ||
- inp->inp_lport != lport)
- continue;
-
- injail = prison_flag(inp->inp_cred, PR_IP4);
- if (injail) {
- if (prison_check_ip4(inp->inp_cred,
- &laddr) != 0)
- continue;
- } else {
- if (local_exact != NULL)
- continue;
- }
-
- if (inp->inp_laddr.s_addr == laddr.s_addr) {
- if (injail)
- goto found;
- else
- local_exact = inp;
- } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
-#ifdef INET6
- /* XXX inp locking, NULL check */
- if (inp->inp_vflag & INP_IPV6PROTO)
- local_wild_mapped = inp;
- else
-#endif
- if (injail)
- jail_wild = inp;
- else
- local_wild = inp;
- }
- } /* LIST_FOREACH */
- inp = jail_wild;
- if (inp == NULL)
- inp = local_exact;
- if (inp == NULL)
- inp = local_wild;
-#ifdef INET6
- if (inp == NULL)
- inp = local_wild_mapped;
-#endif
- if (inp != NULL)
- goto found;
- } /* if (lookupflags & INPLOOKUP_WILDCARD) */
- INP_GROUP_UNLOCK(pcbgroup);
- return (NULL);
-
-found:
- if (lookupflags & INPLOOKUP_WLOCKPCB)
- locked = INP_TRY_WLOCK(inp);
- else if (lookupflags & INPLOOKUP_RLOCKPCB)
- locked = INP_TRY_RLOCK(inp);
- else
- panic("%s: locking bug", __func__);
- if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) {
- if (lookupflags & INPLOOKUP_WLOCKPCB)
- INP_WUNLOCK(inp);
- else
- INP_RUNLOCK(inp);
- return (NULL);
- } else if (!locked)
- in_pcbref(inp);
- INP_GROUP_UNLOCK(pcbgroup);
- if (!locked) {
- if (lookupflags & INPLOOKUP_WLOCKPCB) {
- INP_WLOCK(inp);
- if (in_pcbrele_wlocked(inp))
- return (NULL);
- } else {
- INP_RLOCK(inp);
- if (in_pcbrele_rlocked(inp))
- return (NULL);
- }
- }
-#ifdef INVARIANTS
- if (lookupflags & INPLOOKUP_WLOCKPCB)
- INP_WLOCK_ASSERT(inp);
- else
- INP_RLOCK_ASSERT(inp);
-#endif
- return (inp);
-}
-#endif /* PCBGROUP */
-
/*
* Lookup PCB in hash list, using pcbinfo tables. This variation assumes
* that the caller has locked the hash list, and will not perform any further
@@ -2497,40 +2251,17 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
/*
* Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
* from which a pre-calculated hash value may be extracted.
- *
- * Possibly more of this logic should be in in_pcbgroup.c.
*/
struct inpcb *
in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
{
-#if defined(PCBGROUP) && !defined(RSS)
- struct inpcbgroup *pcbgroup;
-#endif
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
("%s: invalid lookup flags %d", __func__, lookupflags));
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
("%s: LOCKPCB not set", __func__));
- /*
- * When not using RSS, use connection groups in preference to the
- * reservation table when looking up 4-tuples. When using RSS, just
- * use the reservation table, due to the cost of the Toeplitz hash
- * in software.
- *
- * XXXRW: This policy belongs in the pcbgroup code, as in principle
- * we could be doing RSS with a non-Toeplitz hash that is affordable
- * in software.
- */
-#if defined(PCBGROUP) && !defined(RSS)
- if (in_pcbgroup_enabled(pcbinfo)) {
- pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
- fport);
- return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
- laddr, lport, lookupflags, ifp));
- }
-#endif
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
lookupflags, ifp, M_NODOM));
}
@@ -2540,39 +2271,12 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
struct ifnet *ifp, struct mbuf *m)
{
-#ifdef PCBGROUP
- struct inpcbgroup *pcbgroup;
-#endif
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
("%s: invalid lookup flags %d", __func__, lookupflags));
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
("%s: LOCKPCB not set", __func__));
-#ifdef PCBGROUP
- /*
- * If we can use a hardware-generated hash to look up the connection
- * group, use that connection group to find the inpcb. Otherwise
- * fall back on a software hash -- or the reservation table if we're
- * using RSS.
- *
- * XXXRW: As above, that policy belongs in the pcbgroup code.
- */
- if (in_pcbgroup_enabled(pcbinfo) &&
- !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
- pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
- m->m_pkthdr.flowid);
- if (pcbgroup != NULL)
- return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
- fport, laddr, lport, lookupflags, ifp));
-#ifndef RSS
- pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
- fport);
- return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
- laddr, lport, lookupflags, ifp));
-#endif
- }
-#endif
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
lookupflags, ifp, m->m_pkthdr.numa_domain));
}
@@ -2647,13 +2351,7 @@ in_pcbinshash_internal(struct inpcb *inp, struct mbuf *m)
CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
inp->inp_flags |= INP_INHASHLIST;
-#ifdef PCBGROUP
- if (m != NULL) {
- in_pcbgroup_update_mbuf(inp, m);
- } else {
- in_pcbgroup_update(inp);
- }
-#endif
+
return (0);
}
@@ -2702,13 +2400,6 @@ in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
CK_LIST_REMOVE(inp, inp_hash);
CK_LIST_INSERT_HEAD(head, inp, inp_hash);
-
-#ifdef PCBGROUP
- if (m != NULL)
- in_pcbgroup_update_mbuf(inp, m);
- else
- in_pcbgroup_update(inp);
-#endif
}
void
@@ -2749,9 +2440,6 @@ in_pcbremlists(struct inpcb *inp)
}
CK_LIST_REMOVE(inp, inp_list);
pcbinfo->ipi_count--;
-#ifdef PCBGROUP
- in_pcbgroup_remove(inp);
-#endif
}
/*
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index d6a335236599..813c87559de3 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -156,7 +156,6 @@ struct in_conninfo {
* (b) - Protected by the hpts lock.
* (c) - Constant after initialization
* (e) - Protected by the net_epoch_prempt epoch
- * (g) - Protected by the pcbgroup lock
* (i) - Protected by the inpcb lock
* (p) - Protected by the pcbinfo lock for the inpcb
* (l) - Protected by the pcblist lock for the inpcb
@@ -231,7 +230,6 @@ struct m_snd_tag;
struct inpcb {
/* Cache line #1 (amd64) */
CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */
- CK_LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
struct rwlock inp_lock;
/* Cache line #2 (amd64) */
#define inp_start_zero inp_hpts
@@ -276,8 +274,6 @@ struct inpcb {
uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */
TAILQ_ENTRY(inpcb) inp_input; /* pacing in queue next lock(b) */
struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
- struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
- CK_LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
struct ucred *inp_cred; /* (c) cache of socket cred */
u_int32_t inp_flow; /* (i) IPv6 flow information */
u_char inp_vflag; /* (i) IP version flag (v4/v6) */
@@ -423,7 +419,6 @@ struct inpcbport {
* ipi_lock (before)
* inpcb locks (before)
* ipi_list locks (before)
- * {ipi_hash_lock, pcbgroup locks}
*
* Locking key:
*
@@ -432,7 +427,6 @@ struct inpcbport {
* (g) Locked by ipi_lock
* (l) Locked by ipi_list_lock
* (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock
- * (p) Protected by one or more pcbgroup locks
* (x) Synchronisation properties poorly defined
*/
struct inpcbinfo {
@@ -466,16 +460,7 @@ struct inpcbinfo {
struct uma_zone *ipi_zone; /* (c) */
/*
- * Connection groups associated with this protocol. These fields are
- * constant, but pcbgroup structures themselves are protected by
- * per-pcbgroup locks.
- */
- struct inpcbgroup *ipi_pcbgroups; /* (c) */
- u_int ipi_npcbgroups; /* (c) */
- u_int ipi_hashfields; /* (c) */
-
- /*
- * Global lock protecting modification non-pcbgroup hash lookup tables.
+ * Global lock protecting modification hash lookup tables.
*/
struct mtx ipi_hash_lock;
@@ -493,14 +478,6 @@ struct inpcbinfo {
u_long ipi_porthashmask; /* (h) */
/*
- * List of wildcard inpcbs for use with pcbgroups. In the past, was
- * per-pcbgroup but is now global. All pcbgroup locks must be held
- * to modify the list, so any is sufficient to read it.
- */
- struct inpcbhead *ipi_wildbase; /* (p) */
- u_long ipi_wildmask; /* (p) */
-
- /*
* Load balance groups used for the SO_REUSEPORT_LB option,
* hashed by local port.
*/
@@ -525,31 +502,6 @@ struct inpcbinfo {
#ifdef _KERNEL
/*
- * Connection groups hold sets of connections that have similar CPU/thread
- * affinity. Each connection belongs to exactly one connection group.
- */
-struct inpcbgroup {
- /*
- * Per-connection group hash of inpcbs, hashed by local and foreign
- * addresses and port numbers.
- */
- struct inpcbhead *ipg_hashbase; /* (c) */
- u_long ipg_hashmask; /* (c) */
-
- /*
- * Notional affinity of this pcbgroup.
- */
- u_int ipg_cpu; /* (p) */
-
- /*
- * Per-connection group lock, not to be confused with ipi_lock.
- * Protects the hash table hung off the group, but also the global
- * wildcard list in inpcbinfo.
- */
- struct mtx ipg_lock;
-} __aligned(CACHE_LINE_SIZE);
-
-/*
* Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
* (or unique address:port combination) can be re-used at most
* INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which
@@ -728,7 +680,7 @@ int inp_so_options(const struct inpcb *inp);
*/
#define INP_MBUF_L_ACKS 0x00000001 /* We need large mbufs for ack compression */
#define INP_MBUF_ACKCMP 0x00000002 /* TCP mbuf ack compression ok */
-#define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */
+/* 0x00000004 */
#define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */
#define INP_FREED 0x00000010 /* inp itself is not valid */
#define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */
@@ -809,20 +761,6 @@ void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *,
int in_pcbbind_check_bindmulti(const struct inpcb *ni,
const struct inpcb *oi);
-struct inpcbgroup *
- in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t);
-struct inpcbgroup *
- in_pcbgroup_byinpcb(struct inpcb *);
-struct inpcbgroup *
- in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short,
- struct in_addr, u_short);
-void in_pcbgroup_destroy(struct inpcbinfo *);
-int in_pcbgroup_enabled(struct inpcbinfo *);
-void in_pcbgroup_init(struct inpcbinfo *, u_int, int);
-void in_pcbgroup_remove(struct inpcb *);
-void in_pcbgroup_update(struct inpcb *);
-void in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *);
-
void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
int in_pcballoc(struct socket *, struct inpcbinfo *);
int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *);
diff --git a/sys/netinet/in_pcbgroup.c b/sys/netinet/in_pcbgroup.c
deleted file mode 100644
index 11ed75be1198..000000000000
--- a/sys/netinet/in_pcbgroup.c
+++ /dev/null
@@ -1,566 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2010-2011 Juniper Networks, Inc.
- * All rights reserved.
- *
- * This software was developed by Robert N. M. Watson under contract
- * to Juniper Networks, Inc.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-
-__FBSDID("$FreeBSD$");
-
-#include "opt_inet6.h"
-#include "opt_rss.h"
-
-#include <sys/param.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#include <sys/mutex.h>
-#include <sys/smp.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-
-#include <net/rss_config.h>
-
-#include <netinet/in.h>
-
-#include <netinet/in_pcb.h>
-#include <netinet/in_rss.h>
-#ifdef INET6
-#include <netinet6/in6_pcb.h>
-#endif /* INET6 */
-
-/*
- * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's
- * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization
- * Strategies in Modern Operating Systems". This implementation differs
- * significantly from that described in the paper, in that it attempts to
- * introduce not just notions of affinity for connections and distribute work
- * so as to reduce lock contention, but also align those notions with
- * hardware work distribution strategies such as RSS. In this construction,
- * connection groups supplement, rather than replace, existing reservation
- * tables for protocol 4-tuples, offering CPU-affine lookup tables with
- * minimal cache line migration and lock contention during steady state
- * operation.
- *
- * Hardware-offloaded checksums are often inefficient in software -- for
- * example, Toeplitz, specified by RSS, introduced a significant overhead if
- * performed during per-packge processing. It is therefore desirable to fall
- * back on traditional reservation table lookups without affinity where
- * hardware-offloaded checksums aren't available, such as for traffic over
- * non-RSS interfaces.
- *
- * Internet protocols, such as UDP and TCP, register to use connection groups
- * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this
- * indicates to the connection group code whether a 2-tuple or 4-tuple is
- * used as an argument to hashes that assign a connection to a particular
- * group. This must be aligned with any hardware offloaded distribution
- * model, such as RSS or similar approaches taken in embedded network boards.
- * Wildcard sockets require special handling, as in Willman 2006, and are
- * shared between connection groups -- while being protected by group-local
- * locks. This means that connection establishment and teardown can be
- * signficantly more expensive than without connection groups, but that
- * steady-state processing can be significantly faster.
- *
- * When RSS is used, certain connection group parameters, such as the number
- * of groups, are provided by the RSS implementation, found in in_rss.c.
- * Otherwise, in_pcbgroup.c selects possible sensible parameters
- * corresponding to the degree of parallelism exposed by netisr.
- *
- * Most of the implementation of connection groups is in this file; however,
- * connection group lookup is implemented in in_pcb.c alongside reservation
- * table lookups -- see in_pcblookup_group().
- *
- * TODO:
- *
- * Implement dynamic rebalancing of buckets with connection groups; when
- * load is unevenly distributed, search for more optimal balancing on
- * demand. This might require scaling up the number of connection groups
- * by <<1.
- *
- * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection
- * groups for ip_input and ip6_input, allowing non-offloaded work
- * distribution.
- *
- * Expose effective CPU affinity of connections to userspace using socket
- * options.
- *
- * Investigate per-connection affinity overrides based on socket options; an
- * option could be set, certainly resulting in work being distributed
- * differently in software, and possibly propagated to supporting hardware
- * with TCAMs or hardware hash tables. This might require connections to
- * exist in more than one connection group at a time.
- *
- * Hook netisr thread reconfiguration events, and propagate those to RSS so
- * that rebalancing can occur when the thread pool grows or shrinks.
- *
- * Expose per-pcbgroup statistics to userspace monitoring tools such as
- * netstat, in order to allow better debugging and profiling.
- */
-
-void
-in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields,
- int hash_nelements)
-{
- struct inpcbgroup *pcbgroup;
- u_int numpcbgroups, pgn;
-
- /*
- * Only enable connection groups for a protocol if it has been
- * specifically requested.
- */
- if (hashfields == IPI_HASHFIELDS_NONE)
- return;
-
- /*
- * Connection groups are about multi-processor load distribution,
- * lock contention, and connection CPU affinity. As such, no point
- * in turning them on for a uniprocessor machine, it only wastes
- * memory.
- */
- if (mp_ncpus == 1)
- return;
-
-#ifdef RSS
- /*
- * If we're using RSS, then RSS determines the number of connection
- * groups to use: one connection group per RSS bucket. If for some
- * reason RSS isn't able to provide a number of buckets, disable
- * connection groups entirely.
- *
- * XXXRW: Can this ever happen?
- */
- numpcbgroups = rss_getnumbuckets();
- if (numpcbgroups == 0)
- return;
-#else
- /*
- * Otherwise, we'll just use one per CPU for now. If we decide to
- * do dynamic rebalancing a la RSS, we'll need similar logic here.
- */
- numpcbgroups = mp_ncpus;
-#endif
-
- pcbinfo->ipi_hashfields = hashfields;
- pcbinfo->ipi_pcbgroups = malloc(numpcbgroups *
- sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO);
- pcbinfo->ipi_npcbgroups = numpcbgroups;
- pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB,
- &pcbinfo->ipi_wildmask);
- for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
- pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
- pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB,
- &pcbgroup->ipg_hashmask);
- INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup");
-
- /*
- * Initialise notional affinity of the pcbgroup -- for RSS,
- * we want the same notion of affinity as NICs to be used. In
- * the non-RSS case, just round robin for the time being.
- *
- * XXXRW: The notion of a bucket to CPU mapping is common at
- * both pcbgroup and RSS layers -- does that mean that we
- * should migrate it all from RSS to here, and just leave RSS
- * responsible only for providing hashing and mapping functions?
- */
-#ifdef RSS
- pcbgroup->ipg_cpu = rss_getcpu(pgn);
-#else
- pcbgroup->ipg_cpu = (pgn % mp_ncpus);
-#endif
- }
-}
-
-void
-in_pcbgroup_destroy(struct inpcbinfo *pcbinfo)
-{
- struct inpcbgroup *pcbgroup;
- u_int pgn;
-
- if (pcbinfo->ipi_npcbgroups == 0)
- return;
-
- for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
- pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
- KASSERT(CK_LIST_EMPTY(pcbinfo->ipi_listhead),
- ("in_pcbinfo_destroy: listhead not empty"));
- INP_GROUP_LOCK_DESTROY(pcbgroup);
- hashdestroy(pcbgroup->ipg_hashbase, M_PCB,
- pcbgroup->ipg_hashmask);
- }
- hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask);
- free(pcbinfo->ipi_pcbgroups, M_PCB);
- pcbinfo->ipi_pcbgroups = NULL;
- pcbinfo->ipi_npcbgroups = 0;
- pcbinfo->ipi_hashfields = 0;
-}
-
-/*
- * Given a hash of whatever the covered tuple might be, return a pcbgroup
- * index. Where RSS is supported, try to align bucket selection with RSS CPU
- * affinity strategy.
- */
-static __inline u_int
-in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash)
-{
-
-#ifdef RSS
- return (rss_getbucket(hash));
-#else
- return (hash % pcbinfo->ipi_npcbgroups);
-#endif
-}
-
-/*
- * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash
- * information is insufficient to identify the pcbgroup. This might occur if
- * a TCP packet turns up with a 2-tuple hash, or if an RSS hash is present but
- * RSS is not compiled into the kernel.
- */
-struct inpcbgroup *
-in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash)
-{
-
-#ifdef RSS
- if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE &&
- hashtype == M_HASHTYPE_RSS_TCP_IPV4) ||
- (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE &&
- hashtype == M_HASHTYPE_RSS_UDP_IPV4) ||
- (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE &&
- hashtype == M_HASHTYPE_RSS_IPV4))
- return (&pcbinfo->ipi_pcbgroups[
- in_pcbgroup_getbucket(pcbinfo, hash)]);
-#endif
- return (NULL);
-}
-
-static struct inpcbgroup *
-in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m)
-{
-
- return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
- m->m_pkthdr.flowid));
-}
-
-struct inpcbgroup *
-in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr,
- u_short lport, struct in_addr faddr, u_short fport)
-{
- uint32_t hash;
-
- /*
- * RSS note: we pass foreign addr/port as source, and local addr/port
- * as destination, as we want to align with what the hardware is
- * doing.
- */
- switch (pcbinfo->ipi_hashfields) {
- case IPI_HASHFIELDS_4TUPLE:
-#ifdef RSS
- hash = rss_hash_ip4_4tuple(faddr, fport, laddr, lport);
-#else
- hash = faddr.s_addr ^ fport;
-#endif
- break;
-
- case IPI_HASHFIELDS_2TUPLE:
-#ifdef RSS
- hash = rss_hash_ip4_2tuple(faddr, laddr);
-#else
- hash = faddr.s_addr ^ laddr.s_addr;
-#endif
- break;
-
- default:
- hash = 0;
- }
- return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo,
- hash)]);
-}
-
-struct inpcbgroup *
-in_pcbgroup_byinpcb(struct inpcb *inp)
-{
-#ifdef RSS
- /*
- * Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined
- * RSS bucket and thus we should use this pcbgroup, rather than
- * using a tuple or hash.
- *
- * XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket
- * fits in that!
- */
- if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
- return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]);
-#endif
-
- return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr,
- inp->inp_lport, inp->inp_faddr, inp->inp_fport));
-}
-
-static void
-in_pcbwild_add(struct inpcb *inp)
-{
- struct inpcbinfo *pcbinfo;
- struct inpcbhead *head;
- u_int pgn;
-
- INP_WLOCK_ASSERT(inp);
- KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD),
- ("%s: is wild",__func__));
-
- pcbinfo = inp->inp_pcbinfo;
- for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
- INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
- head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport,
- 0, pcbinfo->ipi_wildmask)];
- CK_LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild);
- inp->inp_flags2 |= INP_PCBGROUPWILD;
- for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
- INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
-}
-
-static void
-in_pcbwild_remove(struct inpcb *inp)
-{
- struct inpcbinfo *pcbinfo;
- u_int pgn;
-
- INP_WLOCK_ASSERT(inp);
- KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD),
- ("%s: not wild", __func__));
-
- pcbinfo = inp->inp_pcbinfo;
- for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
- INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
- CK_LIST_REMOVE(inp, inp_pcbgroup_wild);
- for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
- INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
- inp->inp_flags2 &= ~INP_PCBGROUPWILD;
-}
-
-static __inline int
-in_pcbwild_needed(struct inpcb *inp)
-{
-#ifdef RSS
- /*
- * If it's a listen socket and INP_RSS_BUCKET_SET is set,
- * it's a wildcard socket _but_ it's in a specific pcbgroup.
- * Thus we don't treat it as a pcbwild inp.
- */
- if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
- return (0);
-#endif
-
-#ifdef INET6
- if (inp->inp_vflag & INP_IPV6)
- return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr));
- else
-#endif
- return (inp->inp_faddr.s_addr == htonl(INADDR_ANY));
-}
-
-static void
-in_pcbwild_update_internal(struct inpcb *inp)
-{
- int wildcard_needed;
-
- wildcard_needed = in_pcbwild_needed(inp);
- if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD))
- in_pcbwild_add(inp);
- else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD))
- in_pcbwild_remove(inp);
-}
-
-/*
- * Update the pcbgroup of an inpcb, which might include removing an old
- * pcbgroup reference and/or adding a new one. Wildcard processing is not
- * performed here, although ideally we'll never install a pcbgroup for a
- * wildcard inpcb (asserted below).
- */
-static void
-in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo,
- struct inpcbgroup *newpcbgroup, struct inpcb *inp)
-{
- struct inpcbgroup *oldpcbgroup;
- struct inpcbhead *pcbhash;
- uint32_t hashkey_faddr;
-
- INP_WLOCK_ASSERT(inp);
-
- oldpcbgroup = inp->inp_pcbgroup;
- if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
- INP_GROUP_LOCK(oldpcbgroup);
- CK_LIST_REMOVE(inp, inp_pcbgrouphash);
- inp->inp_pcbgroup = NULL;
- INP_GROUP_UNLOCK(oldpcbgroup);
- }
- if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
-#ifdef INET6
- if (inp->inp_vflag & INP_IPV6)
- hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
- else
-#endif
- hashkey_faddr = inp->inp_faddr.s_addr;
- INP_GROUP_LOCK(newpcbgroup);
- /*
- * If the inp is an RSS bucket wildcard entry, ensure
- * that the PCB hash is calculated correctly.
- *
- * The wildcard hash calculation differs from the
- * non-wildcard definition. The source address is
- * INADDR_ANY and the far port is 0.
- */
- if (inp->inp_flags2 & INP_RSS_BUCKET_SET) {
- pcbhash = &newpcbgroup->ipg_hashbase[
- INP_PCBHASH(INADDR_ANY, inp->inp_lport, 0,
- newpcbgroup->ipg_hashmask)];
- } else {
- pcbhash = &newpcbgroup->ipg_hashbase[
- INP_PCBHASH(hashkey_faddr, inp->inp_lport,
- inp->inp_fport,
- newpcbgroup->ipg_hashmask)];
- }
- CK_LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash);
- inp->inp_pcbgroup = newpcbgroup;
- INP_GROUP_UNLOCK(newpcbgroup);
- }
-
- KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)),
- ("%s: pcbgroup and wildcard!", __func__));
-}
-
-/*
- * Two update paths: one in which the 4-tuple on an inpcb has been updated
- * and therefore connection groups may need to change (or a wildcard entry
- * may needed to be installed), and another in which the 4-tuple has been
- * set as a result of a packet received, in which case we may be able to use
- * the hash on the mbuf to avoid doing a software hash calculation for RSS.
- *
- * In each case: first, let the wildcard code have a go at placing it as a
- * wildcard socket. If it was a wildcard, or if the connection has been
- * dropped, then no pcbgroup is required (so potentially clear it);
- * otherwise, calculate and update the pcbgroup for the inpcb.
- */
-void
-in_pcbgroup_update(struct inpcb *inp)
-{
- struct inpcbinfo *pcbinfo;
- struct inpcbgroup *newpcbgroup;
-
- INP_WLOCK_ASSERT(inp);
-
- pcbinfo = inp->inp_pcbinfo;
- if (!in_pcbgroup_enabled(pcbinfo))
- return;
-
- in_pcbwild_update_internal(inp);
- if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
- !(inp->inp_flags & INP_DROPPED)) {
-#ifdef INET6
- if (inp->inp_vflag & INP_IPV6)
- newpcbgroup = in6_pcbgroup_byinpcb(inp);
- else
-#endif
- newpcbgroup = in_pcbgroup_byinpcb(inp);
- } else
- newpcbgroup = NULL;
- in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
-}
-
-void
-in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m)
-{
- struct inpcbinfo *pcbinfo;
- struct inpcbgroup *newpcbgroup;
-
- INP_WLOCK_ASSERT(inp);
-
- pcbinfo = inp->inp_pcbinfo;
- if (!in_pcbgroup_enabled(pcbinfo))
- return;
-
- /*
- * Possibly should assert !INP_PCBGROUPWILD rather than testing for
- * it; presumably this function should never be called for anything
- * other than non-wildcard socket?
- */
- in_pcbwild_update_internal(inp);
- if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
- !(inp->inp_flags & INP_DROPPED)) {
- newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m);
-#ifdef INET6
- if (inp->inp_vflag & INP_IPV6) {
- if (newpcbgroup == NULL)
- newpcbgroup = in6_pcbgroup_byinpcb(inp);
- } else {
-#endif
- if (newpcbgroup == NULL)
- newpcbgroup = in_pcbgroup_byinpcb(inp);
-#ifdef INET6
- }
-#endif
- } else
- newpcbgroup = NULL;
- in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
-}
-
-/*
- * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb.
- */
-void
-in_pcbgroup_remove(struct inpcb *inp)
-{
- struct inpcbgroup *pcbgroup;
-
- INP_WLOCK_ASSERT(inp);
-
- if (!in_pcbgroup_enabled(inp->inp_pcbinfo))
- return;
-
- if (inp->inp_flags2 & INP_PCBGROUPWILD)
- in_pcbwild_remove(inp);
-
- pcbgroup = inp->inp_pcbgroup;
- if (pcbgroup != NULL) {
- INP_GROUP_LOCK(pcbgroup);
- CK_LIST_REMOVE(inp, inp_pcbgrouphash);
- inp->inp_pcbgroup = NULL;
- INP_GROUP_UNLOCK(pcbgroup);
- }
-}
-
-/*
- * Query whether or not it is appropriate to use pcbgroups to look up inpcbs
- * for a protocol.
- */
-int
-in_pcbgroup_enabled(struct inpcbinfo *pcbinfo)
-{
-
- return (pcbinfo->ipi_npcbgroups > 0);
-}
diff --git a/sys/netinet/in_rss.c b/sys/netinet/in_rss.c
index 76438a330bb5..5dffbc5b5fd0 100644
--- a/sys/netinet/in_rss.c
+++ b/sys/netinet/in_rss.c
@@ -32,7 +32,6 @@
__FBSDID("$FreeBSD$");
#include "opt_inet6.h"
-#include "opt_pcbgroup.h"
#include <sys/param.h>
#include <sys/mbuf.h>
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index 4be888f22a37..39ec65df7426 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -38,7 +38,6 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
-#include "opt_pcbgroup.h"
#include <sys/param.h>
#include <sys/systm.h>