aboutsummaryrefslogtreecommitdiff
path: root/sys/netinet/tcp_input.c
diff options
context:
space:
mode:
authorAndre Oppermann <andre@FreeBSD.org>2007-02-01 18:32:13 +0000
committerAndre Oppermann <andre@FreeBSD.org>2007-02-01 18:32:13 +0000
commit6741ecf5950f8d1ee962d0d2c18901914b79717b (patch)
tree7c81206310a4555c1c0c8da554782bf1b9af4a3e /sys/netinet/tcp_input.c
parent6a37f331d7c1d7acc39ad3f0d84062897c79c4b0 (diff)
downloadsrc-6741ecf5950f8d1ee962d0d2c18901914b79717b.tar.gz
src-6741ecf5950f8d1ee962d0d2c18901914b79717b.zip
Auto sizing TCP socket buffers.
Normally the socket buffers are static (either derived from global defaults or set with setsockopt) and do not adapt to real network conditions. Two things happen: a) your socket buffers are too small and you can't reach the full potential of the network between both hosts; b) your socket buffers are too big and you waste a lot of kernel memory for data just sitting around. With automatic TCP send and receive socket buffers we can start with a small buffer and quickly grow it in parallel with the TCP congestion window to match real network conditions. FreeBSD has a default 32K send socket buffer. This supports a maximal transfer rate of only slightly more than 2Mbit/s on a 100ms RTT trans-continental link. Or at 200ms just above 1Mbit/s. With TCP send buffer auto scaling and the default values below it supports 20Mbit/s at 100ms and 10Mbit/s at 200ms. That's an improvement of factor 10, or 1000%. For the receive side it looks slightly better with a default of 64K buffer size. New sysctls are: net.inet.tcp.sendbuf_auto=1 (enabled) net.inet.tcp.sendbuf_inc=8192 (8K, step size) net.inet.tcp.sendbuf_max=262144 (256K, growth limit) net.inet.tcp.recvbuf_auto=1 (enabled) net.inet.tcp.recvbuf_inc=16384 (16K, step size) net.inet.tcp.recvbuf_max=262144 (256K, growth limit) Tested by: many (on HEAD and RELENG_6) Approved by: re MFC after: 1 month
Notes
Notes: svn path=/head/; revision=166405
Diffstat (limited to 'sys/netinet/tcp_input.c')
-rw-r--r--sys/netinet/tcp_input.c84
1 files changed, 81 insertions, 3 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 79d5e8623c85..70462f125882 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -161,6 +161,18 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
&tcp_reass_overflows, 0,
"Global number of TCP Segment Reassembly Queue Overflows");
+int tcp_do_autorcvbuf = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
+ &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
+
+int tcp_autorcvbuf_inc = 16*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
+ &tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer");
+
+int tcp_autorcvbuf_max = 256*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
+ &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
+
struct inpcbhead tcb;
#define tcb6 tcb /* for KAME src sync over BSD*'s */
struct inpcbinfo tcbinfo;
@@ -1295,6 +1307,8 @@ after_listen:
} else if (th->th_ack == tp->snd_una &&
LIST_EMPTY(&tp->t_segq) &&
tlen <= sbspace(&so->so_rcv)) {
+ int newsize = 0; /* automatic sockbuf scaling */
+
KASSERT(headlocked, ("headlocked"));
INP_INFO_WUNLOCK(&tcbinfo);
headlocked = 0;
@@ -1321,18 +1335,78 @@ after_listen:
tcpstat.tcps_rcvpack++;
tcpstat.tcps_rcvbyte += tlen;
ND6_HINT(tp); /* some progress has been done */
- /*
#ifdef TCPDEBUG
if (so->so_options & SO_DEBUG)
tcp_trace(TA_INPUT, ostate, tp,
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
- * Add data to socket buffer.
- */
+ /*
+ * Automatic sizing of receive socket buffer. Often the send
+ * buffer size is not optimally adjusted to the actual network
+ * conditions at hand (delay bandwidth product). Setting the
+ * buffer size too small limits throughput on links with high
+ * bandwidth and high delay (eg. trans-continental/oceanic links).
+ *
+ * On the receive side the socket buffer memory is only rarely
+ * used to any significant extent. This allows us to be much
+ * more aggressive in scaling the receive socket buffer. For
+ * the case that the buffer space is actually used to a large
+ * extent and we run out of kernel memory we can simply drop
+ * the new segments; TCP on the sender will just retransmit it
+ * later. Setting the buffer size too big may only consume too
+ * much kernel memory if the application doesn't read() from
+ * the socket or packet loss or reordering makes use of the
+ * reassembly queue.
+ *
+ * The criteria to step up the receive buffer one notch are:
+ * 1. the number of bytes received during the time it takes
+ * one timestamp to be reflected back to us (the RTT);
+ * 2. received bytes per RTT is within seven eighth of the
+ * current socket buffer size;
+ * 3. receive buffer size has not hit maximal automatic size;
+ *
+ * This algorithm does one step per RTT at most and only if
+ * we receive a bulk stream w/o packet losses or reorderings.
+ * Shrinking the buffer during idle times is not necessary as
+ * it doesn't consume any memory when idle.
+ *
+ * TODO: Only step up if the application is actually serving
+ * the buffer to better manage the socket buffer resources.
+ */
+ if (tcp_do_autorcvbuf &&
+ to.to_tsecr &&
+ (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
+ if (to.to_tsecr > tp->rfbuf_ts &&
+ to.to_tsecr - tp->rfbuf_ts < hz) {
+ if (tp->rfbuf_cnt >
+ (so->so_rcv.sb_hiwat / 8 * 7) &&
+ so->so_rcv.sb_hiwat <
+ tcp_autorcvbuf_max) {
+ newsize =
+ min(so->so_rcv.sb_hiwat +
+ tcp_autorcvbuf_inc,
+ tcp_autorcvbuf_max);
+ }
+ /* Start over with next RTT. */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+ } else
+ tp->rfbuf_cnt += tlen; /* add up */
+ }
+
+ /* Add data to socket buffer. */
SOCKBUF_LOCK(&so->so_rcv);
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
m_freem(m);
} else {
+ /*
+ * Set new socket buffer size.
+ * Give up when limit is reached.
+ */
+ if (newsize)
+ if (!sbreserve_locked(&so->so_rcv,
+ newsize, so, curthread))
+ so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
m_adj(m, drop_hdrlen); /* delayed header drop */
sbappendstream_locked(&so->so_rcv, m);
}
@@ -1361,6 +1435,10 @@ after_listen:
tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
}
+ /* Reset receive buffer auto scaling when not in bulk receive mode. */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+
switch (tp->t_state) {
/*