aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHans Petter Selasky <hselasky@FreeBSD.org>2014-09-13 08:26:09 +0000
committerHans Petter Selasky <hselasky@FreeBSD.org>2014-09-13 08:26:09 +0000
commiteb93b77ae4a354370a1116d33522893cd1fbc1b3 (patch)
treed34b1dd02c73734a9f41558066a1d389f2f3c6f8
parentabafbab15fb7ecadd0cd655caa672329369c9142 (diff)
downloadsrc-eb93b77ae4a354370a1116d33522893cd1fbc1b3.tar.gz
src-eb93b77ae4a354370a1116d33522893cd1fbc1b3.zip
Improve transmit sending offload, TSO, algorithm in general.
The current TSO limitation feature only takes the total number of bytes in an mbuf chain into account and does not limit by the number of mbufs in a chain. Some kinds of hardware is limited by two factors. One is the fragment length and the second is the fragment count. Both of these limits need to be taken into account when doing TSO. Else some kinds of hardware might have to drop completely valid mbuf chains because they cannot loaded into the given hardware's DMA engine. The new way of doing TSO limitation has been made backwards compatible as input from other FreeBSD developers and will use defaults for values not set. MFC after: 1 week Sponsored by: Mellanox Technologies
Notes
Notes: svn path=/head/; revision=271504
-rw-r--r--sys/dev/oce/oce_if.c5
-rw-r--r--sys/dev/oce/oce_if.h1
-rw-r--r--sys/dev/vmware/vmxnet3/if_vmx.c6
-rw-r--r--sys/dev/vmware/vmxnet3/if_vmxvar.h5
-rw-r--r--sys/dev/xen/netfront/netfront.c6
-rw-r--r--sys/net/if.c60
-rw-r--r--sys/net/if_lagg.c13
-rw-r--r--sys/net/if_var.h44
-rw-r--r--sys/net/if_vlan.c4
-rw-r--r--sys/netinet/tcp_output.c77
-rw-r--r--sys/ofed/drivers/net/mlx4/en_netdev.c6
11 files changed, 193 insertions, 34 deletions
diff --git a/sys/dev/oce/oce_if.c b/sys/dev/oce/oce_if.c
index af57491e1fa2..68985b838bf5 100644
--- a/sys/dev/oce/oce_if.c
+++ b/sys/dev/oce/oce_if.c
@@ -1731,7 +1731,10 @@ oce_attach_ifp(POCE_SOFTC sc)
sc->ifp->if_baudrate = IF_Gbps(10);
#if __FreeBSD_version >= 1000000
- sc->ifp->if_hw_tsomax = OCE_MAX_TSO_SIZE;
+ sc->ifp->if_hw_tsomax = IF_HW_TSOMAX_BUILD_VALUE(
+ 65535 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN) /* bytes */,
+ OCE_MAX_TX_ELEMENTS /* maximum frag count */,
+ 12 /* 4K frag size */);
#endif
ether_ifattach(sc->ifp, sc->macaddr.mac_addr);
diff --git a/sys/dev/oce/oce_if.h b/sys/dev/oce/oce_if.h
index b6db402e7766..bb788413faa0 100644
--- a/sys/dev/oce/oce_if.h
+++ b/sys/dev/oce/oce_if.h
@@ -152,7 +152,6 @@ extern int mp_ncpus; /* system's total active cpu cores */
#define OCE_MAX_TX_ELEMENTS 29
#define OCE_MAX_TX_DESC 1024
#define OCE_MAX_TX_SIZE 65535
-#define OCE_MAX_TSO_SIZE (65535 - ETHER_HDR_LEN)
#define OCE_MAX_RX_SIZE 4096
#define OCE_MAX_RQ_POSTS 255
#define OCE_DEFAULT_PROMISCUOUS 0
diff --git a/sys/dev/vmware/vmxnet3/if_vmx.c b/sys/dev/vmware/vmxnet3/if_vmx.c
index f3fde92e3f7a..dd0eb6c7fc8d 100644
--- a/sys/dev/vmware/vmxnet3/if_vmx.c
+++ b/sys/dev/vmware/vmxnet3/if_vmx.c
@@ -1722,7 +1722,11 @@ vmxnet3_setup_interface(struct vmxnet3_softc *sc)
ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
ifp->if_init = vmxnet3_init;
ifp->if_ioctl = vmxnet3_ioctl;
- ifp->if_hw_tsomax = VMXNET3_TSO_MAXSIZE;
+
+ ifp->if_hw_tsomax = IF_HW_TSOMAX_BUILD_VALUE(
+ 65535 - sizeof(struct ether_vlan_header) /* bytes */,
+ VMXNET3_TX_MAXSEGS /* maximum frag count */,
+ VMXNET3_TX_MAXSEGSHIFT /* frag size */);
#ifdef VMXNET3_LEGACY_TX
ifp->if_start = vmxnet3_start;
diff --git a/sys/dev/vmware/vmxnet3/if_vmxvar.h b/sys/dev/vmware/vmxnet3/if_vmxvar.h
index 6c797214b2ee..a6129f911fbe 100644
--- a/sys/dev/vmware/vmxnet3/if_vmxvar.h
+++ b/sys/dev/vmware/vmxnet3/if_vmxvar.h
@@ -277,14 +277,13 @@ struct vmxnet3_softc {
*/
#define VMXNET3_TX_MAXSEGS 32
#define VMXNET3_TX_MAXSIZE (VMXNET3_TX_MAXSEGS * MCLBYTES)
-#define VMXNET3_TSO_MAXSIZE \
- (VMXNET3_TX_MAXSIZE - sizeof(struct ether_vlan_header))
/*
* Maximum support Tx segments size. The length field in the
* Tx descriptor is 14 bits.
*/
-#define VMXNET3_TX_MAXSEGSIZE (1 << 14)
+#define VMXNET3_TX_MAXSEGSHIFT 14
+#define VMXNET3_TX_MAXSEGSIZE (1 << VMXNET3_TX_MAXSEGSHIFT)
/*
* The maximum number of Rx segments we accept. When LRO is enabled,
diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c
index 9ff872cdc745..1fadee46a9c0 100644
--- a/sys/dev/xen/netfront/netfront.c
+++ b/sys/dev/xen/netfront/netfront.c
@@ -134,7 +134,6 @@ static const int MODPARM_rx_flip = 0;
* to mirror the Linux MAX_SKB_FRAGS constant.
*/
#define MAX_TX_REQ_FRAGS (65536 / PAGE_SIZE + 2)
-#define NF_TSO_MAXBURST ((IP_MAXPACKET / PAGE_SIZE) * MCLBYTES)
#define RX_COPY_THRESHOLD 256
@@ -2102,7 +2101,10 @@ create_netdev(device_t dev)
ifp->if_hwassist = XN_CSUM_FEATURES;
ifp->if_capabilities = IFCAP_HWCSUM;
- ifp->if_hw_tsomax = NF_TSO_MAXBURST;
+ ifp->if_hw_tsomax = IF_HW_TSOMAX_BUILD_VALUE(
+ 65535 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN) /* bytes */,
+ MAX_TX_REQ_FRAGS /* maximum frag count */,
+ PAGE_SHIFT /* PAGE_SIZE frag size */);
ether_ifattach(ifp, np->mac);
callout_init(&np->xn_stat_ch, CALLOUT_MPSAFE);
diff --git a/sys/net/if.c b/sys/net/if.c
index 017af336d2b6..5e9b77685449 100644
--- a/sys/net/if.c
+++ b/sys/net/if.c
@@ -423,6 +423,52 @@ if_grow(void)
}
/*
+ * Compute the least common value of two "if_hw_tsomax" values:
+ */
+u_int
+if_hw_tsomax_common(u_int a, u_int b)
+{
+ u_int a_bytes = IF_HW_TSOMAX_GET_BYTES(a);
+ u_int a_frag_count = IF_HW_TSOMAX_GET_FRAG_COUNT(a);
+ u_int a_frag_size = IF_HW_TSOMAX_GET_FRAG_SIZE(a);
+ u_int b_bytes = IF_HW_TSOMAX_GET_BYTES(b);
+ u_int b_frag_count = IF_HW_TSOMAX_GET_FRAG_COUNT(b);
+ u_int b_frag_size = IF_HW_TSOMAX_GET_FRAG_SIZE(b);
+
+ return (IF_HW_TSOMAX_BUILD_VALUE(min(a_bytes, b_bytes),
+ min(a_frag_count, b_frag_count),
+ min(a_frag_size, b_frag_size)));
+}
+
+/*
+ * Range check the "if_hw_tsomax" value:
+ */
+u_int
+if_hw_tsomax_range_check(u_int a)
+{
+ u_int a_bytes = IF_HW_TSOMAX_GET_BYTES(a);
+ u_int a_frag_count = IF_HW_TSOMAX_GET_FRAG_COUNT(a);
+ u_int a_frag_size = IF_HW_TSOMAX_GET_FRAG_SIZE(a);
+
+ /* round down to nearest 4 bytes */
+ a_bytes &= 0xFFFC;
+
+ /* use default, if zero */
+ if (a_bytes == 0)
+ a_bytes = IF_HW_TSOMAX_DEFAULT_BYTES;
+
+ /* use default, if zero */
+ if (a_frag_count == 0)
+ a_frag_count = IF_HW_TSOMAX_DEFAULT_FRAG_COUNT;
+
+ /* use default, if zero */
+ if (a_frag_size == 0)
+ a_frag_size = IF_HW_TSOMAX_DEFAULT_FRAG_SIZE;
+
+ return (IF_HW_TSOMAX_BUILD_VALUE(a_bytes, a_frag_count, a_frag_size));
+}
+
+/*
* Allocate a struct ifnet and an index for an interface. A layer 2
* common structure will also be allocated if an allocation routine is
* registered for the passed type.
@@ -445,6 +491,7 @@ if_alloc(u_char type)
ifp->if_index = idx;
ifp->if_type = type;
ifp->if_alloctype = type;
+ ifp->if_hw_tsomax = IF_HW_TSOMAX_DEFAULT_VALUE();
if (if_com_alloc[type] != NULL) {
ifp->if_l2com = if_com_alloc[type](type, ifp);
if (ifp->if_l2com == NULL) {
@@ -657,16 +704,9 @@ if_attach_internal(struct ifnet *ifp, int vmove)
TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
/* Reliably crash if used uninitialized. */
ifp->if_broadcastaddr = NULL;
-
-#if defined(INET) || defined(INET6)
- /* Initialize to max value. */
- if (ifp->if_hw_tsomax == 0)
- ifp->if_hw_tsomax = min(IP_MAXPACKET, 32 * MCLBYTES -
- (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
- KASSERT(ifp->if_hw_tsomax <= IP_MAXPACKET &&
- ifp->if_hw_tsomax >= IP_MAXPACKET / 8,
- ("%s: tsomax outside of range", __func__));
-#endif
+ /* range check TSO value */
+ ifp->if_hw_tsomax =
+ if_hw_tsomax_range_check(ifp->if_hw_tsomax);
}
#ifdef VIMAGE
else {
diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c
index 8d53526c6a02..e7cfb3c1f753 100644
--- a/sys/net/if_lagg.c
+++ b/sys/net/if_lagg.c
@@ -445,11 +445,7 @@ lagg_capabilities(struct lagg_softc *sc)
struct lagg_port *lp;
int cap = ~0, ena = ~0;
u_long hwa = ~0UL;
-#if defined(INET) || defined(INET6)
- u_int hw_tsomax = IP_MAXPACKET; /* Initialize to the maximum value. */
-#else
- u_int hw_tsomax = ~0; /* if_hw_tsomax is only for INET/INET6, but.. */
-#endif
+ u_int hw_tsomax = IF_HW_TSOMAX_DEFAULT_VALUE();
LAGG_WLOCK_ASSERT(sc);
@@ -458,10 +454,9 @@ lagg_capabilities(struct lagg_softc *sc)
cap &= lp->lp_ifp->if_capabilities;
ena &= lp->lp_ifp->if_capenable;
hwa &= lp->lp_ifp->if_hwassist;
- /* Set to the minimum value of the lagg ports. */
- if (lp->lp_ifp->if_hw_tsomax < hw_tsomax &&
- lp->lp_ifp->if_hw_tsomax > 0)
- hw_tsomax = lp->lp_ifp->if_hw_tsomax;
+ /* Set to the common value of the lagg ports. */
+ hw_tsomax = if_hw_tsomax_common(hw_tsomax,
+ lp->lp_ifp->if_hw_tsomax);
}
cap = (cap == ~0 ? 0 : cap);
ena = (ena == ~0 ? 0 : ena);
diff --git a/sys/net/if_var.h b/sys/net/if_var.h
index 09f41b8097e9..af83713c24df 100644
--- a/sys/net/if_var.h
+++ b/sys/net/if_var.h
@@ -120,6 +120,43 @@ typedef int (*if_transmit_fn_t)(if_t, struct mbuf *);
typedef uint64_t (*if_get_counter_t)(if_t, ifnet_counter);
/*
+ * Macros defining how to decode the "if_hw_tsomax" field:
+ */
+#define IF_HW_TSOMAX_GET_BYTES(x) \
+ ((uint16_t)(x)) /* 32..65535 */
+
+#define IF_HW_TSOMAX_GET_FRAG_COUNT(x) \
+ ((uint8_t)((x) >> 16)) /* 1..255 */
+
+#define IF_HW_TSOMAX_GET_FRAG_SIZE(x) \
+ ((uint8_t)((x) >> 24)) /* 12..16 */
+
+/*
+ * The following macro defines how to build the "if_hw_tsomax"
+ * field. The "bytes" field has unit 1 bytes and declares the maximum
+ * number of bytes which can be transferred by a single transmit
+ * offload, TSO, job. The "bytes" field is rounded down to the neares
+ * 4 bytes to avoid having the hardware do unaligned memory
+ * accesses. The "frag_count" field has unit 1 fragment and declares
+ * the maximum number of fragments a TSO job can contain. The
+ * "frag_size" field has unit logarithm in base 2 of the actual value
+ * in bytes and declares the maximum size of a fragment.
+ */
+#define IF_HW_TSOMAX_BUILD_VALUE(bytes, frag_count, frag_size) \
+ (((bytes) & 0xFFFC) | (((frag_count) & 0xFF) << 16) | \
+ (((frag_size) & 0xFF) << 24))
+
+#define IF_HW_TSOMAX_DEFAULT_BYTES (65536 - 4)
+#define IF_HW_TSOMAX_DEFAULT_FRAG_COUNT 255
+#define IF_HW_TSOMAX_DEFAULT_FRAG_SIZE 16
+
+#define IF_HW_TSOMAX_DEFAULT_VALUE() \
+ IF_HW_TSOMAX_BUILD_VALUE( \
+ IF_HW_TSOMAX_DEFAULT_BYTES, \
+ IF_HW_TSOMAX_DEFAULT_FRAG_COUNT, \
+ IF_HW_TSOMAX_DEFAULT_FRAG_SIZE)
+
+/*
* Structure defining a network interface.
*
* Size ILP32: 592 (approx)
@@ -222,8 +259,7 @@ struct ifnet {
if_get_counter_t if_get_counter; /* get counter values */
/* Stuff that's only temporary and doesn't belong here. */
- u_int if_hw_tsomax; /* tso burst length limit, the minimum
- * is (IP_MAXPACKET / 8).
+ u_int if_hw_tsomax; /* TSO burst length limits.
* XXXAO: Have to find a better place
* for it eventually. */
/*
@@ -608,6 +644,10 @@ void if_setioctlfn(if_t ifp, int (*)(if_t, u_long, caddr_t));
void if_setstartfn(if_t ifp, void (*)(if_t));
void if_settransmitfn(if_t ifp, if_transmit_fn_t);
void if_setqflushfn(if_t ifp, if_qflush_fn_t);
+
+/* "if_hw_tsomax" related functions */
+u_int if_hw_tsomax_common(u_int, u_int);
+u_int if_hw_tsomax_range_check(u_int);
/* Revisit the below. These are inline functions originally */
int drbr_inuse_drv(if_t ifp, struct buf_ring *br);
diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c
index f551ffd618fc..fe2560c6fa7f 100644
--- a/sys/net/if_vlan.c
+++ b/sys/net/if_vlan.c
@@ -1511,8 +1511,8 @@ vlan_capabilities(struct ifvlan *ifv)
* propagate the hardware-assisted flag. TSO on VLANs
* does not necessarily require hardware VLAN tagging.
*/
- if (p->if_hw_tsomax > 0)
- ifp->if_hw_tsomax = p->if_hw_tsomax;
+ ifp->if_hw_tsomax = if_hw_tsomax_common(ifp->if_hw_tsomax,
+ p->if_hw_tsomax);
if (p->if_capabilities & IFCAP_VLAN_HWTSO)
ifp->if_capabilities |= p->if_capabilities & IFCAP_TSO;
if (p->if_capenable & IFCAP_VLAN_HWTSO) {
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index 12d8e754ab5b..5537416e5326 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
#include <net/vnet.h>
@@ -767,18 +768,88 @@ send:
flags &= ~TH_FIN;
if (tso) {
+ u_int if_hw_tsomax_bytes;
+ u_int if_hw_tsomax_frag_count;
+ u_int if_hw_tsomax_frag_size;
+ struct mbuf *mb;
+ u_int moff;
+ int max_len;
+
+ /* extract TSO information */
+ if_hw_tsomax_bytes =
+ IF_HW_TSOMAX_GET_BYTES(tp->t_tsomax);
+ if_hw_tsomax_frag_count =
+ IF_HW_TSOMAX_GET_FRAG_COUNT(tp->t_tsomax);
+ if_hw_tsomax_frag_size =
+ IF_HW_TSOMAX_GET_FRAG_SIZE(tp->t_tsomax);
+
+ /* compute maximum TSO length */
+ max_len = (if_hw_tsomax_bytes - hdrlen);
+
+ /* clamp maximum length value */
+ if (max_len > IP_MAXPACKET)
+ max_len = IP_MAXPACKET;
+ else if (max_len < 0)
+ max_len = 0;
+
+ /* get smallest length */
+ if (len > (u_int)max_len) {
+ if (max_len != 0)
+ sendalot = 1;
+ len = (u_int)max_len;
+ }
+
KASSERT(ipoptlen == 0,
("%s: TSO can't do IP options", __func__));
+ max_len = 0;
+ mb = sbsndptr(&so->so_snd, off, len, &moff);
+
+ /* now make sure the number of fragments fit too */
+ while (mb != NULL && (u_int)max_len < len) {
+ u_int cur_length;
+ u_int cur_frags;
+
+ /*
+ * Get length of mbuf fragment and how
+ * many hardware frags, rounded up, it
+ * would use:
+ */
+ cur_length = (mb->m_len - moff);
+ cur_frags = (cur_length +
+ (1 << if_hw_tsomax_frag_size) - 1) >>
+ if_hw_tsomax_frag_size;
+
+ /* Handle special case: Zero Length Mbuf */
+ if (cur_frags == 0)
+ cur_frags = 1;
+
+ /*
+ * Check if the fragment limit will be
+ * reached or exceeded:
+ */
+ if (cur_frags >= if_hw_tsomax_frag_count) {
+ max_len += min(cur_length,
+ if_hw_tsomax_frag_count <<
+ if_hw_tsomax_frag_size);
+ break;
+ }
+ max_len += cur_length;
+ if_hw_tsomax_frag_count -= cur_frags;
+ moff = 0;
+ mb = mb->m_next;
+ }
+
/*
* Limit a burst to t_tsomax minus IP,
* TCP and options length to keep ip->ip_len
* from overflowing or exceeding the maximum
* length allowed by the network interface.
*/
- if (len > tp->t_tsomax - hdrlen) {
- len = tp->t_tsomax - hdrlen;
- sendalot = 1;
+ if (len > (u_int)max_len) {
+ if (max_len != 0)
+ sendalot = 1;
+ len = (u_int)max_len;
}
/*
diff --git a/sys/ofed/drivers/net/mlx4/en_netdev.c b/sys/ofed/drivers/net/mlx4/en_netdev.c
index d8b015b1407d..a8eb5c83fed1 100644
--- a/sys/ofed/drivers/net/mlx4/en_netdev.c
+++ b/sys/ofed/drivers/net/mlx4/en_netdev.c
@@ -673,6 +673,12 @@ int mlx4_en_do_start_port(struct net_device *dev)
else
priv->rx_csum = 0;
+ /* set TSO limits so that we don't have to drop TX packets */
+ dev->if_hw_tsomax = IF_HW_TSOMAX_BUILD_VALUE(
+ 65535 - sizeof(struct ether_vlan_header) /* bytes */,
+ 16 /* maximum frag count */,
+ 16 /* can do up to 4GByte */);
+
err = mlx4_wol_read(priv->mdev->dev, &config, priv->port);
if (err) {
en_err(priv, "Failed to get WoL info, unable to modify\n");