diff options
author | Luigi Rizzo <luigi@FreeBSD.org> | 2014-01-06 12:53:15 +0000 |
---|---|---|
committer | Luigi Rizzo <luigi@FreeBSD.org> | 2014-01-06 12:53:15 +0000 |
commit | 17885a7bfde9d164e45a9833bb172215c55739f9 (patch) | |
tree | 529a5d218d5f4d073c5ad30a4b484d1b412ea226 /sys | |
parent | 0979970a1d4ffa9c13361e91760891d96864ceee (diff) |
It is 2014 and we have a new version of netmap.
Most relevant features:
- netmap emulation on any NIC, even those without native netmap support.
On the ixgbe we have measured about 4Mpps/core/queue in this mode,
which is still a lot more than with sockets/bpf.
- seamless interconnection of VALE switch, NICs and host stack.
If you disable accelerations on your NIC (say em0)
ifconfig em0 -txcsum -txcsum
you can use the VALE switch to connect the NIC and the host stack:
vale-ctl -h valeXX:em0
allowing sharing the NIC with other netmap clients.
- THE USER API HAS SLIGHTLY CHANGED (head/cur/tail pointers
instead of pointers/count as before). This was unavoidable to support,
in the future, multiple threads operating on the same rings.
Netmap clients require very small source code changes to compile again.
On the plus side, the new API should be easier to understand
and the internals are a lot simpler.
The manual page has been updated extensively to reflect the current
features and give some examples.
This is the result of work of several people including Giuseppe Lettieri,
Vincenzo Maffione, Michio Honda and myself, and has been financially
supported by EU projects CHANGE and OPENLAB, from NetApp University
Research Fund, NEC, and of course the Universita` di Pisa.
Notes
Notes:
svn path=/head/; revision=260368
Diffstat (limited to 'sys')
-rw-r--r-- | sys/dev/e1000/if_em.c | 2 | ||||
-rw-r--r-- | sys/dev/e1000/if_igb.c | 4 | ||||
-rw-r--r-- | sys/dev/e1000/if_lem.c | 2 | ||||
-rw-r--r-- | sys/dev/ixgbe/ixgbe.c | 2 | ||||
-rw-r--r-- | sys/dev/netmap/if_em_netmap.h | 52 | ||||
-rw-r--r-- | sys/dev/netmap/if_igb_netmap.h | 52 | ||||
-rw-r--r-- | sys/dev/netmap/if_lem_netmap.h | 57 | ||||
-rw-r--r-- | sys/dev/netmap/if_re_netmap.h | 84 | ||||
-rw-r--r-- | sys/dev/netmap/ixgbe_netmap.h | 74 | ||||
-rw-r--r-- | sys/dev/netmap/netmap.c | 835 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_freebsd.c | 26 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_generic.c | 1008 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_kern.h | 490 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mbq.c | 15 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mbq.h | 2 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mem2.c | 20 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mem2.h | 2 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_vale.c | 437 | ||||
-rw-r--r-- | sys/net/netmap.h | 381 | ||||
-rw-r--r-- | sys/net/netmap_user.h | 189 |
20 files changed, 2037 insertions, 1697 deletions
diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index 580407a529fd..428612a4a695 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -4352,7 +4352,7 @@ em_initialize_receive_unit(struct adapter *adapter) * preserve the rx buffers passed to userspace. */ if (ifp->if_capenable & IFCAP_NETMAP) - rdt -= NA(adapter->ifp)->rx_rings[i].nr_hwavail; + rdt -= nm_kr_rxspace(&NA(adapter->ifp)->rx_rings[i]); #endif /* DEV_NETMAP */ E1000_WRITE_REG(hw, E1000_RDT(i), rdt); } diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c index 57e4f893ab35..2134e29625cc 100644 --- a/sys/dev/e1000/if_igb.c +++ b/sys/dev/e1000/if_igb.c @@ -4630,13 +4630,13 @@ igb_initialize_receive_units(struct adapter *adapter) * an init() while a netmap client is active must * preserve the rx buffers passed to userspace. * In this driver it means we adjust RDT to - * somthing different from next_to_refresh + * something different from next_to_refresh * (which is not used in netmap mode). */ if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; - int t = rxr->next_to_refresh - kring->nr_hwavail; + int t = rxr->next_to_refresh - nm_kr_rxspace(kring); if (t >= adapter->num_rx_desc) t -= adapter->num_rx_desc; diff --git a/sys/dev/e1000/if_lem.c b/sys/dev/e1000/if_lem.c index a3da50c176ed..8014a0f9fde7 100644 --- a/sys/dev/e1000/if_lem.c +++ b/sys/dev/e1000/if_lem.c @@ -3367,7 +3367,7 @@ lem_initialize_receive_unit(struct adapter *adapter) #ifdef DEV_NETMAP /* preserve buffers already made available to clients */ if (ifp->if_capenable & IFCAP_NETMAP) - rctl -= NA(adapter->ifp)->rx_rings[0].nr_hwavail; + rctl -= nm_kr_rxspace(&NA(adapter->ifp)->rx_rings[0]); #endif /* DEV_NETMAP */ E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), rctl); diff --git a/sys/dev/ixgbe/ixgbe.c b/sys/dev/ixgbe/ixgbe.c index 740f7709e5b2..6dfec02cc8d9 100644 --- a/sys/dev/ixgbe/ixgbe.c +++ b/sys/dev/ixgbe/ixgbe.c @@ -1245,7 +1245,7 @@ ixgbe_init_locked(struct adapter *adapter) if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; - int t = na->num_rx_desc - 1 - kring->nr_hwavail; + int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring); IXGBE_WRITE_REG(hw, IXGBE_RDT(i), t); } else diff --git a/sys/dev/netmap/if_em_netmap.h b/sys/dev/netmap/if_em_netmap.h index dbbee4222407..17b4c4fd2e14 100644 --- a/sys/dev/netmap/if_em_netmap.h +++ b/sys/dev/netmap/if_em_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -120,9 +120,9 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, new_slots; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_txsync_prologue(kring, &new_slots); + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; @@ -130,9 +130,6 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct adapter *adapter = ifp->if_softc; struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - if (cur > lim) /* error checking in nm_txsync_prologue() */ - return netmap_ring_reinit(kring); - bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -141,9 +138,9 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { /* we have new packets to send */ + if (nm_i != head) { /* we have new packets to send */ nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; @@ -175,9 +172,7 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = cur; /* the saved ring->cur */ - /* decrease avail by # of packets sent minus previous ones */ - kring->nr_hwavail -= new_slots; + kring->nr_hwcur = head; /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, @@ -190,26 +185,20 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* * Second part: reclaim buffers for completed transmissions. */ - if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) { - int delta; - + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { /* record completed transmissions using TDH */ nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); nic_i -= kring->nkr_num_slots; } - delta = nic_i - txr->next_to_clean; - if (delta) { - /* some completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; + if (nic_i != txr->next_to_clean) { txr->next_to_clean = nic_i; - kring->nr_hwavail += delta; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } } - nm_txsync_finalize(kring, cur); + nm_txsync_finalize(kring); return 0; } @@ -226,16 +215,16 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, resvd; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; /* device-specific */ struct adapter *adapter = ifp->if_softc; struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - if (cur > lim) + if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ @@ -251,7 +240,7 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nic_i = rxr->next_to_check; nm_i = netmap_idx_n2k(kring, nic_i); - for (n = 0; ; n++) { + for (n = 0; ; n++) { // XXX no need to count struct e1000_rx_desc *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->status); @@ -268,7 +257,7 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) } if (n) { /* update the state variables */ rxr->next_to_check = nic_i; - kring->nr_hwavail += n; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } @@ -277,9 +266,9 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * Second part: skip past packets that userspace has released. */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { + if (nm_i != head) { nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); @@ -302,8 +291,7 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = cur; + kring->nr_hwcur = head; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); @@ -311,12 +299,12 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * IMPORTANT: we must leave one free slot in the ring, * so move nic_i back by one unit */ - nic_i = (nic_i == 0) ? lim : nic_i - 1; + nic_i = nm_prev(nic_i, lim); E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i); } /* tell userspace that there might be new packets */ - ring->avail = kring->nr_hwavail - resvd; + nm_rxsync_finalize(kring); return 0; diff --git a/sys/dev/netmap/if_igb_netmap.h b/sys/dev/netmap/if_igb_netmap.h index b91d0baba06f..e1929f0918e2 100644 --- a/sys/dev/netmap/if_igb_netmap.h +++ b/sys/dev/netmap/if_igb_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Universita` di Pisa. All rights reserved. + * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -88,9 +88,9 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, new_slots; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_txsync_prologue(kring, &new_slots); + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; @@ -101,9 +101,6 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) u32 olinfo_status = (adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0; - if (cur > lim) /* error checking in nm_txsync_prologue() */ - return netmap_ring_reinit(kring); - bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -112,9 +109,9 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { /* we have new packets to send */ + if (nm_i != head) { /* we have new packets to send */ nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; @@ -155,9 +152,7 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = cur; /* the saved ring->cur */ - /* decrease avail by # of packets sent minus previous ones */ - kring->nr_hwavail -= new_slots; + kring->nr_hwcur = head; /* Set the watchdog XXX ? */ txr->queue_status = IGB_QUEUE_WORKING; @@ -174,26 +169,18 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* * Second part: reclaim buffers for completed transmissions. */ - if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) { - int delta; - + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { /* record completed transmissions using TDH */ nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); nic_i -= kring->nkr_num_slots; } - delta = nic_i - txr->next_to_clean; - if (delta) { - /* some completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; - txr->next_to_clean = nic_i; - kring->nr_hwavail += delta; - } + txr->next_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } - nm_txsync_finalize(kring, cur); + nm_txsync_finalize(kring); return 0; } @@ -210,16 +197,16 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, resvd; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; /* device-specific */ struct adapter *adapter = ifp->if_softc; struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - if (cur > lim) + if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ @@ -250,7 +237,7 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) } if (n) { /* update the state variables */ rxr->next_to_check = nic_i; - kring->nr_hwavail += n; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } @@ -259,9 +246,9 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * Second part: skip past packets that userspace has released. */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { + if (nm_i != head) { nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); @@ -284,8 +271,7 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = cur; + kring->nr_hwcur = head; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); @@ -293,12 +279,12 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * IMPORTANT: we must leave one free slot in the ring, * so move nic_i back by one unit */ - nic_i = (nic_i == 0) ? lim : nic_i - 1; + nic_i = nm_prev(nic_i, lim); E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i); } /* tell userspace that there might be new packets */ - ring->avail = kring->nr_hwavail - resvd; + nm_rxsync_finalize(kring); return 0; diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h index 8ad3b7a2a352..4fce5c988d09 100644 --- a/sys/dev/netmap/if_lem_netmap.h +++ b/sys/dev/netmap/if_lem_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -91,18 +91,14 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, new_slots; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_txsync_prologue(kring, &new_slots); + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; /* device-specific */ struct adapter *adapter = ifp->if_softc; - if (cur > lim) /* error checking in nm_txsync_prologue() */ - return netmap_ring_reinit(kring); - bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -111,9 +107,9 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { /* we have new packets to send */ + if (nm_i != head) { /* we have new packets to send */ nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + while (nm_i != head) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; @@ -145,9 +141,7 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = cur; /* the saved ring->cur */ - /* decrease avail by # of packets sent minus previous ones */ - kring->nr_hwavail -= new_slots; + kring->nr_hwcur = head; /* synchronize the NIC ring */ bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, @@ -160,26 +154,19 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* * Second part: reclaim buffers for completed transmissions. */ - if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) { - int delta; - + if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + kring->last_reclaim = ticks; /* record completed transmissions using TDH */ nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); nic_i -= kring->nkr_num_slots; } - delta = nic_i - adapter->next_tx_to_clean; - if (delta) { - /* some completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; - adapter->next_tx_to_clean = nic_i; - kring->nr_hwavail += delta; - } + adapter->next_tx_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } - nm_txsync_finalize(kring, cur); + nm_txsync_finalize(kring); return 0; } @@ -196,15 +183,15 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, resvd; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; /* device-specific */ struct adapter *adapter = ifp->if_softc; - if (cur > lim) + if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ @@ -241,9 +228,14 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ + ND("%d new packets at nic %d nm %d tail %d", + n, + adapter->next_rx_desc_to_check, + netmap_idx_n2k(kring, adapter->next_rx_desc_to_check), + kring->nr_hwtail); adapter->next_rx_desc_to_check = nic_i; // ifp->if_ipackets += n; - kring->nr_hwavail += n; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } @@ -252,9 +244,9 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * Second part: skip past packets that userspace has released. */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { + if (nm_i != head) { nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); @@ -277,20 +269,19 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = cur; + kring->nr_hwcur = head; bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, * so move nic_i back by one unit */ - nic_i = (nic_i == 0) ? lim : nic_i - 1; + nic_i = nm_prev(nic_i, lim); E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), nic_i); } /* tell userspace that there might be new packets */ - ring->avail = kring->nr_hwavail - resvd; + nm_rxsync_finalize(kring); return 0; diff --git a/sys/dev/netmap/if_re_netmap.h b/sys/dev/netmap/if_re_netmap.h index 2c7ba060cffd..10abe4f49f83 100644 --- a/sys/dev/netmap/if_re_netmap.h +++ b/sys/dev/netmap/if_re_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -72,17 +72,14 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, new_slots; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_txsync_prologue(kring, &new_slots); + u_int const head = kring->rhead; /* device-specific */ struct rl_softc *sc = ifp->if_softc; struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc; - if (cur > lim) /* error checking in nm_txsync_prologue() */ - return netmap_ring_reinit(kring); - bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, sc->rl_ldata.rl_tx_list_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); // XXX extra postwrite ? @@ -91,11 +88,11 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) * First part: process new packets to send. */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { /* we have new packets to send */ + if (nm_i != head) { /* we have new packets to send */ nic_i = sc->rl_ldata.rl_tx_prodidx; // XXX or netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; @@ -132,9 +129,7 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nic_i = nm_next(nic_i, lim); } sc->rl_ldata.rl_tx_prodidx = nic_i; - /* decrease avail by # of packets sent minus previous ones */ - kring->nr_hwcur = cur; /* the saved ring->cur */ - kring->nr_hwavail -= new_slots; + kring->nr_hwcur = head; /* synchronize the NIC ring */ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, @@ -148,7 +143,7 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* * Second part: reclaim buffers for completed transmissions. */ - if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) { + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { nic_i = sc->rl_ldata.rl_tx_considx; for (n = 0; nic_i != sc->rl_ldata.rl_tx_prodidx; n++, nic_i = RL_TX_DESC_NXT(sc, nic_i)) { @@ -160,11 +155,11 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) if (n > 0) { sc->rl_ldata.rl_tx_considx = nic_i; sc->rl_ldata.rl_tx_free += n; - kring->nr_hwavail += n; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } } - nm_txsync_finalize(kring, cur); + nm_txsync_finalize(kring); return 0; } @@ -181,16 +176,16 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, resvd; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; /* device-specific */ struct rl_softc *sc = ifp->if_softc; struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc; - if (cur > lim) + if (head > lim) return netmap_ring_reinit(kring); bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, @@ -202,16 +197,17 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * * This device uses all the buffers in the ring, so we need * another termination condition in addition to RL_RDESC_STAT_OWN - * cleared (all buffers could have it cleared. The easiest one - * is to limit the amount of data reported up to 'lim' + * cleared (all buffers could have it cleared). The easiest one + * is to stop right before nm_hwcur. */ if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + uint32_t stop_i = nm_prev(kring->nr_hwcur, lim); nic_i = sc->rl_ldata.rl_rx_prodidx; /* next pkt to check */ nm_i = netmap_idx_n2k(kring, nic_i); - for (n = kring->nr_hwavail; n < lim ; n++) { + while (nm_i != stop_i) { struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[nic_i]; uint32_t rxstat = le32toh(cur_rx->rl_cmdstat); uint32_t total_len; @@ -226,14 +222,12 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* sync was in re_newbuf() */ bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, rxd[nic_i].rx_dmamap, BUS_DMASYNC_POSTREAD); + // sc->rl_ifp->if_ipackets++; nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - if (n != kring->nr_hwavail) { - sc->rl_ldata.rl_rx_prodidx = nic_i; - sc->rl_ifp->if_ipackets += n - kring->nr_hwavail; - kring->nr_hwavail = n; - } + sc->rl_ldata.rl_rx_prodidx = nic_i; + kring->nr_hwtail = nm_i; kring->nr_kflags &= ~NKR_PENDINTR; } @@ -241,9 +235,9 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * Second part: skip past packets that userspace has released. */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { + if (nm_i != head) { nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); @@ -272,8 +266,7 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = cur; + kring->nr_hwcur = head; bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list_map, @@ -281,7 +274,7 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) } /* tell userspace that there might be new packets */ - ring->avail = kring->nr_hwavail - resvd; + nm_rxsync_finalize(kring); return 0; @@ -336,36 +329,35 @@ re_netmap_rx_init(struct rl_softc *sc) struct netmap_slot *slot = netmap_reset(na, NR_RX, 0, 0); struct rl_desc *desc = sc->rl_ldata.rl_rx_list; uint32_t cmdstat; - int i, n, max_avail; + uint32_t nic_i, max_avail; + uint32_t const n = sc->rl_ldata.rl_rx_desc_cnt; if (!slot) return; - n = sc->rl_ldata.rl_rx_desc_cnt; /* - * Userspace owned hwavail packets before the reset, - * so the NIC that last hwavail descriptors of the ring - * are still owned by the driver (and keep one empty). + * Do not release the slots owned by userspace, + * and also keep one empty. */ - max_avail = n - 1 - na->rx_rings[0].nr_hwavail; - for (i = 0; i < n; i++) { + max_avail = n - 1 - nm_kr_rxspace(&na->rx_rings[0]); + for (nic_i = 0; nic_i < n; nic_i++) { void *addr; uint64_t paddr; - int l = netmap_idx_n2k(&na->rx_rings[0], i); + uint32_t nm_i = netmap_idx_n2k(&na->rx_rings[0], nic_i); - addr = PNMB(slot + l, &paddr); + addr = PNMB(slot + nm_i, &paddr); netmap_reload_map(sc->rl_ldata.rl_rx_mtag, - sc->rl_ldata.rl_rx_desc[i].rx_dmamap, addr); + sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, addr); bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, - sc->rl_ldata.rl_rx_desc[i].rx_dmamap, BUS_DMASYNC_PREREAD); - desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); - desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, BUS_DMASYNC_PREREAD); + desc[nic_i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc[nic_i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); cmdstat = NETMAP_BUF_SIZE; - if (i == n - 1) /* mark the end of ring */ + if (nic_i == n - 1) /* mark the end of ring */ cmdstat |= RL_RDESC_CMD_EOR; - if (i < max_avail) + if (nic_i < max_avail) cmdstat |= RL_RDESC_CMD_OWN; - desc[i].rl_cmdstat = htole32(cmdstat); + desc[nic_i].rl_cmdstat = htole32(cmdstat); } } diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h index 4dea6639d325..a617cc4c2429 100644 --- a/sys/dev/netmap/ixgbe_netmap.h +++ b/sys/dev/netmap/ixgbe_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -141,14 +141,13 @@ ixgbe_netmap_reg(struct netmap_adapter *na, int onoff) /* * Reconcile kernel and user view of the transmit ring. * - * Userspace wants to send packets up to the one before ring->cur, + * All information is in the kring. + * Userspace wants to send packets up to the one before kring->rhead, * kernel knows kring->nr_hwcur is the first unsent packet. * * Here we push packets out (as many as possible), and possibly * reclaim buffers from previously completed transmission. * - * ring->avail is not used on input, but it is updated on return. - * * The caller (netmap) guarantees that there is only one instance * running at any time. Any interference with other driver * methods should be handled by the individual drivers. @@ -161,9 +160,9 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, new_slots; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_txsync_prologue(kring, &new_slots); + u_int const head = kring->rhead; /* * interrupts on every tx packet are expensive so request * them every half ring, or where NS_REPORT is set @@ -175,9 +174,6 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct tx_ring *txr = &adapter->tx_rings[ring_nr]; int reclaim_tx; - if (cur > lim) /* error checking in nm_txsync_prologue() */ - return netmap_ring_reinit(kring); - bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -199,7 +195,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ /* - * If we have packets to send (kring->nr_hwcur != ring->cur) + * If we have packets to send (kring->nr_hwcur != kring->rhead) * iterate over the netmap ring, fetch length and update * the corresponding slot in the NIC ring. Some drivers also * need to update the buffer's physical address in the NIC slot @@ -217,13 +213,13 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { /* we have new packets to send */ + if (nm_i != head) { /* we have new packets to send */ nic_i = netmap_idx_k2n(kring, nm_i); __builtin_prefetch(&ring->slot[nm_i]); __builtin_prefetch(&txr->tx_buffers[nic_i]); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; @@ -262,9 +258,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = cur; /* the saved ring->cur */ - /* decrease avail by # of packets sent minus previous ones */ - kring->nr_hwavail -= new_slots; + kring->nr_hwcur = head; /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, @@ -281,7 +275,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ if (flags & NAF_FORCE_RECLAIM) { reclaim_tx = 1; /* forced reclaim */ - } else if (kring->nr_hwavail > 0) { + } else if (!nm_kr_txempty(kring)) { reclaim_tx = 0; /* have buffers, no reclaim */ } else { /* @@ -321,21 +315,13 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nic_i -= kring->nkr_num_slots; } if (nic_i != txr->next_to_clean) { - n = (nic_i + lim + 1) - txr->next_to_clean; - if (n > lim) - n -= lim + 1; /* some tx completed, increment avail */ txr->next_to_clean = nic_i; - kring->nr_hwavail += n; - if (kring->nr_hwavail > lim) { - RD(5, "bad hwavail %d", - kring->nr_hwavail); - return netmap_ring_reinit(kring); - } + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } } - nm_txsync_finalize(kring, cur); + nm_txsync_finalize(kring); return 0; } @@ -347,14 +333,9 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) * The caller guarantees a single invocations, but races against * the rest of the driver should be handled here. * - * When called, userspace has released buffers up to - * ring->cur - ring->reserved (last one excluded). - * - * The last interrupt reported kring->nr_hwavail slots available - * after kring->nr_hwcur. - * We must subtract the newly consumed slots (cur - nr_hwcur) - * from nr_hwavail, make the descriptors available for the next reads, - * and set kring->nr_hwcur = ring->cur and ring->avail = kring->nr_hwavail. + * On call, kring->rhead is the first packet that userspace wants + * to keep, and kring->rcur is the wakeup point. + * The kernel has previously reported packets up to kring->rtail. * * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective * of whether or not we received an interrupt. @@ -367,16 +348,16 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ - u_int n, resvd; + u_int n; u_int const lim = kring->nkr_num_slots - 1; - u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; /* device-specific */ struct adapter *adapter = ifp->if_softc; struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - if (cur > lim) + if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ @@ -391,8 +372,8 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * and they may differ in case if_init() has been called while * in netmap mode. For the receive ring we have * - * nm_i = (kring->nr_hwcur + kring->nr_hwavail) % ring_size * nic_i = rxr->next_to_check; + * nm_i = kring->nr_hwtail (previous) * and * nm_i == (nic_i + kring->nkr_hwofs) % ring_size * @@ -402,7 +383,7 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) int crclen = ix_crcstrip ? 0 : 4; uint16_t slot_flags = kring->nkr_slot_flags; - nic_i = rxr->next_to_check; + nic_i = rxr->next_to_check; // or also k2n(kring->nr_hwtail) nm_i = netmap_idx_n2k(kring, nic_i); for (n = 0; ; n++) { @@ -425,23 +406,23 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) ix_rx_miss_bufs += n; } rxr->next_to_check = nic_i; - kring->nr_hwavail += n; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } /* * Second part: skip past packets that userspace has released. - * (kring->nr_hwcur to ring->cur - ring->reserved excluded), + * (kring->nr_hwcur to kring->rhead excluded), * and make the buffers available for reception. * As usual nm_i is the index in the netmap ring, * nic_i is the index in the NIC ring, and * nm_i == (nic_i + kring->nkr_hwofs) % ring_size */ nm_i = kring->nr_hwcur; - if (nm_i != cur) { + if (nm_i != head) { nic_i = netmap_idx_k2n(kring, nm_i); - for (n = 0; nm_i != cur; n++) { + for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); @@ -464,8 +445,7 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = cur; + kring->nr_hwcur = head; bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); @@ -473,12 +453,12 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * IMPORTANT: we must leave one free slot in the ring, * so move nic_i back by one unit */ - nic_i = (nic_i == 0) ? lim : nic_i - 1; + nic_i = nm_prev(nic_i, lim); IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), nic_i); } /* tell userspace that there might be new packets */ - ring->avail = kring->nr_hwavail - resvd; + nm_rxsync_finalize(kring); return 0; diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index 478d9374937f..358d4693dcb3 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -151,7 +151,6 @@ ports attached to the switch) #include <machine/bus.h> /* bus_dmamap_* */ #include <sys/endian.h> #include <sys/refcount.h> -#include <sys/jail.h> /* reduce conditional code */ @@ -226,9 +225,6 @@ enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ NETMAP_ADMODE_NATIVE, /* either native or none */ NETMAP_ADMODE_GENERIC, /* force generic */ NETMAP_ADMODE_LAST }; -#define NETMAP_ADMODE_NATIVE 1 /* Force native netmap adapter. */ -#define NETMAP_ADMODE_GENERIC 2 /* Force generic netmap adapter. */ -#define NETMAP_ADMODE_BEST 0 /* Priority to native netmap adapter. */ static int netmap_admode = NETMAP_ADMODE_BEST; int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ @@ -252,6 +248,10 @@ nm_kr_get(struct netmap_kring *kr) } +/* + * mark the ring as stopped, and run through the locks + * to make sure other users get to see it. + */ void netmap_disable_ring(struct netmap_kring *kr) { @@ -380,7 +380,6 @@ nm_dump_buf(char *p, int len, int lim, char *dst) } - /* * Fetch configuration from the device, to cope with dynamic * reconfigurations after loading the module. @@ -432,6 +431,7 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail u_int i, len, ndesc; struct netmap_kring *kring; + // XXX additional space for extra rings ? len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); @@ -441,19 +441,23 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail } na->rx_rings = na->tx_rings + ntx; + /* + * All fields in krings are 0 except the one initialized below. + * but better be explicit on important kring fields. + */ ndesc = na->num_tx_desc; for (i = 0; i < ntx; i++) { /* Transmit rings */ kring = &na->tx_rings[i]; bzero(kring, sizeof(*kring)); kring->na = na; + kring->ring_id = i; kring->nkr_num_slots = ndesc; /* - * IMPORTANT: - * Always keep one slot empty, so we can detect new - * transmissions comparing cur and nr_hwcur (they are - * the same only if there are no new transmissions). + * IMPORTANT: Always keep one slot empty. */ - kring->nr_hwavail = ndesc - 1; + kring->rhead = kring->rcur = kring->nr_hwcur = 0; + kring->rtail = kring->nr_hwtail = ndesc - 1; + snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i); mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); init_waitqueue_head(&kring->si); } @@ -463,7 +467,11 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail kring = &na->rx_rings[i]; bzero(kring, sizeof(*kring)); kring->na = na; + kring->ring_id = i; kring->nkr_num_slots = ndesc; + kring->rhead = kring->rcur = kring->nr_hwcur = 0; + kring->rtail = kring->nr_hwtail = 0; + snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i); mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); init_waitqueue_head(&kring->si); } @@ -473,10 +481,10 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail na->tailroom = na->rx_rings + nrx; return 0; - } +/* XXX check boundaries */ void netmap_krings_delete(struct netmap_adapter *na) { @@ -493,6 +501,23 @@ netmap_krings_delete(struct netmap_adapter *na) } +/* + * Destructor for NIC ports. They also have an mbuf queue + * on the rings connected to the host so we need to purge + * them first. + */ +static void +netmap_hw_krings_delete(struct netmap_adapter *na) +{ + struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; + + ND("destroy sw mbq with len %d", mbq_len(q)); + mbq_purge(q); + mbq_safe_destroy(q); + netmap_krings_delete(na); +} + + static struct netmap_if* netmap_if_new(const char *ifname, struct netmap_adapter *na) { @@ -721,6 +746,7 @@ netmap_dtor(void *data) /* * pass a chain of buffers to the host stack as coming from 'dst' + * We do not need to lock because the queue is private. */ static void netmap_send_up(struct ifnet *dst, struct mbq *q) @@ -739,39 +765,30 @@ netmap_send_up(struct ifnet *dst, struct mbq *q) /* * put a copy of the buffers marked NS_FORWARD into an mbuf chain. - * Run from hwcur to cur - reserved + * Take packets from hwcur to ring->head marked NS_FORWARD (or forced) + * and pass them up. Drop remaining packets in the unlikely event + * of an mbuf shortage. */ static void netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) { - /* Take packets from hwcur to cur-reserved and pass them up. - * In case of no buffers we give up. At the end of the loop, - * the queue is drained in all cases. - * XXX handle reserved - */ - u_int lim = kring->nkr_num_slots - 1; - struct mbuf *m; - u_int k = kring->ring->cur, n = kring->ring->reserved; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->ring->head; + u_int n; struct netmap_adapter *na = kring->na; - /* compute the final position, ring->cur - ring->reserved */ - if (n > 0) { - if (k < n) - k += kring->nkr_num_slots; - k += n; - } - for (n = kring->nr_hwcur; n != k;) { + for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) { + struct mbuf *m; struct netmap_slot *slot = &kring->ring->slot[n]; - n = nm_next(n, lim); if ((slot->flags & NS_FORWARD) == 0 && !force) continue; if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { - D("bad pkt at %d len %d", n, slot->len); + RD(5, "bad pkt at %d len %d", n, slot->len); continue; } slot->flags &= ~NS_FORWARD; // XXX needed ? - /* XXX adapt to the case of a multisegment packet */ + /* XXX TODO: adapt to the case of a multisegment packet */ m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); if (m == NULL) @@ -782,69 +799,54 @@ netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) /* - * The host ring has packets from nr_hwcur to (cur - reserved) - * to be sent down to the NIC. - * We need to use the queue lock on the source (host RX ring) - * to protect against netmap_transmit. - * If the user is well behaved we do not need to acquire locks - * on the destination(s), - * so we only need to make sure that there are no panics because - * of user errors. - * XXX verify - * - * We scan the tx rings, which have just been - * flushed so nr_hwcur == cur. Pushing packets down means - * increment cur and decrement avail. - * XXX to be verified + * Send to the NIC rings packets marked NS_FORWARD between + * kring->nr_hwcur and kring->rhead + * Called under kring->rx_queue.lock on the sw rx ring, */ -static void +static u_int netmap_sw_to_nic(struct netmap_adapter *na) { struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; - struct netmap_kring *k1 = &na->tx_rings[0]; - u_int i, howmany, src_lim, dst_lim; - - /* XXX we should also check that the carrier is on */ - if (kring->nkr_stopped) - return; + struct netmap_slot *rxslot = kring->ring->slot; + u_int i, rxcur = kring->nr_hwcur; + u_int const head = kring->rhead; + u_int const src_lim = kring->nkr_num_slots - 1; + u_int sent = 0; + + /* scan rings to find space, then fill as much as possible */ + for (i = 0; i < na->num_tx_rings; i++) { + struct netmap_kring *kdst = &na->tx_rings[i]; + struct netmap_ring *rdst = kdst->ring; + u_int const dst_lim = kdst->nkr_num_slots - 1; + + /* XXX do we trust ring or kring->rcur,rtail ? */ + for (; rxcur != head && !nm_ring_empty(rdst); + rxcur = nm_next(rxcur, src_lim) ) { + struct netmap_slot *src, *dst, tmp; + u_int dst_cur = rdst->cur; - mtx_lock(&kring->q_lock); + src = &rxslot[rxcur]; + if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) + continue; - if (kring->nkr_stopped) - goto out; + sent++; - howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ + dst = &rdst->slot[dst_cur]; - src_lim = kring->nkr_num_slots - 1; - for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { - ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); - dst_lim = k1->nkr_num_slots - 1; - while (howmany > 0 && k1->ring->avail > 0) { - struct netmap_slot *src, *dst, tmp; - src = &kring->ring->slot[kring->nr_hwcur]; - dst = &k1->ring->slot[k1->ring->cur]; tmp = *src; + src->buf_idx = dst->buf_idx; src->flags = NS_BUF_CHANGED; dst->buf_idx = tmp.buf_idx; dst->len = tmp.len; dst->flags = NS_BUF_CHANGED; - ND("out len %d buf %d from %d to %d", - dst->len, dst->buf_idx, - kring->nr_hwcur, k1->ring->cur); - - kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim); - howmany--; - kring->nr_hwavail--; - k1->ring->cur = nm_next(k1->ring->cur, dst_lim); - k1->ring->avail--; + + rdst->cur = nm_next(dst_cur, dst_lim); } - kring->ring->cur = kring->nr_hwcur; // XXX - k1++; // XXX why? + /* if (sent) XXX txsync ? */ } -out: - mtx_unlock(&kring->q_lock); + return sent; } @@ -859,7 +861,8 @@ netmap_txsync_to_host(struct netmap_adapter *na) { struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; struct netmap_ring *ring = kring->ring; - u_int k, lim = kring->nkr_num_slots - 1; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_txsync_prologue(kring); struct mbq q; int error; @@ -869,22 +872,27 @@ netmap_txsync_to_host(struct netmap_adapter *na) D("ring %p busy (user error)", kring); return; } - k = ring->cur; - if (k > lim) { + if (head > lim) { D("invalid ring index in stack TX kring %p", kring); netmap_ring_reinit(kring); nm_kr_put(kring); return; } - /* Take packets from hwcur to cur and pass them up. + /* Take packets from hwcur to head and pass them up. + * force head = cur since netmap_grab_packets() stops at head * In case of no buffers we give up. At the end of the loop, * the queue is drained in all cases. */ mbq_init(&q); - netmap_grab_packets(kring, &q, 1); - kring->nr_hwcur = k; - kring->nr_hwavail = ring->avail = lim; + ring->cur = head; + netmap_grab_packets(kring, &q, 1 /* force */); + ND("have %d pkts in queue", mbq_len(&q)); + kring->nr_hwcur = head; + kring->nr_hwtail = head + lim; + if (kring->nr_hwtail > lim) + kring->nr_hwtail -= lim + 1; + nm_txsync_finalize(kring); nm_kr_put(kring); netmap_send_up(na->ifp, &q); @@ -893,60 +901,89 @@ netmap_txsync_to_host(struct netmap_adapter *na) /* * rxsync backend for packets coming from the host stack. - * They have been put in the queue by netmap_transmit() so we - * need to protect access to the kring using a lock. + * They have been put in kring->rx_queue by netmap_transmit(). + * We protect access to the kring using kring->rx_queue.lock * * This routine also does the selrecord if called from the poll handler * (we know because td != NULL). * * NOTE: on linux, selrecord() is defined as a macro and uses pwait * as an additional hidden argument. + * returns the number of packets delivered to tx queues in + * transparent mode, or a negative value if error */ -static void +int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) { struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; struct netmap_ring *ring = kring->ring; - u_int j, n, lim = kring->nkr_num_slots; - u_int k = ring->cur, resvd = ring->reserved; + u_int nm_i, n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); + int ret = 0; + struct mbq *q = &kring->rx_queue; (void)pwait; /* disable unused warnings */ - if (kring->nkr_stopped) /* check a first time without lock */ - return; + if (head > lim) { + netmap_ring_reinit(kring); + return EINVAL; + } - mtx_lock(&kring->q_lock); + if (kring->nkr_stopped) /* check a first time without lock */ + return EBUSY; - if (kring->nkr_stopped) /* check again with lock held */ - goto unlock_out; + mtx_lock(&q->lock); - if (k >= lim) { - netmap_ring_reinit(kring); + if (kring->nkr_stopped) { /* check again with lock held */ + ret = EBUSY; goto unlock_out; } - /* new packets are already set in nr_hwavail */ - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... + + /* First part: import newly received packets */ + n = mbq_len(q); + if (n) { /* grab packets from the queue */ + struct mbuf *m; + uint32_t stop_i; + + nm_i = kring->nr_hwtail; + stop_i = nm_prev(nm_i, lim); + while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) { + int len = MBUF_LEN(m); + struct netmap_slot *slot = &ring->slot[nm_i]; + + m_copydata(m, 0, len, BDG_NMB(na, slot)); + ND("nm %d len %d", nm_i, len); + if (netmap_verbose) + D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL)); + + slot->len = len; + slot->flags = kring->nkr_slot_flags; + nm_i = nm_next(nm_i, lim); } - k = (k >= resvd) ? k - resvd : k + lim - resvd; + kring->nr_hwtail = nm_i; } - if (j != k) { - n = k >= j ? k - j : k + lim - j; - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* something was released */ + if (netmap_fwd || kring->ring->flags & NR_FORWARD) + ret = netmap_sw_to_nic(na); + kring->nr_hwcur = head; } - k = ring->avail = kring->nr_hwavail - resvd; - if (k == 0 && td) + + nm_rxsync_finalize(kring); + + /* access copies of cur,tail in the kring */ + if (kring->rcur == kring->rtail && td) /* no bufs available */ selrecord(td, &kring->si); - if (k && (netmap_verbose & NM_VERB_HOST)) - D("%d pkts from stack", k); + unlock_out: - mtx_unlock(&kring->q_lock); + mtx_unlock(&q->lock); + return ret; } @@ -1042,7 +1079,7 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) // XXX add a refcount ? netmap_adapter_get(prev_na); } - D("Created generic NA %p (prev %p)", gna, gna->prev); + ND("Created generic NA %p (prev %p)", gna, gna->prev); return 0; } @@ -1113,154 +1150,167 @@ out: /* * validate parameters on entry for *_txsync() * Returns ring->cur if ok, or something >= kring->nkr_num_slots - * in case of error. The extra argument is a pointer to - * 'new_bufs'. XXX this may be deprecated at some point. + * in case of error. * - * Below is a correct configuration on input. ring->cur - * must be in the region covered by kring->hwavail, - * and ring->avail and kring->avail should end at the same slot. + * rhead, rcur and rtail=hwtail are stored from previous round. + * hwcur is the next packet to send to the ring. * - * +-hwcur - * | - * v<--hwres-->|<-----hwavail----> - * ------+------------------------------+-------- ring - * | - * |<---avail---> - * +--cur + * We want + * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail * + * hwcur, rhead, rtail and hwtail are reliable */ u_int -nm_txsync_prologue(struct netmap_kring *kring, u_int *new_slots) +nm_txsync_prologue(struct netmap_kring *kring) { struct netmap_ring *ring = kring->ring; + u_int head = ring->head; /* read only once */ u_int cur = ring->cur; /* read only once */ - u_int avail = ring->avail; /* read only once */ u_int n = kring->nkr_num_slots; - u_int kstart, kend, a; -#if 1 /* kernel sanity checks */ - if (kring->nr_hwcur >= n || - kring->nr_hwreserved >= n || kring->nr_hwavail >= n || - kring->nr_hwreserved + kring->nr_hwavail >= n) + ND(5, "%s kcur %d ktail %d head %d cur %d tail %d", + kring->name, + kring->nr_hwcur, kring->nr_hwtail, + ring->head, ring->cur, ring->tail); +#if 1 /* kernel sanity checks; but we can trust the kring. */ + if (kring->nr_hwcur >= n || kring->rhead >= n || + kring->rtail >= n || kring->nr_hwtail >= n) goto error; #endif /* kernel sanity checks */ - kstart = kring->nr_hwcur + kring->nr_hwreserved; - if (kstart >= n) - kstart -= n; - kend = kstart + kring->nr_hwavail; - /* user sanity checks. a is the expected avail */ - if (cur < kstart) { - /* too low, but maybe wraparound */ - if (cur + n > kend) + /* + * user sanity checks. We only use 'cur', + * A, B, ... are possible positions for cur: + * + * 0 A cur B tail C n-1 + * 0 D tail E cur F n-1 + * + * B, F, D are valid. A, C, E are wrong + */ + if (kring->rtail >= kring->rhead) { + /* want rhead <= head <= rtail */ + if (head < kring->rhead || head > kring->rtail) goto error; - *new_slots = cur + n - kstart; - a = kend - cur - n; - } else { - if (cur > kend) + /* and also head <= cur <= rtail */ + if (cur < head || cur > kring->rtail) + goto error; + } else { /* here rtail < rhead */ + /* we need head outside rtail .. rhead */ + if (head > kring->rtail && head < kring->rhead) goto error; - *new_slots = cur - kstart; - a = kend - cur; + + /* two cases now: head <= rtail or head >= rhead */ + if (head <= kring->rtail) { + /* want head <= cur <= rtail */ + if (cur < head || cur > kring->rtail) + goto error; + } else { /* head >= rhead */ + /* cur must be outside rtail..head */ + if (cur > kring->rtail && cur < head) + goto error; + } } - if (a != avail) { - RD(5, "wrong but fixable avail have %d need %d", - avail, a); - ring->avail = avail = a; + if (ring->tail != kring->rtail) { + RD(5, "tail overwritten was %d need %d", + ring->tail, kring->rtail); + ring->tail = kring->rtail; } - return cur; + kring->rhead = head; + kring->rcur = cur; + return head; error: - RD(5, "kring error: hwcur %d hwres %d hwavail %d cur %d av %d", + RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d", + kring->name, kring->nr_hwcur, - kring->nr_hwreserved, kring->nr_hwavail, - cur, avail); + kring->rcur, kring->nr_hwtail, + cur, ring->tail); return n; } /* * validate parameters on entry for *_rxsync() - * Returns ring->cur - ring->reserved if ok, - * or something >= kring->nkr_num_slots - * in case of error. The extra argument is a pointer to - * 'resvd'. XXX this may be deprecated at some point. + * Returns ring->head if ok, kring->nkr_num_slots on error. * - * Below is a correct configuration on input. ring->cur and - * ring->reserved must be in the region covered by kring->hwavail, - * and ring->avail and kring->avail should end at the same slot. + * For a valid configuration, + * hwcur <= head <= cur <= tail <= hwtail * - * +-hwcur - * | - * v<-------hwavail----------> - * ---------+--------------------------+-------- ring - * |<--res-->| - * |<---avail---> - * +--cur + * We only consider head and cur. + * hwcur and hwtail are reliable. * */ u_int -nm_rxsync_prologue(struct netmap_kring *kring, u_int *resvd) +nm_rxsync_prologue(struct netmap_kring *kring) { struct netmap_ring *ring = kring->ring; - u_int cur = ring->cur; /* read only once */ - u_int avail = ring->avail; /* read only once */ - u_int res = ring->reserved; /* read only once */ - u_int n = kring->nkr_num_slots; - u_int kend = kring->nr_hwcur + kring->nr_hwavail; - u_int a; + uint32_t const n = kring->nkr_num_slots; + uint32_t head, cur; + ND("%s kc %d kt %d h %d c %d t %d", + kring->name, + kring->nr_hwcur, kring->nr_hwtail, + ring->head, ring->cur, ring->tail); + /* + * Before storing the new values, we should check they do not + * move backwards. However: + * - head is not an issue because the previous value is hwcur; + * - cur could in principle go back, however it does not matter + * because we are processing a brand new rxsync() + */ + cur = kring->rcur = ring->cur; /* read only once */ + head = kring->rhead = ring->head; /* read only once */ #if 1 /* kernel sanity checks */ - if (kring->nr_hwcur >= n || kring->nr_hwavail >= n) + if (kring->nr_hwcur >= n || kring->nr_hwtail >= n) goto error; #endif /* kernel sanity checks */ /* user sanity checks */ - if (res >= n) - goto error; - /* check that cur is valid, a is the expected value of avail */ - if (cur < kring->nr_hwcur) { - /* too low, but maybe wraparound */ - if (cur + n > kend) + if (kring->nr_hwtail >= kring->nr_hwcur) { + /* want hwcur <= rhead <= hwtail */ + if (head < kring->nr_hwcur || head > kring->nr_hwtail) goto error; - a = kend - (cur + n); - } else { - if (cur > kend) + /* and also rhead <= rcur <= hwtail */ + if (cur < head || cur > kring->nr_hwtail) goto error; - a = kend - cur; - } - if (a != avail) { - RD(5, "wrong but fixable avail have %d need %d", - avail, a); - ring->avail = avail = a; - } - if (res != 0) { - /* then repeat the check for cur + res */ - cur = (cur >= res) ? cur - res : n + cur - res; - if (cur < kring->nr_hwcur) { - /* too low, but maybe wraparound */ - if (cur + n > kend) - goto error; - } else if (cur > kend) { + } else { + /* we need rhead outside hwtail..hwcur */ + if (head < kring->nr_hwcur && head > kring->nr_hwtail) goto error; + /* two cases now: head <= hwtail or head >= hwcur */ + if (head <= kring->nr_hwtail) { + /* want head <= cur <= hwtail */ + if (cur < head || cur > kring->nr_hwtail) + goto error; + } else { + /* cur must be outside hwtail..head */ + if (cur < head && cur > kring->nr_hwtail) + goto error; } } - *resvd = res; - return cur; + if (ring->tail != kring->rtail) { + RD(5, "%s tail overwritten was %d need %d", + kring->name, + ring->tail, kring->rtail); + ring->tail = kring->rtail; + } + return head; error: - RD(5, "kring error: hwcur %d hwres %d hwavail %d cur %d av %d res %d", + RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d", kring->nr_hwcur, - kring->nr_hwreserved, kring->nr_hwavail, - ring->cur, avail, res); + kring->rcur, kring->nr_hwtail, + kring->rhead, kring->rcur, ring->tail); return n; } + /* * Error routine called when txsync/rxsync detects an error. - * Can't do much more than resetting cur = hwcur, avail = hwavail. + * Can't do much more than resetting head =cur = hwcur, tail = hwtail * Return 1 on reinit. * * This routine is only called by the upper half of the kernel. * It only reads hwcur (which is changed only by the upper half, too) - * and hwavail (which may be changed by the lower half, but only on + * and hwtail (which may be changed by the lower half, but only on * a tx ring and only to increase it, so any error will be recovered * on the next call). For the above, we don't strictly need to call * it under lock. @@ -1274,36 +1324,38 @@ netmap_ring_reinit(struct netmap_kring *kring) // XXX KASSERT nm_kr_tryget RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); + // XXX probably wrong to trust userspace + kring->rhead = ring->head; + kring->rcur = ring->cur; + kring->rtail = ring->tail; + if (ring->cur > lim) errors++; + if (ring->head > lim) + errors++; + if (ring->tail > lim) + errors++; for (i = 0; i <= lim; i++) { u_int idx = ring->slot[i].buf_idx; u_int len = ring->slot[i].len; if (idx < 2 || idx >= netmap_total_buffers) { - if (!errors++) - D("bad buffer at slot %d idx %d len %d ", i, idx, len); + RD(5, "bad index at slot %d idx %d len %d ", i, idx, len); ring->slot[i].buf_idx = 0; ring->slot[i].len = 0; } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { ring->slot[i].len = 0; - if (!errors++) - D("bad len %d at slot %d idx %d", - len, i, idx); + RD(5, "bad len at slot %d idx %d len %d", i, idx, len); } } if (errors) { - int pos = kring - kring->na->tx_rings; - int n = kring->na->num_tx_rings + 1; - RD(10, "total %d errors", errors); - errors++; - RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", - NM_IFPNAME(kring->na->ifp), - pos < n ? "TX" : "RX", pos < n ? pos : pos - n, + RD(10, "%s reinit, cur %d -> %d tail %d -> %d", + kring->name, ring->cur, kring->nr_hwcur, - ring->avail, kring->nr_hwavail); - ring->cur = kring->nr_hwcur; - ring->avail = kring->nr_hwavail; + ring->tail, kring->nr_hwtail); + ring->head = kring->rhead = kring->nr_hwcur; + ring->cur = kring->rcur = kring->nr_hwcur; + ring->tail = kring->rtail = kring->nr_hwtail; } return (errors ? 1 : 0); } @@ -1436,7 +1488,6 @@ out: * - NIOCGINFO * - SIOCGIFADDR just for convenience * - NIOCREGIF - * - NIOCUNREGIF * - NIOCTXSYNC * - NIOCRXSYNC * @@ -1472,6 +1523,17 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, } while (0) #endif /* linux */ + if (cmd == NIOCGINFO || cmd == NIOCREGIF) { + /* truncate name */ + nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; + if (nmr->nr_version != NETMAP_API) { + D("API mismatch for %s got %d need %d", + nmr->nr_name, + nmr->nr_version, NETMAP_API); + nmr->nr_version = NETMAP_API; + return EINVAL; + } + } CURVNET_SET(TD_TO_VNET(td)); error = devfs_get_cdevpriv((void **)&priv); @@ -1482,16 +1544,8 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, return (error == ENOENT ? ENXIO : error); } - nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ switch (cmd) { case NIOCGINFO: /* return capabilities etc */ - if (nmr->nr_version != NETMAP_API) { - D("API mismatch got %d have %d", - nmr->nr_version, NETMAP_API); - nmr->nr_version = NETMAP_API; - error = EINVAL; - break; - } if (nmr->nr_cmd == NETMAP_BDG_LIST) { error = netmap_bdg_ctl(nmr, NULL); break; @@ -1531,11 +1585,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, break; case NIOCREGIF: - if (nmr->nr_version != NETMAP_API) { - nmr->nr_version = NETMAP_API; - error = EINVAL; - break; - } /* possibly attach/detach NIC and VALE switch */ i = nmr->nr_cmd; if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH @@ -1593,12 +1642,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, NMG_UNLOCK(); break; - case NIOCUNREGIF: - // XXX we have no data here ? - D("deprecated, data is %p", nmr); - error = EINVAL; - break; - case NIOCTXSYNC: case NIOCRXSYNC: nifp = priv->np_nifp; @@ -1649,7 +1692,11 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, D("pre txsync ring %d cur %d hwcur %d", i, kring->ring->cur, kring->nr_hwcur); - na->nm_txsync(na, i, NAF_FORCE_RECLAIM); + if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { + netmap_ring_reinit(kring); + } else { + na->nm_txsync(na, i, NAF_FORCE_RECLAIM); + } if (netmap_verbose & NM_VERB_TXSYNC) D("post txsync ring %d cur %d hwcur %d", i, kring->ring->cur, @@ -1726,8 +1773,8 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) struct ifnet *ifp; struct netmap_kring *kring; u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; - u_int lim_tx, lim_rx, host_forwarded = 0; - struct mbq q; + u_int lim_tx, lim_rx; + struct mbq q; /* packets from hw queues to host stack */ void *pwait = dev; /* linux compatibility */ /* @@ -1735,7 +1782,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * txsync and rxsync if we decide to do a selrecord(). * retry_tx (and retry_rx, later) prevent looping forever. */ - int retry_tx = 1; + int retry_tx = 1, retry_rx = 1; (void)pwait; mbq_init(&q); @@ -1769,6 +1816,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) lim_rx = na->num_rx_rings; if (priv->np_qfirst == NETMAP_SW_RING) { + // XXX locking ? /* handle the host stack ring */ if (priv->np_txpoll || want_tx) { /* push any packets up, then we are always ready */ @@ -1777,29 +1825,15 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) } if (want_rx) { kring = &na->rx_rings[lim_rx]; - if (kring->ring->avail == 0) + /* XXX replace with rxprologue etc. */ + if (nm_ring_empty(kring->ring)) netmap_rxsync_from_host(na, td, dev); - if (kring->ring->avail > 0) { + if (!nm_ring_empty(kring->ring)) revents |= want_rx; - } } return (revents); } - /* - * If we are in transparent mode, check also the host rx ring - * XXX Transparent mode at the moment requires to bind all - * rings to a single file descriptor. - */ - kring = &na->rx_rings[lim_rx]; - if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all - && want_rx - && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) { - if (kring->ring->avail == 0) - netmap_rxsync_from_host(na, td, dev); - if (kring->ring->avail > 0) - revents |= want_rx; - } /* * check_all_{tx|rx} are set if the card has more than one queue AND @@ -1825,81 +1859,71 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * We start with a lock free round which is cheap if we have * slots available. If this fails, then lock and call the sync * routines. - * XXX rather than ring->avail >0 should check that - * ring->cur has not reached hwcur+hwavail */ for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { kring = &na->rx_rings[i]; - if (kring->ring->avail > 0) { + /* XXX compare ring->cur and kring->tail */ + if (!nm_ring_empty(kring->ring)) { revents |= want_rx; want_rx = 0; /* also breaks the loop */ } } for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { kring = &na->tx_rings[i]; - if (kring->ring->avail > 0) { + /* XXX compare ring->cur and kring->tail */ + if (!nm_ring_empty(kring->ring)) { revents |= want_tx; want_tx = 0; /* also breaks the loop */ } } /* - * If we to push packets out (priv->np_txpoll) or want_tx is - * still set, we do need to run the txsync calls (on all rings, - * to avoid that the tx rings stall). + * If we want to push packets out (priv->np_txpoll) or + * want_tx is still set, we must issue txsync calls + * (on all rings, to avoid that the tx rings stall). * XXX should also check cur != hwcur on the tx rings. * Fortunately, normal tx mode has np_txpoll set. */ if (priv->np_txpoll || want_tx) { - /* If we really want to be woken up (want_tx), - * do a selrecord, either on the global or on - * the private structure. Then issue the txsync - * so there is no race in the selrecord/selwait + /* + * The first round checks if anyone is ready, if not + * do a selrecord and another round to handle races. + * want_tx goes to 0 if any space is found, and is + * used to skip rings with no pending transmissions. */ flush_tx: for (i = priv->np_qfirst; i < lim_tx; i++) { + int found = 0; + kring = &na->tx_rings[i]; - /* - * Skip this ring if want_tx == 0 - * (we have already done a successful sync on - * a previous ring) AND kring->cur == kring->hwcur - * (there are no pending transmissions for this ring). - */ if (!want_tx && kring->ring->cur == kring->nr_hwcur) continue; - /* make sure only one user thread is doing this */ + /* only one thread does txsync */ if (nm_kr_tryget(kring)) { - ND("ring %p busy is %d", - kring, (int)kring->nr_busy); - revents |= POLLERR; - goto out; + D("%p lost race on txring %d, ok", priv, i); + continue; } - - if (netmap_verbose & NM_VERB_TXSYNC) - D("send %d on %s %d", - kring->ring->cur, NM_IFPNAME(ifp), i); - if (na->nm_txsync(na, i, 0)) + if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { + netmap_ring_reinit(kring); revents |= POLLERR; + } else { + if (na->nm_txsync(na, i, 0)) + revents |= POLLERR; + } - /* Check avail and call selrecord only if - * called with POLLOUT and run out of bufs. - * XXX Note, we cannot trust much ring->avail - * as it is exposed to userspace (even though - * just updated by txsync). We should really - * check kring->nr_hwavail or better have - * txsync set a flag telling if we need - * to do a selrecord(). + /* + * If we found new slots, notify potential + * listeners on the same ring. + * Since we just did a txsync, look at the copies + * of cur,tail in the kring. */ - if (want_tx) { - if (kring->ring->avail > 0) { - /* stop at the first ring. We don't risk - * starvation. - */ - revents |= want_tx; - want_tx = 0; - } - } + found = kring->rcur != kring->rtail; nm_kr_put(kring); + if (found) { /* notify other listeners */ + revents |= want_tx; + want_tx = 0; + na->nm_notify(na, i, NR_TX, NAF_GLOBAL_NOTIFY); + } } if (want_tx && retry_tx) { selrecord(td, check_all_tx ? @@ -1910,21 +1934,27 @@ flush_tx: } /* - * now if want_rx is still set we need to lock and rxsync. + * If want_rx is still set scan receive rings. * Do it on all rings because otherwise we starve. */ if (want_rx) { - int retry_rx = 1; + int send_down = 0; /* transparent mode */ + /* two rounds here to for race avoidance */ do_retry_rx: for (i = priv->np_qfirst; i < lim_rx; i++) { + int found = 0; + kring = &na->rx_rings[i]; if (nm_kr_tryget(kring)) { - revents |= POLLERR; - goto out; + D("%p lost race on rxring %d, ok", priv, i); + continue; } - /* XXX NR_FORWARD should only be read on + /* + * transparent mode support: collect packets + * from the rxring(s). + * XXX NR_FORWARD should only be read on * physical or NIC ports */ if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { @@ -1939,49 +1969,65 @@ do_retry_rx: kring->ring->flags & NR_TIMESTAMP) { microtime(&kring->ring->ts); } - - if (kring->ring->avail > 0) { + /* after an rxsync we can use kring->rcur, rtail */ + found = kring->rcur != kring->rtail; + nm_kr_put(kring); + if (found) { revents |= want_rx; retry_rx = 0; + na->nm_notify(na, i, NR_RX, NAF_GLOBAL_NOTIFY); } - nm_kr_put(kring); } - if (retry_rx) { - retry_rx = 0; + + /* transparent mode XXX only during first pass ? */ + kring = &na->rx_rings[lim_rx]; + if (check_all_rx + && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { + /* XXX fix to use kring fields */ + if (nm_ring_empty(kring->ring)) + send_down = netmap_rxsync_from_host(na, td, dev); + if (!nm_ring_empty(kring->ring)) + revents |= want_rx; + } + + if (retry_rx) selrecord(td, check_all_rx ? &na->rx_si : &na->rx_rings[priv->np_qfirst].si); - goto do_retry_rx; + if (send_down > 0 || retry_rx) { + retry_rx = 0; + if (send_down) + goto flush_tx; /* and retry_rx */ + else + goto do_retry_rx; } } - /* forward host to the netmap ring. - * I am accessing nr_hwavail without lock, but netmap_transmit - * can only increment it, so the operation is safe. + /* + * Transparent mode: marked bufs on rx rings between + * kring->nr_hwcur and ring->head + * are passed to the other endpoint. + * + * In this mode we also scan the sw rxring, which in + * turn passes packets up. + * + * XXX Transparent mode at the moment requires to bind all + * rings to a single file descriptor. */ - kring = &na->rx_rings[lim_rx]; - if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all - && (netmap_fwd || kring->ring->flags & NR_FORWARD) - && kring->nr_hwavail > 0 && !host_forwarded) { - netmap_sw_to_nic(na); - host_forwarded = 1; /* prevent another pass */ - want_rx = 0; - goto flush_tx; - } if (q.head) netmap_send_up(na->ifp, &q); -out: - return (revents); } -/*------- driver support routines ------*/ + +/*-------------------- driver support routines -------------------*/ static int netmap_hw_krings_create(struct netmap_adapter *); static int -netmap_notify(struct netmap_adapter *na, u_int n_ring, enum txrx tx, int flags) +netmap_notify(struct netmap_adapter *na, u_int n_ring, + enum txrx tx, int flags) { struct netmap_kring *kring; @@ -2012,10 +2058,18 @@ netmap_attach_common(struct netmap_adapter *na) return EINVAL; } WNA(ifp) = na; + + /* the following is only needed for na that use the host port. + * XXX do we have something similar for linux ? + */ +#ifdef __FreeBSD__ + na->if_input = ifp->if_input; /* for netmap_send_up */ +#endif /* __FreeBSD__ */ + NETMAP_SET_CAPABLE(ifp); if (na->nm_krings_create == NULL) { na->nm_krings_create = netmap_hw_krings_create; - na->nm_krings_delete = netmap_krings_delete; + na->nm_krings_delete = netmap_hw_krings_delete; } if (na->nm_notify == NULL) na->nm_notify = netmap_notify; @@ -2051,12 +2105,8 @@ netmap_detach_common(struct netmap_adapter *na) * of hardware rings): * krings 0..N-1 are for the hardware queues. * kring N is for the host stack queue - * kring N+1 is only used for the selinfo for all queues. + * kring N+1 is only used for the selinfo for all queues. // XXX still true ? * Return 0 on success, ENOMEM otherwise. - * - * By default the receive and transmit adapter ring counts are both initialized - * to num_queues. na->num_tx_rings can be set for cards with different tx/rx - * setups. */ int netmap_attach(struct netmap_adapter *arg) @@ -2132,8 +2182,14 @@ NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) int netmap_hw_krings_create(struct netmap_adapter *na) { - return netmap_krings_create(na, + int ret = netmap_krings_create(na, na->num_tx_rings + 1, na->num_rx_rings + 1, 0); + if (ret == 0) { + /* initialize the mbq for the sw rx ring */ + mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); + ND("initialized sw rx queue %d", na->num_rx_rings); + } + return ret; } @@ -2162,6 +2218,10 @@ netmap_detach(struct ifnet *ifp) /* * Intercept packets from the network stack and pass them * to netmap as incoming packets on the 'software' ring. + * + * We only store packets in a bounded mbq and then copy them + * in the relevant rxsync routine. + * * We rely on the OS to make sure that the ifp and na do not go * away (typically the caller checks for IFF_DRV_RUNNING or the like). * In nm_register() or whenever there is a reinitialization, @@ -2172,63 +2232,60 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m) { struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring; - u_int i, len = MBUF_LEN(m); - u_int error = EBUSY, lim; - struct netmap_slot *slot; + u_int len = MBUF_LEN(m); + u_int error = ENOBUFS; + struct mbq *q; + int space; // XXX [Linux] we do not need this lock // if we follow the down/configure/up protocol -gl // mtx_lock(&na->core_lock); + if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { - /* interface not in netmap mode anymore */ + D("%s not in netmap mode anymore", NM_IFPNAME(ifp)); error = ENXIO; goto done; } kring = &na->rx_rings[na->num_rx_rings]; - lim = kring->nkr_num_slots - 1; - if (netmap_verbose & NM_VERB_HOST) - D("%s packet %d len %d from the stack", NM_IFPNAME(ifp), - kring->nr_hwcur + kring->nr_hwavail, len); + q = &kring->rx_queue; + // XXX reconsider long packets if we handle fragments if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); goto done; } - /* protect against other instances of netmap_transmit, - * and userspace invocations of rxsync(). + + /* protect against rxsync_from_host(), netmap_sw_to_nic() + * and maybe other instances of netmap_transmit (the latter + * not possible on Linux). + * Also avoid overflowing the queue. */ - // XXX [Linux] there can be no other instances of netmap_transmit - // on this same ring, but we still need this lock to protect - // concurrent access from netmap_sw_to_nic() -gl - mtx_lock(&kring->q_lock); - if (kring->nr_hwavail >= lim) { - if (netmap_verbose) - D("stack ring %s full\n", NM_IFPNAME(ifp)); + mtx_lock(&q->lock); + + space = kring->nr_hwtail - kring->nr_hwcur; + if (space < 0) + space += kring->nkr_num_slots; + if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX + RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p", + NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), + len, m); } else { - /* compute the insert position */ - i = nm_kr_rxpos(kring); - slot = &kring->ring->slot[i]; - m_copydata(m, 0, (int)len, BDG_NMB(na, slot)); - slot->len = len; - slot->flags = kring->nkr_slot_flags; - kring->nr_hwavail++; - if (netmap_verbose & NM_VERB_HOST) - D("wake up host ring %s %d", NM_IFPNAME(na->ifp), na->num_rx_rings); - na->nm_notify(na, na->num_rx_rings, NR_RX, 0); + mbq_enqueue(q, m); + ND(10, "%s %d bufs in queue len %d m %p", + NM_IFPNAME(ifp), mbq_len(q), len, m); + /* notify outside the lock */ + m = NULL; error = 0; } - mtx_unlock(&kring->q_lock); + mtx_unlock(&q->lock); done: - // mtx_unlock(&na->core_lock); - - /* release the mbuf in either cases of success or failure. As an - * alternative, put the mbuf in a free list and free the list - * only when really necessary. - */ - m_freem(m); + if (m) + m_freem(m); + /* unconditionally wake up listeners */ + na->nm_notify(na, na->num_rx_rings, NR_RX, 0); return (error); } @@ -2267,27 +2324,32 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, if (n >= na->num_tx_rings) return NULL; kring = na->tx_rings + n; + // XXX check whether we should use hwcur or rcur new_hwofs = kring->nr_hwcur - new_cur; } else { if (n >= na->num_rx_rings) return NULL; kring = na->rx_rings + n; - new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; + new_hwofs = kring->nr_hwtail - new_cur; } lim = kring->nkr_num_slots - 1; if (new_hwofs > lim) new_hwofs -= lim + 1; /* Always set the new offset value and realign the ring. */ - D("%s hwofs %d -> %d, hwavail %d -> %d", - tx == NR_TX ? "TX" : "RX", + if (netmap_verbose) + D("%s %s%d hwofs %d -> %d, hwtail %d -> %d", + NM_IFPNAME(na->ifp), + tx == NR_TX ? "TX" : "RX", n, kring->nkr_hwofs, new_hwofs, - kring->nr_hwavail, - tx == NR_TX ? lim : kring->nr_hwavail); + kring->nr_hwtail, + tx == NR_TX ? lim : kring->nr_hwtail); kring->nkr_hwofs = new_hwofs; - if (tx == NR_TX) - kring->nr_hwavail = lim; - kring->nr_hwreserved = 0; + if (tx == NR_TX) { + kring->nr_hwtail = kring->nr_hwcur + lim; + if (kring->nr_hwtail > lim) + kring->nr_hwtail -= lim + 1; + } #if 0 // def linux /* XXX check that the mappings are correct */ @@ -2351,6 +2413,7 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) } } + /* * Default functions to handle rx/tx interrupts from a physical device. * "work_done" is non-null on the RX path, NULL for the TX path. @@ -2397,6 +2460,7 @@ netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) static struct cdev *netmap_dev; /* /dev/netmap character device. */ extern struct cdevsw netmap_cdevsw; + void netmap_fini(void) { @@ -2408,6 +2472,7 @@ netmap_fini(void) printf("netmap: unloaded module.\n"); } + int netmap_init(void) { diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index c2814146d2ef..6716168526dc 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Universita` di Pisa. All rights reserved. + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -86,21 +86,31 @@ netmap_catch_rx(struct netmap_adapter *na, int intercept) return 0; } + /* * Intercept the packet steering routine in the tx path, * so that we can decide which queue is used for an mbuf. * Second argument is non-zero to intercept, 0 to restore. * + * actually we also need to redirect the if_transmit ? + * * XXX see if FreeBSD has such a mechanism */ void -netmap_catch_packet_steering(struct netmap_generic_adapter *na, int enable) +netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) { + struct netmap_adapter *na = &gna->up.up; + struct ifnet *ifp = na->ifp; + if (enable) { + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_transmit; } else { + ifp->if_transmit = na->if_transmit; } } + /* Transmit routine used by generic_netmap_txsync(). Returns 0 on success * and non-zero on error (which may be packet drops or other errors). * addr and len identify the netmap buffer, m is the (preallocated) @@ -126,16 +136,16 @@ generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, // copy data to the mbuf m_copyback(m, 0, len, addr); - // inc refcount. We are alone, so we can skip the atomic atomic_fetchadd_int(m->m_ext.ref_cnt, 1); m->m_flags |= M_FLOWID; m->m_pkthdr.flowid = ring_nr; m->m_pkthdr.rcvif = ifp; /* used for tx notification */ - ret = ifp->if_transmit(ifp, m); + ret = NA(ifp)->if_transmit(ifp, m); return ret; } + /* * The following two functions are empty until we have a generic * way to extract the info from the ifp @@ -147,6 +157,7 @@ generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) return 0; } + void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) { @@ -155,6 +166,7 @@ generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) *rxq = 1; } + void netmap_mitigation_init(struct netmap_generic_adapter *na) { ND("called"); @@ -167,22 +179,26 @@ void netmap_mitigation_start(struct netmap_generic_adapter *na) ND("called"); } + void netmap_mitigation_restart(struct netmap_generic_adapter *na) { ND("called"); } + int netmap_mitigation_active(struct netmap_generic_adapter *na) { ND("called"); return 0; } + void netmap_mitigation_cleanup(struct netmap_generic_adapter *na) { ND("called"); } + /* * In order to track whether pages are still mapped, we hook into * the standard cdev_pager and intercept the constructor and @@ -194,6 +210,7 @@ struct netmap_vm_handle_t { struct netmap_priv_d *priv; }; + static int netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) @@ -218,6 +235,7 @@ netmap_dev_pager_dtor(void *handle) dev_rel(dev); } + static int netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_t *mres) diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c index 2c42db3f8862..109a734cac9f 100644 --- a/sys/dev/netmap/netmap_generic.c +++ b/sys/dev/netmap/netmap_generic.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Universita` di Pisa. All rights reserved. + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -82,7 +82,7 @@ __FBSDID("$FreeBSD$"); #include <dev/netmap/netmap_mem2.h> #define rtnl_lock() D("rtnl_lock called"); -#define rtnl_unlock() D("rtnl_lock called"); +#define rtnl_unlock() D("rtnl_unlock called"); #define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) #define smp_mb() @@ -101,9 +101,9 @@ __FBSDID("$FreeBSD$"); * (or reinstall the buffer ?) */ #define SET_MBUF_DESTRUCTOR(m, fn) do { \ - (m)->m_ext.ext_free = (void *)fn; \ - (m)->m_ext.ext_type = EXT_EXTREF; \ - } while (0) + (m)->m_ext.ext_free = (void *)fn; \ + (m)->m_ext.ext_type = EXT_EXTREF; \ +} while (0) #define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *(m)->m_ext.ref_cnt : -1) @@ -137,43 +137,43 @@ __FBSDID("$FreeBSD$"); #ifdef RATE #define IFRATE(x) x struct rate_stats { - unsigned long txpkt; - unsigned long txsync; - unsigned long txirq; - unsigned long rxpkt; - unsigned long rxirq; - unsigned long rxsync; + unsigned long txpkt; + unsigned long txsync; + unsigned long txirq; + unsigned long rxpkt; + unsigned long rxirq; + unsigned long rxsync; }; struct rate_context { - unsigned refcount; - struct timer_list timer; - struct rate_stats new; - struct rate_stats old; + unsigned refcount; + struct timer_list timer; + struct rate_stats new; + struct rate_stats old; }; #define RATE_PRINTK(_NAME_) \ - printk( #_NAME_ " = %lu Hz\n", (cur._NAME_ - ctx->old._NAME_)/RATE_PERIOD); + printk( #_NAME_ " = %lu Hz\n", (cur._NAME_ - ctx->old._NAME_)/RATE_PERIOD); #define RATE_PERIOD 2 static void rate_callback(unsigned long arg) { - struct rate_context * ctx = (struct rate_context *)arg; - struct rate_stats cur = ctx->new; - int r; - - RATE_PRINTK(txpkt); - RATE_PRINTK(txsync); - RATE_PRINTK(txirq); - RATE_PRINTK(rxpkt); - RATE_PRINTK(rxsync); - RATE_PRINTK(rxirq); - printk("\n"); - - ctx->old = cur; - r = mod_timer(&ctx->timer, jiffies + - msecs_to_jiffies(RATE_PERIOD * 1000)); - if (unlikely(r)) - D("[v1000] Error: mod_timer()"); + struct rate_context * ctx = (struct rate_context *)arg; + struct rate_stats cur = ctx->new; + int r; + + RATE_PRINTK(txpkt); + RATE_PRINTK(txsync); + RATE_PRINTK(txirq); + RATE_PRINTK(rxpkt); + RATE_PRINTK(rxsync); + RATE_PRINTK(rxirq); + printk("\n"); + + ctx->old = cur; + r = mod_timer(&ctx->timer, jiffies + + msecs_to_jiffies(RATE_PERIOD * 1000)); + if (unlikely(r)) + D("[v1000] Error: mod_timer()"); } static struct rate_context rate_ctx; @@ -197,150 +197,150 @@ netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done) if (unlikely(!(ifp->if_capenable & IFCAP_NETMAP))) return; - netmap_common_irq(ifp, q, work_done); + netmap_common_irq(ifp, q, work_done); } /* Enable/disable netmap mode for a generic network interface. */ -int generic_netmap_register(struct netmap_adapter *na, int enable) +static int +generic_netmap_register(struct netmap_adapter *na, int enable) { - struct ifnet *ifp = na->ifp; - struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; - struct mbuf *m; - int error; - int i, r; + struct ifnet *ifp = na->ifp; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + struct mbuf *m; + int error; + int i, r; - if (!na) - return EINVAL; + if (!na) + return EINVAL; #ifdef REG_RESET - error = ifp->netdev_ops->ndo_stop(ifp); - if (error) { - return error; - } + error = ifp->netdev_ops->ndo_stop(ifp); + if (error) { + return error; + } #endif /* REG_RESET */ - if (enable) { /* Enable netmap mode. */ - /* Initialize the rx queue, as generic_rx_handler() can - * be called as soon as netmap_catch_rx() returns. - */ - for (r=0; r<na->num_rx_rings; r++) { - mbq_safe_init(&na->rx_rings[r].rx_queue); - na->rx_rings[r].nr_ntc = 0; - } - - /* Init the mitigation timer. */ - netmap_mitigation_init(gna); - - /* - * Preallocate packet buffers for the tx rings. - */ - for (r=0; r<na->num_tx_rings; r++) { - na->tx_rings[r].nr_ntc = 0; - na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *), - M_DEVBUF, M_NOWAIT | M_ZERO); - if (!na->tx_rings[r].tx_pool) { - D("tx_pool allocation failed"); - error = ENOMEM; - goto free_tx_pool; - } - for (i=0; i<na->num_tx_desc; i++) { - m = netmap_get_mbuf(GENERIC_BUF_SIZE); - if (!m) { - D("tx_pool[%d] allocation failed", i); - error = ENOMEM; - goto free_mbufs; - } - na->tx_rings[r].tx_pool[i] = m; - } - } - rtnl_lock(); - /* Prepare to intercept incoming traffic. */ - error = netmap_catch_rx(na, 1); - if (error) { - D("netdev_rx_handler_register() failed"); - goto register_handler; - } - ifp->if_capenable |= IFCAP_NETMAP; - - /* Make netmap control the packet steering. */ - netmap_catch_packet_steering(gna, 1); - - rtnl_unlock(); + if (enable) { /* Enable netmap mode. */ + /* Initialize the rx queue, as generic_rx_handler() can + * be called as soon as netmap_catch_rx() returns. + */ + for (r=0; r<na->num_rx_rings; r++) { + mbq_safe_init(&na->rx_rings[r].rx_queue); + } + + /* Init the mitigation timer. */ + netmap_mitigation_init(gna); + + /* + * Preallocate packet buffers for the tx rings. + */ + for (r=0; r<na->num_tx_rings; r++) + na->tx_rings[r].tx_pool = NULL; + for (r=0; r<na->num_tx_rings; r++) { + na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (!na->tx_rings[r].tx_pool) { + D("tx_pool allocation failed"); + error = ENOMEM; + goto free_tx_pools; + } + for (i=0; i<na->num_tx_desc; i++) + na->tx_rings[r].tx_pool[i] = NULL; + for (i=0; i<na->num_tx_desc; i++) { + m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (!m) { + D("tx_pool[%d] allocation failed", i); + error = ENOMEM; + goto free_tx_pools; + } + na->tx_rings[r].tx_pool[i] = m; + } + } + rtnl_lock(); + /* Prepare to intercept incoming traffic. */ + error = netmap_catch_rx(na, 1); + if (error) { + D("netdev_rx_handler_register() failed"); + goto register_handler; + } + ifp->if_capenable |= IFCAP_NETMAP; + + /* Make netmap control the packet steering. */ + netmap_catch_tx(gna, 1); + + rtnl_unlock(); #ifdef RATE - if (rate_ctx.refcount == 0) { - D("setup_timer()"); - memset(&rate_ctx, 0, sizeof(rate_ctx)); - setup_timer(&rate_ctx.timer, &rate_callback, (unsigned long)&rate_ctx); - if (mod_timer(&rate_ctx.timer, jiffies + msecs_to_jiffies(1500))) { - D("Error: mod_timer()"); - } - } - rate_ctx.refcount++; + if (rate_ctx.refcount == 0) { + D("setup_timer()"); + memset(&rate_ctx, 0, sizeof(rate_ctx)); + setup_timer(&rate_ctx.timer, &rate_callback, (unsigned long)&rate_ctx); + if (mod_timer(&rate_ctx.timer, jiffies + msecs_to_jiffies(1500))) { + D("Error: mod_timer()"); + } + } + rate_ctx.refcount++; #endif /* RATE */ - } else { /* Disable netmap mode. */ - rtnl_lock(); + } else { /* Disable netmap mode. */ + rtnl_lock(); - ifp->if_capenable &= ~IFCAP_NETMAP; + ifp->if_capenable &= ~IFCAP_NETMAP; - /* Release packet steering control. */ - netmap_catch_packet_steering(gna, 0); + /* Release packet steering control. */ + netmap_catch_tx(gna, 0); - /* Do not intercept packets on the rx path. */ - netmap_catch_rx(na, 0); + /* Do not intercept packets on the rx path. */ + netmap_catch_rx(na, 0); - rtnl_unlock(); + rtnl_unlock(); - /* Free the mbufs going to the netmap rings */ - for (r=0; r<na->num_rx_rings; r++) { - mbq_safe_purge(&na->rx_rings[r].rx_queue); - mbq_safe_destroy(&na->rx_rings[r].rx_queue); - } + /* Free the mbufs going to the netmap rings */ + for (r=0; r<na->num_rx_rings; r++) { + mbq_safe_purge(&na->rx_rings[r].rx_queue); + mbq_safe_destroy(&na->rx_rings[r].rx_queue); + } - netmap_mitigation_cleanup(gna); + netmap_mitigation_cleanup(gna); - for (r=0; r<na->num_tx_rings; r++) { - for (i=0; i<na->num_tx_desc; i++) { - m_freem(na->tx_rings[r].tx_pool[i]); - } - free(na->tx_rings[r].tx_pool, M_DEVBUF); - } + for (r=0; r<na->num_tx_rings; r++) { + for (i=0; i<na->num_tx_desc; i++) { + m_freem(na->tx_rings[r].tx_pool[i]); + } + free(na->tx_rings[r].tx_pool, M_DEVBUF); + } #ifdef RATE - if (--rate_ctx.refcount == 0) { - D("del_timer()"); - del_timer(&rate_ctx.timer); - } + if (--rate_ctx.refcount == 0) { + D("del_timer()"); + del_timer(&rate_ctx.timer); + } #endif - } + } #ifdef REG_RESET - error = ifp->netdev_ops->ndo_open(ifp); - if (error) { - goto alloc_tx_pool; - } + error = ifp->netdev_ops->ndo_open(ifp); + if (error) { + goto alloc_tx_pool; + } #endif - return 0; + return 0; register_handler: - rtnl_unlock(); -free_tx_pool: - r--; - i = na->num_tx_desc; /* Useless, but just to stay safe. */ -free_mbufs: - i--; - for (; r>=0; r--) { - for (; i>=0; i--) { - m_freem(na->tx_rings[r].tx_pool[i]); - } - free(na->tx_rings[r].tx_pool, M_DEVBUF); - i = na->num_tx_desc - 1; - } - - return error; + rtnl_unlock(); +free_tx_pools: + for (r=0; r<na->num_tx_rings; r++) { + if (na->tx_rings[r].tx_pool == NULL) + continue; + for (i=0; i<na->num_tx_desc; i++) + if (na->tx_rings[r].tx_pool[i]) + m_freem(na->tx_rings[r].tx_pool[i]); + free(na->tx_rings[r].tx_pool, M_DEVBUF); + } + + return error; } /* @@ -351,93 +351,88 @@ free_mbufs: static void generic_mbuf_destructor(struct mbuf *m) { - if (netmap_verbose) - D("Tx irq (%p) queue %d", m, MBUF_TXQ(m)); - netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL); + if (netmap_verbose) + D("Tx irq (%p) queue %d", m, MBUF_TXQ(m)); + netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL); #ifdef __FreeBSD__ - m->m_ext.ext_type = EXT_PACKET; - m->m_ext.ext_free = NULL; - if (*(m->m_ext.ref_cnt) == 0) - *(m->m_ext.ref_cnt) = 1; - uma_zfree(zone_pack, m); + m->m_ext.ext_type = EXT_PACKET; + m->m_ext.ext_free = NULL; + if (*(m->m_ext.ref_cnt) == 0) + *(m->m_ext.ref_cnt) = 1; + uma_zfree(zone_pack, m); #endif /* __FreeBSD__ */ - IFRATE(rate_ctx.new.txirq++); + IFRATE(rate_ctx.new.txirq++); } -/* Record completed transmissions and update hwavail. +/* Record completed transmissions and update hwtail. * - * nr_ntc is the oldest tx buffer not yet completed - * (same as nr_hwavail + nr_hwcur + 1), + * The oldest tx buffer not yet completed is at nr_hwtail + 1, * nr_hwcur is the first unsent buffer. - * When cleaning, we try to recover buffers between nr_ntc and nr_hwcur. */ -static int +static u_int generic_netmap_tx_clean(struct netmap_kring *kring) { - u_int num_slots = kring->nkr_num_slots; - u_int ntc = kring->nr_ntc; - u_int hwcur = kring->nr_hwcur; - u_int n = 0; - struct mbuf **tx_pool = kring->tx_pool; - - while (ntc != hwcur) { /* buffers not completed */ - struct mbuf *m = tx_pool[ntc]; - - if (unlikely(m == NULL)) { - /* try to replenish the entry */ - tx_pool[ntc] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); - if (unlikely(m == NULL)) { - D("mbuf allocation failed, XXX error"); - // XXX how do we proceed ? break ? - return -ENOMEM; - } - } else if (GET_MBUF_REFCNT(m) != 1) { - break; /* This mbuf is still busy: its refcnt is 2. */ + u_int const lim = kring->nkr_num_slots - 1; + u_int nm_i = nm_next(kring->nr_hwtail, lim); + u_int hwcur = kring->nr_hwcur; + u_int n = 0; + struct mbuf **tx_pool = kring->tx_pool; + + while (nm_i != hwcur) { /* buffers not completed */ + struct mbuf *m = tx_pool[nm_i]; + + if (unlikely(m == NULL)) { + /* this is done, try to replenish the entry */ + tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (unlikely(m == NULL)) { + D("mbuf allocation failed, XXX error"); + // XXX how do we proceed ? break ? + return -ENOMEM; + } + } else if (GET_MBUF_REFCNT(m) != 1) { + break; /* This mbuf is still busy: its refcnt is 2. */ + } + n++; + nm_i = nm_next(nm_i, lim); } - if (unlikely(++ntc == num_slots)) { - ntc = 0; - } - n++; - } - kring->nr_ntc = ntc; - kring->nr_hwavail += n; - ND("tx completed [%d] -> hwavail %d", n, kring->nr_hwavail); - - return n; + kring->nr_hwtail = nm_prev(nm_i, lim); + ND("tx completed [%d] -> hwtail %d", n, kring->nr_hwtail); + + return n; } /* - * We have pending packets in the driver between nr_ntc and j. + * We have pending packets in the driver between nr_hwtail +1 and hwcur. * Compute a position in the middle, to be used to generate * a notification. */ static inline u_int generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur) { - u_int n = kring->nkr_num_slots; - u_int ntc = kring->nr_ntc; - u_int e; - - if (hwcur >= ntc) { - e = (hwcur + ntc) / 2; - } else { /* wrap around */ - e = (hwcur + n + ntc) / 2; - if (e >= n) { - e -= n; - } - } - - if (unlikely(e >= n)) { - D("This cannot happen"); - e = 0; - } - - return e; + u_int n = kring->nkr_num_slots; + u_int ntc = nm_next(kring->nr_hwtail, n-1); + u_int e; + + if (hwcur >= ntc) { + e = (hwcur + ntc) / 2; + } else { /* wrap around */ + e = (hwcur + n + ntc) / 2; + if (e >= n) { + e -= n; + } + } + + if (unlikely(e >= n)) { + D("This cannot happen"); + e = 0; + } + + return e; } /* - * We have pending packets in the driver between nr_ntc and hwcur. + * We have pending packets in the driver between nr_hwtail+1 and hwcur. * Schedule a notification approximately in the middle of the two. * There is a race but this is only called within txsync which does * a double check. @@ -445,28 +440,28 @@ generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur) static void generic_set_tx_event(struct netmap_kring *kring, u_int hwcur) { - struct mbuf *m; - u_int e; - - if (kring->nr_ntc == hwcur) { - return; - } - e = generic_tx_event_middle(kring, hwcur); - - m = kring->tx_pool[e]; - if (m == NULL) { - /* This can happen if there is already an event on the netmap - slot 'e': There is nothing to do. */ - return; - } - ND("Event at %d mbuf %p refcnt %d", e, m, GET_MBUF_REFCNT(m)); - kring->tx_pool[e] = NULL; - SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor); - - // XXX wmb() ? - /* Decrement the refcount an free it if we have the last one. */ - m_freem(m); - smp_mb(); + struct mbuf *m; + u_int e; + + if (nm_next(kring->nr_hwtail, kring->nkr_num_slots -1) == hwcur) { + return; /* all buffers are free */ + } + e = generic_tx_event_middle(kring, hwcur); + + m = kring->tx_pool[e]; + if (m == NULL) { + /* This can happen if there is already an event on the netmap + slot 'e': There is nothing to do. */ + return; + } + ND("Event at %d mbuf %p refcnt %d", e, m, GET_MBUF_REFCNT(m)); + kring->tx_pool[e] = NULL; + SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor); + + // XXX wmb() ? + /* Decrement the refcount an free it if we have the last one. */ + m_freem(m); + smp_mb(); } @@ -480,133 +475,108 @@ generic_set_tx_event(struct netmap_kring *kring, u_int hwcur) static int generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct ifnet *ifp = na->ifp; - struct netmap_kring *kring = &na->tx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, k, num_slots = kring->nkr_num_slots; - int new_slots, ntx; - - IFRATE(rate_ctx.new.txsync++); - - // TODO: handle the case of mbuf allocation failure - /* first, reclaim completed buffers */ - generic_netmap_tx_clean(kring); - - /* Take a copy of ring->cur now, and never read it again. */ - k = ring->cur; - if (unlikely(k >= num_slots)) { - return netmap_ring_reinit(kring); - } - - rmb(); - j = kring->nr_hwcur; - /* - * 'new_slots' counts how many new slots have been added: - * everything from hwcur to cur, excluding reserved ones, if any. - * nr_hwreserved start from hwcur and counts how many slots were - * not sent to the NIC from the previous round. - */ - new_slots = k - j - kring->nr_hwreserved; - if (new_slots < 0) { - new_slots += num_slots; - } - ntx = 0; - if (j != k) { - /* Process new packets to send: - * j is the current index in the netmap ring. + struct ifnet *ifp = na->ifp; + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int nm_i; /* index into the netmap ring */ // j + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; + + IFRATE(rate_ctx.new.txsync++); + + // TODO: handle the case of mbuf allocation failure + + rmb(); + + /* + * First part: process new packets to send. */ - while (j != k) { - struct netmap_slot *slot = &ring->slot[j]; /* Current slot in the netmap ring */ - void *addr = NMB(slot); - u_int len = slot->len; - struct mbuf *m; - int tx_ret; - - if (unlikely(addr == netmap_buffer_base || len > NETMAP_BUF_SIZE)) { - return netmap_ring_reinit(kring); - } - /* Tale a mbuf from the tx pool and copy in the user packet. */ - m = kring->tx_pool[j]; - if (unlikely(!m)) { - RD(5, "This should never happen"); - kring->tx_pool[j] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); - if (unlikely(m == NULL)) { - D("mbuf allocation failed"); - break; - } - } - /* XXX we should ask notifications when NS_REPORT is set, - * or roughly every half frame. We can optimize this - * by lazily requesting notifications only when a - * transmission fails. Probably the best way is to - * break on failures and set notifications when - * ring->avail == 0 || j != k - */ - tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr); - if (unlikely(tx_ret)) { - RD(5, "start_xmit failed: err %d [%u,%u,%u,%u]", - tx_ret, kring->nr_ntc, j, k, kring->nr_hwavail); - /* - * No room for this mbuf in the device driver. - * Request a notification FOR A PREVIOUS MBUF, - * then call generic_netmap_tx_clean(kring) to do the - * double check and see if we can free more buffers. - * If there is space continue, else break; - * NOTE: the double check is necessary if the problem - * occurs in the txsync call after selrecord(). - * Also, we need some way to tell the caller that not - * all buffers were queued onto the device (this was - * not a problem with native netmap driver where space - * is preallocated). The bridge has a similar problem - * and we solve it there by dropping the excess packets. - */ - generic_set_tx_event(kring, j); - if (generic_netmap_tx_clean(kring)) { /* space now available */ - continue; - } else { - break; - } - } - slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); - if (unlikely(++j == num_slots)) - j = 0; - ntx++; - } - - /* Update hwcur to the next slot to transmit. */ - kring->nr_hwcur = j; - - /* - * Report all new slots as unavailable, even those not sent. - * We account for them with with hwreserved, so that - * nr_hwreserved =:= cur - nr_hwcur + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + while (nm_i != head) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; + void *addr = NMB(slot); + + /* device-specific */ + struct mbuf *m; + int tx_ret; + + NM_CHECK_ADDR_LEN(addr, len); + + /* Tale a mbuf from the tx pool and copy in the user packet. */ + m = kring->tx_pool[nm_i]; + if (unlikely(!m)) { + RD(5, "This should never happen"); + kring->tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (unlikely(m == NULL)) { + D("mbuf allocation failed"); + break; + } + } + /* XXX we should ask notifications when NS_REPORT is set, + * or roughly every half frame. We can optimize this + * by lazily requesting notifications only when a + * transmission fails. Probably the best way is to + * break on failures and set notifications when + * ring->cur == ring->tail || nm_i != cur + */ + tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr); + if (unlikely(tx_ret)) { + RD(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]", + tx_ret, nm_i, head, kring->nr_hwtail); + /* + * No room for this mbuf in the device driver. + * Request a notification FOR A PREVIOUS MBUF, + * then call generic_netmap_tx_clean(kring) to do the + * double check and see if we can free more buffers. + * If there is space continue, else break; + * NOTE: the double check is necessary if the problem + * occurs in the txsync call after selrecord(). + * Also, we need some way to tell the caller that not + * all buffers were queued onto the device (this was + * not a problem with native netmap driver where space + * is preallocated). The bridge has a similar problem + * and we solve it there by dropping the excess packets. + */ + generic_set_tx_event(kring, nm_i); + if (generic_netmap_tx_clean(kring)) { /* space now available */ + continue; + } else { + break; + } + } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + nm_i = nm_next(nm_i, lim); + } + + /* Update hwcur to the next slot to transmit. */ + kring->nr_hwcur = nm_i; /* not head, we could break early */ + + IFRATE(rate_ctx.new.txpkt += ntx); + } + + /* + * Second, reclaim completed buffers */ - kring->nr_hwavail -= new_slots; - kring->nr_hwreserved = k - j; - if (kring->nr_hwreserved < 0) { - kring->nr_hwreserved += num_slots; - } - - IFRATE(rate_ctx.new.txpkt += ntx); - - if (!kring->nr_hwavail) { - /* No more available slots? Set a notification event - * on a netmap slot that will be cleaned in the future. - * No doublecheck is performed, since txsync() will be - * called twice by netmap_poll(). - */ - generic_set_tx_event(kring, j); - } - ND("tx #%d, hwavail = %d", n, kring->nr_hwavail); - } - - /* Synchronize the user's view to the kernel view. */ - ring->avail = kring->nr_hwavail; - ring->reserved = kring->nr_hwreserved; - - return 0; + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + /* No more available slots? Set a notification event + * on a netmap slot that will be cleaned in the future. + * No doublecheck is performed, since txsync() will be + * called twice by netmap_poll(). + */ + generic_set_tx_event(kring, nm_i); + } + ND("tx #%d, hwtail = %d", n, kring->nr_hwtail); + + generic_netmap_tx_clean(kring); + + nm_txsync_finalize(kring); + + return 0; } + /* * This handler is registered (through netmap_catch_rx()) * within the attached network interface @@ -615,38 +585,38 @@ generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) * Stolen packets are put in a queue where the * generic_netmap_rxsync() callback can extract them. */ -void generic_rx_handler(struct ifnet *ifp, struct mbuf *m) +void +generic_rx_handler(struct ifnet *ifp, struct mbuf *m) { - struct netmap_adapter *na = NA(ifp); - struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; - u_int work_done; - u_int rr = 0; // receive ring number - - ND("called"); - /* limit the size of the queue */ - if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) { - m_freem(m); - } else { - mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m); - } - - if (netmap_generic_mit < 32768) { - /* no rx mitigation, pass notification up */ - netmap_generic_irq(na->ifp, rr, &work_done); - IFRATE(rate_ctx.new.rxirq++); - } else { - /* same as send combining, filter notification if there is a - * pending timer, otherwise pass it up and start a timer. - */ - if (likely(netmap_mitigation_active(gna))) { - /* Record that there is some pending work. */ - gna->mit_pending = 1; - } else { - netmap_generic_irq(na->ifp, rr, &work_done); - IFRATE(rate_ctx.new.rxirq++); - netmap_mitigation_start(gna); - } - } + struct netmap_adapter *na = NA(ifp); + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + u_int work_done; + u_int rr = 0; // receive ring number + + /* limit the size of the queue */ + if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) { + m_freem(m); + } else { + mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m); + } + + if (netmap_generic_mit < 32768) { + /* no rx mitigation, pass notification up */ + netmap_generic_irq(na->ifp, rr, &work_done); + IFRATE(rate_ctx.new.rxirq++); + } else { + /* same as send combining, filter notification if there is a + * pending timer, otherwise pass it up and start a timer. + */ + if (likely(netmap_mitigation_active(gna))) { + /* Record that there is some pending work. */ + gna->mit_pending = 1; + } else { + netmap_generic_irq(na->ifp, rr, &work_done); + IFRATE(rate_ctx.new.rxirq++); + netmap_mitigation_start(gna); + } + } } /* @@ -658,105 +628,99 @@ void generic_rx_handler(struct ifnet *ifp, struct mbuf *m) static int generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct netmap_kring *kring = &na->rx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, n, lim = kring->nkr_num_slots - 1; - int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k, resvd = ring->reserved; - - if (ring->cur > lim) - return netmap_ring_reinit(kring); - - /* Import newly received packets into the netmap ring. */ - if (netmap_no_pendintr || force_update) { - uint16_t slot_flags = kring->nkr_slot_flags; - struct mbuf *m; - - n = 0; - j = kring->nr_ntc; /* first empty slot in the receive ring */ - /* extract buffers from the rx queue, stop at most one - * slot before nr_hwcur (index k) + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int nm_i; /* index into the netmap ring */ //j, + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + + if (head > lim) + return netmap_ring_reinit(kring); + + /* + * First part: import newly received packets. + */ + if (netmap_no_pendintr || force_update) { + /* extract buffers from the rx queue, stop at most one + * slot before nr_hwcur (stop_i) + */ + uint16_t slot_flags = kring->nkr_slot_flags; + u_int stop_i = nm_prev(kring->nr_hwcur, lim); + + nm_i = kring->nr_hwtail; /* first empty slot in the receive ring */ + for (n = 0; nm_i != stop_i; n++) { + int len; + void *addr = NMB(&ring->slot[nm_i]); + struct mbuf *m; + + /* we only check the address here on generic rx rings */ + if (addr == netmap_buffer_base) { /* Bad buffer */ + return netmap_ring_reinit(kring); + } + /* + * Call the locked version of the function. + * XXX Ideally we could grab a batch of mbufs at once + * and save some locking overhead. + */ + m = mbq_safe_dequeue(&kring->rx_queue); + if (!m) /* no more data */ + break; + len = MBUF_LEN(m); + m_copydata(m, 0, len, addr); + ring->slot[nm_i].len = len; + ring->slot[nm_i].flags = slot_flags; + m_freem(m); + nm_i = nm_next(nm_i, lim); + n++; + } + if (n) { + kring->nr_hwtail = nm_i; + IFRATE(rate_ctx.new.rxpkt += n); + } + kring->nr_kflags &= ~NKR_PENDINTR; + } + + // XXX should we invert the order ? + /* + * Second part: skip past packets that userspace has released. */ - k = (kring->nr_hwcur) ? kring->nr_hwcur-1 : lim; - while (j != k) { - int len; - void *addr = NMB(&ring->slot[j]); - - if (addr == netmap_buffer_base) { /* Bad buffer */ - return netmap_ring_reinit(kring); - } - /* - * Call the locked version of the function. - * XXX Ideally we could grab a batch of mbufs at once, - * by changing rx_queue into a ring. - */ - m = mbq_safe_dequeue(&kring->rx_queue); - if (!m) - break; - len = MBUF_LEN(m); - m_copydata(m, 0, len, addr); - ring->slot[j].len = len; - ring->slot[j].flags = slot_flags; - m_freem(m); - if (unlikely(j++ == lim)) - j = 0; - n++; - } - if (n) { - kring->nr_ntc = j; - kring->nr_hwavail += n; - IFRATE(rate_ctx.new.rxpkt += n); - } - kring->nr_kflags &= ~NKR_PENDINTR; - } - - // XXX should we invert the order ? - /* Skip past packets that userspace has released */ - j = kring->nr_hwcur; - k = ring->cur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { - /* Userspace has released some packets. */ - for (n = 0; j != k; n++) { - struct netmap_slot *slot = &ring->slot[j]; - - slot->flags &= ~NS_BUF_CHANGED; - if (unlikely(j++ == lim)) - j = 0; - } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; - } - /* Tell userspace that there are new packets. */ - ring->avail = kring->nr_hwavail - resvd; - IFRATE(rate_ctx.new.rxsync++); - - return 0; + nm_i = kring->nr_hwcur; + if (nm_i != head) { + /* Userspace has released some packets. */ + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + + slot->flags &= ~NS_BUF_CHANGED; + nm_i = nm_next(nm_i, lim); + } + kring->nr_hwcur = head; + } + /* tell userspace that there might be new packets. */ + nm_rxsync_finalize(kring); + IFRATE(rate_ctx.new.rxsync++); + + return 0; } static void generic_netmap_dtor(struct netmap_adapter *na) { - struct ifnet *ifp = na->ifp; - struct netmap_generic_adapter *gna = (struct netmap_generic_adapter*)na; - struct netmap_adapter *prev_na = gna->prev; - - if (prev_na != NULL) { - D("Released generic NA %p", gna); - if_rele(na->ifp); - netmap_adapter_put(prev_na); - } - if (ifp != NULL) { - WNA(ifp) = prev_na; - D("Restored native NA %p", prev_na); - na->ifp = NULL; - } + struct ifnet *ifp = na->ifp; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter*)na; + struct netmap_adapter *prev_na = gna->prev; + + if (prev_na != NULL) { + D("Released generic NA %p", gna); + if_rele(na->ifp); + netmap_adapter_put(prev_na); + } + if (ifp != NULL) { + WNA(ifp) = prev_na; + D("Restored native NA %p", prev_na); + na->ifp = NULL; + } } /* @@ -773,46 +737,46 @@ generic_netmap_dtor(struct netmap_adapter *na) int generic_netmap_attach(struct ifnet *ifp) { - struct netmap_adapter *na; - struct netmap_generic_adapter *gna; - int retval; - u_int num_tx_desc, num_rx_desc; - - num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */ - - generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); - ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc); - - gna = malloc(sizeof(*gna), M_DEVBUF, M_NOWAIT | M_ZERO); - if (gna == NULL) { - D("no memory on attach, give up"); - return ENOMEM; - } - na = (struct netmap_adapter *)gna; - na->ifp = ifp; - na->num_tx_desc = num_tx_desc; - na->num_rx_desc = num_rx_desc; - na->nm_register = &generic_netmap_register; - na->nm_txsync = &generic_netmap_txsync; - na->nm_rxsync = &generic_netmap_rxsync; - na->nm_dtor = &generic_netmap_dtor; - /* when using generic, IFCAP_NETMAP is set so we force - * NAF_SKIP_INTR to use the regular interrupt handler - */ - na->na_flags = NAF_SKIP_INTR; - - ND("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)", - ifp->num_tx_queues, ifp->real_num_tx_queues, - ifp->tx_queue_len); - ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)", - ifp->num_rx_queues, ifp->real_num_rx_queues); - - generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings); - - retval = netmap_attach_common(na); - if (retval) { - free(gna, M_DEVBUF); - } - - return retval; + struct netmap_adapter *na; + struct netmap_generic_adapter *gna; + int retval; + u_int num_tx_desc, num_rx_desc; + + num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */ + + generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); + ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc); + + gna = malloc(sizeof(*gna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (gna == NULL) { + D("no memory on attach, give up"); + return ENOMEM; + } + na = (struct netmap_adapter *)gna; + na->ifp = ifp; + na->num_tx_desc = num_tx_desc; + na->num_rx_desc = num_rx_desc; + na->nm_register = &generic_netmap_register; + na->nm_txsync = &generic_netmap_txsync; + na->nm_rxsync = &generic_netmap_rxsync; + na->nm_dtor = &generic_netmap_dtor; + /* when using generic, IFCAP_NETMAP is set so we force + * NAF_SKIP_INTR to use the regular interrupt handler + */ + na->na_flags = NAF_SKIP_INTR; + + ND("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)", + ifp->num_tx_queues, ifp->real_num_tx_queues, + ifp->tx_queue_len); + ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)", + ifp->num_rx_queues, ifp->real_num_rx_queues); + + generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings); + + retval = netmap_attach_common(na); + if (retval) { + free(gna, M_DEVBUF); + } + + return retval; } diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index 9381cd4cedd3..74a46297ff3d 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -1,6 +1,6 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. - * Copyright (C) 2013 Universita` di Pisa. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -53,7 +53,7 @@ #define NM_SELINFO_T struct selinfo #define MBUF_LEN(m) ((m)->m_pkthdr.len) #define MBUF_IFP(m) ((m)->m_pkthdr.rcvif) -#define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m) +#define NM_SEND_UP(ifp, m) ((NA(ifp))->if_input)(ifp, m) #define NM_ATOMIC_T volatile int // XXX ? /* atomic operations */ @@ -76,7 +76,11 @@ struct hrtimer { #define NM_SELINFO_T wait_queue_head_t #define MBUF_LEN(m) ((m)->len) #define MBUF_IFP(m) ((m)->dev) -#define NM_SEND_UP(ifp, m) netif_rx(m) +#define NM_SEND_UP(ifp, m) \ + do { \ + m->priority = NM_MAGIC_PRIORITY; \ + netif_rx(m); \ + } while (0) #define NM_ATOMIC_T volatile long unsigned int @@ -125,9 +129,9 @@ struct hrtimer { do { \ struct timeval __xxts; \ microtime(&__xxts); \ - printf("%03d.%06d %s [%d] " format "\n", \ + printf("%03d.%06d [%4d] %-25s " format "\n", \ (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ - __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + __LINE__, __FUNCTION__, ##__VA_ARGS__); \ } while (0) /* rate limited, lps indicates how many per second */ @@ -158,15 +162,23 @@ extern NMG_LOCK_T netmap_global_lock; * a ring across system calls. * * nr_hwcur index of the next buffer to refill. - * It corresponds to ring->cur - ring->reserved + * It corresponds to ring->head + * at the time the system call returns. * - * nr_hwavail the number of slots "owned" by userspace. - * nr_hwavail =:= ring->avail + ring->reserved + * nr_hwtail index of the first buffer owned by the kernel. + * On RX, hwcur->hwtail are receive buffers + * not yet released. hwcur is advanced following + * ring->head, hwtail is advanced on incoming packets, + * and a wakeup is generated when hwtail passes ring->cur + * On TX, hwcur->rcur have been filled by the sender + * but not sent yet to the NIC; rcur->hwtail are available + * for new transmissions, and hwtail->hwcur-1 are pending + * transmissions not yet acknowledged. * * The indexes in the NIC and netmap rings are offset by nkr_hwofs slots. * This is so that, on a reset, buffers owned by userspace are not * modified by the kernel. In particular: - * RX rings: the next empty buffer (hwcur + hwavail + hwofs) coincides with + * RX rings: the next empty buffer (hwtail + hwofs) coincides with * the next empty buffer as known by the hardware (next_to_check or so). * TX rings: hwcur + hwofs coincides with next_to_send * @@ -184,44 +196,76 @@ extern NMG_LOCK_T netmap_global_lock; * from nr_hwlease, advances it, then does the * copy outside the lock. * In RX rings (used for VALE ports), - * nkr_hwcur + nkr_hwavail <= nkr_hwlease < nkr_hwcur+N-1 + * nkr_hwtail <= nkr_hwlease < nkr_hwcur+N-1 * In TX rings (used for NIC or host stack ports) - * nkr_hwcur <= nkr_hwlease < nkr_hwcur+ nkr_hwavail + * nkr_hwcur <= nkr_hwlease < nkr_hwtail * nkr_leases array of nkr_num_slots where writers can report * completion of their block. NR_NOSLOT (~0) indicates * that the writer has not finished yet * nkr_lease_idx index of next free slot in nr_leases, to be assigned * * The kring is manipulated by txsync/rxsync and generic netmap function. - * q_lock is used to arbitrate access to the kring from within the netmap - * code, and this and other protections guarantee that there is never - * more than 1 concurrent call to txsync or rxsync. So we are free - * to manipulate the kring from within txsync/rxsync without any extra - * locks. + * + * Concurrent rxsync or txsync on the same ring are prevented through + * by nm_kr_lock() which in turn uses nr_busy. This is all we need + * for NIC rings, and for TX rings attached to the host stack. + * + * RX rings attached to the host stack use an mbq (rx_queue) on both + * rxsync_from_host() and netmap_transmit(). The mbq is protected + * by its internal lock. + * + * RX rings attached to the VALE switch are accessed by both sender + * and receiver. They are protected through the q_lock on the RX ring. */ struct netmap_kring { - struct netmap_ring *ring; - uint32_t nr_hwcur; - uint32_t nr_hwavail; - uint32_t nr_kflags; /* private driver flags */ - int32_t nr_hwreserved; -#define NKR_PENDINTR 0x1 // Pending interrupt. - uint32_t nkr_num_slots; - int32_t nkr_hwofs; /* offset between NIC and netmap ring */ + struct netmap_ring *ring; + + uint32_t nr_hwcur; + uint32_t nr_hwtail; + + /* + * Copies of values in user rings, so we do not need to look + * at the ring (which could be modified). These are set in the + * *sync_prologue()/finalize() routines. + */ + uint32_t rhead; + uint32_t rcur; + uint32_t rtail; + + uint32_t nr_kflags; /* private driver flags */ +#define NKR_PENDINTR 0x1 // Pending interrupt. + uint32_t nkr_num_slots; + + /* + * On a NIC reset, the NIC ring indexes may be reset but the + * indexes in the netmap rings remain the same. nkr_hwofs + * keeps track of the offset between the two. + */ + int32_t nkr_hwofs; uint16_t nkr_slot_flags; /* initial value for flags */ + + /* last_reclaim is opaque marker to help reduce the frequency + * of operations such as reclaiming tx buffers. A possible use + * is set it to ticks and do the reclaim only once per tick. + */ + uint64_t last_reclaim; + + + NM_SELINFO_T si; /* poll/select wait queue */ + NM_LOCK_T q_lock; /* protects kring and ring. */ + NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ + struct netmap_adapter *na; - struct nm_bdg_fwd *nkr_ft; - uint32_t *nkr_leases; -#define NR_NOSLOT ((uint32_t)~0) - uint32_t nkr_hwlease; - uint32_t nkr_lease_idx; - NM_SELINFO_T si; /* poll/select wait queue */ - NM_LOCK_T q_lock; /* protects kring and ring. */ - NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ + /* The folloiwing fields are for VALE switch support */ + struct nm_bdg_fwd *nkr_ft; + uint32_t *nkr_leases; +#define NR_NOSLOT ((uint32_t)~0) /* used in nkr_*lease* */ + uint32_t nkr_hwlease; + uint32_t nkr_lease_idx; - volatile int nkr_stopped; + volatile int nkr_stopped; // XXX what for ? /* support for adapters without native netmap support. * On tx rings we preallocate an array of tx buffers @@ -230,8 +274,11 @@ struct netmap_kring { * XXX who writes to the rx queue ? */ struct mbuf **tx_pool; - u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */ - struct mbq rx_queue; /* A queue for intercepted rx mbufs. */ + // u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */ + struct mbq rx_queue; /* intercepted rx mbufs. */ + + uint32_t ring_id; /* debugging */ + char name[64]; /* diagnostic */ } __attribute__((__aligned__(64))); @@ -243,6 +290,15 @@ nm_next(uint32_t i, uint32_t lim) return unlikely (i == lim) ? 0 : i + 1; } + +/* return the previous index, with wraparound */ +static inline uint32_t +nm_prev(uint32_t i, uint32_t lim) +{ + return unlikely (i == 0) ? lim : i - 1; +} + + /* * * Here is the layout for the Rx and Tx rings. @@ -253,36 +309,36 @@ nm_next(uint32_t i, uint32_t lim) | | | | |XXX free slot XXX| |XXX free slot XXX| +-----------------+ +-----------------+ - | |<-hwcur | |<-hwcur - | reserved h | | (ready | - +----------- w -+ | to be | - cur->| a | | sent) h | - | v | +---------- w | - | a | cur->| (being a | - | i | | prepared) v | - | avail l | | a | - +-----------------+ + a ------ i + - | | ... | v l |<-hwlease - | (being | ... | a | ... - | prepared) | ... | i | ... - +-----------------+ ... | l | ... - | |<-hwlease +-----------------+ +head->| owned by user |<-hwcur | not sent to nic |<-hwcur + | | | yet | + +-----------------+ | | + cur->| available to | | | + | user, not read | +-----------------+ + | yet | cur->| (being | + | | | prepared) | | | | | + +-----------------+ + ------ + +tail->| |<-hwtail | |<-hwlease + | (being | ... | | ... + | prepared) | ... | | ... + +-----------------+ ... | | ... + | |<-hwlease +-----------------+ + | | tail->| |<-hwtail | | | | | | | | | | | | +-----------------+ +-----------------+ - * The cur/avail (user view) and hwcur/hwavail (kernel view) + * The cur/tail (user view) and hwcur/hwtail (kernel view) * are used in the normal operation of the card. * * When a ring is the output of a switch port (Rx ring for * a VALE port, Tx ring for the host stack or NIC), slots * are reserved in blocks through 'hwlease' which points * to the next unused slot. - * On an Rx ring, hwlease is always after hwavail, - * and completions cause avail to advance. - * On a Tx ring, hwlease is always between cur and hwavail, + * On an Rx ring, hwlease is always after hwtail, + * and completions cause hwtail to advance. + * On a Tx ring, hwlease is always between cur and hwtail, * and completions cause cur to advance. * * nm_kr_space() returns the maximum number of slots that @@ -294,7 +350,6 @@ nm_next(uint32_t i, uint32_t lim) - enum txrx { NR_RX = 0, NR_TX = 1 }; /* @@ -349,6 +404,7 @@ struct netmap_adapter { */ struct netmap_kring *tx_rings; /* array of TX rings. */ struct netmap_kring *rx_rings; /* array of RX rings. */ + void *tailroom; /* space below the rings array */ /* (used for leases) */ @@ -360,11 +416,38 @@ struct netmap_adapter { */ int (*if_transmit)(struct ifnet *, struct mbuf *); + /* copy of if_input for netmap_send_up() */ + void (*if_input)(struct ifnet *, struct mbuf *); + /* references to the ifnet and device routines, used by * the generic netmap functions. */ struct ifnet *ifp; /* adapter is ifp->if_softc */ + /*---- callbacks for this netmap adapter -----*/ + /* + * nm_dtor() is the cleanup routine called when destroying + * the adapter. + * + * nm_register() is called on NIOCREGIF and close() to enter + * or exit netmap mode on the NIC + * + * nm_txsync() pushes packets to the underlying hw/switch + * + * nm_rxsync() collects packets from the underlying hw/switch + * + * nm_config() returns configuration information from the OS + * + * nm_krings_create() XXX + * + * nm_krings_delete() XXX + * + * nm_notify() is used to act after data have become available. + * For hw devices this is typically a selwakeup(), + * but for NIC/host ports attached to a switch (or vice-versa) + * we also need to invoke the 'txsync' code downstream. + */ + /* private cleanup */ void (*nm_dtor)(struct netmap_adapter *); @@ -403,6 +486,7 @@ struct netmap_adapter { void *na_private; }; + /* * If the NIC is owned by the kernel * (i.e., bridge), neither another bridge nor user can use it; @@ -433,13 +517,15 @@ struct netmap_vp_adapter { /* VALE software port */ u_int offset; /* Offset of ethernet header for each packet. */ }; + struct netmap_hw_adapter { /* physical device */ struct netmap_adapter up; struct net_device_ops nm_ndo; // XXX linux only }; -struct netmap_generic_adapter { /* non-native device */ + +struct netmap_generic_adapter { /* emulated device */ struct netmap_hw_adapter up; /* Pointer to a previously used netmap adapter. */ @@ -455,16 +541,20 @@ struct netmap_generic_adapter { /* non-native device */ struct hrtimer mit_timer; int mit_pending; +#ifdef linux + netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *); +#endif }; #ifdef WITH_VALE -/* bridge wrapper for non VALE ports. It is used to connect real devices to the bridge. +/* + * Bridge wrapper for non VALE ports attached to a VALE switch. * - * The real device must already have its own netmap adapter (hwna). The - * bridge wrapper and the hwna adapter share the same set of netmap rings and - * buffers, but they have two separate sets of krings descriptors, with tx/rx - * meanings swapped: + * The real device must already have its own netmap adapter (hwna). + * The bridge wrapper and the hwna adapter share the same set of + * netmap rings and buffers, but they have two separate sets of + * krings descriptors, with tx/rx meanings swapped: * * netmap * bwrap krings rings krings hwna @@ -478,23 +568,28 @@ struct netmap_generic_adapter { /* non-native device */ * | | +------+ +-----+ +------+ | | * +------+ +------+ * - * - packets coming from the bridge go to the brwap rx rings, which are also the - * hwna tx rings. The bwrap notify callback will then complete the hwna tx - * (see netmap_bwrap_notify). - * - packets coming from the outside go to the hwna rx rings, which are also the - * bwrap tx rings. The (overwritten) hwna notify method will then complete - * the bridge tx (see netmap_bwrap_intr_notify). + * - packets coming from the bridge go to the brwap rx rings, + * which are also the hwna tx rings. The bwrap notify callback + * will then complete the hwna tx (see netmap_bwrap_notify). * - * The bridge wrapper may optionally connect the hwna 'host' rings to the - * bridge. This is done by using a second port in the bridge and connecting it - * to the 'host' netmap_vp_adapter contained in the netmap_bwrap_adapter. - * The brwap host adapter cross-links the hwna host rings in the same way as shown above. + * - packets coming from the outside go to the hwna rx rings, + * which are also the bwrap tx rings. The (overwritten) hwna + * notify method will then complete the bridge tx + * (see netmap_bwrap_intr_notify). * - * - packets coming from the bridge and directed to host stack are handled by the - * bwrap host notify callback (see netmap_bwrap_host_notify) - * - packets coming from the host stack are still handled by the overwritten - * hwna notify callback (netmap_bwrap_intr_notify), but are diverted to the - * host adapter depending on the ring number. + * The bridge wrapper may optionally connect the hwna 'host' rings + * to the bridge. This is done by using a second port in the + * bridge and connecting it to the 'host' netmap_vp_adapter + * contained in the netmap_bwrap_adapter. The brwap host adapter + * cross-links the hwna host rings in the same way as shown above. + * + * - packets coming from the bridge and directed to the host stack + * are handled by the bwrap host notify callback + * (see netmap_bwrap_host_notify) + * + * - packets coming from the host stack are still handled by the + * overwritten hwna notify callback (netmap_bwrap_intr_notify), + * but are diverted to the host adapter depending on the ring number. * */ struct netmap_bwrap_adapter { @@ -505,103 +600,39 @@ struct netmap_bwrap_adapter { /* backup of the hwna notify callback */ int (*save_notify)(struct netmap_adapter *, u_int ring, enum txrx, int flags); - /* When we attach a physical interface to the bridge, we + + /* + * When we attach a physical interface to the bridge, we * allow the controlling process to terminate, so we need * a place to store the netmap_priv_d data structure. - * This is only done when physical interfaces are attached to a bridge. + * This is only done when physical interfaces + * are attached to a bridge. */ struct netmap_priv_d *na_kpriv; }; -/* - * Available space in the ring. Only used in VALE code - */ -static inline uint32_t -nm_kr_space(struct netmap_kring *k, int is_rx) -{ - int space; - - if (is_rx) { - int busy = k->nkr_hwlease - k->nr_hwcur + k->nr_hwreserved; - if (busy < 0) - busy += k->nkr_num_slots; - space = k->nkr_num_slots - 1 - busy; - } else { - space = k->nr_hwcur + k->nr_hwavail - k->nkr_hwlease; - if (space < 0) - space += k->nkr_num_slots; - } -#if 0 - // sanity check - if (k->nkr_hwlease >= k->nkr_num_slots || - k->nr_hwcur >= k->nkr_num_slots || - k->nr_hwavail >= k->nkr_num_slots || - busy < 0 || - busy >= k->nkr_num_slots) { - D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, - k->nkr_lease_idx, k->nkr_num_slots); - } -#endif - return space; -} - - +#endif /* WITH_VALE */ -/* make a lease on the kring for N positions. return the - * lease index - */ +/* return slots reserved to rx clients; used in drivers */ static inline uint32_t -nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) +nm_kr_rxspace(struct netmap_kring *k) { - uint32_t lim = k->nkr_num_slots - 1; - uint32_t lease_idx = k->nkr_lease_idx; - - k->nkr_leases[lease_idx] = NR_NOSLOT; - k->nkr_lease_idx = nm_next(lease_idx, lim); + int space = k->nr_hwtail - k->nr_hwcur; + if (space < 0) + space += k->nkr_num_slots; + ND("preserving %d rx slots %d -> %d", space, k->nr_hwcur, k->nr_hwtail); - if (n > nm_kr_space(k, is_rx)) { - D("invalid request for %d slots", n); - panic("x"); - } - /* XXX verify that there are n slots */ - k->nkr_hwlease += n; - if (k->nkr_hwlease > lim) - k->nkr_hwlease -= lim + 1; - - if (k->nkr_hwlease >= k->nkr_num_slots || - k->nr_hwcur >= k->nkr_num_slots || - k->nr_hwavail >= k->nkr_num_slots || - k->nkr_lease_idx >= k->nkr_num_slots) { - D("invalid kring %s, cur %d avail %d lease %d lease_idx %d lim %d", - k->na->ifp->if_xname, - k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, - k->nkr_lease_idx, k->nkr_num_slots); - } - return lease_idx; + return space; } -#endif /* WITH_VALE */ -/* return update position */ -static inline uint32_t -nm_kr_rxpos(struct netmap_kring *k) +/* True if no space in the tx ring. only valid after txsync_prologue */ +static inline int +nm_kr_txempty(struct netmap_kring *kring) { - uint32_t pos = k->nr_hwcur + k->nr_hwavail; - if (pos >= k->nkr_num_slots) - pos -= k->nkr_num_slots; -#if 0 - if (pos >= k->nkr_num_slots || - k->nkr_hwlease >= k->nkr_num_slots || - k->nr_hwcur >= k->nkr_num_slots || - k->nr_hwavail >= k->nkr_num_slots || - k->nkr_lease_idx >= k->nkr_num_slots) { - D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, - k->nkr_lease_idx, k->nkr_num_slots); - } -#endif - return pos; + return kring->rcur == kring->nr_hwtail; } @@ -613,11 +644,13 @@ nm_kr_rxpos(struct netmap_kring *k) #define NM_KR_BUSY 1 #define NM_KR_STOPPED 2 + static __inline void nm_kr_put(struct netmap_kring *kr) { NM_ATOMIC_CLEAR(&kr->nr_busy); } + static __inline int nm_kr_tryget(struct netmap_kring *kr) { /* check a first time without taking the lock @@ -640,7 +673,7 @@ static __inline int nm_kr_tryget(struct netmap_kring *kr) /* - * The following are support routines used by individual drivers to + * The following functions are used by individual drivers to * support netmap operation. * * netmap_attach() initializes a struct netmap_adapter, allocating the @@ -666,7 +699,17 @@ struct netmap_slot *netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, u_int new_cur); int netmap_ring_reinit(struct netmap_kring *); -/* set/clear native flags. XXX maybe also if_transmit ? */ +/* default functions to handle rx/tx interrupts */ +int netmap_rx_irq(struct ifnet *, u_int, u_int *); +#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) +void netmap_common_irq(struct ifnet *, u_int, u_int *work_done); + +void netmap_disable_all_rings(struct ifnet *); +void netmap_enable_all_rings(struct ifnet *); +void netmap_disable_ring(struct netmap_kring *kr); + + +/* set/clear native flags and if_transmit/netdev_ops */ static inline void nm_set_native_flags(struct netmap_adapter *na) { @@ -685,6 +728,7 @@ nm_set_native_flags(struct netmap_adapter *na) #endif } + static inline void nm_clear_native_flags(struct netmap_adapter *na) { @@ -701,36 +745,58 @@ nm_clear_native_flags(struct netmap_adapter *na) #endif } + /* - * validates parameters in the ring/kring, returns a value for cur, - * and the 'new_slots' value in the argument. - * If any error, returns cur > lim to force a reinit. + * validates parameters in the ring/kring, returns a value for head + * If any error, returns ring_size to force a reinit. */ -u_int nm_txsync_prologue(struct netmap_kring *, u_int *); +uint32_t nm_txsync_prologue(struct netmap_kring *); + /* - * validates parameters in the ring/kring, returns a value for cur, + * validates parameters in the ring/kring, returns a value for head, * and the 'reserved' value in the argument. - * If any error, returns cur > lim to force a reinit. + * If any error, returns ring_size lim to force a reinit. + */ +uint32_t nm_rxsync_prologue(struct netmap_kring *); + + +/* + * update kring and ring at the end of txsync. */ -u_int nm_rxsync_prologue(struct netmap_kring *, u_int *); +static inline void +nm_txsync_finalize(struct netmap_kring *kring) +{ + /* update ring head/tail to what the kernel knows */ + kring->ring->tail = kring->rtail = kring->nr_hwtail; + kring->ring->head = kring->rhead = kring->nr_hwcur; + + /* note, head/rhead/hwcur might be behind cur/rcur + * if no carrier + */ + ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d", + kring->name, kring->nr_hwcur, kring->nr_hwtail, + kring->rhead, kring->rcur, kring->rtail); +} + /* - * update kring and ring at the end of txsync + * update kring and ring at the end of rxsync */ static inline void -nm_txsync_finalize(struct netmap_kring *kring, u_int cur) +nm_rxsync_finalize(struct netmap_kring *kring) { - /* recompute hwreserved */ - kring->nr_hwreserved = cur - kring->nr_hwcur; - if (kring->nr_hwreserved < 0) - kring->nr_hwreserved += kring->nkr_num_slots; - - /* update avail and reserved to what the kernel knows */ - kring->ring->avail = kring->nr_hwavail; - kring->ring->reserved = kring->nr_hwreserved; + /* tell userspace that there might be new packets */ + //struct netmap_ring *ring = kring->ring; + ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail, + kring->nr_hwtail); + kring->ring->tail = kring->rtail = kring->nr_hwtail; + /* make a copy of the state for next round */ + kring->rhead = kring->ring->head; + kring->rcur = kring->ring->cur; } + /* check/fix address and len in tx rings */ #if 1 /* debug version */ #define NM_CHECK_ADDR_LEN(_a, _l) do { \ @@ -755,6 +821,8 @@ nm_txsync_finalize(struct netmap_kring *kring, u_int cur) int netmap_update_config(struct netmap_adapter *na); int netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom); void netmap_krings_delete(struct netmap_adapter *na); +int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait); + struct netmap_if * netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, @@ -766,10 +834,13 @@ u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg); int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create); int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na); + #ifdef WITH_VALE /* - * The following bridge-related interfaces are used by other kernel modules - * In the version that only supports unicast or broadcast, the lookup + * The following bridge-related functions are used by other + * kernel modules. + * + * VALE only supports unicast or broadcast. The lookup * function can return 0 .. NM_BDG_MAXPORTS-1 for regular ports, * NM_BDG_MAXPORTS for broadcast, NM_BDG_MAXPORTS+1 for unknown. * XXX in practice "unknown" might be handled same as broadcast. @@ -799,8 +870,6 @@ int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func); /* Various prototypes */ int netmap_poll(struct cdev *dev, int events, struct thread *td); - - int netmap_init(void); void netmap_fini(void); int netmap_get_memory(struct netmap_priv_d* p); @@ -811,7 +880,8 @@ int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct t /* netmap_adapter creation/destruction */ #define NM_IFPNAME(ifp) ((ifp) ? (ifp)->if_xname : "zombie") -#define NM_DEBUG_PUTGET 1 + +// #define NM_DEBUG_PUTGET 1 #ifdef NM_DEBUG_PUTGET @@ -844,12 +914,15 @@ int netmap_adapter_put(struct netmap_adapter *na); #endif /* !NM_DEBUG_PUTGET */ +/* + * module variables + */ extern u_int netmap_buf_size; #define NETMAP_BUF_SIZE netmap_buf_size // XXX remove -extern int netmap_mitigate; +extern int netmap_mitigate; // XXX not really used extern int netmap_no_pendintr; -extern u_int netmap_total_buffers; -extern char *netmap_buffer_base; +extern u_int netmap_total_buffers; // global allocator +extern char *netmap_buffer_base; // global allocator extern int netmap_verbose; // XXX debugging enum { /* verbose flags */ NM_VERB_ON = 1, /* generic verbose */ @@ -908,7 +981,7 @@ extern int netmap_generic_ringsize; #ifdef __FreeBSD__ -/* Callback invoked by the dma machinery after a successfull dmamap_load */ +/* Callback invoked by the dma machinery after a successful dmamap_load */ static void netmap_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs, __unused int nseg, __unused int error) { @@ -1053,31 +1126,27 @@ BDG_NMB(struct netmap_adapter *na, struct netmap_slot *slot) lut[0].vaddr : lut[i].vaddr; } -/* default functions to handle rx/tx interrupts */ -int netmap_rx_irq(struct ifnet *, u_int, u_int *); -#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) -void netmap_common_irq(struct ifnet *, u_int, u_int *work_done); void netmap_txsync_to_host(struct netmap_adapter *na); -void netmap_disable_all_rings(struct ifnet *); -void netmap_enable_all_rings(struct ifnet *); -void netmap_disable_ring(struct netmap_kring *kr); -/* Structure associated to each thread which registered an interface. +/* + * Structure associated to each thread which registered an interface. * * The first 4 fields of this structure are written by NIOCREGIF and * read by poll() and NIOC?XSYNC. - * There is low contention among writers (actually, a correct user program - * should have no contention among writers) and among writers and readers, - * so we use a single global lock to protect the structure initialization. - * Since initialization involves the allocation of memory, we reuse the memory - * allocator lock. + * + * There is low contention among writers (a correct user program + * should have none) and among writers and readers, so we use a + * single global lock to protect the structure initialization; + * since initialization involves the allocation of memory, + * we reuse the memory allocator lock. + * * Read access to the structure is lock free. Readers must check that * np_nifp is not NULL before using the other fields. - * If np_nifp is NULL initialization has not been performed, so they should - * return an error to userlevel. + * If np_nifp is NULL initialization has not been performed, + * so they should return an error to userspace. * * The ref_done field is used to regulate access to the refcount in the * memory allocator. The refcount must be incremented at most once for @@ -1091,38 +1160,29 @@ struct netmap_priv_d { struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ struct netmap_adapter *np_na; - int np_ringid; /* from the ioctl */ - u_int np_qfirst, np_qlast; /* range of rings to scan */ - uint16_t np_txpoll; + int np_ringid; /* from the ioctl */ + u_int np_qfirst, np_qlast; /* range of rings to scan */ + uint16_t np_txpoll; struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ /* np_refcount is only used on FreeBSD */ - int np_refcount; /* use with NMG_LOCK held */ + int np_refcount; /* use with NMG_LOCK held */ }; /* * generic netmap emulation for devices that do not have * native netmap support. - * XXX generic_netmap_register() is only exported to implement - * nma_is_generic(). */ -int generic_netmap_register(struct netmap_adapter *na, int enable); int generic_netmap_attach(struct ifnet *ifp); int netmap_catch_rx(struct netmap_adapter *na, int intercept); void generic_rx_handler(struct ifnet *ifp, struct mbuf *m);; -void netmap_catch_packet_steering(struct netmap_generic_adapter *na, int enable); +void netmap_catch_tx(struct netmap_generic_adapter *na, int enable); int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, u_int ring_nr); int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); -static __inline int -nma_is_generic(struct netmap_adapter *na) -{ - return na->nm_register == generic_netmap_register; -} - /* * netmap_mitigation API. This is used by the generic adapter * to reduce the number of interrupt requests/selwakeup @@ -1134,6 +1194,4 @@ void netmap_mitigation_restart(struct netmap_generic_adapter *na); int netmap_mitigation_active(struct netmap_generic_adapter *na); void netmap_mitigation_cleanup(struct netmap_generic_adapter *na); -// int generic_timer_handler(struct hrtimer *t); - #endif /* _NET_NETMAP_KERN_H_ */ diff --git a/sys/dev/netmap/netmap_mbq.c b/sys/dev/netmap/netmap_mbq.c index c8e581b69fe5..2606b13d48dc 100644 --- a/sys/dev/netmap/netmap_mbq.c +++ b/sys/dev/netmap/netmap_mbq.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Vincenzo Maffione. All rights reserved. + * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -47,17 +47,20 @@ static inline void __mbq_init(struct mbq *q) q->count = 0; } + void mbq_safe_init(struct mbq *q) { mtx_init(&q->lock, "mbq", NULL, MTX_SPIN); __mbq_init(q); } + void mbq_init(struct mbq *q) { __mbq_init(q); } + static inline void __mbq_enqueue(struct mbq *q, struct mbuf *m) { m->m_nextpkt = NULL; @@ -70,6 +73,7 @@ static inline void __mbq_enqueue(struct mbq *q, struct mbuf *m) q->count++; } + void mbq_safe_enqueue(struct mbq *q, struct mbuf *m) { mtx_lock(&q->lock); @@ -77,11 +81,13 @@ void mbq_safe_enqueue(struct mbq *q, struct mbuf *m) mtx_unlock(&q->lock); } + void mbq_enqueue(struct mbq *q, struct mbuf *m) { __mbq_enqueue(q, m); } + static inline struct mbuf *__mbq_dequeue(struct mbq *q) { struct mbuf *ret = NULL; @@ -99,6 +105,7 @@ static inline struct mbuf *__mbq_dequeue(struct mbq *q) return ret; } + struct mbuf *mbq_safe_dequeue(struct mbq *q) { struct mbuf *ret; @@ -110,11 +117,13 @@ struct mbuf *mbq_safe_dequeue(struct mbq *q) return ret; } + struct mbuf *mbq_dequeue(struct mbq *q) { return __mbq_dequeue(q); } + /* XXX seems pointless to have a generic purge */ static void __mbq_purge(struct mbq *q, int safe) { @@ -130,16 +139,19 @@ static void __mbq_purge(struct mbq *q, int safe) } } + void mbq_purge(struct mbq *q) { __mbq_purge(q, 0); } + void mbq_safe_purge(struct mbq *q) { __mbq_purge(q, 1); } + void mbq_safe_destroy(struct mbq *q) { mtx_destroy(&q->lock); @@ -149,4 +161,3 @@ void mbq_safe_destroy(struct mbq *q) void mbq_destroy(struct mbq *q) { } - diff --git a/sys/dev/netmap/netmap_mbq.h b/sys/dev/netmap/netmap_mbq.h index ad023b617a5d..d273d8a8fa23 100644 --- a/sys/dev/netmap/netmap_mbq.h +++ b/sys/dev/netmap/netmap_mbq.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Vincenzo Maffione. All rights reserved. + * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c index f28f2c04751a..b25f79cef3a4 100644 --- a/sys/dev/netmap/netmap_mem2.c +++ b/sys/dev/netmap/netmap_mem2.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2013 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -506,7 +506,7 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj p->r_objsize = objsize; #define MAX_CLUSTSIZE (1<<17) -#define LINE_ROUND 64 +#define LINE_ROUND NM_CACHE_ALIGN // 64 if (objsize >= MAX_CLUSTSIZE) { /* we could do it but there is no point */ D("unsupported allocation for %d bytes", objsize); @@ -960,13 +960,15 @@ netmap_mem_rings_create(struct netmap_adapter *na) ND("txring[%d] at %p ofs %d", i, ring); kring->ring = ring; *(uint32_t *)(uintptr_t)&ring->num_slots = ndesc; - *(ssize_t *)(uintptr_t)&ring->buf_ofs = + *(int64_t *)(uintptr_t)&ring->buf_ofs = (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - netmap_ring_offset(na->nm_mem, ring); - ring->avail = kring->nr_hwavail; - ring->cur = kring->nr_hwcur; + /* copy values from kring */ + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; *(uint16_t *)(uintptr_t)&ring->nr_buf_size = NETMAP_BDG_BUF_SIZE(na->nm_mem); ND("initializing slots for txring"); @@ -989,13 +991,15 @@ netmap_mem_rings_create(struct netmap_adapter *na) kring->ring = ring; *(uint32_t *)(uintptr_t)&ring->num_slots = ndesc; - *(ssize_t *)(uintptr_t)&ring->buf_ofs = + *(int64_t *)(uintptr_t)&ring->buf_ofs = (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - netmap_ring_offset(na->nm_mem, ring); - ring->cur = kring->nr_hwcur; - ring->avail = kring->nr_hwavail; + /* copy values from kring */ + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; *(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BDG_BUF_SIZE(na->nm_mem); ND("initializing slots for rxring[%d]", i); diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h index f492f9814b79..8e6c58cbc4ee 100644 --- a/sys/dev/netmap/netmap_mem2.h +++ b/sys/dev/netmap/netmap_mem2.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2013 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c index 32d6422de120..f988b84e78b2 100644 --- a/sys/dev/netmap/netmap_vale.c +++ b/sys/dev/netmap/netmap_vale.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Universita` di Pisa. All rights reserved. + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -251,44 +251,6 @@ struct nm_bridge nm_bridges[NM_BRIDGES]; /* - * A few function to tell which kind of port are we using. - * XXX should we hold a lock ? - * - * nma_is_vp() virtual port - * nma_is_host() port connected to the host stack - * nma_is_hw() port connected to a NIC - * nma_is_generic() generic netmap adapter XXX stop this madness - */ -static __inline int -nma_is_vp(struct netmap_adapter *na) -{ - return na->nm_register == bdg_netmap_reg; -} - - -static __inline int -nma_is_host(struct netmap_adapter *na) -{ - return na->nm_register == NULL; -} - - -static __inline int -nma_is_hw(struct netmap_adapter *na) -{ - /* In case of sw adapter, nm_register is NULL */ - return !nma_is_vp(na) && !nma_is_host(na) && !nma_is_generic(na); -} - -static __inline int -nma_is_bwrap(struct netmap_adapter *na) -{ - return na->nm_register == netmap_bwrap_register; -} - - - -/* * this is a slightly optimized copy routine which rounds * to multiple of 64 bytes and is often faster than dealing * with other odd sizes. We assume there is enough room @@ -318,7 +280,6 @@ pkt_copy(void *_src, void *_dst, int l) } - /* * locate a bridge among the existing ones. * MUST BE CALLED WITH NMG_LOCK() @@ -393,8 +354,8 @@ nm_free_bdgfwd(struct netmap_adapter *na) struct netmap_kring *kring; NMG_LOCK_ASSERT(); - nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; - kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; + nrings = na->num_tx_rings; + kring = na->tx_rings; for (i = 0; i < nrings; i++) { if (kring[i].nkr_ft) { free(kring[i].nkr_ft, M_DEVBUF); @@ -502,6 +463,7 @@ netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) } } + static void netmap_adapter_vp_dtor(struct netmap_adapter *na) { @@ -520,6 +482,16 @@ netmap_adapter_vp_dtor(struct netmap_adapter *na) na->ifp = NULL; } + +/* Try to get a reference to a netmap adapter attached to a VALE switch. + * If the adapter is found (or is created), this function returns 0, a + * non NULL pointer is returned into *na, and the caller holds a + * reference to the adapter. + * If an adapter is not found, then no reference is grabbed and the + * function returns an error code, or 0 if there is just a VALE prefix + * mismatch. Therefore the caller holds a reference when + * (*na != NULL && return == 0). + */ int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) { @@ -688,18 +660,12 @@ nm_bdg_attach(struct nmreq *nmr) return ENOMEM; NMG_LOCK(); /* XXX probably netmap_get_bdg_na() */ - error = netmap_get_na(nmr, &na, 1 /* create if not exists */); + error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */); if (error) /* no device, or another bridge or user owns the device */ goto unlock_exit; - /* netmap_get_na() sets na_bdg if this is a physical interface - * that we can attach to a switch. - */ - if (!nma_is_bwrap(na)) { - /* got reference to a virtual port or direct access to a NIC. - * perhaps specified no bridge prefix or wrong NIC name - */ + if (na == NULL) { /* VALE prefix missing */ error = EINVAL; - goto unref_exit; + goto unlock_exit; } if (na->active_fds > 0) { /* already registered */ @@ -727,6 +693,7 @@ unlock_exit: return error; } + static int nm_bdg_detach(struct nmreq *nmr) { @@ -736,17 +703,15 @@ nm_bdg_detach(struct nmreq *nmr) int last_instance; NMG_LOCK(); - error = netmap_get_na(nmr, &na, 0 /* don't create */); + error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */); if (error) { /* no device, or another bridge or user owns the device */ goto unlock_exit; } - if (!nma_is_bwrap(na)) { - /* got reference to a virtual port or direct access to a NIC. - * perhaps specified no bridge's prefix or wrong NIC's name - */ + if (na == NULL) { /* VALE prefix missing */ error = EINVAL; - goto unref_exit; + goto unlock_exit; } + bna = (struct netmap_bwrap_adapter *)na; if (na->active_fds == 0) { /* not registered */ @@ -890,12 +855,13 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) case NETMAP_BDG_OFFSET: NMG_LOCK(); error = netmap_get_bdg_na(nmr, &na, 0); - if (!error) { + if (na && !error) { vpna = (struct netmap_vp_adapter *)na; if (nmr->nr_arg1 > NETMAP_BDG_MAX_OFFSET) nmr->nr_arg1 = NETMAP_BDG_MAX_OFFSET; vpna->offset = nmr->nr_arg1; D("Using offset %d for %p", vpna->offset, vpna); + netmap_adapter_put(na); } NMG_UNLOCK(); break; @@ -947,6 +913,7 @@ netmap_vp_krings_create(struct netmap_adapter *na) return 0; } + static void netmap_vp_krings_delete(struct netmap_adapter *na) { @@ -1027,10 +994,6 @@ nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr, } -/* - *---- support for virtual bridge ----- - */ - /* ----- FreeBSD if_bridge hash function ------- */ /* @@ -1052,6 +1015,7 @@ do { \ c -= a; c -= b; c ^= (b >> 15); \ } while (/*CONSTCOND*/0) + static __inline uint32_t nm_bridge_rthash(const uint8_t *addr) { @@ -1144,6 +1108,77 @@ netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, /* + * Available space in the ring. Only used in VALE code + * and only with is_rx = 1 + */ +static inline uint32_t +nm_kr_space(struct netmap_kring *k, int is_rx) +{ + int space; + + if (is_rx) { + int busy = k->nkr_hwlease - k->nr_hwcur; + if (busy < 0) + busy += k->nkr_num_slots; + space = k->nkr_num_slots - 1 - busy; + } else { + /* XXX never used in this branch */ + space = k->nr_hwtail - k->nkr_hwlease; + if (space < 0) + space += k->nkr_num_slots; + } +#if 0 + // sanity check + if (k->nkr_hwlease >= k->nkr_num_slots || + k->nr_hwcur >= k->nkr_num_slots || + k->nr_tail >= k->nkr_num_slots || + busy < 0 || + busy >= k->nkr_num_slots) { + D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, + k->nkr_lease_idx, k->nkr_num_slots); + } +#endif + return space; +} + + + + +/* make a lease on the kring for N positions. return the + * lease index + * XXX only used in VALE code and with is_rx = 1 + */ +static inline uint32_t +nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) +{ + uint32_t lim = k->nkr_num_slots - 1; + uint32_t lease_idx = k->nkr_lease_idx; + + k->nkr_leases[lease_idx] = NR_NOSLOT; + k->nkr_lease_idx = nm_next(lease_idx, lim); + + if (n > nm_kr_space(k, is_rx)) { + D("invalid request for %d slots", n); + panic("x"); + } + /* XXX verify that there are n slots */ + k->nkr_hwlease += n; + if (k->nkr_hwlease > lim) + k->nkr_hwlease -= lim + 1; + + if (k->nkr_hwlease >= k->nkr_num_slots || + k->nr_hwcur >= k->nkr_num_slots || + k->nr_hwtail >= k->nkr_num_slots || + k->nkr_lease_idx >= k->nkr_num_slots) { + D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", + k->na->ifp->if_xname, + k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, + k->nkr_lease_idx, k->nkr_num_slots); + } + return lease_idx; +} + +/* * This flush routine supports only unicast and broadcast but a large * number of ports, and lets us replace the learn and dispatch functions. */ @@ -1357,28 +1392,30 @@ retry: dst = BDG_NMB(&dst_na->up, slot); if (unlikely(fix_mismatch)) { - if (na->offset > dst_na->offset) { - src += na->offset - dst_na->offset; - copy_len -= na->offset - dst_na->offset; - dst_len = copy_len; - } else { - bzero(dst, dst_na->offset - na->offset); - dst_len += dst_na->offset - na->offset; - dst += dst_na->offset - na->offset; - } - /* fix the first fragment only */ - fix_mismatch = 0; - /* completely skip an header only fragment */ - if (copy_len == 0) { - ft_p++; - continue; - } + /* We are processing the first fragment + * and there is a mismatch between source + * and destination offsets. Create a zeroed + * header for the destination, independently + * of the source header length and content. + */ + src += na->offset; + copy_len -= na->offset; + bzero(dst, dst_na->offset); + dst += dst_na->offset; + dst_len = dst_na->offset + copy_len; + /* fix the first fragment only */ + fix_mismatch = 0; + /* Here it could be copy_len == dst_len == 0, + * and so a zero length fragment is passed. + */ } + + ND("send [%d] %d(%d) bytes at %s:%d", + i, (int)copy_len, (int)dst_len, + NM_IFPNAME(dst_ifp), j); /* round to a multiple of 64 */ copy_len = (copy_len + 63) & ~63; - ND("send %d %d bytes at %s:%d", - i, ft_p->ft_len, NM_IFPNAME(dst_ifp), j); if (ft_p->ft_flags & NS_INDIRECT) { if (copyin(src, dst, copy_len)) { // invalid user pointer, pretend len is 0 @@ -1426,7 +1463,7 @@ retry: } p[lease_idx] = j; /* report I am done */ - update_pos = nm_kr_rxpos(kring); + update_pos = kring->nr_hwtail; if (my_start == update_pos) { /* all slots before my_start have been reported, @@ -1443,15 +1480,7 @@ retry: * means there are new buffers to report */ if (likely(j != my_start)) { - uint32_t old_avail = kring->nr_hwavail; - - kring->nr_hwavail = (j >= kring->nr_hwcur) ? - j - kring->nr_hwcur : - j + lim + 1 - kring->nr_hwcur; - if (kring->nr_hwavail < old_avail) { - D("avail shrink %d -> %d", - old_avail, kring->nr_hwavail); - } + kring->nr_hwtail = j; dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); still_locked = 0; mtx_unlock(&kring->q_lock); @@ -1471,35 +1500,32 @@ cleanup: return 0; } + static int netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags) { struct netmap_kring *kring = &na->up.tx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, k, lim = kring->nkr_num_slots - 1; - - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); + u_int done; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = kring->rcur; if (bridge_batch <= 0) { /* testing only */ - j = k; // used all + done = cur; // used all goto done; } if (bridge_batch > NM_BDG_BATCH) bridge_batch = NM_BDG_BATCH; - j = nm_bdg_preflush(na, ring_nr, kring, k); - if (j != k) - D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); - /* k-j modulo ring size is the number of slots processed */ - if (k < j) - k += kring->nkr_num_slots; - kring->nr_hwavail = lim - (k - j); - + done = nm_bdg_preflush(na, ring_nr, kring, cur); done: - kring->nr_hwcur = j; - ring->avail = kring->nr_hwavail; + if (done != cur) + D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail); + /* + * packets between 'done' and 'cur' are left unsent. + */ + kring->nr_hwcur = done; + kring->nr_hwtail = nm_prev(done, lim); + nm_txsync_finalize(kring); if (netmap_verbose) D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags); return 0; @@ -1518,46 +1544,30 @@ bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) return netmap_vp_txsync(vpna, ring_nr, flags); } - -/* - * user process reading from a VALE switch. - * Already protected against concurrent calls from userspace, - * but we must acquire the queue's lock to protect against - * writers on the same queue. - */ static int -bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, lim = kring->nkr_num_slots - 1; - u_int k = ring->cur, resvd = ring->reserved; + u_int nm_i, lim = kring->nkr_num_slots - 1; + u_int head = nm_rxsync_prologue(kring); int n; - mtx_lock(&kring->q_lock); - if (k > lim) { + if (head > lim) { D("ouch dangerous reset!!!"); n = netmap_ring_reinit(kring); goto done; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } + /* First part, import newly received packets. */ + /* actually nothing to do here, they are already in the kring */ - if (j != k) { /* userspace has released some packets. */ - n = k - j; - if (n < 0) - n += kring->nkr_num_slots; - ND("userspace releases %d packets", n); - for (n = 0; likely(j != k); n++) { - struct netmap_slot *slot = &ring->slot[j]; + /* Second part, skip past packets that userspace has released. */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + /* consistency check, but nothing really important here */ + for (n = 0; likely(nm_i != head); n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; void *addr = BDG_NMB(na, slot); if (addr == netmap_buffer_base) { /* bad buf */ @@ -1565,19 +1575,37 @@ bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) slot->buf_idx); } slot->flags &= ~NS_BUF_CHANGED; - j = nm_next(j, lim); + nm_i = nm_next(nm_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = head; } + /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; + nm_rxsync_finalize(kring); n = 0; done: + return n; +} + +/* + * user process reading from a VALE switch. + * Already protected against concurrent calls from userspace, + * but we must acquire the queue's lock to protect against + * writers on the same queue. + */ +static int +bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + int n; + + mtx_lock(&kring->q_lock); + n = netmap_vp_rxsync(na, ring_nr, flags); mtx_unlock(&kring->q_lock); return n; } + static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) { @@ -1627,6 +1655,7 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) return 0; } + static void netmap_bwrap_dtor(struct netmap_adapter *na) { @@ -1652,16 +1681,22 @@ netmap_bwrap_dtor(struct netmap_adapter *na) } + /* - * Pass packets from nic to the bridge. + * Intr callback for NICs connected to a bridge. + * Simply ignore tx interrupts (maybe we could try to recover space ?) + * and pass received packets from nic to the bridge. + * * XXX TODO check locking: this is called from the interrupt * handler so we should make sure that the interface is not * disconnected while passing down an interrupt. * - * Note, no user process can access this NIC so we can ignore - * the info in the 'ring'. - */ -/* callback that overwrites the hwna notify callback. + * Note, no user process can access this NIC or the host stack. + * The only part of the ring that is significant are the slots, + * and head/cur/tail are set from the kring as needed + * (part as a receive ring, part as a transmit ring). + * + * callback that overwrites the hwna notify callback. * Packets come from the outside or from the host stack and are put on an hwna rx ring. * The bridge wrapper then sends the packets through the bridge. */ @@ -1677,21 +1712,24 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, struct netmap_vp_adapter *vpna = &bna->up; int error = 0; - ND("%s[%d] %s %x", NM_IFPNAME(ifp), ring_nr, (tx == NR_TX ? "TX" : "RX"), flags); + if (netmap_verbose) + D("%s %s%d 0x%x", NM_IFPNAME(ifp), + (tx == NR_TX ? "TX" : "RX"), ring_nr, flags); if (flags & NAF_DISABLE_NOTIFY) { kring = tx == NR_TX ? na->tx_rings : na->rx_rings; bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings; - if (kring->nkr_stopped) - netmap_disable_ring(bkring); + if (kring[ring_nr].nkr_stopped) + netmap_disable_ring(&bkring[ring_nr]); else - bkring->nkr_stopped = 0; + bkring[ring_nr].nkr_stopped = 0; return 0; } if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP)) return 0; + /* we only care about receive interrupts */ if (tx == NR_TX) return 0; @@ -1707,7 +1745,24 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, goto put_out; } + /* Here we expect ring->head = ring->cur = ring->tail + * because everything has been released from the previous round. + * However the ring is shared and we might have info from + * the wrong side (the tx ring). Hence we overwrite with + * the info from the rx kring. + */ + if (netmap_verbose) + D("%s head %d cur %d tail %d (kring %d %d %d)", NM_IFPNAME(ifp), + ring->head, ring->cur, ring->tail, + kring->rhead, kring->rcur, kring->rtail); + + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; + + /* simulate a user wakeup on the rx ring */ if (is_host_ring) { + netmap_rxsync_from_host(na, NULL, NULL); vpna = hostna; ring_nr = 0; } else { @@ -1718,23 +1773,46 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, if (error) goto put_out; } - if (kring->nr_hwavail == 0 && netmap_verbose) { + if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) { D("how strange, interrupt with no packets on %s", NM_IFPNAME(ifp)); goto put_out; } - /* XXX avail ? */ - ring->cur = nm_kr_rxpos(kring); + + /* new packets are ring->cur to ring->tail, and the bkring + * had hwcur == ring->cur. So advance ring->cur to ring->tail + * to push all packets out. + */ + ring->head = ring->cur = ring->tail; + + /* also set tail to what the bwrap expects */ + bkring = &vpna->up.tx_rings[ring_nr]; + ring->tail = bkring->nr_hwtail; // rtail too ? + + /* pass packets to the switch */ + nm_txsync_prologue(bkring); // XXX error checking ? netmap_vp_txsync(vpna, ring_nr, flags); - if (!is_host_ring) + /* mark all buffers as released on this ring */ + ring->head = ring->cur = kring->nr_hwtail; + ring->tail = kring->rtail; + /* another call to actually release the buffers */ + if (!is_host_ring) { error = na->nm_rxsync(na, ring_nr, 0); + } else { + /* mark all packets as released, as in the + * second part of netmap_rxsync_from_host() + */ + kring->nr_hwcur = kring->nr_hwtail; + nm_rxsync_finalize(kring); + } put_out: nm_kr_put(kring); return error; } + static int netmap_bwrap_register(struct netmap_adapter *na, int onoff) { @@ -1744,7 +1822,7 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff) struct netmap_vp_adapter *hostna = &bna->host; int error; - ND("%s %d", NM_IFPNAME(ifp), onoff); + ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off"); if (onoff) { int i; @@ -1788,6 +1866,7 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff) return 0; } + static int netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, u_int *rxr, u_int *rxd) @@ -1807,6 +1886,7 @@ netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, return 0; } + static int netmap_bwrap_krings_create(struct netmap_adapter *na) { @@ -1834,6 +1914,7 @@ netmap_bwrap_krings_create(struct netmap_adapter *na) return 0; } + static void netmap_bwrap_krings_delete(struct netmap_adapter *na) { @@ -1847,6 +1928,7 @@ netmap_bwrap_krings_delete(struct netmap_adapter *na) netmap_vp_krings_delete(na); } + /* notify method for the bridge-->hwna direction */ static int netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) @@ -1856,7 +1938,7 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f struct netmap_adapter *hwna = bna->hwna; struct netmap_kring *kring, *hw_kring; struct netmap_ring *ring; - u_int lim, k; + u_int lim; int error = 0; if (tx == NR_TX) @@ -1865,35 +1947,49 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f kring = &na->rx_rings[ring_n]; hw_kring = &hwna->tx_rings[ring_n]; ring = kring->ring; - lim = kring->nkr_num_slots - 1; - k = nm_kr_rxpos(kring); if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP)) return 0; - ring->cur = k; - ND("%s[%d] PRE rx(%d, %d, %d, %d) ring(%d, %d, %d) tx(%d, %d)", + /* first step: simulate a user wakeup on the rx ring */ + netmap_vp_rxsync(na, ring_n, flags); + ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", NM_IFPNAME(na->ifp), ring_n, - kring->nr_hwcur, kring->nr_hwavail, kring->nkr_hwlease, kring->nr_hwreserved, - ring->cur, ring->avail, ring->reserved, - hw_kring->nr_hwcur, hw_kring->nr_hwavail); + kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, + ring->head, ring->cur, ring->tail, + hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); + /* second step: the simulated user consumes all new packets */ + ring->head = ring->cur = ring->tail; + + /* third step: the new packets are sent on the tx ring + * (which is actually the same ring) + */ + /* set tail to what the hw expects */ + ring->tail = hw_kring->rtail; if (ring_n == na->num_rx_rings) { netmap_txsync_to_host(hwna); } else { + nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ? error = hwna->nm_txsync(hwna, ring_n, flags); } - kring->nr_hwcur = ring->cur; - kring->nr_hwavail = 0; - kring->nr_hwreserved = lim - ring->avail; - ND("%s[%d] PST rx(%d, %d, %d, %d) ring(%d, %d, %d) tx(%d, %d)", + + /* fourth step: now we are back the rx ring */ + /* claim ownership on all hw owned bufs */ + ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */ + ring->tail = kring->rtail; /* restore saved value of tail, for safety */ + + /* fifth step: the user goes to sleep again, causing another rxsync */ + netmap_vp_rxsync(na, ring_n, flags); + ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", NM_IFPNAME(na->ifp), ring_n, - kring->nr_hwcur, kring->nr_hwavail, kring->nkr_hwlease, kring->nr_hwreserved, - ring->cur, ring->avail, ring->reserved, - hw_kring->nr_hwcur, hw_kring->nr_hwavail); + kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, + ring->head, ring->cur, ring->tail, + hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); return error; } + static int netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) { @@ -1904,6 +2000,7 @@ netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags); } + /* attach a bridge wrapper to the 'real' device */ static int netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) @@ -1957,7 +2054,8 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) hostna->nm_mem = na->nm_mem; hostna->na_private = bna; - D("%s<->%s txr %d txd %d rxr %d rxd %d", fake->if_xname, real->if_xname, + ND("%s<->%s txr %d txd %d rxr %d rxd %d", + fake->if_xname, real->if_xname, na->num_tx_rings, na->num_tx_desc, na->num_rx_rings, na->num_rx_desc); @@ -1970,6 +2068,7 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) return 0; } + void netmap_init_bridges(void) { diff --git a/sys/net/netmap.h b/sys/net/netmap.h index 50e230934dd0..a5ee9b55edc9 100644 --- a/sys/net/netmap.h +++ b/sys/net/netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -39,6 +39,16 @@ #ifndef _NET_NETMAP_H_ #define _NET_NETMAP_H_ +#define NETMAP_API 10 /* current API version */ + +/* + * Some fields should be cache-aligned to reduce contention. + * The alignment is architecture and OS dependent, but rather than + * digging into OS headers to find the exact value we use an estimate + * that should cover most architectures. + */ +#define NM_CACHE_ALIGN 128 + /* * --- Netmap data structures --- * @@ -52,23 +62,23 @@ ==================================================================== | USERSPACE | struct netmap_ring - +---->+--------------+ - / | cur | - struct netmap_if (nifp, 1 per fd) / | avail | - +---------------+ / | buf_ofs | - | ni_tx_rings | / +==============+ - | ni_rx_rings | / | buf_idx, len | slot[0] - | | / | flags, ptr | - | | / +--------------+ - +===============+ / | buf_idx, len | slot[1] - | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | - | txring_ofs[1] | +--------------+ - (ni_tx_rings+1 entries) (num_slots entries) - | txring_ofs[t] | | buf_idx, len | slot[n-1] - +---------------+ | flags, ptr | - | rxring_ofs[0] | +--------------+ + +---->+---------------+ + / | head,cur,tail | + struct netmap_if (nifp, 1 per fd) / | buf_ofs | + +---------------+ / | other fields | + | ni_tx_rings | / +===============+ + | ni_rx_rings | / | buf_idx, len | slot[0] + | | / | flags, ptr | + | | / +---------------+ + +===============+ / | buf_idx, len | slot[1] + | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | + | txring_ofs[1] | +---------------+ + (tx+1+extra_tx entries) (num_slots entries) + | txring_ofs[t] | | buf_idx, len | slot[n-1] + +---------------+ | flags, ptr | + | rxring_ofs[0] | +---------------+ | rxring_ofs[1] | - (ni_rx_rings+1 entries) + (rx+1+extra_rx entries) | rxring_ofs[r] | +---------------+ @@ -93,122 +103,115 @@ /* * struct netmap_slot is a buffer descriptor - * - * buf_idx the index of the buffer associated to the slot. - * len the length of the payload - * flags control operation on the slot, as defined below - * - * NS_BUF_CHANGED must be set whenever userspace wants - * to change buf_idx (it might be necessary to - * reprogram the NIC) - * - * NS_REPORT must be set if we want the NIC to generate an interrupt - * when this slot is used. Leaving it to 0 improves - * performance. - * - * NS_FORWARD if set on a receive ring, and the device is in - * transparent mode, buffers released with the flag set - * will be forwarded to the 'other' side (host stack - * or NIC, respectively) on the next select() or ioctl() - * - * NS_NO_LEARN on a VALE switch, do not 'learn' the source port for - * this packet. - * - * NS_INDIRECT (tx rings only) data is in a userspace buffer pointed - * by the ptr field in the slot. - * - * NS_MOREFRAG Part of a multi-segment frame. The last (or only) - * segment must not have this flag. - * Only supported on VALE ports. - * - * NS_PORT_MASK the high 8 bits of the flag, if not zero, indicate the - * destination port for the VALE switch, overriding - * the lookup table. */ - struct netmap_slot { uint32_t buf_idx; /* buffer index */ - uint16_t len; /* packet length */ + uint16_t len; /* length for this slot */ uint16_t flags; /* buf changed, etc. */ + uint64_t ptr; /* pointer for indirect buffers */ +}; + +/* + * The following flags control how the slot is used + */ + #define NS_BUF_CHANGED 0x0001 /* buf_idx changed */ -#define NS_REPORT 0x0002 /* ask the hardware to report results - * e.g. by generating an interrupt - */ -#define NS_FORWARD 0x0004 /* pass packet to the other endpoint - * (host stack or device) - */ -#define NS_NO_LEARN 0x0008 -#define NS_INDIRECT 0x0010 -#define NS_MOREFRAG 0x0020 + /* + * must be set whenever buf_idx is changed (as it might be + * necessary to recompute the physical address and mapping) + */ + +#define NS_REPORT 0x0002 /* ask the hardware to report results */ + /* + * Request notification when slot is used by the hardware. + * Normally transmit completions are handled lazily and + * may be unreported. This flag lets us know when a slot + * has been sent (e.g. to terminate the sender). + */ + +#define NS_FORWARD 0x0004 /* pass packet 'forward' */ + /* + * (Only for physical ports, rx rings with NR_FORWARD set). + * Slot released to the kernel (i.e. before ring->head) with + * this flag set are passed to the peer ring (host/NIC), + * thus restoring the host-NIC connection for these slots. + * This supports efficient traffic monitoring or firewalling. + */ + +#define NS_NO_LEARN 0x0008 /* disable bridge learning */ + /* + * On a VALE switch, do not 'learn' the source port for + * this buffer. + */ + +#define NS_INDIRECT 0x0010 /* userspace buffer */ + /* + * (VALE tx rings only) data is in a userspace buffer, + * whose address is in the 'ptr' field in the slot. + */ + +#define NS_MOREFRAG 0x0020 /* packet has more fragments */ + /* + * (VALE ports only) + * Set on all but the last slot of a multi-segment packet. + * The 'len' field refers to the individual fragment. + */ + #define NS_PORT_SHIFT 8 #define NS_PORT_MASK (0xff << NS_PORT_SHIFT) - /* - * in rx rings, the high 8 bits - * are the number of fragments. - */ + /* + * The high 8 bits of the flag, if not zero, indicate the + * destination port for the VALE switch, overriding + * the lookup table. + */ + #define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff) - uint64_t ptr; /* pointer for indirect buffers */ -}; + /* + * (VALE rx rings only) the high 8 bits + * are the number of fragments. + */ + /* * struct netmap_ring * * Netmap representation of a TX or RX ring (also known as "queue"). * This is a queue implemented as a fixed-size circular array. - * At the software level, two fields are important: avail and cur. + * At the software level the important fields are: head, cur, tail. * * In TX rings: * - * avail tells how many slots are available for transmission. - * It is updated by the kernel in each netmap system call. - * It MUST BE decremented by the user when it - * adds a new packet to send. + * head first slot available for transmission. + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel * - * cur indicates the slot to use for the next packet - * to send (i.e. the "tail" of the queue). - * It MUST BE incremented by the user before - * netmap system calls to reflect the number of newly - * sent packets. - * It is checked by the kernel on netmap system calls - * (normally unmodified by the kernel unless invalid). + * [head .. tail-1] can be used for new packets to send; + * 'head' and 'cur' must be incremented as slots are filled + * with new packets to be sent; + * 'cur' can be moved further ahead if we need more space + * for new transmissions. * * In RX rings: * - * avail is the number of packets available (possibly 0). - * It is updated by the kernel in each netmap system call. - * It MUST BE decremented by the user when it - * consumes a packet. - * - * cur indicates the first slot that contains a packet not - * yet processed (the "head" of the queue). - * It MUST BE incremented by the user when it consumes - * a packet. - * - * reserved indicates the number of buffers before 'cur' - * that the user has not released yet. Normally 0, - * it MUST BE incremented by the user when it - * does not return the buffer immediately, and decremented - * when the buffer is finally freed. + * head first valid received packet + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel * + * [head .. tail-1] contain received packets; + * 'head' and 'cur' must be incremented as slots are consumed + * and can be returned to the kernel; + * 'cur' can be moved further ahead if we want to wait for + * new packets without returning the previous ones. * * DATA OWNERSHIP/LOCKING: - * The netmap_ring, all slots, and buffers in the range - * [reserved-cur , cur+avail[ are owned by the user program, - * and the kernel only touches them in the same thread context - * during a system call. - * Other buffers are reserved for use by the NIC's DMA engines. - * - * FLAGS - * NR_TIMESTAMP updates the 'ts' field on each syscall. This is - * a global timestamp for all packets. - * NR_RX_TSTMP if set, the last 64 byte in each buffer will - * contain a timestamp for the frame supplied by - * the hardware (if supported) - * NR_FORWARD if set, the NS_FORWARD flag in each slot of the - * RX ring is checked, and if set the packet is - * passed to the other side (host stack or device, - * respectively). This permits bpf-like behaviour - * or transparency for selected packets. + * The netmap_ring, and all slots and buffers in the range + * [head .. tail-1] are owned by the user program; + * the kernel only accesses them during a netmap system call + * and in the user thread context. + * + * Other slots and buffers are reserved for use by the kernel */ struct netmap_ring { /* @@ -216,19 +219,22 @@ struct netmap_ring { * It contains the offset of the buffer region from this * descriptor. */ - const ssize_t buf_ofs; + const int64_t buf_ofs; const uint32_t num_slots; /* number of slots in the ring. */ - uint32_t avail; /* number of usable slots */ - uint32_t cur; /* 'current' r/w position */ - uint32_t reserved; /* not refilled before current */ + const uint32_t nr_buf_size; + const uint16_t ringid; + const uint16_t dir; /* 0: tx, 1: rx */ - const uint16_t nr_buf_size; - uint16_t flags; -#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ -#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ -#define NR_RX_TSTMP 0x0008 /* set rx timestamp in slots */ + uint32_t head; /* (u) first user slot */ + uint32_t cur; /* (u) wakeup point */ + uint32_t tail; /* (k) first kernel slot */ - struct timeval ts; /* time of last *sync() */ + uint32_t flags; + + struct timeval ts; /* (k) time of last *sync() */ + + /* opaque room for a mutex or similar object */ + uint8_t sem[128] __attribute__((__aligned__(NM_CACHE_ALIGN))); /* the slots follow. This struct has variable size */ struct netmap_slot slot[0]; /* array of slots. */ @@ -236,6 +242,22 @@ struct netmap_ring { /* + * RING FLAGS + */ +#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ + /* + * updates the 'ts' field on each netmap syscall. This saves + * saves a separate gettimeofday(), and is not much worse than + * software timestamps generated in the interrupt handler. + */ + +#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ + /* + * Enables the NS_FORWARD slot flag for the ring. + */ + + +/* * Netmap representation of an interface and its queue(s). * This is initialized by the kernel when binding a file * descriptor to a port, and should be considered as readonly @@ -252,81 +274,109 @@ struct netmap_if { const uint32_t ni_flags; /* properties */ #define NI_PRIV_MEM 0x1 /* private memory region */ - const uint32_t ni_rx_rings; /* number of rx rings */ - const uint32_t ni_tx_rings; /* number of tx rings */ + /* + * The number of packet rings available in netmap mode. + * Physical NICs can have different numbers of tx and rx rings. + * Physical NICs also have a 'host' ring pair. + * Additionally, clients can request additional ring pairs to + * be used for internal communication. + */ + const uint32_t ni_tx_rings; /* number of HW tx rings */ + const uint32_t ni_rx_rings; /* number of HW rx rings */ + + const uint32_t ni_extra_tx_rings; + const uint32_t ni_extra_rx_rings; /* * The following array contains the offset of each netmap ring - * from this structure. The first ni_tx_rings+1 entries refer - * to the tx rings, the next ni_rx_rings+1 refer to the rx rings - * (the last entry in each block refers to the host stack rings). + * from this structure, in the following order: + * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings; + * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings. + * * The area is filled up by the kernel on NIOCREGIF, * and then only read by userspace code. */ const ssize_t ring_ofs[0]; }; + #ifndef NIOCREGIF /* * ioctl names and related fields * + * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, + * whose identity is set in NIOCREGIF through nr_ringid. + * These are non blocking and take no argument. + * * NIOCGINFO takes a struct ifreq, the interface name is the input, * the outputs are number of queues and number of descriptor * for each queue (useful to set number of threads etc.). * The info returned is only advisory and may change before * the interface is bound to a file descriptor. * - * NIOCREGIF takes an interface name within a struct ifreq, + * NIOCREGIF takes an interface name within a struct nmre, * and activates netmap mode on the interface (if possible). * - * nr_name is the name of the interface + * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we + * can pass it down to other NIC-related ioctls. * - * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings - * indicate the configuration of the port on return. + * The actual argument (struct nmreq) has a number of options to request + * different functions. * - * On input, non-zero values for nr_tx_rings, nr_tx_slots and the - * rx counterparts may be used to reconfigure the port according - * to the requested values, but this is not guaranteed. - * The actual values are returned on completion of the ioctl(). + * nr_name (in) + * The name of the port (em0, valeXXX:YYY, etc.) + * limited to IFNAMSIZ for backward compatibility. * - * nr_ringid - * indicates how rings should be bound to the file descriptors. - * The default (0) means all physical rings of a NIC are bound. - * NETMAP_HW_RING plus a ring number lets you bind just - * a single ring pair. - * NETMAP_SW_RING binds only the host tx/rx rings - * NETMAP_NO_TX_POLL prevents select()/poll() from pushing - * out packets on the tx ring unless POLLOUT is specified. + * nr_version (in/out) + * Must match NETMAP_API as used in the kernel, error otherwise. + * Always returns the desired value on output. * - * NETMAP_PRIV_MEM is a return value used to indicate that - * this ring is in a private memory region hence buffer - * swapping cannot be used + * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out) + * On input, non-zero values may be used to reconfigure the port + * according to the requested values, but this is not guaranteed. + * On output the actual values in use are reported. * - * nr_cmd is used to configure NICs attached to a VALE switch, - * or to dump the configuration of a VALE switch. + * nr_ringid (in) + * Indicates how rings should be bound to the file descriptors. + * 0 (default) binds all physical rings + * NETMAP_HW_RING | ring number binds a single ring pair + * NETMAP_SW_RING binds only the host tx/rx rings * - * nr_cmd = NETMAP_BDG_ATTACH and nr_name = vale*:ifname - * attaches the NIC to the switch, with nr_ringid specifying - * which rings to use + * NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push + * packets on tx rings only if POLLOUT is set. + * The default is to push any pending packet. * - * nr_cmd = NETMAP_BDG_DETACH and nr_name = vale*:ifname - * disconnects a previously attached NIC + * NETMAP_PRIV_MEM is set on return for ports that use private + * memory regions and cannot use buffer swapping. * - * nr_cmd = NETMAP_BDG_LIST is used to list the configuration - * of VALE switches, with additional arguments. + * nr_cmd (in) if non-zero indicates a special command: + * NETMAP_BDG_ATTACH and nr_name = vale*:ifname + * attaches the NIC to the switch; nr_ringid specifies + * which rings to use. Used by vale-ctl -a ... + * nr_arg1 = NETMAP_BDG_HOST also attaches the host port + * as in vale-ctl -h ... * - * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, - * whose identity is set in NIOCREGIF through nr_ringid + * NETMAP_BDG_DETACH and nr_name = vale*:ifname + * disconnects a previously attached NIC. + * Used by vale-ctl -d ... + * + * NETMAP_BDG_LIST + * list the configuration of VALE switches. + * + * NETMAP_BDG_OFFSET XXX ? + * Set the offset of data in packets. Used with VALE + * switches where the clients use the vhost header. + * + * nr_arg1, nr_arg2 (in/out) command specific * - * NETMAP_API is the API version. */ + /* * struct nmreq overlays a struct ifreq */ struct nmreq { char nr_name[IFNAMSIZ]; uint32_t nr_version; /* API version */ -#define NETMAP_API 5 /* current version */ uint32_t nr_offset; /* nifp offset in the shared region */ uint32_t nr_memsize; /* size of the shared region */ uint32_t nr_tx_slots; /* slots in tx rings */ @@ -339,19 +389,23 @@ struct nmreq { #define NETMAP_SW_RING 0x2000 /* process the sw ring */ #define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */ #define NETMAP_RING_MASK 0xfff /* the ring number */ + uint16_t nr_cmd; #define NETMAP_BDG_ATTACH 1 /* attach the NIC */ #define NETMAP_BDG_DETACH 2 /* detach the NIC */ #define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */ #define NETMAP_BDG_LIST 4 /* get bridge's info */ #define NETMAP_BDG_OFFSET 5 /* set the port offset */ + uint16_t nr_arg1; #define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */ #define NETMAP_BDG_MAX_OFFSET 12 + uint16_t nr_arg2; uint32_t spare2[3]; }; + /* * FreeBSD uses the size value embedded in the _IOWR to determine * how much to copy in/out. So we need it to match the actual @@ -360,9 +414,22 @@ struct nmreq { */ #define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */ #define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */ -#define NIOCUNREGIF _IO('i', 147) /* deprecated. Was interface unregister */ #define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ #define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ #endif /* !NIOCREGIF */ + +/* + * Helper functions for kernel and userspace + */ + +/* + * check if space is available in the ring. + */ +static inline int +nm_ring_empty(struct netmap_ring *ring) +{ + return (ring->cur == ring->tail); +} + #endif /* _NET_NETMAP_H_ */ diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h index 3f2858304caf..bd6fe0db22ae 100644 --- a/sys/net/netmap_user.h +++ b/sys/net/netmap_user.h @@ -1,6 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. - * Copyright (C) 2013 Universita` di Pisa + * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -28,8 +27,8 @@ /* * $FreeBSD$ * - * This header contains the macros used to manipulate netmap structures - * and packets in userspace. See netmap(4) for more information. + * Functions and macros to manipulate netmap structures and packets + * in userspace. See netmap(4) for more information. * * The address of the struct netmap_if, say nifp, is computed from the * value returned from ioctl(.., NIOCREG, ...) and the mmap region: @@ -44,17 +43,20 @@ * we can access ring->nr_cur, ring->nr_avail, ring->nr_flags * * ring->slot[i] gives us the i-th slot (we can access - * directly plen, flags, bufindex) + * directly len, flags, buf_idx) * * char *buf = NETMAP_BUF(ring, x) returns a pointer to * the buffer numbered x * - * Since rings are circular, we have macros to compute the next index - * i = NETMAP_RING_NEXT(ring, i); + * All ring indexes (head, cur, tail) should always move forward. + * To compute the next index in a circular ring you can use + * i = nm_ring_next(ring, i); * * To ease porting apps from pcap to netmap we supply a few fuctions - * that can be called to open, close and read from netmap in a way - * similar to libpcap. + * that can be called to open, close, read and write on netmap in a way + * similar to libpcap. Note that the read/write function depend on + * an ioctl()/select()/poll() being issued to refill rings or push + * packets out. * * In order to use these, include #define NETMAP_WITH_LIBS * in the source file that invokes these functions. @@ -65,12 +67,19 @@ #include <stdint.h> #include <net/if.h> /* IFNAMSIZ */ + +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif /* likely and unlikely */ + #include <net/netmap.h> +/* helper macro */ #define _NETMAP_OFFSET(type, ptr, offset) \ ((type)(void *)((char *)(ptr) + (offset))) -#define NETMAP_IF(b, o) _NETMAP_OFFSET(struct netmap_if *, b, o) +#define NETMAP_IF(_base, _ofs) _NETMAP_OFFSET(struct netmap_if *, _base, _ofs) #define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \ nifp, (nifp)->ring_ofs[index] ) @@ -85,18 +94,34 @@ ( ((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs) ) / \ (ring)->nr_buf_size ) -#define NETMAP_RING_NEXT(r, i) \ - ((i)+1 == (r)->num_slots ? 0 : (i) + 1 ) -#define NETMAP_RING_FIRST_RESERVED(r) \ - ( (r)->cur < (r)->reserved ? \ - (r)->cur + (r)->num_slots - (r)->reserved : \ - (r)->cur - (r)->reserved ) +static inline uint32_t +nm_ring_next(struct netmap_ring *r, uint32_t i) +{ + return ( unlikely(i + 1 == r->num_slots) ? 0 : i + 1); +} + /* - * Return 1 if the given tx ring is empty. + * Return 1 if we have pending transmissions in the tx ring. + * When everything is complete ring->cur = ring->tail + 1 (modulo ring size) */ -#define NETMAP_TX_RING_EMPTY(r) ((r)->avail >= (r)->num_slots - 1) +static inline int +nm_tx_pending(struct netmap_ring *r) +{ + return nm_ring_next(r, r->tail) != r->cur; +} + + +static inline uint32_t +nm_ring_space(struct netmap_ring *ring) +{ + int ret = ring->tail - ring->cur; + if (ret < 0) + ret += ring->num_slots; + return ret; +} + #ifdef NETMAP_WITH_LIBS /* @@ -113,7 +138,12 @@ #include <sys/ioctl.h> #include <sys/errno.h> /* EINVAL */ #include <fcntl.h> /* O_RDWR */ -#include <malloc.h> +#include <unistd.h> /* close() */ +#ifdef __FreeBSD__ +#include <stdlib.h> +#else +#include <malloc.h> /* on FreeBSD it is stdlib.h */ +#endif struct nm_hdr_t { /* same as pcap_pkthdr */ struct timeval ts; @@ -139,30 +169,73 @@ struct nm_desc_t { #define IS_NETMAP_DESC(d) (P2NMD(d)->self == P2NMD(d)) #define NETMAP_FD(d) (P2NMD(d)->fd) + +/* + * this is a slightly optimized copy routine which rounds + * to multiple of 64 bytes and is often faster than dealing + * with other odd sizes. We assume there is enough room + * in the source and destination buffers. + * + * XXX only for multiples of 64 bytes, non overlapped. + */ +static inline void +pkt_copy(const void *_src, void *_dst, int l) +{ + const uint64_t *src = _src; + uint64_t *dst = _dst; + if (unlikely(l >= 1024)) { + memcpy(dst, src, l); + return; + } + for (; likely(l > 0); l-=64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + + /* * The callback, invoked on each received packet. Same as libpcap */ typedef void (*nm_cb_t)(u_char *, const struct nm_hdr_t *, const u_char *d); /* - * The open routine accepts an ifname (netmap:foo or vale:foo) and - * optionally a second (string) argument indicating the ring number + *--- the pcap-like API --- + * + * nm_open() opens a file descriptor, binds to a port and maps memory. + * + * ifname (netmap:foo or vale:foo) is the port name + * flags can be NETMAP_SW_RING or NETMAP_HW_RING etc. + * ring_no only used if NETMAP_HW_RING is specified, is interpreted + * as a string or integer indicating the ring number + * ring_flags is stored in all ring flags (e.g. for transparent mode) * to open. If successful, t opens the fd and maps the memory. */ + static struct nm_desc_t *nm_open(const char *ifname, const char *ring_no, int flags, int ring_flags); /* - * nm_dispatch() is the same as pcap_dispatch() - * nm_next() is the same as pcap_next() + * nm_close() closes and restores the port to its previous state */ -static int nm_dispatch(struct nm_desc_t *, int, nm_cb_t, u_char *); -static u_char *nm_next(struct nm_desc_t *, struct nm_hdr_t *); + +static int nm_close(struct nm_desc_t *); /* - * unmap memory, close file descriptor and free the descriptor. + * nm_inject() is the same as pcap_inject() + * nm_dispatch() is the same as pcap_dispatch() + * nm_nextpkt() is the same as pcap_next() */ -static int nm_close(struct nm_desc_t *); + +static int nm_inject(struct nm_desc_t *, const void *, size_t); +static int nm_dispatch(struct nm_desc_t *, int, nm_cb_t, u_char *); +static u_char *nm_nextpkt(struct nm_desc_t *, struct nm_hdr_t *); /* @@ -240,6 +313,12 @@ fail: static int nm_close(struct nm_desc_t *d) { + /* + * ugly trick to avoid unused warnings + */ + static void *__xxzt[] __attribute__ ((unused)) = + { nm_open, nm_inject, nm_dispatch, nm_nextpkt } ; + if (d == NULL || d->self != d) return EINVAL; if (d->mem) @@ -253,9 +332,45 @@ nm_close(struct nm_desc_t *d) /* + * Same prototype as pcap_inject(), only need to cast. + */ +static int +nm_inject(struct nm_desc_t *d, const void *buf, size_t size) +{ + u_int c, n = d->last_ring - d->first_ring + 1; + + if (0) fprintf(stderr, "%s rings %d %d %d\n", __FUNCTION__, + d->first_ring, d->cur_ring, d->last_ring); + for (c = 0; c < n ; c++) { + /* compute current ring to use */ + struct netmap_ring *ring; + uint32_t i, idx; + uint32_t ri = d->cur_ring + c; + + if (ri > d->last_ring) + ri = d->first_ring; + ring = NETMAP_TXRING(d->nifp, ri); + if (nm_ring_empty(ring)) { + if (0) fprintf(stderr, "%s ring %d cur %d tail %d\n", + __FUNCTION__, + ri, ring->cur, ring->tail); + continue; + } + i = ring->cur; + idx = ring->slot[i].buf_idx; + ring->slot[i].len = size; + pkt_copy(buf, NETMAP_BUF(ring, idx), size); + d->cur_ring = ri; + ring->head = ring->cur = nm_ring_next(ring, i); + return size; + } + return 0; /* fail */ +} + + +/* * Same prototype as pcap_dispatch(), only need to cast. */ -inline /* not really, but disable unused warnings */ static int nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg) { @@ -276,7 +391,7 @@ nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg) if (ri > d->last_ring) ri = d->first_ring; ring = NETMAP_RXRING(d->nifp, ri); - for ( ; ring->avail > 0 && cnt != got; got++) { + for ( ; !nm_ring_empty(ring) && cnt != got; got++) { u_int i = ring->cur; u_int idx = ring->slot[i].buf_idx; u_char *buf = (u_char *)NETMAP_BUF(ring, idx); @@ -285,24 +400,22 @@ nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg) d->hdr.len = d->hdr.caplen = ring->slot[i].len; d->hdr.ts = ring->ts; cb(arg, &d->hdr, buf); - ring->cur = NETMAP_RING_NEXT(ring, i); - ring->avail--; + ring->head = ring->cur = nm_ring_next(ring, i); } } d->cur_ring = ri; return got; } -inline /* not really, but disable unused warnings */ static u_char * -nm_next(struct nm_desc_t *d, struct nm_hdr_t *hdr) +nm_nextpkt(struct nm_desc_t *d, struct nm_hdr_t *hdr) { int ri = d->cur_ring; do { /* compute current ring to use */ struct netmap_ring *ring = NETMAP_RXRING(d->nifp, ri); - if (ring->avail > 0) { + if (!nm_ring_empty(ring)) { u_int i = ring->cur; u_int idx = ring->slot[i].buf_idx; u_char *buf = (u_char *)NETMAP_BUF(ring, idx); @@ -310,8 +423,12 @@ nm_next(struct nm_desc_t *d, struct nm_hdr_t *hdr) // prefetch(buf); hdr->ts = ring->ts; hdr->len = hdr->caplen = ring->slot[i].len; - ring->cur = NETMAP_RING_NEXT(ring, i); - ring->avail--; + ring->cur = nm_ring_next(ring, i); + /* we could postpone advancing head if we want + * to hold the buffer. This can be supported in + * the future. + */ + ring->head = ring->cur; d->cur_ring = ri; return buf; } |