diff options
author | Luigi Rizzo <luigi@FreeBSD.org> | 2013-12-15 08:37:24 +0000 |
---|---|---|
committer | Luigi Rizzo <luigi@FreeBSD.org> | 2013-12-15 08:37:24 +0000 |
commit | f9790aeb8869bfcedf111517bace712b390e6cc5 (patch) | |
tree | 6735285cbd194faacde40e3b3efd37c63ec36d15 /sys/dev | |
parent | c3e51c9ce1ca864e37fef30547c947384cc0955a (diff) | |
download | src-f9790aeb8869bfcedf111517bace712b390e6cc5.tar.gz src-f9790aeb8869bfcedf111517bace712b390e6cc5.zip |
split netmap code according to functions:
- netmap.c base code
- netmap_freebsd.c FreeBSD-specific code
- netmap_generic.c emulate netmap over standard drivers
- netmap_mbq.c simple mbuf tailq
- netmap_mem2.c memory management
- netmap_vale.c VALE switch
simplify devce-specific code
Notes
Notes:
svn path=/head/; revision=259412
Diffstat (limited to 'sys/dev')
-rw-r--r-- | sys/dev/netmap/if_em_netmap.h | 236 | ||||
-rw-r--r-- | sys/dev/netmap/if_igb_netmap.h | 277 | ||||
-rw-r--r-- | sys/dev/netmap/if_lem_netmap.h | 269 | ||||
-rw-r--r-- | sys/dev/netmap/if_re_netmap.h | 293 | ||||
-rw-r--r-- | sys/dev/netmap/ixgbe_netmap.h | 463 | ||||
-rw-r--r-- | sys/dev/netmap/netmap.c | 3193 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_freebsd.c | 410 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_generic.c | 818 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_kern.h | 596 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mbq.c | 152 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mbq.h | 78 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mem2.c | 292 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mem2.h | 15 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_vale.c | 1983 |
14 files changed, 5600 insertions, 3475 deletions
diff --git a/sys/dev/netmap/if_em_netmap.h b/sys/dev/netmap/if_em_netmap.h index 1ea11238aaaf..dbbee4222407 100644 --- a/sys/dev/netmap/if_em_netmap.h +++ b/sys/dev/netmap/if_em_netmap.h @@ -26,7 +26,7 @@ /* * $FreeBSD$ * - * netmap support for em. + * netmap support for: em. * * For more details on netmap support please see ixgbe_netmap.h */ @@ -39,10 +39,6 @@ #include <dev/netmap/netmap_kern.h> -static void em_netmap_block_tasks(struct adapter *); -static void em_netmap_unblock_tasks(struct adapter *); - - // XXX do we need to block/unblock the tasks ? static void em_netmap_block_tasks(struct adapter *adapter) @@ -85,45 +81,31 @@ em_netmap_unblock_tasks(struct adapter *adapter) /* - * Register/unregister routine + * Register/unregister. We are already under netmap lock. */ static int -em_netmap_reg(struct ifnet *ifp, int onoff) +em_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - - if (na == NULL) - return EINVAL; /* no netmap support here */ + EM_CORE_LOCK(adapter); em_disable_intr(adapter); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); em_netmap_block_tasks(adapter); - + /* enable or disable flags and callbacks in na and ifp */ if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - em_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + nm_set_native_flags(na); } else { -fail: - /* return to non-netmap mode */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - em_init_locked(adapter); /* also enable intr */ + nm_clear_native_flags(na); } + em_init_locked(adapter); /* also enable intr */ em_netmap_unblock_tasks(adapter); - return (error); + EM_CORE_UNLOCK(adapter); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } @@ -131,93 +113,103 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n, new_slots; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = nm_txsync_prologue(kring, &new_slots); /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; - k = ring->cur; - if (k > lim) + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + + if (cur > lim) /* error checking in nm_txsync_prologue() */ return netmap_ring_reinit(kring); bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); /* - * Process new packets to send. j is the current index in the - * netmap ring, l is the corresponding index in the NIC ring. + * First part: process new packets to send. */ - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* slot is the current slot in the netmap ring */ - struct netmap_slot *slot = &ring->slot[j]; - /* curr is the current slot in the nic ring */ - struct e1000_tx_desc *curr = &txr->tx_base[l]; - struct em_buffer *txbuf = &txr->tx_buffers[l]; - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - E1000_TXD_CMD_RS : 0; + + nm_i = kring->nr_hwcur; + if (nm_i != cur) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != cur; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; uint64_t paddr; void *addr = PNMB(slot, &paddr); - u_int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - return netmap_ring_reinit(kring); - } + /* device-specific */ + struct e1000_tx_desc *curr = &txr->tx_base[nic_i]; + struct em_buffer *txbuf = &txr->tx_buffers[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + E1000_TXD_CMD_RS : 0; + + NM_CHECK_ADDR_LEN(addr, len); - slot->flags &= ~NS_REPORT; if (slot->flags & NS_BUF_CHANGED) { curr->buffer_addr = htole64(paddr); /* buffer has changed, reload map */ netmap_reload_map(txr->txtag, txbuf->map, addr); - slot->flags &= ~NS_BUF_CHANGED; } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ curr->upper.data = 0; curr->lower.data = htole32(adapter->txd_cmd | len | (E1000_TXD_CMD_EOP | flags) ); bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = k; /* the saved ring->cur */ - kring->nr_hwavail -= n; + kring->nr_hwcur = cur; /* the saved ring->cur */ + /* decrease avail by # of packets sent minus previous ones */ + kring->nr_hwavail -= new_slots; + /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), l); + /* (re)start the tx unit up to slot nic_i (excluded) */ + E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), nic_i); } - if (n == 0 || kring->nr_hwavail < 1) { + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) { int delta; /* record completed transmissions using TDH */ - l = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); - if (l >= kring->nkr_num_slots) { /* XXX can it happen ? */ - D("TDH wrap %d", l); - l -= kring->nkr_num_slots; + nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } - delta = l - txr->next_to_clean; + delta = nic_i - txr->next_to_clean; if (delta) { /* some completed, increment hwavail. */ if (delta < 0) delta += kring->nkr_num_slots; - txr->next_to_clean = l; + txr->next_to_clean = nic_i; kring->nr_hwavail += delta; } } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + + nm_txsync_finalize(kring, cur); return 0; } @@ -227,19 +219,23 @@ em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) * Reconcile kernel and user view of the receive ring. */ static int -em_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n, resvd; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - k = ring->cur; - if (k > lim) + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + + if (cur > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ @@ -247,84 +243,85 @@ em_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * Import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. */ - l = rxr->next_to_check; - j = netmap_idx_n2k(kring, l); if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + nic_i = rxr->next_to_check; + nm_i = netmap_idx_n2k(kring, nic_i); + for (n = 0; ; n++) { - struct e1000_rx_desc *curr = &rxr->rx_base[l]; + struct e1000_rx_desc *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->status); if ((staterr & E1000_RXD_STAT_DD) == 0) break; - ring->slot[j].len = le16toh(curr->length); - ring->slot[j].flags = slot_flags; - bus_dmamap_sync(rxr->rxtag, rxr->rx_buffers[l].map, + ring->slot[nm_i].len = le16toh(curr->length); + ring->slot[nm_i].flags = slot_flags; + bus_dmamap_sync(rxr->rxtag, rxr->rx_buffers[nic_i].map, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; + nm_i = nm_next(nm_i, lim); /* make sure next_to_refresh follows next_to_check */ - rxr->next_to_refresh = l; // XXX - l = (l == lim) ? 0 : l + 1; + rxr->next_to_refresh = nic_i; // XXX + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ - rxr->next_to_check = l; + rxr->next_to_check = nic_i; kring->nr_hwavail += n; } kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); /* NIC ring index */ - for (n = 0; j != k; n++) { - struct netmap_slot *slot = &ring->slot[j]; - struct e1000_rx_desc *curr = &rxr->rx_base[l]; - struct em_buffer *rxbuf = &rxr->rx_buffers[l]; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != cur) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != cur; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - return netmap_ring_reinit(kring); - } + struct e1000_rx_desc *curr = &rxr->rx_base[nic_i]; + struct em_buffer *rxbuf = &rxr->rx_buffers[nic_i]; + + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { - curr->buffer_addr = htole64(paddr); /* buffer has changed, reload map */ + curr->buffer_addr = htole64(paddr); netmap_reload_map(rxr->rxtag, rxbuf->map, addr); slot->flags &= ~NS_BUF_CHANGED; } curr->status = 0; bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = cur; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), l); + nic_i = (nic_i == 0) ? lim : nic_i - 1; + E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i); } - /* tell userspace that there are new packets */ + + /* tell userspace that there might be new packets */ ring->avail = kring->nr_hwavail - resvd; + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } @@ -342,7 +339,8 @@ em_netmap_attach(struct adapter *adapter) na.nm_txsync = em_netmap_txsync; na.nm_rxsync = em_netmap_rxsync; na.nm_register = em_netmap_reg; - netmap_attach(&na, adapter->num_queues); + na.num_tx_rings = na.num_rx_rings = adapter->num_queues; + netmap_attach(&na); } /* end of file */ diff --git a/sys/dev/netmap/if_igb_netmap.h b/sys/dev/netmap/if_igb_netmap.h index 10d94b5faa38..b91d0baba06f 100644 --- a/sys/dev/netmap/if_igb_netmap.h +++ b/sys/dev/netmap/if_igb_netmap.h @@ -37,44 +37,43 @@ #include <vm/pmap.h> /* vtophys ? */ #include <dev/netmap/netmap_kern.h> +/* + * Adaptation to different versions of the driver. + */ + +#ifndef IGB_MEDIA_RESET +/* at the same time as IGB_MEDIA_RESET was defined, the + * tx buffer descriptor was renamed, so use this to revert + * back to the old name. + */ +#define igb_tx_buf igb_tx_buffer +#endif + /* - * register-unregister routine + * Register/unregister. We are already under netmap lock. */ static int -igb_netmap_reg(struct ifnet *ifp, int onoff) +igb_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - - if (na == NULL) - return EINVAL; /* no netmap support here */ + IGB_CORE_LOCK(adapter); igb_disable_intr(adapter); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + /* enable or disable flags and callbacks in na and ifp */ if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - igb_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + nm_set_native_flags(na); } else { -fail: - /* restore if_transmit */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - igb_init_locked(adapter); /* also enable intr */ + nm_clear_native_flags(na); } - return (error); + igb_init_locked(adapter); /* also enable intr */ + IGB_CORE_UNLOCK(adapter); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } @@ -82,68 +81,62 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n, new_slots; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = nm_txsync_prologue(kring, &new_slots); /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; - k = ring->cur; - if (k > lim) + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + /* 82575 needs the queue index added */ + u32 olinfo_status = + (adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0; + + if (cur > lim) /* error checking in nm_txsync_prologue() */ return netmap_ring_reinit(kring); bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, - BUS_DMASYNC_POSTREAD); + BUS_DMASYNC_POSTREAD); - /* check for new packets to send. - * j indexes the netmap ring, l indexes the nic ring, and - * j = kring->nr_hwcur, l = E1000_TDT (not tracked), - * j == (l + kring->nkr_hwofs) % ring_size + /* + * First part: process new packets to send. */ - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - /* 82575 needs the queue index added */ - u32 olinfo_status = - (adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0; - - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* slot is the current slot in the netmap ring */ - struct netmap_slot *slot = &ring->slot[j]; - /* curr is the current slot in the nic ring */ - union e1000_adv_tx_desc *curr = - (union e1000_adv_tx_desc *)&txr->tx_base[l]; -#ifndef IGB_MEDIA_RESET -/* at the same time as IGB_MEDIA_RESET was defined, the - * tx buffer descriptor was renamed, so use this to revert - * back to the old name. - */ -#define igb_tx_buf igb_tx_buffer -#endif - struct igb_tx_buf *txbuf = &txr->tx_buffers[l]; - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - E1000_ADVTXD_DCMD_RS : 0; + + nm_i = kring->nr_hwcur; + if (nm_i != cur) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != cur; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; uint64_t paddr; void *addr = PNMB(slot, &paddr); - u_int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - return netmap_ring_reinit(kring); - } + /* device-specific */ + union e1000_adv_tx_desc *curr = + (union e1000_adv_tx_desc *)&txr->tx_base[nic_i]; + struct igb_tx_buf *txbuf = &txr->tx_buffers[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + E1000_ADVTXD_DCMD_RS : 0; + + NM_CHECK_ADDR_LEN(addr, len); - slot->flags &= ~NS_REPORT; if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ netmap_reload_map(txr->txtag, txbuf->map, addr); - slot->flags &= ~NS_BUF_CHANGED; } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ curr->read.buffer_addr = htole64(paddr); // XXX check olinfo and cmd_type_len curr->read.olinfo_status = @@ -151,48 +144,56 @@ igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) (len<< E1000_ADVTXD_PAYLEN_SHIFT)); curr->read.cmd_type_len = htole32(len | E1000_ADVTXD_DTYP_DATA | - E1000_ADVTXD_DCMD_IFCS | - E1000_ADVTXD_DCMD_DEXT | - E1000_ADVTXD_DCMD_EOP | flags); + E1000_ADVTXD_DCMD_IFCS | + E1000_ADVTXD_DCMD_DEXT | + E1000_ADVTXD_DCMD_EOP | flags); + /* make sure changes to the buffer are synced */ bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = k; /* the saved ring->cur */ - kring->nr_hwavail -= n; + kring->nr_hwcur = cur; /* the saved ring->cur */ + /* decrease avail by # of packets sent minus previous ones */ + kring->nr_hwavail -= new_slots; /* Set the watchdog XXX ? */ txr->queue_status = IGB_QUEUE_WORKING; txr->watchdog_time = ticks; + /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), l); + /* (re)start the tx unit up to slot nic_i (excluded) */ + E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), nic_i); } - if (n == 0 || kring->nr_hwavail < 1) { + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) { int delta; /* record completed transmissions using TDH */ - l = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); - if (l >= kring->nkr_num_slots) { /* XXX can it happen ? */ - D("TDH wrap %d", l); - l -= kring->nkr_num_slots; + nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } - delta = l - txr->next_to_clean; + delta = nic_i - txr->next_to_clean; if (delta) { /* some completed, increment hwavail. */ if (delta < 0) delta += kring->nkr_num_slots; - txr->next_to_clean = l; + txr->next_to_clean = nic_i; kring->nr_hwavail += delta; } } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + + nm_txsync_finalize(kring, cur); return 0; } @@ -202,101 +203,107 @@ igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) * Reconcile kernel and user view of the receive ring. */ static int -igb_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n, resvd; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - k = ring->cur; - if (k > lim) + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + + if (cur > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. */ - l = rxr->next_to_check; - j = netmap_idx_n2k(kring, l); if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + nic_i = rxr->next_to_check; + nm_i = netmap_idx_n2k(kring, nic_i); + for (n = 0; ; n++) { - union e1000_adv_rx_desc *curr = &rxr->rx_base[l]; + union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->wb.upper.status_error); if ((staterr & E1000_RXD_STAT_DD) == 0) break; - ring->slot[j].len = le16toh(curr->wb.upper.length); - ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = le16toh(curr->wb.upper.length); + ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(rxr->ptag, - rxr->rx_buffers[l].pmap, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + rxr->rx_buffers[nic_i].pmap, BUS_DMASYNC_POSTREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ - rxr->next_to_check = l; + rxr->next_to_check = nic_i; kring->nr_hwavail += n; } kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - struct netmap_slot *slot = ring->slot + j; - union e1000_adv_rx_desc *curr = &rxr->rx_base[l]; - struct igb_rx_buf *rxbuf = rxr->rx_buffers + l; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != cur) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != cur; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - return netmap_ring_reinit(kring); - } + union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i]; + struct igb_rx_buf *rxbuf = &rxr->rx_buffers[nic_i]; + + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, reload map */ netmap_reload_map(rxr->ptag, rxbuf->pmap, addr); slot->flags &= ~NS_BUF_CHANGED; } - curr->read.pkt_addr = htole64(paddr); curr->wb.upper.status_error = 0; + curr->read.pkt_addr = htole64(paddr); bus_dmamap_sync(rxr->ptag, rxbuf->pmap, - BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + BUS_DMASYNC_PREREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = cur; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), l); + nic_i = (nic_i == 0) ? lim : nic_i - 1; + E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i); } - /* tell userspace that there are new packets */ + + /* tell userspace that there might be new packets */ ring->avail = kring->nr_hwavail - resvd; + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } @@ -314,6 +321,8 @@ igb_netmap_attach(struct adapter *adapter) na.nm_txsync = igb_netmap_txsync; na.nm_rxsync = igb_netmap_rxsync; na.nm_register = igb_netmap_reg; - netmap_attach(&na, adapter->num_queues); -} + na.num_tx_rings = na.num_rx_rings = adapter->num_queues; + netmap_attach(&na); +} + /* end of file */ diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h index 25e5c7c27e3e..8ad3b7a2a352 100644 --- a/sys/dev/netmap/if_lem_netmap.h +++ b/sys/dev/netmap/if_lem_netmap.h @@ -27,11 +27,12 @@ /* * $FreeBSD$ * - * netmap support for "lem" + * netmap support for: lem * * For details on netmap support please see ixgbe_netmap.h */ + #include <net/netmap.h> #include <sys/selinfo.h> #include <vm/vm.h> @@ -40,17 +41,13 @@ /* - * Register/unregister + * Register/unregister. We are already under netmap lock. */ static int -lem_netmap_reg(struct ifnet *ifp, int onoff) +lem_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - - if (na == NULL) - return EINVAL; EM_CORE_LOCK(adapter); @@ -64,24 +61,14 @@ lem_netmap_reg(struct ifnet *ifp, int onoff) taskqueue_drain(adapter->tq, &adapter->rxtx_task); taskqueue_drain(adapter->tq, &adapter->link_task); #endif /* !EM_LEGCY_IRQ */ - if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - lem_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + /* enable or disable flags and callbacks in na and ifp */ + if (onoff) { + nm_set_native_flags(na); } else { -fail: - /* return to non-netmap mode */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - lem_init_locked(adapter); /* also enable intr */ + nm_clear_native_flags(na); } + lem_init_locked(adapter); /* also enable intr */ #ifndef EM_LEGACY_IRQ taskqueue_unblock(adapter->tq); // XXX do we need this ? @@ -89,7 +76,7 @@ fail: EM_CORE_UNLOCK(adapter); - return (error); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } @@ -97,108 +84,102 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n, new_slots; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = nm_txsync_prologue(kring, &new_slots); /* generate an interrupt approximately every half ring */ - int report_frequency = kring->nkr_num_slots >> 1; - - ND("%s: hwofs %d, hwcur %d hwavail %d lease %d cur %d avail %d", - ifp->if_xname, - kring->nkr_hwofs, kring->nr_hwcur, kring->nr_hwavail, - kring->nkr_hwlease, - ring->cur, ring->avail); - /* take a copy of ring->cur now, and never read it again */ - k = ring->cur; - if (k > lim) + u_int report_frequency = kring->nkr_num_slots >> 1; + + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + + if (cur > lim) /* error checking in nm_txsync_prologue() */ return netmap_ring_reinit(kring); bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_POSTREAD); + /* - * Process new packets to send. j is the current index in the - * netmap ring, l is the corresponding index in the NIC ring. + * First part: process new packets to send. */ - j = kring->nr_hwcur; - if (netmap_verbose > 255) - RD(5, "device %s send %d->%d", ifp->if_xname, j, k); - if (j != k) { /* we have new packets to send */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* slot is the current slot in the netmap ring */ - struct netmap_slot *slot = &ring->slot[j]; - /* curr is the current slot in the nic ring */ - struct e1000_tx_desc *curr = &adapter->tx_desc_base[l]; - struct em_buffer *txbuf = &adapter->tx_buffer_area[l]; - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - E1000_TXD_CMD_RS : 0; + + nm_i = kring->nr_hwcur; + if (nm_i != cur) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != cur; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; uint64_t paddr; void *addr = PNMB(slot, &paddr); - u_int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - return netmap_ring_reinit(kring); - } - ND("slot %d NIC %d %s", j, l, nm_dump_buf(addr, len, 128, NULL)); + /* device-specific */ + struct e1000_tx_desc *curr = &adapter->tx_desc_base[nic_i]; + struct em_buffer *txbuf = &adapter->tx_buffer_area[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + E1000_TXD_CMD_RS : 0; + + NM_CHECK_ADDR_LEN(addr, len); - slot->flags &= ~NS_REPORT; - if (1 || slot->flags & NS_BUF_CHANGED) { + if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ - netmap_reload_map(adapter->txtag, txbuf->map, addr); curr->buffer_addr = htole64(paddr); - slot->flags &= ~NS_BUF_CHANGED; + netmap_reload_map(adapter->txtag, txbuf->map, addr); } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ curr->upper.data = 0; - curr->lower.data = - htole32( adapter->txd_cmd | len | + curr->lower.data = htole32(adapter->txd_cmd | len | (E1000_TXD_CMD_EOP | flags) ); - - ND("len %d kring %d nic %d", len, j, l); bus_dmamap_sync(adapter->txtag, txbuf->map, - BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + BUS_DMASYNC_PREWRITE); + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - ND("sent %d packets from %d, TDT now %d", n, kring->nr_hwcur, l); - kring->nr_hwcur = k; /* the saved ring->cur */ - kring->nr_hwavail -= n; + kring->nr_hwcur = cur; /* the saved ring->cur */ + /* decrease avail by # of packets sent minus previous ones */ + kring->nr_hwavail -= new_slots; + /* synchronize the NIC ring */ bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), l); + /* (re)start the tx unit up to slot nic_i (excluded) */ + E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), nic_i); } - if (n == 0 || kring->nr_hwavail < 1) { + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) { int delta; /* record completed transmissions using TDH */ - l = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); - ND("tdh is now %d", l); - if (l >= kring->nkr_num_slots) { /* XXX can it happen ? */ - D("bad TDH %d", l); - l -= kring->nkr_num_slots; + nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } - delta = l - adapter->next_tx_to_clean; + delta = nic_i - adapter->next_tx_to_clean; if (delta) { - /* some tx completed, increment hwavail. */ + /* some completed, increment hwavail. */ if (delta < 0) delta += kring->nkr_num_slots; - if (netmap_verbose > 255) - RD(5, "%s tx recover %d bufs", - ifp->if_xname, delta); - adapter->next_tx_to_clean = l; + adapter->next_tx_to_clean = nic_i; kring->nr_hwavail += delta; } } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + + nm_txsync_finalize(kring, cur); return 0; } @@ -208,39 +189,39 @@ lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) * Reconcile kernel and user view of the receive ring. */ static int -lem_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n, resvd; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - if (k > lim) - return netmap_ring_reinit(kring); + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + if (cur > lim) + return netmap_ring_reinit(kring); /* XXX check sync modes */ bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * Import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. */ - l = adapter->next_rx_desc_to_check; - j = netmap_idx_n2k(kring, l); - ND("%s: next NIC %d kring %d (ofs %d), hwcur %d hwavail %d cur %d avail %d", - ifp->if_xname, - l, j, kring->nkr_hwofs, kring->nr_hwcur, kring->nr_hwavail, - ring->cur, ring->avail); if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + nic_i = adapter->next_rx_desc_to_check; + nm_i = netmap_idx_n2k(kring, nic_i); + for (n = 0; ; n++) { - struct e1000_rx_desc *curr = &adapter->rx_desc_base[l]; + struct e1000_rx_desc *curr = &adapter->rx_desc_base[nic_i]; uint32_t staterr = le32toh(curr->status); int len; @@ -248,76 +229,73 @@ lem_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) break; len = le16toh(curr->length) - 4; // CRC if (len < 0) { - D("bogus pkt size at %d", j); + D("bogus pkt size %d nic idx %d", len, nic_i); len = 0; } - ND("\n%s", nm_dump_buf(NMB(&ring->slot[j]), - len, 128, NULL)); - ring->slot[j].len = len; - ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = len; + ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(adapter->rxtag, - adapter->rx_buffer_area[l].map, - BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + adapter->rx_buffer_area[nic_i].map, + BUS_DMASYNC_POSTREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ - adapter->next_rx_desc_to_check = l; + adapter->next_rx_desc_to_check = nic_i; + // ifp->if_ipackets += n; kring->nr_hwavail += n; } kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); /* NIC ring index */ - for (n = 0; j != k; n++) { - struct netmap_slot *slot = &ring->slot[j]; - struct e1000_rx_desc *curr = &adapter->rx_desc_base[l]; - struct em_buffer *rxbuf = &adapter->rx_buffer_area[l]; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != cur) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != cur; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - return netmap_ring_reinit(kring); - } + struct e1000_rx_desc *curr = &adapter->rx_desc_base[nic_i]; + struct em_buffer *rxbuf = &adapter->rx_buffer_area[nic_i]; + + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ - netmap_reload_map(adapter->rxtag, rxbuf->map, addr); curr->buffer_addr = htole64(paddr); + netmap_reload_map(adapter->rxtag, rxbuf->map, addr); slot->flags &= ~NS_BUF_CHANGED; } curr->status = 0; - bus_dmamap_sync(adapter->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); - - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = cur; bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), l); + nic_i = (nic_i == 0) ? lim : nic_i - 1; + E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), nic_i); } - /* tell userspace that there are new packets */ + + /* tell userspace that there might be new packets */ ring->avail = kring->nr_hwavail - resvd; + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } @@ -335,7 +313,8 @@ lem_netmap_attach(struct adapter *adapter) na.nm_txsync = lem_netmap_txsync; na.nm_rxsync = lem_netmap_rxsync; na.nm_register = lem_netmap_reg; - netmap_attach(&na, 1); + na.num_tx_rings = na.num_rx_rings = 1; + netmap_attach(&na); } /* end of file */ diff --git a/sys/dev/netmap/if_re_netmap.h b/sys/dev/netmap/if_re_netmap.h index ac781ccb572e..2c7ba060cffd 100644 --- a/sys/dev/netmap/if_re_netmap.h +++ b/sys/dev/netmap/if_re_netmap.h @@ -26,8 +26,9 @@ /* * $FreeBSD$ * - * netmap support for "re" - * For details on netmap support please see ixgbe_netmap.h + * netmap support for: re + * + * For more details on netmap support please see ixgbe_netmap.h */ @@ -39,44 +40,24 @@ /* - * support for netmap register/unregisted. We are already under core lock. - * only called on the first register or the last unregister. + * Register/unregister. We are already under netmap lock. */ static int -re_netmap_reg(struct ifnet *ifp, int onoff) +re_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct rl_softc *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - - if (na == NULL) - return EINVAL; - /* Tell the stack that the interface is no longer active */ - ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); - - re_stop(adapter); + RL_LOCK(adapter); + re_stop(adapter); /* also clears IFF_DRV_RUNNING */ if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - - /* save if_transmit to restore it later */ - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - re_init_locked(adapter); - - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + nm_set_native_flags(na); } else { -fail: - /* restore if_transmit */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - re_init_locked(adapter); /* also enables intr */ + nm_clear_native_flags(na); } - return (error); + re_init_locked(adapter); /* also enables intr */ + RL_UNLOCK(adapter); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } @@ -84,90 +65,107 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -re_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct rl_softc *sc = ifp->if_softc; - struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc; - struct netmap_adapter *na = NA(sc->rl_ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - int j, k, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n, new_slots; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = nm_txsync_prologue(kring, &new_slots); + + /* device-specific */ + struct rl_softc *sc = ifp->if_softc; + struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc; - k = ring->cur; - if (k > lim) + if (cur > lim) /* error checking in nm_txsync_prologue() */ return netmap_ring_reinit(kring); - /* Sync the TX descriptor list */ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, - sc->rl_ldata.rl_tx_list_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); - - /* XXX move after the transmissions */ - /* record completed transmissions */ - for (n = 0, l = sc->rl_ldata.rl_tx_considx; - l != sc->rl_ldata.rl_tx_prodidx; - n++, l = RL_TX_DESC_NXT(sc, l)) { - uint32_t cmdstat = - le32toh(sc->rl_ldata.rl_tx_list[l].rl_cmdstat); - if (cmdstat & RL_TDESC_STAT_OWN) - break; - } - if (n > 0) { - sc->rl_ldata.rl_tx_considx = l; - sc->rl_ldata.rl_tx_free += n; - kring->nr_hwavail += n; - } + sc->rl_ldata.rl_tx_list_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); // XXX extra postwrite ? - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + /* + * First part: process new packets to send. + */ + nm_i = kring->nr_hwcur; + if (nm_i != cur) { /* we have new packets to send */ + nic_i = sc->rl_ldata.rl_tx_prodidx; + // XXX or netmap_idx_k2n(kring, nm_i); + + for (n = 0; nm_i != cur; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; + uint64_t paddr; + void *addr = PNMB(slot, &paddr); - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - l = sc->rl_ldata.rl_tx_prodidx; - for (n = 0; j != k; n++) { - struct netmap_slot *slot = &ring->slot[j]; - struct rl_desc *desc = &sc->rl_ldata.rl_tx_list[l]; + /* device-specific */ + struct rl_desc *desc = &sc->rl_ldata.rl_tx_list[nic_i]; int cmd = slot->len | RL_TDESC_CMD_EOF | RL_TDESC_CMD_OWN | RL_TDESC_CMD_SOF ; - uint64_t paddr; - void *addr = PNMB(slot, &paddr); - int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - // XXX what about prodidx ? - return netmap_ring_reinit(kring); - } + NM_CHECK_ADDR_LEN(addr, len); - if (l == lim) /* mark end of ring */ + if (nic_i == lim) /* mark end of ring */ cmd |= RL_TDESC_CMD_EOR; if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, reload map */ desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); - /* buffer has changed, unload and reload map */ netmap_reload_map(sc->rl_ldata.rl_tx_mtag, - txd[l].tx_dmamap, addr); - slot->flags &= ~NS_BUF_CHANGED; + txd[nic_i].tx_dmamap, addr); } - slot->flags &= ~NS_REPORT; + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ desc->rl_cmdstat = htole32(cmd); + + /* make sure changes to the buffer are synced */ bus_dmamap_sync(sc->rl_ldata.rl_tx_mtag, - txd[l].tx_dmamap, BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + txd[nic_i].tx_dmamap, + BUS_DMASYNC_PREWRITE); + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - sc->rl_ldata.rl_tx_prodidx = l; - kring->nr_hwcur = k; /* the saved ring->cur */ - ring->avail -= n; // XXX see others - kring->nr_hwavail = ring->avail; + sc->rl_ldata.rl_tx_prodidx = nic_i; + /* decrease avail by # of packets sent minus previous ones */ + kring->nr_hwcur = cur; /* the saved ring->cur */ + kring->nr_hwavail -= new_slots; + /* synchronize the NIC ring */ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, - sc->rl_ldata.rl_tx_list_map, - BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); + sc->rl_ldata.rl_tx_list_map, + BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE); /* start ? */ CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); } + + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (flags & NAF_FORCE_RECLAIM || kring->nr_hwavail < 1) { + nic_i = sc->rl_ldata.rl_tx_considx; + for (n = 0; nic_i != sc->rl_ldata.rl_tx_prodidx; + n++, nic_i = RL_TX_DESC_NXT(sc, nic_i)) { + uint32_t cmdstat = + le32toh(sc->rl_ldata.rl_tx_list[nic_i].rl_cmdstat); + if (cmdstat & RL_TDESC_STAT_OWN) + break; + } + if (n > 0) { + sc->rl_ldata.rl_tx_considx = nic_i; + sc->rl_ldata.rl_tx_free += n; + kring->nr_hwavail += n; + } + } + + nm_txsync_finalize(kring, cur); + return 0; } @@ -176,42 +174,45 @@ re_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) * Reconcile kernel and user view of the receive ring. */ static int -re_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct rl_softc *sc = ifp->if_softc; - struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc; - struct netmap_adapter *na = NA(sc->rl_ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n, resvd; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - k = ring->cur; - if (k > lim) + /* device-specific */ + struct rl_softc *sc = ifp->if_softc; + struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc; + + if (cur > lim) return netmap_ring_reinit(kring); - /* XXX check sync modes */ bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, - sc->rl_ldata.rl_rx_list_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + sc->rl_ldata.rl_rx_list_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * Import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. * - * The device uses all the buffers in the ring, so we need + * This device uses all the buffers in the ring, so we need * another termination condition in addition to RL_RDESC_STAT_OWN * cleared (all buffers could have it cleared. The easiest one * is to limit the amount of data reported up to 'lim' */ - l = sc->rl_ldata.rl_rx_prodidx; /* next pkt to check */ - j = netmap_idx_n2k(kring, l); /* the kring index */ if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + nic_i = sc->rl_ldata.rl_rx_prodidx; /* next pkt to check */ + nm_i = netmap_idx_n2k(kring, nic_i); + for (n = kring->nr_hwavail; n < lim ; n++) { - struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[l]; + struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[nic_i]; uint32_t rxstat = le32toh(cur_rx->rl_cmdstat); uint32_t total_len; @@ -220,74 +221,75 @@ re_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) total_len = rxstat & sc->rl_rxlenmask; /* XXX subtract crc */ total_len = (total_len < 4) ? 0 : total_len - 4; - kring->ring->slot[j].len = total_len; - kring->ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = total_len; + ring->slot[nm_i].flags = slot_flags; /* sync was in re_newbuf() */ bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, - rxd[l].rx_dmamap, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + rxd[nic_i].rx_dmamap, BUS_DMASYNC_POSTREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } if (n != kring->nr_hwavail) { - sc->rl_ldata.rl_rx_prodidx = l; + sc->rl_ldata.rl_rx_prodidx = nic_i; sc->rl_ifp->if_ipackets += n - kring->nr_hwavail; kring->nr_hwavail = n; } kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); /* the NIC index */ - for (n = 0; j != k; n++) { - struct netmap_slot *slot = ring->slot + j; - struct rl_desc *desc = &sc->rl_ldata.rl_rx_list[l]; - int cmd = NETMAP_BUF_SIZE | RL_RDESC_CMD_OWN; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != cur) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != cur; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - return netmap_ring_reinit(kring); - } + struct rl_desc *desc = &sc->rl_ldata.rl_rx_list[nic_i]; + int cmd = NETMAP_BUF_SIZE | RL_RDESC_CMD_OWN; + + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; - if (l == lim) /* mark end of ring */ + if (nic_i == lim) /* mark end of ring */ cmd |= RL_RDESC_CMD_EOR; - slot->flags &= ~NS_REPORT; if (slot->flags & NS_BUF_CHANGED) { - netmap_reload_map(sc->rl_ldata.rl_rx_mtag, - rxd[l].rx_dmamap, addr); + /* buffer has changed, reload map */ desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + netmap_reload_map(sc->rl_ldata.rl_rx_mtag, + rxd[nic_i].rx_dmamap, addr); slot->flags &= ~NS_BUF_CHANGED; } desc->rl_cmdstat = htole32(cmd); bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, - rxd[l].rx_dmamap, BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + rxd[nic_i].rx_dmamap, + BUS_DMASYNC_PREREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } kring->nr_hwavail -= n; - kring->nr_hwcur = k; - /* Flush the RX DMA ring */ + kring->nr_hwcur = cur; bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list_map, - BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); } - /* tell userspace that there are new packets */ + + /* tell userspace that there might be new packets */ ring->avail = kring->nr_hwavail - resvd; + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } + /* * Additional routines to init the tx and rx rings. * In other drivers we do that inline in the main code. @@ -299,11 +301,16 @@ re_netmap_tx_init(struct rl_softc *sc) struct rl_desc *desc; int i, n; struct netmap_adapter *na = NA(sc->rl_ifp); - struct netmap_slot *slot = netmap_reset(na, NR_TX, 0, 0); + struct netmap_slot *slot; + if (!na || !(na->na_flags & NAF_NATIVE_ON)) { + return; + } + + slot = netmap_reset(na, NR_TX, 0, 0); /* slot is NULL if we are not in netmap mode */ if (!slot) - return; + return; // XXX cannot happen /* in netmap mode, overwrite addresses and maps */ txd = sc->rl_ldata.rl_tx_desc; desc = sc->rl_ldata.rl_tx_list; @@ -377,6 +384,8 @@ re_netmap_attach(struct rl_softc *sc) na.nm_txsync = re_netmap_txsync; na.nm_rxsync = re_netmap_rxsync; na.nm_register = re_netmap_reg; - netmap_attach(&na, 1); + na.num_tx_rings = na.num_rx_rings = 1; + netmap_attach(&na); } + /* end of file */ diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h index fca1cf1e0a90..7fd67d2b57ff 100644 --- a/sys/dev/netmap/ixgbe_netmap.h +++ b/sys/dev/netmap/ixgbe_netmap.h @@ -26,16 +26,16 @@ /* * $FreeBSD$ * - * netmap modifications for ixgbe + * netmap support for: ixgbe * * This file is meant to be a reference on how to implement * netmap support for a network driver. - * This file contains code but only static or inline functions - * that are used by a single driver. To avoid replication of - * code we just #include it near the beginning of the - * standard driver. + * This file contains code but only static or inline functions used + * by a single driver. To avoid replication of code we just #include + * it near the beginning of the standard driver. */ + #include <net/netmap.h> #include <sys/selinfo.h> /* @@ -48,7 +48,10 @@ */ #include <dev/netmap/netmap_kern.h> + /* + * device-specific sysctl variables: + * * ix_crcstrip: 0: keep CRC in rx frames (default), 1: strip it. * During regular operations the CRC is stripped, but on some * hardware reception of frames not multiple of 64 is slower, @@ -56,17 +59,11 @@ * * ix_rx_miss, ix_rx_miss_bufs: * count packets that might be missed due to lost interrupts. - * - * ix_use_dd - * use the dd bit for completed tx transmissions. - * This is tricky, much better to use TDH for now. */ SYSCTL_DECL(_dev_netmap); -static int ix_rx_miss, ix_rx_miss_bufs, ix_use_dd, ix_crcstrip; +static int ix_rx_miss, ix_rx_miss_bufs, ix_crcstrip; SYSCTL_INT(_dev_netmap, OID_AUTO, ix_crcstrip, CTLFLAG_RW, &ix_crcstrip, 0, "strip CRC on rx frames"); -SYSCTL_INT(_dev_netmap, OID_AUTO, ix_use_dd, - CTLFLAG_RW, &ix_use_dd, 0, "use dd instead of tdh to detect tx frames"); SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss, CTLFLAG_RW, &ix_rx_miss, 0, "potentially missed rx intr"); SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss_bufs, @@ -110,283 +107,235 @@ set_crcstrip(struct ixgbe_hw *hw, int onoff) IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rxc); } + /* - * Register/unregister. We are already under core lock. + * Register/unregister. We are already under netmap lock. * Only called on the first register or the last unregister. */ static int -ixgbe_netmap_reg(struct ifnet *ifp, int onoff) +ixgbe_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - - if (na == NULL) - return EINVAL; /* no netmap support here */ IXGBE_CORE_LOCK(adapter); - ixgbe_disable_intr(adapter); + ixgbe_disable_intr(adapter); // XXX maybe ixgbe_stop ? /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); set_crcstrip(&adapter->hw, onoff); - if (onoff) { /* enable netmap mode */ - ifp->if_capenable |= IFCAP_NETMAP; - - /* save if_transmit and replace with our routine */ - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - /* - * reinitialize the adapter, now with netmap flag set, - * so the rings will be set accordingly. - */ - ixgbe_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } - } else { /* reset normal mode (explicit request or netmap failed) */ -fail: - /* restore if_transmit */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - /* initialize the card, this time in standard mode */ - ixgbe_init_locked(adapter); /* also enables intr */ + /* enable or disable flags and callbacks in na and ifp */ + if (onoff) { + nm_set_native_flags(na); + } else { + nm_clear_native_flags(na); } - set_crcstrip(&adapter->hw, onoff); + ixgbe_init_locked(adapter); /* also enables intr */ + set_crcstrip(&adapter->hw, onoff); // XXX why twice ? IXGBE_CORE_UNLOCK(adapter); - return (error); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } /* * Reconcile kernel and user view of the transmit ring. - * This routine might be called frequently so it must be efficient. - * - * ring->cur holds the userspace view of the current ring index. Userspace - * has filled the tx slots from the previous call's ring->cur up to but not - * including ring->cur for this call. In this function the kernel updates - * kring->nr_hwcur to ring->cur, thus slots [kring->nr_hwcur, ring->cur) are - * now ready to transmit. At the last interrupt kring->nr_hwavail slots were - * available. * - * This function runs under lock (acquired from the caller or internally). - * It must first update ring->avail to what the kernel knows, - * subtract the newly used slots (ring->cur - kring->nr_hwcur) - * from both avail and nr_hwavail, and set ring->nr_hwcur = ring->cur - * issuing a dmamap_sync on all slots. + * Userspace wants to send packets up to the one before ring->cur, + * kernel knows kring->nr_hwcur is the first unsent packet. * - * Since ring comes from userspace, its content must be read only once, - * and validated before being used to update the kernel's structures. - * (this is also true for every use of ring in the kernel). + * Here we push packets out (as many as possible), and possibly + * reclaim buffers from previously completed transmission. * - * ring->avail is never used, only checked for bogus values. + * ring->avail is not used on input, but it is updated on return. * - * I flags & FORCE_RECLAIM, reclaim transmitted - * buffers irrespective of interrupt mitigation. + * The caller (netmap) guarantees that there is only one instance + * running at any time. Any interference with other driver + * methods should be handled by the individual drivers. */ static int -ixgbe_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - struct netmap_adapter *na = NA(adapter->ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n = 0; - u_int const k = ring->cur, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n, new_slots; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = nm_txsync_prologue(kring, &new_slots); /* - * ixgbe can generate an interrupt on every tx packet, but it - * seems very expensive, so we interrupt once every half ring, - * or when requested with NS_REPORT + * interrupts on every tx packet are expensive so request + * them every half ring, or where NS_REPORT is set */ u_int report_frequency = kring->nkr_num_slots >> 1; - if (k > lim) + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + int reclaim_tx; + + if (cur > lim) /* error checking in nm_txsync_prologue() */ return netmap_ring_reinit(kring); bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); /* - * Process new packets to send. j is the current index in the - * netmap ring, l is the corresponding index in the NIC ring. + * First part: process new packets to send. + * nm_i is the current index in the netmap ring, + * nic_i is the corresponding index in the NIC ring. * The two numbers differ because upon a *_init() we reset * the NIC ring but leave the netmap ring unchanged. * For the transmit ring, we have * - * j = kring->nr_hwcur - * l = IXGBE_TDT (not tracked in the driver) + * nm_i = kring->nr_hwcur + * nic_i = IXGBE_TDT (not tracked in the driver) * and - * j == (l + kring->nkr_hwofs) % ring_size + * nm_i == (nic_i + kring->nkr_hwofs) % ring_size * * In this driver kring->nkr_hwofs >= 0, but for other * drivers it might be negative as well. */ - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - prefetch(&ring->slot[j]); - l = netmap_idx_k2n(kring, j); /* NIC index */ - prefetch(&txr->tx_buffers[l]); - for (n = 0; j != k; n++) { - /* - * Collect per-slot info. - * Note that txbuf and curr are indexed by l. - * - * In this driver we collect the buffer address - * (using the PNMB() macro) because we always - * need to rewrite it into the NIC ring. - * Many other drivers preserve the address, so - * we only need to access it if NS_BUF_CHANGED - * is set. - * XXX note, on this device the dmamap* calls are - * not necessary because tag is 0, however just accessing - * the per-packet tag kills 1Mpps at 900 MHz. - */ - struct netmap_slot *slot = &ring->slot[j]; - union ixgbe_adv_tx_desc *curr = &txr->tx_base[l]; - struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[l]; - uint64_t paddr; - // XXX type for flags and len ? - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - IXGBE_TXD_CMD_RS : 0; + + /* + * If we have packets to send (kring->nr_hwcur != ring->cur) + * iterate over the netmap ring, fetch length and update + * the corresponding slot in the NIC ring. Some drivers also + * need to update the buffer's physical address in the NIC slot + * even NS_BUF_CHANGED is not set (PNMB computes the addresses). + * + * The netmap_reload_map() calls is especially expensive, + * even when (as in this case) the tag is 0, so do only + * when the buffer has actually changed. + * + * If possible do not set the report/intr bit on all slots, + * but only a few times per ring or when NS_REPORT is set. + * + * Finally, on 10G and faster drivers, it might be useful + * to prefetch the next slot and txr entry. + */ + + nm_i = kring->nr_hwcur; + if (nm_i != cur) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + + prefetch(&ring->slot[nm_i]); + prefetch(&txr->tx_buffers[nic_i]); + + for (n = 0; nm_i != cur; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; + uint64_t paddr; void *addr = PNMB(slot, &paddr); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; - prefetch(&ring->slot[j]); - prefetch(&txr->tx_buffers[l]); - - /* - * Quick check for valid addr and len. - * NMB() returns netmap_buffer_base for invalid - * buffer indexes (but the address is still a - * valid one to be used in a ring). slot->len is - * unsigned so no need to check for negative values. - */ - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { -ring_reset: - return netmap_ring_reinit(kring); - } + /* device-specific */ + union ixgbe_adv_tx_desc *curr = &txr->tx_base[nic_i]; + struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + IXGBE_TXD_CMD_RS : 0; + + /* prefetch for next round */ + prefetch(&ring->slot[nm_i + 1]); + prefetch(&txr->tx_buffers[nic_i + 1]); + + NM_CHECK_ADDR_LEN(addr, len); if (slot->flags & NS_BUF_CHANGED) { - /* buffer has changed, unload and reload map */ + /* buffer has changed, reload map */ netmap_reload_map(txr->txtag, txbuf->map, addr); - slot->flags &= ~NS_BUF_CHANGED; } - slot->flags &= ~NS_REPORT; - /* - * Fill the slot in the NIC ring. - * In this driver we need to rewrite the buffer - * address in the NIC ring. Other drivers do not - * need this. - * Use legacy descriptor, it is faster. - */ + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ + /* Use legacy descriptor, they are faster? */ curr->read.buffer_addr = htole64(paddr); curr->read.olinfo_status = 0; curr->read.cmd_type_len = htole32(len | flags | IXGBE_ADVTXD_DCMD_IFCS | IXGBE_TXD_CMD_EOP); /* make sure changes to the buffer are synced */ - bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_PREWRITE); + bus_dmamap_sync(txr->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = k; /* the saved ring->cur */ - /* decrease avail by number of packets sent */ - kring->nr_hwavail -= n; + kring->nr_hwcur = cur; /* the saved ring->cur */ + /* decrease avail by # of packets sent minus previous ones */ + kring->nr_hwavail -= new_slots; /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - /* (re)start the transmitter up to slot l (excluded) */ - IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), l); + + /* (re)start the tx unit up to slot nic_i (excluded) */ + IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), nic_i); } /* - * Reclaim buffers for completed transmissions. + * Second part: reclaim buffers for completed transmissions. * Because this is expensive (we read a NIC register etc.) * we only do it in specific cases (see below). - * In all cases kring->nr_kflags indicates which slot will be - * checked upon a tx interrupt (nkr_num_slots means none). */ if (flags & NAF_FORCE_RECLAIM) { - j = 1; /* forced reclaim, ignore interrupts */ - kring->nr_kflags = kring->nkr_num_slots; + reclaim_tx = 1; /* forced reclaim */ } else if (kring->nr_hwavail > 0) { - j = 0; /* buffers still available: no reclaim, ignore intr. */ - kring->nr_kflags = kring->nkr_num_slots; + reclaim_tx = 0; /* have buffers, no reclaim */ } else { /* - * no buffers available, locate a slot for which we request - * ReportStatus (approximately half ring after next_to_clean) - * and record it in kring->nr_kflags. - * If the slot has DD set, do the reclaim looking at TDH, - * otherwise we go to sleep (in netmap_poll()) and will be - * woken up when slot nr_kflags will be ready. + * No buffers available. Locate previous slot with + * REPORT_STATUS set. + * If the slot has DD set, we can reclaim space, + * otherwise wait for the next interrupt. + * This enables interrupt moderation on the tx + * side though it might reduce throughput. */ struct ixgbe_legacy_tx_desc *txd = (struct ixgbe_legacy_tx_desc *)txr->tx_base; - j = txr->next_to_clean + kring->nkr_num_slots/2; - if (j >= kring->nkr_num_slots) - j -= kring->nkr_num_slots; + nic_i = txr->next_to_clean + report_frequency; + if (nic_i > lim) + nic_i -= lim + 1; // round to the closest with dd set - j= (j < kring->nkr_num_slots / 4 || j >= kring->nkr_num_slots*3/4) ? + nic_i = (nic_i < kring->nkr_num_slots / 4 || + nic_i >= kring->nkr_num_slots*3/4) ? 0 : report_frequency; - kring->nr_kflags = j; /* the slot to check */ - j = txd[j].upper.fields.status & IXGBE_TXD_STAT_DD; // XXX cpu_to_le32 ? + reclaim_tx = txd[nic_i].upper.fields.status & IXGBE_TXD_STAT_DD; // XXX cpu_to_le32 ? } - if (j) { - int delta; - + if (reclaim_tx) { /* * Record completed transmissions. * We (re)use the driver's txr->next_to_clean to keep * track of the most recently completed transmission. * - * The datasheet discourages the use of TDH to find out the - * number of sent packets. We should rather check the DD - * status bit in a packet descriptor. However, we only set - * the "report status" bit for some descriptors (a kind of - * interrupt mitigation), so we can only check on those. - * For the time being we use TDH, as we do it infrequently - * enough not to pose performance problems. + * The datasheet discourages the use of TDH to find + * out the number of sent packets, but we only set + * REPORT_STATUS in a few slots so TDH is the only + * good way. */ - if (ix_use_dd) { - struct ixgbe_legacy_tx_desc *txd = - (struct ixgbe_legacy_tx_desc *)txr->tx_base; - u_int k1 = netmap_idx_k2n(kring, kring->nr_hwcur); - l = txr->next_to_clean; - delta = 0; - while (l != k1 && - txd[l].upper.fields.status & IXGBE_TXD_STAT_DD) { - delta++; - l = (l == lim) ? 0 : l + 1; - } - } else { - l = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr)); - if (l >= kring->nkr_num_slots) { /* XXX can happen */ - D("TDH wrap %d", l); - l -= kring->nkr_num_slots; + nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } - delta = l - txr->next_to_clean; - } - if (delta) { + if (nic_i != txr->next_to_clean) { + n = (nic_i + lim + 1) - txr->next_to_clean; + if (n > lim) + n -= lim + 1; /* some tx completed, increment avail */ - if (delta < 0) - delta += kring->nkr_num_slots; - txr->next_to_clean = l; - kring->nr_hwavail += delta; - if (kring->nr_hwavail > lim) - goto ring_reset; + txr->next_to_clean = nic_i; + kring->nr_hwavail += n; + if (kring->nr_hwavail > lim) { + RD(5, "bad hwavail %d", + kring->nr_hwavail); + return netmap_ring_reinit(kring); + } } } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + + nm_txsync_finalize(kring, cur); return 0; } @@ -394,11 +343,12 @@ ring_reset: /* * Reconcile kernel and user view of the receive ring. - * Same as for the txsync, this routine must be efficient and - * avoid races in accessing the shared regions. + * Same as for the txsync, this routine must be efficient. + * The caller guarantees a single invocations, but races against + * the rest of the driver should be handled here. * - * When called, userspace has read data from slots kring->nr_hwcur - * up to ring->cur (excluded). + * When called, userspace has released buffers up to + * ring->cur - ring->reserved (last one excluded). * * The last interrupt reported kring->nr_hwavail slots available * after kring->nr_hwcur. @@ -410,18 +360,23 @@ ring_reset: * of whether or not we received an interrupt. */ static int -ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - struct netmap_adapter *na = NA(adapter->ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n, resvd; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = nm_rxsync_prologue(kring, &resvd); /* cur + res */ int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - if (k > lim) + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + + if (cur > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ @@ -429,17 +384,17 @@ ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * First part, import newly received packets into the netmap ring. + * First part: import newly received packets. * - * j is the index of the next free slot in the netmap ring, - * and l is the index of the next received packet in the NIC ring, + * nm_i is the index of the next free slot in the netmap ring, + * nic_i is the index of the next received packet in the NIC ring, * and they may differ in case if_init() has been called while * in netmap mode. For the receive ring we have * - * j = (kring->nr_hwcur + kring->nr_hwavail) % ring_size - * l = rxr->next_to_check; + * nm_i = (kring->nr_hwcur + kring->nr_hwavail) % ring_size + * nic_i = rxr->next_to_check; * and - * j == (l + kring->nkr_hwofs) % ring_size + * nm_i == (nic_i + kring->nkr_hwofs) % ring_size * * rxr->next_to_check is set to 0 on a ring reinit */ @@ -447,21 +402,21 @@ ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) int crclen = ix_crcstrip ? 0 : 4; uint16_t slot_flags = kring->nkr_slot_flags; - l = rxr->next_to_check; - j = netmap_idx_n2k(kring, l); + nic_i = rxr->next_to_check; + nm_i = netmap_idx_n2k(kring, nic_i); for (n = 0; ; n++) { - union ixgbe_adv_rx_desc *curr = &rxr->rx_base[l]; + union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->wb.upper.status_error); if ((staterr & IXGBE_RXD_STAT_DD) == 0) break; - ring->slot[j].len = le16toh(curr->wb.upper.length) - crclen; - ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = le16toh(curr->wb.upper.length) - crclen; + ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(rxr->ptag, - rxr->rx_buffers[l].pmap, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + rxr->rx_buffers[nic_i].pmap, BUS_DMASYNC_POSTREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ if (netmap_no_pendintr && !force_update) { @@ -469,48 +424,36 @@ ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) ix_rx_miss ++; ix_rx_miss_bufs += n; } - rxr->next_to_check = l; + rxr->next_to_check = nic_i; kring->nr_hwavail += n; } kring->nr_kflags &= ~NKR_PENDINTR; } /* - * Skip past packets that userspace has released - * (from kring->nr_hwcur to ring->cur - ring->reserved excluded), + * Second part: skip past packets that userspace has released. + * (kring->nr_hwcur to ring->cur - ring->reserved excluded), * and make the buffers available for reception. - * As usual j is the index in the netmap ring, l is the index - * in the NIC ring, and j == (l + kring->nkr_hwofs) % ring_size + * As usual nm_i is the index in the netmap ring, + * nic_i is the index in the NIC ring, and + * nm_i == (nic_i + kring->nkr_hwofs) % ring_size */ - j = kring->nr_hwcur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* collect per-slot info, with similar validations - * and flag handling as in the txsync code. - * - * NOTE curr and rxbuf are indexed by l. - * Also, this driver needs to update the physical - * address in the NIC ring, but other drivers - * may not have this requirement. - */ - struct netmap_slot *slot = &ring->slot[j]; - union ixgbe_adv_rx_desc *curr = &rxr->rx_base[l]; - struct ixgbe_rx_buf *rxbuf = &rxr->rx_buffers[l]; + nm_i = kring->nr_hwcur; + if (nm_i != cur) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != cur; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); + union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i]; + struct ixgbe_rx_buf *rxbuf = &rxr->rx_buffers[nic_i]; + if (addr == netmap_buffer_base) /* bad buf */ goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, reload map */ netmap_reload_map(rxr->ptag, rxbuf->pmap, addr); slot->flags &= ~NS_BUF_CHANGED; } @@ -518,20 +461,23 @@ ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) curr->read.pkt_addr = htole64(paddr); bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = cur; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - /* IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + /* + * IMPORTANT: we must leave one free slot in the ring, + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), l); + nic_i = (nic_i == 0) ? lim : nic_i - 1; + IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), nic_i); } - /* tell userspace that there are new packets */ + + /* tell userspace that there might be new packets */ ring->avail = kring->nr_hwavail - resvd; return 0; @@ -562,7 +508,8 @@ ixgbe_netmap_attach(struct adapter *adapter) na.nm_txsync = ixgbe_netmap_txsync; na.nm_rxsync = ixgbe_netmap_rxsync; na.nm_register = ixgbe_netmap_reg; - netmap_attach(&na, adapter->num_queues); -} + na.num_tx_rings = na.num_rx_rings = adapter->num_queues; + netmap_attach(&na); +} /* end of file */ diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index 19be406f6dbc..033cd3059e17 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -8,7 +8,7 @@ * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. + * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -25,6 +25,8 @@ /* + * $FreeBSD$ + * * This module supports memory mapped access to network devices, * see netmap(4). * @@ -130,131 +132,36 @@ ports attached to the switch) #if defined(__FreeBSD__) #include <sys/cdefs.h> /* prerequisite */ -__FBSDID("$FreeBSD$"); - #include <sys/types.h> -#include <sys/module.h> #include <sys/errno.h> #include <sys/param.h> /* defines used in kernel.h */ -#include <sys/jail.h> #include <sys/kernel.h> /* types used in module initialization */ -#include <sys/conf.h> /* cdevsw struct */ -#include <sys/uio.h> /* uio struct */ +#include <sys/conf.h> /* cdevsw struct, UID, GID */ #include <sys/sockio.h> #include <sys/socketvar.h> /* struct socket */ #include <sys/malloc.h> -#include <sys/mman.h> /* PROT_EXEC */ #include <sys/poll.h> -#include <sys/proc.h> #include <sys/rwlock.h> -#include <vm/vm.h> /* vtophys */ -#include <vm/pmap.h> /* vtophys */ -#include <vm/vm_param.h> -#include <vm/vm_object.h> -#include <vm/vm_page.h> -#include <vm/vm_pager.h> -#include <vm/uma.h> #include <sys/socket.h> /* sockaddrs */ #include <sys/selinfo.h> #include <sys/sysctl.h> #include <net/if.h> #include <net/if_var.h> #include <net/bpf.h> /* BIOCIMMEDIATE */ -#include <net/vnet.h> #include <machine/bus.h> /* bus_dmamap_* */ #include <sys/endian.h> #include <sys/refcount.h> -#define prefetch(x) __builtin_prefetch(x) - -#define BDG_RWLOCK_T struct rwlock // struct rwlock - -#define BDG_RWINIT(b) \ - rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) -#define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) -#define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) -#define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) -#define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) -#define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) -#define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) - - -/* netmap global lock. - * normally called within the user thread (upon a system call) - * or when a file descriptor or process is terminated - * (last close or last munmap) - */ - -#define NMG_LOCK_T struct mtx -#define NMG_LOCK_INIT() mtx_init(&netmap_global_lock, "netmap global lock", NULL, MTX_DEF) -#define NMG_LOCK_DESTROY() mtx_destroy(&netmap_global_lock) -#define NMG_LOCK() mtx_lock(&netmap_global_lock) -#define NMG_UNLOCK() mtx_unlock(&netmap_global_lock) -#define NMG_LOCK_ASSERT() mtx_assert(&netmap_global_lock, MA_OWNED) +/* reduce conditional code */ +#define init_waitqueue_head(x) // only needed in linux -/* atomic operations */ -#include <machine/atomic.h> -#define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) -#define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) #elif defined(linux) #include "bsd_glue.h" -static netdev_tx_t linux_netmap_start_xmit(struct sk_buff *, struct net_device *); - -static struct device_driver* -linux_netmap_find_driver(struct device *dev) -{ - struct device_driver *dd; - - while ( (dd = dev->driver) == NULL ) { - if ( (dev = dev->parent) == NULL ) - return NULL; - } - return dd; -} - -static struct net_device* -ifunit_ref(const char *name) -{ - struct net_device *ifp = dev_get_by_name(&init_net, name); - struct device_driver *dd; - - if (ifp == NULL) - return NULL; - - if ( (dd = linux_netmap_find_driver(&ifp->dev)) == NULL ) - goto error; - - if (!try_module_get(dd->owner)) - goto error; - - return ifp; -error: - dev_put(ifp); - return NULL; -} - -static void -if_rele(struct net_device *ifp) -{ - struct device_driver *dd; - dd = linux_netmap_find_driver(&ifp->dev); - dev_put(ifp); - if (dd) - module_put(dd->owner); -} - -// XXX a mtx would suffice here too 20130404 gl -#define NMG_LOCK_T struct semaphore -#define NMG_LOCK_INIT() sema_init(&netmap_global_lock, 1) -#define NMG_LOCK_DESTROY() -#define NMG_LOCK() down(&netmap_global_lock) -#define NMG_UNLOCK() up(&netmap_global_lock) -#define NMG_LOCK_ASSERT() // XXX to be completed #elif defined(__APPLE__) @@ -306,57 +213,46 @@ int netmap_txsync_retry = 2; SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); -int netmap_drop = 0; /* debugging */ int netmap_flags = 0; /* debug flags */ int netmap_fwd = 0; /* force transparent mode */ int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ -SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , ""); +/* + * netmap_admode selects the netmap mode to use. + * Invalid values are reset to NETMAP_ADMODE_BEST + */ +enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ + NETMAP_ADMODE_NATIVE, /* either native or none */ + NETMAP_ADMODE_GENERIC, /* force generic */ + NETMAP_ADMODE_LAST }; +#define NETMAP_ADMODE_NATIVE 1 /* Force native netmap adapter. */ +#define NETMAP_ADMODE_GENERIC 2 /* Force generic netmap adapter. */ +#define NETMAP_ADMODE_BEST 0 /* Priority to native netmap adapter. */ +static int netmap_admode = NETMAP_ADMODE_BEST; + +int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ +int netmap_generic_ringsize = 1024; /* Generic ringsize. */ + SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); NMG_LOCK_T netmap_global_lock; -/* - * protect against multiple threads using the same ring. - * also check that the ring has not been stopped. - */ -#define NM_KR_BUSY 1 -#define NM_KR_STOPPED 2 -static void nm_kr_put(struct netmap_kring *kr); -static __inline int nm_kr_tryget(struct netmap_kring *kr) -{ - /* check a first time without taking the lock - * to avoid starvation for nm_kr_get() - */ - if (unlikely(kr->nkr_stopped)) { - ND("ring %p stopped (%d)", kr, kr->nkr_stopped); - return NM_KR_STOPPED; - } - if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) - return NM_KR_BUSY; - /* check a second time with lock held */ - if (unlikely(kr->nkr_stopped)) { - ND("ring %p stopped (%d)", kr, kr->nkr_stopped); - nm_kr_put(kr); - return NM_KR_STOPPED; - } - return 0; -} - -static __inline void nm_kr_put(struct netmap_kring *kr) -{ - NM_ATOMIC_CLEAR(&kr->nr_busy); -} -static void nm_kr_get(struct netmap_kring *kr) +static void +nm_kr_get(struct netmap_kring *kr) { while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) tsleep(kr, 0, "NM_KR_GET", 4); } -static void nm_disable_ring(struct netmap_kring *kr) + +void +netmap_disable_ring(struct netmap_kring *kr) { kr->nkr_stopped = 1; nm_kr_get(kr); @@ -365,7 +261,9 @@ static void nm_disable_ring(struct netmap_kring *kr) nm_kr_put(kr); } -void netmap_disable_all_rings(struct ifnet *ifp) + +static void +netmap_set_all_rings(struct ifnet *ifp, int stopped) { struct netmap_adapter *na; int i; @@ -375,35 +273,37 @@ void netmap_disable_all_rings(struct ifnet *ifp) na = NA(ifp); - for (i = 0; i < na->num_tx_rings + 1; i++) { - nm_disable_ring(na->tx_rings + i); - selwakeuppri(&na->tx_rings[i].si, PI_NET); + for (i = 0; i <= na->num_tx_rings; i++) { + if (stopped) + netmap_disable_ring(na->tx_rings + i); + else + na->tx_rings[i].nkr_stopped = 0; + na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY | + (i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0)); } - for (i = 0; i < na->num_rx_rings + 1; i++) { - nm_disable_ring(na->rx_rings + i); - selwakeuppri(&na->rx_rings[i].si, PI_NET); + + for (i = 0; i <= na->num_rx_rings; i++) { + if (stopped) + netmap_disable_ring(na->rx_rings + i); + else + na->rx_rings[i].nkr_stopped = 0; + na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY | + (i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0)); } - selwakeuppri(&na->tx_si, PI_NET); - selwakeuppri(&na->rx_si, PI_NET); } -void netmap_enable_all_rings(struct ifnet *ifp) + +void +netmap_disable_all_rings(struct ifnet *ifp) { - struct netmap_adapter *na; - int i; + netmap_set_all_rings(ifp, 1 /* stopped */); +} - if (!(ifp->if_capenable & IFCAP_NETMAP)) - return; - na = NA(ifp); - for (i = 0; i < na->num_tx_rings + 1; i++) { - D("enabling %p", na->tx_rings + i); - na->tx_rings[i].nkr_stopped = 0; - } - for (i = 0; i < na->num_rx_rings + 1; i++) { - D("enabling %p", na->rx_rings + i); - na->rx_rings[i].nkr_stopped = 0; - } +void +netmap_enable_all_rings(struct ifnet *ifp) +{ + netmap_set_all_rings(ifp, 0 /* enabled */); } @@ -432,6 +332,7 @@ nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) return *v; } + /* * packet-dump function, user-supplied or static buffer. * The destination buffer must be at least 30+4*len @@ -440,7 +341,7 @@ const char * nm_dump_buf(char *p, int len, int lim, char *dst) { static char _dst[8192]; - int i, j, i0; + int i, j, i0; static char hex[] ="0123456789abcdef"; char *o; /* output position */ @@ -477,358 +378,13 @@ nm_dump_buf(char *p, int len, int lim, char *dst) return dst; } -/* - * system parameters (most of them in netmap_kern.h) - * NM_NAME prefix for switch port names, default "vale" - * NM_BDG_MAXPORTS number of ports - * NM_BRIDGES max number of switches in the system. - * XXX should become a sysctl or tunable - * - * Switch ports are named valeX:Y where X is the switch name and Y - * is the port. If Y matches a physical interface name, the port is - * connected to a physical device. - * - * Unlike physical interfaces, switch ports use their own memory region - * for rings and buffers. - * The virtual interfaces use per-queue lock instead of core lock. - * In the tx loop, we aggregate traffic in batches to make all operations - * faster. The batch size is bridge_batch. - */ -#define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ -#define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ -#define NM_BRIDGE_RINGSIZE 1024 /* in the device */ -#define NM_BDG_HASH 1024 /* forwarding table entries */ -#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ -#define NM_MULTISEG 64 /* max size of a chain of bufs */ -/* actual size of the tables */ -#define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) -/* NM_FT_NULL terminates a list of slots in the ft */ -#define NM_FT_NULL NM_BDG_BATCH_MAX -#define NM_BRIDGES 8 /* number of bridges */ - - -/* - * bridge_batch is set via sysctl to the max batch size to be - * used in the bridge. The actual value may be larger as the - * last packet in the block may overflow the size. - */ -int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ -SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); - - -/* - * These are used to handle reference counters for bridge ports. - */ -#define ADD_BDG_REF(ifp) refcount_acquire(&NA(ifp)->na_bdg_refcount) -#define DROP_BDG_REF(ifp) refcount_release(&NA(ifp)->na_bdg_refcount) - -/* The bridge references the buffers using the device specific look up table */ -static inline void * -BDG_NMB(struct netmap_mem_d *nmd, struct netmap_slot *slot) -{ - struct lut_entry *lut = nmd->pools[NETMAP_BUF_POOL].lut; - uint32_t i = slot->buf_idx; - return (unlikely(i >= nmd->pools[NETMAP_BUF_POOL].objtotal)) ? lut[0].vaddr : lut[i].vaddr; -} - -static int bdg_netmap_attach(struct netmap_adapter *); -static int bdg_netmap_reg(struct ifnet *ifp, int onoff); -int kern_netmap_regif(struct nmreq *nmr); - -/* - * Each transmit queue accumulates a batch of packets into - * a structure before forwarding. Packets to the same - * destination are put in a list using ft_next as a link field. - * ft_frags and ft_next are valid only on the first fragment. - */ -struct nm_bdg_fwd { /* forwarding entry for a bridge */ - void *ft_buf; /* netmap or indirect buffer */ - uint8_t ft_frags; /* how many fragments (only on 1st frag) */ - uint8_t _ft_port; /* dst port (unused) */ - uint16_t ft_flags; /* flags, e.g. indirect */ - uint16_t ft_len; /* src fragment len */ - uint16_t ft_next; /* next packet to same destination */ -}; - -/* - * For each output interface, nm_bdg_q is used to construct a list. - * bq_len is the number of output buffers (we can have coalescing - * during the copy). - */ -struct nm_bdg_q { - uint16_t bq_head; - uint16_t bq_tail; - uint32_t bq_len; /* number of buffers */ -}; - -/* XXX revise this */ -struct nm_hash_ent { - uint64_t mac; /* the top 2 bytes are the epoch */ - uint64_t ports; -}; - -/* - * nm_bridge is a descriptor for a VALE switch. - * Interfaces for a bridge are all in bdg_ports[]. - * The array has fixed size, an empty entry does not terminate - * the search, but lookups only occur on attach/detach so we - * don't mind if they are slow. - * - * The bridge is non blocking on the transmit ports: excess - * packets are dropped if there is no room on the output port. - * - * bdg_lock protects accesses to the bdg_ports array. - * This is a rw lock (or equivalent). - */ -struct nm_bridge { - /* XXX what is the proper alignment/layout ? */ - BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ - int bdg_namelen; - uint32_t bdg_active_ports; /* 0 means free */ - char bdg_basename[IFNAMSIZ]; - - /* Indexes of active ports (up to active_ports) - * and all other remaining ports. - */ - uint8_t bdg_port_index[NM_BDG_MAXPORTS]; - - struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS]; - - - /* - * The function to decide the destination port. - * It returns either of an index of the destination port, - * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to - * forward this packet. ring_nr is the source ring index, and the - * function may overwrite this value to forward this packet to a - * different ring index. - * This function must be set by netmap_bdgctl(). - */ - bdg_lookup_fn_t nm_bdg_lookup; - - /* the forwarding table, MAC+ports. - * XXX should be changed to an argument to be passed to - * the lookup function, and allocated on attach - */ - struct nm_hash_ent ht[NM_BDG_HASH]; -}; - - -/* - * XXX in principle nm_bridges could be created dynamically - * Right now we have a static array and deletions are protected - * by an exclusive lock. - */ -struct nm_bridge nm_bridges[NM_BRIDGES]; - - -/* - * A few function to tell which kind of port are we using. - * XXX should we hold a lock ? - * - * nma_is_vp() virtual port - * nma_is_host() port connected to the host stack - * nma_is_hw() port connected to a NIC - */ -int nma_is_vp(struct netmap_adapter *na); -int -nma_is_vp(struct netmap_adapter *na) -{ - return na->nm_register == bdg_netmap_reg; -} - -static __inline int -nma_is_host(struct netmap_adapter *na) -{ - return na->nm_register == NULL; -} - -static __inline int -nma_is_hw(struct netmap_adapter *na) -{ - /* In case of sw adapter, nm_register is NULL */ - return !nma_is_vp(na) && !nma_is_host(na); -} - - -/* - * If the NIC is owned by the kernel - * (i.e., bridge), neither another bridge nor user can use it; - * if the NIC is owned by a user, only users can share it. - * Evaluation must be done under NMG_LOCK(). - */ -#define NETMAP_OWNED_BY_KERN(ifp) (!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg) -#define NETMAP_OWNED_BY_ANY(ifp) \ - (NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0)) - -/* - * NA(ifp)->bdg_port port index - */ - - -/* - * this is a slightly optimized copy routine which rounds - * to multiple of 64 bytes and is often faster than dealing - * with other odd sizes. We assume there is enough room - * in the source and destination buffers. - * - * XXX only for multiples of 64 bytes, non overlapped. - */ -static inline void -pkt_copy(void *_src, void *_dst, int l) -{ - uint64_t *src = _src; - uint64_t *dst = _dst; - if (unlikely(l >= 1024)) { - memcpy(dst, src, l); - return; - } - for (; likely(l > 0); l-=64) { - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - } -} - - -/* - * locate a bridge among the existing ones. - * MUST BE CALLED WITH NMG_LOCK() - * - * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. - * We assume that this is called with a name of at least NM_NAME chars. - */ -static struct nm_bridge * -nm_find_bridge(const char *name, int create) -{ - int i, l, namelen; - struct nm_bridge *b = NULL; - - NMG_LOCK_ASSERT(); - - namelen = strlen(NM_NAME); /* base length */ - l = name ? strlen(name) : 0; /* actual length */ - if (l < namelen) { - D("invalid bridge name %s", name ? name : NULL); - return NULL; - } - for (i = namelen + 1; i < l; i++) { - if (name[i] == ':') { - namelen = i; - break; - } - } - if (namelen >= IFNAMSIZ) - namelen = IFNAMSIZ; - ND("--- prefix is '%.*s' ---", namelen, name); - - /* lookup the name, remember empty slot if there is one */ - for (i = 0; i < NM_BRIDGES; i++) { - struct nm_bridge *x = nm_bridges + i; - - if (x->bdg_active_ports == 0) { - if (create && b == NULL) - b = x; /* record empty slot */ - } else if (x->bdg_namelen != namelen) { - continue; - } else if (strncmp(name, x->bdg_basename, namelen) == 0) { - ND("found '%.*s' at %d", namelen, name, i); - b = x; - break; - } - } - if (i == NM_BRIDGES && b) { /* name not found, can create entry */ - /* initialize the bridge */ - strncpy(b->bdg_basename, name, namelen); - ND("create new bridge %s with ports %d", b->bdg_basename, - b->bdg_active_ports); - b->bdg_namelen = namelen; - b->bdg_active_ports = 0; - for (i = 0; i < NM_BDG_MAXPORTS; i++) - b->bdg_port_index[i] = i; - /* set the default function */ - b->nm_bdg_lookup = netmap_bdg_learning; - /* reset the MAC address table */ - bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); - } - return b; -} - - -/* - * Free the forwarding tables for rings attached to switch ports. - */ -static void -nm_free_bdgfwd(struct netmap_adapter *na) -{ - int nrings, i; - struct netmap_kring *kring; - - NMG_LOCK_ASSERT(); - nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; - kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; - for (i = 0; i < nrings; i++) { - if (kring[i].nkr_ft) { - free(kring[i].nkr_ft, M_DEVBUF); - kring[i].nkr_ft = NULL; /* protect from freeing twice */ - } - } - if (nma_is_hw(na)) - nm_free_bdgfwd(SWNA(na->ifp)); -} - - -/* - * Allocate the forwarding tables for the rings attached to the bridge ports. - */ -static int -nm_alloc_bdgfwd(struct netmap_adapter *na) -{ - int nrings, l, i, num_dstq; - struct netmap_kring *kring; - - NMG_LOCK_ASSERT(); - /* all port:rings + broadcast */ - num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; - l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; - l += sizeof(struct nm_bdg_q) * num_dstq; - l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; - - nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; - kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; - for (i = 0; i < nrings; i++) { - struct nm_bdg_fwd *ft; - struct nm_bdg_q *dstq; - int j; - - ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); - if (!ft) { - nm_free_bdgfwd(na); - return ENOMEM; - } - dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); - for (j = 0; j < num_dstq; j++) { - dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; - dstq[j].bq_len = 0; - } - kring[i].nkr_ft = ft; - } - if (nma_is_hw(na)) - nm_alloc_bdgfwd(SWNA(na->ifp)); - return 0; -} /* * Fetch configuration from the device, to cope with dynamic * reconfigurations after loading the module. */ -static int +int netmap_update_config(struct netmap_adapter *na) { struct ifnet *ifp = na->ifp; @@ -836,7 +392,7 @@ netmap_update_config(struct netmap_adapter *na) txr = txd = rxr = rxd = 0; if (na->nm_config) { - na->nm_config(ifp, &txr, &txd, &rxr, &rxd); + na->nm_config(na, &txr, &txd, &rxr, &rxd); } else { /* take whatever we had at init time */ txr = na->num_tx_rings; @@ -848,15 +404,15 @@ netmap_update_config(struct netmap_adapter *na) if (na->num_tx_rings == txr && na->num_tx_desc == txd && na->num_rx_rings == rxr && na->num_rx_desc == rxd) return 0; /* nothing changed */ - if (netmap_verbose || na->refcount > 0) { + if (netmap_verbose || na->active_fds > 0) { D("stored config %s: txring %d x %d, rxring %d x %d", - ifp->if_xname, + NM_IFPNAME(ifp), na->num_tx_rings, na->num_tx_desc, na->num_rx_rings, na->num_rx_desc); D("new config %s: txring %d x %d, rxring %d x %d", - ifp->if_xname, txr, txd, rxr, rxd); + NM_IFPNAME(ifp), txr, txd, rxr, rxd); } - if (na->refcount == 0) { + if (na->active_fds == 0) { D("configuration changed (but fine)"); na->num_tx_rings = txr; na->num_tx_desc = txd; @@ -868,52 +424,111 @@ netmap_update_config(struct netmap_adapter *na) return 1; } -static struct netmap_if * + +int +netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom) +{ + u_int i, len, ndesc; + struct netmap_kring *kring; + + len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; + + na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); + if (na->tx_rings == NULL) { + D("Cannot allocate krings"); + return ENOMEM; + } + na->rx_rings = na->tx_rings + ntx; + + ndesc = na->num_tx_desc; + for (i = 0; i < ntx; i++) { /* Transmit rings */ + kring = &na->tx_rings[i]; + bzero(kring, sizeof(*kring)); + kring->na = na; + kring->nkr_num_slots = ndesc; + /* + * IMPORTANT: + * Always keep one slot empty, so we can detect new + * transmissions comparing cur and nr_hwcur (they are + * the same only if there are no new transmissions). + */ + kring->nr_hwavail = ndesc - 1; + mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); + init_waitqueue_head(&kring->si); + } + + ndesc = na->num_rx_desc; + for (i = 0; i < nrx; i++) { /* Receive rings */ + kring = &na->rx_rings[i]; + bzero(kring, sizeof(*kring)); + kring->na = na; + kring->nkr_num_slots = ndesc; + mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); + init_waitqueue_head(&kring->si); + } + init_waitqueue_head(&na->tx_si); + init_waitqueue_head(&na->rx_si); + + na->tailroom = na->rx_rings + nrx; + + return 0; + +} + + +void +netmap_krings_delete(struct netmap_adapter *na) +{ + int i; + + for (i = 0; i < na->num_tx_rings + 1; i++) { + mtx_destroy(&na->tx_rings[i].q_lock); + } + for (i = 0; i < na->num_rx_rings + 1; i++) { + mtx_destroy(&na->rx_rings[i].q_lock); + } + free(na->tx_rings, M_DEVBUF); + na->tx_rings = na->rx_rings = na->tailroom = NULL; +} + + +static struct netmap_if* netmap_if_new(const char *ifname, struct netmap_adapter *na) { + struct netmap_if *nifp; + if (netmap_update_config(na)) { /* configuration mismatch, report and fail */ return NULL; } - return netmap_mem_if_new(ifname, na); -} + if (na->active_fds) + goto final; -/* Structure associated to each thread which registered an interface. - * - * The first 4 fields of this structure are written by NIOCREGIF and - * read by poll() and NIOC?XSYNC. - * There is low contention among writers (actually, a correct user program - * should have no contention among writers) and among writers and readers, - * so we use a single global lock to protect the structure initialization. - * Since initialization involves the allocation of memory, we reuse the memory - * allocator lock. - * Read access to the structure is lock free. Readers must check that - * np_nifp is not NULL before using the other fields. - * If np_nifp is NULL initialization has not been performed, so they should - * return an error to userlevel. - * - * The ref_done field is used to regulate access to the refcount in the - * memory allocator. The refcount must be incremented at most once for - * each open("/dev/netmap"). The increment is performed by the first - * function that calls netmap_get_memory() (currently called by - * mmap(), NIOCGINFO and NIOCREGIF). - * If the refcount is incremented, it is then decremented when the - * private structure is destroyed. - */ -struct netmap_priv_d { - struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ + if (na->nm_krings_create(na)) + goto cleanup; - struct ifnet *np_ifp; /* device for which we hold a ref. */ - int np_ringid; /* from the ioctl */ - u_int np_qfirst, np_qlast; /* range of rings to scan */ - uint16_t np_txpoll; + if (netmap_mem_rings_create(na)) + goto cleanup; + +final: + + nifp = netmap_mem_if_new(ifname, na); + if (nifp == NULL) + goto cleanup; + + return (nifp); + +cleanup: + + if (na->active_fds == 0) { + netmap_mem_rings_delete(na); + na->nm_krings_delete(na); + } + + return NULL; +} - struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ -#ifdef __FreeBSD__ - int np_refcount; /* use with NMG_LOCK held */ -#endif /* __FreeBSD__ */ -}; /* grab a reference to the memory allocator, if we don't have one already. The * reference is taken from the netmap_adapter registered with the priv. @@ -925,7 +540,7 @@ netmap_get_memory_locked(struct netmap_priv_d* p) struct netmap_mem_d *nmd; int error = 0; - if (p->np_ifp == NULL) { + if (p->np_na == NULL) { if (!netmap_mmap_unreg) return ENODEV; /* for compatibility with older versions of the API @@ -934,7 +549,7 @@ netmap_get_memory_locked(struct netmap_priv_d* p) */ nmd = &nm_mem; } else { - nmd = NA(p->np_ifp)->nm_mem; + nmd = p->np_na->nm_mem; } if (p->np_mref == NULL) { error = netmap_mem_finalize(nmd); @@ -950,7 +565,8 @@ netmap_get_memory_locked(struct netmap_priv_d* p) return error; } -static int + +int netmap_get_memory(struct netmap_priv_d* p) { int error; @@ -960,12 +576,14 @@ netmap_get_memory(struct netmap_priv_d* p) return error; } + static int netmap_have_memory_locked(struct netmap_priv_d* p) { return p->np_mref != NULL; } + static void netmap_drop_memory_locked(struct netmap_priv_d* p) { @@ -975,11 +593,12 @@ netmap_drop_memory_locked(struct netmap_priv_d* p) } } + /* * File descriptor's private data destructor. * * Call nm_register(ifp,0) to stop netmap mode on the interface and - * revert to normal operation. We expect that np_ifp has not gone. + * revert to normal operation. We expect that np_na->ifp has not gone. * The second argument is the nifp to work on. In some cases it is * not attached yet to the netmap_priv_d so we need to pass it as * a separate argument. @@ -988,16 +607,15 @@ netmap_drop_memory_locked(struct netmap_priv_d* p) static void netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) { - struct ifnet *ifp = priv->np_ifp; - struct netmap_adapter *na = NA(ifp); + struct netmap_adapter *na = priv->np_na; + struct ifnet *ifp = na->ifp; NMG_LOCK_ASSERT(); - na->refcount--; - if (na->refcount <= 0) { /* last instance */ - u_int i; + na->active_fds--; + if (na->active_fds <= 0) { /* last instance */ if (netmap_verbose) - D("deleting last instance for %s", ifp->if_xname); + D("deleting last instance for %s", NM_IFPNAME(ifp)); /* * (TO CHECK) This function is only called * when the last reference to this file descriptor goes @@ -1012,140 +630,33 @@ netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) * happens if the close() occurs while a concurrent * syscall is running. */ - na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */ + if (ifp) + na->nm_register(na, 0); /* off, clear flags */ /* Wake up any sleeping threads. netmap_poll will * then return POLLERR * XXX The wake up now must happen during *_down(), when * we order all activities to stop. -gl */ - nm_free_bdgfwd(na); - for (i = 0; i < na->num_tx_rings + 1; i++) { - mtx_destroy(&na->tx_rings[i].q_lock); - } - for (i = 0; i < na->num_rx_rings + 1; i++) { - mtx_destroy(&na->rx_rings[i].q_lock); - } /* XXX kqueue(9) needed; these will mirror knlist_init. */ /* knlist_destroy(&na->tx_si.si_note); */ /* knlist_destroy(&na->rx_si.si_note); */ - if (nma_is_hw(na)) - SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL; - } - /* - * netmap_mem_if_delete() deletes the nifp, and if this is - * the last instance also buffers, rings and krings. - */ - netmap_mem_if_delete(na, nifp); -} - - -/* we assume netmap adapter exists - * Called with NMG_LOCK held - */ -static void -nm_if_rele(struct ifnet *ifp) -{ - int i, is_hw, hw, sw, lim; - struct nm_bridge *b; - struct netmap_adapter *na; - uint8_t tmp[NM_BDG_MAXPORTS]; - - NMG_LOCK_ASSERT(); - /* I can be called not only for get_ifp()-ed references where netmap's - * capability is guaranteed, but also for non-netmap-capable NICs. - */ - if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) { - if_rele(ifp); - return; - } - na = NA(ifp); - b = na->na_bdg; - is_hw = nma_is_hw(na); - - ND("%s has %d references", ifp->if_xname, NA(ifp)->na_bdg_refcount); - - if (!DROP_BDG_REF(ifp)) - return; - - /* - New algorithm: - make a copy of bdg_port_index; - lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port - in the array of bdg_port_index, replacing them with - entries from the bottom of the array; - decrement bdg_active_ports; - acquire BDG_WLOCK() and copy back the array. - */ - - hw = NA(ifp)->bdg_port; - sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; - lim = b->bdg_active_ports; - - ND("detach %d and %d (lim %d)", hw, sw, lim); - /* make a copy of the list of active ports, update it, - * and then copy back within BDG_WLOCK(). - */ - memcpy(tmp, b->bdg_port_index, sizeof(tmp)); - for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { - if (hw >= 0 && tmp[i] == hw) { - ND("detach hw %d at %d", hw, i); - lim--; /* point to last active port */ - tmp[i] = tmp[lim]; /* swap with i */ - tmp[lim] = hw; /* now this is inactive */ - hw = -1; - } else if (sw >= 0 && tmp[i] == sw) { - ND("detach sw %d at %d", sw, i); - lim--; - tmp[i] = tmp[lim]; - tmp[lim] = sw; - sw = -1; - } else { - i++; - } - } - if (hw >= 0 || sw >= 0) { - D("XXX delete failed hw %d sw %d, should panic...", hw, sw); - } - hw = NA(ifp)->bdg_port; - sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; - BDG_WLOCK(b); - b->bdg_ports[hw] = NULL; - na->na_bdg = NULL; - if (sw >= 0) { - b->bdg_ports[sw] = NULL; - SWNA(ifp)->na_bdg = NULL; - } - memcpy(b->bdg_port_index, tmp, sizeof(tmp)); - b->bdg_active_ports = lim; - BDG_WUNLOCK(b); - - ND("now %d active ports", lim); - if (lim == 0) { - ND("marking bridge %s as free", b->bdg_basename); - b->nm_bdg_lookup = NULL; - } - - if (is_hw) { - if_rele(ifp); - } else { - if (na->na_flags & NAF_MEM_OWNER) - netmap_mem_private_delete(na->nm_mem); - bzero(na, sizeof(*na)); - free(na, M_DEVBUF); - bzero(ifp, sizeof(*ifp)); - free(ifp, M_DEVBUF); + /* delete rings and buffers */ + netmap_mem_rings_delete(na); + na->nm_krings_delete(na); } + /* delete the nifp */ + netmap_mem_if_delete(na, nifp); } /* * returns 1 if this is the last instance and we can free priv */ -static int +int netmap_dtor_locked(struct netmap_priv_d *priv) { - struct ifnet *ifp = priv->np_ifp; + struct netmap_adapter *na = priv->np_na; #ifdef __FreeBSD__ /* @@ -1156,17 +667,21 @@ netmap_dtor_locked(struct netmap_priv_d *priv) return 0; } #endif /* __FreeBSD__ */ - if (ifp) { - netmap_do_unregif(priv, priv->np_nifp); + if (!na) { + return 1; //XXX is it correct? } + netmap_do_unregif(priv, priv->np_nifp); + priv->np_nifp = NULL; netmap_drop_memory_locked(priv); - if (ifp) { - nm_if_rele(ifp); /* might also destroy *na */ + if (priv->np_na) { + netmap_adapter_put(na); + priv->np_na = NULL; } return 1; } -static void + +void netmap_dtor(void *data) { struct netmap_priv_d *priv = data; @@ -1182,190 +697,6 @@ netmap_dtor(void *data) } -#ifdef __FreeBSD__ - -/* - * In order to track whether pages are still mapped, we hook into - * the standard cdev_pager and intercept the constructor and - * destructor. - */ - -struct netmap_vm_handle_t { - struct cdev *dev; - struct netmap_priv_d *priv; -}; - -static int -netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, - vm_ooffset_t foff, struct ucred *cred, u_short *color) -{ - struct netmap_vm_handle_t *vmh = handle; - D("handle %p size %jd prot %d foff %jd", - handle, (intmax_t)size, prot, (intmax_t)foff); - dev_ref(vmh->dev); - return 0; -} - - -static void -netmap_dev_pager_dtor(void *handle) -{ - struct netmap_vm_handle_t *vmh = handle; - struct cdev *dev = vmh->dev; - struct netmap_priv_d *priv = vmh->priv; - D("handle %p", handle); - netmap_dtor(priv); - free(vmh, M_DEVBUF); - dev_rel(dev); -} - -static int -netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, - int prot, vm_page_t *mres) -{ - struct netmap_vm_handle_t *vmh = object->handle; - struct netmap_priv_d *priv = vmh->priv; - vm_paddr_t paddr; - vm_page_t page; - vm_memattr_t memattr; - vm_pindex_t pidx; - - ND("object %p offset %jd prot %d mres %p", - object, (intmax_t)offset, prot, mres); - memattr = object->memattr; - pidx = OFF_TO_IDX(offset); - paddr = netmap_mem_ofstophys(priv->np_mref, offset); - if (paddr == 0) - return VM_PAGER_FAIL; - - if (((*mres)->flags & PG_FICTITIOUS) != 0) { - /* - * If the passed in result page is a fake page, update it with - * the new physical address. - */ - page = *mres; - vm_page_updatefake(page, paddr, memattr); - } else { - /* - * Replace the passed in reqpage page with our own fake page and - * free up the all of the original pages. - */ -#ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ -#define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK -#define VM_OBJECT_WLOCK VM_OBJECT_LOCK -#endif /* VM_OBJECT_WUNLOCK */ - - VM_OBJECT_WUNLOCK(object); - page = vm_page_getfake(paddr, memattr); - VM_OBJECT_WLOCK(object); - vm_page_lock(*mres); - vm_page_free(*mres); - vm_page_unlock(*mres); - *mres = page; - vm_page_insert(page, object, pidx); - } - page->valid = VM_PAGE_BITS_ALL; - return (VM_PAGER_OK); -} - - -static struct cdev_pager_ops netmap_cdev_pager_ops = { - .cdev_pg_ctor = netmap_dev_pager_ctor, - .cdev_pg_dtor = netmap_dev_pager_dtor, - .cdev_pg_fault = netmap_dev_pager_fault, -}; - - -static int -netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, - vm_size_t objsize, vm_object_t *objp, int prot) -{ - int error; - struct netmap_vm_handle_t *vmh; - struct netmap_priv_d *priv; - vm_object_t obj; - - D("cdev %p foff %jd size %jd objp %p prot %d", cdev, - (intmax_t )*foff, (intmax_t )objsize, objp, prot); - - vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (vmh == NULL) - return ENOMEM; - vmh->dev = cdev; - - NMG_LOCK(); - error = devfs_get_cdevpriv((void**)&priv); - if (error) - goto err_unlock; - vmh->priv = priv; - priv->np_refcount++; - NMG_UNLOCK(); - - error = netmap_get_memory(priv); - if (error) - goto err_deref; - - obj = cdev_pager_allocate(vmh, OBJT_DEVICE, - &netmap_cdev_pager_ops, objsize, prot, - *foff, NULL); - if (obj == NULL) { - D("cdev_pager_allocate failed"); - error = EINVAL; - goto err_deref; - } - - *objp = obj; - return 0; - -err_deref: - NMG_LOCK(); - priv->np_refcount--; -err_unlock: - NMG_UNLOCK(); -// err: - free(vmh, M_DEVBUF); - return error; -} - - -// XXX can we remove this ? -static int -netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) -{ - if (netmap_verbose) - D("dev %p fflag 0x%x devtype %d td %p", - dev, fflag, devtype, td); - return 0; -} - - -static int -netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) -{ - struct netmap_priv_d *priv; - int error; - - (void)dev; - (void)oflags; - (void)devtype; - (void)td; - - // XXX wait or nowait ? - priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (priv == NULL) - return ENOMEM; - - error = devfs_set_cdevpriv(priv, netmap_dtor); - if (error) - return error; - - priv->np_refcount = 1; - - return 0; -} -#endif /* __FreeBSD__ */ /* @@ -1391,26 +722,19 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) * pass a chain of buffers to the host stack as coming from 'dst' */ static void -netmap_send_up(struct ifnet *dst, struct mbuf *head) +netmap_send_up(struct ifnet *dst, struct mbq *q) { struct mbuf *m; /* send packets up, outside the lock */ - while ((m = head) != NULL) { - head = head->m_nextpkt; - m->m_nextpkt = NULL; + while ((m = mbq_dequeue(q)) != NULL) { if (netmap_verbose & NM_VERB_HOST) D("sending up pkt %p size %d", m, MBUF_LEN(m)); NM_SEND_UP(dst, m); } + mbq_destroy(q); } -struct mbq { - struct mbuf *head; - struct mbuf *tail; - int count; -}; - /* * put a copy of the buffers marked NS_FORWARD into an mbuf chain. @@ -1425,9 +749,9 @@ netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) * XXX handle reserved */ u_int lim = kring->nkr_num_slots - 1; - struct mbuf *m, *tail = q->tail; + struct mbuf *m; u_int k = kring->ring->cur, n = kring->ring->reserved; - struct netmap_mem_d *nmd = kring->na->nm_mem; + struct netmap_adapter *na = kring->na; /* compute the final position, ring->cur - ring->reserved */ if (n > 0) { @@ -1441,25 +765,18 @@ netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) n = nm_next(n, lim); if ((slot->flags & NS_FORWARD) == 0 && !force) continue; - if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(nmd)) { + if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { D("bad pkt at %d len %d", n, slot->len); continue; } slot->flags &= ~NS_FORWARD; // XXX needed ? /* XXX adapt to the case of a multisegment packet */ - m = m_devget(BDG_NMB(nmd, slot), slot->len, 0, kring->na->ifp, NULL); + m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); if (m == NULL) break; - if (tail) - tail->m_nextpkt = m; - else - q->head = m; - tail = m; - q->count++; - m->m_nextpkt = NULL; + mbq_enqueue(q, m); } - q->tail = tail; } @@ -1536,16 +853,19 @@ out: * can be among multiple user threads erroneously calling * this routine concurrently. */ -static void +void netmap_txsync_to_host(struct netmap_adapter *na) { struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; struct netmap_ring *ring = kring->ring; u_int k, lim = kring->nkr_num_slots - 1; - struct mbq q = { NULL, NULL, 0 }; + struct mbq q; + int error; - if (nm_kr_tryget(kring)) { - D("ring %p busy (user error)", kring); + error = nm_kr_tryget(kring); + if (error) { + if (error == NM_KR_BUSY) + D("ring %p busy (user error)", kring); return; } k = ring->cur; @@ -1560,29 +880,13 @@ netmap_txsync_to_host(struct netmap_adapter *na) * In case of no buffers we give up. At the end of the loop, * the queue is drained in all cases. */ + mbq_init(&q); netmap_grab_packets(kring, &q, 1); kring->nr_hwcur = k; kring->nr_hwavail = ring->avail = lim; nm_kr_put(kring); - netmap_send_up(na->ifp, q.head); -} - - -/* - * This is the 'txsync' handler to send from a software ring to the - * host stack. - */ -/* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */ -static int -netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int flags) -{ - (void)ring_nr; - (void)flags; - if (netmap_verbose > 255) - RD(5, "sync to host %s ring %d", ifp->if_xname, ring_nr); - netmap_txsync_to_host(NA(ifp)); - return 0; + netmap_send_up(na->ifp, &q); } @@ -1610,7 +914,6 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai if (kring->nkr_stopped) /* check a first time without lock */ return; - /* XXX as an optimization we could reuse na->core_lock */ mtx_lock(&kring->q_lock); if (kring->nkr_stopped) /* check again with lock held */ @@ -1629,7 +932,7 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai ring->reserved = resvd = 0; // XXX panic... } k = (k >= resvd) ? k - resvd : k + lim - resvd; - } + } if (j != k) { n = k >= j ? k - j : k + lim - j; kring->nr_hwavail -= n; @@ -1646,6 +949,104 @@ unlock_out: } +/* Get a netmap adapter for the port. + * + * If it is possible to satisfy the request, return 0 + * with *na containing the netmap adapter found. + * Otherwise return an error code, with *na containing NULL. + * + * When the port is attached to a bridge, we always return + * EBUSY. + * Otherwise, if the port is already bound to a file descriptor, + * then we unconditionally return the existing adapter into *na. + * In all the other cases, we return (into *na) either native, + * generic or NULL, according to the following table: + * + * native_support + * active_fds dev.netmap.admode YES NO + * ------------------------------------------------------- + * >0 * NA(ifp) NA(ifp) + * + * 0 NETMAP_ADMODE_BEST NATIVE GENERIC + * 0 NETMAP_ADMODE_NATIVE NATIVE NULL + * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC + * + */ + +int +netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) +{ + /* generic support */ + int i = netmap_admode; /* Take a snapshot. */ + int error = 0; + struct netmap_adapter *prev_na; + struct netmap_generic_adapter *gna; + + *na = NULL; /* default */ + + /* reset in case of invalid value */ + if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) + i = netmap_admode = NETMAP_ADMODE_BEST; + + if (NETMAP_CAPABLE(ifp)) { + /* If an adapter already exists, but is + * attached to a vale port, we report that the + * port is busy. + */ + if (NETMAP_OWNED_BY_KERN(NA(ifp))) + return EBUSY; + + /* If an adapter already exists, return it if + * there are active file descriptors or if + * netmap is not forced to use generic + * adapters. + */ + if (NA(ifp)->active_fds > 0 || + i != NETMAP_ADMODE_GENERIC) { + *na = NA(ifp); + return 0; + } + } + + /* If there isn't native support and netmap is not allowed + * to use generic adapters, we cannot satisfy the request. + */ + if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) + return EINVAL; + + /* Otherwise, create a generic adapter and return it, + * saving the previously used netmap adapter, if any. + * + * Note that here 'prev_na', if not NULL, MUST be a + * native adapter, and CANNOT be a generic one. This is + * true because generic adapters are created on demand, and + * destroyed when not used anymore. Therefore, if the adapter + * currently attached to an interface 'ifp' is generic, it + * must be that + * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). + * Consequently, if NA(ifp) is generic, we will enter one of + * the branches above. This ensures that we never override + * a generic adapter with another generic adapter. + */ + prev_na = NA(ifp); + error = generic_netmap_attach(ifp); + if (error) + return error; + + *na = NA(ifp); + gna = (struct netmap_generic_adapter*)NA(ifp); + gna->prev = prev_na; /* save old na */ + if (prev_na != NULL) { + ifunit_ref(ifp->if_xname); + // XXX add a refcount ? + netmap_adapter_get(prev_na); + } + D("Created generic NA %p (prev %p)", gna, gna->prev); + + return 0; +} + + /* * MUST BE CALLED UNDER NMG_LOCK() * @@ -1666,179 +1067,191 @@ unlock_out: * being detached from the bridge in error handling. But once refcount * is acquired by this function, it must be released using nm_if_rele(). */ -static int -get_ifp(struct nmreq *nmr, struct ifnet **ifp, int create) +int +netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) { - const char *name = nmr->nr_name; - int namelen = strlen(name); - struct ifnet *iter = NULL; - int no_prefix = 0; + struct ifnet *ifp; + int error = 0; + struct netmap_adapter *ret; - /* first try to see if this is a bridge port. */ - struct nm_bridge *b; - struct netmap_adapter *na; - int i, j, cand = -1, cand2 = -1; - int needed; + *na = NULL; /* default return value */ + /* first try to see if this is a bridge port. */ NMG_LOCK_ASSERT(); - *ifp = NULL; /* default */ - if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { - no_prefix = 1; /* no VALE prefix */ - goto no_bridge_port; - } - b = nm_find_bridge(name, create); - if (b == NULL) { - D("no bridges available for '%s'", name); - return (ENXIO); + error = netmap_get_bdg_na(nmr, na, create); + if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */ + return error; + + ifp = ifunit_ref(nmr->nr_name); + if (ifp == NULL) { + return ENXIO; } - /* Now we are sure that name starts with the bridge's name, - * lookup the port in the bridge. We need to scan the entire - * list. It is not important to hold a WLOCK on the bridge - * during the search because NMG_LOCK already guarantees - * that there are no other possible writers. - */ + error = netmap_get_hw_na(ifp, &ret); + if (error) + goto out; - /* lookup in the local list of ports */ - for (j = 0; j < b->bdg_active_ports; j++) { - i = b->bdg_port_index[j]; - na = b->bdg_ports[i]; - // KASSERT(na != NULL); - iter = na->ifp; - /* XXX make sure the name only contains one : */ - if (!strcmp(iter->if_xname, name) /* virtual port */ || - (namelen > b->bdg_namelen && !strcmp(iter->if_xname, - name + b->bdg_namelen + 1)) /* NIC */) { - ADD_BDG_REF(iter); - ND("found existing if %s refs %d", name, - NA(iter)->na_bdg_refcount); - *ifp = iter; - /* we are done, this is surely netmap capable */ - return 0; + if (ret != NULL) { + /* Users cannot use the NIC attached to a bridge directly */ + if (NETMAP_OWNED_BY_KERN(ret)) { + error = EINVAL; + goto out; } + error = 0; + *na = ret; + netmap_adapter_get(ret); } - /* not found, should we create it? */ - if (!create) - return ENXIO; - /* yes we should, see if we have space to attach entries */ - needed = 2; /* in some cases we only need 1 */ - if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { - D("bridge full %d, cannot create new port", b->bdg_active_ports); - return EINVAL; +out: + if_rele(ifp); + + return error; +} + + +/* + * validate parameters on entry for *_txsync() + * Returns ring->cur if ok, or something >= kring->nkr_num_slots + * in case of error. The extra argument is a pointer to + * 'new_bufs'. XXX this may be deprecated at some point. + * + * Below is a correct configuration on input. ring->cur + * must be in the region covered by kring->hwavail, + * and ring->avail and kring->avail should end at the same slot. + * + * +-hwcur + * | + * v<--hwres-->|<-----hwavail----> + * ------+------------------------------+-------- ring + * | + * |<---avail---> + * +--cur + * + */ +u_int +nm_txsync_prologue(struct netmap_kring *kring, u_int *new_slots) +{ + struct netmap_ring *ring = kring->ring; + u_int cur = ring->cur; /* read only once */ + u_int avail = ring->avail; /* read only once */ + u_int n = kring->nkr_num_slots; + u_int kstart, kend, a; + +#if 1 /* kernel sanity checks */ + if (kring->nr_hwcur >= n || + kring->nr_hwreserved >= n || kring->nr_hwavail >= n || + kring->nr_hwreserved + kring->nr_hwavail >= n) + goto error; +#endif /* kernel sanity checks */ + kstart = kring->nr_hwcur + kring->nr_hwreserved; + if (kstart >= n) + kstart -= n; + kend = kstart + kring->nr_hwavail; + /* user sanity checks. a is the expected avail */ + if (cur < kstart) { + /* too low, but maybe wraparound */ + if (cur + n > kend) + goto error; + *new_slots = cur + n - kstart; + a = kend - cur - n; + } else { + if (cur > kend) + goto error; + *new_slots = cur - kstart; + a = kend - cur; } - /* record the next two ports available, but do not allocate yet */ - cand = b->bdg_port_index[b->bdg_active_ports]; - cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; - ND("+++ bridge %s port %s used %d avail %d %d", - b->bdg_basename, name, b->bdg_active_ports, cand, cand2); + if (a != avail) { + RD(5, "wrong but fixable avail have %d need %d", + avail, a); + ring->avail = avail = a; + } + return cur; - /* - * try see if there is a matching NIC with this name - * (after the bridge's name) - */ - iter = ifunit_ref(name + b->bdg_namelen + 1); - if (!iter) { /* this is a virtual port */ - /* Create a temporary NA with arguments, then - * bdg_netmap_attach() will allocate the real one - * and attach it to the ifp - */ - struct netmap_adapter tmp_na; - int error; +error: + RD(5, "kring error: hwcur %d hwres %d hwavail %d cur %d av %d", + kring->nr_hwcur, + kring->nr_hwreserved, kring->nr_hwavail, + cur, avail); + return n; +} - if (nmr->nr_cmd) { - /* nr_cmd must be 0 for a virtual port */ - return EINVAL; - } - bzero(&tmp_na, sizeof(tmp_na)); - /* bound checking */ - tmp_na.num_tx_rings = nmr->nr_tx_rings; - nm_bound_var(&tmp_na.num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); - nmr->nr_tx_rings = tmp_na.num_tx_rings; // write back - tmp_na.num_rx_rings = nmr->nr_rx_rings; - nm_bound_var(&tmp_na.num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); - nmr->nr_rx_rings = tmp_na.num_rx_rings; // write back - nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, - 1, NM_BDG_MAXSLOTS, NULL); - tmp_na.num_tx_desc = nmr->nr_tx_slots; - nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, - 1, NM_BDG_MAXSLOTS, NULL); - tmp_na.num_rx_desc = nmr->nr_rx_slots; - - /* create a struct ifnet for the new port. - * need M_NOWAIT as we are under nma_lock - */ - iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO); - if (!iter) - return ENOMEM; - - strcpy(iter->if_xname, name); - tmp_na.ifp = iter; - /* bdg_netmap_attach creates a struct netmap_adapter */ - error = bdg_netmap_attach(&tmp_na); - if (error) { - D("error %d", error); - free(iter, M_DEVBUF); - return error; - } - cand2 = -1; /* only need one port */ - } else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */ - /* make sure the NIC is not already in use */ - if (NETMAP_OWNED_BY_ANY(iter)) { - D("NIC %s busy, cannot attach to bridge", - iter->if_xname); - if_rele(iter); /* don't detach from bridge */ - return EINVAL; + +/* + * validate parameters on entry for *_rxsync() + * Returns ring->cur - ring->reserved if ok, + * or something >= kring->nkr_num_slots + * in case of error. The extra argument is a pointer to + * 'resvd'. XXX this may be deprecated at some point. + * + * Below is a correct configuration on input. ring->cur and + * ring->reserved must be in the region covered by kring->hwavail, + * and ring->avail and kring->avail should end at the same slot. + * + * +-hwcur + * | + * v<-------hwavail----------> + * ---------+--------------------------+-------- ring + * |<--res-->| + * |<---avail---> + * +--cur + * + */ +u_int +nm_rxsync_prologue(struct netmap_kring *kring, u_int *resvd) +{ + struct netmap_ring *ring = kring->ring; + u_int cur = ring->cur; /* read only once */ + u_int avail = ring->avail; /* read only once */ + u_int res = ring->reserved; /* read only once */ + u_int n = kring->nkr_num_slots; + u_int kend = kring->nr_hwcur + kring->nr_hwavail; + u_int a; + +#if 1 /* kernel sanity checks */ + if (kring->nr_hwcur >= n || kring->nr_hwavail >= n) + goto error; +#endif /* kernel sanity checks */ + /* user sanity checks */ + if (res >= n) + goto error; + /* check that cur is valid, a is the expected value of avail */ + if (cur < kring->nr_hwcur) { + /* too low, but maybe wraparound */ + if (cur + n > kend) + goto error; + a = kend - (cur + n); + } else { + if (cur > kend) + goto error; + a = kend - cur; + } + if (a != avail) { + RD(5, "wrong but fixable avail have %d need %d", + avail, a); + ring->avail = avail = a; + } + if (res != 0) { + /* then repeat the check for cur + res */ + cur = (cur >= res) ? cur - res : n + cur - res; + if (cur < kring->nr_hwcur) { + /* too low, but maybe wraparound */ + if (cur + n > kend) + goto error; + } else if (cur > kend) { + goto error; } - if (nmr->nr_arg1 != NETMAP_BDG_HOST) - cand2 = -1; /* only need one port */ - } else { /* not a netmap-capable NIC */ - if_rele(iter); /* don't detach from bridge */ - return EINVAL; } - na = NA(iter); - - BDG_WLOCK(b); - na->bdg_port = cand; - ND("NIC %p to bridge port %d", NA(iter), cand); - /* bind the port to the bridge (virtual ports are not active) */ - b->bdg_ports[cand] = na; - na->na_bdg = b; - b->bdg_active_ports++; - if (cand2 >= 0) { - /* also bind the host stack to the bridge */ - b->bdg_ports[cand2] = SWNA(iter); - SWNA(iter)->bdg_port = cand2; - SWNA(iter)->na_bdg = b; - b->bdg_active_ports++; - ND("host %p to bridge port %d", SWNA(iter), cand2); - } - ADD_BDG_REF(iter); // XXX one or two ? - ND("if %s refs %d", name, NA(iter)->na_bdg_refcount); - BDG_WUNLOCK(b); - *ifp = iter; - return 0; - -no_bridge_port: - *ifp = iter; - if (! *ifp) - *ifp = ifunit_ref(name); - if (*ifp == NULL) - return (ENXIO); + *resvd = res; + return cur; - if (NETMAP_CAPABLE(*ifp)) { - /* Users cannot use the NIC attached to a bridge directly */ - if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) { - if_rele(*ifp); /* don't detach from bridge */ - return EINVAL; - } else - return 0; /* valid pointer, we hold the refcount */ - } - nm_if_rele(*ifp); - return EINVAL; // not NETMAP capable +error: + RD(5, "kring error: hwcur %d hwres %d hwavail %d cur %d av %d res %d", + kring->nr_hwcur, + kring->nr_hwreserved, kring->nr_hwavail, + ring->cur, avail, res); + return n; } - /* * Error routine called when txsync/rxsync detects an error. * Can't do much more than resetting cur = hwcur, avail = hwavail. @@ -1859,7 +1272,7 @@ netmap_ring_reinit(struct netmap_kring *kring) int errors = 0; // XXX KASSERT nm_kr_tryget - RD(10, "called for %s", kring->na->ifp->if_xname); + RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); if (ring->cur > lim) errors++; for (i = 0; i <= lim; i++) { @@ -1884,7 +1297,7 @@ netmap_ring_reinit(struct netmap_kring *kring) RD(10, "total %d errors", errors); errors++; RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", - kring->na->ifp->if_xname, + NM_IFPNAME(kring->na->ifp), pos < n ? "TX" : "RX", pos < n ? pos : pos - n, ring->cur, kring->nr_hwcur, ring->avail, kring->nr_hwavail); @@ -1902,8 +1315,8 @@ netmap_ring_reinit(struct netmap_kring *kring) static int netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) { - struct ifnet *ifp = priv->np_ifp; - struct netmap_adapter *na = NA(ifp); + struct netmap_adapter *na = priv->np_na; + struct ifnet *ifp = na->ifp; u_int i = ringid & NETMAP_RING_MASK; /* initially (np_qfirst == np_qlast) we don't want to lock */ u_int lim = na->num_rx_rings; @@ -1928,12 +1341,12 @@ netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; if (netmap_verbose) { if (ringid & NETMAP_SW_RING) - D("ringid %s set to SW RING", ifp->if_xname); + D("ringid %s set to SW RING", NM_IFPNAME(ifp)); else if (ringid & NETMAP_HW_RING) - D("ringid %s set to HW RING %d", ifp->if_xname, + D("ringid %s set to HW RING %d", NM_IFPNAME(ifp), priv->np_qfirst); else - D("ringid %s set to all %d HW RINGS", ifp->if_xname, lim); + D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim); } return 0; } @@ -1944,18 +1357,18 @@ netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) * If success it returns a pointer to netmap_if, otherwise NULL. * This must be called with NMG_LOCK held. */ -static struct netmap_if * -netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp, +struct netmap_if * +netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, uint16_t ringid, int *err) { - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_if *nifp = NULL; - int error, need_mem; + int error, need_mem = 0; NMG_LOCK_ASSERT(); /* ring configuration may have changed, fetch from the card */ netmap_update_config(na); - priv->np_ifp = ifp; /* store the reference */ + priv->np_na = na; /* store the reference */ error = netmap_set_ringid(priv, ringid); if (error) goto out; @@ -1967,57 +1380,40 @@ netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp, if (error) goto out; } - nifp = netmap_if_new(ifp->if_xname, na); + nifp = netmap_if_new(NM_IFPNAME(ifp), na); if (nifp == NULL) { /* allocation failed */ /* we should drop the allocator, but only * if we were the ones who grabbed it */ - if (need_mem) - netmap_drop_memory_locked(priv); error = ENOMEM; goto out; } - na->refcount++; + na->active_fds++; if (ifp->if_capenable & IFCAP_NETMAP) { /* was already set */ } else { - u_int i; /* Otherwise set the card in netmap mode * and make it use the shared buffers. * - * If the interface is attached to a bridge, lock it. - */ - if (NETMAP_OWNED_BY_KERN(ifp)) - BDG_WLOCK(NA(ifp)->na_bdg); - for (i = 0 ; i < na->num_tx_rings + 1; i++) - mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock", - NULL, MTX_DEF); - for (i = 0 ; i < na->num_rx_rings + 1; i++) { - mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", - NULL, MTX_DEF); - } - if (nma_is_hw(na)) { - SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings]; - SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings]; - } - /* * do not core lock because the race is harmless here, * there cannot be any traffic to netmap_transmit() */ - error = na->nm_register(ifp, 1); /* mode on */ - // XXX do we need to nm_alloc_bdgfwd() in all cases ? - if (!error) - error = nm_alloc_bdgfwd(na); + na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut; + ND("%p->na_lut == %p", na, na->na_lut); + na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal; + error = na->nm_register(na, 1); /* mode on */ if (error) { netmap_do_unregif(priv, nifp); nifp = NULL; } - if (NETMAP_OWNED_BY_KERN(ifp)) - BDG_WUNLOCK(NA(ifp)->na_bdg); - } out: *err = error; + if (error) { + priv->np_na = NULL; + if (need_mem) + netmap_drop_memory_locked(priv); + } if (nifp != NULL) { /* * advertise that the interface is ready bt setting ni_nifp. @@ -2030,251 +1426,6 @@ out: return nifp; } -/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ -static int -nm_bdg_attach(struct nmreq *nmr) -{ - struct ifnet *ifp; - struct netmap_if *nifp; - struct netmap_priv_d *npriv; - int error; - - npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); - if (npriv == NULL) - return ENOMEM; - NMG_LOCK(); - error = get_ifp(nmr, &ifp, 1 /* create if not exists */); - if (error) /* no device, or another bridge or user owns the device */ - goto unlock_exit; - /* get_ifp() sets na_bdg if this is a physical interface - * that we can attach to a switch. - */ - if (!NETMAP_OWNED_BY_KERN(ifp)) { - /* got reference to a virtual port or direct access to a NIC. - * perhaps specified no bridge prefix or wrong NIC name - */ - error = EINVAL; - goto unref_exit; - } - - if (NA(ifp)->refcount > 0) { /* already registered */ - error = EBUSY; - DROP_BDG_REF(ifp); - goto unlock_exit; - } - - nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error); - if (!nifp) { - goto unref_exit; - } - - NA(ifp)->na_kpriv = npriv; - NMG_UNLOCK(); - ND("registered %s to netmap-mode", ifp->if_xname); - return 0; - -unref_exit: - nm_if_rele(ifp); -unlock_exit: - NMG_UNLOCK(); - bzero(npriv, sizeof(*npriv)); - free(npriv, M_DEVBUF); - return error; -} - -static int -nm_bdg_detach(struct nmreq *nmr) -{ - struct ifnet *ifp; - int error; - int last_instance; - - NMG_LOCK(); - error = get_ifp(nmr, &ifp, 0 /* don't create */); - if (error) { /* no device, or another bridge or user owns the device */ - goto unlock_exit; - } - /* XXX do we need to check this ? */ - if (!NETMAP_OWNED_BY_KERN(ifp)) { - /* got reference to a virtual port or direct access to a NIC. - * perhaps specified no bridge's prefix or wrong NIC's name - */ - error = EINVAL; - goto unref_exit; - } - - if (NA(ifp)->refcount == 0) { /* not registered */ - error = EINVAL; - goto unref_exit; - } - - DROP_BDG_REF(ifp); /* the one from get_ifp */ - last_instance = netmap_dtor_locked(NA(ifp)->na_kpriv); /* unregister */ - NMG_UNLOCK(); - if (!last_instance) { - D("--- error, trying to detach an entry with active mmaps"); - error = EINVAL; - } else { - struct netmap_priv_d *npriv = NA(ifp)->na_kpriv; - NA(ifp)->na_kpriv = NULL; - - bzero(npriv, sizeof(*npriv)); - free(npriv, M_DEVBUF); - } - return error; - -unref_exit: - nm_if_rele(ifp); -unlock_exit: - NMG_UNLOCK(); - return error; -} - - -/* Initialize necessary fields of sw adapter located in right after hw's - * one. sw adapter attaches a pair of sw rings of the netmap-mode NIC. - * It is always activated and deactivated at the same tie with the hw's one. - * Thus we don't need refcounting on the sw adapter. - * Regardless of NIC's feature we use separate lock so that anybody can lock - * me independently from the hw adapter. - * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw - */ -static void -netmap_attach_sw(struct ifnet *ifp) -{ - struct netmap_adapter *hw_na = NA(ifp); - struct netmap_adapter *na = SWNA(ifp); - - na->ifp = ifp; - na->num_rx_rings = na->num_tx_rings = 1; - na->num_tx_desc = hw_na->num_tx_desc; - na->num_rx_desc = hw_na->num_rx_desc; - na->nm_txsync = netmap_bdg_to_host; - /* we use the same memory allocator as the - * the hw adapter */ - na->nm_mem = hw_na->nm_mem; -} - - -/* exported to kernel callers, e.g. OVS ? - * Entry point. - * Called without NMG_LOCK. - */ -int -netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) -{ - struct nm_bridge *b; - struct netmap_adapter *na; - struct ifnet *iter; - char *name = nmr->nr_name; - int cmd = nmr->nr_cmd, namelen = strlen(name); - int error = 0, i, j; - - switch (cmd) { - case NETMAP_BDG_ATTACH: - error = nm_bdg_attach(nmr); - break; - - case NETMAP_BDG_DETACH: - error = nm_bdg_detach(nmr); - break; - - case NETMAP_BDG_LIST: - /* this is used to enumerate bridges and ports */ - if (namelen) { /* look up indexes of bridge and port */ - if (strncmp(name, NM_NAME, strlen(NM_NAME))) { - error = EINVAL; - break; - } - NMG_LOCK(); - b = nm_find_bridge(name, 0 /* don't create */); - if (!b) { - error = ENOENT; - NMG_UNLOCK(); - break; - } - - error = ENOENT; - for (j = 0; j < b->bdg_active_ports; j++) { - i = b->bdg_port_index[j]; - na = b->bdg_ports[i]; - if (na == NULL) { - D("---AAAAAAAAARGH-------"); - continue; - } - iter = na->ifp; - /* the former and the latter identify a - * virtual port and a NIC, respectively - */ - if (!strcmp(iter->if_xname, name) || - (namelen > b->bdg_namelen && - !strcmp(iter->if_xname, - name + b->bdg_namelen + 1))) { - /* bridge index */ - nmr->nr_arg1 = b - nm_bridges; - nmr->nr_arg2 = i; /* port index */ - error = 0; - break; - } - } - NMG_UNLOCK(); - } else { - /* return the first non-empty entry starting from - * bridge nr_arg1 and port nr_arg2. - * - * Users can detect the end of the same bridge by - * seeing the new and old value of nr_arg1, and can - * detect the end of all the bridge by error != 0 - */ - i = nmr->nr_arg1; - j = nmr->nr_arg2; - - NMG_LOCK(); - for (error = ENOENT; i < NM_BRIDGES; i++) { - b = nm_bridges + i; - if (j >= b->bdg_active_ports) { - j = 0; /* following bridges scan from 0 */ - continue; - } - nmr->nr_arg1 = i; - nmr->nr_arg2 = j; - j = b->bdg_port_index[j]; - na = b->bdg_ports[j]; - iter = na->ifp; - strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); - error = 0; - break; - } - NMG_UNLOCK(); - } - break; - - case NETMAP_BDG_LOOKUP_REG: - /* register a lookup function to the given bridge. - * nmr->nr_name may be just bridge's name (including ':' - * if it is not just NM_NAME). - */ - if (!func) { - error = EINVAL; - break; - } - NMG_LOCK(); - b = nm_find_bridge(name, 0 /* don't create */); - if (!b) { - error = EINVAL; - } else { - b->nm_bdg_lookup = func; - } - NMG_UNLOCK(); - break; - - default: - D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); - error = EINVAL; - break; - } - return error; -} /* @@ -2290,7 +1441,7 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) * * Return 0 on success, errno otherwise. */ -static int +int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { @@ -2353,13 +1504,12 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, if (nmr->nr_name[0] != '\0') { /* get a refcount */ - error = get_ifp(nmr, &ifp, 1 /* create */); + error = netmap_get_na(nmr, &na, 1 /* create */); if (error) break; - na = NA(ifp); /* retrieve the netmap adapter */ - nmd = na->nm_mem; /* and its memory allocator */ + nmd = na->nm_mem; /* get memory allocator */ } - + error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); if (error) break; @@ -2374,9 +1524,8 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, nmr->nr_tx_slots = na->num_tx_desc; if (memflags & NETMAP_MEM_PRIVATE) nmr->nr_ringid |= NETMAP_PRIV_MEM; + netmap_adapter_put(na); } while (0); - if (ifp) - nm_if_rele(ifp); /* return the refcount */ NMG_UNLOCK(); break; @@ -2388,7 +1537,8 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, } /* possibly attach/detach NIC and VALE switch */ i = nmr->nr_cmd; - if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) { + if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH + || i == NETMAP_BDG_OFFSET) { error = netmap_bdg_ctl(nmr, NULL); break; } else if (i != 0) { @@ -2402,36 +1552,35 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, do { u_int memflags; - if (priv->np_ifp != NULL) { /* thread already registered */ + if (priv->np_na != NULL) { /* thread already registered */ error = netmap_set_ringid(priv, nmr->nr_ringid); break; } /* find the interface and a reference */ - error = get_ifp(nmr, &ifp, 1 /* create */); /* keep reference */ + error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ if (error) break; - if (NETMAP_OWNED_BY_KERN(ifp)) { - nm_if_rele(ifp); + ifp = na->ifp; + if (NETMAP_OWNED_BY_KERN(na)) { + netmap_adapter_put(na); error = EBUSY; break; } - nifp = netmap_do_regif(priv, ifp, nmr->nr_ringid, &error); + nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error); if (!nifp) { /* reg. failed, release priv and ref */ - nm_if_rele(ifp); /* return the refcount */ - priv->np_ifp = NULL; + netmap_adapter_put(na); priv->np_nifp = NULL; break; } /* return the offset of the netmap_if object */ - na = NA(ifp); /* retrieve netmap adapter */ nmr->nr_rx_rings = na->num_rx_rings; nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); if (error) { - nm_if_rele(ifp); + netmap_adapter_put(na); break; } if (memflags & NETMAP_MEM_PRIVATE) { @@ -2459,15 +1608,21 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, } rmb(); /* make sure following reads are not from cache */ - ifp = priv->np_ifp; /* we have a reference */ + na = priv->np_na; /* we have a reference */ + if (na == NULL) { + D("Internal error: nifp != NULL && na == NULL"); + error = ENXIO; + break; + } + + ifp = na->ifp; if (ifp == NULL) { - D("Internal error: nifp != NULL && ifp == NULL"); + RD(1, "the ifp is gone"); error = ENXIO; break; } - na = NA(ifp); /* retrieve netmap adapter */ if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ if (cmd == NIOCTXSYNC) netmap_txsync_to_host(na); @@ -2493,13 +1648,13 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, D("pre txsync ring %d cur %d hwcur %d", i, kring->ring->cur, kring->nr_hwcur); - na->nm_txsync(ifp, i, NAF_FORCE_RECLAIM); + na->nm_txsync(na, i, NAF_FORCE_RECLAIM); if (netmap_verbose & NM_VERB_TXSYNC) D("post txsync ring %d cur %d hwcur %d", i, kring->ring->cur, kring->nr_hwcur); } else { - na->nm_rxsync(ifp, i, NAF_FORCE_READ); + na->nm_rxsync(na, i, NAF_FORCE_READ); microtime(&na->rx_rings[i].ring->ts); } nm_kr_put(kring); @@ -2521,15 +1676,17 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, bzero(&so, sizeof(so)); NMG_LOCK(); - error = get_ifp(nmr, &ifp, 0 /* don't create */); /* keep reference */ + error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */ if (error) { + netmap_adapter_put(na); NMG_UNLOCK(); break; } + ifp = na->ifp; so.so_vnet = ifp->if_vnet; // so->so_proto not null. error = ifioctl(&so, cmd, data, td); - nm_if_rele(ifp); + netmap_adapter_put(na); NMG_UNLOCK(); break; } @@ -2560,7 +1717,7 @@ out: * The first one is remapped to pwait as selrecord() uses the name as an * hidden argument. */ -static int +int netmap_poll(struct cdev *dev, int events, struct thread *td) { struct netmap_priv_d *priv = NULL; @@ -2569,12 +1726,18 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) struct netmap_kring *kring; u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; u_int lim_tx, lim_rx, host_forwarded = 0; - struct mbq q = { NULL, NULL, 0 }; + struct mbq q; void *pwait = dev; /* linux compatibility */ - int retry_tx = 1; + /* + * In order to avoid nested locks, we need to "double check" + * txsync and rxsync if we decide to do a selrecord(). + * retry_tx (and retry_rx, later) prevent looping forever. + */ + int retry_tx = 1; (void)pwait; + mbq_init(&q); if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) return POLLERR; @@ -2585,18 +1748,22 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) } rmb(); /* make sure following reads are not from cache */ - ifp = priv->np_ifp; - // XXX check for deleting() ? + na = priv->np_na; + ifp = na->ifp; + // check for deleted + if (ifp == NULL) { + RD(1, "the ifp is gone"); + return POLLERR; + } + if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) return POLLERR; if (netmap_verbose & 0x8000) - D("device %s events 0x%x", ifp->if_xname, events); + D("device %s events 0x%x", NM_IFPNAME(ifp), events); want_tx = events & (POLLOUT | POLLWRNORM); want_rx = events & (POLLIN | POLLRDNORM); - na = NA(ifp); /* retrieve netmap adapter */ - lim_tx = na->num_tx_rings; lim_rx = na->num_rx_rings; @@ -2618,7 +1785,11 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) return (revents); } - /* if we are in transparent mode, check also the host rx ring */ + /* + * If we are in transparent mode, check also the host rx ring + * XXX Transparent mode at the moment requires to bind all + * rings to a single file descriptor. + */ kring = &na->rx_rings[lim_rx]; if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all && want_rx @@ -2630,8 +1801,8 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) } /* - * check_all is set if the card has more than one queue AND - * the client is polling all of them. If true, we sleep on + * check_all_{tx|rx} are set if the card has more than one queue AND + * the file descriptor is bound to all of them. If so, we sleep on * the "global" selinfo, otherwise we sleep on individual selinfo * (FreeBSD only allows two selinfo's per file descriptor). * The interrupt routine in the driver wake one or the other @@ -2650,9 +1821,11 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) } /* - * We start with a lock free round which is good if we have - * data available. If this fails, then lock and call the sync + * We start with a lock free round which is cheap if we have + * slots available. If this fails, then lock and call the sync * routines. + * XXX rather than ring->avail >0 should check that + * ring->cur has not reached hwcur+hwavail */ for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { kring = &na->rx_rings[i]; @@ -2673,6 +1846,8 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * If we to push packets out (priv->np_txpoll) or want_tx is * still set, we do need to run the txsync calls (on all rings, * to avoid that the tx rings stall). + * XXX should also check cur != hwcur on the tx rings. + * Fortunately, normal tx mode has np_txpoll set. */ if (priv->np_txpoll || want_tx) { /* If we really want to be woken up (want_tx), @@ -2693,18 +1868,27 @@ flush_tx: continue; /* make sure only one user thread is doing this */ if (nm_kr_tryget(kring)) { - ND("ring %p busy is %d", kring, (int)kring->nr_busy); + ND("ring %p busy is %d", + kring, (int)kring->nr_busy); revents |= POLLERR; goto out; } if (netmap_verbose & NM_VERB_TXSYNC) D("send %d on %s %d", - kring->ring->cur, ifp->if_xname, i); - if (na->nm_txsync(ifp, i, 0)) + kring->ring->cur, NM_IFPNAME(ifp), i); + if (na->nm_txsync(na, i, 0)) revents |= POLLERR; - /* Check avail/call selrecord only if called with POLLOUT */ + /* Check avail and call selrecord only if + * called with POLLOUT and run out of bufs. + * XXX Note, we cannot trust much ring->avail + * as it is exposed to userspace (even though + * just updated by txsync). We should really + * check kring->nr_hwavail or better have + * txsync set a flag telling if we need + * to do a selrecord(). + */ if (want_tx) { if (kring->ring->avail > 0) { /* stop at the first ring. We don't risk @@ -2748,7 +1932,7 @@ do_retry_rx: netmap_grab_packets(kring, &q, netmap_fwd); } - if (na->nm_rxsync(ifp, i, 0)) + if (na->nm_rxsync(na, i, 0)) revents |= POLLERR; if (netmap_no_timestamp == 0 || kring->ring->flags & NR_TIMESTAMP) { @@ -2784,7 +1968,7 @@ do_retry_rx: } if (q.head) - netmap_send_up(na->ifp, q.head); + netmap_send_up(na->ifp, &q); out: @@ -2793,6 +1977,71 @@ out: /*------- driver support routines ------*/ +static int netmap_hw_krings_create(struct netmap_adapter *); + +static int +netmap_notify(struct netmap_adapter *na, u_int n_ring, enum txrx tx, int flags) +{ + struct netmap_kring *kring; + + if (tx == NR_TX) { + kring = na->tx_rings + n_ring; + selwakeuppri(&kring->si, PI_NET); + if (flags & NAF_GLOBAL_NOTIFY) + selwakeuppri(&na->tx_si, PI_NET); + } else { + kring = na->rx_rings + n_ring; + selwakeuppri(&kring->si, PI_NET); + if (flags & NAF_GLOBAL_NOTIFY) + selwakeuppri(&na->rx_si, PI_NET); + } + return 0; +} + + +// XXX check handling of failures +int +netmap_attach_common(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + + if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { + D("%s: invalid rings tx %d rx %d", + ifp->if_xname, na->num_tx_rings, na->num_rx_rings); + return EINVAL; + } + WNA(ifp) = na; + NETMAP_SET_CAPABLE(ifp); + if (na->nm_krings_create == NULL) { + na->nm_krings_create = netmap_hw_krings_create; + na->nm_krings_delete = netmap_krings_delete; + } + if (na->nm_notify == NULL) + na->nm_notify = netmap_notify; + na->active_fds = 0; + + if (na->nm_mem == NULL) + na->nm_mem = &nm_mem; + return 0; +} + + +void +netmap_detach_common(struct netmap_adapter *na) +{ + if (na->ifp) + WNA(na->ifp) = NULL; /* XXX do we need this? */ + + if (na->tx_rings) { /* XXX should not happen */ + D("freeing leftover tx_rings"); + na->nm_krings_delete(na); + } + if (na->na_flags & NAF_MEM_OWNER) + netmap_mem_private_delete(na->nm_mem); + bzero(na, sizeof(*na)); + free(na, M_DEVBUF); +} + /* * Initialize a ``netmap_adapter`` object created by driver on attach. @@ -2809,53 +2058,85 @@ out: * setups. */ int -netmap_attach(struct netmap_adapter *arg, u_int num_queues) +netmap_attach(struct netmap_adapter *arg) { - struct netmap_adapter *na = NULL; + struct netmap_hw_adapter *hwna = NULL; + // XXX when is arg == NULL ? struct ifnet *ifp = arg ? arg->ifp : NULL; - size_t len; if (arg == NULL || ifp == NULL) goto fail; - /* a VALE port uses two endpoints */ - len = nma_is_vp(arg) ? sizeof(*na) : sizeof(*na) * 2; - na = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); - if (na == NULL) + hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (hwna == NULL) goto fail; - WNA(ifp) = na; - *na = *arg; /* copy everything, trust the driver to not pass junk */ - NETMAP_SET_CAPABLE(ifp); - if (na->num_tx_rings == 0) - na->num_tx_rings = num_queues; - na->num_rx_rings = num_queues; - na->refcount = na->na_single = na->na_multi = 0; - /* Core lock initialized here, others after netmap_if_new. */ - mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF); + hwna->up = *arg; + if (netmap_attach_common(&hwna->up)) { + free(hwna, M_DEVBUF); + goto fail; + } + netmap_adapter_get(&hwna->up); + #ifdef linux if (ifp->netdev_ops) { - ND("netdev_ops %p", ifp->netdev_ops); /* prepare a clone of the netdev ops */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) - na->nm_ndo.ndo_start_xmit = ifp->netdev_ops; + hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; #else - na->nm_ndo = *ifp->netdev_ops; + hwna->nm_ndo = *ifp->netdev_ops; #endif } - na->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; + hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; #endif /* linux */ - na->nm_mem = arg->nm_mem ? arg->nm_mem : &nm_mem; - if (!nma_is_vp(arg)) - netmap_attach_sw(ifp); - D("success for %s", ifp->if_xname); + + D("success for %s", NM_IFPNAME(ifp)); return 0; fail: - D("fail, arg %p ifp %p na %p", arg, ifp, na); + D("fail, arg %p ifp %p na %p", arg, ifp, hwna); netmap_detach(ifp); - return (na ? EINVAL : ENOMEM); + return (hwna ? EINVAL : ENOMEM); +} + + +void +NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) +{ + if (!na) { + return; + } + + refcount_acquire(&na->na_refcount); } +/* returns 1 iff the netmap_adapter is destroyed */ +int +NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) +{ + if (!na) + return 1; + + if (!refcount_release(&na->na_refcount)) + return 0; + + if (na->nm_dtor) + na->nm_dtor(na); + + netmap_detach_common(na); + + return 1; +} + + +int +netmap_hw_krings_create(struct netmap_adapter *na) +{ + return netmap_krings_create(na, + na->num_tx_rings + 1, na->num_rx_rings + 1, 0); +} + + + /* * Free the allocated memory linked to the given ``netmap_adapter`` * object. @@ -2868,33 +2149,22 @@ netmap_detach(struct ifnet *ifp) if (!na) return; - mtx_destroy(&na->core_lock); - - if (na->tx_rings) { /* XXX should not happen */ - D("freeing leftover tx_rings"); - free(na->tx_rings, M_DEVBUF); - } - if (na->na_flags & NAF_MEM_OWNER) - netmap_mem_private_delete(na->nm_mem); - bzero(na, sizeof(*na)); - WNA(ifp) = NULL; - free(na, M_DEVBUF); + NMG_LOCK(); + netmap_disable_all_rings(ifp); + netmap_adapter_put(na); + na->ifp = NULL; + netmap_enable_all_rings(ifp); + NMG_UNLOCK(); } -int -nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, - struct netmap_adapter *na, u_int ring_nr); - - /* * Intercept packets from the network stack and pass them * to netmap as incoming packets on the 'software' ring. * We rely on the OS to make sure that the ifp and na do not go * away (typically the caller checks for IFF_DRV_RUNNING or the like). * In nm_register() or whenever there is a reinitialization, - * we make sure to access the core lock and per-ring locks - * so that IFCAP_NETMAP is visible here. + * we make sure to make the mode change visible here. */ int netmap_transmit(struct ifnet *ifp, struct mbuf *m) @@ -2917,44 +2187,16 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m) kring = &na->rx_rings[na->num_rx_rings]; lim = kring->nkr_num_slots - 1; if (netmap_verbose & NM_VERB_HOST) - D("%s packet %d len %d from the stack", ifp->if_xname, + D("%s packet %d len %d from the stack", NM_IFPNAME(ifp), kring->nr_hwcur + kring->nr_hwavail, len); // XXX reconsider long packets if we handle fragments if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ - D("%s from_host, drop packet size %d > %d", ifp->if_xname, + D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); goto done; } - if (SWNA(ifp)->na_bdg) { - struct nm_bdg_fwd *ft; - char *dst; - - na = SWNA(ifp); /* we operate on the host port */ - ft = na->rx_rings[0].nkr_ft; - dst = BDG_NMB(na->nm_mem, &na->rx_rings[0].ring->slot[0]); - - /* use slot 0 in the ft, there is nothing queued here */ - /* XXX we can save the copy calling m_copydata in nm_bdg_flush, - * need a special flag for this. - */ - m_copydata(m, 0, (int)len, dst); - ft->ft_flags = 0; - ft->ft_len = len; - ft->ft_buf = dst; - ft->ft_next = NM_FT_NULL; - ft->ft_frags = 1; - if (netmap_verbose & NM_VERB_HOST) - RD(5, "pkt %p size %d to bridge port %d", - dst, len, na->bdg_port); - nm_bdg_flush(ft, 1, na, 0); - na = NA(ifp); /* back to the regular object/lock */ - error = 0; - goto done; - } - /* protect against other instances of netmap_transmit, * and userspace invocations of rxsync(). - * XXX could reuse core_lock */ // XXX [Linux] there can be no other instances of netmap_transmit // on this same ring, but we still need this lock to protect @@ -2962,18 +2204,18 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m) mtx_lock(&kring->q_lock); if (kring->nr_hwavail >= lim) { if (netmap_verbose) - D("stack ring %s full\n", ifp->if_xname); + D("stack ring %s full\n", NM_IFPNAME(ifp)); } else { /* compute the insert position */ i = nm_kr_rxpos(kring); slot = &kring->ring->slot[i]; - m_copydata(m, 0, (int)len, BDG_NMB(na->nm_mem, slot)); + m_copydata(m, 0, (int)len, BDG_NMB(na, slot)); slot->len = len; slot->flags = kring->nkr_slot_flags; kring->nr_hwavail++; if (netmap_verbose & NM_VERB_HOST) - D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings); - selwakeuppri(&kring->si, PI_NET); + D("wake up host ring %s %d", NM_IFPNAME(na->ifp), na->num_rx_rings); + na->nm_notify(na, na->num_rx_rings, NR_RX, 0); error = 0; } mtx_unlock(&kring->q_lock); @@ -2994,7 +2236,7 @@ done: /* * netmap_reset() is called by the driver routines when reinitializing * a ring. The driver is in charge of locking to protect the kring. - * If netmap mode is not set just return NULL. + * If native netmap mode is not set just return NULL. */ struct netmap_slot * netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, @@ -3044,6 +2286,7 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, kring->nkr_hwofs = new_hwofs; if (tx == NR_TX) kring->nr_hwavail = lim; + kring->nr_hwreserved = 0; #if 0 // def linux /* XXX check that the mappings are correct */ @@ -3060,137 +2303,60 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, * We do the wakeup here, but the ring is not yet reconfigured. * However, we are under lock so there are no races. */ - selwakeuppri(&kring->si, PI_NET); - selwakeuppri(tx == NR_TX ? &na->tx_si : &na->rx_si, PI_NET); + na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY); return kring->ring->slot; } /* - * Grab packets from a kring, move them into the ft structure - * associated to the tx (input) port. Max one instance per port, - * filtered on input (ioctl, poll or XXX). - * Returns the next position in the ring. - */ -static int -nm_bdg_preflush(struct netmap_adapter *na, u_int ring_nr, - struct netmap_kring *kring, u_int end) -{ - struct netmap_ring *ring = kring->ring; - struct nm_bdg_fwd *ft; - u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; - u_int ft_i = 0; /* start from 0 */ - u_int frags = 1; /* how many frags ? */ - struct nm_bridge *b = na->na_bdg; - - /* To protect against modifications to the bridge we acquire a - * shared lock, waiting if we can sleep (if the source port is - * attached to a user process) or with a trylock otherwise (NICs). - */ - ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); - if (na->na_flags & NAF_BDG_MAYSLEEP) - BDG_RLOCK(b); - else if (!BDG_RTRYLOCK(b)) - return 0; - ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); - ft = kring->nkr_ft; - - for (; likely(j != end); j = nm_next(j, lim)) { - struct netmap_slot *slot = &ring->slot[j]; - char *buf; - - ft[ft_i].ft_len = slot->len; - ft[ft_i].ft_flags = slot->flags; - - ND("flags is 0x%x", slot->flags); - /* this slot goes into a list so initialize the link field */ - ft[ft_i].ft_next = NM_FT_NULL; - buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? - (void *)(uintptr_t)slot->ptr : BDG_NMB(na->nm_mem, slot); - prefetch(buf); - ++ft_i; - if (slot->flags & NS_MOREFRAG) { - frags++; - continue; - } - if (unlikely(netmap_verbose && frags > 1)) - RD(5, "%d frags at %d", frags, ft_i - frags); - ft[ft_i - frags].ft_frags = frags; - frags = 1; - if (unlikely((int)ft_i >= bridge_batch)) - ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); - } - if (frags > 1) { - D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); - // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG - ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; - ft[ft_i - frags].ft_frags = frags - 1; - } - if (ft_i) - ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); - BDG_RUNLOCK(b); - return j; -} - - -/* - * Pass packets from nic to the bridge. - * XXX TODO check locking: this is called from the interrupt - * handler so we should make sure that the interface is not - * disconnected while passing down an interrupt. + * Dispatch rx/tx interrupts to the netmap rings. + * + * "work_done" is non-null on the RX path, NULL for the TX path. + * We rely on the OS to make sure that there is only one active + * instance per queue, and that there is appropriate locking. * - * Note, no user process can access this NIC so we can ignore - * the info in the 'ring'. + * The 'notify' routine depends on what the ring is attached to. + * - for a netmap file descriptor, do a selwakeup on the individual + * waitqueue, plus one on the global one if needed + * - for a switch, call the proper forwarding routine + * - XXX more ? */ -static void -netmap_nic_to_bdg(struct ifnet *ifp, u_int ring_nr) +void +netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) { struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring = &na->rx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, k; - - /* make sure that only one thread is ever in here, - * after which we can unlock. Probably unnecessary XXX. - */ - if (nm_kr_tryget(kring)) - return; - /* fetch packets that have arrived. - * XXX maybe do this in a loop ? - */ - if (na->nm_rxsync(ifp, ring_nr, 0)) - goto put_out; - if (kring->nr_hwavail == 0 && netmap_verbose) { - D("how strange, interrupt with no packets on %s", - ifp->if_xname); - goto put_out; - } - k = nm_kr_rxpos(kring); + struct netmap_kring *kring; - j = nm_bdg_preflush(na, ring_nr, kring, k); + q &= NETMAP_RING_MASK; - /* we consume everything, but we cannot update kring directly - * because the nic may have destroyed the info in the NIC ring. - * So we need to call rxsync again to restore it. - */ - ring->cur = j; - ring->avail = 0; - na->nm_rxsync(ifp, ring_nr, 0); + if (netmap_verbose) { + RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); + } -put_out: - nm_kr_put(kring); - return; + if (work_done) { /* RX path */ + if (q >= na->num_rx_rings) + return; // not a physical queue + kring = na->rx_rings + q; + kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? + na->nm_notify(na, q, NR_RX, + (na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); + *work_done = 1; /* do not fire napi again */ + } else { /* TX path */ + if (q >= na->num_tx_rings) + return; // not a physical queue + kring = na->tx_rings + q; + na->nm_notify(na, q, NR_TX, + (na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); + } } - /* * Default functions to handle rx/tx interrupts from a physical device. * "work_done" is non-null on the RX path, NULL for the TX path. - * We rely on the OS to make sure that there is only one active - * instance per queue, and that there is appropriate locking. * * If the card is not in netmap mode, simply return 0, * so that the caller proceeds with regular processing. + * Otherwise call netmap_common_irq() and return 1. * * If the card is connected to a netmap file descriptor, * do a selwakeup on the individual queue, plus one on the global one @@ -3203,871 +2369,64 @@ put_out: int netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) { - struct netmap_adapter *na; - struct netmap_kring *kring; - + // XXX could we check NAF_NATIVE_ON ? if (!(ifp->if_capenable & IFCAP_NETMAP)) return 0; - q &= NETMAP_RING_MASK; - - if (netmap_verbose) - RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); - na = NA(ifp); - if (na->na_flags & NAF_SKIP_INTR) { + if (NA(ifp)->na_flags & NAF_SKIP_INTR) { ND("use regular interrupt"); return 0; } - if (work_done) { /* RX path */ - if (q >= na->num_rx_rings) - return 0; // not a physical queue - kring = na->rx_rings + q; - kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? - if (na->na_bdg != NULL) { - netmap_nic_to_bdg(ifp, q); - } else { - selwakeuppri(&kring->si, PI_NET); - if (na->num_rx_rings > 1 /* or multiple listeners */ ) - selwakeuppri(&na->rx_si, PI_NET); - } - *work_done = 1; /* do not fire napi again */ - } else { /* TX path */ - if (q >= na->num_tx_rings) - return 0; // not a physical queue - kring = na->tx_rings + q; - selwakeuppri(&kring->si, PI_NET); - if (na->num_tx_rings > 1 /* or multiple listeners */ ) - selwakeuppri(&na->tx_si, PI_NET); - } + netmap_common_irq(ifp, q, work_done); return 1; } -#ifdef linux /* linux-specific routines */ - - -/* - * Remap linux arguments into the FreeBSD call. - * - pwait is the poll table, passed as 'dev'; - * If pwait == NULL someone else already woke up before. We can report - * events but they are filtered upstream. - * If pwait != NULL, then pwait->key contains the list of events. - * - events is computed from pwait as above. - * - file is passed as 'td'; - */ -static u_int -linux_netmap_poll(struct file * file, struct poll_table_struct *pwait) -{ -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) - int events = POLLIN | POLLOUT; /* XXX maybe... */ -#elif LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) - int events = pwait ? pwait->key : POLLIN | POLLOUT; -#else /* in 3.4.0 field 'key' was renamed to '_key' */ - int events = pwait ? pwait->_key : POLLIN | POLLOUT; -#endif - return netmap_poll((void *)pwait, events, (void *)file); -} - - -static int -linux_netmap_mmap(struct file *f, struct vm_area_struct *vma) -{ - int error = 0; - unsigned long off, va; - vm_ooffset_t pa; - struct netmap_priv_d *priv = f->private_data; - /* - * vma->vm_start: start of mapping user address space - * vma->vm_end: end of the mapping user address space - * vma->vm_pfoff: offset of first page in the device - */ - - // XXX security checks - - error = netmap_get_memory(priv); - ND("get_memory returned %d", error); - if (error) - return -error; - - if ((vma->vm_start & ~PAGE_MASK) || (vma->vm_end & ~PAGE_MASK)) { - ND("vm_start = %lx vm_end = %lx", vma->vm_start, vma->vm_end); - return -EINVAL; - } - - for (va = vma->vm_start, off = vma->vm_pgoff; - va < vma->vm_end; - va += PAGE_SIZE, off++) - { - pa = netmap_mem_ofstophys(priv->np_mref, off << PAGE_SHIFT); - if (pa == 0) - return -EINVAL; - - ND("va %lx pa %p", va, pa); - error = remap_pfn_range(vma, va, pa >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); - if (error) - return error; - } - return 0; -} - - /* - * This one is probably already protected by the netif lock XXX - */ -static netdev_tx_t -linux_netmap_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - netmap_transmit(dev, skb); - return (NETDEV_TX_OK); -} - - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) // XXX was 37 -#define LIN_IOCTL_NAME .ioctl -int -linux_netmap_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long data /* arg */) -#else -#define LIN_IOCTL_NAME .unlocked_ioctl -long -linux_netmap_ioctl(struct file *file, u_int cmd, u_long data /* arg */) -#endif -{ - int ret; - struct nmreq nmr; - bzero(&nmr, sizeof(nmr)); - - if (cmd == NIOCTXSYNC || cmd == NIOCRXSYNC) { - data = 0; /* no argument required here */ - } - if (data && copy_from_user(&nmr, (void *)data, sizeof(nmr) ) != 0) - return -EFAULT; - ret = netmap_ioctl(NULL, cmd, (caddr_t)&nmr, 0, (void *)file); - if (data && copy_to_user((void*)data, &nmr, sizeof(nmr) ) != 0) - return -EFAULT; - return -ret; -} - - -static int -netmap_release(struct inode *inode, struct file *file) -{ - (void)inode; /* UNUSED */ - if (file->private_data) - netmap_dtor(file->private_data); - return (0); -} - - -static int -linux_netmap_open(struct inode *inode, struct file *file) -{ - struct netmap_priv_d *priv; - (void)inode; /* UNUSED */ - - priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (priv == NULL) - return -ENOMEM; - - file->private_data = priv; - - return (0); -} - - -static struct file_operations netmap_fops = { - .owner = THIS_MODULE, - .open = linux_netmap_open, - .mmap = linux_netmap_mmap, - LIN_IOCTL_NAME = linux_netmap_ioctl, - .poll = linux_netmap_poll, - .release = netmap_release, -}; - - -static struct miscdevice netmap_cdevsw = { /* same name as FreeBSD */ - MISC_DYNAMIC_MINOR, - "netmap", - &netmap_fops, -}; - -static int netmap_init(void); -static void netmap_fini(void); - - -/* Errors have negative values on linux */ -static int linux_netmap_init(void) -{ - return -netmap_init(); -} - -module_init(linux_netmap_init); -module_exit(netmap_fini); -/* export certain symbols to other modules */ -EXPORT_SYMBOL(netmap_attach); // driver attach routines -EXPORT_SYMBOL(netmap_detach); // driver detach routines -EXPORT_SYMBOL(netmap_ring_reinit); // ring init on error -EXPORT_SYMBOL(netmap_buffer_lut); -EXPORT_SYMBOL(netmap_total_buffers); // index check -EXPORT_SYMBOL(netmap_buffer_base); -EXPORT_SYMBOL(netmap_reset); // ring init routines -EXPORT_SYMBOL(netmap_buf_size); -EXPORT_SYMBOL(netmap_rx_irq); // default irq handler -EXPORT_SYMBOL(netmap_no_pendintr); // XXX mitigation - should go away -EXPORT_SYMBOL(netmap_bdg_ctl); // bridge configuration routine -EXPORT_SYMBOL(netmap_bdg_learning); // the default lookup function -EXPORT_SYMBOL(netmap_disable_all_rings); -EXPORT_SYMBOL(netmap_enable_all_rings); - - -MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/"); -MODULE_DESCRIPTION("The netmap packet I/O framework"); -MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */ - -#else /* __FreeBSD__ */ - - -static struct cdevsw netmap_cdevsw = { - .d_version = D_VERSION, - .d_name = "netmap", - .d_open = netmap_open, - .d_mmap_single = netmap_mmap_single, - .d_ioctl = netmap_ioctl, - .d_poll = netmap_poll, - .d_close = netmap_close, -}; -#endif /* __FreeBSD__ */ - -/* - *---- support for virtual bridge ----- - */ - -/* ----- FreeBSD if_bridge hash function ------- */ - -/* - * The following hash function is adapted from "Hash Functions" by Bob Jenkins - * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). + * Module loader and unloader + * + * netmap_init() creates the /dev/netmap device and initializes + * all global variables. Returns 0 on success, errno on failure + * (but there is no chance) * - * http://www.burtleburtle.net/bob/hash/spooky.html + * netmap_fini() destroys everything. */ -#define mix(a, b, c) \ -do { \ - a -= b; a -= c; a ^= (c >> 13); \ - b -= c; b -= a; b ^= (a << 8); \ - c -= a; c -= b; c ^= (b >> 13); \ - a -= b; a -= c; a ^= (c >> 12); \ - b -= c; b -= a; b ^= (a << 16); \ - c -= a; c -= b; c ^= (b >> 5); \ - a -= b; a -= c; a ^= (c >> 3); \ - b -= c; b -= a; b ^= (a << 10); \ - c -= a; c -= b; c ^= (b >> 15); \ -} while (/*CONSTCOND*/0) - -static __inline uint32_t -nm_bridge_rthash(const uint8_t *addr) -{ - uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key - - b += addr[5] << 8; - b += addr[4]; - a += addr[3] << 24; - a += addr[2] << 16; - a += addr[1] << 8; - a += addr[0]; - - mix(a, b, c); -#define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) - return (c & BRIDGE_RTHASH_MASK); -} - -#undef mix - - -static int -bdg_netmap_reg(struct ifnet *ifp, int onoff) -{ - /* the interface is already attached to the bridge, - * so we only need to toggle IFCAP_NETMAP. - */ - if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - } else { - ifp->if_capenable &= ~IFCAP_NETMAP; - } - return 0; -} +static struct cdev *netmap_dev; /* /dev/netmap character device. */ +extern struct cdevsw netmap_cdevsw; -/* - * Lookup function for a learning bridge. - * Update the hash table with the source address, - * and then returns the destination port index, and the - * ring in *dst_ring (at the moment, always use ring 0) - */ -u_int -netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, - struct netmap_adapter *na) +void +netmap_fini(void) { - struct nm_hash_ent *ht = na->na_bdg->ht; - uint32_t sh, dh; - u_int dst, mysrc = na->bdg_port; - uint64_t smac, dmac; - - if (buf_len < 14) { - D("invalid buf length %d", buf_len); - return NM_BDG_NOPORT; - } - dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; - smac = le64toh(*(uint64_t *)(buf + 4)); - smac >>= 16; - - /* - * The hash is somewhat expensive, there might be some - * worthwhile optimizations here. - */ - if ((buf[6] & 1) == 0) { /* valid src */ - uint8_t *s = buf+6; - sh = nm_bridge_rthash(s); // XXX hash of source - /* update source port forwarding entry */ - ht[sh].mac = smac; /* XXX expire ? */ - ht[sh].ports = mysrc; - if (netmap_verbose) - D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", - s[0], s[1], s[2], s[3], s[4], s[5], mysrc); - } - dst = NM_BDG_BROADCAST; - if ((buf[0] & 1) == 0) { /* unicast */ - dh = nm_bridge_rthash(buf); // XXX hash of dst - if (ht[dh].mac == dmac) { /* found dst */ - dst = ht[dh].ports; - } - /* XXX otherwise return NM_BDG_UNKNOWN ? */ - } - *dst_ring = 0; - return dst; + // XXX destroy_bridges() ? + if (netmap_dev) + destroy_dev(netmap_dev); + netmap_mem_fini(); + NMG_LOCK_DESTROY(); + printf("netmap: unloaded module.\n"); } - -/* - * This flush routine supports only unicast and broadcast but a large - * number of ports, and lets us replace the learn and dispatch functions. - */ int -nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_adapter *na, - u_int ring_nr) -{ - struct nm_bdg_q *dst_ents, *brddst; - uint16_t num_dsts = 0, *dsts; - struct nm_bridge *b = na->na_bdg; - u_int i, j, me = na->bdg_port; - - /* - * The work area (pointed by ft) is followed by an array of - * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS - * queues per port plus one for the broadcast traffic. - * Then we have an array of destination indexes. - */ - dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); - dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); - - /* first pass: find a destination for each packet in the batch */ - for (i = 0; likely(i < n); i += ft[i].ft_frags) { - uint8_t dst_ring = ring_nr; /* default, same ring as origin */ - uint16_t dst_port, d_i; - struct nm_bdg_q *d; - - ND("slot %d frags %d", i, ft[i].ft_frags); - dst_port = b->nm_bdg_lookup(ft[i].ft_buf, ft[i].ft_len, - &dst_ring, na); - if (netmap_verbose > 255) - RD(5, "slot %d port %d -> %d", i, me, dst_port); - if (dst_port == NM_BDG_NOPORT) - continue; /* this packet is identified to be dropped */ - else if (unlikely(dst_port > NM_BDG_MAXPORTS)) - continue; - else if (dst_port == NM_BDG_BROADCAST) - dst_ring = 0; /* broadcasts always go to ring 0 */ - else if (unlikely(dst_port == me || - !b->bdg_ports[dst_port])) - continue; - - /* get a position in the scratch pad */ - d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; - d = dst_ents + d_i; - - /* append the first fragment to the list */ - if (d->bq_head == NM_FT_NULL) { /* new destination */ - d->bq_head = d->bq_tail = i; - /* remember this position to be scanned later */ - if (dst_port != NM_BDG_BROADCAST) - dsts[num_dsts++] = d_i; - } else { - ft[d->bq_tail].ft_next = i; - d->bq_tail = i; - } - d->bq_len += ft[i].ft_frags; - } - - /* - * Broadcast traffic goes to ring 0 on all destinations. - * So we need to add these rings to the list of ports to scan. - * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is - * expensive. We should keep a compact list of active destinations - * so we could shorten this loop. - */ - brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; - if (brddst->bq_head != NM_FT_NULL) { - for (j = 0; likely(j < b->bdg_active_ports); j++) { - uint16_t d_i; - i = b->bdg_port_index[j]; - if (unlikely(i == me)) - continue; - d_i = i * NM_BDG_MAXRINGS; - if (dst_ents[d_i].bq_head == NM_FT_NULL) - dsts[num_dsts++] = d_i; - } - } - - ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); - /* second pass: scan destinations (XXX will be modular somehow) */ - for (i = 0; i < num_dsts; i++) { - struct ifnet *dst_ifp; - struct netmap_adapter *dst_na; - struct netmap_kring *kring; - struct netmap_ring *ring; - u_int dst_nr, is_vp, lim, j, sent = 0, d_i, next, brd_next; - u_int needed, howmany; - int retry = netmap_txsync_retry; - struct nm_bdg_q *d; - uint32_t my_start = 0, lease_idx = 0; - int nrings; - - d_i = dsts[i]; - ND("second pass %d port %d", i, d_i); - d = dst_ents + d_i; - // XXX fix the division - dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; - /* protect from the lookup function returning an inactive - * destination port - */ - if (unlikely(dst_na == NULL)) - goto cleanup; - if (dst_na->na_flags & NAF_SW_ONLY) - goto cleanup; - dst_ifp = dst_na->ifp; - /* - * The interface may be in !netmap mode in two cases: - * - when na is attached but not activated yet; - * - when na is being deactivated but is still attached. - */ - if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { - ND("not in netmap mode!"); - goto cleanup; - } - - /* there is at least one either unicast or broadcast packet */ - brd_next = brddst->bq_head; - next = d->bq_head; - /* we need to reserve this many slots. If fewer are - * available, some packets will be dropped. - * Packets may have multiple fragments, so we may not use - * there is a chance that we may not use all of the slots - * we have claimed, so we will need to handle the leftover - * ones when we regain the lock. - */ - needed = d->bq_len + brddst->bq_len; - - is_vp = nma_is_vp(dst_na); - ND(5, "pass 2 dst %d is %x %s", - i, d_i, is_vp ? "virtual" : "nic/host"); - dst_nr = d_i & (NM_BDG_MAXRINGS-1); - if (is_vp) { /* virtual port */ - nrings = dst_na->num_rx_rings; - } else { - nrings = dst_na->num_tx_rings; - } - if (dst_nr >= nrings) - dst_nr = dst_nr % nrings; - kring = is_vp ? &dst_na->rx_rings[dst_nr] : - &dst_na->tx_rings[dst_nr]; - ring = kring->ring; - lim = kring->nkr_num_slots - 1; - -retry: - - /* reserve the buffers in the queue and an entry - * to report completion, and drop lock. - * XXX this might become a helper function. - */ - mtx_lock(&kring->q_lock); - if (kring->nkr_stopped) { - mtx_unlock(&kring->q_lock); - goto cleanup; - } - /* on physical interfaces, do a txsync to recover - * slots for packets already transmitted. - * XXX maybe we could be optimistic and rely on a retry - * in case of failure. - */ - if (nma_is_hw(dst_na)) { - dst_na->nm_txsync(dst_ifp, dst_nr, 0); - } - my_start = j = kring->nkr_hwlease; - howmany = nm_kr_space(kring, is_vp); - if (needed < howmany) - howmany = needed; - lease_idx = nm_kr_lease(kring, howmany, is_vp); - mtx_unlock(&kring->q_lock); - - /* only retry if we need more than available slots */ - if (retry && needed <= howmany) - retry = 0; - - /* copy to the destination queue */ - while (howmany > 0) { - struct netmap_slot *slot; - struct nm_bdg_fwd *ft_p, *ft_end; - u_int cnt; - - /* find the queue from which we pick next packet. - * NM_FT_NULL is always higher than valid indexes - * so we never dereference it if the other list - * has packets (and if both are empty we never - * get here). - */ - if (next < brd_next) { - ft_p = ft + next; - next = ft_p->ft_next; - } else { /* insert broadcast */ - ft_p = ft + brd_next; - brd_next = ft_p->ft_next; - } - cnt = ft_p->ft_frags; // cnt > 0 - if (unlikely(cnt > howmany)) - break; /* no more space */ - howmany -= cnt; - if (netmap_verbose && cnt > 1) - RD(5, "rx %d frags to %d", cnt, j); - ft_end = ft_p + cnt; - do { - void *dst, *src = ft_p->ft_buf; - size_t len = (ft_p->ft_len + 63) & ~63; - - slot = &ring->slot[j]; - dst = BDG_NMB(dst_na->nm_mem, slot); - /* round to a multiple of 64 */ - - ND("send %d %d bytes at %s:%d", - i, ft_p->ft_len, dst_ifp->if_xname, j); - if (ft_p->ft_flags & NS_INDIRECT) { - if (copyin(src, dst, len)) { - // invalid user pointer, pretend len is 0 - ft_p->ft_len = 0; - } - } else { - //memcpy(dst, src, len); - pkt_copy(src, dst, (int)len); - } - slot->len = ft_p->ft_len; - slot->flags = (cnt << 8)| NS_MOREFRAG; - j = nm_next(j, lim); - ft_p++; - sent++; - } while (ft_p != ft_end); - slot->flags = (cnt << 8); /* clear flag on last entry */ - /* are we done ? */ - if (next == NM_FT_NULL && brd_next == NM_FT_NULL) - break; - } - { - /* current position */ - uint32_t *p = kring->nkr_leases; /* shorthand */ - uint32_t update_pos; - int still_locked = 1; - - mtx_lock(&kring->q_lock); - if (unlikely(howmany > 0)) { - /* not used all bufs. If i am the last one - * i can recover the slots, otherwise must - * fill them with 0 to mark empty packets. - */ - ND("leftover %d bufs", howmany); - if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { - /* yes i am the last one */ - ND("roll back nkr_hwlease to %d", j); - kring->nkr_hwlease = j; - } else { - while (howmany-- > 0) { - ring->slot[j].len = 0; - ring->slot[j].flags = 0; - j = nm_next(j, lim); - } - } - } - p[lease_idx] = j; /* report I am done */ - - update_pos = is_vp ? nm_kr_rxpos(kring) : ring->cur; - - if (my_start == update_pos) { - /* all slots before my_start have been reported, - * so scan subsequent leases to see if other ranges - * have been completed, and to a selwakeup or txsync. - */ - while (lease_idx != kring->nkr_lease_idx && - p[lease_idx] != NR_NOSLOT) { - j = p[lease_idx]; - p[lease_idx] = NR_NOSLOT; - lease_idx = nm_next(lease_idx, lim); - } - /* j is the new 'write' position. j != my_start - * means there are new buffers to report - */ - if (likely(j != my_start)) { - if (is_vp) { - uint32_t old_avail = kring->nr_hwavail; - - kring->nr_hwavail = (j >= kring->nr_hwcur) ? - j - kring->nr_hwcur : - j + lim + 1 - kring->nr_hwcur; - if (kring->nr_hwavail < old_avail) { - D("avail shrink %d -> %d", - old_avail, kring->nr_hwavail); - } - still_locked = 0; - mtx_unlock(&kring->q_lock); - selwakeuppri(&kring->si, PI_NET); - } else { - ring->cur = j; - /* XXX update avail ? */ - still_locked = 0; - dst_na->nm_txsync(dst_ifp, dst_nr, 0); - mtx_unlock(&kring->q_lock); - - /* retry to send more packets */ - if (nma_is_hw(dst_na) && retry--) - goto retry; - } - } - } - if (still_locked) - mtx_unlock(&kring->q_lock); - } -cleanup: - d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ - d->bq_len = 0; - } - brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ - brddst->bq_len = 0; - return 0; -} - - -/* - * main dispatch routine for the bridge. - * We already know that only one thread is running this. - * we must run nm_bdg_preflush without lock. - */ -static int -bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) -{ - struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring = &na->tx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, k, lim = kring->nkr_num_slots - 1; - - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); - - if (bridge_batch <= 0) { /* testing only */ - j = k; // used all - goto done; - } - if (bridge_batch > NM_BDG_BATCH) - bridge_batch = NM_BDG_BATCH; - - j = nm_bdg_preflush(na, ring_nr, kring, k); - if (j != k) - D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); - /* k-j modulo ring size is the number of slots processed */ - if (k < j) - k += kring->nkr_num_slots; - kring->nr_hwavail = lim - (k - j); - -done: - kring->nr_hwcur = j; - ring->avail = kring->nr_hwavail; - if (netmap_verbose) - D("%s ring %d flags %d", ifp->if_xname, ring_nr, flags); - return 0; -} - - -/* - * user process reading from a VALE switch. - * Already protected against concurrent calls from userspace, - * but we must acquire the queue's lock to protect against - * writers on the same queue. - */ -static int -bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) -{ - struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring = &na->rx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, lim = kring->nkr_num_slots - 1; - u_int k = ring->cur, resvd = ring->reserved; - int n; - - mtx_lock(&kring->q_lock); - if (k > lim) { - D("ouch dangerous reset!!!"); - n = netmap_ring_reinit(kring); - goto done; - } - - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - - if (j != k) { /* userspace has released some packets. */ - n = k - j; - if (n < 0) - n += kring->nkr_num_slots; - ND("userspace releases %d packets", n); - for (n = 0; likely(j != k); n++) { - struct netmap_slot *slot = &ring->slot[j]; - void *addr = BDG_NMB(na->nm_mem, slot); - - if (addr == netmap_buffer_base) { /* bad buf */ - D("bad buffer index %d, ignore ?", - slot->buf_idx); - } - slot->flags &= ~NS_BUF_CHANGED; - j = nm_next(j, lim); - } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; - } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; - n = 0; -done: - mtx_unlock(&kring->q_lock); - return n; -} - - -static int -bdg_netmap_attach(struct netmap_adapter *arg) -{ - struct netmap_adapter na; - - ND("attaching virtual bridge"); - bzero(&na, sizeof(na)); - - na.ifp = arg->ifp; - na.na_flags = NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; - na.num_tx_rings = arg->num_tx_rings; - na.num_rx_rings = arg->num_rx_rings; - na.num_tx_desc = arg->num_tx_desc; - na.num_rx_desc = arg->num_rx_desc; - na.nm_txsync = bdg_netmap_txsync; - na.nm_rxsync = bdg_netmap_rxsync; - na.nm_register = bdg_netmap_reg; - na.nm_mem = netmap_mem_private_new(arg->ifp->if_xname, - na.num_tx_rings, na.num_tx_desc, - na.num_rx_rings, na.num_rx_desc); - return netmap_attach(&na, na.num_tx_rings); -} - - -static struct cdev *netmap_dev; /* /dev/netmap character device. */ - - -/* - * Module loader. - * - * Create the /dev/netmap device and initialize all global - * variables. - * - * Return 0 on success, errno on failure. - */ -static int netmap_init(void) { - int i, error; + int error; NMG_LOCK_INIT(); error = netmap_mem_init(); - if (error != 0) { - printf("netmap: unable to initialize the memory allocator.\n"); - return (error); - } - printf("netmap: loaded module\n"); + if (error != 0) + goto fail; + /* XXX could use make_dev_credv() to get error number */ netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, "netmap"); + if (!netmap_dev) + goto fail; - bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ - for (i = 0; i < NM_BRIDGES; i++) - BDG_RWINIT(&nm_bridges[i]); - return (error); -} - - -/* - * Module unloader. - * - * Free all the memory, and destroy the ``/dev/netmap`` device. - */ -static void -netmap_fini(void) -{ - destroy_dev(netmap_dev); - netmap_mem_fini(); - NMG_LOCK_DESTROY(); - printf("netmap: unloaded module.\n"); -} - - -#ifdef __FreeBSD__ -/* - * Kernel entry point. - * - * Initialize/finalize the module and return. - * - * Return 0 on success, errno on failure. - */ -static int -netmap_loader(__unused struct module *module, int event, __unused void *arg) -{ - int error = 0; - - switch (event) { - case MOD_LOAD: - error = netmap_init(); - break; - - case MOD_UNLOAD: - netmap_fini(); - break; - - default: - error = EOPNOTSUPP; - break; - } - - return (error); + netmap_init_bridges(); + printf("netmap: loaded module\n"); + return (0); +fail: + netmap_fini(); + return (EINVAL); /* may be incorrect */ } - - -DEV_MODULE(netmap, netmap_loader, NULL); -#endif /* __FreeBSD__ */ diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c new file mode 100644 index 000000000000..c2814146d2ef --- /dev/null +++ b/sys/dev/netmap/netmap_freebsd.c @@ -0,0 +1,410 @@ +/* + * Copyright (C) 2013 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* $FreeBSD$ */ + +#include <sys/types.h> +#include <sys/module.h> +#include <sys/errno.h> +#include <sys/param.h> /* defines used in kernel.h */ +#include <sys/kernel.h> /* types used in module initialization */ +#include <sys/conf.h> /* DEV_MODULE */ + +#include <sys/rwlock.h> + +#include <vm/vm.h> /* vtophys */ +#include <vm/pmap.h> /* vtophys */ +#include <vm/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/uma.h> + + +#include <sys/malloc.h> +#include <sys/socket.h> /* sockaddrs */ +#include <sys/selinfo.h> +#include <net/if.h> +#include <net/if_var.h> +#include <machine/bus.h> /* bus_dmamap_* */ + +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> +#include <dev/netmap/netmap_mem2.h> + + +/* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ + +/* + * Intercept the rx routine in the standard device driver. + * Second argument is non-zero to intercept, 0 to restore + */ +int +netmap_catch_rx(struct netmap_adapter *na, int intercept) +{ + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + struct ifnet *ifp = na->ifp; + + if (intercept) { + if (gna->save_if_input) { + D("cannot intercept again"); + return EINVAL; /* already set */ + } + gna->save_if_input = ifp->if_input; + ifp->if_input = generic_rx_handler; + } else { + if (!gna->save_if_input){ + D("cannot restore"); + return EINVAL; /* not saved */ + } + ifp->if_input = gna->save_if_input; + gna->save_if_input = NULL; + } + + return 0; +} + +/* + * Intercept the packet steering routine in the tx path, + * so that we can decide which queue is used for an mbuf. + * Second argument is non-zero to intercept, 0 to restore. + * + * XXX see if FreeBSD has such a mechanism + */ +void +netmap_catch_packet_steering(struct netmap_generic_adapter *na, int enable) +{ + if (enable) { + } else { + } +} + +/* Transmit routine used by generic_netmap_txsync(). Returns 0 on success + * and non-zero on error (which may be packet drops or other errors). + * addr and len identify the netmap buffer, m is the (preallocated) + * mbuf to use for transmissions. + * + * We should add a reference to the mbuf so the m_freem() at the end + * of the transmission does not consume resources. + * + * On FreeBSD, and on multiqueue cards, we can force the queue using + * if ((m->m_flags & M_FLOWID) != 0) + * i = m->m_pkthdr.flowid % adapter->num_queues; + * else + * i = curcpu % adapter->num_queues; + * + */ +int +generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, + void *addr, u_int len, u_int ring_nr) +{ + int ret; + + m->m_len = m->m_pkthdr.len = 0; + + // copy data to the mbuf + m_copyback(m, 0, len, addr); + + // inc refcount. We are alone, so we can skip the atomic + atomic_fetchadd_int(m->m_ext.ref_cnt, 1); + m->m_flags |= M_FLOWID; + m->m_pkthdr.flowid = ring_nr; + m->m_pkthdr.rcvif = ifp; /* used for tx notification */ + ret = ifp->if_transmit(ifp, m); + return ret; +} + +/* + * The following two functions are empty until we have a generic + * way to extract the info from the ifp + */ +int +generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) +{ + D("called"); + return 0; +} + +void +generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) +{ + D("called"); + *txq = 1; + *rxq = 1; +} + +void netmap_mitigation_init(struct netmap_generic_adapter *na) +{ + ND("called"); + na->mit_pending = 0; +} + + +void netmap_mitigation_start(struct netmap_generic_adapter *na) +{ + ND("called"); +} + +void netmap_mitigation_restart(struct netmap_generic_adapter *na) +{ + ND("called"); +} + +int netmap_mitigation_active(struct netmap_generic_adapter *na) +{ + ND("called"); + return 0; +} + +void netmap_mitigation_cleanup(struct netmap_generic_adapter *na) +{ + ND("called"); +} + +/* + * In order to track whether pages are still mapped, we hook into + * the standard cdev_pager and intercept the constructor and + * destructor. + */ + +struct netmap_vm_handle_t { + struct cdev *dev; + struct netmap_priv_d *priv; +}; + +static int +netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, + vm_ooffset_t foff, struct ucred *cred, u_short *color) +{ + struct netmap_vm_handle_t *vmh = handle; + D("handle %p size %jd prot %d foff %jd", + handle, (intmax_t)size, prot, (intmax_t)foff); + dev_ref(vmh->dev); + return 0; +} + + +static void +netmap_dev_pager_dtor(void *handle) +{ + struct netmap_vm_handle_t *vmh = handle; + struct cdev *dev = vmh->dev; + struct netmap_priv_d *priv = vmh->priv; + D("handle %p", handle); + netmap_dtor(priv); + free(vmh, M_DEVBUF); + dev_rel(dev); +} + +static int +netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, + int prot, vm_page_t *mres) +{ + struct netmap_vm_handle_t *vmh = object->handle; + struct netmap_priv_d *priv = vmh->priv; + vm_paddr_t paddr; + vm_page_t page; + vm_memattr_t memattr; + vm_pindex_t pidx; + + ND("object %p offset %jd prot %d mres %p", + object, (intmax_t)offset, prot, mres); + memattr = object->memattr; + pidx = OFF_TO_IDX(offset); + paddr = netmap_mem_ofstophys(priv->np_mref, offset); + if (paddr == 0) + return VM_PAGER_FAIL; + + if (((*mres)->flags & PG_FICTITIOUS) != 0) { + /* + * If the passed in result page is a fake page, update it with + * the new physical address. + */ + page = *mres; + vm_page_updatefake(page, paddr, memattr); + } else { + /* + * Replace the passed in reqpage page with our own fake page and + * free up the all of the original pages. + */ +#ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ +#define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK +#define VM_OBJECT_WLOCK VM_OBJECT_LOCK +#endif /* VM_OBJECT_WUNLOCK */ + + VM_OBJECT_WUNLOCK(object); + page = vm_page_getfake(paddr, memattr); + VM_OBJECT_WLOCK(object); + vm_page_lock(*mres); + vm_page_free(*mres); + vm_page_unlock(*mres); + *mres = page; + vm_page_insert(page, object, pidx); + } + page->valid = VM_PAGE_BITS_ALL; + return (VM_PAGER_OK); +} + + +static struct cdev_pager_ops netmap_cdev_pager_ops = { + .cdev_pg_ctor = netmap_dev_pager_ctor, + .cdev_pg_dtor = netmap_dev_pager_dtor, + .cdev_pg_fault = netmap_dev_pager_fault, +}; + + +static int +netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, + vm_size_t objsize, vm_object_t *objp, int prot) +{ + int error; + struct netmap_vm_handle_t *vmh; + struct netmap_priv_d *priv; + vm_object_t obj; + + D("cdev %p foff %jd size %jd objp %p prot %d", cdev, + (intmax_t )*foff, (intmax_t )objsize, objp, prot); + + vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (vmh == NULL) + return ENOMEM; + vmh->dev = cdev; + + NMG_LOCK(); + error = devfs_get_cdevpriv((void**)&priv); + if (error) + goto err_unlock; + vmh->priv = priv; + priv->np_refcount++; + NMG_UNLOCK(); + + error = netmap_get_memory(priv); + if (error) + goto err_deref; + + obj = cdev_pager_allocate(vmh, OBJT_DEVICE, + &netmap_cdev_pager_ops, objsize, prot, + *foff, NULL); + if (obj == NULL) { + D("cdev_pager_allocate failed"); + error = EINVAL; + goto err_deref; + } + + *objp = obj; + return 0; + +err_deref: + NMG_LOCK(); + priv->np_refcount--; +err_unlock: + NMG_UNLOCK(); +// err: + free(vmh, M_DEVBUF); + return error; +} + + +// XXX can we remove this ? +static int +netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) +{ + if (netmap_verbose) + D("dev %p fflag 0x%x devtype %d td %p", + dev, fflag, devtype, td); + return 0; +} + + +static int +netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +{ + struct netmap_priv_d *priv; + int error; + + (void)dev; + (void)oflags; + (void)devtype; + (void)td; + + // XXX wait or nowait ? + priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (priv == NULL) + return ENOMEM; + + error = devfs_set_cdevpriv(priv, netmap_dtor); + if (error) + return error; + + priv->np_refcount = 1; + + return 0; +} + + +struct cdevsw netmap_cdevsw = { + .d_version = D_VERSION, + .d_name = "netmap", + .d_open = netmap_open, + .d_mmap_single = netmap_mmap_single, + .d_ioctl = netmap_ioctl, + .d_poll = netmap_poll, + .d_close = netmap_close, +}; + + +/* + * Kernel entry point. + * + * Initialize/finalize the module and return. + * + * Return 0 on success, errno on failure. + */ +static int +netmap_loader(__unused struct module *module, int event, __unused void *arg) +{ + int error = 0; + + switch (event) { + case MOD_LOAD: + error = netmap_init(); + break; + + case MOD_UNLOAD: + netmap_fini(); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + + +DEV_MODULE(netmap, netmap_loader, NULL); diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c new file mode 100644 index 000000000000..2c42db3f8862 --- /dev/null +++ b/sys/dev/netmap/netmap_generic.c @@ -0,0 +1,818 @@ +/* + * Copyright (C) 2013 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This module implements netmap support on top of standard, + * unmodified device drivers. + * + * A NIOCREGIF request is handled here if the device does not + * have native support. TX and RX rings are emulated as follows: + * + * NIOCREGIF + * We preallocate a block of TX mbufs (roughly as many as + * tx descriptors; the number is not critical) to speed up + * operation during transmissions. The refcount on most of + * these buffers is artificially bumped up so we can recycle + * them more easily. Also, the destructor is intercepted + * so we use it as an interrupt notification to wake up + * processes blocked on a poll(). + * + * For each receive ring we allocate one "struct mbq" + * (an mbuf tailq plus a spinlock). We intercept packets + * (through if_input) + * on the receive path and put them in the mbq from which + * netmap receive routines can grab them. + * + * TX: + * in the generic_txsync() routine, netmap buffers are copied + * (or linked, in a future) to the preallocated mbufs + * and pushed to the transmit queue. Some of these mbufs + * (those with NS_REPORT, or otherwise every half ring) + * have the refcount=1, others have refcount=2. + * When the destructor is invoked, we take that as + * a notification that all mbufs up to that one in + * the specific ring have been completed, and generate + * the equivalent of a transmit interrupt. + * + * RX: + * + */ + +#ifdef __FreeBSD__ + +#include <sys/cdefs.h> /* prerequisite */ +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/malloc.h> +#include <sys/lock.h> /* PROT_EXEC */ +#include <sys/rwlock.h> +#include <sys/socket.h> /* sockaddrs */ +#include <sys/selinfo.h> +#include <net/if.h> +#include <net/if_var.h> +#include <machine/bus.h> /* bus_dmamap_* in netmap_kern.h */ + +// XXX temporary - D() defined here +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> +#include <dev/netmap/netmap_mem2.h> + +#define rtnl_lock() D("rtnl_lock called"); +#define rtnl_unlock() D("rtnl_lock called"); +#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) +#define smp_mb() + +/* + * mbuf wrappers + */ + +/* + * we allocate an EXT_PACKET + */ +#define netmap_get_mbuf(len) m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR|M_NOFREE) + +/* mbuf destructor, also need to change the type to EXT_EXTREF, + * add an M_NOFREE flag, and then clear the flag and + * chain into uma_zfree(zone_pack, mf) + * (or reinstall the buffer ?) + */ +#define SET_MBUF_DESTRUCTOR(m, fn) do { \ + (m)->m_ext.ext_free = (void *)fn; \ + (m)->m_ext.ext_type = EXT_EXTREF; \ + } while (0) + + +#define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *(m)->m_ext.ref_cnt : -1) + + + +#else /* linux */ + +#include "bsd_glue.h" + +#include <linux/rtnetlink.h> /* rtnl_[un]lock() */ +#include <linux/ethtool.h> /* struct ethtool_ops, get_ringparam */ +#include <linux/hrtimer.h> + +//#define RATE /* Enables communication statistics. */ + +//#define REG_RESET + +#endif /* linux */ + + +/* Common headers. */ +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> +#include <dev/netmap/netmap_mem2.h> + + + +/* ======================== usage stats =========================== */ + +#ifdef RATE +#define IFRATE(x) x +struct rate_stats { + unsigned long txpkt; + unsigned long txsync; + unsigned long txirq; + unsigned long rxpkt; + unsigned long rxirq; + unsigned long rxsync; +}; + +struct rate_context { + unsigned refcount; + struct timer_list timer; + struct rate_stats new; + struct rate_stats old; +}; + +#define RATE_PRINTK(_NAME_) \ + printk( #_NAME_ " = %lu Hz\n", (cur._NAME_ - ctx->old._NAME_)/RATE_PERIOD); +#define RATE_PERIOD 2 +static void rate_callback(unsigned long arg) +{ + struct rate_context * ctx = (struct rate_context *)arg; + struct rate_stats cur = ctx->new; + int r; + + RATE_PRINTK(txpkt); + RATE_PRINTK(txsync); + RATE_PRINTK(txirq); + RATE_PRINTK(rxpkt); + RATE_PRINTK(rxsync); + RATE_PRINTK(rxirq); + printk("\n"); + + ctx->old = cur; + r = mod_timer(&ctx->timer, jiffies + + msecs_to_jiffies(RATE_PERIOD * 1000)); + if (unlikely(r)) + D("[v1000] Error: mod_timer()"); +} + +static struct rate_context rate_ctx; + +#else /* !RATE */ +#define IFRATE(x) +#endif /* !RATE */ + + +/* =============== GENERIC NETMAP ADAPTER SUPPORT ================= */ +#define GENERIC_BUF_SIZE netmap_buf_size /* Size of the mbufs in the Tx pool. */ + +/* + * Wrapper used by the generic adapter layer to notify + * the poller threads. Differently from netmap_rx_irq(), we check + * only IFCAP_NETMAP instead of NAF_NATIVE_ON to enable the irq. + */ +static void +netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done) +{ + if (unlikely(!(ifp->if_capenable & IFCAP_NETMAP))) + return; + + netmap_common_irq(ifp, q, work_done); +} + + +/* Enable/disable netmap mode for a generic network interface. */ +int generic_netmap_register(struct netmap_adapter *na, int enable) +{ + struct ifnet *ifp = na->ifp; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + struct mbuf *m; + int error; + int i, r; + + if (!na) + return EINVAL; + +#ifdef REG_RESET + error = ifp->netdev_ops->ndo_stop(ifp); + if (error) { + return error; + } +#endif /* REG_RESET */ + + if (enable) { /* Enable netmap mode. */ + /* Initialize the rx queue, as generic_rx_handler() can + * be called as soon as netmap_catch_rx() returns. + */ + for (r=0; r<na->num_rx_rings; r++) { + mbq_safe_init(&na->rx_rings[r].rx_queue); + na->rx_rings[r].nr_ntc = 0; + } + + /* Init the mitigation timer. */ + netmap_mitigation_init(gna); + + /* + * Preallocate packet buffers for the tx rings. + */ + for (r=0; r<na->num_tx_rings; r++) { + na->tx_rings[r].nr_ntc = 0; + na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (!na->tx_rings[r].tx_pool) { + D("tx_pool allocation failed"); + error = ENOMEM; + goto free_tx_pool; + } + for (i=0; i<na->num_tx_desc; i++) { + m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (!m) { + D("tx_pool[%d] allocation failed", i); + error = ENOMEM; + goto free_mbufs; + } + na->tx_rings[r].tx_pool[i] = m; + } + } + rtnl_lock(); + /* Prepare to intercept incoming traffic. */ + error = netmap_catch_rx(na, 1); + if (error) { + D("netdev_rx_handler_register() failed"); + goto register_handler; + } + ifp->if_capenable |= IFCAP_NETMAP; + + /* Make netmap control the packet steering. */ + netmap_catch_packet_steering(gna, 1); + + rtnl_unlock(); + +#ifdef RATE + if (rate_ctx.refcount == 0) { + D("setup_timer()"); + memset(&rate_ctx, 0, sizeof(rate_ctx)); + setup_timer(&rate_ctx.timer, &rate_callback, (unsigned long)&rate_ctx); + if (mod_timer(&rate_ctx.timer, jiffies + msecs_to_jiffies(1500))) { + D("Error: mod_timer()"); + } + } + rate_ctx.refcount++; +#endif /* RATE */ + + } else { /* Disable netmap mode. */ + rtnl_lock(); + + ifp->if_capenable &= ~IFCAP_NETMAP; + + /* Release packet steering control. */ + netmap_catch_packet_steering(gna, 0); + + /* Do not intercept packets on the rx path. */ + netmap_catch_rx(na, 0); + + rtnl_unlock(); + + /* Free the mbufs going to the netmap rings */ + for (r=0; r<na->num_rx_rings; r++) { + mbq_safe_purge(&na->rx_rings[r].rx_queue); + mbq_safe_destroy(&na->rx_rings[r].rx_queue); + } + + netmap_mitigation_cleanup(gna); + + for (r=0; r<na->num_tx_rings; r++) { + for (i=0; i<na->num_tx_desc; i++) { + m_freem(na->tx_rings[r].tx_pool[i]); + } + free(na->tx_rings[r].tx_pool, M_DEVBUF); + } + +#ifdef RATE + if (--rate_ctx.refcount == 0) { + D("del_timer()"); + del_timer(&rate_ctx.timer); + } +#endif + } + +#ifdef REG_RESET + error = ifp->netdev_ops->ndo_open(ifp); + if (error) { + goto alloc_tx_pool; + } +#endif + + return 0; + +register_handler: + rtnl_unlock(); +free_tx_pool: + r--; + i = na->num_tx_desc; /* Useless, but just to stay safe. */ +free_mbufs: + i--; + for (; r>=0; r--) { + for (; i>=0; i--) { + m_freem(na->tx_rings[r].tx_pool[i]); + } + free(na->tx_rings[r].tx_pool, M_DEVBUF); + i = na->num_tx_desc - 1; + } + + return error; +} + +/* + * Callback invoked when the device driver frees an mbuf used + * by netmap to transmit a packet. This usually happens when + * the NIC notifies the driver that transmission is completed. + */ +static void +generic_mbuf_destructor(struct mbuf *m) +{ + if (netmap_verbose) + D("Tx irq (%p) queue %d", m, MBUF_TXQ(m)); + netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL); +#ifdef __FreeBSD__ + m->m_ext.ext_type = EXT_PACKET; + m->m_ext.ext_free = NULL; + if (*(m->m_ext.ref_cnt) == 0) + *(m->m_ext.ref_cnt) = 1; + uma_zfree(zone_pack, m); +#endif /* __FreeBSD__ */ + IFRATE(rate_ctx.new.txirq++); +} + +/* Record completed transmissions and update hwavail. + * + * nr_ntc is the oldest tx buffer not yet completed + * (same as nr_hwavail + nr_hwcur + 1), + * nr_hwcur is the first unsent buffer. + * When cleaning, we try to recover buffers between nr_ntc and nr_hwcur. + */ +static int +generic_netmap_tx_clean(struct netmap_kring *kring) +{ + u_int num_slots = kring->nkr_num_slots; + u_int ntc = kring->nr_ntc; + u_int hwcur = kring->nr_hwcur; + u_int n = 0; + struct mbuf **tx_pool = kring->tx_pool; + + while (ntc != hwcur) { /* buffers not completed */ + struct mbuf *m = tx_pool[ntc]; + + if (unlikely(m == NULL)) { + /* try to replenish the entry */ + tx_pool[ntc] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (unlikely(m == NULL)) { + D("mbuf allocation failed, XXX error"); + // XXX how do we proceed ? break ? + return -ENOMEM; + } + } else if (GET_MBUF_REFCNT(m) != 1) { + break; /* This mbuf is still busy: its refcnt is 2. */ + } + if (unlikely(++ntc == num_slots)) { + ntc = 0; + } + n++; + } + kring->nr_ntc = ntc; + kring->nr_hwavail += n; + ND("tx completed [%d] -> hwavail %d", n, kring->nr_hwavail); + + return n; +} + + +/* + * We have pending packets in the driver between nr_ntc and j. + * Compute a position in the middle, to be used to generate + * a notification. + */ +static inline u_int +generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur) +{ + u_int n = kring->nkr_num_slots; + u_int ntc = kring->nr_ntc; + u_int e; + + if (hwcur >= ntc) { + e = (hwcur + ntc) / 2; + } else { /* wrap around */ + e = (hwcur + n + ntc) / 2; + if (e >= n) { + e -= n; + } + } + + if (unlikely(e >= n)) { + D("This cannot happen"); + e = 0; + } + + return e; +} + +/* + * We have pending packets in the driver between nr_ntc and hwcur. + * Schedule a notification approximately in the middle of the two. + * There is a race but this is only called within txsync which does + * a double check. + */ +static void +generic_set_tx_event(struct netmap_kring *kring, u_int hwcur) +{ + struct mbuf *m; + u_int e; + + if (kring->nr_ntc == hwcur) { + return; + } + e = generic_tx_event_middle(kring, hwcur); + + m = kring->tx_pool[e]; + if (m == NULL) { + /* This can happen if there is already an event on the netmap + slot 'e': There is nothing to do. */ + return; + } + ND("Event at %d mbuf %p refcnt %d", e, m, GET_MBUF_REFCNT(m)); + kring->tx_pool[e] = NULL; + SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor); + + // XXX wmb() ? + /* Decrement the refcount an free it if we have the last one. */ + m_freem(m); + smp_mb(); +} + + +/* + * generic_netmap_txsync() transforms netmap buffers into mbufs + * and passes them to the standard device driver + * (ndo_start_xmit() or ifp->if_transmit() ). + * On linux this is not done directly, but using dev_queue_xmit(), + * since it implements the TX flow control (and takes some locks). + */ +static int +generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct ifnet *ifp = na->ifp; + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int j, k, num_slots = kring->nkr_num_slots; + int new_slots, ntx; + + IFRATE(rate_ctx.new.txsync++); + + // TODO: handle the case of mbuf allocation failure + /* first, reclaim completed buffers */ + generic_netmap_tx_clean(kring); + + /* Take a copy of ring->cur now, and never read it again. */ + k = ring->cur; + if (unlikely(k >= num_slots)) { + return netmap_ring_reinit(kring); + } + + rmb(); + j = kring->nr_hwcur; + /* + * 'new_slots' counts how many new slots have been added: + * everything from hwcur to cur, excluding reserved ones, if any. + * nr_hwreserved start from hwcur and counts how many slots were + * not sent to the NIC from the previous round. + */ + new_slots = k - j - kring->nr_hwreserved; + if (new_slots < 0) { + new_slots += num_slots; + } + ntx = 0; + if (j != k) { + /* Process new packets to send: + * j is the current index in the netmap ring. + */ + while (j != k) { + struct netmap_slot *slot = &ring->slot[j]; /* Current slot in the netmap ring */ + void *addr = NMB(slot); + u_int len = slot->len; + struct mbuf *m; + int tx_ret; + + if (unlikely(addr == netmap_buffer_base || len > NETMAP_BUF_SIZE)) { + return netmap_ring_reinit(kring); + } + /* Tale a mbuf from the tx pool and copy in the user packet. */ + m = kring->tx_pool[j]; + if (unlikely(!m)) { + RD(5, "This should never happen"); + kring->tx_pool[j] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (unlikely(m == NULL)) { + D("mbuf allocation failed"); + break; + } + } + /* XXX we should ask notifications when NS_REPORT is set, + * or roughly every half frame. We can optimize this + * by lazily requesting notifications only when a + * transmission fails. Probably the best way is to + * break on failures and set notifications when + * ring->avail == 0 || j != k + */ + tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr); + if (unlikely(tx_ret)) { + RD(5, "start_xmit failed: err %d [%u,%u,%u,%u]", + tx_ret, kring->nr_ntc, j, k, kring->nr_hwavail); + /* + * No room for this mbuf in the device driver. + * Request a notification FOR A PREVIOUS MBUF, + * then call generic_netmap_tx_clean(kring) to do the + * double check and see if we can free more buffers. + * If there is space continue, else break; + * NOTE: the double check is necessary if the problem + * occurs in the txsync call after selrecord(). + * Also, we need some way to tell the caller that not + * all buffers were queued onto the device (this was + * not a problem with native netmap driver where space + * is preallocated). The bridge has a similar problem + * and we solve it there by dropping the excess packets. + */ + generic_set_tx_event(kring, j); + if (generic_netmap_tx_clean(kring)) { /* space now available */ + continue; + } else { + break; + } + } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + if (unlikely(++j == num_slots)) + j = 0; + ntx++; + } + + /* Update hwcur to the next slot to transmit. */ + kring->nr_hwcur = j; + + /* + * Report all new slots as unavailable, even those not sent. + * We account for them with with hwreserved, so that + * nr_hwreserved =:= cur - nr_hwcur + */ + kring->nr_hwavail -= new_slots; + kring->nr_hwreserved = k - j; + if (kring->nr_hwreserved < 0) { + kring->nr_hwreserved += num_slots; + } + + IFRATE(rate_ctx.new.txpkt += ntx); + + if (!kring->nr_hwavail) { + /* No more available slots? Set a notification event + * on a netmap slot that will be cleaned in the future. + * No doublecheck is performed, since txsync() will be + * called twice by netmap_poll(). + */ + generic_set_tx_event(kring, j); + } + ND("tx #%d, hwavail = %d", n, kring->nr_hwavail); + } + + /* Synchronize the user's view to the kernel view. */ + ring->avail = kring->nr_hwavail; + ring->reserved = kring->nr_hwreserved; + + return 0; +} + +/* + * This handler is registered (through netmap_catch_rx()) + * within the attached network interface + * in the RX subsystem, so that every mbuf passed up by + * the driver can be stolen to the network stack. + * Stolen packets are put in a queue where the + * generic_netmap_rxsync() callback can extract them. + */ +void generic_rx_handler(struct ifnet *ifp, struct mbuf *m) +{ + struct netmap_adapter *na = NA(ifp); + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + u_int work_done; + u_int rr = 0; // receive ring number + + ND("called"); + /* limit the size of the queue */ + if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) { + m_freem(m); + } else { + mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m); + } + + if (netmap_generic_mit < 32768) { + /* no rx mitigation, pass notification up */ + netmap_generic_irq(na->ifp, rr, &work_done); + IFRATE(rate_ctx.new.rxirq++); + } else { + /* same as send combining, filter notification if there is a + * pending timer, otherwise pass it up and start a timer. + */ + if (likely(netmap_mitigation_active(gna))) { + /* Record that there is some pending work. */ + gna->mit_pending = 1; + } else { + netmap_generic_irq(na->ifp, rr, &work_done); + IFRATE(rate_ctx.new.rxirq++); + netmap_mitigation_start(gna); + } + } +} + +/* + * generic_netmap_rxsync() extracts mbufs from the queue filled by + * generic_netmap_rx_handler() and puts their content in the netmap + * receive ring. + * Access must be protected because the rx handler is asynchronous, + */ +static int +generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int j, n, lim = kring->nkr_num_slots - 1; + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + u_int k, resvd = ring->reserved; + + if (ring->cur > lim) + return netmap_ring_reinit(kring); + + /* Import newly received packets into the netmap ring. */ + if (netmap_no_pendintr || force_update) { + uint16_t slot_flags = kring->nkr_slot_flags; + struct mbuf *m; + + n = 0; + j = kring->nr_ntc; /* first empty slot in the receive ring */ + /* extract buffers from the rx queue, stop at most one + * slot before nr_hwcur (index k) + */ + k = (kring->nr_hwcur) ? kring->nr_hwcur-1 : lim; + while (j != k) { + int len; + void *addr = NMB(&ring->slot[j]); + + if (addr == netmap_buffer_base) { /* Bad buffer */ + return netmap_ring_reinit(kring); + } + /* + * Call the locked version of the function. + * XXX Ideally we could grab a batch of mbufs at once, + * by changing rx_queue into a ring. + */ + m = mbq_safe_dequeue(&kring->rx_queue); + if (!m) + break; + len = MBUF_LEN(m); + m_copydata(m, 0, len, addr); + ring->slot[j].len = len; + ring->slot[j].flags = slot_flags; + m_freem(m); + if (unlikely(j++ == lim)) + j = 0; + n++; + } + if (n) { + kring->nr_ntc = j; + kring->nr_hwavail += n; + IFRATE(rate_ctx.new.rxpkt += n); + } + kring->nr_kflags &= ~NKR_PENDINTR; + } + + // XXX should we invert the order ? + /* Skip past packets that userspace has released */ + j = kring->nr_hwcur; + k = ring->cur; + if (resvd > 0) { + if (resvd + ring->avail >= lim + 1) { + D("XXX invalid reserve/avail %d %d", resvd, ring->avail); + ring->reserved = resvd = 0; // XXX panic... + } + k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; + } + if (j != k) { + /* Userspace has released some packets. */ + for (n = 0; j != k; n++) { + struct netmap_slot *slot = &ring->slot[j]; + + slot->flags &= ~NS_BUF_CHANGED; + if (unlikely(j++ == lim)) + j = 0; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = k; + } + /* Tell userspace that there are new packets. */ + ring->avail = kring->nr_hwavail - resvd; + IFRATE(rate_ctx.new.rxsync++); + + return 0; +} + +static void +generic_netmap_dtor(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter*)na; + struct netmap_adapter *prev_na = gna->prev; + + if (prev_na != NULL) { + D("Released generic NA %p", gna); + if_rele(na->ifp); + netmap_adapter_put(prev_na); + } + if (ifp != NULL) { + WNA(ifp) = prev_na; + D("Restored native NA %p", prev_na); + na->ifp = NULL; + } +} + +/* + * generic_netmap_attach() makes it possible to use netmap on + * a device without native netmap support. + * This is less performant than native support but potentially + * faster than raw sockets or similar schemes. + * + * In this "emulated" mode, netmap rings do not necessarily + * have the same size as those in the NIC. We use a default + * value and possibly override it if the OS has ways to fetch the + * actual configuration. + */ +int +generic_netmap_attach(struct ifnet *ifp) +{ + struct netmap_adapter *na; + struct netmap_generic_adapter *gna; + int retval; + u_int num_tx_desc, num_rx_desc; + + num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */ + + generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); + ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc); + + gna = malloc(sizeof(*gna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (gna == NULL) { + D("no memory on attach, give up"); + return ENOMEM; + } + na = (struct netmap_adapter *)gna; + na->ifp = ifp; + na->num_tx_desc = num_tx_desc; + na->num_rx_desc = num_rx_desc; + na->nm_register = &generic_netmap_register; + na->nm_txsync = &generic_netmap_txsync; + na->nm_rxsync = &generic_netmap_rxsync; + na->nm_dtor = &generic_netmap_dtor; + /* when using generic, IFCAP_NETMAP is set so we force + * NAF_SKIP_INTR to use the regular interrupt handler + */ + na->na_flags = NAF_SKIP_INTR; + + ND("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)", + ifp->num_tx_queues, ifp->real_num_tx_queues, + ifp->tx_queue_len); + ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)", + ifp->num_rx_queues, ifp->real_num_rx_queues); + + generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings); + + retval = netmap_attach_common(na); + if (retval) { + free(gna, M_DEVBUF); + } + + return retval; +} diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index 12bd882521b3..c009f5e62684 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -1,5 +1,6 @@ /* * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2013 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -33,27 +34,61 @@ #ifndef _NET_NETMAP_KERN_H_ #define _NET_NETMAP_KERN_H_ +#define WITH_VALE // comment out to disable VALE support + #if defined(__FreeBSD__) #define likely(x) __builtin_expect((long)!!(x), 1L) #define unlikely(x) __builtin_expect((long)!!(x), 0L) #define NM_LOCK_T struct mtx +#define NMG_LOCK_T struct mtx +#define NMG_LOCK_INIT() mtx_init(&netmap_global_lock, \ + "netmap global lock", NULL, MTX_DEF) +#define NMG_LOCK_DESTROY() mtx_destroy(&netmap_global_lock) +#define NMG_LOCK() mtx_lock(&netmap_global_lock) +#define NMG_UNLOCK() mtx_unlock(&netmap_global_lock) +#define NMG_LOCK_ASSERT() mtx_assert(&netmap_global_lock, MA_OWNED) + #define NM_SELINFO_T struct selinfo #define MBUF_LEN(m) ((m)->m_pkthdr.len) +#define MBUF_IFP(m) ((m)->m_pkthdr.rcvif) #define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m) -#define NM_ATOMIC_T volatile int +#define NM_ATOMIC_T volatile int // XXX ? +/* atomic operations */ +#include <machine/atomic.h> +#define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) +#define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) + +#define prefetch(x) __builtin_prefetch(x) + +MALLOC_DECLARE(M_NETMAP); + +// XXX linux struct, not used in FreeBSD +struct net_device_ops { +}; +struct hrtimer { +}; #elif defined (linux) #define NM_LOCK_T safe_spinlock_t // see bsd_glue.h #define NM_SELINFO_T wait_queue_head_t #define MBUF_LEN(m) ((m)->len) +#define MBUF_IFP(m) ((m)->dev) #define NM_SEND_UP(ifp, m) netif_rx(m) #define NM_ATOMIC_T volatile long unsigned int +// XXX a mtx would suffice here too 20130404 gl +#define NMG_LOCK_T struct semaphore +#define NMG_LOCK_INIT() sema_init(&netmap_global_lock, 1) +#define NMG_LOCK_DESTROY() +#define NMG_LOCK() down(&netmap_global_lock) +#define NMG_UNLOCK() up(&netmap_global_lock) +#define NMG_LOCK_ASSERT() // XXX to be completed + #ifndef DEV_NETMAP #define DEV_NETMAP #endif /* DEV_NETMAP */ @@ -115,6 +150,10 @@ struct netmap_priv_d; const char *nm_dump_buf(char *p, int len, int lim, char *dst); +#include "netmap_mbq.h" + +extern NMG_LOCK_T netmap_global_lock; + /* * private, kernel view of a ring. Keeps track of the status of * a ring across system calls. @@ -152,7 +191,7 @@ const char *nm_dump_buf(char *p, int len, int lim, char *dst); * nkr_leases array of nkr_num_slots where writers can report * completion of their block. NR_NOSLOT (~0) indicates * that the writer has not finished yet - * nkr_lease_idx index of next free slot in nr_leases, to be assigned + * nkr_lease_idx index of next free slot in nr_leases, to be assigned * * The kring is manipulated by txsync/rxsync and generic netmap function. * q_lock is used to arbitrate access to the kring from within the netmap @@ -166,6 +205,7 @@ struct netmap_kring { uint32_t nr_hwcur; uint32_t nr_hwavail; uint32_t nr_kflags; /* private driver flags */ + int32_t nr_hwreserved; #define NKR_PENDINTR 0x1 // Pending interrupt. uint32_t nkr_num_slots; int32_t nkr_hwofs; /* offset between NIC and netmap ring */ @@ -183,6 +223,17 @@ struct netmap_kring { NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ volatile int nkr_stopped; + + /* support for adapters without native netmap support. + * On tx rings we preallocate an array of tx buffers + * (same size as the netmap ring), on rx rings we + * store incoming packets in a queue. + * XXX who writes to the rx queue ? + */ + struct mbuf **tx_pool; + u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */ + struct mbq rx_queue; /* A queue for intercepted rx mbufs. */ + } __attribute__((__aligned__(64))); @@ -245,22 +296,26 @@ nm_next(uint32_t i, uint32_t lim) +enum txrx { NR_RX = 0, NR_TX = 1 }; /* - * This struct extends the 'struct adapter' (or - * equivalent) device descriptor. It contains all fields needed to - * support netmap operation. + * The "struct netmap_adapter" extends the "struct adapter" + * (or equivalent) device descriptor. + * It contains all base fields needed to support netmap operation. + * There are in fact different types of netmap adapters + * (native, generic, VALE switch...) so a netmap_adapter is + * just the first field in the derived type. */ struct netmap_adapter { /* * On linux we do not have a good way to tell if an interface - * is netmap-capable. So we use the following trick: + * is netmap-capable. So we always use the following trick: * NA(ifp) points here, and the first entry (which hopefully * always exists and is at least 32 bits) contains a magic * value which we can use to detect that the interface is good. */ uint32_t magic; - uint32_t na_flags; /* future place for IFCAP_NETMAP */ + uint32_t na_flags; /* enabled, and other flags */ #define NAF_SKIP_INTR 1 /* use the regular interrupt handler. * useful during initialization */ @@ -272,17 +327,16 @@ struct netmap_adapter { #define NAF_MEM_OWNER 8 /* the adapter is responsible for the * deallocation of the memory allocator */ - int refcount; /* number of user-space descriptors using this +#define NAF_NATIVE_ON 16 /* the adapter is native and the attached + * interface is in netmap mode + */ +#define NAF_NETMAP_ON 32 /* netmap is active (either native or + * emulated. Where possible (e.g. FreeBSD) + * IFCAP_NETMAP also mirrors this flag. + */ + int active_fds; /* number of user-space descriptors using this interface, which is equal to the number of struct netmap_if objs in the mapped region. */ - /* - * The selwakeup in the interrupt thread can use per-ring - * and/or global wait queues. We track how many clients - * of each type we have so we can optimize the drivers, - * and especially avoid huge contention on the locks. - */ - int na_single; /* threads attached to a single hw queue */ - int na_multi; /* threads attached to multiple hw queues */ u_int num_rx_rings; /* number of adapter receive rings */ u_int num_tx_rings; /* number of adapter transmit rings */ @@ -296,6 +350,9 @@ struct netmap_adapter { */ struct netmap_kring *tx_rings; /* array of TX rings. */ struct netmap_kring *rx_rings; /* array of RX rings. */ + void *tailroom; /* space below the rings array */ + /* (used for leases) */ + NM_SELINFO_T tx_si, rx_si; /* global wait queues */ @@ -309,47 +366,157 @@ struct netmap_adapter { */ struct ifnet *ifp; /* adapter is ifp->if_softc */ - NM_LOCK_T core_lock; /* used if no device lock available */ + /* private cleanup */ + void (*nm_dtor)(struct netmap_adapter *); - int (*nm_register)(struct ifnet *, int onoff); + int (*nm_register)(struct netmap_adapter *, int onoff); - int (*nm_txsync)(struct ifnet *, u_int ring, int flags); - int (*nm_rxsync)(struct ifnet *, u_int ring, int flags); + int (*nm_txsync)(struct netmap_adapter *, u_int ring, int flags); + int (*nm_rxsync)(struct netmap_adapter *, u_int ring, int flags); #define NAF_FORCE_READ 1 #define NAF_FORCE_RECLAIM 2 /* return configuration information */ - int (*nm_config)(struct ifnet *, u_int *txr, u_int *txd, - u_int *rxr, u_int *rxd); + int (*nm_config)(struct netmap_adapter *, + u_int *txr, u_int *txd, u_int *rxr, u_int *rxd); + int (*nm_krings_create)(struct netmap_adapter *); + void (*nm_krings_delete)(struct netmap_adapter *); + int (*nm_notify)(struct netmap_adapter *, + u_int ring, enum txrx, int flags); +#define NAF_GLOBAL_NOTIFY 4 +#define NAF_DISABLE_NOTIFY 8 + + /* standard refcount to control the lifetime of the adapter + * (it should be equal to the lifetime of the corresponding ifp) + */ + int na_refcount; + + /* memory allocator (opaque) + * We also cache a pointer to the lut_entry for translating + * buffer addresses, and the total number of buffers. + */ + struct netmap_mem_d *nm_mem; + struct lut_entry *na_lut; + uint32_t na_lut_objtotal; /* max buffer index */ + + /* used internally. If non-null, the interface cannot be bound + * from userspace + */ + void *na_private; +}; + +/* + * If the NIC is owned by the kernel + * (i.e., bridge), neither another bridge nor user can use it; + * if the NIC is owned by a user, only users can share it. + * Evaluation must be done under NMG_LOCK(). + */ +#define NETMAP_OWNED_BY_KERN(na) (na->na_private) +#define NETMAP_OWNED_BY_ANY(na) \ + (NETMAP_OWNED_BY_KERN(na) || (na->active_fds > 0)) + + +/* + * derived netmap adapters for various types of ports + */ +struct netmap_vp_adapter { /* VALE software port */ + struct netmap_adapter up; /* * Bridge support: * * bdg_port is the port number used in the bridge; - * na_bdg_refcount is a refcount used for bridge ports, - * when it goes to 0 we can detach+free this port - * (a bridge port is always attached if it exists; - * it is not always registered) * na_bdg points to the bridge this NA is attached to. */ int bdg_port; - int na_bdg_refcount; struct nm_bridge *na_bdg; + int retry; + + u_int offset; /* Offset of ethernet header for each packet. */ +}; + +struct netmap_hw_adapter { /* physical device */ + struct netmap_adapter up; + + struct net_device_ops nm_ndo; // XXX linux only +}; + +struct netmap_generic_adapter { /* non-native device */ + struct netmap_hw_adapter up; + + /* Pointer to a previously used netmap adapter. */ + struct netmap_adapter *prev; + + /* generic netmap adapters support: + * a net_device_ops struct overrides ndo_select_queue(), + * save_if_input saves the if_input hook (FreeBSD), + * mit_timer and mit_pending implement rx interrupt mitigation, + */ + struct net_device_ops generic_ndo; + void (*save_if_input)(struct ifnet *, struct mbuf *); + + struct hrtimer mit_timer; + int mit_pending; +}; + +#ifdef WITH_VALE + +/* bridge wrapper for non VALE ports. It is used to connect real devices to the bridge. + * + * The real device must already have its own netmap adapter (hwna). The + * bridge wrapper and the hwna adapter share the same set of netmap rings and + * buffers, but they have two separate sets of krings descriptors, with tx/rx + * meanings swapped: + * + * netmap + * bwrap krings rings krings hwna + * +------+ +------+ +-----+ +------+ +------+ + * |tx_rings->| |\ /| |----| |<-tx_rings| + * | | +------+ \ / +-----+ +------+ | | + * | | X | | + * | | / \ | | + * | | +------+/ \+-----+ +------+ | | + * |rx_rings->| | | |----| |<-rx_rings| + * | | +------+ +-----+ +------+ | | + * +------+ +------+ + * + * - packets coming from the bridge go to the brwap rx rings, which are also the + * hwna tx rings. The bwrap notify callback will then complete the hwna tx + * (see netmap_bwrap_notify). + * - packets coming from the outside go to the hwna rx rings, which are also the + * bwrap tx rings. The (overwritten) hwna notify method will then complete + * the bridge tx (see netmap_bwrap_intr_notify). + * + * The bridge wrapper may optionally connect the hwna 'host' rings to the + * bridge. This is done by using a second port in the bridge and connecting it + * to the 'host' netmap_vp_adapter contained in the netmap_bwrap_adapter. + * The brwap host adapter cross-links the hwna host rings in the same way as shown above. + * + * - packets coming from the bridge and directed to host stack are handled by the + * bwrap host notify callback (see netmap_bwrap_host_notify) + * - packets coming from the host stack are still handled by the overwritten + * hwna notify callback (netmap_bwrap_intr_notify), but are diverted to the + * host adapter depending on the ring number. + * + */ +struct netmap_bwrap_adapter { + struct netmap_vp_adapter up; + struct netmap_vp_adapter host; /* for host rings */ + struct netmap_adapter *hwna; /* the underlying device */ + + /* backup of the hwna notify callback */ + int (*save_notify)(struct netmap_adapter *, + u_int ring, enum txrx, int flags); /* When we attach a physical interface to the bridge, we * allow the controlling process to terminate, so we need * a place to store the netmap_priv_d data structure. * This is only done when physical interfaces are attached to a bridge. */ struct netmap_priv_d *na_kpriv; - - /* memory allocator */ - struct netmap_mem_d *nm_mem; -#ifdef linux - struct net_device_ops nm_ndo; -#endif /* linux */ }; + /* - * Available space in the ring. + * Available space in the ring. Only used in VALE code */ static inline uint32_t nm_kr_space(struct netmap_kring *k, int is_rx) @@ -357,7 +524,7 @@ nm_kr_space(struct netmap_kring *k, int is_rx) int space; if (is_rx) { - int busy = k->nkr_hwlease - k->nr_hwcur; + int busy = k->nkr_hwlease - k->nr_hwcur + k->nr_hwreserved; if (busy < 0) busy += k->nkr_num_slots; space = k->nkr_num_slots - 1 - busy; @@ -381,25 +548,6 @@ nm_kr_space(struct netmap_kring *k, int is_rx) } -/* return update position */ -static inline uint32_t -nm_kr_rxpos(struct netmap_kring *k) -{ - uint32_t pos = k->nr_hwcur + k->nr_hwavail; - if (pos >= k->nkr_num_slots) - pos -= k->nkr_num_slots; -#if 0 - if (pos >= k->nkr_num_slots || - k->nkr_hwlease >= k->nkr_num_slots || - k->nr_hwcur >= k->nkr_num_slots || - k->nr_hwavail >= k->nkr_num_slots || - k->nkr_lease_idx >= k->nkr_num_slots) { - D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, - k->nkr_lease_idx, k->nkr_num_slots); - } -#endif - return pos; -} /* make a lease on the kring for N positions. return the @@ -435,23 +583,61 @@ nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) return lease_idx; } +#endif /* WITH_VALE */ + +/* return update position */ +static inline uint32_t +nm_kr_rxpos(struct netmap_kring *k) +{ + uint32_t pos = k->nr_hwcur + k->nr_hwavail; + if (pos >= k->nkr_num_slots) + pos -= k->nkr_num_slots; +#if 0 + if (pos >= k->nkr_num_slots || + k->nkr_hwlease >= k->nkr_num_slots || + k->nr_hwcur >= k->nkr_num_slots || + k->nr_hwavail >= k->nkr_num_slots || + k->nkr_lease_idx >= k->nkr_num_slots) { + D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, + k->nkr_lease_idx, k->nkr_num_slots); + } +#endif + return pos; +} + /* - * XXX NETMAP_DELETING() is unused - * - * The combination of "enable" (ifp->if_capenable & IFCAP_NETMAP) - * and refcount gives the status of the interface, namely: - * - * enable refcount Status - * - * FALSE 0 normal operation - * FALSE != 0 -- (impossible) - * TRUE 1 netmap mode - * TRUE 0 being deleted. + * protect against multiple threads using the same ring. + * also check that the ring has not been stopped. + * We only care for 0 or !=0 as a return code. */ +#define NM_KR_BUSY 1 +#define NM_KR_STOPPED 2 -#define NETMAP_DELETING(_na) ( ((_na)->refcount == 0) && \ - ( (_na)->ifp->if_capenable & IFCAP_NETMAP) ) +static __inline void nm_kr_put(struct netmap_kring *kr) +{ + NM_ATOMIC_CLEAR(&kr->nr_busy); +} + +static __inline int nm_kr_tryget(struct netmap_kring *kr) +{ + /* check a first time without taking the lock + * to avoid starvation for nm_kr_get() + */ + if (unlikely(kr->nkr_stopped)) { + ND("ring %p stopped (%d)", kr, kr->nkr_stopped); + return NM_KR_STOPPED; + } + if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) + return NM_KR_BUSY; + /* check a second time with lock held */ + if (unlikely(kr->nkr_stopped)) { + ND("ring %p stopped (%d)", kr, kr->nkr_stopped); + nm_kr_put(kr); + return NM_KR_STOPPED; + } + return 0; +} /* @@ -472,16 +658,116 @@ nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) * netmap_reset() is a helper routine to be called in the driver * when reinitializing a ring. */ -int netmap_attach(struct netmap_adapter *, u_int); +int netmap_attach(struct netmap_adapter *); +int netmap_attach_common(struct netmap_adapter *); +void netmap_detach_common(struct netmap_adapter *na); void netmap_detach(struct ifnet *); int netmap_transmit(struct ifnet *, struct mbuf *); -enum txrx { NR_RX = 0, NR_TX = 1 }; struct netmap_slot *netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, u_int new_cur); int netmap_ring_reinit(struct netmap_kring *); +/* set/clear native flags. XXX maybe also if_transmit ? */ +static inline void +nm_set_native_flags(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + + na->na_flags |= (NAF_NATIVE_ON | NAF_NETMAP_ON); +#ifdef IFCAP_NETMAP /* or FreeBSD ? */ + ifp->if_capenable |= IFCAP_NETMAP; +#endif +#ifdef __FreeBSD__ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_transmit; +#else + na->if_transmit = (void *)ifp->netdev_ops; + ifp->netdev_ops = &((struct netmap_hw_adapter *)na)->nm_ndo; +#endif +} + +static inline void +nm_clear_native_flags(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + +#ifdef __FreeBSD__ + ifp->if_transmit = na->if_transmit; +#else + ifp->netdev_ops = (void *)na->if_transmit; +#endif + na->na_flags &= ~(NAF_NATIVE_ON | NAF_NETMAP_ON); +#ifdef IFCAP_NETMAP /* or FreeBSD ? */ + ifp->if_capenable &= ~IFCAP_NETMAP; +#endif +} + +/* + * validates parameters in the ring/kring, returns a value for cur, + * and the 'new_slots' value in the argument. + * If any error, returns cur > lim to force a reinit. + */ +u_int nm_txsync_prologue(struct netmap_kring *, u_int *); + +/* + * validates parameters in the ring/kring, returns a value for cur, + * and the 'reserved' value in the argument. + * If any error, returns cur > lim to force a reinit. + */ +u_int nm_rxsync_prologue(struct netmap_kring *, u_int *); + +/* + * update kring and ring at the end of txsync + */ +static inline void +nm_txsync_finalize(struct netmap_kring *kring, u_int cur) +{ + /* recompute hwreserved */ + kring->nr_hwreserved = cur - kring->nr_hwcur; + if (kring->nr_hwreserved < 0) + kring->nr_hwreserved += kring->nkr_num_slots; + + /* update avail and reserved to what the kernel knows */ + kring->ring->avail = kring->nr_hwavail; + kring->ring->reserved = kring->nr_hwreserved; +} + +/* check/fix address and len in tx rings */ +#if 1 /* debug version */ +#define NM_CHECK_ADDR_LEN(_a, _l) do { \ + if (_a == netmap_buffer_base || _l > NETMAP_BUF_SIZE) { \ + RD(5, "bad addr/len ring %d slot %d idx %d len %d", \ + ring_nr, nm_i, slot->buf_idx, len); \ + if (_l > NETMAP_BUF_SIZE) \ + _l = NETMAP_BUF_SIZE; \ + } } while (0) +#else /* no debug version */ +#define NM_CHECK_ADDR_LEN(_a, _l) do { \ + if (_l > NETMAP_BUF_SIZE) \ + _l = NETMAP_BUF_SIZE; \ + } while (0) +#endif + + +/*---------------------------------------------------------------*/ +/* + * Support routines to be used with the VALE switch + */ +int netmap_update_config(struct netmap_adapter *na); +int netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom); +void netmap_krings_delete(struct netmap_adapter *na); + +struct netmap_if * +netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, + uint16_t ringid, int *err); + + + u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg); +int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na); +#ifdef WITH_VALE /* * The following bridge-related interfaces are used by other kernel modules * In the version that only supports unicast or broadcast, the lookup @@ -489,15 +775,76 @@ u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg); * NM_BDG_MAXPORTS for broadcast, NM_BDG_MAXPORTS+1 for unknown. * XXX in practice "unknown" might be handled same as broadcast. */ -typedef u_int (*bdg_lookup_fn_t)(char *buf, u_int len, uint8_t *ring_nr, - struct netmap_adapter *); -int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func); -u_int netmap_bdg_learning(char *, u_int, uint8_t *, struct netmap_adapter *); -#define NM_NAME "vale" /* prefix for the bridge port name */ -#define NM_BDG_MAXPORTS 254 /* up to 32 for bitmap, 254 ok otherwise */ +typedef u_int (*bdg_lookup_fn_t)(char *buf, u_int len, + uint8_t *ring_nr, struct netmap_vp_adapter *); +u_int netmap_bdg_learning(char *, u_int, uint8_t *, + struct netmap_vp_adapter *); + +#define NM_BDG_MAXPORTS 254 /* up to 254 */ #define NM_BDG_BROADCAST NM_BDG_MAXPORTS #define NM_BDG_NOPORT (NM_BDG_MAXPORTS+1) +#define NM_NAME "vale" /* prefix for bridge port name */ + + +/* these are redefined in case of no VALE support */ +int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +void netmap_init_bridges(void); +int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func); + +#else /* !WITH_VALE */ +#define netmap_get_bdg_na(_1, _2, _3) 0 +#define netmap_init_bridges(_1) +#define netmap_bdg_ctl(_1, _2) EINVAL +#endif /* !WITH_VALE */ + +/* Various prototypes */ +int netmap_poll(struct cdev *dev, int events, struct thread *td); + + +int netmap_init(void); +void netmap_fini(void); +int netmap_get_memory(struct netmap_priv_d* p); +void netmap_dtor(void *data); +int netmap_dtor_locked(struct netmap_priv_d *priv); + +int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td); + +/* netmap_adapter creation/destruction */ +#define NM_IFPNAME(ifp) ((ifp) ? (ifp)->if_xname : "zombie") +#define NM_DEBUG_PUTGET 1 + +#ifdef NM_DEBUG_PUTGET + +#define NM_DBG(f) __##f + +void __netmap_adapter_get(struct netmap_adapter *na); + +#define netmap_adapter_get(na) \ + do { \ + struct netmap_adapter *__na = na; \ + D("getting %p:%s (%d)", __na, NM_IFPNAME(__na->ifp), __na->na_refcount); \ + __netmap_adapter_get(__na); \ + } while (0) + +int __netmap_adapter_put(struct netmap_adapter *na); + +#define netmap_adapter_put(na) \ + do { \ + struct netmap_adapter *__na = na; \ + D("putting %p:%s (%d)", __na, NM_IFPNAME(__na->ifp), __na->na_refcount); \ + __netmap_adapter_put(__na); \ + } while (0) + +#else /* !NM_DEBUG_PUTGET */ + +#define NM_DBG(f) f +void netmap_adapter_get(struct netmap_adapter *na); +int netmap_adapter_put(struct netmap_adapter *na); + +#endif /* !NM_DEBUG_PUTGET */ + + extern u_int netmap_buf_size; #define NETMAP_BUF_SIZE netmap_buf_size // XXX remove extern int netmap_mitigate; @@ -516,18 +863,18 @@ enum { /* verbose flags */ NM_VERB_NIC_TXSYNC = 0x2000, }; +extern int netmap_txsync_retry; +extern int netmap_generic_mit; +extern int netmap_generic_ringsize; + /* * NA returns a pointer to the struct netmap adapter from the ifp, * WNA is used to write it. - * SWNA() is used for the "host stack" endpoint associated - * to an interface. It is allocated together with the main NA(), - * as an array of two objects. */ #ifndef WNA #define WNA(_ifp) (_ifp)->if_pspare[0] #endif #define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp)) -#define SWNA(_ifp) (NA(_ifp) + 1) /* * Macros to determine if an interface is netmap capable or netmap enabled. @@ -561,6 +908,7 @@ enum { /* verbose flags */ #endif /* linux */ #ifdef __FreeBSD__ + /* Callback invoked by the dma machinery after a successfull dmamap_load */ static void netmap_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs, __unused int nseg, __unused int error) @@ -588,6 +936,7 @@ netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf) netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); } } + #else /* linux */ /* @@ -695,16 +1044,97 @@ PNMB(struct netmap_slot *slot, uint64_t *pp) return ret; } +/* Generic version of NMB, which uses device-specific memory. */ +static inline void * +BDG_NMB(struct netmap_adapter *na, struct netmap_slot *slot) +{ + struct lut_entry *lut = na->na_lut; + uint32_t i = slot->buf_idx; + return (unlikely(i >= na->na_lut_objtotal)) ? + lut[0].vaddr : lut[i].vaddr; +} + /* default functions to handle rx/tx interrupts */ int netmap_rx_irq(struct ifnet *, u_int, u_int *); #define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) - -#ifdef __FreeBSD__ -MALLOC_DECLARE(M_NETMAP); -#endif /* __FreeBSD__ */ +void netmap_common_irq(struct ifnet *, u_int, u_int *work_done); +void netmap_txsync_to_host(struct netmap_adapter *na); void netmap_disable_all_rings(struct ifnet *); void netmap_enable_all_rings(struct ifnet *); +void netmap_disable_ring(struct netmap_kring *kr); + + +/* Structure associated to each thread which registered an interface. + * + * The first 4 fields of this structure are written by NIOCREGIF and + * read by poll() and NIOC?XSYNC. + * There is low contention among writers (actually, a correct user program + * should have no contention among writers) and among writers and readers, + * so we use a single global lock to protect the structure initialization. + * Since initialization involves the allocation of memory, we reuse the memory + * allocator lock. + * Read access to the structure is lock free. Readers must check that + * np_nifp is not NULL before using the other fields. + * If np_nifp is NULL initialization has not been performed, so they should + * return an error to userlevel. + * + * The ref_done field is used to regulate access to the refcount in the + * memory allocator. The refcount must be incremented at most once for + * each open("/dev/netmap"). The increment is performed by the first + * function that calls netmap_get_memory() (currently called by + * mmap(), NIOCGINFO and NIOCREGIF). + * If the refcount is incremented, it is then decremented when the + * private structure is destroyed. + */ +struct netmap_priv_d { + struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ + + struct netmap_adapter *np_na; + int np_ringid; /* from the ioctl */ + u_int np_qfirst, np_qlast; /* range of rings to scan */ + uint16_t np_txpoll; + + struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ + /* np_refcount is only used on FreeBSD */ + int np_refcount; /* use with NMG_LOCK held */ +}; + + +/* + * generic netmap emulation for devices that do not have + * native netmap support. + * XXX generic_netmap_register() is only exported to implement + * nma_is_generic(). + */ +int generic_netmap_register(struct netmap_adapter *na, int enable); +int generic_netmap_attach(struct ifnet *ifp); + +int netmap_catch_rx(struct netmap_adapter *na, int intercept); +void generic_rx_handler(struct ifnet *ifp, struct mbuf *m);; +void netmap_catch_packet_steering(struct netmap_generic_adapter *na, int enable); +int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, u_int ring_nr); +int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); +void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); + +static __inline int +nma_is_generic(struct netmap_adapter *na) +{ + return na->nm_register == generic_netmap_register; +} + +/* + * netmap_mitigation API. This is used by the generic adapter + * to reduce the number of interrupt requests/selwakeup + * to clients on incoming packets. + */ +void netmap_mitigation_init(struct netmap_generic_adapter *na); +void netmap_mitigation_start(struct netmap_generic_adapter *na); +void netmap_mitigation_restart(struct netmap_generic_adapter *na); +int netmap_mitigation_active(struct netmap_generic_adapter *na); +void netmap_mitigation_cleanup(struct netmap_generic_adapter *na); + +// int generic_timer_handler(struct hrtimer *t); #endif /* _NET_NETMAP_KERN_H_ */ diff --git a/sys/dev/netmap/netmap_mbq.c b/sys/dev/netmap/netmap_mbq.c new file mode 100644 index 000000000000..c8e581b69fe5 --- /dev/null +++ b/sys/dev/netmap/netmap_mbq.c @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2013 Vincenzo Maffione. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + + +#ifdef linux +#include "bsd_glue.h" +#else /* __FreeBSD__ */ +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#endif /* __FreeBSD__ */ + +#include "netmap_mbq.h" + + +static inline void __mbq_init(struct mbq *q) +{ + q->head = q->tail = NULL; + q->count = 0; +} + +void mbq_safe_init(struct mbq *q) +{ + mtx_init(&q->lock, "mbq", NULL, MTX_SPIN); + __mbq_init(q); +} + +void mbq_init(struct mbq *q) +{ + __mbq_init(q); +} + +static inline void __mbq_enqueue(struct mbq *q, struct mbuf *m) +{ + m->m_nextpkt = NULL; + if (q->tail) { + q->tail->m_nextpkt = m; + q->tail = m; + } else { + q->head = q->tail = m; + } + q->count++; +} + +void mbq_safe_enqueue(struct mbq *q, struct mbuf *m) +{ + mtx_lock(&q->lock); + __mbq_enqueue(q, m); + mtx_unlock(&q->lock); +} + +void mbq_enqueue(struct mbq *q, struct mbuf *m) +{ + __mbq_enqueue(q, m); +} + +static inline struct mbuf *__mbq_dequeue(struct mbq *q) +{ + struct mbuf *ret = NULL; + + if (q->head) { + ret = q->head; + q->head = ret->m_nextpkt; + if (q->head == NULL) { + q->tail = NULL; + } + q->count--; + ret->m_nextpkt = NULL; + } + + return ret; +} + +struct mbuf *mbq_safe_dequeue(struct mbq *q) +{ + struct mbuf *ret; + + mtx_lock(&q->lock); + ret = __mbq_dequeue(q); + mtx_unlock(&q->lock); + + return ret; +} + +struct mbuf *mbq_dequeue(struct mbq *q) +{ + return __mbq_dequeue(q); +} + +/* XXX seems pointless to have a generic purge */ +static void __mbq_purge(struct mbq *q, int safe) +{ + struct mbuf *m; + + for (;;) { + m = safe ? mbq_safe_dequeue(q) : mbq_dequeue(q); + if (m) { + m_freem(m); + } else { + break; + } + } +} + +void mbq_purge(struct mbq *q) +{ + __mbq_purge(q, 0); +} + +void mbq_safe_purge(struct mbq *q) +{ + __mbq_purge(q, 1); +} + +void mbq_safe_destroy(struct mbq *q) +{ + mtx_destroy(&q->lock); +} + + +void mbq_destroy(struct mbq *q) +{ +} + diff --git a/sys/dev/netmap/netmap_mbq.h b/sys/dev/netmap/netmap_mbq.h new file mode 100644 index 000000000000..ad023b617a5d --- /dev/null +++ b/sys/dev/netmap/netmap_mbq.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2013 Vincenzo Maffione. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + + +#ifndef __NETMAP_MBQ_H__ +#define __NETMAP_MBQ_H__ + +/* + * These function implement an mbuf tailq with an optional lock. + * The base functions act ONLY ON THE QUEUE, whereas the "safe" + * variants (mbq_safe_*) also handle the lock. + */ + +/* XXX probably rely on a previous definition of SPINLOCK_T */ +#ifdef linux +#define SPINLOCK_T safe_spinlock_t +#else +#define SPINLOCK_T struct mtx +#endif + +/* A FIFO queue of mbufs with an optional lock. */ +struct mbq { + struct mbuf *head; + struct mbuf *tail; + int count; + SPINLOCK_T lock; +}; + +/* XXX "destroy" does not match "init" as a name. + * We should also clarify whether init can be used while + * holding a lock, and whether mbq_safe_destroy() is a NOP. + */ +void mbq_init(struct mbq *q); +void mbq_destroy(struct mbq *q); +void mbq_enqueue(struct mbq *q, struct mbuf *m); +struct mbuf *mbq_dequeue(struct mbq *q); +void mbq_purge(struct mbq *q); + +/* XXX missing mbq_lock() and mbq_unlock */ + +void mbq_safe_init(struct mbq *q); +void mbq_safe_destroy(struct mbq *q); +void mbq_safe_enqueue(struct mbq *q, struct mbuf *m); +struct mbuf *mbq_safe_dequeue(struct mbq *q); +void mbq_safe_purge(struct mbq *q); + +static inline unsigned int mbq_len(struct mbq *q) +{ + return q->count; +} + +#endif /* __NETMAP_MBQ_H_ */ diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c index a78904216057..f28f2c04751a 100644 --- a/sys/dev/netmap/netmap_mem2.c +++ b/sys/dev/netmap/netmap_mem2.c @@ -8,7 +8,7 @@ * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. + * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -167,12 +167,12 @@ const struct netmap_mem_d nm_blueprint = { #define DECLARE_SYSCTLS(id, name) \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_size, \ CTLFLAG_RW, &netmap_params[id].size, 0, "Requested size of netmap " STRINGIFY(name) "s"); \ - SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \ - CTLFLAG_RD, &nm_mem.pools[id]._objsize, 0, "Current size of netmap " STRINGIFY(name) "s"); \ - SYSCTL_INT(_dev_netmap, OID_AUTO, name##_num, \ - CTLFLAG_RW, &netmap_params[id].num, 0, "Requested number of netmap " STRINGIFY(name) "s"); \ - SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \ - CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s") + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \ + CTLFLAG_RD, &nm_mem.pools[id]._objsize, 0, "Current size of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_num, \ + CTLFLAG_RW, &netmap_params[id].num, 0, "Requested number of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \ + CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s") SYSCTL_DECL(_dev_netmap); DECLARE_SYSCTLS(NETMAP_IF_POOL, if); @@ -310,7 +310,7 @@ netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_ } if (p->objfree == 0) { - D("%s allocator: run out of memory", p->name); + D("no more %s objects", p->name); return NULL; } if (start) @@ -395,28 +395,22 @@ netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) /* Return nonzero on error */ static int -netmap_new_bufs(struct netmap_mem_d *nmd, struct netmap_if *nifp, - struct netmap_slot *slot, u_int n) +netmap_new_bufs(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n) { struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; u_int i = 0; /* slot counter */ uint32_t pos = 0; /* slot in p->bitmap */ uint32_t index = 0; /* buffer index */ - (void)nifp; /* UNUSED */ for (i = 0; i < n; i++) { void *vaddr = netmap_buf_malloc(nmd, &pos, &index); if (vaddr == NULL) { - D("unable to locate empty packet buffer"); + D("no more buffers after %d of %d", i, n); goto cleanup; } slot[i].buf_idx = index; slot[i].len = p->_objsize; - /* XXX setting flags=NS_BUF_CHANGED forces a pointer reload - * in the NIC ring. This is a hack that hides missing - * initializations in the drivers, and should go away. - */ - // slot[i].flags = NS_BUF_CHANGED; + slot[i].flags = 0; } ND("allocated %d buffers, %d available, first at %d", n, p->objfree, pos); @@ -433,11 +427,10 @@ cleanup: static void -netmap_free_buf(struct netmap_mem_d *nmd, struct netmap_if *nifp, uint32_t i) +netmap_free_buf(struct netmap_mem_d *nmd, uint32_t i) { struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; - (void)nifp; if (i < 2 || i >= p->objtotal) { D("Cannot free buf#%d: should be in [2, %d[", i, p->objtotal); return; @@ -760,7 +753,8 @@ netmap_mem_private_finalize(struct netmap_mem_d *nmd) } -static void netmap_mem_private_deref(struct netmap_mem_d *nmd) +static void +netmap_mem_private_deref(struct netmap_mem_d *nmd) { NMA_LOCK(nmd); if (--nmd->refcount <= 0) @@ -845,7 +839,7 @@ netmap_mem_global_config(struct netmap_mem_d *nmd) netmap_reset_obj_allocator(&nmd->pools[i]); } nmd->flags &= ~NETMAP_MEM_FINALIZED; - } + } for (i = 0; i < NETMAP_POOLS_NR; i++) { nmd->lasterr = netmap_config_obj_allocator(&nmd->pools[i], @@ -938,176 +932,156 @@ netmap_free_rings(struct netmap_adapter *na) na->rx_rings[i].ring = NULL; } } - free(na->tx_rings, M_DEVBUF); - na->tx_rings = na->rx_rings = NULL; } - - -/* call with NMA_LOCK held */ -/* - * Allocate the per-fd structure netmap_if. - * If this is the first instance, also allocate the krings, rings etc. +/* call with NMA_LOCK held * * - * We assume that the configuration stored in na - * (number of tx/rx rings and descs) does not change while - * the interface is in netmap mode. + * Allocate netmap rings and buffers for this card + * The rings are contiguous, but have variable size. */ -extern int nma_is_vp(struct netmap_adapter *na); -struct netmap_if * -netmap_mem_if_new(const char *ifname, struct netmap_adapter *na) +int +netmap_mem_rings_create(struct netmap_adapter *na) { - struct netmap_if *nifp; struct netmap_ring *ring; - ssize_t base; /* handy for relative offsets between rings and nifp */ - u_int i, len, ndesc, ntx, nrx; + u_int len, ndesc; struct netmap_kring *kring; - uint32_t *tx_leases = NULL, *rx_leases = NULL; - - /* - * verify whether virtual port need the stack ring - */ - ntx = na->num_tx_rings + 1; /* shorthand, include stack ring */ - nrx = na->num_rx_rings + 1; /* shorthand, include stack ring */ - /* - * the descriptor is followed inline by an array of offsets - * to the tx and rx rings in the shared memory region. - * For virtual rx rings we also allocate an array of - * pointers to assign to nkr_leases. - */ NMA_LOCK(na->nm_mem); - len = sizeof(struct netmap_if) + (nrx + ntx) * sizeof(ssize_t); - nifp = netmap_if_malloc(na->nm_mem, len); - if (nifp == NULL) { - NMA_UNLOCK(na->nm_mem); - return NULL; - } - - /* initialize base fields -- override const */ - *(u_int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings; - *(u_int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings; - strncpy(nifp->ni_name, ifname, (size_t)IFNAMSIZ); - - if (na->refcount) { /* already setup, we are done */ - goto final; - } - - len = (ntx + nrx) * sizeof(struct netmap_kring); - /* - * Leases are attached to TX rings on NIC/host ports, - * and to RX rings on VALE ports. - */ - if (nma_is_vp(na)) { - len += sizeof(uint32_t) * na->num_rx_desc * na->num_rx_rings; - } else { - len += sizeof(uint32_t) * na->num_tx_desc * ntx; - } - - na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); - if (na->tx_rings == NULL) { - D("Cannot allocate krings for %s", ifname); - goto cleanup; - } - na->rx_rings = na->tx_rings + ntx; - - if (nma_is_vp(na)) { - rx_leases = (uint32_t *)(na->rx_rings + nrx); - } else { - tx_leases = (uint32_t *)(na->rx_rings + nrx); - } - - /* - * First instance, allocate netmap rings and buffers for this card - * The rings are contiguous, but have variable size. - */ - for (i = 0; i < ntx; i++) { /* Transmit rings */ - kring = &na->tx_rings[i]; - ndesc = na->num_tx_desc; - bzero(kring, sizeof(*kring)); + for (kring = na->tx_rings; kring != na->rx_rings; kring++) { /* Transmit rings */ + ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); ring = netmap_ring_malloc(na->nm_mem, len); if (ring == NULL) { - D("Cannot allocate tx_ring[%d] for %s", i, ifname); + D("Cannot allocate tx_ring"); goto cleanup; } ND("txring[%d] at %p ofs %d", i, ring); - kring->na = na; kring->ring = ring; - if (tx_leases) { - kring->nkr_leases = tx_leases; - tx_leases += ndesc; - } - *(uint32_t *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc; + *(uint32_t *)(uintptr_t)&ring->num_slots = ndesc; *(ssize_t *)(uintptr_t)&ring->buf_ofs = (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - netmap_ring_offset(na->nm_mem, ring); - /* - * IMPORTANT: - * Always keep one slot empty, so we can detect new - * transmissions comparing cur and nr_hwcur (they are - * the same only if there are no new transmissions). - */ - ring->avail = kring->nr_hwavail = ndesc - 1; - ring->cur = kring->nr_hwcur = 0; + ring->avail = kring->nr_hwavail; + ring->cur = kring->nr_hwcur; *(uint16_t *)(uintptr_t)&ring->nr_buf_size = NETMAP_BDG_BUF_SIZE(na->nm_mem); - ND("initializing slots for txring[%d]", i); - if (netmap_new_bufs(na->nm_mem, nifp, ring->slot, ndesc)) { - D("Cannot allocate buffers for tx_ring[%d] for %s", i, ifname); + ND("initializing slots for txring"); + if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { + D("Cannot allocate buffers for tx_ring"); goto cleanup; } } - for (i = 0; i < nrx; i++) { /* Receive rings */ - kring = &na->rx_rings[i]; - ndesc = na->num_rx_desc; - bzero(kring, sizeof(*kring)); + for ( ; kring != na->tailroom; kring++) { /* Receive rings */ + ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); ring = netmap_ring_malloc(na->nm_mem, len); if (ring == NULL) { - D("Cannot allocate rx_ring[%d] for %s", i, ifname); + D("Cannot allocate rx_ring"); goto cleanup; } - ND("rxring[%d] at %p ofs %d", i, ring); + ND("rxring at %p ofs %d", ring); - kring->na = na; kring->ring = ring; - if (rx_leases && i < na->num_rx_rings) { - kring->nkr_leases = rx_leases; - rx_leases += ndesc; - } - *(uint32_t *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc; + *(uint32_t *)(uintptr_t)&ring->num_slots = ndesc; *(ssize_t *)(uintptr_t)&ring->buf_ofs = (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - netmap_ring_offset(na->nm_mem, ring); - ring->cur = kring->nr_hwcur = 0; - ring->avail = kring->nr_hwavail = 0; /* empty */ + ring->cur = kring->nr_hwcur; + ring->avail = kring->nr_hwavail; *(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BDG_BUF_SIZE(na->nm_mem); ND("initializing slots for rxring[%d]", i); - if (netmap_new_bufs(na->nm_mem, nifp, ring->slot, ndesc)) { - D("Cannot allocate buffers for rx_ring[%d] for %s", i, ifname); + if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { + D("Cannot allocate buffers for rx_ring"); goto cleanup; } } -#ifdef linux - // XXX initialize the selrecord structs. - for (i = 0; i < ntx; i++) - init_waitqueue_head(&na->tx_rings[i].si); - for (i = 0; i < nrx; i++) - init_waitqueue_head(&na->rx_rings[i].si); - init_waitqueue_head(&na->tx_si); - init_waitqueue_head(&na->rx_si); -#endif -final: + + NMA_UNLOCK(na->nm_mem); + + return 0; + +cleanup: + netmap_free_rings(na); + + NMA_UNLOCK(na->nm_mem); + + return ENOMEM; +} + +void +netmap_mem_rings_delete(struct netmap_adapter *na) +{ + /* last instance, release bufs and rings */ + u_int i, lim; + struct netmap_kring *kring; + struct netmap_ring *ring; + + NMA_LOCK(na->nm_mem); + + for (kring = na->tx_rings; kring != na->tailroom; kring++) { + ring = kring->ring; + if (ring == NULL) + continue; + lim = kring->nkr_num_slots; + for (i = 0; i < lim; i++) + netmap_free_buf(na->nm_mem, ring->slot[i].buf_idx); + } + netmap_free_rings(na); + + NMA_UNLOCK(na->nm_mem); +} + + +/* call with NMA_LOCK held */ +/* + * Allocate the per-fd structure netmap_if. + * + * We assume that the configuration stored in na + * (number of tx/rx rings and descs) does not change while + * the interface is in netmap mode. + */ +struct netmap_if * +netmap_mem_if_new(const char *ifname, struct netmap_adapter *na) +{ + struct netmap_if *nifp; + ssize_t base; /* handy for relative offsets between rings and nifp */ + u_int i, len, ntx, nrx; + + /* + * verify whether virtual port need the stack ring + */ + ntx = na->num_tx_rings + 1; /* shorthand, include stack ring */ + nrx = na->num_rx_rings + 1; /* shorthand, include stack ring */ + /* + * the descriptor is followed inline by an array of offsets + * to the tx and rx rings in the shared memory region. + * For virtual rx rings we also allocate an array of + * pointers to assign to nkr_leases. + */ + + NMA_LOCK(na->nm_mem); + + len = sizeof(struct netmap_if) + (nrx + ntx) * sizeof(ssize_t); + nifp = netmap_if_malloc(na->nm_mem, len); + if (nifp == NULL) { + NMA_UNLOCK(na->nm_mem); + return NULL; + } + + /* initialize base fields -- override const */ + *(u_int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings; + *(u_int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings; + strncpy(nifp->ni_name, ifname, (size_t)IFNAMSIZ); + /* * fill the slots for the rx and tx rings. They contain the offset * between the ring and nifp, so the information is usable in @@ -1126,13 +1100,6 @@ final: NMA_UNLOCK(na->nm_mem); return (nifp); -cleanup: - netmap_free_rings(na); - netmap_if_free(na->nm_mem, nifp); - - NMA_UNLOCK(na->nm_mem); - - return NULL; } void @@ -1143,25 +1110,6 @@ netmap_mem_if_delete(struct netmap_adapter *na, struct netmap_if *nifp) return; NMA_LOCK(na->nm_mem); - if (na->refcount <= 0) { - /* last instance, release bufs and rings */ - u_int i, j, lim; - struct netmap_ring *ring; - - for (i = 0; i < na->num_tx_rings + 1; i++) { - ring = na->tx_rings[i].ring; - lim = na->tx_rings[i].nkr_num_slots; - for (j = 0; j < lim; j++) - netmap_free_buf(na->nm_mem, nifp, ring->slot[j].buf_idx); - } - for (i = 0; i < na->num_rx_rings + 1; i++) { - ring = na->rx_rings[i].ring; - lim = na->rx_rings[i].nkr_num_slots; - for (j = 0; j < lim; j++) - netmap_free_buf(na->nm_mem, nifp, ring->slot[j].buf_idx); - } - netmap_free_rings(na); - } netmap_if_free(na->nm_mem, nifp); NMA_UNLOCK(na->nm_mem); @@ -1179,12 +1127,14 @@ netmap_mem_global_deref(struct netmap_mem_d *nmd) NMA_UNLOCK(nmd); } -int netmap_mem_finalize(struct netmap_mem_d *nmd) +int +netmap_mem_finalize(struct netmap_mem_d *nmd) { return nmd->finalize(nmd); } -void netmap_mem_deref(struct netmap_mem_d *nmd) +void +netmap_mem_deref(struct netmap_mem_d *nmd) { return nmd->deref(nmd); } diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h index 83f31d011c45..f492f9814b79 100644 --- a/sys/dev/netmap/netmap_mem2.h +++ b/sys/dev/netmap/netmap_mem2.h @@ -189,7 +189,7 @@ struct netmap_mem_d { /* the three allocators */ struct netmap_obj_pool pools[NETMAP_POOLS_NR]; - netmap_mem_config_t config; + netmap_mem_config_t config; netmap_mem_finalize_t finalize; netmap_mem_deref_t deref; }; @@ -200,14 +200,17 @@ vm_paddr_t netmap_mem_ofstophys(struct netmap_mem_d *, vm_ooffset_t); int netmap_mem_finalize(struct netmap_mem_d *); int netmap_mem_init(void); void netmap_mem_fini(void); -struct netmap_if * netmap_mem_if_new(const char *, struct netmap_adapter *); -void netmap_mem_if_delete(struct netmap_adapter *na, struct netmap_if *nifp); +struct netmap_if * + netmap_mem_if_new(const char *, struct netmap_adapter *); +void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *); +int netmap_mem_rings_create(struct netmap_adapter *); +void netmap_mem_rings_delete(struct netmap_adapter *); void netmap_mem_deref(struct netmap_mem_d *); -int netmap_mem_get_info(struct netmap_mem_d *nm_mem, u_int *size, u_int *memflags); -ssize_t netmap_mem_if_offset(struct netmap_mem_d *nm_mem, const void *vaddr); +int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags); +ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr); struct netmap_mem_d* netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int rxd); -void netmap_mem_private_delete(struct netmap_mem_d *nm_mem); +void netmap_mem_private_delete(struct netmap_mem_d *); #define NETMAP_BDG_BUF_SIZE(n) ((n)->pools[NETMAP_BUF_POOL]._objsize) diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c new file mode 100644 index 000000000000..e0ce94cccb7d --- /dev/null +++ b/sys/dev/netmap/netmap_vale.c @@ -0,0 +1,1983 @@ +/* + * Copyright (C) 2013 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +/* + * This module implements the VALE switch for netmap + +--- VALE SWITCH --- + +NMG_LOCK() serializes all modifications to switches and ports. +A switch cannot be deleted until all ports are gone. + +For each switch, an SX lock (RWlock on linux) protects +deletion of ports. When configuring or deleting a new port, the +lock is acquired in exclusive mode (after holding NMG_LOCK). +When forwarding, the lock is acquired in shared mode (without NMG_LOCK). +The lock is held throughout the entire forwarding cycle, +during which the thread may incur in a page fault. +Hence it is important that sleepable shared locks are used. + +On the rx ring, the per-port lock is grabbed initially to reserve +a number of slot in the ring, then the lock is released, +packets are copied from source to destination, and then +the lock is acquired again and the receive ring is updated. +(A similar thing is done on the tx ring for NIC and host stack +ports attached to the switch) + + */ + +/* + * OS-specific code that is used only within this file. + * Other OS-specific code that must be accessed by drivers + * is present in netmap_kern.h + */ + +#if defined(__FreeBSD__) +#include <sys/cdefs.h> /* prerequisite */ +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> /* defines used in kernel.h */ +#include <sys/kernel.h> /* types used in module initialization */ +#include <sys/conf.h> /* cdevsw struct, UID, GID */ +#include <sys/sockio.h> +#include <sys/socketvar.h> /* struct socket */ +#include <sys/malloc.h> +#include <sys/poll.h> +#include <sys/rwlock.h> +#include <sys/socket.h> /* sockaddrs */ +#include <sys/selinfo.h> +#include <sys/sysctl.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/bpf.h> /* BIOCIMMEDIATE */ +#include <machine/bus.h> /* bus_dmamap_* */ +#include <sys/endian.h> +#include <sys/refcount.h> + +// #define prefetch(x) __builtin_prefetch(x) + + +#define BDG_RWLOCK_T struct rwlock // struct rwlock + +#define BDG_RWINIT(b) \ + rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) +#define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) +#define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) +#define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) +#define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) +#define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) +#define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) + + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +/* + * common headers + */ + +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> +#include <dev/netmap/netmap_mem2.h> + +#ifdef WITH_VALE + +/* + * system parameters (most of them in netmap_kern.h) + * NM_NAME prefix for switch port names, default "vale" + * NM_BDG_MAXPORTS number of ports + * NM_BRIDGES max number of switches in the system. + * XXX should become a sysctl or tunable + * + * Switch ports are named valeX:Y where X is the switch name and Y + * is the port. If Y matches a physical interface name, the port is + * connected to a physical device. + * + * Unlike physical interfaces, switch ports use their own memory region + * for rings and buffers. + * The virtual interfaces use per-queue lock instead of core lock. + * In the tx loop, we aggregate traffic in batches to make all operations + * faster. The batch size is bridge_batch. + */ +#define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ +#define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ +#define NM_BRIDGE_RINGSIZE 1024 /* in the device */ +#define NM_BDG_HASH 1024 /* forwarding table entries */ +#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ +#define NM_MULTISEG 64 /* max size of a chain of bufs */ +/* actual size of the tables */ +#define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) +/* NM_FT_NULL terminates a list of slots in the ft */ +#define NM_FT_NULL NM_BDG_BATCH_MAX +#define NM_BRIDGES 8 /* number of bridges */ + + +/* + * bridge_batch is set via sysctl to the max batch size to be + * used in the bridge. The actual value may be larger as the + * last packet in the block may overflow the size. + */ +int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ +SYSCTL_DECL(_dev_netmap); +SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); + + +static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp); +static int bdg_netmap_reg(struct netmap_adapter *na, int onoff); +static int netmap_bwrap_attach(struct ifnet *, struct ifnet *); +static int netmap_bwrap_register(struct netmap_adapter *, int onoff); +int kern_netmap_regif(struct nmreq *nmr); + +/* + * Each transmit queue accumulates a batch of packets into + * a structure before forwarding. Packets to the same + * destination are put in a list using ft_next as a link field. + * ft_frags and ft_next are valid only on the first fragment. + */ +struct nm_bdg_fwd { /* forwarding entry for a bridge */ + void *ft_buf; /* netmap or indirect buffer */ + uint8_t ft_frags; /* how many fragments (only on 1st frag) */ + uint8_t _ft_port; /* dst port (unused) */ + uint16_t ft_flags; /* flags, e.g. indirect */ + uint16_t ft_len; /* src fragment len */ + uint16_t ft_next; /* next packet to same destination */ +}; + +/* + * For each output interface, nm_bdg_q is used to construct a list. + * bq_len is the number of output buffers (we can have coalescing + * during the copy). + */ +struct nm_bdg_q { + uint16_t bq_head; + uint16_t bq_tail; + uint32_t bq_len; /* number of buffers */ +}; + +/* XXX revise this */ +struct nm_hash_ent { + uint64_t mac; /* the top 2 bytes are the epoch */ + uint64_t ports; +}; + +/* + * nm_bridge is a descriptor for a VALE switch. + * Interfaces for a bridge are all in bdg_ports[]. + * The array has fixed size, an empty entry does not terminate + * the search, but lookups only occur on attach/detach so we + * don't mind if they are slow. + * + * The bridge is non blocking on the transmit ports: excess + * packets are dropped if there is no room on the output port. + * + * bdg_lock protects accesses to the bdg_ports array. + * This is a rw lock (or equivalent). + */ +struct nm_bridge { + /* XXX what is the proper alignment/layout ? */ + BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ + int bdg_namelen; + uint32_t bdg_active_ports; /* 0 means free */ + char bdg_basename[IFNAMSIZ]; + + /* Indexes of active ports (up to active_ports) + * and all other remaining ports. + */ + uint8_t bdg_port_index[NM_BDG_MAXPORTS]; + + struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS]; + + + /* + * The function to decide the destination port. + * It returns either of an index of the destination port, + * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to + * forward this packet. ring_nr is the source ring index, and the + * function may overwrite this value to forward this packet to a + * different ring index. + * This function must be set by netmap_bdgctl(). + */ + bdg_lookup_fn_t nm_bdg_lookup; + + /* the forwarding table, MAC+ports. + * XXX should be changed to an argument to be passed to + * the lookup function, and allocated on attach + */ + struct nm_hash_ent ht[NM_BDG_HASH]; +}; + + +/* + * XXX in principle nm_bridges could be created dynamically + * Right now we have a static array and deletions are protected + * by an exclusive lock. + */ +struct nm_bridge nm_bridges[NM_BRIDGES]; + + +/* + * A few function to tell which kind of port are we using. + * XXX should we hold a lock ? + * + * nma_is_vp() virtual port + * nma_is_host() port connected to the host stack + * nma_is_hw() port connected to a NIC + * nma_is_generic() generic netmap adapter XXX stop this madness + */ +static __inline int +nma_is_vp(struct netmap_adapter *na) +{ + return na->nm_register == bdg_netmap_reg; +} + + +static __inline int +nma_is_host(struct netmap_adapter *na) +{ + return na->nm_register == NULL; +} + + +static __inline int +nma_is_hw(struct netmap_adapter *na) +{ + /* In case of sw adapter, nm_register is NULL */ + return !nma_is_vp(na) && !nma_is_host(na) && !nma_is_generic(na); +} + +static __inline int +nma_is_bwrap(struct netmap_adapter *na) +{ + return na->nm_register == netmap_bwrap_register; +} + + + +/* + * this is a slightly optimized copy routine which rounds + * to multiple of 64 bytes and is often faster than dealing + * with other odd sizes. We assume there is enough room + * in the source and destination buffers. + * + * XXX only for multiples of 64 bytes, non overlapped. + */ +static inline void +pkt_copy(void *_src, void *_dst, int l) +{ + uint64_t *src = _src; + uint64_t *dst = _dst; + if (unlikely(l >= 1024)) { + memcpy(dst, src, l); + return; + } + for (; likely(l > 0); l-=64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + + + +/* + * locate a bridge among the existing ones. + * MUST BE CALLED WITH NMG_LOCK() + * + * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. + * We assume that this is called with a name of at least NM_NAME chars. + */ +static struct nm_bridge * +nm_find_bridge(const char *name, int create) +{ + int i, l, namelen; + struct nm_bridge *b = NULL; + + NMG_LOCK_ASSERT(); + + namelen = strlen(NM_NAME); /* base length */ + l = name ? strlen(name) : 0; /* actual length */ + if (l < namelen) { + D("invalid bridge name %s", name ? name : NULL); + return NULL; + } + for (i = namelen + 1; i < l; i++) { + if (name[i] == ':') { + namelen = i; + break; + } + } + if (namelen >= IFNAMSIZ) + namelen = IFNAMSIZ; + ND("--- prefix is '%.*s' ---", namelen, name); + + /* lookup the name, remember empty slot if there is one */ + for (i = 0; i < NM_BRIDGES; i++) { + struct nm_bridge *x = nm_bridges + i; + + if (x->bdg_active_ports == 0) { + if (create && b == NULL) + b = x; /* record empty slot */ + } else if (x->bdg_namelen != namelen) { + continue; + } else if (strncmp(name, x->bdg_basename, namelen) == 0) { + ND("found '%.*s' at %d", namelen, name, i); + b = x; + break; + } + } + if (i == NM_BRIDGES && b) { /* name not found, can create entry */ + /* initialize the bridge */ + strncpy(b->bdg_basename, name, namelen); + ND("create new bridge %s with ports %d", b->bdg_basename, + b->bdg_active_ports); + b->bdg_namelen = namelen; + b->bdg_active_ports = 0; + for (i = 0; i < NM_BDG_MAXPORTS; i++) + b->bdg_port_index[i] = i; + /* set the default function */ + b->nm_bdg_lookup = netmap_bdg_learning; + /* reset the MAC address table */ + bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); + } + return b; +} + + +/* + * Free the forwarding tables for rings attached to switch ports. + */ +static void +nm_free_bdgfwd(struct netmap_adapter *na) +{ + int nrings, i; + struct netmap_kring *kring; + + NMG_LOCK_ASSERT(); + nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; + kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; + for (i = 0; i < nrings; i++) { + if (kring[i].nkr_ft) { + free(kring[i].nkr_ft, M_DEVBUF); + kring[i].nkr_ft = NULL; /* protect from freeing twice */ + } + } +} + + +/* + * Allocate the forwarding tables for the rings attached to the bridge ports. + */ +static int +nm_alloc_bdgfwd(struct netmap_adapter *na) +{ + int nrings, l, i, num_dstq; + struct netmap_kring *kring; + + NMG_LOCK_ASSERT(); + /* all port:rings + broadcast */ + num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; + l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; + l += sizeof(struct nm_bdg_q) * num_dstq; + l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; + + nrings = na->num_tx_rings + 1; + kring = na->tx_rings; + for (i = 0; i < nrings; i++) { + struct nm_bdg_fwd *ft; + struct nm_bdg_q *dstq; + int j; + + ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ft) { + nm_free_bdgfwd(na); + return ENOMEM; + } + dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); + for (j = 0; j < num_dstq; j++) { + dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; + dstq[j].bq_len = 0; + } + kring[i].nkr_ft = ft; + } + return 0; +} + + +static void +netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) +{ + int s_hw = hw, s_sw = sw; + int i, lim =b->bdg_active_ports; + uint8_t tmp[NM_BDG_MAXPORTS]; + + /* + New algorithm: + make a copy of bdg_port_index; + lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port + in the array of bdg_port_index, replacing them with + entries from the bottom of the array; + decrement bdg_active_ports; + acquire BDG_WLOCK() and copy back the array. + */ + + D("detach %d and %d (lim %d)", hw, sw, lim); + /* make a copy of the list of active ports, update it, + * and then copy back within BDG_WLOCK(). + */ + memcpy(tmp, b->bdg_port_index, sizeof(tmp)); + for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { + if (hw >= 0 && tmp[i] == hw) { + ND("detach hw %d at %d", hw, i); + lim--; /* point to last active port */ + tmp[i] = tmp[lim]; /* swap with i */ + tmp[lim] = hw; /* now this is inactive */ + hw = -1; + } else if (sw >= 0 && tmp[i] == sw) { + ND("detach sw %d at %d", sw, i); + lim--; + tmp[i] = tmp[lim]; + tmp[lim] = sw; + sw = -1; + } else { + i++; + } + } + if (hw >= 0 || sw >= 0) { + D("XXX delete failed hw %d sw %d, should panic...", hw, sw); + } + + BDG_WLOCK(b); + b->bdg_ports[s_hw] = NULL; + if (s_sw >= 0) { + b->bdg_ports[s_sw] = NULL; + } + memcpy(b->bdg_port_index, tmp, sizeof(tmp)); + b->bdg_active_ports = lim; + BDG_WUNLOCK(b); + + ND("now %d active ports", lim); + if (lim == 0) { + ND("marking bridge %s as free", b->bdg_basename); + b->nm_bdg_lookup = NULL; + } +} + +static void +netmap_adapter_vp_dtor(struct netmap_adapter *na) +{ + struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; + struct nm_bridge *b = vpna->na_bdg; + struct ifnet *ifp = na->ifp; + + ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount); + + if (b) { + netmap_bdg_detach_common(b, vpna->bdg_port, -1); + } + + bzero(ifp, sizeof(*ifp)); + free(ifp, M_DEVBUF); + na->ifp = NULL; +} + +int +netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) +{ + const char *name = nmr->nr_name; + struct ifnet *ifp; + int error = 0; + struct netmap_adapter *ret; + struct netmap_vp_adapter *vpna; + struct nm_bridge *b; + int i, j, cand = -1, cand2 = -1; + int needed; + + *na = NULL; /* default return value */ + + /* first try to see if this is a bridge port. */ + NMG_LOCK_ASSERT(); + if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { + return 0; /* no error, but no VALE prefix */ + } + + b = nm_find_bridge(name, create); + if (b == NULL) { + D("no bridges available for '%s'", name); + return (ENXIO); + } + + /* Now we are sure that name starts with the bridge's name, + * lookup the port in the bridge. We need to scan the entire + * list. It is not important to hold a WLOCK on the bridge + * during the search because NMG_LOCK already guarantees + * that there are no other possible writers. + */ + + /* lookup in the local list of ports */ + for (j = 0; j < b->bdg_active_ports; j++) { + i = b->bdg_port_index[j]; + vpna = b->bdg_ports[i]; + // KASSERT(na != NULL); + ifp = vpna->up.ifp; + /* XXX make sure the name only contains one : */ + if (!strcmp(NM_IFPNAME(ifp), name)) { + netmap_adapter_get(&vpna->up); + ND("found existing if %s refs %d", name, + vpna->na_bdg_refcount); + *na = (struct netmap_adapter *)vpna; + return 0; + } + } + /* not found, should we create it? */ + if (!create) + return ENXIO; + /* yes we should, see if we have space to attach entries */ + needed = 2; /* in some cases we only need 1 */ + if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { + D("bridge full %d, cannot create new port", b->bdg_active_ports); + return EINVAL; + } + /* record the next two ports available, but do not allocate yet */ + cand = b->bdg_port_index[b->bdg_active_ports]; + cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; + ND("+++ bridge %s port %s used %d avail %d %d", + b->bdg_basename, name, b->bdg_active_ports, cand, cand2); + + /* + * try see if there is a matching NIC with this name + * (after the bridge's name) + */ + ifp = ifunit_ref(name + b->bdg_namelen + 1); + if (!ifp) { /* this is a virtual port */ + if (nmr->nr_cmd) { + /* nr_cmd must be 0 for a virtual port */ + return EINVAL; + } + + /* create a struct ifnet for the new port. + * need M_NOWAIT as we are under nma_lock + */ + ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ifp) + return ENOMEM; + + strcpy(ifp->if_xname, name); + /* bdg_netmap_attach creates a struct netmap_adapter */ + error = bdg_netmap_attach(nmr, ifp); + if (error) { + D("error %d", error); + free(ifp, M_DEVBUF); + return error; + } + ret = NA(ifp); + cand2 = -1; /* only need one port */ + } else { /* this is a NIC */ + struct ifnet *fake_ifp; + + error = netmap_get_hw_na(ifp, &ret); + if (error || ret == NULL) + goto out; + + /* make sure the NIC is not already in use */ + if (NETMAP_OWNED_BY_ANY(ret)) { + D("NIC %s busy, cannot attach to bridge", + NM_IFPNAME(ifp)); + error = EINVAL; + goto out; + } + /* create a fake interface */ + fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!fake_ifp) { + error = ENOMEM; + goto out; + } + strcpy(fake_ifp->if_xname, name); + error = netmap_bwrap_attach(fake_ifp, ifp); + if (error) { + free(fake_ifp, M_DEVBUF); + goto out; + } + ret = NA(fake_ifp); + if (nmr->nr_arg1 != NETMAP_BDG_HOST) + cand2 = -1; /* only need one port */ + if_rele(ifp); + } + vpna = (struct netmap_vp_adapter *)ret; + + BDG_WLOCK(b); + vpna->bdg_port = cand; + ND("NIC %p to bridge port %d", vpna, cand); + /* bind the port to the bridge (virtual ports are not active) */ + b->bdg_ports[cand] = vpna; + vpna->na_bdg = b; + b->bdg_active_ports++; + if (cand2 >= 0) { + struct netmap_vp_adapter *hostna = vpna + 1; + /* also bind the host stack to the bridge */ + b->bdg_ports[cand2] = hostna; + hostna->bdg_port = cand2; + hostna->na_bdg = b; + b->bdg_active_ports++; + ND("host %p to bridge port %d", hostna, cand2); + } + ND("if %s refs %d", name, vpna->up.na_refcount); + BDG_WUNLOCK(b); + *na = ret; + netmap_adapter_get(ret); + return 0; + +out: + if_rele(ifp); + + return error; +} + + +/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ +static int +nm_bdg_attach(struct nmreq *nmr) +{ + struct netmap_adapter *na; + struct netmap_if *nifp; + struct netmap_priv_d *npriv; + struct netmap_bwrap_adapter *bna; + int error; + + npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); + if (npriv == NULL) + return ENOMEM; + NMG_LOCK(); + /* XXX probably netmap_get_bdg_na() */ + error = netmap_get_na(nmr, &na, 1 /* create if not exists */); + if (error) /* no device, or another bridge or user owns the device */ + goto unlock_exit; + /* netmap_get_na() sets na_bdg if this is a physical interface + * that we can attach to a switch. + */ + if (!nma_is_bwrap(na)) { + /* got reference to a virtual port or direct access to a NIC. + * perhaps specified no bridge prefix or wrong NIC name + */ + error = EINVAL; + goto unref_exit; + } + + if (na->active_fds > 0) { /* already registered */ + error = EBUSY; + goto unref_exit; + } + + nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, &error); + if (!nifp) { + goto unref_exit; + } + + bna = (struct netmap_bwrap_adapter*)na; + bna->na_kpriv = npriv; + NMG_UNLOCK(); + ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp)); + return 0; + +unref_exit: + netmap_adapter_put(na); +unlock_exit: + NMG_UNLOCK(); + bzero(npriv, sizeof(*npriv)); + free(npriv, M_DEVBUF); + return error; +} + +static int +nm_bdg_detach(struct nmreq *nmr) +{ + struct netmap_adapter *na; + int error; + struct netmap_bwrap_adapter *bna; + int last_instance; + + NMG_LOCK(); + error = netmap_get_na(nmr, &na, 0 /* don't create */); + if (error) { /* no device, or another bridge or user owns the device */ + goto unlock_exit; + } + if (!nma_is_bwrap(na)) { + /* got reference to a virtual port or direct access to a NIC. + * perhaps specified no bridge's prefix or wrong NIC's name + */ + error = EINVAL; + goto unref_exit; + } + bna = (struct netmap_bwrap_adapter *)na; + + if (na->active_fds == 0) { /* not registered */ + error = EINVAL; + goto unref_exit; + } + + last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */ + if (!last_instance) { + D("--- error, trying to detach an entry with active mmaps"); + error = EINVAL; + } else { + struct netmap_priv_d *npriv = bna->na_kpriv; + + bna->na_kpriv = NULL; + D("deleting priv"); + + bzero(npriv, sizeof(*npriv)); + free(npriv, M_DEVBUF); + } + +unref_exit: + netmap_adapter_put(na); +unlock_exit: + NMG_UNLOCK(); + return error; + +} + + +/* exported to kernel callers, e.g. OVS ? + * Entry point. + * Called without NMG_LOCK. + */ +int +netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) +{ + struct nm_bridge *b; + struct netmap_adapter *na; + struct netmap_vp_adapter *vpna; + struct ifnet *iter; + char *name = nmr->nr_name; + int cmd = nmr->nr_cmd, namelen = strlen(name); + int error = 0, i, j; + + switch (cmd) { + case NETMAP_BDG_ATTACH: + error = nm_bdg_attach(nmr); + break; + + case NETMAP_BDG_DETACH: + error = nm_bdg_detach(nmr); + break; + + case NETMAP_BDG_LIST: + /* this is used to enumerate bridges and ports */ + if (namelen) { /* look up indexes of bridge and port */ + if (strncmp(name, NM_NAME, strlen(NM_NAME))) { + error = EINVAL; + break; + } + NMG_LOCK(); + b = nm_find_bridge(name, 0 /* don't create */); + if (!b) { + error = ENOENT; + NMG_UNLOCK(); + break; + } + + error = ENOENT; + for (j = 0; j < b->bdg_active_ports; j++) { + i = b->bdg_port_index[j]; + vpna = b->bdg_ports[i]; + if (vpna == NULL) { + D("---AAAAAAAAARGH-------"); + continue; + } + iter = vpna->up.ifp; + /* the former and the latter identify a + * virtual port and a NIC, respectively + */ + if (!strcmp(iter->if_xname, name)) { + /* bridge index */ + nmr->nr_arg1 = b - nm_bridges; + nmr->nr_arg2 = i; /* port index */ + error = 0; + break; + } + } + NMG_UNLOCK(); + } else { + /* return the first non-empty entry starting from + * bridge nr_arg1 and port nr_arg2. + * + * Users can detect the end of the same bridge by + * seeing the new and old value of nr_arg1, and can + * detect the end of all the bridge by error != 0 + */ + i = nmr->nr_arg1; + j = nmr->nr_arg2; + + NMG_LOCK(); + for (error = ENOENT; i < NM_BRIDGES; i++) { + b = nm_bridges + i; + if (j >= b->bdg_active_ports) { + j = 0; /* following bridges scan from 0 */ + continue; + } + nmr->nr_arg1 = i; + nmr->nr_arg2 = j; + j = b->bdg_port_index[j]; + vpna = b->bdg_ports[j]; + iter = vpna->up.ifp; + strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); + error = 0; + break; + } + NMG_UNLOCK(); + } + break; + + case NETMAP_BDG_LOOKUP_REG: + /* register a lookup function to the given bridge. + * nmr->nr_name may be just bridge's name (including ':' + * if it is not just NM_NAME). + */ + if (!func) { + error = EINVAL; + break; + } + NMG_LOCK(); + b = nm_find_bridge(name, 0 /* don't create */); + if (!b) { + error = EINVAL; + } else { + b->nm_bdg_lookup = func; + } + NMG_UNLOCK(); + break; + + case NETMAP_BDG_OFFSET: + NMG_LOCK(); + error = netmap_get_bdg_na(nmr, &na, 0); + if (!error) { + vpna = (struct netmap_vp_adapter *)na; + if (nmr->nr_arg1 > NETMAP_BDG_MAX_OFFSET) + nmr->nr_arg1 = NETMAP_BDG_MAX_OFFSET; + vpna->offset = nmr->nr_arg1; + D("Using offset %d for %p", vpna->offset, vpna); + } + NMG_UNLOCK(); + break; + + default: + D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); + error = EINVAL; + break; + } + return error; +} + + +static int +netmap_vp_krings_create(struct netmap_adapter *na) +{ + u_int ntx, nrx, tailroom; + int error, i; + uint32_t *leases; + + /* XXX vps do not need host rings, + * but we crash if we don't have one + */ + ntx = na->num_tx_rings + 1; + nrx = na->num_rx_rings + 1; + + /* + * Leases are attached to RX rings on vale ports + */ + tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; + + error = netmap_krings_create(na, ntx, nrx, tailroom); + if (error) + return error; + + leases = na->tailroom; + + for (i = 0; i < nrx; i++) { /* Receive rings */ + na->rx_rings[i].nkr_leases = leases; + leases += na->num_rx_desc; + } + + error = nm_alloc_bdgfwd(na); + if (error) { + netmap_krings_delete(na); + return error; + } + + return 0; +} + +static void +netmap_vp_krings_delete(struct netmap_adapter *na) +{ + nm_free_bdgfwd(na); + netmap_krings_delete(na); +} + + +static int +nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, + struct netmap_vp_adapter *na, u_int ring_nr); + + +/* + * Grab packets from a kring, move them into the ft structure + * associated to the tx (input) port. Max one instance per port, + * filtered on input (ioctl, poll or XXX). + * Returns the next position in the ring. + */ +static int +nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr, + struct netmap_kring *kring, u_int end) +{ + struct netmap_ring *ring = kring->ring; + struct nm_bdg_fwd *ft; + u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; + u_int ft_i = 0; /* start from 0 */ + u_int frags = 1; /* how many frags ? */ + struct nm_bridge *b = na->na_bdg; + + /* To protect against modifications to the bridge we acquire a + * shared lock, waiting if we can sleep (if the source port is + * attached to a user process) or with a trylock otherwise (NICs). + */ + ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); + if (na->up.na_flags & NAF_BDG_MAYSLEEP) + BDG_RLOCK(b); + else if (!BDG_RTRYLOCK(b)) + return 0; + ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); + ft = kring->nkr_ft; + + for (; likely(j != end); j = nm_next(j, lim)) { + struct netmap_slot *slot = &ring->slot[j]; + char *buf; + + ft[ft_i].ft_len = slot->len; + ft[ft_i].ft_flags = slot->flags; + + ND("flags is 0x%x", slot->flags); + /* this slot goes into a list so initialize the link field */ + ft[ft_i].ft_next = NM_FT_NULL; + buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? + (void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot); + prefetch(buf); + ++ft_i; + if (slot->flags & NS_MOREFRAG) { + frags++; + continue; + } + if (unlikely(netmap_verbose && frags > 1)) + RD(5, "%d frags at %d", frags, ft_i - frags); + ft[ft_i - frags].ft_frags = frags; + frags = 1; + if (unlikely((int)ft_i >= bridge_batch)) + ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); + } + if (frags > 1) { + D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); + // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG + ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; + ft[ft_i - frags].ft_frags = frags - 1; + } + if (ft_i) + ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); + BDG_RUNLOCK(b); + return j; +} + + +/* + *---- support for virtual bridge ----- + */ + +/* ----- FreeBSD if_bridge hash function ------- */ + +/* + * The following hash function is adapted from "Hash Functions" by Bob Jenkins + * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). + * + * http://www.burtleburtle.net/bob/hash/spooky.html + */ +#define mix(a, b, c) \ +do { \ + a -= b; a -= c; a ^= (c >> 13); \ + b -= c; b -= a; b ^= (a << 8); \ + c -= a; c -= b; c ^= (b >> 13); \ + a -= b; a -= c; a ^= (c >> 12); \ + b -= c; b -= a; b ^= (a << 16); \ + c -= a; c -= b; c ^= (b >> 5); \ + a -= b; a -= c; a ^= (c >> 3); \ + b -= c; b -= a; b ^= (a << 10); \ + c -= a; c -= b; c ^= (b >> 15); \ +} while (/*CONSTCOND*/0) + +static __inline uint32_t +nm_bridge_rthash(const uint8_t *addr) +{ + uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key + + b += addr[5] << 8; + b += addr[4]; + a += addr[3] << 24; + a += addr[2] << 16; + a += addr[1] << 8; + a += addr[0]; + + mix(a, b, c); +#define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) + return (c & BRIDGE_RTHASH_MASK); +} + +#undef mix + + +static int +bdg_netmap_reg(struct netmap_adapter *na, int onoff) +{ + struct netmap_vp_adapter *vpna = + (struct netmap_vp_adapter*)na; + struct ifnet *ifp = na->ifp; + + /* the interface is already attached to the bridge, + * so we only need to toggle IFCAP_NETMAP. + */ + BDG_WLOCK(vpna->na_bdg); + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + } else { + ifp->if_capenable &= ~IFCAP_NETMAP; + } + BDG_WUNLOCK(vpna->na_bdg); + return 0; +} + + +/* + * Lookup function for a learning bridge. + * Update the hash table with the source address, + * and then returns the destination port index, and the + * ring in *dst_ring (at the moment, always use ring 0) + */ +u_int +netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, + struct netmap_vp_adapter *na) +{ + struct nm_hash_ent *ht = na->na_bdg->ht; + uint32_t sh, dh; + u_int dst, mysrc = na->bdg_port; + uint64_t smac, dmac; + + if (buf_len < 14) { + D("invalid buf length %d", buf_len); + return NM_BDG_NOPORT; + } + dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; + smac = le64toh(*(uint64_t *)(buf + 4)); + smac >>= 16; + + /* + * The hash is somewhat expensive, there might be some + * worthwhile optimizations here. + */ + if ((buf[6] & 1) == 0) { /* valid src */ + uint8_t *s = buf+6; + sh = nm_bridge_rthash(s); // XXX hash of source + /* update source port forwarding entry */ + ht[sh].mac = smac; /* XXX expire ? */ + ht[sh].ports = mysrc; + if (netmap_verbose) + D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", + s[0], s[1], s[2], s[3], s[4], s[5], mysrc); + } + dst = NM_BDG_BROADCAST; + if ((buf[0] & 1) == 0) { /* unicast */ + dh = nm_bridge_rthash(buf); // XXX hash of dst + if (ht[dh].mac == dmac) { /* found dst */ + dst = ht[dh].ports; + } + /* XXX otherwise return NM_BDG_UNKNOWN ? */ + } + *dst_ring = 0; + return dst; +} + + +/* + * This flush routine supports only unicast and broadcast but a large + * number of ports, and lets us replace the learn and dispatch functions. + */ +int +nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, + u_int ring_nr) +{ + struct nm_bdg_q *dst_ents, *brddst; + uint16_t num_dsts = 0, *dsts; + struct nm_bridge *b = na->na_bdg; + u_int i, j, me = na->bdg_port; + + /* + * The work area (pointed by ft) is followed by an array of + * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS + * queues per port plus one for the broadcast traffic. + * Then we have an array of destination indexes. + */ + dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); + dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); + + /* first pass: find a destination for each packet in the batch */ + for (i = 0; likely(i < n); i += ft[i].ft_frags) { + uint8_t dst_ring = ring_nr; /* default, same ring as origin */ + uint16_t dst_port, d_i; + struct nm_bdg_q *d; + uint8_t *buf = ft[i].ft_buf; + u_int len = ft[i].ft_len; + + ND("slot %d frags %d", i, ft[i].ft_frags); + /* Drop the packet if the offset is not into the first + fragment nor at the very beginning of the second. */ + if (unlikely(na->offset > len)) + continue; + if (len == na->offset) { + buf = ft[i+1].ft_buf; + len = ft[i+1].ft_len; + } else { + buf += na->offset; + len -= na->offset; + } + dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na); + if (netmap_verbose > 255) + RD(5, "slot %d port %d -> %d", i, me, dst_port); + if (dst_port == NM_BDG_NOPORT) + continue; /* this packet is identified to be dropped */ + else if (unlikely(dst_port > NM_BDG_MAXPORTS)) + continue; + else if (dst_port == NM_BDG_BROADCAST) + dst_ring = 0; /* broadcasts always go to ring 0 */ + else if (unlikely(dst_port == me || + !b->bdg_ports[dst_port])) + continue; + + /* get a position in the scratch pad */ + d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; + d = dst_ents + d_i; + + /* append the first fragment to the list */ + if (d->bq_head == NM_FT_NULL) { /* new destination */ + d->bq_head = d->bq_tail = i; + /* remember this position to be scanned later */ + if (dst_port != NM_BDG_BROADCAST) + dsts[num_dsts++] = d_i; + } else { + ft[d->bq_tail].ft_next = i; + d->bq_tail = i; + } + d->bq_len += ft[i].ft_frags; + } + + /* + * Broadcast traffic goes to ring 0 on all destinations. + * So we need to add these rings to the list of ports to scan. + * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is + * expensive. We should keep a compact list of active destinations + * so we could shorten this loop. + */ + brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; + if (brddst->bq_head != NM_FT_NULL) { + for (j = 0; likely(j < b->bdg_active_ports); j++) { + uint16_t d_i; + i = b->bdg_port_index[j]; + if (unlikely(i == me)) + continue; + d_i = i * NM_BDG_MAXRINGS; + if (dst_ents[d_i].bq_head == NM_FT_NULL) + dsts[num_dsts++] = d_i; + } + } + + ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); + /* second pass: scan destinations (XXX will be modular somehow) */ + for (i = 0; i < num_dsts; i++) { + struct ifnet *dst_ifp; + struct netmap_vp_adapter *dst_na; + struct netmap_kring *kring; + struct netmap_ring *ring; + u_int dst_nr, lim, j, sent = 0, d_i, next, brd_next; + u_int needed, howmany; + int retry = netmap_txsync_retry; + struct nm_bdg_q *d; + uint32_t my_start = 0, lease_idx = 0; + int nrings; + int offset_mismatch; + + d_i = dsts[i]; + ND("second pass %d port %d", i, d_i); + d = dst_ents + d_i; + // XXX fix the division + dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; + /* protect from the lookup function returning an inactive + * destination port + */ + if (unlikely(dst_na == NULL)) + goto cleanup; + if (dst_na->up.na_flags & NAF_SW_ONLY) + goto cleanup; + dst_ifp = dst_na->up.ifp; + /* + * The interface may be in !netmap mode in two cases: + * - when na is attached but not activated yet; + * - when na is being deactivated but is still attached. + */ + if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { + ND("not in netmap mode!"); + goto cleanup; + } + + offset_mismatch = (dst_na->offset != na->offset); + + /* there is at least one either unicast or broadcast packet */ + brd_next = brddst->bq_head; + next = d->bq_head; + /* we need to reserve this many slots. If fewer are + * available, some packets will be dropped. + * Packets may have multiple fragments, so we may not use + * there is a chance that we may not use all of the slots + * we have claimed, so we will need to handle the leftover + * ones when we regain the lock. + */ + needed = d->bq_len + brddst->bq_len; + + ND(5, "pass 2 dst %d is %x %s", + i, d_i, is_vp ? "virtual" : "nic/host"); + dst_nr = d_i & (NM_BDG_MAXRINGS-1); + nrings = dst_na->up.num_rx_rings; + if (dst_nr >= nrings) + dst_nr = dst_nr % nrings; + kring = &dst_na->up.rx_rings[dst_nr]; + ring = kring->ring; + lim = kring->nkr_num_slots - 1; + +retry: + + /* reserve the buffers in the queue and an entry + * to report completion, and drop lock. + * XXX this might become a helper function. + */ + mtx_lock(&kring->q_lock); + if (kring->nkr_stopped) { + mtx_unlock(&kring->q_lock); + goto cleanup; + } + if (dst_na->retry) { + dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); + } + my_start = j = kring->nkr_hwlease; + howmany = nm_kr_space(kring, 1); + if (needed < howmany) + howmany = needed; + lease_idx = nm_kr_lease(kring, howmany, 1); + mtx_unlock(&kring->q_lock); + + /* only retry if we need more than available slots */ + if (retry && needed <= howmany) + retry = 0; + + /* copy to the destination queue */ + while (howmany > 0) { + struct netmap_slot *slot; + struct nm_bdg_fwd *ft_p, *ft_end; + u_int cnt; + int fix_mismatch = offset_mismatch; + + /* find the queue from which we pick next packet. + * NM_FT_NULL is always higher than valid indexes + * so we never dereference it if the other list + * has packets (and if both are empty we never + * get here). + */ + if (next < brd_next) { + ft_p = ft + next; + next = ft_p->ft_next; + } else { /* insert broadcast */ + ft_p = ft + brd_next; + brd_next = ft_p->ft_next; + } + cnt = ft_p->ft_frags; // cnt > 0 + if (unlikely(cnt > howmany)) + break; /* no more space */ + howmany -= cnt; + if (netmap_verbose && cnt > 1) + RD(5, "rx %d frags to %d", cnt, j); + ft_end = ft_p + cnt; + do { + char *dst, *src = ft_p->ft_buf; + size_t copy_len = ft_p->ft_len, dst_len = copy_len; + + slot = &ring->slot[j]; + dst = BDG_NMB(&dst_na->up, slot); + + if (unlikely(fix_mismatch)) { + if (na->offset > dst_na->offset) { + src += na->offset - dst_na->offset; + copy_len -= na->offset - dst_na->offset; + dst_len = copy_len; + } else { + bzero(dst, dst_na->offset - na->offset); + dst_len += dst_na->offset - na->offset; + dst += dst_na->offset - na->offset; + } + /* fix the first fragment only */ + fix_mismatch = 0; + /* completely skip an header only fragment */ + if (copy_len == 0) { + ft_p++; + continue; + } + } + /* round to a multiple of 64 */ + copy_len = (copy_len + 63) & ~63; + + ND("send %d %d bytes at %s:%d", + i, ft_p->ft_len, NM_IFPNAME(dst_ifp), j); + if (ft_p->ft_flags & NS_INDIRECT) { + if (copyin(src, dst, copy_len)) { + // invalid user pointer, pretend len is 0 + dst_len = 0; + } + } else { + //memcpy(dst, src, copy_len); + pkt_copy(src, dst, (int)copy_len); + } + slot->len = dst_len; + slot->flags = (cnt << 8)| NS_MOREFRAG; + j = nm_next(j, lim); + ft_p++; + sent++; + } while (ft_p != ft_end); + slot->flags = (cnt << 8); /* clear flag on last entry */ + /* are we done ? */ + if (next == NM_FT_NULL && brd_next == NM_FT_NULL) + break; + } + { + /* current position */ + uint32_t *p = kring->nkr_leases; /* shorthand */ + uint32_t update_pos; + int still_locked = 1; + + mtx_lock(&kring->q_lock); + if (unlikely(howmany > 0)) { + /* not used all bufs. If i am the last one + * i can recover the slots, otherwise must + * fill them with 0 to mark empty packets. + */ + ND("leftover %d bufs", howmany); + if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { + /* yes i am the last one */ + ND("roll back nkr_hwlease to %d", j); + kring->nkr_hwlease = j; + } else { + while (howmany-- > 0) { + ring->slot[j].len = 0; + ring->slot[j].flags = 0; + j = nm_next(j, lim); + } + } + } + p[lease_idx] = j; /* report I am done */ + + update_pos = nm_kr_rxpos(kring); + + if (my_start == update_pos) { + /* all slots before my_start have been reported, + * so scan subsequent leases to see if other ranges + * have been completed, and to a selwakeup or txsync. + */ + while (lease_idx != kring->nkr_lease_idx && + p[lease_idx] != NR_NOSLOT) { + j = p[lease_idx]; + p[lease_idx] = NR_NOSLOT; + lease_idx = nm_next(lease_idx, lim); + } + /* j is the new 'write' position. j != my_start + * means there are new buffers to report + */ + if (likely(j != my_start)) { + uint32_t old_avail = kring->nr_hwavail; + + kring->nr_hwavail = (j >= kring->nr_hwcur) ? + j - kring->nr_hwcur : + j + lim + 1 - kring->nr_hwcur; + if (kring->nr_hwavail < old_avail) { + D("avail shrink %d -> %d", + old_avail, kring->nr_hwavail); + } + dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); + still_locked = 0; + mtx_unlock(&kring->q_lock); + if (dst_na->retry && retry--) + goto retry; + } + } + if (still_locked) + mtx_unlock(&kring->q_lock); + } +cleanup: + d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ + d->bq_len = 0; + } + brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ + brddst->bq_len = 0; + return 0; +} + +static int +netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->up.tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int j, k, lim = kring->nkr_num_slots - 1; + + k = ring->cur; + if (k > lim) + return netmap_ring_reinit(kring); + + if (bridge_batch <= 0) { /* testing only */ + j = k; // used all + goto done; + } + if (bridge_batch > NM_BDG_BATCH) + bridge_batch = NM_BDG_BATCH; + + j = nm_bdg_preflush(na, ring_nr, kring, k); + if (j != k) + D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); + /* k-j modulo ring size is the number of slots processed */ + if (k < j) + k += kring->nkr_num_slots; + kring->nr_hwavail = lim - (k - j); + +done: + kring->nr_hwcur = j; + ring->avail = kring->nr_hwavail; + if (netmap_verbose) + D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags); + return 0; +} + + +/* + * main dispatch routine for the bridge. + * We already know that only one thread is running this. + * we must run nm_bdg_preflush without lock. + */ +static int +bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; + return netmap_vp_txsync(vpna, ring_nr, flags); +} + + +/* + * user process reading from a VALE switch. + * Already protected against concurrent calls from userspace, + * but we must acquire the queue's lock to protect against + * writers on the same queue. + */ +static int +bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int j, lim = kring->nkr_num_slots - 1; + u_int k = ring->cur, resvd = ring->reserved; + int n; + + mtx_lock(&kring->q_lock); + if (k > lim) { + D("ouch dangerous reset!!!"); + n = netmap_ring_reinit(kring); + goto done; + } + + /* skip past packets that userspace has released */ + j = kring->nr_hwcur; /* netmap ring index */ + if (resvd > 0) { + if (resvd + ring->avail >= lim + 1) { + D("XXX invalid reserve/avail %d %d", resvd, ring->avail); + ring->reserved = resvd = 0; // XXX panic... + } + k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; + } + + if (j != k) { /* userspace has released some packets. */ + n = k - j; + if (n < 0) + n += kring->nkr_num_slots; + ND("userspace releases %d packets", n); + for (n = 0; likely(j != k); n++) { + struct netmap_slot *slot = &ring->slot[j]; + void *addr = BDG_NMB(na, slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + D("bad buffer index %d, ignore ?", + slot->buf_idx); + } + slot->flags &= ~NS_BUF_CHANGED; + j = nm_next(j, lim); + } + kring->nr_hwavail -= n; + kring->nr_hwcur = k; + } + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail - resvd; + n = 0; +done: + mtx_unlock(&kring->q_lock); + return n; +} + +static int +bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) +{ + struct netmap_vp_adapter *vpna; + struct netmap_adapter *na; + int error; + + vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (vpna == NULL) + return ENOMEM; + + na = &vpna->up; + + na->ifp = ifp; + + /* bound checking */ + na->num_tx_rings = nmr->nr_tx_rings; + nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); + nmr->nr_tx_rings = na->num_tx_rings; // write back + na->num_rx_rings = nmr->nr_rx_rings; + nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); + nmr->nr_rx_rings = na->num_rx_rings; // write back + nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, + 1, NM_BDG_MAXSLOTS, NULL); + na->num_tx_desc = nmr->nr_tx_slots; + nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, + 1, NM_BDG_MAXSLOTS, NULL); + na->num_rx_desc = nmr->nr_rx_slots; + vpna->offset = 0; + + na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; + na->nm_txsync = bdg_netmap_txsync; + na->nm_rxsync = bdg_netmap_rxsync; + na->nm_register = bdg_netmap_reg; + na->nm_dtor = netmap_adapter_vp_dtor; + na->nm_krings_create = netmap_vp_krings_create; + na->nm_krings_delete = netmap_vp_krings_delete; + na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp), + na->num_tx_rings, na->num_tx_desc, + na->num_rx_rings, na->num_rx_desc); + /* other nmd fields are set in the common routine */ + error = netmap_attach_common(na); + if (error) { + free(vpna, M_DEVBUF); + return error; + } + return 0; +} + +static void +netmap_bwrap_dtor(struct netmap_adapter *na) +{ + struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; + struct netmap_adapter *hwna = bna->hwna; + struct nm_bridge *b = bna->up.na_bdg, + *bh = bna->host.na_bdg; + struct ifnet *ifp = na->ifp; + + ND("na %p", na); + + if (b) { + netmap_bdg_detach_common(b, bna->up.bdg_port, + (bh ? bna->host.bdg_port : -1)); + } + + hwna->na_private = NULL; + netmap_adapter_put(hwna); + + bzero(ifp, sizeof(*ifp)); + free(ifp, M_DEVBUF); + na->ifp = NULL; + +} + +/* + * Pass packets from nic to the bridge. + * XXX TODO check locking: this is called from the interrupt + * handler so we should make sure that the interface is not + * disconnected while passing down an interrupt. + * + * Note, no user process can access this NIC so we can ignore + * the info in the 'ring'. + */ +/* callback that overwrites the hwna notify callback. + * Packets come from the outside or from the host stack and are put on an hwna rx ring. + * The bridge wrapper then sends the packets through the bridge. + */ +static int +netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags) +{ + struct ifnet *ifp = na->ifp; + struct netmap_bwrap_adapter *bna = na->na_private; + struct netmap_vp_adapter *hostna = &bna->host; + struct netmap_kring *kring, *bkring; + struct netmap_ring *ring; + int is_host_ring = ring_nr == na->num_rx_rings; + struct netmap_vp_adapter *vpna = &bna->up; + int error = 0; + + ND("%s[%d] %s %x", NM_IFPNAME(ifp), ring_nr, (tx == NR_TX ? "TX" : "RX"), flags); + + if (flags & NAF_DISABLE_NOTIFY) { + kring = tx == NR_TX ? na->tx_rings : na->rx_rings; + bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings; + if (kring->nkr_stopped) + netmap_disable_ring(bkring); + else + bkring->nkr_stopped = 0; + return 0; + } + + if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP)) + return 0; + + if (tx == NR_TX) + return 0; + + kring = &na->rx_rings[ring_nr]; + ring = kring->ring; + + /* make sure the ring is not disabled */ + if (nm_kr_tryget(kring)) + return 0; + + if (is_host_ring && hostna->na_bdg == NULL) { + error = bna->save_notify(na, ring_nr, tx, flags); + goto put_out; + } + + if (is_host_ring) { + vpna = hostna; + ring_nr = 0; + } else { + /* fetch packets that have arrived. + * XXX maybe do this in a loop ? + */ + error = na->nm_rxsync(na, ring_nr, 0); + if (error) + goto put_out; + } + if (kring->nr_hwavail == 0 && netmap_verbose) { + D("how strange, interrupt with no packets on %s", + NM_IFPNAME(ifp)); + goto put_out; + } + /* XXX avail ? */ + ring->cur = nm_kr_rxpos(kring); + netmap_vp_txsync(vpna, ring_nr, flags); + + if (!is_host_ring) + error = na->nm_rxsync(na, ring_nr, 0); + +put_out: + nm_kr_put(kring); + return error; +} + +static int +netmap_bwrap_register(struct netmap_adapter *na, int onoff) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + struct netmap_vp_adapter *hostna = &bna->host; + int error; + + ND("%s %d", NM_IFPNAME(ifp), onoff); + + if (onoff) { + int i; + + hwna->na_lut = na->na_lut; + hwna->na_lut_objtotal = na->na_lut_objtotal; + + if (hostna->na_bdg) { + hostna->up.na_lut = na->na_lut; + hostna->up.na_lut_objtotal = na->na_lut_objtotal; + } + + /* cross-link the netmap rings */ + for (i = 0; i <= na->num_tx_rings; i++) { + hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots; + hwna->tx_rings[i].ring = na->rx_rings[i].ring; + } + for (i = 0; i <= na->num_rx_rings; i++) { + hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots; + hwna->rx_rings[i].ring = na->tx_rings[i].ring; + } + } + + if (hwna->ifp) { + error = hwna->nm_register(hwna, onoff); + if (error) + return error; + } + + bdg_netmap_reg(na, onoff); + + if (onoff) { + bna->save_notify = hwna->nm_notify; + hwna->nm_notify = netmap_bwrap_intr_notify; + } else { + hwna->nm_notify = bna->save_notify; + hwna->na_lut = NULL; + hwna->na_lut_objtotal = 0; + } + + return 0; +} + +static int +netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, + u_int *rxr, u_int *rxd) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + + /* forward the request */ + netmap_update_config(hwna); + /* swap the results */ + *txr = hwna->num_rx_rings; + *txd = hwna->num_rx_desc; + *rxr = hwna->num_tx_rings; + *rxd = hwna->num_rx_desc; + + return 0; +} + +static int +netmap_bwrap_krings_create(struct netmap_adapter *na) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + struct netmap_adapter *hostna = &bna->host.up; + int error; + + ND("%s", NM_IFPNAME(na->ifp)); + + error = netmap_vp_krings_create(na); + if (error) + return error; + + error = hwna->nm_krings_create(hwna); + if (error) { + netmap_vp_krings_delete(na); + return error; + } + + hostna->tx_rings = na->tx_rings + na->num_tx_rings; + hostna->rx_rings = na->rx_rings + na->num_rx_rings; + + return 0; +} + +static void +netmap_bwrap_krings_delete(struct netmap_adapter *na) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + + ND("%s", NM_IFPNAME(na->ifp)); + + hwna->nm_krings_delete(hwna); + netmap_vp_krings_delete(na); +} + +/* notify method for the bridge-->hwna direction */ +static int +netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + struct netmap_kring *kring, *hw_kring; + struct netmap_ring *ring; + u_int lim, k; + int error = 0; + + if (tx == NR_TX) + return ENXIO; + + kring = &na->rx_rings[ring_n]; + hw_kring = &hwna->tx_rings[ring_n]; + ring = kring->ring; + + lim = kring->nkr_num_slots - 1; + k = nm_kr_rxpos(kring); + + if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP)) + return 0; + ring->cur = k; + ND("%s[%d] PRE rx(%d, %d, %d, %d) ring(%d, %d, %d) tx(%d, %d)", + NM_IFPNAME(na->ifp), ring_n, + kring->nr_hwcur, kring->nr_hwavail, kring->nkr_hwlease, kring->nr_hwreserved, + ring->cur, ring->avail, ring->reserved, + hw_kring->nr_hwcur, hw_kring->nr_hwavail); + if (ring_n == na->num_rx_rings) { + netmap_txsync_to_host(hwna); + } else { + error = hwna->nm_txsync(hwna, ring_n, flags); + } + kring->nr_hwcur = ring->cur; + kring->nr_hwavail = 0; + kring->nr_hwreserved = lim - ring->avail; + ND("%s[%d] PST rx(%d, %d, %d, %d) ring(%d, %d, %d) tx(%d, %d)", + NM_IFPNAME(na->ifp), ring_n, + kring->nr_hwcur, kring->nr_hwavail, kring->nkr_hwlease, kring->nr_hwreserved, + ring->cur, ring->avail, ring->reserved, + hw_kring->nr_hwcur, hw_kring->nr_hwavail); + + return error; +} + +static int +netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) +{ + struct netmap_bwrap_adapter *bna = na->na_private; + struct netmap_adapter *port_na = &bna->up.up; + if (tx == NR_TX || ring_n != 0) + return ENXIO; + return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags); +} + +/* attach a bridge wrapper to the 'real' device */ +static int +netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) +{ + struct netmap_bwrap_adapter *bna; + struct netmap_adapter *na; + struct netmap_adapter *hwna = NA(real); + struct netmap_adapter *hostna; + int error; + + + bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (bna == NULL) + return ENOMEM; + + na = &bna->up.up; + na->ifp = fake; + /* fill the ring data for the bwrap adapter with rx/tx meanings + * swapped. The real cross-linking will be done during register, + * when all the krings will have been created. + */ + na->num_rx_rings = hwna->num_tx_rings; + na->num_tx_rings = hwna->num_rx_rings; + na->num_tx_desc = hwna->num_rx_desc; + na->num_rx_desc = hwna->num_tx_desc; + na->nm_dtor = netmap_bwrap_dtor; + na->nm_register = netmap_bwrap_register; + // na->nm_txsync = netmap_bwrap_txsync; + // na->nm_rxsync = netmap_bwrap_rxsync; + na->nm_config = netmap_bwrap_config; + na->nm_krings_create = netmap_bwrap_krings_create; + na->nm_krings_delete = netmap_bwrap_krings_delete; + na->nm_notify = netmap_bwrap_notify; + na->nm_mem = hwna->nm_mem; + na->na_private = na; /* prevent NIOCREGIF */ + bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ + + bna->hwna = hwna; + netmap_adapter_get(hwna); + hwna->na_private = bna; /* weak reference */ + + hostna = &bna->host.up; + hostna->ifp = hwna->ifp; + hostna->num_tx_rings = 1; + hostna->num_tx_desc = hwna->num_rx_desc; + hostna->num_rx_rings = 1; + hostna->num_rx_desc = hwna->num_tx_desc; + // hostna->nm_txsync = netmap_bwrap_host_txsync; + // hostna->nm_rxsync = netmap_bwrap_host_rxsync; + hostna->nm_notify = netmap_bwrap_host_notify; + hostna->nm_mem = na->nm_mem; + hostna->na_private = bna; + + D("%s<->%s txr %d txd %d rxr %d rxd %d", fake->if_xname, real->if_xname, + na->num_tx_rings, na->num_tx_desc, + na->num_rx_rings, na->num_rx_desc); + + error = netmap_attach_common(na); + if (error) { + netmap_adapter_put(hwna); + free(bna, M_DEVBUF); + return error; + } + return 0; +} + +void +netmap_init_bridges(void) +{ + int i; + bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ + for (i = 0; i < NM_BRIDGES; i++) + BDG_RWINIT(&nm_bridges[i]); +} +#endif /* WITH_VALE */ |