aboutsummaryrefslogblamecommitdiff
path: root/sys/net/debugnet.c
blob: cefdd4f582ce4d204f2b7e52115cf787ec08ba08 (plain) (tree)
1
2
   
                                        






























                                                                             
                    





                       
                             


                       


                       




                       





                         
                           
                     

                           

                       
                           
















                                      
                                                                        














                                                                            



                                                       














                                                           















                                                           








































































                                                                               
                                                   






                                                   
   




















                                                                              
































                                                                           


                                                    
























































































                                                                               
                                           
                           

                                                            








                            








                                                                         

                          

















                                                                  
 
                                    




                                                                  













                                                                                    




                                                                         






                                                                                 
                       

         



                                                                                  

 








                                                                               

































                                                                               
                             



















                                                         


                                                                             

                       







                                                                  


                                     
















                                                                          














                                                                           
                                                     






























                                                                            

                                                                               
























                                                                             












                                                 






                                                                               
                                               
 


                          










                                                     
                                  
                                                                 

                          





                                                                              









                                                        
                                                 















                                                                         
                                                      

                                      
                                                    
                                     




                                                



                                                                            
                                     
                                       








                                                          
                                                                             
                              

                                  
                                 
                                                                            




                                       
                                           

                                                   
                      
                                                           



                                                                               

                                                                       
 
                                    






                                                                    

         
                          











                                                                                   
                                                               















                                                                        
                                                                
                                     



                                                                             
                            




                                                       
                                       










                                                                       



                                                         
                                                                        
                                                     












































































                                                                               




                                                                               


                                



                                                                              





























                                                                               































































































































































                                                                                  
                                               





















                                                                           

                                                                          


                           
                                                   
                                                     





                                                                               

                                                                     






                         
/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2019 Isilon Systems, LLC.
 * Copyright (c) 2005-2014 Sandvine Incorporated. All rights reserved.
 * Copyright (c) 2000 Darrell Anderson
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include "opt_ddb.h"
#include "opt_inet.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/endian.h>
#include <sys/errno.h>
#include <sys/eventhandler.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/socket.h>
#include <sys/sysctl.h>

#ifdef DDB
#include <ddb/ddb.h>
#include <ddb/db_lex.h>
#endif

#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/if_var.h>
#include <net/if_private.h>
#include <net/vnet.h>
#include <net/route.h>
#include <net/route/nhop.h>

#include <netinet/in.h>
#include <netinet/in_fib.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_options.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>

#include <machine/in_cksum.h>
#include <machine/pcb.h>

#include <net/debugnet.h>
#define	DEBUGNET_INTERNAL
#include <net/debugnet_int.h>

FEATURE(debugnet, "Debugnet support");

SYSCTL_NODE(_net, OID_AUTO, debugnet, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
    "debugnet parameters");

unsigned debugnet_debug;
SYSCTL_UINT(_net_debugnet, OID_AUTO, debug, CTLFLAG_RWTUN,
    &debugnet_debug, 0,
    "Debug message verbosity (0: off; 1: on; 2: verbose)");

int debugnet_npolls = 2000;
SYSCTL_INT(_net_debugnet, OID_AUTO, npolls, CTLFLAG_RWTUN,
    &debugnet_npolls, 0,
    "Number of times to poll before assuming packet loss (0.5ms per poll)");
int debugnet_nretries = 10;
SYSCTL_INT(_net_debugnet, OID_AUTO, nretries, CTLFLAG_RWTUN,
    &debugnet_nretries, 0,
    "Number of retransmit attempts before giving up");
int debugnet_fib = RT_DEFAULT_FIB;
SYSCTL_INT(_net_debugnet, OID_AUTO, fib, CTLFLAG_RWTUN,
    &debugnet_fib, 0,
    "Fib to use when sending dump");

static bool g_debugnet_pcb_inuse;
static struct debugnet_pcb g_dnet_pcb;

/*
 * Simple accessors for opaque PCB.
 */
const unsigned char *
debugnet_get_gw_mac(const struct debugnet_pcb *pcb)
{
	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
	    pcb->dp_state >= DN_STATE_HAVE_GW_MAC);
	return (pcb->dp_gw_mac.octet);
}

const in_addr_t *
debugnet_get_server_addr(const struct debugnet_pcb *pcb)
{
	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
	    pcb->dp_state >= DN_STATE_GOT_HERALD_PORT);
	return (&pcb->dp_server);
}

const uint16_t
debugnet_get_server_port(const struct debugnet_pcb *pcb)
{
	MPASS(g_debugnet_pcb_inuse && pcb == &g_dnet_pcb &&
	    pcb->dp_state >= DN_STATE_GOT_HERALD_PORT);
	return (pcb->dp_server_port);
}

/*
 * Start of network primitives, beginning with output primitives.
 */

/*
 * Handles creation of the ethernet header, then places outgoing packets into
 * the tx buffer for the NIC
 *
 * Parameters:
 *	m	The mbuf containing the packet to be sent (will be freed by
 *		this function or the NIC driver)
 *	ifp	The interface to send on
 *	dst	The destination ethernet address (source address will be looked
 *		up using ifp)
 *	etype	The ETHERTYPE_* value for the protocol that is being sent
 *
 * Returns:
 *	int	see errno.h, 0 for success
 */
int
debugnet_ether_output(struct mbuf *m, struct ifnet *ifp, struct ether_addr dst,
    u_short etype)
{
	struct ether_header *eh;

	if (((ifp->if_flags & (IFF_MONITOR | IFF_UP)) != IFF_UP) ||
	    (ifp->if_drv_flags & IFF_DRV_RUNNING) != IFF_DRV_RUNNING) {
		if_printf(ifp, "%s: interface isn't up\n", __func__);
		m_freem(m);
		return (ENETDOWN);
	}

	/* Fill in the ethernet header. */
	M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
	if (m == NULL) {
		printf("%s: out of mbufs\n", __func__);
		return (ENOBUFS);
	}
	eh = mtod(m, struct ether_header *);
	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
	memcpy(eh->ether_dhost, dst.octet, ETHER_ADDR_LEN);
	eh->ether_type = htons(etype);
	return (ifp->if_debugnet_methods->dn_transmit(ifp, m));
}

/*
 * Unreliable transmission of an mbuf chain to the debugnet server
 * Note: can't handle fragmentation; fails if the packet is larger than
 *	 ifp->if_mtu after adding the UDP/IP headers
 *
 * Parameters:
 *	pcb	The debugnet context block
 *	m	mbuf chain
 *
 * Returns:
 *	int	see errno.h, 0 for success
 */
static int
debugnet_udp_output(struct debugnet_pcb *pcb, struct mbuf *m)
{
	struct udphdr *udp;

	MPASS(pcb->dp_state >= DN_STATE_HAVE_GW_MAC);

	M_PREPEND(m, sizeof(*udp), M_NOWAIT);
	if (m == NULL) {
		printf("%s: out of mbufs\n", __func__);
		return (ENOBUFS);
	}

	udp = mtod(m, void *);
	udp->uh_ulen = htons(m->m_pkthdr.len);
	/* Use this src port so that the server can connect() the socket */
	udp->uh_sport = htons(pcb->dp_client_port);
	udp->uh_dport = htons(pcb->dp_server_port);
	/* Computed later (protocol-dependent). */
	udp->uh_sum = 0;

	return (debugnet_ip_output(pcb, m));
}

int
debugnet_ack_output(struct debugnet_pcb *pcb, uint32_t seqno /* net endian */)
{
	struct debugnet_ack *dn_ack;
	struct mbuf *m;

	DNETDEBUG("Acking with seqno %u\n", ntohl(seqno));

	m = m_gethdr(M_NOWAIT, MT_DATA);
	if (m == NULL) {
		printf("%s: Out of mbufs\n", __func__);
		return (ENOBUFS);
	}
	m->m_len = sizeof(*dn_ack);
	m->m_pkthdr.len = sizeof(*dn_ack);
	MH_ALIGN(m, sizeof(*dn_ack));
	dn_ack = mtod(m, void *);
	dn_ack->da_seqno = seqno;

	return (debugnet_udp_output(pcb, m));
}

/*
 * Dummy free function for debugnet clusters.
 */
static void
debugnet_mbuf_free(struct mbuf *m __unused)
{
}

/*
 * Construct and reliably send a debugnet packet.  May fail from a resource
 * shortage or extreme number of unacknowledged retransmissions.  Wait for
 * an acknowledgement before returning.  Splits packets into chunks small
 * enough to be sent without fragmentation (looks up the interface MTU)
 *
 * Parameters:
 *	type	debugnet packet type (HERALD, FINISHED, ...)
 *	data	data
 *	datalen	data size (bytes)
 *	auxdata	optional auxiliary information
 *
 * Returns:
 *	int see errno.h, 0 for success
 */
int
debugnet_send(struct debugnet_pcb *pcb, uint32_t type, const void *data,
    uint32_t datalen, const struct debugnet_proto_aux *auxdata)
{
	struct debugnet_msg_hdr *dn_msg_hdr;
	struct mbuf *m, *m2;
	uint64_t want_acks;
	uint32_t i, pktlen, sent_so_far;
	int retries, polls, error;

	if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
		return (ECONNRESET);

	want_acks = 0;
	pcb->dp_rcvd_acks = 0;
	retries = 0;

retransmit:
	/* Chunks can be too big to fit in packets. */
	for (i = sent_so_far = 0; sent_so_far < datalen ||
	    (i == 0 && datalen == 0); i++) {
		pktlen = datalen - sent_so_far;

		/* Bound: the interface MTU (assume no IP options). */
		pktlen = min(pktlen, pcb->dp_ifp->if_mtu -
		    sizeof(struct udpiphdr) - sizeof(struct debugnet_msg_hdr));

		/*
		 * Check if it is retransmitting and this has been ACKed
		 * already.
		 */
		if ((pcb->dp_rcvd_acks & (1 << i)) != 0) {
			sent_so_far += pktlen;
			continue;
		}

		/*
		 * Get and fill a header mbuf, then chain data as an extended
		 * mbuf.
		 */
		m = m_gethdr(M_NOWAIT, MT_DATA);
		if (m == NULL) {
			printf("%s: Out of mbufs\n", __func__);
			return (ENOBUFS);
		}
		m->m_len = sizeof(struct debugnet_msg_hdr);
		m->m_pkthdr.len = sizeof(struct debugnet_msg_hdr);
		MH_ALIGN(m, sizeof(struct debugnet_msg_hdr));
		dn_msg_hdr = mtod(m, struct debugnet_msg_hdr *);
		dn_msg_hdr->mh_seqno = htonl(pcb->dp_seqno + i);
		dn_msg_hdr->mh_type = htonl(type);
		dn_msg_hdr->mh_len = htonl(pktlen);

		if (auxdata != NULL) {
			dn_msg_hdr->mh_offset =
			    htobe64(auxdata->dp_offset_start + sent_so_far);
			dn_msg_hdr->mh_aux2 = htobe32(auxdata->dp_aux2);
		} else {
			dn_msg_hdr->mh_offset = htobe64(sent_so_far);
			dn_msg_hdr->mh_aux2 = 0;
		}

		if (pktlen != 0) {
			m2 = m_get(M_NOWAIT, MT_DATA);
			if (m2 == NULL) {
				m_freem(m);
				printf("%s: Out of mbufs\n", __func__);
				return (ENOBUFS);
			}
			MEXTADD(m2, __DECONST(char *, data) + sent_so_far,
			    pktlen, debugnet_mbuf_free, NULL, NULL, 0,
			    EXT_DISPOSABLE);
			m2->m_len = pktlen;

			m_cat(m, m2);
			m->m_pkthdr.len += pktlen;
		}
		error = debugnet_udp_output(pcb, m);
		if (error != 0)
			return (error);

		/* Note that we're waiting for this packet in the bitfield. */
		want_acks |= (1 << i);
		sent_so_far += pktlen;
	}
	if (i >= DEBUGNET_MAX_IN_FLIGHT)
		printf("Warning: Sent more than %d packets (%d). "
		    "Acknowledgements will fail unless the size of "
		    "rcvd_acks/want_acks is increased.\n",
		    DEBUGNET_MAX_IN_FLIGHT, i);

	/*
	 * Wait for acks.  A *real* window would speed things up considerably.
	 */
	polls = 0;
	while (pcb->dp_rcvd_acks != want_acks) {
		if (polls++ > debugnet_npolls) {
			if (retries++ > debugnet_nretries)
				return (ETIMEDOUT);
			printf(". ");
			goto retransmit;
		}
		debugnet_network_poll(pcb);
		DELAY(500);
		if (pcb->dp_state == DN_STATE_REMOTE_CLOSED)
			return (ECONNRESET);
	}
	pcb->dp_seqno += i;
	return (0);
}

/*
 * Network input primitives.
 */

/*
 * Just introspect the header enough to fire off a seqno ack and validate
 * length fits.
 */
static void
debugnet_handle_rx_msg(struct debugnet_pcb *pcb, struct mbuf **mb)
{
	const struct debugnet_msg_hdr *dnh;
	struct mbuf *m;
	uint32_t hdr_type;
	uint32_t seqno;
	int error;

	m = *mb;

	if (m->m_pkthdr.len < sizeof(*dnh)) {
		DNETDEBUG("ignoring small debugnet_msg packet\n");
		return;
	}

	/* Get ND header. */
	if (m->m_len < sizeof(*dnh)) {
		m = m_pullup(m, sizeof(*dnh));
		*mb = m;
		if (m == NULL) {
			DNETDEBUG("m_pullup failed\n");
			return;
		}
	}

	dnh = mtod(m, const void *);
	if (ntohl(dnh->mh_len) + sizeof(*dnh) > m->m_pkthdr.len) {
		DNETDEBUG("Dropping short packet.\n");
		return;
	}

	hdr_type = ntohl(dnh->mh_type);
	if (hdr_type != DEBUGNET_DATA) {
		if (hdr_type == DEBUGNET_FINISHED) {
			printf("Remote shut down the connection on us!\n");
			pcb->dp_state = DN_STATE_REMOTE_CLOSED;
			if (pcb->dp_finish_handler != NULL) {
				pcb->dp_finish_handler();
			}
		} else {
			DNETDEBUG("Got unexpected debugnet message %u\n", hdr_type);
		}
		return;
	}

	/*
	 * If the issue is transient (ENOBUFS), sender should resend.  If
	 * non-transient (like driver objecting to rx -> tx from the same
	 * thread), not much else we can do.
	 */
	seqno = dnh->mh_seqno; /* net endian */
	m_adj(m, sizeof(*dnh));
	dnh = NULL;
	error = pcb->dp_rx_handler(m);
	if (error != 0) {
		DNETDEBUG("RX handler was not able to accept message, error %d. "
		    "Skipping ack.\n", error);
		return;
	}

	error = debugnet_ack_output(pcb, seqno);
	if (error != 0) {
		DNETDEBUG("Couldn't ACK rx packet %u; %d\n", ntohl(seqno), error);
	}
}

static void
debugnet_handle_ack(struct debugnet_pcb *pcb, struct mbuf **mb, uint16_t sport)
{
	const struct debugnet_ack *dn_ack;
	struct mbuf *m;
	uint32_t rcv_ackno;

	m = *mb;

	/* Get Ack. */
	if (m->m_len < sizeof(*dn_ack)) {
		m = m_pullup(m, sizeof(*dn_ack));
		*mb = m;
		if (m == NULL) {
			DNETDEBUG("m_pullup failed\n");
			return;
		}
	}
	dn_ack = mtod(m, const void *);

	/* Debugnet processing. */
	/*
	 * Packet is meant for us.  Extract the ack sequence number and the
	 * port number if necessary.
	 */
	rcv_ackno = ntohl(dn_ack->da_seqno);
	if (pcb->dp_state < DN_STATE_GOT_HERALD_PORT) {
		pcb->dp_server_port = sport;
		pcb->dp_state = DN_STATE_GOT_HERALD_PORT;
	}
	if (rcv_ackno >= pcb->dp_seqno + DEBUGNET_MAX_IN_FLIGHT)
		printf("%s: ACK %u too far in future!\n", __func__, rcv_ackno);
	else if (rcv_ackno >= pcb->dp_seqno) {
		/* We're interested in this ack. Record it. */
		pcb->dp_rcvd_acks |= 1 << (rcv_ackno - pcb->dp_seqno);
	}
}

void
debugnet_handle_udp(struct debugnet_pcb *pcb, struct mbuf **mb)
{
	const struct udphdr *udp;
	struct mbuf *m;
	uint16_t sport, ulen;

	/* UDP processing. */

	m = *mb;
	if (m->m_pkthdr.len < sizeof(*udp)) {
		DNETDEBUG("ignoring small UDP packet\n");
		return;
	}

	/* Get UDP headers. */
	if (m->m_len < sizeof(*udp)) {
		m = m_pullup(m, sizeof(*udp));
		*mb = m;
		if (m == NULL) {
			DNETDEBUG("m_pullup failed\n");
			return;
		}
	}
	udp = mtod(m, const void *);

	/* We expect to receive UDP packets on the configured client port. */
	if (ntohs(udp->uh_dport) != pcb->dp_client_port) {
		DNETDEBUG("not on the expected port.\n");
		return;
	}

	/* Check that ulen does not exceed actual size of data. */
	ulen = ntohs(udp->uh_ulen);
	if (m->m_pkthdr.len < ulen) {
		DNETDEBUG("ignoring runt UDP packet\n");
		return;
	}

	sport = ntohs(udp->uh_sport);

	m_adj(m, sizeof(*udp));
	ulen -= sizeof(*udp);

	if (ulen == sizeof(struct debugnet_ack)) {
		debugnet_handle_ack(pcb, mb, sport);
		return;
	}

	if (pcb->dp_rx_handler == NULL) {
		if (ulen < sizeof(struct debugnet_ack))
			DNETDEBUG("ignoring small ACK packet\n");
		else
			DNETDEBUG("ignoring unexpected non-ACK packet on "
			    "half-duplex connection.\n");
		return;
	}

	debugnet_handle_rx_msg(pcb, mb);
}

/*
 * Handler for incoming packets directly from the network adapter
 * Identifies the packet type (IP or ARP) and passes it along to one of the
 * helper functions debugnet_handle_ip or debugnet_handle_arp.
 *
 * It needs to partially replicate the behaviour of ether_input() and
 * ether_demux().
 *
 * Parameters:
 *	ifp	the interface the packet came from
 *	m	an mbuf containing the packet received
 */
static void
debugnet_input_one(struct ifnet *ifp, struct mbuf *m)
{
	struct ifreq ifr;
	struct ether_header *eh;
	u_short etype;

	/* Ethernet processing. */
	if ((m->m_flags & M_PKTHDR) == 0) {
		DNETDEBUG_IF(ifp, "discard frame without packet header\n");
		goto done;
	}
	if (m->m_len < ETHER_HDR_LEN) {
		DNETDEBUG_IF(ifp,
	    "discard frame without leading eth header (len %u pktlen %u)\n",
		    m->m_len, m->m_pkthdr.len);
		goto done;
	}
	if ((m->m_flags & M_HASFCS) != 0) {
		m_adj(m, -ETHER_CRC_LEN);
		m->m_flags &= ~M_HASFCS;
	}
	eh = mtod(m, struct ether_header *);
	etype = ntohs(eh->ether_type);
	if ((m->m_flags & M_VLANTAG) != 0 || etype == ETHERTYPE_VLAN) {
		DNETDEBUG_IF(ifp, "ignoring vlan packets\n");
		goto done;
	}
	if (if_gethwaddr(ifp, &ifr) != 0) {
		DNETDEBUG_IF(ifp, "failed to get hw addr for interface\n");
		goto done;
	}
	if (memcmp(ifr.ifr_addr.sa_data, eh->ether_dhost,
	    ETHER_ADDR_LEN) != 0 &&
	    (etype != ETHERTYPE_ARP || !ETHER_IS_BROADCAST(eh->ether_dhost))) {
		DNETDEBUG_IF(ifp,
		    "discard frame with incorrect destination addr\n");
		goto done;
	}

	MPASS(g_debugnet_pcb_inuse);

	/* Done ethernet processing. Strip off the ethernet header. */
	m_adj(m, ETHER_HDR_LEN);
	switch (etype) {
	case ETHERTYPE_ARP:
		debugnet_handle_arp(&g_dnet_pcb, &m);
		break;
	case ETHERTYPE_IP:
		debugnet_handle_ip(&g_dnet_pcb, &m);
		break;
	default:
		DNETDEBUG_IF(ifp, "dropping unknown ethertype %hu\n", etype);
		break;
	}
done:
	if (m != NULL)
		m_freem(m);
}

static void
debugnet_input(struct ifnet *ifp, struct mbuf *m)
{
	struct mbuf *n;

	do {
		n = m->m_nextpkt;
		m->m_nextpkt = NULL;
		debugnet_input_one(ifp, m);
		m = n;
	} while (m != NULL);
}

/*
 * Network polling primitive.
 *
 * Instead of assuming that most of the network stack is sane, we just poll the
 * driver directly for packets.
 */
void
debugnet_network_poll(struct debugnet_pcb *pcb)
{
	struct ifnet *ifp;

	ifp = pcb->dp_ifp;
	ifp->if_debugnet_methods->dn_poll(ifp, 1000);
}

/*
 * Start of consumer API surface.
 */
void
debugnet_free(struct debugnet_pcb *pcb)
{
	struct ifnet *ifp;

	MPASS(pcb == &g_dnet_pcb);
	MPASS(pcb->dp_drv_input == NULL || g_debugnet_pcb_inuse);

	ifp = pcb->dp_ifp;
	if (ifp != NULL) {
		if (pcb->dp_drv_input != NULL)
			ifp->if_input = pcb->dp_drv_input;
		if (pcb->dp_event_started)
			ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_END);
	}
	debugnet_mbuf_finish();

	g_debugnet_pcb_inuse = false;
	memset(&g_dnet_pcb, 0xfd, sizeof(g_dnet_pcb));
}

int
debugnet_connect(const struct debugnet_conn_params *dcp,
    struct debugnet_pcb **pcb_out)
{
	struct debugnet_proto_aux herald_auxdata;
	struct debugnet_pcb *pcb;
	struct ifnet *ifp;
	int error;

	if (g_debugnet_pcb_inuse) {
		printf("%s: Only one connection at a time.\n", __func__);
		return (EBUSY);
	}

	pcb = &g_dnet_pcb;
	*pcb = (struct debugnet_pcb) {
		.dp_state = DN_STATE_INIT,
		.dp_client = dcp->dc_client,
		.dp_server = dcp->dc_server,
		.dp_gateway = dcp->dc_gateway,
		.dp_server_port = dcp->dc_herald_port,	/* Initially */
		.dp_client_port = dcp->dc_client_port,
		.dp_seqno = 1,
		.dp_ifp = dcp->dc_ifp,
		.dp_rx_handler = dcp->dc_rx_handler,
		.dp_drv_input = NULL,
	};

	/* Switch to the debugnet mbuf zones. */
	debugnet_mbuf_start();

	/* At least one needed parameter is missing; infer it. */
	if (pcb->dp_client == INADDR_ANY || pcb->dp_gateway == INADDR_ANY ||
	    pcb->dp_ifp == NULL) {
		struct sockaddr_in dest_sin, *gw_sin, *local_sin;
		struct ifnet *rt_ifp;
		struct nhop_object *nh;

		memset(&dest_sin, 0, sizeof(dest_sin));
		dest_sin = (struct sockaddr_in) {
			.sin_len = sizeof(dest_sin),
			.sin_family = AF_INET,
			.sin_addr.s_addr = pcb->dp_server,
		};

		CURVNET_SET(vnet0);
		nh = fib4_lookup_debugnet(debugnet_fib, dest_sin.sin_addr, 0,
		    NHR_NONE);
		CURVNET_RESTORE();

		if (nh == NULL) {
			printf("%s: Could not get route for that server.\n",
			    __func__);
			error = ENOENT;
			goto cleanup;
		}

		/* TODO support AF_INET6 */
		if (nh->gw_sa.sa_family == AF_INET)
			gw_sin = &nh->gw4_sa;
		else {
			if (nh->gw_sa.sa_family == AF_LINK)
				DNETDEBUG("Destination address is on link.\n");
			gw_sin = NULL;
		}

		MPASS(nh->nh_ifa->ifa_addr->sa_family == AF_INET);
		local_sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;

		rt_ifp = nh->nh_ifp;

		if (pcb->dp_client == INADDR_ANY)
			pcb->dp_client = local_sin->sin_addr.s_addr;
		if (pcb->dp_gateway == INADDR_ANY && gw_sin != NULL)
			pcb->dp_gateway = gw_sin->sin_addr.s_addr;
		if (pcb->dp_ifp == NULL)
			pcb->dp_ifp = rt_ifp;
	}

	ifp = pcb->dp_ifp;

	if (debugnet_debug > 0) {
		char serbuf[INET_ADDRSTRLEN], clibuf[INET_ADDRSTRLEN],
		    gwbuf[INET_ADDRSTRLEN];
		inet_ntop(AF_INET, &pcb->dp_server, serbuf, sizeof(serbuf));
		inet_ntop(AF_INET, &pcb->dp_client, clibuf, sizeof(clibuf));
		if (pcb->dp_gateway != INADDR_ANY)
			inet_ntop(AF_INET, &pcb->dp_gateway, gwbuf, sizeof(gwbuf));
		DNETDEBUG("Connecting to %s:%d%s%s from %s:%d on %s\n",
		    serbuf, pcb->dp_server_port,
		    (pcb->dp_gateway == INADDR_ANY) ? "" : " via ",
		    (pcb->dp_gateway == INADDR_ANY) ? "" : gwbuf,
		    clibuf, pcb->dp_client_port, if_name(ifp));
	}

	/* Validate iface is online and supported. */
	if (!DEBUGNET_SUPPORTED_NIC(ifp)) {
		printf("%s: interface '%s' does not support debugnet\n",
		    __func__, if_name(ifp));
		error = ENODEV;
		goto cleanup;
	}
	if ((if_getflags(ifp) & IFF_UP) == 0) {
		printf("%s: interface '%s' link is down\n", __func__,
		    if_name(ifp));
		error = ENXIO;
		goto cleanup;
	}

	ifp->if_debugnet_methods->dn_event(ifp, DEBUGNET_START);
	pcb->dp_event_started = true;

	/*
	 * We maintain the invariant that g_debugnet_pcb_inuse is always true
	 * while the debugnet ifp's if_input is overridden with
	 * debugnet_input().
	 */
	g_debugnet_pcb_inuse = true;

	/* Make the card use *our* receive callback. */
	pcb->dp_drv_input = ifp->if_input;
	ifp->if_input = debugnet_input;

	printf("%s: searching for %s MAC...\n", __func__,
	    (dcp->dc_gateway == INADDR_ANY) ? "server" : "gateway");

	error = debugnet_arp_gw(pcb);
	if (error != 0) {
		printf("%s: failed to locate MAC address\n", __func__);
		goto cleanup;
	}
	MPASS(pcb->dp_state == DN_STATE_HAVE_GW_MAC);

	herald_auxdata = (struct debugnet_proto_aux) {
		.dp_offset_start = dcp->dc_herald_offset,
		.dp_aux2 = dcp->dc_herald_aux2,
	};
	error = debugnet_send(pcb, DEBUGNET_HERALD, dcp->dc_herald_data,
	    dcp->dc_herald_datalen, &herald_auxdata);
	if (error != 0) {
		printf("%s: failed to herald debugnet server\n", __func__);
		goto cleanup;
	}

	*pcb_out = pcb;
	return (0);

cleanup:
	debugnet_free(pcb);
	return (error);
}

/*
 * Pre-allocated dump-time mbuf tracking.
 *
 * We just track the high water mark we've ever seen and allocate appropriately
 * for that iface/mtu combo.
 */
static struct {
	int nmbuf;
	int ncl;
	int clsize;
} dn_hwm;
static struct mtx dn_hwm_lk;
MTX_SYSINIT(debugnet_hwm_lock, &dn_hwm_lk, "Debugnet HWM lock", MTX_DEF);

static void
dn_maybe_reinit_mbufs(int nmbuf, int ncl, int clsize)
{
	bool any;

	any = false;
	mtx_lock(&dn_hwm_lk);

	if (nmbuf > dn_hwm.nmbuf) {
		any = true;
		dn_hwm.nmbuf = nmbuf;
	} else
		nmbuf = dn_hwm.nmbuf;

	if (ncl > dn_hwm.ncl) {
		any = true;
		dn_hwm.ncl = ncl;
	} else
		ncl = dn_hwm.ncl;

	if (clsize > dn_hwm.clsize) {
		any = true;
		dn_hwm.clsize = clsize;
	} else
		clsize = dn_hwm.clsize;

	mtx_unlock(&dn_hwm_lk);

	if (any)
		debugnet_mbuf_reinit(nmbuf, ncl, clsize);
}

void
debugnet_any_ifnet_update(struct ifnet *ifp)
{
	int clsize, nmbuf, ncl, nrxr;

	if (!DEBUGNET_SUPPORTED_NIC(ifp))
		return;

	ifp->if_debugnet_methods->dn_init(ifp, &nrxr, &ncl, &clsize);
	KASSERT(nrxr > 0, ("invalid receive ring count %d", nrxr));

	/*
	 * We need two headers per message on the transmit side. Multiply by
	 * four to give us some breathing room.
	 */
	nmbuf = ncl * (4 + nrxr);
	ncl *= nrxr;

	/*
	 * Bandaid for drivers that (incorrectly) advertise LinkUp before their
	 * dn_init method is available.
	 */
	if (nmbuf == 0 || ncl == 0 || clsize == 0) {
#ifndef INVARIANTS
		if (bootverbose)
#endif
		printf("%s: Bad dn_init result from %s (ifp %p), ignoring.\n",
		    __func__, if_name(ifp), ifp);
		return;
	}
	dn_maybe_reinit_mbufs(nmbuf, ncl, clsize);
}

/*
 * Unfortunately, the ifnet_arrival_event eventhandler hook is mostly useless
 * for us because drivers tend to if_attach before invoking DEBUGNET_SET().
 *
 * On the other hand, hooking DEBUGNET_SET() itself may still be too early,
 * because the driver is still in attach.  Since we cannot use down interfaces,
 * maybe hooking ifnet_event:IFNET_EVENT_UP is sufficient?  ... Nope, at least
 * with vtnet and dhcpclient that event just never occurs.
 *
 * So that's how I've landed on the lower level ifnet_link_event.
 */

static void
dn_ifnet_event(void *arg __unused, struct ifnet *ifp, int link_state)
{
	if (link_state == LINK_STATE_UP)
		debugnet_any_ifnet_update(ifp);
}

static eventhandler_tag dn_attach_cookie;
static void
dn_evh_init(void *ctx __unused)
{
	dn_attach_cookie = EVENTHANDLER_REGISTER(ifnet_link_event,
	    dn_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
}
SYSINIT(dn_evh_init, SI_SUB_EVENTHANDLER + 1, SI_ORDER_ANY, dn_evh_init, NULL);

/*
 * DDB parsing helpers for debugnet(4) consumers.
 */
#ifdef DDB
struct my_inet_opt {
	bool has_opt;
	const char *printname;
	in_addr_t *result;
};

static int
dn_parse_optarg_ipv4(struct my_inet_opt *opt)
{
	in_addr_t tmp;
	unsigned octet;
	int t;

	tmp = 0;
	for (octet = 0; octet < 4; octet++) {
		t = db_read_token_flags(DRT_WSPACE | DRT_DECIMAL);
		if (t != tNUMBER) {
			db_printf("%s:%s: octet %u expected number; found %d\n",
			    __func__, opt->printname, octet, t);
			return (EINVAL);
		}
		/*
		 * db_lex lexes '-' distinctly from the number itself, but
		 * let's document that invariant.
		 */
		MPASS(db_tok_number >= 0);

		if (db_tok_number > UINT8_MAX) {
			db_printf("%s:%s: octet %u out of range: %jd\n", __func__,
			    opt->printname, octet, (intmax_t)db_tok_number);
			return (EDOM);
		}

		/* Constructed host-endian and converted to network later. */
		tmp = (tmp << 8) | db_tok_number;

		if (octet < 3) {
			t = db_read_token_flags(DRT_WSPACE);
			if (t != tDOT) {
				db_printf("%s:%s: octet %u expected '.'; found"
				    " %d\n", __func__, opt->printname, octet,
				    t);
				return (EINVAL);
			}
		}
	}

	*opt->result = htonl(tmp);
	opt->has_opt = true;
	return (0);
}

int
debugnet_parse_ddb_cmd(const char *cmd, struct debugnet_ddb_config *result)
{
	struct ifnet *ifp;
	int t, error;
	bool want_ifp;
	char ch;

	struct my_inet_opt opt_client = {
		.printname = "client",
		.result = &result->dd_client,
	},
	opt_server = {
		.printname = "server",
		.result = &result->dd_server,
	},
	opt_gateway = {
		.printname = "gateway",
		.result = &result->dd_gateway,
	},
	*cur_inet_opt;

	ifp = NULL;
	memset(result, 0, sizeof(*result));

	/*
	 * command [space] [-] [opt] [[space] [optarg]] ...
	 *
	 * db_command has already lexed 'command' for us.
	 */
	t = db_read_token_flags(DRT_WSPACE);
	if (t == tWSPACE)
		t = db_read_token_flags(DRT_WSPACE);

	while (t != tEOL) {
		if (t != tMINUS) {
			db_printf("%s: Bad syntax; expected '-', got %d\n",
			    cmd, t);
			goto usage;
		}

		t = db_read_token_flags(DRT_WSPACE);
		if (t != tIDENT) {
			db_printf("%s: Bad syntax; expected tIDENT, got %d\n",
			    cmd, t);
			goto usage;
		}

		if (strlen(db_tok_string) > 1) {
			db_printf("%s: Bad syntax; expected single option "
			    "flag, got '%s'\n", cmd, db_tok_string);
			goto usage;
		}

		want_ifp = false;
		cur_inet_opt = NULL;
		switch ((ch = db_tok_string[0])) {
		default:
			DNETDEBUG("Unexpected: '%c'\n", ch);
			/* FALLTHROUGH */
		case 'h':
			goto usage;
		case 'c':
			cur_inet_opt = &opt_client;
			break;
		case 'g':
			cur_inet_opt = &opt_gateway;
			break;
		case 's':
			cur_inet_opt = &opt_server;
			break;
		case 'i':
			want_ifp = true;
			break;
		}

		t = db_read_token_flags(DRT_WSPACE);
		if (t != tWSPACE) {
			db_printf("%s: Bad syntax; expected space after "
			    "flag %c, got %d\n", cmd, ch, t);
			goto usage;
		}

		if (want_ifp) {
			t = db_read_token_flags(DRT_WSPACE);
			if (t != tIDENT) {
				db_printf("%s: Expected interface but got %d\n",
				    cmd, t);
				goto usage;
			}

			CURVNET_SET(vnet0);
			/*
			 * We *don't* take a ref here because the only current
			 * consumer, db_netdump_cmd, does not need it.  It
			 * (somewhat redundantly) extracts the if_name(),
			 * re-lookups the ifp, and takes its own reference.
			 */
			ifp = ifunit(db_tok_string);
			CURVNET_RESTORE();
			if (ifp == NULL) {
				db_printf("Could not locate interface %s\n",
				    db_tok_string);
				error = ENOENT;
				goto cleanup;
			}
		} else {
			MPASS(cur_inet_opt != NULL);
			/* Assume IPv4 for now. */
			error = dn_parse_optarg_ipv4(cur_inet_opt);
			if (error != 0)
				goto cleanup;
		}

		/* Skip (mandatory) whitespace after option, if not EOL. */
		t = db_read_token_flags(DRT_WSPACE);
		if (t == tEOL)
			break;
		if (t != tWSPACE) {
			db_printf("%s: Bad syntax; expected space after "
			    "flag %c option; got %d\n", cmd, ch, t);
			goto usage;
		}
		t = db_read_token_flags(DRT_WSPACE);
	}

	if (!opt_server.has_opt) {
		db_printf("%s: need a destination server address\n", cmd);
		goto usage;
	}

	result->dd_has_client = opt_client.has_opt;
	result->dd_has_gateway = opt_gateway.has_opt;
	result->dd_ifp = ifp;

	/* We parsed the full line to tEOL already, or bailed with an error. */
	return (0);

usage:
	db_printf("Usage: %s -s <server> [-g <gateway> -c <localip> "
	    "-i <interface>]\n", cmd);
	error = EINVAL;
	/* FALLTHROUGH */
cleanup:
	db_skip_to_eol();
	return (error);
}
#endif /* DDB */