52 files changed, 30566 insertions, 0 deletions
diff --git a/sys/dev/hyperv/hvsock/hv_sock.c b/sys/dev/hyperv/hvsock/hv_sock.c
new file mode 100644
index 000000000000..6d5ad4fc6609
--- /dev/null
+++ b/sys/dev/hyperv/hvsock/hv_sock.c
@@ -0,0 +1,1762 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/domain.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/sockbuf.h>
+#include <sys/sx.h>
+#include <sys/uio.h>
+
+#include <net/vnet.h>
+
+#include <dev/hyperv/vmbus/vmbus_reg.h>
+
+#include "hv_sock.h"
+
+#define HVSOCK_DBG_NONE			0x0
+#define HVSOCK_DBG_INFO			0x1
+#define HVSOCK_DBG_ERR			0x2
+#define HVSOCK_DBG_VERBOSE		0x3
+
+
+SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket");
+
+static int hvs_dbg_level;
+SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level,
+    0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose");
+
+
+#define HVSOCK_DBG(level, ...) do {					\
+	if (hvs_dbg_level >= (level))					\
+		printf(__VA_ARGS__);					\
+	} while (0)
+
+MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures");
+
+static int hvs_dom_probe(void);
+
+/* The MTU is 16KB per host side's design */
+#define HVSOCK_MTU_SIZE		(1024 * 16)
+#define HVSOCK_SEND_BUF_SZ	(PAGE_SIZE - sizeof(struct vmpipe_proto_header))
+
+#define HVSOCK_HEADER_LEN	(sizeof(struct hvs_pkt_header))
+
+#define HVSOCK_PKT_LEN(payload_len)	(HVSOCK_HEADER_LEN + \
+					 roundup2(payload_len, 8) + \
+					 sizeof(uint64_t))
+
+
+static struct domain		hv_socket_domain;
+
+/*
+ * HyperV Transport sockets
+ */
+static struct pr_usrreqs	hvs_trans_usrreqs = {
+	.pru_attach =		hvs_trans_attach,
+	.pru_bind =		hvs_trans_bind,
+	.pru_listen =		hvs_trans_listen,
+	.pru_accept =		hvs_trans_accept,
+	.pru_connect =		hvs_trans_connect,
+	.pru_peeraddr =		hvs_trans_peeraddr,
+	.pru_sockaddr =		hvs_trans_sockaddr,
+	.pru_soreceive =	hvs_trans_soreceive,
+	.pru_sosend =		hvs_trans_sosend,
+	.pru_disconnect =	hvs_trans_disconnect,
+	.pru_close =		hvs_trans_close,
+	.pru_detach =		hvs_trans_detach,
+	.pru_shutdown =		hvs_trans_shutdown,
+	.pru_abort =		hvs_trans_abort,
+};
+
+/*
+ * Definitions of protocols supported in HyperV socket domain
+ */
+static struct protosw		hv_socket_protosw[] = {
+{
+	.pr_type =		SOCK_STREAM,
+	.pr_domain =		&hv_socket_domain,
+	.pr_protocol =		HYPERV_SOCK_PROTO_TRANS,
+	.pr_flags =		PR_CONNREQUIRED,
+	.pr_init =		hvs_trans_init,
+	.pr_usrreqs =		&hvs_trans_usrreqs,
+},
+};
+
+static struct domain		hv_socket_domain = {
+	.dom_family =		AF_HYPERV,
+	.dom_name =		"hyperv",
+	.dom_probe =		hvs_dom_probe,
+	.dom_protosw =		hv_socket_protosw,
+	.dom_protoswNPROTOSW =	&hv_socket_protosw[nitems(hv_socket_protosw)]
+};
+
+VNET_DOMAIN_SET(hv_socket_);
+
+#define MAX_PORT			((uint32_t)0xFFFFFFFF)
+#define MIN_PORT			((uint32_t)0x0)
+
+/* 00000000-facb-11e6-bd58-64006a7986d3 */
+static const struct hyperv_guid srv_id_template = {
+	.hv_guid = {
+	    0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11,
+	    0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 }
+};
+
+static int		hvsock_br_callback(void *, int, void *);
+static uint32_t		hvsock_canread_check(struct hvs_pcb *);
+static uint32_t		hvsock_canwrite_check(struct hvs_pcb *);
+static int		hvsock_send_data(struct vmbus_channel *chan,
+    struct uio *uio, uint32_t to_write, struct sockbuf *sb);
+
+
+
+/* Globals */
+static struct sx		hvs_trans_socks_sx;
+static struct mtx		hvs_trans_socks_mtx;
+static LIST_HEAD(, hvs_pcb)	hvs_trans_bound_socks;
+static LIST_HEAD(, hvs_pcb)	hvs_trans_connected_socks;
+static uint32_t			previous_auto_bound_port;
+
+static void
+hvsock_print_guid(struct hyperv_guid *guid)
+{
+	unsigned char *p = (unsigned char *)guid;
+
+	HVSOCK_DBG(HVSOCK_DBG_INFO,
+	    "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n",
+	    *(unsigned int *)p,
+	    *((unsigned short *) &p[4]),
+	    *((unsigned short *) &p[6]),
+	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+}
+
+static bool
+is_valid_srv_id(const struct hyperv_guid *id)
+{
+	return !memcmp(&id->hv_guid[4],
+	    &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4);
+}
+
+static unsigned int
+get_port_by_srv_id(const struct hyperv_guid *srv_id)
+{
+	return *((const unsigned int *)srv_id);
+}
+
+static void
+set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port)
+{
+	*((unsigned int *)srv_id) = port;
+}
+
+
+static void
+__hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list)
+{
+	struct hvs_pcb *p = NULL;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
+
+	if (!pcb)
+		return;
+
+	if (list & HVS_LIST_BOUND) {
+		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
+			if  (p == pcb)
+				LIST_REMOVE(p, bound_next);
+	}
+
+	if (list & HVS_LIST_CONNECTED) {
+		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
+			if (p == pcb)
+				LIST_REMOVE(pcb, connected_next);
+	}
+}
+
+static void
+__hvs_remove_socket_from_list(struct socket *so, unsigned char list)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
+
+	__hvs_remove_pcb_from_list(pcb, list);
+}
+
+static void
+__hvs_insert_socket_on_list(struct socket *so, unsigned char list)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+
+	if (list & HVS_LIST_BOUND)
+		LIST_INSERT_HEAD(&hvs_trans_bound_socks,
+		   pcb, bound_next);
+
+	if (list & HVS_LIST_CONNECTED)
+		LIST_INSERT_HEAD(&hvs_trans_connected_socks,
+		   pcb, connected_next);
+}
+
+void
+hvs_remove_socket_from_list(struct socket *so, unsigned char list)
+{
+	if (!so || !so->so_pcb) {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: socket or so_pcb is null\n", __func__);
+		return;
+	}
+
+	mtx_lock(&hvs_trans_socks_mtx);
+	__hvs_remove_socket_from_list(so, list);
+	mtx_unlock(&hvs_trans_socks_mtx);
+}
+
+static void
+hvs_insert_socket_on_list(struct socket *so, unsigned char list)
+{
+	if (!so || !so->so_pcb) {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: socket or so_pcb is null\n", __func__);
+		return;
+	}
+
+	mtx_lock(&hvs_trans_socks_mtx);
+	__hvs_insert_socket_on_list(so, list);
+	mtx_unlock(&hvs_trans_socks_mtx);
+}
+
+static struct socket *
+__hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
+{
+	struct hvs_pcb *p = NULL;
+
+	if (list & HVS_LIST_BOUND)
+		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
+			if (p->so != NULL &&
+			    addr->hvs_port == p->local_addr.hvs_port)
+				return p->so;
+
+	if (list & HVS_LIST_CONNECTED)
+		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
+			if (p->so != NULL &&
+			    addr->hvs_port == p->local_addr.hvs_port)
+				return p->so;
+
+	return NULL;
+}
+
+static struct socket *
+hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
+{
+	struct socket *s = NULL;
+
+	mtx_lock(&hvs_trans_socks_mtx);
+	s = __hvs_find_socket_on_list(addr, list);
+	mtx_unlock(&hvs_trans_socks_mtx);
+
+	return s;
+}
+
+static inline void
+hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port)
+{
+	memset(addr, 0, sizeof(*addr));
+	addr->sa_family = AF_HYPERV;
+	addr->sa_len = sizeof(*addr);
+	addr->hvs_port = port;
+}
+
+void
+hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id)
+{
+	hvs_addr_set(addr, get_port_by_srv_id(svr_id));
+}
+
+int
+hvs_trans_lock(void)
+{
+	sx_xlock(&hvs_trans_socks_sx);
+	return (0);
+}
+
+void
+hvs_trans_unlock(void)
+{
+	sx_xunlock(&hvs_trans_socks_sx);
+}
+
+static int
+hvs_dom_probe(void)
+{
+
+	/* Don't even give us a chance to attach on non-HyperV. */
+	if (vm_guest != VM_GUEST_HV)
+		return (ENXIO);
+	return (0);
+}
+
+void
+hvs_trans_init(void)
+{
+	/* Skip initialization of globals for non-default instances. */
+	if (!IS_DEFAULT_VNET(curvnet))
+		return;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_init called\n", __func__);
+
+	/* Initialize Globals */
+	previous_auto_bound_port = MAX_PORT;
+	sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx");
+	mtx_init(&hvs_trans_socks_mtx,
+	    "hvs_trans_socks_mtx", NULL, MTX_DEF);
+	LIST_INIT(&hvs_trans_bound_socks);
+	LIST_INIT(&hvs_trans_connected_socks);
+}
+
+/*
+ * Called in two cases:
+ * 1) When user calls socket();
+ * 2) When we accept new incoming conneciton and call sonewconn().
+ */
+int
+hvs_trans_attach(struct socket *so, int proto, struct thread *td)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_attach called\n", __func__);
+
+	if (so->so_type != SOCK_STREAM)
+		return (ESOCKTNOSUPPORT);
+
+	if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS)
+		return (EPROTONOSUPPORT);
+
+	if (pcb != NULL)
+		return (EISCONN);
+	pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO);
+	if (pcb == NULL)
+		return (ENOMEM);
+
+	pcb->so = so;
+	so->so_pcb = (void *)pcb;
+
+	return (0);
+}
+
+void
+hvs_trans_detach(struct socket *so)
+{
+	struct hvs_pcb *pcb;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_detach called\n", __func__);
+
+	(void) hvs_trans_lock();
+	pcb = so2hvspcb(so);
+	if (pcb == NULL) {
+		hvs_trans_unlock();
+		return;
+	}
+
+	if (SOLISTENING(so)) {
+		bzero(pcb, sizeof(*pcb));
+		free(pcb, M_HVSOCK);
+	}
+
+	so->so_pcb = NULL;
+
+	hvs_trans_unlock();
+}
+
+int
+hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+	struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr;
+	int error = 0;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_bind called\n", __func__);
+
+	if (sa == NULL) {
+		return (EINVAL);
+	}
+
+	if (pcb == NULL) {
+		return (EINVAL);
+	}
+
+	if (sa->sa_family != AF_HYPERV) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: Not supported, sa_family is %u\n",
+		    __func__, sa->sa_family);
+		return (EAFNOSUPPORT);
+	}
+	if (sa->sa_len != sizeof(*sa)) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: Not supported, sa_len is %u\n",
+		    __func__, sa->sa_len);
+		return (EINVAL);
+	}
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: binding port = 0x%x\n", __func__, sa->hvs_port);
+
+	mtx_lock(&hvs_trans_socks_mtx);
+	if (__hvs_find_socket_on_list(sa,
+	    HVS_LIST_BOUND | HVS_LIST_CONNECTED)) {
+		error = EADDRINUSE;
+	} else {
+		/*
+		 * The address is available for us to bind.
+		 * Add socket to the bound list.
+		 */
+		hvs_addr_set(&pcb->local_addr, sa->hvs_port);
+		hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY);
+		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
+	}
+	mtx_unlock(&hvs_trans_socks_mtx);
+
+	return (error);
+}
+
+int
+hvs_trans_listen(struct socket *so, int backlog, struct thread *td)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+	struct socket *bound_so;
+	int error;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_listen called\n", __func__);
+
+	if (pcb == NULL)
+		return (EINVAL);
+
+	/* Check if the address is already bound and it was by us. */
+	bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND);
+	if (bound_so == NULL || bound_so != so) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: Address not bound or not by us.\n", __func__);
+		return (EADDRNOTAVAIL);
+	}
+
+	SOCK_LOCK(so);
+	error = solisten_proto_check(so);
+	if (error == 0)
+		solisten_proto(so, backlog);
+	SOCK_UNLOCK(so);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket listen error = %d\n", __func__, error);
+	return (error);
+}
+
+int
+hvs_trans_accept(struct socket *so, struct sockaddr **nam)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_accept called\n", __func__);
+
+	if (pcb == NULL)
+		return (EINVAL);
+
+	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr,
+	    M_NOWAIT);
+
+	return ((*nam == NULL) ? ENOMEM : 0);
+}
+
+int
+hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+	struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam;
+	bool found_auto_bound_port = false;
+	int i, error = 0;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n",
+	    __func__, raddr->hvs_port);
+
+	if (pcb == NULL)
+		return (EINVAL);
+
+	/* Verify the remote address */
+	if (raddr == NULL)
+		return (EINVAL);
+	if (raddr->sa_family != AF_HYPERV)
+		return (EAFNOSUPPORT);
+	if (raddr->sa_len != sizeof(*raddr))
+		return (EINVAL);
+
+	mtx_lock(&hvs_trans_socks_mtx);
+	if (so->so_state &
+	    (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) {
+			HVSOCK_DBG(HVSOCK_DBG_ERR,
+			    "%s: socket connect in progress\n",
+			    __func__);
+			error = EINPROGRESS;
+			goto out;
+	}
+
+	/*
+	 * Find an available port for us to auto bind the local
+	 * address.
+	 */
+	hvs_addr_set(&pcb->local_addr, 0);
+
+	for (i = previous_auto_bound_port - 1;
+	    i != previous_auto_bound_port; i --) {
+		if (i == MIN_PORT)
+			i = MAX_PORT;
+
+		pcb->local_addr.hvs_port = i;
+
+		if (__hvs_find_socket_on_list(&pcb->local_addr,
+		    HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) {
+			found_auto_bound_port = true;
+			previous_auto_bound_port = i;
+			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+			    "%s: found local bound port is %x\n",
+			    __func__, pcb->local_addr.hvs_port);
+			break;
+		}
+	}
+
+	if (found_auto_bound_port == true) {
+		/* Found available port for auto bound, put on list */
+		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
+		/* Set VM service ID */
+		pcb->vm_srv_id = srv_id_template;
+		set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port);
+		/* Set host service ID and remote port */
+		pcb->host_srv_id = srv_id_template;
+		set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port);
+		hvs_addr_set(&pcb->remote_addr, raddr->hvs_port);
+
+		/* Change the socket state to SS_ISCONNECTING */
+		soisconnecting(so);
+	} else {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: No local port available for auto bound\n",
+		    __func__);
+		error = EADDRINUSE;
+	}
+
+	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is ");
+	hvsock_print_guid(&pcb->vm_srv_id);
+	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is ");
+	hvsock_print_guid(&pcb->host_srv_id);
+
+out:
+	mtx_unlock(&hvs_trans_socks_mtx);
+
+	if (found_auto_bound_port == true)
+		 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id);
+
+	return (error);
+}
+
+int
+hvs_trans_disconnect(struct socket *so)
+{
+	struct hvs_pcb *pcb;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_disconnect called\n", __func__);
+
+	(void) hvs_trans_lock();
+	pcb = so2hvspcb(so);
+	if (pcb == NULL) {
+		hvs_trans_unlock();
+		return (EINVAL);
+	}
+
+	/* If socket is already disconnected, skip this */
+	if ((so->so_state & SS_ISDISCONNECTED) == 0)
+		soisdisconnecting(so);
+
+	hvs_trans_unlock();
+
+	return (0);
+}
+
+struct hvs_callback_arg {
+	struct uio *uio;
+	struct sockbuf *sb;
+};
+
+int
+hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr,
+    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+	struct sockbuf *sb;
+	ssize_t orig_resid;
+	uint32_t canread, to_read;
+	int flags, error = 0;
+	struct hvs_callback_arg cbarg;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_soreceive called\n", __func__);
+
+	if (so->so_type != SOCK_STREAM)
+		return (EINVAL);
+	if (pcb == NULL)
+		return (EINVAL);
+
+	if (flagsp != NULL)
+		flags = *flagsp &~ MSG_EOR;
+	else
+		flags = 0;
+
+	if (flags & MSG_PEEK)
+		return (EOPNOTSUPP);
+
+	/* If no space to copy out anything */
+	if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ)
+		return (EINVAL);
+
+	orig_resid = uio->uio_resid;
+
+	/* Prevent other readers from entering the socket. */
+	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
+	if (error) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: soiolock returned error = %d\n", __func__, error);
+		return (error);
+	}
+
+	sb = &so->so_rcv;
+	SOCKBUF_LOCK(sb);
+
+	cbarg.uio = uio;
+	cbarg.sb = sb;
+	/*
+	 * If the socket is closing, there might still be some data
+	 * in rx br to read. However we need to make sure
+	 * the channel is still open.
+	 */
+	if ((sb->sb_state & SBS_CANTRCVMORE) &&
+	    (so->so_state & SS_ISDISCONNECTED)) {
+		/* Other thread already closed the channel */
+		error = EPIPE;
+		goto out;
+	}
+
+	while (true) {
+		while (uio->uio_resid > 0 &&
+		    (canread = hvsock_canread_check(pcb)) > 0) {
+			to_read = MIN(canread, uio->uio_resid);
+			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+			    "%s: to_read = %u, skip = %u\n", __func__, to_read,
+			    (unsigned int)(sizeof(struct hvs_pkt_header) +
+			    pcb->recv_data_off));
+
+			error = vmbus_chan_recv_peek_call(pcb->chan, to_read,
+			    sizeof(struct hvs_pkt_header) + pcb->recv_data_off,
+			    hvsock_br_callback, (void *)&cbarg);
+			/*
+			 * It is possible socket is disconnected becasue
+			 * we released lock in hvsock_br_callback. So we
+			 * need to check the state to make sure it is not
+			 * disconnected.
+			 */
+			if (error || so->so_state & SS_ISDISCONNECTED) {
+				break;
+			}
+
+			pcb->recv_data_len -= to_read;
+			pcb->recv_data_off += to_read;
+		}
+
+		if (error)
+			break;
+
+		/* Abort if socket has reported problems. */
+		if (so->so_error) {
+			if (so->so_error == ESHUTDOWN &&
+			    orig_resid > uio->uio_resid) {
+				/*
+				 * Although we got a FIN, we also received
+				 * some data in this round. Delivery it
+				 * to user.
+				 */
+				error = 0;
+			} else {
+				if (so->so_error != ESHUTDOWN)
+					error = so->so_error;
+			}
+
+			break;
+		}
+
+		/* Cannot received more. */
+		if (sb->sb_state & SBS_CANTRCVMORE)
+			break;
+
+		/* We are done if buffer has been filled */
+		if (uio->uio_resid == 0)
+			break;
+
+		if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid)
+			break;
+
+		/* Buffer ring is empty and we shall not block */
+		if ((so->so_state & SS_NBIO) ||
+		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
+			if (orig_resid == uio->uio_resid) {
+				/* We have not read anything */
+				error = EAGAIN;
+			}
+			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+			    "%s: non blocked read return, error %d.\n",
+			    __func__, error);
+			break;
+		}
+
+		/*
+		 * Wait and block until (more) data comes in.
+		 * Note: Drops the sockbuf lock during wait.
+		 */
+		error = sbwait(sb);
+
+		if (error)
+			break;
+
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: wake up from sbwait, read available is %u\n",
+		    __func__, vmbus_chan_read_available(pcb->chan));
+	}
+
+out:
+	SOCKBUF_UNLOCK(sb);
+	SOCK_IO_RECV_UNLOCK(so);
+
+	/* We recieved a FIN in this call */
+	if (so->so_error == ESHUTDOWN) {
+		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+			/* Send has already closed */
+			soisdisconnecting(so);
+		} else {
+			/* Just close the receive side */
+			socantrcvmore(so);
+		}
+	}
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: returning error = %d, so_error = %d\n",
+	    __func__, error, so->so_error);
+
+	return (error);
+}
+
+int
+hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+	struct sockbuf *sb;
+	ssize_t orig_resid;
+	uint32_t canwrite, to_write;
+	int error = 0;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n",
+	    __func__, uio->uio_resid);
+
+	if (so->so_type != SOCK_STREAM)
+		return (EINVAL);
+	if (pcb == NULL)
+		return (EINVAL);
+
+	/* If nothing to send */
+	if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE)
+		return (EINVAL);
+
+	orig_resid = uio->uio_resid;
+
+	/* Prevent other writers from entering the socket. */
+	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
+	if (error) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: soiolocak returned error = %d\n", __func__, error);
+		return (error);
+	}
+
+	sb = &so->so_snd;
+	SOCKBUF_LOCK(sb);
+
+	if ((sb->sb_state & SBS_CANTSENDMORE) ||
+	    so->so_error == ESHUTDOWN) {
+		error = EPIPE;
+		goto out;
+	}
+
+	while (uio->uio_resid > 0) {
+		canwrite = hvsock_canwrite_check(pcb);
+		if (canwrite == 0) {
+			/* We have sent some data */
+			if (orig_resid > uio->uio_resid)
+				break;
+			/*
+			 * We have not sent any data and it is
+			 * non-blocked io
+			 */
+			if (so->so_state & SS_NBIO ||
+			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
+				error = EWOULDBLOCK;
+				break;
+			} else {
+				/*
+				 * We are here because there is no space on
+				 * send buffer ring. Signal the other side
+				 * to read and free more space.
+				 * Sleep wait until space avaiable to send
+				 * Note: Drops the sockbuf lock during wait.
+				 */
+				error = sbwait(sb);
+
+				if (error)
+					break;
+
+				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+				    "%s: wake up from sbwait, space avail on "
+				    "tx ring is %u\n",
+				    __func__,
+				    vmbus_chan_write_available(pcb->chan));
+
+				continue;
+			}
+		}
+		to_write = MIN(canwrite, uio->uio_resid);
+		to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ);
+
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: canwrite is %u, to_write = %u\n", __func__,
+		    canwrite, to_write);
+		error = hvsock_send_data(pcb->chan, uio, to_write, sb);
+
+		if (error)
+			break;
+	}
+
+out:
+	SOCKBUF_UNLOCK(sb);
+	SOCK_IO_SEND_UNLOCK(so);
+
+	return (error);
+}
+
+int
+hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__);
+
+	if (pcb == NULL)
+		return (EINVAL);
+
+	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT);
+
+	return ((*nam == NULL)? ENOMEM : 0);
+}
+
+int
+hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__);
+
+	if (pcb == NULL)
+		return (EINVAL);
+
+	*nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT);
+
+	return ((*nam == NULL)? ENOMEM : 0);
+}
+
+void
+hvs_trans_close(struct socket *so)
+{
+	struct hvs_pcb *pcb;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_close called\n", __func__);
+
+	(void) hvs_trans_lock();
+	pcb = so2hvspcb(so);
+	if (!pcb) {
+		hvs_trans_unlock();
+		return;
+	}
+
+	if (so->so_state & SS_ISCONNECTED) {
+		/* Send a FIN to peer */
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: hvs_trans_close sending a FIN to host\n", __func__);
+		(void) hvsock_send_data(pcb->chan, NULL, 0, NULL);
+	}
+
+	if (so->so_state &
+	    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
+		soisdisconnected(so);
+
+	pcb->chan = NULL;
+	pcb->so = NULL;
+
+	if (SOLISTENING(so)) {
+		mtx_lock(&hvs_trans_socks_mtx);
+		/* Remove from bound list */
+		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
+		mtx_unlock(&hvs_trans_socks_mtx);
+	}
+
+	hvs_trans_unlock();
+
+	return;
+}
+
+void
+hvs_trans_abort(struct socket *so)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_abort called\n", __func__);
+
+	(void) hvs_trans_lock();
+	if (pcb == NULL) {
+		hvs_trans_unlock();
+		return;
+	}
+
+	if (SOLISTENING(so)) {
+		mtx_lock(&hvs_trans_socks_mtx);
+		/* Remove from bound list */
+		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
+		mtx_unlock(&hvs_trans_socks_mtx);
+	}
+
+	if (so->so_state & SS_ISCONNECTED) {
+		(void) sodisconnect(so);
+	}
+	hvs_trans_unlock();
+
+	return;
+}
+
+int
+hvs_trans_shutdown(struct socket *so)
+{
+	struct hvs_pcb *pcb = so2hvspcb(so);
+	struct sockbuf *sb;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: HyperV Socket hvs_trans_shutdown called\n", __func__);
+
+	if (pcb == NULL)
+		return (EINVAL);
+
+	/*
+	 * Only get called with the shutdown method is SHUT_WR or
+	 * SHUT_RDWR.
+	 * When the method is SHUT_RD or SHUT_RDWR, the caller
+	 * already set the SBS_CANTRCVMORE on receive side socket
+	 * buffer.
+	 */
+	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
+		/*
+		 * SHUT_WR only case.
+		 * Receive side is still open. Just close
+		 * the send side.
+		 */
+		socantsendmore(so);
+	} else {
+		/* SHUT_RDWR case */
+		if (so->so_state & SS_ISCONNECTED) {
+			/* Send a FIN to peer */
+			sb = &so->so_snd;
+			SOCKBUF_LOCK(sb);
+			(void) hvsock_send_data(pcb->chan, NULL, 0, sb);
+			SOCKBUF_UNLOCK(sb);
+
+			soisdisconnecting(so);
+		}
+	}
+
+	return (0);
+}
+
+/* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is
+ * <port> (see struct sockaddr_hvs).
+ *
+ * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV:
+ * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-
+ * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with
+ * the below sockaddr:
+ *
+ * struct SOCKADDR_HV
+ * {
+ *    ADDRESS_FAMILY Family;
+ *    USHORT Reserved;
+ *    GUID VmId;
+ *    GUID ServiceId;
+ * };
+ * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via
+ * VMBus, because here it's obvious the host and the VM can easily identify
+ * each other. Though the VmID is useful on the host, especially in the case
+ * of Windows container, FreeBSD VM doesn't need it at all.
+ *
+ * To be compatible with similar infrastructure in Linux VMs, we have
+ * to limit the available GUID space of SOCKADDR_HV so that we can create
+ * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID.
+ * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is:
+ *
+ ****************************************************************************
+ * The only valid Service GUIDs, from the perspectives of both the host and *
+ * FreeBSD VM, that can be connected by the other end, must conform to this *
+ * format: <port>-facb-11e6-bd58-64006a7986d3.                              *
+ ****************************************************************************
+ *
+ * When we write apps on the host to connect(), the GUID ServiceID is used.
+ * When we write apps in FreeBSD VM to connect(), we only need to specify the
+ * port and the driver will form the GUID and use that to request the host.
+ *
+ * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the
+ * auto-generated remote port for a connect request initiated by the host's
+ * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the
+ * FreeBSD guest.
+ */
+
+/*
+ * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before)
+ * restricts HyperV socket ring buffer size to six 4K pages. Newer
+ * HyperV hosts doen't have this limit.
+ */
+#define HVS_RINGBUF_RCV_SIZE	(PAGE_SIZE * 6)
+#define HVS_RINGBUF_SND_SIZE	(PAGE_SIZE * 6)
+#define HVS_RINGBUF_MAX_SIZE	(PAGE_SIZE * 64)
+
+struct hvsock_sc {
+	device_t		dev;
+	struct hvs_pcb		*pcb;
+	struct vmbus_channel	*channel;
+};
+
+static bool
+hvsock_chan_readable(struct vmbus_channel *chan)
+{
+	uint32_t readable = vmbus_chan_read_available(chan);
+
+	return (readable >= HVSOCK_PKT_LEN(0));
+}
+
+static void
+hvsock_chan_cb(struct vmbus_channel *chan, void *context)
+{
+	struct hvs_pcb *pcb = (struct hvs_pcb *) context;
+	struct socket *so;
+	uint32_t canwrite;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: host send us a wakeup on rb data, pcb = %p\n",
+	    __func__, pcb);
+
+	/*
+	 * Check if the socket is still attached and valid.
+	 * Here we know channel is still open. Need to make
+	 * sure the socket has not been closed or freed.
+	 */
+	(void) hvs_trans_lock();
+	so = hsvpcb2so(pcb);
+
+	if (pcb->chan != NULL && so != NULL) {
+		/*
+		 * Wake up reader if there are data to read.
+		 */
+		SOCKBUF_LOCK(&(so)->so_rcv);
+
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: read available = %u\n", __func__,
+		    vmbus_chan_read_available(pcb->chan));
+
+		if (hvsock_chan_readable(pcb->chan))
+			sorwakeup_locked(so);
+		else
+			SOCKBUF_UNLOCK(&(so)->so_rcv);
+
+		/*
+		 * Wake up sender if space becomes available to write.
+		 */
+		SOCKBUF_LOCK(&(so)->so_snd);
+		canwrite = hvsock_canwrite_check(pcb);
+
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: canwrite = %u\n", __func__, canwrite);
+
+		if (canwrite > 0) {
+			sowwakeup_locked(so);
+		} else {
+			SOCKBUF_UNLOCK(&(so)->so_snd);
+		}
+	}
+
+	hvs_trans_unlock();
+
+	return;
+}
+
+static int
+hvsock_br_callback(void *datap, int cplen, void *cbarg)
+{
+	struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg;
+	struct uio *uio = arg->uio;
+	struct sockbuf *sb = arg->sb;
+	int error = 0;
+
+	if (cbarg == NULL || datap == NULL)
+		return (EINVAL);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, "
+	    "datap = %p\n",
+	    __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br",
+	    uio->uio_resid, cplen, datap);
+
+	if (sb)
+		SOCKBUF_UNLOCK(sb);
+
+	error = uiomove(datap, cplen, uio);
+
+	if (sb)
+		SOCKBUF_LOCK(sb);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: after uiomove, uio_resid = %zd, error = %d\n",
+	    __func__, uio->uio_resid, error);
+
+	return (error);
+}
+
+static int
+hvsock_send_data(struct vmbus_channel *chan, struct uio *uio,
+    uint32_t to_write, struct sockbuf *sb)
+{
+	struct hvs_pkt_header hvs_pkt;
+	int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0;
+	uint64_t pad = 0;
+	struct iovec iov[3];
+	struct hvs_callback_arg cbarg;
+
+	if (chan == NULL)
+		return (ENOTCONN);
+
+	hlen = sizeof(struct vmbus_chanpkt_hdr);
+	hvs_pkthlen = sizeof(struct hvs_pkt_header);
+	hvs_pktlen = hvs_pkthlen + to_write;
+	pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, "
+	    "pad_pktlen = %u, data_len = %u\n",
+	    __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write);
+
+	hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND;
+	hvs_pkt.chan_pkt_hdr.cph_flags = 0;
+	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen);
+	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen);
+	hvs_pkt.chan_pkt_hdr.cph_xactid = 0;
+
+	hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1;
+	hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write;
+
+	cbarg.uio = uio;
+	cbarg.sb = sb;
+
+	if (uio && to_write > 0) {
+		iov[0].iov_base = &hvs_pkt;
+		iov[0].iov_len = hvs_pkthlen;
+		iov[1].iov_base = NULL;
+		iov[1].iov_len = to_write;
+		iov[2].iov_base = &pad;
+		iov[2].iov_len = pad_pktlen - hvs_pktlen;
+
+		error = vmbus_chan_iov_send(chan, iov, 3,
+		    hvsock_br_callback, &cbarg);
+	} else {
+		if (to_write == 0) {
+			iov[0].iov_base = &hvs_pkt;
+			iov[0].iov_len = hvs_pkthlen;
+			iov[1].iov_base = &pad;
+			iov[1].iov_len = pad_pktlen - hvs_pktlen;
+			error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL);
+		}
+	}
+
+	if (error) {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: error = %d\n", __func__, error);
+	}
+
+	return (error);
+}
+
+/*
+ * Check if we have data on current ring buffer to read
+ * or not. If not, advance the ring buffer read index to
+ * next packet. Update the recev_data_len and recev_data_off
+ * to new value.
+ * Return the number of bytes can read.
+ */
+static uint32_t
+hvsock_canread_check(struct hvs_pcb *pcb)
+{
+	uint32_t advance;
+	uint32_t tlen, hlen, dlen;
+	uint32_t bytes_canread = 0;
+	int error;
+
+	if (pcb == NULL || pcb->chan == NULL) {
+		pcb->so->so_error = EIO;
+		return (0);
+	}
+
+	/* Still have data not read yet on current packet */
+	if (pcb->recv_data_len > 0)
+		return (pcb->recv_data_len);
+
+	if (pcb->rb_init)
+		advance =
+		    VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
+	else
+		advance = 0;
+
+	bytes_canread = vmbus_chan_read_available(pcb->chan);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: bytes_canread on br = %u, advance = %u\n",
+	    __func__, bytes_canread, advance);
+
+	if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) {
+		/*
+		 * Nothing to read. Need to advance the rindex before
+		 * calling sbwait, so host knows to wake us up when data
+		 * is available to read on rb.
+		 */
+		error = vmbus_chan_recv_idxadv(pcb->chan, advance);
+		if (error) {
+			HVSOCK_DBG(HVSOCK_DBG_ERR,
+			    "%s: after calling vmbus_chan_recv_idxadv, "
+			    "got error = %d\n",  __func__, error);
+			return (0);
+		} else {
+			pcb->rb_init = false;
+			pcb->recv_data_len = 0;
+			pcb->recv_data_off = 0;
+			bytes_canread = vmbus_chan_read_available(pcb->chan);
+
+			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+			    "%s: advanced %u bytes, "
+			    " bytes_canread on br now = %u\n",
+			    __func__, advance, bytes_canread);
+
+			if (bytes_canread == 0)
+				return (0);
+			else
+				advance = 0;
+		}
+	}
+
+	if (bytes_canread <
+	    advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t)))
+		return (0);
+
+	error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt,
+	    sizeof(struct hvs_pkt_header), advance);
+
+	/* Don't have anything to read */
+	if (error) {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: after calling vmbus_chan_recv_peek, got error = %d\n",
+		    __func__, error);
+		return (0);
+	}
+
+	/*
+	 * We just read in a new packet header. Do some sanity checks.
+	 */
+	tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
+	hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen);
+	dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size;
+	if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) ||
+	    __predict_false(hlen > tlen) ||
+	    __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "invalid tlen(%u), hlen(%u) or dlen(%u)\n",
+		    tlen, hlen, dlen);
+		pcb->so->so_error = EIO;
+		return (0);
+	}
+	if (pcb->rb_init == false)
+		pcb->rb_init = true;
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n",
+	    tlen, hlen, dlen);
+
+	/* The other side has sent a close FIN */
+	if (dlen == 0) {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "%s: Received FIN from other side\n", __func__);
+		/* inform the caller by seting so_error to ESHUTDOWN */
+		pcb->so->so_error = ESHUTDOWN;
+	}
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: canread on receive ring is %u \n", __func__, dlen);
+
+	pcb->recv_data_len = dlen;
+	pcb->recv_data_off = 0;
+
+	return (pcb->recv_data_len);
+}
+
+static uint32_t
+hvsock_canwrite_check(struct hvs_pcb *pcb)
+{
+	uint32_t writeable;
+	uint32_t ret;
+
+	if (pcb == NULL || pcb->chan == NULL)
+		return (0);
+
+	writeable = vmbus_chan_write_available(pcb->chan);
+
+	/*
+	 * We must always reserve a 0-length-payload packet for the FIN.
+	 */
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: writeable is %u, should be greater than %ju\n",
+	    __func__, writeable,
+	    (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)));
+
+	if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) {
+		/*
+		 * The Tx ring seems full.
+		 */
+		return (0);
+	}
+
+	ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+	    "%s: available size is %u\n", __func__, rounddown2(ret, 8));
+
+	return (rounddown2(ret, 8));
+}
+
+static void
+hvsock_set_chan_pending_send_size(struct vmbus_channel *chan)
+{
+	vmbus_chan_set_pending_send_size(chan,
+	    HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ));
+}
+
+static int
+hvsock_open_channel(struct vmbus_channel *chan, struct socket *so)
+{
+	unsigned int rcvbuf, sndbuf;
+	struct hvs_pcb *pcb = so2hvspcb(so);
+	int ret;
+
+	if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) {
+		sndbuf = HVS_RINGBUF_SND_SIZE;
+		rcvbuf = HVS_RINGBUF_RCV_SIZE;
+	} else {
+		sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE);
+		sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE);
+		sndbuf = rounddown2(sndbuf, PAGE_SIZE);
+		rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE);
+		rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE);
+		rcvbuf = rounddown2(rcvbuf, PAGE_SIZE);
+	}
+
+	/*
+	 * Can only read whatever user provided size of data
+	 * from ring buffer. Turn off batched reading.
+	 */
+	vmbus_chan_set_readbatch(chan, false);
+
+	ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0,
+	    hvsock_chan_cb, pcb);
+
+	if (ret != 0) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: failed to open hvsock channel, sndbuf = %u, "
+		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
+	} else {
+		HVSOCK_DBG(HVSOCK_DBG_INFO,
+		    "%s: hvsock channel opened, sndbuf = %u, i"
+		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
+		/*
+		 * Se the pending send size so to receive wakeup
+		 * signals from host when there is enough space on
+		 * rx buffer ring to write.
+		 */
+		hvsock_set_chan_pending_send_size(chan);
+	}
+
+	return ret;
+}
+
+/*
+ * Guest is listening passively on the socket. Open channel and
+ * create a new socket for the conneciton.
+ */
+static void
+hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so,
+    struct hvsock_sc *sc)
+{
+	struct socket *new_so;
+	struct hvs_pcb *new_pcb, *pcb;
+	int error;
+
+	/* Do nothing if socket is not listening */
+	if (!SOLISTENING(so)) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: socket is not a listening one\n", __func__);
+		return;
+	}
+
+	/*
+	 * Create a new socket. This will call pru_attach to complete
+	 * the socket initialization and put the new socket onto
+	 * listening socket's sol_incomp list, waiting to be promoted
+	 * to sol_comp list.
+	 * The new socket created has ref count 0. There is no other
+	 * thread that changes the state of this new one at the
+	 * moment, so we don't need to hold its lock while opening
+	 * channel and filling out its pcb information.
+	 */
+	new_so = sonewconn(so, 0);
+	if (!new_so)
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: creating new socket failed\n", __func__);
+
+	/*
+	 * Now open the vmbus channel. If it fails, the socket will be
+	 * on the listening socket's sol_incomp queue until it is
+	 * replaced and aborted.
+	 */
+	error = hvsock_open_channel(chan, new_so);
+	if (error) {
+		new_so->so_error = error;
+		return;
+	}
+
+	pcb = so->so_pcb;
+	new_pcb = new_so->so_pcb;
+
+	hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port);
+	/* Remote port is unknown to guest in this type of conneciton */
+	hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN);
+	new_pcb->chan = chan;
+	new_pcb->recv_data_len = 0;
+	new_pcb->recv_data_off = 0;
+	new_pcb->rb_init = false;
+
+	new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan);
+	new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan);
+
+	hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED);
+
+	sc->pcb = new_pcb;
+
+	/*
+	 * Change the socket state to SS_ISCONNECTED. This will promote
+	 * the socket to sol_comp queue and wake up the thread which
+	 * is accepting connection.
+	 */
+	soisconnected(new_so);
+}
+
+
+/*
+ * Guest is actively connecting to host.
+ */
+static void
+hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so)
+{
+	struct hvs_pcb *pcb;
+	int error;
+
+	error = hvsock_open_channel(chan, so);
+	if (error) {
+		so->so_error = error;
+		return;
+	}
+
+	pcb = so->so_pcb;
+	pcb->chan = chan;
+	pcb->recv_data_len = 0;
+	pcb->recv_data_off = 0;
+	pcb->rb_init = false;
+
+	mtx_lock(&hvs_trans_socks_mtx);
+	__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
+	__hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED);
+	mtx_unlock(&hvs_trans_socks_mtx);
+
+	/*
+	 * Change the socket state to SS_ISCONNECTED. This will wake up
+	 * the thread sleeping in connect call.
+	 */
+	soisconnected(so);
+}
+
+static void
+hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc)
+{
+	struct hyperv_guid *inst_guid, *type_guid;
+	bool conn_from_host;
+	struct sockaddr_hvs addr;
+	struct socket *so;
+	struct hvs_pcb *pcb;
+
+	type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan);
+	inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan);
+	conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan);
+
+	HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is ");
+	hvsock_print_guid(type_guid);
+	HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is ");
+	hvsock_print_guid(inst_guid);
+	HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n",
+	    (conn_from_host == true ) ? "from" : "to");
+
+	/*
+	 * The listening port should be in [0, MAX_LISTEN_PORT]
+	 */
+	if (!is_valid_srv_id(type_guid))
+		return;
+
+	/*
+	 * There should be a bound socket already created no matter
+	 * it is a passive or active connection.
+	 * For host initiated connection (passive on guest side),
+	 * the  type_guid contains the port which guest is bound and
+	 * listening.
+	 * For the guest initiated connection (active on guest side),
+	 * the inst_guid contains the port that guest has auto bound
+	 * to.
+	 */
+	hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid);
+	so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND);
+	if (!so) {
+		HVSOCK_DBG(HVSOCK_DBG_ERR,
+		    "%s: no bound socket found for port %u\n",
+		    __func__, addr.hvs_port);
+		return;
+	}
+
+	if (conn_from_host) {
+		hvsock_open_conn_passive(chan, so, sc);
+	} else {
+		(void) hvs_trans_lock();
+		pcb = so->so_pcb;
+		if (pcb && pcb->so) {
+			sc->pcb = so2hvspcb(so);
+			hvsock_open_conn_active(chan, so);
+		} else {
+			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+			    "%s: channel detached before open\n", __func__);
+		}
+		hvs_trans_unlock();
+	}
+
+}
+
+static int
+hvsock_probe(device_t dev)
+{
+	struct vmbus_channel *channel = vmbus_get_channel(dev);
+
+	if (!channel || !vmbus_chan_is_hvs(channel)) {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "hvsock_probe called but not a hvsock channel id %u\n",
+		    vmbus_chan_id(channel));
+
+		return ENXIO;
+	} else {
+		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+		    "hvsock_probe got a hvsock channel id %u\n",
+		    vmbus_chan_id(channel));
+
+		return BUS_PROBE_DEFAULT;
+	}
+}
+
+static int
+hvsock_attach(device_t dev)
+{
+	struct vmbus_channel *channel = vmbus_get_channel(dev);
+	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n");
+
+	hvsock_open_connection(channel, sc);
+
+	/*
+	 * Always return success. On error the host will rescind the device
+	 * in 30 seconds and we can do cleanup at that time in
+	 * vmbus_chan_msgproc_chrescind().
+	 */
+	return (0);
+}
+
+static int
+hvsock_detach(device_t dev)
+{
+	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
+	struct socket *so;
+	int retry;
+
+	if (bootverbose)
+		device_printf(dev, "hvsock_detach called.\n");
+
+	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n");
+
+	if (sc->pcb != NULL) {
+		(void) hvs_trans_lock();
+
+		so = hsvpcb2so(sc->pcb);
+		if (so) {
+			/* Close the connection */
+			if (so->so_state &
+			    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
+				soisdisconnected(so);
+		}
+
+		mtx_lock(&hvs_trans_socks_mtx);
+		__hvs_remove_pcb_from_list(sc->pcb,
+		    HVS_LIST_BOUND | HVS_LIST_CONNECTED);
+		mtx_unlock(&hvs_trans_socks_mtx);
+
+		/*
+		 * Close channel while no reader and sender are working
+		 * on the buffer rings.
+		 */
+		if (so) {
+			retry = 0;
+			while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) {
+				/*
+				 * Someone is reading, rx br is busy
+				 */
+				soisdisconnected(so);
+				DELAY(500);
+				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+				    "waiting for rx reader to exit, "
+				    "retry = %d\n", retry++);
+			}
+			retry = 0;
+			while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) {
+				/*
+				 * Someone is sending, tx br is busy
+				 */
+				soisdisconnected(so);
+				DELAY(500);
+				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+				    "waiting for tx sender to exit, "
+				    "retry = %d\n", retry++);
+			}
+		}
+
+
+		bzero(sc->pcb, sizeof(struct hvs_pcb));
+		free(sc->pcb, M_HVSOCK);
+		sc->pcb = NULL;
+
+		if (so) {
+			SOCK_IO_RECV_UNLOCK(so);
+			SOCK_IO_SEND_UNLOCK(so);
+			so->so_pcb = NULL;
+		}
+
+		hvs_trans_unlock();
+	}
+
+	vmbus_chan_close(vmbus_get_channel(dev));
+
+	return (0);
+}
+
+static device_method_t hvsock_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe, hvsock_probe),
+	DEVMETHOD(device_attach, hvsock_attach),
+	DEVMETHOD(device_detach, hvsock_detach),
+	DEVMETHOD_END
+};
+
+static driver_t hvsock_driver = {
+	"hv_sock",
+	hvsock_methods,
+	sizeof(struct hvsock_sc)
+};
+
+static devclass_t hvsock_devclass;
+
+DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL);
+MODULE_VERSION(hvsock, 1);
+MODULE_DEPEND(hvsock, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/hvsock/hv_sock.h b/sys/dev/hyperv/hvsock/hv_sock.h
new file mode 100644
index 000000000000..877425968345
--- /dev/null
+++ b/sys/dev/hyperv/hvsock/hv_sock.h
@@ -0,0 +1,122 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HVSOCK_H
+#define _HVSOCK_H
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/queue.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+
+/*
+ * HyperV Socket Protocols
+ */
+#define	HYPERV_SOCK_PROTO_TRANS		1	/* Transport protocol */
+
+#define	HVADDR_PORT_ANY			-1U
+#define	HVADDR_PORT_UNKNOWN		-1U
+
+#define HVS_LIST_BOUND			0x01
+#define HVS_LIST_CONNECTED		0x02
+#define HVS_LIST_ALL			(HVS_LIST_BOUND | HVS_LIST_CONNECTED)
+
+struct sockaddr_hvs {
+	unsigned char	sa_len;
+	sa_family_t	sa_family;
+	unsigned int	hvs_port;
+	unsigned char	hvs_zero[sizeof(struct sockaddr) -
+				 sizeof(sa_family_t) -
+				 sizeof(unsigned char) -
+				 sizeof(unsigned int)];
+};
+
+struct vmpipe_proto_header {
+	uint32_t			vmpipe_pkt_type;
+	uint32_t			vmpipe_data_size;
+} __packed;
+
+struct hvs_pkt_header {
+	struct vmbus_chanpkt_hdr	chan_pkt_hdr;
+	struct vmpipe_proto_header	vmpipe_pkt_hdr;
+} __packed;
+
+struct hvs_pcb {
+	struct socket			*so;		/* Pointer to socket */
+	struct sockaddr_hvs		local_addr;
+	struct sockaddr_hvs		remote_addr;
+
+	struct hyperv_guid		vm_srv_id;
+	struct hyperv_guid		host_srv_id;
+
+	struct vmbus_channel		*chan;
+	/* Current packet header on rx ring */
+	struct hvs_pkt_header		hvs_pkt;
+	/* Available data in receive br in current packet */
+	uint32_t			recv_data_len;
+	/* offset in the packet */
+	uint32_t			recv_data_off;
+	bool				rb_init;
+	/* Link lists for global bound and connected sockets */
+	LIST_ENTRY(hvs_pcb)		bound_next;
+	LIST_ENTRY(hvs_pcb)		connected_next;
+};
+
+#define so2hvspcb(so) \
+	((struct hvs_pcb *)((so)->so_pcb))
+#define hsvpcb2so(hvspcb) \
+	((struct socket *)((hvspcb)->so))
+
+void	hvs_addr_init(struct sockaddr_hvs *, const struct hyperv_guid *);
+void	hvs_trans_init(void);
+void	hvs_trans_close(struct socket *);
+void	hvs_trans_detach(struct socket *);
+void	hvs_trans_abort(struct socket *);
+int	hvs_trans_attach(struct socket *, int, struct thread *);
+int	hvs_trans_bind(struct socket *, struct sockaddr *, struct thread *);
+int	hvs_trans_listen(struct socket *, int, struct thread *);
+int	hvs_trans_accept(struct socket *, struct sockaddr **);
+int	hvs_trans_connect(struct socket *,
+	    struct sockaddr *, struct thread *);
+int	hvs_trans_peeraddr(struct socket *, struct sockaddr **);
+int	hvs_trans_sockaddr(struct socket *, struct sockaddr **);
+int	hvs_trans_soreceive(struct socket *, struct sockaddr **,
+	    struct uio *, struct mbuf **, struct mbuf **, int *);
+int	hvs_trans_sosend(struct socket *, struct sockaddr *, struct uio *,
+	     struct mbuf *, struct mbuf *, int, struct thread *);
+int	hvs_trans_disconnect(struct socket *);
+int	hvs_trans_shutdown(struct socket *);
+
+int	hvs_trans_lock(void);
+void	hvs_trans_unlock(void);
+
+void	hvs_remove_socket_from_list(struct socket *, unsigned char);
+#endif /* _HVSOCK_H */
diff --git a/sys/dev/hyperv/include/hyperv.h b/sys/dev/hyperv/include/hyperv.h
new file mode 100644
index 000000000000..8b985b2f31a7
--- /dev/null
+++ b/sys/dev/hyperv/include/hyperv.h
@@ -0,0 +1,104 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HYPERV_H_
+#define _HYPERV_H_
+
+#ifdef _KERNEL
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#define MSR_HV_TIME_REF_COUNT		0x40000020
+
+#define CPUID_HV_MSR_TIME_REFCNT	0x0002	/* MSR_HV_TIME_REF_COUNT */
+#define CPUID_HV_MSR_SYNIC		0x0004	/* MSRs for SynIC */
+#define CPUID_HV_MSR_SYNTIMER		0x0008	/* MSRs for SynTimer */
+#define CPUID_HV_MSR_APIC		0x0010	/* MSR_HV_{EOI,ICR,TPR} */
+#define CPUID_HV_MSR_HYPERCALL		0x0020	/* MSR_HV_GUEST_OS_ID
+						 * MSR_HV_HYPERCALL */
+#define CPUID_HV_MSR_VP_INDEX		0x0040	/* MSR_HV_VP_INDEX */
+#define CPUID_HV_MSR_REFERENCE_TSC	0x0200	/* MSR_HV_REFERENCE_TSC */
+#define CPUID_HV_MSR_GUEST_IDLE		0x0400	/* MSR_HV_GUEST_IDLE */
+
+#ifndef NANOSEC
+#define NANOSEC				1000000000ULL
+#endif
+#define HYPERV_TIMER_NS_FACTOR		100ULL
+#define HYPERV_TIMER_FREQ		(NANOSEC / HYPERV_TIMER_NS_FACTOR)
+
+#endif	/* _KERNEL */
+
+#define HYPERV_REFTSC_DEVNAME		"hv_tsc"
+
+/*
+ * Hyper-V Reference TSC
+ */
+struct hyperv_reftsc {
+	volatile uint32_t		tsc_seq;
+	volatile uint32_t		tsc_rsvd1;
+	volatile uint64_t		tsc_scale;
+	volatile int64_t		tsc_ofs;
+} __packed __aligned(PAGE_SIZE);
+#ifdef CTASSERT
+CTASSERT(sizeof(struct hyperv_reftsc) == PAGE_SIZE);
+#endif
+
+#ifdef _KERNEL
+
+struct hyperv_guid {
+	uint8_t				hv_guid[16];
+} __packed;
+
+#define HYPERV_GUID_STRLEN		40
+
+typedef uint64_t			(*hyperv_tc64_t)(void);
+
+int			hyperv_guid2str(const struct hyperv_guid *, char *,
+			    size_t);
+
+/*
+ * hyperv_tc64 could be NULL, if there were no suitable Hyper-V
+ * specific timecounter.
+ */
+extern hyperv_tc64_t	hyperv_tc64;
+extern u_int		hyperv_features;	/* CPUID_HV_MSR_ */
+extern u_int		hyperv_ver_major;
+
+/*
+ * Vmbus version after negotiation with host.
+ */
+extern uint32_t		vmbus_current_version;
+
+#endif	/* _KERNEL */
+
+#endif  /* _HYPERV_H_ */
diff --git a/sys/dev/hyperv/include/hyperv_busdma.h b/sys/dev/hyperv/include/hyperv_busdma.h
new file mode 100644
index 000000000000..ff01b3e27a95
--- /dev/null
+++ b/sys/dev/hyperv/include/hyperv_busdma.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HYPERV_BUSDMA_H_
+#define _HYPERV_BUSDMA_H_
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <machine/bus.h>
+
+struct hyperv_dma {
+	bus_addr_t	hv_paddr;
+	bus_dma_tag_t	hv_dtag;
+	bus_dmamap_t	hv_dmap;
+};
+
+void		hyperv_dma_map_paddr(void *arg, bus_dma_segment_t *segs,
+		    int nseg, int error);
+void		*hyperv_dmamem_alloc(bus_dma_tag_t parent_dtag,
+		    bus_size_t alignment, bus_addr_t boundary, bus_size_t size,
+		    struct hyperv_dma *dma, int flags);
+void		hyperv_dmamem_free(struct hyperv_dma *dma, void *ptr);
+
+#endif	/* !_HYPERV_BUSDMA_H_ */
diff --git a/sys/dev/hyperv/include/vmbus.h b/sys/dev/hyperv/include/vmbus.h
new file mode 100644
index 000000000000..76c1ad632765
--- /dev/null
+++ b/sys/dev/hyperv/include/vmbus.h
@@ -0,0 +1,261 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_H_
+#define _VMBUS_H_
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/_iovec.h>
+
+/*
+ * VMBUS version is 32 bit, upper 16 bit for major_number and lower
+ * 16 bit for minor_number.
+ *
+ * 0.13  --  Windows Server 2008
+ * 1.1   --  Windows 7
+ * 2.4   --  Windows 8
+ * 3.0   --  Windows 8.1
+ * 4.0   --  Windows 10
+ * 5.0   --  Newer Windows 10
+ */
+#define VMBUS_VERSION_WS2008		((0 << 16) | (13))
+#define VMBUS_VERSION_WIN7		((1 << 16) | (1))
+#define VMBUS_VERSION_WIN8		((2 << 16) | (4))
+#define VMBUS_VERSION_WIN8_1		((3 << 16) | (0))
+#define VMBUS_VERSION_WIN10		((4 << 16) | (0))
+#define VMBUS_VERSION_WIN10_V5		((5 << 16) | (0))
+
+#define VMBUS_VERSION_MAJOR(ver)	(((uint32_t)(ver)) >> 16)
+#define VMBUS_VERSION_MINOR(ver)	(((uint32_t)(ver)) & 0xffff)
+
+#define VMBUS_CHAN_POLLHZ_MIN		100	/* 10ms interval */
+#define VMBUS_CHAN_POLLHZ_MAX		1000000	/* 1us interval */
+
+/*
+ * GPA stuffs.
+ */
+struct vmbus_gpa_range {
+	uint32_t	gpa_len;
+	uint32_t	gpa_ofs;
+	uint64_t	gpa_page[0];
+} __packed;
+
+/* This is actually vmbus_gpa_range.gpa_page[1] */
+struct vmbus_gpa {
+	uint32_t	gpa_len;
+	uint32_t	gpa_ofs;
+	uint64_t	gpa_page;
+} __packed;
+
+#define VMBUS_CHANPKT_SIZE_SHIFT	3
+
+#define VMBUS_CHANPKT_GETLEN(pktlen)	\
+	(((int)(pktlen)) << VMBUS_CHANPKT_SIZE_SHIFT)
+
+struct vmbus_chanpkt_hdr {
+	uint16_t	cph_type;	/* VMBUS_CHANPKT_TYPE_ */
+	uint16_t	cph_hlen;	/* header len, in 8 bytes */
+	uint16_t	cph_tlen;	/* total len, in 8 bytes */
+	uint16_t	cph_flags;	/* VMBUS_CHANPKT_FLAG_ */
+	uint64_t	cph_xactid;
+} __packed;
+
+#define VMBUS_CHANPKT_TYPE_INBAND	0x0006
+#define VMBUS_CHANPKT_TYPE_RXBUF	0x0007
+#define VMBUS_CHANPKT_TYPE_GPA		0x0009
+#define VMBUS_CHANPKT_TYPE_COMP		0x000b
+
+#define VMBUS_CHANPKT_FLAG_NONE		0
+#define VMBUS_CHANPKT_FLAG_RC		0x0001	/* report completion */
+
+#define VMBUS_CHANPKT_CONST_DATA(pkt)		\
+	(const void *)((const uint8_t *)(pkt) +	\
+	VMBUS_CHANPKT_GETLEN((pkt)->cph_hlen))
+
+/* Include padding */
+#define VMBUS_CHANPKT_DATALEN(pkt)		\
+	(VMBUS_CHANPKT_GETLEN((pkt)->cph_tlen) -\
+	 VMBUS_CHANPKT_GETLEN((pkt)->cph_hlen))
+
+struct vmbus_rxbuf_desc {
+	uint32_t	rb_len;
+	uint32_t	rb_ofs;
+} __packed;
+
+struct vmbus_chanpkt_rxbuf {
+	struct vmbus_chanpkt_hdr cp_hdr;
+	uint16_t	cp_rxbuf_id;
+	uint16_t	cp_rsvd;
+	uint32_t	cp_rxbuf_cnt;
+	struct vmbus_rxbuf_desc cp_rxbuf[];
+} __packed;
+
+struct vmbus_chan_br {
+	void		*cbr;
+	bus_addr_t	cbr_paddr;
+	int		cbr_txsz;
+	int		cbr_rxsz;
+};
+
+struct vmbus_channel;
+struct vmbus_xact;
+struct vmbus_xact_ctx;
+struct hyperv_guid;
+struct task;
+struct taskqueue;
+
+typedef void	(*vmbus_chan_callback_t)(struct vmbus_channel *, void *);
+typedef int	(*vmbus_br_copy_callback_t)(void *, int, void *);
+
+static __inline struct vmbus_channel *
+vmbus_get_channel(device_t dev)
+{
+	return device_get_ivars(dev);
+}
+
+/*
+ * vmbus_chan_open_br()
+ *
+ * Return values:
+ * 0			Succeeded.
+ * EISCONN		Failed, and the memory passed through 'br' is still
+ *			connected.  Callers must _not_ free the the memory
+ *			passed through 'br', if this error happens.
+ * other values		Failed.  The memory passed through 'br' is no longer
+ *			connected.  Callers are free to do anything with the
+ *			memory passed through 'br'.
+ *
+ *
+ *
+ * vmbus_chan_close_direct()
+ *
+ * NOTE:
+ * Callers of this function _must_ make sure to close all sub-channels before
+ * closing the primary channel.
+ *
+ * Return values:
+ * 0			Succeeded.
+ * EISCONN		Failed, and the memory associated with the bufring
+ *			is still connected.  Callers must _not_ free the the
+ *			memory associated with the bufring, if this error
+ *			happens.
+ * other values		Failed.  The memory associated with the bufring is
+ *			no longer connected.  Callers are free to do anything
+ *			with the memory associated with the bufring.
+ */
+int		vmbus_chan_open(struct vmbus_channel *chan,
+		    int txbr_size, int rxbr_size, const void *udata, int udlen,
+		    vmbus_chan_callback_t cb, void *cbarg);
+int		vmbus_chan_open_br(struct vmbus_channel *chan,
+		    const struct vmbus_chan_br *cbr, const void *udata,
+		    int udlen, vmbus_chan_callback_t cb, void *cbarg);
+void		vmbus_chan_close(struct vmbus_channel *chan);
+int		vmbus_chan_close_direct(struct vmbus_channel *chan);
+void		vmbus_chan_intr_drain(struct vmbus_channel *chan);
+void		vmbus_chan_run_task(struct vmbus_channel *chan,
+		    struct task *task);
+void		vmbus_chan_set_orphan(struct vmbus_channel *chan,
+		    struct vmbus_xact_ctx *);
+void		vmbus_chan_unset_orphan(struct vmbus_channel *chan);
+const void	*vmbus_chan_xact_wait(const struct vmbus_channel *chan,
+		    struct vmbus_xact *xact, size_t *resp_len, bool can_sleep);
+
+int		vmbus_chan_gpadl_connect(struct vmbus_channel *chan,
+		    bus_addr_t paddr, int size, uint32_t *gpadl);
+int		vmbus_chan_gpadl_disconnect(struct vmbus_channel *chan,
+		    uint32_t gpadl);
+
+void		vmbus_chan_cpu_set(struct vmbus_channel *chan, int cpu);
+void		vmbus_chan_cpu_rr(struct vmbus_channel *chan);
+void		vmbus_chan_set_readbatch(struct vmbus_channel *chan, bool on);
+
+struct vmbus_channel **
+		vmbus_subchan_get(struct vmbus_channel *pri_chan,
+		    int subchan_cnt);
+void		vmbus_subchan_rel(struct vmbus_channel **subchan,
+		    int subchan_cnt);
+void		vmbus_subchan_drain(struct vmbus_channel *pri_chan);
+
+int		vmbus_chan_recv(struct vmbus_channel *chan, void *data, int *dlen,
+		    uint64_t *xactid);
+int		vmbus_chan_recv_pkt(struct vmbus_channel *chan,
+		    struct vmbus_chanpkt_hdr *pkt, int *pktlen);
+
+int		vmbus_chan_recv_idxadv(struct vmbus_channel *chan,
+		    uint32_t advance);
+int		vmbus_chan_recv_peek(struct vmbus_channel *chan,
+		    void *data, int data_len, uint32_t advance);
+int		vmbus_chan_recv_peek_call(struct vmbus_channel *chan,
+		    int data_len, uint32_t skip,
+		    vmbus_br_copy_callback_t cb, void *cbarg);
+
+int		vmbus_chan_send(struct vmbus_channel *chan, uint16_t type,
+		    uint16_t flags, void *data, int dlen, uint64_t xactid);
+int		vmbus_chan_send_sglist(struct vmbus_channel *chan,
+		    struct vmbus_gpa sg[], int sglen, void *data, int dlen,
+		    uint64_t xactid);
+int		vmbus_chan_send_prplist(struct vmbus_channel *chan,
+		    struct vmbus_gpa_range *prp, int prp_cnt, void *data,
+		    int dlen, uint64_t xactid);
+int		vmbus_chan_iov_send(struct vmbus_channel *chan,
+		    const struct iovec iov[], int iovlen,
+		    vmbus_br_copy_callback_t cb, void *cbarg);
+uint32_t	vmbus_chan_write_available(struct vmbus_channel *chan);
+uint32_t	vmbus_chan_read_available(struct vmbus_channel *chan);
+bool		vmbus_chan_write_signal(struct vmbus_channel *chan,
+		    int32_t min_signal_size);
+void		vmbus_chan_set_pending_send_size(struct vmbus_channel *chan,
+		    uint32_t size);
+
+uint32_t	vmbus_chan_id(const struct vmbus_channel *chan);
+uint32_t	vmbus_chan_subidx(const struct vmbus_channel *chan);
+bool		vmbus_chan_is_primary(const struct vmbus_channel *chan);
+bool		vmbus_chan_is_revoked(const struct vmbus_channel *chan);
+bool		vmbus_chan_is_hvs(const struct vmbus_channel *chan);
+bool		vmbus_chan_is_hvs_conn_from_host(
+		    const struct vmbus_channel *chan);
+int		vmbus_req_tl_connect(struct hyperv_guid *,
+		    struct hyperv_guid *);
+
+struct hyperv_guid *
+		vmbus_chan_guid_type(struct vmbus_channel *chan);
+struct hyperv_guid *
+		vmbus_chan_guid_inst(struct vmbus_channel *chan);
+int		vmbus_chan_prplist_nelem(int br_size, int prpcnt_max,
+		    int dlen_max);
+bool		vmbus_chan_rx_empty(const struct vmbus_channel *chan);
+bool		vmbus_chan_tx_empty(const struct vmbus_channel *chan);
+struct taskqueue *
+		vmbus_chan_mgmt_tq(const struct vmbus_channel *chan);
+
+void		vmbus_chan_poll_enable(struct vmbus_channel *chan,
+		    u_int pollhz);
+void		vmbus_chan_poll_disable(struct vmbus_channel *chan);
+
+#endif	/* !_VMBUS_H_ */
diff --git a/sys/dev/hyperv/include/vmbus_xact.h b/sys/dev/hyperv/include/vmbus_xact.h
new file mode 100644
index 000000000000..90711a0be774
--- /dev/null
+++ b/sys/dev/hyperv/include/vmbus_xact.h
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_XACT_H_
+#define _VMBUS_XACT_H_
+
+#include <sys/param.h>
+#include <sys/bus.h>
+
+struct vmbus_xact;
+struct vmbus_xact_ctx;
+
+struct vmbus_xact_ctx	*vmbus_xact_ctx_create(bus_dma_tag_t dtag,
+			    size_t req_size, size_t resp_size,
+			    size_t priv_size);
+void			vmbus_xact_ctx_destroy(struct vmbus_xact_ctx *ctx);
+bool			vmbus_xact_ctx_orphan(struct vmbus_xact_ctx *ctx);
+
+struct vmbus_xact	*vmbus_xact_get(struct vmbus_xact_ctx *ctx,
+			    size_t req_len);
+void			vmbus_xact_put(struct vmbus_xact *xact);
+
+void			*vmbus_xact_req_data(const struct vmbus_xact *xact);
+bus_addr_t		vmbus_xact_req_paddr(const struct vmbus_xact *xact);
+void			*vmbus_xact_priv(const struct vmbus_xact *xact,
+			    size_t priv_len);
+void			vmbus_xact_activate(struct vmbus_xact *xact);
+void			vmbus_xact_deactivate(struct vmbus_xact *xact);
+const void		*vmbus_xact_wait(struct vmbus_xact *xact,
+			    size_t *resp_len);
+const void		*vmbus_xact_busywait(struct vmbus_xact *xact,
+			    size_t *resp_len);
+const void		*vmbus_xact_poll(struct vmbus_xact *xact,
+			    size_t *resp_len);
+void			vmbus_xact_wakeup(struct vmbus_xact *xact,
+			    const void *data, size_t dlen);
+void			vmbus_xact_ctx_wakeup(struct vmbus_xact_ctx *ctx,
+			    const void *data, size_t dlen);
+
+#endif	/* !_VMBUS_XACT_H_ */
diff --git a/sys/dev/hyperv/input/hv_kbd.c b/sys/dev/hyperv/input/hv_kbd.c
new file mode 100644
index 000000000000..53aacda7fbcb
--- /dev/null
+++ b/sys/dev/hyperv/input/hv_kbd.c
@@ -0,0 +1,857 @@
+/*-
+ * Copyright (c) 2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_evdev.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/taskqueue.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/kthread.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/sema.h>
+#include <sys/signal.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/mutex.h>
+#include <sys/callout.h>
+
+#include <sys/kbio.h>
+#include <dev/kbd/kbdreg.h>
+#include <dev/kbd/kbdtables.h>
+
+#ifdef EVDEV_SUPPORT
+#include <dev/evdev/evdev.h>
+#include <dev/evdev/input.h>
+#endif
+
+#include "dev/hyperv/input/hv_kbdc.h"
+
+#define HVKBD_MTX_LOCK(_m) do {		\
+	mtx_lock(_m);			\
+} while (0)
+
+#define HVKBD_MTX_UNLOCK(_m) do {	\
+	mtx_unlock(_m);			\
+} while (0)
+
+#define HVKBD_MTX_ASSERT(_m, _t) do {	\
+	mtx_assert(_m, _t);		\
+} while (0)
+
+#define	HVKBD_LOCK()		HVKBD_MTX_LOCK(&Giant)
+#define	HVKBD_UNLOCK()		HVKBD_MTX_UNLOCK(&Giant)
+#define	HVKBD_LOCK_ASSERT()	HVKBD_MTX_ASSERT(&Giant, MA_OWNED)
+
+#define	HVKBD_FLAG_COMPOSE	0x00000001	/* compose char flag */
+#define HVKBD_FLAG_POLLING	0x00000002
+
+#ifdef EVDEV_SUPPORT
+static evdev_event_t hvkbd_ev_event;
+
+static const struct evdev_methods hvkbd_evdev_methods = {
+	.ev_event = hvkbd_ev_event,
+};
+#endif
+
+/* early keyboard probe, not supported */
+static int
+hvkbd_configure(int flags)
+{
+	return (0);
+}
+
+/* detect a keyboard, not used */
+static int
+hvkbd_probe(int unit, void *arg, int flags)
+{
+	return (ENXIO);
+}
+
+/* reset and initialize the device, not used */
+static int
+hvkbd_init(int unit, keyboard_t **kbdp, void *arg, int flags)
+{
+	DEBUG_HVKBD(*kbdp, "%s\n", __func__);
+	return (ENXIO);
+}
+
+/* test the interface to the device, not used */
+static int
+hvkbd_test_if(keyboard_t *kbd)
+{
+	DEBUG_HVKBD(kbd, "%s\n", __func__);
+	return (0);
+}
+
+/* finish using this keyboard, not used */
+static int
+hvkbd_term(keyboard_t *kbd)
+{
+	DEBUG_HVKBD(kbd, "%s\n", __func__);
+	return (ENXIO);
+}
+
+/* keyboard interrupt routine, not used */
+static int
+hvkbd_intr(keyboard_t *kbd, void *arg)
+{
+	DEBUG_HVKBD(kbd, "%s\n", __func__);
+	return (0);
+}
+
+/* lock the access to the keyboard, not used */
+static int
+hvkbd_lock(keyboard_t *kbd, int lock)
+{
+	DEBUG_HVKBD(kbd, "%s\n", __func__);
+	return (1);
+}
+
+/* save the internal state, not used */
+static int
+hvkbd_get_state(keyboard_t *kbd, void *buf, size_t len)
+{
+	DEBUG_HVKBD(kbd,"%s\n",  __func__);
+	return (len == 0) ? 1 : -1;
+}
+
+/* set the internal state, not used */
+static int
+hvkbd_set_state(keyboard_t *kbd, void *buf, size_t len)
+{
+	DEBUG_HVKBD(kbd, "%s\n", __func__);
+	return (EINVAL);
+}
+
+static int
+hvkbd_poll(keyboard_t *kbd, int on)
+{
+	hv_kbd_sc *sc = kbd->kb_data;
+
+	HVKBD_LOCK();
+	/*
+	 * Keep a reference count on polling to allow recursive
+	 * cngrab() during a panic for example.
+	 */
+	if (on)
+		sc->sc_polling++;
+	else if (sc->sc_polling > 0)
+		sc->sc_polling--;
+
+	if (sc->sc_polling != 0) {
+		sc->sc_flags |= HVKBD_FLAG_POLLING;
+	} else {
+		sc->sc_flags &= ~HVKBD_FLAG_POLLING;
+	}
+	HVKBD_UNLOCK();
+	return (0);
+}
+
+/*
+ * Enable the access to the device; until this function is called,
+ * the client cannot read from the keyboard.
+ */
+static int
+hvkbd_enable(keyboard_t *kbd)
+{
+	HVKBD_LOCK();
+	KBD_ACTIVATE(kbd);
+	HVKBD_UNLOCK();
+	return (0);
+}
+
+/* disallow the access to the device */
+static int
+hvkbd_disable(keyboard_t *kbd)
+{
+	DEBUG_HVKBD(kbd, "%s\n", __func__);
+	HVKBD_LOCK();
+	KBD_DEACTIVATE(kbd);
+	HVKBD_UNLOCK();
+	return (0);
+}
+
+static void
+hvkbd_do_poll(hv_kbd_sc *sc, uint8_t wait)
+{
+	while (!hv_kbd_prod_is_ready(sc)) {
+		hv_kbd_read_channel(sc->hs_chan, sc);
+		if (!wait)
+			break;
+	}
+}
+
+/* check if data is waiting */
+/* Currently unused. */
+static int
+hvkbd_check(keyboard_t *kbd)
+{
+	DEBUG_HVKBD(kbd, "%s\n", __func__);
+	return (0);
+}
+
+/* check if char is waiting */
+static int
+hvkbd_check_char_locked(keyboard_t *kbd)
+{
+	HVKBD_LOCK_ASSERT();
+	if (!KBD_IS_ACTIVE(kbd))
+		return (FALSE);
+
+	hv_kbd_sc *sc = kbd->kb_data;
+	if (!(sc->sc_flags & HVKBD_FLAG_COMPOSE) && sc->sc_composed_char != 0)
+		return (TRUE);
+	if (sc->sc_flags & HVKBD_FLAG_POLLING)
+		hvkbd_do_poll(sc, 0);
+	if (hv_kbd_prod_is_ready(sc)) {
+		return (TRUE);
+	}
+	return (FALSE);
+}
+
+static int
+hvkbd_check_char(keyboard_t *kbd)
+{
+	int result;
+
+	HVKBD_LOCK();
+	result = hvkbd_check_char_locked(kbd);
+	HVKBD_UNLOCK();
+
+	return (result);
+}
+
+/* read char from the keyboard */
+static uint32_t
+hvkbd_read_char_locked(keyboard_t *kbd, int wait)
+{
+	uint32_t scancode = NOKEY;
+	uint32_t action;
+	keystroke ks;
+	hv_kbd_sc *sc = kbd->kb_data;
+	int keycode;
+
+	HVKBD_LOCK_ASSERT();
+
+	if (!KBD_IS_ACTIVE(kbd) || !hv_kbd_prod_is_ready(sc))
+		return (NOKEY);
+
+next_code:
+
+	/* do we have a composed char to return? */
+	if (!(sc->sc_flags & HVKBD_FLAG_COMPOSE) && sc->sc_composed_char > 0) {
+		action = sc->sc_composed_char;
+		sc->sc_composed_char = 0;
+		if (action > UCHAR_MAX) {
+			return (ERRKEY);
+		}
+		return (action);
+	}
+
+	if (hv_kbd_fetch_top(sc, &ks)) {
+		return (NOKEY);
+	}
+	if ((ks.info & IS_E0) || (ks.info & IS_E1)) {
+		/**
+		 * Emulate the generation of E0 or E1 scancode,
+		 * the real scancode will be consumed next time.
+		 */
+		if (ks.info & IS_E0) {
+			scancode = XTKBD_EMUL0;
+			ks.info &= ~IS_E0;
+		} else if (ks.info & IS_E1) {
+			scancode = XTKBD_EMUL1;
+			ks.info &= ~IS_E1;
+		}
+		/**
+		 * Change the top item to avoid encountering
+		 * E0 or E1 twice.
+		 */
+		hv_kbd_modify_top(sc, &ks);
+	} else if (ks.info & IS_UNICODE) {
+		/**
+		 * XXX: Hyperv host send unicode to VM through
+		 * 'Type clipboard text', the mapping from
+		 * unicode to scancode depends on the keymap.
+		 * It is so complicated that we do not plan to
+		 * support it yet.
+		 */
+		if (bootverbose)
+			device_printf(sc->dev, "Unsupported unicode\n");
+		hv_kbd_remove_top(sc);
+		return (NOKEY);
+	} else {
+		scancode = ks.makecode;
+		if (ks.info & IS_BREAK) {
+			scancode |= XTKBD_RELEASE;
+		}
+		hv_kbd_remove_top(sc);
+	}
+#ifdef EVDEV_SUPPORT
+	/* push evdev event */
+	if (evdev_rcpt_mask & EVDEV_RCPT_HW_KBD &&
+	    sc->ks_evdev != NULL) {
+		keycode = evdev_scancode2key(&sc->ks_evdev_state,
+		    scancode);
+
+		if (keycode != KEY_RESERVED) {
+			evdev_push_event(sc->ks_evdev, EV_KEY,
+			    (uint16_t)keycode, scancode & 0x80 ? 0 : 1);
+			evdev_sync(sc->ks_evdev);
+		}
+	}
+#endif
+	++kbd->kb_count;
+	DEBUG_HVKBD(kbd, "read scan: 0x%x\n", scancode);
+
+	/* return the byte as is for the K_RAW mode */
+	if (sc->sc_mode == K_RAW)
+		return scancode;
+
+	/* translate the scan code into a keycode */
+	keycode = scancode & 0x7F;
+	switch (sc->sc_prefix) {
+	case 0x00:      /* normal scancode */
+		switch(scancode) {
+		case 0xB8:      /* left alt (compose key) released */
+			if (sc->sc_flags & HVKBD_FLAG_COMPOSE) {
+				sc->sc_flags &= ~HVKBD_FLAG_COMPOSE;
+				if (sc->sc_composed_char > UCHAR_MAX)
+					sc->sc_composed_char = 0;
+			}
+			break;
+		case 0x38:      /* left alt (compose key) pressed */
+			if (!(sc->sc_flags & HVKBD_FLAG_COMPOSE)) {
+				sc->sc_flags |= HVKBD_FLAG_COMPOSE;
+				sc->sc_composed_char = 0;
+			}
+			break;
+		case 0xE0:
+		case 0xE1:
+			sc->sc_prefix = scancode;
+			goto next_code;
+		}
+		break;
+	case 0xE0:		/* 0xE0 prefix */
+		sc->sc_prefix = 0;
+		switch (keycode) {
+		case 0x1C:	/* right enter key */
+			keycode = 0x59;
+			break;
+		case 0x1D:	/* right ctrl key */
+			keycode = 0x5A;
+			break;
+		case 0x35:	/* keypad divide key */
+			keycode = 0x5B;
+			break;
+		case 0x37:	/* print scrn key */
+			keycode = 0x5C;
+			break;
+		case 0x38:	/* right alt key (alt gr) */
+			keycode = 0x5D;
+			break;
+		case 0x46:	/* ctrl-pause/break on AT 101 (see below) */
+			keycode = 0x68;
+			break;
+		case 0x47:	/* grey home key */
+			keycode = 0x5E;
+			break;
+		case 0x48:	/* grey up arrow key */
+			keycode = 0x5F;
+			break;
+		case 0x49:	/* grey page up key */
+			keycode = 0x60;
+			break;
+		case 0x4B:	/* grey left arrow key */
+			keycode = 0x61;
+			break;
+		case 0x4D:	/* grey right arrow key */
+			keycode = 0x62;
+			break;
+		case 0x4F:	/* grey end key */
+			keycode = 0x63;
+			break;
+		case 0x50:	/* grey down arrow key */
+			keycode = 0x64;
+			break;
+		case 0x51:	/* grey page down key */
+			keycode = 0x65;
+			break;
+		case 0x52:	/* grey insert key */
+			keycode = 0x66;
+			break;
+		case 0x53:	/* grey delete key */
+			keycode = 0x67;
+			break;
+			/* the following 3 are only used on the MS "Natural" keyboard */
+		case 0x5b:	/* left Window key */
+			keycode = 0x69;
+			break;
+		case 0x5c:	/* right Window key */
+			keycode = 0x6a;
+			break;
+		case 0x5d:	/* menu key */
+			keycode = 0x6b;
+			break;
+		case 0x5e:	/* power key */
+			keycode = 0x6d;
+			break;
+		case 0x5f:	/* sleep key */
+			keycode = 0x6e;
+			break;
+		case 0x63:	/* wake key */
+			keycode = 0x6f;
+			break;
+		default:	/* ignore everything else */
+			goto next_code;
+		}
+		break;
+	case 0xE1:	/* 0xE1 prefix */
+		/*
+		 * The pause/break key on the 101 keyboard produces:
+		 * E1-1D-45 E1-9D-C5
+		 * Ctrl-pause/break produces:
+		 * E0-46 E0-C6 (See above.)
+		 */
+		sc->sc_prefix = 0;
+		if (keycode == 0x1D)
+			sc->sc_prefix = 0x1D;
+		goto next_code;
+		/* NOT REACHED */
+	case 0x1D:	/* pause / break */
+		sc->sc_prefix = 0;
+		if (keycode != 0x45)
+			goto next_code;
+		keycode = 0x68;
+		break;
+	}
+
+	/* XXX assume 101/102 keys AT keyboard */
+	switch (keycode) {
+	case 0x5c:      /* print screen */
+		if (sc->sc_flags & ALTS)
+			keycode = 0x54; /* sysrq */
+		break;
+	case 0x68:      /* pause/break */
+		if (sc->sc_flags & CTLS)
+			keycode = 0x6c; /* break */
+		break;
+	}
+
+	/* return the key code in the K_CODE mode */
+	if (sc->sc_mode == K_CODE)
+		return (keycode | (scancode & 0x80));
+
+	/* compose a character code */
+	if (sc->sc_flags &  HVKBD_FLAG_COMPOSE) {
+		switch (keycode | (scancode & 0x80)) {
+		/* key pressed, process it */
+		case 0x47: case 0x48: case 0x49:	/* keypad 7,8,9 */
+			sc->sc_composed_char *= 10;
+			sc->sc_composed_char += keycode - 0x40;
+			if (sc->sc_composed_char > UCHAR_MAX)
+				return ERRKEY;
+			goto next_code;
+		case 0x4B: case 0x4C: case 0x4D:	/* keypad 4,5,6 */
+			sc->sc_composed_char *= 10;
+			sc->sc_composed_char += keycode - 0x47;
+			if (sc->sc_composed_char > UCHAR_MAX)
+				return ERRKEY;
+			goto next_code;
+		case 0x4F: case 0x50: case 0x51:	/* keypad 1,2,3 */
+			sc->sc_composed_char *= 10;
+			sc->sc_composed_char += keycode - 0x4E;
+			if (sc->sc_composed_char > UCHAR_MAX)
+				return ERRKEY;
+			goto next_code;
+		case 0x52:				/* keypad 0 */
+			sc->sc_composed_char *= 10;
+			if (sc->sc_composed_char > UCHAR_MAX)
+				return ERRKEY;
+			goto next_code;
+
+		/* key released, no interest here */
+		case 0xC7: case 0xC8: case 0xC9:	/* keypad 7,8,9 */
+		case 0xCB: case 0xCC: case 0xCD:	/* keypad 4,5,6 */
+		case 0xCF: case 0xD0: case 0xD1:	/* keypad 1,2,3 */
+		case 0xD2:				/* keypad 0 */
+			goto next_code;
+
+		case 0x38:				/* left alt key */
+			break;
+
+		default:
+			if (sc->sc_composed_char > 0) {
+				sc->sc_flags &= ~HVKBD_FLAG_COMPOSE;
+				sc->sc_composed_char = 0;
+				return (ERRKEY);
+			}
+			break;
+		}
+	}
+
+	/* keycode to key action */
+	action = genkbd_keyaction(kbd, keycode, scancode & 0x80,
+				  &sc->sc_state, &sc->sc_accents);
+	if (action == NOKEY)
+		goto next_code;
+	else
+		return (action);
+}
+
+/* Currently wait is always false. */
+static uint32_t
+hvkbd_read_char(keyboard_t *kbd, int wait)
+{
+	uint32_t keycode;
+
+	HVKBD_LOCK();
+	keycode = hvkbd_read_char_locked(kbd, wait);
+	HVKBD_UNLOCK();
+
+	return (keycode);
+}
+
+/* clear the internal state of the keyboard */
+static void
+hvkbd_clear_state(keyboard_t *kbd)
+{
+	hv_kbd_sc *sc = kbd->kb_data;
+	sc->sc_state &= LOCK_MASK;	/* preserve locking key state */
+	sc->sc_flags &= ~(HVKBD_FLAG_POLLING | HVKBD_FLAG_COMPOSE);
+	sc->sc_accents = 0;
+	sc->sc_composed_char = 0;
+}
+
+static int
+hvkbd_ioctl_locked(keyboard_t *kbd, u_long cmd, caddr_t arg)
+{
+	int i;
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+        int ival;
+#endif
+	hv_kbd_sc *sc = kbd->kb_data;
+	switch (cmd) {
+	case KDGKBMODE:
+		*(int *)arg = sc->sc_mode;
+		break;
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+	case _IO('K', 7):
+		ival = IOCPARM_IVAL(arg);
+		arg = (caddr_t)&ival;
+		/* FALLTHROUGH */
+#endif
+	case KDSKBMODE:		/* set keyboard mode */
+		DEBUG_HVKBD(kbd, "expected mode: %x\n", *(int *)arg);
+		switch (*(int *)arg) {
+		case K_XLATE:
+			if (sc->sc_mode != K_XLATE) {
+				/* make lock key state and LED state match */
+				sc->sc_state &= ~LOCK_MASK;
+				sc->sc_state |= KBD_LED_VAL(kbd);
+			}
+			/* FALLTHROUGH */
+		case K_RAW:
+		case K_CODE:
+			if (sc->sc_mode != *(int *)arg) {
+				DEBUG_HVKBD(kbd, "mod changed to %x\n", *(int *)arg);
+				if ((sc->sc_flags & HVKBD_FLAG_POLLING) == 0)
+					hvkbd_clear_state(kbd);
+				sc->sc_mode = *(int *)arg;
+			}
+			break;
+		default:
+			return (EINVAL);
+		}
+		break;
+	case KDGKBSTATE:	/* get lock key state */
+		*(int *)arg = sc->sc_state & LOCK_MASK;
+		break;
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+	case _IO('K', 20):
+		ival = IOCPARM_IVAL(arg);
+		arg = (caddr_t)&ival;
+		/* FALLTHROUGH */
+#endif
+	case KDSKBSTATE:		/* set lock key state */
+		if (*(int *)arg & ~LOCK_MASK) {
+			return (EINVAL);
+		}
+		sc->sc_state &= ~LOCK_MASK;
+		sc->sc_state |= *(int *)arg;
+		return hvkbd_ioctl_locked(kbd, KDSETLED, arg);
+	case KDGETLED:			/* get keyboard LED */
+		*(int *)arg = KBD_LED_VAL(kbd);
+		break;
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+	case _IO('K', 66):
+		ival = IOCPARM_IVAL(arg);
+		arg = (caddr_t)&ival;
+		/* FALLTHROUGH */
+#endif
+	case KDSETLED:			/* set keyboard LED */
+		/* NOTE: lock key state in "sc_state" won't be changed */
+		if (*(int *)arg & ~LOCK_MASK)
+			return (EINVAL);
+
+		i = *(int *)arg;
+
+		/* replace CAPS LED with ALTGR LED for ALTGR keyboards */
+		if (sc->sc_mode == K_XLATE &&
+		    kbd->kb_keymap->n_keys > ALTGR_OFFSET) {
+			if (i & ALKED)
+				i |= CLKED;
+			else
+				i &= ~CLKED;
+		}
+		if (KBD_HAS_DEVICE(kbd)) {
+			DEBUG_HVSC(sc, "setled 0x%x\n", *(int *)arg);
+		}
+
+#ifdef EVDEV_SUPPORT
+		/* push LED states to evdev */
+		if (sc->ks_evdev != NULL &&
+		    evdev_rcpt_mask & EVDEV_RCPT_HW_KBD)
+			evdev_push_leds(sc->ks_evdev, *(int *)arg);
+#endif
+		KBD_LED_VAL(kbd) = *(int *)arg;
+		break;
+	case PIO_KEYMAP:	/* set keyboard translation table */
+	case OPIO_KEYMAP:	/* set keyboard translation table (compat) */
+	case PIO_KEYMAPENT:	/* set keyboard translation table entry */
+	case PIO_DEADKEYMAP:	/* set accent key translation table */
+		sc->sc_accents = 0;
+		/* FALLTHROUGH */
+	default:
+		return (genkbd_commonioctl(kbd, cmd, arg));
+	}
+	return (0);
+}
+
+/* some useful control functions */
+static int
+hvkbd_ioctl(keyboard_t *kbd, u_long cmd, caddr_t arg)
+{
+	DEBUG_HVKBD(kbd, "%s: %lx start\n", __func__, cmd);
+	HVKBD_LOCK();
+	int ret = hvkbd_ioctl_locked(kbd, cmd, arg);
+	HVKBD_UNLOCK();
+	DEBUG_HVKBD(kbd, "%s: %lx end %d\n", __func__, cmd, ret);
+	return (ret);
+}
+
+/* read one byte from the keyboard if it's allowed */
+/* Currently unused. */
+static int
+hvkbd_read(keyboard_t *kbd, int wait)
+{
+	DEBUG_HVKBD(kbd, "%s\n", __func__);
+	HVKBD_LOCK_ASSERT();
+	if (!KBD_IS_ACTIVE(kbd))
+		return (-1);
+	return hvkbd_read_char_locked(kbd, wait);
+}
+
+#ifdef EVDEV_SUPPORT
+static void
+hvkbd_ev_event(struct evdev_dev *evdev, uint16_t type, uint16_t code,
+    int32_t value)
+{
+	keyboard_t *kbd = evdev_get_softc(evdev);
+
+	if (evdev_rcpt_mask & EVDEV_RCPT_HW_KBD &&
+	(type == EV_LED || type == EV_REP)) {
+		mtx_lock(&Giant);
+		kbd_ev_event(kbd, type, code, value);
+		mtx_unlock(&Giant);
+	}
+}
+#endif
+
+static keyboard_switch_t hvkbdsw = {
+	.probe =	hvkbd_probe,		/* not used */
+	.init =		hvkbd_init,
+	.term =		hvkbd_term,		/* not used */
+	.intr =		hvkbd_intr,		/* not used */
+	.test_if =	hvkbd_test_if,		/* not used */
+	.enable =	hvkbd_enable,
+	.disable =	hvkbd_disable,
+	.read =		hvkbd_read,
+	.check =	hvkbd_check,
+	.read_char =	hvkbd_read_char,
+	.check_char =	hvkbd_check_char,
+	.ioctl =	hvkbd_ioctl,
+	.lock =		hvkbd_lock,		/* not used */
+	.clear_state =	hvkbd_clear_state,
+	.get_state =	hvkbd_get_state,	/* not used */
+	.set_state =	hvkbd_set_state,	/* not used */
+	.poll =		hvkbd_poll,
+};
+
+KEYBOARD_DRIVER(hvkbd, hvkbdsw, hvkbd_configure);
+
+void
+hv_kbd_intr(hv_kbd_sc *sc)
+{
+	uint32_t c;
+	if ((sc->sc_flags & HVKBD_FLAG_POLLING) != 0)
+		return;
+
+	if (KBD_IS_ACTIVE(&sc->sc_kbd) &&
+	    KBD_IS_BUSY(&sc->sc_kbd)) {
+		/* let the callback function process the input */
+		(sc->sc_kbd.kb_callback.kc_func) (&sc->sc_kbd, KBDIO_KEYINPUT,
+		    sc->sc_kbd.kb_callback.kc_arg);
+	} else {
+		/* read and discard the input, no one is waiting for it */
+		do {
+			c = hvkbd_read_char(&sc->sc_kbd, 0);
+		} while (c != NOKEY);
+	}
+}
+
+int
+hvkbd_driver_load(module_t mod, int what, void *arg)
+{
+	switch (what) {
+	case MOD_LOAD:
+		kbd_add_driver(&hvkbd_kbd_driver);
+		break;
+	case MOD_UNLOAD:
+		kbd_delete_driver(&hvkbd_kbd_driver);
+		break;
+	}
+	return (0);
+}
+
+int
+hv_kbd_drv_attach(device_t dev)
+{
+	hv_kbd_sc *sc = device_get_softc(dev);
+	int unit = device_get_unit(dev);
+	keyboard_t *kbd = &sc->sc_kbd;
+	keyboard_switch_t *sw;
+#ifdef EVDEV_SUPPORT
+	struct evdev_dev *evdev;
+#endif
+
+	sw = kbd_get_switch(HVKBD_DRIVER_NAME);
+	if (sw == NULL) {
+		return (ENXIO);
+	}
+
+	kbd_init_struct(kbd, HVKBD_DRIVER_NAME, KB_OTHER, unit, 0, 0, 0);
+	kbd->kb_data = (void *)sc;
+	kbd_set_maps(kbd, &key_map, &accent_map, fkey_tab, nitems(fkey_tab));
+	KBD_FOUND_DEVICE(kbd);
+	hvkbd_clear_state(kbd);
+	KBD_PROBE_DONE(kbd);
+	KBD_INIT_DONE(kbd);
+	sc->sc_mode = K_XLATE;
+	(*sw->enable)(kbd);
+
+#ifdef EVDEV_SUPPORT
+	evdev = evdev_alloc();
+	evdev_set_name(evdev, "Hyper-V keyboard");
+	evdev_set_phys(evdev, device_get_nameunit(dev));
+	evdev_set_id(evdev, BUS_VIRTUAL, 0, 0, 0);
+	evdev_set_methods(evdev, kbd, &hvkbd_evdev_methods);
+	evdev_support_event(evdev, EV_SYN);
+	evdev_support_event(evdev, EV_KEY);
+	evdev_support_event(evdev, EV_LED);
+	evdev_support_event(evdev, EV_REP);
+	evdev_support_all_known_keys(evdev);
+	evdev_support_led(evdev, LED_NUML);
+	evdev_support_led(evdev, LED_CAPSL);
+	evdev_support_led(evdev, LED_SCROLLL);
+	if (evdev_register_mtx(evdev, &Giant))
+		evdev_free(evdev);
+	else
+		sc->ks_evdev = evdev;
+	sc->ks_evdev_state = 0;
+#endif
+
+	if (kbd_register(kbd) < 0) {
+		goto detach;
+	}
+	KBD_CONFIG_DONE(kbd);
+#ifdef KBD_INSTALL_CDEV
+        if (kbd_attach(kbd)) {
+		goto detach;
+	}
+#endif
+	if (bootverbose) {
+		kbdd_diag(kbd, bootverbose);
+	}
+	return (0);
+detach:
+	hv_kbd_drv_detach(dev);
+	return (ENXIO);
+}
+
+int
+hv_kbd_drv_detach(device_t dev)
+{
+	int error = 0;
+	hv_kbd_sc *sc = device_get_softc(dev);
+	hvkbd_disable(&sc->sc_kbd);
+#ifdef EVDEV_SUPPORT
+	evdev_free(sc->ks_evdev);
+#endif
+	if (KBD_IS_CONFIGURED(&sc->sc_kbd)) {
+		error = kbd_unregister(&sc->sc_kbd);
+		if (error) {
+			device_printf(dev, "WARNING: kbd_unregister() "
+			    "returned non-zero! (ignored)\n");
+		}
+	}
+#ifdef KBD_INSTALL_CDEV
+	error = kbd_detach(&sc->sc_kbd);
+#endif
+	return (error);
+}
+
diff --git a/sys/dev/hyperv/input/hv_kbdc.c b/sys/dev/hyperv/input/hv_kbdc.c
new file mode 100644
index 000000000000..7065ff3057a7
--- /dev/null
+++ b/sys/dev/hyperv/input/hv_kbdc.c
@@ -0,0 +1,530 @@
+/*-
+ * Copyright (c) 2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/lock.h>
+#include <sys/taskqueue.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/mutex.h>
+
+#include <sys/kbio.h>
+#include <dev/kbd/kbdreg.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/utilities/hv_utilreg.h>
+#include <dev/hyperv/utilities/vmbus_icreg.h>
+#include <dev/hyperv/utilities/vmbus_icvar.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+
+#include "dev/hyperv/input/hv_kbdc.h"
+#include "vmbus_if.h"
+
+#define HV_KBD_VER_MAJOR	(1)
+#define HV_KBD_VER_MINOR	(0)
+
+#define HV_KBD_VER		(HV_KBD_VER_MINOR | (HV_KBD_VER_MAJOR) << 16)
+
+#define HV_KBD_PROTO_ACCEPTED	(1)
+
+#define HV_BUFF_SIZE		(4*PAGE_SIZE)
+#define HV_KBD_RINGBUFF_SEND_SZ	(10*PAGE_SIZE)
+#define HV_KBD_RINGBUFF_RECV_SZ (10*PAGE_SIZE)
+
+enum hv_kbd_msg_type_t {
+	HV_KBD_PROTO_REQUEST        = 1,
+	HV_KBD_PROTO_RESPONSE       = 2,
+	HV_KBD_PROTO_EVENT          = 3,
+	HV_KBD_PROTO_LED_INDICATORS = 4,
+};
+
+typedef struct hv_kbd_msg_hdr_t {
+	uint32_t type;
+} hv_kbd_msg_hdr;
+
+typedef struct hv_kbd_msg_t {
+	hv_kbd_msg_hdr hdr;
+	char data[];
+} hv_kbd_msg;
+
+typedef struct hv_kbd_proto_req_t {
+	hv_kbd_msg_hdr	hdr;
+	uint32_t	ver;
+} hv_kbd_proto_req;
+
+typedef struct hv_kbd_proto_resp_t {
+	hv_kbd_msg_hdr  hdr;
+	uint32_t	status;
+} hv_kbd_proto_resp;
+
+#define HV_KBD_PROTO_REQ_SZ	(sizeof(hv_kbd_proto_req))
+#define HV_KBD_PROTO_RESP_SZ	(sizeof(hv_kbd_proto_resp))
+
+/**
+ * the struct in win host:
+ * typedef struct _HK_MESSAGE_KEYSTROKE
+ * {
+ *     HK_MESSAGE_HEADER Header;
+ *     UINT16 MakeCode;
+ *     UINT32 IsUnicode:1;
+ *     UINT32 IsBreak:1;
+ *     UINT32 IsE0:1;
+ *     UINT32 IsE1:1;
+ *     UINT32 Reserved:28;
+ * } HK_MESSAGE_KEYSTROKE
+ */
+typedef struct hv_kbd_keystroke_t {
+	hv_kbd_msg_hdr  hdr;
+	keystroke	ks;
+} hv_kbd_keystroke;
+
+static const struct vmbus_ic_desc vmbus_kbd_descs[] = {
+	{
+		.ic_guid = { .hv_guid = {
+		    0x6d, 0xad, 0x12, 0xf9, 0x17, 0x2b, 0xea, 0x48,
+		    0xbd, 0x65, 0xf9, 0x27, 0xa6, 0x1c, 0x76,  0x84} },
+		.ic_desc = "Hyper-V KBD"
+	},
+	VMBUS_IC_DESC_END
+};
+
+static int hv_kbd_attach(device_t dev);
+static int hv_kbd_detach(device_t dev);
+
+/**
+ * return 1 if producer is ready
+ */
+int
+hv_kbd_prod_is_ready(hv_kbd_sc *sc)
+{
+	int ret;
+	mtx_lock(&sc->ks_mtx);
+	ret = !STAILQ_EMPTY(&sc->ks_queue);
+	mtx_unlock(&sc->ks_mtx);
+	return (ret);
+}
+
+int
+hv_kbd_produce_ks(hv_kbd_sc *sc, const keystroke *ks)
+{
+	int ret = 0;
+	keystroke_info *ksi;
+	mtx_lock(&sc->ks_mtx);
+	if (LIST_EMPTY(&sc->ks_free_list)) {
+		DEBUG_HVSC(sc, "NO buffer!\n");
+		ret = 1;
+	} else {
+		ksi = LIST_FIRST(&sc->ks_free_list);
+		LIST_REMOVE(ksi, link);
+		ksi->ks = *ks;
+		STAILQ_INSERT_TAIL(&sc->ks_queue, ksi, slink);
+	}
+	mtx_unlock(&sc->ks_mtx);
+	return (ret);
+}
+
+/**
+ * return 0 if successfully get the 1st item of queue without removing it
+ */
+int
+hv_kbd_fetch_top(hv_kbd_sc *sc, keystroke *result)
+{
+	int ret = 0;
+	keystroke_info *ksi = NULL;
+	mtx_lock(&sc->ks_mtx);
+	if (STAILQ_EMPTY(&sc->ks_queue)) {
+		DEBUG_HVSC(sc, "Empty queue!\n");
+		ret = 1;
+	} else {
+		ksi = STAILQ_FIRST(&sc->ks_queue);
+		*result = ksi->ks;
+	}
+	mtx_unlock(&sc->ks_mtx);
+	return (ret);
+}
+
+/**
+ * return 0 if successfully removing the top item
+ */
+int
+hv_kbd_remove_top(hv_kbd_sc *sc)
+{
+	int ret = 0;
+	keystroke_info *ksi = NULL;
+	mtx_lock(&sc->ks_mtx);
+	if (STAILQ_EMPTY(&sc->ks_queue)) {
+		DEBUG_HVSC(sc, "Empty queue!\n");
+		ret = 1;
+	} else {
+		ksi = STAILQ_FIRST(&sc->ks_queue);
+		STAILQ_REMOVE_HEAD(&sc->ks_queue, slink);
+		LIST_INSERT_HEAD(&sc->ks_free_list, ksi, link);
+	}
+	mtx_unlock(&sc->ks_mtx);
+	return (ret);
+}
+
+/**
+ * return 0 if successfully modify the 1st item of queue
+ */
+int
+hv_kbd_modify_top(hv_kbd_sc *sc, keystroke *top)
+{
+	int ret = 0;
+	keystroke_info *ksi = NULL;
+	mtx_lock(&sc->ks_mtx);
+	if (STAILQ_EMPTY(&sc->ks_queue)) {
+		DEBUG_HVSC(sc, "Empty queue!\n");
+		ret = 1;
+	} else {
+		ksi = STAILQ_FIRST(&sc->ks_queue);
+		ksi->ks = *top;
+	}
+	mtx_unlock(&sc->ks_mtx);
+	return (ret);
+}
+
+static int
+hv_kbd_probe(device_t dev)
+{
+	device_t bus = device_get_parent(dev);
+	const struct vmbus_ic_desc *d;
+
+	if (resource_disabled(device_get_name(dev), 0))
+		return (ENXIO);
+
+	for (d = vmbus_kbd_descs; d->ic_desc != NULL; ++d) {
+		if (VMBUS_PROBE_GUID(bus, dev, &d->ic_guid) == 0) {
+			device_set_desc(dev, d->ic_desc);
+			return (BUS_PROBE_DEFAULT);
+		}
+	}
+	return (ENXIO);
+}
+
+static void
+hv_kbd_on_response(hv_kbd_sc *sc, struct vmbus_chanpkt_hdr *pkt)
+{
+	struct vmbus_xact_ctx *xact = sc->hs_xact_ctx;
+	if (xact != NULL) {
+		DEBUG_HVSC(sc, "hvkbd is ready\n");
+		vmbus_xact_ctx_wakeup(xact, VMBUS_CHANPKT_CONST_DATA(pkt),
+		    VMBUS_CHANPKT_DATALEN(pkt));
+	}
+}
+
+static void
+hv_kbd_on_received(hv_kbd_sc *sc, struct vmbus_chanpkt_hdr *pkt)
+{
+
+	const hv_kbd_msg *msg = VMBUS_CHANPKT_CONST_DATA(pkt);
+	const hv_kbd_proto_resp *resp =
+	    VMBUS_CHANPKT_CONST_DATA(pkt);
+	const hv_kbd_keystroke *keystroke =
+	    VMBUS_CHANPKT_CONST_DATA(pkt);
+	uint32_t msg_len = VMBUS_CHANPKT_DATALEN(pkt);
+	enum hv_kbd_msg_type_t msg_type;
+	uint32_t info;
+	uint16_t scan_code;
+
+	if (msg_len <= sizeof(hv_kbd_msg)) {
+		device_printf(sc->dev, "Illegal packet\n");
+		return;
+	}
+	msg_type = msg->hdr.type;
+	switch (msg_type) {
+		case HV_KBD_PROTO_RESPONSE:
+			hv_kbd_on_response(sc, pkt);
+			DEBUG_HVSC(sc, "keyboard resp: 0x%x\n",
+			    resp->status);
+			break;
+		case HV_KBD_PROTO_EVENT:
+			info = keystroke->ks.info;
+			scan_code = keystroke->ks.makecode;
+			DEBUG_HVSC(sc, "keystroke info: 0x%x, scan: 0x%x\n",
+			    info, scan_code);
+			hv_kbd_produce_ks(sc, &keystroke->ks);
+			hv_kbd_intr(sc);
+		default:
+			break;
+	}
+}
+
+void 
+hv_kbd_read_channel(struct vmbus_channel *channel, void *context)
+{
+	uint8_t *buf;
+	uint32_t buflen = 0;
+	int ret = 0;
+
+	hv_kbd_sc *sc = (hv_kbd_sc*)context;
+	buf = sc->buf;
+	buflen = sc->buflen;
+	for (;;) {
+		struct vmbus_chanpkt_hdr *pkt = (struct vmbus_chanpkt_hdr *)buf;
+		uint32_t rxed = buflen;
+
+		ret = vmbus_chan_recv_pkt(channel, pkt, &rxed);
+		if (__predict_false(ret == ENOBUFS)) {
+			buflen = sc->buflen * 2;
+			while (buflen < rxed)
+				buflen *= 2;
+			buf = malloc(buflen, M_DEVBUF, M_WAITOK | M_ZERO);
+			device_printf(sc->dev, "expand recvbuf %d -> %d\n",
+			    sc->buflen, buflen);
+			free(sc->buf, M_DEVBUF);
+			sc->buf = buf;
+			sc->buflen = buflen;
+			continue;
+		} else if (__predict_false(ret == EAGAIN)) {
+			/* No more channel packets; done! */
+			break;
+		}
+		KASSERT(!ret, ("vmbus_chan_recv_pkt failed: %d", ret));
+
+		DEBUG_HVSC(sc, "event: 0x%x\n", pkt->cph_type);
+		switch (pkt->cph_type) {
+		case VMBUS_CHANPKT_TYPE_COMP:
+		case VMBUS_CHANPKT_TYPE_RXBUF:
+			device_printf(sc->dev, "unhandled event: %d\n",
+			    pkt->cph_type);
+			break;
+		case VMBUS_CHANPKT_TYPE_INBAND:
+			hv_kbd_on_received(sc, pkt);
+			break;
+		default:
+			device_printf(sc->dev, "unknown event: %d\n",
+			    pkt->cph_type);
+			break;
+		}
+	}
+}
+
+static int
+hv_kbd_connect_vsp(hv_kbd_sc *sc)
+{
+	int ret;
+	size_t resplen;
+	struct vmbus_xact *xact;
+	hv_kbd_proto_req *req;
+	const hv_kbd_proto_resp *resp;
+
+	xact = vmbus_xact_get(sc->hs_xact_ctx, sizeof(*req));
+	if (xact == NULL) {
+		device_printf(sc->dev, "no xact for kbd init");
+		return (ENODEV);
+	}
+	req = vmbus_xact_req_data(xact);
+	req->hdr.type = HV_KBD_PROTO_REQUEST;
+	req->ver = HV_KBD_VER;
+
+	vmbus_xact_activate(xact);
+	ret = vmbus_chan_send(sc->hs_chan,
+		VMBUS_CHANPKT_TYPE_INBAND,
+		VMBUS_CHANPKT_FLAG_RC,
+		req, sizeof(hv_kbd_proto_req),
+		(uint64_t)(uintptr_t)xact);
+	if (ret) {
+		device_printf(sc->dev, "fail to send\n");
+		vmbus_xact_deactivate(xact);
+		return (ret);
+	}
+	resp = vmbus_chan_xact_wait(sc->hs_chan, xact, &resplen, true);
+	if (resplen < HV_KBD_PROTO_RESP_SZ) {
+		device_printf(sc->dev, "hv_kbd init communicate failed\n");
+		ret = ENODEV;
+		goto clean;
+	}
+
+	if (!(resp->status & HV_KBD_PROTO_ACCEPTED)) {
+		device_printf(sc->dev, "hv_kbd protocol request failed\n");
+		ret = ENODEV;
+	}
+clean:
+	vmbus_xact_put(xact);
+	DEBUG_HVSC(sc, "finish connect vsp\n");
+	return (ret);
+}
+
+static int
+hv_kbd_attach1(device_t dev, vmbus_chan_callback_t cb)
+{
+	int ret;
+	hv_kbd_sc *sc;
+
+        sc = device_get_softc(dev);
+	sc->buflen = HV_BUFF_SIZE;
+	sc->buf = malloc(sc->buflen, M_DEVBUF, M_WAITOK | M_ZERO);
+	vmbus_chan_set_readbatch(sc->hs_chan, false);
+	ret = vmbus_chan_open(
+		sc->hs_chan,
+		HV_KBD_RINGBUFF_SEND_SZ,
+		HV_KBD_RINGBUFF_RECV_SZ,
+		NULL, 0,
+		cb,
+		sc);
+	if (ret != 0) {
+		free(sc->buf, M_DEVBUF);
+	}
+	return (ret);
+}
+
+static int
+hv_kbd_detach1(device_t dev)
+{
+	hv_kbd_sc *sc = device_get_softc(dev);
+	vmbus_chan_close(vmbus_get_channel(dev));
+	free(sc->buf, M_DEVBUF);
+	return (0);
+}
+
+static void
+hv_kbd_init(hv_kbd_sc *sc)
+{
+	const int max_list = 16;
+	int i;
+	keystroke_info *ksi;
+
+	mtx_init(&sc->ks_mtx, "hv_kbdc mutex", NULL, MTX_DEF);
+	LIST_INIT(&sc->ks_free_list);
+	STAILQ_INIT(&sc->ks_queue);
+	for (i = 0; i < max_list; i++) {
+		ksi = malloc(sizeof(keystroke_info),
+		    M_DEVBUF, M_WAITOK|M_ZERO);
+		LIST_INSERT_HEAD(&sc->ks_free_list, ksi, link);
+	}
+}
+
+static void
+hv_kbd_fini(hv_kbd_sc *sc)
+{
+	keystroke_info *ksi;
+	while (!LIST_EMPTY(&sc->ks_free_list)) {
+		ksi = LIST_FIRST(&sc->ks_free_list);
+		LIST_REMOVE(ksi, link);
+		free(ksi, M_DEVBUF);
+	}
+	while (!STAILQ_EMPTY(&sc->ks_queue)) {
+		ksi = STAILQ_FIRST(&sc->ks_queue);
+		STAILQ_REMOVE_HEAD(&sc->ks_queue, slink);
+		free(ksi, M_DEVBUF);
+	}
+	mtx_destroy(&sc->ks_mtx);
+}
+
+static void
+hv_kbd_sysctl(device_t dev)
+{
+	struct sysctl_oid_list *child;
+	struct sysctl_ctx_list *ctx;
+	hv_kbd_sc *sc;
+
+	sc = device_get_softc(dev);
+	ctx = device_get_sysctl_ctx(dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "debug", CTLFLAG_RW,
+	    &sc->debug, 0, "debug hyperv keyboard");
+}
+
+static int
+hv_kbd_attach(device_t dev)
+{
+	int error = 0;
+	hv_kbd_sc *sc;
+
+	sc = device_get_softc(dev);
+	sc->hs_chan = vmbus_get_channel(dev);
+	sc->dev = dev;
+	hv_kbd_init(sc);
+	sc->hs_xact_ctx = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
+	    HV_KBD_PROTO_REQ_SZ, HV_KBD_PROTO_RESP_SZ, 0);
+	if (sc->hs_xact_ctx == NULL) {
+		error = ENOMEM;
+		goto failed;
+	}
+
+	error = hv_kbd_attach1(dev, hv_kbd_read_channel);
+	if (error)
+		goto failed;
+	error = hv_kbd_connect_vsp(sc);
+	if (error)
+		goto failed;
+
+	error = hv_kbd_drv_attach(dev);
+	if (error)
+		goto failed;
+	hv_kbd_sysctl(dev);
+	return (0);
+failed:
+	hv_kbd_detach(dev);
+	return (error);
+}
+
+static int
+hv_kbd_detach(device_t dev)
+{
+	int ret;
+	hv_kbd_sc *sc = device_get_softc(dev);
+	hv_kbd_fini(sc);
+	if (sc->hs_xact_ctx != NULL)
+		vmbus_xact_ctx_destroy(sc->hs_xact_ctx);
+	ret = hv_kbd_detach1(dev);
+	if (!ret)
+		device_printf(dev, "Fail to detach\n");
+	return hv_kbd_drv_detach(dev);
+}
+
+static device_method_t kbd_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe, hv_kbd_probe),
+	DEVMETHOD(device_attach, hv_kbd_attach),
+	DEVMETHOD(device_detach, hv_kbd_detach),
+	{ 0, 0 }
+};
+
+static driver_t kbd_driver = {HVKBD_DRIVER_NAME , kbd_methods, sizeof(hv_kbd_sc)};
+
+static devclass_t kbd_devclass;
+
+DRIVER_MODULE(hv_kbd, vmbus, kbd_driver, kbd_devclass, hvkbd_driver_load, NULL);
+MODULE_VERSION(hv_kbd, 1);
+MODULE_DEPEND(hv_kbd, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/input/hv_kbdc.h b/sys/dev/hyperv/input/hv_kbdc.h
new file mode 100644
index 000000000000..f6f76035e8c3
--- /dev/null
+++ b/sys/dev/hyperv/input/hv_kbdc.h
@@ -0,0 +1,118 @@
+/*-
+ * Copyright (c) 2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HV_KBD_H
+#define _HV_KBD_H
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/systm.h>
+
+#include <dev/kbd/kbdreg.h>
+
+#include "opt_evdev.h"
+#ifdef EVDEV_SUPPORT
+#include <dev/evdev/evdev.h>
+#include <dev/evdev/input.h>
+#endif
+
+#define HVKBD_DRIVER_NAME	"hvkbd"
+#define IS_UNICODE		(1)
+#define IS_BREAK		(2)
+#define IS_E0			(4)
+#define IS_E1			(8)
+
+#define XTKBD_EMUL0		(0xe0)
+#define XTKBD_EMUL1		(0xe1)
+#define XTKBD_RELEASE		(0x80)
+
+#define DEBUG_HVSC(sc, ...) do {			\
+	if (sc->debug > 0) {				\
+		device_printf(sc->dev, __VA_ARGS__);	\
+	}						\
+} while (0)
+#define DEBUG_HVKBD(kbd, ...) do {			\
+	hv_kbd_sc *sc = (kbd)->kb_data;			\
+	DEBUG_HVSC(sc, __VA_ARGS__);				\
+} while (0)
+
+struct vmbus_channel;
+struct vmbus_xact_ctx;
+
+typedef struct keystroke_t {
+	uint16_t			makecode;
+	uint32_t			info;
+} keystroke;
+
+typedef struct keystroke_info {
+	LIST_ENTRY(keystroke_info)	link;
+	STAILQ_ENTRY(keystroke_info)	slink;
+	keystroke			ks;
+} keystroke_info;
+
+typedef struct hv_kbd_sc_t {
+	struct vmbus_channel		*hs_chan;
+	device_t			dev;
+	struct vmbus_xact_ctx		*hs_xact_ctx;
+	int32_t				buflen;
+	uint8_t				*buf;
+
+	struct mtx			ks_mtx;
+	LIST_HEAD(, keystroke_info)	ks_free_list;
+	STAILQ_HEAD(, keystroke_info)	ks_queue;	/* keystroke info queue */
+
+	keyboard_t			sc_kbd;
+	int				sc_mode;
+	int				sc_state;
+	uint32_t			sc_accents;	/* accent key index (> 0) */
+	uint32_t			sc_composed_char; /* composed char code */
+	uint8_t				sc_prefix;	/* AT scan code prefix */
+	int				sc_polling;	/* polling recursion count */
+	uint32_t			sc_flags;
+	int				debug;
+
+#ifdef EVDEV_SUPPORT
+	struct evdev_dev		*ks_evdev;
+	int				ks_evdev_state;
+#endif
+} hv_kbd_sc;
+
+int	hv_kbd_produce_ks(hv_kbd_sc *sc, const keystroke *ks);
+int	hv_kbd_fetch_top(hv_kbd_sc *sc, keystroke *top);
+int	hv_kbd_modify_top(hv_kbd_sc *sc, keystroke *top);
+int	hv_kbd_remove_top(hv_kbd_sc *sc);
+int	hv_kbd_prod_is_ready(hv_kbd_sc *sc);
+void	hv_kbd_read_channel(struct vmbus_channel *, void *);
+
+int	hv_kbd_drv_attach(device_t dev);
+int	hv_kbd_drv_detach(device_t dev);
+
+int	hvkbd_driver_load(module_t, int, void *);
+void	hv_kbd_intr(hv_kbd_sc *sc);
+#endif
diff --git a/sys/dev/hyperv/netvsc/hn_nvs.c b/sys/dev/hyperv/netvsc/hn_nvs.c
new file mode 100644
index 000000000000..4dbc28996617
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/hn_nvs.c
@@ -0,0 +1,751 @@
+/*-
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2010-2012 Citrix Inc.
+ * Copyright (c) 2012 NetApp Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Network Virtualization Service.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet6.h"
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/socket.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_media.h>
+
+#include <netinet/in.h>
+#include <netinet/tcp_lro.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+
+#include <dev/hyperv/netvsc/ndis.h>
+#include <dev/hyperv/netvsc/if_hnreg.h>
+#include <dev/hyperv/netvsc/if_hnvar.h>
+#include <dev/hyperv/netvsc/hn_nvs.h>
+
+static int			hn_nvs_conn_chim(struct hn_softc *);
+static int			hn_nvs_conn_rxbuf(struct hn_softc *);
+static void			hn_nvs_disconn_chim(struct hn_softc *);
+static void			hn_nvs_disconn_rxbuf(struct hn_softc *);
+static int			hn_nvs_conf_ndis(struct hn_softc *, int);
+static int			hn_nvs_init_ndis(struct hn_softc *);
+static int			hn_nvs_doinit(struct hn_softc *, uint32_t);
+static int			hn_nvs_init(struct hn_softc *);
+static const void		*hn_nvs_xact_execute(struct hn_softc *,
+				    struct vmbus_xact *, void *, int,
+				    size_t *, uint32_t);
+static void			hn_nvs_sent_none(struct hn_nvs_sendctx *,
+				    struct hn_softc *, struct vmbus_channel *,
+				    const void *, int);
+
+struct hn_nvs_sendctx		hn_nvs_sendctx_none =
+    HN_NVS_SENDCTX_INITIALIZER(hn_nvs_sent_none, NULL);
+
+static const uint32_t		hn_nvs_version[] = {
+	HN_NVS_VERSION_61,
+	HN_NVS_VERSION_6,
+	HN_NVS_VERSION_5,
+	HN_NVS_VERSION_4,
+	HN_NVS_VERSION_2,
+	HN_NVS_VERSION_1
+};
+
+static const void *
+hn_nvs_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact,
+    void *req, int reqlen, size_t *resplen0, uint32_t type)
+{
+	struct hn_nvs_sendctx sndc;
+	size_t resplen, min_resplen = *resplen0;
+	const struct hn_nvs_hdr *hdr;
+	int error;
+
+	KASSERT(min_resplen >= sizeof(*hdr),
+	    ("invalid minimum response len %zu", min_resplen));
+
+	/*
+	 * Execute the xact setup by the caller.
+	 */
+	hn_nvs_sendctx_init(&sndc, hn_nvs_sent_xact, xact);
+
+	vmbus_xact_activate(xact);
+	error = hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_RC,
+	    req, reqlen, &sndc);
+	if (error) {
+		vmbus_xact_deactivate(xact);
+		return (NULL);
+	}
+	hdr = vmbus_chan_xact_wait(sc->hn_prichan, xact, &resplen,
+	    HN_CAN_SLEEP(sc));
+
+	/*
+	 * Check this NVS response message.
+	 */
+	if (resplen < min_resplen) {
+		if_printf(sc->hn_ifp, "invalid NVS resp len %zu\n", resplen);
+		return (NULL);
+	}
+	if (hdr->nvs_type != type) {
+		if_printf(sc->hn_ifp, "unexpected NVS resp 0x%08x, "
+		    "expect 0x%08x\n", hdr->nvs_type, type);
+		return (NULL);
+	}
+	/* All pass! */
+	*resplen0 = resplen;
+	return (hdr);
+}
+
+static __inline int
+hn_nvs_req_send(struct hn_softc *sc, void *req, int reqlen)
+{
+
+	return (hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_NONE,
+	    req, reqlen, &hn_nvs_sendctx_none));
+}
+
+static int 
+hn_nvs_conn_rxbuf(struct hn_softc *sc)
+{
+	struct vmbus_xact *xact = NULL;
+	struct hn_nvs_rxbuf_conn *conn;
+	const struct hn_nvs_rxbuf_connresp *resp;
+	size_t resp_len;
+	uint32_t status;
+	int error, rxbuf_size;
+
+	/*
+	 * Limit RXBUF size for old NVS.
+	 */
+	if (sc->hn_nvs_ver <= HN_NVS_VERSION_2)
+		rxbuf_size = HN_RXBUF_SIZE_COMPAT;
+	else
+		rxbuf_size = HN_RXBUF_SIZE;
+
+	/*
+	 * Connect the RXBUF GPADL to the primary channel.
+	 *
+	 * NOTE:
+	 * Only primary channel has RXBUF connected to it.  Sub-channels
+	 * just share this RXBUF.
+	 */
+	error = vmbus_chan_gpadl_connect(sc->hn_prichan,
+	    sc->hn_rxbuf_dma.hv_paddr, rxbuf_size, &sc->hn_rxbuf_gpadl);
+	if (error) {
+		if_printf(sc->hn_ifp, "rxbuf gpadl conn failed: %d\n",
+		    error);
+		goto cleanup;
+	}
+
+	/*
+	 * Connect RXBUF to NVS.
+	 */
+
+	xact = vmbus_xact_get(sc->hn_xact, sizeof(*conn));
+	if (xact == NULL) {
+		if_printf(sc->hn_ifp, "no xact for nvs rxbuf conn\n");
+		error = ENXIO;
+		goto cleanup;
+	}
+	conn = vmbus_xact_req_data(xact);
+	conn->nvs_type = HN_NVS_TYPE_RXBUF_CONN;
+	conn->nvs_gpadl = sc->hn_rxbuf_gpadl;
+	conn->nvs_sig = HN_NVS_RXBUF_SIG;
+
+	resp_len = sizeof(*resp);
+	resp = hn_nvs_xact_execute(sc, xact, conn, sizeof(*conn), &resp_len,
+	    HN_NVS_TYPE_RXBUF_CONNRESP);
+	if (resp == NULL) {
+		if_printf(sc->hn_ifp, "exec nvs rxbuf conn failed\n");
+		error = EIO;
+		goto cleanup;
+	}
+
+	status = resp->nvs_status;
+	vmbus_xact_put(xact);
+	xact = NULL;
+
+	if (status != HN_NVS_STATUS_OK) {
+		if_printf(sc->hn_ifp, "nvs rxbuf conn failed: %x\n", status);
+		error = EIO;
+		goto cleanup;
+	}
+	sc->hn_flags |= HN_FLAG_RXBUF_CONNECTED;
+
+	return (0);
+
+cleanup:
+	if (xact != NULL)
+		vmbus_xact_put(xact);
+	hn_nvs_disconn_rxbuf(sc);
+	return (error);
+}
+
+static int 
+hn_nvs_conn_chim(struct hn_softc *sc)
+{
+	struct vmbus_xact *xact = NULL;
+	struct hn_nvs_chim_conn *chim;
+	const struct hn_nvs_chim_connresp *resp;
+	size_t resp_len;
+	uint32_t status, sectsz;
+	int error;
+
+	/*
+	 * Connect chimney sending buffer GPADL to the primary channel.
+	 *
+	 * NOTE:
+	 * Only primary channel has chimney sending buffer connected to it.
+	 * Sub-channels just share this chimney sending buffer.
+	 */
+	error = vmbus_chan_gpadl_connect(sc->hn_prichan,
+  	    sc->hn_chim_dma.hv_paddr, HN_CHIM_SIZE, &sc->hn_chim_gpadl);
+	if (error) {
+		if_printf(sc->hn_ifp, "chim gpadl conn failed: %d\n", error);
+		goto cleanup;
+	}
+
+	/*
+	 * Connect chimney sending buffer to NVS
+	 */
+
+	xact = vmbus_xact_get(sc->hn_xact, sizeof(*chim));
+	if (xact == NULL) {
+		if_printf(sc->hn_ifp, "no xact for nvs chim conn\n");
+		error = ENXIO;
+		goto cleanup;
+	}
+	chim = vmbus_xact_req_data(xact);
+	chim->nvs_type = HN_NVS_TYPE_CHIM_CONN;
+	chim->nvs_gpadl = sc->hn_chim_gpadl;
+	chim->nvs_sig = HN_NVS_CHIM_SIG;
+
+	resp_len = sizeof(*resp);
+	resp = hn_nvs_xact_execute(sc, xact, chim, sizeof(*chim), &resp_len,
+	    HN_NVS_TYPE_CHIM_CONNRESP);
+	if (resp == NULL) {
+		if_printf(sc->hn_ifp, "exec nvs chim conn failed\n");
+		error = EIO;
+		goto cleanup;
+	}
+
+	status = resp->nvs_status;
+	sectsz = resp->nvs_sectsz;
+	vmbus_xact_put(xact);
+	xact = NULL;
+
+	if (status != HN_NVS_STATUS_OK) {
+		if_printf(sc->hn_ifp, "nvs chim conn failed: %x\n", status);
+		error = EIO;
+		goto cleanup;
+	}
+	if (sectsz == 0 || sectsz % sizeof(uint32_t) != 0) {
+		/*
+		 * Can't use chimney sending buffer; done!
+		 */
+		if (sectsz == 0) {
+			if_printf(sc->hn_ifp, "zero chimney sending buffer "
+			    "section size\n");
+		} else {
+			if_printf(sc->hn_ifp, "misaligned chimney sending "
+			    "buffers, section size: %u\n", sectsz);
+		}
+		sc->hn_chim_szmax = 0;
+		sc->hn_chim_cnt = 0;
+		sc->hn_flags |= HN_FLAG_CHIM_CONNECTED;
+		return (0);
+	}
+
+	sc->hn_chim_szmax = sectsz;
+	sc->hn_chim_cnt = HN_CHIM_SIZE / sc->hn_chim_szmax;
+	if (HN_CHIM_SIZE % sc->hn_chim_szmax != 0) {
+		if_printf(sc->hn_ifp, "chimney sending sections are "
+		    "not properly aligned\n");
+	}
+	if (sc->hn_chim_cnt % LONG_BIT != 0) {
+		if_printf(sc->hn_ifp, "discard %d chimney sending sections\n",
+		    sc->hn_chim_cnt % LONG_BIT);
+	}
+
+	sc->hn_chim_bmap_cnt = sc->hn_chim_cnt / LONG_BIT;
+	sc->hn_chim_bmap = malloc(sc->hn_chim_bmap_cnt * sizeof(u_long),
+	    M_DEVBUF, M_WAITOK | M_ZERO);
+
+	/* Done! */
+	sc->hn_flags |= HN_FLAG_CHIM_CONNECTED;
+	if (bootverbose) {
+		if_printf(sc->hn_ifp, "chimney sending buffer %d/%d\n",
+		    sc->hn_chim_szmax, sc->hn_chim_cnt);
+	}
+	return (0);
+
+cleanup:
+	if (xact != NULL)
+		vmbus_xact_put(xact);
+	hn_nvs_disconn_chim(sc);
+	return (error);
+}
+
+static void
+hn_nvs_disconn_rxbuf(struct hn_softc *sc)
+{
+	int error;
+
+	if (sc->hn_flags & HN_FLAG_RXBUF_CONNECTED) {
+		struct hn_nvs_rxbuf_disconn disconn;
+
+		/*
+		 * Disconnect RXBUF from NVS.
+		 */
+		memset(&disconn, 0, sizeof(disconn));
+		disconn.nvs_type = HN_NVS_TYPE_RXBUF_DISCONN;
+		disconn.nvs_sig = HN_NVS_RXBUF_SIG;
+
+		/* NOTE: No response. */
+		error = hn_nvs_req_send(sc, &disconn, sizeof(disconn));
+		if (error) {
+			if_printf(sc->hn_ifp,
+			    "send nvs rxbuf disconn failed: %d\n", error);
+			/*
+			 * Fine for a revoked channel, since the hypervisor
+			 * does not drain TX bufring for a revoked channel.
+			 */
+			if (!vmbus_chan_is_revoked(sc->hn_prichan))
+				sc->hn_flags |= HN_FLAG_RXBUF_REF;
+		}
+		sc->hn_flags &= ~HN_FLAG_RXBUF_CONNECTED;
+
+		/*
+		 * Wait for the hypervisor to receive this NVS request.
+		 *
+		 * NOTE:
+		 * The TX bufring will not be drained by the hypervisor,
+		 * if the primary channel is revoked.
+		 */
+		while (!vmbus_chan_tx_empty(sc->hn_prichan) &&
+		    !vmbus_chan_is_revoked(sc->hn_prichan))
+			pause("waittx", 1);
+		/*
+		 * Linger long enough for NVS to disconnect RXBUF.
+		 */
+		pause("lingtx", (200 * hz) / 1000);
+	}
+
+	if (vmbus_current_version < VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
+		/*
+		 * Disconnect RXBUF from primary channel.
+		 */
+		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
+		    sc->hn_rxbuf_gpadl);
+		if (error) {
+			if_printf(sc->hn_ifp,
+			    "rxbuf gpadl disconn failed: %d\n", error);
+			sc->hn_flags |= HN_FLAG_RXBUF_REF;
+		}
+		sc->hn_rxbuf_gpadl = 0;
+	}
+}
+
+static void
+hn_nvs_disconn_chim(struct hn_softc *sc)
+{
+	int error;
+
+	if (sc->hn_flags & HN_FLAG_CHIM_CONNECTED) {
+		struct hn_nvs_chim_disconn disconn;
+
+		/*
+		 * Disconnect chimney sending buffer from NVS.
+		 */
+		memset(&disconn, 0, sizeof(disconn));
+		disconn.nvs_type = HN_NVS_TYPE_CHIM_DISCONN;
+		disconn.nvs_sig = HN_NVS_CHIM_SIG;
+
+		/* NOTE: No response. */
+		error = hn_nvs_req_send(sc, &disconn, sizeof(disconn));
+		if (error) {
+			if_printf(sc->hn_ifp,
+			    "send nvs chim disconn failed: %d\n", error);
+			/*
+			 * Fine for a revoked channel, since the hypervisor
+			 * does not drain TX bufring for a revoked channel.
+			 */
+			if (!vmbus_chan_is_revoked(sc->hn_prichan))
+				sc->hn_flags |= HN_FLAG_CHIM_REF;
+		}
+		sc->hn_flags &= ~HN_FLAG_CHIM_CONNECTED;
+
+		/*
+		 * Wait for the hypervisor to receive this NVS request.
+		 *
+		 * NOTE:
+		 * The TX bufring will not be drained by the hypervisor,
+		 * if the primary channel is revoked.
+		 */
+		while (!vmbus_chan_tx_empty(sc->hn_prichan) &&
+		    !vmbus_chan_is_revoked(sc->hn_prichan))
+			pause("waittx", 1);
+		/*
+		 * Linger long enough for NVS to disconnect chimney
+		 * sending buffer.
+		 */
+		pause("lingtx", (200 * hz) / 1000);
+	}
+
+	if (vmbus_current_version < VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
+		/*
+		 * Disconnect chimney sending buffer from primary channel.
+		 */
+		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
+		    sc->hn_chim_gpadl);
+		if (error) {
+			if_printf(sc->hn_ifp,
+			    "chim gpadl disconn failed: %d\n", error);
+			sc->hn_flags |= HN_FLAG_CHIM_REF;
+		}
+		sc->hn_chim_gpadl = 0;
+	}
+
+	if (sc->hn_chim_bmap != NULL) {
+		free(sc->hn_chim_bmap, M_DEVBUF);
+		sc->hn_chim_bmap = NULL;
+		sc->hn_chim_bmap_cnt = 0;
+	}
+}
+
+static int
+hn_nvs_doinit(struct hn_softc *sc, uint32_t nvs_ver)
+{
+	struct vmbus_xact *xact;
+	struct hn_nvs_init *init;
+	const struct hn_nvs_init_resp *resp;
+	size_t resp_len;
+	uint32_t status;
+
+	xact = vmbus_xact_get(sc->hn_xact, sizeof(*init));
+	if (xact == NULL) {
+		if_printf(sc->hn_ifp, "no xact for nvs init\n");
+		return (ENXIO);
+	}
+	init = vmbus_xact_req_data(xact);
+	init->nvs_type = HN_NVS_TYPE_INIT;
+	init->nvs_ver_min = nvs_ver;
+	init->nvs_ver_max = nvs_ver;
+
+	resp_len = sizeof(*resp);
+	resp = hn_nvs_xact_execute(sc, xact, init, sizeof(*init), &resp_len,
+	    HN_NVS_TYPE_INIT_RESP);
+	if (resp == NULL) {
+		if_printf(sc->hn_ifp, "exec init failed\n");
+		vmbus_xact_put(xact);
+		return (EIO);
+	}
+
+	status = resp->nvs_status;
+	vmbus_xact_put(xact);
+
+	if (status != HN_NVS_STATUS_OK) {
+		if (bootverbose) {
+			/*
+			 * Caller may try another NVS version, and will log
+			 * error if there are no more NVS versions to try,
+			 * so don't bark out loud here.
+			 */
+			if_printf(sc->hn_ifp, "nvs init failed for ver 0x%x\n",
+			    nvs_ver);
+		}
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * Configure MTU and enable VLAN.
+ */
+static int
+hn_nvs_conf_ndis(struct hn_softc *sc, int mtu)
+{
+	struct hn_nvs_ndis_conf conf;
+	int error;
+
+	memset(&conf, 0, sizeof(conf));
+	conf.nvs_type = HN_NVS_TYPE_NDIS_CONF;
+	conf.nvs_mtu = mtu + ETHER_HDR_LEN;
+	conf.nvs_caps = HN_NVS_NDIS_CONF_VLAN;
+	if (sc->hn_nvs_ver >= HN_NVS_VERSION_5)
+		conf.nvs_caps |= HN_NVS_NDIS_CONF_SRIOV;
+	if (sc->hn_nvs_ver >= HN_NVS_VERSION_61)
+		conf.nvs_caps |= HN_NVS_NDIS_CONF_RSC;
+
+
+	/* NOTE: No response. */
+	error = hn_nvs_req_send(sc, &conf, sizeof(conf));
+	if (error) {
+		if_printf(sc->hn_ifp, "send nvs ndis conf failed: %d\n", error);
+		return (error);
+	}
+
+	if (bootverbose)
+		if_printf(sc->hn_ifp, "nvs ndis conf done\n");
+	sc->hn_caps |= HN_CAP_MTU | HN_CAP_VLAN;
+	return (0);
+}
+
+static int
+hn_nvs_init_ndis(struct hn_softc *sc)
+{
+	struct hn_nvs_ndis_init ndis;
+	int error;
+
+	memset(&ndis, 0, sizeof(ndis));
+	ndis.nvs_type = HN_NVS_TYPE_NDIS_INIT;
+	ndis.nvs_ndis_major = HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver);
+	ndis.nvs_ndis_minor = HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver);
+
+	/* NOTE: No response. */
+	error = hn_nvs_req_send(sc, &ndis, sizeof(ndis));
+	if (error)
+		if_printf(sc->hn_ifp, "send nvs ndis init failed: %d\n", error);
+	return (error);
+}
+
+static int
+hn_nvs_init(struct hn_softc *sc)
+{
+	int i, error;
+
+	if (device_is_attached(sc->hn_dev)) {
+		/*
+		 * NVS version and NDIS version MUST NOT be changed.
+		 */
+		if (bootverbose) {
+			if_printf(sc->hn_ifp, "reinit NVS version 0x%x, "
+			    "NDIS version %u.%u\n", sc->hn_nvs_ver,
+			    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
+			    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
+		}
+
+		error = hn_nvs_doinit(sc, sc->hn_nvs_ver);
+		if (error) {
+			if_printf(sc->hn_ifp, "reinit NVS version 0x%x "
+			    "failed: %d\n", sc->hn_nvs_ver, error);
+			return (error);
+		}
+		goto done;
+	}
+
+	/*
+	 * Find the supported NVS version and set NDIS version accordingly.
+	 */
+	for (i = 0; i < nitems(hn_nvs_version); ++i) {
+		error = hn_nvs_doinit(sc, hn_nvs_version[i]);
+		if (!error) {
+			sc->hn_nvs_ver = hn_nvs_version[i];
+
+			/* Set NDIS version according to NVS version. */
+			sc->hn_ndis_ver = HN_NDIS_VERSION_6_30;
+			if (sc->hn_nvs_ver <= HN_NVS_VERSION_4)
+				sc->hn_ndis_ver = HN_NDIS_VERSION_6_1;
+
+			if (bootverbose) {
+				if_printf(sc->hn_ifp, "NVS version 0x%x, "
+				    "NDIS version %u.%u\n", sc->hn_nvs_ver,
+				    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
+				    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
+			}
+			goto done;
+		}
+	}
+	if_printf(sc->hn_ifp, "no NVS available\n");
+	return (ENXIO);
+
+done:
+	if (sc->hn_nvs_ver >= HN_NVS_VERSION_5)
+		sc->hn_caps |= HN_CAP_HASHVAL;
+	return (0);
+}
+
+int
+hn_nvs_attach(struct hn_softc *sc, int mtu)
+{
+	int error;
+
+	if (hyperv_ver_major >= 10) {
+		/* UDP 4-tuple hash is enforced. */
+		sc->hn_caps |= HN_CAP_UDPHASH;
+	}
+
+	/*
+	 * Initialize NVS.
+	 */
+	error = hn_nvs_init(sc);
+	if (error)
+		return (error);
+
+	if (sc->hn_nvs_ver >= HN_NVS_VERSION_2) {
+		/*
+		 * Configure NDIS before initializing it.
+		 */
+		error = hn_nvs_conf_ndis(sc, mtu);
+		if (error)
+			return (error);
+	}
+
+	/*
+	 * Initialize NDIS.
+	 */
+	error = hn_nvs_init_ndis(sc);
+	if (error)
+		return (error);
+
+	/*
+	 * Connect RXBUF.
+	 */
+	error = hn_nvs_conn_rxbuf(sc);
+	if (error)
+		return (error);
+
+	/*
+	 * Connect chimney sending buffer.
+	 */
+	error = hn_nvs_conn_chim(sc);
+	if (error) {
+		hn_nvs_disconn_rxbuf(sc);
+		return (error);
+	}
+	return (0);
+}
+
+void
+hn_nvs_detach(struct hn_softc *sc)
+{
+
+	/* NOTE: there are no requests to stop the NVS. */
+	hn_nvs_disconn_rxbuf(sc);
+	hn_nvs_disconn_chim(sc);
+}
+
+void
+hn_nvs_sent_xact(struct hn_nvs_sendctx *sndc,
+    struct hn_softc *sc __unused, struct vmbus_channel *chan __unused,
+    const void *data, int dlen)
+{
+
+	vmbus_xact_wakeup(sndc->hn_cbarg, data, dlen);
+}
+
+static void
+hn_nvs_sent_none(struct hn_nvs_sendctx *sndc __unused,
+    struct hn_softc *sc __unused, struct vmbus_channel *chan __unused,
+    const void *data __unused, int dlen __unused)
+{
+	/* EMPTY */
+}
+
+int
+hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch0)
+{
+	struct vmbus_xact *xact;
+	struct hn_nvs_subch_req *req;
+	const struct hn_nvs_subch_resp *resp;
+	int error, nsubch_req;
+	uint32_t nsubch;
+	size_t resp_len;
+
+	nsubch_req = *nsubch0;
+	KASSERT(nsubch_req > 0, ("invalid # of sub-channels %d", nsubch_req));
+
+	xact = vmbus_xact_get(sc->hn_xact, sizeof(*req));
+	if (xact == NULL) {
+		if_printf(sc->hn_ifp, "no xact for nvs subch alloc\n");
+		return (ENXIO);
+	}
+	req = vmbus_xact_req_data(xact);
+	req->nvs_type = HN_NVS_TYPE_SUBCH_REQ;
+	req->nvs_op = HN_NVS_SUBCH_OP_ALLOC;
+	req->nvs_nsubch = nsubch_req;
+
+	resp_len = sizeof(*resp);
+	resp = hn_nvs_xact_execute(sc, xact, req, sizeof(*req), &resp_len,
+	    HN_NVS_TYPE_SUBCH_RESP);
+	if (resp == NULL) {
+		if_printf(sc->hn_ifp, "exec nvs subch alloc failed\n");
+		error = EIO;
+		goto done;
+	}
+	if (resp->nvs_status != HN_NVS_STATUS_OK) {
+		if_printf(sc->hn_ifp, "nvs subch alloc failed: %x\n",
+		    resp->nvs_status);
+		error = EIO;
+		goto done;
+	}
+
+	nsubch = resp->nvs_nsubch;
+	if (nsubch > nsubch_req) {
+		if_printf(sc->hn_ifp, "%u subchans are allocated, "
+		    "requested %d\n", nsubch, nsubch_req);
+		nsubch = nsubch_req;
+	}
+	*nsubch0 = nsubch;
+	error = 0;
+done:
+	vmbus_xact_put(xact);
+	return (error);
+}
+
+int
+hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan,
+    struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt)
+{
+
+	return hn_nvs_send_rndis_sglist(chan, HN_NVS_RNDIS_MTYPE_CTRL,
+	    sndc, gpa, gpa_cnt);
+}
+
+void
+hn_nvs_set_datapath(struct hn_softc *sc, uint32_t path)
+{
+	struct hn_nvs_datapath dp;
+
+	memset(&dp, 0, sizeof(dp));
+	dp.nvs_type = HN_NVS_TYPE_SET_DATAPATH;
+	dp.nvs_active_path = path;
+
+	hn_nvs_req_send(sc, &dp, sizeof(dp));
+}
diff --git a/sys/dev/hyperv/netvsc/hn_nvs.h b/sys/dev/hyperv/netvsc/hn_nvs.h
new file mode 100644
index 000000000000..a14d7b765590
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/hn_nvs.h
@@ -0,0 +1,107 @@
+/*-
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2010-2012 Citrix Inc.
+ * Copyright (c) 2012 NetApp Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HN_NVS_H_
+#define _HN_NVS_H_
+
+struct hn_nvs_sendctx;
+struct vmbus_channel;
+struct hn_softc;
+
+typedef void		(*hn_nvs_sent_t)
+			(struct hn_nvs_sendctx *, struct hn_softc *,
+			 struct vmbus_channel *, const void *, int);
+
+struct hn_nvs_sendctx {
+	hn_nvs_sent_t	hn_cb;
+	void		*hn_cbarg;
+};
+
+#define HN_NVS_SENDCTX_INITIALIZER(cb, cbarg)	\
+{						\
+	.hn_cb		= cb,			\
+	.hn_cbarg	= cbarg			\
+}
+
+static __inline void
+hn_nvs_sendctx_init(struct hn_nvs_sendctx *sndc, hn_nvs_sent_t cb, void *cbarg)
+{
+
+	sndc->hn_cb = cb;
+	sndc->hn_cbarg = cbarg;
+}
+
+static __inline int
+hn_nvs_send(struct vmbus_channel *chan, uint16_t flags,
+    void *nvs_msg, int nvs_msglen, struct hn_nvs_sendctx *sndc)
+{
+
+	return (vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, flags,
+	    nvs_msg, nvs_msglen, (uint64_t)(uintptr_t)sndc));
+}
+
+static __inline int
+hn_nvs_send_sglist(struct vmbus_channel *chan, struct vmbus_gpa sg[], int sglen,
+    void *nvs_msg, int nvs_msglen, struct hn_nvs_sendctx *sndc)
+{
+
+	return (vmbus_chan_send_sglist(chan, sg, sglen, nvs_msg, nvs_msglen,
+	    (uint64_t)(uintptr_t)sndc));
+}
+
+static __inline int
+hn_nvs_send_rndis_sglist(struct vmbus_channel *chan, uint32_t rndis_mtype,
+    struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt)
+{
+	struct hn_nvs_rndis rndis;
+
+	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
+	rndis.nvs_rndis_mtype = rndis_mtype;
+	rndis.nvs_chim_idx = HN_NVS_CHIM_IDX_INVALID;
+	rndis.nvs_chim_sz = 0;
+
+	return (hn_nvs_send_sglist(chan, gpa, gpa_cnt,
+	    &rndis, sizeof(rndis), sndc));
+}
+
+int		hn_nvs_attach(struct hn_softc *sc, int mtu);
+void		hn_nvs_detach(struct hn_softc *sc);
+int		hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch);
+void		hn_nvs_sent_xact(struct hn_nvs_sendctx *sndc,
+		    struct hn_softc *sc, struct vmbus_channel *chan,
+		    const void *data, int dlen);
+int		hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan,
+		    struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa,
+		    int gpa_cnt);
+void		hn_nvs_set_datapath(struct hn_softc *sc, uint32_t path);
+
+extern struct hn_nvs_sendctx	hn_nvs_sendctx_none;
+
+#endif  /* !_HN_NVS_H_ */
diff --git a/sys/dev/hyperv/netvsc/hn_rndis.c b/sys/dev/hyperv/netvsc/hn_rndis.c
new file mode 100644
index 000000000000..108950aa3f9b
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/hn_rndis.c
@@ -0,0 +1,1061 @@
+/*-
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2010-2012 Citrix Inc.
+ * Copyright (c) 2012 NetApp Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet6.h"
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+
+#include <machine/atomic.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_media.h>
+#include <net/rndis.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_lro.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+
+#include <dev/hyperv/netvsc/ndis.h>
+#include <dev/hyperv/netvsc/if_hnreg.h>
+#include <dev/hyperv/netvsc/if_hnvar.h>
+#include <dev/hyperv/netvsc/hn_nvs.h>
+#include <dev/hyperv/netvsc/hn_rndis.h>
+
+#define HN_RNDIS_RID_COMPAT_MASK	0xffff
+#define HN_RNDIS_RID_COMPAT_MAX		HN_RNDIS_RID_COMPAT_MASK
+
+#define HN_RNDIS_XFER_SIZE		2048
+
+#define HN_NDIS_TXCSUM_CAP_IP4		\
+	(NDIS_TXCSUM_CAP_IP4 | NDIS_TXCSUM_CAP_IP4OPT)
+#define HN_NDIS_TXCSUM_CAP_TCP4		\
+	(NDIS_TXCSUM_CAP_TCP4 | NDIS_TXCSUM_CAP_TCP4OPT)
+#define HN_NDIS_TXCSUM_CAP_TCP6		\
+	(NDIS_TXCSUM_CAP_TCP6 | NDIS_TXCSUM_CAP_TCP6OPT | \
+	 NDIS_TXCSUM_CAP_IP6EXT)
+#define HN_NDIS_TXCSUM_CAP_UDP6		\
+	(NDIS_TXCSUM_CAP_UDP6 | NDIS_TXCSUM_CAP_IP6EXT)
+#define HN_NDIS_LSOV2_CAP_IP6		\
+	(NDIS_LSOV2_CAP_IP6EXT | NDIS_LSOV2_CAP_TCP6OPT)
+
+static const void	*hn_rndis_xact_exec1(struct hn_softc *,
+			    struct vmbus_xact *, size_t,
+			    struct hn_nvs_sendctx *, size_t *);
+static const void	*hn_rndis_xact_execute(struct hn_softc *,
+			    struct vmbus_xact *, uint32_t, size_t, size_t *,
+			    uint32_t);
+static int		hn_rndis_query(struct hn_softc *, uint32_t,
+			    const void *, size_t, void *, size_t *);
+static int		hn_rndis_query2(struct hn_softc *, uint32_t,
+			    const void *, size_t, void *, size_t *, size_t);
+static int		hn_rndis_set(struct hn_softc *, uint32_t,
+			    const void *, size_t);
+static int		hn_rndis_init(struct hn_softc *);
+static int		hn_rndis_halt(struct hn_softc *);
+static int		hn_rndis_conf_offload(struct hn_softc *, int);
+static int		hn_rndis_query_hwcaps(struct hn_softc *,
+			    struct ndis_offload *);
+
+static __inline uint32_t
+hn_rndis_rid(struct hn_softc *sc)
+{
+	uint32_t rid;
+
+again:
+	rid = atomic_fetchadd_int(&sc->hn_rndis_rid, 1);
+	if (rid == 0)
+		goto again;
+
+	/* Use upper 16 bits for non-compat RNDIS messages. */
+	return ((rid & 0xffff) << 16);
+}
+
+void
+hn_rndis_rx_ctrl(struct hn_softc *sc, const void *data, int dlen)
+{
+	const struct rndis_comp_hdr *comp;
+	const struct rndis_msghdr *hdr;
+
+	KASSERT(dlen >= sizeof(*hdr), ("invalid RNDIS msg\n"));
+	hdr = data;
+
+	switch (hdr->rm_type) {
+	case REMOTE_NDIS_INITIALIZE_CMPLT:
+	case REMOTE_NDIS_QUERY_CMPLT:
+	case REMOTE_NDIS_SET_CMPLT:
+	case REMOTE_NDIS_KEEPALIVE_CMPLT:	/* unused */
+		if (dlen < sizeof(*comp)) {
+			if_printf(sc->hn_ifp, "invalid RNDIS cmplt\n");
+			return;
+		}
+		comp = data;
+
+		KASSERT(comp->rm_rid > HN_RNDIS_RID_COMPAT_MAX,
+		    ("invalid RNDIS rid 0x%08x\n", comp->rm_rid));
+		vmbus_xact_ctx_wakeup(sc->hn_xact, comp, dlen);
+		break;
+
+	case REMOTE_NDIS_RESET_CMPLT:
+		/*
+		 * Reset completed, no rid.
+		 *
+		 * NOTE:
+		 * RESET is not issued by hn(4), so this message should
+		 * _not_ be observed.
+		 */
+		if_printf(sc->hn_ifp, "RESET cmplt received\n");
+		break;
+
+	default:
+		if_printf(sc->hn_ifp, "unknown RNDIS msg 0x%x\n",
+		    hdr->rm_type);
+		break;
+	}
+}
+
+int
+hn_rndis_get_eaddr(struct hn_softc *sc, uint8_t *eaddr)
+{
+	size_t eaddr_len;
+	int error;
+
+	eaddr_len = ETHER_ADDR_LEN;
+	error = hn_rndis_query(sc, OID_802_3_PERMANENT_ADDRESS, NULL, 0,
+	    eaddr, &eaddr_len);
+	if (error)
+		return (error);
+	if (eaddr_len != ETHER_ADDR_LEN) {
+		if_printf(sc->hn_ifp, "invalid eaddr len %zu\n", eaddr_len);
+		return (EINVAL);
+	}
+	return (0);
+}
+
+int
+hn_rndis_get_linkstatus(struct hn_softc *sc, uint32_t *link_status)
+{
+	size_t size;
+	int error;
+
+	size = sizeof(*link_status);
+	error = hn_rndis_query(sc, OID_GEN_MEDIA_CONNECT_STATUS, NULL, 0,
+	    link_status, &size);
+	if (error)
+		return (error);
+	if (size != sizeof(uint32_t)) {
+		if_printf(sc->hn_ifp, "invalid link status len %zu\n", size);
+		return (EINVAL);
+	}
+	return (0);
+}
+
+int
+hn_rndis_get_mtu(struct hn_softc *sc, uint32_t *mtu)
+{
+	size_t size;
+	int error;
+
+	size = sizeof(*mtu);
+	error = hn_rndis_query(sc, OID_GEN_MAXIMUM_FRAME_SIZE, NULL, 0,
+	    mtu, &size);
+	if (error)
+		return (error);
+	if (size != sizeof(uint32_t)) {
+		if_printf(sc->hn_ifp, "invalid mtu len %zu\n", size);
+		return (EINVAL);
+	}
+	return (0);
+}
+
+static const void *
+hn_rndis_xact_exec1(struct hn_softc *sc, struct vmbus_xact *xact, size_t reqlen,
+    struct hn_nvs_sendctx *sndc, size_t *comp_len)
+{
+	struct vmbus_gpa gpa[HN_XACT_REQ_PGCNT];
+	int gpa_cnt, error;
+	bus_addr_t paddr;
+
+	KASSERT(reqlen <= HN_XACT_REQ_SIZE && reqlen > 0,
+	    ("invalid request length %zu", reqlen));
+
+	/*
+	 * Setup the SG list.
+	 */
+	paddr = vmbus_xact_req_paddr(xact);
+	KASSERT((paddr & PAGE_MASK) == 0,
+	    ("vmbus xact request is not page aligned 0x%jx", (uintmax_t)paddr));
+	for (gpa_cnt = 0; gpa_cnt < HN_XACT_REQ_PGCNT; ++gpa_cnt) {
+		int len = PAGE_SIZE;
+
+		if (reqlen == 0)
+			break;
+		if (reqlen < len)
+			len = reqlen;
+
+		gpa[gpa_cnt].gpa_page = atop(paddr) + gpa_cnt;
+		gpa[gpa_cnt].gpa_len = len;
+		gpa[gpa_cnt].gpa_ofs = 0;
+
+		reqlen -= len;
+	}
+	KASSERT(reqlen == 0, ("still have %zu request data left", reqlen));
+
+	/*
+	 * Send this RNDIS control message and wait for its completion
+	 * message.
+	 */
+	vmbus_xact_activate(xact);
+	error = hn_nvs_send_rndis_ctrl(sc->hn_prichan, sndc, gpa, gpa_cnt);
+	if (error) {
+		vmbus_xact_deactivate(xact);
+		if_printf(sc->hn_ifp, "RNDIS ctrl send failed: %d\n", error);
+		return (NULL);
+	}
+	return (vmbus_chan_xact_wait(sc->hn_prichan, xact, comp_len,
+	    HN_CAN_SLEEP(sc)));
+}
+
+static const void *
+hn_rndis_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact, uint32_t rid,
+    size_t reqlen, size_t *comp_len0, uint32_t comp_type)
+{
+	const struct rndis_comp_hdr *comp;
+	size_t comp_len, min_complen = *comp_len0;
+
+	KASSERT(rid > HN_RNDIS_RID_COMPAT_MAX, ("invalid rid %u\n", rid));
+	KASSERT(min_complen >= sizeof(*comp),
+	    ("invalid minimum complete len %zu", min_complen));
+
+	/*
+	 * Execute the xact setup by the caller.
+	 */
+	comp = hn_rndis_xact_exec1(sc, xact, reqlen, &hn_nvs_sendctx_none,
+	    &comp_len);
+	if (comp == NULL)
+		return (NULL);
+
+	/*
+	 * Check this RNDIS complete message.
+	 */
+	if (comp_len < min_complen) {
+		if (comp_len >= sizeof(*comp)) {
+			/* rm_status field is valid */
+			if_printf(sc->hn_ifp, "invalid RNDIS comp len %zu, "
+			    "status 0x%08x\n", comp_len, comp->rm_status);
+		} else {
+			if_printf(sc->hn_ifp, "invalid RNDIS comp len %zu\n",
+			    comp_len);
+		}
+		return (NULL);
+	}
+	if (comp->rm_len < min_complen) {
+		if_printf(sc->hn_ifp, "invalid RNDIS comp msglen %u\n",
+		    comp->rm_len);
+		return (NULL);
+	}
+	if (comp->rm_type != comp_type) {
+		if_printf(sc->hn_ifp, "unexpected RNDIS comp 0x%08x, "
+		    "expect 0x%08x\n", comp->rm_type, comp_type);
+		return (NULL);
+	}
+	if (comp->rm_rid != rid) {
+		if_printf(sc->hn_ifp, "RNDIS comp rid mismatch %u, "
+		    "expect %u\n", comp->rm_rid, rid);
+		return (NULL);
+	}
+	/* All pass! */
+	*comp_len0 = comp_len;
+	return (comp);
+}
+
+static int
+hn_rndis_query(struct hn_softc *sc, uint32_t oid,
+    const void *idata, size_t idlen, void *odata, size_t *odlen0)
+{
+
+	return (hn_rndis_query2(sc, oid, idata, idlen, odata, odlen0, *odlen0));
+}
+
+static int
+hn_rndis_query2(struct hn_softc *sc, uint32_t oid,
+    const void *idata, size_t idlen, void *odata, size_t *odlen0,
+    size_t min_odlen)
+{
+	struct rndis_query_req *req;
+	const struct rndis_query_comp *comp;
+	struct vmbus_xact *xact;
+	size_t reqlen, odlen = *odlen0, comp_len;
+	int error, ofs;
+	uint32_t rid;
+
+	reqlen = sizeof(*req) + idlen;
+	xact = vmbus_xact_get(sc->hn_xact, reqlen);
+	if (xact == NULL) {
+		if_printf(sc->hn_ifp, "no xact for RNDIS query 0x%08x\n", oid);
+		return (ENXIO);
+	}
+	rid = hn_rndis_rid(sc);
+	req = vmbus_xact_req_data(xact);
+	req->rm_type = REMOTE_NDIS_QUERY_MSG;
+	req->rm_len = reqlen;
+	req->rm_rid = rid;
+	req->rm_oid = oid;
+	/*
+	 * XXX
+	 * This is _not_ RNDIS Spec conforming:
+	 * "This MUST be set to 0 when there is no input data
+	 *  associated with the OID."
+	 *
+	 * If this field was set to 0 according to the RNDIS Spec,
+	 * Hyper-V would set non-SUCCESS status in the query
+	 * completion.
+	 */
+	req->rm_infobufoffset = RNDIS_QUERY_REQ_INFOBUFOFFSET;
+
+	if (idlen > 0) {
+		req->rm_infobuflen = idlen;
+		/* Input data immediately follows RNDIS query. */
+		memcpy(req + 1, idata, idlen);
+	}
+
+	comp_len = sizeof(*comp) + min_odlen;
+	comp = hn_rndis_xact_execute(sc, xact, rid, reqlen, &comp_len,
+	    REMOTE_NDIS_QUERY_CMPLT);
+	if (comp == NULL) {
+		if_printf(sc->hn_ifp, "exec RNDIS query 0x%08x failed\n", oid);
+		error = EIO;
+		goto done;
+	}
+
+	if (comp->rm_status != RNDIS_STATUS_SUCCESS) {
+		if_printf(sc->hn_ifp, "RNDIS query 0x%08x failed: "
+		    "status 0x%08x\n", oid, comp->rm_status);
+		error = EIO;
+		goto done;
+	}
+	if (comp->rm_infobuflen == 0 || comp->rm_infobufoffset == 0) {
+		/* No output data! */
+		if_printf(sc->hn_ifp, "RNDIS query 0x%08x, no data\n", oid);
+		*odlen0 = 0;
+		error = 0;
+		goto done;
+	}
+
+	/*
+	 * Check output data length and offset.
+	 */
+	/* ofs is the offset from the beginning of comp. */
+	ofs = RNDIS_QUERY_COMP_INFOBUFOFFSET_ABS(comp->rm_infobufoffset);
+	if (ofs < sizeof(*comp) || ofs + comp->rm_infobuflen > comp_len) {
+		if_printf(sc->hn_ifp, "RNDIS query invalid comp ib off/len, "
+		    "%u/%u\n", comp->rm_infobufoffset, comp->rm_infobuflen);
+		error = EINVAL;
+		goto done;
+	}
+
+	/*
+	 * Save output data.
+	 */
+	if (comp->rm_infobuflen < odlen)
+		odlen = comp->rm_infobuflen;
+	memcpy(odata, ((const uint8_t *)comp) + ofs, odlen);
+	*odlen0 = odlen;
+
+	error = 0;
+done:
+	vmbus_xact_put(xact);
+	return (error);
+}
+
+int
+hn_rndis_query_rsscaps(struct hn_softc *sc, int *rxr_cnt0)
+{
+	struct ndis_rss_caps in, caps;
+	size_t caps_len;
+	int error, indsz, rxr_cnt, hash_fnidx;
+	uint32_t hash_func = 0, hash_types = 0;
+
+	*rxr_cnt0 = 0;
+
+	if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_20)
+		return (EOPNOTSUPP);
+
+	memset(&in, 0, sizeof(in));
+	in.ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_CAPS;
+	in.ndis_hdr.ndis_rev = NDIS_RSS_CAPS_REV_2;
+	in.ndis_hdr.ndis_size = NDIS_RSS_CAPS_SIZE;
+
+	caps_len = NDIS_RSS_CAPS_SIZE;
+	error = hn_rndis_query2(sc, OID_GEN_RECEIVE_SCALE_CAPABILITIES,
+	    &in, NDIS_RSS_CAPS_SIZE, &caps, &caps_len, NDIS_RSS_CAPS_SIZE_6_0);
+	if (error)
+		return (error);
+
+	/*
+	 * Preliminary verification.
+	 */
+	if (caps.ndis_hdr.ndis_type != NDIS_OBJTYPE_RSS_CAPS) {
+		if_printf(sc->hn_ifp, "invalid NDIS objtype 0x%02x\n",
+		    caps.ndis_hdr.ndis_type);
+		return (EINVAL);
+	}
+	if (caps.ndis_hdr.ndis_rev < NDIS_RSS_CAPS_REV_1) {
+		if_printf(sc->hn_ifp, "invalid NDIS objrev 0x%02x\n",
+		    caps.ndis_hdr.ndis_rev);
+		return (EINVAL);
+	}
+	if (caps.ndis_hdr.ndis_size > caps_len) {
+		if_printf(sc->hn_ifp, "invalid NDIS objsize %u, "
+		    "data size %zu\n", caps.ndis_hdr.ndis_size, caps_len);
+		return (EINVAL);
+	} else if (caps.ndis_hdr.ndis_size < NDIS_RSS_CAPS_SIZE_6_0) {
+		if_printf(sc->hn_ifp, "invalid NDIS objsize %u\n",
+		    caps.ndis_hdr.ndis_size);
+		return (EINVAL);
+	}
+
+	/*
+	 * Save information for later RSS configuration.
+	 */
+	if (caps.ndis_nrxr == 0) {
+		if_printf(sc->hn_ifp, "0 RX rings!?\n");
+		return (EINVAL);
+	}
+	if (bootverbose)
+		if_printf(sc->hn_ifp, "%u RX rings\n", caps.ndis_nrxr);
+	rxr_cnt = caps.ndis_nrxr;
+
+	if (caps.ndis_hdr.ndis_size == NDIS_RSS_CAPS_SIZE &&
+	    caps.ndis_hdr.ndis_rev >= NDIS_RSS_CAPS_REV_2) {
+		if (caps.ndis_nind > NDIS_HASH_INDCNT) {
+			if_printf(sc->hn_ifp,
+			    "too many RSS indirect table entries %u\n",
+			    caps.ndis_nind);
+			return (EOPNOTSUPP);
+		}
+		if (!powerof2(caps.ndis_nind)) {
+			if_printf(sc->hn_ifp, "RSS indirect table size is not "
+			    "power-of-2 %u\n", caps.ndis_nind);
+		}
+
+		if (bootverbose) {
+			if_printf(sc->hn_ifp, "RSS indirect table size %u\n",
+			    caps.ndis_nind);
+		}
+		indsz = caps.ndis_nind;
+	} else {
+		indsz = NDIS_HASH_INDCNT;
+	}
+	if (indsz < rxr_cnt) {
+		if_printf(sc->hn_ifp, "# of RX rings (%d) > "
+		    "RSS indirect table size %d\n", rxr_cnt, indsz);
+		rxr_cnt = indsz;
+	}
+
+	/*
+	 * NOTE:
+	 * Toeplitz is at the lowest bit, and it is preferred; so ffs(),
+	 * instead of fls(), is used here.
+	 */
+	hash_fnidx = ffs(caps.ndis_caps & NDIS_RSS_CAP_HASHFUNC_MASK);
+	if (hash_fnidx == 0) {
+		if_printf(sc->hn_ifp, "no hash functions, caps 0x%08x\n",
+		    caps.ndis_caps);
+		return (EOPNOTSUPP);
+	}
+	hash_func = 1 << (hash_fnidx - 1); /* ffs is 1-based */
+
+	if (caps.ndis_caps & NDIS_RSS_CAP_IPV4)
+		hash_types |= NDIS_HASH_IPV4 | NDIS_HASH_TCP_IPV4;
+	if (caps.ndis_caps & NDIS_RSS_CAP_IPV6)
+		hash_types |= NDIS_HASH_IPV6 | NDIS_HASH_TCP_IPV6;
+	if (caps.ndis_caps & NDIS_RSS_CAP_IPV6_EX)
+		hash_types |= NDIS_HASH_IPV6_EX | NDIS_HASH_TCP_IPV6_EX;
+	if (hash_types == 0) {
+		if_printf(sc->hn_ifp, "no hash types, caps 0x%08x\n",
+		    caps.ndis_caps);
+		return (EOPNOTSUPP);
+	}
+	if (bootverbose)
+		if_printf(sc->hn_ifp, "RSS caps %#x\n", caps.ndis_caps);
+
+	/* Commit! */
+	sc->hn_rss_ind_size = indsz;
+	sc->hn_rss_hcap = hash_func | hash_types;
+	if (sc->hn_caps & HN_CAP_UDPHASH) {
+		/* UDP 4-tuple hash is unconditionally enabled. */
+		sc->hn_rss_hcap |= NDIS_HASH_UDP_IPV4_X;
+	}
+	*rxr_cnt0 = rxr_cnt;
+	return (0);
+}
+
+static int
+hn_rndis_set(struct hn_softc *sc, uint32_t oid, const void *data, size_t dlen)
+{
+	struct rndis_set_req *req;
+	const struct rndis_set_comp *comp;
+	struct vmbus_xact *xact;
+	size_t reqlen, comp_len;
+	uint32_t rid;
+	int error;
+
+	KASSERT(dlen > 0, ("invalid dlen %zu", dlen));
+
+	reqlen = sizeof(*req) + dlen;
+	xact = vmbus_xact_get(sc->hn_xact, reqlen);
+	if (xact == NULL) {
+		if_printf(sc->hn_ifp, "no xact for RNDIS set 0x%08x\n", oid);
+		return (ENXIO);
+	}
+	rid = hn_rndis_rid(sc);
+	req = vmbus_xact_req_data(xact);
+	req->rm_type = REMOTE_NDIS_SET_MSG;
+	req->rm_len = reqlen;
+	req->rm_rid = rid;
+	req->rm_oid = oid;
+	req->rm_infobuflen = dlen;
+	req->rm_infobufoffset = RNDIS_SET_REQ_INFOBUFOFFSET;
+	/* Data immediately follows RNDIS set. */
+	memcpy(req + 1, data, dlen);
+
+	comp_len = sizeof(*comp);
+	comp = hn_rndis_xact_execute(sc, xact, rid, reqlen, &comp_len,
+	    REMOTE_NDIS_SET_CMPLT);
+	if (comp == NULL) {
+		if_printf(sc->hn_ifp, "exec RNDIS set 0x%08x failed\n", oid);
+		error = EIO;
+		goto done;
+	}
+
+	if (comp->rm_status != RNDIS_STATUS_SUCCESS) {
+		if_printf(sc->hn_ifp, "RNDIS set 0x%08x failed: "
+		    "status 0x%08x\n", oid, comp->rm_status);
+		error = EIO;
+		goto done;
+	}
+	error = 0;
+done:
+	vmbus_xact_put(xact);
+	return (error);
+}
+
+static int
+hn_rndis_conf_offload(struct hn_softc *sc, int mtu)
+{
+	struct ndis_offload hwcaps;
+	struct ndis_offload_params params;
+	uint32_t caps = 0;
+	size_t paramsz;
+	int error, tso_maxsz, tso_minsg;
+
+	error = hn_rndis_query_hwcaps(sc, &hwcaps);
+	if (error) {
+		if_printf(sc->hn_ifp, "hwcaps query failed: %d\n", error);
+		return (error);
+	}
+
+	/* NOTE: 0 means "no change" */
+	memset(&params, 0, sizeof(params));
+
+	params.ndis_hdr.ndis_type = NDIS_OBJTYPE_DEFAULT;
+	if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_30) {
+		params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_2;
+		paramsz = NDIS_OFFLOAD_PARAMS_SIZE_6_1;
+	} else {
+		params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_3;
+		paramsz = NDIS_OFFLOAD_PARAMS_SIZE;
+	}
+	params.ndis_hdr.ndis_size = paramsz;
+
+	/*
+	 * TSO4/TSO6 setup.
+	 */
+	tso_maxsz = IP_MAXPACKET;
+	tso_minsg = 2;
+	if (hwcaps.ndis_lsov2.ndis_ip4_encap & NDIS_OFFLOAD_ENCAP_8023) {
+		caps |= HN_CAP_TSO4;
+		params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_ON;
+
+		if (hwcaps.ndis_lsov2.ndis_ip4_maxsz < tso_maxsz)
+			tso_maxsz = hwcaps.ndis_lsov2.ndis_ip4_maxsz;
+		if (hwcaps.ndis_lsov2.ndis_ip4_minsg > tso_minsg)
+			tso_minsg = hwcaps.ndis_lsov2.ndis_ip4_minsg;
+	}
+	if ((hwcaps.ndis_lsov2.ndis_ip6_encap & NDIS_OFFLOAD_ENCAP_8023) &&
+	    (hwcaps.ndis_lsov2.ndis_ip6_opts & HN_NDIS_LSOV2_CAP_IP6) ==
+	    HN_NDIS_LSOV2_CAP_IP6) {
+		caps |= HN_CAP_TSO6;
+		params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_ON;
+
+		if (hwcaps.ndis_lsov2.ndis_ip6_maxsz < tso_maxsz)
+			tso_maxsz = hwcaps.ndis_lsov2.ndis_ip6_maxsz;
+		if (hwcaps.ndis_lsov2.ndis_ip6_minsg > tso_minsg)
+			tso_minsg = hwcaps.ndis_lsov2.ndis_ip6_minsg;
+	}
+	sc->hn_ndis_tso_szmax = 0;
+	sc->hn_ndis_tso_sgmin = 0;
+	if (caps & (HN_CAP_TSO4 | HN_CAP_TSO6)) {
+		KASSERT(tso_maxsz <= IP_MAXPACKET,
+		    ("invalid NDIS TSO maxsz %d", tso_maxsz));
+		KASSERT(tso_minsg >= 2,
+		    ("invalid NDIS TSO minsg %d", tso_minsg));
+		if (tso_maxsz < tso_minsg * mtu) {
+			if_printf(sc->hn_ifp, "invalid NDIS TSO config: "
+			    "maxsz %d, minsg %d, mtu %d; "
+			    "disable TSO4 and TSO6\n",
+			    tso_maxsz, tso_minsg, mtu);
+			caps &= ~(HN_CAP_TSO4 | HN_CAP_TSO6);
+			params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_OFF;
+			params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_OFF;
+		} else {
+			sc->hn_ndis_tso_szmax = tso_maxsz;
+			sc->hn_ndis_tso_sgmin = tso_minsg;
+			if (bootverbose) {
+				if_printf(sc->hn_ifp, "NDIS TSO "
+				    "szmax %d sgmin %d\n",
+				    sc->hn_ndis_tso_szmax,
+				    sc->hn_ndis_tso_sgmin);
+			}
+		}
+	}
+
+	/* IPv4 checksum */
+	if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HN_NDIS_TXCSUM_CAP_IP4) ==
+	    HN_NDIS_TXCSUM_CAP_IP4) {
+		caps |= HN_CAP_IPCS;
+		params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TX;
+	}
+	if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_IP4) {
+		if (params.ndis_ip4csum == NDIS_OFFLOAD_PARAM_TX)
+			params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TXRX;
+		else
+			params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_RX;
+	}
+
+	/* TCP4 checksum */
+	if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HN_NDIS_TXCSUM_CAP_TCP4) ==
+	    HN_NDIS_TXCSUM_CAP_TCP4) {
+		caps |= HN_CAP_TCP4CS;
+		params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TX;
+	}
+	if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_TCP4) {
+		if (params.ndis_tcp4csum == NDIS_OFFLOAD_PARAM_TX)
+			params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TXRX;
+		else
+			params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_RX;
+	}
+
+	/* UDP4 checksum */
+	if (hwcaps.ndis_csum.ndis_ip4_txcsum & NDIS_TXCSUM_CAP_UDP4) {
+		caps |= HN_CAP_UDP4CS;
+		params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TX;
+	}
+	if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_UDP4) {
+		if (params.ndis_udp4csum == NDIS_OFFLOAD_PARAM_TX)
+			params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TXRX;
+		else
+			params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_RX;
+	}
+
+	/* TCP6 checksum */
+	if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HN_NDIS_TXCSUM_CAP_TCP6) ==
+	    HN_NDIS_TXCSUM_CAP_TCP6) {
+		caps |= HN_CAP_TCP6CS;
+		params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TX;
+	}
+	if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_TCP6) {
+		if (params.ndis_tcp6csum == NDIS_OFFLOAD_PARAM_TX)
+			params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TXRX;
+		else
+			params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_RX;
+	}
+
+	/* UDP6 checksum */
+	if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HN_NDIS_TXCSUM_CAP_UDP6) ==
+	    HN_NDIS_TXCSUM_CAP_UDP6) {
+		caps |= HN_CAP_UDP6CS;
+		params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TX;
+	}
+	if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_UDP6) {
+		if (params.ndis_udp6csum == NDIS_OFFLOAD_PARAM_TX)
+			params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TXRX;
+		else
+			params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_RX;
+	}
+
+	/* RSC offload */
+	if (hwcaps.ndis_hdr.ndis_rev >= NDIS_OFFLOAD_PARAMS_REV_3) {
+		if (hwcaps.ndis_rsc.ndis_ip4 && hwcaps.ndis_rsc.ndis_ip6) {
+			params.ndis_rsc_ip4 = NDIS_OFFLOAD_RSC_ON;
+			params.ndis_rsc_ip6 = NDIS_OFFLOAD_RSC_ON;
+		} else {
+			params.ndis_rsc_ip4 = NDIS_OFFLOAD_RSC_OFF;
+			params.ndis_rsc_ip6 = NDIS_OFFLOAD_RSC_OFF;
+		}
+	}
+
+	if (bootverbose) {
+		if_printf(sc->hn_ifp, "offload csum: "
+		    "ip4 %u, tcp4 %u, udp4 %u, tcp6 %u, udp6 %u\n",
+		    params.ndis_ip4csum,
+		    params.ndis_tcp4csum,
+		    params.ndis_udp4csum,
+		    params.ndis_tcp6csum,
+		    params.ndis_udp6csum);
+		if_printf(sc->hn_ifp, "offload lsov2: ip4 %u, ip6 %u\n",
+		    params.ndis_lsov2_ip4,
+		    params.ndis_lsov2_ip6);
+		if (hwcaps.ndis_hdr.ndis_rev >= NDIS_OFFLOAD_PARAMS_REV_3)
+			if_printf(sc->hn_ifp, "offload rsc: ip4 %u, ip6 %u\n",
+			    params.ndis_rsc_ip4,
+			    params.ndis_rsc_ip6);
+	}
+
+	error = hn_rndis_set(sc, OID_TCP_OFFLOAD_PARAMETERS, &params, paramsz);
+	if (error) {
+		if_printf(sc->hn_ifp, "offload config failed: %d\n", error);
+		return (error);
+	}
+
+	if (bootverbose)
+		if_printf(sc->hn_ifp, "offload config done\n");
+	sc->hn_caps |= caps;
+	return (0);
+}
+
+int
+hn_rndis_conf_rss(struct hn_softc *sc, uint16_t flags)
+{
+	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
+	struct ndis_rss_params *prm = &rss->rss_params;
+	int error, rss_size;
+
+	/*
+	 * Only NDIS 6.20+ is supported:
+	 * We only support 4bytes element in indirect table, which has been
+	 * adopted since NDIS 6.20.
+	 */
+	KASSERT(sc->hn_ndis_ver >= HN_NDIS_VERSION_6_20,
+	    ("NDIS 6.20+ is required, NDIS version 0x%08x", sc->hn_ndis_ver));
+
+	/* XXX only one can be specified through, popcnt? */
+	KASSERT((sc->hn_rss_hash & NDIS_HASH_FUNCTION_MASK),
+	    ("no hash func %08x", sc->hn_rss_hash));
+	KASSERT((sc->hn_rss_hash & NDIS_HASH_STD),
+	    ("no standard hash types %08x", sc->hn_rss_hash));
+	KASSERT(sc->hn_rss_ind_size > 0, ("no indirect table size"));
+
+	if (bootverbose) {
+		if_printf(sc->hn_ifp, "RSS indirect table size %d, "
+		    "hash 0x%08x\n", sc->hn_rss_ind_size, sc->hn_rss_hash);
+	}
+
+	/*
+	 * NOTE:
+	 * DO NOT whack rss_key and rss_ind, which are setup by the caller.
+	 */
+	memset(prm, 0, sizeof(*prm));
+	rss_size = NDIS_RSSPRM_TOEPLITZ_SIZE(sc->hn_rss_ind_size);
+
+	prm->ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_PARAMS;
+	prm->ndis_hdr.ndis_rev = NDIS_RSS_PARAMS_REV_2;
+	prm->ndis_hdr.ndis_size = rss_size;
+	prm->ndis_flags = flags;
+	prm->ndis_hash = sc->hn_rss_hash &
+	    (NDIS_HASH_FUNCTION_MASK | NDIS_HASH_STD);
+	prm->ndis_indsize = sizeof(rss->rss_ind[0]) * sc->hn_rss_ind_size;
+	prm->ndis_indoffset =
+	    __offsetof(struct ndis_rssprm_toeplitz, rss_ind[0]);
+	prm->ndis_keysize = sizeof(rss->rss_key);
+	prm->ndis_keyoffset =
+	    __offsetof(struct ndis_rssprm_toeplitz, rss_key[0]);
+
+	error = hn_rndis_set(sc, OID_GEN_RECEIVE_SCALE_PARAMETERS,
+	    rss, rss_size);
+	if (error) {
+		if_printf(sc->hn_ifp, "RSS config failed: %d\n", error);
+	} else {
+		if (bootverbose)
+			if_printf(sc->hn_ifp, "RSS config done\n");
+	}
+	return (error);
+}
+
+int
+hn_rndis_set_rxfilter(struct hn_softc *sc, uint32_t filter)
+{
+	int error;
+
+	error = hn_rndis_set(sc, OID_GEN_CURRENT_PACKET_FILTER,
+	    &filter, sizeof(filter));
+	if (error) {
+		if_printf(sc->hn_ifp, "set RX filter 0x%08x failed: %d\n",
+		    filter, error);
+	} else {
+		if (bootverbose) {
+			if_printf(sc->hn_ifp, "set RX filter 0x%08x done\n",
+			    filter);
+		}
+	}
+	return (error);
+}
+
+static int
+hn_rndis_init(struct hn_softc *sc)
+{
+	struct rndis_init_req *req;
+	const struct rndis_init_comp *comp;
+	struct vmbus_xact *xact;
+	size_t comp_len;
+	uint32_t rid;
+	int error;
+
+	xact = vmbus_xact_get(sc->hn_xact, sizeof(*req));
+	if (xact == NULL) {
+		if_printf(sc->hn_ifp, "no xact for RNDIS init\n");
+		return (ENXIO);
+	}
+	rid = hn_rndis_rid(sc);
+	req = vmbus_xact_req_data(xact);
+	req->rm_type = REMOTE_NDIS_INITIALIZE_MSG;
+	req->rm_len = sizeof(*req);
+	req->rm_rid = rid;
+	req->rm_ver_major = RNDIS_VERSION_MAJOR;
+	req->rm_ver_minor = RNDIS_VERSION_MINOR;
+	req->rm_max_xfersz = HN_RNDIS_XFER_SIZE;
+
+	comp_len = RNDIS_INIT_COMP_SIZE_MIN;
+	comp = hn_rndis_xact_execute(sc, xact, rid, sizeof(*req), &comp_len,
+	    REMOTE_NDIS_INITIALIZE_CMPLT);
+	if (comp == NULL) {
+		if_printf(sc->hn_ifp, "exec RNDIS init failed\n");
+		error = EIO;
+		goto done;
+	}
+
+	if (comp->rm_status != RNDIS_STATUS_SUCCESS) {
+		if_printf(sc->hn_ifp, "RNDIS init failed: status 0x%08x\n",
+		    comp->rm_status);
+		error = EIO;
+		goto done;
+	}
+	sc->hn_rndis_agg_size = comp->rm_pktmaxsz;
+	sc->hn_rndis_agg_pkts = comp->rm_pktmaxcnt;
+	sc->hn_rndis_agg_align = 1U << comp->rm_align;
+
+	if (sc->hn_rndis_agg_align < sizeof(uint32_t)) {
+		/*
+		 * The RNDIS packet messsage encap assumes that the RNDIS
+		 * packet message is at least 4 bytes aligned.  Fix up the
+		 * alignment here, if the remote side sets the alignment
+		 * too low.
+		 */
+		if_printf(sc->hn_ifp, "fixup RNDIS aggpkt align: %u -> %zu\n",
+		    sc->hn_rndis_agg_align, sizeof(uint32_t));
+		sc->hn_rndis_agg_align = sizeof(uint32_t);
+	}
+
+	if (bootverbose) {
+		if_printf(sc->hn_ifp, "RNDIS ver %u.%u, "
+		    "aggpkt size %u, aggpkt cnt %u, aggpkt align %u\n",
+		    comp->rm_ver_major, comp->rm_ver_minor,
+		    sc->hn_rndis_agg_size, sc->hn_rndis_agg_pkts,
+		    sc->hn_rndis_agg_align);
+	}
+	error = 0;
+done:
+	vmbus_xact_put(xact);
+	return (error);
+}
+
+static int
+hn_rndis_halt(struct hn_softc *sc)
+{
+	struct vmbus_xact *xact;
+	struct rndis_halt_req *halt;
+	struct hn_nvs_sendctx sndc;
+	size_t comp_len;
+
+	xact = vmbus_xact_get(sc->hn_xact, sizeof(*halt));
+	if (xact == NULL) {
+		if_printf(sc->hn_ifp, "no xact for RNDIS halt\n");
+		return (ENXIO);
+	}
+	halt = vmbus_xact_req_data(xact);
+	halt->rm_type = REMOTE_NDIS_HALT_MSG;
+	halt->rm_len = sizeof(*halt);
+	halt->rm_rid = hn_rndis_rid(sc);
+
+	/* No RNDIS completion; rely on NVS message send completion */
+	hn_nvs_sendctx_init(&sndc, hn_nvs_sent_xact, xact);
+	hn_rndis_xact_exec1(sc, xact, sizeof(*halt), &sndc, &comp_len);
+
+	vmbus_xact_put(xact);
+	if (bootverbose)
+		if_printf(sc->hn_ifp, "RNDIS halt done\n");
+	return (0);
+}
+
+static int
+hn_rndis_query_hwcaps(struct hn_softc *sc, struct ndis_offload *caps)
+{
+	struct ndis_offload in;
+	size_t caps_len, size;
+	int error;
+
+	memset(&in, 0, sizeof(in));
+	in.ndis_hdr.ndis_type = NDIS_OBJTYPE_OFFLOAD;
+	if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_30) {
+		in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_3;
+		size = NDIS_OFFLOAD_SIZE;
+	} else if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_1) {
+		in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_2;
+		size = NDIS_OFFLOAD_SIZE_6_1;
+	} else {
+		in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_1;
+		size = NDIS_OFFLOAD_SIZE_6_0;
+	}
+	in.ndis_hdr.ndis_size = size;
+
+	caps_len = NDIS_OFFLOAD_SIZE;
+	error = hn_rndis_query2(sc, OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES,
+	    &in, size, caps, &caps_len, NDIS_OFFLOAD_SIZE_6_0);
+	if (error)
+		return (error);
+
+	/*
+	 * Preliminary verification.
+	 */
+	if (caps->ndis_hdr.ndis_type != NDIS_OBJTYPE_OFFLOAD) {
+		if_printf(sc->hn_ifp, "invalid NDIS objtype 0x%02x\n",
+		    caps->ndis_hdr.ndis_type);
+		return (EINVAL);
+	}
+	if (caps->ndis_hdr.ndis_rev < NDIS_OFFLOAD_REV_1) {
+		if_printf(sc->hn_ifp, "invalid NDIS objrev 0x%02x\n",
+		    caps->ndis_hdr.ndis_rev);
+		return (EINVAL);
+	}
+	if (caps->ndis_hdr.ndis_size > caps_len) {
+		if_printf(sc->hn_ifp, "invalid NDIS objsize %u, "
+		    "data size %zu\n", caps->ndis_hdr.ndis_size, caps_len);
+		return (EINVAL);
+	} else if (caps->ndis_hdr.ndis_size < NDIS_OFFLOAD_SIZE_6_0) {
+		if_printf(sc->hn_ifp, "invalid NDIS objsize %u\n",
+		    caps->ndis_hdr.ndis_size);
+		return (EINVAL);
+	} else if (caps->ndis_hdr.ndis_rev >= NDIS_OFFLOAD_REV_3 &&
+		   caps->ndis_hdr.ndis_size < NDIS_OFFLOAD_SIZE) {
+		if_printf(sc->hn_ifp, "invalid NDIS rev3 objsize %u\n",
+		    caps->ndis_hdr.ndis_size);
+		return (EINVAL);
+	}
+
+	if (bootverbose) {
+		/*
+		 * NOTE:
+		 * caps->ndis_hdr.ndis_size MUST be checked before accessing
+		 * NDIS 6.1+ specific fields.
+		 */
+		if_printf(sc->hn_ifp, "hwcaps rev %u\n",
+		    caps->ndis_hdr.ndis_rev);
+
+		if_printf(sc->hn_ifp, "hwcaps csum: "
+		    "ip4 tx 0x%x/0x%x rx 0x%x/0x%x, "
+		    "ip6 tx 0x%x/0x%x rx 0x%x/0x%x\n",
+		    caps->ndis_csum.ndis_ip4_txcsum,
+		    caps->ndis_csum.ndis_ip4_txenc,
+		    caps->ndis_csum.ndis_ip4_rxcsum,
+		    caps->ndis_csum.ndis_ip4_rxenc,
+		    caps->ndis_csum.ndis_ip6_txcsum,
+		    caps->ndis_csum.ndis_ip6_txenc,
+		    caps->ndis_csum.ndis_ip6_rxcsum,
+		    caps->ndis_csum.ndis_ip6_rxenc);
+		if_printf(sc->hn_ifp, "hwcaps lsov2: "
+		    "ip4 maxsz %u minsg %u encap 0x%x, "
+		    "ip6 maxsz %u minsg %u encap 0x%x opts 0x%x\n",
+		    caps->ndis_lsov2.ndis_ip4_maxsz,
+		    caps->ndis_lsov2.ndis_ip4_minsg,
+		    caps->ndis_lsov2.ndis_ip4_encap,
+		    caps->ndis_lsov2.ndis_ip6_maxsz,
+		    caps->ndis_lsov2.ndis_ip6_minsg,
+		    caps->ndis_lsov2.ndis_ip6_encap,
+		    caps->ndis_lsov2.ndis_ip6_opts);
+		if (caps->ndis_hdr.ndis_rev >= NDIS_OFFLOAD_REV_3)
+			if_printf(sc->hn_ifp, "hwcaps rsc: "
+			    "ip4 %u ip6 %u\n",
+			    caps->ndis_rsc.ndis_ip4,
+			    caps->ndis_rsc.ndis_ip6);
+	}
+	return (0);
+}
+
+int
+hn_rndis_attach(struct hn_softc *sc, int mtu, int *init_done)
+{
+	int error;
+
+	*init_done = 0;
+
+	/*
+	 * Initialize RNDIS.
+	 */
+	error = hn_rndis_init(sc);
+	if (error)
+		return (error);
+	*init_done = 1;
+
+	/*
+	 * Configure NDIS offload settings.
+	 */
+	hn_rndis_conf_offload(sc, mtu);
+	return (0);
+}
+
+void
+hn_rndis_detach(struct hn_softc *sc)
+{
+
+	/* Halt the RNDIS. */
+	hn_rndis_halt(sc);
+}
diff --git a/sys/dev/hyperv/netvsc/hn_rndis.h b/sys/dev/hyperv/netvsc/hn_rndis.h
new file mode 100644
index 000000000000..4610d5a10526
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/hn_rndis.h
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2010-2012 Citrix Inc.
+ * Copyright (c) 2012 NetApp Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HN_RNDIS_H_
+#define _HN_RNDIS_H_
+
+struct hn_softc;
+
+int		hn_rndis_attach(struct hn_softc *sc, int mtu, int *init_done);
+void		hn_rndis_detach(struct hn_softc *sc);
+int		hn_rndis_conf_rss(struct hn_softc *sc, uint16_t flags);
+int		hn_rndis_query_rsscaps(struct hn_softc *sc, int *rxr_cnt);
+int		hn_rndis_get_eaddr(struct hn_softc *sc, uint8_t *eaddr);
+/* link_status: NDIS_MEDIA_STATE_ */
+int		hn_rndis_get_linkstatus(struct hn_softc *sc,
+		    uint32_t *link_status);
+int		hn_rndis_get_mtu(struct hn_softc *sc, uint32_t *mtu);
+/* filter: NDIS_PACKET_TYPE_. */
+int		hn_rndis_set_rxfilter(struct hn_softc *sc, uint32_t filter);
+void		hn_rndis_rx_ctrl(struct hn_softc *sc, const void *data,
+		    int dlen);
+
+#endif  /* !_HN_RNDIS_H_ */
diff --git a/sys/dev/hyperv/netvsc/if_hn.c b/sys/dev/hyperv/netvsc/if_hn.c
new file mode 100644
index 000000000000..d562a937ecad
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/if_hn.c
@@ -0,0 +1,7717 @@
+/*-
+ * Copyright (c) 2010-2012 Citrix Inc.
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 2004-2006 Kip Macy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_hn.h"
+#include "opt_inet6.h"
+#include "opt_inet.h"
+#include "opt_rss.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/counter.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/proc.h>
+#include <sys/rmlock.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/buf_ring.h>
+#include <sys/eventhandler.h>
+#include <sys/epoch.h>
+
+#include <machine/atomic.h>
+#include <machine/in_cksum.h>
+
+#include <net/bpf.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_media.h>
+#include <net/if_types.h>
+#include <net/if_var.h>
+#include <net/rndis.h>
+#ifdef RSS
+#include <net/rss_config.h>
+#endif
+
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_lro.h>
+#include <netinet/udp.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+
+#include <dev/hyperv/netvsc/ndis.h>
+#include <dev/hyperv/netvsc/if_hnreg.h>
+#include <dev/hyperv/netvsc/if_hnvar.h>
+#include <dev/hyperv/netvsc/hn_nvs.h>
+#include <dev/hyperv/netvsc/hn_rndis.h>
+
+#include "vmbus_if.h"
+
+#define HN_IFSTART_SUPPORT
+
+#define HN_RING_CNT_DEF_MAX		8
+
+#define HN_VFMAP_SIZE_DEF		8
+
+#define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
+
+/* YYY should get it from the underlying channel */
+#define HN_TX_DESC_CNT			512
+
+#define HN_RNDIS_PKT_LEN					\
+	(sizeof(struct rndis_packet_msg) +			\
+	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
+	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
+	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
+	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
+#define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
+#define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
+
+#define HN_TX_DATA_BOUNDARY		PAGE_SIZE
+#define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
+#define HN_TX_DATA_SEGSIZE		PAGE_SIZE
+/* -1 for RNDIS packet message */
+#define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
+
+#define HN_DIRECT_TX_SIZE_DEF		128
+
+#define HN_EARLY_TXEOF_THRESH		8
+
+#define HN_PKTBUF_LEN_DEF		(16 * 1024)
+
+#define HN_LROENT_CNT_DEF		128
+
+#define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
+#define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
+/* YYY 2*MTU is a bit rough, but should be good enough. */
+#define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
+
+#define HN_LRO_ACKCNT_DEF		1
+
+#define HN_LOCK_INIT(sc)		\
+	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
+#define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
+#define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
+#define HN_LOCK(sc)					\
+do {							\
+	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
+		/* Relinquish cpu to avoid deadlock */	\
+		sched_relinquish(curthread);		\
+		DELAY(1000);				\
+	}						\
+} while (0)
+#define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
+
+#define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
+#define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
+#define HN_CSUM_IP_HWASSIST(sc)		\
+	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
+#define HN_CSUM_IP6_HWASSIST(sc)	\
+	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
+
+#define HN_PKTSIZE_MIN(align)		\
+	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
+	    HN_RNDIS_PKT_LEN, (align))
+#define HN_PKTSIZE(m, align)		\
+	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
+
+#ifdef RSS
+#define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
+#else
+#define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
+#endif
+
+struct hn_txdesc {
+#ifndef HN_USE_TXDESC_BUFRING
+	SLIST_ENTRY(hn_txdesc)		link;
+#endif
+	STAILQ_ENTRY(hn_txdesc)		agg_link;
+
+	/* Aggregated txdescs, in sending order. */
+	STAILQ_HEAD(, hn_txdesc)	agg_list;
+
+	/* The oldest packet, if transmission aggregation happens. */
+	struct mbuf			*m;
+	struct hn_tx_ring		*txr;
+	int				refs;
+	uint32_t			flags;	/* HN_TXD_FLAG_ */
+	struct hn_nvs_sendctx		send_ctx;
+	uint32_t			chim_index;
+	int				chim_size;
+
+	bus_dmamap_t			data_dmap;
+
+	bus_addr_t			rndis_pkt_paddr;
+	struct rndis_packet_msg		*rndis_pkt;
+	bus_dmamap_t			rndis_pkt_dmap;
+};
+
+#define HN_TXD_FLAG_ONLIST		0x0001
+#define HN_TXD_FLAG_DMAMAP		0x0002
+#define HN_TXD_FLAG_ONAGG		0x0004
+
+#define	HN_NDIS_PKTINFO_SUBALLOC	0x01
+#define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
+#define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
+
+struct packet_info_id {
+	uint8_t				ver;
+	uint8_t				flag;
+	uint16_t			pkt_id;
+};
+
+#define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
+
+
+struct hn_rxinfo {
+	const uint32_t			*vlan_info;
+	const uint32_t			*csum_info;
+	const uint32_t			*hash_info;
+	const uint32_t			*hash_value;
+	const struct packet_info_id	*pktinfo_id;
+};
+
+struct hn_rxvf_setarg {
+	struct hn_rx_ring	*rxr;
+	struct ifnet		*vf_ifp;
+};
+
+#define HN_RXINFO_VLAN			0x0001
+#define HN_RXINFO_CSUM			0x0002
+#define HN_RXINFO_HASHINF		0x0004
+#define HN_RXINFO_HASHVAL		0x0008
+#define HN_RXINFO_PKTINFO_ID		0x0010
+#define HN_RXINFO_ALL			\
+	(HN_RXINFO_VLAN |		\
+	 HN_RXINFO_CSUM |		\
+	 HN_RXINFO_HASHINF |		\
+	 HN_RXINFO_HASHVAL |		\
+	 HN_RXINFO_PKTINFO_ID)
+
+static int			hn_probe(device_t);
+static int			hn_attach(device_t);
+static int			hn_detach(device_t);
+static int			hn_shutdown(device_t);
+static void			hn_chan_callback(struct vmbus_channel *,
+				    void *);
+
+static void			hn_init(void *);
+static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
+#ifdef HN_IFSTART_SUPPORT
+static void			hn_start(struct ifnet *);
+#endif
+static int			hn_transmit(struct ifnet *, struct mbuf *);
+static void			hn_xmit_qflush(struct ifnet *);
+static int			hn_ifmedia_upd(struct ifnet *);
+static void			hn_ifmedia_sts(struct ifnet *,
+				    struct ifmediareq *);
+
+static void			hn_ifnet_event(void *, struct ifnet *, int);
+static void			hn_ifaddr_event(void *, struct ifnet *);
+static void			hn_ifnet_attevent(void *, struct ifnet *);
+static void			hn_ifnet_detevent(void *, struct ifnet *);
+static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
+
+static bool			hn_ismyvf(const struct hn_softc *,
+				    const struct ifnet *);
+static void			hn_rxvf_change(struct hn_softc *,
+				    struct ifnet *, bool);
+static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
+static void			hn_rxvf_set_task(void *, int);
+static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
+static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
+static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
+				    struct ifreq *);
+static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
+static bool			hn_xpnt_vf_isready(struct hn_softc *);
+static void			hn_xpnt_vf_setready(struct hn_softc *);
+static void			hn_xpnt_vf_init_taskfunc(void *, int);
+static void			hn_xpnt_vf_init(struct hn_softc *);
+static void			hn_xpnt_vf_setenable(struct hn_softc *);
+static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
+static void			hn_vf_rss_fixup(struct hn_softc *, bool);
+static void			hn_vf_rss_restore(struct hn_softc *);
+
+static int			hn_rndis_rxinfo(const void *, int,
+				    struct hn_rxinfo *);
+static void			hn_rndis_rx_data(struct hn_rx_ring *,
+				    const void *, int);
+static void			hn_rndis_rx_status(struct hn_softc *,
+				    const void *, int);
+static void			hn_rndis_init_fixat(struct hn_softc *, int);
+
+static void			hn_nvs_handle_notify(struct hn_softc *,
+				    const struct vmbus_chanpkt_hdr *);
+static void			hn_nvs_handle_comp(struct hn_softc *,
+				    struct vmbus_channel *,
+				    const struct vmbus_chanpkt_hdr *);
+static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
+				    struct vmbus_channel *,
+				    const struct vmbus_chanpkt_hdr *);
+static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
+				    struct vmbus_channel *, uint64_t);
+
+#if __FreeBSD_version >= 1100099
+static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
+#endif
+static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
+#if __FreeBSD_version < 1100095
+static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
+#else
+static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
+#endif
+static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
+#ifndef RSS
+static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
+#endif
+static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
+static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
+
+static void			hn_stop(struct hn_softc *, bool);
+static void			hn_init_locked(struct hn_softc *);
+static int			hn_chan_attach(struct hn_softc *,
+				    struct vmbus_channel *);
+static void			hn_chan_detach(struct hn_softc *,
+				    struct vmbus_channel *);
+static int			hn_attach_subchans(struct hn_softc *);
+static void			hn_detach_allchans(struct hn_softc *);
+static void			hn_chan_rollup(struct hn_rx_ring *,
+				    struct hn_tx_ring *);
+static void			hn_set_ring_inuse(struct hn_softc *, int);
+static int			hn_synth_attach(struct hn_softc *, int);
+static void			hn_synth_detach(struct hn_softc *);
+static int			hn_synth_alloc_subchans(struct hn_softc *,
+				    int *);
+static bool			hn_synth_attachable(const struct hn_softc *);
+static void			hn_suspend(struct hn_softc *);
+static void			hn_suspend_data(struct hn_softc *);
+static void			hn_suspend_mgmt(struct hn_softc *);
+static void			hn_resume(struct hn_softc *);
+static void			hn_resume_data(struct hn_softc *);
+static void			hn_resume_mgmt(struct hn_softc *);
+static void			hn_suspend_mgmt_taskfunc(void *, int);
+static void			hn_chan_drain(struct hn_softc *,
+				    struct vmbus_channel *);
+static void			hn_disable_rx(struct hn_softc *);
+static void			hn_drain_rxtx(struct hn_softc *, int);
+static void			hn_polling(struct hn_softc *, u_int);
+static void			hn_chan_polling(struct vmbus_channel *, u_int);
+static void			hn_mtu_change_fixup(struct hn_softc *);
+
+static void			hn_update_link_status(struct hn_softc *);
+static void			hn_change_network(struct hn_softc *);
+static void			hn_link_taskfunc(void *, int);
+static void			hn_netchg_init_taskfunc(void *, int);
+static void			hn_netchg_status_taskfunc(void *, int);
+static void			hn_link_status(struct hn_softc *);
+
+static int			hn_create_rx_data(struct hn_softc *, int);
+static void			hn_destroy_rx_data(struct hn_softc *);
+static int			hn_check_iplen(const struct mbuf *, int);
+static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
+static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
+static int			hn_rxfilter_config(struct hn_softc *);
+static int			hn_rss_reconfig(struct hn_softc *);
+static void			hn_rss_ind_fixup(struct hn_softc *);
+static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
+static int			hn_rxpkt(struct hn_rx_ring *);
+static uint32_t			hn_rss_type_fromndis(uint32_t);
+static uint32_t			hn_rss_type_tondis(uint32_t);
+
+static int			hn_tx_ring_create(struct hn_softc *, int);
+static void			hn_tx_ring_destroy(struct hn_tx_ring *);
+static int			hn_create_tx_data(struct hn_softc *, int);
+static void			hn_fixup_tx_data(struct hn_softc *);
+static void			hn_fixup_rx_data(struct hn_softc *);
+static void			hn_destroy_tx_data(struct hn_softc *);
+static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
+static void			hn_txdesc_gc(struct hn_tx_ring *,
+				    struct hn_txdesc *);
+static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
+				    struct hn_txdesc *, struct mbuf **);
+static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
+				    struct hn_txdesc *);
+static void			hn_set_chim_size(struct hn_softc *, int);
+static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
+static bool			hn_tx_ring_pending(struct hn_tx_ring *);
+static void			hn_tx_ring_qflush(struct hn_tx_ring *);
+static void			hn_resume_tx(struct hn_softc *, int);
+static void			hn_set_txagg(struct hn_softc *);
+static void			*hn_try_txagg(struct ifnet *,
+				    struct hn_tx_ring *, struct hn_txdesc *,
+				    int);
+static int			hn_get_txswq_depth(const struct hn_tx_ring *);
+static void			hn_txpkt_done(struct hn_nvs_sendctx *,
+				    struct hn_softc *, struct vmbus_channel *,
+				    const void *, int);
+static int			hn_txpkt_sglist(struct hn_tx_ring *,
+				    struct hn_txdesc *);
+static int			hn_txpkt_chim(struct hn_tx_ring *,
+				    struct hn_txdesc *);
+static int			hn_xmit(struct hn_tx_ring *, int);
+static void			hn_xmit_taskfunc(void *, int);
+static void			hn_xmit_txeof(struct hn_tx_ring *);
+static void			hn_xmit_txeof_taskfunc(void *, int);
+#ifdef HN_IFSTART_SUPPORT
+static int			hn_start_locked(struct hn_tx_ring *, int);
+static void			hn_start_taskfunc(void *, int);
+static void			hn_start_txeof(struct hn_tx_ring *);
+static void			hn_start_txeof_taskfunc(void *, int);
+#endif
+
+SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
+    "Hyper-V network interface");
+
+/* Trust tcp segment verification on host side. */
+static int			hn_trust_hosttcp = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
+    &hn_trust_hosttcp, 0,
+    "Trust tcp segment verification on host side, "
+    "when csum info is missing (global setting)");
+
+/* Trust udp datagrams verification on host side. */
+static int			hn_trust_hostudp = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
+    &hn_trust_hostudp, 0,
+    "Trust udp datagram verification on host side, "
+    "when csum info is missing (global setting)");
+
+/* Trust ip packets verification on host side. */
+static int			hn_trust_hostip = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
+    &hn_trust_hostip, 0,
+    "Trust ip packet verification on host side, "
+    "when csum info is missing (global setting)");
+
+/*
+ * Offload UDP/IPv4 checksum.
+ */
+static int			hn_enable_udp4cs = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
+    &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
+
+/*
+ * Offload UDP/IPv6 checksum.
+ */
+static int			hn_enable_udp6cs = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
+    &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
+
+/* Stats. */
+static counter_u64_t		hn_udpcs_fixup;
+SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
+    &hn_udpcs_fixup, "# of UDP checksum fixup");
+
+/*
+ * See hn_set_hlen().
+ *
+ * This value is for Azure.  For Hyper-V, set this above
+ * 65536 to disable UDP datagram checksum fixup.
+ */
+static int			hn_udpcs_fixup_mtu = 1420;
+SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
+    &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
+
+/* Limit TSO burst size */
+static int			hn_tso_maxlen = IP_MAXPACKET;
+SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
+    &hn_tso_maxlen, 0, "TSO burst limit");
+
+/* Limit chimney send size */
+static int			hn_tx_chimney_size = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
+    &hn_tx_chimney_size, 0, "Chimney send packet size limit");
+
+/* Limit the size of packet for direct transmission */
+static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
+SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
+    &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
+
+/* # of LRO entries per RX ring */
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
+SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
+    &hn_lro_entry_count, 0, "LRO entry count");
+#endif
+#endif
+
+static int			hn_tx_taskq_cnt = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
+    &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
+
+#define HN_TX_TASKQ_M_INDEP	0
+#define HN_TX_TASKQ_M_GLOBAL	1
+#define HN_TX_TASKQ_M_EVTTQ	2
+
+static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
+    &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
+    "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
+
+#ifndef HN_USE_TXDESC_BUFRING
+static int			hn_use_txdesc_bufring = 0;
+#else
+static int			hn_use_txdesc_bufring = 1;
+#endif
+SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
+    &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
+
+#ifdef HN_IFSTART_SUPPORT
+/* Use ifnet.if_start instead of ifnet.if_transmit */
+static int			hn_use_if_start = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
+    &hn_use_if_start, 0, "Use if_start TX method");
+#endif
+
+/* # of channels to use */
+static int			hn_chan_cnt = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
+    &hn_chan_cnt, 0,
+    "# of channels to use; each channel has one RX ring and one TX ring");
+
+/* # of transmit rings to use */
+static int			hn_tx_ring_cnt = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
+    &hn_tx_ring_cnt, 0, "# of TX rings to use");
+
+/* Software TX ring deptch */
+static int			hn_tx_swq_depth = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
+    &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
+
+/* Enable sorted LRO, and the depth of the per-channel mbuf queue */
+#if __FreeBSD_version >= 1100095
+static u_int			hn_lro_mbufq_depth = 0;
+SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
+    &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
+#endif
+
+/* Packet transmission aggregation size limit */
+static int			hn_tx_agg_size = -1;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
+    &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
+
+/* Packet transmission aggregation count limit */
+static int			hn_tx_agg_pkts = -1;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
+    &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
+
+/* VF list */
+SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
+    CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
+    hn_vflist_sysctl, "A",
+    "VF list");
+
+/* VF mapping */
+SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
+    CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
+    hn_vfmap_sysctl, "A",
+    "VF mapping");
+
+/* Transparent VF */
+static int			hn_xpnt_vf = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
+    &hn_xpnt_vf, 0, "Transparent VF mod");
+
+/* Accurate BPF support for Transparent VF */
+static int			hn_xpnt_vf_accbpf = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
+    &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
+
+/* Extra wait for transparent VF attach routing; unit seconds. */
+static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
+SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
+    &hn_xpnt_vf_attwait, 0,
+    "Extra wait for transparent VF attach routing; unit: seconds");
+
+static u_int			hn_cpu_index;	/* next CPU for channel */
+static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
+
+static struct rmlock		hn_vfmap_lock;
+static int			hn_vfmap_size;
+static struct ifnet		**hn_vfmap;
+
+#ifndef RSS
+static const uint8_t
+hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
+};
+#endif	/* !RSS */
+
+static const struct hyperv_guid	hn_guid = {
+	.hv_guid = {
+	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
+	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
+};
+
+static device_method_t hn_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		hn_probe),
+	DEVMETHOD(device_attach,	hn_attach),
+	DEVMETHOD(device_detach,	hn_detach),
+	DEVMETHOD(device_shutdown,	hn_shutdown),
+	DEVMETHOD_END
+};
+
+static driver_t hn_driver = {
+	"hn",
+	hn_methods,
+	sizeof(struct hn_softc)
+};
+
+static devclass_t hn_devclass;
+
+DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
+MODULE_VERSION(hn, 1);
+MODULE_DEPEND(hn, vmbus, 1, 1, 1);
+
+#if __FreeBSD_version >= 1100099
+static void
+hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
+{
+	int i;
+
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
+}
+#endif
+
+static int
+hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
+{
+
+	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
+	    txd->chim_size == 0, ("invalid rndis sglist txd"));
+	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
+	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
+}
+
+static int
+hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
+{
+	struct hn_nvs_rndis rndis;
+
+	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
+	    txd->chim_size > 0, ("invalid rndis chim txd"));
+
+	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
+	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
+	rndis.nvs_chim_idx = txd->chim_index;
+	rndis.nvs_chim_sz = txd->chim_size;
+
+	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
+	    &rndis, sizeof(rndis), &txd->send_ctx));
+}
+
+static __inline uint32_t
+hn_chim_alloc(struct hn_softc *sc)
+{
+	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
+	u_long *bmap = sc->hn_chim_bmap;
+	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
+
+	for (i = 0; i < bmap_cnt; ++i) {
+		int idx;
+
+		idx = ffsl(~bmap[i]);
+		if (idx == 0)
+			continue;
+
+		--idx; /* ffsl is 1-based */
+		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
+		    ("invalid i %d and idx %d", i, idx));
+
+		if (atomic_testandset_long(&bmap[i], idx))
+			continue;
+
+		ret = i * LONG_BIT + idx;
+		break;
+	}
+	return (ret);
+}
+
+static __inline void
+hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
+{
+	u_long mask;
+	uint32_t idx;
+
+	idx = chim_idx / LONG_BIT;
+	KASSERT(idx < sc->hn_chim_bmap_cnt,
+	    ("invalid chimney index 0x%x", chim_idx));
+
+	mask = 1UL << (chim_idx % LONG_BIT);
+	KASSERT(sc->hn_chim_bmap[idx] & mask,
+	    ("index bitmap 0x%lx, chimney index %u, "
+	     "bitmap idx %d, bitmask 0x%lx",
+	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
+
+	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
+}
+
+#if defined(INET6) || defined(INET)
+
+#define PULLUP_HDR(m, len)				\
+do {							\
+	if (__predict_false((m)->m_len < (len))) {	\
+		(m) = m_pullup((m), (len));		\
+		if ((m) == NULL)			\
+			return (NULL);			\
+	}						\
+} while (0)
+
+/*
+ * NOTE: If this function failed, the m_head would be freed.
+ */
+static __inline struct mbuf *
+hn_tso_fixup(struct mbuf *m_head)
+{
+	struct ether_vlan_header *evl;
+	struct tcphdr *th;
+	int ehlen;
+
+	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
+
+	PULLUP_HDR(m_head, sizeof(*evl));
+	evl = mtod(m_head, struct ether_vlan_header *);
+	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
+		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
+	else
+		ehlen = ETHER_HDR_LEN;
+	m_head->m_pkthdr.l2hlen = ehlen;
+
+#ifdef INET
+	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
+		struct ip *ip;
+		int iphlen;
+
+		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
+		ip = mtodo(m_head, ehlen);
+		iphlen = ip->ip_hl << 2;
+		m_head->m_pkthdr.l3hlen = iphlen;
+
+		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
+		th = mtodo(m_head, ehlen + iphlen);
+
+		ip->ip_len = 0;
+		ip->ip_sum = 0;
+		th->th_sum = in_pseudo(ip->ip_src.s_addr,
+		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
+	}
+#endif
+#if defined(INET6) && defined(INET)
+	else
+#endif
+#ifdef INET6
+	{
+		struct ip6_hdr *ip6;
+
+		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
+		ip6 = mtodo(m_head, ehlen);
+		if (ip6->ip6_nxt != IPPROTO_TCP) {
+			m_freem(m_head);
+			return (NULL);
+		}
+		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
+
+		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
+		th = mtodo(m_head, ehlen + sizeof(*ip6));
+
+		ip6->ip6_plen = 0;
+		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
+	}
+#endif
+	return (m_head);
+}
+
+/*
+ * NOTE: If this function failed, the m_head would be freed.
+ */
+static __inline struct mbuf *
+hn_set_hlen(struct mbuf *m_head)
+{
+	const struct ether_vlan_header *evl;
+	int ehlen;
+
+	PULLUP_HDR(m_head, sizeof(*evl));
+	evl = mtod(m_head, const struct ether_vlan_header *);
+	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
+		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
+	else
+		ehlen = ETHER_HDR_LEN;
+	m_head->m_pkthdr.l2hlen = ehlen;
+
+#ifdef INET
+	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
+		const struct ip *ip;
+		int iphlen;
+
+		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
+		ip = mtodo(m_head, ehlen);
+		iphlen = ip->ip_hl << 2;
+		m_head->m_pkthdr.l3hlen = iphlen;
+
+		/*
+		 * UDP checksum offload does not work in Azure, if the
+		 * following conditions meet:
+		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
+		 * - IP_DF is not set in the IP hdr.
+		 *
+		 * Fallback to software checksum for these UDP datagrams.
+		 */
+		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
+		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
+		    (ntohs(ip->ip_off) & IP_DF) == 0) {
+			uint16_t off = ehlen + iphlen;
+
+			counter_u64_add(hn_udpcs_fixup, 1);
+			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
+			*(uint16_t *)(m_head->m_data + off +
+                            m_head->m_pkthdr.csum_data) = in_cksum_skip(
+			    m_head, m_head->m_pkthdr.len, off);
+			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
+		}
+	}
+#endif
+#if defined(INET6) && defined(INET)
+	else
+#endif
+#ifdef INET6
+	{
+		const struct ip6_hdr *ip6;
+
+		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
+		ip6 = mtodo(m_head, ehlen);
+		if (ip6->ip6_nxt != IPPROTO_TCP &&
+		    ip6->ip6_nxt != IPPROTO_UDP) {
+			m_freem(m_head);
+			return (NULL);
+		}
+		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
+	}
+#endif
+	return (m_head);
+}
+
+/*
+ * NOTE: If this function failed, the m_head would be freed.
+ */
+static __inline struct mbuf *
+hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
+{
+	const struct tcphdr *th;
+	int ehlen, iphlen;
+
+	*tcpsyn = 0;
+	ehlen = m_head->m_pkthdr.l2hlen;
+	iphlen = m_head->m_pkthdr.l3hlen;
+
+	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
+	th = mtodo(m_head, ehlen + iphlen);
+	if (th->th_flags & TH_SYN)
+		*tcpsyn = 1;
+	return (m_head);
+}
+
+#undef PULLUP_HDR
+
+#endif	/* INET6 || INET */
+
+static int
+hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
+{
+	int error = 0;
+
+	HN_LOCK_ASSERT(sc);
+
+	if (sc->hn_rx_filter != filter) {
+		error = hn_rndis_set_rxfilter(sc, filter);
+		if (!error)
+			sc->hn_rx_filter = filter;
+	}
+	return (error);
+}
+
+static int
+hn_rxfilter_config(struct hn_softc *sc)
+{
+	struct ifnet *ifp = sc->hn_ifp;
+	uint32_t filter;
+
+	HN_LOCK_ASSERT(sc);
+
+	/*
+	 * If the non-transparent mode VF is activated, we don't know how
+	 * its RX filter is configured, so stick the synthetic device in
+	 * the promiscous mode.
+	 */
+	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
+		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
+	} else {
+		filter = NDIS_PACKET_TYPE_DIRECTED;
+		if (ifp->if_flags & IFF_BROADCAST)
+			filter |= NDIS_PACKET_TYPE_BROADCAST;
+		/* TODO: support multicast list */
+		if ((ifp->if_flags & IFF_ALLMULTI) ||
+		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
+			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
+	}
+	return (hn_set_rxfilter(sc, filter));
+}
+
+static void
+hn_set_txagg(struct hn_softc *sc)
+{
+	uint32_t size, pkts;
+	int i;
+
+	/*
+	 * Setup aggregation size.
+	 */
+	if (sc->hn_agg_size < 0)
+		size = UINT32_MAX;
+	else
+		size = sc->hn_agg_size;
+
+	if (sc->hn_rndis_agg_size < size)
+		size = sc->hn_rndis_agg_size;
+
+	/* NOTE: We only aggregate packets using chimney sending buffers. */
+	if (size > (uint32_t)sc->hn_chim_szmax)
+		size = sc->hn_chim_szmax;
+
+	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
+		/* Disable */
+		size = 0;
+		pkts = 0;
+		goto done;
+	}
+
+	/* NOTE: Type of the per TX ring setting is 'int'. */
+	if (size > INT_MAX)
+		size = INT_MAX;
+
+	/*
+	 * Setup aggregation packet count.
+	 */
+	if (sc->hn_agg_pkts < 0)
+		pkts = UINT32_MAX;
+	else
+		pkts = sc->hn_agg_pkts;
+
+	if (sc->hn_rndis_agg_pkts < pkts)
+		pkts = sc->hn_rndis_agg_pkts;
+
+	if (pkts <= 1) {
+		/* Disable */
+		size = 0;
+		pkts = 0;
+		goto done;
+	}
+
+	/* NOTE: Type of the per TX ring setting is 'short'. */
+	if (pkts > SHRT_MAX)
+		pkts = SHRT_MAX;
+
+done:
+	/* NOTE: Type of the per TX ring setting is 'short'. */
+	if (sc->hn_rndis_agg_align > SHRT_MAX) {
+		/* Disable */
+		size = 0;
+		pkts = 0;
+	}
+
+	if (bootverbose) {
+		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
+		    size, pkts, sc->hn_rndis_agg_align);
+	}
+
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
+
+		mtx_lock(&txr->hn_tx_lock);
+		txr->hn_agg_szmax = size;
+		txr->hn_agg_pktmax = pkts;
+		txr->hn_agg_align = sc->hn_rndis_agg_align;
+		mtx_unlock(&txr->hn_tx_lock);
+	}
+}
+
+static int
+hn_get_txswq_depth(const struct hn_tx_ring *txr)
+{
+
+	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
+	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
+		return txr->hn_txdesc_cnt;
+	return hn_tx_swq_depth;
+}
+
+static int
+hn_rss_reconfig(struct hn_softc *sc)
+{
+	int error;
+
+	HN_LOCK_ASSERT(sc);
+
+	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
+		return (ENXIO);
+
+	/*
+	 * Disable RSS first.
+	 *
+	 * NOTE:
+	 * Direct reconfiguration by setting the UNCHG flags does
+	 * _not_ work properly.
+	 */
+	if (bootverbose)
+		if_printf(sc->hn_ifp, "disable RSS\n");
+	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
+	if (error) {
+		if_printf(sc->hn_ifp, "RSS disable failed\n");
+		return (error);
+	}
+
+	/*
+	 * Reenable the RSS w/ the updated RSS key or indirect
+	 * table.
+	 */
+	if (bootverbose)
+		if_printf(sc->hn_ifp, "reconfig RSS\n");
+	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
+	if (error) {
+		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
+		return (error);
+	}
+	return (0);
+}
+
+static void
+hn_rss_ind_fixup(struct hn_softc *sc)
+{
+	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
+	int i, nchan;
+
+	nchan = sc->hn_rx_ring_inuse;
+	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
+
+	/*
+	 * Check indirect table to make sure that all channels in it
+	 * can be used.
+	 */
+	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
+		if (rss->rss_ind[i] >= nchan) {
+			if_printf(sc->hn_ifp,
+			    "RSS indirect table %d fixup: %u -> %d\n",
+			    i, rss->rss_ind[i], nchan - 1);
+			rss->rss_ind[i] = nchan - 1;
+		}
+	}
+}
+
+static int
+hn_ifmedia_upd(struct ifnet *ifp __unused)
+{
+
+	return EOPNOTSUPP;
+}
+
+static void
+hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
+{
+	struct hn_softc *sc = ifp->if_softc;
+
+	ifmr->ifm_status = IFM_AVALID;
+	ifmr->ifm_active = IFM_ETHER;
+
+	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
+		ifmr->ifm_active |= IFM_NONE;
+		return;
+	}
+	ifmr->ifm_status |= IFM_ACTIVE;
+	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
+}
+
+static void
+hn_rxvf_set_task(void *xarg, int pending __unused)
+{
+	struct hn_rxvf_setarg *arg = xarg;
+
+	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
+}
+
+static void
+hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
+{
+	struct hn_rx_ring *rxr;
+	struct hn_rxvf_setarg arg;
+	struct task task;
+	int i;
+
+	HN_LOCK_ASSERT(sc);
+
+	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
+
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		rxr = &sc->hn_rx_ring[i];
+
+		if (i < sc->hn_rx_ring_inuse) {
+			arg.rxr = rxr;
+			arg.vf_ifp = vf_ifp;
+			vmbus_chan_run_task(rxr->hn_chan, &task);
+		} else {
+			rxr->hn_rxvf_ifp = vf_ifp;
+		}
+	}
+}
+
+static bool
+hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
+{
+	const struct ifnet *hn_ifp;
+
+	hn_ifp = sc->hn_ifp;
+
+	if (ifp == hn_ifp)
+		return (false);
+
+	if (ifp->if_alloctype != IFT_ETHER)
+		return (false);
+
+	/* Ignore lagg/vlan interfaces */
+	if (strcmp(ifp->if_dname, "lagg") == 0 ||
+	    strcmp(ifp->if_dname, "vlan") == 0)
+		return (false);
+
+	/*
+	 * During detach events ifp->if_addr might be NULL.
+	 * Make sure the bcmp() below doesn't panic on that:
+	 */
+	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
+		return (false);
+
+	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
+		return (false);
+
+	return (true);
+}
+
+static void
+hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
+{
+	struct ifnet *hn_ifp;
+
+	HN_LOCK(sc);
+
+	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
+		goto out;
+
+	if (!hn_ismyvf(sc, ifp))
+		goto out;
+	hn_ifp = sc->hn_ifp;
+
+	if (rxvf) {
+		if (sc->hn_flags & HN_FLAG_RXVF)
+			goto out;
+
+		sc->hn_flags |= HN_FLAG_RXVF;
+		hn_rxfilter_config(sc);
+	} else {
+		if (!(sc->hn_flags & HN_FLAG_RXVF))
+			goto out;
+
+		sc->hn_flags &= ~HN_FLAG_RXVF;
+		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
+			hn_rxfilter_config(sc);
+		else
+			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
+	}
+
+	hn_nvs_set_datapath(sc,
+	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
+
+	hn_rxvf_set(sc, rxvf ? ifp : NULL);
+
+	if (rxvf) {
+		hn_vf_rss_fixup(sc, true);
+		hn_suspend_mgmt(sc);
+		sc->hn_link_flags &=
+		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
+		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
+	} else {
+		hn_vf_rss_restore(sc);
+		hn_resume_mgmt(sc);
+	}
+
+	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
+	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
+
+	if (bootverbose) {
+		if_printf(hn_ifp, "datapath is switched %s %s\n",
+		    rxvf ? "to" : "from", ifp->if_xname);
+	}
+out:
+	HN_UNLOCK(sc);
+}
+
+static void
+hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
+{
+
+	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
+		return;
+	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
+}
+
+static void
+hn_ifaddr_event(void *arg, struct ifnet *ifp)
+{
+
+	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
+}
+
+static int
+hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
+{
+	struct ifnet *ifp, *vf_ifp;
+	uint64_t tmp;
+	int error;
+
+	HN_LOCK_ASSERT(sc);
+	ifp = sc->hn_ifp;
+	vf_ifp = sc->hn_vf_ifp;
+
+	/*
+	 * Fix up requested capabilities w/ supported capabilities,
+	 * since the supported capabilities could have been changed.
+	 */
+	ifr->ifr_reqcap &= ifp->if_capabilities;
+	/* Pass SIOCSIFCAP to VF. */
+	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
+
+	/*
+	 * NOTE:
+	 * The error will be propagated to the callers, however, it
+	 * is _not_ useful here.
+	 */
+
+	/*
+	 * Merge VF's enabled capabilities.
+	 */
+	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
+
+	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
+	if (ifp->if_capenable & IFCAP_TXCSUM)
+		ifp->if_hwassist |= tmp;
+	else
+		ifp->if_hwassist &= ~tmp;
+
+	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
+	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
+		ifp->if_hwassist |= tmp;
+	else
+		ifp->if_hwassist &= ~tmp;
+
+	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
+	if (ifp->if_capenable & IFCAP_TSO4)
+		ifp->if_hwassist |= tmp;
+	else
+		ifp->if_hwassist &= ~tmp;
+
+	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
+	if (ifp->if_capenable & IFCAP_TSO6)
+		ifp->if_hwassist |= tmp;
+	else
+		ifp->if_hwassist &= ~tmp;
+
+	return (error);
+}
+
+static int
+hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
+{
+	struct ifnet *vf_ifp;
+	struct ifreq ifr;
+
+	HN_LOCK_ASSERT(sc);
+	vf_ifp = sc->hn_vf_ifp;
+
+	memset(&ifr, 0, sizeof(ifr));
+	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
+	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
+	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
+	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
+}
+
+static void
+hn_xpnt_vf_saveifflags(struct hn_softc *sc)
+{
+	struct ifnet *ifp = sc->hn_ifp;
+	int allmulti = 0;
+
+	HN_LOCK_ASSERT(sc);
+
+	/* XXX vlan(4) style mcast addr maintenance */
+	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
+		allmulti = IFF_ALLMULTI;
+
+	/* Always set the VF's if_flags */
+	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
+}
+
+static void
+hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
+{
+	struct rm_priotracker pt;
+	struct ifnet *hn_ifp = NULL;
+	struct mbuf *mn;
+
+	/*
+	 * XXX racy, if hn(4) ever detached.
+	 */
+	rm_rlock(&hn_vfmap_lock, &pt);
+	if (vf_ifp->if_index < hn_vfmap_size)
+		hn_ifp = hn_vfmap[vf_ifp->if_index];
+	rm_runlock(&hn_vfmap_lock, &pt);
+
+	if (hn_ifp != NULL) {
+		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
+			/*
+			 * Allow tapping on the VF.
+			 */
+			ETHER_BPF_MTAP(vf_ifp, mn);
+
+			/*
+			 * Update VF stats.
+			 */
+			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
+				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
+				    mn->m_pkthdr.len);
+			}
+			/*
+			 * XXX IFCOUNTER_IMCAST
+			 * This stat updating is kinda invasive, since it
+			 * requires two checks on the mbuf: the length check
+			 * and the ethernet header check.  As of this write,
+			 * all multicast packets go directly to hn(4), which
+			 * makes imcast stat updating in the VF a try in vian.
+			 */
+
+			/*
+			 * Fix up rcvif and increase hn(4)'s ipackets.
+			 */
+			mn->m_pkthdr.rcvif = hn_ifp;
+			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
+		}
+		/*
+		 * Go through hn(4)'s if_input.
+		 */
+		hn_ifp->if_input(hn_ifp, m);
+	} else {
+		/*
+		 * In the middle of the transition; free this
+		 * mbuf chain.
+		 */
+		while (m != NULL) {
+			mn = m->m_nextpkt;
+			m->m_nextpkt = NULL;
+			m_freem(m);
+			m = mn;
+		}
+	}
+}
+
+static void
+hn_mtu_change_fixup(struct hn_softc *sc)
+{
+	struct ifnet *ifp;
+
+	HN_LOCK_ASSERT(sc);
+	ifp = sc->hn_ifp;
+
+	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
+#if __FreeBSD_version >= 1100099
+	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
+		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
+#endif
+}
+
+static uint32_t
+hn_rss_type_fromndis(uint32_t rss_hash)
+{
+	uint32_t types = 0;
+
+	if (rss_hash & NDIS_HASH_IPV4)
+		types |= RSS_TYPE_IPV4;
+	if (rss_hash & NDIS_HASH_TCP_IPV4)
+		types |= RSS_TYPE_TCP_IPV4;
+	if (rss_hash & NDIS_HASH_IPV6)
+		types |= RSS_TYPE_IPV6;
+	if (rss_hash & NDIS_HASH_IPV6_EX)
+		types |= RSS_TYPE_IPV6_EX;
+	if (rss_hash & NDIS_HASH_TCP_IPV6)
+		types |= RSS_TYPE_TCP_IPV6;
+	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
+		types |= RSS_TYPE_TCP_IPV6_EX;
+	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
+		types |= RSS_TYPE_UDP_IPV4;
+	return (types);
+}
+
+static uint32_t
+hn_rss_type_tondis(uint32_t types)
+{
+	uint32_t rss_hash = 0;
+
+	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
+	    ("UDP6 and UDP6EX are not supported"));
+
+	if (types & RSS_TYPE_IPV4)
+		rss_hash |= NDIS_HASH_IPV4;
+	if (types & RSS_TYPE_TCP_IPV4)
+		rss_hash |= NDIS_HASH_TCP_IPV4;
+	if (types & RSS_TYPE_IPV6)
+		rss_hash |= NDIS_HASH_IPV6;
+	if (types & RSS_TYPE_IPV6_EX)
+		rss_hash |= NDIS_HASH_IPV6_EX;
+	if (types & RSS_TYPE_TCP_IPV6)
+		rss_hash |= NDIS_HASH_TCP_IPV6;
+	if (types & RSS_TYPE_TCP_IPV6_EX)
+		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
+	if (types & RSS_TYPE_UDP_IPV4)
+		rss_hash |= NDIS_HASH_UDP_IPV4_X;
+	return (rss_hash);
+}
+
+static void
+hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
+{
+	int i;
+
+	HN_LOCK_ASSERT(sc);
+
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
+}
+
+static void
+hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
+{
+	struct ifnet *ifp, *vf_ifp;
+	struct ifrsshash ifrh;
+	struct ifrsskey ifrk;
+	int error;
+	uint32_t my_types, diff_types, mbuf_types = 0;
+
+	HN_LOCK_ASSERT(sc);
+	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
+	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
+
+	if (sc->hn_rx_ring_inuse == 1) {
+		/* No RSS on synthetic parts; done. */
+		return;
+	}
+	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
+		/* Synthetic parts do not support Toeplitz; done. */
+		return;
+	}
+
+	ifp = sc->hn_ifp;
+	vf_ifp = sc->hn_vf_ifp;
+
+	/*
+	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
+	 * supported.
+	 */
+	memset(&ifrk, 0, sizeof(ifrk));
+	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
+	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
+	if (error) {
+		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
+		    vf_ifp->if_xname, error);
+		goto done;
+	}
+	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
+		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
+		    vf_ifp->if_xname, ifrk.ifrk_func);
+		goto done;
+	}
+	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
+		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
+		    vf_ifp->if_xname, ifrk.ifrk_keylen);
+		goto done;
+	}
+
+	/*
+	 * Extract VF's RSS hash.  Only Toeplitz is supported.
+	 */
+	memset(&ifrh, 0, sizeof(ifrh));
+	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
+	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
+	if (error) {
+		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
+		    vf_ifp->if_xname, error);
+		goto done;
+	}
+	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
+		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
+		    vf_ifp->if_xname, ifrh.ifrh_func);
+		goto done;
+	}
+
+	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
+	if ((ifrh.ifrh_types & my_types) == 0) {
+		/* This disables RSS; ignore it then */
+		if_printf(ifp, "%s intersection of RSS types failed.  "
+		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
+		    ifrh.ifrh_types, my_types);
+		goto done;
+	}
+
+	diff_types = my_types ^ ifrh.ifrh_types;
+	my_types &= ifrh.ifrh_types;
+	mbuf_types = my_types;
+
+	/*
+	 * Detect RSS hash value/type confliction.
+	 *
+	 * NOTE:
+	 * We don't disable the hash type, but stop delivery the hash
+	 * value/type through mbufs on RX path.
+	 *
+	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
+	 * hash is delivered with type of TCP_IPV4.  This means if
+	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
+	 * least to hn_mbuf_hash.  However, given that _all_ of the
+	 * NICs implement TCP_IPV4, this will _not_ impose any issues
+	 * here.
+	 */
+	if ((my_types & RSS_TYPE_IPV4) &&
+	    (diff_types & ifrh.ifrh_types &
+	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
+		/* Conflict; disable IPV4 hash type/value delivery. */
+		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
+		mbuf_types &= ~RSS_TYPE_IPV4;
+	}
+	if ((my_types & RSS_TYPE_IPV6) &&
+	    (diff_types & ifrh.ifrh_types &
+	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
+	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
+	      RSS_TYPE_IPV6_EX))) {
+		/* Conflict; disable IPV6 hash type/value delivery. */
+		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
+		mbuf_types &= ~RSS_TYPE_IPV6;
+	}
+	if ((my_types & RSS_TYPE_IPV6_EX) &&
+	    (diff_types & ifrh.ifrh_types &
+	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
+	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
+	      RSS_TYPE_IPV6))) {
+		/* Conflict; disable IPV6_EX hash type/value delivery. */
+		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
+		mbuf_types &= ~RSS_TYPE_IPV6_EX;
+	}
+	if ((my_types & RSS_TYPE_TCP_IPV6) &&
+	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
+		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
+		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
+		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
+	}
+	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
+	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
+		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
+		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
+		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
+	}
+	if ((my_types & RSS_TYPE_UDP_IPV6) &&
+	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
+		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
+		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
+		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
+	}
+	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
+	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
+		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
+		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
+		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
+	}
+
+	/*
+	 * Indirect table does not matter.
+	 */
+
+	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
+	    hn_rss_type_tondis(my_types);
+	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
+	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
+
+	if (reconf) {
+		error = hn_rss_reconfig(sc);
+		if (error) {
+			/* XXX roll-back? */
+			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
+			/* XXX keep going. */
+		}
+	}
+done:
+	/* Hash deliverability for mbufs. */
+	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
+}
+
+static void
+hn_vf_rss_restore(struct hn_softc *sc)
+{
+
+	HN_LOCK_ASSERT(sc);
+	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
+	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
+
+	if (sc->hn_rx_ring_inuse == 1)
+		goto done;
+
+	/*
+	 * Restore hash types.  Key does _not_ matter.
+	 */
+	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
+		int error;
+
+		sc->hn_rss_hash = sc->hn_rss_hcap;
+		error = hn_rss_reconfig(sc);
+		if (error) {
+			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
+			    error);
+			/* XXX keep going. */
+		}
+	}
+done:
+	/* Hash deliverability for mbufs. */
+	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
+}
+
+static void
+hn_xpnt_vf_setready(struct hn_softc *sc)
+{
+	struct ifnet *ifp, *vf_ifp;
+	struct ifreq ifr;
+
+	HN_LOCK_ASSERT(sc);
+	ifp = sc->hn_ifp;
+	vf_ifp = sc->hn_vf_ifp;
+
+	/*
+	 * Mark the VF ready.
+	 */
+	sc->hn_vf_rdytick = 0;
+
+	/*
+	 * Save information for restoration.
+	 */
+	sc->hn_saved_caps = ifp->if_capabilities;
+	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
+	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
+	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
+
+	/*
+	 * Intersect supported/enabled capabilities.
+	 *
+	 * NOTE:
+	 * if_hwassist is not changed here.
+	 */
+	ifp->if_capabilities &= vf_ifp->if_capabilities;
+	ifp->if_capenable &= ifp->if_capabilities;
+
+	/*
+	 * Fix TSO settings.
+	 */
+	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
+		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
+	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
+		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
+	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
+		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
+
+	/*
+	 * Change VF's enabled capabilities.
+	 */
+	memset(&ifr, 0, sizeof(ifr));
+	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
+	ifr.ifr_reqcap = ifp->if_capenable;
+	hn_xpnt_vf_iocsetcaps(sc, &ifr);
+
+	if (ifp->if_mtu != ETHERMTU) {
+		int error;
+
+		/*
+		 * Change VF's MTU.
+		 */
+		memset(&ifr, 0, sizeof(ifr));
+		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
+		ifr.ifr_mtu = ifp->if_mtu;
+		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
+		if (error) {
+			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
+			    vf_ifp->if_xname, ifp->if_mtu);
+			if (ifp->if_mtu > ETHERMTU) {
+				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
+
+				/*
+				 * XXX
+				 * No need to adjust the synthetic parts' MTU;
+				 * failure of the adjustment will cause us
+				 * infinite headache.
+				 */
+				ifp->if_mtu = ETHERMTU;
+				hn_mtu_change_fixup(sc);
+			}
+		}
+	}
+}
+
+static bool
+hn_xpnt_vf_isready(struct hn_softc *sc)
+{
+
+	HN_LOCK_ASSERT(sc);
+
+	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
+		return (false);
+
+	if (sc->hn_vf_rdytick == 0)
+		return (true);
+
+	if (sc->hn_vf_rdytick > ticks)
+		return (false);
+
+	/* Mark VF as ready. */
+	hn_xpnt_vf_setready(sc);
+	return (true);
+}
+
+static void
+hn_xpnt_vf_setenable(struct hn_softc *sc)
+{
+	int i;
+
+	HN_LOCK_ASSERT(sc);
+
+	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
+	rm_wlock(&sc->hn_vf_lock);
+	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
+	rm_wunlock(&sc->hn_vf_lock);
+
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
+}
+
+static void
+hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
+{
+	int i;
+
+	HN_LOCK_ASSERT(sc);
+
+	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
+	rm_wlock(&sc->hn_vf_lock);
+	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
+	if (clear_vf)
+		sc->hn_vf_ifp = NULL;
+	rm_wunlock(&sc->hn_vf_lock);
+
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
+}
+
+static void
+hn_xpnt_vf_init(struct hn_softc *sc)
+{
+	int error;
+
+	HN_LOCK_ASSERT(sc);
+
+	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
+	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
+
+	if (bootverbose) {
+		if_printf(sc->hn_ifp, "try bringing up %s\n",
+		    sc->hn_vf_ifp->if_xname);
+	}
+
+	/*
+	 * Bring the VF up.
+	 */
+	hn_xpnt_vf_saveifflags(sc);
+	sc->hn_vf_ifp->if_flags |= IFF_UP;
+	error = hn_xpnt_vf_iocsetflags(sc);
+	if (error) {
+		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
+		    sc->hn_vf_ifp->if_xname, error);
+		return;
+	}
+
+	/*
+	 * NOTE:
+	 * Datapath setting must happen _after_ bringing the VF up.
+	 */
+	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
+
+	/*
+	 * NOTE:
+	 * Fixup RSS related bits _after_ the VF is brought up, since
+	 * many VFs generate RSS key during it's initialization.
+	 */
+	hn_vf_rss_fixup(sc, true);
+
+	/* Mark transparent mode VF as enabled. */
+	hn_xpnt_vf_setenable(sc);
+}
+
+static void
+hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
+{
+	struct hn_softc *sc = xsc;
+
+	HN_LOCK(sc);
+
+	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
+		goto done;
+	if (sc->hn_vf_ifp == NULL)
+		goto done;
+	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
+		goto done;
+
+	if (sc->hn_vf_rdytick != 0) {
+		/* Mark VF as ready. */
+		hn_xpnt_vf_setready(sc);
+	}
+
+	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
+		/*
+		 * Delayed VF initialization.
+		 */
+		if (bootverbose) {
+			if_printf(sc->hn_ifp, "delayed initialize %s\n",
+			    sc->hn_vf_ifp->if_xname);
+		}
+		hn_xpnt_vf_init(sc);
+	}
+done:
+	HN_UNLOCK(sc);
+}
+
+static void
+hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
+{
+	struct hn_softc *sc = xsc;
+
+	HN_LOCK(sc);
+
+	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
+		goto done;
+
+	if (!hn_ismyvf(sc, ifp))
+		goto done;
+
+	if (sc->hn_vf_ifp != NULL) {
+		if_printf(sc->hn_ifp, "%s was attached as VF\n",
+		    sc->hn_vf_ifp->if_xname);
+		goto done;
+	}
+
+	if (hn_xpnt_vf && ifp->if_start != NULL) {
+		/*
+		 * ifnet.if_start is _not_ supported by transparent
+		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
+		 */
+		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
+		    "in transparent VF mode.\n", ifp->if_xname);
+		goto done;
+	}
+
+	rm_wlock(&hn_vfmap_lock);
+
+	if (ifp->if_index >= hn_vfmap_size) {
+		struct ifnet **newmap;
+		int newsize;
+
+		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
+		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
+		    M_WAITOK | M_ZERO);
+
+		memcpy(newmap, hn_vfmap,
+		    sizeof(struct ifnet *) * hn_vfmap_size);
+		free(hn_vfmap, M_DEVBUF);
+		hn_vfmap = newmap;
+		hn_vfmap_size = newsize;
+	}
+	KASSERT(hn_vfmap[ifp->if_index] == NULL,
+	    ("%s: ifindex %d was mapped to %s",
+	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
+	hn_vfmap[ifp->if_index] = sc->hn_ifp;
+
+	rm_wunlock(&hn_vfmap_lock);
+
+	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
+	rm_wlock(&sc->hn_vf_lock);
+	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
+	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
+	sc->hn_vf_ifp = ifp;
+	rm_wunlock(&sc->hn_vf_lock);
+
+	if (hn_xpnt_vf) {
+		int wait_ticks;
+
+		/*
+		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
+		 * Save vf_ifp's current if_input for later restoration.
+		 */
+		sc->hn_vf_input = ifp->if_input;
+		ifp->if_input = hn_xpnt_vf_input;
+
+		/*
+		 * Stop link status management; use the VF's.
+		 */
+		hn_suspend_mgmt(sc);
+
+		/*
+		 * Give VF sometime to complete its attach routing.
+		 */
+		wait_ticks = hn_xpnt_vf_attwait * hz;
+		sc->hn_vf_rdytick = ticks + wait_ticks;
+
+		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
+		    wait_ticks);
+	}
+done:
+	HN_UNLOCK(sc);
+}
+
+static void
+hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
+{
+	struct hn_softc *sc = xsc;
+
+	HN_LOCK(sc);
+
+	if (sc->hn_vf_ifp == NULL)
+		goto done;
+
+	if (!hn_ismyvf(sc, ifp))
+		goto done;
+
+	if (hn_xpnt_vf) {
+		/*
+		 * Make sure that the delayed initialization is not running.
+		 *
+		 * NOTE:
+		 * - This lock _must_ be released, since the hn_vf_init task
+		 *   will try holding this lock.
+		 * - It is safe to release this lock here, since the
+		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
+		 *
+		 * XXX racy, if hn(4) ever detached.
+		 */
+		HN_UNLOCK(sc);
+		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
+		HN_LOCK(sc);
+
+		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
+		    sc->hn_ifp->if_xname));
+		ifp->if_input = sc->hn_vf_input;
+		sc->hn_vf_input = NULL;
+
+		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
+		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
+			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
+
+		if (sc->hn_vf_rdytick == 0) {
+			/*
+			 * The VF was ready; restore some settings.
+			 */
+			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
+			/*
+			 * NOTE:
+			 * There is _no_ need to fixup if_capenable and
+			 * if_hwassist, since the if_capabilities before
+			 * restoration was an intersection of the VF's
+			 * if_capabilites and the synthetic device's
+			 * if_capabilites.
+			 */
+			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
+			sc->hn_ifp->if_hw_tsomaxsegcount =
+			    sc->hn_saved_tsosegcnt;
+			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
+		}
+
+		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
+			/*
+			 * Restore RSS settings.
+			 */
+			hn_vf_rss_restore(sc);
+
+			/*
+			 * Resume link status management, which was suspended
+			 * by hn_ifnet_attevent().
+			 */
+			hn_resume_mgmt(sc);
+		}
+	}
+
+	/* Mark transparent mode VF as disabled. */
+	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
+
+	rm_wlock(&hn_vfmap_lock);
+
+	KASSERT(ifp->if_index < hn_vfmap_size,
+	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
+	if (hn_vfmap[ifp->if_index] != NULL) {
+		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
+		    ("%s: ifindex %d was mapped to %s",
+		     ifp->if_xname, ifp->if_index,
+		     hn_vfmap[ifp->if_index]->if_xname));
+		hn_vfmap[ifp->if_index] = NULL;
+	}
+
+	rm_wunlock(&hn_vfmap_lock);
+done:
+	HN_UNLOCK(sc);
+}
+
+static void
+hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
+{
+	struct hn_softc *sc = xsc;
+
+	if (sc->hn_vf_ifp == ifp)
+		if_link_state_change(sc->hn_ifp, link_state);
+}
+
+static int
+hn_probe(device_t dev)
+{
+
+	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
+		device_set_desc(dev, "Hyper-V Network Interface");
+		return BUS_PROBE_DEFAULT;
+	}
+	return ENXIO;
+}
+
+static int
+hn_attach(device_t dev)
+{
+	struct hn_softc *sc = device_get_softc(dev);
+	struct sysctl_oid_list *child;
+	struct sysctl_ctx_list *ctx;
+	uint8_t eaddr[ETHER_ADDR_LEN];
+	struct ifnet *ifp = NULL;
+	int error, ring_cnt, tx_ring_cnt;
+	uint32_t mtu;
+
+	sc->hn_dev = dev;
+	sc->hn_prichan = vmbus_get_channel(dev);
+	HN_LOCK_INIT(sc);
+	rm_init(&sc->hn_vf_lock, "hnvf");
+	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
+		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
+
+	/*
+	 * Initialize these tunables once.
+	 */
+	sc->hn_agg_size = hn_tx_agg_size;
+	sc->hn_agg_pkts = hn_tx_agg_pkts;
+
+	/*
+	 * Setup taskqueue for transmission.
+	 */
+	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
+		int i;
+
+		sc->hn_tx_taskqs =
+		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
+		    M_DEVBUF, M_WAITOK);
+		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
+			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
+			    M_WAITOK, taskqueue_thread_enqueue,
+			    &sc->hn_tx_taskqs[i]);
+			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
+			    "%s tx%d", device_get_nameunit(dev), i);
+		}
+	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
+		sc->hn_tx_taskqs = hn_tx_taskque;
+	}
+
+	/*
+	 * Setup taskqueue for mangement tasks, e.g. link status.
+	 */
+	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
+	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
+	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
+	    device_get_nameunit(dev));
+	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
+	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
+	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
+	    hn_netchg_status_taskfunc, sc);
+
+	if (hn_xpnt_vf) {
+		/*
+		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
+		 */
+		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
+		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
+		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
+		    device_get_nameunit(dev));
+		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
+		    hn_xpnt_vf_init_taskfunc, sc);
+	}
+
+	/*
+	 * Allocate ifnet and setup its name earlier, so that if_printf
+	 * can be used by functions, which will be called after
+	 * ether_ifattach().
+	 */
+	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
+	ifp->if_softc = sc;
+	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
+
+	/*
+	 * Initialize ifmedia earlier so that it can be unconditionally
+	 * destroyed, if error happened later on.
+	 */
+	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
+
+	/*
+	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
+	 * to use (tx_ring_cnt).
+	 *
+	 * NOTE:
+	 * The # of RX rings to use is same as the # of channels to use.
+	 */
+	ring_cnt = hn_chan_cnt;
+	if (ring_cnt <= 0) {
+		/* Default */
+		ring_cnt = mp_ncpus;
+		if (ring_cnt > HN_RING_CNT_DEF_MAX)
+			ring_cnt = HN_RING_CNT_DEF_MAX;
+	} else if (ring_cnt > mp_ncpus) {
+		ring_cnt = mp_ncpus;
+	}
+#ifdef RSS
+	if (ring_cnt > rss_getnumbuckets())
+		ring_cnt = rss_getnumbuckets();
+#endif
+
+	tx_ring_cnt = hn_tx_ring_cnt;
+	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
+		tx_ring_cnt = ring_cnt;
+#ifdef HN_IFSTART_SUPPORT
+	if (hn_use_if_start) {
+		/* ifnet.if_start only needs one TX ring. */
+		tx_ring_cnt = 1;
+	}
+#endif
+
+	/*
+	 * Set the leader CPU for channels.
+	 */
+	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
+
+	/*
+	 * Create enough TX/RX rings, even if only limited number of
+	 * channels can be allocated.
+	 */
+	error = hn_create_tx_data(sc, tx_ring_cnt);
+	if (error)
+		goto failed;
+	error = hn_create_rx_data(sc, ring_cnt);
+	if (error)
+		goto failed;
+
+	/*
+	 * Create transaction context for NVS and RNDIS transactions.
+	 */
+	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
+	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
+	if (sc->hn_xact == NULL) {
+		error = ENXIO;
+		goto failed;
+	}
+
+	/*
+	 * Install orphan handler for the revocation of this device's
+	 * primary channel.
+	 *
+	 * NOTE:
+	 * The processing order is critical here:
+	 * Install the orphan handler, _before_ testing whether this
+	 * device's primary channel has been revoked or not.
+	 */
+	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
+	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
+		error = ENXIO;
+		goto failed;
+	}
+
+	/*
+	 * Attach the synthetic parts, i.e. NVS and RNDIS.
+	 */
+	error = hn_synth_attach(sc, ETHERMTU);
+	if (error)
+		goto failed;
+
+	error = hn_rndis_get_eaddr(sc, eaddr);
+	if (error)
+		goto failed;
+
+	error = hn_rndis_get_mtu(sc, &mtu);
+	if (error)
+		mtu = ETHERMTU;
+	else if (bootverbose)
+		device_printf(dev, "RNDIS mtu %u\n", mtu);
+
+#if __FreeBSD_version >= 1100099
+	if (sc->hn_rx_ring_inuse > 1) {
+		/*
+		 * Reduce TCP segment aggregation limit for multiple
+		 * RX rings to increase ACK timeliness.
+		 */
+		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
+	}
+#endif
+
+	/*
+	 * Fixup TX/RX stuffs after synthetic parts are attached.
+	 */
+	hn_fixup_tx_data(sc);
+	hn_fixup_rx_data(sc);
+
+	ctx = device_get_sysctl_ctx(dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
+	    &sc->hn_nvs_ver, 0, "NVS version");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
+	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+	    hn_ndis_version_sysctl, "A", "NDIS version");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
+	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+	    hn_caps_sysctl, "A", "capabilities");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
+	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+	    hn_hwassist_sysctl, "A", "hwassist");
+	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
+	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
+	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
+	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
+	    "max # of TSO segments");
+	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
+	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
+	    "max size of TSO segment");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
+	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+	    hn_rxfilter_sysctl, "A", "rxfilter");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
+	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+	    hn_rss_hash_sysctl, "A", "RSS hash");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
+	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
+	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
+	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
+#ifndef RSS
+	/*
+	 * Don't allow RSS key/indirect table changes, if RSS is defined.
+	 */
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
+	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+	    hn_rss_key_sysctl, "IU", "RSS key");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
+	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
+#endif
+	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
+	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
+	    "RNDIS offered packet transmission aggregation size limit");
+	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
+	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
+	    "RNDIS offered packet transmission aggregation count limit");
+	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
+	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
+	    "RNDIS packet transmission aggregation alignment");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+	    hn_txagg_size_sysctl, "I",
+	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+	    hn_txagg_pkts_sysctl, "I",
+	    "Packet transmission aggregation packets, "
+	    "0 -- disable, -1 -- auto");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
+	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+	    hn_polling_sysctl, "I",
+	    "Polling frequency: [100,1000000], 0 disable polling");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
+	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+	    hn_vf_sysctl, "A", "Virtual Function's name");
+	if (!hn_xpnt_vf) {
+		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
+		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
+	} else {
+		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
+		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+		    hn_xpnt_vf_enabled_sysctl, "I",
+		    "Transparent VF enabled");
+		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
+		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+		    hn_xpnt_vf_accbpf_sysctl, "I",
+		    "Accurate BPF for transparent VF");
+	}
+
+	/*
+	 * Setup the ifmedia, which has been initialized earlier.
+	 */
+	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
+	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
+	/* XXX ifmedia_set really should do this for us */
+	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
+
+	/*
+	 * Setup the ifnet for this interface.
+	 */
+
+	ifp->if_baudrate = IF_Gbps(10);
+	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+	ifp->if_ioctl = hn_ioctl;
+	ifp->if_init = hn_init;
+#ifdef HN_IFSTART_SUPPORT
+	if (hn_use_if_start) {
+		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
+
+		ifp->if_start = hn_start;
+		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
+		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
+		IFQ_SET_READY(&ifp->if_snd);
+	} else
+#endif
+	{
+		ifp->if_transmit = hn_transmit;
+		ifp->if_qflush = hn_xmit_qflush;
+	}
+
+	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
+#ifdef foo
+	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
+	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
+#endif
+	if (sc->hn_caps & HN_CAP_VLAN) {
+		/* XXX not sure about VLAN_MTU. */
+		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
+	}
+
+	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
+	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
+		ifp->if_capabilities |= IFCAP_TXCSUM;
+	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
+		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
+	if (sc->hn_caps & HN_CAP_TSO4) {
+		ifp->if_capabilities |= IFCAP_TSO4;
+		ifp->if_hwassist |= CSUM_IP_TSO;
+	}
+	if (sc->hn_caps & HN_CAP_TSO6) {
+		ifp->if_capabilities |= IFCAP_TSO6;
+		ifp->if_hwassist |= CSUM_IP6_TSO;
+	}
+
+	/* Enable all available capabilities by default. */
+	ifp->if_capenable = ifp->if_capabilities;
+
+	/*
+	 * Disable IPv6 TSO and TXCSUM by default, they still can
+	 * be enabled through SIOCSIFCAP.
+	 */
+	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
+	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
+
+	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
+		/*
+		 * Lock hn_set_tso_maxsize() to simplify its
+		 * internal logic.
+		 */
+		HN_LOCK(sc);
+		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
+		HN_UNLOCK(sc);
+		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
+		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
+	}
+
+	ether_ifattach(ifp, eaddr);
+
+	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
+		if_printf(ifp, "TSO segcnt %u segsz %u\n",
+		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
+	}
+	if (mtu < ETHERMTU) {
+		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
+		ifp->if_mtu = mtu;
+	}
+
+	/* Inform the upper layer about the long frame support. */
+	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
+
+	/*
+	 * Kick off link status check.
+	 */
+	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
+	hn_update_link_status(sc);
+
+	if (!hn_xpnt_vf) {
+		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
+		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
+		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
+		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
+	} else {
+		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
+		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
+	}
+
+	/*
+	 * NOTE:
+	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
+	 * since interface's LLADDR is needed; interface LLADDR is not
+	 * available when ifnet_arrival event is triggered.
+	 */
+	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
+	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
+	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
+	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
+
+	return (0);
+failed:
+	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
+		hn_synth_detach(sc);
+	hn_detach(dev);
+	return (error);
+}
+
+static int
+hn_detach(device_t dev)
+{
+	struct hn_softc *sc = device_get_softc(dev);
+	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
+
+	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
+		/*
+		 * In case that the vmbus missed the orphan handler
+		 * installation.
+		 */
+		vmbus_xact_ctx_orphan(sc->hn_xact);
+	}
+
+	if (sc->hn_ifaddr_evthand != NULL)
+		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
+	if (sc->hn_ifnet_evthand != NULL)
+		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
+	if (sc->hn_ifnet_atthand != NULL) {
+		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
+		    sc->hn_ifnet_atthand);
+	}
+	if (sc->hn_ifnet_dethand != NULL) {
+		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
+		    sc->hn_ifnet_dethand);
+	}
+	if (sc->hn_ifnet_lnkhand != NULL)
+		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
+
+	vf_ifp = sc->hn_vf_ifp;
+	__compiler_membar();
+	if (vf_ifp != NULL)
+		hn_ifnet_detevent(sc, vf_ifp);
+
+	if (device_is_attached(dev)) {
+		HN_LOCK(sc);
+		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
+			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+				hn_stop(sc, true);
+			/*
+			 * NOTE:
+			 * hn_stop() only suspends data, so managment
+			 * stuffs have to be suspended manually here.
+			 */
+			hn_suspend_mgmt(sc);
+			hn_synth_detach(sc);
+		}
+		HN_UNLOCK(sc);
+		ether_ifdetach(ifp);
+	}
+
+	ifmedia_removeall(&sc->hn_media);
+	hn_destroy_rx_data(sc);
+	hn_destroy_tx_data(sc);
+
+	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
+		int i;
+
+		for (i = 0; i < hn_tx_taskq_cnt; ++i)
+			taskqueue_free(sc->hn_tx_taskqs[i]);
+		free(sc->hn_tx_taskqs, M_DEVBUF);
+	}
+	taskqueue_free(sc->hn_mgmt_taskq0);
+	if (sc->hn_vf_taskq != NULL)
+		taskqueue_free(sc->hn_vf_taskq);
+
+	if (sc->hn_xact != NULL) {
+		/*
+		 * Uninstall the orphan handler _before_ the xact is
+		 * destructed.
+		 */
+		vmbus_chan_unset_orphan(sc->hn_prichan);
+		vmbus_xact_ctx_destroy(sc->hn_xact);
+	}
+
+	if_free(ifp);
+
+	HN_LOCK_DESTROY(sc);
+	rm_destroy(&sc->hn_vf_lock);
+	return (0);
+}
+
+static int
+hn_shutdown(device_t dev)
+{
+
+	return (0);
+}
+
+static void
+hn_link_status(struct hn_softc *sc)
+{
+	uint32_t link_status;
+	int error;
+
+	error = hn_rndis_get_linkstatus(sc, &link_status);
+	if (error) {
+		/* XXX what to do? */
+		return;
+	}
+
+	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
+		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
+	else
+		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
+	if_link_state_change(sc->hn_ifp,
+	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
+	    LINK_STATE_UP : LINK_STATE_DOWN);
+}
+
+static void
+hn_link_taskfunc(void *xsc, int pending __unused)
+{
+	struct hn_softc *sc = xsc;
+
+	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
+		return;
+	hn_link_status(sc);
+}
+
+static void
+hn_netchg_init_taskfunc(void *xsc, int pending __unused)
+{
+	struct hn_softc *sc = xsc;
+
+	/* Prevent any link status checks from running. */
+	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
+
+	/*
+	 * Fake up a [link down --> link up] state change; 5 seconds
+	 * delay is used, which closely simulates miibus reaction
+	 * upon link down event.
+	 */
+	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
+	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
+	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
+	    &sc->hn_netchg_status, 5 * hz);
+}
+
+static void
+hn_netchg_status_taskfunc(void *xsc, int pending __unused)
+{
+	struct hn_softc *sc = xsc;
+
+	/* Re-allow link status checks. */
+	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
+	hn_link_status(sc);
+}
+
+static void
+hn_update_link_status(struct hn_softc *sc)
+{
+
+	if (sc->hn_mgmt_taskq != NULL)
+		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
+}
+
+static void
+hn_change_network(struct hn_softc *sc)
+{
+
+	if (sc->hn_mgmt_taskq != NULL)
+		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
+}
+
+static __inline int
+hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
+    struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
+{
+	struct mbuf *m = *m_head;
+	int error;
+
+	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
+
+	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
+	    m, segs, nsegs, BUS_DMA_NOWAIT);
+	if (error == EFBIG) {
+		struct mbuf *m_new;
+
+		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
+		if (m_new == NULL)
+			return ENOBUFS;
+		else
+			*m_head = m = m_new;
+		txr->hn_tx_collapsed++;
+
+		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
+		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
+	}
+	if (!error) {
+		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
+		    BUS_DMASYNC_PREWRITE);
+		txd->flags |= HN_TXD_FLAG_DMAMAP;
+	}
+	return error;
+}
+
+static __inline int
+hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
+{
+
+	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
+	    ("put an onlist txd %#x", txd->flags));
+	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
+	    ("put an onagg txd %#x", txd->flags));
+
+	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
+	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
+		return 0;
+
+	if (!STAILQ_EMPTY(&txd->agg_list)) {
+		struct hn_txdesc *tmp_txd;
+
+		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
+			int freed;
+
+			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
+			    ("resursive aggregation on aggregated txdesc"));
+			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
+			    ("not aggregated txdesc"));
+			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
+			    ("aggregated txdesc uses dmamap"));
+			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
+			    ("aggregated txdesc consumes "
+			     "chimney sending buffer"));
+			KASSERT(tmp_txd->chim_size == 0,
+			    ("aggregated txdesc has non-zero "
+			     "chimney sending size"));
+
+			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
+			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
+			freed = hn_txdesc_put(txr, tmp_txd);
+			KASSERT(freed, ("failed to free aggregated txdesc"));
+		}
+	}
+
+	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
+		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
+		    ("chim txd uses dmamap"));
+		hn_chim_free(txr->hn_sc, txd->chim_index);
+		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
+		txd->chim_size = 0;
+	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
+		bus_dmamap_sync(txr->hn_tx_data_dtag,
+		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
+		bus_dmamap_unload(txr->hn_tx_data_dtag,
+		    txd->data_dmap);
+		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
+	}
+
+	if (txd->m != NULL) {
+		m_freem(txd->m);
+		txd->m = NULL;
+	}
+
+	txd->flags |= HN_TXD_FLAG_ONLIST;
+#ifndef HN_USE_TXDESC_BUFRING
+	mtx_lock_spin(&txr->hn_txlist_spin);
+	KASSERT(txr->hn_txdesc_avail >= 0 &&
+	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
+	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
+	txr->hn_txdesc_avail++;
+	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
+	mtx_unlock_spin(&txr->hn_txlist_spin);
+#else	/* HN_USE_TXDESC_BUFRING */
+#ifdef HN_DEBUG
+	atomic_add_int(&txr->hn_txdesc_avail, 1);
+#endif
+	buf_ring_enqueue(txr->hn_txdesc_br, txd);
+#endif	/* !HN_USE_TXDESC_BUFRING */
+
+	return 1;
+}
+
+static __inline struct hn_txdesc *
+hn_txdesc_get(struct hn_tx_ring *txr)
+{
+	struct hn_txdesc *txd;
+
+#ifndef HN_USE_TXDESC_BUFRING
+	mtx_lock_spin(&txr->hn_txlist_spin);
+	txd = SLIST_FIRST(&txr->hn_txlist);
+	if (txd != NULL) {
+		KASSERT(txr->hn_txdesc_avail > 0,
+		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
+		txr->hn_txdesc_avail--;
+		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
+	}
+	mtx_unlock_spin(&txr->hn_txlist_spin);
+#else
+	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
+#endif
+
+	if (txd != NULL) {
+#ifdef HN_USE_TXDESC_BUFRING
+#ifdef HN_DEBUG
+		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
+#endif
+#endif	/* HN_USE_TXDESC_BUFRING */
+		KASSERT(txd->m == NULL && txd->refs == 0 &&
+		    STAILQ_EMPTY(&txd->agg_list) &&
+		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
+		    txd->chim_size == 0 &&
+		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
+		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
+		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
+		txd->flags &= ~HN_TXD_FLAG_ONLIST;
+		txd->refs = 1;
+	}
+	return txd;
+}
+
+static __inline void
+hn_txdesc_hold(struct hn_txdesc *txd)
+{
+
+	/* 0->1 transition will never work */
+	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
+	atomic_add_int(&txd->refs, 1);
+}
+
+static __inline void
+hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
+{
+
+	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
+	    ("recursive aggregation on aggregating txdesc"));
+
+	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
+	    ("already aggregated"));
+	KASSERT(STAILQ_EMPTY(&txd->agg_list),
+	    ("recursive aggregation on to-be-aggregated txdesc"));
+
+	txd->flags |= HN_TXD_FLAG_ONAGG;
+	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
+}
+
+static bool
+hn_tx_ring_pending(struct hn_tx_ring *txr)
+{
+	bool pending = false;
+
+#ifndef HN_USE_TXDESC_BUFRING
+	mtx_lock_spin(&txr->hn_txlist_spin);
+	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
+		pending = true;
+	mtx_unlock_spin(&txr->hn_txlist_spin);
+#else
+	if (!buf_ring_full(txr->hn_txdesc_br))
+		pending = true;
+#endif
+	return (pending);
+}
+
+static __inline void
+hn_txeof(struct hn_tx_ring *txr)
+{
+	txr->hn_has_txeof = 0;
+	txr->hn_txeof(txr);
+}
+
+static void
+hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
+    struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
+{
+	struct hn_txdesc *txd = sndc->hn_cbarg;
+	struct hn_tx_ring *txr;
+
+	txr = txd->txr;
+	KASSERT(txr->hn_chan == chan,
+	    ("channel mismatch, on chan%u, should be chan%u",
+	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
+
+	txr->hn_has_txeof = 1;
+	hn_txdesc_put(txr, txd);
+
+	++txr->hn_txdone_cnt;
+	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
+		txr->hn_txdone_cnt = 0;
+		if (txr->hn_oactive)
+			hn_txeof(txr);
+	}
+}
+
+static void
+hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
+{
+#if defined(INET) || defined(INET6)
+	struct epoch_tracker et;
+
+	NET_EPOCH_ENTER(et);
+	tcp_lro_flush_all(&rxr->hn_lro);
+	NET_EPOCH_EXIT(et);
+#endif
+
+	/*
+	 * NOTE:
+	 * 'txr' could be NULL, if multiple channels and
+	 * ifnet.if_start method are enabled.
+	 */
+	if (txr == NULL || !txr->hn_has_txeof)
+		return;
+
+	txr->hn_txdone_cnt = 0;
+	hn_txeof(txr);
+}
+
+static __inline uint32_t
+hn_rndis_pktmsg_offset(uint32_t ofs)
+{
+
+	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
+	    ("invalid RNDIS packet msg offset %u", ofs));
+	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
+}
+
+static __inline void *
+hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
+    size_t pi_dlen, uint32_t pi_type)
+{
+	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
+	struct rndis_pktinfo *pi;
+
+	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
+	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
+
+	/*
+	 * Per-packet-info does not move; it only grows.
+	 *
+	 * NOTE:
+	 * rm_pktinfooffset in this phase counts from the beginning
+	 * of rndis_packet_msg.
+	 */
+	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
+	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
+	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
+	    pkt->rm_pktinfolen);
+	pkt->rm_pktinfolen += pi_size;
+
+	pi->rm_size = pi_size;
+	pi->rm_type = pi_type;
+	pi->rm_internal = 0;
+	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
+
+	return (pi->rm_data);
+}
+
+static __inline int
+hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
+{
+	struct hn_txdesc *txd;
+	struct mbuf *m;
+	int error, pkts;
+
+	txd = txr->hn_agg_txd;
+	KASSERT(txd != NULL, ("no aggregate txdesc"));
+
+	/*
+	 * Since hn_txpkt() will reset this temporary stat, save
+	 * it now, so that oerrors can be updated properly, if
+	 * hn_txpkt() ever fails.
+	 */
+	pkts = txr->hn_stat_pkts;
+
+	/*
+	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
+	 * failure, save it for later freeing, if hn_txpkt() ever
+	 * fails.
+	 */
+	m = txd->m;
+	error = hn_txpkt(ifp, txr, txd);
+	if (__predict_false(error)) {
+		/* txd is freed, but m is not. */
+		m_freem(m);
+
+		txr->hn_flush_failed++;
+		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
+	}
+
+	/* Reset all aggregation states. */
+	txr->hn_agg_txd = NULL;
+	txr->hn_agg_szleft = 0;
+	txr->hn_agg_pktleft = 0;
+	txr->hn_agg_prevpkt = NULL;
+
+	return (error);
+}
+
+static void *
+hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
+    int pktsize)
+{
+	void *chim;
+
+	if (txr->hn_agg_txd != NULL) {
+		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
+			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
+			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
+			int olen;
+
+			/*
+			 * Update the previous RNDIS packet's total length,
+			 * it can be increased due to the mandatory alignment
+			 * padding for this RNDIS packet.  And update the
+			 * aggregating txdesc's chimney sending buffer size
+			 * accordingly.
+			 *
+			 * XXX
+			 * Zero-out the padding, as required by the RNDIS spec.
+			 */
+			olen = pkt->rm_len;
+			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
+			agg_txd->chim_size += pkt->rm_len - olen;
+
+			/* Link this txdesc to the parent. */
+			hn_txdesc_agg(agg_txd, txd);
+
+			chim = (uint8_t *)pkt + pkt->rm_len;
+			/* Save the current packet for later fixup. */
+			txr->hn_agg_prevpkt = chim;
+
+			txr->hn_agg_pktleft--;
+			txr->hn_agg_szleft -= pktsize;
+			if (txr->hn_agg_szleft <=
+			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
+				/*
+				 * Probably can't aggregate more packets,
+				 * flush this aggregating txdesc proactively.
+				 */
+				txr->hn_agg_pktleft = 0;
+			}
+			/* Done! */
+			return (chim);
+		}
+		hn_flush_txagg(ifp, txr);
+	}
+	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
+
+	txr->hn_tx_chimney_tried++;
+	txd->chim_index = hn_chim_alloc(txr->hn_sc);
+	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
+		return (NULL);
+	txr->hn_tx_chimney++;
+
+	chim = txr->hn_sc->hn_chim +
+	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
+
+	if (txr->hn_agg_pktmax > 1 &&
+	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
+		txr->hn_agg_txd = txd;
+		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
+		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
+		txr->hn_agg_prevpkt = chim;
+	}
+	return (chim);
+}
+
+/*
+ * NOTE:
+ * If this function fails, then both txd and m_head0 will be freed.
+ */
+static int
+hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
+    struct mbuf **m_head0)
+{
+	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
+	int error, nsegs, i;
+	struct mbuf *m_head = *m_head0;
+	struct rndis_packet_msg *pkt;
+	uint32_t *pi_data;
+	void *chim = NULL;
+	int pkt_hlen, pkt_size;
+
+	pkt = txd->rndis_pkt;
+	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
+	if (pkt_size < txr->hn_chim_size) {
+		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
+		if (chim != NULL)
+			pkt = chim;
+	} else {
+		if (txr->hn_agg_txd != NULL)
+			hn_flush_txagg(ifp, txr);
+	}
+
+	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
+	pkt->rm_len = m_head->m_pkthdr.len;
+	pkt->rm_dataoffset = 0;
+	pkt->rm_datalen = m_head->m_pkthdr.len;
+	pkt->rm_oobdataoffset = 0;
+	pkt->rm_oobdatalen = 0;
+	pkt->rm_oobdataelements = 0;
+	pkt->rm_pktinfooffset = sizeof(*pkt);
+	pkt->rm_pktinfolen = 0;
+	pkt->rm_vchandle = 0;
+	pkt->rm_reserved = 0;
+
+	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
+		/*
+		 * Set the hash value for this packet.
+		 */
+		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
+		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
+
+		if (M_HASHTYPE_ISHASH(m_head))
+			/*
+			 * The flowid field contains the hash value host
+			 * set in the rx queue if it is a ip forwarding pkt.
+			 * Set the same hash value so host can send on the
+			 * cpu it was received.
+			 */
+			*pi_data = m_head->m_pkthdr.flowid;
+		else
+			/*
+			 * Otherwise just put the tx queue index.
+			 */
+			*pi_data = txr->hn_tx_idx;
+	}
+
+	if (m_head->m_flags & M_VLANTAG) {
+		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
+		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
+		*pi_data = NDIS_VLAN_INFO_MAKE(
+		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
+		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
+		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
+	}
+
+	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
+#if defined(INET6) || defined(INET)
+		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
+		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
+#ifdef INET
+		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
+			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
+			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
+			    m_head->m_pkthdr.tso_segsz);
+		}
+#endif
+#if defined(INET6) && defined(INET)
+		else
+#endif
+#ifdef INET6
+		{
+			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
+			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
+			    m_head->m_pkthdr.tso_segsz);
+		}
+#endif
+#endif	/* INET6 || INET */
+	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
+		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
+		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
+		if (m_head->m_pkthdr.csum_flags &
+		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
+			*pi_data = NDIS_TXCSUM_INFO_IPV6;
+		} else {
+			*pi_data = NDIS_TXCSUM_INFO_IPV4;
+			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
+				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
+		}
+
+		if (m_head->m_pkthdr.csum_flags &
+		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
+			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
+			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
+		} else if (m_head->m_pkthdr.csum_flags &
+		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
+			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
+			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
+		}
+	}
+
+	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
+	/* Fixup RNDIS packet message total length */
+	pkt->rm_len += pkt_hlen;
+	/* Convert RNDIS packet message offsets */
+	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
+	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
+
+	/*
+	 * Fast path: Chimney sending.
+	 */
+	if (chim != NULL) {
+		struct hn_txdesc *tgt_txd = txd;
+
+		if (txr->hn_agg_txd != NULL) {
+			tgt_txd = txr->hn_agg_txd;
+#ifdef INVARIANTS
+			*m_head0 = NULL;
+#endif
+		}
+
+		KASSERT(pkt == chim,
+		    ("RNDIS pkt not in chimney sending buffer"));
+		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
+		    ("chimney sending buffer is not used"));
+		tgt_txd->chim_size += pkt->rm_len;
+
+		m_copydata(m_head, 0, m_head->m_pkthdr.len,
+		    ((uint8_t *)chim) + pkt_hlen);
+
+		txr->hn_gpa_cnt = 0;
+		txr->hn_sendpkt = hn_txpkt_chim;
+		goto done;
+	}
+
+	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
+	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
+	    ("chimney buffer is used"));
+	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
+
+	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
+	if (__predict_false(error)) {
+		int freed;
+
+		/*
+		 * This mbuf is not linked w/ the txd yet, so free it now.
+		 */
+		m_freem(m_head);
+		*m_head0 = NULL;
+
+		freed = hn_txdesc_put(txr, txd);
+		KASSERT(freed != 0,
+		    ("fail to free txd upon txdma error"));
+
+		txr->hn_txdma_failed++;
+		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+		return error;
+	}
+	*m_head0 = m_head;
+
+	/* +1 RNDIS packet message */
+	txr->hn_gpa_cnt = nsegs + 1;
+
+	/* send packet with page buffer */
+	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
+	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
+	txr->hn_gpa[0].gpa_len = pkt_hlen;
+
+	/*
+	 * Fill the page buffers with mbuf info after the page
+	 * buffer for RNDIS packet message.
+	 */
+	for (i = 0; i < nsegs; ++i) {
+		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
+
+		gpa->gpa_page = atop(segs[i].ds_addr);
+		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
+		gpa->gpa_len = segs[i].ds_len;
+	}
+
+	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
+	txd->chim_size = 0;
+	txr->hn_sendpkt = hn_txpkt_sglist;
+done:
+	txd->m = m_head;
+
+	/* Set the completion routine */
+	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
+
+	/* Update temporary stats for later use. */
+	txr->hn_stat_pkts++;
+	txr->hn_stat_size += m_head->m_pkthdr.len;
+	if (m_head->m_flags & M_MCAST)
+		txr->hn_stat_mcasts++;
+
+	return 0;
+}
+
+/*
+ * NOTE:
+ * If this function fails, then txd will be freed, but the mbuf
+ * associated w/ the txd will _not_ be freed.
+ */
+static int
+hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
+{
+	int error, send_failed = 0, has_bpf;
+
+again:
+	has_bpf = bpf_peers_present(ifp->if_bpf);
+	if (has_bpf) {
+		/*
+		 * Make sure that this txd and any aggregated txds are not
+		 * freed before ETHER_BPF_MTAP.
+		 */
+		hn_txdesc_hold(txd);
+	}
+	error = txr->hn_sendpkt(txr, txd);
+	if (!error) {
+		if (has_bpf) {
+			const struct hn_txdesc *tmp_txd;
+
+			ETHER_BPF_MTAP(ifp, txd->m);
+			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
+				ETHER_BPF_MTAP(ifp, tmp_txd->m);
+		}
+
+		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
+#ifdef HN_IFSTART_SUPPORT
+		if (!hn_use_if_start)
+#endif
+		{
+			if_inc_counter(ifp, IFCOUNTER_OBYTES,
+			    txr->hn_stat_size);
+			if (txr->hn_stat_mcasts != 0) {
+				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
+				    txr->hn_stat_mcasts);
+			}
+		}
+		txr->hn_pkts += txr->hn_stat_pkts;
+		txr->hn_sends++;
+	}
+	if (has_bpf)
+		hn_txdesc_put(txr, txd);
+
+	if (__predict_false(error)) {
+		int freed;
+
+		/*
+		 * This should "really rarely" happen.
+		 *
+		 * XXX Too many RX to be acked or too many sideband
+		 * commands to run?  Ask netvsc_channel_rollup()
+		 * to kick start later.
+		 */
+		txr->hn_has_txeof = 1;
+		if (!send_failed) {
+			txr->hn_send_failed++;
+			send_failed = 1;
+			/*
+			 * Try sending again after set hn_has_txeof;
+			 * in case that we missed the last
+			 * netvsc_channel_rollup().
+			 */
+			goto again;
+		}
+		if_printf(ifp, "send failed\n");
+
+		/*
+		 * Caller will perform further processing on the
+		 * associated mbuf, so don't free it in hn_txdesc_put();
+		 * only unload it from the DMA map in hn_txdesc_put(),
+		 * if it was loaded.
+		 */
+		txd->m = NULL;
+		freed = hn_txdesc_put(txr, txd);
+		KASSERT(freed != 0,
+		    ("fail to free txd upon send error"));
+
+		txr->hn_send_failed++;
+	}
+
+	/* Reset temporary stats, after this sending is done. */
+	txr->hn_stat_size = 0;
+	txr->hn_stat_pkts = 0;
+	txr->hn_stat_mcasts = 0;
+
+	return (error);
+}
+
+/*
+ * Append the specified data to the indicated mbuf chain,
+ * Extend the mbuf chain if the new data does not fit in
+ * existing space.
+ *
+ * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
+ * There should be an equivalent in the kernel mbuf code,
+ * but there does not appear to be one yet.
+ *
+ * Differs from m_append() in that additional mbufs are
+ * allocated with cluster size MJUMPAGESIZE, and filled
+ * accordingly.
+ *
+ * Return the last mbuf in the chain or NULL if failed to
+ * allocate new mbuf.
+ */
+static struct mbuf *
+hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
+{
+	struct mbuf *m, *n;
+	int remainder, space;
+
+	for (m = m0; m->m_next != NULL; m = m->m_next)
+		;
+	remainder = len;
+	space = M_TRAILINGSPACE(m);
+	if (space > 0) {
+		/*
+		 * Copy into available space.
+		 */
+		if (space > remainder)
+			space = remainder;
+		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
+		m->m_len += space;
+		cp += space;
+		remainder -= space;
+	}
+	while (remainder > 0) {
+		/*
+		 * Allocate a new mbuf; could check space
+		 * and allocate a cluster instead.
+		 */
+		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
+		if (n == NULL)
+			return NULL;
+		n->m_len = min(MJUMPAGESIZE, remainder);
+		bcopy(cp, mtod(n, caddr_t), n->m_len);
+		cp += n->m_len;
+		remainder -= n->m_len;
+		m->m_next = n;
+		m = n;
+	}
+
+	return m;
+}
+
+#if defined(INET) || defined(INET6)
+static __inline int
+hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
+{
+#if __FreeBSD_version >= 1100095
+	if (hn_lro_mbufq_depth) {
+		tcp_lro_queue_mbuf(lc, m);
+		return 0;
+	}
+#endif
+	return tcp_lro_rx(lc, m, 0);
+}
+#endif
+
+static int
+hn_rxpkt(struct hn_rx_ring *rxr)
+{
+	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
+	struct mbuf *m_new, *n;
+	int size, do_lro = 0, do_csum = 1, is_vf = 0;
+	int hash_type = M_HASHTYPE_NONE;
+	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
+	int i;
+
+	ifp = hn_ifp;
+	if (rxr->hn_rxvf_ifp != NULL) {
+		/*
+		 * Non-transparent mode VF; pretend this packet is from
+		 * the VF.
+		 */
+		ifp = rxr->hn_rxvf_ifp;
+		is_vf = 1;
+	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
+		/* Transparent mode VF. */
+		is_vf = 1;
+	}
+
+	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
+		/*
+		 * NOTE:
+		 * See the NOTE of hn_rndis_init_fixat().  This
+		 * function can be reached, immediately after the
+		 * RNDIS is initialized but before the ifnet is
+		 * setup on the hn_attach() path; drop the unexpected
+		 * packets.
+		 */
+		return (0);
+	}
+
+	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
+		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
+		return (0);
+	}
+
+	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
+		m_new = m_gethdr(M_NOWAIT, MT_DATA);
+		if (m_new == NULL) {
+			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
+			return (0);
+		}
+		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
+		    rxr->rsc.frag_len[0]);
+		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
+	} else {
+		/*
+		 * Get an mbuf with a cluster.  For packets 2K or less,
+		 * get a standard 2K cluster.  For anything larger, get a
+		 * 4K cluster.  Any buffers larger than 4K can cause problems
+		 * if looped around to the Hyper-V TX channel, so avoid them.
+		 */
+		size = MCLBYTES;
+		if (rxr->rsc.pktlen > MCLBYTES) {
+			/* 4096 */
+			size = MJUMPAGESIZE;
+		}
+
+		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
+		if (m_new == NULL) {
+			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
+			return (0);
+		}
+
+		n = m_new;
+		for (i = 0; i < rxr->rsc.cnt; i++) {
+			n = hv_m_append(n, rxr->rsc.frag_len[i],
+			    rxr->rsc.frag_data[i]);
+			if (n == NULL) {
+				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
+				return (0);
+			} else {
+				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
+			}
+		}
+	}
+	if (rxr->rsc.pktlen <= MHLEN)
+		rxr->hn_small_pkts++;
+
+	m_new->m_pkthdr.rcvif = ifp;
+
+	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
+		do_csum = 0;
+
+	/* receive side checksum offload */
+	if (rxr->rsc.csum_info != NULL) {
+		/* IP csum offload */
+		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
+			m_new->m_pkthdr.csum_flags |=
+			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
+			rxr->hn_csum_ip++;
+		}
+
+		/* TCP/UDP csum offload */
+		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
+		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
+			m_new->m_pkthdr.csum_flags |=
+			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+			m_new->m_pkthdr.csum_data = 0xffff;
+			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
+				rxr->hn_csum_tcp++;
+			else
+				rxr->hn_csum_udp++;
+		}
+
+		/*
+		 * XXX
+		 * As of this write (Oct 28th, 2016), host side will turn
+		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
+		 * the do_lro setting here is actually _not_ accurate.  We
+		 * depend on the RSS hash type check to reset do_lro.
+		 */
+		if ((*(rxr->rsc.csum_info) &
+		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
+		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
+			do_lro = 1;
+	} else {
+		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
+		if (l3proto == ETHERTYPE_IP) {
+			if (l4proto == IPPROTO_TCP) {
+				if (do_csum &&
+				    (rxr->hn_trust_hcsum &
+				     HN_TRUST_HCSUM_TCP)) {
+					rxr->hn_csum_trusted++;
+					m_new->m_pkthdr.csum_flags |=
+					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
+					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+					m_new->m_pkthdr.csum_data = 0xffff;
+				}
+				do_lro = 1;
+			} else if (l4proto == IPPROTO_UDP) {
+				if (do_csum &&
+				    (rxr->hn_trust_hcsum &
+				     HN_TRUST_HCSUM_UDP)) {
+					rxr->hn_csum_trusted++;
+					m_new->m_pkthdr.csum_flags |=
+					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
+					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+					m_new->m_pkthdr.csum_data = 0xffff;
+				}
+			} else if (l4proto != IPPROTO_DONE && do_csum &&
+			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
+				rxr->hn_csum_trusted++;
+				m_new->m_pkthdr.csum_flags |=
+				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
+			}
+		}
+	}
+
+	if (rxr->rsc.vlan_info != NULL) {
+		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
+		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
+		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
+		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
+		m_new->m_flags |= M_VLANTAG;
+	}
+
+	/*
+	 * If VF is activated (tranparent/non-transparent mode does not
+	 * matter here).
+	 *
+	 * - Disable LRO
+	 *
+	 *   hn(4) will only receive broadcast packets, multicast packets,
+	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
+	 *   packet types.
+	 *
+	 *   For non-transparent, we definitely _cannot_ enable LRO at
+	 *   all, since the LRO flush will use hn(4) as the receiving
+	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
+	 */
+	if (is_vf)
+		do_lro = 0;
+
+	/*
+	 * If VF is activated (tranparent/non-transparent mode does not
+	 * matter here), do _not_ mess with unsupported hash types or
+	 * functions.
+	 */
+	if (rxr->rsc.hash_info != NULL) {
+		rxr->hn_rss_pkts++;
+		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
+		if (!is_vf)
+			hash_type = M_HASHTYPE_OPAQUE_HASH;
+		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
+		    NDIS_HASH_FUNCTION_TOEPLITZ) {
+			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
+			    rxr->hn_mbuf_hash);
+
+			/*
+			 * NOTE:
+			 * do_lro is resetted, if the hash types are not TCP
+			 * related.  See the comment in the above csum_flags
+			 * setup section.
+			 */
+			switch (type) {
+			case NDIS_HASH_IPV4:
+				hash_type = M_HASHTYPE_RSS_IPV4;
+				do_lro = 0;
+				break;
+
+			case NDIS_HASH_TCP_IPV4:
+				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
+				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
+					int def_htype = M_HASHTYPE_OPAQUE_HASH;
+
+					if (is_vf)
+						def_htype = M_HASHTYPE_NONE;
+
+					/*
+					 * UDP 4-tuple hash is delivered as
+					 * TCP 4-tuple hash.
+					 */
+					if (l3proto == ETHERTYPE_MAX) {
+						hn_rxpkt_proto(m_new,
+						    &l3proto, &l4proto);
+					}
+					if (l3proto == ETHERTYPE_IP) {
+						if (l4proto == IPPROTO_UDP &&
+						    (rxr->hn_mbuf_hash &
+						     NDIS_HASH_UDP_IPV4_X)) {
+							hash_type =
+							M_HASHTYPE_RSS_UDP_IPV4;
+							do_lro = 0;
+						} else if (l4proto !=
+						    IPPROTO_TCP) {
+							hash_type = def_htype;
+							do_lro = 0;
+						}
+					} else {
+						hash_type = def_htype;
+						do_lro = 0;
+					}
+				}
+				break;
+
+			case NDIS_HASH_IPV6:
+				hash_type = M_HASHTYPE_RSS_IPV6;
+				do_lro = 0;
+				break;
+
+			case NDIS_HASH_IPV6_EX:
+				hash_type = M_HASHTYPE_RSS_IPV6_EX;
+				do_lro = 0;
+				break;
+
+			case NDIS_HASH_TCP_IPV6:
+				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
+				break;
+
+			case NDIS_HASH_TCP_IPV6_EX:
+				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
+				break;
+			}
+		}
+	} else if (!is_vf) {
+		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
+		hash_type = M_HASHTYPE_OPAQUE;
+	}
+	M_HASHTYPE_SET(m_new, hash_type);
+
+	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
+	if (hn_ifp != ifp) {
+		const struct ether_header *eh;
+
+		/*
+		 * Non-transparent mode VF is activated.
+		 */
+
+		/*
+		 * Allow tapping on hn(4).
+		 */
+		ETHER_BPF_MTAP(hn_ifp, m_new);
+
+		/*
+		 * Update hn(4)'s stats.
+		 */
+		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
+		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
+		/* Checked at the beginning of this function. */
+		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
+		eh = mtod(m_new, struct ether_header *);
+		if (ETHER_IS_MULTICAST(eh->ether_dhost))
+			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
+	}
+	rxr->hn_pkts++;
+
+	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
+#if defined(INET) || defined(INET6)
+		struct lro_ctrl *lro = &rxr->hn_lro;
+
+		if (lro->lro_cnt) {
+			rxr->hn_lro_tried++;
+			if (hn_lro_rx(lro, m_new) == 0) {
+				/* DONE! */
+				return 0;
+			}
+		}
+#endif
+	}
+	ifp->if_input(ifp, m_new);
+
+	return (0);
+}
+
+static int
+hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
+{
+	struct hn_softc *sc = ifp->if_softc;
+	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
+	struct ifnet *vf_ifp;
+	int mask, error = 0;
+	struct ifrsskey *ifrk;
+	struct ifrsshash *ifrh;
+	uint32_t mtu;
+
+	switch (cmd) {
+	case SIOCSIFMTU:
+		if (ifr->ifr_mtu > HN_MTU_MAX) {
+			error = EINVAL;
+			break;
+		}
+
+		HN_LOCK(sc);
+
+		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
+			HN_UNLOCK(sc);
+			break;
+		}
+
+		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
+			/* Can't change MTU */
+			HN_UNLOCK(sc);
+			error = EOPNOTSUPP;
+			break;
+		}
+
+		if (ifp->if_mtu == ifr->ifr_mtu) {
+			HN_UNLOCK(sc);
+			break;
+		}
+
+		if (hn_xpnt_vf_isready(sc)) {
+			vf_ifp = sc->hn_vf_ifp;
+			ifr_vf = *ifr;
+			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
+			    sizeof(ifr_vf.ifr_name));
+			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
+			    (caddr_t)&ifr_vf);
+			if (error) {
+				HN_UNLOCK(sc);
+				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
+				    vf_ifp->if_xname, ifr->ifr_mtu, error);
+				break;
+			}
+		}
+
+		/*
+		 * Suspend this interface before the synthetic parts
+		 * are ripped.
+		 */
+		hn_suspend(sc);
+
+		/*
+		 * Detach the synthetics parts, i.e. NVS and RNDIS.
+		 */
+		hn_synth_detach(sc);
+
+		/*
+		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
+		 * with the new MTU setting.
+		 */
+		error = hn_synth_attach(sc, ifr->ifr_mtu);
+		if (error) {
+			HN_UNLOCK(sc);
+			break;
+		}
+
+		error = hn_rndis_get_mtu(sc, &mtu);
+		if (error)
+			mtu = ifr->ifr_mtu;
+		else if (bootverbose)
+			if_printf(ifp, "RNDIS mtu %u\n", mtu);
+
+		/*
+		 * Commit the requested MTU, after the synthetic parts
+		 * have been successfully attached.
+		 */
+		if (mtu >= ifr->ifr_mtu) {
+			mtu = ifr->ifr_mtu;
+		} else {
+			if_printf(ifp, "fixup mtu %d -> %u\n",
+			    ifr->ifr_mtu, mtu);
+		}
+		ifp->if_mtu = mtu;
+
+		/*
+		 * Synthetic parts' reattach may change the chimney
+		 * sending size; update it.
+		 */
+		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
+			hn_set_chim_size(sc, sc->hn_chim_szmax);
+
+		/*
+		 * Make sure that various parameters based on MTU are
+		 * still valid, after the MTU change.
+		 */
+		hn_mtu_change_fixup(sc);
+
+		/*
+		 * All done!  Resume the interface now.
+		 */
+		hn_resume(sc);
+
+		if ((sc->hn_flags & HN_FLAG_RXVF) ||
+		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
+			/*
+			 * Since we have reattached the NVS part,
+			 * change the datapath to VF again; in case
+			 * that it is lost, after the NVS was detached.
+			 */
+			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
+		}
+
+		HN_UNLOCK(sc);
+		break;
+
+	case SIOCSIFFLAGS:
+		HN_LOCK(sc);
+
+		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
+			HN_UNLOCK(sc);
+			break;
+		}
+
+		if (hn_xpnt_vf_isready(sc))
+			hn_xpnt_vf_saveifflags(sc);
+
+		if (ifp->if_flags & IFF_UP) {
+			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+				/*
+				 * Caller meight hold mutex, e.g.
+				 * bpf; use busy-wait for the RNDIS
+				 * reply.
+				 */
+				HN_NO_SLEEPING(sc);
+				hn_rxfilter_config(sc);
+				HN_SLEEPING_OK(sc);
+
+				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
+					error = hn_xpnt_vf_iocsetflags(sc);
+			} else {
+				hn_init_locked(sc);
+			}
+		} else {
+			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+				hn_stop(sc, false);
+		}
+		sc->hn_if_flags = ifp->if_flags;
+
+		HN_UNLOCK(sc);
+		break;
+
+	case SIOCSIFCAP:
+		HN_LOCK(sc);
+
+		if (hn_xpnt_vf_isready(sc)) {
+			ifr_vf = *ifr;
+			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
+			    sizeof(ifr_vf.ifr_name));
+			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
+			HN_UNLOCK(sc);
+			break;
+		}
+
+		/*
+		 * Fix up requested capabilities w/ supported capabilities,
+		 * since the supported capabilities could have been changed.
+		 */
+		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
+		    ifp->if_capenable;
+
+		if (mask & IFCAP_TXCSUM) {
+			ifp->if_capenable ^= IFCAP_TXCSUM;
+			if (ifp->if_capenable & IFCAP_TXCSUM)
+				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
+			else
+				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
+		}
+		if (mask & IFCAP_TXCSUM_IPV6) {
+			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
+			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
+				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
+			else
+				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
+		}
+
+		/* TODO: flip RNDIS offload parameters for RXCSUM. */
+		if (mask & IFCAP_RXCSUM)
+			ifp->if_capenable ^= IFCAP_RXCSUM;
+#ifdef foo
+		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
+		if (mask & IFCAP_RXCSUM_IPV6)
+			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
+#endif
+
+		if (mask & IFCAP_LRO)
+			ifp->if_capenable ^= IFCAP_LRO;
+
+		if (mask & IFCAP_TSO4) {
+			ifp->if_capenable ^= IFCAP_TSO4;
+			if (ifp->if_capenable & IFCAP_TSO4)
+				ifp->if_hwassist |= CSUM_IP_TSO;
+			else
+				ifp->if_hwassist &= ~CSUM_IP_TSO;
+		}
+		if (mask & IFCAP_TSO6) {
+			ifp->if_capenable ^= IFCAP_TSO6;
+			if (ifp->if_capenable & IFCAP_TSO6)
+				ifp->if_hwassist |= CSUM_IP6_TSO;
+			else
+				ifp->if_hwassist &= ~CSUM_IP6_TSO;
+		}
+
+		HN_UNLOCK(sc);
+		break;
+
+	case SIOCADDMULTI:
+	case SIOCDELMULTI:
+		HN_LOCK(sc);
+
+		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
+			HN_UNLOCK(sc);
+			break;
+		}
+		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+			/*
+			 * Multicast uses mutex; use busy-wait for
+			 * the RNDIS reply.
+			 */
+			HN_NO_SLEEPING(sc);
+			hn_rxfilter_config(sc);
+			HN_SLEEPING_OK(sc);
+		}
+
+		/* XXX vlan(4) style mcast addr maintenance */
+		if (hn_xpnt_vf_isready(sc)) {
+			int old_if_flags;
+
+			old_if_flags = sc->hn_vf_ifp->if_flags;
+			hn_xpnt_vf_saveifflags(sc);
+
+			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
+			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
+			     IFF_ALLMULTI))
+				error = hn_xpnt_vf_iocsetflags(sc);
+		}
+
+		HN_UNLOCK(sc);
+		break;
+
+	case SIOCSIFMEDIA:
+	case SIOCGIFMEDIA:
+		HN_LOCK(sc);
+		if (hn_xpnt_vf_isready(sc)) {
+			/*
+			 * SIOCGIFMEDIA expects ifmediareq, so don't
+			 * create and pass ifr_vf to the VF here; just
+			 * replace the ifr_name.
+			 */
+			vf_ifp = sc->hn_vf_ifp;
+			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
+			    sizeof(ifr->ifr_name));
+			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
+			/* Restore the ifr_name. */
+			strlcpy(ifr->ifr_name, ifp->if_xname,
+			    sizeof(ifr->ifr_name));
+			HN_UNLOCK(sc);
+			break;
+		}
+		HN_UNLOCK(sc);
+		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
+		break;
+
+	case SIOCGIFRSSHASH:
+		ifrh = (struct ifrsshash *)data;
+		HN_LOCK(sc);
+		if (sc->hn_rx_ring_inuse == 1) {
+			HN_UNLOCK(sc);
+			ifrh->ifrh_func = RSS_FUNC_NONE;
+			ifrh->ifrh_types = 0;
+			break;
+		}
+
+		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
+			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
+		else
+			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
+		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
+		HN_UNLOCK(sc);
+		break;
+
+	case SIOCGIFRSSKEY:
+		ifrk = (struct ifrsskey *)data;
+		HN_LOCK(sc);
+		if (sc->hn_rx_ring_inuse == 1) {
+			HN_UNLOCK(sc);
+			ifrk->ifrk_func = RSS_FUNC_NONE;
+			ifrk->ifrk_keylen = 0;
+			break;
+		}
+		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
+			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
+		else
+			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
+		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
+		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
+		    NDIS_HASH_KEYSIZE_TOEPLITZ);
+		HN_UNLOCK(sc);
+		break;
+
+	default:
+		error = ether_ioctl(ifp, cmd, data);
+		break;
+	}
+	return (error);
+}
+
+static void
+hn_stop(struct hn_softc *sc, bool detaching)
+{
+	struct ifnet *ifp = sc->hn_ifp;
+	int i;
+
+	HN_LOCK_ASSERT(sc);
+
+	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
+	    ("synthetic parts were not attached"));
+
+	/* Clear RUNNING bit ASAP. */
+	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
+
+	/* Disable polling. */
+	hn_polling(sc, 0);
+
+	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
+		KASSERT(sc->hn_vf_ifp != NULL,
+		    ("%s: VF is not attached", ifp->if_xname));
+
+		/* Mark transparent mode VF as disabled. */
+		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
+
+		/*
+		 * NOTE:
+		 * Datapath setting must happen _before_ bringing
+		 * the VF down.
+		 */
+		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
+
+		/*
+		 * Bring the VF down.
+		 */
+		hn_xpnt_vf_saveifflags(sc);
+		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
+		hn_xpnt_vf_iocsetflags(sc);
+	}
+
+	/* Suspend data transfers. */
+	hn_suspend_data(sc);
+
+	/* Clear OACTIVE bit. */
+	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
+		sc->hn_tx_ring[i].hn_oactive = 0;
+
+	/*
+	 * If the non-transparent mode VF is active, make sure
+	 * that the RX filter still allows packet reception.
+	 */
+	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
+		hn_rxfilter_config(sc);
+}
+
+static void
+hn_init_locked(struct hn_softc *sc)
+{
+	struct ifnet *ifp = sc->hn_ifp;
+	int i;
+
+	HN_LOCK_ASSERT(sc);
+
+	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
+		return;
+
+	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+		return;
+
+	/* Configure RX filter */
+	hn_rxfilter_config(sc);
+
+	/* Clear OACTIVE bit. */
+	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
+		sc->hn_tx_ring[i].hn_oactive = 0;
+
+	/* Clear TX 'suspended' bit. */
+	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
+
+	if (hn_xpnt_vf_isready(sc)) {
+		/* Initialize transparent VF. */
+		hn_xpnt_vf_init(sc);
+	}
+
+	/* Everything is ready; unleash! */
+	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
+
+	/* Re-enable polling if requested. */
+	if (sc->hn_pollhz > 0)
+		hn_polling(sc, sc->hn_pollhz);
+}
+
+static void
+hn_init(void *xsc)
+{
+	struct hn_softc *sc = xsc;
+
+	HN_LOCK(sc);
+	hn_init_locked(sc);
+	HN_UNLOCK(sc);
+}
+
+#if __FreeBSD_version >= 1100099
+
+static int
+hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	unsigned int lenlim;
+	int error;
+
+	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
+	error = sysctl_handle_int(oidp, &lenlim, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	HN_LOCK(sc);
+	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
+	    lenlim > TCP_LRO_LENGTH_MAX) {
+		HN_UNLOCK(sc);
+		return EINVAL;
+	}
+	hn_set_lro_lenlim(sc, lenlim);
+	HN_UNLOCK(sc);
+
+	return 0;
+}
+
+static int
+hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int ackcnt, error, i;
+
+	/*
+	 * lro_ackcnt_lim is append count limit,
+	 * +1 to turn it into aggregation limit.
+	 */
+	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
+	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
+		return EINVAL;
+
+	/*
+	 * Convert aggregation limit back to append
+	 * count limit.
+	 */
+	--ackcnt;
+	HN_LOCK(sc);
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
+	HN_UNLOCK(sc);
+	return 0;
+}
+
+#endif
+
+static int
+hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int hcsum = arg2;
+	int on, error, i;
+
+	on = 0;
+	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
+		on = 1;
+
+	error = sysctl_handle_int(oidp, &on, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	HN_LOCK(sc);
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
+
+		if (on)
+			rxr->hn_trust_hcsum |= hcsum;
+		else
+			rxr->hn_trust_hcsum &= ~hcsum;
+	}
+	HN_UNLOCK(sc);
+	return 0;
+}
+
+static int
+hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int chim_size, error;
+
+	chim_size = sc->hn_tx_ring[0].hn_chim_size;
+	error = sysctl_handle_int(oidp, &chim_size, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
+		return EINVAL;
+
+	HN_LOCK(sc);
+	hn_set_chim_size(sc, chim_size);
+	HN_UNLOCK(sc);
+	return 0;
+}
+
+#if __FreeBSD_version < 1100095
+static int
+hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int ofs = arg2, i, error;
+	struct hn_rx_ring *rxr;
+	uint64_t stat;
+
+	stat = 0;
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		rxr = &sc->hn_rx_ring[i];
+		stat += *((int *)((uint8_t *)rxr + ofs));
+	}
+
+	error = sysctl_handle_64(oidp, &stat, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	/* Zero out this stat. */
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		rxr = &sc->hn_rx_ring[i];
+		*((int *)((uint8_t *)rxr + ofs)) = 0;
+	}
+	return 0;
+}
+#else
+static int
+hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int ofs = arg2, i, error;
+	struct hn_rx_ring *rxr;
+	uint64_t stat;
+
+	stat = 0;
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		rxr = &sc->hn_rx_ring[i];
+		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
+	}
+
+	error = sysctl_handle_64(oidp, &stat, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	/* Zero out this stat. */
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		rxr = &sc->hn_rx_ring[i];
+		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
+	}
+	return 0;
+}
+
+#endif
+
+static int
+hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int ofs = arg2, i, error;
+	struct hn_rx_ring *rxr;
+	u_long stat;
+
+	stat = 0;
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		rxr = &sc->hn_rx_ring[i];
+		stat += *((u_long *)((uint8_t *)rxr + ofs));
+	}
+
+	error = sysctl_handle_long(oidp, &stat, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	/* Zero out this stat. */
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		rxr = &sc->hn_rx_ring[i];
+		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
+	}
+	return 0;
+}
+
+static int
+hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int ofs = arg2, i, error;
+	struct hn_tx_ring *txr;
+	u_long stat;
+
+	stat = 0;
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+		txr = &sc->hn_tx_ring[i];
+		stat += *((u_long *)((uint8_t *)txr + ofs));
+	}
+
+	error = sysctl_handle_long(oidp, &stat, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	/* Zero out this stat. */
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+		txr = &sc->hn_tx_ring[i];
+		*((u_long *)((uint8_t *)txr + ofs)) = 0;
+	}
+	return 0;
+}
+
+static int
+hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int ofs = arg2, i, error, conf;
+	struct hn_tx_ring *txr;
+
+	txr = &sc->hn_tx_ring[0];
+	conf = *((int *)((uint8_t *)txr + ofs));
+
+	error = sysctl_handle_int(oidp, &conf, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	HN_LOCK(sc);
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+		txr = &sc->hn_tx_ring[i];
+		*((int *)((uint8_t *)txr + ofs)) = conf;
+	}
+	HN_UNLOCK(sc);
+
+	return 0;
+}
+
+static int
+hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int error, size;
+
+	size = sc->hn_agg_size;
+	error = sysctl_handle_int(oidp, &size, 0, req);
+	if (error || req->newptr == NULL)
+		return (error);
+
+	HN_LOCK(sc);
+	sc->hn_agg_size = size;
+	hn_set_txagg(sc);
+	HN_UNLOCK(sc);
+
+	return (0);
+}
+
+static int
+hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int error, pkts;
+
+	pkts = sc->hn_agg_pkts;
+	error = sysctl_handle_int(oidp, &pkts, 0, req);
+	if (error || req->newptr == NULL)
+		return (error);
+
+	HN_LOCK(sc);
+	sc->hn_agg_pkts = pkts;
+	hn_set_txagg(sc);
+	HN_UNLOCK(sc);
+
+	return (0);
+}
+
+static int
+hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int pkts;
+
+	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
+	return (sysctl_handle_int(oidp, &pkts, 0, req));
+}
+
+static int
+hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int align;
+
+	align = sc->hn_tx_ring[0].hn_agg_align;
+	return (sysctl_handle_int(oidp, &align, 0, req));
+}
+
+static void
+hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
+{
+	if (pollhz == 0)
+		vmbus_chan_poll_disable(chan);
+	else
+		vmbus_chan_poll_enable(chan, pollhz);
+}
+
+static void
+hn_polling(struct hn_softc *sc, u_int pollhz)
+{
+	int nsubch = sc->hn_rx_ring_inuse - 1;
+
+	HN_LOCK_ASSERT(sc);
+
+	if (nsubch > 0) {
+		struct vmbus_channel **subch;
+		int i;
+
+		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
+		for (i = 0; i < nsubch; ++i)
+			hn_chan_polling(subch[i], pollhz);
+		vmbus_subchan_rel(subch, nsubch);
+	}
+	hn_chan_polling(sc->hn_prichan, pollhz);
+}
+
+static int
+hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int pollhz, error;
+
+	pollhz = sc->hn_pollhz;
+	error = sysctl_handle_int(oidp, &pollhz, 0, req);
+	if (error || req->newptr == NULL)
+		return (error);
+
+	if (pollhz != 0 &&
+	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
+		return (EINVAL);
+
+	HN_LOCK(sc);
+	if (sc->hn_pollhz != pollhz) {
+		sc->hn_pollhz = pollhz;
+		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
+		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
+			hn_polling(sc, sc->hn_pollhz);
+	}
+	HN_UNLOCK(sc);
+
+	return (0);
+}
+
+static int
+hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	char verstr[16];
+
+	snprintf(verstr, sizeof(verstr), "%u.%u",
+	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
+	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
+	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
+}
+
+static int
+hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	char caps_str[128];
+	uint32_t caps;
+
+	HN_LOCK(sc);
+	caps = sc->hn_caps;
+	HN_UNLOCK(sc);
+	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
+	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
+}
+
+static int
+hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	char assist_str[128];
+	uint32_t hwassist;
+
+	HN_LOCK(sc);
+	hwassist = sc->hn_ifp->if_hwassist;
+	HN_UNLOCK(sc);
+	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
+	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
+}
+
+static int
+hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	char filter_str[128];
+	uint32_t filter;
+
+	HN_LOCK(sc);
+	filter = sc->hn_rx_filter;
+	HN_UNLOCK(sc);
+	snprintf(filter_str, sizeof(filter_str), "%b", filter,
+	    NDIS_PACKET_TYPES);
+	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
+}
+
+#ifndef RSS
+
+static int
+hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int error;
+
+	HN_LOCK(sc);
+
+	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
+	if (error || req->newptr == NULL)
+		goto back;
+
+	if ((sc->hn_flags & HN_FLAG_RXVF) ||
+	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
+		/*
+		 * RSS key is synchronized w/ VF's, don't allow users
+		 * to change it.
+		 */
+		error = EBUSY;
+		goto back;
+	}
+
+	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
+	if (error)
+		goto back;
+	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
+
+	if (sc->hn_rx_ring_inuse > 1) {
+		error = hn_rss_reconfig(sc);
+	} else {
+		/* Not RSS capable, at least for now; just save the RSS key. */
+		error = 0;
+	}
+back:
+	HN_UNLOCK(sc);
+	return (error);
+}
+
+static int
+hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int error;
+
+	HN_LOCK(sc);
+
+	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
+	if (error || req->newptr == NULL)
+		goto back;
+
+	/*
+	 * Don't allow RSS indirect table change, if this interface is not
+	 * RSS capable currently.
+	 */
+	if (sc->hn_rx_ring_inuse == 1) {
+		error = EOPNOTSUPP;
+		goto back;
+	}
+
+	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
+	if (error)
+		goto back;
+	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
+
+	hn_rss_ind_fixup(sc);
+	error = hn_rss_reconfig(sc);
+back:
+	HN_UNLOCK(sc);
+	return (error);
+}
+
+#endif	/* !RSS */
+
+static int
+hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	char hash_str[128];
+	uint32_t hash;
+
+	HN_LOCK(sc);
+	hash = sc->hn_rss_hash;
+	HN_UNLOCK(sc);
+	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
+	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
+}
+
+static int
+hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	char hash_str[128];
+	uint32_t hash;
+
+	HN_LOCK(sc);
+	hash = sc->hn_rss_hcap;
+	HN_UNLOCK(sc);
+	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
+	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
+}
+
+static int
+hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	char hash_str[128];
+	uint32_t hash;
+
+	HN_LOCK(sc);
+	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
+	HN_UNLOCK(sc);
+	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
+	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
+}
+
+static int
+hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	char vf_name[IFNAMSIZ + 1];
+	struct ifnet *vf_ifp;
+
+	HN_LOCK(sc);
+	vf_name[0] = '\0';
+	vf_ifp = sc->hn_vf_ifp;
+	if (vf_ifp != NULL)
+		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
+	HN_UNLOCK(sc);
+	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
+}
+
+static int
+hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	char vf_name[IFNAMSIZ + 1];
+	struct ifnet *vf_ifp;
+
+	HN_LOCK(sc);
+	vf_name[0] = '\0';
+	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
+	if (vf_ifp != NULL)
+		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
+	HN_UNLOCK(sc);
+	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
+}
+
+static int
+hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct rm_priotracker pt;
+	struct sbuf *sb;
+	int error, i;
+	bool first;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+
+	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
+	if (sb == NULL)
+		return (ENOMEM);
+
+	rm_rlock(&hn_vfmap_lock, &pt);
+
+	first = true;
+	for (i = 0; i < hn_vfmap_size; ++i) {
+		struct ifnet *ifp;
+
+		if (hn_vfmap[i] == NULL)
+			continue;
+
+		ifp = ifnet_byindex(i);
+		if (ifp != NULL) {
+			if (first)
+				sbuf_printf(sb, "%s", ifp->if_xname);
+			else
+				sbuf_printf(sb, " %s", ifp->if_xname);
+			first = false;
+		}
+	}
+
+	rm_runlock(&hn_vfmap_lock, &pt);
+
+	error = sbuf_finish(sb);
+	sbuf_delete(sb);
+	return (error);
+}
+
+static int
+hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct rm_priotracker pt;
+	struct sbuf *sb;
+	int error, i;
+	bool first;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+
+	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
+	if (sb == NULL)
+		return (ENOMEM);
+
+	rm_rlock(&hn_vfmap_lock, &pt);
+
+	first = true;
+	for (i = 0; i < hn_vfmap_size; ++i) {
+		struct ifnet *ifp, *hn_ifp;
+
+		hn_ifp = hn_vfmap[i];
+		if (hn_ifp == NULL)
+			continue;
+
+		ifp = ifnet_byindex(i);
+		if (ifp != NULL) {
+			if (first) {
+				sbuf_printf(sb, "%s:%s", ifp->if_xname,
+				    hn_ifp->if_xname);
+			} else {
+				sbuf_printf(sb, " %s:%s", ifp->if_xname,
+				    hn_ifp->if_xname);
+			}
+			first = false;
+		}
+	}
+
+	rm_runlock(&hn_vfmap_lock, &pt);
+
+	error = sbuf_finish(sb);
+	sbuf_delete(sb);
+	return (error);
+}
+
+static int
+hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int error, onoff = 0;
+
+	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
+		onoff = 1;
+	error = sysctl_handle_int(oidp, &onoff, 0, req);
+	if (error || req->newptr == NULL)
+		return (error);
+
+	HN_LOCK(sc);
+	/* NOTE: hn_vf_lock for hn_transmit() */
+	rm_wlock(&sc->hn_vf_lock);
+	if (onoff)
+		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
+	else
+		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
+	rm_wunlock(&sc->hn_vf_lock);
+	HN_UNLOCK(sc);
+
+	return (0);
+}
+
+static int
+hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int enabled = 0;
+
+	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
+		enabled = 1;
+	return (sysctl_handle_int(oidp, &enabled, 0, req));
+}
+
+static int
+hn_check_iplen(const struct mbuf *m, int hoff)
+{
+	const struct ip *ip;
+	int len, iphlen, iplen;
+	const struct tcphdr *th;
+	int thoff;				/* TCP data offset */
+
+	len = hoff + sizeof(struct ip);
+
+	/* The packet must be at least the size of an IP header. */
+	if (m->m_pkthdr.len < len)
+		return IPPROTO_DONE;
+
+	/* The fixed IP header must reside completely in the first mbuf. */
+	if (m->m_len < len)
+		return IPPROTO_DONE;
+
+	ip = mtodo(m, hoff);
+
+	/* Bound check the packet's stated IP header length. */
+	iphlen = ip->ip_hl << 2;
+	if (iphlen < sizeof(struct ip))		/* minimum header length */
+		return IPPROTO_DONE;
+
+	/* The full IP header must reside completely in the one mbuf. */
+	if (m->m_len < hoff + iphlen)
+		return IPPROTO_DONE;
+
+	iplen = ntohs(ip->ip_len);
+
+	/*
+	 * Check that the amount of data in the buffers is as
+	 * at least much as the IP header would have us expect.
+	 */
+	if (m->m_pkthdr.len < hoff + iplen)
+		return IPPROTO_DONE;
+
+	/*
+	 * Ignore IP fragments.
+	 */
+	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
+		return IPPROTO_DONE;
+
+	/*
+	 * The TCP/IP or UDP/IP header must be entirely contained within
+	 * the first fragment of a packet.
+	 */
+	switch (ip->ip_p) {
+	case IPPROTO_TCP:
+		if (iplen < iphlen + sizeof(struct tcphdr))
+			return IPPROTO_DONE;
+		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
+			return IPPROTO_DONE;
+		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
+		thoff = th->th_off << 2;
+		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
+			return IPPROTO_DONE;
+		if (m->m_len < hoff + iphlen + thoff)
+			return IPPROTO_DONE;
+		break;
+	case IPPROTO_UDP:
+		if (iplen < iphlen + sizeof(struct udphdr))
+			return IPPROTO_DONE;
+		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
+			return IPPROTO_DONE;
+		break;
+	default:
+		if (iplen < iphlen)
+			return IPPROTO_DONE;
+		break;
+	}
+	return ip->ip_p;
+}
+
+static void
+hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
+{
+	const struct ether_header *eh;
+	uint16_t etype;
+	int hoff;
+
+	hoff = sizeof(*eh);
+	/* Checked at the beginning of this function. */
+	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
+
+	eh = mtod(m_new, const struct ether_header *);
+	etype = ntohs(eh->ether_type);
+	if (etype == ETHERTYPE_VLAN) {
+		const struct ether_vlan_header *evl;
+
+		hoff = sizeof(*evl);
+		if (m_new->m_len < hoff)
+			return;
+		evl = mtod(m_new, const struct ether_vlan_header *);
+		etype = ntohs(evl->evl_proto);
+	}
+	*l3proto = etype;
+
+	if (etype == ETHERTYPE_IP)
+		*l4proto = hn_check_iplen(m_new, hoff);
+	else
+		*l4proto = IPPROTO_DONE;
+}
+
+static int
+hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
+{
+	struct sysctl_oid_list *child;
+	struct sysctl_ctx_list *ctx;
+	device_t dev = sc->hn_dev;
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+	int lroent_cnt;
+#endif
+#endif
+	int i;
+
+	/*
+	 * Create RXBUF for reception.
+	 *
+	 * NOTE:
+	 * - It is shared by all channels.
+	 * - A large enough buffer is allocated, certain version of NVSes
+	 *   may further limit the usable space.
+	 */
+	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
+	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
+	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
+	if (sc->hn_rxbuf == NULL) {
+		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
+		return (ENOMEM);
+	}
+
+	sc->hn_rx_ring_cnt = ring_cnt;
+	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
+
+	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
+	    M_DEVBUF, M_WAITOK | M_ZERO);
+
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+	lroent_cnt = hn_lro_entry_count;
+	if (lroent_cnt < TCP_LRO_ENTRIES)
+		lroent_cnt = TCP_LRO_ENTRIES;
+	if (bootverbose)
+		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
+#endif
+#endif	/* INET || INET6 */
+
+	ctx = device_get_sysctl_ctx(dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+
+	/* Create dev.hn.UNIT.rx sysctl tree */
+	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
+	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
+
+		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
+		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
+		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
+		if (rxr->hn_br == NULL) {
+			device_printf(dev, "allocate bufring failed\n");
+			return (ENOMEM);
+		}
+
+		if (hn_trust_hosttcp)
+			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
+		if (hn_trust_hostudp)
+			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
+		if (hn_trust_hostip)
+			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
+		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
+		rxr->hn_ifp = sc->hn_ifp;
+		if (i < sc->hn_tx_ring_cnt)
+			rxr->hn_txr = &sc->hn_tx_ring[i];
+		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
+		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
+		rxr->hn_rx_idx = i;
+		rxr->hn_rxbuf = sc->hn_rxbuf;
+
+		/*
+		 * Initialize LRO.
+		 */
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
+		    hn_lro_mbufq_depth);
+#else
+		tcp_lro_init(&rxr->hn_lro);
+		rxr->hn_lro.ifp = sc->hn_ifp;
+#endif
+#if __FreeBSD_version >= 1100099
+		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
+		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
+#endif
+#endif	/* INET || INET6 */
+
+		if (sc->hn_rx_sysctl_tree != NULL) {
+			char name[16];
+
+			/*
+			 * Create per RX ring sysctl tree:
+			 * dev.hn.UNIT.rx.RINGID
+			 */
+			snprintf(name, sizeof(name), "%d", i);
+			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
+			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
+			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+
+			if (rxr->hn_rx_sysctl_tree != NULL) {
+				SYSCTL_ADD_ULONG(ctx,
+				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
+				    OID_AUTO, "packets", CTLFLAG_RW,
+				    &rxr->hn_pkts, "# of packets received");
+				SYSCTL_ADD_ULONG(ctx,
+				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
+				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
+				    &rxr->hn_rss_pkts,
+				    "# of packets w/ RSS info received");
+				SYSCTL_ADD_ULONG(ctx,
+				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
+				    OID_AUTO, "rsc_pkts", CTLFLAG_RW,
+				    &rxr->hn_rsc_pkts,
+				    "# of RSC packets received");
+				SYSCTL_ADD_ULONG(ctx,
+				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
+				    OID_AUTO, "rsc_drop", CTLFLAG_RW,
+				    &rxr->hn_rsc_drop,
+				    "# of RSC fragments dropped");
+				SYSCTL_ADD_INT(ctx,
+				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
+				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
+				    &rxr->hn_pktbuf_len, 0,
+				    "Temporary channel packet buffer length");
+			}
+		}
+	}
+
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
+	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
+#if __FreeBSD_version < 1100095
+	    hn_rx_stat_int_sysctl,
+#else
+	    hn_rx_stat_u64_sysctl,
+#endif
+	    "LU", "LRO queued");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
+	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
+#if __FreeBSD_version < 1100095
+	    hn_rx_stat_int_sysctl,
+#else
+	    hn_rx_stat_u64_sysctl,
+#endif
+	    "LU", "LRO flushed");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_rx_ring, hn_lro_tried),
+	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
+#if __FreeBSD_version >= 1100099
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
+	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+	    hn_lro_lenlim_sysctl, "IU",
+	    "Max # of data bytes to be aggregated by LRO");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+	    hn_lro_ackcnt_sysctl, "I",
+	    "Max # of ACKs to be aggregated by LRO");
+#endif
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
+	    hn_trust_hcsum_sysctl, "I",
+	    "Trust tcp segment verification on host side, "
+	    "when csum info is missing");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
+	    hn_trust_hcsum_sysctl, "I",
+	    "Trust udp datagram verification on host side, "
+	    "when csum info is missing");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
+	    hn_trust_hcsum_sysctl, "I",
+	    "Trust ip packet verification on host side, "
+	    "when csum info is missing");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_rx_ring, hn_csum_ip),
+	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
+	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_rx_ring, hn_csum_udp),
+	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
+	    hn_rx_stat_ulong_sysctl, "LU",
+	    "# of packets that we trust host's csum verification");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_rx_ring, hn_small_pkts),
+	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_rx_ring, hn_ack_failed),
+	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
+	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
+	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
+
+	return (0);
+}
+
+static void
+hn_destroy_rx_data(struct hn_softc *sc)
+{
+	int i;
+
+	if (sc->hn_rxbuf != NULL) {
+		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
+			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
+		else
+			device_printf(sc->hn_dev, "RXBUF is referenced\n");
+		sc->hn_rxbuf = NULL;
+	}
+
+	if (sc->hn_rx_ring_cnt == 0)
+		return;
+
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
+
+		if (rxr->hn_br == NULL)
+			continue;
+		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
+			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
+		} else {
+			device_printf(sc->hn_dev,
+			    "%dth channel bufring is referenced", i);
+		}
+		rxr->hn_br = NULL;
+
+#if defined(INET) || defined(INET6)
+		tcp_lro_free(&rxr->hn_lro);
+#endif
+		free(rxr->hn_pktbuf, M_DEVBUF);
+	}
+	free(sc->hn_rx_ring, M_DEVBUF);
+	sc->hn_rx_ring = NULL;
+
+	sc->hn_rx_ring_cnt = 0;
+	sc->hn_rx_ring_inuse = 0;
+}
+
+static int
+hn_tx_ring_create(struct hn_softc *sc, int id)
+{
+	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
+	device_t dev = sc->hn_dev;
+	bus_dma_tag_t parent_dtag;
+	int error, i;
+
+	txr->hn_sc = sc;
+	txr->hn_tx_idx = id;
+
+#ifndef HN_USE_TXDESC_BUFRING
+	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
+#endif
+	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
+
+	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
+	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
+	    M_DEVBUF, M_WAITOK | M_ZERO);
+#ifndef HN_USE_TXDESC_BUFRING
+	SLIST_INIT(&txr->hn_txlist);
+#else
+	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
+	    M_WAITOK, &txr->hn_tx_lock);
+#endif
+
+	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
+		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
+		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
+	} else {
+		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
+	}
+
+#ifdef HN_IFSTART_SUPPORT
+	if (hn_use_if_start) {
+		txr->hn_txeof = hn_start_txeof;
+		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
+		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
+	} else
+#endif
+	{
+		int br_depth;
+
+		txr->hn_txeof = hn_xmit_txeof;
+		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
+		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
+
+		br_depth = hn_get_txswq_depth(txr);
+		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
+		    M_WAITOK, &txr->hn_tx_lock);
+	}
+
+	txr->hn_direct_tx_size = hn_direct_tx_size;
+
+	/*
+	 * Always schedule transmission instead of trying to do direct
+	 * transmission.  This one gives the best performance so far.
+	 */
+	txr->hn_sched_tx = 1;
+
+	parent_dtag = bus_get_dma_tag(dev);
+
+	/* DMA tag for RNDIS packet messages. */
+	error = bus_dma_tag_create(parent_dtag, /* parent */
+	    HN_RNDIS_PKT_ALIGN,		/* alignment */
+	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
+	    BUS_SPACE_MAXADDR,		/* lowaddr */
+	    BUS_SPACE_MAXADDR,		/* highaddr */
+	    NULL, NULL,			/* filter, filterarg */
+	    HN_RNDIS_PKT_LEN,		/* maxsize */
+	    1,				/* nsegments */
+	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
+	    0,				/* flags */
+	    NULL,			/* lockfunc */
+	    NULL,			/* lockfuncarg */
+	    &txr->hn_tx_rndis_dtag);
+	if (error) {
+		device_printf(dev, "failed to create rndis dmatag\n");
+		return error;
+	}
+
+	/* DMA tag for data. */
+	error = bus_dma_tag_create(parent_dtag, /* parent */
+	    1,				/* alignment */
+	    HN_TX_DATA_BOUNDARY,	/* boundary */
+	    BUS_SPACE_MAXADDR,		/* lowaddr */
+	    BUS_SPACE_MAXADDR,		/* highaddr */
+	    NULL, NULL,			/* filter, filterarg */
+	    HN_TX_DATA_MAXSIZE,		/* maxsize */
+	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
+	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
+	    0,				/* flags */
+	    NULL,			/* lockfunc */
+	    NULL,			/* lockfuncarg */
+	    &txr->hn_tx_data_dtag);
+	if (error) {
+		device_printf(dev, "failed to create data dmatag\n");
+		return error;
+	}
+
+	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
+		struct hn_txdesc *txd = &txr->hn_txdesc[i];
+
+		txd->txr = txr;
+		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
+		STAILQ_INIT(&txd->agg_list);
+
+		/*
+		 * Allocate and load RNDIS packet message.
+		 */
+        	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
+		    (void **)&txd->rndis_pkt,
+		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
+		    &txd->rndis_pkt_dmap);
+		if (error) {
+			device_printf(dev,
+			    "failed to allocate rndis_packet_msg, %d\n", i);
+			return error;
+		}
+
+		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
+		    txd->rndis_pkt_dmap,
+		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
+		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
+		    BUS_DMA_NOWAIT);
+		if (error) {
+			device_printf(dev,
+			    "failed to load rndis_packet_msg, %d\n", i);
+			bus_dmamem_free(txr->hn_tx_rndis_dtag,
+			    txd->rndis_pkt, txd->rndis_pkt_dmap);
+			return error;
+		}
+
+		/* DMA map for TX data. */
+		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
+		    &txd->data_dmap);
+		if (error) {
+			device_printf(dev,
+			    "failed to allocate tx data dmamap\n");
+			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
+			    txd->rndis_pkt_dmap);
+			bus_dmamem_free(txr->hn_tx_rndis_dtag,
+			    txd->rndis_pkt, txd->rndis_pkt_dmap);
+			return error;
+		}
+
+		/* All set, put it to list */
+		txd->flags |= HN_TXD_FLAG_ONLIST;
+#ifndef HN_USE_TXDESC_BUFRING
+		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
+#else
+		buf_ring_enqueue(txr->hn_txdesc_br, txd);
+#endif
+	}
+	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
+
+	if (sc->hn_tx_sysctl_tree != NULL) {
+		struct sysctl_oid_list *child;
+		struct sysctl_ctx_list *ctx;
+		char name[16];
+
+		/*
+		 * Create per TX ring sysctl tree:
+		 * dev.hn.UNIT.tx.RINGID
+		 */
+		ctx = device_get_sysctl_ctx(dev);
+		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
+
+		snprintf(name, sizeof(name), "%d", id);
+		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
+		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+
+		if (txr->hn_tx_sysctl_tree != NULL) {
+			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
+
+#ifdef HN_DEBUG
+			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
+			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
+			    "# of available TX descs");
+#endif
+#ifdef HN_IFSTART_SUPPORT
+			if (!hn_use_if_start)
+#endif
+			{
+				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
+				    CTLFLAG_RD, &txr->hn_oactive, 0,
+				    "over active");
+			}
+			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
+			    CTLFLAG_RW, &txr->hn_pkts,
+			    "# of packets transmitted");
+			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
+			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
+		}
+	}
+
+	return 0;
+}
+
+static void
+hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
+{
+	struct hn_tx_ring *txr = txd->txr;
+
+	KASSERT(txd->m == NULL, ("still has mbuf installed"));
+	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
+
+	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
+	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
+	    txd->rndis_pkt_dmap);
+	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
+}
+
+static void
+hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
+{
+
+	KASSERT(txd->refs == 0 || txd->refs == 1,
+	    ("invalid txd refs %d", txd->refs));
+
+	/* Aggregated txds will be freed by their aggregating txd. */
+	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
+		int freed;
+
+		freed = hn_txdesc_put(txr, txd);
+		KASSERT(freed, ("can't free txdesc"));
+	}
+}
+
+static void
+hn_tx_ring_destroy(struct hn_tx_ring *txr)
+{
+	int i;
+
+	if (txr->hn_txdesc == NULL)
+		return;
+
+	/*
+	 * NOTE:
+	 * Because the freeing of aggregated txds will be deferred
+	 * to the aggregating txd, two passes are used here:
+	 * - The first pass GCes any pending txds.  This GC is necessary,
+	 *   since if the channels are revoked, hypervisor will not
+	 *   deliver send-done for all pending txds.
+	 * - The second pass frees the busdma stuffs, i.e. after all txds
+	 *   were freed.
+	 */
+	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
+		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
+	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
+		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
+
+	if (txr->hn_tx_data_dtag != NULL)
+		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
+	if (txr->hn_tx_rndis_dtag != NULL)
+		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
+
+#ifdef HN_USE_TXDESC_BUFRING
+	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
+#endif
+
+	free(txr->hn_txdesc, M_DEVBUF);
+	txr->hn_txdesc = NULL;
+
+	if (txr->hn_mbuf_br != NULL)
+		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
+
+#ifndef HN_USE_TXDESC_BUFRING
+	mtx_destroy(&txr->hn_txlist_spin);
+#endif
+	mtx_destroy(&txr->hn_tx_lock);
+}
+
+static int
+hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
+{
+	struct sysctl_oid_list *child;
+	struct sysctl_ctx_list *ctx;
+	int i;
+
+	/*
+	 * Create TXBUF for chimney sending.
+	 *
+	 * NOTE: It is shared by all channels.
+	 */
+	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
+	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
+	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
+	if (sc->hn_chim == NULL) {
+		device_printf(sc->hn_dev, "allocate txbuf failed\n");
+		return (ENOMEM);
+	}
+
+	sc->hn_tx_ring_cnt = ring_cnt;
+	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
+
+	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
+	    M_DEVBUF, M_WAITOK | M_ZERO);
+
+	ctx = device_get_sysctl_ctx(sc->hn_dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
+
+	/* Create dev.hn.UNIT.tx sysctl tree */
+	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
+	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+		int error;
+
+		error = hn_tx_ring_create(sc, i);
+		if (error)
+			return error;
+	}
+
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
+	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_tx_ring, hn_send_failed),
+	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
+	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_tx_ring, hn_flush_failed),
+	    hn_tx_stat_ulong_sysctl, "LU",
+	    "# of packet transmission aggregation flush failure");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
+	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
+	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
+	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
+	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
+	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
+	    "# of total TX descs");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
+	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
+	    "Chimney send packet size upper boundary");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
+	    hn_tx_conf_int_sysctl, "I",
+	    "Size of the packet for direct transmission");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
+	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+	    __offsetof(struct hn_tx_ring, hn_sched_tx),
+	    hn_tx_conf_int_sysctl, "I",
+	    "Always schedule transmission "
+	    "instead of doing direct transmission");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
+	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
+	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
+	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
+	    "Applied packet transmission aggregation size");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
+	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+	    hn_txagg_pktmax_sysctl, "I",
+	    "Applied packet transmission aggregation packets");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
+	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+	    hn_txagg_align_sysctl, "I",
+	    "Applied packet transmission aggregation alignment");
+
+	return 0;
+}
+
+static void
+hn_set_chim_size(struct hn_softc *sc, int chim_size)
+{
+	int i;
+
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
+		sc->hn_tx_ring[i].hn_chim_size = chim_size;
+}
+
+static void
+hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
+{
+	struct ifnet *ifp = sc->hn_ifp;
+	u_int hw_tsomax;
+	int tso_minlen;
+
+	HN_LOCK_ASSERT(sc);
+
+	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
+		return;
+
+	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
+	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
+	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
+
+	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
+	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
+	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
+
+	if (tso_maxlen < tso_minlen)
+		tso_maxlen = tso_minlen;
+	else if (tso_maxlen > IP_MAXPACKET)
+		tso_maxlen = IP_MAXPACKET;
+	if (tso_maxlen > sc->hn_ndis_tso_szmax)
+		tso_maxlen = sc->hn_ndis_tso_szmax;
+	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
+
+	if (hn_xpnt_vf_isready(sc)) {
+		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
+			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
+	}
+	ifp->if_hw_tsomax = hw_tsomax;
+	if (bootverbose)
+		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
+}
+
+static void
+hn_fixup_tx_data(struct hn_softc *sc)
+{
+	uint64_t csum_assist;
+	int i;
+
+	hn_set_chim_size(sc, sc->hn_chim_szmax);
+	if (hn_tx_chimney_size > 0 &&
+	    hn_tx_chimney_size < sc->hn_chim_szmax)
+		hn_set_chim_size(sc, hn_tx_chimney_size);
+
+	csum_assist = 0;
+	if (sc->hn_caps & HN_CAP_IPCS)
+		csum_assist |= CSUM_IP;
+	if (sc->hn_caps & HN_CAP_TCP4CS)
+		csum_assist |= CSUM_IP_TCP;
+	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
+		csum_assist |= CSUM_IP_UDP;
+	if (sc->hn_caps & HN_CAP_TCP6CS)
+		csum_assist |= CSUM_IP6_TCP;
+	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
+		csum_assist |= CSUM_IP6_UDP;
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
+		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
+
+	if (sc->hn_caps & HN_CAP_HASHVAL) {
+		/*
+		 * Support HASHVAL pktinfo on TX path.
+		 */
+		if (bootverbose)
+			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
+		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
+			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
+	}
+}
+
+static void
+hn_fixup_rx_data(struct hn_softc *sc)
+{
+
+	if (sc->hn_caps & HN_CAP_UDPHASH) {
+		int i;
+
+		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
+	}
+}
+
+static void
+hn_destroy_tx_data(struct hn_softc *sc)
+{
+	int i;
+
+	if (sc->hn_chim != NULL) {
+		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
+			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
+		} else {
+			device_printf(sc->hn_dev,
+			    "chimney sending buffer is referenced");
+		}
+		sc->hn_chim = NULL;
+	}
+
+	if (sc->hn_tx_ring_cnt == 0)
+		return;
+
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
+		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
+
+	free(sc->hn_tx_ring, M_DEVBUF);
+	sc->hn_tx_ring = NULL;
+
+	sc->hn_tx_ring_cnt = 0;
+	sc->hn_tx_ring_inuse = 0;
+}
+
+#ifdef HN_IFSTART_SUPPORT
+
+static void
+hn_start_taskfunc(void *xtxr, int pending __unused)
+{
+	struct hn_tx_ring *txr = xtxr;
+
+	mtx_lock(&txr->hn_tx_lock);
+	hn_start_locked(txr, 0);
+	mtx_unlock(&txr->hn_tx_lock);
+}
+
+static int
+hn_start_locked(struct hn_tx_ring *txr, int len)
+{
+	struct hn_softc *sc = txr->hn_sc;
+	struct ifnet *ifp = sc->hn_ifp;
+	int sched = 0;
+
+	KASSERT(hn_use_if_start,
+	    ("hn_start_locked is called, when if_start is disabled"));
+	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
+	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
+	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
+
+	if (__predict_false(txr->hn_suspended))
+		return (0);
+
+	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
+	    IFF_DRV_RUNNING)
+		return (0);
+
+	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
+		struct hn_txdesc *txd;
+		struct mbuf *m_head;
+		int error;
+
+		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
+		if (m_head == NULL)
+			break;
+
+		if (len > 0 && m_head->m_pkthdr.len > len) {
+			/*
+			 * This sending could be time consuming; let callers
+			 * dispatch this packet sending (and sending of any
+			 * following up packets) to tx taskqueue.
+			 */
+			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+			sched = 1;
+			break;
+		}
+
+#if defined(INET6) || defined(INET)
+		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
+			m_head = hn_tso_fixup(m_head);
+			if (__predict_false(m_head == NULL)) {
+				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+				continue;
+			}
+		} else if (m_head->m_pkthdr.csum_flags &
+		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
+			m_head = hn_set_hlen(m_head);
+			if (__predict_false(m_head == NULL)) {
+				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+				continue;
+			}
+		}
+#endif
+
+		txd = hn_txdesc_get(txr);
+		if (txd == NULL) {
+			txr->hn_no_txdescs++;
+			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+			break;
+		}
+
+		error = hn_encap(ifp, txr, txd, &m_head);
+		if (error) {
+			/* Both txd and m_head are freed */
+			KASSERT(txr->hn_agg_txd == NULL,
+			    ("encap failed w/ pending aggregating txdesc"));
+			continue;
+		}
+
+		if (txr->hn_agg_pktleft == 0) {
+			if (txr->hn_agg_txd != NULL) {
+				KASSERT(m_head == NULL,
+				    ("pending mbuf for aggregating txdesc"));
+				error = hn_flush_txagg(ifp, txr);
+				if (__predict_false(error)) {
+					atomic_set_int(&ifp->if_drv_flags,
+					    IFF_DRV_OACTIVE);
+					break;
+				}
+			} else {
+				KASSERT(m_head != NULL, ("mbuf was freed"));
+				error = hn_txpkt(ifp, txr, txd);
+				if (__predict_false(error)) {
+					/* txd is freed, but m_head is not */
+					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+					atomic_set_int(&ifp->if_drv_flags,
+					    IFF_DRV_OACTIVE);
+					break;
+				}
+			}
+		}
+#ifdef INVARIANTS
+		else {
+			KASSERT(txr->hn_agg_txd != NULL,
+			    ("no aggregating txdesc"));
+			KASSERT(m_head == NULL,
+			    ("pending mbuf for aggregating txdesc"));
+		}
+#endif
+	}
+
+	/* Flush pending aggerated transmission. */
+	if (txr->hn_agg_txd != NULL)
+		hn_flush_txagg(ifp, txr);
+	return (sched);
+}
+
+static void
+hn_start(struct ifnet *ifp)
+{
+	struct hn_softc *sc = ifp->if_softc;
+	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
+
+	if (txr->hn_sched_tx)
+		goto do_sched;
+
+	if (mtx_trylock(&txr->hn_tx_lock)) {
+		int sched;
+
+		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
+		mtx_unlock(&txr->hn_tx_lock);
+		if (!sched)
+			return;
+	}
+do_sched:
+	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
+}
+
+static void
+hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
+{
+	struct hn_tx_ring *txr = xtxr;
+
+	mtx_lock(&txr->hn_tx_lock);
+	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
+	hn_start_locked(txr, 0);
+	mtx_unlock(&txr->hn_tx_lock);
+}
+
+static void
+hn_start_txeof(struct hn_tx_ring *txr)
+{
+	struct hn_softc *sc = txr->hn_sc;
+	struct ifnet *ifp = sc->hn_ifp;
+
+	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
+
+	if (txr->hn_sched_tx)
+		goto do_sched;
+
+	if (mtx_trylock(&txr->hn_tx_lock)) {
+		int sched;
+
+		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
+		mtx_unlock(&txr->hn_tx_lock);
+		if (sched) {
+			taskqueue_enqueue(txr->hn_tx_taskq,
+			    &txr->hn_tx_task);
+		}
+	} else {
+do_sched:
+		/*
+		 * Release the OACTIVE earlier, with the hope, that
+		 * others could catch up.  The task will clear the
+		 * flag again with the hn_tx_lock to avoid possible
+		 * races.
+		 */
+		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
+	}
+}
+
+#endif	/* HN_IFSTART_SUPPORT */
+
+static int
+hn_xmit(struct hn_tx_ring *txr, int len)
+{
+	struct hn_softc *sc = txr->hn_sc;
+	struct ifnet *ifp = sc->hn_ifp;
+	struct mbuf *m_head;
+	int sched = 0;
+
+	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
+#ifdef HN_IFSTART_SUPPORT
+	KASSERT(hn_use_if_start == 0,
+	    ("hn_xmit is called, when if_start is enabled"));
+#endif
+	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
+
+	if (__predict_false(txr->hn_suspended))
+		return (0);
+
+	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
+		return (0);
+
+	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
+		struct hn_txdesc *txd;
+		int error;
+
+		if (len > 0 && m_head->m_pkthdr.len > len) {
+			/*
+			 * This sending could be time consuming; let callers
+			 * dispatch this packet sending (and sending of any
+			 * following up packets) to tx taskqueue.
+			 */
+			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
+			sched = 1;
+			break;
+		}
+
+		txd = hn_txdesc_get(txr);
+		if (txd == NULL) {
+			txr->hn_no_txdescs++;
+			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
+			txr->hn_oactive = 1;
+			break;
+		}
+
+		error = hn_encap(ifp, txr, txd, &m_head);
+		if (error) {
+			/* Both txd and m_head are freed; discard */
+			KASSERT(txr->hn_agg_txd == NULL,
+			    ("encap failed w/ pending aggregating txdesc"));
+			drbr_advance(ifp, txr->hn_mbuf_br);
+			continue;
+		}
+
+		if (txr->hn_agg_pktleft == 0) {
+			if (txr->hn_agg_txd != NULL) {
+				KASSERT(m_head == NULL,
+				    ("pending mbuf for aggregating txdesc"));
+				error = hn_flush_txagg(ifp, txr);
+				if (__predict_false(error)) {
+					txr->hn_oactive = 1;
+					break;
+				}
+			} else {
+				KASSERT(m_head != NULL, ("mbuf was freed"));
+				error = hn_txpkt(ifp, txr, txd);
+				if (__predict_false(error)) {
+					/* txd is freed, but m_head is not */
+					drbr_putback(ifp, txr->hn_mbuf_br,
+					    m_head);
+					txr->hn_oactive = 1;
+					break;
+				}
+			}
+		}
+#ifdef INVARIANTS
+		else {
+			KASSERT(txr->hn_agg_txd != NULL,
+			    ("no aggregating txdesc"));
+			KASSERT(m_head == NULL,
+			    ("pending mbuf for aggregating txdesc"));
+		}
+#endif
+
+		/* Sent */
+		drbr_advance(ifp, txr->hn_mbuf_br);
+	}
+
+	/* Flush pending aggerated transmission. */
+	if (txr->hn_agg_txd != NULL)
+		hn_flush_txagg(ifp, txr);
+	return (sched);
+}
+
+static int
+hn_transmit(struct ifnet *ifp, struct mbuf *m)
+{
+	struct hn_softc *sc = ifp->if_softc;
+	struct hn_tx_ring *txr;
+	int error, idx = 0;
+
+	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
+		struct rm_priotracker pt;
+
+		rm_rlock(&sc->hn_vf_lock, &pt);
+		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
+			struct mbuf *m_bpf = NULL;
+			int obytes, omcast;
+
+			obytes = m->m_pkthdr.len;
+			omcast = (m->m_flags & M_MCAST) != 0;
+
+			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
+				if (bpf_peers_present(ifp->if_bpf)) {
+					m_bpf = m_copypacket(m, M_NOWAIT);
+					if (m_bpf == NULL) {
+						/*
+						 * Failed to grab a shallow
+						 * copy; tap now.
+						 */
+						ETHER_BPF_MTAP(ifp, m);
+					}
+				}
+			} else {
+				ETHER_BPF_MTAP(ifp, m);
+			}
+
+			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
+			rm_runlock(&sc->hn_vf_lock, &pt);
+
+			if (m_bpf != NULL) {
+				if (!error)
+					ETHER_BPF_MTAP(ifp, m_bpf);
+				m_freem(m_bpf);
+			}
+
+			if (error == ENOBUFS) {
+				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
+			} else if (error) {
+				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+			} else {
+				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
+				if (omcast) {
+					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
+					    omcast);
+				}
+			}
+			return (error);
+		}
+		rm_runlock(&sc->hn_vf_lock, &pt);
+	}
+
+#if defined(INET6) || defined(INET)
+	/*
+	 * Perform TSO packet header fixup or get l2/l3 header length now,
+	 * since packet headers should be cache-hot.
+	 */
+	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
+		m = hn_tso_fixup(m);
+		if (__predict_false(m == NULL)) {
+			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+			return EIO;
+		}
+	} else if (m->m_pkthdr.csum_flags &
+	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
+		m = hn_set_hlen(m);
+		if (__predict_false(m == NULL)) {
+			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+			return EIO;
+		}
+	}
+#endif
+
+	/*
+	 * Select the TX ring based on flowid
+	 */
+	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
+#ifdef RSS
+		uint32_t bid;
+
+		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
+		    &bid) == 0)
+			idx = bid % sc->hn_tx_ring_inuse;
+		else
+#endif
+		{
+#if defined(INET6) || defined(INET)
+			int tcpsyn = 0;
+
+			if (m->m_pkthdr.len < 128 &&
+			    (m->m_pkthdr.csum_flags &
+			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
+			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
+				m = hn_check_tcpsyn(m, &tcpsyn);
+				if (__predict_false(m == NULL)) {
+					if_inc_counter(ifp,
+					    IFCOUNTER_OERRORS, 1);
+					return (EIO);
+				}
+			}
+#else
+			const int tcpsyn = 0;
+#endif
+			if (tcpsyn)
+				idx = 0;
+			else
+				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
+		}
+	}
+	txr = &sc->hn_tx_ring[idx];
+
+	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
+	if (error) {
+		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
+		return error;
+	}
+
+	if (txr->hn_oactive)
+		return 0;
+
+	if (txr->hn_sched_tx)
+		goto do_sched;
+
+	if (mtx_trylock(&txr->hn_tx_lock)) {
+		int sched;
+
+		sched = hn_xmit(txr, txr->hn_direct_tx_size);
+		mtx_unlock(&txr->hn_tx_lock);
+		if (!sched)
+			return 0;
+	}
+do_sched:
+	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
+	return 0;
+}
+
+static void
+hn_tx_ring_qflush(struct hn_tx_ring *txr)
+{
+	struct mbuf *m;
+
+	mtx_lock(&txr->hn_tx_lock);
+	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
+		m_freem(m);
+	mtx_unlock(&txr->hn_tx_lock);
+}
+
+static void
+hn_xmit_qflush(struct ifnet *ifp)
+{
+	struct hn_softc *sc = ifp->if_softc;
+	struct rm_priotracker pt;
+	int i;
+
+	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
+		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
+	if_qflush(ifp);
+
+	rm_rlock(&sc->hn_vf_lock, &pt);
+	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
+		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
+	rm_runlock(&sc->hn_vf_lock, &pt);
+}
+
+static void
+hn_xmit_txeof(struct hn_tx_ring *txr)
+{
+
+	if (txr->hn_sched_tx)
+		goto do_sched;
+
+	if (mtx_trylock(&txr->hn_tx_lock)) {
+		int sched;
+
+		txr->hn_oactive = 0;
+		sched = hn_xmit(txr, txr->hn_direct_tx_size);
+		mtx_unlock(&txr->hn_tx_lock);
+		if (sched) {
+			taskqueue_enqueue(txr->hn_tx_taskq,
+			    &txr->hn_tx_task);
+		}
+	} else {
+do_sched:
+		/*
+		 * Release the oactive earlier, with the hope, that
+		 * others could catch up.  The task will clear the
+		 * oactive again with the hn_tx_lock to avoid possible
+		 * races.
+		 */
+		txr->hn_oactive = 0;
+		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
+	}
+}
+
+static void
+hn_xmit_taskfunc(void *xtxr, int pending __unused)
+{
+	struct hn_tx_ring *txr = xtxr;
+
+	mtx_lock(&txr->hn_tx_lock);
+	hn_xmit(txr, 0);
+	mtx_unlock(&txr->hn_tx_lock);
+}
+
+static void
+hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
+{
+	struct hn_tx_ring *txr = xtxr;
+
+	mtx_lock(&txr->hn_tx_lock);
+	txr->hn_oactive = 0;
+	hn_xmit(txr, 0);
+	mtx_unlock(&txr->hn_tx_lock);
+}
+
+static int
+hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
+{
+	struct vmbus_chan_br cbr;
+	struct hn_rx_ring *rxr;
+	struct hn_tx_ring *txr = NULL;
+	int idx, error;
+
+	idx = vmbus_chan_subidx(chan);
+
+	/*
+	 * Link this channel to RX/TX ring.
+	 */
+	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
+	    ("invalid channel index %d, should > 0 && < %d",
+	     idx, sc->hn_rx_ring_inuse));
+	rxr = &sc->hn_rx_ring[idx];
+	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
+	    ("RX ring %d already attached", idx));
+	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
+	rxr->hn_chan = chan;
+
+	if (bootverbose) {
+		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
+		    idx, vmbus_chan_id(chan));
+	}
+
+	if (idx < sc->hn_tx_ring_inuse) {
+		txr = &sc->hn_tx_ring[idx];
+		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
+		    ("TX ring %d already attached", idx));
+		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
+
+		txr->hn_chan = chan;
+		if (bootverbose) {
+			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
+			    idx, vmbus_chan_id(chan));
+		}
+	}
+
+	/* Bind this channel to a proper CPU. */
+	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
+
+	/*
+	 * Open this channel
+	 */
+	cbr.cbr = rxr->hn_br;
+	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
+	cbr.cbr_txsz = HN_TXBR_SIZE;
+	cbr.cbr_rxsz = HN_RXBR_SIZE;
+	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
+	if (error) {
+		if (error == EISCONN) {
+			if_printf(sc->hn_ifp, "bufring is connected after "
+			    "chan%u open failure\n", vmbus_chan_id(chan));
+			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
+		} else {
+			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
+			    vmbus_chan_id(chan), error);
+		}
+	}
+	return (error);
+}
+
+static void
+hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
+{
+	struct hn_rx_ring *rxr;
+	int idx, error;
+
+	idx = vmbus_chan_subidx(chan);
+
+	/*
+	 * Link this channel to RX/TX ring.
+	 */
+	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
+	    ("invalid channel index %d, should > 0 && < %d",
+	     idx, sc->hn_rx_ring_inuse));
+	rxr = &sc->hn_rx_ring[idx];
+	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
+	    ("RX ring %d is not attached", idx));
+	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
+
+	if (idx < sc->hn_tx_ring_inuse) {
+		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
+
+		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
+		    ("TX ring %d is not attached attached", idx));
+		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
+	}
+
+	/*
+	 * Close this channel.
+	 *
+	 * NOTE:
+	 * Channel closing does _not_ destroy the target channel.
+	 */
+	error = vmbus_chan_close_direct(chan);
+	if (error == EISCONN) {
+		if_printf(sc->hn_ifp, "chan%u bufring is connected "
+		    "after being closed\n", vmbus_chan_id(chan));
+		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
+	} else if (error) {
+		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
+		    vmbus_chan_id(chan), error);
+	}
+}
+
+static int
+hn_attach_subchans(struct hn_softc *sc)
+{
+	struct vmbus_channel **subchans;
+	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
+	int i, error = 0;
+
+	KASSERT(subchan_cnt > 0, ("no sub-channels"));
+
+	/* Attach the sub-channels. */
+	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
+	for (i = 0; i < subchan_cnt; ++i) {
+		int error1;
+
+		error1 = hn_chan_attach(sc, subchans[i]);
+		if (error1) {
+			error = error1;
+			/* Move on; all channels will be detached later. */
+		}
+	}
+	vmbus_subchan_rel(subchans, subchan_cnt);
+
+	if (error) {
+		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
+	} else {
+		if (bootverbose) {
+			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
+			    subchan_cnt);
+		}
+	}
+	return (error);
+}
+
+static void
+hn_detach_allchans(struct hn_softc *sc)
+{
+	struct vmbus_channel **subchans;
+	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
+	int i;
+
+	if (subchan_cnt == 0)
+		goto back;
+
+	/* Detach the sub-channels. */
+	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
+	for (i = 0; i < subchan_cnt; ++i)
+		hn_chan_detach(sc, subchans[i]);
+	vmbus_subchan_rel(subchans, subchan_cnt);
+
+back:
+	/*
+	 * Detach the primary channel, _after_ all sub-channels
+	 * are detached.
+	 */
+	hn_chan_detach(sc, sc->hn_prichan);
+
+	/* Wait for sub-channels to be destroyed, if any. */
+	vmbus_subchan_drain(sc->hn_prichan);
+
+#ifdef INVARIANTS
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
+		    HN_RX_FLAG_ATTACHED) == 0,
+		    ("%dth RX ring is still attached", i));
+	}
+	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
+		    HN_TX_FLAG_ATTACHED) == 0,
+		    ("%dth TX ring is still attached", i));
+	}
+#endif
+}
+
+static int
+hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
+{
+	struct vmbus_channel **subchans;
+	int nchan, rxr_cnt, error;
+
+	nchan = *nsubch + 1;
+	if (nchan == 1) {
+		/*
+		 * Multiple RX/TX rings are not requested.
+		 */
+		*nsubch = 0;
+		return (0);
+	}
+
+	/*
+	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
+	 * table entries.
+	 */
+	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
+	if (error) {
+		/* No RSS; this is benign. */
+		*nsubch = 0;
+		return (0);
+	}
+	if (bootverbose) {
+		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
+		    rxr_cnt, nchan);
+	}
+
+	if (nchan > rxr_cnt)
+		nchan = rxr_cnt;
+	if (nchan == 1) {
+		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
+		*nsubch = 0;
+		return (0);
+	}
+
+	/*
+	 * Allocate sub-channels from NVS.
+	 */
+	*nsubch = nchan - 1;
+	error = hn_nvs_alloc_subchans(sc, nsubch);
+	if (error || *nsubch == 0) {
+		/* Failed to allocate sub-channels. */
+		*nsubch = 0;
+		return (0);
+	}
+
+	/*
+	 * Wait for all sub-channels to become ready before moving on.
+	 */
+	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
+	vmbus_subchan_rel(subchans, *nsubch);
+	return (0);
+}
+
+static bool
+hn_synth_attachable(const struct hn_softc *sc)
+{
+	int i;
+
+	if (sc->hn_flags & HN_FLAG_ERRORS)
+		return (false);
+
+	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
+
+		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
+			return (false);
+	}
+	return (true);
+}
+
+/*
+ * Make sure that the RX filter is zero after the successful
+ * RNDIS initialization.
+ *
+ * NOTE:
+ * Under certain conditions on certain versions of Hyper-V,
+ * the RNDIS rxfilter is _not_ zero on the hypervisor side
+ * after the successful RNDIS initialization, which breaks
+ * the assumption of any following code (well, it breaks the
+ * RNDIS API contract actually).  Clear the RNDIS rxfilter
+ * explicitly, drain packets sneaking through, and drain the
+ * interrupt taskqueues scheduled due to the stealth packets.
+ */
+static void
+hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
+{
+
+	hn_disable_rx(sc);
+	hn_drain_rxtx(sc, nchan);
+}
+
+static int
+hn_synth_attach(struct hn_softc *sc, int mtu)
+{
+#define ATTACHED_NVS		0x0002
+#define ATTACHED_RNDIS		0x0004
+
+	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
+	int error, nsubch, nchan = 1, i, rndis_inited;
+	uint32_t old_caps, attached = 0;
+
+	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
+	    ("synthetic parts were attached"));
+
+	if (!hn_synth_attachable(sc))
+		return (ENXIO);
+
+	/* Save capabilities for later verification. */
+	old_caps = sc->hn_caps;
+	sc->hn_caps = 0;
+
+	/* Clear RSS stuffs. */
+	sc->hn_rss_ind_size = 0;
+	sc->hn_rss_hash = 0;
+	sc->hn_rss_hcap = 0;
+
+	/*
+	 * Attach the primary channel _before_ attaching NVS and RNDIS.
+	 */
+	error = hn_chan_attach(sc, sc->hn_prichan);
+	if (error)
+		goto failed;
+
+	/*
+	 * Attach NVS.
+	 */
+	error = hn_nvs_attach(sc, mtu);
+	if (error)
+		goto failed;
+	attached |= ATTACHED_NVS;
+
+	/*
+	 * Attach RNDIS _after_ NVS is attached.
+	 */
+	error = hn_rndis_attach(sc, mtu, &rndis_inited);
+	if (rndis_inited)
+		attached |= ATTACHED_RNDIS;
+	if (error)
+		goto failed;
+
+	/*
+	 * Make sure capabilities are not changed.
+	 */
+	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
+		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
+		    old_caps, sc->hn_caps);
+		error = ENXIO;
+		goto failed;
+	}
+
+	/*
+	 * Allocate sub-channels for multi-TX/RX rings.
+	 *
+	 * NOTE:
+	 * The # of RX rings that can be used is equivalent to the # of
+	 * channels to be requested.
+	 */
+	nsubch = sc->hn_rx_ring_cnt - 1;
+	error = hn_synth_alloc_subchans(sc, &nsubch);
+	if (error)
+		goto failed;
+	/* NOTE: _Full_ synthetic parts detach is required now. */
+	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
+
+	/*
+	 * Set the # of TX/RX rings that could be used according to
+	 * the # of channels that NVS offered.
+	 */
+	nchan = nsubch + 1;
+	hn_set_ring_inuse(sc, nchan);
+	if (nchan == 1) {
+		/* Only the primary channel can be used; done */
+		goto back;
+	}
+
+	/*
+	 * Attach the sub-channels.
+	 *
+	 * NOTE: hn_set_ring_inuse() _must_ have been called.
+	 */
+	error = hn_attach_subchans(sc);
+	if (error)
+		goto failed;
+
+	/*
+	 * Configure RSS key and indirect table _after_ all sub-channels
+	 * are attached.
+	 */
+	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
+		/*
+		 * RSS key is not set yet; set it to the default RSS key.
+		 */
+		if (bootverbose)
+			if_printf(sc->hn_ifp, "setup default RSS key\n");
+#ifdef RSS
+		rss_getkey(rss->rss_key);
+#else
+		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
+#endif
+		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
+	}
+
+	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
+		/*
+		 * RSS indirect table is not set yet; set it up in round-
+		 * robin fashion.
+		 */
+		if (bootverbose) {
+			if_printf(sc->hn_ifp, "setup default RSS indirect "
+			    "table\n");
+		}
+		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
+			uint32_t subidx;
+
+#ifdef RSS
+			subidx = rss_get_indirection_to_bucket(i);
+#else
+			subidx = i;
+#endif
+			rss->rss_ind[i] = subidx % nchan;
+		}
+		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
+	} else {
+		/*
+		 * # of usable channels may be changed, so we have to
+		 * make sure that all entries in RSS indirect table
+		 * are valid.
+		 *
+		 * NOTE: hn_set_ring_inuse() _must_ have been called.
+		 */
+		hn_rss_ind_fixup(sc);
+	}
+
+	sc->hn_rss_hash = sc->hn_rss_hcap;
+	if ((sc->hn_flags & HN_FLAG_RXVF) ||
+	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
+		/* NOTE: Don't reconfigure RSS; will do immediately. */
+		hn_vf_rss_fixup(sc, false);
+	}
+	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
+	if (error)
+		goto failed;
+back:
+	/*
+	 * Fixup transmission aggregation setup.
+	 */
+	hn_set_txagg(sc);
+	hn_rndis_init_fixat(sc, nchan);
+	return (0);
+
+failed:
+	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
+		hn_rndis_init_fixat(sc, nchan);
+		hn_synth_detach(sc);
+	} else {
+		if (attached & ATTACHED_RNDIS) {
+			hn_rndis_init_fixat(sc, nchan);
+			hn_rndis_detach(sc);
+		}
+		if (attached & ATTACHED_NVS)
+			hn_nvs_detach(sc);
+		hn_chan_detach(sc, sc->hn_prichan);
+		/* Restore old capabilities. */
+		sc->hn_caps = old_caps;
+	}
+	return (error);
+
+#undef ATTACHED_RNDIS
+#undef ATTACHED_NVS
+}
+
+/*
+ * NOTE:
+ * The interface must have been suspended though hn_suspend(), before
+ * this function get called.
+ */
+static void
+hn_synth_detach(struct hn_softc *sc)
+{
+
+	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
+	    ("synthetic parts were not attached"));
+
+	/* Detach the RNDIS first. */
+	hn_rndis_detach(sc);
+
+	/* Detach NVS. */
+	hn_nvs_detach(sc);
+
+	/* Detach all of the channels. */
+	hn_detach_allchans(sc);
+
+	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
+		/*
+		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
+		 */
+		int error;
+
+		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
+		    sc->hn_rxbuf_gpadl);
+		if (error) {
+			if_printf(sc->hn_ifp,
+			    "rxbuf gpadl disconn failed: %d\n", error);
+			sc->hn_flags |= HN_FLAG_RXBUF_REF;
+		}
+		sc->hn_rxbuf_gpadl = 0;
+	}
+
+	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
+		/*
+		 * Host is post-Win2016, disconnect chimney sending buffer from
+		 * primary channel here.
+		 */
+		int error;
+
+		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
+		    sc->hn_chim_gpadl);
+		if (error) {
+			if_printf(sc->hn_ifp,
+			    "chim gpadl disconn failed: %d\n", error);
+			sc->hn_flags |= HN_FLAG_CHIM_REF;
+		}
+		sc->hn_chim_gpadl = 0;
+	}
+	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
+}
+
+static void
+hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
+{
+	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
+	    ("invalid ring count %d", ring_cnt));
+
+	if (sc->hn_tx_ring_cnt > ring_cnt)
+		sc->hn_tx_ring_inuse = ring_cnt;
+	else
+		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
+	sc->hn_rx_ring_inuse = ring_cnt;
+
+#ifdef RSS
+	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
+		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
+		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
+		    rss_getnumbuckets());
+	}
+#endif
+
+	if (bootverbose) {
+		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
+		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
+	}
+}
+
+static void
+hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
+{
+
+	/*
+	 * NOTE:
+	 * The TX bufring will not be drained by the hypervisor,
+	 * if the primary channel is revoked.
+	 */
+	while (!vmbus_chan_rx_empty(chan) ||
+	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
+	     !vmbus_chan_tx_empty(chan)))
+		pause("waitch", 1);
+	vmbus_chan_intr_drain(chan);
+}
+
+static void
+hn_disable_rx(struct hn_softc *sc)
+{
+
+	/*
+	 * Disable RX by clearing RX filter forcefully.
+	 */
+	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
+	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
+
+	/*
+	 * Give RNDIS enough time to flush all pending data packets.
+	 */
+	pause("waitrx", (200 * hz) / 1000);
+}
+
+/*
+ * NOTE:
+ * RX/TX _must_ have been suspended/disabled, before this function
+ * is called.
+ */
+static void
+hn_drain_rxtx(struct hn_softc *sc, int nchan)
+{
+	struct vmbus_channel **subch = NULL;
+	int nsubch;
+
+	/*
+	 * Drain RX/TX bufrings and interrupts.
+	 */
+	nsubch = nchan - 1;
+	if (nsubch > 0)
+		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
+
+	if (subch != NULL) {
+		int i;
+
+		for (i = 0; i < nsubch; ++i)
+			hn_chan_drain(sc, subch[i]);
+	}
+	hn_chan_drain(sc, sc->hn_prichan);
+
+	if (subch != NULL)
+		vmbus_subchan_rel(subch, nsubch);
+}
+
+static void
+hn_suspend_data(struct hn_softc *sc)
+{
+	struct hn_tx_ring *txr;
+	int i;
+
+	HN_LOCK_ASSERT(sc);
+
+	/*
+	 * Suspend TX.
+	 */
+	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
+		txr = &sc->hn_tx_ring[i];
+
+		mtx_lock(&txr->hn_tx_lock);
+		txr->hn_suspended = 1;
+		mtx_unlock(&txr->hn_tx_lock);
+		/* No one is able send more packets now. */
+
+		/*
+		 * Wait for all pending sends to finish.
+		 *
+		 * NOTE:
+		 * We will _not_ receive all pending send-done, if the
+		 * primary channel is revoked.
+		 */
+		while (hn_tx_ring_pending(txr) &&
+		    !vmbus_chan_is_revoked(sc->hn_prichan))
+			pause("hnwtx", 1 /* 1 tick */);
+	}
+
+	/*
+	 * Disable RX.
+	 */
+	hn_disable_rx(sc);
+
+	/*
+	 * Drain RX/TX.
+	 */
+	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
+
+	/*
+	 * Drain any pending TX tasks.
+	 *
+	 * NOTE:
+	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
+	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
+	 */
+	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
+		txr = &sc->hn_tx_ring[i];
+
+		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
+		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
+	}
+}
+
+static void
+hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
+{
+
+	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
+}
+
+static void
+hn_suspend_mgmt(struct hn_softc *sc)
+{
+	struct task task;
+
+	HN_LOCK_ASSERT(sc);
+
+	/*
+	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
+	 * through hn_mgmt_taskq.
+	 */
+	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
+	vmbus_chan_run_task(sc->hn_prichan, &task);
+
+	/*
+	 * Make sure that all pending management tasks are completed.
+	 */
+	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
+	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
+	taskqueue_drain_all(sc->hn_mgmt_taskq0);
+}
+
+static void
+hn_suspend(struct hn_softc *sc)
+{
+
+	/* Disable polling. */
+	hn_polling(sc, 0);
+
+	/*
+	 * If the non-transparent mode VF is activated, the synthetic
+	 * device is receiving packets, so the data path of the
+	 * synthetic device must be suspended.
+	 */
+	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
+	    (sc->hn_flags & HN_FLAG_RXVF))
+		hn_suspend_data(sc);
+	hn_suspend_mgmt(sc);
+}
+
+static void
+hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
+{
+	int i;
+
+	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
+	    ("invalid TX ring count %d", tx_ring_cnt));
+
+	for (i = 0; i < tx_ring_cnt; ++i) {
+		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
+
+		mtx_lock(&txr->hn_tx_lock);
+		txr->hn_suspended = 0;
+		mtx_unlock(&txr->hn_tx_lock);
+	}
+}
+
+static void
+hn_resume_data(struct hn_softc *sc)
+{
+	int i;
+
+	HN_LOCK_ASSERT(sc);
+
+	/*
+	 * Re-enable RX.
+	 */
+	hn_rxfilter_config(sc);
+
+	/*
+	 * Make sure to clear suspend status on "all" TX rings,
+	 * since hn_tx_ring_inuse can be changed after
+	 * hn_suspend_data().
+	 */
+	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
+
+#ifdef HN_IFSTART_SUPPORT
+	if (!hn_use_if_start)
+#endif
+	{
+		/*
+		 * Flush unused drbrs, since hn_tx_ring_inuse may be
+		 * reduced.
+		 */
+		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
+			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
+	}
+
+	/*
+	 * Kick start TX.
+	 */
+	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
+		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
+
+		/*
+		 * Use txeof task, so that any pending oactive can be
+		 * cleared properly.
+		 */
+		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
+	}
+}
+
+static void
+hn_resume_mgmt(struct hn_softc *sc)
+{
+
+	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
+
+	/*
+	 * Kick off network change detection, if it was pending.
+	 * If no network change was pending, start link status
+	 * checks, which is more lightweight than network change
+	 * detection.
+	 */
+	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
+		hn_change_network(sc);
+	else
+		hn_update_link_status(sc);
+}
+
+static void
+hn_resume(struct hn_softc *sc)
+{
+
+	/*
+	 * If the non-transparent mode VF is activated, the synthetic
+	 * device have to receive packets, so the data path of the
+	 * synthetic device must be resumed.
+	 */
+	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
+	    (sc->hn_flags & HN_FLAG_RXVF))
+		hn_resume_data(sc);
+
+	/*
+	 * Don't resume link status change if VF is attached/activated.
+	 * - In the non-transparent VF mode, the synthetic device marks
+	 *   link down until the VF is deactivated; i.e. VF is down.
+	 * - In transparent VF mode, VF's media status is used until
+	 *   the VF is detached.
+	 */
+	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
+	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
+		hn_resume_mgmt(sc);
+
+	/*
+	 * Re-enable polling if this interface is running and
+	 * the polling is requested.
+	 */
+	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
+		hn_polling(sc, sc->hn_pollhz);
+}
+
+static void 
+hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
+{
+	const struct rndis_status_msg *msg;
+	int ofs;
+
+	if (dlen < sizeof(*msg)) {
+		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
+		return;
+	}
+	msg = data;
+
+	switch (msg->rm_status) {
+	case RNDIS_STATUS_MEDIA_CONNECT:
+	case RNDIS_STATUS_MEDIA_DISCONNECT:
+		hn_update_link_status(sc);
+		break;
+
+	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
+	case RNDIS_STATUS_LINK_SPEED_CHANGE:
+		/* Not really useful; ignore. */
+		break;
+
+	case RNDIS_STATUS_NETWORK_CHANGE:
+		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
+		if (dlen < ofs + msg->rm_stbuflen ||
+		    msg->rm_stbuflen < sizeof(uint32_t)) {
+			if_printf(sc->hn_ifp, "network changed\n");
+		} else {
+			uint32_t change;
+
+			memcpy(&change, ((const uint8_t *)msg) + ofs,
+			    sizeof(change));
+			if_printf(sc->hn_ifp, "network changed, change %u\n",
+			    change);
+		}
+		hn_change_network(sc);
+		break;
+
+	default:
+		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
+		    msg->rm_status);
+		break;
+	}
+}
+
+static int
+hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
+{
+	const struct rndis_pktinfo *pi = info_data;
+	uint32_t mask = 0;
+
+	while (info_dlen != 0) {
+		const void *data;
+		uint32_t dlen;
+
+		if (__predict_false(info_dlen < sizeof(*pi)))
+			return (EINVAL);
+		if (__predict_false(info_dlen < pi->rm_size))
+			return (EINVAL);
+		info_dlen -= pi->rm_size;
+
+		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
+			return (EINVAL);
+		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
+			return (EINVAL);
+		dlen = pi->rm_size - pi->rm_pktinfooffset;
+		data = pi->rm_data;
+
+		if (pi->rm_internal == 1) {
+			switch (pi->rm_type) {
+			case NDIS_PKTINFO_IT_PKTINFO_ID:
+				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
+					return (EINVAL);
+				info->pktinfo_id =
+				    (const struct packet_info_id *)data;
+				mask |= HN_RXINFO_PKTINFO_ID;
+				break;
+
+			default:
+				goto next;
+			}
+		} else {
+			switch (pi->rm_type) {
+			case NDIS_PKTINFO_TYPE_VLAN:
+				if (__predict_false(dlen
+				    < NDIS_VLAN_INFO_SIZE))
+					return (EINVAL);
+				info->vlan_info = (const uint32_t *)data;
+				mask |= HN_RXINFO_VLAN;
+				break;
+
+			case NDIS_PKTINFO_TYPE_CSUM:
+				if (__predict_false(dlen
+				    < NDIS_RXCSUM_INFO_SIZE))
+					return (EINVAL);
+				info->csum_info = (const uint32_t *)data;
+				mask |= HN_RXINFO_CSUM;
+				break;
+
+			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
+				if (__predict_false(dlen
+				    < HN_NDIS_HASH_VALUE_SIZE))
+					return (EINVAL);
+				info->hash_value = (const uint32_t *)data;
+				mask |= HN_RXINFO_HASHVAL;
+				break;
+
+			case HN_NDIS_PKTINFO_TYPE_HASHINF:
+				if (__predict_false(dlen
+				    < HN_NDIS_HASH_INFO_SIZE))
+					return (EINVAL);
+				info->hash_info = (const uint32_t *)data;
+				mask |= HN_RXINFO_HASHINF;
+				break;
+
+			default:
+				goto next;
+			}
+		}
+
+		if (mask == HN_RXINFO_ALL) {
+			/* All found; done */
+			break;
+		}
+next:
+		pi = (const struct rndis_pktinfo *)
+		    ((const uint8_t *)pi + pi->rm_size);
+	}
+
+	/*
+	 * Final fixup.
+	 * - If there is no hash value, invalidate the hash info.
+	 */
+	if ((mask & HN_RXINFO_HASHVAL) == 0)
+		info->hash_info = NULL;
+	return (0);
+}
+
+static __inline bool
+hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
+{
+
+	if (off < check_off) {
+		if (__predict_true(off + len <= check_off))
+			return (false);
+	} else if (off > check_off) {
+		if (__predict_true(check_off + check_len <= off))
+			return (false);
+	}
+	return (true);
+}
+
+static __inline void
+hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
+		uint32_t len, struct hn_rxinfo *info)
+{
+	uint32_t cnt = rxr->rsc.cnt;
+
+	if (cnt) {
+		rxr->rsc.pktlen += len;
+	} else {
+		rxr->rsc.vlan_info = info->vlan_info;
+		rxr->rsc.csum_info = info->csum_info;
+		rxr->rsc.hash_info = info->hash_info;
+		rxr->rsc.hash_value = info->hash_value;
+		rxr->rsc.pktlen = len;
+	}
+
+	rxr->rsc.frag_data[cnt] = data;
+	rxr->rsc.frag_len[cnt] = len;
+	rxr->rsc.cnt++;
+}
+
+static void
+hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
+{
+	const struct rndis_packet_msg *pkt;
+	struct hn_rxinfo info;
+	int data_off, pktinfo_off, data_len, pktinfo_len;
+	bool rsc_more= false;
+
+	/*
+	 * Check length.
+	 */
+	if (__predict_false(dlen < sizeof(*pkt))) {
+		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
+		return;
+	}
+	pkt = data;
+
+	if (__predict_false(dlen < pkt->rm_len)) {
+		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
+		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
+		return;
+	}
+	if (__predict_false(pkt->rm_len <
+	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
+		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
+		    "msglen %u, data %u, oob %u, pktinfo %u\n",
+		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
+		    pkt->rm_pktinfolen);
+		return;
+	}
+	if (__predict_false(pkt->rm_datalen == 0)) {
+		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
+		return;
+	}
+
+	/*
+	 * Check offests.
+	 */
+#define IS_OFFSET_INVALID(ofs)			\
+	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
+	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
+
+	/* XXX Hyper-V does not meet data offset alignment requirement */
+	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
+		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+		    "data offset %u\n", pkt->rm_dataoffset);
+		return;
+	}
+	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
+	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
+		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+		    "oob offset %u\n", pkt->rm_oobdataoffset);
+		return;
+	}
+	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
+	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
+		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
+		return;
+	}
+
+#undef IS_OFFSET_INVALID
+
+	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
+	data_len = pkt->rm_datalen;
+	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
+	pktinfo_len = pkt->rm_pktinfolen;
+
+	/*
+	 * Check OOB coverage.
+	 */
+	if (__predict_false(pkt->rm_oobdatalen != 0)) {
+		int oob_off, oob_len;
+
+		if_printf(rxr->hn_ifp, "got oobdata\n");
+		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
+		oob_len = pkt->rm_oobdatalen;
+
+		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
+			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+			    "oob overflow, msglen %u, oob abs %d len %d\n",
+			    pkt->rm_len, oob_off, oob_len);
+			return;
+		}
+
+		/*
+		 * Check against data.
+		 */
+		if (hn_rndis_check_overlap(oob_off, oob_len,
+		    data_off, data_len)) {
+			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+			    "oob overlaps data, oob abs %d len %d, "
+			    "data abs %d len %d\n",
+			    oob_off, oob_len, data_off, data_len);
+			return;
+		}
+
+		/*
+		 * Check against pktinfo.
+		 */
+		if (pktinfo_len != 0 &&
+		    hn_rndis_check_overlap(oob_off, oob_len,
+		    pktinfo_off, pktinfo_len)) {
+			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+			    "oob overlaps pktinfo, oob abs %d len %d, "
+			    "pktinfo abs %d len %d\n",
+			    oob_off, oob_len, pktinfo_off, pktinfo_len);
+			return;
+		}
+	}
+
+	/*
+	 * Check per-packet-info coverage and find useful per-packet-info.
+	 */
+	info.vlan_info = NULL;
+	info.csum_info = NULL;
+	info.hash_info = NULL;
+	info.pktinfo_id = NULL;
+
+	if (__predict_true(pktinfo_len != 0)) {
+		bool overlap;
+		int error;
+
+		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
+			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+			    "pktinfo overflow, msglen %u, "
+			    "pktinfo abs %d len %d\n",
+			    pkt->rm_len, pktinfo_off, pktinfo_len);
+			return;
+		}
+
+		/*
+		 * Check packet info coverage.
+		 */
+		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
+		    data_off, data_len);
+		if (__predict_false(overlap)) {
+			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+			    "pktinfo overlap data, pktinfo abs %d len %d, "
+			    "data abs %d len %d\n",
+			    pktinfo_off, pktinfo_len, data_off, data_len);
+			return;
+		}
+
+		/*
+		 * Find useful per-packet-info.
+		 */
+		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
+		    pktinfo_len, &info);
+		if (__predict_false(error)) {
+			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
+			    "pktinfo\n");
+			return;
+		}
+	}
+
+	if (__predict_false(data_off + data_len > pkt->rm_len)) {
+		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+		    "data overflow, msglen %u, data abs %d len %d\n",
+		    pkt->rm_len, data_off, data_len);
+		return;
+	}
+
+	/* Identify RSC fragments, drop invalid packets */
+	if ((info.pktinfo_id != NULL) &&
+	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
+		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
+			rxr->rsc.cnt = 0;
+			rxr->hn_rsc_pkts++;
+		} else if (rxr->rsc.cnt == 0)
+			goto drop;
+
+		rsc_more = true;
+
+		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
+			rsc_more = false;
+
+		if (rsc_more && rxr->rsc.is_last)
+			goto drop;
+	} else {
+		rxr->rsc.cnt = 0;
+	}
+
+	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
+		goto drop;
+
+	/* Store data in per rx ring structure */
+	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
+	    data_len, &info);
+
+	if (rsc_more)
+		return;
+
+	hn_rxpkt(rxr);
+	rxr->rsc.cnt = 0;
+	return;
+drop:
+	rxr->hn_rsc_drop++;
+	return;
+}
+
+static __inline void
+hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
+{
+	const struct rndis_msghdr *hdr;
+
+	if (__predict_false(dlen < sizeof(*hdr))) {
+		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
+		return;
+	}
+	hdr = data;
+
+	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
+		/* Hot data path. */
+		hn_rndis_rx_data(rxr, data, dlen);
+		/* Done! */
+		return;
+	}
+
+	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
+		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
+	else
+		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
+}
+
+static void
+hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
+{
+	const struct hn_nvs_hdr *hdr;
+
+	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
+		if_printf(sc->hn_ifp, "invalid nvs notify\n");
+		return;
+	}
+	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
+
+	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
+		/* Useless; ignore */
+		return;
+	}
+	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
+}
+
+static void
+hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
+    const struct vmbus_chanpkt_hdr *pkt)
+{
+	struct hn_nvs_sendctx *sndc;
+
+	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
+	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
+	    VMBUS_CHANPKT_DATALEN(pkt));
+	/*
+	 * NOTE:
+	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
+	 * its callback.
+	 */
+}
+
+static void
+hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
+    const struct vmbus_chanpkt_hdr *pkthdr)
+{
+	struct epoch_tracker et;
+	const struct vmbus_chanpkt_rxbuf *pkt;
+	const struct hn_nvs_hdr *nvs_hdr;
+	int count, i, hlen;
+
+	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
+		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
+		return;
+	}
+	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
+
+	/* Make sure that this is a RNDIS message. */
+	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
+		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
+		    nvs_hdr->nvs_type);
+		return;
+	}
+
+	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
+	if (__predict_false(hlen < sizeof(*pkt))) {
+		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
+		return;
+	}
+	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
+
+	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
+		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
+		    pkt->cp_rxbuf_id);
+		return;
+	}
+
+	count = pkt->cp_rxbuf_cnt;
+	if (__predict_false(hlen <
+	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
+		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
+		return;
+	}
+
+	NET_EPOCH_ENTER(et);
+	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
+	for (i = 0; i < count; ++i) {
+		int ofs, len;
+
+		ofs = pkt->cp_rxbuf[i].rb_ofs;
+		len = pkt->cp_rxbuf[i].rb_len;
+		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
+			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
+			    "ofs %d, len %d\n", i, ofs, len);
+			continue;
+		}
+
+		rxr->rsc.is_last = (i == (count - 1));
+		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
+	}
+	NET_EPOCH_EXIT(et);
+
+	/*
+	 * Ack the consumed RXBUF associated w/ this channel packet,
+	 * so that this RXBUF can be recycled by the hypervisor.
+	 */
+	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
+}
+
+static void
+hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
+    uint64_t tid)
+{
+	struct hn_nvs_rndis_ack ack;
+	int retries, error;
+	
+	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
+	ack.nvs_status = HN_NVS_STATUS_OK;
+
+	retries = 0;
+again:
+	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
+	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
+	if (__predict_false(error == EAGAIN)) {
+		/*
+		 * NOTE:
+		 * This should _not_ happen in real world, since the
+		 * consumption of the TX bufring from the TX path is
+		 * controlled.
+		 */
+		if (rxr->hn_ack_failed == 0)
+			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
+		rxr->hn_ack_failed++;
+		retries++;
+		if (retries < 10) {
+			DELAY(100);
+			goto again;
+		}
+		/* RXBUF leaks! */
+		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
+	}
+}
+
+static void
+hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
+{
+	struct hn_rx_ring *rxr = xrxr;
+	struct hn_softc *sc = rxr->hn_ifp->if_softc;
+
+	for (;;) {
+		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
+		int error, pktlen;
+
+		pktlen = rxr->hn_pktbuf_len;
+		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
+		if (__predict_false(error == ENOBUFS)) {
+			void *nbuf;
+			int nlen;
+
+			/*
+			 * Expand channel packet buffer.
+			 *
+			 * XXX
+			 * Use M_WAITOK here, since allocation failure
+			 * is fatal.
+			 */
+			nlen = rxr->hn_pktbuf_len * 2;
+			while (nlen < pktlen)
+				nlen *= 2;
+			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
+
+			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
+			    rxr->hn_pktbuf_len, nlen);
+
+			free(rxr->hn_pktbuf, M_DEVBUF);
+			rxr->hn_pktbuf = nbuf;
+			rxr->hn_pktbuf_len = nlen;
+			/* Retry! */
+			continue;
+		} else if (__predict_false(error == EAGAIN)) {
+			/* No more channel packets; done! */
+			break;
+		}
+		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
+
+		switch (pkt->cph_type) {
+		case VMBUS_CHANPKT_TYPE_COMP:
+			hn_nvs_handle_comp(sc, chan, pkt);
+			break;
+
+		case VMBUS_CHANPKT_TYPE_RXBUF:
+			hn_nvs_handle_rxbuf(rxr, chan, pkt);
+			break;
+
+		case VMBUS_CHANPKT_TYPE_INBAND:
+			hn_nvs_handle_notify(sc, pkt);
+			break;
+
+		default:
+			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
+			    pkt->cph_type);
+			break;
+		}
+	}
+	hn_chan_rollup(rxr, rxr->hn_txr);
+}
+
+static void
+hn_sysinit(void *arg __unused)
+{
+	int i;
+
+	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
+
+#ifdef HN_IFSTART_SUPPORT
+	/*
+	 * Don't use ifnet.if_start if transparent VF mode is requested;
+	 * mainly due to the IFF_DRV_OACTIVE flag.
+	 */
+	if (hn_xpnt_vf && hn_use_if_start) {
+		hn_use_if_start = 0;
+		printf("hn: tranparent VF mode, if_transmit will be used, "
+		    "instead of if_start\n");
+	}
+#endif
+	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
+		printf("hn: invalid transparent VF attach routing "
+		    "wait timeout %d, reset to %d\n",
+		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
+		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
+	}
+
+	/*
+	 * Initialize VF map.
+	 */
+	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
+	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
+	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
+	    M_WAITOK | M_ZERO);
+
+	/*
+	 * Fix the # of TX taskqueues.
+	 */
+	if (hn_tx_taskq_cnt <= 0)
+		hn_tx_taskq_cnt = 1;
+	else if (hn_tx_taskq_cnt > mp_ncpus)
+		hn_tx_taskq_cnt = mp_ncpus;
+
+	/*
+	 * Fix the TX taskqueue mode.
+	 */
+	switch (hn_tx_taskq_mode) {
+	case HN_TX_TASKQ_M_INDEP:
+	case HN_TX_TASKQ_M_GLOBAL:
+	case HN_TX_TASKQ_M_EVTTQ:
+		break;
+	default:
+		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
+		break;
+	}
+
+	if (vm_guest != VM_GUEST_HV)
+		return;
+
+	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
+		return;
+
+	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
+	    M_DEVBUF, M_WAITOK);
+	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
+		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
+		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
+		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
+		    "hn tx%d", i);
+	}
+}
+SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
+
+static void
+hn_sysuninit(void *arg __unused)
+{
+
+	if (hn_tx_taskque != NULL) {
+		int i;
+
+		for (i = 0; i < hn_tx_taskq_cnt; ++i)
+			taskqueue_free(hn_tx_taskque[i]);
+		free(hn_tx_taskque, M_DEVBUF);
+	}
+
+	if (hn_vfmap != NULL)
+		free(hn_vfmap, M_DEVBUF);
+	rm_destroy(&hn_vfmap_lock);
+
+	counter_u64_free(hn_udpcs_fixup);
+}
+SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
diff --git a/sys/dev/hyperv/netvsc/if_hnreg.h b/sys/dev/hyperv/netvsc/if_hnreg.h
new file mode 100644
index 000000000000..54db556cc56d
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/if_hnreg.h
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IF_HNREG_H_
+#define _IF_HNREG_H_
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+/*
+ * NDIS protocol version numbers
+ */
+#define HN_NDIS_VERSION_6_1		0x00060001
+#define HN_NDIS_VERSION_6_20		0x00060014
+#define HN_NDIS_VERSION_6_30		0x0006001e
+#define HN_NDIS_VERSION_MAJOR(ver)	(((ver) & 0xffff0000) >> 16)
+#define HN_NDIS_VERSION_MINOR(ver)	((ver) & 0xffff)
+
+/*
+ * NVS versions.
+ */
+#define HN_NVS_VERSION_1		0x00002
+#define HN_NVS_VERSION_2		0x30002
+#define HN_NVS_VERSION_4		0x40000
+#define HN_NVS_VERSION_5		0x50000
+#define HN_NVS_VERSION_6		0x60000
+#define HN_NVS_VERSION_61		0x60001
+
+#define HN_NVS_RXBUF_SIG		0xcafe
+#define HN_NVS_CHIM_SIG			0xface
+
+#define HN_NVS_CHIM_IDX_INVALID		0xffffffff
+
+#define HN_NVS_RNDIS_MTYPE_DATA		0
+#define HN_NVS_RNDIS_MTYPE_CTRL		1
+
+/*
+ * NVS message transacion status codes.
+ */
+#define HN_NVS_STATUS_OK		1
+#define HN_NVS_STATUS_FAILED		2
+
+/*
+ * NVS request/response message types.
+ */
+#define HN_NVS_TYPE_INIT		1
+#define HN_NVS_TYPE_INIT_RESP		2
+#define HN_NVS_TYPE_NDIS_INIT		100
+#define HN_NVS_TYPE_RXBUF_CONN		101
+#define HN_NVS_TYPE_RXBUF_CONNRESP	102
+#define HN_NVS_TYPE_RXBUF_DISCONN	103
+#define HN_NVS_TYPE_CHIM_CONN		104
+#define HN_NVS_TYPE_CHIM_CONNRESP	105
+#define HN_NVS_TYPE_CHIM_DISCONN	106
+#define HN_NVS_TYPE_RNDIS		107
+#define HN_NVS_TYPE_RNDIS_ACK		108
+#define HN_NVS_TYPE_NDIS_CONF		125
+#define HN_NVS_TYPE_VFASSOC_NOTE	128	/* notification */
+#define HN_NVS_TYPE_SET_DATAPATH	129
+#define HN_NVS_TYPE_SUBCH_REQ		133
+#define HN_NVS_TYPE_SUBCH_RESP		133	/* same as SUBCH_REQ */
+#define HN_NVS_TYPE_TXTBL_NOTE		134	/* notification */
+
+/*
+ * Any size less than this one will _not_ work, e.g. hn_nvs_init
+ * only has 12B valid data, however, if only 12B data were sent,
+ * Hypervisor would never reply.
+ */
+#define HN_NVS_REQSIZE_MIN		32
+
+/* NVS message common header */
+struct hn_nvs_hdr {
+	uint32_t	nvs_type;
+} __packed;
+
+struct hn_nvs_init {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_INIT */
+	uint32_t	nvs_ver_min;
+	uint32_t	nvs_ver_max;
+	uint8_t		nvs_rsvd[20];
+	uint8_t		nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_init) >= HN_NVS_REQSIZE_MIN);
+
+struct hn_nvs_init_resp {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_INIT_RESP */
+	uint32_t	nvs_ver;	/* deprecated */
+	uint32_t	nvs_rsvd;
+	uint32_t	nvs_status;	/* HN_NVS_STATUS_ */
+} __packed;
+
+/* No reponse */
+struct hn_nvs_ndis_conf {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_NDIS_CONF */
+	uint32_t	nvs_mtu;
+	uint32_t	nvs_rsvd;
+	uint64_t	nvs_caps;	/* HN_NVS_NDIS_CONF_ */
+	uint8_t		nvs_rsvd1[12];
+	uint8_t		nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_ndis_conf) >= HN_NVS_REQSIZE_MIN);
+
+#define HN_NVS_NDIS_CONF_SRIOV		0x0004
+#define HN_NVS_NDIS_CONF_VLAN		0x0008
+#define HN_NVS_NDIS_CONF_RSC		0x0080
+
+/* No response */
+struct hn_nvs_ndis_init {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_NDIS_INIT */
+	uint32_t	nvs_ndis_major;	/* NDIS_VERSION_MAJOR_ */
+	uint32_t	nvs_ndis_minor;	/* NDIS_VERSION_MINOR_ */
+	uint8_t		nvs_rsvd[20];
+	uint8_t		nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_ndis_init) >= HN_NVS_REQSIZE_MIN);
+
+#define HN_NVS_DATAPATH_SYNTH		0
+#define HN_NVS_DATAPATH_VF		1
+
+/* No response */
+struct hn_nvs_datapath {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_SET_DATAPATH */
+	uint32_t	nvs_active_path;/* HN_NVS_DATAPATH_* */
+	uint32_t	nvs_rsvd[6];
+	uint8_t		nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_datapath) >= HN_NVS_REQSIZE_MIN);
+
+struct hn_nvs_rxbuf_conn {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_RXBUF_CONN */
+	uint32_t	nvs_gpadl;	/* RXBUF vmbus GPADL */
+	uint16_t	nvs_sig;	/* HN_NVS_RXBUF_SIG */
+	uint8_t		nvs_rsvd[22];
+	uint8_t		nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_rxbuf_conn) >= HN_NVS_REQSIZE_MIN);
+
+struct hn_nvs_rxbuf_sect {
+	uint32_t	nvs_start;
+	uint32_t	nvs_slotsz;
+	uint32_t	nvs_slotcnt;
+	uint32_t	nvs_end;
+} __packed;
+
+struct hn_nvs_rxbuf_connresp {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_RXBUF_CONNRESP */
+	uint32_t	nvs_status;	/* HN_NVS_STATUS_ */
+	uint32_t	nvs_nsect;	/* # of elem in nvs_sect */
+	struct hn_nvs_rxbuf_sect nvs_sect[];
+} __packed;
+
+/* No response */
+struct hn_nvs_rxbuf_disconn {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_RXBUF_DISCONN */
+	uint16_t	nvs_sig;	/* HN_NVS_RXBUF_SIG */
+	uint8_t		nvs_rsvd[26];
+	uint8_t		nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_rxbuf_disconn) >= HN_NVS_REQSIZE_MIN);
+
+struct hn_nvs_chim_conn {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_CHIM_CONN */
+	uint32_t	nvs_gpadl;	/* chimney buf vmbus GPADL */
+	uint16_t	nvs_sig;	/* NDIS_NVS_CHIM_SIG */
+	uint8_t		nvs_rsvd[22];
+	uint8_t		nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_chim_conn) >= HN_NVS_REQSIZE_MIN);
+
+struct hn_nvs_chim_connresp {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_CHIM_CONNRESP */
+	uint32_t	nvs_status;	/* HN_NVS_STATUS_ */
+	uint32_t	nvs_sectsz;	/* section size */
+} __packed;
+
+/* No response */
+struct hn_nvs_chim_disconn {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_CHIM_DISCONN */
+	uint16_t	nvs_sig;	/* HN_NVS_CHIM_SIG */
+	uint8_t		nvs_rsvd[26];
+	uint8_t		nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_chim_disconn) >= HN_NVS_REQSIZE_MIN);
+
+#define HN_NVS_SUBCH_OP_ALLOC		1
+
+struct hn_nvs_subch_req {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_SUBCH_REQ */
+	uint32_t	nvs_op;		/* HN_NVS_SUBCH_OP_ */
+	uint32_t	nvs_nsubch;
+	uint8_t		nvs_rsvd[20];
+	uint8_t		nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_subch_req) >= HN_NVS_REQSIZE_MIN);
+
+struct hn_nvs_subch_resp {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_SUBCH_RESP */
+	uint32_t	nvs_status;	/* HN_NVS_STATUS_ */
+	uint32_t	nvs_nsubch;
+} __packed;
+
+struct hn_nvs_rndis {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_RNDIS */
+	uint32_t	nvs_rndis_mtype;/* HN_NVS_RNDIS_MTYPE_ */
+	/*
+	 * Chimney sending buffer index and size.
+	 *
+	 * NOTE:
+	 * If nvs_chim_idx is set to HN_NVS_CHIM_IDX_INVALID
+	 * and nvs_chim_sz is set to 0, then chimney sending
+	 * buffer is _not_ used by this RNDIS message.
+	 */
+	uint32_t	nvs_chim_idx;
+	uint32_t	nvs_chim_sz;
+	uint8_t		nvs_rsvd[16];
+	uint8_t		nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_rndis) >= HN_NVS_REQSIZE_MIN);
+
+struct hn_nvs_rndis_ack {
+	uint32_t	nvs_type;	/* HN_NVS_TYPE_RNDIS_ACK */
+	uint32_t	nvs_status;	/* HN_NVS_STATUS_ */
+	uint8_t		nvs_rsvd[24];
+	uint8_t		nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_rndis_ack) >= HN_NVS_REQSIZE_MIN);
+
+/*
+ * RNDIS extension
+ */
+
+/* Per-packet hash info */
+#define HN_NDIS_HASH_INFO_SIZE		sizeof(uint32_t)
+#define HN_NDIS_PKTINFO_TYPE_HASHINF	NDIS_PKTINFO_TYPE_ORIG_NBLIST
+/* NDIS_HASH_ */
+
+/* Per-packet hash value */
+#define HN_NDIS_HASH_VALUE_SIZE		sizeof(uint32_t)
+#define HN_NDIS_PKTINFO_TYPE_HASHVAL	NDIS_PKTINFO_TYPE_PKT_CANCELID
+
+/* Per-packet-info size */
+#define HN_RNDIS_PKTINFO_SIZE(dlen)	\
+	__offsetof(struct rndis_pktinfo, rm_data[dlen])
+
+#endif	/* !_IF_HNREG_H_ */
diff --git a/sys/dev/hyperv/netvsc/if_hnvar.h b/sys/dev/hyperv/netvsc/if_hnvar.h
new file mode 100644
index 000000000000..27d93db5395e
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/if_hnvar.h
@@ -0,0 +1,335 @@
+/*-
+ * Copyright (c) 2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IF_HNVAR_H_
+#define _IF_HNVAR_H_
+
+#define HN_USE_TXDESC_BUFRING
+
+#define HN_CHIM_SIZE			(15 * 1024 * 1024)
+
+#define HN_RXBUF_SIZE			(31 * 1024 * 1024)
+#define HN_RXBUF_SIZE_COMPAT		(15 * 1024 * 1024)
+
+#define HN_MTU_MAX			(65535 - ETHER_ADDR_LEN)
+
+#define HN_TXBR_SIZE			(128 * PAGE_SIZE)
+#define HN_RXBR_SIZE			(128 * PAGE_SIZE)
+
+#define HN_XACT_REQ_PGCNT		2
+#define HN_XACT_RESP_PGCNT		2
+#define HN_XACT_REQ_SIZE		(HN_XACT_REQ_PGCNT * PAGE_SIZE)
+#define HN_XACT_RESP_SIZE		(HN_XACT_RESP_PGCNT * PAGE_SIZE)
+
+#define HN_GPACNT_MAX			32
+
+struct hn_txdesc;
+#ifndef HN_USE_TXDESC_BUFRING
+SLIST_HEAD(hn_txdesc_list, hn_txdesc);
+#else
+struct buf_ring;
+#endif
+struct hn_tx_ring;
+
+#define	HN_NVS_RSC_MAX		562	/* Max RSC frags in one vmbus packet */
+
+struct hn_rx_rsc {
+	const uint32_t		*vlan_info;
+	const uint32_t		*csum_info;
+	const uint32_t		*hash_info;
+	const uint32_t		*hash_value;
+	uint32_t		cnt;		/* fragment count */
+	uint32_t		pktlen;		/* full packet length */
+	uint8_t			is_last;	/* last fragment */
+	const void		*frag_data[HN_NVS_RSC_MAX];
+	uint32_t		frag_len[HN_NVS_RSC_MAX];
+};
+
+struct hn_rx_ring {
+	struct ifnet	*hn_ifp;
+	struct ifnet	*hn_rxvf_ifp;	/* SR-IOV VF for RX */
+	struct hn_tx_ring *hn_txr;
+	void		*hn_pktbuf;
+	int		hn_pktbuf_len;
+	int		hn_rx_flags;	/* HN_RX_FLAG_ */
+	uint32_t	hn_mbuf_hash;	/* NDIS_HASH_ */
+	uint8_t		*hn_rxbuf;	/* shadow sc->hn_rxbuf */
+	int		hn_rx_idx;
+	struct hn_rx_rsc rsc;
+
+	/* Trust csum verification on host side */
+	int		hn_trust_hcsum;	/* HN_TRUST_HCSUM_ */
+	struct lro_ctrl	hn_lro;
+
+	u_long		hn_csum_ip;
+	u_long		hn_csum_tcp;
+	u_long		hn_csum_udp;
+	u_long		hn_csum_trusted;
+	u_long		hn_lro_tried;
+	u_long		hn_small_pkts;
+	u_long		hn_pkts;
+	u_long		hn_rss_pkts;
+	u_long		hn_ack_failed;
+	u_long		hn_rsc_pkts;
+	u_long		hn_rsc_drop;
+
+	/* Rarely used stuffs */
+	struct sysctl_oid *hn_rx_sysctl_tree;
+
+	void		*hn_br;		/* TX/RX bufring */
+	struct hyperv_dma hn_br_dma;
+
+	struct vmbus_channel *hn_chan;
+} __aligned(CACHE_LINE_SIZE);
+
+#define HN_TRUST_HCSUM_IP	0x0001
+#define HN_TRUST_HCSUM_TCP	0x0002
+#define HN_TRUST_HCSUM_UDP	0x0004
+
+#define HN_RX_FLAG_ATTACHED	0x0001
+#define HN_RX_FLAG_BR_REF	0x0002
+#define HN_RX_FLAG_XPNT_VF	0x0004
+#define HN_RX_FLAG_UDP_HASH	0x0008
+
+struct hn_tx_ring {
+#ifndef HN_USE_TXDESC_BUFRING
+	struct mtx	hn_txlist_spin;
+	struct hn_txdesc_list hn_txlist;
+#else
+	struct buf_ring	*hn_txdesc_br;
+#endif
+	int		hn_txdesc_cnt;
+	int		hn_txdesc_avail;
+	u_short		hn_has_txeof;
+	u_short		hn_txdone_cnt;
+
+	int		hn_sched_tx;
+	void		(*hn_txeof)(struct hn_tx_ring *);
+	struct taskqueue *hn_tx_taskq;
+	struct task	hn_tx_task;
+	struct task	hn_txeof_task;
+
+	struct buf_ring	*hn_mbuf_br;
+	int		hn_oactive;
+	int		hn_tx_idx;
+	int		hn_tx_flags;
+
+	struct mtx	hn_tx_lock;
+	struct hn_softc	*hn_sc;
+	struct vmbus_channel *hn_chan;
+
+	int		hn_direct_tx_size;
+	int		hn_chim_size;
+	bus_dma_tag_t	hn_tx_data_dtag;
+	uint64_t	hn_csum_assist;
+
+	/* Applied packet transmission aggregation limits. */
+	int		hn_agg_szmax;
+	short		hn_agg_pktmax;
+	short		hn_agg_align;
+
+	/* Packet transmission aggregation states. */
+	struct hn_txdesc *hn_agg_txd;
+	int		hn_agg_szleft;
+	short		hn_agg_pktleft;
+	struct rndis_packet_msg *hn_agg_prevpkt;
+
+	/* Temporary stats for each sends. */
+	int		hn_stat_size;
+	short		hn_stat_pkts;
+	short		hn_stat_mcasts;
+
+	int		(*hn_sendpkt)(struct hn_tx_ring *, struct hn_txdesc *);
+	int		hn_suspended;
+	int		hn_gpa_cnt;
+	struct vmbus_gpa hn_gpa[HN_GPACNT_MAX];
+
+	u_long		hn_no_txdescs;
+	u_long		hn_send_failed;
+	u_long		hn_txdma_failed;
+	u_long		hn_tx_collapsed;
+	u_long		hn_tx_chimney_tried;
+	u_long		hn_tx_chimney;
+	u_long		hn_pkts;
+	u_long		hn_sends;
+	u_long		hn_flush_failed;
+
+	/* Rarely used stuffs */
+	struct hn_txdesc *hn_txdesc;
+	bus_dma_tag_t	hn_tx_rndis_dtag;
+	struct sysctl_oid *hn_tx_sysctl_tree;
+} __aligned(CACHE_LINE_SIZE);
+
+#define HN_TX_FLAG_ATTACHED	0x0001
+#define HN_TX_FLAG_HASHVAL	0x0002	/* support HASHVAL pktinfo */
+
+/*
+ * Device-specific softc structure
+ */
+struct hn_softc {
+	struct ifnet    *hn_ifp;
+	struct ifmedia	hn_media;
+	device_t        hn_dev;
+	int             hn_if_flags;
+	struct sx	hn_lock;
+	struct vmbus_channel *hn_prichan;
+
+	int		hn_rx_ring_cnt;
+	int		hn_rx_ring_inuse;
+	struct hn_rx_ring *hn_rx_ring;
+
+	struct rmlock	hn_vf_lock;
+	struct ifnet	*hn_vf_ifp;	/* SR-IOV VF */
+	uint32_t	hn_xvf_flags;	/* transparent VF flags */
+
+	int		hn_tx_ring_cnt;
+	int		hn_tx_ring_inuse;
+	struct hn_tx_ring *hn_tx_ring;
+
+	uint8_t		*hn_chim;
+	u_long		*hn_chim_bmap;
+	int		hn_chim_bmap_cnt;
+	int		hn_chim_cnt;
+	int		hn_chim_szmax;
+
+	int		hn_cpu;
+	struct taskqueue **hn_tx_taskqs;
+	struct sysctl_oid *hn_tx_sysctl_tree;
+	struct sysctl_oid *hn_rx_sysctl_tree;
+	struct vmbus_xact_ctx *hn_xact;
+	uint32_t	hn_nvs_ver;
+	uint32_t	hn_rx_filter;
+
+	/* Packet transmission aggregation user settings. */
+	int			hn_agg_size;
+	int			hn_agg_pkts;
+
+	struct taskqueue	*hn_mgmt_taskq;
+	struct taskqueue	*hn_mgmt_taskq0;
+	struct task		hn_link_task;
+	struct task		hn_netchg_init;
+	struct timeout_task	hn_netchg_status;
+	uint32_t		hn_link_flags;	/* HN_LINK_FLAG_ */
+
+	uint32_t		hn_caps;	/* HN_CAP_ */
+	uint32_t		hn_flags;	/* HN_FLAG_ */
+	u_int			hn_pollhz;
+
+	void			*hn_rxbuf;
+	uint32_t		hn_rxbuf_gpadl;
+	struct hyperv_dma	hn_rxbuf_dma;
+
+	uint32_t		hn_chim_gpadl;
+	struct hyperv_dma	hn_chim_dma;
+
+	uint32_t		hn_rndis_rid;
+	uint32_t		hn_ndis_ver;
+	int			hn_ndis_tso_szmax;
+	int			hn_ndis_tso_sgmin;
+	uint32_t		hn_rndis_agg_size;
+	uint32_t		hn_rndis_agg_pkts;
+	uint32_t		hn_rndis_agg_align;
+
+	int			hn_rss_ind_size;
+	uint32_t		hn_rss_hash;	/* setting, NDIS_HASH_ */
+	uint32_t		hn_rss_hcap;	/* caps, NDIS_HASH_ */
+	struct ndis_rssprm_toeplitz hn_rss;
+
+	eventhandler_tag	hn_ifaddr_evthand;
+	eventhandler_tag	hn_ifnet_evthand;
+	eventhandler_tag	hn_ifnet_atthand;
+	eventhandler_tag	hn_ifnet_dethand;
+	eventhandler_tag	hn_ifnet_lnkhand;
+
+	/*
+	 * Transparent VF delayed initialization.
+	 */
+	int			hn_vf_rdytick;	/* ticks, 0 == ready */
+	struct taskqueue	*hn_vf_taskq;
+	struct timeout_task	hn_vf_init;
+
+	/*
+	 * Saved information for VF under transparent mode.
+	 */
+	void			(*hn_vf_input)
+				(struct ifnet *, struct mbuf *);
+	int			hn_saved_caps;
+	u_int			hn_saved_tsomax;
+	u_int			hn_saved_tsosegcnt;
+	u_int			hn_saved_tsosegsz;
+};
+
+#define HN_FLAG_RXBUF_CONNECTED		0x0001
+#define HN_FLAG_CHIM_CONNECTED		0x0002
+#define HN_FLAG_HAS_RSSKEY		0x0004
+#define HN_FLAG_HAS_RSSIND		0x0008
+#define HN_FLAG_SYNTH_ATTACHED		0x0010
+#define HN_FLAG_NO_SLEEPING		0x0020
+#define HN_FLAG_RXBUF_REF		0x0040
+#define HN_FLAG_CHIM_REF		0x0080
+#define HN_FLAG_RXVF			0x0100
+
+#define HN_FLAG_ERRORS			(HN_FLAG_RXBUF_REF | HN_FLAG_CHIM_REF)
+
+#define HN_XVFFLAG_ENABLED		0x0001
+#define HN_XVFFLAG_ACCBPF		0x0002
+
+#define HN_NO_SLEEPING(sc)			\
+do {						\
+	(sc)->hn_flags |= HN_FLAG_NO_SLEEPING;	\
+} while (0)
+
+#define HN_SLEEPING_OK(sc)			\
+do {						\
+	(sc)->hn_flags &= ~HN_FLAG_NO_SLEEPING;	\
+} while (0)
+
+#define HN_CAN_SLEEP(sc)		\
+	(((sc)->hn_flags & HN_FLAG_NO_SLEEPING) == 0)
+
+#define HN_CAP_VLAN			0x0001
+#define HN_CAP_MTU			0x0002
+#define HN_CAP_IPCS			0x0004
+#define HN_CAP_TCP4CS			0x0008
+#define HN_CAP_TCP6CS			0x0010
+#define HN_CAP_UDP4CS			0x0020
+#define HN_CAP_UDP6CS			0x0040
+#define HN_CAP_TSO4			0x0080
+#define HN_CAP_TSO6			0x0100
+#define HN_CAP_HASHVAL			0x0200
+#define HN_CAP_UDPHASH			0x0400
+
+/* Capability description for use with printf(9) %b identifier. */
+#define HN_CAP_BITS				\
+	"\020\1VLAN\2MTU\3IPCS\4TCP4CS\5TCP6CS"	\
+	"\6UDP4CS\7UDP6CS\10TSO4\11TSO6\12HASHVAL\13UDPHASH"
+
+#define HN_LINK_FLAG_LINKUP		0x0001
+#define HN_LINK_FLAG_NETCHG		0x0002
+
+#endif	/* !_IF_HNVAR_H_ */
diff --git a/sys/dev/hyperv/netvsc/ndis.h b/sys/dev/hyperv/netvsc/ndis.h
new file mode 100644
index 000000000000..c69da7807a63
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/ndis.h
@@ -0,0 +1,422 @@
+/*-
+ * Copyright (c) 2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NET_NDIS_H_
+#define _NET_NDIS_H_
+
+#define	NDIS_MEDIA_STATE_CONNECTED	0
+#define	NDIS_MEDIA_STATE_DISCONNECTED	1
+
+#define	NDIS_NETCHANGE_TYPE_POSSIBLE	1
+#define	NDIS_NETCHANGE_TYPE_DEFINITE	2
+#define	NDIS_NETCHANGE_TYPE_FROMMEDIA	3
+
+#define	NDIS_OFFLOAD_SET_NOCHG		0
+#define	NDIS_OFFLOAD_SET_ON		1
+#define	NDIS_OFFLOAD_SET_OFF		2
+
+/* a.k.a GRE MAC */
+#define	NDIS_ENCAP_TYPE_NVGRE		0x00000001
+
+#define	NDIS_HASH_FUNCTION_MASK		0x000000FF	/* see hash function */
+#define	NDIS_HASH_TYPE_MASK		0x00FFFF00	/* see hash type */
+
+/* hash function */
+#define	NDIS_HASH_FUNCTION_TOEPLITZ	0x00000001
+
+/* hash type */
+#define	NDIS_HASH_IPV4			0x00000100
+#define	NDIS_HASH_TCP_IPV4		0x00000200
+#define	NDIS_HASH_IPV6			0x00000400
+#define	NDIS_HASH_IPV6_EX		0x00000800
+#define	NDIS_HASH_TCP_IPV6		0x00001000
+#define	NDIS_HASH_TCP_IPV6_EX		0x00002000
+#define	NDIS_HASH_UDP_IPV4_X		0x00004000	/* XXX non-standard */
+
+#define	NDIS_HASH_ALL			(NDIS_HASH_IPV4 |	\
+					 NDIS_HASH_TCP_IPV4 |	\
+					 NDIS_HASH_IPV6 |	\
+					 NDIS_HASH_IPV6_EX |	\
+					 NDIS_HASH_TCP_IPV6 |	\
+					 NDIS_HASH_TCP_IPV6_EX |\
+					 NDIS_HASH_UDP_IPV4_X)
+
+#define	NDIS_HASH_STD			(NDIS_HASH_IPV4 |	\
+					 NDIS_HASH_TCP_IPV4 |	\
+					 NDIS_HASH_IPV6 |	\
+					 NDIS_HASH_IPV6_EX |	\
+					 NDIS_HASH_TCP_IPV6 |	\
+					 NDIS_HASH_TCP_IPV6_EX)
+
+/* Hash description for use with printf(9) %b identifier. */
+#define	NDIS_HASH_BITS			\
+	"\20\1TOEPLITZ\11IP4\12TCP4\13IP6\14IP6EX\15TCP6\16TCP6EX\17UDP4_X"
+
+#define	NDIS_HASH_KEYSIZE_TOEPLITZ	40
+#define	NDIS_HASH_INDCNT		128
+
+#define	NDIS_OBJTYPE_DEFAULT		0x80
+#define	NDIS_OBJTYPE_RSS_CAPS		0x88
+#define	NDIS_OBJTYPE_RSS_PARAMS		0x89
+#define	NDIS_OBJTYPE_OFFLOAD		0xa7
+
+struct ndis_object_hdr {
+	uint8_t			ndis_type;	/* NDIS_OBJTYPE_ */
+	uint8_t			ndis_rev;	/* type specific */
+	uint16_t		ndis_size;	/* incl. this hdr */
+};
+
+/*
+ * OID_TCP_OFFLOAD_PARAMETERS
+ * ndis_type: NDIS_OBJTYPE_DEFAULT
+ */
+struct ndis_offload_params {
+	struct ndis_object_hdr	ndis_hdr;
+	uint8_t			ndis_ip4csum;	/* NDIS_OFFLOAD_PARAM_ */
+	uint8_t			ndis_tcp4csum;	/* NDIS_OFFLOAD_PARAM_ */
+	uint8_t			ndis_udp4csum;	/* NDIS_OFFLOAD_PARAM_ */
+	uint8_t			ndis_tcp6csum;	/* NDIS_OFFLOAD_PARAM_ */
+	uint8_t			ndis_udp6csum;	/* NDIS_OFFLOAD_PARAM_ */
+	uint8_t			ndis_lsov1;	/* NDIS_OFFLOAD_PARAM_ */
+	uint8_t			ndis_ipsecv1;	/* NDIS_OFFLOAD_IPSECV1_ */
+	uint8_t			ndis_lsov2_ip4;	/* NDIS_OFFLOAD_LSOV2_ */
+	uint8_t			ndis_lsov2_ip6;	/* NDIS_OFFLOAD_LSOV2_ */
+	uint8_t			ndis_tcp4conn;	/* 0 */
+	uint8_t			ndis_tcp6conn;	/* 0 */
+	uint32_t		ndis_flags;	/* 0 */
+	/* NDIS >= 6.1 */
+	uint8_t			ndis_ipsecv2;	/* NDIS_OFFLOAD_IPSECV2_ */
+	uint8_t			ndis_ipsecv2_ip4;/* NDIS_OFFLOAD_IPSECV2_ */
+	/* NDIS >= 6.30 */
+	uint8_t			ndis_rsc_ip4;	/* NDIS_OFFLOAD_RSC_ */
+	uint8_t			ndis_rsc_ip6;	/* NDIS_OFFLOAD_RSC_ */
+	uint8_t			ndis_encap;	/* NDIS_OFFLOAD_SET_ */
+	uint8_t			ndis_encap_types;/* NDIS_ENCAP_TYPE_ */
+};
+
+#define	NDIS_OFFLOAD_PARAMS_SIZE	sizeof(struct ndis_offload_params)
+#define	NDIS_OFFLOAD_PARAMS_SIZE_6_1	\
+	__offsetof(struct ndis_offload_params, ndis_rsc_ip4)
+
+#define	NDIS_OFFLOAD_PARAMS_REV_2	2	/* NDIS 6.1 */
+#define	NDIS_OFFLOAD_PARAMS_REV_3	3	/* NDIS 6.30 */
+
+#define	NDIS_OFFLOAD_PARAM_NOCHG	0	/* common */
+#define	NDIS_OFFLOAD_PARAM_OFF		1
+#define	NDIS_OFFLOAD_PARAM_TX		2
+#define	NDIS_OFFLOAD_PARAM_RX		3
+#define	NDIS_OFFLOAD_PARAM_TXRX		4
+
+/* NDIS_OFFLOAD_PARAM_NOCHG */
+#define	NDIS_OFFLOAD_LSOV1_OFF		1
+#define	NDIS_OFFLOAD_LSOV1_ON		2
+
+/* NDIS_OFFLOAD_PARAM_NOCHG */
+#define	NDIS_OFFLOAD_IPSECV1_OFF	1
+#define	NDIS_OFFLOAD_IPSECV1_AH		2
+#define	NDIS_OFFLOAD_IPSECV1_ESP	3
+#define	NDIS_OFFLOAD_IPSECV1_AH_ESP	4
+
+/* NDIS_OFFLOAD_PARAM_NOCHG */
+#define	NDIS_OFFLOAD_LSOV2_OFF		1
+#define	NDIS_OFFLOAD_LSOV2_ON		2
+
+/* NDIS_OFFLOAD_PARAM_NOCHG */
+#define	NDIS_OFFLOAD_IPSECV2_OFF	1
+#define	NDIS_OFFLOAD_IPSECV2_AH		2
+#define	NDIS_OFFLOAD_IPSECV2_ESP	3
+#define	NDIS_OFFLOAD_IPSECV2_AH_ESP	4
+
+/* NDIS_OFFLOAD_PARAM_NOCHG */
+#define	NDIS_OFFLOAD_RSC_OFF		1
+#define	NDIS_OFFLOAD_RSC_ON		2
+
+/*
+ * OID_GEN_RECEIVE_SCALE_CAPABILITIES
+ * ndis_type: NDIS_OBJTYPE_RSS_CAPS
+ */
+struct ndis_rss_caps {
+	struct ndis_object_hdr		ndis_hdr;
+	uint32_t			ndis_caps;	/* NDIS_RSS_CAP_ */
+	uint32_t			ndis_nmsi;	/* # of MSIs */
+	uint32_t			ndis_nrxr;	/* # of RX rings */
+	/* NDIS >= 6.30 */
+	uint16_t			ndis_nind;	/* # of indtbl ent. */
+	uint16_t			ndis_pad;
+};
+
+#define	NDIS_RSS_CAPS_SIZE		\
+	__offsetof(struct ndis_rss_caps, ndis_pad)
+#define	NDIS_RSS_CAPS_SIZE_6_0		\
+	__offsetof(struct ndis_rss_caps, ndis_nind)
+
+#define	NDIS_RSS_CAPS_REV_1		1	/* NDIS 6.{0,1,20} */
+#define	NDIS_RSS_CAPS_REV_2		2	/* NDIS 6.30 */
+
+#define	NDIS_RSS_CAP_MSI		0x01000000
+#define	NDIS_RSS_CAP_CLASSIFY_ISR	0x02000000
+#define	NDIS_RSS_CAP_CLASSIFY_DPC	0x04000000
+#define	NDIS_RSS_CAP_MSIX		0x08000000
+#define	NDIS_RSS_CAP_IPV4		0x00000100
+#define	NDIS_RSS_CAP_IPV6		0x00000200
+#define	NDIS_RSS_CAP_IPV6_EX		0x00000400
+#define	NDIS_RSS_CAP_HASH_TOEPLITZ	NDIS_HASH_FUNCTION_TOEPLITZ
+#define	NDIS_RSS_CAP_HASHFUNC_MASK	NDIS_HASH_FUNCTION_MASK
+
+/*
+ * OID_GEN_RECEIVE_SCALE_PARAMETERS
+ * ndis_type: NDIS_OBJTYPE_RSS_PARAMS
+ */
+struct ndis_rss_params {
+	struct ndis_object_hdr		ndis_hdr;
+	uint16_t			ndis_flags;	/* NDIS_RSS_FLAG_ */
+	uint16_t			ndis_bcpu;	/* base cpu 0 */
+	uint32_t			ndis_hash;	/* NDIS_HASH_ */
+	uint16_t			ndis_indsize;	/* indirect table */
+	uint32_t			ndis_indoffset;
+	uint16_t			ndis_keysize;	/* hash key */
+	uint32_t			ndis_keyoffset;
+	/* NDIS >= 6.20 */
+	uint32_t			ndis_cpumaskoffset;
+	uint32_t			ndis_cpumaskcnt;
+	uint32_t			ndis_cpumaskentsz;
+};
+
+#define	NDIS_RSS_PARAMS_SIZE		sizeof(struct ndis_rss_params)
+#define	NDIS_RSS_PARAMS_SIZE_6_0	\
+	__offsetof(struct ndis_rss_params, ndis_cpumaskoffset)
+
+#define	NDIS_RSS_PARAMS_REV_1		1	/* NDIS 6.0 */
+#define	NDIS_RSS_PARAMS_REV_2		2	/* NDIS 6.20 */
+
+#define	NDIS_RSS_FLAG_NONE		0x0000
+#define	NDIS_RSS_FLAG_BCPU_UNCHG	0x0001
+#define	NDIS_RSS_FLAG_HASH_UNCHG	0x0002
+#define	NDIS_RSS_FLAG_IND_UNCHG		0x0004
+#define	NDIS_RSS_FLAG_KEY_UNCHG		0x0008
+#define	NDIS_RSS_FLAG_DISABLE		0x0010
+
+/* non-standard convenient struct */
+struct ndis_rssprm_toeplitz {
+	struct ndis_rss_params		rss_params;
+	/* Toeplitz hash key */
+	uint8_t				rss_key[NDIS_HASH_KEYSIZE_TOEPLITZ];
+	/* Indirect table */
+	uint32_t			rss_ind[NDIS_HASH_INDCNT];
+};
+
+#define	NDIS_RSSPRM_TOEPLITZ_SIZE(nind)	\
+	__offsetof(struct ndis_rssprm_toeplitz, rss_ind[nind])
+
+/*
+ * OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES
+ * ndis_type: NDIS_OBJTYPE_OFFLOAD
+ */
+
+#define	NDIS_OFFLOAD_ENCAP_NONE		0x0000
+#define	NDIS_OFFLOAD_ENCAP_NULL		0x0001
+#define	NDIS_OFFLOAD_ENCAP_8023		0x0002
+#define	NDIS_OFFLOAD_ENCAP_8023PQ	0x0004
+#define	NDIS_OFFLOAD_ENCAP_8023PQ_OOB	0x0008
+#define	NDIS_OFFLOAD_ENCAP_RFC1483	0x0010
+
+struct ndis_csum_offload {
+	uint32_t			ndis_ip4_txenc;	/*NDIS_OFFLOAD_ENCAP_*/
+	uint32_t			ndis_ip4_txcsum;
+#define	NDIS_TXCSUM_CAP_IP4OPT		0x001
+#define	NDIS_TXCSUM_CAP_TCP4OPT		0x004
+#define	NDIS_TXCSUM_CAP_TCP4		0x010
+#define	NDIS_TXCSUM_CAP_UDP4		0x040
+#define	NDIS_TXCSUM_CAP_IP4		0x100
+	uint32_t			ndis_ip4_rxenc;	/*NDIS_OFFLOAD_ENCAP_*/
+	uint32_t			ndis_ip4_rxcsum;
+#define	NDIS_RXCSUM_CAP_IP4OPT		0x001
+#define	NDIS_RXCSUM_CAP_TCP4OPT		0x004
+#define	NDIS_RXCSUM_CAP_TCP4		0x010
+#define	NDIS_RXCSUM_CAP_UDP4		0x040
+#define	NDIS_RXCSUM_CAP_IP4		0x100
+	uint32_t			ndis_ip6_txenc;	/*NDIS_OFFLOAD_ENCAP_*/
+	uint32_t			ndis_ip6_txcsum;
+#define	NDIS_TXCSUM_CAP_IP6EXT		0x001
+#define	NDIS_TXCSUM_CAP_TCP6OPT		0x004
+#define	NDIS_TXCSUM_CAP_TCP6		0x010
+#define	NDIS_TXCSUM_CAP_UDP6		0x040
+	uint32_t			ndis_ip6_rxenc;	/*NDIS_OFFLOAD_ENCAP_*/
+	uint32_t			ndis_ip6_rxcsum;
+#define	NDIS_RXCSUM_CAP_IP6EXT		0x001
+#define	NDIS_RXCSUM_CAP_TCP6OPT		0x004
+#define	NDIS_RXCSUM_CAP_TCP6		0x010
+#define	NDIS_RXCSUM_CAP_UDP6		0x040
+};
+
+struct ndis_lsov1_offload {
+	uint32_t			ndis_encap;	/*NDIS_OFFLOAD_ENCAP_*/
+	uint32_t			ndis_maxsize;
+	uint32_t			ndis_minsegs;
+	uint32_t			ndis_opts;
+};
+
+struct ndis_ipsecv1_offload {
+	uint32_t			ndis_encap;	/*NDIS_OFFLOAD_ENCAP_*/
+	uint32_t			ndis_ah_esp;
+	uint32_t			ndis_xport_tun;
+	uint32_t			ndis_ip4_opts;
+	uint32_t			ndis_flags;
+	uint32_t			ndis_ip4_ah;
+	uint32_t			ndis_ip4_esp;
+};
+
+struct ndis_lsov2_offload {
+	uint32_t			ndis_ip4_encap;	/*NDIS_OFFLOAD_ENCAP_*/
+	uint32_t			ndis_ip4_maxsz;
+	uint32_t			ndis_ip4_minsg;
+	uint32_t			ndis_ip6_encap;	/*NDIS_OFFLOAD_ENCAP_*/
+	uint32_t			ndis_ip6_maxsz;
+	uint32_t			ndis_ip6_minsg;
+	uint32_t			ndis_ip6_opts;
+#define	NDIS_LSOV2_CAP_IP6EXT		0x001
+#define	NDIS_LSOV2_CAP_TCP6OPT		0x004
+};
+
+struct ndis_ipsecv2_offload {
+	uint32_t			ndis_encap;	/*NDIS_OFFLOAD_ENCAP_*/
+	uint8_t			ndis_ip6;
+	uint8_t			ndis_ip4opt;
+	uint8_t			ndis_ip6ext;
+	uint8_t			ndis_ah;
+	uint8_t			ndis_esp;
+	uint8_t			ndis_ah_esp;
+	uint8_t			ndis_xport;
+	uint8_t			ndis_tun;
+	uint8_t			ndis_xport_tun;
+	uint8_t			ndis_lso;
+	uint8_t			ndis_extseq;
+	uint32_t			ndis_udp_esp;
+	uint32_t			ndis_auth;
+	uint32_t			ndis_crypto;
+	uint32_t			ndis_sa_caps;
+};
+
+struct ndis_rsc_offload {
+	uint8_t			ndis_ip4;
+	uint8_t			ndis_ip6;
+};
+
+struct ndis_encap_offload {
+	uint32_t			ndis_flags;
+	uint32_t			ndis_maxhdr;
+};
+
+struct ndis_offload {
+	struct ndis_object_hdr		ndis_hdr;
+	struct ndis_csum_offload	ndis_csum;
+	struct ndis_lsov1_offload	ndis_lsov1;
+	struct ndis_ipsecv1_offload	ndis_ipsecv1;
+	struct ndis_lsov2_offload	ndis_lsov2;
+	uint32_t			ndis_flags;
+	/* NDIS >= 6.1 */
+	struct ndis_ipsecv2_offload	ndis_ipsecv2;
+	/* NDIS >= 6.30 */
+	struct ndis_rsc_offload		ndis_rsc;
+	struct ndis_encap_offload	ndis_encap_gre;
+};
+
+#define	NDIS_OFFLOAD_SIZE		sizeof(struct ndis_offload)
+#define	NDIS_OFFLOAD_SIZE_6_0		\
+	__offsetof(struct ndis_offload, ndis_ipsecv2)
+#define	NDIS_OFFLOAD_SIZE_6_1		\
+	__offsetof(struct ndis_offload, ndis_rsc)
+
+#define	NDIS_OFFLOAD_REV_1		1	/* NDIS 6.0 */
+#define	NDIS_OFFLOAD_REV_2		2	/* NDIS 6.1 */
+#define	NDIS_OFFLOAD_REV_3		3	/* NDIS 6.30 */
+
+/*
+ * Per-packet-info
+ */
+
+/* VLAN */
+#define	NDIS_VLAN_INFO_SIZE		sizeof(uint32_t)
+#define	NDIS_VLAN_INFO_PRI_MASK		0x0007
+#define	NDIS_VLAN_INFO_CFI_MASK		0x0008
+#define	NDIS_VLAN_INFO_ID_MASK		0xfff0
+#define	NDIS_VLAN_INFO_MAKE(id, pri, cfi)	\
+        (((pri) & NDIS_VLAN_INFO_PRI_MASK) |	\
+	 (((cfi) & 0x1) << 3) | (((id) & 0xfff) << 4))
+#define	NDIS_VLAN_INFO_ID(inf)		(((inf) & NDIS_VLAN_INFO_ID_MASK) >> 4)
+#define	NDIS_VLAN_INFO_CFI(inf)		(((inf) & NDIS_VLAN_INFO_CFI_MASK) >> 3)
+#define	NDIS_VLAN_INFO_PRI(inf)		((inf) & NDIS_VLAN_INFO_PRI_MASK)
+
+/* Reception checksum */
+#define	NDIS_RXCSUM_INFO_SIZE		sizeof(uint32_t)
+#define	NDIS_RXCSUM_INFO_TCPCS_FAILED	0x0001
+#define	NDIS_RXCSUM_INFO_UDPCS_FAILED	0x0002
+#define	NDIS_RXCSUM_INFO_IPCS_FAILED	0x0004
+#define	NDIS_RXCSUM_INFO_TCPCS_OK	0x0008
+#define	NDIS_RXCSUM_INFO_UDPCS_OK	0x0010
+#define	NDIS_RXCSUM_INFO_IPCS_OK	0x0020
+#define	NDIS_RXCSUM_INFO_LOOPBACK	0x0040
+#define	NDIS_RXCSUM_INFO_TCPCS_INVAL	0x0080
+#define	NDIS_RXCSUM_INFO_IPCS_INVAL	0x0100
+
+/* LSOv2 */
+#define	NDIS_LSO2_INFO_SIZE		sizeof(uint32_t)
+#define	NDIS_LSO2_INFO_MSS_MASK		0x000fffff
+#define	NDIS_LSO2_INFO_THOFF_MASK	0x3ff00000
+#define	NDIS_LSO2_INFO_ISLSO2		0x40000000
+#define	NDIS_LSO2_INFO_ISIPV6		0x80000000
+
+#define	NDIS_LSO2_INFO_MAKE(thoff, mss)				\
+	((((uint32_t)(mss)) & NDIS_LSO2_INFO_MSS_MASK) |	\
+	 ((((uint32_t)(thoff)) & 0x3ff) << 20) |		\
+	 NDIS_LSO2_INFO_ISLSO2)
+
+#define	NDIS_LSO2_INFO_MAKEIPV4(thoff, mss)			\
+	NDIS_LSO2_INFO_MAKE((thoff), (mss))
+
+#define	NDIS_LSO2_INFO_MAKEIPV6(thoff, mss)			\
+	(NDIS_LSO2_INFO_MAKE((thoff), (mss)) | NDIS_LSO2_INFO_ISIPV6)
+
+/* Transmission checksum */
+#define	NDIS_TXCSUM_INFO_SIZE		sizeof(uint32_t)
+#define	NDIS_TXCSUM_INFO_IPV4		0x00000001
+#define	NDIS_TXCSUM_INFO_IPV6		0x00000002
+#define	NDIS_TXCSUM_INFO_TCPCS		0x00000004
+#define	NDIS_TXCSUM_INFO_UDPCS		0x00000008
+#define	NDIS_TXCSUM_INFO_IPCS		0x00000010
+#define	NDIS_TXCSUM_INFO_THOFF		0x03ff0000
+
+#define	NDIS_TXCSUM_INFO_MKL4CS(thoff, flag)			\
+	((((uint32_t)(thoff)) << 16) | (flag))
+
+#define	NDIS_TXCSUM_INFO_MKTCPCS(thoff)				\
+	NDIS_TXCSUM_INFO_MKL4CS((thoff), NDIS_TXCSUM_INFO_TCPCS)
+
+#define	NDIS_TXCSUM_INFO_MKUDPCS(thoff)				\
+	NDIS_TXCSUM_INFO_MKL4CS((thoff), NDIS_TXCSUM_INFO_UDPCS)
+#endif	/* !_NET_NDIS_H_ */
diff --git a/sys/dev/hyperv/pcib/vmbus_pcib.c b/sys/dev/hyperv/pcib/vmbus_pcib.c
new file mode 100644
index 000000000000..c7df32044678
--- /dev/null
+++ b/sys/dev/hyperv/pcib/vmbus_pcib.c
@@ -0,0 +1,1897 @@
+/*-
+ * Copyright (c) 2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef NEW_PCIB
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/sx.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+#include <sys/mutex.h>
+#include <sys/errno.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/pmap.h>
+
+#include <machine/atomic.h>
+#include <machine/bus.h>
+#include <machine/frame.h>
+#include <machine/pci_cfgreg.h>
+#include <machine/resource.h>
+
+#include <sys/pciio.h>
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pci_private.h>
+#include <dev/pci/pcib_private.h>
+#include "pcib_if.h"
+
+#include <machine/intr_machdep.h>
+#include <x86/apicreg.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+#include <dev/hyperv/vmbus/vmbus_reg.h>
+#include <dev/hyperv/vmbus/vmbus_chanvar.h>
+
+#include "vmbus_if.h"
+
+#if __FreeBSD_version < 1100000
+typedef u_long rman_res_t;
+#define RM_MAX_END	(~(rman_res_t)0)
+#endif
+
+struct completion {
+	unsigned int done;
+	struct mtx lock;
+};
+
+static void
+init_completion(struct completion *c)
+{
+	memset(c, 0, sizeof(*c));
+	mtx_init(&c->lock, "hvcmpl", NULL, MTX_DEF);
+	c->done = 0;
+}
+
+static void
+free_completion(struct completion *c)
+{
+	mtx_destroy(&c->lock);
+}
+
+static void
+complete(struct completion *c)
+{
+	mtx_lock(&c->lock);
+	c->done++;
+	mtx_unlock(&c->lock);
+	wakeup(c);
+}
+
+static void
+wait_for_completion(struct completion *c)
+{
+	mtx_lock(&c->lock);
+	while (c->done == 0)
+		mtx_sleep(c, &c->lock, 0, "hvwfc", 0);
+	c->done--;
+	mtx_unlock(&c->lock);
+}
+
+/*
+ * Return: 0 if completed, a non-zero value if timed out.
+ */
+static int
+wait_for_completion_timeout(struct completion *c, int timeout)
+{
+	int ret;
+
+	mtx_lock(&c->lock);
+
+	if (c->done == 0)
+		mtx_sleep(c, &c->lock, 0, "hvwfc", timeout);
+
+	if (c->done > 0) {
+		c->done--;
+		ret = 0;
+	} else {
+		ret = 1;
+	}
+
+	mtx_unlock(&c->lock);
+
+	return (ret);
+}
+
+#define PCI_MAKE_VERSION(major, minor) ((uint32_t)(((major) << 16) | (major)))
+
+enum {
+	PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1),
+	PCI_PROTOCOL_VERSION_CURRENT = PCI_PROTOCOL_VERSION_1_1
+};
+
+#define PCI_CONFIG_MMIO_LENGTH	0x2000
+#define CFG_PAGE_OFFSET 0x1000
+#define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
+
+/*
+ * Message Types
+ */
+
+enum pci_message_type {
+	/*
+	 * Version 1.1
+	 */
+	PCI_MESSAGE_BASE                = 0x42490000,
+	PCI_BUS_RELATIONS               = PCI_MESSAGE_BASE + 0,
+	PCI_QUERY_BUS_RELATIONS         = PCI_MESSAGE_BASE + 1,
+	PCI_POWER_STATE_CHANGE          = PCI_MESSAGE_BASE + 4,
+	PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
+	PCI_QUERY_RESOURCE_RESOURCES    = PCI_MESSAGE_BASE + 6,
+	PCI_BUS_D0ENTRY                 = PCI_MESSAGE_BASE + 7,
+	PCI_BUS_D0EXIT                  = PCI_MESSAGE_BASE + 8,
+	PCI_READ_BLOCK                  = PCI_MESSAGE_BASE + 9,
+	PCI_WRITE_BLOCK                 = PCI_MESSAGE_BASE + 0xA,
+	PCI_EJECT                       = PCI_MESSAGE_BASE + 0xB,
+	PCI_QUERY_STOP                  = PCI_MESSAGE_BASE + 0xC,
+	PCI_REENABLE                    = PCI_MESSAGE_BASE + 0xD,
+	PCI_QUERY_STOP_FAILED           = PCI_MESSAGE_BASE + 0xE,
+	PCI_EJECTION_COMPLETE           = PCI_MESSAGE_BASE + 0xF,
+	PCI_RESOURCES_ASSIGNED          = PCI_MESSAGE_BASE + 0x10,
+	PCI_RESOURCES_RELEASED          = PCI_MESSAGE_BASE + 0x11,
+	PCI_INVALIDATE_BLOCK            = PCI_MESSAGE_BASE + 0x12,
+	PCI_QUERY_PROTOCOL_VERSION      = PCI_MESSAGE_BASE + 0x13,
+	PCI_CREATE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x14,
+	PCI_DELETE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x15,
+	PCI_MESSAGE_MAXIMUM
+};
+
+/*
+ * Structures defining the virtual PCI Express protocol.
+ */
+
+union pci_version {
+	struct {
+		uint16_t minor_version;
+		uint16_t major_version;
+	} parts;
+	uint32_t version;
+} __packed;
+
+/*
+ * This representation is the one used in Windows, which is
+ * what is expected when sending this back and forth with
+ * the Hyper-V parent partition.
+ */
+union win_slot_encoding {
+	struct {
+		uint32_t	slot:5;
+		uint32_t	func:3;
+		uint32_t	reserved:24;
+	} bits;
+	uint32_t val;
+} __packed;
+
+struct pci_func_desc {
+	uint16_t	v_id;	/* vendor ID */
+	uint16_t	d_id;	/* device ID */
+	uint8_t		rev;
+	uint8_t		prog_intf;
+	uint8_t		subclass;
+	uint8_t		base_class;
+	uint32_t	subsystem_id;
+	union win_slot_encoding wslot;
+	uint32_t	ser;	/* serial number */
+} __packed;
+
+struct hv_msi_desc {
+	uint8_t		vector;
+	uint8_t		delivery_mode;
+	uint16_t	vector_count;
+	uint32_t	reserved;
+	uint64_t	cpu_mask;
+} __packed;
+
+struct tran_int_desc {
+	uint16_t	reserved;
+	uint16_t	vector_count;
+	uint32_t	data;
+	uint64_t	address;
+} __packed;
+
+struct pci_message {
+	uint32_t type;
+} __packed;
+
+struct pci_child_message {
+	struct pci_message message_type;
+	union win_slot_encoding wslot;
+} __packed;
+
+struct pci_incoming_message {
+	struct vmbus_chanpkt_hdr hdr;
+	struct pci_message message_type;
+} __packed;
+
+struct pci_response {
+	struct vmbus_chanpkt_hdr hdr;
+	int32_t status;	/* negative values are failures */
+} __packed;
+
+struct pci_packet {
+	void (*completion_func)(void *context, struct pci_response *resp,
+	    int resp_packet_size);
+	void *compl_ctxt;
+
+	struct pci_message message[0];
+};
+
+/*
+ * Specific message types supporting the PCI protocol.
+ */
+
+struct pci_version_request {
+	struct pci_message message_type;
+	uint32_t protocol_version;
+	uint32_t is_last_attempt:1;
+	uint32_t reservedz:31;
+} __packed;
+
+struct pci_bus_d0_entry {
+	struct pci_message message_type;
+	uint32_t reserved;
+	uint64_t mmio_base;
+} __packed;
+
+struct pci_bus_relations {
+	struct pci_incoming_message incoming;
+	uint32_t device_count;
+	struct pci_func_desc func[0];
+} __packed;
+
+#define MAX_NUM_BARS	(PCIR_MAX_BAR_0 + 1)
+struct pci_q_res_req_response {
+	struct vmbus_chanpkt_hdr hdr;
+	int32_t status; /* negative values are failures */
+	uint32_t probed_bar[MAX_NUM_BARS];
+} __packed;
+
+struct pci_resources_assigned {
+	struct pci_message message_type;
+	union win_slot_encoding wslot;
+	uint8_t memory_range[0x14][MAX_NUM_BARS]; /* unused here */
+	uint32_t msi_descriptors;
+	uint32_t reserved[4];
+} __packed;
+
+struct pci_create_interrupt {
+	struct pci_message message_type;
+	union win_slot_encoding wslot;
+	struct hv_msi_desc int_desc;
+} __packed;
+
+struct pci_create_int_response {
+	struct pci_response response;
+	uint32_t reserved;
+	struct tran_int_desc int_desc;
+} __packed;
+
+struct pci_delete_interrupt {
+	struct pci_message message_type;
+	union win_slot_encoding wslot;
+	struct tran_int_desc int_desc;
+} __packed;
+
+struct pci_dev_incoming {
+	struct pci_incoming_message incoming;
+	union win_slot_encoding wslot;
+} __packed;
+
+struct pci_eject_response {
+	struct pci_message message_type;
+	union win_slot_encoding wslot;
+	uint32_t status;
+} __packed;
+
+/*
+ * Driver specific state.
+ */
+
+enum hv_pcibus_state {
+	hv_pcibus_init = 0,
+	hv_pcibus_installed,
+};
+
+struct hv_pcibus {
+	device_t pcib;
+	device_t pci_bus;
+	struct vmbus_pcib_softc *sc;
+
+	uint16_t pci_domain;
+
+	enum hv_pcibus_state state;
+
+	struct resource *cfg_res;
+
+	struct completion query_completion, *query_comp;
+
+	struct mtx config_lock; /* Avoid two threads writing index page */
+	struct mtx device_list_lock;    /* Protect lists below */
+	TAILQ_HEAD(, hv_pci_dev) children;
+	TAILQ_HEAD(, hv_dr_state) dr_list;
+
+	volatile int detaching;
+};
+
+struct hv_pci_dev {
+	TAILQ_ENTRY(hv_pci_dev) link;
+
+	struct pci_func_desc desc;
+
+	bool reported_missing;
+
+	struct hv_pcibus *hbus;
+	struct task eject_task;
+
+	TAILQ_HEAD(, hv_irq_desc) irq_desc_list;
+
+	/*
+	 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
+	 * read it back, for each of the BAR offsets within config space.
+	 */
+	uint32_t probed_bar[MAX_NUM_BARS];
+};
+
+/*
+ * Tracks "Device Relations" messages from the host, which must be both
+ * processed in order.
+ */
+struct hv_dr_work {
+	struct task task;
+	struct hv_pcibus *bus;
+};
+
+struct hv_dr_state {
+	TAILQ_ENTRY(hv_dr_state) link;
+	uint32_t device_count;
+	struct pci_func_desc func[0];
+};
+
+struct hv_irq_desc {
+	TAILQ_ENTRY(hv_irq_desc) link;
+	struct tran_int_desc desc;
+	int irq;
+};
+
+#define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
+#define PCI_SLOT(devfn)         (((devfn) >> 3) & 0x1f)
+#define PCI_FUNC(devfn)         ((devfn) & 0x07)
+
+static uint32_t
+devfn_to_wslot(unsigned int devfn)
+{
+	union win_slot_encoding wslot;
+
+	wslot.val = 0;
+	wslot.bits.slot = PCI_SLOT(devfn);
+	wslot.bits.func = PCI_FUNC(devfn);
+
+	return (wslot.val);
+}
+
+static unsigned int
+wslot_to_devfn(uint32_t wslot)
+{
+	union win_slot_encoding encoding;
+	unsigned int slot;
+	unsigned int func;
+
+	encoding.val = wslot;
+
+	slot = encoding.bits.slot;
+	func = encoding.bits.func;
+
+	return (PCI_DEVFN(slot, func));
+}
+
+struct vmbus_pcib_softc {
+	struct vmbus_channel	*chan;
+	void *rx_buf;
+
+	struct taskqueue	*taskq;
+
+	struct hv_pcibus	*hbus;
+};
+
+/* {44C4F61D-4444-4400-9D52-802E27EDE19F} */
+static const struct hyperv_guid g_pass_through_dev_type = {
+	.hv_guid = {0x1D, 0xF6, 0xC4, 0x44, 0x44, 0x44, 0x00, 0x44,
+	    0x9D, 0x52, 0x80, 0x2E, 0x27, 0xED, 0xE1, 0x9F}
+};
+
+struct hv_pci_compl {
+	struct completion host_event;
+	int32_t completion_status;
+};
+
+struct q_res_req_compl {
+	struct completion host_event;
+	struct hv_pci_dev *hpdev;
+};
+
+struct compose_comp_ctxt {
+	struct hv_pci_compl comp_pkt;
+	struct tran_int_desc int_desc;
+};
+
+/*
+ * It is possible the device is revoked during initialization.
+ * Check if this happens during wait.
+ * Return: 0 if response arrived, ENODEV if device revoked.
+ */
+static int
+wait_for_response(struct hv_pcibus *hbus, struct completion *c)
+{
+	do {
+		if (vmbus_chan_is_revoked(hbus->sc->chan)) {
+			device_printf(hbus->pcib,
+			    "The device is revoked.\n");
+			return (ENODEV);
+		}
+	} while (wait_for_completion_timeout(c, hz /10) != 0);
+
+	return 0;
+}
+
+static void
+hv_pci_generic_compl(void *context, struct pci_response *resp,
+    int resp_packet_size)
+{
+	struct hv_pci_compl *comp_pkt = context;
+
+	if (resp_packet_size >= sizeof(struct pci_response))
+		comp_pkt->completion_status = resp->status;
+	else
+		comp_pkt->completion_status = -1;
+
+	complete(&comp_pkt->host_event);
+}
+
+static void
+q_resource_requirements(void *context, struct pci_response *resp,
+    int resp_packet_size)
+{
+	struct q_res_req_compl *completion = context;
+	struct pci_q_res_req_response *q_res_req =
+	    (struct pci_q_res_req_response *)resp;
+	int i;
+
+	if (resp->status < 0) {
+		printf("vmbus_pcib: failed to query resource requirements\n");
+	} else {
+		for (i = 0; i < MAX_NUM_BARS; i++)
+			completion->hpdev->probed_bar[i] =
+			    q_res_req->probed_bar[i];
+	}
+
+	complete(&completion->host_event);
+}
+
+static void
+hv_pci_compose_compl(void *context, struct pci_response *resp,
+    int resp_packet_size)
+{
+	struct compose_comp_ctxt *comp_pkt = context;
+	struct pci_create_int_response *int_resp =
+	    (struct pci_create_int_response *)resp;
+
+	comp_pkt->comp_pkt.completion_status = resp->status;
+	comp_pkt->int_desc = int_resp->int_desc;
+	complete(&comp_pkt->comp_pkt.host_event);
+}
+
+static void
+hv_int_desc_free(struct hv_pci_dev *hpdev, struct hv_irq_desc *hid)
+{
+	struct pci_delete_interrupt *int_pkt;
+	struct {
+		struct pci_packet pkt;
+		uint8_t buffer[sizeof(struct pci_delete_interrupt)];
+	} ctxt;
+
+	memset(&ctxt, 0, sizeof(ctxt));
+	int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
+	int_pkt->message_type.type = PCI_DELETE_INTERRUPT_MESSAGE;
+	int_pkt->wslot.val = hpdev->desc.wslot.val;
+	int_pkt->int_desc = hid->desc;
+
+	vmbus_chan_send(hpdev->hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
+	    int_pkt, sizeof(*int_pkt), 0);
+
+	free(hid, M_DEVBUF);
+}
+
+static void
+hv_pci_delete_device(struct hv_pci_dev *hpdev)
+{
+	struct hv_pcibus *hbus = hpdev->hbus;
+	struct hv_irq_desc *hid, *tmp_hid;
+	device_t pci_dev;
+	int devfn;
+
+	devfn = wslot_to_devfn(hpdev->desc.wslot.val);
+
+	mtx_lock(&Giant);
+
+	pci_dev = pci_find_dbsf(hbus->pci_domain,
+	    0, PCI_SLOT(devfn), PCI_FUNC(devfn));
+	if (pci_dev)
+		device_delete_child(hbus->pci_bus, pci_dev);
+
+	mtx_unlock(&Giant);
+
+	mtx_lock(&hbus->device_list_lock);
+	TAILQ_REMOVE(&hbus->children, hpdev, link);
+	mtx_unlock(&hbus->device_list_lock);
+
+	TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid)
+		hv_int_desc_free(hpdev, hid);
+
+	free(hpdev, M_DEVBUF);
+}
+
+static struct hv_pci_dev *
+new_pcichild_device(struct hv_pcibus *hbus, struct pci_func_desc *desc)
+{
+	struct hv_pci_dev *hpdev;
+	struct pci_child_message *res_req;
+	struct q_res_req_compl comp_pkt;
+	struct {
+		struct pci_packet pkt;
+		uint8_t buffer[sizeof(struct pci_child_message)];
+	} ctxt;
+	int ret;
+
+	hpdev = malloc(sizeof(*hpdev), M_DEVBUF, M_WAITOK | M_ZERO);
+	hpdev->hbus = hbus;
+
+	TAILQ_INIT(&hpdev->irq_desc_list);
+
+	init_completion(&comp_pkt.host_event);
+	comp_pkt.hpdev = hpdev;
+
+	ctxt.pkt.compl_ctxt = &comp_pkt;
+	ctxt.pkt.completion_func = q_resource_requirements;
+
+	res_req = (struct pci_child_message *)&ctxt.pkt.message;
+	res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
+	res_req->wslot.val = desc->wslot.val;
+
+	ret = vmbus_chan_send(hbus->sc->chan,
+	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+	    res_req, sizeof(*res_req), (uint64_t)(uintptr_t)&ctxt.pkt);
+	if (ret)
+		goto err;
+
+	if (wait_for_response(hbus, &comp_pkt.host_event))
+		goto err;
+
+	free_completion(&comp_pkt.host_event);
+
+	hpdev->desc = *desc;
+
+	mtx_lock(&hbus->device_list_lock);
+	if (TAILQ_EMPTY(&hbus->children))
+		hbus->pci_domain = desc->ser & 0xFFFF;
+	TAILQ_INSERT_TAIL(&hbus->children, hpdev, link);
+	mtx_unlock(&hbus->device_list_lock);
+	return (hpdev);
+err:
+	free_completion(&comp_pkt.host_event);
+	free(hpdev, M_DEVBUF);
+	return (NULL);
+}
+
+#if __FreeBSD_version < 1100000
+
+/* Old versions don't have BUS_RESCAN(). Let's copy it from FreeBSD 11. */
+
+static struct pci_devinfo *
+pci_identify_function(device_t pcib, device_t dev, int domain, int busno,
+    int slot, int func, size_t dinfo_size)
+{
+	struct pci_devinfo *dinfo;
+
+	dinfo = pci_read_device(pcib, domain, busno, slot, func, dinfo_size);
+	if (dinfo != NULL)
+		pci_add_child(dev, dinfo);
+
+	return (dinfo);
+}
+
+static int
+pci_rescan(device_t dev)
+{
+#define	REG(n, w)	PCIB_READ_CONFIG(pcib, busno, s, f, n, w)
+	device_t pcib = device_get_parent(dev);
+	struct pci_softc *sc;
+	device_t child, *devlist, *unchanged;
+	int devcount, error, i, j, maxslots, oldcount;
+	int busno, domain, s, f, pcifunchigh;
+	uint8_t hdrtype;
+
+	/* No need to check for ARI on a rescan. */
+	error = device_get_children(dev, &devlist, &devcount);
+	if (error)
+		return (error);
+	if (devcount != 0) {
+		unchanged = malloc(devcount * sizeof(device_t), M_TEMP,
+		    M_NOWAIT | M_ZERO);
+		if (unchanged == NULL) {
+			free(devlist, M_TEMP);
+			return (ENOMEM);
+		}
+	} else
+		unchanged = NULL;
+
+	sc = device_get_softc(dev);
+	domain = pcib_get_domain(dev);
+	busno = pcib_get_bus(dev);
+	maxslots = PCIB_MAXSLOTS(pcib);
+	for (s = 0; s <= maxslots; s++) {
+		/* If function 0 is not present, skip to the next slot. */
+		f = 0;
+		if (REG(PCIR_VENDOR, 2) == 0xffff)
+			continue;
+		pcifunchigh = 0;
+		hdrtype = REG(PCIR_HDRTYPE, 1);
+		if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE)
+			continue;
+		if (hdrtype & PCIM_MFDEV)
+			pcifunchigh = PCIB_MAXFUNCS(pcib);
+		for (f = 0; f <= pcifunchigh; f++) {
+			if (REG(PCIR_VENDOR, 2) == 0xffff)
+				continue;
+
+			/*
+			 * Found a valid function.  Check if a
+			 * device_t for this device already exists.
+			 */
+			for (i = 0; i < devcount; i++) {
+				child = devlist[i];
+				if (child == NULL)
+					continue;
+				if (pci_get_slot(child) == s &&
+				    pci_get_function(child) == f) {
+					unchanged[i] = child;
+					goto next_func;
+				}
+			}
+
+			pci_identify_function(pcib, dev, domain, busno, s, f,
+			    sizeof(struct pci_devinfo));
+		next_func:;
+		}
+	}
+
+	/* Remove devices that are no longer present. */
+	for (i = 0; i < devcount; i++) {
+		if (unchanged[i] != NULL)
+			continue;
+		device_delete_child(dev, devlist[i]);
+	}
+
+	free(devlist, M_TEMP);
+	oldcount = devcount;
+
+	/* Try to attach the devices just added. */
+	error = device_get_children(dev, &devlist, &devcount);
+	if (error) {
+		free(unchanged, M_TEMP);
+		return (error);
+	}
+
+	for (i = 0; i < devcount; i++) {
+		for (j = 0; j < oldcount; j++) {
+			if (devlist[i] == unchanged[j])
+				goto next_device;
+		}
+
+		device_probe_and_attach(devlist[i]);
+	next_device:;
+	}
+
+	free(unchanged, M_TEMP);
+	free(devlist, M_TEMP);
+	return (0);
+#undef REG
+}
+
+#else
+
+static int
+pci_rescan(device_t dev)
+{
+	return (BUS_RESCAN(dev));
+}
+
+#endif
+
+static void
+pci_devices_present_work(void *arg, int pending __unused)
+{
+	struct hv_dr_work *dr_wrk = arg;
+	struct hv_dr_state *dr = NULL;
+	struct hv_pcibus *hbus;
+	uint32_t child_no;
+	bool found;
+	struct pci_func_desc *new_desc;
+	struct hv_pci_dev *hpdev, *tmp_hpdev;
+	struct completion *query_comp;
+	bool need_rescan = false;
+
+	hbus = dr_wrk->bus;
+	free(dr_wrk, M_DEVBUF);
+
+	/* Pull this off the queue and process it if it was the last one. */
+	mtx_lock(&hbus->device_list_lock);
+	while (!TAILQ_EMPTY(&hbus->dr_list)) {
+		dr = TAILQ_FIRST(&hbus->dr_list);
+		TAILQ_REMOVE(&hbus->dr_list, dr, link);
+
+		/* Throw this away if the list still has stuff in it. */
+		if (!TAILQ_EMPTY(&hbus->dr_list)) {
+			free(dr, M_DEVBUF);
+			continue;
+		}
+	}
+	mtx_unlock(&hbus->device_list_lock);
+
+	if (!dr)
+		return;
+
+	/* First, mark all existing children as reported missing. */
+	mtx_lock(&hbus->device_list_lock);
+	TAILQ_FOREACH(hpdev, &hbus->children, link)
+		hpdev->reported_missing = true;
+	mtx_unlock(&hbus->device_list_lock);
+
+	/* Next, add back any reported devices. */
+	for (child_no = 0; child_no < dr->device_count; child_no++) {
+		found = false;
+		new_desc = &dr->func[child_no];
+
+		mtx_lock(&hbus->device_list_lock);
+		TAILQ_FOREACH(hpdev, &hbus->children, link) {
+			if ((hpdev->desc.wslot.val ==
+			    new_desc->wslot.val) &&
+			    (hpdev->desc.v_id == new_desc->v_id) &&
+			    (hpdev->desc.d_id == new_desc->d_id) &&
+			    (hpdev->desc.ser == new_desc->ser)) {
+				hpdev->reported_missing = false;
+				found = true;
+				break;
+			}
+		}
+		mtx_unlock(&hbus->device_list_lock);
+
+		if (!found) {
+			if (!need_rescan)
+				need_rescan = true;
+
+			hpdev = new_pcichild_device(hbus, new_desc);
+			if (!hpdev)
+				printf("vmbus_pcib: failed to add a child\n");
+		}
+	}
+
+	/* Remove missing device(s), if any */
+	TAILQ_FOREACH_SAFE(hpdev, &hbus->children, link, tmp_hpdev) {
+		if (hpdev->reported_missing)
+			hv_pci_delete_device(hpdev);
+	}
+
+	/* Rescan the bus to find any new device, if necessary. */
+	if (hbus->state == hv_pcibus_installed && need_rescan)
+		pci_rescan(hbus->pci_bus);
+
+	/* Wake up hv_pci_query_relations(), if it's waiting. */
+	query_comp = hbus->query_comp;
+	if (query_comp) {
+		hbus->query_comp = NULL;
+		complete(query_comp);
+	}
+
+	free(dr, M_DEVBUF);
+}
+
+static struct hv_pci_dev *
+get_pcichild_wslot(struct hv_pcibus *hbus, uint32_t wslot)
+{
+	struct hv_pci_dev *hpdev, *ret = NULL;
+
+	mtx_lock(&hbus->device_list_lock);
+	TAILQ_FOREACH(hpdev, &hbus->children, link) {
+		if (hpdev->desc.wslot.val == wslot) {
+			ret = hpdev;
+			break;
+		}
+	}
+	mtx_unlock(&hbus->device_list_lock);
+
+	return (ret);
+}
+
+static void
+hv_pci_devices_present(struct hv_pcibus *hbus,
+    struct pci_bus_relations *relations)
+{
+	struct hv_dr_state *dr;
+	struct hv_dr_work *dr_wrk;
+	unsigned long dr_size;
+
+	if (hbus->detaching && relations->device_count > 0)
+		return;
+
+	dr_size = offsetof(struct hv_dr_state, func) +
+	    (sizeof(struct pci_func_desc) * relations->device_count);
+	dr = malloc(dr_size, M_DEVBUF, M_WAITOK | M_ZERO);
+
+	dr->device_count = relations->device_count;
+	if (dr->device_count != 0)
+		memcpy(dr->func, relations->func,
+		    sizeof(struct pci_func_desc) * dr->device_count);
+
+	mtx_lock(&hbus->device_list_lock);
+	TAILQ_INSERT_TAIL(&hbus->dr_list, dr, link);
+	mtx_unlock(&hbus->device_list_lock);
+
+	dr_wrk = malloc(sizeof(*dr_wrk), M_DEVBUF, M_WAITOK | M_ZERO);
+	dr_wrk->bus = hbus;
+	TASK_INIT(&dr_wrk->task, 0, pci_devices_present_work, dr_wrk);
+	taskqueue_enqueue(hbus->sc->taskq, &dr_wrk->task);
+}
+
+static void
+hv_eject_device_work(void *arg, int pending __unused)
+{
+	struct hv_pci_dev *hpdev = arg;
+	union win_slot_encoding wslot = hpdev->desc.wslot;
+	struct hv_pcibus *hbus = hpdev->hbus;
+	struct pci_eject_response *eject_pkt;
+	struct {
+		struct pci_packet pkt;
+		uint8_t buffer[sizeof(struct pci_eject_response)];
+	} ctxt;
+
+	hv_pci_delete_device(hpdev);
+
+	memset(&ctxt, 0, sizeof(ctxt));
+	eject_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
+	eject_pkt->message_type.type = PCI_EJECTION_COMPLETE;
+	eject_pkt->wslot.val = wslot.val;
+	vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
+	    eject_pkt, sizeof(*eject_pkt), 0);
+}
+
+static void
+hv_pci_eject_device(struct hv_pci_dev *hpdev)
+{
+	struct hv_pcibus *hbus = hpdev->hbus;
+	struct taskqueue *taskq;
+
+	if (hbus->detaching)
+		return;
+
+	/*
+	 * Push this task into the same taskqueue on which
+	 * vmbus_pcib_attach() runs, so we're sure this task can't run
+	 * concurrently with vmbus_pcib_attach().
+	 */
+	TASK_INIT(&hpdev->eject_task, 0, hv_eject_device_work, hpdev);
+	taskq = vmbus_chan_mgmt_tq(hbus->sc->chan);
+	taskqueue_enqueue(taskq, &hpdev->eject_task);
+}
+
+#define PCIB_PACKET_SIZE	0x100
+
+static void
+vmbus_pcib_on_channel_callback(struct vmbus_channel *chan, void *arg)
+{
+	struct vmbus_pcib_softc *sc = arg;
+	struct hv_pcibus *hbus = sc->hbus;
+
+	void *buffer;
+	int bufferlen = PCIB_PACKET_SIZE;
+
+	struct pci_packet *comp_packet;
+	struct pci_response *response;
+	struct pci_incoming_message *new_msg;
+	struct pci_bus_relations *bus_rel;
+	struct pci_dev_incoming *dev_msg;
+	struct hv_pci_dev *hpdev;
+
+	buffer = sc->rx_buf;
+	do {
+		struct vmbus_chanpkt_hdr *pkt = buffer;
+		uint32_t bytes_rxed;
+		int ret;
+
+		bytes_rxed = bufferlen;
+		ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed);
+
+		if (ret == ENOBUFS) {
+			/* Handle large packet */
+			if (bufferlen > PCIB_PACKET_SIZE) {
+				free(buffer, M_DEVBUF);
+				buffer = NULL;
+			}
+
+			/* alloc new buffer */
+			buffer = malloc(bytes_rxed, M_DEVBUF, M_WAITOK | M_ZERO);
+			bufferlen = bytes_rxed;
+
+			continue;
+		}
+
+		if (ret != 0) {
+			/* ignore EIO or EAGAIN */
+			break;
+		}
+
+		if (bytes_rxed <= sizeof(struct pci_response))
+			continue;
+
+		switch (pkt->cph_type) {
+		case VMBUS_CHANPKT_TYPE_COMP:
+			comp_packet =
+			    (struct pci_packet *)(uintptr_t)pkt->cph_xactid;
+			response = (struct pci_response *)pkt;
+			comp_packet->completion_func(comp_packet->compl_ctxt,
+			    response, bytes_rxed);
+			break;
+		case VMBUS_CHANPKT_TYPE_INBAND:
+			new_msg = (struct pci_incoming_message *)buffer;
+
+			switch (new_msg->message_type.type) {
+			case PCI_BUS_RELATIONS:
+				bus_rel = (struct pci_bus_relations *)buffer;
+
+				if (bus_rel->device_count == 0)
+					break;
+
+				if (bytes_rxed <
+				    offsetof(struct pci_bus_relations, func) +
+				        (sizeof(struct pci_func_desc) *
+				            (bus_rel->device_count)))
+					break;
+
+				hv_pci_devices_present(hbus, bus_rel);
+				break;
+
+			case PCI_EJECT:
+				dev_msg = (struct pci_dev_incoming *)buffer;
+				hpdev = get_pcichild_wslot(hbus,
+				    dev_msg->wslot.val);
+
+				if (hpdev)
+					hv_pci_eject_device(hpdev);
+
+				break;
+			default:
+				printf("vmbus_pcib: Unknown msg type 0x%x\n",
+				    new_msg->message_type.type);
+				break;
+			}
+			break;
+		default:
+			printf("vmbus_pcib: Unknown VMBus msg type %hd\n",
+			    pkt->cph_type);
+			break;
+		}
+	} while (1);
+
+	if (bufferlen > PCIB_PACKET_SIZE)
+		free(buffer, M_DEVBUF);
+}
+
+static int
+hv_pci_protocol_negotiation(struct hv_pcibus *hbus)
+{
+	struct pci_version_request *version_req;
+	struct hv_pci_compl comp_pkt;
+	struct {
+		struct pci_packet pkt;
+		uint8_t buffer[sizeof(struct pci_version_request)];
+	} ctxt;
+	int ret;
+
+	init_completion(&comp_pkt.host_event);
+
+	ctxt.pkt.completion_func = hv_pci_generic_compl;
+	ctxt.pkt.compl_ctxt = &comp_pkt;
+	version_req = (struct pci_version_request *)&ctxt.pkt.message;
+	version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
+	version_req->protocol_version = PCI_PROTOCOL_VERSION_CURRENT;
+	version_req->is_last_attempt = 1;
+
+	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND,
+	    VMBUS_CHANPKT_FLAG_RC, version_req, sizeof(*version_req),
+	    (uint64_t)(uintptr_t)&ctxt.pkt);
+	if (!ret)
+		ret = wait_for_response(hbus, &comp_pkt.host_event);
+
+	if (ret) {
+		device_printf(hbus->pcib,
+		    "vmbus_pcib failed to request version: %d\n",
+		    ret);
+		goto out;
+	}
+
+	if (comp_pkt.completion_status < 0) {
+		device_printf(hbus->pcib,
+		    "vmbus_pcib version negotiation failed: %x\n",
+		    comp_pkt.completion_status);
+		ret = EPROTO;
+	} else {
+		ret = 0;
+	}
+out:
+	free_completion(&comp_pkt.host_event);
+	return (ret);
+}
+
+/* Ask the host to send along the list of child devices */
+static int
+hv_pci_query_relations(struct hv_pcibus *hbus)
+{
+	struct pci_message message;
+	int ret;
+
+	message.type = PCI_QUERY_BUS_RELATIONS;
+	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
+	    &message, sizeof(message), 0);
+	return (ret);
+}
+
+static int
+hv_pci_enter_d0(struct hv_pcibus *hbus)
+{
+	struct pci_bus_d0_entry *d0_entry;
+	struct hv_pci_compl comp_pkt;
+	struct {
+		struct pci_packet pkt;
+		uint8_t buffer[sizeof(struct pci_bus_d0_entry)];
+	} ctxt;
+	int ret;
+
+	/*
+	 * Tell the host that the bus is ready to use, and moved into the
+	 * powered-on state.  This includes telling the host which region
+	 * of memory-mapped I/O space has been chosen for configuration space
+	 * access.
+	 */
+	init_completion(&comp_pkt.host_event);
+
+	ctxt.pkt.completion_func = hv_pci_generic_compl;
+	ctxt.pkt.compl_ctxt = &comp_pkt;
+
+	d0_entry = (struct pci_bus_d0_entry *)&ctxt.pkt.message;
+	memset(d0_entry, 0, sizeof(*d0_entry));
+	d0_entry->message_type.type = PCI_BUS_D0ENTRY;
+	d0_entry->mmio_base = rman_get_start(hbus->cfg_res);
+
+	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND,
+	    VMBUS_CHANPKT_FLAG_RC, d0_entry, sizeof(*d0_entry),
+	    (uint64_t)(uintptr_t)&ctxt.pkt);
+	if (!ret)
+		ret = wait_for_response(hbus, &comp_pkt.host_event);
+
+	if (ret)
+		goto out;
+
+	if (comp_pkt.completion_status < 0) {
+		device_printf(hbus->pcib, "vmbus_pcib failed to enable D0\n");
+		ret = EPROTO;
+	} else {
+		ret = 0;
+	}
+
+out:
+	free_completion(&comp_pkt.host_event);
+	return (ret);
+}
+
+/*
+ * It looks this is only needed by Windows VM, but let's send the message too
+ * just to make the host happy.
+ */
+static int
+hv_send_resources_allocated(struct hv_pcibus *hbus)
+{
+	struct pci_resources_assigned *res_assigned;
+	struct hv_pci_compl comp_pkt;
+	struct hv_pci_dev *hpdev;
+	struct pci_packet *pkt;
+	uint32_t wslot;
+	int ret = 0;
+
+	pkt = malloc(sizeof(*pkt) + sizeof(*res_assigned),
+	    M_DEVBUF, M_WAITOK | M_ZERO);
+
+	for (wslot = 0; wslot < 256; wslot++) {
+		hpdev = get_pcichild_wslot(hbus, wslot);
+		if (!hpdev)
+			continue;
+
+		init_completion(&comp_pkt.host_event);
+
+		memset(pkt, 0, sizeof(*pkt) + sizeof(*res_assigned));
+		pkt->completion_func = hv_pci_generic_compl;
+		pkt->compl_ctxt = &comp_pkt;
+
+		res_assigned = (struct pci_resources_assigned *)&pkt->message;
+		res_assigned->message_type.type = PCI_RESOURCES_ASSIGNED;
+		res_assigned->wslot.val = hpdev->desc.wslot.val;
+
+		ret = vmbus_chan_send(hbus->sc->chan,
+		    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+		    &pkt->message, sizeof(*res_assigned),
+		    (uint64_t)(uintptr_t)pkt);
+		if (!ret)
+			ret = wait_for_response(hbus, &comp_pkt.host_event);
+
+		free_completion(&comp_pkt.host_event);
+
+		if (ret)
+			break;
+
+		if (comp_pkt.completion_status < 0) {
+			ret = EPROTO;
+			device_printf(hbus->pcib,
+			    "failed to send PCI_RESOURCES_ASSIGNED\n");
+			break;
+		}
+	}
+
+	free(pkt, M_DEVBUF);
+	return (ret);
+}
+
+static int
+hv_send_resources_released(struct hv_pcibus *hbus)
+{
+	struct pci_child_message pkt;
+	struct hv_pci_dev *hpdev;
+	uint32_t wslot;
+	int ret;
+
+	for (wslot = 0; wslot < 256; wslot++) {
+		hpdev = get_pcichild_wslot(hbus, wslot);
+		if (!hpdev)
+			continue;
+
+		pkt.message_type.type = PCI_RESOURCES_RELEASED;
+		pkt.wslot.val = hpdev->desc.wslot.val;
+
+		ret = vmbus_chan_send(hbus->sc->chan,
+		    VMBUS_CHANPKT_TYPE_INBAND, 0, &pkt, sizeof(pkt), 0);
+		if (ret)
+			return (ret);
+	}
+
+	return (0);
+}
+
+#define hv_cfg_read(x, s)						\
+static inline uint##x##_t hv_cfg_read_##s(struct hv_pcibus *bus,	\
+    bus_size_t offset)							\
+{									\
+	return (bus_read_##s(bus->cfg_res, offset));			\
+}
+
+#define hv_cfg_write(x, s)						\
+static inline void hv_cfg_write_##s(struct hv_pcibus *bus,		\
+    bus_size_t offset, uint##x##_t val)					\
+{									\
+	return (bus_write_##s(bus->cfg_res, offset, val));		\
+}
+
+hv_cfg_read(8, 1)
+hv_cfg_read(16, 2)
+hv_cfg_read(32, 4)
+
+hv_cfg_write(8, 1)
+hv_cfg_write(16, 2)
+hv_cfg_write(32, 4)
+
+static void
+_hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, int size,
+    uint32_t *val)
+{
+	struct hv_pcibus *hbus = hpdev->hbus;
+	bus_size_t addr = CFG_PAGE_OFFSET + where;
+
+	/*
+	 * If the attempt is to read the IDs or the ROM BAR, simulate that.
+	 */
+	if (where + size <= PCIR_COMMAND) {
+		memcpy(val, ((uint8_t *)&hpdev->desc.v_id) + where, size);
+	} else if (where >= PCIR_REVID && where + size <=
+		   PCIR_CACHELNSZ) {
+		memcpy(val, ((uint8_t *)&hpdev->desc.rev) + where -
+		       PCIR_REVID, size);
+	} else if (where >= PCIR_SUBVEND_0 && where + size <=
+		   PCIR_BIOS) {
+		memcpy(val, (uint8_t *)&hpdev->desc.subsystem_id + where -
+		       PCIR_SUBVEND_0, size);
+	} else if (where >= PCIR_BIOS && where + size <=
+		   PCIR_CAP_PTR) {
+		/* ROM BARs are unimplemented */
+		*val = 0;
+	} else if ((where >= PCIR_INTLINE && where + size <=
+		   PCIR_INTPIN) ||(where == PCIR_INTPIN && size == 1)) {
+		/*
+		 * Interrupt Line and Interrupt PIN are hard-wired to zero
+		 * because this front-end only supports message-signaled
+		 * interrupts.
+		 */
+		*val = 0;
+	} else if (where + size <= CFG_PAGE_SIZE) {
+		mtx_lock(&hbus->config_lock);
+
+		/* Choose the function to be read. */
+		hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val);
+
+		/* Make sure the function was chosen before we start reading.*/
+		mb();
+
+		/* Read from that function's config space. */
+		switch (size) {
+		case 1:
+			*((uint8_t *)val) = hv_cfg_read_1(hbus, addr);
+			break;
+		case 2:
+			*((uint16_t *)val) = hv_cfg_read_2(hbus, addr);
+			break;
+		default:
+			*((uint32_t *)val) = hv_cfg_read_4(hbus, addr);
+			break;
+		}
+		/*
+		 * Make sure the write was done before we release the lock,
+		 * allowing consecutive reads/writes.
+		 */
+		mb();
+
+		mtx_unlock(&hbus->config_lock);
+	} else {
+		/* Invalid config read: it's unlikely to reach here. */
+		memset(val, 0, size);
+	}
+}
+
+static void
+_hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, int size,
+    uint32_t val)
+{
+	struct hv_pcibus *hbus = hpdev->hbus;
+	bus_size_t addr = CFG_PAGE_OFFSET + where;
+
+	/* SSIDs and ROM BARs are read-only */
+	if (where >= PCIR_SUBVEND_0 && where + size <= PCIR_CAP_PTR)
+		return;
+
+	if (where >= PCIR_COMMAND && where + size <= CFG_PAGE_SIZE) {
+		mtx_lock(&hbus->config_lock);
+
+		/* Choose the function to be written. */
+		hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val);
+
+		/* Make sure the function was chosen before we start writing.*/
+		wmb();
+
+		/* Write to that function's config space. */
+		switch (size) {
+		case 1:
+			hv_cfg_write_1(hbus, addr, (uint8_t)val);
+			break;
+		case 2:
+			hv_cfg_write_2(hbus, addr, (uint16_t)val);
+			break;
+		default:
+			hv_cfg_write_4(hbus, addr, (uint32_t)val);
+			break;
+		}
+
+		/*
+		 * Make sure the write was done before we release the lock,
+		 * allowing consecutive reads/writes.
+		 */
+		mb();
+
+		mtx_unlock(&hbus->config_lock);
+	} else {
+		/* Invalid config write: it's unlikely to reach here. */
+		return;
+	}
+}
+
+/*
+ * The vPCI in some Hyper-V releases do not initialize the last 4
+ * bit of BAR registers. This could result weird problems causing PCI
+ * code fail to configure BAR correctly.
+ *
+ * Just write all 1's to those BARs whose probed values are not zero.
+ * This seems to make the Hyper-V vPCI and pci_write_bar() to cooperate
+ * correctly.
+ */
+
+static void
+vmbus_pcib_prepopulate_bars(struct hv_pcibus *hbus)
+{
+	struct hv_pci_dev *hpdev;
+	int i;
+
+	mtx_lock(&hbus->device_list_lock);
+	TAILQ_FOREACH(hpdev, &hbus->children, link) {
+		for (i = 0; i < 6; i++) {
+			/* Ignore empty bar */
+			if (hpdev->probed_bar[i] == 0)
+				continue;
+
+			uint32_t bar_val = 0;
+
+			_hv_pcifront_read_config(hpdev, PCIR_BAR(i),
+			    4, &bar_val);
+
+			if (hpdev->probed_bar[i] != bar_val) {
+				if (bootverbose)
+					printf("vmbus_pcib: initialize bar %d "
+					    "by writing all 1s\n", i);
+
+				_hv_pcifront_write_config(hpdev, PCIR_BAR(i),
+				    4, 0xffffffff);
+			}
+		}
+	}
+	mtx_unlock(&hbus->device_list_lock);
+}
+
+static void
+vmbus_pcib_set_detaching(void *arg, int pending __unused)
+{
+	struct hv_pcibus *hbus = arg;
+
+	atomic_set_int(&hbus->detaching, 1);
+}
+
+static void
+vmbus_pcib_pre_detach(struct hv_pcibus *hbus)
+{
+	struct task task;
+
+	TASK_INIT(&task, 0, vmbus_pcib_set_detaching, hbus);
+
+	/*
+	 * Make sure the channel callback won't push any possible new
+	 * PCI_BUS_RELATIONS and PCI_EJECT tasks to sc->taskq.
+	 */
+	vmbus_chan_run_task(hbus->sc->chan, &task);
+
+	taskqueue_drain_all(hbus->sc->taskq);
+}
+
+
+/*
+ * Standard probe entry point.
+ *
+ */
+static int
+vmbus_pcib_probe(device_t dev)
+{
+	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
+	    &g_pass_through_dev_type) == 0) {
+		device_set_desc(dev, "Hyper-V PCI Express Pass Through");
+		return (BUS_PROBE_DEFAULT);
+	}
+	return (ENXIO);
+}
+
+/*
+ * Standard attach entry point.
+ *
+ */
+static int
+vmbus_pcib_attach(device_t dev)
+{
+	const int pci_ring_size = (4 * PAGE_SIZE);
+	const struct hyperv_guid *inst_guid;
+	struct vmbus_channel *channel;
+	struct vmbus_pcib_softc *sc;
+	struct hv_pcibus *hbus;
+	int rid = 0;
+	int ret;
+
+	hbus = malloc(sizeof(*hbus), M_DEVBUF, M_WAITOK | M_ZERO);
+	hbus->pcib = dev;
+
+	channel = vmbus_get_channel(dev);
+	inst_guid = vmbus_chan_guid_inst(channel);
+	hbus->pci_domain = inst_guid->hv_guid[9] |
+			  (inst_guid->hv_guid[8] << 8);
+
+	mtx_init(&hbus->config_lock, "hbcfg", NULL, MTX_DEF);
+	mtx_init(&hbus->device_list_lock, "hbdl", NULL, MTX_DEF);
+	TAILQ_INIT(&hbus->children);
+	TAILQ_INIT(&hbus->dr_list);
+
+	hbus->cfg_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid,
+	    0, RM_MAX_END, PCI_CONFIG_MMIO_LENGTH,
+	    RF_ACTIVE | rman_make_alignment_flags(PAGE_SIZE));
+
+	if (!hbus->cfg_res) {
+		device_printf(dev, "failed to get resource for cfg window\n");
+		ret = ENXIO;
+		goto free_bus;
+	}
+
+	sc = device_get_softc(dev);
+	sc->chan = channel;
+	sc->rx_buf = malloc(PCIB_PACKET_SIZE, M_DEVBUF, M_WAITOK | M_ZERO);
+	sc->hbus = hbus;
+
+	/*
+	 * The taskq is used to handle PCI_BUS_RELATIONS and PCI_EJECT
+	 * messages. NB: we can't handle the messages in the channel callback
+	 * directly, because the message handlers need to send new messages
+	 * to the host and waits for the host's completion messages, which
+	 * must also be handled by the channel callback.
+	 */
+	sc->taskq = taskqueue_create("vmbus_pcib_tq", M_WAITOK,
+	    taskqueue_thread_enqueue, &sc->taskq);
+	taskqueue_start_threads(&sc->taskq, 1, PI_NET, "vmbus_pcib_tq");
+
+	hbus->sc = sc;
+
+	init_completion(&hbus->query_completion);
+	hbus->query_comp = &hbus->query_completion;
+
+	ret = vmbus_chan_open(sc->chan, pci_ring_size, pci_ring_size,
+		NULL, 0, vmbus_pcib_on_channel_callback, sc);
+	if (ret)
+		goto free_res;
+
+	ret = hv_pci_protocol_negotiation(hbus);
+	if (ret)
+		goto vmbus_close;
+
+	ret = hv_pci_query_relations(hbus);
+	if (!ret)
+		ret = wait_for_response(hbus, hbus->query_comp);
+
+	if (ret)
+		goto vmbus_close;
+
+	ret = hv_pci_enter_d0(hbus);
+	if (ret)
+		goto vmbus_close;
+
+	ret = hv_send_resources_allocated(hbus);
+	if (ret)
+		goto vmbus_close;
+
+	vmbus_pcib_prepopulate_bars(hbus);
+
+	hbus->pci_bus = device_add_child(dev, "pci", -1);
+	if (!hbus->pci_bus) {
+		device_printf(dev, "failed to create pci bus\n");
+		ret = ENXIO;
+		goto vmbus_close;
+	}
+
+	bus_generic_attach(dev);
+
+	hbus->state = hv_pcibus_installed;
+
+	return (0);
+
+vmbus_close:
+	vmbus_pcib_pre_detach(hbus);
+	vmbus_chan_close(sc->chan);
+free_res:
+	taskqueue_free(sc->taskq);
+	free_completion(&hbus->query_completion);
+	free(sc->rx_buf, M_DEVBUF);
+	bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res);
+free_bus:
+	mtx_destroy(&hbus->device_list_lock);
+	mtx_destroy(&hbus->config_lock);
+	free(hbus, M_DEVBUF);
+	return (ret);
+}
+
+/*
+ * Standard detach entry point
+ */
+static int
+vmbus_pcib_detach(device_t dev)
+{
+	struct vmbus_pcib_softc *sc = device_get_softc(dev);
+	struct hv_pcibus *hbus = sc->hbus;
+	struct pci_message teardown_packet;
+	struct pci_bus_relations relations;
+	int ret;
+
+	vmbus_pcib_pre_detach(hbus);
+
+	if (hbus->state == hv_pcibus_installed)
+		bus_generic_detach(dev);
+
+	/* Delete any children which might still exist. */
+	memset(&relations, 0, sizeof(relations));
+	hv_pci_devices_present(hbus, &relations);
+
+	ret = hv_send_resources_released(hbus);
+	if (ret)
+		device_printf(dev, "failed to send PCI_RESOURCES_RELEASED\n");
+
+	teardown_packet.type = PCI_BUS_D0EXIT;
+	ret = vmbus_chan_send(sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
+	    &teardown_packet, sizeof(struct pci_message), 0);
+	if (ret)
+		device_printf(dev, "failed to send PCI_BUS_D0EXIT\n");
+
+	taskqueue_drain_all(hbus->sc->taskq);
+	vmbus_chan_close(sc->chan);
+	taskqueue_free(sc->taskq);
+
+	free_completion(&hbus->query_completion);
+	free(sc->rx_buf, M_DEVBUF);
+	bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res);
+
+	mtx_destroy(&hbus->device_list_lock);
+	mtx_destroy(&hbus->config_lock);
+	free(hbus, M_DEVBUF);
+
+	return (0);
+}
+
+static int
+vmbus_pcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *val)
+{
+	struct vmbus_pcib_softc *sc = device_get_softc(dev);
+
+	switch (which) {
+	case PCIB_IVAR_DOMAIN:
+		*val = sc->hbus->pci_domain;
+		return (0);
+
+	case PCIB_IVAR_BUS:
+		/* There is only bus 0. */
+		*val = 0;
+		return (0);
+	}
+	return (ENOENT);
+}
+
+static int
+vmbus_pcib_write_ivar(device_t dev, device_t child, int which, uintptr_t val)
+{
+	return (ENOENT);
+}
+
+static struct resource *
+vmbus_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
+	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
+{
+	unsigned int bar_no;
+	struct hv_pci_dev *hpdev;
+	struct vmbus_pcib_softc *sc = device_get_softc(dev);
+	struct resource *res;
+	unsigned int devfn;
+
+	if (type == PCI_RES_BUS)
+		return (pci_domain_alloc_bus(sc->hbus->pci_domain, child, rid,
+		    start, end, count, flags));
+
+	/* Devices with port I/O BAR are not supported. */
+	if (type == SYS_RES_IOPORT)
+		return (NULL);
+
+	if (type == SYS_RES_MEMORY) {
+		devfn = PCI_DEVFN(pci_get_slot(child),
+		    pci_get_function(child));
+		hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
+		if (!hpdev)
+			return (NULL);
+
+		bar_no = PCI_RID2BAR(*rid);
+		if (bar_no >= MAX_NUM_BARS)
+			return (NULL);
+
+		/* Make sure a 32-bit BAR gets a 32-bit address */
+		if (!(hpdev->probed_bar[bar_no] & PCIM_BAR_MEM_64))
+			end = ulmin(end, 0xFFFFFFFF);
+	}
+
+	res = bus_generic_alloc_resource(dev, child, type, rid,
+		start, end, count, flags);
+	/*
+	 * If this is a request for a specific range, assume it is
+	 * correct and pass it up to the parent.
+	 */
+	if (res == NULL && start + count - 1 == end)
+		res = bus_generic_alloc_resource(dev, child, type, rid,
+		    start, end, count, flags);
+	return (res);
+}
+
+static int
+vmbus_pcib_release_resource(device_t dev, device_t child, int type, int rid,
+    struct resource *r)
+{
+	struct vmbus_pcib_softc *sc = device_get_softc(dev);
+
+	if (type == PCI_RES_BUS)
+		return (pci_domain_release_bus(sc->hbus->pci_domain, child,
+		    rid, r));
+
+	if (type == SYS_RES_IOPORT)
+		return (EINVAL);
+
+	return (bus_generic_release_resource(dev, child, type, rid, r));
+}
+
+#if __FreeBSD_version >= 1100000
+static int
+vmbus_pcib_get_cpus(device_t pcib, device_t dev, enum cpu_sets op,
+    size_t setsize, cpuset_t *cpuset)
+{
+	return (bus_get_cpus(pcib, op, setsize, cpuset));
+}
+#endif
+
+static uint32_t
+vmbus_pcib_read_config(device_t dev, u_int bus, u_int slot, u_int func,
+    u_int reg, int bytes)
+{
+	struct vmbus_pcib_softc *sc = device_get_softc(dev);
+	struct hv_pci_dev *hpdev;
+	unsigned int devfn = PCI_DEVFN(slot, func);
+	uint32_t data = 0;
+
+	KASSERT(bus == 0, ("bus should be 0, but is %u", bus));
+
+	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
+	if (!hpdev)
+		return (~0);
+
+	_hv_pcifront_read_config(hpdev, reg, bytes, &data);
+
+	return (data);
+}
+
+static void
+vmbus_pcib_write_config(device_t dev, u_int bus, u_int slot, u_int func,
+    u_int reg, uint32_t data, int bytes)
+{
+	struct vmbus_pcib_softc *sc = device_get_softc(dev);
+	struct hv_pci_dev *hpdev;
+	unsigned int devfn = PCI_DEVFN(slot, func);
+
+	KASSERT(bus == 0, ("bus should be 0, but is %u", bus));
+
+	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
+	if (!hpdev)
+		return;
+
+	_hv_pcifront_write_config(hpdev, reg, bytes, data);
+}
+
+static int
+vmbus_pcib_route_intr(device_t pcib, device_t dev, int pin)
+{
+	/* We only support MSI/MSI-X and don't support INTx interrupt. */
+	return (PCI_INVALID_IRQ);
+}
+
+static int
+vmbus_pcib_alloc_msi(device_t pcib, device_t dev, int count,
+    int maxcount, int *irqs)
+{
+	return (PCIB_ALLOC_MSI(device_get_parent(pcib), dev, count, maxcount,
+	    irqs));
+}
+
+static int
+vmbus_pcib_release_msi(device_t pcib, device_t dev, int count, int *irqs)
+{
+	return (PCIB_RELEASE_MSI(device_get_parent(pcib), dev, count, irqs));
+}
+
+static int
+vmbus_pcib_alloc_msix(device_t pcib, device_t dev, int *irq)
+{
+	return (PCIB_ALLOC_MSIX(device_get_parent(pcib), dev, irq));
+}
+
+static int
+vmbus_pcib_release_msix(device_t pcib, device_t dev, int irq)
+{
+	return (PCIB_RELEASE_MSIX(device_get_parent(pcib), dev, irq));
+}
+
+#define	MSI_INTEL_ADDR_DEST	0x000ff000
+#define	MSI_INTEL_DATA_INTVEC	IOART_INTVEC	/* Interrupt vector. */
+#define	MSI_INTEL_DATA_DELFIXED	IOART_DELFIXED
+
+static int
+vmbus_pcib_map_msi(device_t pcib, device_t child, int irq,
+    uint64_t *addr, uint32_t *data)
+{
+	unsigned int devfn;
+	struct hv_pci_dev *hpdev;
+
+	uint64_t v_addr;
+	uint32_t v_data;
+	struct hv_irq_desc *hid, *tmp_hid;
+	unsigned int cpu, vcpu_id;
+	unsigned int vector;
+
+	struct vmbus_pcib_softc *sc = device_get_softc(pcib);
+	struct pci_create_interrupt *int_pkt;
+	struct compose_comp_ctxt comp;
+	struct {
+		struct pci_packet pkt;
+		uint8_t buffer[sizeof(struct pci_create_interrupt)];
+	} ctxt;
+
+	int ret;
+
+	devfn = PCI_DEVFN(pci_get_slot(child), pci_get_function(child));
+	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
+	if (!hpdev)
+		return (ENOENT);
+
+	ret = PCIB_MAP_MSI(device_get_parent(pcib), child, irq,
+	    &v_addr, &v_data);
+	if (ret)
+		return (ret);
+
+	TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid) {
+		if (hid->irq == irq) {
+			TAILQ_REMOVE(&hpdev->irq_desc_list, hid, link);
+			hv_int_desc_free(hpdev, hid);
+			break;
+		}
+	}
+
+	cpu = (v_addr & MSI_INTEL_ADDR_DEST) >> 12;
+	vcpu_id = VMBUS_GET_VCPU_ID(device_get_parent(pcib), pcib, cpu);
+	vector = v_data & MSI_INTEL_DATA_INTVEC;
+
+	init_completion(&comp.comp_pkt.host_event);
+
+	memset(&ctxt, 0, sizeof(ctxt));
+	ctxt.pkt.completion_func = hv_pci_compose_compl;
+	ctxt.pkt.compl_ctxt = &comp;
+
+	int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message;
+	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
+	int_pkt->wslot.val = hpdev->desc.wslot.val;
+	int_pkt->int_desc.vector = vector;
+	int_pkt->int_desc.vector_count = 1;
+	int_pkt->int_desc.delivery_mode = MSI_INTEL_DATA_DELFIXED;
+	int_pkt->int_desc.cpu_mask = 1ULL << vcpu_id;
+
+	ret = vmbus_chan_send(sc->chan,	VMBUS_CHANPKT_TYPE_INBAND,
+	    VMBUS_CHANPKT_FLAG_RC, int_pkt, sizeof(*int_pkt),
+	    (uint64_t)(uintptr_t)&ctxt.pkt);
+	if (ret) {
+		free_completion(&comp.comp_pkt.host_event);
+		return (ret);
+	}
+
+	wait_for_completion(&comp.comp_pkt.host_event);
+	free_completion(&comp.comp_pkt.host_event);
+
+	if (comp.comp_pkt.completion_status < 0)
+		return (EPROTO);
+
+	*addr = comp.int_desc.address;
+	*data = comp.int_desc.data;
+
+	hid = malloc(sizeof(struct hv_irq_desc), M_DEVBUF, M_WAITOK | M_ZERO);
+	hid->irq = irq;
+	hid->desc = comp.int_desc;
+	TAILQ_INSERT_TAIL(&hpdev->irq_desc_list, hid, link);
+
+	return (0);
+}
+
+static device_method_t vmbus_pcib_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,         vmbus_pcib_probe),
+	DEVMETHOD(device_attach,        vmbus_pcib_attach),
+	DEVMETHOD(device_detach,        vmbus_pcib_detach),
+	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
+	DEVMETHOD(device_suspend,	bus_generic_suspend),
+	DEVMETHOD(device_resume,	bus_generic_resume),
+
+	/* Bus interface */
+	DEVMETHOD(bus_read_ivar,		vmbus_pcib_read_ivar),
+	DEVMETHOD(bus_write_ivar,		vmbus_pcib_write_ivar),
+	DEVMETHOD(bus_alloc_resource,		vmbus_pcib_alloc_resource),
+	DEVMETHOD(bus_release_resource,		vmbus_pcib_release_resource),
+	DEVMETHOD(bus_activate_resource,   bus_generic_activate_resource),
+	DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
+	DEVMETHOD(bus_setup_intr,	   bus_generic_setup_intr),
+	DEVMETHOD(bus_teardown_intr,	   bus_generic_teardown_intr),
+#if __FreeBSD_version >= 1100000
+	DEVMETHOD(bus_get_cpus,			vmbus_pcib_get_cpus),
+#endif
+
+	/* pcib interface */
+	DEVMETHOD(pcib_maxslots,		pcib_maxslots),
+	DEVMETHOD(pcib_read_config,		vmbus_pcib_read_config),
+	DEVMETHOD(pcib_write_config,		vmbus_pcib_write_config),
+	DEVMETHOD(pcib_route_interrupt,		vmbus_pcib_route_intr),
+	DEVMETHOD(pcib_alloc_msi,		vmbus_pcib_alloc_msi),
+	DEVMETHOD(pcib_release_msi,		vmbus_pcib_release_msi),
+	DEVMETHOD(pcib_alloc_msix,		vmbus_pcib_alloc_msix),
+	DEVMETHOD(pcib_release_msix,		vmbus_pcib_release_msix),
+	DEVMETHOD(pcib_map_msi,			vmbus_pcib_map_msi),
+	DEVMETHOD(pcib_request_feature,		pcib_request_feature_allow),
+
+	DEVMETHOD_END
+};
+
+static devclass_t pcib_devclass;
+
+DEFINE_CLASS_0(pcib, vmbus_pcib_driver, vmbus_pcib_methods,
+		sizeof(struct vmbus_pcib_softc));
+DRIVER_MODULE(vmbus_pcib, vmbus, vmbus_pcib_driver, pcib_devclass, 0, 0);
+MODULE_DEPEND(vmbus_pcib, vmbus, 1, 1, 1);
+MODULE_DEPEND(vmbus_pcib, pci, 1, 1, 1);
+
+#endif /* NEW_PCIB */
diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
new file mode 100644
index 000000000000..702308e26a1d
--- /dev/null
+++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
@@ -0,0 +1,2515 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * StorVSC driver for Hyper-V.  This driver presents a SCSI HBA interface
+ * to the Comman Access Method (CAM) layer.  CAM control blocks (CCBs) are
+ * converted into VSCSI protocol messages which are delivered to the parent
+ * partition StorVSP driver over the Hyper-V VMBUS.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/condvar.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/sockio.h>
+#include <sys/mbuf.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+#include <sys/bus.h>
+#include <sys/mutex.h>
+#include <sys/callout.h>
+#include <sys/smp.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/uma.h>
+#include <sys/lock.h>
+#include <sys/sema.h>
+#include <sys/sglist.h>
+#include <sys/eventhandler.h>
+#include <machine/bus.h>
+
+#include <cam/cam.h>
+#include <cam/cam_ccb.h>
+#include <cam/cam_periph.h>
+#include <cam/cam_sim.h>
+#include <cam/cam_xpt_sim.h>
+#include <cam/cam_xpt_internal.h>
+#include <cam/cam_debug.h>
+#include <cam/scsi/scsi_all.h>
+#include <cam/scsi/scsi_message.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+#include "hv_vstorage.h"
+#include "vmbus_if.h"
+
+#define STORVSC_MAX_LUNS_PER_TARGET	(64)
+#define STORVSC_MAX_IO_REQUESTS		(STORVSC_MAX_LUNS_PER_TARGET * 2)
+#define BLKVSC_MAX_IDE_DISKS_PER_TARGET	(1)
+#define BLKVSC_MAX_IO_REQUESTS		STORVSC_MAX_IO_REQUESTS
+#define STORVSC_MAX_TARGETS		(2)
+
+#define VSTOR_PKT_SIZE	(sizeof(struct vstor_packet) - vmscsi_size_delta)
+
+/*
+ * 33 segments are needed to allow 128KB maxio, in case the data
+ * in the first page is _not_ PAGE_SIZE aligned, e.g.
+ *
+ *     |<----------- 128KB ----------->|
+ *     |                               |
+ *  0  2K 4K    8K   16K   124K  128K  130K
+ *  |  |  |     |     |       |     |  |
+ *  +--+--+-----+-----+.......+-----+--+--+
+ *  |  |  |     |     |       |     |  |  | DATA
+ *  |  |  |     |     |       |     |  |  |
+ *  +--+--+-----+-----+.......------+--+--+
+ *     |  |                         |  |
+ *     | 1|            31           | 1| ...... # of segments
+ */
+#define STORVSC_DATA_SEGCNT_MAX		33
+#define STORVSC_DATA_SEGSZ_MAX		PAGE_SIZE
+#define STORVSC_DATA_SIZE_MAX		\
+	((STORVSC_DATA_SEGCNT_MAX - 1) * STORVSC_DATA_SEGSZ_MAX)
+
+struct storvsc_softc;
+
+struct hv_sgl_node {
+	LIST_ENTRY(hv_sgl_node) link;
+	struct sglist *sgl_data;
+};
+
+struct hv_sgl_page_pool{
+	LIST_HEAD(, hv_sgl_node) in_use_sgl_list;
+	LIST_HEAD(, hv_sgl_node) free_sgl_list;
+	boolean_t                is_init;
+} g_hv_sgl_page_pool;
+
+enum storvsc_request_type {
+	WRITE_TYPE,
+	READ_TYPE,
+	UNKNOWN_TYPE
+};
+
+SYSCTL_NODE(_hw, OID_AUTO, storvsc, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
+	"Hyper-V storage interface");
+
+static u_int hv_storvsc_use_win8ext_flags = 1;
+SYSCTL_UINT(_hw_storvsc, OID_AUTO, use_win8ext_flags, CTLFLAG_RW,
+	&hv_storvsc_use_win8ext_flags, 0,
+	"Use win8 extension flags or not");
+
+static u_int hv_storvsc_use_pim_unmapped = 1;
+SYSCTL_UINT(_hw_storvsc, OID_AUTO, use_pim_unmapped, CTLFLAG_RDTUN,
+	&hv_storvsc_use_pim_unmapped, 0,
+	"Optimize storvsc by using unmapped I/O");
+
+static u_int hv_storvsc_ringbuffer_size = (64 * PAGE_SIZE);
+SYSCTL_UINT(_hw_storvsc, OID_AUTO, ringbuffer_size, CTLFLAG_RDTUN,
+	&hv_storvsc_ringbuffer_size, 0, "Hyper-V storage ringbuffer size");
+
+static u_int hv_storvsc_max_io = 512;
+SYSCTL_UINT(_hw_storvsc, OID_AUTO, max_io, CTLFLAG_RDTUN,
+	&hv_storvsc_max_io, 0, "Hyper-V storage max io limit");
+
+static int hv_storvsc_chan_cnt = 0;
+SYSCTL_INT(_hw_storvsc, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
+	&hv_storvsc_chan_cnt, 0, "# of channels to use");
+#ifdef DIAGNOSTIC
+static int hv_storvsc_srb_status = -1;
+SYSCTL_INT(_hw_storvsc, OID_AUTO, srb_status,  CTLFLAG_RW,
+	&hv_storvsc_srb_status, 0, "srb_status to inject");
+TUNABLE_INT("hw_storvsc.srb_status", &hv_storvsc_srb_status);
+#endif /* DIAGNOSTIC */
+
+#define STORVSC_MAX_IO						\
+	vmbus_chan_prplist_nelem(hv_storvsc_ringbuffer_size,	\
+	   STORVSC_DATA_SEGCNT_MAX, VSTOR_PKT_SIZE)
+
+struct hv_storvsc_sysctl {
+	u_long		data_bio_cnt;
+	u_long		data_vaddr_cnt;
+	u_long		data_sg_cnt;
+	u_long		chan_send_cnt[MAXCPU];
+};
+
+struct storvsc_gpa_range {
+	struct vmbus_gpa_range	gpa_range;
+	uint64_t		gpa_page[STORVSC_DATA_SEGCNT_MAX];
+} __packed;
+
+struct hv_storvsc_request {
+	LIST_ENTRY(hv_storvsc_request)	link;
+	struct vstor_packet		vstor_packet;
+	int				prp_cnt;
+	struct storvsc_gpa_range	prp_list;
+	void				*sense_data;
+	uint8_t				sense_info_len;
+	uint8_t				retries;
+	union ccb			*ccb;
+	struct storvsc_softc		*softc;
+	struct callout			callout;
+	struct sema			synch_sema; /*Synchronize the request/response if needed */
+	struct sglist			*bounce_sgl;
+	unsigned int			bounce_sgl_count;
+	uint64_t			not_aligned_seg_bits;
+	bus_dmamap_t			data_dmap;
+};
+
+struct storvsc_softc {
+	struct vmbus_channel		*hs_chan;
+	LIST_HEAD(, hv_storvsc_request)	hs_free_list;
+	struct mtx			hs_lock;
+	struct storvsc_driver_props	*hs_drv_props;
+	int 				hs_unit;
+	uint32_t			hs_frozen;
+	struct cam_sim			*hs_sim;
+	struct cam_path 		*hs_path;
+	uint32_t			hs_num_out_reqs;
+	boolean_t			hs_destroy;
+	boolean_t			hs_drain_notify;
+	struct sema 			hs_drain_sema;	
+	struct hv_storvsc_request	hs_init_req;
+	struct hv_storvsc_request	hs_reset_req;
+	device_t			hs_dev;
+	bus_dma_tag_t			storvsc_req_dtag;
+	struct hv_storvsc_sysctl	sysctl_data;
+	uint32_t			hs_nchan;
+	struct vmbus_channel		*hs_sel_chan[MAXCPU];
+};
+
+static eventhandler_tag storvsc_handler_tag;
+/*
+ * The size of the vmscsi_request has changed in win8. The
+ * additional size is for the newly added elements in the
+ * structure. These elements are valid only when we are talking
+ * to a win8 host.
+ * Track the correct size we need to apply.
+ */
+static int vmscsi_size_delta = sizeof(struct vmscsi_win8_extension);
+
+/**
+ * HyperV storvsc timeout testing cases:
+ * a. IO returned after first timeout;
+ * b. IO returned after second timeout and queue freeze;
+ * c. IO returned while timer handler is running
+ * The first can be tested by "sg_senddiag -vv /dev/daX",
+ * and the second and third can be done by
+ * "sg_wr_mode -v -p 08 -c 0,1a -m 0,ff /dev/daX".
+ */
+#define HVS_TIMEOUT_TEST 0
+
+/*
+ * Bus/adapter reset functionality on the Hyper-V host is
+ * buggy and it will be disabled until
+ * it can be further tested.
+ */
+#define HVS_HOST_RESET 0
+
+struct storvsc_driver_props {
+	char		*drv_name;
+	char		*drv_desc;
+	uint8_t		drv_max_luns_per_target;
+	uint32_t	drv_max_ios_per_target;
+	uint32_t	drv_ringbuffer_size;
+};
+
+enum hv_storage_type {
+	DRIVER_BLKVSC,
+	DRIVER_STORVSC,
+	DRIVER_UNKNOWN
+};
+
+#define HS_MAX_ADAPTERS 10
+
+#define HV_STORAGE_SUPPORTS_MULTI_CHANNEL 0x1
+
+/* {ba6163d9-04a1-4d29-b605-72e2ffb1dc7f} */
+static const struct hyperv_guid gStorVscDeviceType={
+	.hv_guid = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d,
+		 0xb6, 0x05, 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f}
+};
+
+/* {32412632-86cb-44a2-9b5c-50d1417354f5} */
+static const struct hyperv_guid gBlkVscDeviceType={
+	.hv_guid = {0x32, 0x26, 0x41, 0x32, 0xcb, 0x86, 0xa2, 0x44,
+		 0x9b, 0x5c, 0x50, 0xd1, 0x41, 0x73, 0x54, 0xf5}
+};
+
+static struct storvsc_driver_props g_drv_props_table[] = {
+	{"blkvsc", "Hyper-V IDE",
+	 BLKVSC_MAX_IDE_DISKS_PER_TARGET, BLKVSC_MAX_IO_REQUESTS,
+	 20*PAGE_SIZE},
+	{"storvsc", "Hyper-V SCSI",
+	 STORVSC_MAX_LUNS_PER_TARGET, STORVSC_MAX_IO_REQUESTS,
+	 20*PAGE_SIZE}
+};
+
+/*
+ * Sense buffer size changed in win8; have a run-time
+ * variable to track the size we should use.
+ */
+static int sense_buffer_size = PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE;
+
+/*
+ * The storage protocol version is determined during the
+ * initial exchange with the host.  It will indicate which
+ * storage functionality is available in the host.
+*/
+static int vmstor_proto_version;
+
+struct vmstor_proto {
+        int proto_version;
+        int sense_buffer_size;
+        int vmscsi_size_delta;
+};
+
+static const struct vmstor_proto vmstor_proto_list[] = {
+        {
+                VMSTOR_PROTOCOL_VERSION_WIN10,
+                POST_WIN7_STORVSC_SENSE_BUFFER_SIZE,
+                0
+        },
+        {
+                VMSTOR_PROTOCOL_VERSION_WIN8_1,
+                POST_WIN7_STORVSC_SENSE_BUFFER_SIZE,
+                0
+        },
+        {
+                VMSTOR_PROTOCOL_VERSION_WIN8,
+                POST_WIN7_STORVSC_SENSE_BUFFER_SIZE,
+                0
+        },
+        {
+                VMSTOR_PROTOCOL_VERSION_WIN7,
+                PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE,
+                sizeof(struct vmscsi_win8_extension),
+        },
+        {
+                VMSTOR_PROTOCOL_VERSION_WIN6,
+                PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE,
+                sizeof(struct vmscsi_win8_extension),
+        }
+};
+
+/* static functions */
+static int storvsc_probe(device_t dev);
+static int storvsc_attach(device_t dev);
+static int storvsc_detach(device_t dev);
+static void storvsc_poll(struct cam_sim * sim);
+static void storvsc_action(struct cam_sim * sim, union ccb * ccb);
+static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp);
+static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp);
+static enum hv_storage_type storvsc_get_storage_type(device_t dev);
+static void hv_storvsc_rescan_target(struct storvsc_softc *sc);
+static void hv_storvsc_on_channel_callback(struct vmbus_channel *chan, void *xsc);
+static void hv_storvsc_on_iocompletion( struct storvsc_softc *sc,
+					struct vstor_packet *vstor_packet,
+					struct hv_storvsc_request *request);
+static int hv_storvsc_connect_vsp(struct storvsc_softc *);
+static void storvsc_io_done(struct hv_storvsc_request *reqp);
+static void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl,
+				bus_dma_segment_t *orig_sgl,
+				unsigned int orig_sgl_count,
+				uint64_t seg_bits);
+void storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl,
+				unsigned int dest_sgl_count,
+				struct sglist* src_sgl,
+				uint64_t seg_bits);
+
+static device_method_t storvsc_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		storvsc_probe),
+	DEVMETHOD(device_attach,	storvsc_attach),
+	DEVMETHOD(device_detach,	storvsc_detach),
+	DEVMETHOD(device_shutdown,      bus_generic_shutdown),
+	DEVMETHOD_END
+};
+
+static driver_t storvsc_driver = {
+	"storvsc", storvsc_methods, sizeof(struct storvsc_softc),
+};
+
+static devclass_t storvsc_devclass;
+DRIVER_MODULE(storvsc, vmbus, storvsc_driver, storvsc_devclass, 0, 0);
+MODULE_VERSION(storvsc, 1);
+MODULE_DEPEND(storvsc, vmbus, 1, 1, 1);
+
+static void
+storvsc_subchan_attach(struct storvsc_softc *sc,
+    struct vmbus_channel *new_channel)
+{
+	struct vmstor_chan_props props;
+	int ret = 0;
+
+	memset(&props, 0, sizeof(props));
+
+	vmbus_chan_cpu_rr(new_channel);
+	ret = vmbus_chan_open(new_channel,
+	    sc->hs_drv_props->drv_ringbuffer_size,
+  	    sc->hs_drv_props->drv_ringbuffer_size,
+	    (void *)&props,
+	    sizeof(struct vmstor_chan_props),
+	    hv_storvsc_on_channel_callback, sc);
+}
+
+/**
+ * @brief Send multi-channel creation request to host
+ *
+ * @param device  a Hyper-V device pointer
+ * @param max_chans  the max channels supported by vmbus
+ */
+static void
+storvsc_send_multichannel_request(struct storvsc_softc *sc, int max_subch)
+{
+	struct vmbus_channel **subchan;
+	struct hv_storvsc_request *request;
+	struct vstor_packet *vstor_packet;	
+	int request_subch;
+	int ret, i;
+
+	/* get sub-channel count that need to create */
+	request_subch = MIN(max_subch, mp_ncpus - 1);
+
+	request = &sc->hs_init_req;
+
+	/* request the host to create multi-channel */
+	memset(request, 0, sizeof(struct hv_storvsc_request));
+	
+	sema_init(&request->synch_sema, 0, ("stor_synch_sema"));
+
+	vstor_packet = &request->vstor_packet;
+	
+	vstor_packet->operation = VSTOR_OPERATION_CREATE_MULTI_CHANNELS;
+	vstor_packet->flags = REQUEST_COMPLETION_FLAG;
+	vstor_packet->u.multi_channels_cnt = request_subch;
+
+	ret = vmbus_chan_send(sc->hs_chan,
+	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+	    vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
+
+	sema_wait(&request->synch_sema);
+
+	if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
+	    vstor_packet->status != 0) {		
+		printf("Storvsc_error: create multi-channel invalid operation "
+		    "(%d) or statue (%u)\n",
+		    vstor_packet->operation, vstor_packet->status);
+		return;
+	}
+
+	/* Update channel count */
+	sc->hs_nchan = request_subch + 1;
+
+	/* Wait for sub-channels setup to complete. */
+	subchan = vmbus_subchan_get(sc->hs_chan, request_subch);
+
+	/* Attach the sub-channels. */
+	for (i = 0; i < request_subch; ++i)
+		storvsc_subchan_attach(sc, subchan[i]);
+
+	/* Release the sub-channels. */
+	vmbus_subchan_rel(subchan, request_subch);
+
+	if (bootverbose)
+		printf("Storvsc create multi-channel success!\n");
+}
+
+/**
+ * @brief initialize channel connection to parent partition
+ *
+ * @param dev  a Hyper-V device pointer
+ * @returns  0 on success, non-zero error on failure
+ */
+static int
+hv_storvsc_channel_init(struct storvsc_softc *sc)
+{
+	int ret = 0, i;
+	struct hv_storvsc_request *request;
+	struct vstor_packet *vstor_packet;
+	uint16_t max_subch;
+	boolean_t support_multichannel;
+	uint32_t version;
+
+	max_subch = 0;
+	support_multichannel = FALSE;
+
+	request = &sc->hs_init_req;
+	memset(request, 0, sizeof(struct hv_storvsc_request));
+	vstor_packet = &request->vstor_packet;
+	request->softc = sc;
+
+	/**
+	 * Initiate the vsc/vsp initialization protocol on the open channel
+	 */
+	sema_init(&request->synch_sema, 0, ("stor_synch_sema"));
+
+	vstor_packet->operation = VSTOR_OPERATION_BEGININITIALIZATION;
+	vstor_packet->flags = REQUEST_COMPLETION_FLAG;
+
+
+	ret = vmbus_chan_send(sc->hs_chan,
+	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+	    vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
+
+	if (ret != 0)
+		goto cleanup;
+
+	sema_wait(&request->synch_sema);
+
+	if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
+		vstor_packet->status != 0) {
+		goto cleanup;
+	}
+
+	for (i = 0; i < nitems(vmstor_proto_list); i++) {
+		/* reuse the packet for version range supported */
+
+		memset(vstor_packet, 0, sizeof(struct vstor_packet));
+		vstor_packet->operation = VSTOR_OPERATION_QUERYPROTOCOLVERSION;
+		vstor_packet->flags = REQUEST_COMPLETION_FLAG;
+
+		vstor_packet->u.version.major_minor =
+			vmstor_proto_list[i].proto_version;
+
+		/* revision is only significant for Windows guests */
+		vstor_packet->u.version.revision = 0;
+
+		ret = vmbus_chan_send(sc->hs_chan,
+		    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+		    vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
+
+		if (ret != 0)
+			goto cleanup;
+
+		sema_wait(&request->synch_sema);
+
+		if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO) {
+			ret = EINVAL;
+			goto cleanup;	
+		}
+		if (vstor_packet->status == 0) {
+			vmstor_proto_version =
+				vmstor_proto_list[i].proto_version;
+			sense_buffer_size =
+				vmstor_proto_list[i].sense_buffer_size;
+			vmscsi_size_delta =
+				vmstor_proto_list[i].vmscsi_size_delta;
+			break;
+		}
+	}
+
+	if (vstor_packet->status != 0) {
+		ret = EINVAL;
+		goto cleanup;
+	}
+	/**
+	 * Query channel properties
+	 */
+	memset(vstor_packet, 0, sizeof(struct vstor_packet));
+	vstor_packet->operation = VSTOR_OPERATION_QUERYPROPERTIES;
+	vstor_packet->flags = REQUEST_COMPLETION_FLAG;
+
+	ret = vmbus_chan_send(sc->hs_chan,
+	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+	    vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
+
+	if ( ret != 0)
+		goto cleanup;
+
+	sema_wait(&request->synch_sema);
+
+	/* TODO: Check returned version */
+	if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
+	    vstor_packet->status != 0) {
+		goto cleanup;
+	}
+
+	max_subch = vstor_packet->u.chan_props.max_channel_cnt;
+	if (hv_storvsc_chan_cnt > 0 && hv_storvsc_chan_cnt < (max_subch + 1))
+		max_subch = hv_storvsc_chan_cnt - 1;
+
+	/* multi-channels feature is supported by WIN8 and above version */
+	version = VMBUS_GET_VERSION(device_get_parent(sc->hs_dev), sc->hs_dev);
+	if (version != VMBUS_VERSION_WIN7 && version != VMBUS_VERSION_WS2008 &&
+	    (vstor_packet->u.chan_props.flags &
+	     HV_STORAGE_SUPPORTS_MULTI_CHANNEL)) {
+		support_multichannel = TRUE;
+	}
+	if (bootverbose) {
+		device_printf(sc->hs_dev, "max chans %d%s\n", max_subch + 1,
+		    support_multichannel ? ", multi-chan capable" : "");
+	}
+
+	memset(vstor_packet, 0, sizeof(struct vstor_packet));
+	vstor_packet->operation = VSTOR_OPERATION_ENDINITIALIZATION;
+	vstor_packet->flags = REQUEST_COMPLETION_FLAG;
+
+	ret = vmbus_chan_send(sc->hs_chan,
+	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+	    vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
+
+	if (ret != 0) {
+		goto cleanup;
+	}
+
+	sema_wait(&request->synch_sema);
+
+	if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
+	    vstor_packet->status != 0)
+		goto cleanup;
+
+	/*
+	 * If multi-channel is supported, send multichannel create
+	 * request to host.
+	 */
+	if (support_multichannel && max_subch > 0)
+		storvsc_send_multichannel_request(sc, max_subch);
+cleanup:
+	sema_destroy(&request->synch_sema);
+	return (ret);
+}
+
+/**
+ * @brief Open channel connection to paraent partition StorVSP driver
+ *
+ * Open and initialize channel connection to parent partition StorVSP driver.
+ *
+ * @param pointer to a Hyper-V device
+ * @returns 0 on success, non-zero error on failure
+ */
+static int
+hv_storvsc_connect_vsp(struct storvsc_softc *sc)
+{	
+	int ret = 0;
+	struct vmstor_chan_props props;
+
+	memset(&props, 0, sizeof(struct vmstor_chan_props));
+
+	/*
+	 * Open the channel
+	 */
+	vmbus_chan_cpu_rr(sc->hs_chan);
+	ret = vmbus_chan_open(
+		sc->hs_chan,
+		sc->hs_drv_props->drv_ringbuffer_size,
+		sc->hs_drv_props->drv_ringbuffer_size,
+		(void *)&props,
+		sizeof(struct vmstor_chan_props),
+		hv_storvsc_on_channel_callback, sc);
+
+	if (ret != 0) {
+		return ret;
+	}
+
+	ret = hv_storvsc_channel_init(sc);
+	return (ret);
+}
+
+#if HVS_HOST_RESET
+static int
+hv_storvsc_host_reset(struct storvsc_softc *sc)
+{
+	int ret = 0;
+
+	struct hv_storvsc_request *request;
+	struct vstor_packet *vstor_packet;
+
+	request = &sc->hs_reset_req;
+	request->softc = sc;
+	vstor_packet = &request->vstor_packet;
+
+	sema_init(&request->synch_sema, 0, "stor synch sema");
+
+	vstor_packet->operation = VSTOR_OPERATION_RESETBUS;
+	vstor_packet->flags = REQUEST_COMPLETION_FLAG;
+
+	ret = vmbus_chan_send(dev->channel,
+	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+	    vstor_packet, VSTOR_PKT_SIZE,
+	    (uint64_t)(uintptr_t)&sc->hs_reset_req);
+
+	if (ret != 0) {
+		goto cleanup;
+	}
+
+	sema_wait(&request->synch_sema);
+
+	/*
+	 * At this point, all outstanding requests in the adapter
+	 * should have been flushed out and return to us
+	 */
+
+cleanup:
+	sema_destroy(&request->synch_sema);
+	return (ret);
+}
+#endif /* HVS_HOST_RESET */
+
+/**
+ * @brief Function to initiate an I/O request
+ *
+ * @param device Hyper-V device pointer
+ * @param request pointer to a request structure
+ * @returns 0 on success, non-zero error on failure
+ */
+static int
+hv_storvsc_io_request(struct storvsc_softc *sc,
+					  struct hv_storvsc_request *request)
+{
+	struct vstor_packet *vstor_packet = &request->vstor_packet;
+	struct vmbus_channel* outgoing_channel = NULL;
+	int ret = 0, ch_sel;
+
+	vstor_packet->flags |= REQUEST_COMPLETION_FLAG;
+
+	vstor_packet->u.vm_srb.length =
+	    sizeof(struct vmscsi_req) - vmscsi_size_delta;
+	
+	vstor_packet->u.vm_srb.sense_info_len = sense_buffer_size;
+
+	vstor_packet->u.vm_srb.transfer_len =
+	    request->prp_list.gpa_range.gpa_len;
+
+	vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB;
+
+	ch_sel = (vstor_packet->u.vm_srb.lun + curcpu) % sc->hs_nchan;
+	/*
+	 * If we are panic'ing, then we are dumping core. Since storvsc_polls
+	 * always uses sc->hs_chan, then we must send to that channel or a poll
+	 * timeout will occur.
+	 */
+	if (panicstr) {
+		outgoing_channel = sc->hs_chan;
+	} else {
+		outgoing_channel = sc->hs_sel_chan[ch_sel];
+	}
+
+	mtx_unlock(&request->softc->hs_lock);
+	if (request->prp_list.gpa_range.gpa_len) {
+		ret = vmbus_chan_send_prplist(outgoing_channel,
+		    &request->prp_list.gpa_range, request->prp_cnt,
+		    vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
+	} else {
+		ret = vmbus_chan_send(outgoing_channel,
+		    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+		    vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
+	}
+	/* statistic for successful request sending on each channel */
+	if (!ret) {
+		sc->sysctl_data.chan_send_cnt[ch_sel]++;
+	}
+	mtx_lock(&request->softc->hs_lock);
+
+	if (ret != 0) {
+		printf("Unable to send packet %p ret %d", vstor_packet, ret);
+	} else {
+		atomic_add_int(&sc->hs_num_out_reqs, 1);
+	}
+
+	return (ret);
+}
+
+
+/**
+ * Process IO_COMPLETION_OPERATION and ready
+ * the result to be completed for upper layer
+ * processing by the CAM layer.
+ */
+static void
+hv_storvsc_on_iocompletion(struct storvsc_softc *sc,
+			   struct vstor_packet *vstor_packet,
+			   struct hv_storvsc_request *request)
+{
+	struct vmscsi_req *vm_srb;
+
+	vm_srb = &vstor_packet->u.vm_srb;
+
+	/*
+	 * Copy some fields of the host's response into the request structure,
+	 * because the fields will be used later in storvsc_io_done().
+	 */
+	request->vstor_packet.u.vm_srb.scsi_status = vm_srb->scsi_status;
+	request->vstor_packet.u.vm_srb.srb_status = vm_srb->srb_status;
+	request->vstor_packet.u.vm_srb.transfer_len = vm_srb->transfer_len;
+
+	if (((vm_srb->scsi_status & 0xFF) == SCSI_STATUS_CHECK_COND) &&
+			(vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID)) {
+		/* Autosense data available */
+
+		KASSERT(vm_srb->sense_info_len <= request->sense_info_len,
+				("vm_srb->sense_info_len <= "
+				 "request->sense_info_len"));
+
+		memcpy(request->sense_data, vm_srb->u.sense_data,
+			vm_srb->sense_info_len);
+
+		request->sense_info_len = vm_srb->sense_info_len;
+	}
+
+	/* Complete request by passing to the CAM layer */
+	storvsc_io_done(request);
+	atomic_subtract_int(&sc->hs_num_out_reqs, 1);
+	if (sc->hs_drain_notify && (sc->hs_num_out_reqs == 0)) {
+		sema_post(&sc->hs_drain_sema);
+	}
+}
+
+static void
+hv_storvsc_rescan_target(struct storvsc_softc *sc)
+{
+	path_id_t pathid;
+	target_id_t targetid;
+	union ccb *ccb;
+
+	pathid = cam_sim_path(sc->hs_sim);
+	targetid = CAM_TARGET_WILDCARD;
+
+	/*
+	 * Allocate a CCB and schedule a rescan.
+	 */
+	ccb = xpt_alloc_ccb_nowait();
+	if (ccb == NULL) {
+		printf("unable to alloc CCB for rescan\n");
+		return;
+	}
+
+	if (xpt_create_path(&ccb->ccb_h.path, NULL, pathid, targetid,
+	    CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
+		printf("unable to create path for rescan, pathid: %u,"
+		    "targetid: %u\n", pathid, targetid);
+		xpt_free_ccb(ccb);
+		return;
+	}
+
+	if (targetid == CAM_TARGET_WILDCARD)
+		ccb->ccb_h.func_code = XPT_SCAN_BUS;
+	else
+		ccb->ccb_h.func_code = XPT_SCAN_TGT;
+
+	xpt_rescan(ccb);
+}
+
+static void
+hv_storvsc_on_channel_callback(struct vmbus_channel *channel, void *xsc)
+{
+	int ret = 0;
+	struct storvsc_softc *sc = xsc;
+	uint32_t bytes_recvd;
+	uint64_t request_id;
+	uint8_t packet[roundup2(sizeof(struct vstor_packet), 8)];
+	struct hv_storvsc_request *request;
+	struct vstor_packet *vstor_packet;
+
+	bytes_recvd = roundup2(VSTOR_PKT_SIZE, 8);
+	ret = vmbus_chan_recv(channel, packet, &bytes_recvd, &request_id);
+	KASSERT(ret != ENOBUFS, ("storvsc recvbuf is not large enough"));
+	/* XXX check bytes_recvd to make sure that it contains enough data */
+
+	while ((ret == 0) && (bytes_recvd > 0)) {
+		request = (struct hv_storvsc_request *)(uintptr_t)request_id;
+
+		if ((request == &sc->hs_init_req) ||
+			(request == &sc->hs_reset_req)) {
+			memcpy(&request->vstor_packet, packet,
+				   sizeof(struct vstor_packet));
+			sema_post(&request->synch_sema);
+		} else {
+			vstor_packet = (struct vstor_packet *)packet;
+			switch(vstor_packet->operation) {
+			case VSTOR_OPERATION_COMPLETEIO:
+				if (request == NULL)
+					panic("VMBUS: storvsc received a "
+					    "packet with NULL request id in "
+					    "COMPLETEIO operation.");
+
+				hv_storvsc_on_iocompletion(sc,
+							vstor_packet, request);
+				break;
+			case VSTOR_OPERATION_REMOVEDEVICE:
+				printf("VMBUS: storvsc operation %d not "
+				    "implemented.\n", vstor_packet->operation);
+				/* TODO: implement */
+				break;
+			case VSTOR_OPERATION_ENUMERATE_BUS:
+				hv_storvsc_rescan_target(sc);
+				break;
+			default:
+				break;
+			}			
+		}
+
+		bytes_recvd = roundup2(VSTOR_PKT_SIZE, 8),
+		ret = vmbus_chan_recv(channel, packet, &bytes_recvd,
+		    &request_id);
+		KASSERT(ret != ENOBUFS,
+		    ("storvsc recvbuf is not large enough"));
+		/*
+		 * XXX check bytes_recvd to make sure that it contains
+		 * enough data
+		 */
+	}
+}
+
+/**
+ * @brief StorVSC probe function
+ *
+ * Device probe function.  Returns 0 if the input device is a StorVSC
+ * device.  Otherwise, a ENXIO is returned.  If the input device is
+ * for BlkVSC (paravirtual IDE) device and this support is disabled in
+ * favor of the emulated ATA/IDE device, return ENXIO.
+ *
+ * @param a device
+ * @returns 0 on success, ENXIO if not a matcing StorVSC device
+ */
+static int
+storvsc_probe(device_t dev)
+{
+	int ret	= ENXIO;
+	
+	switch (storvsc_get_storage_type(dev)) {
+	case DRIVER_BLKVSC:
+		if(bootverbose)
+			device_printf(dev,
+			    "Enlightened ATA/IDE detected\n");
+		device_set_desc(dev, g_drv_props_table[DRIVER_BLKVSC].drv_desc);
+		ret = BUS_PROBE_DEFAULT;
+		break;
+	case DRIVER_STORVSC:
+		if(bootverbose)
+			device_printf(dev, "Enlightened SCSI device detected\n");
+		device_set_desc(dev, g_drv_props_table[DRIVER_STORVSC].drv_desc);
+		ret = BUS_PROBE_DEFAULT;
+		break;
+	default:
+		ret = ENXIO;
+	}
+	return (ret);
+}
+
+static void
+storvsc_create_chan_sel(struct storvsc_softc *sc)
+{
+	struct vmbus_channel **subch;
+	int i, nsubch;
+
+	sc->hs_sel_chan[0] = sc->hs_chan;
+	nsubch = sc->hs_nchan - 1;
+	if (nsubch == 0)
+		return;
+
+	subch = vmbus_subchan_get(sc->hs_chan, nsubch);
+	for (i = 0; i < nsubch; i++)
+		sc->hs_sel_chan[i + 1] = subch[i];
+	vmbus_subchan_rel(subch, nsubch);
+}
+
+static int
+storvsc_init_requests(device_t dev)
+{
+	struct storvsc_softc *sc = device_get_softc(dev);
+	struct hv_storvsc_request *reqp;
+	int error, i;
+
+	LIST_INIT(&sc->hs_free_list);
+
+	error = bus_dma_tag_create(
+		bus_get_dma_tag(dev),		/* parent */
+		1,				/* alignment */
+		PAGE_SIZE,			/* boundary */
+		BUS_SPACE_MAXADDR,		/* lowaddr */
+		BUS_SPACE_MAXADDR,		/* highaddr */
+		NULL, NULL,			/* filter, filterarg */
+		STORVSC_DATA_SIZE_MAX,		/* maxsize */
+		STORVSC_DATA_SEGCNT_MAX,	/* nsegments */
+		STORVSC_DATA_SEGSZ_MAX,		/* maxsegsize */
+		0,				/* flags */
+		NULL,				/* lockfunc */
+		NULL,				/* lockfuncarg */
+		&sc->storvsc_req_dtag);
+	if (error) {
+		device_printf(dev, "failed to create storvsc dma tag\n");
+		return (error);
+	}
+
+	for (i = 0; i < sc->hs_drv_props->drv_max_ios_per_target; ++i) {
+		reqp = malloc(sizeof(struct hv_storvsc_request),
+				 M_DEVBUF, M_WAITOK|M_ZERO);
+		reqp->softc = sc;
+		error = bus_dmamap_create(sc->storvsc_req_dtag, 0,
+				&reqp->data_dmap);
+		if (error) {
+			device_printf(dev, "failed to allocate storvsc "
+			    "data dmamap\n");
+			goto cleanup;
+		}
+		LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link);
+	}
+	return (0);
+
+cleanup:
+	while ((reqp = LIST_FIRST(&sc->hs_free_list)) != NULL) {
+		LIST_REMOVE(reqp, link);
+		bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap);
+		free(reqp, M_DEVBUF);
+	}
+	return (error);
+}
+
+static void
+storvsc_sysctl(device_t dev)
+{
+	struct sysctl_oid_list *child;
+	struct sysctl_ctx_list *ctx;
+	struct sysctl_oid *ch_tree, *chid_tree;
+	struct storvsc_softc *sc;
+	char name[16];
+	int i;
+
+	sc = device_get_softc(dev);
+	ctx = device_get_sysctl_ctx(dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_bio_cnt", CTLFLAG_RW,
+		&sc->sysctl_data.data_bio_cnt, "# of bio data block");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_vaddr_cnt", CTLFLAG_RW,
+		&sc->sysctl_data.data_vaddr_cnt, "# of vaddr data block");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_sg_cnt", CTLFLAG_RW,
+		&sc->sysctl_data.data_sg_cnt, "# of sg data block");
+
+	/* dev.storvsc.UNIT.channel */
+	ch_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "channel",
+		CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+	if (ch_tree == NULL)
+		return;
+
+	for (i = 0; i < sc->hs_nchan; i++) {
+		uint32_t ch_id;
+
+		ch_id = vmbus_chan_id(sc->hs_sel_chan[i]);
+		snprintf(name, sizeof(name), "%d", ch_id);
+		/* dev.storvsc.UNIT.channel.CHID */
+		chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree),
+			OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+		if (chid_tree == NULL)
+			return;
+		/* dev.storvsc.UNIT.channel.CHID.send_req */
+		SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
+			"send_req", CTLFLAG_RD, &sc->sysctl_data.chan_send_cnt[i],
+			"# of request sending from this channel");
+	}
+}
+
+/**
+ * @brief StorVSC attach function
+ *
+ * Function responsible for allocating per-device structures,
+ * setting up CAM interfaces and scanning for available LUNs to
+ * be used for SCSI device peripherals.
+ *
+ * @param a device
+ * @returns 0 on success or an error on failure
+ */
+static int
+storvsc_attach(device_t dev)
+{
+	enum hv_storage_type stor_type;
+	struct storvsc_softc *sc;
+	struct cam_devq *devq;
+	int ret, i, j;
+	struct hv_storvsc_request *reqp;
+	struct root_hold_token *root_mount_token = NULL;
+	struct hv_sgl_node *sgl_node = NULL;
+	void *tmp_buff = NULL;
+
+	/*
+	 * We need to serialize storvsc attach calls.
+	 */
+	root_mount_token = root_mount_hold("storvsc");
+
+	sc = device_get_softc(dev);
+	sc->hs_nchan = 1;
+	sc->hs_chan = vmbus_get_channel(dev);
+
+	stor_type = storvsc_get_storage_type(dev);
+
+	if (stor_type == DRIVER_UNKNOWN) {
+		ret = ENODEV;
+		goto cleanup;
+	}
+
+	/* fill in driver specific properties */
+	sc->hs_drv_props = &g_drv_props_table[stor_type];
+	sc->hs_drv_props->drv_ringbuffer_size = hv_storvsc_ringbuffer_size;
+	sc->hs_drv_props->drv_max_ios_per_target =
+		MIN(STORVSC_MAX_IO, hv_storvsc_max_io);
+	if (bootverbose) {
+		printf("storvsc ringbuffer size: %d, max_io: %d\n",
+			sc->hs_drv_props->drv_ringbuffer_size,
+			sc->hs_drv_props->drv_max_ios_per_target);
+	}
+	/* fill in device specific properties */
+	sc->hs_unit	= device_get_unit(dev);
+	sc->hs_dev	= dev;
+
+	mtx_init(&sc->hs_lock, "hvslck", NULL, MTX_DEF);
+
+	ret = storvsc_init_requests(dev);
+	if (ret != 0)
+		goto cleanup;
+
+	/* create sg-list page pool */
+	if (FALSE == g_hv_sgl_page_pool.is_init) {
+		g_hv_sgl_page_pool.is_init = TRUE;
+		LIST_INIT(&g_hv_sgl_page_pool.in_use_sgl_list);
+		LIST_INIT(&g_hv_sgl_page_pool.free_sgl_list);
+
+		/*
+		 * Pre-create SG list, each SG list with
+		 * STORVSC_DATA_SEGCNT_MAX segments, each
+		 * segment has one page buffer
+		 */
+		for (i = 0; i < sc->hs_drv_props->drv_max_ios_per_target; i++) {
+	        	sgl_node = malloc(sizeof(struct hv_sgl_node),
+			    M_DEVBUF, M_WAITOK|M_ZERO);
+
+			sgl_node->sgl_data =
+			    sglist_alloc(STORVSC_DATA_SEGCNT_MAX,
+			    M_WAITOK|M_ZERO);
+
+			for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++) {
+				tmp_buff = malloc(PAGE_SIZE,
+				    M_DEVBUF, M_WAITOK|M_ZERO);
+
+				sgl_node->sgl_data->sg_segs[j].ss_paddr =
+				    (vm_paddr_t)tmp_buff;
+			}
+
+			LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list,
+			    sgl_node, link);
+		}
+	}
+
+	sc->hs_destroy = FALSE;
+	sc->hs_drain_notify = FALSE;
+	sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema");
+
+	ret = hv_storvsc_connect_vsp(sc);
+	if (ret != 0) {
+		goto cleanup;
+	}
+
+	/* Construct cpu to channel mapping */
+	storvsc_create_chan_sel(sc);
+
+	/*
+	 * Create the device queue.
+	 * Hyper-V maps each target to one SCSI HBA
+	 */
+	devq = cam_simq_alloc(sc->hs_drv_props->drv_max_ios_per_target);
+	if (devq == NULL) {
+		device_printf(dev, "Failed to alloc device queue\n");
+		ret = ENOMEM;
+		goto cleanup;
+	}
+
+	sc->hs_sim = cam_sim_alloc(storvsc_action,
+				storvsc_poll,
+				sc->hs_drv_props->drv_name,
+				sc,
+				sc->hs_unit,
+				&sc->hs_lock, 1,
+				sc->hs_drv_props->drv_max_ios_per_target,
+				devq);
+
+	if (sc->hs_sim == NULL) {
+		device_printf(dev, "Failed to alloc sim\n");
+		cam_simq_free(devq);
+		ret = ENOMEM;
+		goto cleanup;
+	}
+
+	mtx_lock(&sc->hs_lock);
+	/* bus_id is set to 0, need to get it from VMBUS channel query? */
+	if (xpt_bus_register(sc->hs_sim, dev, 0) != CAM_SUCCESS) {
+		cam_sim_free(sc->hs_sim, /*free_devq*/TRUE);
+		mtx_unlock(&sc->hs_lock);
+		device_printf(dev, "Unable to register SCSI bus\n");
+		ret = ENXIO;
+		goto cleanup;
+	}
+
+	if (xpt_create_path(&sc->hs_path, /*periph*/NULL,
+		 cam_sim_path(sc->hs_sim),
+		CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
+		xpt_bus_deregister(cam_sim_path(sc->hs_sim));
+		cam_sim_free(sc->hs_sim, /*free_devq*/TRUE);
+		mtx_unlock(&sc->hs_lock);
+		device_printf(dev, "Unable to create path\n");
+		ret = ENXIO;
+		goto cleanup;
+	}
+
+	mtx_unlock(&sc->hs_lock);
+
+	storvsc_sysctl(dev);
+
+	root_mount_rel(root_mount_token);
+	return (0);
+
+
+cleanup:
+	root_mount_rel(root_mount_token);
+	while (!LIST_EMPTY(&sc->hs_free_list)) {
+		reqp = LIST_FIRST(&sc->hs_free_list);
+		LIST_REMOVE(reqp, link);
+		bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap);
+		free(reqp, M_DEVBUF);
+	}
+
+	while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
+		sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
+		LIST_REMOVE(sgl_node, link);
+		for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++) {
+			if (NULL !=
+			    (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) {
+				free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF);
+			}
+		}
+		sglist_free(sgl_node->sgl_data);
+		free(sgl_node, M_DEVBUF);
+	}
+
+	return (ret);
+}
+
+/**
+ * @brief StorVSC device detach function
+ *
+ * This function is responsible for safely detaching a
+ * StorVSC device.  This includes waiting for inbound responses
+ * to complete and freeing associated per-device structures.
+ *
+ * @param dev a device
+ * returns 0 on success
+ */
+static int
+storvsc_detach(device_t dev)
+{
+	struct storvsc_softc *sc = device_get_softc(dev);
+	struct hv_storvsc_request *reqp = NULL;
+	struct hv_sgl_node *sgl_node = NULL;
+	int j = 0;
+
+	sc->hs_destroy = TRUE;
+
+	/*
+	 * At this point, all outbound traffic should be disabled. We
+	 * only allow inbound traffic (responses) to proceed so that
+	 * outstanding requests can be completed.
+	 */
+
+	sc->hs_drain_notify = TRUE;
+	sema_wait(&sc->hs_drain_sema);
+	sc->hs_drain_notify = FALSE;
+
+	/*
+	 * Since we have already drained, we don't need to busy wait.
+	 * The call to close the channel will reset the callback
+	 * under the protection of the incoming channel lock.
+	 */
+
+	vmbus_chan_close(sc->hs_chan);
+
+	mtx_lock(&sc->hs_lock);
+	while (!LIST_EMPTY(&sc->hs_free_list)) {
+		reqp = LIST_FIRST(&sc->hs_free_list);
+		LIST_REMOVE(reqp, link);
+		bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap);
+		free(reqp, M_DEVBUF);
+	}
+	mtx_unlock(&sc->hs_lock);
+
+	while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
+		sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
+		LIST_REMOVE(sgl_node, link);
+		for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++){
+			if (NULL !=
+			    (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) {
+				free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF);
+			}
+		}
+		sglist_free(sgl_node->sgl_data);
+		free(sgl_node, M_DEVBUF);
+	}
+	
+	return (0);
+}
+
+#if HVS_TIMEOUT_TEST
+/**
+ * @brief unit test for timed out operations
+ *
+ * This function provides unit testing capability to simulate
+ * timed out operations.  Recompilation with HV_TIMEOUT_TEST=1
+ * is required.
+ *
+ * @param reqp pointer to a request structure
+ * @param opcode SCSI operation being performed
+ * @param wait if 1, wait for I/O to complete
+ */
+static void
+storvsc_timeout_test(struct hv_storvsc_request *reqp,
+		uint8_t opcode, int wait)
+{
+	int ret;
+	union ccb *ccb = reqp->ccb;
+	struct storvsc_softc *sc = reqp->softc;
+
+	if (reqp->vstor_packet.vm_srb.cdb[0] != opcode) {
+		return;
+	}
+
+	if (wait) {
+		mtx_lock(&reqp->event.mtx);
+	}
+	ret = hv_storvsc_io_request(sc, reqp);
+	if (ret != 0) {
+		if (wait) {
+			mtx_unlock(&reqp->event.mtx);
+		}
+		printf("%s: io_request failed with %d.\n",
+				__func__, ret);
+		ccb->ccb_h.status = CAM_PROVIDE_FAIL;
+		mtx_lock(&sc->hs_lock);
+		storvsc_free_request(sc, reqp);
+		xpt_done(ccb);
+		mtx_unlock(&sc->hs_lock);
+		return;
+	}
+
+	if (wait) {
+		xpt_print(ccb->ccb_h.path,
+				"%u: %s: waiting for IO return.\n",
+				ticks, __func__);
+		ret = cv_timedwait(&reqp->event.cv, &reqp->event.mtx, 60*hz);
+		mtx_unlock(&reqp->event.mtx);
+		xpt_print(ccb->ccb_h.path, "%u: %s: %s.\n",
+				ticks, __func__, (ret == 0)?
+				"IO return detected" :
+				"IO return not detected");
+		/*
+		 * Now both the timer handler and io done are running
+		 * simultaneously. We want to confirm the io done always
+		 * finishes after the timer handler exits. So reqp used by
+		 * timer handler is not freed or stale. Do busy loop for
+		 * another 1/10 second to make sure io done does
+		 * wait for the timer handler to complete.
+		 */
+		DELAY(100*1000);
+		mtx_lock(&sc->hs_lock);
+		xpt_print(ccb->ccb_h.path,
+				"%u: %s: finishing, queue frozen %d, "
+				"ccb status 0x%x scsi_status 0x%x.\n",
+				ticks, __func__, sc->hs_frozen,
+				ccb->ccb_h.status,
+				ccb->csio.scsi_status);
+		mtx_unlock(&sc->hs_lock);
+	}
+}
+#endif /* HVS_TIMEOUT_TEST */
+
+#ifdef notyet
+/**
+ * @brief timeout handler for requests
+ *
+ * This function is called as a result of a callout expiring.
+ *
+ * @param arg pointer to a request
+ */
+static void
+storvsc_timeout(void *arg)
+{
+	struct hv_storvsc_request *reqp = arg;
+	struct storvsc_softc *sc = reqp->softc;
+	union ccb *ccb = reqp->ccb;
+
+	if (reqp->retries == 0) {
+		mtx_lock(&sc->hs_lock);
+		xpt_print(ccb->ccb_h.path,
+		    "%u: IO timed out (req=0x%p), wait for another %u secs.\n",
+		    ticks, reqp, ccb->ccb_h.timeout / 1000);
+		cam_error_print(ccb, CAM_ESF_ALL, CAM_EPF_ALL);
+		mtx_unlock(&sc->hs_lock);
+
+		reqp->retries++;
+		callout_reset_sbt(&reqp->callout, SBT_1MS * ccb->ccb_h.timeout,
+		    0, storvsc_timeout, reqp, 0);
+#if HVS_TIMEOUT_TEST
+		storvsc_timeout_test(reqp, SEND_DIAGNOSTIC, 0);
+#endif
+		return;
+	}
+
+	mtx_lock(&sc->hs_lock);
+	xpt_print(ccb->ccb_h.path,
+		"%u: IO (reqp = 0x%p) did not return for %u seconds, %s.\n",
+		ticks, reqp, ccb->ccb_h.timeout * (reqp->retries+1) / 1000,
+		(sc->hs_frozen == 0)?
+		"freezing the queue" : "the queue is already frozen");
+	if (sc->hs_frozen == 0) {
+		sc->hs_frozen = 1;
+		xpt_freeze_simq(xpt_path_sim(ccb->ccb_h.path), 1);
+	}
+	mtx_unlock(&sc->hs_lock);
+	
+#if HVS_TIMEOUT_TEST
+	storvsc_timeout_test(reqp, MODE_SELECT_10, 1);
+#endif
+}
+#endif
+
+/**
+ * @brief StorVSC device poll function
+ *
+ * This function is responsible for servicing requests when
+ * interrupts are disabled (i.e when we are dumping core.)
+ *
+ * @param sim a pointer to a CAM SCSI interface module
+ */
+static void
+storvsc_poll(struct cam_sim *sim)
+{
+	struct storvsc_softc *sc = cam_sim_softc(sim);
+
+	mtx_assert(&sc->hs_lock, MA_OWNED);
+	mtx_unlock(&sc->hs_lock);
+	hv_storvsc_on_channel_callback(sc->hs_chan, sc);
+	mtx_lock(&sc->hs_lock);
+}
+
+/**
+ * @brief StorVSC device action function
+ *
+ * This function is responsible for handling SCSI operations which
+ * are passed from the CAM layer.  The requests are in the form of
+ * CAM control blocks which indicate the action being performed.
+ * Not all actions require converting the request to a VSCSI protocol
+ * message - these actions can be responded to by this driver.
+ * Requests which are destined for a backend storage device are converted
+ * to a VSCSI protocol message and sent on the channel connection associated
+ * with this device.
+ *
+ * @param sim pointer to a CAM SCSI interface module
+ * @param ccb pointer to a CAM control block
+ */
+static void
+storvsc_action(struct cam_sim *sim, union ccb *ccb)
+{
+	struct storvsc_softc *sc = cam_sim_softc(sim);
+	int res;
+
+	mtx_assert(&sc->hs_lock, MA_OWNED);
+	switch (ccb->ccb_h.func_code) {
+	case XPT_PATH_INQ: {
+		struct ccb_pathinq *cpi = &ccb->cpi;
+
+		cpi->version_num = 1;
+		cpi->hba_inquiry = PI_TAG_ABLE|PI_SDTR_ABLE;
+		cpi->target_sprt = 0;
+		cpi->hba_misc = PIM_NOBUSRESET;
+		if (hv_storvsc_use_pim_unmapped)
+			cpi->hba_misc |= PIM_UNMAPPED;
+		cpi->maxio = STORVSC_DATA_SIZE_MAX;
+		cpi->hba_eng_cnt = 0;
+		cpi->max_target = STORVSC_MAX_TARGETS;
+		cpi->max_lun = sc->hs_drv_props->drv_max_luns_per_target;
+		cpi->initiator_id = cpi->max_target;
+		cpi->bus_id = cam_sim_bus(sim);
+		cpi->base_transfer_speed = 300000;
+		cpi->transport = XPORT_SAS;
+		cpi->transport_version = 0;
+		cpi->protocol = PROTO_SCSI;
+		cpi->protocol_version = SCSI_REV_SPC2;
+		strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
+		strlcpy(cpi->hba_vid, sc->hs_drv_props->drv_name, HBA_IDLEN);
+		strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
+		cpi->unit_number = cam_sim_unit(sim);
+
+		ccb->ccb_h.status = CAM_REQ_CMP;
+		xpt_done(ccb);
+		return;
+	}
+	case XPT_GET_TRAN_SETTINGS: {
+		struct  ccb_trans_settings *cts = &ccb->cts;
+
+		cts->transport = XPORT_SAS;
+		cts->transport_version = 0;
+		cts->protocol = PROTO_SCSI;
+		cts->protocol_version = SCSI_REV_SPC2;
+
+		/* enable tag queuing and disconnected mode */
+		cts->proto_specific.valid = CTS_SCSI_VALID_TQ;
+		cts->proto_specific.scsi.valid = CTS_SCSI_VALID_TQ;
+		cts->proto_specific.scsi.flags = CTS_SCSI_FLAGS_TAG_ENB;
+		cts->xport_specific.valid = CTS_SPI_VALID_DISC;
+		cts->xport_specific.spi.flags = CTS_SPI_FLAGS_DISC_ENB;
+			
+		ccb->ccb_h.status = CAM_REQ_CMP;
+		xpt_done(ccb);
+		return;
+	}
+	case XPT_SET_TRAN_SETTINGS:	{
+		ccb->ccb_h.status = CAM_REQ_CMP;
+		xpt_done(ccb);
+		return;
+	}
+	case XPT_CALC_GEOMETRY:{
+		cam_calc_geometry(&ccb->ccg, 1);
+		xpt_done(ccb);
+		return;
+	}
+	case  XPT_RESET_BUS:
+	case  XPT_RESET_DEV:{
+#if HVS_HOST_RESET
+		if ((res = hv_storvsc_host_reset(sc)) != 0) {
+			xpt_print(ccb->ccb_h.path,
+				"hv_storvsc_host_reset failed with %d\n", res);
+			ccb->ccb_h.status = CAM_PROVIDE_FAIL;
+			xpt_done(ccb);
+			return;
+		}
+		ccb->ccb_h.status = CAM_REQ_CMP;
+		xpt_done(ccb);
+		return;
+#else
+		xpt_print(ccb->ccb_h.path,
+				  "%s reset not supported.\n",
+				  (ccb->ccb_h.func_code == XPT_RESET_BUS)?
+				  "bus" : "dev");
+		ccb->ccb_h.status = CAM_REQ_INVALID;
+		xpt_done(ccb);
+		return;
+#endif	/* HVS_HOST_RESET */
+	}
+	case XPT_SCSI_IO:
+	case XPT_IMMED_NOTIFY: {
+		struct hv_storvsc_request *reqp = NULL;
+		bus_dmamap_t dmap_saved;
+
+		if (ccb->csio.cdb_len == 0) {
+			panic("cdl_len is 0\n");
+		}
+
+		if (LIST_EMPTY(&sc->hs_free_list)) {
+			ccb->ccb_h.status = CAM_REQUEUE_REQ;
+			if (sc->hs_frozen == 0) {
+				sc->hs_frozen = 1;
+				xpt_freeze_simq(sim, /* count*/1);
+			}
+			xpt_done(ccb);
+			return;
+		}
+
+		reqp = LIST_FIRST(&sc->hs_free_list);
+		LIST_REMOVE(reqp, link);
+
+		/* Save the data_dmap before reset request */
+		dmap_saved = reqp->data_dmap;
+
+		/* XXX this is ugly */
+		bzero(reqp, sizeof(struct hv_storvsc_request));
+
+		/* Restore necessary bits */
+		reqp->data_dmap = dmap_saved;
+		reqp->softc = sc;
+		
+		ccb->ccb_h.status |= CAM_SIM_QUEUED;
+		if ((res = create_storvsc_request(ccb, reqp)) != 0) {
+			ccb->ccb_h.status = CAM_REQ_INVALID;
+			xpt_done(ccb);
+			return;
+		}
+
+#ifdef notyet
+		if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) {
+			callout_init(&reqp->callout, 1);
+			callout_reset_sbt(&reqp->callout,
+			    SBT_1MS * ccb->ccb_h.timeout, 0,
+			    storvsc_timeout, reqp, 0);
+#if HVS_TIMEOUT_TEST
+			cv_init(&reqp->event.cv, "storvsc timeout cv");
+			mtx_init(&reqp->event.mtx, "storvsc timeout mutex",
+					NULL, MTX_DEF);
+			switch (reqp->vstor_packet.vm_srb.cdb[0]) {
+				case MODE_SELECT_10:
+				case SEND_DIAGNOSTIC:
+					/* To have timer send the request. */
+					return;
+				default:
+					break;
+			}
+#endif /* HVS_TIMEOUT_TEST */
+		}
+#endif
+
+		if ((res = hv_storvsc_io_request(sc, reqp)) != 0) {
+			xpt_print(ccb->ccb_h.path,
+				"hv_storvsc_io_request failed with %d\n", res);
+			ccb->ccb_h.status = CAM_PROVIDE_FAIL;
+			storvsc_free_request(sc, reqp);
+			xpt_done(ccb);
+			return;
+		}
+		return;
+	}
+
+	default:
+		ccb->ccb_h.status = CAM_REQ_INVALID;
+		xpt_done(ccb);
+		return;
+	}
+}
+
+/**
+ * @brief destroy bounce buffer
+ *
+ * This function is responsible for destroy a Scatter/Gather list
+ * that create by storvsc_create_bounce_buffer()
+ *
+ * @param sgl- the Scatter/Gather need be destroy
+ * @param sg_count- page count of the SG list.
+ *
+ */
+static void
+storvsc_destroy_bounce_buffer(struct sglist *sgl)
+{
+	struct hv_sgl_node *sgl_node = NULL;
+	if (LIST_EMPTY(&g_hv_sgl_page_pool.in_use_sgl_list)) {
+		printf("storvsc error: not enough in use sgl\n");
+		return;
+	}
+	sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list);
+	LIST_REMOVE(sgl_node, link);
+	sgl_node->sgl_data = sgl;
+	LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link);
+}
+
+/**
+ * @brief create bounce buffer
+ *
+ * This function is responsible for create a Scatter/Gather list,
+ * which hold several pages that can be aligned with page size.
+ *
+ * @param seg_count- SG-list segments count
+ * @param write - if WRITE_TYPE, set SG list page used size to 0,
+ * otherwise set used size to page size.
+ *
+ * return NULL if create failed
+ */
+static struct sglist *
+storvsc_create_bounce_buffer(uint16_t seg_count, int write)
+{
+	int i = 0;
+	struct sglist *bounce_sgl = NULL;
+	unsigned int buf_len = ((write == WRITE_TYPE) ? 0 : PAGE_SIZE);
+	struct hv_sgl_node *sgl_node = NULL;	
+
+	/* get struct sglist from free_sgl_list */
+	if (LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
+		printf("storvsc error: not enough free sgl\n");
+		return NULL;
+	}
+	sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
+	LIST_REMOVE(sgl_node, link);
+	bounce_sgl = sgl_node->sgl_data;
+	LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link);
+
+	bounce_sgl->sg_maxseg = seg_count;
+
+	if (write == WRITE_TYPE)
+		bounce_sgl->sg_nseg = 0;
+	else
+		bounce_sgl->sg_nseg = seg_count;
+
+	for (i = 0; i < seg_count; i++)
+	        bounce_sgl->sg_segs[i].ss_len = buf_len;
+
+	return bounce_sgl;
+}
+
+/**
+ * @brief copy data from SG list to bounce buffer
+ *
+ * This function is responsible for copy data from one SG list's segments
+ * to another SG list which used as bounce buffer.
+ *
+ * @param bounce_sgl - the destination SG list
+ * @param orig_sgl - the segment of the source SG list.
+ * @param orig_sgl_count - the count of segments.
+ * @param orig_sgl_count - indicate which segment need bounce buffer,
+ *  set 1 means need.
+ *
+ */
+static void
+storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl,
+			       bus_dma_segment_t *orig_sgl,
+			       unsigned int orig_sgl_count,
+			       uint64_t seg_bits)
+{
+	int src_sgl_idx = 0;
+
+	for (src_sgl_idx = 0; src_sgl_idx < orig_sgl_count; src_sgl_idx++) {
+		if (seg_bits & (1 << src_sgl_idx)) {
+			memcpy((void*)bounce_sgl->sg_segs[src_sgl_idx].ss_paddr,
+			    (void*)orig_sgl[src_sgl_idx].ds_addr,
+			    orig_sgl[src_sgl_idx].ds_len);
+
+			bounce_sgl->sg_segs[src_sgl_idx].ss_len =
+			    orig_sgl[src_sgl_idx].ds_len;
+		}
+	}
+}
+
+/**
+ * @brief copy data from SG list which used as bounce to another SG list
+ *
+ * This function is responsible for copy data from one SG list with bounce
+ * buffer to another SG list's segments.
+ *
+ * @param dest_sgl - the destination SG list's segments
+ * @param dest_sgl_count - the count of destination SG list's segment.
+ * @param src_sgl - the source SG list.
+ * @param seg_bits - indicate which segment used bounce buffer of src SG-list.
+ *
+ */
+void
+storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl,
+				    unsigned int dest_sgl_count,
+				    struct sglist* src_sgl,
+				    uint64_t seg_bits)
+{
+	int sgl_idx = 0;
+	
+	for (sgl_idx = 0; sgl_idx < dest_sgl_count; sgl_idx++) {
+		if (seg_bits & (1 << sgl_idx)) {
+			memcpy((void*)(dest_sgl[sgl_idx].ds_addr),
+			    (void*)(src_sgl->sg_segs[sgl_idx].ss_paddr),
+			    src_sgl->sg_segs[sgl_idx].ss_len);
+		}
+	}
+}
+
+/**
+ * @brief check SG list with bounce buffer or not
+ *
+ * This function is responsible for check if need bounce buffer for SG list.
+ *
+ * @param sgl - the SG list's segments
+ * @param sg_count - the count of SG list's segment.
+ * @param bits - segmengs number that need bounce buffer
+ *
+ * return -1 if SG list needless bounce buffer
+ */
+static int
+storvsc_check_bounce_buffer_sgl(bus_dma_segment_t *sgl,
+				unsigned int sg_count,
+				uint64_t *bits)
+{
+	int i = 0;
+	int offset = 0;
+	uint64_t phys_addr = 0;
+	uint64_t tmp_bits = 0;
+	boolean_t found_hole = FALSE;
+	boolean_t pre_aligned = TRUE;
+
+	if (sg_count < 2){
+		return -1;
+	}
+
+	*bits = 0;
+	
+	phys_addr = vtophys(sgl[0].ds_addr);
+	offset =  phys_addr - trunc_page(phys_addr);
+
+	if (offset != 0) {
+		pre_aligned = FALSE;
+		tmp_bits |= 1;
+	}
+
+	for (i = 1; i < sg_count; i++) {
+		phys_addr = vtophys(sgl[i].ds_addr);
+		offset =  phys_addr - trunc_page(phys_addr);
+
+		if (offset == 0) {
+			if (FALSE == pre_aligned){
+				/*
+				 * This segment is aligned, if the previous
+				 * one is not aligned, find a hole
+				 */
+				found_hole = TRUE;
+			}
+			pre_aligned = TRUE;
+		} else {
+			tmp_bits |= 1ULL << i;
+			if (!pre_aligned) {
+				if (phys_addr != vtophys(sgl[i-1].ds_addr +
+				    sgl[i-1].ds_len)) {
+					/*
+					 * Check whether connect to previous
+					 * segment,if not, find the hole
+					 */
+					found_hole = TRUE;
+				}
+			} else {
+				found_hole = TRUE;
+			}
+			pre_aligned = FALSE;
+		}
+	}
+
+	if (!found_hole) {
+		return (-1);
+	} else {
+		*bits = tmp_bits;
+		return 0;
+	}
+}
+
+/**
+ * Copy bus_dma segments to multiple page buffer, which requires
+ * the pages are compact composed except for the 1st and last pages.
+ */
+static void
+storvsc_xferbuf_prepare(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
+{
+	struct hv_storvsc_request *reqp = arg;
+	union ccb *ccb = reqp->ccb;
+	struct ccb_scsiio *csio = &ccb->csio;
+	struct storvsc_gpa_range *prplist;
+	int i;
+
+	prplist = &reqp->prp_list;
+	prplist->gpa_range.gpa_len = csio->dxfer_len;
+	prplist->gpa_range.gpa_ofs = segs[0].ds_addr & PAGE_MASK;
+
+	for (i = 0; i < nsegs; i++) {
+#ifdef INVARIANTS
+		if (nsegs > 1) {
+			if (i == 0) {
+				KASSERT((segs[i].ds_addr & PAGE_MASK) +
+				    segs[i].ds_len == PAGE_SIZE,
+				    ("invalid 1st page, ofs 0x%jx, len %zu",
+				     (uintmax_t)segs[i].ds_addr,
+				     segs[i].ds_len));
+			} else if (i == nsegs - 1) {
+				KASSERT((segs[i].ds_addr & PAGE_MASK) == 0,
+				    ("invalid last page, ofs 0x%jx",
+				     (uintmax_t)segs[i].ds_addr));
+			} else {
+				KASSERT((segs[i].ds_addr & PAGE_MASK) == 0 &&
+				    segs[i].ds_len == PAGE_SIZE,
+				    ("not a full page, ofs 0x%jx, len %zu",
+				     (uintmax_t)segs[i].ds_addr,
+				     segs[i].ds_len));
+			}
+		}
+#endif
+		prplist->gpa_page[i] = atop(segs[i].ds_addr);
+	}
+	reqp->prp_cnt = nsegs;
+}
+
+/**
+ * @brief Fill in a request structure based on a CAM control block
+ *
+ * Fills in a request structure based on the contents of a CAM control
+ * block.  The request structure holds the payload information for
+ * VSCSI protocol request.
+ *
+ * @param ccb pointer to a CAM contorl block
+ * @param reqp pointer to a request structure
+ */
+static int
+create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp)
+{
+	struct ccb_scsiio *csio = &ccb->csio;
+	uint64_t phys_addr;
+	uint32_t pfn;
+	uint64_t not_aligned_seg_bits = 0;
+	int error;
+	
+	/* refer to struct vmscsi_req for meanings of these two fields */
+	reqp->vstor_packet.u.vm_srb.port =
+		cam_sim_unit(xpt_path_sim(ccb->ccb_h.path));
+	reqp->vstor_packet.u.vm_srb.path_id =
+		cam_sim_bus(xpt_path_sim(ccb->ccb_h.path));
+
+	reqp->vstor_packet.u.vm_srb.target_id = ccb->ccb_h.target_id;
+	reqp->vstor_packet.u.vm_srb.lun = ccb->ccb_h.target_lun;
+
+	reqp->vstor_packet.u.vm_srb.cdb_len = csio->cdb_len;
+	if(ccb->ccb_h.flags & CAM_CDB_POINTER) {
+		memcpy(&reqp->vstor_packet.u.vm_srb.u.cdb, csio->cdb_io.cdb_ptr,
+			csio->cdb_len);
+	} else {
+		memcpy(&reqp->vstor_packet.u.vm_srb.u.cdb, csio->cdb_io.cdb_bytes,
+			csio->cdb_len);
+	}
+
+	if (hv_storvsc_use_win8ext_flags) {
+		reqp->vstor_packet.u.vm_srb.win8_extension.time_out_value = 60;
+		reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |=
+			SRB_FLAGS_DISABLE_SYNCH_TRANSFER;
+	}
+	switch (ccb->ccb_h.flags & CAM_DIR_MASK) {
+	case CAM_DIR_OUT:
+		reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE;
+		if (hv_storvsc_use_win8ext_flags) {
+			reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |=
+				SRB_FLAGS_DATA_OUT;
+		}
+		break;
+	case CAM_DIR_IN:
+		reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE;
+		if (hv_storvsc_use_win8ext_flags) {
+			reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |=
+				SRB_FLAGS_DATA_IN;
+		}
+		break;
+	case CAM_DIR_NONE:
+		reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE;
+		if (hv_storvsc_use_win8ext_flags) {
+			reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |=
+				SRB_FLAGS_NO_DATA_TRANSFER;
+		}
+		break;
+	default:
+		printf("Error: unexpected data direction: 0x%x\n",
+			ccb->ccb_h.flags & CAM_DIR_MASK);
+		return (EINVAL);
+	}
+
+	reqp->sense_data     = &csio->sense_data;
+	reqp->sense_info_len = csio->sense_len;
+
+	reqp->ccb = ccb;
+	ccb->ccb_h.spriv_ptr0 = reqp;
+
+	if (0 == csio->dxfer_len) {
+		return (0);
+	}
+
+	switch (ccb->ccb_h.flags & CAM_DATA_MASK) {
+	case CAM_DATA_BIO:
+	case CAM_DATA_VADDR:
+		error = bus_dmamap_load_ccb(reqp->softc->storvsc_req_dtag,
+		    reqp->data_dmap, ccb, storvsc_xferbuf_prepare, reqp,
+		    BUS_DMA_NOWAIT);
+		if (error) {
+			xpt_print(ccb->ccb_h.path,
+			    "bus_dmamap_load_ccb failed: %d\n", error);
+			return (error);
+		}
+		if ((ccb->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO)
+			reqp->softc->sysctl_data.data_bio_cnt++;
+		else
+			reqp->softc->sysctl_data.data_vaddr_cnt++;
+		break;
+
+	case CAM_DATA_SG:
+	{
+		struct storvsc_gpa_range *prplist;
+		int i = 0;
+		int offset = 0;
+		int ret;
+
+		bus_dma_segment_t *storvsc_sglist =
+		    (bus_dma_segment_t *)ccb->csio.data_ptr;
+		u_int16_t storvsc_sg_count = ccb->csio.sglist_cnt;
+
+		prplist = &reqp->prp_list;
+		prplist->gpa_range.gpa_len = csio->dxfer_len;
+
+		printf("Storvsc: get SG I/O operation, %d\n",
+		    reqp->vstor_packet.u.vm_srb.data_in);
+
+		if (storvsc_sg_count > STORVSC_DATA_SEGCNT_MAX){
+			printf("Storvsc: %d segments is too much, "
+			    "only support %d segments\n",
+			    storvsc_sg_count, STORVSC_DATA_SEGCNT_MAX);
+			return (EINVAL);
+		}
+
+		/*
+		 * We create our own bounce buffer function currently. Idealy
+		 * we should use BUS_DMA(9) framework. But with current BUS_DMA
+		 * code there is no callback API to check the page alignment of
+		 * middle segments before busdma can decide if a bounce buffer
+		 * is needed for particular segment. There is callback,
+		 * "bus_dma_filter_t *filter", but the parrameters are not
+		 * sufficient for storvsc driver.
+		 * TODO:
+		 *	Add page alignment check in BUS_DMA(9) callback. Once
+		 *	this is complete, switch the following code to use
+		 *	BUS_DMA(9) for storvsc bounce buffer support.
+		 */
+		/* check if we need to create bounce buffer */
+		ret = storvsc_check_bounce_buffer_sgl(storvsc_sglist,
+		    storvsc_sg_count, &not_aligned_seg_bits);
+		if (ret != -1) {
+			reqp->bounce_sgl =
+			    storvsc_create_bounce_buffer(storvsc_sg_count,
+			    reqp->vstor_packet.u.vm_srb.data_in);
+			if (NULL == reqp->bounce_sgl) {
+				printf("Storvsc_error: "
+				    "create bounce buffer failed.\n");
+				return (ENOMEM);
+			}
+
+			reqp->bounce_sgl_count = storvsc_sg_count;
+			reqp->not_aligned_seg_bits = not_aligned_seg_bits;
+
+			/*
+			 * if it is write, we need copy the original data
+			 *to bounce buffer
+			 */
+			if (WRITE_TYPE == reqp->vstor_packet.u.vm_srb.data_in) {
+				storvsc_copy_sgl_to_bounce_buf(
+				    reqp->bounce_sgl,
+				    storvsc_sglist,
+				    storvsc_sg_count,
+				    reqp->not_aligned_seg_bits);
+			}
+
+			/* transfer virtual address to physical frame number */
+			if (reqp->not_aligned_seg_bits & 0x1){
+ 				phys_addr =
+				    vtophys(reqp->bounce_sgl->sg_segs[0].ss_paddr);
+			}else{
+ 				phys_addr =
+					vtophys(storvsc_sglist[0].ds_addr);
+			}
+			prplist->gpa_range.gpa_ofs = phys_addr & PAGE_MASK;
+
+			pfn = phys_addr >> PAGE_SHIFT;
+			prplist->gpa_page[0] = pfn;
+			
+			for (i = 1; i < storvsc_sg_count; i++) {
+				if (reqp->not_aligned_seg_bits & (1 << i)) {
+					phys_addr =
+					    vtophys(reqp->bounce_sgl->sg_segs[i].ss_paddr);
+				} else {
+					phys_addr =
+					    vtophys(storvsc_sglist[i].ds_addr);
+				}
+
+				pfn = phys_addr >> PAGE_SHIFT;
+				prplist->gpa_page[i] = pfn;
+			}
+			reqp->prp_cnt = i;
+		} else {
+			phys_addr = vtophys(storvsc_sglist[0].ds_addr);
+
+			prplist->gpa_range.gpa_ofs = phys_addr & PAGE_MASK;
+
+			for (i = 0; i < storvsc_sg_count; i++) {
+				phys_addr = vtophys(storvsc_sglist[i].ds_addr);
+				pfn = phys_addr >> PAGE_SHIFT;
+				prplist->gpa_page[i] = pfn;
+			}
+			reqp->prp_cnt = i;
+
+			/* check the last segment cross boundary or not */
+			offset = phys_addr & PAGE_MASK;
+			if (offset) {
+				/* Add one more PRP entry */
+				phys_addr =
+				    vtophys(storvsc_sglist[i-1].ds_addr +
+				    PAGE_SIZE - offset);
+				pfn = phys_addr >> PAGE_SHIFT;
+				prplist->gpa_page[i] = pfn;
+				reqp->prp_cnt++;
+			}
+			
+			reqp->bounce_sgl_count = 0;
+		}
+		reqp->softc->sysctl_data.data_sg_cnt++;
+		break;
+	}
+	default:
+		printf("Unknow flags: %d\n", ccb->ccb_h.flags);
+		return(EINVAL);
+	}
+
+	return(0);
+}
+
+static uint32_t
+is_scsi_valid(const struct scsi_inquiry_data *inq_data)
+{
+	u_int8_t type;
+
+	type = SID_TYPE(inq_data);
+	if (type == T_NODEVICE)
+		return (0);
+	if (SID_QUAL(inq_data) == SID_QUAL_BAD_LU)
+		return (0);
+	return (1);
+}
+
+/**
+ * @brief completion function before returning to CAM
+ *
+ * I/O process has been completed and the result needs
+ * to be passed to the CAM layer.
+ * Free resources related to this request.
+ *
+ * @param reqp pointer to a request structure
+ */
+static void
+storvsc_io_done(struct hv_storvsc_request *reqp)
+{
+	union ccb *ccb = reqp->ccb;
+	struct ccb_scsiio *csio = &ccb->csio;
+	struct storvsc_softc *sc = reqp->softc;
+	struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb;
+	bus_dma_segment_t *ori_sglist = NULL;
+	int ori_sg_count = 0;
+	const struct scsi_generic *cmd;
+
+	/* destroy bounce buffer if it is used */
+	if (reqp->bounce_sgl_count) {
+		ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr;
+		ori_sg_count = ccb->csio.sglist_cnt;
+
+		/*
+		 * If it is READ operation, we should copy back the data
+		 * to original SG list.
+		 */
+		if (READ_TYPE == reqp->vstor_packet.u.vm_srb.data_in) {
+			storvsc_copy_from_bounce_buf_to_sgl(ori_sglist,
+			    ori_sg_count,
+			    reqp->bounce_sgl,
+			    reqp->not_aligned_seg_bits);
+		}
+
+		storvsc_destroy_bounce_buffer(reqp->bounce_sgl);
+		reqp->bounce_sgl_count = 0;
+	}
+		
+	if (reqp->retries > 0) {
+		mtx_lock(&sc->hs_lock);
+#if HVS_TIMEOUT_TEST
+		xpt_print(ccb->ccb_h.path,
+			"%u: IO returned after timeout, "
+			"waking up timer handler if any.\n", ticks);
+		mtx_lock(&reqp->event.mtx);
+		cv_signal(&reqp->event.cv);
+		mtx_unlock(&reqp->event.mtx);
+#endif
+		reqp->retries = 0;
+		xpt_print(ccb->ccb_h.path,
+			"%u: IO returned after timeout, "
+			"stopping timer if any.\n", ticks);
+		mtx_unlock(&sc->hs_lock);
+	}
+
+#ifdef notyet
+	/*
+	 * callout_drain() will wait for the timer handler to finish
+	 * if it is running. So we don't need any lock to synchronize
+	 * between this routine and the timer handler.
+	 * Note that we need to make sure reqp is not freed when timer
+	 * handler is using or will use it.
+	 */
+	if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) {
+		callout_drain(&reqp->callout);
+	}
+#endif
+	cmd = (const struct scsi_generic *)
+	    ((ccb->ccb_h.flags & CAM_CDB_POINTER) ?
+	     csio->cdb_io.cdb_ptr : csio->cdb_io.cdb_bytes);
+
+	ccb->ccb_h.status &= ~CAM_SIM_QUEUED;
+	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
+	int srb_status = SRB_STATUS(vm_srb->srb_status);
+#ifdef DIAGNOSTIC
+	if (hv_storvsc_srb_status != -1) {
+		srb_status = SRB_STATUS(hv_storvsc_srb_status & 0x3f);
+		hv_storvsc_srb_status = -1;
+	}
+#endif /* DIAGNOSTIC */
+	if (vm_srb->scsi_status == SCSI_STATUS_OK) {
+		if (srb_status != SRB_STATUS_SUCCESS) {
+			bool log_error = true;
+			switch (srb_status) {
+				case SRB_STATUS_PENDING:
+					/* We should never get this */
+					panic("storvsc_io_done: SRB_STATUS_PENDING");
+					break;
+				case SRB_STATUS_ABORTED:
+					/*
+					 * storvsc doesn't support aborts yet
+					 * but if we ever get this status
+					 * the I/O is complete - treat it as a
+					 * timeout
+					 */
+					ccb->ccb_h.status |= CAM_CMD_TIMEOUT;
+					break;
+				case SRB_STATUS_ABORT_FAILED:
+					/* We should never get this */
+					panic("storvsc_io_done: SRB_STATUS_ABORT_FAILED");
+					break;
+				case SRB_STATUS_ERROR:
+					/*
+					 * We should never get this.
+					 * Treat it as a CAM_UNREC_HBA_ERROR.
+					 * It will be retried
+					 */
+					ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+					break;
+				case SRB_STATUS_BUSY:
+					/* Host is busy. Delay and retry */
+					ccb->ccb_h.status |= CAM_BUSY;
+					break;
+				case SRB_STATUS_INVALID_REQUEST:
+				case SRB_STATUS_INVALID_PATH_ID:
+				case SRB_STATUS_NO_DEVICE:
+				case SRB_STATUS_INVALID_TARGET_ID:
+					/*
+					 * These indicate an invalid address
+					 * and really should never be seen.
+					 * A CAM_PATH_INVALID could be
+					 * used here but I want to run
+					 * down retries.  Do a CAM_BUSY
+					 * since the host might be having issues.
+					 */
+					ccb->ccb_h.status |= CAM_BUSY;
+					break;
+				case SRB_STATUS_TIMEOUT:
+				case SRB_STATUS_COMMAND_TIMEOUT:
+					/* The backend has timed this out */
+					ccb->ccb_h.status |= CAM_BUSY;
+					break;
+				/* Some old pSCSI errors below */
+				case SRB_STATUS_SELECTION_TIMEOUT:
+				case SRB_STATUS_MESSAGE_REJECTED:
+				case SRB_STATUS_PARITY_ERROR:
+				case SRB_STATUS_NO_HBA:
+				case SRB_STATUS_DATA_OVERRUN:
+				case SRB_STATUS_UNEXPECTED_BUS_FREE:
+				case SRB_STATUS_PHASE_SEQUENCE_FAILURE:
+					/*
+					 * Old pSCSI responses, should never get.
+					 * If we do treat as a CAM_UNREC_HBA_ERROR
+					 * which will be retried
+					 */
+					ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+					break;
+				case SRB_STATUS_BUS_RESET:
+					ccb->ccb_h.status |= CAM_SCSI_BUS_RESET;
+					break;
+				case SRB_STATUS_BAD_SRB_BLOCK_LENGTH:
+					/*
+					 * The request block is malformed and
+					 * I doubt it is from the guest. Just retry.
+					 */
+					ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+					break;
+				/* Not used statuses just retry */
+				case SRB_STATUS_REQUEST_FLUSHED:
+				case SRB_STATUS_BAD_FUNCTION:
+				case SRB_STATUS_NOT_POWERED:
+					ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+					break;
+				case SRB_STATUS_INVALID_LUN:
+					/*
+					 * Don't log an EMS for this response since
+					 * there is no device at this LUN. This is a
+					 * normal and expected response when a device
+					 * is detached.
+					 */
+					ccb->ccb_h.status |= CAM_DEV_NOT_THERE;
+					log_error = false;
+					break;
+				case SRB_STATUS_ERROR_RECOVERY:
+				case SRB_STATUS_LINK_DOWN:
+					/*
+					 * I don't ever expect these from
+					 * the host but if we ever get
+					 * retry after a delay
+					 */
+					ccb->ccb_h.status |= CAM_BUSY;
+					break;
+				default:
+					/*
+					 * An undefined response assert on
+					 * on debug builds else retry
+					 */
+					ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+					KASSERT(srb_status <= SRB_STATUS_LINK_DOWN,
+					    ("storvsc: %s, unexpected srb_status of 0x%x",
+					    __func__, srb_status));
+					break;
+			}
+			if (log_error) {
+				xpt_print(ccb->ccb_h.path, "The hypervisor's I/O adapter "
+					"driver received an unexpected response code 0x%x "
+					"for operation: %s. If this continues to occur, "
+					"report the condition to your hypervisor vendor so "
+					"they can rectify the issue.\n", srb_status,
+					scsi_op_desc(cmd->opcode, NULL));
+			}
+		} else {
+			ccb->ccb_h.status |= CAM_REQ_CMP;
+		}
+
+		if (cmd->opcode == INQUIRY &&
+		    srb_status == SRB_STATUS_SUCCESS) {
+			int resp_xfer_len, resp_buf_len, data_len;
+			uint8_t *resp_buf = (uint8_t *)csio->data_ptr;
+			struct scsi_inquiry_data *inq_data =
+			    (struct scsi_inquiry_data *)csio->data_ptr;
+
+			/* Get the buffer length reported by host */
+			resp_xfer_len = vm_srb->transfer_len;
+
+			/* Get the available buffer length */
+			resp_buf_len = resp_xfer_len >= 5 ? resp_buf[4] + 5 : 0;
+			data_len = (resp_buf_len < resp_xfer_len) ?
+			    resp_buf_len : resp_xfer_len;
+			if (bootverbose && data_len >= 5) {
+				xpt_print(ccb->ccb_h.path, "storvsc inquiry "
+				    "(%d) [%x %x %x %x %x ... ]\n", data_len,
+				    resp_buf[0], resp_buf[1], resp_buf[2],
+				    resp_buf[3], resp_buf[4]);
+			}
+			/*
+			 * XXX: Hyper-V (since win2012r2) responses inquiry with
+			 * unknown version (0) for GEN-2 DVD device.
+			 * Manually set the version number to SPC3 in order to
+			 * ask CAM to continue probing with "PROBE_REPORT_LUNS".
+			 * see probedone() in scsi_xpt.c
+			 */
+			if (SID_TYPE(inq_data) == T_CDROM &&
+			    inq_data->version == 0 &&
+			    (vmstor_proto_version >= VMSTOR_PROTOCOL_VERSION_WIN8)) {
+				inq_data->version = SCSI_REV_SPC3;
+				if (bootverbose) {
+					xpt_print(ccb->ccb_h.path,
+					    "set version from 0 to %d\n",
+					    inq_data->version);
+				}
+			}
+			/*
+			 * XXX: Manually fix the wrong response returned from WS2012
+			 */
+			if (!is_scsi_valid(inq_data) &&
+			    (vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8_1 ||
+			    vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8 ||
+			    vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN7)) {
+				if (data_len >= 4 &&
+				    (resp_buf[2] == 0 || resp_buf[3] == 0)) {
+					resp_buf[2] = SCSI_REV_SPC3;
+					resp_buf[3] = 2; // resp fmt must be 2
+					if (bootverbose)
+						xpt_print(ccb->ccb_h.path,
+						    "fix version and resp fmt for 0x%x\n",
+						    vmstor_proto_version);
+				}
+			} else if (data_len >= SHORT_INQUIRY_LENGTH) {
+				char vendor[16];
+
+				cam_strvis(vendor, inq_data->vendor,
+				    sizeof(inq_data->vendor), sizeof(vendor));
+				/*
+				 * XXX: Upgrade SPC2 to SPC3 if host is WIN8 or
+				 * WIN2012 R2 in order to support UNMAP feature.
+				 */
+				if (!strncmp(vendor, "Msft", 4) &&
+				    SID_ANSI_REV(inq_data) == SCSI_REV_SPC2 &&
+				    (vmstor_proto_version ==
+				     VMSTOR_PROTOCOL_VERSION_WIN8_1 ||
+				     vmstor_proto_version ==
+				     VMSTOR_PROTOCOL_VERSION_WIN8)) {
+					inq_data->version = SCSI_REV_SPC3;
+					if (bootverbose) {
+						xpt_print(ccb->ccb_h.path,
+						    "storvsc upgrades "
+						    "SPC2 to SPC3\n");
+					}
+				}
+			}
+		}
+	} else {
+		/**
+		 * On Some Windows hosts TEST_UNIT_READY command can return
+		 * SRB_STATUS_ERROR and sense data, for example, asc=0x3a,1
+		 * "(Medium not present - tray closed)". This error can be
+		 * ignored since it will be sent to host periodically.
+		 */
+		boolean_t unit_not_ready = \
+		    vm_srb->scsi_status == SCSI_STATUS_CHECK_COND &&
+		    cmd->opcode == TEST_UNIT_READY &&
+		    srb_status == SRB_STATUS_ERROR;
+		if (!unit_not_ready && bootverbose) {
+			mtx_lock(&sc->hs_lock);
+			xpt_print(ccb->ccb_h.path,
+				"storvsc scsi_status = %d, srb_status = %d\n",
+				vm_srb->scsi_status, srb_status);
+			mtx_unlock(&sc->hs_lock);
+		}
+		ccb->ccb_h.status |= CAM_SCSI_STATUS_ERROR;
+	}
+
+	ccb->csio.scsi_status = (vm_srb->scsi_status & 0xFF);
+	if (srb_status == SRB_STATUS_SUCCESS ||
+	    srb_status == SRB_STATUS_DATA_OVERRUN)
+		ccb->csio.resid = ccb->csio.dxfer_len - vm_srb->transfer_len;
+	else
+		ccb->csio.resid = ccb->csio.dxfer_len;
+
+	if ((vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID) != 0 &&
+	    reqp->sense_info_len != 0) {
+		csio->sense_resid = csio->sense_len - reqp->sense_info_len;
+		ccb->ccb_h.status |= CAM_AUTOSNS_VALID;
+	}
+
+	mtx_lock(&sc->hs_lock);
+	if (reqp->softc->hs_frozen == 1) {
+		xpt_print(ccb->ccb_h.path,
+			"%u: storvsc unfreezing softc 0x%p.\n",
+			ticks, reqp->softc);
+		ccb->ccb_h.status |= CAM_RELEASE_SIMQ;
+		reqp->softc->hs_frozen = 0;
+	}
+	storvsc_free_request(sc, reqp);
+	mtx_unlock(&sc->hs_lock);
+
+	xpt_done_direct(ccb);
+}
+
+/**
+ * @brief Free a request structure
+ *
+ * Free a request structure by returning it to the free list
+ *
+ * @param sc pointer to a softc
+ * @param reqp pointer to a request structure
+ */	
+static void
+storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp)
+{
+
+	LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link);
+}
+
+/**
+ * @brief Determine type of storage device from GUID
+ *
+ * Using the type GUID, determine if this is a StorVSC (paravirtual
+ * SCSI or BlkVSC (paravirtual IDE) device.
+ *
+ * @param dev a device
+ * returns an enum
+ */
+static enum hv_storage_type
+storvsc_get_storage_type(device_t dev)
+{
+	device_t parent = device_get_parent(dev);
+
+	if (VMBUS_PROBE_GUID(parent, dev, &gBlkVscDeviceType) == 0)
+		return DRIVER_BLKVSC;
+	if (VMBUS_PROBE_GUID(parent, dev, &gStorVscDeviceType) == 0)
+		return DRIVER_STORVSC;
+	return DRIVER_UNKNOWN;
+}
+
+#define	PCI_VENDOR_INTEL	0x8086
+#define	PCI_PRODUCT_PIIX4	0x7111
+
+static void
+storvsc_ada_probe_veto(void *arg __unused, struct cam_path *path,
+    struct ata_params *ident_buf __unused, int *veto)
+{
+
+	/*
+	 * The ATA disks are shared with the controllers managed
+	 * by this driver, so veto the ATA disks' attachment; the
+	 * ATA disks will be attached as SCSI disks once this driver
+	 * attached.
+	 */
+	if (path->device->protocol == PROTO_ATA) {
+		struct ccb_pathinq cpi;
+
+		xpt_path_inq(&cpi, path);
+		if (cpi.ccb_h.status == CAM_REQ_CMP &&
+		    cpi.hba_vendor == PCI_VENDOR_INTEL &&
+		    cpi.hba_device == PCI_PRODUCT_PIIX4) {
+			(*veto)++;
+			if (bootverbose) {
+				xpt_print(path,
+				    "Disable ATA disks on "
+				    "simulated ATA controller (0x%04x%04x)\n",
+				    cpi.hba_device, cpi.hba_vendor);
+			}
+		}
+	}
+}
+
+static void
+storvsc_sysinit(void *arg __unused)
+{
+	if (vm_guest == VM_GUEST_HV) {
+		storvsc_handler_tag = EVENTHANDLER_REGISTER(ada_probe_veto,
+		    storvsc_ada_probe_veto, NULL, EVENTHANDLER_PRI_ANY);
+	}
+}
+SYSINIT(storvsc_sys_init, SI_SUB_DRIVERS, SI_ORDER_SECOND, storvsc_sysinit,
+    NULL);
+
+static void
+storvsc_sysuninit(void *arg __unused)
+{
+	if (storvsc_handler_tag != NULL)
+		EVENTHANDLER_DEREGISTER(ada_probe_veto, storvsc_handler_tag);
+}
+SYSUNINIT(storvsc_sys_uninit, SI_SUB_DRIVERS, SI_ORDER_SECOND,
+    storvsc_sysuninit, NULL);
diff --git a/sys/dev/hyperv/storvsc/hv_vstorage.h b/sys/dev/hyperv/storvsc/hv_vstorage.h
new file mode 100644
index 000000000000..f1d4c1dfd2e2
--- /dev/null
+++ b/sys/dev/hyperv/storvsc/hv_vstorage.h
@@ -0,0 +1,311 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2009-2012,2017 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __HV_VSTORAGE_H__
+#define __HV_VSTORAGE_H__
+
+/*
+ * Major/minor macros.  Minor version is in LSB, meaning that earlier flat
+ * version numbers will be interpreted as "0.x" (i.e., 1 becomes 0.1).
+ */
+
+#define VMSTOR_PROTOCOL_MAJOR(VERSION_)         (((VERSION_) >> 8) & 0xff)
+#define VMSTOR_PROTOCOL_MINOR(VERSION_)         (((VERSION_)     ) & 0xff)
+#define VMSTOR_PROTOCOL_VERSION(MAJOR_, MINOR_) ((((MAJOR_) & 0xff) << 8) | \
+                                                 (((MINOR_) & 0xff)     ))
+
+#define VMSTOR_PROTOCOL_VERSION_WIN6       VMSTOR_PROTOCOL_VERSION(2, 0)
+#define VMSTOR_PROTOCOL_VERSION_WIN7       VMSTOR_PROTOCOL_VERSION(4, 2)
+#define VMSTOR_PROTOCOL_VERSION_WIN8       VMSTOR_PROTOCOL_VERSION(5, 1)
+#define VMSTOR_PROTOCOL_VERSION_WIN8_1     VMSTOR_PROTOCOL_VERSION(6, 0)
+#define VMSTOR_PROTOCOL_VERSION_WIN10      VMSTOR_PROTOCOL_VERSION(6, 2)
+/*
+ * Invalid version.
+ */
+#define VMSTOR_INVALID_PROTOCOL_VERSION  -1
+
+/*
+ * Version history:
+ * V1 Beta                    0.1
+ * V1 RC < 2008/1/31          1.0
+ * V1 RC > 2008/1/31          2.0
+ * Win7: 4.2
+ * Win8: 5.1
+ */
+
+#define VMSTOR_PROTOCOL_VERSION_CURRENT	VMSTOR_PROTOCOL_VERSION(5, 1)
+
+/**
+ *  Packet structure ops describing virtual storage requests.
+ */
+enum vstor_packet_ops {
+	VSTOR_OPERATION_COMPLETEIO            = 1,
+	VSTOR_OPERATION_REMOVEDEVICE          = 2,
+	VSTOR_OPERATION_EXECUTESRB            = 3,
+	VSTOR_OPERATION_RESETLUN              = 4,
+	VSTOR_OPERATION_RESETADAPTER          = 5,
+	VSTOR_OPERATION_RESETBUS              = 6,
+	VSTOR_OPERATION_BEGININITIALIZATION   = 7,
+	VSTOR_OPERATION_ENDINITIALIZATION     = 8,
+	VSTOR_OPERATION_QUERYPROTOCOLVERSION  = 9,
+	VSTOR_OPERATION_QUERYPROPERTIES       = 10,
+	VSTOR_OPERATION_ENUMERATE_BUS         = 11,
+	VSTOR_OPERATION_FCHBA_DATA            = 12,
+	VSTOR_OPERATION_CREATE_MULTI_CHANNELS = 13,
+	VSTOR_OPERATION_MAXIMUM               = 13
+};
+
+
+/*
+ *  Platform neutral description of a scsi request -
+ *  this remains the same across the write regardless of 32/64 bit
+ *  note: it's patterned off the Windows DDK SCSI_PASS_THROUGH structure
+ */
+
+#define CDB16GENERIC_LENGTH			0x10
+#define SENSE_BUFFER_SIZE			0x14
+#define MAX_DATA_BUFFER_LENGTH_WITH_PADDING	0x14
+
+#define POST_WIN7_STORVSC_SENSE_BUFFER_SIZE	0x14
+#define PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE	0x12
+
+
+struct vmscsi_win8_extension {
+	/*
+	 * The following were added in Windows 8
+	 */
+	uint16_t reserve;
+	uint8_t  queue_tag;
+	uint8_t  queue_action;
+	uint32_t srb_flags;
+	uint32_t time_out_value;
+	uint32_t queue_sort_ey;
+} __packed;
+
+struct vmscsi_req {
+	uint16_t length;
+	uint8_t  srb_status;
+	uint8_t  scsi_status;
+
+	/* HBA number, set to the order number detected by initiator. */
+	uint8_t  port;
+	/* SCSI bus number or bus_id, different from CAM's path_id. */
+	uint8_t  path_id;
+
+	uint8_t  target_id;
+	uint8_t  lun;
+
+	uint8_t  cdb_len;
+	uint8_t  sense_info_len;
+	uint8_t  data_in;
+	uint8_t  reserved;
+
+	uint32_t transfer_len;
+
+	union {
+	    uint8_t cdb[CDB16GENERIC_LENGTH];
+
+	    uint8_t sense_data[SENSE_BUFFER_SIZE];
+
+	    uint8_t reserved_array[MAX_DATA_BUFFER_LENGTH_WITH_PADDING];
+	} u;
+
+	/*
+	 * The following was added in win8.
+	 */
+	struct vmscsi_win8_extension win8_extension;
+
+} __packed;
+
+/**
+ *  This structure is sent during the initialization phase to get the different
+ *  properties of the channel.
+ */
+
+struct vmstor_chan_props {
+	uint16_t proto_ver;
+	uint8_t  path_id;
+	uint8_t  target_id;
+
+	uint16_t max_channel_cnt;
+
+	/**
+	 * Note: port number is only really known on the client side
+	 */
+	uint16_t port;
+	uint32_t flags;
+	uint32_t max_transfer_bytes;
+
+	/**
+	 *  This id is unique for each channel and will correspond with
+	 *  vendor specific data in the inquiry_ata
+	 */
+	uint64_t unique_id;
+
+} __packed;
+
+/**
+ *  This structure is sent during the storage protocol negotiations.
+ */
+
+struct vmstor_proto_ver
+{
+	/**
+	 * Major (MSW) and minor (LSW) version numbers.
+	 */
+	uint16_t major_minor;
+
+	uint16_t revision;			/* always zero */
+} __packed;
+
+/**
+ * Channel Property Flags
+ */
+
+#define STORAGE_CHANNEL_REMOVABLE_FLAG                  0x1
+#define STORAGE_CHANNEL_EMULATED_IDE_FLAG               0x2
+
+
+struct vstor_packet {
+	/**
+	 * Requested operation type
+	 */
+	enum vstor_packet_ops operation;
+
+	/*
+	 * Flags - see below for values
+	 */
+	uint32_t flags;
+
+	/**
+	 * Status of the request returned from the server side.
+	 */
+	uint32_t status;
+
+	union
+	{
+	    /**
+	     * Structure used to forward SCSI commands from the client to
+	     * the server.
+	     */
+	    struct vmscsi_req vm_srb;
+
+	    /**
+	     * Structure used to query channel properties.
+	     */
+	    struct vmstor_chan_props chan_props;
+
+	    /**
+	     * Used during version negotiations.
+	     */
+	    struct vmstor_proto_ver version;
+
+	    /**
+             * Number of multichannels to create
+	     */
+	    uint16_t multi_channels_cnt;
+	} u;
+
+} __packed;
+
+
+/**
+ * SRB (SCSI Request Block) Status Codes
+ */
+#define SRB_STATUS_PENDING                  0x00
+#define SRB_STATUS_SUCCESS                  0x01
+#define SRB_STATUS_ABORTED                  0x02
+#define SRB_STATUS_ABORT_FAILED             0x03
+#define SRB_STATUS_ERROR                    0x04
+#define SRB_STATUS_BUSY                     0x05
+#define SRB_STATUS_INVALID_REQUEST          0x06
+#define SRB_STATUS_INVALID_PATH_ID          0x07
+#define SRB_STATUS_NO_DEVICE                0x08
+#define SRB_STATUS_TIMEOUT                  0x09
+#define SRB_STATUS_SELECTION_TIMEOUT        0x0A
+#define SRB_STATUS_COMMAND_TIMEOUT          0x0B
+#define SRB_STATUS_MESSAGE_REJECTED         0x0D
+#define SRB_STATUS_BUS_RESET                0x0E
+#define SRB_STATUS_PARITY_ERROR             0x0F
+#define SRB_STATUS_REQUEST_SENSE_FAILED     0x10
+#define SRB_STATUS_NO_HBA                   0x11
+#define SRB_STATUS_DATA_OVERRUN             0x12
+#define SRB_STATUS_UNEXPECTED_BUS_FREE      0x13
+#define SRB_STATUS_PHASE_SEQUENCE_FAILURE   0x14
+#define SRB_STATUS_BAD_SRB_BLOCK_LENGTH     0x15
+#define SRB_STATUS_REQUEST_FLUSHED          0x16
+#define SRB_STATUS_INVALID_LUN              0x20
+#define SRB_STATUS_INVALID_TARGET_ID        0x21
+#define SRB_STATUS_BAD_FUNCTION             0x22
+#define SRB_STATUS_ERROR_RECOVERY           0x23
+#define SRB_STATUS_NOT_POWERED              0x24
+#define SRB_STATUS_LINK_DOWN                0x25
+/**
+ * SRB Status Masks (can be combined with above status codes)
+ */
+#define SRB_STATUS_QUEUE_FROZEN         0x40
+#define SRB_STATUS_AUTOSENSE_VALID      0x80
+
+#define SRB_STATUS(status)	\
+	((status) & ~(SRB_STATUS_AUTOSENSE_VALID | SRB_STATUS_QUEUE_FROZEN))
+/*
+ * SRB Flag Bits
+ */
+
+#define SRB_FLAGS_QUEUE_ACTION_ENABLE           0x00000002
+#define SRB_FLAGS_DISABLE_DISCONNECT            0x00000004
+#define SRB_FLAGS_DISABLE_SYNCH_TRANSFER        0x00000008
+#define SRB_FLAGS_BYPASS_FROZEN_QUEUE           0x00000010
+#define SRB_FLAGS_DISABLE_AUTOSENSE             0x00000020
+#define SRB_FLAGS_DATA_IN                       0x00000040
+#define SRB_FLAGS_DATA_OUT                      0x00000080
+#define SRB_FLAGS_NO_DATA_TRANSFER              0x00000000
+#define SRB_FLAGS_UNSPECIFIED_DIRECTION (SRB_FLAGS_DATA_IN | SRB_FLAGS_DATA_OUT)
+#define SRB_FLAGS_NO_QUEUE_FREEZE               0x00000100
+#define SRB_FLAGS_ADAPTER_CACHE_ENABLE          0x00000200
+#define SRB_FLAGS_FREE_SENSE_BUFFER             0x00000400
+/**
+ *  Packet flags
+ */
+
+/**
+ *  This flag indicates that the server should send back a completion for this
+ *  packet.
+ */
+#define REQUEST_COMPLETION_FLAG	0x1
+
+/**
+ *  This is the set of flags that the vsc can set in any packets it sends
+ */
+#define VSC_LEGAL_FLAGS (REQUEST_COMPLETION_FLAG)
+
+#endif /* __HV_VSTORAGE_H__ */
diff --git a/sys/dev/hyperv/utilities/hv_kvp.c b/sys/dev/hyperv/utilities/hv_kvp.c
new file mode 100644
index 000000000000..8da0936f6cd7
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_kvp.c
@@ -0,0 +1,920 @@
+/*-
+ * Copyright (c) 2014,2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ *	Author:	Sainath Varanasi.
+ *	Date:	4/2012
+ *	Email:	bsdic@microsoft.com
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/reboot.h>
+#include <sys/lock.h>
+#include <sys/taskqueue.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/un.h>
+#include <sys/endian.h>
+#include <sys/_null.h>
+#include <sys/sema.h>
+#include <sys/signal.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/mutex.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/utilities/hv_utilreg.h>
+#include <dev/hyperv/utilities/vmbus_icreg.h>
+#include <dev/hyperv/utilities/vmbus_icvar.h>
+
+#include "unicode.h"
+#include "hv_kvp.h"
+#include "vmbus_if.h"
+
+/* hv_kvp defines */
+#define BUFFERSIZE	sizeof(struct hv_kvp_msg)
+#define kvp_hdr		hdr.kvp_hdr
+
+#define KVP_FWVER_MAJOR		3
+#define KVP_FWVER		VMBUS_IC_VERSION(KVP_FWVER_MAJOR, 0)
+
+#define KVP_MSGVER_MAJOR	4
+#define KVP_MSGVER		VMBUS_IC_VERSION(KVP_MSGVER_MAJOR, 0)
+
+/* hv_kvp debug control */
+static int hv_kvp_log = 0;
+
+#define	hv_kvp_log_error(...)	do {				\
+	if (hv_kvp_log > 0)				\
+		log(LOG_ERR, "hv_kvp: " __VA_ARGS__);	\
+} while (0)
+
+#define	hv_kvp_log_info(...) do {				\
+	if (hv_kvp_log > 1)				\
+		log(LOG_INFO, "hv_kvp: " __VA_ARGS__);		\
+} while (0)
+
+static const struct vmbus_ic_desc vmbus_kvp_descs[] = {
+	{
+		.ic_guid = { .hv_guid = {
+		    0xe7, 0xf4, 0xa0, 0xa9, 0x45, 0x5a, 0x96, 0x4d,
+		    0xb8, 0x27, 0x8a, 0x84, 0x1e, 0x8c, 0x3,  0xe6 } },
+		.ic_desc = "Hyper-V KVP"
+	},
+	VMBUS_IC_DESC_END
+};
+
+/* character device prototypes */
+static d_open_t		hv_kvp_dev_open;
+static d_close_t	hv_kvp_dev_close;
+static d_read_t		hv_kvp_dev_daemon_read;
+static d_write_t	hv_kvp_dev_daemon_write;
+static d_poll_t		hv_kvp_dev_daemon_poll;
+
+/* hv_kvp character device structure */
+static struct cdevsw hv_kvp_cdevsw =
+{
+	.d_version	= D_VERSION,
+	.d_open		= hv_kvp_dev_open,
+	.d_close	= hv_kvp_dev_close,
+	.d_read		= hv_kvp_dev_daemon_read,
+	.d_write	= hv_kvp_dev_daemon_write,
+	.d_poll		= hv_kvp_dev_daemon_poll,
+	.d_name		= "hv_kvp_dev",
+};
+
+
+/*
+ * Global state to track and synchronize multiple
+ * KVP transaction requests from the host.
+ */
+typedef struct hv_kvp_sc {
+	struct vmbus_ic_softc	util_sc;
+	device_t		dev;
+
+	/* Unless specified the pending mutex should be
+	 * used to alter the values of the following parameters:
+	 * 1. req_in_progress
+	 * 2. req_timed_out
+	 */
+	struct mtx		pending_mutex;
+
+	struct task		task;
+
+	/* To track if transaction is active or not */
+	boolean_t		req_in_progress;
+	/* Tracks if daemon did not reply back in time */
+	boolean_t		req_timed_out;
+	/* Tracks if daemon is serving a request currently */
+	boolean_t		daemon_busy;
+
+	/* Length of host message */
+	uint32_t		host_msg_len;
+
+	/* Host message id */
+	uint64_t		host_msg_id;
+
+	/* Current kvp message from the host */
+	struct hv_kvp_msg	*host_kvp_msg;
+
+	 /* Current kvp message for daemon */
+	struct hv_kvp_msg	daemon_kvp_msg;
+
+	/* Rcv buffer for communicating with the host*/
+	uint8_t			*rcv_buf;
+
+	/* Device semaphore to control communication */
+	struct sema		dev_sema;
+
+	/* Indicates if daemon registered with driver */
+	boolean_t		register_done;
+
+	/* Character device status */
+	boolean_t		dev_accessed;
+
+	struct cdev *hv_kvp_dev;
+
+	struct proc *daemon_task;
+
+	struct selinfo hv_kvp_selinfo;
+} hv_kvp_sc;
+
+/* hv_kvp prototypes */
+static int	hv_kvp_req_in_progress(hv_kvp_sc *sc);
+static void	hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t, uint64_t, uint8_t *);
+static void	hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc);
+static void	hv_kvp_process_request(void *context, int pending);
+
+/*
+ * hv_kvp low level functions
+ */
+
+/*
+ * Check if kvp transaction is in progres
+ */
+static int
+hv_kvp_req_in_progress(hv_kvp_sc *sc)
+{
+
+	return (sc->req_in_progress);
+}
+
+
+/*
+ * This routine is called whenever a message is received from the host
+ */
+static void
+hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t rcv_len,
+			uint64_t request_id, uint8_t *rcv_buf)
+{
+
+	/* Store all the relevant message details in the global structure */
+	/* Do not need to use mutex for req_in_progress here */
+	sc->req_in_progress = true;
+	sc->host_msg_len = rcv_len;
+	sc->host_msg_id = request_id;
+	sc->rcv_buf = rcv_buf;
+	sc->host_kvp_msg = (struct hv_kvp_msg *)&rcv_buf[
+	    sizeof(struct hv_vmbus_pipe_hdr) +
+	    sizeof(struct hv_vmbus_icmsg_hdr)];
+}
+
+/*
+ * Convert ip related info in umsg from utf8 to utf16 and store in hmsg
+ */
+static int
+hv_kvp_convert_utf8_ipinfo_to_utf16(struct hv_kvp_msg *umsg,
+				    struct hv_kvp_ip_msg *host_ip_msg)
+{
+	int err_ip, err_subnet, err_gway, err_dns, err_adap;
+	int UNUSED_FLAG = 1;
+
+	utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.ip_addr,
+	    MAX_IP_ADDR_SIZE,
+	    (char *)umsg->body.kvp_ip_val.ip_addr,
+	    strlen((char *)umsg->body.kvp_ip_val.ip_addr),
+	    UNUSED_FLAG,
+	    &err_ip);
+	utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.sub_net,
+	    MAX_IP_ADDR_SIZE,
+	    (char *)umsg->body.kvp_ip_val.sub_net,
+	    strlen((char *)umsg->body.kvp_ip_val.sub_net),
+	    UNUSED_FLAG,
+	    &err_subnet);
+	utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.gate_way,
+	    MAX_GATEWAY_SIZE,
+	    (char *)umsg->body.kvp_ip_val.gate_way,
+	    strlen((char *)umsg->body.kvp_ip_val.gate_way),
+	    UNUSED_FLAG,
+	    &err_gway);
+	utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.dns_addr,
+	    MAX_IP_ADDR_SIZE,
+	    (char *)umsg->body.kvp_ip_val.dns_addr,
+	    strlen((char *)umsg->body.kvp_ip_val.dns_addr),
+	    UNUSED_FLAG,
+	    &err_dns);
+	utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.adapter_id,
+	    MAX_ADAPTER_ID_SIZE,
+	    (char *)umsg->body.kvp_ip_val.adapter_id,
+	    strlen((char *)umsg->body.kvp_ip_val.adapter_id),
+	    UNUSED_FLAG,
+	    &err_adap);
+
+	host_ip_msg->kvp_ip_val.dhcp_enabled = umsg->body.kvp_ip_val.dhcp_enabled;
+	host_ip_msg->kvp_ip_val.addr_family = umsg->body.kvp_ip_val.addr_family;
+
+	return (err_ip | err_subnet | err_gway | err_dns | err_adap);
+}
+
+
+/*
+ * Convert ip related info in hmsg from utf16 to utf8 and store in umsg
+ */
+static int
+hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg,
+				    struct hv_kvp_msg *umsg)
+{
+	int err_ip, err_subnet, err_gway, err_dns, err_adap;
+	int UNUSED_FLAG = 1;
+	device_t *devs;
+	int devcnt;
+
+	/* IP Address */
+	utf16_to_utf8((char *)umsg->body.kvp_ip_val.ip_addr,
+	    MAX_IP_ADDR_SIZE,
+	    (uint16_t *)host_ip_msg->kvp_ip_val.ip_addr,
+	    MAX_IP_ADDR_SIZE,
+	    UNUSED_FLAG,
+	    &err_ip);
+
+	/* Adapter ID : GUID */
+	utf16_to_utf8((char *)umsg->body.kvp_ip_val.adapter_id,
+	    MAX_ADAPTER_ID_SIZE,
+	    (uint16_t *)host_ip_msg->kvp_ip_val.adapter_id,
+	    MAX_ADAPTER_ID_SIZE,
+	    UNUSED_FLAG,
+	    &err_adap);
+
+	if (devclass_get_devices(devclass_find("hn"), &devs, &devcnt) == 0) {
+		for (devcnt = devcnt - 1; devcnt >= 0; devcnt--) {
+			device_t dev = devs[devcnt];
+			struct vmbus_channel *chan;
+			char buf[HYPERV_GUID_STRLEN];
+			int n;
+
+			chan = vmbus_get_channel(dev);
+			n = hyperv_guid2str(vmbus_chan_guid_inst(chan), buf,
+			    sizeof(buf));
+
+			/*
+			 * The string in the 'kvp_ip_val.adapter_id' has
+			 * braces around the GUID; skip the leading brace
+			 * in 'kvp_ip_val.adapter_id'.
+			 */
+			if (strncmp(buf,
+			    ((char *)&umsg->body.kvp_ip_val.adapter_id) + 1,
+			    n) == 0) {
+				strlcpy((char *)umsg->body.kvp_ip_val.adapter_id,
+				    device_get_nameunit(dev), MAX_ADAPTER_ID_SIZE);
+				break;
+			}
+		}
+		free(devs, M_TEMP);
+	}
+
+	/* Address Family , DHCP , SUBNET, Gateway, DNS */
+	umsg->kvp_hdr.operation = host_ip_msg->operation;
+	umsg->body.kvp_ip_val.addr_family = host_ip_msg->kvp_ip_val.addr_family;
+	umsg->body.kvp_ip_val.dhcp_enabled = host_ip_msg->kvp_ip_val.dhcp_enabled;
+	utf16_to_utf8((char *)umsg->body.kvp_ip_val.sub_net, MAX_IP_ADDR_SIZE,
+	    (uint16_t *)host_ip_msg->kvp_ip_val.sub_net,
+	    MAX_IP_ADDR_SIZE,
+	    UNUSED_FLAG,
+	    &err_subnet);
+
+	utf16_to_utf8((char *)umsg->body.kvp_ip_val.gate_way, MAX_GATEWAY_SIZE,
+	    (uint16_t *)host_ip_msg->kvp_ip_val.gate_way,
+	    MAX_GATEWAY_SIZE,
+	    UNUSED_FLAG,
+	    &err_gway);
+
+	utf16_to_utf8((char *)umsg->body.kvp_ip_val.dns_addr, MAX_IP_ADDR_SIZE,
+	    (uint16_t *)host_ip_msg->kvp_ip_val.dns_addr,
+	    MAX_IP_ADDR_SIZE,
+	    UNUSED_FLAG,
+	    &err_dns);
+
+	return (err_ip | err_subnet | err_gway | err_dns | err_adap);
+}
+
+
+/*
+ * Prepare a user kvp msg based on host kvp msg (utf16 to utf8)
+ * Ensure utf16_utf8 takes care of the additional string terminating char!!
+ */
+static void
+hv_kvp_convert_hostmsg_to_usermsg(struct hv_kvp_msg *hmsg, struct hv_kvp_msg *umsg)
+{
+	int utf_err = 0;
+	uint32_t value_type;
+	struct hv_kvp_ip_msg *host_ip_msg;
+
+	host_ip_msg = (struct hv_kvp_ip_msg*)hmsg;
+	memset(umsg, 0, sizeof(struct hv_kvp_msg));
+
+	umsg->kvp_hdr.operation = hmsg->kvp_hdr.operation;
+	umsg->kvp_hdr.pool = hmsg->kvp_hdr.pool;
+
+	switch (umsg->kvp_hdr.operation) {
+	case HV_KVP_OP_SET_IP_INFO:
+		hv_kvp_convert_utf16_ipinfo_to_utf8(host_ip_msg, umsg);
+		break;
+
+	case HV_KVP_OP_GET_IP_INFO:
+		utf16_to_utf8((char *)umsg->body.kvp_ip_val.adapter_id,
+		    MAX_ADAPTER_ID_SIZE,
+		    (uint16_t *)host_ip_msg->kvp_ip_val.adapter_id,
+		    MAX_ADAPTER_ID_SIZE, 1, &utf_err);
+
+		umsg->body.kvp_ip_val.addr_family =
+		    host_ip_msg->kvp_ip_val.addr_family;
+		break;
+
+	case HV_KVP_OP_SET:
+		value_type = hmsg->body.kvp_set.data.value_type;
+
+		switch (value_type) {
+		case HV_REG_SZ:
+			umsg->body.kvp_set.data.value_size =
+			    utf16_to_utf8(
+				(char *)umsg->body.kvp_set.data.msg_value.value,
+				HV_KVP_EXCHANGE_MAX_VALUE_SIZE - 1,
+				(uint16_t *)hmsg->body.kvp_set.data.msg_value.value,
+				hmsg->body.kvp_set.data.value_size,
+				1, &utf_err);
+			/* utf8 encoding */
+			umsg->body.kvp_set.data.value_size =
+			    umsg->body.kvp_set.data.value_size / 2;
+			break;
+
+		case HV_REG_U32:
+			umsg->body.kvp_set.data.value_size =
+			    sprintf(umsg->body.kvp_set.data.msg_value.value, "%d",
+				hmsg->body.kvp_set.data.msg_value.value_u32) + 1;
+			break;
+
+		case HV_REG_U64:
+			umsg->body.kvp_set.data.value_size =
+			    sprintf(umsg->body.kvp_set.data.msg_value.value, "%llu",
+				(unsigned long long)
+				hmsg->body.kvp_set.data.msg_value.value_u64) + 1;
+			break;
+		}
+
+		umsg->body.kvp_set.data.key_size =
+		    utf16_to_utf8(
+			umsg->body.kvp_set.data.key,
+			HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1,
+			(uint16_t *)hmsg->body.kvp_set.data.key,
+			hmsg->body.kvp_set.data.key_size,
+			1, &utf_err);
+
+		/* utf8 encoding */
+		umsg->body.kvp_set.data.key_size =
+		    umsg->body.kvp_set.data.key_size / 2;
+		break;
+
+	case HV_KVP_OP_GET:
+		umsg->body.kvp_get.data.key_size =
+		    utf16_to_utf8(umsg->body.kvp_get.data.key,
+			HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1,
+			(uint16_t *)hmsg->body.kvp_get.data.key,
+			hmsg->body.kvp_get.data.key_size,
+			1, &utf_err);
+		/* utf8 encoding */
+		umsg->body.kvp_get.data.key_size =
+		    umsg->body.kvp_get.data.key_size / 2;
+		break;
+
+	case HV_KVP_OP_DELETE:
+		umsg->body.kvp_delete.key_size =
+		    utf16_to_utf8(umsg->body.kvp_delete.key,
+			HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1,
+			(uint16_t *)hmsg->body.kvp_delete.key,
+			hmsg->body.kvp_delete.key_size,
+			1, &utf_err);
+		/* utf8 encoding */
+		umsg->body.kvp_delete.key_size =
+		    umsg->body.kvp_delete.key_size / 2;
+		break;
+
+	case HV_KVP_OP_ENUMERATE:
+		umsg->body.kvp_enum_data.index =
+		    hmsg->body.kvp_enum_data.index;
+		break;
+
+	default:
+		hv_kvp_log_info("%s: daemon_kvp_msg: Invalid operation : %d\n",
+		    __func__, umsg->kvp_hdr.operation);
+	}
+}
+
+
+/*
+ * Prepare a host kvp msg based on user kvp msg (utf8 to utf16)
+ */
+static int
+hv_kvp_convert_usermsg_to_hostmsg(struct hv_kvp_msg *umsg, struct hv_kvp_msg *hmsg)
+{
+	int hkey_len = 0, hvalue_len = 0, utf_err = 0;
+	struct hv_kvp_exchg_msg_value *host_exchg_data;
+	char *key_name, *value;
+
+	struct hv_kvp_ip_msg *host_ip_msg = (struct hv_kvp_ip_msg *)hmsg;
+
+	switch (hmsg->kvp_hdr.operation) {
+	case HV_KVP_OP_GET_IP_INFO:
+		return (hv_kvp_convert_utf8_ipinfo_to_utf16(umsg, host_ip_msg));
+
+	case HV_KVP_OP_SET_IP_INFO:
+	case HV_KVP_OP_SET:
+	case HV_KVP_OP_DELETE:
+		return (0);
+
+	case HV_KVP_OP_ENUMERATE:
+		host_exchg_data = &hmsg->body.kvp_enum_data.data;
+		key_name = umsg->body.kvp_enum_data.data.key;
+		hkey_len = utf8_to_utf16((uint16_t *)host_exchg_data->key,
+				((HV_KVP_EXCHANGE_MAX_KEY_SIZE / 2) - 2),
+				key_name, strlen(key_name),
+				1, &utf_err);
+		/* utf16 encoding */
+		host_exchg_data->key_size = 2 * (hkey_len + 1);
+		value = umsg->body.kvp_enum_data.data.msg_value.value;
+		hvalue_len = utf8_to_utf16(
+				(uint16_t *)host_exchg_data->msg_value.value,
+				((HV_KVP_EXCHANGE_MAX_VALUE_SIZE / 2) - 2),
+				value, strlen(value),
+				1, &utf_err);
+		host_exchg_data->value_size = 2 * (hvalue_len + 1);
+		host_exchg_data->value_type = HV_REG_SZ;
+
+		if ((hkey_len < 0) || (hvalue_len < 0))
+			return (EINVAL);
+
+		return (0);
+
+	case HV_KVP_OP_GET:
+		host_exchg_data = &hmsg->body.kvp_get.data;
+		value = umsg->body.kvp_get.data.msg_value.value;
+		hvalue_len = utf8_to_utf16(
+				(uint16_t *)host_exchg_data->msg_value.value,
+				((HV_KVP_EXCHANGE_MAX_VALUE_SIZE / 2) - 2),
+				value, strlen(value),
+				1, &utf_err);
+		/* Convert value size to uft16 */
+		host_exchg_data->value_size = 2 * (hvalue_len + 1);
+		/* Use values by string */
+		host_exchg_data->value_type = HV_REG_SZ;
+
+		if (hvalue_len < 0)
+			return (EINVAL);
+
+		return (0);
+
+	default:
+		return (EINVAL);
+	}
+}
+
+
+/*
+ * Send the response back to the host.
+ */
+static void
+hv_kvp_respond_host(hv_kvp_sc *sc, uint32_t error)
+{
+	struct hv_vmbus_icmsg_hdr *hv_icmsg_hdrp;
+
+	hv_icmsg_hdrp = (struct hv_vmbus_icmsg_hdr *)
+	    &sc->rcv_buf[sizeof(struct hv_vmbus_pipe_hdr)];
+
+	hv_icmsg_hdrp->status = error;
+	hv_icmsg_hdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION |
+	    HV_ICMSGHDRFLAG_RESPONSE;
+
+	error = vmbus_chan_send(vmbus_get_channel(sc->dev),
+	    VMBUS_CHANPKT_TYPE_INBAND, 0, sc->rcv_buf, sc->host_msg_len,
+	    sc->host_msg_id);
+	if (error)
+		hv_kvp_log_info("%s: hv_kvp_respond_host: sendpacket error:%d\n",
+			__func__, error);
+}
+
+
+/*
+ * This is the main kvp kernel process that interacts with both user daemon
+ * and the host
+ */
+static void
+hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc)
+{
+	struct hv_kvp_msg *hmsg = sc->host_kvp_msg;
+	struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg;
+
+	/* Prepare kvp_msg to be sent to user */
+	hv_kvp_convert_hostmsg_to_usermsg(hmsg, umsg);
+
+	/* Send the msg to user via function deamon_read - setting sema */
+	sema_post(&sc->dev_sema);
+
+	/* We should wake up the daemon, in case it's doing poll() */
+	selwakeup(&sc->hv_kvp_selinfo);
+}
+
+
+/*
+ * Function to read the kvp request buffer from host
+ * and interact with daemon
+ */
+static void
+hv_kvp_process_request(void *context, int pending)
+{
+	uint8_t *kvp_buf;
+	struct vmbus_channel *channel;
+	uint32_t recvlen = 0;
+	uint64_t requestid;
+	struct hv_vmbus_icmsg_hdr *icmsghdrp;
+	int ret = 0, error;
+	hv_kvp_sc *sc;
+
+	hv_kvp_log_info("%s: entering hv_kvp_process_request\n", __func__);
+
+	sc = (hv_kvp_sc*)context;
+	kvp_buf = sc->util_sc.ic_buf;
+	channel = vmbus_get_channel(sc->dev);
+
+	recvlen = sc->util_sc.ic_buflen;
+	ret = vmbus_chan_recv(channel, kvp_buf, &recvlen, &requestid);
+	KASSERT(ret != ENOBUFS, ("hvkvp recvbuf is not large enough"));
+	/* XXX check recvlen to make sure that it contains enough data */
+
+	while ((ret == 0) && (recvlen > 0)) {
+		icmsghdrp = (struct hv_vmbus_icmsg_hdr *)
+		    &kvp_buf[sizeof(struct hv_vmbus_pipe_hdr)];
+
+		hv_kvp_transaction_init(sc, recvlen, requestid, kvp_buf);
+		if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
+			error = vmbus_ic_negomsg(&sc->util_sc,
+			    kvp_buf, &recvlen, KVP_FWVER, KVP_MSGVER);
+			/* XXX handle vmbus_ic_negomsg failure. */
+			if (!error)
+				hv_kvp_respond_host(sc, HV_S_OK);
+			else
+				hv_kvp_respond_host(sc, HV_E_FAIL);
+			/*
+			 * It is ok to not acquire the mutex before setting
+			 * req_in_progress here because negotiation is the
+			 * first thing that happens and hence there is no
+			 * chance of a race condition.
+			 */
+
+			sc->req_in_progress = false;
+			hv_kvp_log_info("%s :version negotiated\n", __func__);
+
+		} else {
+			if (!sc->daemon_busy) {
+
+				hv_kvp_log_info("%s: issuing qury to daemon\n", __func__);
+				mtx_lock(&sc->pending_mutex);
+				sc->req_timed_out = false;
+				sc->daemon_busy = true;
+				mtx_unlock(&sc->pending_mutex);
+
+				hv_kvp_send_msg_to_daemon(sc);
+				hv_kvp_log_info("%s: waiting for daemon\n", __func__);
+			}
+
+			/* Wait 5 seconds for daemon to respond back */
+			tsleep(sc, 0, "kvpworkitem", 5 * hz);
+			hv_kvp_log_info("%s: came out of wait\n", __func__);
+		}
+
+		mtx_lock(&sc->pending_mutex);
+
+		/* Notice that once req_timed_out is set to true
+		 * it will remain true until the next request is
+		 * sent to the daemon. The response from daemon
+		 * is forwarded to host only when this flag is
+		 * false.
+		 */
+		sc->req_timed_out = true;
+
+		/*
+		 * Cancel request if so need be.
+		 */
+		if (hv_kvp_req_in_progress(sc)) {
+			hv_kvp_log_info("%s: request was still active after wait so failing\n", __func__);
+			hv_kvp_respond_host(sc, HV_E_FAIL);
+			sc->req_in_progress = false;
+		}
+
+		mtx_unlock(&sc->pending_mutex);
+
+		/*
+		 * Try reading next buffer
+		 */
+		recvlen = sc->util_sc.ic_buflen;
+		ret = vmbus_chan_recv(channel, kvp_buf, &recvlen, &requestid);
+		KASSERT(ret != ENOBUFS, ("hvkvp recvbuf is not large enough"));
+		/* XXX check recvlen to make sure that it contains enough data */
+
+		hv_kvp_log_info("%s: read: context %p, ret =%d, recvlen=%d\n",
+			__func__, context, ret, recvlen);
+	}
+}
+
+
+/*
+ * Callback routine that gets called whenever there is a message from host
+ */
+static void
+hv_kvp_callback(struct vmbus_channel *chan __unused, void *context)
+{
+	hv_kvp_sc *sc = (hv_kvp_sc*)context;
+	/*
+	 The first request from host will not be handled until daemon is registered.
+	 when callback is triggered without a registered daemon, callback just return.
+	 When a new daemon gets regsitered, this callbcak is trigged from _write op.
+	*/
+	if (sc->register_done) {
+		hv_kvp_log_info("%s: Queuing work item\n", __func__);
+		taskqueue_enqueue(taskqueue_thread, &sc->task);
+	}
+}
+
+static int
+hv_kvp_dev_open(struct cdev *dev, int oflags, int devtype,
+				struct thread *td)
+{
+	hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
+
+	hv_kvp_log_info("%s: Opened device \"hv_kvp_device\" successfully.\n", __func__);
+	if (sc->dev_accessed)
+		return (-EBUSY);
+
+	sc->daemon_task = curproc;
+	sc->dev_accessed = true;
+	sc->daemon_busy = false;
+	return (0);
+}
+
+
+static int
+hv_kvp_dev_close(struct cdev *dev __unused, int fflag __unused, int devtype __unused,
+				 struct thread *td __unused)
+{
+	hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
+
+	hv_kvp_log_info("%s: Closing device \"hv_kvp_device\".\n", __func__);
+	sc->dev_accessed = false;
+	sc->register_done = false;
+	return (0);
+}
+
+
+/*
+ * hv_kvp_daemon read invokes this function
+ * acts as a send to daemon
+ */
+static int
+hv_kvp_dev_daemon_read(struct cdev *dev, struct uio *uio, int ioflag __unused)
+{
+	size_t amt;
+	int error = 0;
+	struct hv_kvp_msg *hv_kvp_dev_buf;
+	hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
+
+	/* Read is not allowed util registering is done. */
+	if (!sc->register_done)
+		return (EPERM);
+
+	sema_wait(&sc->dev_sema);
+
+	hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK);
+	memcpy(hv_kvp_dev_buf, &sc->daemon_kvp_msg, sizeof(struct hv_kvp_msg));
+
+	amt = MIN(uio->uio_resid, uio->uio_offset >= BUFFERSIZE + 1 ? 0 :
+		BUFFERSIZE + 1 - uio->uio_offset);
+
+	if ((error = uiomove(hv_kvp_dev_buf, amt, uio)) != 0)
+		hv_kvp_log_info("%s: hv_kvp uiomove read failed!\n", __func__);
+
+	free(hv_kvp_dev_buf, M_TEMP);
+	return (error);
+}
+
+
+/*
+ * hv_kvp_daemon write invokes this function
+ * acts as a receive from daemon
+ */
+static int
+hv_kvp_dev_daemon_write(struct cdev *dev, struct uio *uio, int ioflag __unused)
+{
+	size_t amt;
+	int error = 0;
+	struct hv_kvp_msg *hv_kvp_dev_buf;
+	hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
+
+	uio->uio_offset = 0;
+	hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK);
+
+	amt = MIN(uio->uio_resid, BUFFERSIZE);
+	error = uiomove(hv_kvp_dev_buf, amt, uio);
+
+	if (error != 0) {
+		free(hv_kvp_dev_buf, M_TEMP);
+		return (error);
+	}
+	memcpy(&sc->daemon_kvp_msg, hv_kvp_dev_buf, sizeof(struct hv_kvp_msg));
+
+	free(hv_kvp_dev_buf, M_TEMP);
+	if (sc->register_done == false) {
+		if (sc->daemon_kvp_msg.kvp_hdr.operation == HV_KVP_OP_REGISTER) {
+			sc->register_done = true;
+			hv_kvp_callback(vmbus_get_channel(sc->dev), dev->si_drv1);
+		}
+		else {
+			hv_kvp_log_info("%s, KVP Registration Failed\n", __func__);
+			return (EINVAL);
+		}
+	} else {
+
+		mtx_lock(&sc->pending_mutex);
+
+		if(!sc->req_timed_out) {
+			struct hv_kvp_msg *hmsg = sc->host_kvp_msg;
+			struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg;
+
+			error = hv_kvp_convert_usermsg_to_hostmsg(umsg, hmsg);
+			hv_kvp_respond_host(sc, umsg->hdr.error);
+			wakeup(sc);
+			sc->req_in_progress = false;
+			if (umsg->hdr.error != HV_S_OK)
+				hv_kvp_log_info("%s, Error 0x%x from daemon\n",
+				    __func__, umsg->hdr.error);
+			if (error)
+				hv_kvp_log_info("%s, Error from convert\n", __func__);
+		}
+
+		sc->daemon_busy = false;
+		mtx_unlock(&sc->pending_mutex);
+	}
+
+	return (error);
+}
+
+
+/*
+ * hv_kvp_daemon poll invokes this function to check if data is available
+ * for daemon to read.
+ */
+static int
+hv_kvp_dev_daemon_poll(struct cdev *dev, int events, struct thread *td)
+{
+	int revents = 0;
+	hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
+
+	mtx_lock(&sc->pending_mutex);
+	/*
+	 * We check global flag daemon_busy for the data availiability for
+	 * userland to read. Deamon_busy is set to true before driver has data
+	 * for daemon to read. It is set to false after daemon sends
+	 * then response back to driver.
+	 */
+	if (sc->daemon_busy == true)
+		revents = POLLIN;
+	else
+		selrecord(td, &sc->hv_kvp_selinfo);
+
+	mtx_unlock(&sc->pending_mutex);
+
+	return (revents);
+}
+
+static int
+hv_kvp_probe(device_t dev)
+{
+
+	return (vmbus_ic_probe(dev, vmbus_kvp_descs));
+}
+
+static int
+hv_kvp_attach(device_t dev)
+{
+	int error;
+	struct sysctl_oid_list *child;
+	struct sysctl_ctx_list *ctx;
+
+	hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev);
+
+	sc->dev = dev;
+	sema_init(&sc->dev_sema, 0, "hv_kvp device semaphore");
+	mtx_init(&sc->pending_mutex, "hv-kvp pending mutex",
+		NULL, MTX_DEF);
+
+	ctx = device_get_sysctl_ctx(dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "hv_kvp_log",
+	    CTLFLAG_RWTUN, &hv_kvp_log, 0, "Hyperv KVP service log level");
+
+	TASK_INIT(&sc->task, 0, hv_kvp_process_request, sc);
+
+	/* create character device */
+	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
+			&sc->hv_kvp_dev,
+			&hv_kvp_cdevsw,
+			0,
+			UID_ROOT,
+			GID_WHEEL,
+			0640,
+			"hv_kvp_dev");
+
+	if (error != 0)
+		return (error);
+	sc->hv_kvp_dev->si_drv1 = sc;
+
+	return (vmbus_ic_attach(dev, hv_kvp_callback));
+}
+
+static int
+hv_kvp_detach(device_t dev)
+{
+	hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev);
+
+	if (sc->daemon_task != NULL) {
+		PROC_LOCK(sc->daemon_task);
+		kern_psignal(sc->daemon_task, SIGKILL);
+		PROC_UNLOCK(sc->daemon_task);
+	}
+
+	destroy_dev(sc->hv_kvp_dev);
+	return (vmbus_ic_detach(dev));
+}
+
+static device_method_t kvp_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe, hv_kvp_probe),
+	DEVMETHOD(device_attach, hv_kvp_attach),
+	DEVMETHOD(device_detach, hv_kvp_detach),
+	{ 0, 0 }
+};
+
+static driver_t kvp_driver = { "hvkvp", kvp_methods, sizeof(hv_kvp_sc)};
+
+static devclass_t kvp_devclass;
+
+DRIVER_MODULE(hv_kvp, vmbus, kvp_driver, kvp_devclass, NULL, NULL);
+MODULE_VERSION(hv_kvp, 1);
+MODULE_DEPEND(hv_kvp, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/utilities/hv_kvp.h b/sys/dev/hyperv/utilities/hv_kvp.h
new file mode 100644
index 000000000000..91e1ea404d4a
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_kvp.h
@@ -0,0 +1,229 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014,2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _KVP_H
+#define _KVP_H
+/*
+ * An implementation of HyperV key value pair (KVP) functionality for FreeBSD
+ *
+ */
+
+/*
+ * Maximum value size - used for both key names and value data, and includes
+ * any applicable NULL terminators.
+ *
+ * Note:  This limit is somewhat arbitrary, but falls easily within what is
+ * supported for all native guests (back to Win 2000) and what is reasonable
+ * for the IC KVP exchange functionality.  Note that Windows Me/98/95 are
+ * limited to 255 character key names.
+ *
+ * MSDN recommends not storing data values larger than 2048 bytes in the
+ * registry.
+ *
+ * Note:  This value is used in defining the KVP exchange message - this value
+ * cannot be modified without affecting the message size and compatibility.
+ */
+
+/*
+ * bytes, including any null terminators
+ */
+#define HV_KVP_EXCHANGE_MAX_VALUE_SIZE    (2048)
+
+
+/*
+ * Maximum key size - the registry limit for the length of an entry name
+ * is 256 characters, including the null terminator
+ */
+#define HV_KVP_EXCHANGE_MAX_KEY_SIZE    (512)
+
+
+/*
+ * In FreeBSD, we implement the KVP functionality in two components:
+ * 1) The kernel component which is packaged as part of the hv_utils driver
+ * is responsible for communicating with the host and responsible for
+ * implementing the host/guest protocol. 2) A user level daemon that is
+ * responsible for data gathering.
+ *
+ * Host/Guest Protocol: The host iterates over an index and expects the guest
+ * to assign a key name to the index and also return the value corresponding to
+ * the key. The host will have atmost one KVP transaction outstanding at any
+ * given point in time. The host side iteration stops when the guest returns
+ * an error. Microsoft has specified the following mapping of key names to
+ * host specified index:
+ *
+ *  Index		Key Name
+ *	0		FullyQualifiedDomainName
+ *	1		IntegrationServicesVersion
+ *	2		NetworkAddressIPv4
+ *	3		NetworkAddressIPv6
+ *	4		OSBuildNumber
+ *	5		OSName
+ *	6		OSMajorVersion
+ *	7		OSMinorVersion
+ *	8		OSVersion
+ *	9		ProcessorArchitecture
+ *
+ * The Windows host expects the Key Name and Key Value to be encoded in utf16.
+ *
+ * Guest Kernel/KVP Daemon Protocol: As noted earlier, we implement all of the
+ * data gathering functionality in a user mode daemon. The user level daemon
+ * is also responsible for binding the key name to the index as well. The
+ * kernel and user-level daemon communicate using a connector channel.
+ *
+ * The user mode component first registers with the
+ * the kernel component. Subsequently, the kernel component requests, data
+ * for the specified keys. In response to this message the user mode component
+ * fills in the value corresponding to the specified key. We overload the
+ * sequence field in the cn_msg header to define our KVP message types.
+ *
+ *
+ * The kernel component simply acts as a conduit for communication between the
+ * Windows host and the user-level daemon. The kernel component passes up the
+ * index received from the Host to the user-level daemon. If the index is
+ * valid (supported), the corresponding key as well as its
+ * value (both are strings) is returned. If the index is invalid
+ * (not supported), a NULL key string is returned.
+ */
+
+ 
+/*
+ * Registry value types.
+ */
+#define HV_REG_SZ     1
+#define HV_REG_U32    4
+#define HV_REG_U64    8
+
+
+/*
+ * Daemon code supporting IP injection.
+ */
+#define HV_KVP_OP_REGISTER    4
+
+
+enum hv_kvp_exchg_op {
+	HV_KVP_OP_GET = 0,
+	HV_KVP_OP_SET,
+	HV_KVP_OP_DELETE,
+	HV_KVP_OP_ENUMERATE,
+	HV_KVP_OP_GET_IP_INFO,
+	HV_KVP_OP_SET_IP_INFO,
+	HV_KVP_OP_COUNT /* Number of operations, must be last. */
+};
+
+enum hv_kvp_exchg_pool {
+	HV_KVP_POOL_EXTERNAL = 0,
+	HV_KVP_POOL_GUEST,
+	HV_KVP_POOL_AUTO,
+	HV_KVP_POOL_AUTO_EXTERNAL,
+	HV_KVP_POOL_AUTO_INTERNAL,
+	HV_KVP_POOL_COUNT /* Number of pools, must be last. */
+};
+
+#define ADDR_FAMILY_NONE                 0x00
+#define ADDR_FAMILY_IPV4                 0x01
+#define ADDR_FAMILY_IPV6                 0x02
+
+#define MAX_ADAPTER_ID_SIZE              128
+#define MAX_IP_ADDR_SIZE                 1024
+#define MAX_GATEWAY_SIZE                 512
+
+
+struct hv_kvp_ipaddr_value {
+	uint16_t adapter_id[MAX_ADAPTER_ID_SIZE];
+	uint8_t  addr_family;
+	uint8_t  dhcp_enabled;
+	uint16_t ip_addr[MAX_IP_ADDR_SIZE];
+	uint16_t sub_net[MAX_IP_ADDR_SIZE];
+	uint16_t gate_way[MAX_GATEWAY_SIZE];
+	uint16_t dns_addr[MAX_IP_ADDR_SIZE];
+}__attribute__((packed));
+
+struct hv_kvp_hdr {
+	uint8_t                 operation;
+	uint8_t                 pool;
+	uint16_t                pad;
+} __attribute__((packed));
+
+struct hv_kvp_exchg_msg_value {
+	uint32_t value_type;
+	uint32_t key_size;
+	uint32_t value_size;
+	uint8_t  key[HV_KVP_EXCHANGE_MAX_KEY_SIZE];
+	union {
+		uint8_t  value[HV_KVP_EXCHANGE_MAX_VALUE_SIZE];
+		uint32_t value_u32;
+		uint64_t value_u64;
+	} msg_value;
+} __attribute__((packed));
+
+struct hv_kvp_msg_enumerate {
+	uint32_t index;
+	struct hv_kvp_exchg_msg_value data;
+} __attribute__((packed));
+
+struct hv_kvp_msg_get {
+	struct hv_kvp_exchg_msg_value data;
+} __attribute__((packed));
+
+struct hv_kvp_msg_set {
+	struct hv_kvp_exchg_msg_value data;
+} __attribute__((packed));
+
+struct hv_kvp_msg_delete {
+	uint32_t key_size;
+	uint8_t key[HV_KVP_EXCHANGE_MAX_KEY_SIZE];
+} __attribute__((packed));
+
+struct hv_kvp_register {
+	uint8_t version[HV_KVP_EXCHANGE_MAX_KEY_SIZE];
+} __attribute__((packed));
+
+struct hv_kvp_msg {
+	union {
+		struct hv_kvp_hdr kvp_hdr;
+		uint32_t error;
+	} hdr;
+	union {
+		struct hv_kvp_msg_get		kvp_get;
+		struct hv_kvp_msg_set		kvp_set;
+		struct hv_kvp_msg_delete	kvp_delete;
+		struct hv_kvp_msg_enumerate	kvp_enum_data;
+		struct hv_kvp_ipaddr_value	kvp_ip_val;
+		struct hv_kvp_register		kvp_register;
+	} body;
+} __attribute__((packed));
+
+struct hv_kvp_ip_msg {
+	uint8_t operation;
+	uint8_t pool;
+	struct hv_kvp_ipaddr_value      kvp_ip_val;
+} __attribute__((packed));
+
+#endif /* _KVP_H */
diff --git a/sys/dev/hyperv/utilities/hv_snapshot.c b/sys/dev/hyperv/utilities/hv_snapshot.c
new file mode 100644
index 000000000000..45defe1b0f1e
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_snapshot.c
@@ -0,0 +1,1061 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/lock.h>
+#include <sys/taskqueue.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/kthread.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/un.h>
+#include <sys/endian.h>
+#include <sys/sema.h>
+#include <sys/signal.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/mutex.h>
+#include <sys/callout.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/utilities/hv_utilreg.h>
+#include <dev/hyperv/utilities/vmbus_icreg.h>
+#include <dev/hyperv/utilities/vmbus_icvar.h>
+
+#include "hv_snapshot.h"
+#include "vmbus_if.h"
+
+#define VSS_MAJOR		5
+#define VSS_MINOR		0
+#define VSS_MSGVER		VMBUS_IC_VERSION(VSS_MAJOR, VSS_MINOR)
+
+#define VSS_FWVER_MAJOR		3
+#define VSS_FWVER		VMBUS_IC_VERSION(VSS_FWVER_MAJOR, 0)
+
+#define TIMEOUT_LIMIT		(15)	// seconds
+enum hv_vss_op {
+	VSS_OP_CREATE = 0,
+	VSS_OP_DELETE,
+	VSS_OP_HOT_BACKUP,
+	VSS_OP_GET_DM_INFO,
+	VSS_OP_BU_COMPLETE,
+	/*
+	 * Following operations are only supported with IC version >= 5.0
+	 */
+	VSS_OP_FREEZE, /* Freeze the file systems in the VM */
+	VSS_OP_THAW, /* Unfreeze the file systems */
+	VSS_OP_AUTO_RECOVER,
+	VSS_OP_COUNT /* Number of operations, must be last */
+};
+
+/*
+ * Header for all VSS messages.
+ */
+struct hv_vss_hdr {
+	struct vmbus_icmsg_hdr	ic_hdr;
+	uint8_t			operation;
+	uint8_t			reserved[7];
+} __packed;
+
+
+/*
+ * Flag values for the hv_vss_check_feature. Here supports only
+ * one value.
+ */
+#define VSS_HBU_NO_AUTO_RECOVERY		0x00000005
+
+struct hv_vss_check_feature {
+	uint32_t flags;
+} __packed;
+
+struct hv_vss_check_dm_info {
+	uint32_t flags;
+} __packed;
+
+struct hv_vss_msg {
+	union {
+		struct hv_vss_hdr vss_hdr;
+	} hdr;
+	union {
+		struct hv_vss_check_feature vss_cf;
+		struct hv_vss_check_dm_info dm_info;
+	} body;
+} __packed;
+
+struct hv_vss_req {
+	struct hv_vss_opt_msg	opt_msg;	/* used to communicate with daemon */
+	struct hv_vss_msg	msg;		/* used to communicate with host */
+} __packed;
+
+/* hv_vss debug control */
+static int hv_vss_log = 0;
+
+#define	hv_vss_log_error(...)	do {				\
+	if (hv_vss_log > 0)					\
+		log(LOG_ERR, "hv_vss: " __VA_ARGS__);		\
+} while (0)
+
+#define	hv_vss_log_info(...) do {				\
+	if (hv_vss_log > 1)					\
+		log(LOG_INFO, "hv_vss: " __VA_ARGS__);		\
+} while (0)
+
+static const struct vmbus_ic_desc vmbus_vss_descs[] = {
+	{
+		.ic_guid = { .hv_guid = {
+		    0x29, 0x2e, 0xfa, 0x35, 0x23, 0xea, 0x36, 0x42,
+		    0x96, 0xae, 0x3a, 0x6e, 0xba, 0xcb, 0xa4,  0x40} },
+		.ic_desc = "Hyper-V VSS"
+	},
+	VMBUS_IC_DESC_END
+};
+
+static const char * vss_opt_name[] = {"None", "VSSCheck", "Freeze", "Thaw"};
+
+/* character device prototypes */
+static d_open_t		hv_vss_dev_open;
+static d_close_t	hv_vss_dev_close;
+static d_poll_t		hv_vss_dev_daemon_poll;
+static d_ioctl_t	hv_vss_dev_daemon_ioctl;
+
+static d_open_t		hv_appvss_dev_open;
+static d_close_t	hv_appvss_dev_close;
+static d_poll_t		hv_appvss_dev_poll;
+static d_ioctl_t	hv_appvss_dev_ioctl;
+
+/* hv_vss character device structure */
+static struct cdevsw hv_vss_cdevsw =
+{
+	.d_version	= D_VERSION,
+	.d_open		= hv_vss_dev_open,
+	.d_close	= hv_vss_dev_close,
+	.d_poll		= hv_vss_dev_daemon_poll,
+	.d_ioctl	= hv_vss_dev_daemon_ioctl,
+	.d_name		= FS_VSS_DEV_NAME,
+};
+
+static struct cdevsw hv_appvss_cdevsw =
+{
+	.d_version	= D_VERSION,
+	.d_open		= hv_appvss_dev_open,
+	.d_close	= hv_appvss_dev_close,
+	.d_poll		= hv_appvss_dev_poll,
+	.d_ioctl	= hv_appvss_dev_ioctl,
+	.d_name		= APP_VSS_DEV_NAME,
+};
+
+struct hv_vss_sc;
+/*
+ * Global state to track cdev
+ */
+struct hv_vss_dev_sc {
+	/*
+	 * msg was transferred from host to notify queue, and
+	 * ack queue. Finally, it was recyled to free list.
+	 */
+	STAILQ_HEAD(, hv_vss_req_internal) 	to_notify_queue;
+	STAILQ_HEAD(, hv_vss_req_internal) 	to_ack_queue;
+	struct hv_vss_sc			*sc;
+	struct proc				*proc_task;
+	struct selinfo				hv_vss_selinfo;
+};
+/*
+ * Global state to track and synchronize the transaction requests from the host.
+ * The VSS allows user to register their function to do freeze/thaw for application.
+ * VSS kernel will notify both vss daemon and user application if it is registered.
+ * The implementation state transition is illustrated by:
+ * https://clovertrail.github.io/assets/vssdot.png
+ */
+typedef struct hv_vss_sc {
+	struct vmbus_ic_softc			util_sc;
+	device_t				dev;
+
+	struct task				task;
+
+	/*
+	 * mutex is used to protect access of list/queue,
+	 * callout in request is also used this mutex.
+	 */
+	struct mtx				pending_mutex;
+	/*
+	 * req_free_list contains all free items
+	 */
+	LIST_HEAD(, hv_vss_req_internal)	req_free_list;
+
+	/* Indicates if daemon registered with driver */
+	boolean_t				register_done;
+
+	boolean_t				app_register_done;
+
+	/* cdev for file system freeze/thaw */
+	struct cdev				*hv_vss_dev;
+	/* cdev for application freeze/thaw */
+	struct cdev				*hv_appvss_dev;
+
+	/* sc for app */
+	struct hv_vss_dev_sc			app_sc;
+	/* sc for deamon */
+	struct hv_vss_dev_sc			daemon_sc;
+} hv_vss_sc;
+
+typedef struct hv_vss_req_internal {
+	LIST_ENTRY(hv_vss_req_internal)		link;
+	STAILQ_ENTRY(hv_vss_req_internal)	slink;
+	struct hv_vss_req			vss_req;
+
+	/* Rcv buffer for communicating with the host*/
+	uint8_t					*rcv_buf;
+	/* Length of host message */
+	uint32_t				host_msg_len;
+	/* Host message id */
+	uint64_t				host_msg_id;
+
+	hv_vss_sc				*sc;
+
+	struct callout				callout;
+} hv_vss_req_internal;
+
+#define SEARCH_REMOVE_REQ_LOCKED(reqp, queue, link, tmp, id)		\
+	do {								\
+		STAILQ_FOREACH_SAFE(reqp, queue, link, tmp) {		\
+			if (reqp->vss_req.opt_msg.msgid == id) {	\
+				STAILQ_REMOVE(queue,			\
+				    reqp, hv_vss_req_internal, link);	\
+				break;					\
+			}						\
+		}							\
+	} while (0)
+
+static bool
+hv_vss_is_daemon_killed_after_launch(hv_vss_sc *sc)
+{
+	return (!sc->register_done && sc->daemon_sc.proc_task);
+}
+
+/*
+ * Callback routine that gets called whenever there is a message from host
+ */
+static void
+hv_vss_callback(struct vmbus_channel *chan __unused, void *context)
+{
+	hv_vss_sc *sc = (hv_vss_sc*)context;
+	if (hv_vss_is_daemon_killed_after_launch(sc))
+		hv_vss_log_info("%s: daemon was killed!\n", __func__);
+	if (sc->register_done || sc->daemon_sc.proc_task) {
+		hv_vss_log_info("%s: Queuing work item\n", __func__);
+		if (hv_vss_is_daemon_killed_after_launch(sc))
+			hv_vss_log_info("%s: daemon was killed!\n", __func__);
+		taskqueue_enqueue(taskqueue_thread, &sc->task);
+	} else {
+		hv_vss_log_info("%s: daemon has never been registered\n", __func__);
+	}
+	hv_vss_log_info("%s: received msg from host\n", __func__);
+}
+/*
+ * Send the response back to the host.
+ */
+static void
+hv_vss_respond_host(uint8_t *rcv_buf, struct vmbus_channel *ch,
+    uint32_t recvlen, uint64_t requestid, uint32_t error)
+{
+	struct vmbus_icmsg_hdr *hv_icmsg_hdrp;
+
+	hv_icmsg_hdrp = (struct vmbus_icmsg_hdr *)rcv_buf;
+
+	hv_icmsg_hdrp->ic_status = error;
+	hv_icmsg_hdrp->ic_flags = HV_ICMSGHDRFLAG_TRANSACTION | HV_ICMSGHDRFLAG_RESPONSE;
+
+	error = vmbus_chan_send(ch, VMBUS_CHANPKT_TYPE_INBAND, 0,
+	    rcv_buf, recvlen, requestid);
+	if (error)
+		hv_vss_log_info("%s: hv_vss_respond_host: sendpacket error:%d\n",
+		    __func__, error);
+}
+
+static void
+hv_vss_notify_host_result_locked(struct hv_vss_req_internal *reqp, uint32_t status)
+{
+	struct hv_vss_msg* msg = (struct hv_vss_msg *)reqp->rcv_buf;
+	hv_vss_sc *sc = reqp->sc;
+	if (reqp->vss_req.opt_msg.opt == HV_VSS_CHECK) {
+		msg->body.vss_cf.flags = VSS_HBU_NO_AUTO_RECOVERY;
+	}
+	hv_vss_log_info("%s, %s response %s to host\n", __func__,
+	    vss_opt_name[reqp->vss_req.opt_msg.opt],
+	    status == HV_S_OK ? "Success" : "Fail");
+	hv_vss_respond_host(reqp->rcv_buf, vmbus_get_channel(reqp->sc->dev),
+	    reqp->host_msg_len, reqp->host_msg_id, status);
+	/* recycle the request */
+	LIST_INSERT_HEAD(&sc->req_free_list, reqp, link);
+}
+
+static void
+hv_vss_notify_host_result(struct hv_vss_req_internal *reqp, uint32_t status)
+{
+	mtx_lock(&reqp->sc->pending_mutex);
+	hv_vss_notify_host_result_locked(reqp, status);
+	mtx_unlock(&reqp->sc->pending_mutex);
+}
+
+static void
+hv_vss_cp_vssreq_to_user(struct hv_vss_req_internal *reqp,
+    struct hv_vss_opt_msg *userdata)
+{
+	struct hv_vss_req *hv_vss_dev_buf;
+	hv_vss_dev_buf = &reqp->vss_req;
+	hv_vss_dev_buf->opt_msg.opt = HV_VSS_NONE;
+	switch (reqp->vss_req.msg.hdr.vss_hdr.operation) {
+	case VSS_OP_FREEZE:
+		hv_vss_dev_buf->opt_msg.opt = HV_VSS_FREEZE;
+		break;
+	case VSS_OP_THAW:
+		hv_vss_dev_buf->opt_msg.opt = HV_VSS_THAW;
+		break;
+	case VSS_OP_HOT_BACKUP:
+		hv_vss_dev_buf->opt_msg.opt = HV_VSS_CHECK;
+		break;
+	}
+	*userdata = hv_vss_dev_buf->opt_msg;
+	hv_vss_log_info("%s, read data from user for "
+	    "%s (%ju) \n", __func__, vss_opt_name[userdata->opt],
+	    (uintmax_t)userdata->msgid);
+}
+
+/**
+ * Remove the request id from app notifiy or ack queue,
+ * and recyle the request by inserting it to free list.
+ *
+ * When app was notified but not yet sending ack, the request
+ * should locate in either notify queue or ack queue.
+ */
+static struct hv_vss_req_internal*
+hv_vss_drain_req_queue_locked(hv_vss_sc *sc, uint64_t req_id)
+{
+	struct hv_vss_req_internal *reqp, *tmp;
+	SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->daemon_sc.to_notify_queue,
+	    slink, tmp, req_id);
+	if (reqp == NULL)
+		SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->daemon_sc.to_ack_queue,
+		    slink, tmp, req_id);
+	if (reqp == NULL)
+		SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->app_sc.to_notify_queue,
+		    slink, tmp, req_id);
+	if (reqp == NULL)
+		SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->app_sc.to_ack_queue, slink,
+		    tmp, req_id);
+	return (reqp);
+}
+/**
+ * Actions for daemon who has been notified.
+ */
+static void
+hv_vss_notified(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
+{
+	struct hv_vss_req_internal *reqp;
+	mtx_lock(&dev_sc->sc->pending_mutex);
+	if (!STAILQ_EMPTY(&dev_sc->to_notify_queue)) {
+		reqp = STAILQ_FIRST(&dev_sc->to_notify_queue);
+		hv_vss_cp_vssreq_to_user(reqp, userdata);
+		STAILQ_REMOVE_HEAD(&dev_sc->to_notify_queue, slink);
+		/* insert the msg to queue for write */
+		STAILQ_INSERT_TAIL(&dev_sc->to_ack_queue, reqp, slink);
+		userdata->status = VSS_SUCCESS;
+	} else {
+		/* Timeout occur, thus request was removed from queue. */
+		hv_vss_log_info("%s: notify queue is empty!\n", __func__);
+		userdata->status = VSS_FAIL;
+	}
+	mtx_unlock(&dev_sc->sc->pending_mutex);
+}
+
+static void
+hv_vss_notify(struct hv_vss_dev_sc *dev_sc, struct hv_vss_req_internal *reqp)
+{
+	uint32_t opt = reqp->vss_req.opt_msg.opt;
+	mtx_lock(&dev_sc->sc->pending_mutex);
+	STAILQ_INSERT_TAIL(&dev_sc->to_notify_queue, reqp, slink);
+	hv_vss_log_info("%s: issuing query %s (%ju) to %s\n", __func__,
+	    vss_opt_name[opt], (uintmax_t)reqp->vss_req.opt_msg.msgid,
+	    &dev_sc->sc->app_sc == dev_sc ? "app" : "daemon");
+	mtx_unlock(&dev_sc->sc->pending_mutex);
+	selwakeup(&dev_sc->hv_vss_selinfo);
+}
+
+/**
+ * Actions for daemon who has acknowledged.
+ */
+static void
+hv_vss_daemon_acked(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
+{
+	struct hv_vss_req_internal	*reqp, *tmp;
+	uint64_t			req_id;
+	int				opt;
+	uint32_t			status;
+
+	opt = userdata->opt;
+	req_id = userdata->msgid;
+	status = userdata->status;
+	/* make sure the reserved fields are all zeros. */
+	memset(&userdata->reserved, 0, sizeof(struct hv_vss_opt_msg) -
+	    __offsetof(struct hv_vss_opt_msg, reserved));
+	mtx_lock(&dev_sc->sc->pending_mutex);
+	SEARCH_REMOVE_REQ_LOCKED(reqp, &dev_sc->to_ack_queue, slink, tmp, req_id);
+	mtx_unlock(&dev_sc->sc->pending_mutex);
+	if (reqp == NULL) {
+		hv_vss_log_info("%s Timeout: fail to find daemon ack request\n",
+		    __func__);
+		userdata->status = VSS_FAIL;
+		return;
+	}
+	KASSERT(opt == reqp->vss_req.opt_msg.opt, ("Mismatched VSS operation!"));
+	hv_vss_log_info("%s, get response %d from daemon for %s (%ju) \n", __func__,
+	    status, vss_opt_name[opt], (uintmax_t)req_id);
+	switch (opt) {
+	case HV_VSS_CHECK:
+	case HV_VSS_FREEZE:
+		callout_drain(&reqp->callout);
+		hv_vss_notify_host_result(reqp,
+		    status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
+		break;
+	case HV_VSS_THAW:
+		if (dev_sc->sc->app_register_done) {
+			if (status == VSS_SUCCESS) {
+				hv_vss_notify(&dev_sc->sc->app_sc, reqp);
+			} else {
+				/* handle error */
+				callout_drain(&reqp->callout);
+				hv_vss_notify_host_result(reqp, HV_E_FAIL);
+			}
+		} else {
+			callout_drain(&reqp->callout);
+			hv_vss_notify_host_result(reqp,
+			    status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
+		}
+		break;
+	}
+}
+
+/**
+ * Actions for app who has acknowledged.
+ */
+static void
+hv_vss_app_acked(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
+{
+	struct hv_vss_req_internal	*reqp, *tmp;
+	uint64_t			req_id;
+	int				opt;
+	uint8_t				status;
+
+	opt = userdata->opt;
+	req_id = userdata->msgid;
+	status = userdata->status;
+	/* make sure the reserved fields are all zeros. */
+	memset(&userdata->reserved, 0, sizeof(struct hv_vss_opt_msg) -
+	    __offsetof(struct hv_vss_opt_msg, reserved));
+	mtx_lock(&dev_sc->sc->pending_mutex);
+	SEARCH_REMOVE_REQ_LOCKED(reqp, &dev_sc->to_ack_queue, slink, tmp, req_id);
+	mtx_unlock(&dev_sc->sc->pending_mutex);
+	if (reqp == NULL) {
+		hv_vss_log_info("%s Timeout: fail to find app ack request\n",
+		    __func__);
+		userdata->status = VSS_FAIL;
+		return;
+	}
+	KASSERT(opt == reqp->vss_req.opt_msg.opt, ("Mismatched VSS operation!"));
+	hv_vss_log_info("%s, get response %d from app for %s (%ju) \n",
+	    __func__, status, vss_opt_name[opt], (uintmax_t)req_id);
+	if (dev_sc->sc->register_done) {
+		switch (opt) {
+		case HV_VSS_CHECK:
+		case HV_VSS_FREEZE:
+			if (status == VSS_SUCCESS) {
+				hv_vss_notify(&dev_sc->sc->daemon_sc, reqp);
+			} else {
+				/* handle error */
+				callout_drain(&reqp->callout);
+				hv_vss_notify_host_result(reqp, HV_E_FAIL);
+			}
+			break;
+		case HV_VSS_THAW:
+			callout_drain(&reqp->callout);
+			hv_vss_notify_host_result(reqp,
+			    status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
+			break;
+		}
+	} else {
+		hv_vss_log_info("%s, Fatal: vss daemon was killed\n", __func__);
+	}
+}
+
+static int
+hv_vss_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+	struct proc     *td_proc;
+	td_proc = td->td_proc;
+
+	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+	hv_vss_log_info("%s: %s opens device \"%s\" successfully.\n",
+	    __func__, td_proc->p_comm, FS_VSS_DEV_NAME);
+
+	if (dev_sc->sc->register_done)
+		return (EBUSY);
+
+	dev_sc->sc->register_done = true;
+	hv_vss_callback(vmbus_get_channel(dev_sc->sc->dev), dev_sc->sc);
+
+	dev_sc->proc_task = curproc;
+	return (0);
+}
+
+static int
+hv_vss_dev_close(struct cdev *dev, int fflag __unused, int devtype __unused,
+				 struct thread *td)
+{
+	struct proc     *td_proc;
+	td_proc = td->td_proc;
+
+	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+
+	hv_vss_log_info("%s: %s closes device \"%s\"\n",
+	    __func__, td_proc->p_comm, FS_VSS_DEV_NAME);
+	dev_sc->sc->register_done = false;
+	return (0);
+}
+
+static int
+hv_vss_dev_daemon_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
+    struct thread *td)
+{
+	struct proc			*td_proc;
+	struct hv_vss_dev_sc		*sc;
+
+	td_proc = td->td_proc;
+	sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+
+	hv_vss_log_info("%s: %s invoked vss ioctl\n", __func__, td_proc->p_comm);
+
+	struct hv_vss_opt_msg* userdata = (struct hv_vss_opt_msg*)data;
+	switch(cmd) {
+	case IOCHVVSSREAD:
+		hv_vss_notified(sc, userdata);
+		break;
+	case IOCHVVSSWRITE:
+		hv_vss_daemon_acked(sc, userdata);
+		break;
+	}
+	return (0);
+}
+
+/*
+ * hv_vss_daemon poll invokes this function to check if data is available
+ * for daemon to read.
+ */
+static int
+hv_vss_dev_daemon_poll(struct cdev *dev, int events, struct thread *td)
+{
+	int revent = 0;
+	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+
+	mtx_lock(&dev_sc->sc->pending_mutex);
+	/**
+	 * if there is data ready, inform daemon's poll
+	 */
+	if (!STAILQ_EMPTY(&dev_sc->to_notify_queue))
+		revent = POLLIN;
+	if (revent == 0)
+		selrecord(td, &dev_sc->hv_vss_selinfo);
+	hv_vss_log_info("%s return 0x%x\n", __func__, revent);
+	mtx_unlock(&dev_sc->sc->pending_mutex);
+	return (revent);
+}
+
+static int
+hv_appvss_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+	struct proc     *td_proc;
+	td_proc = td->td_proc;
+
+	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+	hv_vss_log_info("%s: %s opens device \"%s\" successfully.\n",
+	    __func__, td_proc->p_comm, APP_VSS_DEV_NAME);
+
+	if (dev_sc->sc->app_register_done)
+		return (EBUSY);
+
+	dev_sc->sc->app_register_done = true;
+	dev_sc->proc_task = curproc;
+	return (0);
+}
+
+static int
+hv_appvss_dev_close(struct cdev *dev, int fflag __unused, int devtype __unused,
+				 struct thread *td)
+{
+	struct proc     *td_proc;
+	td_proc = td->td_proc;
+
+	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+
+	hv_vss_log_info("%s: %s closes device \"%s\".\n",
+	    __func__, td_proc->p_comm, APP_VSS_DEV_NAME);
+	dev_sc->sc->app_register_done = false;
+	return (0);
+}
+
+static int
+hv_appvss_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
+    struct thread *td)
+{
+	struct proc			*td_proc;
+	struct hv_vss_dev_sc		*dev_sc;
+
+	td_proc = td->td_proc;
+	dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+
+	hv_vss_log_info("%s: %s invoked vss ioctl\n", __func__, td_proc->p_comm);
+
+	struct hv_vss_opt_msg* userdata = (struct hv_vss_opt_msg*)data;
+	switch(cmd) {
+	case IOCHVVSSREAD:
+		hv_vss_notified(dev_sc, userdata);
+		break;
+	case IOCHVVSSWRITE:
+		hv_vss_app_acked(dev_sc, userdata);
+		break;
+	}
+	return (0);
+}
+
+/*
+ * hv_vss_daemon poll invokes this function to check if data is available
+ * for daemon to read.
+ */
+static int
+hv_appvss_dev_poll(struct cdev *dev, int events, struct thread *td)
+{
+	int revent = 0;
+	struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+
+	mtx_lock(&dev_sc->sc->pending_mutex);
+	/**
+	 * if there is data ready, inform daemon's poll
+	 */
+	if (!STAILQ_EMPTY(&dev_sc->to_notify_queue))
+		revent = POLLIN;
+	if (revent == 0)
+		selrecord(td, &dev_sc->hv_vss_selinfo);
+	hv_vss_log_info("%s return 0x%x\n", __func__, revent);
+	mtx_unlock(&dev_sc->sc->pending_mutex);
+	return (revent);
+}
+
+static void
+hv_vss_timeout(void *arg)
+{
+	hv_vss_req_internal *reqp = arg;
+	hv_vss_req_internal *request;
+	hv_vss_sc* sc = reqp->sc;
+	uint64_t req_id = reqp->vss_req.opt_msg.msgid;
+	/* This thread is locked */
+	KASSERT(mtx_owned(&sc->pending_mutex), ("mutex lock is not owned!"));
+	request = hv_vss_drain_req_queue_locked(sc, req_id);
+	KASSERT(request != NULL, ("timeout but fail to find request"));
+	hv_vss_notify_host_result_locked(reqp, HV_E_FAIL);
+}
+
+/*
+ * This routine is called whenever a message is received from the host
+ */
+static void
+hv_vss_init_req(hv_vss_req_internal *reqp,
+    uint32_t recvlen, uint64_t requestid, uint8_t *vss_buf, hv_vss_sc *sc)
+{
+	struct timespec vm_ts;
+	struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
+
+	memset(reqp, 0, __offsetof(hv_vss_req_internal, callout));
+	reqp->host_msg_len = recvlen;
+	reqp->host_msg_id = requestid;
+	reqp->rcv_buf = vss_buf;
+	reqp->sc = sc;
+	memcpy(&reqp->vss_req.msg,
+	    (struct hv_vss_msg *)vss_buf, sizeof(struct hv_vss_msg));
+	/* set the opt for users */
+	switch (msg->hdr.vss_hdr.operation) {
+	case VSS_OP_FREEZE:
+		reqp->vss_req.opt_msg.opt = HV_VSS_FREEZE;
+		break;
+	case VSS_OP_THAW:
+		reqp->vss_req.opt_msg.opt = HV_VSS_THAW;
+		break;
+	case VSS_OP_HOT_BACKUP:
+		reqp->vss_req.opt_msg.opt = HV_VSS_CHECK;
+		break;
+	}
+	/* Use a timestamp as msg request ID */
+	nanotime(&vm_ts);
+	reqp->vss_req.opt_msg.msgid = (vm_ts.tv_sec * NANOSEC) + vm_ts.tv_nsec;
+}
+
+static hv_vss_req_internal*
+hv_vss_get_new_req_locked(hv_vss_sc *sc)
+{
+	hv_vss_req_internal *reqp;
+	if (!STAILQ_EMPTY(&sc->daemon_sc.to_notify_queue) ||
+	    !STAILQ_EMPTY(&sc->daemon_sc.to_ack_queue) ||
+	    !STAILQ_EMPTY(&sc->app_sc.to_notify_queue) ||
+	    !STAILQ_EMPTY(&sc->app_sc.to_ack_queue)) {
+		/*
+		 * There is request coming from host before
+		 * finishing previous requests
+		 */
+		hv_vss_log_info("%s: Warning: there is new request "
+		    "coming before finishing previous requests\n", __func__);
+		return (NULL);
+	}
+	if (LIST_EMPTY(&sc->req_free_list)) {
+		/* TODO Error: no buffer */
+		hv_vss_log_info("Error: No buffer\n");
+		return (NULL);
+	}
+	reqp = LIST_FIRST(&sc->req_free_list);
+	LIST_REMOVE(reqp, link);
+	return (reqp);
+}
+
+static void
+hv_vss_start_notify(hv_vss_req_internal *reqp, uint32_t opt)
+{
+	hv_vss_sc *sc = reqp->sc;
+	/*
+	 * Freeze/Check notification sequence: kernel -> app -> daemon(fs)
+	 * Thaw notification sequence:         kernel -> daemon(fs) -> app
+	 *
+	 * We should wake up the daemon, in case it's doing poll().
+	 * The response should be received after 5s, otherwise, trigger timeout.
+	 */
+	switch (opt) {
+	case VSS_OP_FREEZE:
+	case VSS_OP_HOT_BACKUP:
+		if (sc->app_register_done)
+			hv_vss_notify(&sc->app_sc, reqp);
+		else
+			hv_vss_notify(&sc->daemon_sc, reqp);
+		callout_reset(&reqp->callout, TIMEOUT_LIMIT * hz,
+		    hv_vss_timeout, reqp);
+		break;
+	case VSS_OP_THAW:
+		hv_vss_notify(&sc->daemon_sc, reqp);
+		callout_reset(&reqp->callout, TIMEOUT_LIMIT * hz,
+		    hv_vss_timeout, reqp);
+		break;
+	}
+}
+
+/*
+ * Function to read the vss request buffer from host
+ * and interact with daemon
+ */
+static void
+hv_vss_process_request(void *context, int pending __unused)
+{
+	uint8_t *vss_buf;
+	struct vmbus_channel *channel;
+	uint32_t recvlen = 0;
+	uint64_t requestid;
+	struct vmbus_icmsg_hdr *icmsghdrp;
+	int ret = 0;
+	hv_vss_sc *sc;
+	hv_vss_req_internal *reqp;
+
+	hv_vss_log_info("%s: entering hv_vss_process_request\n", __func__);
+
+	sc = (hv_vss_sc*)context;
+	vss_buf = sc->util_sc.ic_buf;
+	channel = vmbus_get_channel(sc->dev);
+
+	recvlen = sc->util_sc.ic_buflen;
+	ret = vmbus_chan_recv(channel, vss_buf, &recvlen, &requestid);
+	KASSERT(ret != ENOBUFS, ("hvvss recvbuf is not large enough"));
+	/* XXX check recvlen to make sure that it contains enough data */
+
+	while ((ret == 0) && (recvlen > 0)) {
+		icmsghdrp = (struct vmbus_icmsg_hdr *)vss_buf;
+
+		if (icmsghdrp->ic_type == HV_ICMSGTYPE_NEGOTIATE) {
+			ret = vmbus_ic_negomsg(&sc->util_sc, vss_buf,
+			    &recvlen, VSS_FWVER, VSS_MSGVER);
+			hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
+			    recvlen, requestid, ret);
+			hv_vss_log_info("%s: version negotiated\n", __func__);
+		} else if (!hv_vss_is_daemon_killed_after_launch(sc)) {
+			struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
+			switch(msg->hdr.vss_hdr.operation) {
+			case VSS_OP_FREEZE:
+			case VSS_OP_THAW:
+			case VSS_OP_HOT_BACKUP:
+				mtx_lock(&sc->pending_mutex);
+				reqp = hv_vss_get_new_req_locked(sc);
+				mtx_unlock(&sc->pending_mutex);
+				if (reqp == NULL) {
+					/* ignore this request from host */
+					break;
+				}
+				hv_vss_init_req(reqp, recvlen, requestid, vss_buf, sc);
+				hv_vss_log_info("%s: receive %s (%ju) from host\n",
+				    __func__,
+				    vss_opt_name[reqp->vss_req.opt_msg.opt],
+				    (uintmax_t)reqp->vss_req.opt_msg.msgid);
+				hv_vss_start_notify(reqp, msg->hdr.vss_hdr.operation);
+				break;
+			case VSS_OP_GET_DM_INFO:
+				hv_vss_log_info("%s: receive GET_DM_INFO from host\n",
+				    __func__);
+				msg->body.dm_info.flags = 0;
+				hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
+				    recvlen, requestid, HV_S_OK);
+				break;
+			default:
+				device_printf(sc->dev, "Unknown opt from host: %d\n",
+				    msg->hdr.vss_hdr.operation);
+				break;
+			}
+		} else {
+			/* daemon was killed for some reason after it was launched */
+			struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
+			switch(msg->hdr.vss_hdr.operation) {
+			case VSS_OP_FREEZE:
+				hv_vss_log_info("%s: response fail for FREEZE\n",
+				    __func__);
+				break;
+			case VSS_OP_THAW:
+				hv_vss_log_info("%s: response fail for THAW\n",
+				    __func__);
+				break;
+			case VSS_OP_HOT_BACKUP:
+				hv_vss_log_info("%s: response fail for HOT_BACKUP\n",
+				    __func__);
+				msg->body.vss_cf.flags = VSS_HBU_NO_AUTO_RECOVERY;
+				break;
+			case VSS_OP_GET_DM_INFO:
+				hv_vss_log_info("%s: response fail for GET_DM_INFO\n",
+				    __func__);
+				msg->body.dm_info.flags = 0;
+				break;
+			default:
+				device_printf(sc->dev, "Unknown opt from host: %d\n",
+				    msg->hdr.vss_hdr.operation);
+				break;
+			}
+			hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
+			    recvlen, requestid, HV_E_FAIL);
+		}
+		/*
+		 * Try reading next buffer
+		 */
+		recvlen = sc->util_sc.ic_buflen;
+		ret = vmbus_chan_recv(channel, vss_buf, &recvlen, &requestid);
+		KASSERT(ret != ENOBUFS, ("hvvss recvbuf is not large enough"));
+		/* XXX check recvlen to make sure that it contains enough data */
+
+		hv_vss_log_info("%s: read: context %p, ret =%d, recvlen=%d\n",
+		    __func__, context, ret, recvlen);
+	}
+}
+
+static int
+hv_vss_probe(device_t dev)
+{
+	return (vmbus_ic_probe(dev, vmbus_vss_descs));
+}
+
+static int
+hv_vss_init_send_receive_queue(device_t dev)
+{
+	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
+	int i;
+	const int max_list = 4; /* It is big enough for the list */
+	struct hv_vss_req_internal* reqp;
+
+	LIST_INIT(&sc->req_free_list);
+	STAILQ_INIT(&sc->daemon_sc.to_notify_queue);
+	STAILQ_INIT(&sc->daemon_sc.to_ack_queue);
+	STAILQ_INIT(&sc->app_sc.to_notify_queue);
+	STAILQ_INIT(&sc->app_sc.to_ack_queue);
+
+	for (i = 0; i < max_list; i++) {
+		reqp = malloc(sizeof(struct hv_vss_req_internal),
+		    M_DEVBUF, M_WAITOK|M_ZERO);
+		LIST_INSERT_HEAD(&sc->req_free_list, reqp, link);
+		callout_init_mtx(&reqp->callout, &sc->pending_mutex, 0);
+	}
+	return (0);
+}
+
+static int
+hv_vss_destroy_send_receive_queue(device_t dev)
+{
+	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
+	hv_vss_req_internal* reqp;
+
+	while (!LIST_EMPTY(&sc->req_free_list)) {
+		reqp = LIST_FIRST(&sc->req_free_list);
+		LIST_REMOVE(reqp, link);
+		free(reqp, M_DEVBUF);
+	}
+
+	while (!STAILQ_EMPTY(&sc->daemon_sc.to_notify_queue)) {
+		reqp = STAILQ_FIRST(&sc->daemon_sc.to_notify_queue);
+		STAILQ_REMOVE_HEAD(&sc->daemon_sc.to_notify_queue, slink);
+		free(reqp, M_DEVBUF);
+	}
+
+	while (!STAILQ_EMPTY(&sc->daemon_sc.to_ack_queue)) {
+		reqp = STAILQ_FIRST(&sc->daemon_sc.to_ack_queue);
+		STAILQ_REMOVE_HEAD(&sc->daemon_sc.to_ack_queue, slink);
+		free(reqp, M_DEVBUF);
+	}
+
+	while (!STAILQ_EMPTY(&sc->app_sc.to_notify_queue)) {
+		reqp = STAILQ_FIRST(&sc->app_sc.to_notify_queue);
+		STAILQ_REMOVE_HEAD(&sc->app_sc.to_notify_queue, slink);
+		free(reqp, M_DEVBUF);
+	}
+
+	while (!STAILQ_EMPTY(&sc->app_sc.to_ack_queue)) {
+		reqp = STAILQ_FIRST(&sc->app_sc.to_ack_queue);
+		STAILQ_REMOVE_HEAD(&sc->app_sc.to_ack_queue, slink);
+		free(reqp, M_DEVBUF);
+	}
+	return (0);
+}
+
+static int
+hv_vss_attach(device_t dev)
+{
+	int error;
+	struct sysctl_oid_list *child;
+	struct sysctl_ctx_list *ctx;
+
+	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
+
+	sc->dev = dev;
+	mtx_init(&sc->pending_mutex, "hv_vss pending mutex", NULL, MTX_DEF);
+
+	ctx = device_get_sysctl_ctx(dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "hv_vss_log",
+	    CTLFLAG_RWTUN, &hv_vss_log, 0, "Hyperv VSS service log level");
+
+	TASK_INIT(&sc->task, 0, hv_vss_process_request, sc);
+	hv_vss_init_send_receive_queue(dev);
+	/* create character device for file system freeze/thaw */
+	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
+		    &sc->hv_vss_dev,
+		    &hv_vss_cdevsw,
+		    0,
+		    UID_ROOT,
+		    GID_WHEEL,
+		    0640,
+		    FS_VSS_DEV_NAME);
+
+	if (error != 0) {
+		hv_vss_log_info("Fail to create '%s': %d\n", FS_VSS_DEV_NAME, error);
+		return (error);
+	}
+	sc->hv_vss_dev->si_drv1 = &sc->daemon_sc;
+	sc->daemon_sc.sc = sc;
+	/* create character device for application freeze/thaw */
+	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
+		    &sc->hv_appvss_dev,
+		    &hv_appvss_cdevsw,
+		    0,
+		    UID_ROOT,
+		    GID_WHEEL,
+		    0640,
+		    APP_VSS_DEV_NAME);
+
+	if (error != 0) {
+		hv_vss_log_info("Fail to create '%s': %d\n", APP_VSS_DEV_NAME, error);
+		return (error);
+	}
+	sc->hv_appvss_dev->si_drv1 = &sc->app_sc;
+	sc->app_sc.sc = sc;
+
+	return (vmbus_ic_attach(dev, hv_vss_callback));
+}
+
+static int
+hv_vss_detach(device_t dev)
+{
+	hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
+	mtx_destroy(&sc->pending_mutex);
+	if (sc->daemon_sc.proc_task != NULL) {
+		PROC_LOCK(sc->daemon_sc.proc_task);
+		kern_psignal(sc->daemon_sc.proc_task, SIGKILL);
+		PROC_UNLOCK(sc->daemon_sc.proc_task);
+	}
+	if (sc->app_sc.proc_task != NULL) {
+		PROC_LOCK(sc->app_sc.proc_task);
+		kern_psignal(sc->app_sc.proc_task, SIGKILL);
+		PROC_UNLOCK(sc->app_sc.proc_task);
+	}
+	hv_vss_destroy_send_receive_queue(dev);
+	destroy_dev(sc->hv_vss_dev);
+	destroy_dev(sc->hv_appvss_dev);
+	return (vmbus_ic_detach(dev));
+}
+
+static device_method_t vss_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe, hv_vss_probe),
+	DEVMETHOD(device_attach, hv_vss_attach),
+	DEVMETHOD(device_detach, hv_vss_detach),
+	{ 0, 0 }
+};
+
+static driver_t vss_driver = { "hvvss", vss_methods, sizeof(hv_vss_sc)};
+
+static devclass_t vss_devclass;
+
+DRIVER_MODULE(hv_vss, vmbus, vss_driver, vss_devclass, NULL, NULL);
+MODULE_VERSION(hv_vss, 1);
+MODULE_DEPEND(hv_vss, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/utilities/hv_snapshot.h b/sys/dev/hyperv/utilities/hv_snapshot.h
new file mode 100644
index 000000000000..e3c9e0c9fef2
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_snapshot.h
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VSS_H
+#define _VSS_H
+#include <sys/ioccom.h>
+#define FS_VSS_DEV_NAME		"hv_fsvss_dev"
+#define APP_VSS_DEV_NAME	"hv_appvss_dev"
+
+#define VSS_DEV(VSS)		"/dev/"VSS
+
+#define VSS_SUCCESS		0x00000000
+#define VSS_FAIL		0x00000001
+
+enum hv_vss_op_t {
+	HV_VSS_NONE = 0,
+	HV_VSS_CHECK,
+	HV_VSS_FREEZE,
+	HV_VSS_THAW,
+	HV_VSS_COUNT
+};
+
+struct hv_vss_opt_msg {
+	uint32_t	opt;		/* operation */
+	uint32_t	status;		/* 0 for success, 1 for error */
+	uint64_t	msgid;		/* an ID used to identify the transaction */
+	uint8_t		reserved[48];	/* reserved values are all zeroes */
+};
+#define IOCHVVSSREAD		_IOR('v', 2, struct hv_vss_opt_msg)
+#define IOCHVVSSWRITE		_IOW('v', 3, struct hv_vss_opt_msg)
+#endif
diff --git a/sys/dev/hyperv/utilities/hv_utilreg.h b/sys/dev/hyperv/utilities/hv_utilreg.h
new file mode 100644
index 000000000000..b29c0f99204f
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_utilreg.h
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HV_UTILREG_H_
+#define _HV_UTILREG_H_
+
+/*
+ * Some Hyper-V status codes.
+ */
+#define HV_S_OK				0x00000000
+#define HV_E_FAIL			0x80004005
+#define HV_S_CONT			0x80070103
+#define HV_ERROR_NOT_SUPPORTED		0x80070032
+#define HV_ERROR_MACHINE_LOCKED		0x800704F7
+#define HV_ERROR_DEVICE_NOT_CONNECTED	0x8007048F
+#define HV_INVALIDARG			0x80070057
+#define HV_GUID_NOTFOUND		0x80041002
+
+/*
+ * Common defines for Hyper-V ICs
+ */
+#define HV_ICMSGTYPE_NEGOTIATE		0
+#define HV_ICMSGTYPE_HEARTBEAT		1
+#define HV_ICMSGTYPE_KVPEXCHANGE	2
+#define HV_ICMSGTYPE_SHUTDOWN		3
+#define HV_ICMSGTYPE_TIMESYNC		4
+#define HV_ICMSGTYPE_VSS		5
+
+#define HV_ICMSGHDRFLAG_TRANSACTION	1
+#define HV_ICMSGHDRFLAG_REQUEST		2
+#define HV_ICMSGHDRFLAG_RESPONSE	4
+
+typedef struct hv_vmbus_pipe_hdr {
+	uint32_t flags;
+	uint32_t msgsize;
+} __packed hv_vmbus_pipe_hdr;
+
+typedef struct hv_vmbus_ic_version {
+	uint16_t major;
+	uint16_t minor;
+} __packed hv_vmbus_ic_version;
+
+typedef struct hv_vmbus_icmsg_hdr {
+	hv_vmbus_ic_version	icverframe;
+	uint16_t		icmsgtype;
+	hv_vmbus_ic_version	icvermsg;
+	uint16_t		icmsgsize;
+	uint32_t		status;
+	uint8_t			ictransaction_id;
+	uint8_t			icflags;
+	uint8_t			reserved[2];
+} __packed hv_vmbus_icmsg_hdr;
+
+typedef struct hv_vmbus_icmsg_negotiate {
+	uint16_t		icframe_vercnt;
+	uint16_t		icmsg_vercnt;
+	uint32_t		reserved;
+	hv_vmbus_ic_version	icversion_data[1]; /* any size array */
+} __packed hv_vmbus_icmsg_negotiate;
+
+#endif	/* !_HV_UTILREG_H_ */
diff --git a/sys/dev/hyperv/utilities/unicode.h b/sys/dev/hyperv/utilities/unicode.h
new file mode 100644
index 000000000000..696777cbbf26
--- /dev/null
+++ b/sys/dev/hyperv/utilities/unicode.h
@@ -0,0 +1,201 @@
+/* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */
+
+/*-
+ * Copyright (c) 2007 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/types.h>
+
+#define UNICODE_DECOMPOSE		0x01
+#define UNICODE_PRECOMPOSE		0x02
+#define UNICODE_UTF8_LATIN1_FALLBACK	0x03
+
+size_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *);
+size_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *);
+
+size_t
+utf8_to_utf16(uint16_t *dst, size_t dst_len,
+	      const char *src, size_t src_len,
+	      int flags, int *errp)
+{
+    const unsigned char *s;
+    size_t spos, dpos;
+    int error;
+    uint16_t c;
+
+#define IS_CONT(c)	(((c)&0xc0) == 0x80)
+
+    error = 0;
+    s = (const unsigned char *)src;
+    spos = dpos = 0;
+    while (spos<src_len) {
+	if (s[spos] < 0x80)
+	    c = s[spos++];
+	else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
+		 && (spos >= src_len || !IS_CONT(s[spos+1]))
+		 && s[spos]>=0xa0) {
+	    /* not valid UTF-8, assume ISO 8859-1 */
+	    c = s[spos++];
+	}
+	else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
+	    /* continuation byte without lead byte
+	       or lead byte for codepoint above 0x10ffff */
+	    error++;
+	    spos++;
+	    continue;
+	}
+	else if (s[spos] < 0xe0) {
+	    if (spos >= src_len || !IS_CONT(s[spos+1])) {
+		spos++;
+		error++;
+		continue;
+	    }
+	    c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
+	    spos += 2;
+	    if (c < 0x80) {
+		/* overlong encoding */
+		error++;
+		continue;
+	    }
+	}
+	else if (s[spos] < 0xf0) {
+	    if (spos >= src_len-2
+		|| !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
+		spos++;
+		error++;
+		continue;
+	    }
+	    c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
+		| (s[spos+2] & 0x3f);
+	    spos += 3;
+	    if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
+		/* overlong encoding or encoded surrogate */
+		error++;
+		continue;
+	    }
+	}
+	else {
+	    uint32_t cc;
+	    /* UTF-16 surrogate pair */
+
+	    if (spos >= src_len-3 || !IS_CONT(s[spos+1])
+		|| !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
+		spos++;
+		error++;
+		
+		continue;
+	    }
+	    cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
+		 | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
+	    spos += 4;
+	    if (cc < 0x10000) {
+		/* overlong encoding */
+		error++;
+		continue;
+	    }
+	    if (dst && dpos < dst_len)
+		dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
+	    dpos++;
+	    c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
+	}
+
+	if (dst && dpos < dst_len)
+	    dst[dpos] = c;
+	dpos++;
+    }
+    
+    if (errp)
+	*errp = error;
+
+    return dpos;
+
+#undef IS_CONT
+}
+
+
+size_t
+utf16_to_utf8(char *dst, size_t dst_len,
+	      const uint16_t *src, size_t src_len,
+	      int flags, int *errp)
+{
+    uint16_t spos, dpos;
+    int error;
+
+#define CHECK_LENGTH(l)	(dpos > dst_len-(l) ? dst=NULL : NULL)
+#define ADD_BYTE(b)	(dst ? dst[dpos] = (b) : 0, dpos++)
+
+    error = 0;
+    dpos = 0;
+    for (spos=0; spos<src_len; spos++) {
+	if (src[spos] < 0x80) {
+	    CHECK_LENGTH(1);
+	    ADD_BYTE(src[spos]);
+	}
+	else if (src[spos] < 0x800) {
+	    CHECK_LENGTH(2);
+	    ADD_BYTE(0xc0 | (src[spos]>>6));
+	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
+	}
+	else if ((src[spos] & 0xdc00) == 0xd800) {
+	    uint32_t c;
+	    /* first surrogate */
+	    if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
+		/* no second surrogate present */
+		error++;
+		continue;
+	    }
+	    spos++;
+	    CHECK_LENGTH(4);
+	    c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
+	    ADD_BYTE(0xf0 | (c>>18));
+	    ADD_BYTE(0x80 | ((c>>12) & 0x3f));
+	    ADD_BYTE(0x80 | ((c>>6) & 0x3f));
+	    ADD_BYTE(0x80 | (c & 0x3f));
+	}
+	else if ((src[spos] & 0xdc00) == 0xdc00) {
+	    /* second surrogate without preceding first surrogate */
+	    error++;
+	}
+	else {
+	    CHECK_LENGTH(3);
+	    ADD_BYTE(0xe0 | src[spos]>>12);
+	    ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
+	    ADD_BYTE(0x80 | (src[spos] & 0x3f));
+	}
+    }
+
+    if (errp)
+	*errp = error;
+
+    return dpos;
+
+#undef ADD_BYTE
+#undef CHECK_LENGTH
+}
diff --git a/sys/dev/hyperv/utilities/vmbus_heartbeat.c b/sys/dev/hyperv/utilities/vmbus_heartbeat.c
new file mode 100644
index 000000000000..f15b94822aa9
--- /dev/null
+++ b/sys/dev/hyperv/utilities/vmbus_heartbeat.c
@@ -0,0 +1,152 @@
+/*-
+ * Copyright (c) 2014,2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/systm.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/utilities/vmbus_icreg.h>
+#include <dev/hyperv/utilities/vmbus_icvar.h>
+
+#define VMBUS_HEARTBEAT_FWVER_MAJOR	3
+#define VMBUS_HEARTBEAT_FWVER		\
+	VMBUS_IC_VERSION(VMBUS_HEARTBEAT_FWVER_MAJOR, 0)
+
+#define VMBUS_HEARTBEAT_MSGVER_MAJOR	3
+#define VMBUS_HEARTBEAT_MSGVER		\
+	VMBUS_IC_VERSION(VMBUS_HEARTBEAT_MSGVER_MAJOR, 0)
+
+static int			vmbus_heartbeat_probe(device_t);
+static int			vmbus_heartbeat_attach(device_t);
+
+static const struct vmbus_ic_desc vmbus_heartbeat_descs[] = {
+	{
+		.ic_guid = { .hv_guid = {
+		    0x39, 0x4f, 0x16, 0x57, 0x15, 0x91, 0x78, 0x4e,
+		    0xab, 0x55, 0x38, 0x2f, 0x3b, 0xd5, 0x42, 0x2d} },
+		.ic_desc = "Hyper-V Heartbeat"
+	},
+	VMBUS_IC_DESC_END
+};
+
+static device_method_t vmbus_heartbeat_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		vmbus_heartbeat_probe),
+	DEVMETHOD(device_attach,	vmbus_heartbeat_attach),
+	DEVMETHOD(device_detach,	vmbus_ic_detach),
+	DEVMETHOD_END
+};
+
+static driver_t vmbus_heartbeat_driver = {
+	"hvheartbeat",
+	vmbus_heartbeat_methods,
+	sizeof(struct vmbus_ic_softc)
+};
+
+static devclass_t vmbus_heartbeat_devclass;
+
+DRIVER_MODULE(hv_heartbeat, vmbus, vmbus_heartbeat_driver,
+    vmbus_heartbeat_devclass, NULL, NULL);
+MODULE_VERSION(hv_heartbeat, 1);
+MODULE_DEPEND(hv_heartbeat, vmbus, 1, 1, 1);
+
+static void
+vmbus_heartbeat_cb(struct vmbus_channel *chan, void *xsc)
+{
+	struct vmbus_ic_softc *sc = xsc;
+	struct vmbus_icmsg_hdr *hdr;
+	int dlen, error;
+	uint64_t xactid;
+	void *data;
+
+	/*
+	 * Receive request.
+	 */
+	data = sc->ic_buf;
+	dlen = sc->ic_buflen;
+	error = vmbus_chan_recv(chan, data, &dlen, &xactid);
+	KASSERT(error != ENOBUFS, ("icbuf is not large enough"));
+	if (error)
+		return;
+
+	if (dlen < sizeof(*hdr)) {
+		device_printf(sc->ic_dev, "invalid data len %d\n", dlen);
+		return;
+	}
+	hdr = data;
+
+	/*
+	 * Update request, which will be echoed back as response.
+	 */
+	switch (hdr->ic_type) {
+	case VMBUS_ICMSG_TYPE_NEGOTIATE:
+		error = vmbus_ic_negomsg(sc, data, &dlen,
+		    VMBUS_HEARTBEAT_FWVER, VMBUS_HEARTBEAT_MSGVER);
+		if (error)
+			return;
+		break;
+
+	case VMBUS_ICMSG_TYPE_HEARTBEAT:
+		/* Only ic_seq is a must */
+		if (dlen < VMBUS_ICMSG_HEARTBEAT_SIZE_MIN) {
+			device_printf(sc->ic_dev, "invalid heartbeat len %d\n",
+			    dlen);
+			return;
+		}
+		((struct vmbus_icmsg_heartbeat *)data)->ic_seq++;
+		break;
+
+	default:
+		device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type);
+		break;
+	}
+
+	/*
+	 * Send response by echoing the request back.
+	 */
+	vmbus_ic_sendresp(sc, chan, data, dlen, xactid);
+}
+
+static int
+vmbus_heartbeat_probe(device_t dev)
+{
+
+	return (vmbus_ic_probe(dev, vmbus_heartbeat_descs));
+}
+
+static int
+vmbus_heartbeat_attach(device_t dev)
+{
+
+	return (vmbus_ic_attach(dev, vmbus_heartbeat_cb));
+}
diff --git a/sys/dev/hyperv/utilities/vmbus_ic.c b/sys/dev/hyperv/utilities/vmbus_ic.c
new file mode 100644
index 000000000000..574670053918
--- /dev/null
+++ b/sys/dev/hyperv/utilities/vmbus_ic.c
@@ -0,0 +1,299 @@
+/*-
+ * Copyright (c) 2014,2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/utilities/vmbus_icreg.h>
+#include <dev/hyperv/utilities/vmbus_icvar.h>
+
+#include "vmbus_if.h"
+
+#define VMBUS_IC_BRSIZE		(4 * PAGE_SIZE)
+
+#define VMBUS_IC_VERCNT		2
+#define VMBUS_IC_NEGOSZ		\
+	__offsetof(struct vmbus_icmsg_negotiate, ic_ver[VMBUS_IC_VERCNT])
+CTASSERT(VMBUS_IC_NEGOSZ < VMBUS_IC_BRSIZE);
+
+static int	vmbus_ic_fwver_sysctl(SYSCTL_HANDLER_ARGS);
+static int	vmbus_ic_msgver_sysctl(SYSCTL_HANDLER_ARGS);
+
+int
+vmbus_ic_negomsg(struct vmbus_ic_softc *sc, void *data, int *dlen0,
+    uint32_t fw_ver, uint32_t msg_ver)
+{
+	struct vmbus_icmsg_negotiate *nego;
+	int i, cnt, dlen = *dlen0, error;
+	uint32_t sel_fw_ver, sel_msg_ver;
+	bool has_fw_ver, has_msg_ver;
+
+	/*
+	 * Preliminary message verification.
+	 */
+	if (dlen < sizeof(*nego)) {
+		device_printf(sc->ic_dev, "truncated ic negotiate, len %d\n",
+		    dlen);
+		return (EINVAL);
+	}
+	nego = data;
+
+	if (nego->ic_fwver_cnt == 0) {
+		device_printf(sc->ic_dev, "ic negotiate does not contain "
+		    "framework version %u\n", nego->ic_fwver_cnt);
+		return (EINVAL);
+	}
+	if (nego->ic_msgver_cnt == 0) {
+		device_printf(sc->ic_dev, "ic negotiate does not contain "
+		    "message version %u\n", nego->ic_msgver_cnt);
+		return (EINVAL);
+	}
+
+	cnt = nego->ic_fwver_cnt + nego->ic_msgver_cnt;
+	if (dlen < __offsetof(struct vmbus_icmsg_negotiate, ic_ver[cnt])) {
+		device_printf(sc->ic_dev, "ic negotiate does not contain "
+		    "versions %d\n", dlen);
+		return (EINVAL);
+	}
+
+	error = EOPNOTSUPP;
+
+	/*
+	 * Find the best match framework version.
+	 */
+	has_fw_ver = false;
+	for (i = 0; i < nego->ic_fwver_cnt; ++i) {
+		if (VMBUS_ICVER_LE(nego->ic_ver[i], fw_ver)) {
+			if (!has_fw_ver) {
+				sel_fw_ver = nego->ic_ver[i];
+				has_fw_ver = true;
+			} else if (VMBUS_ICVER_GT(nego->ic_ver[i],
+			    sel_fw_ver)) {
+				sel_fw_ver = nego->ic_ver[i];
+			}
+		}
+	}
+	if (!has_fw_ver) {
+		device_printf(sc->ic_dev, "failed to select framework "
+		    "version\n");
+		goto done;
+	}
+
+	/*
+	 * Fine the best match message version.
+	 */
+	has_msg_ver = false;
+	for (i = nego->ic_fwver_cnt;
+	    i < nego->ic_fwver_cnt + nego->ic_msgver_cnt; ++i) {
+		if (VMBUS_ICVER_LE(nego->ic_ver[i], msg_ver)) {
+			if (!has_msg_ver) {
+				sel_msg_ver = nego->ic_ver[i];
+				has_msg_ver = true;
+			} else if (VMBUS_ICVER_GT(nego->ic_ver[i],
+			    sel_msg_ver)) {
+				sel_msg_ver = nego->ic_ver[i];
+			}
+		}
+	}
+	if (!has_msg_ver) {
+		device_printf(sc->ic_dev, "failed to select message "
+		    "version\n");
+		goto done;
+	}
+
+	error = 0;
+done:
+	if (bootverbose || !has_fw_ver || !has_msg_ver) {
+		if (has_fw_ver) {
+			device_printf(sc->ic_dev, "sel framework version: "
+			    "%u.%u\n",
+			    VMBUS_ICVER_MAJOR(sel_fw_ver),
+			    VMBUS_ICVER_MINOR(sel_fw_ver));
+		}
+		for (i = 0; i < nego->ic_fwver_cnt; i++) {
+			device_printf(sc->ic_dev, "supp framework version: "
+			    "%u.%u\n",
+			    VMBUS_ICVER_MAJOR(nego->ic_ver[i]),
+			    VMBUS_ICVER_MINOR(nego->ic_ver[i]));
+		}
+
+		if (has_msg_ver) {
+			device_printf(sc->ic_dev, "sel message version: "
+			    "%u.%u\n",
+			    VMBUS_ICVER_MAJOR(sel_msg_ver),
+			    VMBUS_ICVER_MINOR(sel_msg_ver));
+		}
+		for (i = nego->ic_fwver_cnt;
+		    i < nego->ic_fwver_cnt + nego->ic_msgver_cnt; i++) {
+			device_printf(sc->ic_dev, "supp message version: "
+			    "%u.%u\n",
+			    VMBUS_ICVER_MAJOR(nego->ic_ver[i]),
+			    VMBUS_ICVER_MINOR(nego->ic_ver[i]));
+		}
+	}
+	if (error)
+		return (error);
+
+	/* Record the selected versions. */
+	sc->ic_fwver = sel_fw_ver;
+	sc->ic_msgver = sel_msg_ver;
+
+	/* One framework version. */
+	nego->ic_fwver_cnt = 1;
+	nego->ic_ver[0] = sel_fw_ver;
+
+	/* One message version. */
+	nego->ic_msgver_cnt = 1;
+	nego->ic_ver[1] = sel_msg_ver;
+
+	/* Update data size. */
+	nego->ic_hdr.ic_dsize = VMBUS_IC_NEGOSZ -
+	    sizeof(struct vmbus_icmsg_hdr);
+
+	/* Update total size, if necessary. */
+	if (dlen < VMBUS_IC_NEGOSZ)
+		*dlen0 = VMBUS_IC_NEGOSZ;
+
+	return (0);
+}
+
+int
+vmbus_ic_probe(device_t dev, const struct vmbus_ic_desc descs[])
+{
+	device_t bus = device_get_parent(dev);
+	const struct vmbus_ic_desc *d;
+
+	if (resource_disabled(device_get_name(dev), 0))
+		return (ENXIO);
+
+	for (d = descs; d->ic_desc != NULL; ++d) {
+		if (VMBUS_PROBE_GUID(bus, dev, &d->ic_guid) == 0) {
+			device_set_desc(dev, d->ic_desc);
+			return (BUS_PROBE_DEFAULT);
+		}
+	}
+	return (ENXIO);
+}
+
+int
+vmbus_ic_attach(device_t dev, vmbus_chan_callback_t cb)
+{
+	struct vmbus_ic_softc *sc = device_get_softc(dev);
+	struct vmbus_channel *chan = vmbus_get_channel(dev);
+	struct sysctl_oid_list *child;
+	struct sysctl_ctx_list *ctx;
+	int error;
+
+	sc->ic_dev = dev;
+	sc->ic_buflen = VMBUS_IC_BRSIZE;
+	sc->ic_buf = malloc(VMBUS_IC_BRSIZE, M_DEVBUF, M_WAITOK | M_ZERO);
+
+	/*
+	 * These services are not performance critical and do not need
+	 * batched reading. Furthermore, some services such as KVP can
+	 * only handle one message from the host at a time.
+	 * Turn off batched reading for all util drivers before we open the
+	 * channel.
+	 */
+	vmbus_chan_set_readbatch(chan, false);
+
+	error = vmbus_chan_open(chan, VMBUS_IC_BRSIZE, VMBUS_IC_BRSIZE, NULL, 0,
+	    cb, sc);
+	if (error) {
+		free(sc->ic_buf, M_DEVBUF);
+		return (error);
+	}
+
+	ctx = device_get_sysctl_ctx(dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "fw_version",
+	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+	    vmbus_ic_fwver_sysctl, "A", "framework version");
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "msg_version",
+	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+	    vmbus_ic_msgver_sysctl, "A", "message version");
+
+	return (0);
+}
+
+static int
+vmbus_ic_fwver_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct vmbus_ic_softc *sc = arg1;
+	char verstr[16];
+
+	snprintf(verstr, sizeof(verstr), "%u.%u",
+	    VMBUS_ICVER_MAJOR(sc->ic_fwver), VMBUS_ICVER_MINOR(sc->ic_fwver));
+	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
+}
+
+static int
+vmbus_ic_msgver_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct vmbus_ic_softc *sc = arg1;
+	char verstr[16];
+
+	snprintf(verstr, sizeof(verstr), "%u.%u",
+	    VMBUS_ICVER_MAJOR(sc->ic_msgver), VMBUS_ICVER_MINOR(sc->ic_msgver));
+	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
+}
+
+int
+vmbus_ic_detach(device_t dev)
+{
+	struct vmbus_ic_softc *sc = device_get_softc(dev);
+
+	vmbus_chan_close(vmbus_get_channel(dev));
+	free(sc->ic_buf, M_DEVBUF);
+
+	return (0);
+}
+
+int
+vmbus_ic_sendresp(struct vmbus_ic_softc *sc, struct vmbus_channel *chan,
+    void *data, int dlen, uint64_t xactid)
+{
+	struct vmbus_icmsg_hdr *hdr;
+	int error;
+
+	KASSERT(dlen >= sizeof(*hdr), ("invalid data length %d", dlen));
+	hdr = data;
+
+	hdr->ic_flags = VMBUS_ICMSG_FLAG_XACT | VMBUS_ICMSG_FLAG_RESP;
+	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
+	    data, dlen, xactid);
+	if (error)
+		device_printf(sc->ic_dev, "resp send failed: %d\n", error);
+	return (error);
+}
diff --git a/sys/dev/hyperv/utilities/vmbus_icreg.h b/sys/dev/hyperv/utilities/vmbus_icreg.h
new file mode 100644
index 000000000000..e962102d13dd
--- /dev/null
+++ b/sys/dev/hyperv/utilities/vmbus_icreg.h
@@ -0,0 +1,135 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_ICREG_H_
+#define _VMBUS_ICREG_H_
+
+#define VMBUS_ICMSG_TYPE_NEGOTIATE	0
+#define VMBUS_ICMSG_TYPE_HEARTBEAT	1
+#define VMBUS_ICMSG_TYPE_KVP		2
+#define VMBUS_ICMSG_TYPE_SHUTDOWN	3
+#define VMBUS_ICMSG_TYPE_TIMESYNC	4
+#define VMBUS_ICMSG_TYPE_VSS		5
+
+#define VMBUS_ICMSG_STATUS_OK		0x00000000
+#define VMBUS_ICMSG_STATUS_FAIL		0x80004005
+
+#define VMBUS_IC_VERSION(major, minor)	((major) | (((uint32_t)(minor)) << 16))
+#define VMBUS_ICVER_MAJOR(ver)		((ver) & 0xffff)
+#define VMBUS_ICVER_MINOR(ver)		(((ver) & 0xffff0000) >> 16)
+#define VMBUS_ICVER_SWAP(ver)		\
+	((VMBUS_ICVER_MAJOR((ver)) << 16) | VMBUS_ICVER_MINOR((ver)))
+#define VMBUS_ICVER_LE(v1, v2)		\
+	(VMBUS_ICVER_SWAP((v1)) <= VMBUS_ICVER_SWAP((v2)))
+#define VMBUS_ICVER_GT(v1, v2)		\
+	(VMBUS_ICVER_SWAP((v1)) > VMBUS_ICVER_SWAP((v2)))
+
+struct vmbus_pipe_hdr {
+	uint32_t		ph_flags;
+	uint32_t		ph_msgsz;
+} __packed;
+
+struct vmbus_icmsg_hdr {
+	struct vmbus_pipe_hdr	ic_pipe;
+	uint32_t		ic_fwver;	/* framework version */
+	uint16_t		ic_type;
+	uint32_t		ic_msgver;	/* message version */
+	uint16_t		ic_dsize;	/* data size */
+	uint32_t		ic_status;	/* VMBUS_ICMSG_STATUS_ */
+	uint8_t			ic_xactid;
+	uint8_t			ic_flags;	/* VMBUS_ICMSG_FLAG_ */
+	uint8_t			ic_rsvd[2];
+} __packed;
+
+#define VMBUS_ICMSG_FLAG_XACT		0x0001
+#define VMBUS_ICMSG_FLAG_REQ		0x0002
+#define VMBUS_ICMSG_FLAG_RESP		0x0004
+
+/* VMBUS_ICMSG_TYPE_NEGOTIATE */
+struct vmbus_icmsg_negotiate {
+	struct vmbus_icmsg_hdr	ic_hdr;
+	uint16_t		ic_fwver_cnt;
+	uint16_t		ic_msgver_cnt;
+	uint32_t		ic_rsvd;
+	/*
+	 * This version array contains two set of supported
+	 * versions:
+	 * - The first set consists of #ic_fwver_cnt supported framework
+	 *   versions.
+	 * - The second set consists of #ic_msgver_cnt supported message
+	 *   versions.
+	 */
+	uint32_t		ic_ver[];
+} __packed;
+
+/* VMBUS_ICMSG_TYPE_HEARTBEAT */
+struct vmbus_icmsg_heartbeat {
+	struct vmbus_icmsg_hdr	ic_hdr;
+	uint64_t		ic_seq;
+	uint32_t		ic_rsvd[8];
+} __packed;
+
+#define VMBUS_ICMSG_HEARTBEAT_SIZE_MIN	\
+	__offsetof(struct vmbus_icmsg_heartbeat, ic_rsvd[0])
+
+/* VMBUS_ICMSG_TYPE_SHUTDOWN */
+struct vmbus_icmsg_shutdown {
+	struct vmbus_icmsg_hdr	ic_hdr;
+	uint32_t		ic_code;
+	uint32_t		ic_timeo;
+	uint32_t 		ic_haltflags;
+	uint8_t			ic_msg[2048];
+} __packed;
+
+#define VMBUS_ICMSG_SHUTDOWN_SIZE_MIN	\
+	__offsetof(struct vmbus_icmsg_shutdown, ic_msg[0])
+
+/* VMBUS_ICMSG_TYPE_TIMESYNC */
+struct vmbus_icmsg_timesync {
+	struct vmbus_icmsg_hdr	ic_hdr;
+	uint64_t		ic_hvtime;
+	uint64_t		ic_vmtime;
+	uint64_t		ic_rtt;
+	uint8_t			ic_tsflags;	/* VMBUS_ICMSG_TS_FLAG_ */
+} __packed;
+
+/* VMBUS_ICMSG_TYPE_TIMESYNC, MSGVER4 */
+struct vmbus_icmsg_timesync4 {
+	struct vmbus_icmsg_hdr	ic_hdr;
+	uint64_t		ic_hvtime;
+	uint64_t		ic_sent_tc;
+	uint8_t			ic_tsflags;	/* VMBUS_ICMSG_TS_FLAG_ */
+	uint8_t			ic_rsvd[5];
+} __packed;
+
+#define VMBUS_ICMSG_TS_FLAG_SYNC	0x01
+#define VMBUS_ICMSG_TS_FLAG_SAMPLE	0x02
+
+#define VMBUS_ICMSG_TS_BASE		116444736000000000ULL
+
+#endif	/* !_VMBUS_ICREG_H_ */
diff --git a/sys/dev/hyperv/utilities/vmbus_icvar.h b/sys/dev/hyperv/utilities/vmbus_icvar.h
new file mode 100644
index 000000000000..a60ecfed58a2
--- /dev/null
+++ b/sys/dev/hyperv/utilities/vmbus_icvar.h
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_ICVAR_H_
+#define _VMBUS_ICVAR_H_
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+
+struct vmbus_ic_softc {
+	device_t		ic_dev;
+	uint8_t			*ic_buf;
+	int			ic_buflen;
+	uint32_t		ic_fwver;	/* framework version */
+	uint32_t		ic_msgver;	/* message version */
+};
+
+struct vmbus_ic_desc {
+	const struct hyperv_guid	ic_guid;
+	const char			*ic_desc;
+};
+
+#define VMBUS_IC_DESC_END	{ .ic_desc = NULL }
+
+int		vmbus_ic_attach(device_t dev, vmbus_chan_callback_t cb);
+int		vmbus_ic_detach(device_t dev);
+int		vmbus_ic_probe(device_t dev, const struct vmbus_ic_desc descs[]);
+int		vmbus_ic_negomsg(struct vmbus_ic_softc *sc, void *data,
+		    int *dlen, uint32_t fw_ver, uint32_t msg_ver);
+int		vmbus_ic_sendresp(struct vmbus_ic_softc *sc,
+		    struct vmbus_channel *chan, void *data, int dlen,
+		    uint64_t xactid);
+
+#endif	/* !_VMBUS_ICVAR_H_ */
diff --git a/sys/dev/hyperv/utilities/vmbus_shutdown.c b/sys/dev/hyperv/utilities/vmbus_shutdown.c
new file mode 100644
index 000000000000..7e54dc9866bb
--- /dev/null
+++ b/sys/dev/hyperv/utilities/vmbus_shutdown.c
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (c) 2014,2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/reboot.h>
+#include <sys/systm.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/utilities/vmbus_icreg.h>
+#include <dev/hyperv/utilities/vmbus_icvar.h>
+
+#define VMBUS_SHUTDOWN_FWVER_MAJOR	3
+#define VMBUS_SHUTDOWN_FWVER		\
+	VMBUS_IC_VERSION(VMBUS_SHUTDOWN_FWVER_MAJOR, 0)
+
+#define VMBUS_SHUTDOWN_MSGVER_MAJOR	3
+#define VMBUS_SHUTDOWN_MSGVER		\
+	VMBUS_IC_VERSION(VMBUS_SHUTDOWN_MSGVER_MAJOR, 0)
+
+static int			vmbus_shutdown_probe(device_t);
+static int			vmbus_shutdown_attach(device_t);
+
+static const struct vmbus_ic_desc vmbus_shutdown_descs[] = {
+	{
+		.ic_guid = { .hv_guid = {
+		    0x31, 0x60, 0x0b, 0x0e, 0x13, 0x52, 0x34, 0x49,
+		    0x81, 0x8b, 0x38, 0xd9, 0x0c, 0xed, 0x39, 0xdb } },
+		.ic_desc = "Hyper-V Shutdown"
+	},
+	VMBUS_IC_DESC_END
+};
+
+static device_method_t vmbus_shutdown_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		vmbus_shutdown_probe),
+	DEVMETHOD(device_attach,	vmbus_shutdown_attach),
+	DEVMETHOD(device_detach,	vmbus_ic_detach),
+	DEVMETHOD_END
+};
+
+static driver_t vmbus_shutdown_driver = {
+	"hvshutdown",
+	vmbus_shutdown_methods,
+	sizeof(struct vmbus_ic_softc)
+};
+
+static devclass_t vmbus_shutdown_devclass;
+
+DRIVER_MODULE(hv_shutdown, vmbus, vmbus_shutdown_driver,
+    vmbus_shutdown_devclass, NULL, NULL);
+MODULE_VERSION(hv_shutdown, 1);
+MODULE_DEPEND(hv_shutdown, vmbus, 1, 1, 1);
+
+static void
+vmbus_shutdown_cb(struct vmbus_channel *chan, void *xsc)
+{
+	struct vmbus_ic_softc *sc = xsc;
+	struct vmbus_icmsg_hdr *hdr;
+	struct vmbus_icmsg_shutdown *msg;
+	int dlen, error, do_shutdown = 0;
+	uint64_t xactid;
+	void *data;
+
+	/*
+	 * Receive request.
+	 */
+	data = sc->ic_buf;
+	dlen = sc->ic_buflen;
+	error = vmbus_chan_recv(chan, data, &dlen, &xactid);
+	KASSERT(error != ENOBUFS, ("icbuf is not large enough"));
+	if (error)
+		return;
+
+	if (dlen < sizeof(*hdr)) {
+		device_printf(sc->ic_dev, "invalid data len %d\n", dlen);
+		return;
+	}
+	hdr = data;
+
+	/*
+	 * Update request, which will be echoed back as response.
+	 */
+	switch (hdr->ic_type) {
+	case VMBUS_ICMSG_TYPE_NEGOTIATE:
+		error = vmbus_ic_negomsg(sc, data, &dlen,
+		    VMBUS_SHUTDOWN_FWVER, VMBUS_SHUTDOWN_MSGVER);
+		if (error)
+			return;
+		break;
+
+	case VMBUS_ICMSG_TYPE_SHUTDOWN:
+		if (dlen < VMBUS_ICMSG_SHUTDOWN_SIZE_MIN) {
+			device_printf(sc->ic_dev, "invalid shutdown len %d\n",
+			    dlen);
+			return;
+		}
+		msg = data;
+
+		/* XXX ic_flags definition? */
+		if (msg->ic_haltflags == 0 || msg->ic_haltflags == 1) {
+			device_printf(sc->ic_dev, "shutdown requested\n");
+			hdr->ic_status = VMBUS_ICMSG_STATUS_OK;
+			do_shutdown = 1;
+		} else {
+			device_printf(sc->ic_dev, "unknown shutdown flags "
+			    "0x%08x\n", msg->ic_haltflags);
+			hdr->ic_status = VMBUS_ICMSG_STATUS_FAIL;
+		}
+		break;
+
+	default:
+		device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type);
+		break;
+	}
+
+	/*
+	 * Send response by echoing the request back.
+	 */
+	vmbus_ic_sendresp(sc, chan, data, dlen, xactid);
+
+	if (do_shutdown)
+		shutdown_nice(RB_POWEROFF);
+}
+
+static int
+vmbus_shutdown_probe(device_t dev)
+{
+
+	return (vmbus_ic_probe(dev, vmbus_shutdown_descs));
+}
+
+static int
+vmbus_shutdown_attach(device_t dev)
+{
+
+	return (vmbus_ic_attach(dev, vmbus_shutdown_cb));
+}
diff --git a/sys/dev/hyperv/utilities/vmbus_timesync.c b/sys/dev/hyperv/utilities/vmbus_timesync.c
new file mode 100644
index 000000000000..2a8d3a988b43
--- /dev/null
+++ b/sys/dev/hyperv/utilities/vmbus_timesync.c
@@ -0,0 +1,260 @@
+/*-
+ * Copyright (c) 2014,2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/utilities/vmbus_icreg.h>
+#include <dev/hyperv/utilities/vmbus_icvar.h>
+
+#define VMBUS_TIMESYNC_FWVER_MAJOR	3
+#define VMBUS_TIMESYNC_FWVER		\
+	VMBUS_IC_VERSION(VMBUS_TIMESYNC_FWVER_MAJOR, 0)
+
+#define VMBUS_TIMESYNC_MSGVER_MAJOR	4
+#define VMBUS_TIMESYNC_MSGVER		\
+	VMBUS_IC_VERSION(VMBUS_TIMESYNC_MSGVER_MAJOR, 0)
+
+#define VMBUS_TIMESYNC_MSGVER4(sc)	\
+	VMBUS_ICVER_LE(VMBUS_IC_VERSION(4, 0), (sc)->ic_msgver)
+
+#define VMBUS_TIMESYNC_DORTT(sc)	\
+	(VMBUS_TIMESYNC_MSGVER4((sc)) && hyperv_tc64 != NULL)
+
+static int			vmbus_timesync_probe(device_t);
+static int			vmbus_timesync_attach(device_t);
+
+static const struct vmbus_ic_desc vmbus_timesync_descs[] = {
+	{
+		.ic_guid = { .hv_guid = {
+		    0x30, 0xe6, 0x27, 0x95, 0xae, 0xd0, 0x7b, 0x49,
+		    0xad, 0xce, 0xe8, 0x0a, 0xb0, 0x17, 0x5c, 0xaf } },
+		.ic_desc = "Hyper-V Timesync"
+	},
+	VMBUS_IC_DESC_END
+};
+
+static device_method_t vmbus_timesync_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		vmbus_timesync_probe),
+	DEVMETHOD(device_attach,	vmbus_timesync_attach),
+	DEVMETHOD(device_detach,	vmbus_ic_detach),
+	DEVMETHOD_END
+};
+
+static driver_t vmbus_timesync_driver = {
+	"hvtimesync",
+	vmbus_timesync_methods,
+	sizeof(struct vmbus_ic_softc)
+};
+
+static devclass_t vmbus_timesync_devclass;
+
+DRIVER_MODULE(hv_timesync, vmbus, vmbus_timesync_driver,
+    vmbus_timesync_devclass, NULL, NULL);
+MODULE_VERSION(hv_timesync, 1);
+MODULE_DEPEND(hv_timesync, vmbus, 1, 1, 1);
+
+SYSCTL_NODE(_hw, OID_AUTO, hvtimesync, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
+    "Hyper-V timesync interface");
+
+static int vmbus_ts_ignore_sync = 0;
+SYSCTL_INT(_hw_hvtimesync, OID_AUTO, ignore_sync, CTLFLAG_RWTUN,
+    &vmbus_ts_ignore_sync, 0, "Ignore the sync request.");
+
+/*
+ * Trigger sample sync when drift exceeds threshold (ms).
+ * Ignore the sample request when set to 0.
+ */
+static int vmbus_ts_sample_thresh = 100;
+SYSCTL_INT(_hw_hvtimesync, OID_AUTO, sample_thresh, CTLFLAG_RWTUN,
+    &vmbus_ts_sample_thresh, 0,
+    "Threshold that makes sample request trigger the sync (unit: ms).");
+
+static int vmbus_ts_sample_verbose = 0;
+SYSCTL_INT(_hw_hvtimesync, OID_AUTO, sample_verbose, CTLFLAG_RWTUN,
+    &vmbus_ts_sample_verbose, 0, "Increase sample request verbosity.");
+
+static void
+vmbus_timesync(struct vmbus_ic_softc *sc, uint64_t hvtime, uint64_t sent_tc,
+    uint8_t tsflags)
+{
+	struct timespec vm_ts;
+	uint64_t hv_ns, vm_ns, rtt = 0;
+
+	if (VMBUS_TIMESYNC_DORTT(sc))
+		rtt = hyperv_tc64() - sent_tc;
+
+	hv_ns = (hvtime - VMBUS_ICMSG_TS_BASE + rtt) * HYPERV_TIMER_NS_FACTOR;
+	nanotime(&vm_ts);
+	vm_ns = (vm_ts.tv_sec * NANOSEC) + vm_ts.tv_nsec;
+
+	if ((tsflags & VMBUS_ICMSG_TS_FLAG_SYNC) && !vmbus_ts_ignore_sync) {
+		struct timespec hv_ts;
+
+		if (bootverbose) {
+			device_printf(sc->ic_dev, "apply sync request, "
+			    "hv: %ju, vm: %ju\n",
+			    (uintmax_t)hv_ns, (uintmax_t)vm_ns);
+		}
+		hv_ts.tv_sec = hv_ns / NANOSEC;
+		hv_ts.tv_nsec = hv_ns % NANOSEC;
+		kern_clock_settime(curthread, CLOCK_REALTIME, &hv_ts);
+		/* Done! */
+		return;
+	}
+
+	if ((tsflags & VMBUS_ICMSG_TS_FLAG_SAMPLE) &&
+	    vmbus_ts_sample_thresh >= 0) {
+		int64_t diff;
+
+		if (vmbus_ts_sample_verbose) {
+			device_printf(sc->ic_dev, "sample request, "
+			    "hv: %ju, vm: %ju\n",
+			    (uintmax_t)hv_ns, (uintmax_t)vm_ns);
+		}
+
+		if (hv_ns > vm_ns)
+			diff = hv_ns - vm_ns;
+		else
+			diff = vm_ns - hv_ns;
+		/* nanosec -> millisec */
+		diff /= 1000000;
+
+		if (diff > vmbus_ts_sample_thresh) {
+			struct timespec hv_ts;
+
+			if (bootverbose) {
+				device_printf(sc->ic_dev,
+				    "apply sample request, hv: %ju, vm: %ju\n",
+				    (uintmax_t)hv_ns, (uintmax_t)vm_ns);
+			}
+			hv_ts.tv_sec = hv_ns / NANOSEC;
+			hv_ts.tv_nsec = hv_ns % NANOSEC;
+			kern_clock_settime(curthread, CLOCK_REALTIME, &hv_ts);
+		}
+		/* Done */
+		return;
+	}
+}
+
+static void
+vmbus_timesync_cb(struct vmbus_channel *chan, void *xsc)
+{
+	struct vmbus_ic_softc *sc = xsc;
+	struct vmbus_icmsg_hdr *hdr;
+	int dlen, error;
+	uint64_t xactid;
+	void *data;
+
+	/*
+	 * Receive request.
+	 */
+	data = sc->ic_buf;
+	dlen = sc->ic_buflen;
+	error = vmbus_chan_recv(chan, data, &dlen, &xactid);
+	KASSERT(error != ENOBUFS, ("icbuf is not large enough"));
+	if (error)
+		return;
+
+	if (dlen < sizeof(*hdr)) {
+		device_printf(sc->ic_dev, "invalid data len %d\n", dlen);
+		return;
+	}
+	hdr = data;
+
+	/*
+	 * Update request, which will be echoed back as response.
+	 */
+	switch (hdr->ic_type) {
+	case VMBUS_ICMSG_TYPE_NEGOTIATE:
+		error = vmbus_ic_negomsg(sc, data, &dlen,
+		    VMBUS_TIMESYNC_FWVER, VMBUS_TIMESYNC_MSGVER);
+		if (error)
+			return;
+		if (VMBUS_TIMESYNC_DORTT(sc))
+			device_printf(sc->ic_dev, "RTT\n");
+		break;
+
+	case VMBUS_ICMSG_TYPE_TIMESYNC:
+		if (VMBUS_TIMESYNC_MSGVER4(sc)) {
+			const struct vmbus_icmsg_timesync4 *msg4;
+
+			if (dlen < sizeof(*msg4)) {
+				device_printf(sc->ic_dev, "invalid timesync4 "
+				    "len %d\n", dlen);
+				return;
+			}
+			msg4 = data;
+			vmbus_timesync(sc, msg4->ic_hvtime, msg4->ic_sent_tc,
+			    msg4->ic_tsflags);
+		} else {
+			const struct vmbus_icmsg_timesync *msg;
+
+			if (dlen < sizeof(*msg)) {
+				device_printf(sc->ic_dev, "invalid timesync "
+				    "len %d\n", dlen);
+				return;
+			}
+			msg = data;
+			vmbus_timesync(sc, msg->ic_hvtime, 0, msg->ic_tsflags);
+		}
+		break;
+
+	default:
+		device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type);
+		break;
+	}
+
+	/*
+	 * Send response by echoing the request back.
+	 */
+	vmbus_ic_sendresp(sc, chan, data, dlen, xactid);
+}
+
+static int
+vmbus_timesync_probe(device_t dev)
+{
+
+	return (vmbus_ic_probe(dev, vmbus_timesync_descs));
+}
+
+static int
+vmbus_timesync_attach(device_t dev)
+{
+
+	return (vmbus_ic_attach(dev, vmbus_timesync_cb));
+}
diff --git a/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c b/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c
new file mode 100644
index 000000000000..11d549dc18d2
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c
@@ -0,0 +1,236 @@
+/*-
+ * Copyright (c) 2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/timetc.h>
+#include <sys/vdso.h>
+
+#include <machine/cpufunc.h>
+#include <machine/cputypes.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+#include <vm/vm.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/vmbus/hyperv_machdep.h>
+#include <dev/hyperv/vmbus/hyperv_reg.h>
+#include <dev/hyperv/vmbus/hyperv_var.h>
+
+struct hyperv_reftsc_ctx {
+	struct hyperv_reftsc	*tsc_ref;
+	struct hyperv_dma	tsc_ref_dma;
+};
+
+static uint32_t			hyperv_tsc_vdso_timehands(
+				    struct vdso_timehands *,
+				    struct timecounter *);
+
+static d_open_t			hyperv_tsc_open;
+static d_mmap_t			hyperv_tsc_mmap;
+
+static struct timecounter	hyperv_tsc_timecounter = {
+	.tc_get_timecount	= NULL,	/* based on CPU vendor. */
+	.tc_counter_mask	= 0xffffffff,
+	.tc_frequency		= HYPERV_TIMER_FREQ,
+	.tc_name		= "Hyper-V-TSC",
+	.tc_quality		= 3000,
+	.tc_fill_vdso_timehands = hyperv_tsc_vdso_timehands,
+};
+
+static struct cdevsw		hyperv_tsc_cdevsw = {
+	.d_version		= D_VERSION,
+	.d_open			= hyperv_tsc_open,
+	.d_mmap			= hyperv_tsc_mmap,
+	.d_name			= HYPERV_REFTSC_DEVNAME
+};
+
+static struct hyperv_reftsc_ctx	hyperv_ref_tsc;
+
+uint64_t
+hypercall_md(volatile void *hc_addr, uint64_t in_val,
+    uint64_t in_paddr, uint64_t out_paddr)
+{
+	uint64_t status;
+
+	__asm__ __volatile__ ("mov %0, %%r8" : : "r" (out_paddr): "r8");
+	__asm__ __volatile__ ("call *%3" : "=a" (status) :
+	    "c" (in_val), "d" (in_paddr), "m" (hc_addr));
+	return (status);
+}
+
+static int
+hyperv_tsc_open(struct cdev *dev __unused, int oflags, int devtype __unused,
+    struct thread *td __unused)
+{
+
+	if (oflags & FWRITE)
+		return (EPERM);
+	return (0);
+}
+
+static int
+hyperv_tsc_mmap(struct cdev *dev __unused, vm_ooffset_t offset,
+    vm_paddr_t *paddr, int nprot __unused, vm_memattr_t *memattr __unused)
+{
+
+	KASSERT(hyperv_ref_tsc.tsc_ref != NULL, ("reftsc has not been setup"));
+
+	/*
+	 * NOTE:
+	 * 'nprot' does not contain information interested to us;
+	 * WR-open is blocked by d_open.
+	 */
+
+	if (offset != 0)
+		return (EOPNOTSUPP);
+
+	*paddr = hyperv_ref_tsc.tsc_ref_dma.hv_paddr;
+	return (0);
+}
+
+static uint32_t
+hyperv_tsc_vdso_timehands(struct vdso_timehands *vdso_th,
+    struct timecounter *tc __unused)
+{
+
+	vdso_th->th_algo = VDSO_TH_ALGO_X86_HVTSC;
+	vdso_th->th_x86_shift = 0;
+	vdso_th->th_x86_hpet_idx = 0;
+	vdso_th->th_x86_pvc_last_systime = 0;
+	vdso_th->th_x86_pvc_stable_mask = 0;
+	bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
+	return (1);
+}
+
+#define HYPERV_TSC_TIMECOUNT(fence)					\
+static uint64_t								\
+hyperv_tc64_tsc_##fence(void)						\
+{									\
+	struct hyperv_reftsc *tsc_ref = hyperv_ref_tsc.tsc_ref;		\
+	uint32_t seq;							\
+									\
+	while ((seq = atomic_load_acq_int(&tsc_ref->tsc_seq)) != 0) {	\
+		uint64_t disc, ret, tsc;				\
+		uint64_t scale = tsc_ref->tsc_scale;			\
+		int64_t ofs = tsc_ref->tsc_ofs;				\
+									\
+		fence();						\
+		tsc = rdtsc();						\
+									\
+		/* ret = ((tsc * scale) >> 64) + ofs */			\
+		__asm__ __volatile__ ("mulq %3" :			\
+		    "=d" (ret), "=a" (disc) :				\
+		    "a" (tsc), "r" (scale));				\
+		ret += ofs;						\
+									\
+		atomic_thread_fence_acq();				\
+		if (tsc_ref->tsc_seq == seq)				\
+			return (ret);					\
+									\
+		/* Sequence changed; re-sync. */			\
+	}								\
+	/* Fallback to the generic timecounter, i.e. rdmsr. */		\
+	return (rdmsr(MSR_HV_TIME_REF_COUNT));				\
+}									\
+									\
+static u_int								\
+hyperv_tsc_timecount_##fence(struct timecounter *tc __unused)		\
+{									\
+									\
+	return (hyperv_tc64_tsc_##fence());				\
+}									\
+struct __hack
+
+HYPERV_TSC_TIMECOUNT(lfence);
+HYPERV_TSC_TIMECOUNT(mfence);
+
+static void
+hyperv_tsc_tcinit(void *dummy __unused)
+{
+	hyperv_tc64_t tc64 = NULL;
+	uint64_t val, orig;
+
+	if ((hyperv_features &
+	     (CPUID_HV_MSR_TIME_REFCNT | CPUID_HV_MSR_REFERENCE_TSC)) !=
+	    (CPUID_HV_MSR_TIME_REFCNT | CPUID_HV_MSR_REFERENCE_TSC) ||
+	    (cpu_feature & CPUID_SSE2) == 0)	/* SSE2 for mfence/lfence */
+		return;
+
+	switch (cpu_vendor_id) {
+	case CPU_VENDOR_AMD:
+	case CPU_VENDOR_HYGON:
+		hyperv_tsc_timecounter.tc_get_timecount =
+		    hyperv_tsc_timecount_mfence;
+		tc64 = hyperv_tc64_tsc_mfence;
+		break;
+
+	case CPU_VENDOR_INTEL:
+		hyperv_tsc_timecounter.tc_get_timecount =
+		    hyperv_tsc_timecount_lfence;
+		tc64 = hyperv_tc64_tsc_lfence;
+		break;
+
+	default:
+		/* Unsupport CPU vendors. */
+		return;
+	}
+
+	hyperv_ref_tsc.tsc_ref = hyperv_dmamem_alloc(NULL, PAGE_SIZE, 0,
+	    sizeof(struct hyperv_reftsc), &hyperv_ref_tsc.tsc_ref_dma,
+	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
+	if (hyperv_ref_tsc.tsc_ref == NULL) {
+		printf("hyperv: reftsc page allocation failed\n");
+		return;
+	}
+
+	orig = rdmsr(MSR_HV_REFERENCE_TSC);
+	val = MSR_HV_REFTSC_ENABLE | (orig & MSR_HV_REFTSC_RSVD_MASK) |
+	    ((hyperv_ref_tsc.tsc_ref_dma.hv_paddr >> PAGE_SHIFT) <<
+	     MSR_HV_REFTSC_PGSHIFT);
+	wrmsr(MSR_HV_REFERENCE_TSC, val);
+
+	/* Register "enlightened" timecounter. */
+	tc_init(&hyperv_tsc_timecounter);
+
+	/* Install 64 bits timecounter method for other modules to use. */
+	KASSERT(tc64 != NULL, ("tc64 is not set"));
+	hyperv_tc64 = tc64;
+
+	/* Add device for mmap(2). */
+	make_dev(&hyperv_tsc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0444,
+	    HYPERV_REFTSC_DEVNAME);
+}
+SYSINIT(hyperv_tsc_init, SI_SUB_DRIVERS, SI_ORDER_FIRST, hyperv_tsc_tcinit,
+    NULL);
diff --git a/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S b/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S
new file mode 100644
index 000000000000..30c07348734c
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "assym.inc"
+
+#include <machine/psl.h>
+#include <machine/asmacros.h>
+#include <machine/specialreg.h>
+
+/*
+ * This is the Hyper-V vmbus channel direct callback interrupt.
+ * Only used when it is running on Hyper-V.
+ */
+	.text
+	SUPERALIGN_TEXT
+	INTR_HANDLER	vmbus_isr
+	movq	%rsp, %rdi
+	call	vmbus_handle_intr
+	jmp	doreti
diff --git a/sys/dev/hyperv/vmbus/hyperv.c b/sys/dev/hyperv/vmbus/hyperv.c
new file mode 100644
index 000000000000..01e0ad9610d9
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/hyperv.c
@@ -0,0 +1,340 @@
+/*-
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * Implements low-level interactions with Hyper-V/Azure
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/timetc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/pmap.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/vmbus/hyperv_machdep.h>
+#include <dev/hyperv/vmbus/hyperv_reg.h>
+#include <dev/hyperv/vmbus/hyperv_var.h>
+
+#define HYPERV_FREEBSD_BUILD		0ULL
+#define HYPERV_FREEBSD_VERSION		((uint64_t)__FreeBSD_version)
+#define HYPERV_FREEBSD_OSID		0ULL
+
+#define MSR_HV_GUESTID_BUILD_FREEBSD	\
+	(HYPERV_FREEBSD_BUILD & MSR_HV_GUESTID_BUILD_MASK)
+#define MSR_HV_GUESTID_VERSION_FREEBSD	\
+	((HYPERV_FREEBSD_VERSION << MSR_HV_GUESTID_VERSION_SHIFT) & \
+	 MSR_HV_GUESTID_VERSION_MASK)
+#define MSR_HV_GUESTID_OSID_FREEBSD	\
+	((HYPERV_FREEBSD_OSID << MSR_HV_GUESTID_OSID_SHIFT) & \
+	 MSR_HV_GUESTID_OSID_MASK)
+
+#define MSR_HV_GUESTID_FREEBSD		\
+	(MSR_HV_GUESTID_BUILD_FREEBSD |	\
+	 MSR_HV_GUESTID_VERSION_FREEBSD | \
+	 MSR_HV_GUESTID_OSID_FREEBSD |	\
+	 MSR_HV_GUESTID_OSTYPE_FREEBSD)
+
+struct hypercall_ctx {
+	void			*hc_addr;
+	vm_paddr_t		hc_paddr;
+};
+
+static u_int			hyperv_get_timecount(struct timecounter *);
+static bool			hyperv_identify(void);
+static void			hypercall_memfree(void);
+
+u_int				hyperv_ver_major;
+
+u_int				hyperv_features;
+u_int				hyperv_recommends;
+
+static u_int			hyperv_pm_features;
+static u_int			hyperv_features3;
+
+hyperv_tc64_t			hyperv_tc64;
+
+static struct timecounter	hyperv_timecounter = {
+	.tc_get_timecount	= hyperv_get_timecount,
+	.tc_poll_pps		= NULL,
+	.tc_counter_mask	= 0xffffffff,
+	.tc_frequency		= HYPERV_TIMER_FREQ,
+	.tc_name		= "Hyper-V",
+	.tc_quality		= 2000,
+	.tc_flags		= 0,
+	.tc_priv		= NULL
+};
+
+static struct hypercall_ctx	hypercall_context;
+
+static u_int
+hyperv_get_timecount(struct timecounter *tc __unused)
+{
+	return rdmsr(MSR_HV_TIME_REF_COUNT);
+}
+
+static uint64_t
+hyperv_tc64_rdmsr(void)
+{
+
+	return (rdmsr(MSR_HV_TIME_REF_COUNT));
+}
+
+uint64_t
+hypercall_post_message(bus_addr_t msg_paddr)
+{
+	return hypercall_md(hypercall_context.hc_addr,
+	    HYPERCALL_POST_MESSAGE, msg_paddr, 0);
+}
+
+uint64_t
+hypercall_signal_event(bus_addr_t monprm_paddr)
+{
+	return hypercall_md(hypercall_context.hc_addr,
+	    HYPERCALL_SIGNAL_EVENT, monprm_paddr, 0);
+}
+
+int
+hyperv_guid2str(const struct hyperv_guid *guid, char *buf, size_t sz)
+{
+	const uint8_t *d = guid->hv_guid;
+
+	return snprintf(buf, sz, "%02x%02x%02x%02x-"
+	    "%02x%02x-%02x%02x-%02x%02x-"
+	    "%02x%02x%02x%02x%02x%02x",
+	    d[3], d[2], d[1], d[0],
+	    d[5], d[4], d[7], d[6], d[8], d[9],
+	    d[10], d[11], d[12], d[13], d[14], d[15]);
+}
+
+static bool
+hyperv_identify(void)
+{
+	u_int regs[4];
+	unsigned int maxleaf;
+
+	if (vm_guest != VM_GUEST_HV)
+		return (false);
+
+	do_cpuid(CPUID_LEAF_HV_MAXLEAF, regs);
+	maxleaf = regs[0];
+	if (maxleaf < CPUID_LEAF_HV_LIMITS)
+		return (false);
+
+	do_cpuid(CPUID_LEAF_HV_INTERFACE, regs);
+	if (regs[0] != CPUID_HV_IFACE_HYPERV)
+		return (false);
+
+	do_cpuid(CPUID_LEAF_HV_FEATURES, regs);
+	if ((regs[0] & CPUID_HV_MSR_HYPERCALL) == 0) {
+		/*
+		 * Hyper-V w/o Hypercall is impossible; someone
+		 * is faking Hyper-V.
+		 */
+		return (false);
+	}
+	hyperv_features = regs[0];
+	hyperv_pm_features = regs[2];
+	hyperv_features3 = regs[3];
+
+	do_cpuid(CPUID_LEAF_HV_IDENTITY, regs);
+	hyperv_ver_major = regs[1] >> 16;
+	printf("Hyper-V Version: %d.%d.%d [SP%d]\n",
+	    hyperv_ver_major, regs[1] & 0xffff, regs[0], regs[2]);
+
+	printf("  Features=0x%b\n", hyperv_features,
+	    "\020"
+	    "\001VPRUNTIME"	/* MSR_HV_VP_RUNTIME */
+	    "\002TMREFCNT"	/* MSR_HV_TIME_REF_COUNT */
+	    "\003SYNIC"		/* MSRs for SynIC */
+	    "\004SYNTM"		/* MSRs for SynTimer */
+	    "\005APIC"		/* MSR_HV_{EOI,ICR,TPR} */
+	    "\006HYPERCALL"	/* MSR_HV_{GUEST_OS_ID,HYPERCALL} */
+	    "\007VPINDEX"	/* MSR_HV_VP_INDEX */
+	    "\010RESET"		/* MSR_HV_RESET */
+	    "\011STATS"		/* MSR_HV_STATS_ */
+	    "\012REFTSC"	/* MSR_HV_REFERENCE_TSC */
+	    "\013IDLE"		/* MSR_HV_GUEST_IDLE */
+	    "\014TMFREQ"	/* MSR_HV_{TSC,APIC}_FREQUENCY */
+	    "\015DEBUG");	/* MSR_HV_SYNTH_DEBUG_ */
+	printf("  PM Features=0x%b [C%u]\n",
+	    (hyperv_pm_features & ~CPUPM_HV_CSTATE_MASK),
+	    "\020"
+	    "\005C3HPET",	/* HPET is required for C3 state */
+	    CPUPM_HV_CSTATE(hyperv_pm_features));
+	printf("  Features3=0x%b\n", hyperv_features3,
+	    "\020"
+	    "\001MWAIT"		/* MWAIT */
+	    "\002DEBUG"		/* guest debug support */
+	    "\003PERFMON"	/* performance monitor */
+	    "\004PCPUDPE"	/* physical CPU dynamic partition event */
+	    "\005XMMHC"		/* hypercall input through XMM regs */
+	    "\006IDLE"		/* guest idle support */
+	    "\007SLEEP"		/* hypervisor sleep support */
+	    "\010NUMA"		/* NUMA distance query support */
+	    "\011TMFREQ"	/* timer frequency query (TSC, LAPIC) */
+	    "\012SYNCMC"	/* inject synthetic machine checks */
+	    "\013CRASH"		/* MSRs for guest crash */
+	    "\014DEBUGMSR"	/* MSRs for guest debug */
+	    "\015NPIEP"		/* NPIEP */
+	    "\016HVDIS");	/* disabling hypervisor */
+
+	do_cpuid(CPUID_LEAF_HV_RECOMMENDS, regs);
+	hyperv_recommends = regs[0];
+	if (bootverbose)
+		printf("  Recommends: %08x %08x\n", regs[0], regs[1]);
+
+	do_cpuid(CPUID_LEAF_HV_LIMITS, regs);
+	if (bootverbose) {
+		printf("  Limits: Vcpu:%d Lcpu:%d Int:%d\n",
+		    regs[0], regs[1], regs[2]);
+	}
+
+	if (maxleaf >= CPUID_LEAF_HV_HWFEATURES) {
+		do_cpuid(CPUID_LEAF_HV_HWFEATURES, regs);
+		if (bootverbose) {
+			printf("  HW Features: %08x, AMD: %08x\n",
+			    regs[0], regs[3]);
+		}
+	}
+
+	return (true);
+}
+
+static void
+hyperv_init(void *dummy __unused)
+{
+	if (!hyperv_identify()) {
+		/* Not Hyper-V; reset guest id to the generic one. */
+		if (vm_guest == VM_GUEST_HV)
+			vm_guest = VM_GUEST_VM;
+		return;
+	}
+
+	/* Set guest id */
+	wrmsr(MSR_HV_GUEST_OS_ID, MSR_HV_GUESTID_FREEBSD);
+
+	if (hyperv_features & CPUID_HV_MSR_TIME_REFCNT) {
+		/*
+		 * Register Hyper-V timecounter.  This should be done as early
+		 * as possible to let DELAY() work, since the 8254 PIT is not
+		 * reliably emulated or even available.
+		 */
+		tc_init(&hyperv_timecounter);
+
+		/*
+		 * Install 64 bits timecounter method for other modules
+		 * to use.
+		 */
+		hyperv_tc64 = hyperv_tc64_rdmsr;
+	}
+}
+SYSINIT(hyperv_initialize, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, hyperv_init,
+    NULL);
+
+static void
+hypercall_memfree(void)
+{
+	kmem_free((vm_offset_t)hypercall_context.hc_addr, PAGE_SIZE);
+	hypercall_context.hc_addr = NULL;
+}
+
+static void
+hypercall_create(void *arg __unused)
+{
+	uint64_t hc, hc_orig;
+
+	if (vm_guest != VM_GUEST_HV)
+		return;
+
+	/*
+	 * NOTE:
+	 * - busdma(9), i.e. hyperv_dmamem APIs, can _not_ be used due to
+	 *   the NX bit.
+	 * - Assume kmem_malloc() returns properly aligned memory.
+	 */
+	hypercall_context.hc_addr = (void *)kmem_malloc(PAGE_SIZE, M_EXEC |
+	    M_WAITOK);
+	hypercall_context.hc_paddr = vtophys(hypercall_context.hc_addr);
+
+	/* Get the 'reserved' bits, which requires preservation. */
+	hc_orig = rdmsr(MSR_HV_HYPERCALL);
+
+	/*
+	 * Setup the Hypercall page.
+	 *
+	 * NOTE: 'reserved' bits MUST be preserved.
+	 */
+	hc = ((hypercall_context.hc_paddr >> PAGE_SHIFT) <<
+	    MSR_HV_HYPERCALL_PGSHIFT) |
+	    (hc_orig & MSR_HV_HYPERCALL_RSVD_MASK) |
+	    MSR_HV_HYPERCALL_ENABLE;
+	wrmsr(MSR_HV_HYPERCALL, hc);
+
+	/*
+	 * Confirm that Hypercall page did get setup.
+	 */
+	hc = rdmsr(MSR_HV_HYPERCALL);
+	if ((hc & MSR_HV_HYPERCALL_ENABLE) == 0) {
+		printf("hyperv: Hypercall setup failed\n");
+		hypercall_memfree();
+		/* Can't perform any Hyper-V specific actions */
+		vm_guest = VM_GUEST_VM;
+		return;
+	}
+	if (bootverbose)
+		printf("hyperv: Hypercall created\n");
+}
+SYSINIT(hypercall_ctor, SI_SUB_DRIVERS, SI_ORDER_FIRST, hypercall_create, NULL);
+
+static void
+hypercall_destroy(void *arg __unused)
+{
+	uint64_t hc;
+
+	if (hypercall_context.hc_addr == NULL)
+		return;
+
+	/* Disable Hypercall */
+	hc = rdmsr(MSR_HV_HYPERCALL);
+	wrmsr(MSR_HV_HYPERCALL, (hc & MSR_HV_HYPERCALL_RSVD_MASK));
+	hypercall_memfree();
+
+	if (bootverbose)
+		printf("hyperv: Hypercall destroyed\n");
+}
+SYSUNINIT(hypercall_dtor, SI_SUB_DRIVERS, SI_ORDER_FIRST, hypercall_destroy,
+    NULL);
diff --git a/sys/dev/hyperv/vmbus/hyperv_busdma.c b/sys/dev/hyperv/vmbus/hyperv_busdma.c
new file mode 100644
index 000000000000..9550540014c4
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/hyperv_busdma.c
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+#include <machine/bus.h>
+
+#include <dev/hyperv/include/hyperv_busdma.h>
+
+#define HYPERV_DMA_MASK	(BUS_DMA_WAITOK | BUS_DMA_NOWAIT | BUS_DMA_ZERO)
+
+void
+hyperv_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error)
+{
+	bus_addr_t *paddr = arg;
+
+	if (error)
+		return;
+
+	KASSERT(nseg == 1, ("too many segments %d!", nseg));
+	*paddr = segs->ds_addr;
+}
+
+void *
+hyperv_dmamem_alloc(bus_dma_tag_t parent_dtag, bus_size_t alignment,
+    bus_addr_t boundary, bus_size_t size, struct hyperv_dma *dma, int flags)
+{
+	void *ret;
+	int error;
+
+	error = bus_dma_tag_create(parent_dtag, /* parent */
+	    alignment,		/* alignment */
+	    boundary,		/* boundary */
+	    BUS_SPACE_MAXADDR,	/* lowaddr */
+	    BUS_SPACE_MAXADDR,	/* highaddr */
+	    NULL, NULL,		/* filter, filterarg */
+	    size,		/* maxsize */
+	    1,			/* nsegments */
+	    size,		/* maxsegsize */
+	    0,			/* flags */
+	    NULL,		/* lockfunc */
+	    NULL,		/* lockfuncarg */
+	    &dma->hv_dtag);
+	if (error)
+		return NULL;
+
+	error = bus_dmamem_alloc(dma->hv_dtag, &ret,
+	    (flags & HYPERV_DMA_MASK) | BUS_DMA_COHERENT, &dma->hv_dmap);
+	if (error) {
+		bus_dma_tag_destroy(dma->hv_dtag);
+		return NULL;
+	}
+
+	error = bus_dmamap_load(dma->hv_dtag, dma->hv_dmap, ret, size,
+	    hyperv_dma_map_paddr, &dma->hv_paddr, BUS_DMA_NOWAIT);
+	if (error) {
+		bus_dmamem_free(dma->hv_dtag, ret, dma->hv_dmap);
+		bus_dma_tag_destroy(dma->hv_dtag);
+		return NULL;
+	}
+	return ret;
+}
+
+void
+hyperv_dmamem_free(struct hyperv_dma *dma, void *ptr)
+{
+	bus_dmamap_unload(dma->hv_dtag, dma->hv_dmap);
+	bus_dmamem_free(dma->hv_dtag, ptr, dma->hv_dmap);
+	bus_dma_tag_destroy(dma->hv_dtag);
+}
diff --git a/sys/dev/hyperv/vmbus/hyperv_machdep.h b/sys/dev/hyperv/vmbus/hyperv_machdep.h
new file mode 100644
index 000000000000..48cf5b78dc3b
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/hyperv_machdep.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HYPERV_MACHDEP_H_
+#define _HYPERV_MACHDEP_H_
+
+#include <sys/param.h>
+
+uint64_t	hypercall_md(volatile void *hc_addr, uint64_t in_val,
+		    uint64_t in_paddr, uint64_t out_paddr);
+
+#endif	/* !_HYPERV_MACHDEP_H_ */
diff --git a/sys/dev/hyperv/vmbus/hyperv_reg.h b/sys/dev/hyperv/vmbus/hyperv_reg.h
new file mode 100644
index 000000000000..b3b133c84881
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/hyperv_reg.h
@@ -0,0 +1,193 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HYPERV_REG_H_
+#define _HYPERV_REG_H_
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+/*
+ * Hyper-V Synthetic MSRs
+ */
+
+#define MSR_HV_GUEST_OS_ID		0x40000000
+#define MSR_HV_GUESTID_BUILD_MASK	0xffffULL
+#define MSR_HV_GUESTID_VERSION_MASK	0x0000ffffffff0000ULL
+#define MSR_HV_GUESTID_VERSION_SHIFT	16
+#define MSR_HV_GUESTID_OSID_MASK	0x00ff000000000000ULL
+#define MSR_HV_GUESTID_OSID_SHIFT	48
+#define MSR_HV_GUESTID_OSTYPE_MASK	0x7f00000000000000ULL
+#define MSR_HV_GUESTID_OSTYPE_SHIFT	56
+#define MSR_HV_GUESTID_OPENSRC		0x8000000000000000ULL
+#define MSR_HV_GUESTID_OSTYPE_LINUX	\
+	((0x01ULL << MSR_HV_GUESTID_OSTYPE_SHIFT) | MSR_HV_GUESTID_OPENSRC)
+#define MSR_HV_GUESTID_OSTYPE_FREEBSD	\
+	((0x02ULL << MSR_HV_GUESTID_OSTYPE_SHIFT) | MSR_HV_GUESTID_OPENSRC)
+
+#define MSR_HV_HYPERCALL		0x40000001
+#define MSR_HV_HYPERCALL_ENABLE		0x0001ULL
+#define MSR_HV_HYPERCALL_RSVD_MASK	0x0ffeULL
+#define MSR_HV_HYPERCALL_PGSHIFT	12
+
+#define MSR_HV_VP_INDEX			0x40000002
+
+#define MSR_HV_REFERENCE_TSC		0x40000021
+#define MSR_HV_REFTSC_ENABLE		0x0001ULL
+#define MSR_HV_REFTSC_RSVD_MASK		0x0ffeULL
+#define MSR_HV_REFTSC_PGSHIFT		12
+
+#define MSR_HV_SCONTROL			0x40000080
+#define MSR_HV_SCTRL_ENABLE		0x0001ULL
+#define MSR_HV_SCTRL_RSVD_MASK		0xfffffffffffffffeULL
+
+#define MSR_HV_SIEFP			0x40000082
+#define MSR_HV_SIEFP_ENABLE		0x0001ULL
+#define MSR_HV_SIEFP_RSVD_MASK		0x0ffeULL
+#define MSR_HV_SIEFP_PGSHIFT		12
+
+#define MSR_HV_SIMP			0x40000083
+#define MSR_HV_SIMP_ENABLE		0x0001ULL
+#define MSR_HV_SIMP_RSVD_MASK		0x0ffeULL
+#define MSR_HV_SIMP_PGSHIFT		12
+
+#define MSR_HV_EOM			0x40000084
+
+#define MSR_HV_SINT0			0x40000090
+#define MSR_HV_SINT_VECTOR_MASK		0x00ffULL
+#define MSR_HV_SINT_RSVD1_MASK		0xff00ULL
+#define MSR_HV_SINT_MASKED		0x00010000ULL
+#define MSR_HV_SINT_AUTOEOI		0x00020000ULL
+#define MSR_HV_SINT_RSVD2_MASK		0xfffffffffffc0000ULL
+#define MSR_HV_SINT_RSVD_MASK		(MSR_HV_SINT_RSVD1_MASK |	\
+					 MSR_HV_SINT_RSVD2_MASK)
+
+#define MSR_HV_STIMER0_CONFIG		0x400000b0
+#define MSR_HV_STIMER_CFG_ENABLE	0x0001ULL
+#define MSR_HV_STIMER_CFG_PERIODIC	0x0002ULL
+#define MSR_HV_STIMER_CFG_LAZY		0x0004ULL
+#define MSR_HV_STIMER_CFG_AUTOEN	0x0008ULL
+#define MSR_HV_STIMER_CFG_SINT_MASK	0x000f0000ULL
+#define MSR_HV_STIMER_CFG_SINT_SHIFT	16
+
+#define MSR_HV_STIMER0_COUNT		0x400000b1
+
+/*
+ * CPUID leaves
+ */
+
+#define CPUID_LEAF_HV_MAXLEAF		0x40000000
+
+#define CPUID_LEAF_HV_INTERFACE		0x40000001
+#define CPUID_HV_IFACE_HYPERV		0x31237648	/* HV#1 */
+
+#define CPUID_LEAF_HV_IDENTITY		0x40000002
+
+#define CPUID_LEAF_HV_FEATURES		0x40000003
+/* EAX: features include/hyperv.h CPUID_HV_MSR */
+/* ECX: power management features */
+#define CPUPM_HV_CSTATE_MASK		0x000f	/* deepest C-state */
+#define CPUPM_HV_C3_HPET		0x0010	/* C3 requires HPET */
+#define CPUPM_HV_CSTATE(f)		((f) & CPUPM_HV_CSTATE_MASK)
+/* EDX: features3 */
+#define CPUID3_HV_MWAIT			0x0001	/* MWAIT */
+#define CPUID3_HV_XMM_HYPERCALL		0x0010	/* Hypercall input through
+						 * XMM regs */
+#define CPUID3_HV_GUEST_IDLE		0x0020	/* guest idle */
+#define CPUID3_HV_NUMA			0x0080	/* NUMA distance query */
+#define CPUID3_HV_TIME_FREQ		0x0100	/* timer frequency query
+						 * (TSC, LAPIC) */
+#define CPUID3_HV_MSR_CRASH		0x0400	/* MSRs for guest crash */
+
+#define CPUID_LEAF_HV_RECOMMENDS	0x40000004
+#define CPUID_LEAF_HV_LIMITS		0x40000005
+#define CPUID_LEAF_HV_HWFEATURES	0x40000006
+
+/*
+ * Hyper-V Monitor Notification Facility
+ */
+struct hyperv_mon_param {
+	uint32_t	mp_connid;
+	uint16_t	mp_evtflag_ofs;
+	uint16_t	mp_rsvd;
+} __packed;
+
+/*
+ * Hyper-V message types
+ */
+#define HYPERV_MSGTYPE_NONE		0
+#define HYPERV_MSGTYPE_CHANNEL		1
+#define HYPERV_MSGTYPE_TIMER_EXPIRED	0x80000010
+
+/*
+ * Hypercall status codes
+ */
+#define HYPERCALL_STATUS_SUCCESS	0x0000
+
+/*
+ * Hypercall input values
+ */
+#define HYPERCALL_POST_MESSAGE		0x005c
+#define HYPERCALL_SIGNAL_EVENT		0x005d
+
+/*
+ * Hypercall input parameters
+ */
+#define HYPERCALL_PARAM_ALIGN		8
+#if 0
+/*
+ * XXX
+ * <<Hypervisor Top Level Functional Specification 4.0b>> requires
+ * input parameters size to be multiple of 8, however, many post
+ * message input parameters do _not_ meet this requirement.
+ */
+#define HYPERCALL_PARAM_SIZE_ALIGN	8
+#endif
+
+/*
+ * HYPERCALL_POST_MESSAGE
+ */
+#define HYPERCALL_POSTMSGIN_DSIZE_MAX	240
+#define HYPERCALL_POSTMSGIN_SIZE	256
+
+struct hypercall_postmsg_in {
+	uint32_t	hc_connid;
+	uint32_t	hc_rsvd;
+	uint32_t	hc_msgtype;	/* HYPERV_MSGTYPE_ */
+	uint32_t	hc_dsize;
+	uint8_t		hc_data[HYPERCALL_POSTMSGIN_DSIZE_MAX];
+} __packed;
+CTASSERT(sizeof(struct hypercall_postmsg_in) == HYPERCALL_POSTMSGIN_SIZE);
+
+/*
+ * HYPERCALL_SIGNAL_EVENT
+ *
+ * struct hyperv_mon_param.
+ */
+
+#endif	/* !_HYPERV_REG_H_ */
diff --git a/sys/dev/hyperv/vmbus/hyperv_var.h b/sys/dev/hyperv/vmbus/hyperv_var.h
new file mode 100644
index 000000000000..f620e4fd64ae
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/hyperv_var.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HYPERV_VAR_H_
+#define _HYPERV_VAR_H_
+
+extern u_int	hyperv_recommends;
+
+uint64_t	hypercall_post_message(bus_addr_t msg_paddr);
+uint64_t	hypercall_signal_event(bus_addr_t monprm_paddr);
+
+#endif	/* !_HYPERV_VAR_H_ */
diff --git a/sys/dev/hyperv/vmbus/i386/hyperv_machdep.c b/sys/dev/hyperv/vmbus/i386/hyperv_machdep.c
new file mode 100644
index 000000000000..b12bff855f63
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/i386/hyperv_machdep.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <dev/hyperv/vmbus/hyperv_machdep.h>
+
+uint64_t
+hypercall_md(volatile void *hc_addr, uint64_t in_val,
+    uint64_t in_paddr, uint64_t out_paddr)
+{
+	uint32_t in_val_hi = in_val >> 32;
+	uint32_t in_val_lo = in_val & 0xFFFFFFFF;
+	uint32_t status_hi, status_lo;
+	uint32_t in_paddr_hi = in_paddr >> 32;
+	uint32_t in_paddr_lo = in_paddr & 0xFFFFFFFF;
+	uint32_t out_paddr_hi = out_paddr >> 32;
+	uint32_t out_paddr_lo = out_paddr & 0xFFFFFFFF;
+
+	__asm__ __volatile__ ("call *%8" : "=d"(status_hi), "=a"(status_lo) :
+	    "d" (in_val_hi), "a" (in_val_lo),
+	    "b" (in_paddr_hi), "c" (in_paddr_lo),
+	    "D"(out_paddr_hi), "S"(out_paddr_lo),
+	    "m" (hc_addr));
+	return (status_lo | ((uint64_t)status_hi << 32));
+}
diff --git a/sys/dev/hyperv/vmbus/i386/vmbus_vector.S b/sys/dev/hyperv/vmbus/i386/vmbus_vector.S
new file mode 100644
index 000000000000..b1ffe89cd55d
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/i386/vmbus_vector.S
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "assym.inc"
+
+#include <machine/psl.h>
+#include <machine/asmacros.h>
+#include <machine/specialreg.h>
+
+/*
+ * This is the Hyper-V vmbus channel direct callback interrupt.
+ * Only used when it is running on Hyper-V.
+ *
+ * Note that this file is not assembled directly, it is included into
+ * i386/exception.s.
+ */
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(vmbus_isr_pti)
+IDTVEC(vmbus_isr)
+	PUSH_FRAME
+	SET_KERNEL_SREGS
+	cld
+	KENTER
+	pushl	%esp
+	mov	$vmbus_handle_intr, %eax
+	call	*%eax
+	add	$4, %esp
+	jmp	doreti
diff --git a/sys/dev/hyperv/vmbus/vmbus.c b/sys/dev/hyperv/vmbus/vmbus.c
new file mode 100644
index 000000000000..31951cbf4858
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus.c
@@ -0,0 +1,1679 @@
+/*-
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * VM Bus Driver Implementation
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sbuf.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+
+#include <machine/bus.h>
+#include <machine/intr_machdep.h>
+#include <machine/metadata.h>
+#include <machine/md_var.h>
+#include <machine/resource.h>
+#include <x86/include/apicvar.h>
+
+#include <contrib/dev/acpica/include/acpi.h>
+#include <dev/acpica/acpivar.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+#include <dev/hyperv/vmbus/hyperv_reg.h>
+#include <dev/hyperv/vmbus/hyperv_var.h>
+#include <dev/hyperv/vmbus/vmbus_reg.h>
+#include <dev/hyperv/vmbus/vmbus_var.h>
+#include <dev/hyperv/vmbus/vmbus_chanvar.h>
+
+#include "acpi_if.h"
+#include "pcib_if.h"
+#include "vmbus_if.h"
+
+#define VMBUS_GPADL_START		0xe1e10
+
+struct vmbus_msghc {
+	struct vmbus_xact		*mh_xact;
+	struct hypercall_postmsg_in	mh_inprm_save;
+};
+
+static void			vmbus_identify(driver_t *, device_t);
+static int			vmbus_probe(device_t);
+static int			vmbus_attach(device_t);
+static int			vmbus_detach(device_t);
+static int			vmbus_read_ivar(device_t, device_t, int,
+				    uintptr_t *);
+static int			vmbus_child_pnpinfo(device_t, device_t, struct sbuf *);
+static struct resource		*vmbus_alloc_resource(device_t dev,
+				    device_t child, int type, int *rid,
+				    rman_res_t start, rman_res_t end,
+				    rman_res_t count, u_int flags);
+static int			vmbus_alloc_msi(device_t bus, device_t dev,
+				    int count, int maxcount, int *irqs);
+static int			vmbus_release_msi(device_t bus, device_t dev,
+				    int count, int *irqs);
+static int			vmbus_alloc_msix(device_t bus, device_t dev,
+				    int *irq);
+static int			vmbus_release_msix(device_t bus, device_t dev,
+				    int irq);
+static int			vmbus_map_msi(device_t bus, device_t dev,
+				    int irq, uint64_t *addr, uint32_t *data);
+static uint32_t			vmbus_get_version_method(device_t, device_t);
+static int			vmbus_probe_guid_method(device_t, device_t,
+				    const struct hyperv_guid *);
+static uint32_t			vmbus_get_vcpu_id_method(device_t bus,
+				    device_t dev, int cpu);
+static struct taskqueue		*vmbus_get_eventtq_method(device_t, device_t,
+				    int);
+#ifdef EARLY_AP_STARTUP
+static void			vmbus_intrhook(void *);
+#endif
+
+static int			vmbus_init(struct vmbus_softc *);
+static int			vmbus_connect(struct vmbus_softc *, uint32_t);
+static int			vmbus_req_channels(struct vmbus_softc *sc);
+static void			vmbus_disconnect(struct vmbus_softc *);
+static int			vmbus_scan(struct vmbus_softc *);
+static void			vmbus_scan_teardown(struct vmbus_softc *);
+static void			vmbus_scan_done(struct vmbus_softc *,
+				    const struct vmbus_message *);
+static void			vmbus_chanmsg_handle(struct vmbus_softc *,
+				    const struct vmbus_message *);
+static void			vmbus_msg_task(void *, int);
+static void			vmbus_synic_setup(void *);
+static void			vmbus_synic_teardown(void *);
+static int			vmbus_sysctl_version(SYSCTL_HANDLER_ARGS);
+static int			vmbus_dma_alloc(struct vmbus_softc *);
+static void			vmbus_dma_free(struct vmbus_softc *);
+static int			vmbus_intr_setup(struct vmbus_softc *);
+static void			vmbus_intr_teardown(struct vmbus_softc *);
+static int			vmbus_doattach(struct vmbus_softc *);
+static void			vmbus_event_proc_dummy(struct vmbus_softc *,
+				    int);
+
+static struct vmbus_softc	*vmbus_sc;
+
+SYSCTL_NODE(_hw, OID_AUTO, vmbus, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
+    "Hyper-V vmbus");
+
+static int			vmbus_pin_evttask = 1;
+SYSCTL_INT(_hw_vmbus, OID_AUTO, pin_evttask, CTLFLAG_RDTUN,
+    &vmbus_pin_evttask, 0, "Pin event tasks to their respective CPU");
+
+extern inthand_t IDTVEC(vmbus_isr), IDTVEC(vmbus_isr_pti);
+#define VMBUS_ISR_ADDR	trunc_page((uintptr_t)IDTVEC(vmbus_isr_pti))
+
+uint32_t			vmbus_current_version;
+
+static const uint32_t		vmbus_version[] = {
+	VMBUS_VERSION_WIN10,
+	VMBUS_VERSION_WIN8_1,
+	VMBUS_VERSION_WIN8,
+	VMBUS_VERSION_WIN7,
+	VMBUS_VERSION_WS2008
+};
+
+static const vmbus_chanmsg_proc_t
+vmbus_chanmsg_handlers[VMBUS_CHANMSG_TYPE_MAX] = {
+	VMBUS_CHANMSG_PROC(CHOFFER_DONE, vmbus_scan_done),
+	VMBUS_CHANMSG_PROC_WAKEUP(CONNECT_RESP)
+};
+
+static device_method_t vmbus_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_identify,		vmbus_identify),
+	DEVMETHOD(device_probe,			vmbus_probe),
+	DEVMETHOD(device_attach,		vmbus_attach),
+	DEVMETHOD(device_detach,		vmbus_detach),
+	DEVMETHOD(device_shutdown,		bus_generic_shutdown),
+	DEVMETHOD(device_suspend,		bus_generic_suspend),
+	DEVMETHOD(device_resume,		bus_generic_resume),
+
+	/* Bus interface */
+	DEVMETHOD(bus_add_child,		bus_generic_add_child),
+	DEVMETHOD(bus_print_child,		bus_generic_print_child),
+	DEVMETHOD(bus_read_ivar,		vmbus_read_ivar),
+	DEVMETHOD(bus_child_pnpinfo,		vmbus_child_pnpinfo),
+	DEVMETHOD(bus_alloc_resource,		vmbus_alloc_resource),
+	DEVMETHOD(bus_release_resource,		bus_generic_release_resource),
+	DEVMETHOD(bus_activate_resource,	bus_generic_activate_resource),
+	DEVMETHOD(bus_deactivate_resource,	bus_generic_deactivate_resource),
+	DEVMETHOD(bus_setup_intr,		bus_generic_setup_intr),
+	DEVMETHOD(bus_teardown_intr,		bus_generic_teardown_intr),
+#if __FreeBSD_version >= 1100000
+	DEVMETHOD(bus_get_cpus,			bus_generic_get_cpus),
+#endif
+
+	/* pcib interface */
+	DEVMETHOD(pcib_alloc_msi,		vmbus_alloc_msi),
+	DEVMETHOD(pcib_release_msi,		vmbus_release_msi),
+	DEVMETHOD(pcib_alloc_msix,		vmbus_alloc_msix),
+	DEVMETHOD(pcib_release_msix,		vmbus_release_msix),
+	DEVMETHOD(pcib_map_msi,			vmbus_map_msi),
+
+	/* Vmbus interface */
+	DEVMETHOD(vmbus_get_version,		vmbus_get_version_method),
+	DEVMETHOD(vmbus_probe_guid,		vmbus_probe_guid_method),
+	DEVMETHOD(vmbus_get_vcpu_id,		vmbus_get_vcpu_id_method),
+	DEVMETHOD(vmbus_get_event_taskq,	vmbus_get_eventtq_method),
+
+	DEVMETHOD_END
+};
+
+static driver_t vmbus_driver = {
+	"vmbus",
+	vmbus_methods,
+	sizeof(struct vmbus_softc)
+};
+
+static devclass_t vmbus_devclass;
+
+DRIVER_MODULE(vmbus, pcib, vmbus_driver, vmbus_devclass, NULL, NULL);
+DRIVER_MODULE(vmbus, acpi_syscontainer, vmbus_driver, vmbus_devclass,
+    NULL, NULL);
+
+MODULE_DEPEND(vmbus, acpi, 1, 1, 1);
+MODULE_DEPEND(vmbus, pci, 1, 1, 1);
+MODULE_VERSION(vmbus, 1);
+
+static __inline struct vmbus_softc *
+vmbus_get_softc(void)
+{
+	return vmbus_sc;
+}
+
+void
+vmbus_msghc_reset(struct vmbus_msghc *mh, size_t dsize)
+{
+	struct hypercall_postmsg_in *inprm;
+
+	if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX)
+		panic("invalid data size %zu", dsize);
+
+	inprm = vmbus_xact_req_data(mh->mh_xact);
+	memset(inprm, 0, HYPERCALL_POSTMSGIN_SIZE);
+	inprm->hc_connid = VMBUS_CONNID_MESSAGE;
+	inprm->hc_msgtype = HYPERV_MSGTYPE_CHANNEL;
+	inprm->hc_dsize = dsize;
+}
+
+struct vmbus_msghc *
+vmbus_msghc_get(struct vmbus_softc *sc, size_t dsize)
+{
+	struct vmbus_msghc *mh;
+	struct vmbus_xact *xact;
+
+	if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX)
+		panic("invalid data size %zu", dsize);
+
+	xact = vmbus_xact_get(sc->vmbus_xc,
+	    dsize + __offsetof(struct hypercall_postmsg_in, hc_data[0]));
+	if (xact == NULL)
+		return (NULL);
+
+	mh = vmbus_xact_priv(xact, sizeof(*mh));
+	mh->mh_xact = xact;
+
+	vmbus_msghc_reset(mh, dsize);
+	return (mh);
+}
+
+void
+vmbus_msghc_put(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh)
+{
+
+	vmbus_xact_put(mh->mh_xact);
+}
+
+void *
+vmbus_msghc_dataptr(struct vmbus_msghc *mh)
+{
+	struct hypercall_postmsg_in *inprm;
+
+	inprm = vmbus_xact_req_data(mh->mh_xact);
+	return (inprm->hc_data);
+}
+
+int
+vmbus_msghc_exec_noresult(struct vmbus_msghc *mh)
+{
+	sbintime_t time = SBT_1MS;
+	struct hypercall_postmsg_in *inprm;
+	bus_addr_t inprm_paddr;
+	int i;
+
+	inprm = vmbus_xact_req_data(mh->mh_xact);
+	inprm_paddr = vmbus_xact_req_paddr(mh->mh_xact);
+
+	/*
+	 * Save the input parameter so that we could restore the input
+	 * parameter if the Hypercall failed.
+	 *
+	 * XXX
+	 * Is this really necessary?!  i.e. Will the Hypercall ever
+	 * overwrite the input parameter?
+	 */
+	memcpy(&mh->mh_inprm_save, inprm, HYPERCALL_POSTMSGIN_SIZE);
+
+	/*
+	 * In order to cope with transient failures, e.g. insufficient
+	 * resources on host side, we retry the post message Hypercall
+	 * several times.  20 retries seem sufficient.
+	 */
+#define HC_RETRY_MAX	20
+
+	for (i = 0; i < HC_RETRY_MAX; ++i) {
+		uint64_t status;
+
+		status = hypercall_post_message(inprm_paddr);
+		if (status == HYPERCALL_STATUS_SUCCESS)
+			return 0;
+
+		pause_sbt("hcpmsg", time, 0, C_HARDCLOCK);
+		if (time < SBT_1S * 2)
+			time *= 2;
+
+		/* Restore input parameter and try again */
+		memcpy(inprm, &mh->mh_inprm_save, HYPERCALL_POSTMSGIN_SIZE);
+	}
+
+#undef HC_RETRY_MAX
+
+	return EIO;
+}
+
+int
+vmbus_msghc_exec(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh)
+{
+	int error;
+
+	vmbus_xact_activate(mh->mh_xact);
+	error = vmbus_msghc_exec_noresult(mh);
+	if (error)
+		vmbus_xact_deactivate(mh->mh_xact);
+	return error;
+}
+
+void
+vmbus_msghc_exec_cancel(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh)
+{
+
+	vmbus_xact_deactivate(mh->mh_xact);
+}
+
+const struct vmbus_message *
+vmbus_msghc_wait_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh)
+{
+	size_t resp_len;
+
+	return (vmbus_xact_wait(mh->mh_xact, &resp_len));
+}
+
+const struct vmbus_message *
+vmbus_msghc_poll_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh)
+{
+	size_t resp_len;
+
+	return (vmbus_xact_poll(mh->mh_xact, &resp_len));
+}
+
+void
+vmbus_msghc_wakeup(struct vmbus_softc *sc, const struct vmbus_message *msg)
+{
+
+	vmbus_xact_ctx_wakeup(sc->vmbus_xc, msg, sizeof(*msg));
+}
+
+uint32_t
+vmbus_gpadl_alloc(struct vmbus_softc *sc)
+{
+	uint32_t gpadl;
+
+again:
+	gpadl = atomic_fetchadd_int(&sc->vmbus_gpadl, 1);
+	if (gpadl == 0)
+		goto again;
+	return (gpadl);
+}
+
+/* Used for Hyper-V socket when guest client connects to host */
+int
+vmbus_req_tl_connect(struct hyperv_guid *guest_srv_id,
+    struct hyperv_guid *host_srv_id)
+{
+	struct vmbus_softc *sc = vmbus_get_softc();
+	struct vmbus_chanmsg_tl_connect *req;
+	struct vmbus_msghc *mh;
+	int error;
+
+	if (!sc)
+		return ENXIO;
+
+	mh = vmbus_msghc_get(sc, sizeof(*req));
+	if (mh == NULL) {
+		device_printf(sc->vmbus_dev,
+		    "can not get msg hypercall for tl connect\n");
+		return ENXIO;
+	}
+
+	req = vmbus_msghc_dataptr(mh);
+	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_TL_CONN;
+	req->guest_endpoint_id = *guest_srv_id;
+	req->host_service_id = *host_srv_id;
+
+	error = vmbus_msghc_exec_noresult(mh);
+	vmbus_msghc_put(sc, mh);
+
+	if (error) {
+		device_printf(sc->vmbus_dev,
+		    "tl connect msg hypercall failed\n");
+	}
+
+	return error;
+}
+
+static int
+vmbus_connect(struct vmbus_softc *sc, uint32_t version)
+{
+	struct vmbus_chanmsg_connect *req;
+	const struct vmbus_message *msg;
+	struct vmbus_msghc *mh;
+	int error, done = 0;
+
+	mh = vmbus_msghc_get(sc, sizeof(*req));
+	if (mh == NULL)
+		return ENXIO;
+
+	req = vmbus_msghc_dataptr(mh);
+	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CONNECT;
+	req->chm_ver = version;
+	req->chm_evtflags = sc->vmbus_evtflags_dma.hv_paddr;
+	req->chm_mnf1 = sc->vmbus_mnf1_dma.hv_paddr;
+	req->chm_mnf2 = sc->vmbus_mnf2_dma.hv_paddr;
+
+	error = vmbus_msghc_exec(sc, mh);
+	if (error) {
+		vmbus_msghc_put(sc, mh);
+		return error;
+	}
+
+	msg = vmbus_msghc_wait_result(sc, mh);
+	done = ((const struct vmbus_chanmsg_connect_resp *)
+	    msg->msg_data)->chm_done;
+
+	vmbus_msghc_put(sc, mh);
+
+	return (done ? 0 : EOPNOTSUPP);
+}
+
+static int
+vmbus_init(struct vmbus_softc *sc)
+{
+	int i;
+
+	for (i = 0; i < nitems(vmbus_version); ++i) {
+		int error;
+
+		error = vmbus_connect(sc, vmbus_version[i]);
+		if (!error) {
+			vmbus_current_version = vmbus_version[i];
+			sc->vmbus_version = vmbus_version[i];
+			device_printf(sc->vmbus_dev, "version %u.%u\n",
+			    VMBUS_VERSION_MAJOR(sc->vmbus_version),
+			    VMBUS_VERSION_MINOR(sc->vmbus_version));
+			return 0;
+		}
+	}
+	return ENXIO;
+}
+
+static void
+vmbus_disconnect(struct vmbus_softc *sc)
+{
+	struct vmbus_chanmsg_disconnect *req;
+	struct vmbus_msghc *mh;
+	int error;
+
+	mh = vmbus_msghc_get(sc, sizeof(*req));
+	if (mh == NULL) {
+		device_printf(sc->vmbus_dev,
+		    "can not get msg hypercall for disconnect\n");
+		return;
+	}
+
+	req = vmbus_msghc_dataptr(mh);
+	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_DISCONNECT;
+
+	error = vmbus_msghc_exec_noresult(mh);
+	vmbus_msghc_put(sc, mh);
+
+	if (error) {
+		device_printf(sc->vmbus_dev,
+		    "disconnect msg hypercall failed\n");
+	}
+}
+
+static int
+vmbus_req_channels(struct vmbus_softc *sc)
+{
+	struct vmbus_chanmsg_chrequest *req;
+	struct vmbus_msghc *mh;
+	int error;
+
+	mh = vmbus_msghc_get(sc, sizeof(*req));
+	if (mh == NULL)
+		return ENXIO;
+
+	req = vmbus_msghc_dataptr(mh);
+	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHREQUEST;
+
+	error = vmbus_msghc_exec_noresult(mh);
+	vmbus_msghc_put(sc, mh);
+
+	return error;
+}
+
+static void
+vmbus_scan_done_task(void *xsc, int pending __unused)
+{
+	struct vmbus_softc *sc = xsc;
+
+	mtx_lock(&Giant);
+	sc->vmbus_scandone = true;
+	mtx_unlock(&Giant);
+	wakeup(&sc->vmbus_scandone);
+}
+
+static void
+vmbus_scan_done(struct vmbus_softc *sc,
+    const struct vmbus_message *msg __unused)
+{
+
+	taskqueue_enqueue(sc->vmbus_devtq, &sc->vmbus_scandone_task);
+}
+
+static int
+vmbus_scan(struct vmbus_softc *sc)
+{
+	int error;
+
+	/*
+	 * Identify, probe and attach for non-channel devices.
+	 */
+	bus_generic_probe(sc->vmbus_dev);
+	bus_generic_attach(sc->vmbus_dev);
+
+	/*
+	 * This taskqueue serializes vmbus devices' attach and detach
+	 * for channel offer and rescind messages.
+	 */
+	sc->vmbus_devtq = taskqueue_create("vmbus dev", M_WAITOK,
+	    taskqueue_thread_enqueue, &sc->vmbus_devtq);
+	taskqueue_start_threads(&sc->vmbus_devtq, 1, PI_NET, "vmbusdev");
+	TASK_INIT(&sc->vmbus_scandone_task, 0, vmbus_scan_done_task, sc);
+
+	/*
+	 * This taskqueue handles sub-channel detach, so that vmbus
+	 * device's detach running in vmbus_devtq can drain its sub-
+	 * channels.
+	 */
+	sc->vmbus_subchtq = taskqueue_create("vmbus subch", M_WAITOK,
+	    taskqueue_thread_enqueue, &sc->vmbus_subchtq);
+	taskqueue_start_threads(&sc->vmbus_subchtq, 1, PI_NET, "vmbussch");
+
+	/*
+	 * Start vmbus scanning.
+	 */
+	error = vmbus_req_channels(sc);
+	if (error) {
+		device_printf(sc->vmbus_dev, "channel request failed: %d\n",
+		    error);
+		return (error);
+	}
+
+	/*
+	 * Wait for all vmbus devices from the initial channel offers to be
+	 * attached.
+	 */
+	GIANT_REQUIRED;
+	while (!sc->vmbus_scandone)
+		mtx_sleep(&sc->vmbus_scandone, &Giant, 0, "vmbusdev", 0);
+
+	if (bootverbose) {
+		device_printf(sc->vmbus_dev, "device scan, probe and attach "
+		    "done\n");
+	}
+	return (0);
+}
+
+static void
+vmbus_scan_teardown(struct vmbus_softc *sc)
+{
+
+	GIANT_REQUIRED;
+	if (sc->vmbus_devtq != NULL) {
+		mtx_unlock(&Giant);
+		taskqueue_free(sc->vmbus_devtq);
+		mtx_lock(&Giant);
+		sc->vmbus_devtq = NULL;
+	}
+	if (sc->vmbus_subchtq != NULL) {
+		mtx_unlock(&Giant);
+		taskqueue_free(sc->vmbus_subchtq);
+		mtx_lock(&Giant);
+		sc->vmbus_subchtq = NULL;
+	}
+}
+
+static void
+vmbus_chanmsg_handle(struct vmbus_softc *sc, const struct vmbus_message *msg)
+{
+	vmbus_chanmsg_proc_t msg_proc;
+	uint32_t msg_type;
+
+	msg_type = ((const struct vmbus_chanmsg_hdr *)msg->msg_data)->chm_type;
+	if (msg_type >= VMBUS_CHANMSG_TYPE_MAX) {
+		device_printf(sc->vmbus_dev, "unknown message type 0x%x\n",
+		    msg_type);
+		return;
+	}
+
+	msg_proc = vmbus_chanmsg_handlers[msg_type];
+	if (msg_proc != NULL)
+		msg_proc(sc, msg);
+
+	/* Channel specific processing */
+	vmbus_chan_msgproc(sc, msg);
+}
+
+static void
+vmbus_msg_task(void *xsc, int pending __unused)
+{
+	struct vmbus_softc *sc = xsc;
+	volatile struct vmbus_message *msg;
+
+	msg = VMBUS_PCPU_GET(sc, message, curcpu) + VMBUS_SINT_MESSAGE;
+	for (;;) {
+		if (msg->msg_type == HYPERV_MSGTYPE_NONE) {
+			/* No message */
+			break;
+		} else if (msg->msg_type == HYPERV_MSGTYPE_CHANNEL) {
+			/* Channel message */
+			vmbus_chanmsg_handle(sc,
+			    __DEVOLATILE(const struct vmbus_message *, msg));
+		}
+
+		msg->msg_type = HYPERV_MSGTYPE_NONE;
+		/*
+		 * Make sure the write to msg_type (i.e. set to
+		 * HYPERV_MSGTYPE_NONE) happens before we read the
+		 * msg_flags and EOMing. Otherwise, the EOMing will
+		 * not deliver any more messages since there is no
+		 * empty slot
+		 *
+		 * NOTE:
+		 * mb() is used here, since atomic_thread_fence_seq_cst()
+		 * will become compiler fence on UP kernel.
+		 */
+		mb();
+		if (msg->msg_flags & VMBUS_MSGFLAG_PENDING) {
+			/*
+			 * This will cause message queue rescan to possibly
+			 * deliver another msg from the hypervisor
+			 */
+			wrmsr(MSR_HV_EOM, 0);
+		}
+	}
+}
+
+static __inline int
+vmbus_handle_intr1(struct vmbus_softc *sc, struct trapframe *frame, int cpu)
+{
+	volatile struct vmbus_message *msg;
+	struct vmbus_message *msg_base;
+
+	msg_base = VMBUS_PCPU_GET(sc, message, cpu);
+
+	/*
+	 * Check event timer.
+	 *
+	 * TODO: move this to independent IDT vector.
+	 */
+	msg = msg_base + VMBUS_SINT_TIMER;
+	if (msg->msg_type == HYPERV_MSGTYPE_TIMER_EXPIRED) {
+		msg->msg_type = HYPERV_MSGTYPE_NONE;
+
+		vmbus_et_intr(frame);
+
+		/*
+		 * Make sure the write to msg_type (i.e. set to
+		 * HYPERV_MSGTYPE_NONE) happens before we read the
+		 * msg_flags and EOMing. Otherwise, the EOMing will
+		 * not deliver any more messages since there is no
+		 * empty slot
+		 *
+		 * NOTE:
+		 * mb() is used here, since atomic_thread_fence_seq_cst()
+		 * will become compiler fence on UP kernel.
+		 */
+		mb();
+		if (msg->msg_flags & VMBUS_MSGFLAG_PENDING) {
+			/*
+			 * This will cause message queue rescan to possibly
+			 * deliver another msg from the hypervisor
+			 */
+			wrmsr(MSR_HV_EOM, 0);
+		}
+	}
+
+	/*
+	 * Check events.  Hot path for network and storage I/O data; high rate.
+	 *
+	 * NOTE:
+	 * As recommended by the Windows guest fellows, we check events before
+	 * checking messages.
+	 */
+	sc->vmbus_event_proc(sc, cpu);
+
+	/*
+	 * Check messages.  Mainly management stuffs; ultra low rate.
+	 */
+	msg = msg_base + VMBUS_SINT_MESSAGE;
+	if (__predict_false(msg->msg_type != HYPERV_MSGTYPE_NONE)) {
+		taskqueue_enqueue(VMBUS_PCPU_GET(sc, message_tq, cpu),
+		    VMBUS_PCPU_PTR(sc, message_task, cpu));
+	}
+
+	return (FILTER_HANDLED);
+}
+
+void
+vmbus_handle_intr(struct trapframe *trap_frame)
+{
+	struct vmbus_softc *sc = vmbus_get_softc();
+	int cpu = curcpu;
+
+	/*
+	 * Disable preemption.
+	 */
+	critical_enter();
+
+	/*
+	 * Do a little interrupt counting.
+	 */
+	(*VMBUS_PCPU_GET(sc, intr_cnt, cpu))++;
+
+	vmbus_handle_intr1(sc, trap_frame, cpu);
+
+	/*
+	 * Enable preemption.
+	 */
+	critical_exit();
+}
+
+static void
+vmbus_synic_setup(void *xsc)
+{
+	struct vmbus_softc *sc = xsc;
+	int cpu = curcpu;
+	uint64_t val, orig;
+	uint32_t sint;
+
+	if (hyperv_features & CPUID_HV_MSR_VP_INDEX) {
+		/* Save virtual processor id. */
+		VMBUS_PCPU_GET(sc, vcpuid, cpu) = rdmsr(MSR_HV_VP_INDEX);
+	} else {
+		/* Set virtual processor id to 0 for compatibility. */
+		VMBUS_PCPU_GET(sc, vcpuid, cpu) = 0;
+	}
+
+	/*
+	 * Setup the SynIC message.
+	 */
+	orig = rdmsr(MSR_HV_SIMP);
+	val = MSR_HV_SIMP_ENABLE | (orig & MSR_HV_SIMP_RSVD_MASK) |
+	    ((VMBUS_PCPU_GET(sc, message_dma.hv_paddr, cpu) >> PAGE_SHIFT) <<
+	     MSR_HV_SIMP_PGSHIFT);
+	wrmsr(MSR_HV_SIMP, val);
+
+	/*
+	 * Setup the SynIC event flags.
+	 */
+	orig = rdmsr(MSR_HV_SIEFP);
+	val = MSR_HV_SIEFP_ENABLE | (orig & MSR_HV_SIEFP_RSVD_MASK) |
+	    ((VMBUS_PCPU_GET(sc, event_flags_dma.hv_paddr, cpu)
+	      >> PAGE_SHIFT) << MSR_HV_SIEFP_PGSHIFT);
+	wrmsr(MSR_HV_SIEFP, val);
+
+
+	/*
+	 * Configure and unmask SINT for message and event flags.
+	 */
+	sint = MSR_HV_SINT0 + VMBUS_SINT_MESSAGE;
+	orig = rdmsr(sint);
+	val = sc->vmbus_idtvec | MSR_HV_SINT_AUTOEOI |
+	    (orig & MSR_HV_SINT_RSVD_MASK);
+	wrmsr(sint, val);
+
+	/*
+	 * Configure and unmask SINT for timer.
+	 */
+	sint = MSR_HV_SINT0 + VMBUS_SINT_TIMER;
+	orig = rdmsr(sint);
+	val = sc->vmbus_idtvec | MSR_HV_SINT_AUTOEOI |
+	    (orig & MSR_HV_SINT_RSVD_MASK);
+	wrmsr(sint, val);
+
+	/*
+	 * All done; enable SynIC.
+	 */
+	orig = rdmsr(MSR_HV_SCONTROL);
+	val = MSR_HV_SCTRL_ENABLE | (orig & MSR_HV_SCTRL_RSVD_MASK);
+	wrmsr(MSR_HV_SCONTROL, val);
+}
+
+static void
+vmbus_synic_teardown(void *arg)
+{
+	uint64_t orig;
+	uint32_t sint;
+
+	/*
+	 * Disable SynIC.
+	 */
+	orig = rdmsr(MSR_HV_SCONTROL);
+	wrmsr(MSR_HV_SCONTROL, (orig & MSR_HV_SCTRL_RSVD_MASK));
+
+	/*
+	 * Mask message and event flags SINT.
+	 */
+	sint = MSR_HV_SINT0 + VMBUS_SINT_MESSAGE;
+	orig = rdmsr(sint);
+	wrmsr(sint, orig | MSR_HV_SINT_MASKED);
+
+	/*
+	 * Mask timer SINT.
+	 */
+	sint = MSR_HV_SINT0 + VMBUS_SINT_TIMER;
+	orig = rdmsr(sint);
+	wrmsr(sint, orig | MSR_HV_SINT_MASKED);
+
+	/*
+	 * Teardown SynIC message.
+	 */
+	orig = rdmsr(MSR_HV_SIMP);
+	wrmsr(MSR_HV_SIMP, (orig & MSR_HV_SIMP_RSVD_MASK));
+
+	/*
+	 * Teardown SynIC event flags.
+	 */
+	orig = rdmsr(MSR_HV_SIEFP);
+	wrmsr(MSR_HV_SIEFP, (orig & MSR_HV_SIEFP_RSVD_MASK));
+}
+
+static int
+vmbus_dma_alloc(struct vmbus_softc *sc)
+{
+	bus_dma_tag_t parent_dtag;
+	uint8_t *evtflags;
+	int cpu;
+
+	parent_dtag = bus_get_dma_tag(sc->vmbus_dev);
+	CPU_FOREACH(cpu) {
+		void *ptr;
+
+		/*
+		 * Per-cpu messages and event flags.
+		 */
+		ptr = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0,
+		    PAGE_SIZE, VMBUS_PCPU_PTR(sc, message_dma, cpu),
+		    BUS_DMA_WAITOK | BUS_DMA_ZERO);
+		if (ptr == NULL)
+			return ENOMEM;
+		VMBUS_PCPU_GET(sc, message, cpu) = ptr;
+
+		ptr = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0,
+		    PAGE_SIZE, VMBUS_PCPU_PTR(sc, event_flags_dma, cpu),
+		    BUS_DMA_WAITOK | BUS_DMA_ZERO);
+		if (ptr == NULL)
+			return ENOMEM;
+		VMBUS_PCPU_GET(sc, event_flags, cpu) = ptr;
+	}
+
+	evtflags = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0,
+	    PAGE_SIZE, &sc->vmbus_evtflags_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO);
+	if (evtflags == NULL)
+		return ENOMEM;
+	sc->vmbus_rx_evtflags = (u_long *)evtflags;
+	sc->vmbus_tx_evtflags = (u_long *)(evtflags + (PAGE_SIZE / 2));
+	sc->vmbus_evtflags = evtflags;
+
+	sc->vmbus_mnf1 = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0,
+	    PAGE_SIZE, &sc->vmbus_mnf1_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO);
+	if (sc->vmbus_mnf1 == NULL)
+		return ENOMEM;
+
+	sc->vmbus_mnf2 = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0,
+	    sizeof(struct vmbus_mnf), &sc->vmbus_mnf2_dma,
+	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
+	if (sc->vmbus_mnf2 == NULL)
+		return ENOMEM;
+
+	return 0;
+}
+
+static void
+vmbus_dma_free(struct vmbus_softc *sc)
+{
+	int cpu;
+
+	if (sc->vmbus_evtflags != NULL) {
+		hyperv_dmamem_free(&sc->vmbus_evtflags_dma, sc->vmbus_evtflags);
+		sc->vmbus_evtflags = NULL;
+		sc->vmbus_rx_evtflags = NULL;
+		sc->vmbus_tx_evtflags = NULL;
+	}
+	if (sc->vmbus_mnf1 != NULL) {
+		hyperv_dmamem_free(&sc->vmbus_mnf1_dma, sc->vmbus_mnf1);
+		sc->vmbus_mnf1 = NULL;
+	}
+	if (sc->vmbus_mnf2 != NULL) {
+		hyperv_dmamem_free(&sc->vmbus_mnf2_dma, sc->vmbus_mnf2);
+		sc->vmbus_mnf2 = NULL;
+	}
+
+	CPU_FOREACH(cpu) {
+		if (VMBUS_PCPU_GET(sc, message, cpu) != NULL) {
+			hyperv_dmamem_free(
+			    VMBUS_PCPU_PTR(sc, message_dma, cpu),
+			    VMBUS_PCPU_GET(sc, message, cpu));
+			VMBUS_PCPU_GET(sc, message, cpu) = NULL;
+		}
+		if (VMBUS_PCPU_GET(sc, event_flags, cpu) != NULL) {
+			hyperv_dmamem_free(
+			    VMBUS_PCPU_PTR(sc, event_flags_dma, cpu),
+			    VMBUS_PCPU_GET(sc, event_flags, cpu));
+			VMBUS_PCPU_GET(sc, event_flags, cpu) = NULL;
+		}
+	}
+}
+
+static int
+vmbus_intr_setup(struct vmbus_softc *sc)
+{
+	int cpu;
+
+	CPU_FOREACH(cpu) {
+		char buf[MAXCOMLEN + 1];
+		cpuset_t cpu_mask;
+
+		/* Allocate an interrupt counter for Hyper-V interrupt */
+		snprintf(buf, sizeof(buf), "cpu%d:hyperv", cpu);
+		intrcnt_add(buf, VMBUS_PCPU_PTR(sc, intr_cnt, cpu));
+
+		/*
+		 * Setup taskqueue to handle events.  Task will be per-
+		 * channel.
+		 */
+		VMBUS_PCPU_GET(sc, event_tq, cpu) = taskqueue_create_fast(
+		    "hyperv event", M_WAITOK, taskqueue_thread_enqueue,
+		    VMBUS_PCPU_PTR(sc, event_tq, cpu));
+		if (vmbus_pin_evttask) {
+			CPU_SETOF(cpu, &cpu_mask);
+			taskqueue_start_threads_cpuset(
+			    VMBUS_PCPU_PTR(sc, event_tq, cpu), 1, PI_NET,
+			    &cpu_mask, "hvevent%d", cpu);
+		} else {
+			taskqueue_start_threads(
+			    VMBUS_PCPU_PTR(sc, event_tq, cpu), 1, PI_NET,
+			    "hvevent%d", cpu);
+		}
+
+		/*
+		 * Setup tasks and taskqueues to handle messages.
+		 */
+		VMBUS_PCPU_GET(sc, message_tq, cpu) = taskqueue_create_fast(
+		    "hyperv msg", M_WAITOK, taskqueue_thread_enqueue,
+		    VMBUS_PCPU_PTR(sc, message_tq, cpu));
+		CPU_SETOF(cpu, &cpu_mask);
+		taskqueue_start_threads_cpuset(
+		    VMBUS_PCPU_PTR(sc, message_tq, cpu), 1, PI_NET, &cpu_mask,
+		    "hvmsg%d", cpu);
+		TASK_INIT(VMBUS_PCPU_PTR(sc, message_task, cpu), 0,
+		    vmbus_msg_task, sc);
+	}
+
+#if defined(__amd64__) && defined(KLD_MODULE)
+	pmap_pti_add_kva(VMBUS_ISR_ADDR, VMBUS_ISR_ADDR + PAGE_SIZE, true);
+#endif
+
+	/*
+	 * All Hyper-V ISR required resources are setup, now let's find a
+	 * free IDT vector for Hyper-V ISR and set it up.
+	 */
+	sc->vmbus_idtvec = lapic_ipi_alloc(pti ? IDTVEC(vmbus_isr_pti) :
+	    IDTVEC(vmbus_isr));
+	if (sc->vmbus_idtvec < 0) {
+#if defined(__amd64__) && defined(KLD_MODULE)
+		pmap_pti_remove_kva(VMBUS_ISR_ADDR, VMBUS_ISR_ADDR + PAGE_SIZE);
+#endif
+		device_printf(sc->vmbus_dev, "cannot find free IDT vector\n");
+		return ENXIO;
+	}
+	if (bootverbose) {
+		device_printf(sc->vmbus_dev, "vmbus IDT vector %d\n",
+		    sc->vmbus_idtvec);
+	}
+	return 0;
+}
+
+static void
+vmbus_intr_teardown(struct vmbus_softc *sc)
+{
+	int cpu;
+
+	if (sc->vmbus_idtvec >= 0) {
+		lapic_ipi_free(sc->vmbus_idtvec);
+		sc->vmbus_idtvec = -1;
+	}
+
+#if defined(__amd64__) && defined(KLD_MODULE)
+	pmap_pti_remove_kva(VMBUS_ISR_ADDR, VMBUS_ISR_ADDR + PAGE_SIZE);
+#endif
+
+	CPU_FOREACH(cpu) {
+		if (VMBUS_PCPU_GET(sc, event_tq, cpu) != NULL) {
+			taskqueue_free(VMBUS_PCPU_GET(sc, event_tq, cpu));
+			VMBUS_PCPU_GET(sc, event_tq, cpu) = NULL;
+		}
+		if (VMBUS_PCPU_GET(sc, message_tq, cpu) != NULL) {
+			taskqueue_drain(VMBUS_PCPU_GET(sc, message_tq, cpu),
+			    VMBUS_PCPU_PTR(sc, message_task, cpu));
+			taskqueue_free(VMBUS_PCPU_GET(sc, message_tq, cpu));
+			VMBUS_PCPU_GET(sc, message_tq, cpu) = NULL;
+		}
+	}
+}
+
+static int
+vmbus_read_ivar(device_t dev, device_t child, int index, uintptr_t *result)
+{
+	return (ENOENT);
+}
+
+static int
+vmbus_child_pnpinfo(device_t dev, device_t child, struct sbuf *sb)
+{
+	const struct vmbus_channel *chan;
+	char guidbuf[HYPERV_GUID_STRLEN];
+
+	chan = vmbus_get_channel(child);
+	if (chan == NULL) {
+		/* Event timer device, which does not belong to a channel */
+		return (0);
+	}
+
+	hyperv_guid2str(&chan->ch_guid_type, guidbuf, sizeof(guidbuf));
+	sbuf_printf(sb, "classid=%s", guidbuf);
+
+	hyperv_guid2str(&chan->ch_guid_inst, guidbuf, sizeof(guidbuf));
+	sbuf_printf(sb, " deviceid=%s", guidbuf);
+
+	return (0);
+}
+
+int
+vmbus_add_child(struct vmbus_channel *chan)
+{
+	struct vmbus_softc *sc = chan->ch_vmbus;
+	device_t parent = sc->vmbus_dev;
+
+	mtx_lock(&Giant);
+
+	chan->ch_dev = device_add_child(parent, NULL, -1);
+	if (chan->ch_dev == NULL) {
+		mtx_unlock(&Giant);
+		device_printf(parent, "device_add_child for chan%u failed\n",
+		    chan->ch_id);
+		return (ENXIO);
+	}
+	device_set_ivars(chan->ch_dev, chan);
+	device_probe_and_attach(chan->ch_dev);
+
+	mtx_unlock(&Giant);
+	return (0);
+}
+
+int
+vmbus_delete_child(struct vmbus_channel *chan)
+{
+	int error = 0;
+
+	mtx_lock(&Giant);
+	if (chan->ch_dev != NULL) {
+		error = device_delete_child(chan->ch_vmbus->vmbus_dev,
+		    chan->ch_dev);
+		chan->ch_dev = NULL;
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+static int
+vmbus_sysctl_version(SYSCTL_HANDLER_ARGS)
+{
+	struct vmbus_softc *sc = arg1;
+	char verstr[16];
+
+	snprintf(verstr, sizeof(verstr), "%u.%u",
+	    VMBUS_VERSION_MAJOR(sc->vmbus_version),
+	    VMBUS_VERSION_MINOR(sc->vmbus_version));
+	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
+}
+
+/*
+ * We need the function to make sure the MMIO resource is allocated from the
+ * ranges found in _CRS.
+ *
+ * For the release function, we can use bus_generic_release_resource().
+ */
+static struct resource *
+vmbus_alloc_resource(device_t dev, device_t child, int type, int *rid,
+    rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
+{
+	device_t parent = device_get_parent(dev);
+	struct resource *res;
+
+#ifdef NEW_PCIB
+	if (type == SYS_RES_MEMORY) {
+		struct vmbus_softc *sc = device_get_softc(dev);
+
+		res = pcib_host_res_alloc(&sc->vmbus_mmio_res, child, type,
+		    rid, start, end, count, flags);
+	} else
+#endif
+	{
+		res = BUS_ALLOC_RESOURCE(parent, child, type, rid, start,
+		    end, count, flags);
+	}
+
+	return (res);
+}
+
+static int
+vmbus_alloc_msi(device_t bus, device_t dev, int count, int maxcount, int *irqs)
+{
+
+	return (PCIB_ALLOC_MSI(device_get_parent(bus), dev, count, maxcount,
+	    irqs));
+}
+
+static int
+vmbus_release_msi(device_t bus, device_t dev, int count, int *irqs)
+{
+
+	return (PCIB_RELEASE_MSI(device_get_parent(bus), dev, count, irqs));
+}
+
+static int
+vmbus_alloc_msix(device_t bus, device_t dev, int *irq)
+{
+
+	return (PCIB_ALLOC_MSIX(device_get_parent(bus), dev, irq));
+}
+
+static int
+vmbus_release_msix(device_t bus, device_t dev, int irq)
+{
+
+	return (PCIB_RELEASE_MSIX(device_get_parent(bus), dev, irq));
+}
+
+static int
+vmbus_map_msi(device_t bus, device_t dev, int irq, uint64_t *addr,
+	uint32_t *data)
+{
+
+	return (PCIB_MAP_MSI(device_get_parent(bus), dev, irq, addr, data));
+}
+
+static uint32_t
+vmbus_get_version_method(device_t bus, device_t dev)
+{
+	struct vmbus_softc *sc = device_get_softc(bus);
+
+	return sc->vmbus_version;
+}
+
+static int
+vmbus_probe_guid_method(device_t bus, device_t dev,
+    const struct hyperv_guid *guid)
+{
+	const struct vmbus_channel *chan = vmbus_get_channel(dev);
+
+	if (memcmp(&chan->ch_guid_type, guid, sizeof(struct hyperv_guid)) == 0)
+		return 0;
+	return ENXIO;
+}
+
+static uint32_t
+vmbus_get_vcpu_id_method(device_t bus, device_t dev, int cpu)
+{
+	const struct vmbus_softc *sc = device_get_softc(bus);
+
+	return (VMBUS_PCPU_GET(sc, vcpuid, cpu));
+}
+
+static struct taskqueue *
+vmbus_get_eventtq_method(device_t bus, device_t dev __unused, int cpu)
+{
+	const struct vmbus_softc *sc = device_get_softc(bus);
+
+	KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu%d", cpu));
+	return (VMBUS_PCPU_GET(sc, event_tq, cpu));
+}
+
+#ifdef NEW_PCIB
+#define VTPM_BASE_ADDR 0xfed40000
+#define FOUR_GB (1ULL << 32)
+
+enum parse_pass { parse_64, parse_32 };
+
+struct parse_context {
+	device_t vmbus_dev;
+	enum parse_pass pass;
+};
+
+static ACPI_STATUS
+parse_crs(ACPI_RESOURCE *res, void *ctx)
+{
+	const struct parse_context *pc = ctx;
+	device_t vmbus_dev = pc->vmbus_dev;
+
+	struct vmbus_softc *sc = device_get_softc(vmbus_dev);
+	UINT64 start, end;
+
+	switch (res->Type) {
+	case ACPI_RESOURCE_TYPE_ADDRESS32:
+		start = res->Data.Address32.Address.Minimum;
+		end = res->Data.Address32.Address.Maximum;
+		break;
+
+	case ACPI_RESOURCE_TYPE_ADDRESS64:
+		start = res->Data.Address64.Address.Minimum;
+		end = res->Data.Address64.Address.Maximum;
+		break;
+
+	default:
+		/* Unused types. */
+		return (AE_OK);
+	}
+
+	/*
+	 * We don't use <1MB addresses.
+	 */
+	if (end < 0x100000)
+		return (AE_OK);
+
+	/* Don't conflict with vTPM. */
+	if (end >= VTPM_BASE_ADDR && start < VTPM_BASE_ADDR)
+		end = VTPM_BASE_ADDR - 1;
+
+	if ((pc->pass == parse_32 && start < FOUR_GB) ||
+	    (pc->pass == parse_64 && start >= FOUR_GB))
+		pcib_host_res_decodes(&sc->vmbus_mmio_res, SYS_RES_MEMORY,
+		    start, end, 0);
+
+	return (AE_OK);
+}
+
+static void
+vmbus_get_crs(device_t dev, device_t vmbus_dev, enum parse_pass pass)
+{
+	struct parse_context pc;
+	ACPI_STATUS status;
+
+	if (bootverbose)
+		device_printf(dev, "walking _CRS, pass=%d\n", pass);
+
+	pc.vmbus_dev = vmbus_dev;
+	pc.pass = pass;
+	status = AcpiWalkResources(acpi_get_handle(dev), "_CRS",
+			parse_crs, &pc);
+
+	if (bootverbose && ACPI_FAILURE(status))
+		device_printf(dev, "_CRS: not found, pass=%d\n", pass);
+}
+
+static void
+vmbus_get_mmio_res_pass(device_t dev, enum parse_pass pass)
+{
+	device_t acpi0, parent;
+
+	parent = device_get_parent(dev);
+
+	acpi0 = device_get_parent(parent);
+	if (strcmp("acpi0", device_get_nameunit(acpi0)) == 0) {
+		device_t *children;
+		int count;
+
+		/*
+		 * Try to locate VMBUS resources and find _CRS on them.
+		 */
+		if (device_get_children(acpi0, &children, &count) == 0) {
+			int i;
+
+			for (i = 0; i < count; ++i) {
+				if (!device_is_attached(children[i]))
+					continue;
+
+				if (strcmp("vmbus_res",
+				    device_get_name(children[i])) == 0)
+					vmbus_get_crs(children[i], dev, pass);
+			}
+			free(children, M_TEMP);
+		}
+
+		/*
+		 * Try to find _CRS on acpi.
+		 */
+		vmbus_get_crs(acpi0, dev, pass);
+	} else {
+		device_printf(dev, "not grandchild of acpi\n");
+	}
+
+	/*
+	 * Try to find _CRS on parent.
+	 */
+	vmbus_get_crs(parent, dev, pass);
+}
+
+static void
+vmbus_get_mmio_res(device_t dev)
+{
+	struct vmbus_softc *sc = device_get_softc(dev);
+	/*
+	 * We walk the resources twice to make sure that: in the resource
+	 * list, the 32-bit resources appear behind the 64-bit resources.
+	 * NB: resource_list_add() uses INSERT_TAIL. This way, when we
+	 * iterate through the list to find a range for a 64-bit BAR in
+	 * vmbus_alloc_resource(), we can make sure we try to use >4GB
+	 * ranges first.
+	 */
+	pcib_host_res_init(dev, &sc->vmbus_mmio_res);
+
+	vmbus_get_mmio_res_pass(dev, parse_64);
+	vmbus_get_mmio_res_pass(dev, parse_32);
+}
+
+/*
+ * On Gen2 VMs, Hyper-V provides mmio space for framebuffer.
+ * This mmio address range is not useable for other PCI devices.
+ * Currently only efifb and vbefb drivers are using this range without
+ * reserving it from system.
+ * Therefore, vmbus driver reserves it before any other PCI device
+ * drivers start to request mmio addresses.
+ */
+static struct resource *hv_fb_res;
+
+static void
+vmbus_fb_mmio_res(device_t dev)
+{
+	struct efi_fb *efifb;
+	struct vbe_fb *vbefb;
+	rman_res_t fb_start, fb_end, fb_count;
+	int fb_height, fb_width;
+	caddr_t kmdp;
+
+	struct vmbus_softc *sc = device_get_softc(dev);
+	int rid = 0;
+
+	kmdp = preload_search_by_type("elf kernel");
+	if (kmdp == NULL)
+		kmdp = preload_search_by_type("elf64 kernel");
+	efifb = (struct efi_fb *)preload_search_info(kmdp,
+	    MODINFO_METADATA | MODINFOMD_EFI_FB);
+	vbefb = (struct vbe_fb *)preload_search_info(kmdp,
+	    MODINFO_METADATA | MODINFOMD_VBE_FB);
+	if (efifb != NULL) {
+		fb_start = efifb->fb_addr;
+		fb_end = efifb->fb_addr + efifb->fb_size;
+		fb_count = efifb->fb_size;
+		fb_height = efifb->fb_height;
+		fb_width = efifb->fb_width;
+	} else if (vbefb != NULL) {
+		fb_start = vbefb->fb_addr;
+		fb_end = vbefb->fb_addr + vbefb->fb_size;
+		fb_count = vbefb->fb_size;
+		fb_height = vbefb->fb_height;
+		fb_width = vbefb->fb_width;
+	} else {
+		if (bootverbose)
+			device_printf(dev,
+			    "no preloaded kernel fb information\n");
+		/* We are on Gen1 VM, just return. */
+		return;
+	}
+	
+	if (bootverbose)
+		device_printf(dev,
+		    "fb: fb_addr: %#jx, size: %#jx, "
+		    "actual size needed: 0x%x\n",
+		    fb_start, fb_count, fb_height * fb_width);
+
+	hv_fb_res = pcib_host_res_alloc(&sc->vmbus_mmio_res, dev,
+	    SYS_RES_MEMORY, &rid, fb_start, fb_end, fb_count,
+	    RF_ACTIVE | rman_make_alignment_flags(PAGE_SIZE));
+
+	if (hv_fb_res && bootverbose)
+		device_printf(dev,
+		    "successfully reserved memory for framebuffer "
+		    "starting at %#jx, size %#jx\n",
+		    fb_start, fb_count);
+}
+
+static void
+vmbus_free_mmio_res(device_t dev)
+{
+	struct vmbus_softc *sc = device_get_softc(dev);
+
+	pcib_host_res_free(dev, &sc->vmbus_mmio_res);
+
+	if (hv_fb_res)
+		hv_fb_res = NULL;
+}
+#endif	/* NEW_PCIB */
+
+static void
+vmbus_identify(driver_t *driver, device_t parent)
+{
+
+	if (device_get_unit(parent) != 0 || vm_guest != VM_GUEST_HV ||
+	    (hyperv_features & CPUID_HV_MSR_SYNIC) == 0)
+		return;
+	device_add_child(parent, "vmbus", -1);
+}
+
+static int
+vmbus_probe(device_t dev)
+{
+
+	if (device_get_unit(dev) != 0 || vm_guest != VM_GUEST_HV ||
+	    (hyperv_features & CPUID_HV_MSR_SYNIC) == 0)
+		return (ENXIO);
+
+	device_set_desc(dev, "Hyper-V Vmbus");
+	return (BUS_PROBE_DEFAULT);
+}
+
+/**
+ * @brief Main vmbus driver initialization routine.
+ *
+ * Here, we
+ * - initialize the vmbus driver context
+ * - setup various driver entry points
+ * - invoke the vmbus hv main init routine
+ * - get the irq resource
+ * - invoke the vmbus to add the vmbus root device
+ * - setup the vmbus root device
+ * - retrieve the channel offers
+ */
+static int
+vmbus_doattach(struct vmbus_softc *sc)
+{
+	struct sysctl_oid_list *child;
+	struct sysctl_ctx_list *ctx;
+	int ret;
+
+	if (sc->vmbus_flags & VMBUS_FLAG_ATTACHED)
+		return (0);
+
+#ifdef NEW_PCIB
+	vmbus_get_mmio_res(sc->vmbus_dev);
+	vmbus_fb_mmio_res(sc->vmbus_dev);
+#endif
+
+	sc->vmbus_flags |= VMBUS_FLAG_ATTACHED;
+
+	sc->vmbus_gpadl = VMBUS_GPADL_START;
+	mtx_init(&sc->vmbus_prichan_lock, "vmbus prichan", NULL, MTX_DEF);
+	TAILQ_INIT(&sc->vmbus_prichans);
+	mtx_init(&sc->vmbus_chan_lock, "vmbus channel", NULL, MTX_DEF);
+	TAILQ_INIT(&sc->vmbus_chans);
+	sc->vmbus_chmap = malloc(
+	    sizeof(struct vmbus_channel *) * VMBUS_CHAN_MAX, M_DEVBUF,
+	    M_WAITOK | M_ZERO);
+
+	/*
+	 * Create context for "post message" Hypercalls
+	 */
+	sc->vmbus_xc = vmbus_xact_ctx_create(bus_get_dma_tag(sc->vmbus_dev),
+	    HYPERCALL_POSTMSGIN_SIZE, VMBUS_MSG_SIZE,
+	    sizeof(struct vmbus_msghc));
+	if (sc->vmbus_xc == NULL) {
+		ret = ENXIO;
+		goto cleanup;
+	}
+
+	/*
+	 * Allocate DMA stuffs.
+	 */
+	ret = vmbus_dma_alloc(sc);
+	if (ret != 0)
+		goto cleanup;
+
+	/*
+	 * Setup interrupt.
+	 */
+	ret = vmbus_intr_setup(sc);
+	if (ret != 0)
+		goto cleanup;
+
+	/*
+	 * Setup SynIC.
+	 */
+	if (bootverbose)
+		device_printf(sc->vmbus_dev, "smp_started = %d\n", smp_started);
+	smp_rendezvous(NULL, vmbus_synic_setup, NULL, sc);
+	sc->vmbus_flags |= VMBUS_FLAG_SYNIC;
+
+	/*
+	 * Initialize vmbus, e.g. connect to Hypervisor.
+	 */
+	ret = vmbus_init(sc);
+	if (ret != 0)
+		goto cleanup;
+
+	if (sc->vmbus_version == VMBUS_VERSION_WS2008 ||
+	    sc->vmbus_version == VMBUS_VERSION_WIN7)
+		sc->vmbus_event_proc = vmbus_event_proc_compat;
+	else
+		sc->vmbus_event_proc = vmbus_event_proc;
+
+	ret = vmbus_scan(sc);
+	if (ret != 0)
+		goto cleanup;
+
+	ctx = device_get_sysctl_ctx(sc->vmbus_dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->vmbus_dev));
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "version",
+	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+	    vmbus_sysctl_version, "A", "vmbus version");
+
+	return (ret);
+
+cleanup:
+	vmbus_scan_teardown(sc);
+	vmbus_intr_teardown(sc);
+	vmbus_dma_free(sc);
+	if (sc->vmbus_xc != NULL) {
+		vmbus_xact_ctx_destroy(sc->vmbus_xc);
+		sc->vmbus_xc = NULL;
+	}
+	free(__DEVOLATILE(void *, sc->vmbus_chmap), M_DEVBUF);
+	mtx_destroy(&sc->vmbus_prichan_lock);
+	mtx_destroy(&sc->vmbus_chan_lock);
+
+	return (ret);
+}
+
+static void
+vmbus_event_proc_dummy(struct vmbus_softc *sc __unused, int cpu __unused)
+{
+}
+
+#ifdef EARLY_AP_STARTUP
+
+static void
+vmbus_intrhook(void *xsc)
+{
+	struct vmbus_softc *sc = xsc;
+
+	if (bootverbose)
+		device_printf(sc->vmbus_dev, "intrhook\n");
+	vmbus_doattach(sc);
+	config_intrhook_disestablish(&sc->vmbus_intrhook);
+}
+
+#endif	/* EARLY_AP_STARTUP */
+
+static int
+vmbus_attach(device_t dev)
+{
+	vmbus_sc = device_get_softc(dev);
+	vmbus_sc->vmbus_dev = dev;
+	vmbus_sc->vmbus_idtvec = -1;
+
+	/*
+	 * Event processing logic will be configured:
+	 * - After the vmbus protocol version negotiation.
+	 * - Before we request channel offers.
+	 */
+	vmbus_sc->vmbus_event_proc = vmbus_event_proc_dummy;
+
+#ifdef EARLY_AP_STARTUP
+	/*
+	 * Defer the real attach until the pause(9) works as expected.
+	 */
+	vmbus_sc->vmbus_intrhook.ich_func = vmbus_intrhook;
+	vmbus_sc->vmbus_intrhook.ich_arg = vmbus_sc;
+	config_intrhook_establish(&vmbus_sc->vmbus_intrhook);
+#else	/* !EARLY_AP_STARTUP */
+	/* 
+	 * If the system has already booted and thread
+	 * scheduling is possible indicated by the global
+	 * cold set to zero, we just call the driver
+	 * initialization directly.
+	 */
+	if (!cold)
+		vmbus_doattach(vmbus_sc);
+#endif	/* EARLY_AP_STARTUP */
+
+	return (0);
+}
+
+static int
+vmbus_detach(device_t dev)
+{
+	struct vmbus_softc *sc = device_get_softc(dev);
+
+	bus_generic_detach(dev);
+	vmbus_chan_destroy_all(sc);
+
+	vmbus_scan_teardown(sc);
+
+	vmbus_disconnect(sc);
+
+	if (sc->vmbus_flags & VMBUS_FLAG_SYNIC) {
+		sc->vmbus_flags &= ~VMBUS_FLAG_SYNIC;
+		smp_rendezvous(NULL, vmbus_synic_teardown, NULL, NULL);
+	}
+
+	vmbus_intr_teardown(sc);
+	vmbus_dma_free(sc);
+
+	if (sc->vmbus_xc != NULL) {
+		vmbus_xact_ctx_destroy(sc->vmbus_xc);
+		sc->vmbus_xc = NULL;
+	}
+
+	free(__DEVOLATILE(void *, sc->vmbus_chmap), M_DEVBUF);
+	mtx_destroy(&sc->vmbus_prichan_lock);
+	mtx_destroy(&sc->vmbus_chan_lock);
+
+#ifdef NEW_PCIB
+	vmbus_free_mmio_res(dev);
+#endif
+
+	return (0);
+}
+
+#ifndef EARLY_AP_STARTUP
+
+static void
+vmbus_sysinit(void *arg __unused)
+{
+	struct vmbus_softc *sc = vmbus_get_softc();
+
+	if (vm_guest != VM_GUEST_HV || sc == NULL)
+		return;
+
+	/* 
+	 * If the system has already booted and thread
+	 * scheduling is possible, as indicated by the
+	 * global cold set to zero, we just call the driver
+	 * initialization directly.
+	 */
+	if (!cold) 
+		vmbus_doattach(sc);
+}
+/*
+ * NOTE:
+ * We have to start as the last step of SI_SUB_SMP, i.e. after SMP is
+ * initialized.
+ */
+SYSINIT(vmbus_initialize, SI_SUB_SMP, SI_ORDER_ANY, vmbus_sysinit, NULL);
+
+#endif	/* !EARLY_AP_STARTUP */
diff --git a/sys/dev/hyperv/vmbus/vmbus_br.c b/sys/dev/hyperv/vmbus/vmbus_br.c
new file mode 100644
index 000000000000..7311f87fd596
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_br.c
@@ -0,0 +1,720 @@
+/*-
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+
+#include <dev/hyperv/vmbus/vmbus_reg.h>
+#include <dev/hyperv/vmbus/vmbus_brvar.h>
+
+/* Amount of space available for write */
+#define	VMBUS_BR_WAVAIL(r, w, z)	\
+	(((w) >= (r)) ? ((z) - ((w) - (r))) : ((r) - (w)))
+
+/* Increase bufing index */
+#define VMBUS_BR_IDXINC(idx, inc, sz)	(((idx) + (inc)) % (sz))
+
+static int			vmbus_br_sysctl_state(SYSCTL_HANDLER_ARGS);
+static int			vmbus_br_sysctl_state_bin(SYSCTL_HANDLER_ARGS);
+static void			vmbus_br_setup(struct vmbus_br *, void *, int);
+
+static int
+vmbus_br_sysctl_state(SYSCTL_HANDLER_ARGS)
+{
+	const struct vmbus_br *br = arg1;
+	uint32_t rindex, windex, imask, psndsz, fvalue, ravail, wavail;
+	uint64_t intrcnt;
+	char state[256];
+
+	intrcnt = br->vbr_intrcnt;
+	rindex = br->vbr_rindex;
+	windex = br->vbr_windex;
+	imask = br->vbr_imask;
+	psndsz = br->vbr_psndsz;
+	fvalue = br->vbr_fvalue;
+	wavail = VMBUS_BR_WAVAIL(rindex, windex, br->vbr_dsize);
+	ravail = br->vbr_dsize - wavail;
+
+	snprintf(state, sizeof(state),
+	    "intrcnt:%ju rindex:%u windex:%u imask:%u psndsz:%u fvalue:%u "
+	    "ravail:%u wavail:%u",
+	    (uintmax_t)intrcnt, rindex, windex, imask, psndsz, fvalue,
+	    ravail, wavail);
+	return sysctl_handle_string(oidp, state, sizeof(state), req);
+}
+
+/*
+ * Binary bufring states.
+ */
+static int
+vmbus_br_sysctl_state_bin(SYSCTL_HANDLER_ARGS)
+{
+#define BR_STATE_RIDX	0
+#define BR_STATE_WIDX	1
+#define BR_STATE_IMSK	2
+#define BR_STATE_PSSZ	3
+#define BR_STATE_FVAL	4
+#define BR_STATE_RSPC	5
+#define BR_STATE_WSPC	6
+#define BR_STATE_MAX	7
+
+	const struct vmbus_br *br = arg1;
+	uint32_t rindex, windex, wavail, state[BR_STATE_MAX];
+
+	rindex = br->vbr_rindex;
+	windex = br->vbr_windex;
+	wavail = VMBUS_BR_WAVAIL(rindex, windex, br->vbr_dsize);
+
+	state[BR_STATE_RIDX] = rindex;
+	state[BR_STATE_WIDX] = windex;
+	state[BR_STATE_IMSK] = br->vbr_imask;
+	state[BR_STATE_PSSZ] = br->vbr_psndsz;
+	state[BR_STATE_FVAL] = br->vbr_fvalue;
+	state[BR_STATE_WSPC] = wavail;
+	state[BR_STATE_RSPC] = br->vbr_dsize - wavail;
+
+	return sysctl_handle_opaque(oidp, state, sizeof(state), req);
+}
+
+void
+vmbus_br_sysctl_create(struct sysctl_ctx_list *ctx, struct sysctl_oid *br_tree,
+    struct vmbus_br *br, const char *name)
+{
+	struct sysctl_oid *tree;
+	char desc[64];
+
+	tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(br_tree), OID_AUTO,
+	    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+	if (tree == NULL)
+		return;
+
+	snprintf(desc, sizeof(desc), "%s state", name);
+	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "state",
+	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+	    br, 0, vmbus_br_sysctl_state, "A", desc);
+
+	snprintf(desc, sizeof(desc), "%s binary state", name);
+	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "state_bin",
+	    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
+	    br, 0, vmbus_br_sysctl_state_bin, "IU", desc);
+}
+
+void
+vmbus_rxbr_intr_mask(struct vmbus_rxbr *rbr)
+{
+	rbr->rxbr_imask = 1;
+	mb();
+}
+
+static __inline uint32_t
+vmbus_rxbr_avail(const struct vmbus_rxbr *rbr)
+{
+	uint32_t rindex, windex;
+
+	/* Get snapshot */
+	rindex = rbr->rxbr_rindex;
+	windex = rbr->rxbr_windex;
+
+	return (rbr->rxbr_dsize -
+	    VMBUS_BR_WAVAIL(rindex, windex, rbr->rxbr_dsize));
+}
+
+uint32_t
+vmbus_rxbr_available(const struct vmbus_rxbr *rbr)
+{
+	return (vmbus_rxbr_avail(rbr));
+}
+
+uint32_t
+vmbus_rxbr_intr_unmask(struct vmbus_rxbr *rbr)
+{
+	rbr->rxbr_imask = 0;
+	mb();
+
+	/*
+	 * Now check to see if the ring buffer is still empty.
+	 * If it is not, we raced and we need to process new
+	 * incoming channel packets.
+	 */
+	return vmbus_rxbr_avail(rbr);
+}
+
+static void
+vmbus_br_setup(struct vmbus_br *br, void *buf, int blen)
+{
+	br->vbr = buf;
+	br->vbr_dsize = blen - sizeof(struct vmbus_bufring);
+}
+
+void
+vmbus_rxbr_init(struct vmbus_rxbr *rbr)
+{
+	mtx_init(&rbr->rxbr_lock, "vmbus_rxbr", NULL, MTX_SPIN);
+}
+
+void
+vmbus_rxbr_deinit(struct vmbus_rxbr *rbr)
+{
+	mtx_destroy(&rbr->rxbr_lock);
+}
+
+void
+vmbus_rxbr_setup(struct vmbus_rxbr *rbr, void *buf, int blen)
+{
+	vmbus_br_setup(&rbr->rxbr, buf, blen);
+}
+
+static __inline boolean_t
+vmbus_rxbr_need_signal(const struct vmbus_rxbr *rbr, uint32_t bytes_read)
+{
+	uint32_t pending_snd_sz, canwrite_size;
+
+	/* No need to signal if host doesn't want us to */
+	if (!rbr->rxbr_fpsndsz)
+		return false;
+
+	mb();
+
+	pending_snd_sz = rbr->rxbr_psndsz;
+	/* No need to signal if host sets pending_snd_sz to 0 */
+	if (!pending_snd_sz)
+		return false;
+
+	mb();
+
+	canwrite_size = rbr->rxbr_dsize - vmbus_rxbr_avail(rbr);
+
+	/* No need to signal if br already has enough space before read */
+	if (canwrite_size - bytes_read > pending_snd_sz)
+		return false;
+
+	/*
+	 * No need to signal if still doesn't have enough space
+	 * asked by host
+	 */
+	if (canwrite_size <= pending_snd_sz)
+		return false;
+
+	return true;
+}
+
+void
+vmbus_txbr_init(struct vmbus_txbr *tbr)
+{
+	mtx_init(&tbr->txbr_lock, "vmbus_txbr", NULL, MTX_SPIN);
+}
+
+void
+vmbus_txbr_deinit(struct vmbus_txbr *tbr)
+{
+	mtx_destroy(&tbr->txbr_lock);
+}
+
+void
+vmbus_txbr_setup(struct vmbus_txbr *tbr, void *buf, int blen)
+{
+	vmbus_br_setup(&tbr->txbr, buf, blen);
+
+	/* Set feature bit enabling flow control */
+	tbr->txbr_fpsndsz = 1;
+}
+
+uint32_t
+vmbus_txbr_get_imask(const struct vmbus_txbr *tbr)
+{
+	mb();
+
+	return(tbr->txbr_imask);
+}
+
+void
+vmbus_txbr_set_pending_snd_sz(struct vmbus_txbr *tbr, uint32_t size)
+{
+	tbr->txbr_psndsz = size;
+}
+
+/*
+ * When we write to the ring buffer, check if the host needs to be
+ * signaled.
+ *
+ * The contract:
+ * - The host guarantees that while it is draining the TX bufring,
+ *   it will set the br_imask to indicate it does not need to be
+ *   interrupted when new data are added.
+ * - The host guarantees that it will completely drain the TX bufring
+ *   before exiting the read loop.  Further, once the TX bufring is
+ *   empty, it will clear the br_imask and re-check to see if new
+ *   data have arrived.
+ */
+static __inline boolean_t
+vmbus_txbr_need_signal(const struct vmbus_txbr *tbr, uint32_t old_windex)
+{
+	mb();
+	if (tbr->txbr_imask)
+		return (FALSE);
+
+	__compiler_membar();
+
+	/*
+	 * This is the only case we need to signal when the
+	 * ring transitions from being empty to non-empty.
+	 */
+	if (old_windex == tbr->txbr_rindex)
+		return (TRUE);
+
+	return (FALSE);
+}
+
+static __inline uint32_t
+vmbus_txbr_avail(const struct vmbus_txbr *tbr)
+{
+	uint32_t rindex, windex;
+
+	/* Get snapshot */
+	rindex = tbr->txbr_rindex;
+	windex = tbr->txbr_windex;
+
+	return VMBUS_BR_WAVAIL(rindex, windex, tbr->txbr_dsize);
+}
+
+static __inline uint32_t
+vmbus_txbr_copyto(const struct vmbus_txbr *tbr, uint32_t windex,
+    const void *src0, uint32_t cplen)
+{
+	const uint8_t *src = src0;
+	uint8_t *br_data = tbr->txbr_data;
+	uint32_t br_dsize = tbr->txbr_dsize;
+
+	if (cplen > br_dsize - windex) {
+		uint32_t fraglen = br_dsize - windex;
+
+		/* Wrap-around detected */
+		memcpy(br_data + windex, src, fraglen);
+		memcpy(br_data, src + fraglen, cplen - fraglen);
+	} else {
+		memcpy(br_data + windex, src, cplen);
+	}
+	return VMBUS_BR_IDXINC(windex, cplen, br_dsize);
+}
+
+static __inline uint32_t
+vmbus_txbr_copyto_call(const struct vmbus_txbr *tbr, uint32_t windex,
+    uint32_t cplen, vmbus_br_copy_callback_t cb, void *cbarg, int *ret)
+{
+	uint8_t *br_data = tbr->txbr_data;
+	uint32_t br_dsize = tbr->txbr_dsize;
+	int err = 0;
+
+	if (cplen > br_dsize - windex) {
+		uint32_t fraglen = br_dsize - windex;
+
+		/* Wrap-around detected */
+		err = cb((void *)(br_data + windex), fraglen, cbarg);
+		if (!err)
+			err = cb((void *)br_data, cplen - fraglen, cbarg);
+	} else {
+		err = cb((void *)(br_data + windex), cplen, cbarg);
+	}
+
+	*ret = err;
+
+	return VMBUS_BR_IDXINC(windex, cplen, br_dsize);
+}
+
+uint32_t
+vmbus_txbr_available(const struct vmbus_txbr *tbr)
+{
+	return (vmbus_txbr_avail(tbr));
+}
+
+/*
+ * NOTE:
+ * Not holding lock when calling user provided callback routine.
+ * Caller should hold lock to serialize ring buffer accesses.
+ */
+int
+vmbus_txbr_write_call(struct vmbus_txbr *tbr,
+    const struct iovec iov[], int iovlen,
+    vmbus_br_copy_callback_t cb, void *cbarg,
+    boolean_t *need_sig)
+{
+	uint32_t old_windex, windex, total;
+	uint64_t save_windex;
+	int i;
+	int cb_ret = 0;
+
+	total = 0;
+	for (i = 0; i < iovlen; i++)
+		total += iov[i].iov_len;
+	total += sizeof(save_windex);
+
+
+	/*
+	 * NOTE:
+	 * If this write is going to make br_windex same as br_rindex,
+	 * i.e. the available space for write is same as the write size,
+	 * we can't do it then, since br_windex == br_rindex means that
+	 * the bufring is empty.
+	 */
+	if (vmbus_txbr_avail(tbr) <= total) {
+		return (EAGAIN);
+	}
+
+	/* Save br_windex for later use */
+	old_windex = tbr->txbr_windex;
+
+	/*
+	 * Copy the scattered channel packet to the TX bufring.
+	 */
+	windex = old_windex;
+	for (i = 0; i < iovlen; i++) {
+		if (iov[i].iov_base != NULL) {
+			windex = vmbus_txbr_copyto(tbr, windex,
+			    iov[i].iov_base, iov[i].iov_len);
+		} else if (cb != NULL) {
+			windex = vmbus_txbr_copyto_call(tbr, windex,
+			    iov[i].iov_len, cb, cbarg, &cb_ret);
+			/*
+			 * If callback fails, return without updating
+			 * write index.
+			 */
+			if (cb_ret)
+				return (cb_ret);
+		}
+	}
+
+	mtx_lock_spin(&tbr->txbr_lock);
+
+	/*
+	 * Set the offset of the current channel packet.
+	 */
+	save_windex = ((uint64_t)old_windex) << 32;
+	windex = vmbus_txbr_copyto(tbr, windex, &save_windex,
+	    sizeof(save_windex));
+
+	/*
+	 * Update the write index _after_ the channel packet
+	 * is copied.
+	 */
+	__compiler_membar();
+	tbr->txbr_windex = windex;
+
+	mtx_unlock_spin(&tbr->txbr_lock);
+
+	if (need_sig)
+		*need_sig = vmbus_txbr_need_signal(tbr, old_windex);
+
+	return (0);
+}
+
+/*
+ * Write scattered channel packet to TX bufring.
+ *
+ * The offset of this channel packet is written as a 64bits value
+ * immediately after this channel packet.
+ */
+int
+vmbus_txbr_write(struct vmbus_txbr *tbr, const struct iovec iov[], int iovlen,
+    boolean_t *need_sig)
+{
+	uint32_t old_windex, windex, total;
+	uint64_t save_windex;
+	int i;
+
+	total = 0;
+	for (i = 0; i < iovlen; i++)
+		total += iov[i].iov_len;
+	total += sizeof(save_windex);
+
+	mtx_lock_spin(&tbr->txbr_lock);
+
+	/*
+	 * NOTE:
+	 * If this write is going to make br_windex same as br_rindex,
+	 * i.e. the available space for write is same as the write size,
+	 * we can't do it then, since br_windex == br_rindex means that
+	 * the bufring is empty.
+	 */
+	if (vmbus_txbr_avail(tbr) <= total) {
+		mtx_unlock_spin(&tbr->txbr_lock);
+		return (EAGAIN);
+	}
+
+	/* Save br_windex for later use */
+	old_windex = tbr->txbr_windex;
+
+	/*
+	 * Copy the scattered channel packet to the TX bufring.
+	 */
+	windex = old_windex;
+	for (i = 0; i < iovlen; i++) {
+		windex = vmbus_txbr_copyto(tbr, windex,
+		    iov[i].iov_base, iov[i].iov_len);
+	}
+
+	/*
+	 * Set the offset of the current channel packet.
+	 */
+	save_windex = ((uint64_t)old_windex) << 32;
+	windex = vmbus_txbr_copyto(tbr, windex, &save_windex,
+	    sizeof(save_windex));
+
+	/*
+	 * Update the write index _after_ the channel packet
+	 * is copied.
+	 */
+	__compiler_membar();
+	tbr->txbr_windex = windex;
+
+	mtx_unlock_spin(&tbr->txbr_lock);
+
+	*need_sig = vmbus_txbr_need_signal(tbr, old_windex);
+
+	return (0);
+}
+
+static __inline uint32_t
+vmbus_rxbr_copyfrom(const struct vmbus_rxbr *rbr, uint32_t rindex,
+    void *dst0, int cplen)
+{
+	uint8_t *dst = dst0;
+	const uint8_t *br_data = rbr->rxbr_data;
+	uint32_t br_dsize = rbr->rxbr_dsize;
+
+	if (cplen > br_dsize - rindex) {
+		uint32_t fraglen = br_dsize - rindex;
+
+		/* Wrap-around detected. */
+		memcpy(dst, br_data + rindex, fraglen);
+		memcpy(dst + fraglen, br_data, cplen - fraglen);
+	} else {
+		memcpy(dst, br_data + rindex, cplen);
+	}
+	return VMBUS_BR_IDXINC(rindex, cplen, br_dsize);
+}
+
+static __inline uint32_t
+vmbus_rxbr_copyfrom_call(const struct vmbus_rxbr *rbr, uint32_t rindex,
+    int cplen, vmbus_br_copy_callback_t cb, void *cbarg)
+{
+	uint8_t *br_data = rbr->rxbr_data;
+	uint32_t br_dsize = rbr->rxbr_dsize;
+	int error = 0;
+
+	if (cplen > br_dsize - rindex) {
+		uint32_t fraglen = br_dsize - rindex;
+
+		/* Wrap-around detected. */
+		error = cb((void *)(br_data + rindex), fraglen, cbarg);
+		if (!error)
+			error = cb((void *)br_data, cplen - fraglen, cbarg);
+	} else {
+		error = cb((void *)(br_data + rindex), cplen, cbarg);
+	}
+	return (error);
+}
+
+int
+vmbus_rxbr_peek(struct vmbus_rxbr *rbr, void *data, int dlen)
+{
+	mtx_lock_spin(&rbr->rxbr_lock);
+
+	/*
+	 * The requested data and the 64bits channel packet
+	 * offset should be there at least.
+	 */
+	if (vmbus_rxbr_avail(rbr) < dlen + sizeof(uint64_t)) {
+		mtx_unlock_spin(&rbr->rxbr_lock);
+		return (EAGAIN);
+	}
+	vmbus_rxbr_copyfrom(rbr, rbr->rxbr_rindex, data, dlen);
+
+	mtx_unlock_spin(&rbr->rxbr_lock);
+
+	return (0);
+}
+
+/*
+ * NOTE:
+ * We only hold spin lock to check the ring buffer space. It is
+ * released before calling user provided callback routine.
+ * Caller should hold lock to serialize ring buffer accesses.
+ */
+int
+vmbus_rxbr_peek_call(struct vmbus_rxbr *rbr, int dlen, uint32_t skip,
+    vmbus_br_copy_callback_t cb, void *cbarg)
+{
+	uint32_t rindex, br_dsize0 = rbr->rxbr_dsize;
+	int ret;
+
+	mtx_lock_spin(&rbr->rxbr_lock);
+	/*
+	 * The requested data + skip and the 64bits channel packet
+	 * offset should be there at least.
+	 */
+	if (vmbus_rxbr_avail(rbr) < skip + dlen + sizeof(uint64_t)) {
+		mtx_unlock_spin(&rbr->rxbr_lock);
+		return (EAGAIN);
+	}
+
+	rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, skip, br_dsize0);
+	mtx_unlock_spin(&rbr->rxbr_lock);
+
+	ret = vmbus_rxbr_copyfrom_call(rbr, rindex, dlen, cb, cbarg);
+
+	return (ret);
+}
+
+/*
+ * NOTE:
+ * We assume idx_adv == sizeof(channel packet).
+ */
+int
+vmbus_rxbr_idxadv_peek(struct vmbus_rxbr *rbr, void *data, int dlen,
+    uint32_t idx_adv, boolean_t *need_sig)
+{
+	uint32_t rindex, br_dsize = rbr->rxbr_dsize;
+
+	mtx_lock_spin(&rbr->rxbr_lock);
+	/*
+	 * Make sure it has enough data to read.
+	 */
+	if (vmbus_rxbr_avail(rbr) < idx_adv + sizeof(uint64_t) + dlen) {
+		mtx_unlock_spin(&rbr->rxbr_lock);
+		return (EAGAIN);
+	}
+
+	if (idx_adv > 0) {
+		/*
+		 * Advance the read index first, including the channel's 64bit
+		 * previous write offset.
+		 */
+		rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex,
+		    idx_adv + sizeof(uint64_t), br_dsize);
+		__compiler_membar();
+		rbr->rxbr_rindex = rindex;
+	}
+
+	vmbus_rxbr_copyfrom(rbr, rbr->rxbr_rindex, data, dlen);
+
+	mtx_unlock_spin(&rbr->rxbr_lock);
+
+	if (need_sig) {
+		if (idx_adv > 0)
+			*need_sig =
+			    vmbus_rxbr_need_signal(rbr, idx_adv +
+			    sizeof(uint64_t));
+		else
+			*need_sig = false;
+	}
+
+	return (0);
+}
+
+/*
+ * NOTE:
+ * Just update the RX rb index.
+ */
+int
+vmbus_rxbr_idxadv(struct vmbus_rxbr *rbr, uint32_t idx_adv,
+    boolean_t *need_sig)
+{
+	uint32_t rindex, br_dsize = rbr->rxbr_dsize;
+
+	mtx_lock_spin(&rbr->rxbr_lock);
+	/*
+	 * Make sure it has enough space to advance.
+	 */
+	if (vmbus_rxbr_avail(rbr) < idx_adv + sizeof(uint64_t)) {
+		mtx_unlock_spin(&rbr->rxbr_lock);
+		return (EAGAIN);
+	}
+
+	/*
+	 * Advance the read index, including the channel's 64bit
+	 * previous write offset.
+	 */
+	rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex,
+	    idx_adv + sizeof(uint64_t), br_dsize);
+	__compiler_membar();
+	rbr->rxbr_rindex = rindex;
+
+	mtx_unlock_spin(&rbr->rxbr_lock);
+
+	if (need_sig) {
+		*need_sig =
+		    vmbus_rxbr_need_signal(rbr, idx_adv + sizeof(uint64_t));
+	}
+
+	return (0);
+}
+
+/*
+ * NOTE:
+ * We assume (dlen + skip) == sizeof(channel packet).
+ */
+int
+vmbus_rxbr_read(struct vmbus_rxbr *rbr, void *data, int dlen, uint32_t skip)
+{
+	uint32_t rindex, br_dsize = rbr->rxbr_dsize;
+
+	KASSERT(dlen + skip > 0, ("invalid dlen %d, offset %u", dlen, skip));
+
+	mtx_lock_spin(&rbr->rxbr_lock);
+
+	if (vmbus_rxbr_avail(rbr) < dlen + skip + sizeof(uint64_t)) {
+		mtx_unlock_spin(&rbr->rxbr_lock);
+		return (EAGAIN);
+	}
+
+	/*
+	 * Copy channel packet from RX bufring.
+	 */
+	rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, skip, br_dsize);
+	rindex = vmbus_rxbr_copyfrom(rbr, rindex, data, dlen);
+
+	/*
+	 * Discard this channel packet's 64bits offset, which is useless to us.
+	 */
+	rindex = VMBUS_BR_IDXINC(rindex, sizeof(uint64_t), br_dsize);
+
+	/*
+	 * Update the read index _after_ the channel packet is fetched.
+	 */
+	__compiler_membar();
+	rbr->rxbr_rindex = rindex;
+
+	mtx_unlock_spin(&rbr->rxbr_lock);
+
+	return (0);
+}
diff --git a/sys/dev/hyperv/vmbus/vmbus_brvar.h b/sys/dev/hyperv/vmbus/vmbus_brvar.h
new file mode 100644
index 000000000000..95bf4338ff1c
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_brvar.h
@@ -0,0 +1,157 @@
+/*-
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_BRVAR_H_
+#define _VMBUS_BRVAR_H_
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/_iovec.h>
+
+struct vmbus_br {
+	struct vmbus_bufring	*vbr;
+	uint32_t		vbr_dsize;	/* total data size */
+};
+
+#define vbr_windex		vbr->br_windex
+#define vbr_rindex		vbr->br_rindex
+#define vbr_imask		vbr->br_imask
+#define vbr_psndsz		vbr->br_pending_snd_sz
+#define vbr_fpsndsz		vbr->br_feature_bits.feat_pending_snd_sz
+#define vbr_fvalue		vbr->br_feature_bits.value
+#define vbr_intrcnt		vbr->br_g2h_intr_cnt
+#define vbr_data		vbr->br_data
+
+struct vmbus_rxbr {
+	struct mtx		rxbr_lock;
+	struct vmbus_br		rxbr;
+};
+
+#define rxbr_windex		rxbr.vbr_windex
+#define rxbr_rindex		rxbr.vbr_rindex
+#define rxbr_imask		rxbr.vbr_imask
+#define rxbr_psndsz		rxbr.vbr_psndsz
+#define rxbr_fpsndsz		rxbr.vbr_fpsndsz
+#define rxbr_fvalue		rxbr.vbr_fvalue
+#define rxbr_intrcnt		rxbr.vbr_intrcnt
+#define rxbr_data		rxbr.vbr_data
+#define rxbr_dsize		rxbr.vbr_dsize
+
+struct vmbus_txbr {
+	struct mtx		txbr_lock;
+	struct vmbus_br		txbr;
+};
+
+#define txbr_windex		txbr.vbr_windex
+#define txbr_rindex		txbr.vbr_rindex
+#define txbr_imask		txbr.vbr_imask
+#define txbr_psndsz		txbr.vbr_psndsz
+#define txbr_fpsndsz		txbr.vbr_fpsndsz
+#define txbr_fvalue		txbr.vbr_fvalue
+#define txbr_intrcnt		txbr.vbr_intrcnt
+#define txbr_data		txbr.vbr_data
+#define txbr_dsize		txbr.vbr_dsize
+
+struct sysctl_ctx_list;
+struct sysctl_oid;
+
+static __inline int
+vmbus_txbr_maxpktsz(const struct vmbus_txbr *tbr)
+{
+
+	/*
+	 * - 64 bits for the trailing start index (- sizeof(uint64_t)).
+	 * - The rindex and windex can't be same (- 1).  See
+	 *   the comment near vmbus_bufring.br_{r,w}index.
+	 */
+	return (tbr->txbr_dsize - sizeof(uint64_t) - 1);
+}
+
+static __inline bool
+vmbus_txbr_empty(const struct vmbus_txbr *tbr)
+{
+
+	return (tbr->txbr_windex == tbr->txbr_rindex ? true : false);
+}
+
+static __inline bool
+vmbus_rxbr_empty(const struct vmbus_rxbr *rbr)
+{
+
+	return (rbr->rxbr_windex == rbr->rxbr_rindex ? true : false);
+}
+
+static __inline int
+vmbus_br_nelem(int br_size, int elem_size)
+{
+
+	/* Strip bufring header */
+	br_size -= sizeof(struct vmbus_bufring);
+	/* Add per-element trailing index */
+	elem_size += sizeof(uint64_t);
+	return (br_size / elem_size);
+}
+
+void		vmbus_br_sysctl_create(struct sysctl_ctx_list *ctx,
+		    struct sysctl_oid *br_tree, struct vmbus_br *br,
+		    const char *name);
+
+void		vmbus_rxbr_init(struct vmbus_rxbr *rbr);
+void		vmbus_rxbr_deinit(struct vmbus_rxbr *rbr);
+void		vmbus_rxbr_setup(struct vmbus_rxbr *rbr, void *buf, int blen);
+int		vmbus_rxbr_peek(struct vmbus_rxbr *rbr, void *data, int dlen);
+int		vmbus_rxbr_read(struct vmbus_rxbr *rbr, void *data, int dlen,
+		    uint32_t skip);
+int		vmbus_rxbr_idxadv(struct vmbus_rxbr *rbr, uint32_t idx_adv,
+		    boolean_t *need_sig);
+int		vmbus_rxbr_idxadv_peek(struct vmbus_rxbr *rbr, void *data,
+		    int dlen, uint32_t idx_adv, boolean_t *need_sig);
+int		vmbus_rxbr_peek_call(struct vmbus_rxbr *rbr, int dlen,
+		    uint32_t skip, vmbus_br_copy_callback_t cb, void *cbarg);
+void		vmbus_rxbr_intr_mask(struct vmbus_rxbr *rbr);
+uint32_t	vmbus_rxbr_intr_unmask(struct vmbus_rxbr *rbr);
+uint32_t	vmbus_rxbr_available(const struct vmbus_rxbr *rbr);
+
+void		vmbus_txbr_init(struct vmbus_txbr *tbr);
+void		vmbus_txbr_deinit(struct vmbus_txbr *tbr);
+void		vmbus_txbr_setup(struct vmbus_txbr *tbr, void *buf, int blen);
+int		vmbus_txbr_write(struct vmbus_txbr *tbr,
+		    const struct iovec iov[], int iovlen, boolean_t *need_sig);
+int		vmbus_txbr_write_call(struct vmbus_txbr *tbr,
+		    const struct iovec iov[], int iovlen,
+		    vmbus_br_copy_callback_t cb, void *cbarg,
+		    boolean_t *need_sig);
+uint32_t	vmbus_txbr_available(const struct vmbus_txbr *tbr);
+uint32_t	vmbus_txbr_get_imask(const struct vmbus_txbr *tbr);
+void		vmbus_txbr_set_pending_snd_sz(struct vmbus_txbr *tbr,
+		    uint32_t size);
+
+#endif  /* _VMBUS_BRVAR_H_ */
diff --git a/sys/dev/hyperv/vmbus/vmbus_chan.c b/sys/dev/hyperv/vmbus/vmbus_chan.c
new file mode 100644
index 000000000000..032e06c47c95
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_chan.c
@@ -0,0 +1,2390 @@
+/*-
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <machine/atomic.h>
+#include <machine/stdarg.h>
+
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+#include <dev/hyperv/vmbus/hyperv_var.h>
+#include <dev/hyperv/vmbus/vmbus_reg.h>
+#include <dev/hyperv/vmbus/vmbus_var.h>
+#include <dev/hyperv/vmbus/vmbus_brvar.h>
+#include <dev/hyperv/vmbus/vmbus_chanvar.h>
+
+struct vmbus_chan_pollarg {
+	struct vmbus_channel	*poll_chan;
+	u_int			poll_hz;
+};
+
+static void			vmbus_chan_update_evtflagcnt(
+				    struct vmbus_softc *,
+				    const struct vmbus_channel *);
+static int			vmbus_chan_close_internal(
+				    struct vmbus_channel *);
+static int			vmbus_chan_sysctl_mnf(SYSCTL_HANDLER_ARGS);
+static void			vmbus_chan_sysctl_create(
+				    struct vmbus_channel *);
+static struct vmbus_channel	*vmbus_chan_alloc(struct vmbus_softc *);
+static void			vmbus_chan_free(struct vmbus_channel *);
+static int			vmbus_chan_add(struct vmbus_channel *);
+static void			vmbus_chan_cpu_default(struct vmbus_channel *);
+static int			vmbus_chan_release(struct vmbus_channel *);
+static void			vmbus_chan_set_chmap(struct vmbus_channel *);
+static void			vmbus_chan_clear_chmap(struct vmbus_channel *);
+static void			vmbus_chan_detach(struct vmbus_channel *);
+static bool			vmbus_chan_wait_revoke(
+				    const struct vmbus_channel *, bool);
+static void			vmbus_chan_poll_timeout(void *);
+static bool			vmbus_chan_poll_cancel_intq(
+				    struct vmbus_channel *);
+static void			vmbus_chan_poll_cancel(struct vmbus_channel *);
+
+static void			vmbus_chan_ins_prilist(struct vmbus_softc *,
+				    struct vmbus_channel *);
+static void			vmbus_chan_rem_prilist(struct vmbus_softc *,
+				    struct vmbus_channel *);
+static void			vmbus_chan_ins_list(struct vmbus_softc *,
+				    struct vmbus_channel *);
+static void			vmbus_chan_rem_list(struct vmbus_softc *,
+				    struct vmbus_channel *);
+static void			vmbus_chan_ins_sublist(struct vmbus_channel *,
+				    struct vmbus_channel *);
+static void			vmbus_chan_rem_sublist(struct vmbus_channel *,
+				    struct vmbus_channel *);
+
+static void			vmbus_chan_task(void *, int);
+static void			vmbus_chan_task_nobatch(void *, int);
+static void			vmbus_chan_poll_task(void *, int);
+static void			vmbus_chan_clrchmap_task(void *, int);
+static void			vmbus_chan_pollcfg_task(void *, int);
+static void			vmbus_chan_polldis_task(void *, int);
+static void			vmbus_chan_poll_cancel_task(void *, int);
+static void			vmbus_prichan_attach_task(void *, int);
+static void			vmbus_subchan_attach_task(void *, int);
+static void			vmbus_prichan_detach_task(void *, int);
+static void			vmbus_subchan_detach_task(void *, int);
+
+static void			vmbus_chan_msgproc_choffer(struct vmbus_softc *,
+				    const struct vmbus_message *);
+static void			vmbus_chan_msgproc_chrescind(
+				    struct vmbus_softc *,
+				    const struct vmbus_message *);
+
+static int			vmbus_chan_printf(const struct vmbus_channel *,
+				    const char *, ...) __printflike(2, 3);
+
+/*
+ * Vmbus channel message processing.
+ */
+static const vmbus_chanmsg_proc_t
+vmbus_chan_msgprocs[VMBUS_CHANMSG_TYPE_MAX] = {
+	VMBUS_CHANMSG_PROC(CHOFFER,	vmbus_chan_msgproc_choffer),
+	VMBUS_CHANMSG_PROC(CHRESCIND,	vmbus_chan_msgproc_chrescind),
+
+	VMBUS_CHANMSG_PROC_WAKEUP(CHOPEN_RESP),
+	VMBUS_CHANMSG_PROC_WAKEUP(GPADL_CONNRESP),
+	VMBUS_CHANMSG_PROC_WAKEUP(GPADL_DISCONNRESP)
+};
+
+/*
+ * Notify host that there are data pending on our TX bufring or
+ * we have put some data on the TX bufring.
+ */
+static __inline void
+vmbus_chan_signal(const struct vmbus_channel *chan)
+{
+	atomic_set_long(chan->ch_evtflag, chan->ch_evtflag_mask);
+	if (chan->ch_txflags & VMBUS_CHAN_TXF_HASMNF)
+		atomic_set_int(chan->ch_montrig, chan->ch_montrig_mask);
+	else
+		hypercall_signal_event(chan->ch_monprm_dma.hv_paddr);
+}
+
+static __inline void
+vmbus_chan_signal_tx(struct vmbus_channel *chan)
+{
+	chan->ch_txbr.txbr_intrcnt ++;
+
+	vmbus_chan_signal(chan);
+}
+
+static __inline void
+vmbus_chan_signal_rx(struct vmbus_channel *chan)
+{
+	chan->ch_rxbr.rxbr_intrcnt ++;
+
+	vmbus_chan_signal(chan);
+}
+
+static void
+vmbus_chan_ins_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan)
+{
+
+	mtx_assert(&sc->vmbus_prichan_lock, MA_OWNED);
+	if (atomic_testandset_int(&chan->ch_stflags,
+	    VMBUS_CHAN_ST_ONPRIL_SHIFT))
+		panic("channel is already on the prilist");
+	TAILQ_INSERT_TAIL(&sc->vmbus_prichans, chan, ch_prilink);
+}
+
+static void
+vmbus_chan_rem_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan)
+{
+
+	mtx_assert(&sc->vmbus_prichan_lock, MA_OWNED);
+	if (atomic_testandclear_int(&chan->ch_stflags,
+	    VMBUS_CHAN_ST_ONPRIL_SHIFT) == 0)
+		panic("channel is not on the prilist");
+	TAILQ_REMOVE(&sc->vmbus_prichans, chan, ch_prilink);
+}
+
+static void
+vmbus_chan_ins_sublist(struct vmbus_channel *prichan,
+    struct vmbus_channel *chan)
+{
+
+	mtx_assert(&prichan->ch_subchan_lock, MA_OWNED);
+
+	if (atomic_testandset_int(&chan->ch_stflags,
+	    VMBUS_CHAN_ST_ONSUBL_SHIFT))
+		panic("channel is already on the sublist");
+	TAILQ_INSERT_TAIL(&prichan->ch_subchans, chan, ch_sublink);
+
+	/* Bump sub-channel count. */
+	prichan->ch_subchan_cnt++;
+}
+
+static void
+vmbus_chan_rem_sublist(struct vmbus_channel *prichan,
+    struct vmbus_channel *chan)
+{
+
+	mtx_assert(&prichan->ch_subchan_lock, MA_OWNED);
+
+	KASSERT(prichan->ch_subchan_cnt > 0,
+	    ("invalid subchan_cnt %d", prichan->ch_subchan_cnt));
+	prichan->ch_subchan_cnt--;
+
+	if (atomic_testandclear_int(&chan->ch_stflags,
+	    VMBUS_CHAN_ST_ONSUBL_SHIFT) == 0)
+		panic("channel is not on the sublist");
+	TAILQ_REMOVE(&prichan->ch_subchans, chan, ch_sublink);
+}
+
+static void
+vmbus_chan_ins_list(struct vmbus_softc *sc, struct vmbus_channel *chan)
+{
+
+	mtx_assert(&sc->vmbus_chan_lock, MA_OWNED);
+	if (atomic_testandset_int(&chan->ch_stflags,
+	    VMBUS_CHAN_ST_ONLIST_SHIFT))
+		panic("channel is already on the list");
+	TAILQ_INSERT_TAIL(&sc->vmbus_chans, chan, ch_link);
+}
+
+static void
+vmbus_chan_rem_list(struct vmbus_softc *sc, struct vmbus_channel *chan)
+{
+
+	mtx_assert(&sc->vmbus_chan_lock, MA_OWNED);
+	if (atomic_testandclear_int(&chan->ch_stflags,
+	    VMBUS_CHAN_ST_ONLIST_SHIFT) == 0)
+		panic("channel is not on the list");
+	TAILQ_REMOVE(&sc->vmbus_chans, chan, ch_link);
+}
+
+static int
+vmbus_chan_sysctl_mnf(SYSCTL_HANDLER_ARGS)
+{
+	struct vmbus_channel *chan = arg1;
+	int mnf = 0;
+
+	if (chan->ch_txflags & VMBUS_CHAN_TXF_HASMNF)
+		mnf = 1;
+	return sysctl_handle_int(oidp, &mnf, 0, req);
+}
+
+static void
+vmbus_chan_sysctl_create(struct vmbus_channel *chan)
+{
+	struct sysctl_oid *ch_tree, *chid_tree, *br_tree;
+	struct sysctl_ctx_list *ctx;
+	uint32_t ch_id;
+	char name[16];
+
+	/*
+	 * Add sysctl nodes related to this channel to this
+	 * channel's sysctl ctx, so that they can be destroyed
+	 * independently upon close of this channel, which can
+	 * happen even if the device is not detached.
+	 */
+	ctx = &chan->ch_sysctl_ctx;
+	sysctl_ctx_init(ctx);
+
+	/*
+	 * Create dev.NAME.UNIT.channel tree.
+	 */
+	ch_tree = SYSCTL_ADD_NODE(ctx,
+	    SYSCTL_CHILDREN(device_get_sysctl_tree(chan->ch_dev)),
+	    OID_AUTO, "channel", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+	if (ch_tree == NULL)
+		return;
+
+	/*
+	 * Create dev.NAME.UNIT.channel.CHANID tree.
+	 */
+	if (VMBUS_CHAN_ISPRIMARY(chan))
+		ch_id = chan->ch_id;
+	else
+		ch_id = chan->ch_prichan->ch_id;
+	snprintf(name, sizeof(name), "%d", ch_id);
+	chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree),
+	    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+	if (chid_tree == NULL)
+		return;
+
+	if (!VMBUS_CHAN_ISPRIMARY(chan)) {
+		/*
+		 * Create dev.NAME.UNIT.channel.CHANID.sub tree.
+		 */
+		ch_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree),
+		    OID_AUTO, "sub", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+		if (ch_tree == NULL)
+			return;
+
+		/*
+		 * Create dev.NAME.UNIT.channel.CHANID.sub.SUBIDX tree.
+		 *
+		 * NOTE:
+		 * chid_tree is changed to this new sysctl tree.
+		 */
+		snprintf(name, sizeof(name), "%d", chan->ch_subidx);
+		chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree),
+		    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+		if (chid_tree == NULL)
+			return;
+
+		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
+		    "chanid", CTLFLAG_RD, &chan->ch_id, 0, "channel id");
+	}
+
+	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
+	    "cpu", CTLFLAG_RD, &chan->ch_cpuid, 0, "owner CPU id");
+	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
+	    "mnf", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
+	    chan, 0, vmbus_chan_sysctl_mnf, "I",
+	    "has monitor notification facilities");
+
+	br_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
+	    "br", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+	if (br_tree != NULL) {
+		/*
+		 * Create sysctl tree for RX bufring.
+		 */
+		vmbus_br_sysctl_create(ctx, br_tree, &chan->ch_rxbr.rxbr, "rx");
+		/*
+		 * Create sysctl tree for TX bufring.
+		 */
+		vmbus_br_sysctl_create(ctx, br_tree, &chan->ch_txbr.txbr, "tx");
+	}
+}
+
+int
+vmbus_chan_open(struct vmbus_channel *chan, int txbr_size, int rxbr_size,
+    const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg)
+{
+	struct vmbus_chan_br cbr;
+	int error;
+
+	/*
+	 * Allocate the TX+RX bufrings.
+	 */
+	KASSERT(chan->ch_bufring == NULL, ("bufrings are allocated"));
+	chan->ch_bufring = hyperv_dmamem_alloc(bus_get_dma_tag(chan->ch_dev),
+	    PAGE_SIZE, 0, txbr_size + rxbr_size, &chan->ch_bufring_dma,
+	    BUS_DMA_WAITOK);
+	if (chan->ch_bufring == NULL) {
+		vmbus_chan_printf(chan, "bufring allocation failed\n");
+		return (ENOMEM);
+	}
+
+	cbr.cbr = chan->ch_bufring;
+	cbr.cbr_paddr = chan->ch_bufring_dma.hv_paddr;
+	cbr.cbr_txsz = txbr_size;
+	cbr.cbr_rxsz = rxbr_size;
+
+	error = vmbus_chan_open_br(chan, &cbr, udata, udlen, cb, cbarg);
+	if (error) {
+		if (error == EISCONN) {
+			/*
+			 * XXX
+			 * The bufring GPADL is still connected; abandon
+			 * this bufring, instead of having mysterious
+			 * crash or trashed data later on.
+			 */
+			vmbus_chan_printf(chan, "chan%u bufring GPADL "
+			    "is still connected upon channel open error; "
+			    "leak %d bytes memory\n", chan->ch_id,
+			    txbr_size + rxbr_size);
+		} else {
+			hyperv_dmamem_free(&chan->ch_bufring_dma,
+			    chan->ch_bufring);
+		}
+		chan->ch_bufring = NULL;
+	}
+	return (error);
+}
+
+int
+vmbus_chan_open_br(struct vmbus_channel *chan, const struct vmbus_chan_br *cbr,
+    const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg)
+{
+	struct vmbus_softc *sc = chan->ch_vmbus;
+	const struct vmbus_message *msg;
+	struct vmbus_chanmsg_chopen *req;
+	struct vmbus_msghc *mh;
+	uint32_t status;
+	int error, txbr_size, rxbr_size;
+	task_fn_t *task_fn;
+	uint8_t *br;
+
+	if (udlen > VMBUS_CHANMSG_CHOPEN_UDATA_SIZE) {
+		vmbus_chan_printf(chan,
+		    "invalid udata len %d for chan%u\n", udlen, chan->ch_id);
+		return (EINVAL);
+	}
+
+	br = cbr->cbr;
+	txbr_size = cbr->cbr_txsz;
+	rxbr_size = cbr->cbr_rxsz;
+	KASSERT((txbr_size & PAGE_MASK) == 0,
+	    ("send bufring size is not multiple page"));
+	KASSERT((rxbr_size & PAGE_MASK) == 0,
+	    ("recv bufring size is not multiple page"));
+	KASSERT((cbr->cbr_paddr & PAGE_MASK) == 0,
+	    ("bufring is not page aligned"));
+
+	/*
+	 * Zero out the TX/RX bufrings, in case that they were used before.
+	 */
+	memset(br, 0, txbr_size + rxbr_size);
+
+	if (atomic_testandset_int(&chan->ch_stflags,
+	    VMBUS_CHAN_ST_OPENED_SHIFT))
+		panic("double-open chan%u", chan->ch_id);
+
+	chan->ch_cb = cb;
+	chan->ch_cbarg = cbarg;
+
+	vmbus_chan_update_evtflagcnt(sc, chan);
+
+	chan->ch_tq = VMBUS_PCPU_GET(chan->ch_vmbus, event_tq, chan->ch_cpuid);
+	if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD)
+		task_fn = vmbus_chan_task;
+	else
+		task_fn = vmbus_chan_task_nobatch;
+	TASK_INIT(&chan->ch_task, 0, task_fn, chan);
+
+	/* TX bufring comes first */
+	vmbus_txbr_setup(&chan->ch_txbr, br, txbr_size);
+	/* RX bufring immediately follows TX bufring */
+	vmbus_rxbr_setup(&chan->ch_rxbr, br + txbr_size, rxbr_size);
+
+	/* Create sysctl tree for this channel */
+	vmbus_chan_sysctl_create(chan);
+
+	/*
+	 * Connect the bufrings, both RX and TX, to this channel.
+	 */
+	error = vmbus_chan_gpadl_connect(chan, cbr->cbr_paddr,
+	    txbr_size + rxbr_size, &chan->ch_bufring_gpadl);
+	if (error) {
+		vmbus_chan_printf(chan,
+		    "failed to connect bufring GPADL to chan%u\n", chan->ch_id);
+		goto failed;
+	}
+
+	/*
+	 * Install this channel, before it is opened, but after everything
+	 * else has been setup.
+	 */
+	vmbus_chan_set_chmap(chan);
+
+	/*
+	 * Open channel w/ the bufring GPADL on the target CPU.
+	 */
+	mh = vmbus_msghc_get(sc, sizeof(*req));
+	if (mh == NULL) {
+		vmbus_chan_printf(chan,
+		    "can not get msg hypercall for chopen(chan%u)\n",
+		    chan->ch_id);
+		error = ENXIO;
+		goto failed;
+	}
+
+	req = vmbus_msghc_dataptr(mh);
+	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHOPEN;
+	req->chm_chanid = chan->ch_id;
+	req->chm_openid = chan->ch_id;
+	req->chm_gpadl = chan->ch_bufring_gpadl;
+	req->chm_vcpuid = chan->ch_vcpuid;
+	req->chm_txbr_pgcnt = txbr_size >> PAGE_SHIFT;
+	if (udlen > 0)
+		memcpy(req->chm_udata, udata, udlen);
+
+	error = vmbus_msghc_exec(sc, mh);
+	if (error) {
+		vmbus_chan_printf(chan,
+		    "chopen(chan%u) msg hypercall exec failed: %d\n",
+		    chan->ch_id, error);
+		vmbus_msghc_put(sc, mh);
+		goto failed;
+	}
+
+	for (;;) {
+		msg = vmbus_msghc_poll_result(sc, mh);
+		if (msg != NULL)
+			break;
+		if (vmbus_chan_is_revoked(chan)) {
+			int i;
+
+			/*
+			 * NOTE:
+			 * Hypervisor does _not_ send response CHOPEN to
+			 * a revoked channel.
+			 */
+			vmbus_chan_printf(chan,
+			    "chan%u is revoked, when it is being opened\n",
+			    chan->ch_id);
+
+			/*
+			 * XXX
+			 * Add extra delay before cancel the hypercall
+			 * execution; mainly to close any possible
+			 * CHRESCIND and CHOPEN_RESP races on the
+			 * hypervisor side.
+			 */
+#define REVOKE_LINGER	100
+			for (i = 0; i < REVOKE_LINGER; ++i) {
+				msg = vmbus_msghc_poll_result(sc, mh);
+				if (msg != NULL)
+					break;
+				pause("rchopen", 1);
+			}
+#undef REVOKE_LINGER
+			if (msg == NULL)
+				vmbus_msghc_exec_cancel(sc, mh);
+			break;
+		}
+		pause("chopen", 1);
+	}
+	if (msg != NULL) {
+		status = ((const struct vmbus_chanmsg_chopen_resp *)
+		    msg->msg_data)->chm_status;
+	} else {
+		/* XXX any non-0 value is ok here. */
+		status = 0xff;
+	}
+
+	vmbus_msghc_put(sc, mh);
+
+	if (status == 0) {
+		if (bootverbose)
+			vmbus_chan_printf(chan, "chan%u opened\n", chan->ch_id);
+		return (0);
+	}
+
+	vmbus_chan_printf(chan, "failed to open chan%u\n", chan->ch_id);
+	error = ENXIO;
+
+failed:
+	sysctl_ctx_free(&chan->ch_sysctl_ctx);
+	vmbus_chan_clear_chmap(chan);
+	if (chan->ch_bufring_gpadl != 0) {
+		int error1;
+
+		error1 = vmbus_chan_gpadl_disconnect(chan,
+		    chan->ch_bufring_gpadl);
+		if (error1) {
+			/*
+			 * Give caller a hint that the bufring GPADL is still
+			 * connected.
+			 */
+			error = EISCONN;
+		}
+		chan->ch_bufring_gpadl = 0;
+	}
+	atomic_clear_int(&chan->ch_stflags, VMBUS_CHAN_ST_OPENED);
+	return (error);
+}
+
+int
+vmbus_chan_gpadl_connect(struct vmbus_channel *chan, bus_addr_t paddr,
+    int size, uint32_t *gpadl0)
+{
+	struct vmbus_softc *sc = chan->ch_vmbus;
+	struct vmbus_msghc *mh;
+	struct vmbus_chanmsg_gpadl_conn *req;
+	const struct vmbus_message *msg;
+	size_t reqsz;
+	uint32_t gpadl, status;
+	int page_count, range_len, i, cnt, error;
+	uint64_t page_id;
+
+	KASSERT(*gpadl0 == 0, ("GPADL is not zero"));
+
+	/*
+	 * Preliminary checks.
+	 */
+
+	KASSERT((size & PAGE_MASK) == 0,
+	    ("invalid GPA size %d, not multiple page size", size));
+	page_count = size >> PAGE_SHIFT;
+
+	KASSERT((paddr & PAGE_MASK) == 0,
+	    ("GPA is not page aligned %jx", (uintmax_t)paddr));
+	page_id = paddr >> PAGE_SHIFT;
+
+	range_len = __offsetof(struct vmbus_gpa_range, gpa_page[page_count]);
+	/*
+	 * We don't support multiple GPA ranges.
+	 */
+	if (range_len > UINT16_MAX) {
+		vmbus_chan_printf(chan, "GPA too large, %d pages\n",
+		    page_count);
+		return EOPNOTSUPP;
+	}
+
+	/*
+	 * Allocate GPADL id.
+	 */
+	gpadl = vmbus_gpadl_alloc(sc);
+
+	/*
+	 * Connect this GPADL to the target channel.
+	 *
+	 * NOTE:
+	 * Since each message can only hold small set of page
+	 * addresses, several messages may be required to
+	 * complete the connection.
+	 */
+	if (page_count > VMBUS_CHANMSG_GPADL_CONN_PGMAX)
+		cnt = VMBUS_CHANMSG_GPADL_CONN_PGMAX;
+	else
+		cnt = page_count;
+	page_count -= cnt;
+
+	reqsz = __offsetof(struct vmbus_chanmsg_gpadl_conn,
+	    chm_range.gpa_page[cnt]);
+	mh = vmbus_msghc_get(sc, reqsz);
+	if (mh == NULL) {
+		vmbus_chan_printf(chan,
+		    "can not get msg hypercall for gpadl_conn(chan%u)\n",
+		    chan->ch_id);
+		return EIO;
+	}
+
+	req = vmbus_msghc_dataptr(mh);
+	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_CONN;
+	req->chm_chanid = chan->ch_id;
+	req->chm_gpadl = gpadl;
+	req->chm_range_len = range_len;
+	req->chm_range_cnt = 1;
+	req->chm_range.gpa_len = size;
+	req->chm_range.gpa_ofs = 0;
+	for (i = 0; i < cnt; ++i)
+		req->chm_range.gpa_page[i] = page_id++;
+
+	error = vmbus_msghc_exec(sc, mh);
+	if (error) {
+		vmbus_chan_printf(chan,
+		    "gpadl_conn(chan%u) msg hypercall exec failed: %d\n",
+		    chan->ch_id, error);
+		vmbus_msghc_put(sc, mh);
+		return error;
+	}
+
+	while (page_count > 0) {
+		struct vmbus_chanmsg_gpadl_subconn *subreq;
+
+		if (page_count > VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX)
+			cnt = VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX;
+		else
+			cnt = page_count;
+		page_count -= cnt;
+
+		reqsz = __offsetof(struct vmbus_chanmsg_gpadl_subconn,
+		    chm_gpa_page[cnt]);
+		vmbus_msghc_reset(mh, reqsz);
+
+		subreq = vmbus_msghc_dataptr(mh);
+		subreq->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_SUBCONN;
+		subreq->chm_gpadl = gpadl;
+		for (i = 0; i < cnt; ++i)
+			subreq->chm_gpa_page[i] = page_id++;
+
+		vmbus_msghc_exec_noresult(mh);
+	}
+	KASSERT(page_count == 0, ("invalid page count %d", page_count));
+
+	msg = vmbus_msghc_wait_result(sc, mh);
+	status = ((const struct vmbus_chanmsg_gpadl_connresp *)
+	    msg->msg_data)->chm_status;
+
+	vmbus_msghc_put(sc, mh);
+
+	if (status != 0) {
+		vmbus_chan_printf(chan, "gpadl_conn(chan%u) failed: %u\n",
+		    chan->ch_id, status);
+		return EIO;
+	}
+
+	/* Done; commit the GPADL id. */
+	*gpadl0 = gpadl;
+	if (bootverbose) {
+		vmbus_chan_printf(chan, "gpadl_conn(chan%u) succeeded\n",
+		    chan->ch_id);
+	}
+	return 0;
+}
+
+static bool
+vmbus_chan_wait_revoke(const struct vmbus_channel *chan, bool can_sleep)
+{
+#define WAIT_COUNT	200	/* 200ms */
+
+	int i;
+
+	for (i = 0; i < WAIT_COUNT; ++i) {
+		if (vmbus_chan_is_revoked(chan))
+			return (true);
+		if (can_sleep)
+			pause("wchrev", 1);
+		else
+			DELAY(1000);
+	}
+	return (false);
+
+#undef WAIT_COUNT
+}
+
+/*
+ * Disconnect the GPA from the target channel
+ */
+int
+vmbus_chan_gpadl_disconnect(struct vmbus_channel *chan, uint32_t gpadl)
+{
+	struct vmbus_softc *sc = chan->ch_vmbus;
+	struct vmbus_msghc *mh;
+	struct vmbus_chanmsg_gpadl_disconn *req;
+	int error;
+
+	KASSERT(gpadl != 0, ("GPADL is zero"));
+
+	mh = vmbus_msghc_get(sc, sizeof(*req));
+	if (mh == NULL) {
+		vmbus_chan_printf(chan,
+		    "can not get msg hypercall for gpadl_disconn(chan%u)\n",
+		    chan->ch_id);
+		return (EBUSY);
+	}
+
+	req = vmbus_msghc_dataptr(mh);
+	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_DISCONN;
+	req->chm_chanid = chan->ch_id;
+	req->chm_gpadl = gpadl;
+
+	error = vmbus_msghc_exec(sc, mh);
+	if (error) {
+		vmbus_msghc_put(sc, mh);
+
+		if (vmbus_chan_wait_revoke(chan, true)) {
+			/*
+			 * Error is benign; this channel is revoked,
+			 * so this GPADL will not be touched anymore.
+			 */
+			vmbus_chan_printf(chan,
+			    "gpadl_disconn(revoked chan%u) msg hypercall "
+			    "exec failed: %d\n", chan->ch_id, error);
+			return (0);
+		}
+		vmbus_chan_printf(chan,
+		    "gpadl_disconn(chan%u) msg hypercall exec failed: %d\n",
+		    chan->ch_id, error);
+		return (error);
+	}
+
+	vmbus_msghc_wait_result(sc, mh);
+	/* Discard result; no useful information */
+	vmbus_msghc_put(sc, mh);
+
+	return (0);
+}
+
+static void
+vmbus_chan_detach(struct vmbus_channel *chan)
+{
+	int refs;
+
+	KASSERT(chan->ch_refs > 0, ("chan%u: invalid refcnt %d",
+	    chan->ch_id, chan->ch_refs));
+	refs = atomic_fetchadd_int(&chan->ch_refs, -1);
+#ifdef INVARIANTS
+	if (VMBUS_CHAN_ISPRIMARY(chan)) {
+		KASSERT(refs == 1, ("chan%u: invalid refcnt %d for prichan",
+		    chan->ch_id, refs + 1));
+	}
+#endif
+	if (refs == 1) {
+		/*
+		 * Detach the target channel.
+		 */
+		if (bootverbose) {
+			vmbus_chan_printf(chan, "chan%u detached\n",
+			    chan->ch_id);
+		}
+		taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_detach_task);
+	}
+}
+
+static void
+vmbus_chan_clrchmap_task(void *xchan, int pending __unused)
+{
+	struct vmbus_channel *chan = xchan;
+
+	chan->ch_vmbus->vmbus_chmap[chan->ch_id] = NULL;
+}
+
+static void
+vmbus_chan_clear_chmap(struct vmbus_channel *chan)
+{
+	struct task chmap_task;
+
+	TASK_INIT(&chmap_task, 0, vmbus_chan_clrchmap_task, chan);
+	vmbus_chan_run_task(chan, &chmap_task);
+}
+
+static void
+vmbus_chan_set_chmap(struct vmbus_channel *chan)
+{
+	__compiler_membar();
+	chan->ch_vmbus->vmbus_chmap[chan->ch_id] = chan;
+}
+
+static void
+vmbus_chan_poll_cancel_task(void *xchan, int pending __unused)
+{
+
+	vmbus_chan_poll_cancel_intq(xchan);
+}
+
+static void
+vmbus_chan_poll_cancel(struct vmbus_channel *chan)
+{
+	struct task poll_cancel;
+
+	TASK_INIT(&poll_cancel, 0, vmbus_chan_poll_cancel_task, chan);
+	vmbus_chan_run_task(chan, &poll_cancel);
+}
+
+static int
+vmbus_chan_close_internal(struct vmbus_channel *chan)
+{
+	struct vmbus_softc *sc = chan->ch_vmbus;
+	struct vmbus_msghc *mh;
+	struct vmbus_chanmsg_chclose *req;
+	uint32_t old_stflags;
+	int error;
+
+	/*
+	 * NOTE:
+	 * Sub-channels are closed upon their primary channel closing,
+	 * so they can be closed even before they are opened.
+	 */
+	for (;;) {
+		old_stflags = chan->ch_stflags;
+		if (atomic_cmpset_int(&chan->ch_stflags, old_stflags,
+		    old_stflags & ~VMBUS_CHAN_ST_OPENED))
+			break;
+	}
+	if ((old_stflags & VMBUS_CHAN_ST_OPENED) == 0) {
+		/* Not opened yet; done */
+		if (bootverbose) {
+			vmbus_chan_printf(chan, "chan%u not opened\n",
+			    chan->ch_id);
+		}
+		return (0);
+	}
+
+	/*
+	 * Free this channel's sysctl tree attached to its device's
+	 * sysctl tree.
+	 */
+	sysctl_ctx_free(&chan->ch_sysctl_ctx);
+
+	/*
+	 * Cancel polling, if it is enabled.
+	 */
+	vmbus_chan_poll_cancel(chan);
+
+	/*
+	 * NOTE:
+	 * Order is critical.  This channel _must_ be uninstalled first,
+	 * else the channel task may be enqueued by the IDT after it has
+	 * been drained.
+	 */
+	vmbus_chan_clear_chmap(chan);
+	taskqueue_drain(chan->ch_tq, &chan->ch_task);
+	chan->ch_tq = NULL;
+
+	/*
+	 * Close this channel.
+	 */
+	mh = vmbus_msghc_get(sc, sizeof(*req));
+	if (mh == NULL) {
+		vmbus_chan_printf(chan,
+		    "can not get msg hypercall for chclose(chan%u)\n",
+		    chan->ch_id);
+		error = ENXIO;
+		goto disconnect;
+	}
+
+	req = vmbus_msghc_dataptr(mh);
+	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHCLOSE;
+	req->chm_chanid = chan->ch_id;
+
+	error = vmbus_msghc_exec_noresult(mh);
+	vmbus_msghc_put(sc, mh);
+
+	if (error) {
+		vmbus_chan_printf(chan,
+		    "chclose(chan%u) msg hypercall exec failed: %d\n",
+		    chan->ch_id, error);
+		goto disconnect;
+	}
+
+	if (bootverbose)
+		vmbus_chan_printf(chan, "chan%u closed\n", chan->ch_id);
+
+disconnect:
+	/*
+	 * Disconnect the TX+RX bufrings from this channel.
+	 */
+	if (chan->ch_bufring_gpadl != 0) {
+		int error1;
+
+		error1 = vmbus_chan_gpadl_disconnect(chan,
+		    chan->ch_bufring_gpadl);
+		if (error1) {
+			/*
+			 * XXX
+			 * The bufring GPADL is still connected; abandon
+			 * this bufring, instead of having mysterious
+			 * crash or trashed data later on.
+			 */
+			vmbus_chan_printf(chan, "chan%u bufring GPADL "
+			    "is still connected after close\n", chan->ch_id);
+			chan->ch_bufring = NULL;
+			/*
+			 * Give caller a hint that the bufring GPADL is
+			 * still connected.
+			 */
+			error = EISCONN;
+		}
+		chan->ch_bufring_gpadl = 0;
+	}
+
+	/*
+	 * Destroy the TX+RX bufrings.
+	 */
+	if (chan->ch_bufring != NULL) {
+		hyperv_dmamem_free(&chan->ch_bufring_dma, chan->ch_bufring);
+		chan->ch_bufring = NULL;
+	}
+	return (error);
+}
+
+int
+vmbus_chan_close_direct(struct vmbus_channel *chan)
+{
+	int error;
+
+#ifdef INVARIANTS
+	if (VMBUS_CHAN_ISPRIMARY(chan)) {
+		struct vmbus_channel *subchan;
+
+		/*
+		 * All sub-channels _must_ have been closed, or are _not_
+		 * opened at all.
+		 */
+		mtx_lock(&chan->ch_subchan_lock);
+		TAILQ_FOREACH(subchan, &chan->ch_subchans, ch_sublink) {
+			KASSERT(
+			   (subchan->ch_stflags & VMBUS_CHAN_ST_OPENED) == 0,
+			   ("chan%u: subchan%u is still opened",
+			    chan->ch_id, subchan->ch_subidx));
+		}
+		mtx_unlock(&chan->ch_subchan_lock);
+	}
+#endif
+
+	error = vmbus_chan_close_internal(chan);
+	if (!VMBUS_CHAN_ISPRIMARY(chan)) {
+		/*
+		 * This sub-channel is referenced, when it is linked to
+		 * the primary channel; drop that reference now.
+		 */
+		vmbus_chan_detach(chan);
+	}
+	return (error);
+}
+
+/*
+ * Caller should make sure that all sub-channels have
+ * been added to 'chan' and all to-be-closed channels
+ * are not being opened.
+ */
+void
+vmbus_chan_close(struct vmbus_channel *chan)
+{
+	int subchan_cnt;
+
+	if (!VMBUS_CHAN_ISPRIMARY(chan)) {
+		/*
+		 * Sub-channel is closed when its primary channel
+		 * is closed; done.
+		 */
+		return;
+	}
+
+	/*
+	 * Close all sub-channels, if any.
+	 */
+	subchan_cnt = chan->ch_subchan_cnt;
+	if (subchan_cnt > 0) {
+		struct vmbus_channel **subchan;
+		int i;
+
+		subchan = vmbus_subchan_get(chan, subchan_cnt);
+		for (i = 0; i < subchan_cnt; ++i) {
+			vmbus_chan_close_internal(subchan[i]);
+			/*
+			 * This sub-channel is referenced, when it is
+			 * linked to the primary channel; drop that
+			 * reference now.
+			 */
+			vmbus_chan_detach(subchan[i]);
+		}
+		vmbus_subchan_rel(subchan, subchan_cnt);
+	}
+
+	/* Then close the primary channel. */
+	vmbus_chan_close_internal(chan);
+}
+
+void
+vmbus_chan_intr_drain(struct vmbus_channel *chan)
+{
+
+	taskqueue_drain(chan->ch_tq, &chan->ch_task);
+}
+
+uint32_t
+vmbus_chan_write_available(struct vmbus_channel *chan)
+{
+	return (vmbus_txbr_available(&chan->ch_txbr));
+}
+
+bool
+vmbus_chan_write_signal(struct vmbus_channel *chan,
+    int32_t min_signal_size)
+{
+	if (min_signal_size >= 0 &&
+	    vmbus_chan_write_available(chan) > min_signal_size) {
+		return false;
+	}
+
+	if (!vmbus_txbr_get_imask(&chan->ch_txbr)) {
+		/* txbr imask is not set, signal the reader */
+		vmbus_chan_signal_tx(chan);
+		return true;
+	}
+
+	return false;
+}
+
+void
+vmbus_chan_set_pending_send_size(struct vmbus_channel *chan,
+    uint32_t size)
+{
+	if (chan)
+		vmbus_txbr_set_pending_snd_sz(&chan->ch_txbr, size);
+}
+
+int
+vmbus_chan_iov_send(struct vmbus_channel *chan,
+    const struct iovec iov[], int iovlen,
+    vmbus_br_copy_callback_t cb, void *cbarg)
+{
+	int error;
+	boolean_t send_evt;
+
+	if (iovlen == 0)
+		return (0);
+
+	error = vmbus_txbr_write_call(&chan->ch_txbr, iov, iovlen,
+	    cb, cbarg, &send_evt);
+
+	if (!error && send_evt) {
+		vmbus_chan_signal_tx(chan);
+	}
+
+	return error;
+}
+
+int
+vmbus_chan_send(struct vmbus_channel *chan, uint16_t type, uint16_t flags,
+    void *data, int dlen, uint64_t xactid)
+{
+	struct vmbus_chanpkt pkt;
+	int pktlen, pad_pktlen, hlen, error;
+	uint64_t pad = 0;
+	struct iovec iov[3];
+	boolean_t send_evt;
+
+	hlen = sizeof(pkt);
+	pktlen = hlen + dlen;
+	pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen);
+	KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr),
+	    ("invalid packet size %d", pad_pktlen));
+
+	pkt.cp_hdr.cph_type = type;
+	pkt.cp_hdr.cph_flags = flags;
+	VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen);
+	VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen);
+	pkt.cp_hdr.cph_xactid = xactid;
+
+	iov[0].iov_base = &pkt;
+	iov[0].iov_len = hlen;
+	iov[1].iov_base = data;
+	iov[1].iov_len = dlen;
+	iov[2].iov_base = &pad;
+	iov[2].iov_len = pad_pktlen - pktlen;
+
+	error = vmbus_txbr_write(&chan->ch_txbr, iov, 3, &send_evt);
+	if (!error && send_evt)
+		vmbus_chan_signal_tx(chan);
+	return error;
+}
+
+int
+vmbus_chan_send_sglist(struct vmbus_channel *chan,
+    struct vmbus_gpa sg[], int sglen, void *data, int dlen, uint64_t xactid)
+{
+	struct vmbus_chanpkt_sglist pkt;
+	int pktlen, pad_pktlen, hlen, error;
+	struct iovec iov[4];
+	boolean_t send_evt;
+	uint64_t pad = 0;
+
+	hlen = __offsetof(struct vmbus_chanpkt_sglist, cp_gpa[sglen]);
+	pktlen = hlen + dlen;
+	pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen);
+	KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr),
+	    ("invalid packet size %d", pad_pktlen));
+
+	pkt.cp_hdr.cph_type = VMBUS_CHANPKT_TYPE_GPA;
+	pkt.cp_hdr.cph_flags = VMBUS_CHANPKT_FLAG_RC;
+	VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen);
+	VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen);
+	pkt.cp_hdr.cph_xactid = xactid;
+	pkt.cp_rsvd = 0;
+	pkt.cp_gpa_cnt = sglen;
+
+	iov[0].iov_base = &pkt;
+	iov[0].iov_len = sizeof(pkt);
+	iov[1].iov_base = sg;
+	iov[1].iov_len = sizeof(struct vmbus_gpa) * sglen;
+	iov[2].iov_base = data;
+	iov[2].iov_len = dlen;
+	iov[3].iov_base = &pad;
+	iov[3].iov_len = pad_pktlen - pktlen;
+
+	error = vmbus_txbr_write(&chan->ch_txbr, iov, 4, &send_evt);
+	if (!error && send_evt)
+		vmbus_chan_signal_tx(chan);
+	return error;
+}
+
+int
+vmbus_chan_send_prplist(struct vmbus_channel *chan,
+    struct vmbus_gpa_range *prp, int prp_cnt, void *data, int dlen,
+    uint64_t xactid)
+{
+	struct vmbus_chanpkt_prplist pkt;
+	int pktlen, pad_pktlen, hlen, error;
+	struct iovec iov[4];
+	boolean_t send_evt;
+	uint64_t pad = 0;
+
+	hlen = __offsetof(struct vmbus_chanpkt_prplist,
+	    cp_range[0].gpa_page[prp_cnt]);
+	pktlen = hlen + dlen;
+	pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen);
+	KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr),
+	    ("invalid packet size %d", pad_pktlen));
+
+	pkt.cp_hdr.cph_type = VMBUS_CHANPKT_TYPE_GPA;
+	pkt.cp_hdr.cph_flags = VMBUS_CHANPKT_FLAG_RC;
+	VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen);
+	VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen);
+	pkt.cp_hdr.cph_xactid = xactid;
+	pkt.cp_rsvd = 0;
+	pkt.cp_range_cnt = 1;
+
+	iov[0].iov_base = &pkt;
+	iov[0].iov_len = sizeof(pkt);
+	iov[1].iov_base = prp;
+	iov[1].iov_len = __offsetof(struct vmbus_gpa_range, gpa_page[prp_cnt]);
+	iov[2].iov_base = data;
+	iov[2].iov_len = dlen;
+	iov[3].iov_base = &pad;
+	iov[3].iov_len = pad_pktlen - pktlen;
+
+	error = vmbus_txbr_write(&chan->ch_txbr, iov, 4, &send_evt);
+	if (!error && send_evt)
+		vmbus_chan_signal_tx(chan);
+	return error;
+}
+
+int
+vmbus_chan_recv(struct vmbus_channel *chan, void *data, int *dlen0,
+    uint64_t *xactid)
+{
+	struct vmbus_chanpkt_hdr pkt;
+	int error, dlen, hlen;
+
+	error = vmbus_rxbr_peek(&chan->ch_rxbr, &pkt, sizeof(pkt));
+	if (error)
+		return (error);
+
+	if (__predict_false(pkt.cph_hlen < VMBUS_CHANPKT_HLEN_MIN)) {
+		vmbus_chan_printf(chan, "invalid hlen %u\n", pkt.cph_hlen);
+		/* XXX this channel is dead actually. */
+		return (EIO);
+	}
+	if (__predict_false(pkt.cph_hlen > pkt.cph_tlen)) {
+		vmbus_chan_printf(chan, "invalid hlen %u and tlen %u\n",
+		    pkt.cph_hlen, pkt.cph_tlen);
+		/* XXX this channel is dead actually. */
+		return (EIO);
+	}
+
+	hlen = VMBUS_CHANPKT_GETLEN(pkt.cph_hlen);
+	dlen = VMBUS_CHANPKT_GETLEN(pkt.cph_tlen) - hlen;
+
+	if (*dlen0 < dlen) {
+		/* Return the size of this packet's data. */
+		*dlen0 = dlen;
+		return (ENOBUFS);
+	}
+
+	*xactid = pkt.cph_xactid;
+	*dlen0 = dlen;
+
+	/* Skip packet header */
+	error = vmbus_rxbr_read(&chan->ch_rxbr, data, dlen, hlen);
+	KASSERT(!error, ("vmbus_rxbr_read failed"));
+
+	return (0);
+}
+
+int
+vmbus_chan_recv_pkt(struct vmbus_channel *chan,
+    struct vmbus_chanpkt_hdr *pkt, int *pktlen0)
+{
+	int error, pktlen, pkt_hlen;
+
+	pkt_hlen = sizeof(*pkt);
+	error = vmbus_rxbr_peek(&chan->ch_rxbr, pkt, pkt_hlen);
+	if (error)
+		return (error);
+
+	if (__predict_false(pkt->cph_hlen < VMBUS_CHANPKT_HLEN_MIN)) {
+		vmbus_chan_printf(chan, "invalid hlen %u\n", pkt->cph_hlen);
+		/* XXX this channel is dead actually. */
+		return (EIO);
+	}
+	if (__predict_false(pkt->cph_hlen > pkt->cph_tlen)) {
+		vmbus_chan_printf(chan, "invalid hlen %u and tlen %u\n",
+		    pkt->cph_hlen, pkt->cph_tlen);
+		/* XXX this channel is dead actually. */
+		return (EIO);
+	}
+
+	pktlen = VMBUS_CHANPKT_GETLEN(pkt->cph_tlen);
+	if (*pktlen0 < pktlen) {
+		/* Return the size of this packet. */
+		*pktlen0 = pktlen;
+		return (ENOBUFS);
+	}
+	*pktlen0 = pktlen;
+
+	/*
+	 * Skip the fixed-size packet header, which has been filled
+	 * by the above vmbus_rxbr_peek().
+	 */
+	error = vmbus_rxbr_read(&chan->ch_rxbr, pkt + 1,
+	    pktlen - pkt_hlen, pkt_hlen);
+	KASSERT(!error, ("vmbus_rxbr_read failed"));
+
+	return (0);
+}
+
+uint32_t
+vmbus_chan_read_available(struct vmbus_channel *chan)
+{
+	return (vmbus_rxbr_available(&chan->ch_rxbr));
+}
+
+/*
+ * This routine does:
+ *     - Advance the channel read index for 'advance' bytes
+ *     - Copy data_len bytes in to the buffer pointed by 'data'
+ * Return 0 if operation succeed. EAGAIN if operations if failed.
+ * If failed, the buffer pointed by 'data' is intact, and the
+ * channel read index is not advanced at all.
+ */
+int
+vmbus_chan_recv_peek(struct vmbus_channel *chan,
+    void *data, int data_len, uint32_t advance)
+{
+	int error;
+	boolean_t sig_event;
+
+	if (data == NULL || data_len <= 0)
+		return (EINVAL);
+
+	error = vmbus_rxbr_idxadv_peek(&chan->ch_rxbr,
+	    data, data_len, advance, &sig_event);
+
+	if (!error && sig_event) {
+		vmbus_chan_signal_rx(chan);
+	}
+
+	return (error);
+}
+
+/*
+ * This routine does:
+ *     - Advance the channel read index for 'advance' bytes
+ */
+int
+vmbus_chan_recv_idxadv(struct vmbus_channel *chan, uint32_t advance)
+{
+	int error;
+	boolean_t sig_event;
+
+	if (advance == 0)
+		return (EINVAL);
+
+	error = vmbus_rxbr_idxadv(&chan->ch_rxbr, advance, &sig_event);
+
+	if (!error && sig_event) {
+		vmbus_chan_signal_rx(chan);
+	}
+
+	return (error);
+}
+
+
+/*
+ * Caller should hold its own lock to serialize the ring buffer
+ * copy.
+ */
+int
+vmbus_chan_recv_peek_call(struct vmbus_channel *chan, int data_len,
+    uint32_t skip, vmbus_br_copy_callback_t cb, void *cbarg)
+{
+	if (!chan || data_len <= 0 || cb == NULL)
+		return (EINVAL);
+
+	return (vmbus_rxbr_peek_call(&chan->ch_rxbr, data_len, skip,
+	    cb, cbarg));
+}
+
+static void
+vmbus_chan_task(void *xchan, int pending __unused)
+{
+	struct vmbus_channel *chan = xchan;
+	vmbus_chan_callback_t cb = chan->ch_cb;
+	void *cbarg = chan->ch_cbarg;
+
+	KASSERT(chan->ch_poll_intvl == 0,
+	    ("chan%u: interrupted in polling mode", chan->ch_id));
+
+	/*
+	 * Optimize host to guest signaling by ensuring:
+	 * 1. While reading the channel, we disable interrupts from
+	 *    host.
+	 * 2. Ensure that we process all posted messages from the host
+	 *    before returning from this callback.
+	 * 3. Once we return, enable signaling from the host. Once this
+	 *    state is set we check to see if additional packets are
+	 *    available to read. In this case we repeat the process.
+	 *
+	 * NOTE: Interrupt has been disabled in the ISR.
+	 */
+	for (;;) {
+		uint32_t left;
+
+		cb(chan, cbarg);
+
+		left = vmbus_rxbr_intr_unmask(&chan->ch_rxbr);
+		if (left == 0) {
+			/* No more data in RX bufring; done */
+			break;
+		}
+		vmbus_rxbr_intr_mask(&chan->ch_rxbr);
+	}
+}
+
+static void
+vmbus_chan_task_nobatch(void *xchan, int pending __unused)
+{
+	struct vmbus_channel *chan = xchan;
+
+	KASSERT(chan->ch_poll_intvl == 0,
+	    ("chan%u: interrupted in polling mode", chan->ch_id));
+	chan->ch_cb(chan, chan->ch_cbarg);
+}
+
+static void
+vmbus_chan_poll_timeout(void *xchan)
+{
+	struct vmbus_channel *chan = xchan;
+
+	KASSERT(chan->ch_poll_intvl != 0,
+	    ("chan%u: polling timeout in interrupt mode", chan->ch_id));
+	taskqueue_enqueue(chan->ch_tq, &chan->ch_poll_task);
+}
+
+static void
+vmbus_chan_poll_task(void *xchan, int pending __unused)
+{
+	struct vmbus_channel *chan = xchan;
+
+	KASSERT(chan->ch_poll_intvl != 0,
+	    ("chan%u: polling in interrupt mode", chan->ch_id));
+	callout_reset_sbt_curcpu(&chan->ch_poll_timeo, chan->ch_poll_intvl, 0,
+	    vmbus_chan_poll_timeout, chan, chan->ch_poll_flags);
+	chan->ch_cb(chan, chan->ch_cbarg);
+}
+
+static void
+vmbus_chan_pollcfg_task(void *xarg, int pending __unused)
+{
+	const struct vmbus_chan_pollarg *arg = xarg;
+	struct vmbus_channel *chan = arg->poll_chan;
+	sbintime_t intvl;
+	int poll_flags;
+
+	/*
+	 * Save polling interval.
+	 */
+	intvl = SBT_1S / arg->poll_hz;
+	if (intvl == 0)
+		intvl = 1;
+	if (intvl == chan->ch_poll_intvl) {
+		/* Nothing changes; done */
+		return;
+	}
+	chan->ch_poll_intvl = intvl;
+
+	/* Adjust callout flags. */
+	poll_flags = C_DIRECT_EXEC;
+	if (arg->poll_hz <= hz)
+		poll_flags |= C_HARDCLOCK;
+	chan->ch_poll_flags = poll_flags;
+
+	/*
+	 * Disconnect this channel from the channel map to make sure that
+	 * the RX bufring interrupt enabling bit can not be touched, and
+	 * ISR can not enqueue this channel task anymore.  THEN, disable
+	 * interrupt from the RX bufring (TX bufring does not generate
+	 * interrupt to VM).
+	 *
+	 * NOTE: order is critical.
+	 */
+	chan->ch_vmbus->vmbus_chmap[chan->ch_id] = NULL;
+	__compiler_membar();
+	vmbus_rxbr_intr_mask(&chan->ch_rxbr);
+
+	/*
+	 * NOTE:
+	 * At this point, this channel task will not be enqueued by
+	 * the ISR anymore, time to cancel the pending one.
+	 */
+	taskqueue_cancel(chan->ch_tq, &chan->ch_task, NULL);
+
+	/* Kick start! */
+	taskqueue_enqueue(chan->ch_tq, &chan->ch_poll_task);
+}
+
+static bool
+vmbus_chan_poll_cancel_intq(struct vmbus_channel *chan)
+{
+
+	if (chan->ch_poll_intvl == 0) {
+		/* Not enabled. */
+		return (false);
+	}
+
+	/*
+	 * Stop polling callout, so that channel polling task
+	 * will not be enqueued anymore.
+	 */
+	callout_drain(&chan->ch_poll_timeo);
+
+	/*
+	 * Disable polling by resetting polling interval.
+	 *
+	 * NOTE:
+	 * The polling interval resetting MUST be conducted
+	 * after the callout is drained; mainly to keep the
+	 * proper assertion in place.
+	 */
+	chan->ch_poll_intvl = 0;
+
+	/*
+	 * NOTE:
+	 * At this point, this channel polling task will not be
+	 * enqueued by the callout anymore, time to cancel the
+	 * pending one.
+	 */
+	taskqueue_cancel(chan->ch_tq, &chan->ch_poll_task, NULL);
+
+	/* Polling was enabled. */
+	return (true);
+}
+
+static void
+vmbus_chan_polldis_task(void *xchan, int pending __unused)
+{
+	struct vmbus_channel *chan = xchan;
+
+	if (!vmbus_chan_poll_cancel_intq(chan)) {
+		/* Already disabled; done. */
+		return;
+	}
+
+	/*
+	 * Plug this channel back to the channel map and unmask
+	 * the RX bufring interrupt.
+	 */
+	chan->ch_vmbus->vmbus_chmap[chan->ch_id] = chan;
+	__compiler_membar();
+	vmbus_rxbr_intr_unmask(&chan->ch_rxbr);
+
+	/*
+	 * Kick start the interrupt task, just in case unmasking
+	 * interrupt races ISR.
+	 */
+	taskqueue_enqueue(chan->ch_tq, &chan->ch_task);
+}
+
+static __inline void
+vmbus_event_flags_proc(struct vmbus_softc *sc, volatile u_long *event_flags,
+    int flag_cnt)
+{
+	int f;
+
+	for (f = 0; f < flag_cnt; ++f) {
+		uint32_t chid_base;
+		u_long flags;
+		int chid_ofs;
+
+		if (event_flags[f] == 0)
+			continue;
+
+		flags = atomic_swap_long(&event_flags[f], 0);
+		chid_base = f << VMBUS_EVTFLAG_SHIFT;
+
+		while ((chid_ofs = ffsl(flags)) != 0) {
+			struct vmbus_channel *chan;
+
+			--chid_ofs; /* NOTE: ffsl is 1-based */
+			flags &= ~(1UL << chid_ofs);
+
+			chan = sc->vmbus_chmap[chid_base + chid_ofs];
+			if (__predict_false(chan == NULL)) {
+				/* Channel is closed. */
+				continue;
+			}
+			__compiler_membar();
+
+			if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD)
+				vmbus_rxbr_intr_mask(&chan->ch_rxbr);
+			taskqueue_enqueue(chan->ch_tq, &chan->ch_task);
+		}
+	}
+}
+
+void
+vmbus_event_proc(struct vmbus_softc *sc, int cpu)
+{
+	struct vmbus_evtflags *eventf;
+
+	/*
+	 * On Host with Win8 or above, the event page can be checked directly
+	 * to get the id of the channel that has the pending interrupt.
+	 */
+	eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE;
+	vmbus_event_flags_proc(sc, eventf->evt_flags,
+	    VMBUS_PCPU_GET(sc, event_flags_cnt, cpu));
+}
+
+void
+vmbus_event_proc_compat(struct vmbus_softc *sc, int cpu)
+{
+	struct vmbus_evtflags *eventf;
+
+	eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE;
+	if (atomic_testandclear_long(&eventf->evt_flags[0], 0)) {
+		vmbus_event_flags_proc(sc, sc->vmbus_rx_evtflags,
+		    VMBUS_CHAN_MAX_COMPAT >> VMBUS_EVTFLAG_SHIFT);
+	}
+}
+
+static void
+vmbus_chan_update_evtflagcnt(struct vmbus_softc *sc,
+    const struct vmbus_channel *chan)
+{
+	volatile int *flag_cnt_ptr;
+	int flag_cnt;
+
+	flag_cnt = (chan->ch_id / VMBUS_EVTFLAG_LEN) + 1;
+	flag_cnt_ptr = VMBUS_PCPU_PTR(sc, event_flags_cnt, chan->ch_cpuid);
+
+	for (;;) {
+		int old_flag_cnt;
+
+		old_flag_cnt = *flag_cnt_ptr;
+		if (old_flag_cnt >= flag_cnt)
+			break;
+		if (atomic_cmpset_int(flag_cnt_ptr, old_flag_cnt, flag_cnt)) {
+			if (bootverbose) {
+				vmbus_chan_printf(chan,
+				    "chan%u update cpu%d flag_cnt to %d\n",
+				    chan->ch_id, chan->ch_cpuid, flag_cnt);
+			}
+			break;
+		}
+	}
+}
+
+static struct vmbus_channel *
+vmbus_chan_alloc(struct vmbus_softc *sc)
+{
+	struct vmbus_channel *chan;
+
+	chan = malloc(sizeof(*chan), M_DEVBUF, M_WAITOK | M_ZERO);
+
+	chan->ch_monprm = hyperv_dmamem_alloc(bus_get_dma_tag(sc->vmbus_dev),
+	    HYPERCALL_PARAM_ALIGN, 0, sizeof(struct hyperv_mon_param),
+	    &chan->ch_monprm_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO);
+	if (chan->ch_monprm == NULL) {
+		device_printf(sc->vmbus_dev, "monprm alloc failed\n");
+		free(chan, M_DEVBUF);
+		return NULL;
+	}
+
+	chan->ch_refs = 1;
+	chan->ch_vmbus = sc;
+	mtx_init(&chan->ch_subchan_lock, "vmbus subchan", NULL, MTX_DEF);
+	sx_init(&chan->ch_orphan_lock, "vmbus chorphan");
+	TAILQ_INIT(&chan->ch_subchans);
+	vmbus_rxbr_init(&chan->ch_rxbr);
+	vmbus_txbr_init(&chan->ch_txbr);
+
+	TASK_INIT(&chan->ch_poll_task, 0, vmbus_chan_poll_task, chan);
+	callout_init(&chan->ch_poll_timeo, 1);
+
+	return chan;
+}
+
+static void
+vmbus_chan_free(struct vmbus_channel *chan)
+{
+
+	KASSERT(TAILQ_EMPTY(&chan->ch_subchans) && chan->ch_subchan_cnt == 0,
+	    ("still owns sub-channels"));
+	KASSERT((chan->ch_stflags &
+	    (VMBUS_CHAN_ST_OPENED |
+	     VMBUS_CHAN_ST_ONPRIL |
+	     VMBUS_CHAN_ST_ONSUBL |
+	     VMBUS_CHAN_ST_ONLIST)) == 0, ("free busy channel"));
+	KASSERT(chan->ch_orphan_xact == NULL,
+	    ("still has orphan xact installed"));
+	KASSERT(chan->ch_refs == 0, ("chan%u: invalid refcnt %d",
+	    chan->ch_id, chan->ch_refs));
+	KASSERT(chan->ch_poll_intvl == 0, ("chan%u: polling is activated",
+	    chan->ch_id));
+
+	hyperv_dmamem_free(&chan->ch_monprm_dma, chan->ch_monprm);
+	mtx_destroy(&chan->ch_subchan_lock);
+	sx_destroy(&chan->ch_orphan_lock);
+	vmbus_rxbr_deinit(&chan->ch_rxbr);
+	vmbus_txbr_deinit(&chan->ch_txbr);
+	free(chan, M_DEVBUF);
+}
+
+static int
+vmbus_chan_add(struct vmbus_channel *newchan)
+{
+	struct vmbus_softc *sc = newchan->ch_vmbus;
+	struct vmbus_channel *prichan;
+
+	if (newchan->ch_id == 0) {
+		/*
+		 * XXX
+		 * Chan0 will neither be processed nor should be offered;
+		 * skip it.
+		 */
+		device_printf(sc->vmbus_dev, "got chan0 offer, discard\n");
+		return EINVAL;
+	} else if (newchan->ch_id >= VMBUS_CHAN_MAX) {
+		device_printf(sc->vmbus_dev, "invalid chan%u offer\n",
+		    newchan->ch_id);
+		return EINVAL;
+	}
+
+	mtx_lock(&sc->vmbus_prichan_lock);
+	TAILQ_FOREACH(prichan, &sc->vmbus_prichans, ch_prilink) {
+		/*
+		 * Sub-channel will have the same type GUID and instance
+		 * GUID as its primary channel.
+		 */
+		if (memcmp(&prichan->ch_guid_type, &newchan->ch_guid_type,
+		    sizeof(struct hyperv_guid)) == 0 &&
+		    memcmp(&prichan->ch_guid_inst, &newchan->ch_guid_inst,
+		    sizeof(struct hyperv_guid)) == 0)
+			break;
+	}
+	if (VMBUS_CHAN_ISPRIMARY(newchan)) {
+		if (prichan == NULL) {
+			/* Install the new primary channel */
+			vmbus_chan_ins_prilist(sc, newchan);
+			mtx_unlock(&sc->vmbus_prichan_lock);
+			goto done;
+		} else {
+			mtx_unlock(&sc->vmbus_prichan_lock);
+			device_printf(sc->vmbus_dev,
+			    "duplicated primary chan%u\n", newchan->ch_id);
+			return EINVAL;
+		}
+	} else { /* Sub-channel */
+		if (prichan == NULL) {
+			mtx_unlock(&sc->vmbus_prichan_lock);
+			device_printf(sc->vmbus_dev,
+			    "no primary chan for chan%u\n", newchan->ch_id);
+			return EINVAL;
+		}
+		/*
+		 * Found the primary channel for this sub-channel and
+		 * move on.
+		 *
+		 * XXX refcnt prichan
+		 */
+	}
+	mtx_unlock(&sc->vmbus_prichan_lock);
+
+	/*
+	 * This is a sub-channel; link it with the primary channel.
+	 */
+	KASSERT(!VMBUS_CHAN_ISPRIMARY(newchan),
+	    ("new channel is not sub-channel"));
+	KASSERT(prichan != NULL, ("no primary channel"));
+
+	/*
+	 * Reference count this sub-channel; it will be dereferenced
+	 * when this sub-channel is closed.
+	 */
+	KASSERT(newchan->ch_refs == 1, ("chan%u: invalid refcnt %d",
+	    newchan->ch_id, newchan->ch_refs));
+	atomic_add_int(&newchan->ch_refs, 1);
+
+	newchan->ch_prichan = prichan;
+	newchan->ch_dev = prichan->ch_dev;
+
+	mtx_lock(&prichan->ch_subchan_lock);
+	vmbus_chan_ins_sublist(prichan, newchan);
+	mtx_unlock(&prichan->ch_subchan_lock);
+	/*
+	 * Notify anyone that is interested in this sub-channel,
+	 * after this sub-channel is setup.
+	 */
+	wakeup(prichan);
+done:
+	/*
+	 * Hook this channel up for later revocation.
+	 */
+	mtx_lock(&sc->vmbus_chan_lock);
+	vmbus_chan_ins_list(sc, newchan);
+	mtx_unlock(&sc->vmbus_chan_lock);
+
+	if (bootverbose) {
+		vmbus_chan_printf(newchan, "chan%u subidx%u offer\n",
+		    newchan->ch_id, newchan->ch_subidx);
+	}
+
+	/* Select default cpu for this channel. */
+	vmbus_chan_cpu_default(newchan);
+
+	return 0;
+}
+
+void
+vmbus_chan_cpu_set(struct vmbus_channel *chan, int cpu)
+{
+	KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu %d", cpu));
+
+	if (chan->ch_vmbus->vmbus_version == VMBUS_VERSION_WS2008 ||
+	    chan->ch_vmbus->vmbus_version == VMBUS_VERSION_WIN7) {
+		/* Only cpu0 is supported */
+		cpu = 0;
+	}
+
+	chan->ch_cpuid = cpu;
+	chan->ch_vcpuid = VMBUS_PCPU_GET(chan->ch_vmbus, vcpuid, cpu);
+
+	if (bootverbose) {
+		vmbus_chan_printf(chan,
+		    "chan%u assigned to cpu%u [vcpu%u]\n",
+		    chan->ch_id, chan->ch_cpuid, chan->ch_vcpuid);
+	}
+}
+
+void
+vmbus_chan_cpu_rr(struct vmbus_channel *chan)
+{
+	static uint32_t vmbus_chan_nextcpu;
+	int cpu;
+
+	cpu = atomic_fetchadd_int(&vmbus_chan_nextcpu, 1) % mp_ncpus;
+	vmbus_chan_cpu_set(chan, cpu);
+}
+
+static void
+vmbus_chan_cpu_default(struct vmbus_channel *chan)
+{
+	/*
+	 * By default, pin the channel to cpu0.  Devices having
+	 * special channel-cpu mapping requirement should call
+	 * vmbus_chan_cpu_{set,rr}().
+	 */
+	vmbus_chan_cpu_set(chan, 0);
+}
+
+static void
+vmbus_chan_msgproc_choffer(struct vmbus_softc *sc,
+    const struct vmbus_message *msg)
+{
+	const struct vmbus_chanmsg_choffer *offer;
+	struct vmbus_channel *chan;
+	task_fn_t *detach_fn, *attach_fn;
+	int error;
+
+	offer = (const struct vmbus_chanmsg_choffer *)msg->msg_data;
+
+	chan = vmbus_chan_alloc(sc);
+	if (chan == NULL) {
+		device_printf(sc->vmbus_dev, "allocate chan%u failed\n",
+		    offer->chm_chanid);
+		return;
+	}
+
+	chan->ch_id = offer->chm_chanid;
+	chan->ch_subidx = offer->chm_subidx;
+	chan->ch_guid_type = offer->chm_chtype;
+	chan->ch_guid_inst = offer->chm_chinst;
+
+	/* Batch reading is on by default */
+	chan->ch_flags |= VMBUS_CHAN_FLAG_BATCHREAD;
+
+	chan->ch_monprm->mp_connid = VMBUS_CONNID_EVENT;
+	if (sc->vmbus_version != VMBUS_VERSION_WS2008)
+		chan->ch_monprm->mp_connid = offer->chm_connid;
+
+	if (offer->chm_flags1 & VMBUS_CHOFFER_FLAG1_HASMNF) {
+		int trig_idx;
+
+		/*
+		 * Setup MNF stuffs.
+		 */
+		chan->ch_txflags |= VMBUS_CHAN_TXF_HASMNF;
+
+		trig_idx = offer->chm_montrig / VMBUS_MONTRIG_LEN;
+		if (trig_idx >= VMBUS_MONTRIGS_MAX)
+			panic("invalid monitor trigger %u", offer->chm_montrig);
+		chan->ch_montrig =
+		    &sc->vmbus_mnf2->mnf_trigs[trig_idx].mt_pending;
+
+		chan->ch_montrig_mask =
+		    1 << (offer->chm_montrig % VMBUS_MONTRIG_LEN);
+	}
+
+	if (offer->chm_chflags & VMBUS_CHAN_TLNPI_PROVIDER_OFFER) {
+		/* This is HyperV socket channel */
+		chan->ch_is_hvs = true;
+		/* The first byte != 0 means the host initiated connection. */
+		chan->ch_hvs_conn_from_host =
+		    offer->chm_udata.pipe.user_def[0];
+
+		if (bootverbose) {
+			device_printf(sc->vmbus_dev,
+			    "chan%u is hyperv socket channel "
+			    "connected %s host\n",
+			    chan->ch_id,
+			    (chan->ch_hvs_conn_from_host != 0) ?
+			    "from" : "to");
+		}
+	} else {
+		chan->ch_is_hvs = false;
+	}
+
+	/*
+	 * Setup event flag.
+	 */
+	chan->ch_evtflag =
+	    &sc->vmbus_tx_evtflags[chan->ch_id >> VMBUS_EVTFLAG_SHIFT];
+	chan->ch_evtflag_mask = 1UL << (chan->ch_id & VMBUS_EVTFLAG_MASK);
+
+	/*
+	 * Setup attach and detach tasks.
+	 */
+	if (VMBUS_CHAN_ISPRIMARY(chan)) {
+		chan->ch_mgmt_tq = sc->vmbus_devtq;
+		attach_fn = vmbus_prichan_attach_task;
+		detach_fn = vmbus_prichan_detach_task;
+	} else {
+		chan->ch_mgmt_tq = sc->vmbus_subchtq;
+		attach_fn = vmbus_subchan_attach_task;
+		detach_fn = vmbus_subchan_detach_task;
+	}
+	TASK_INIT(&chan->ch_attach_task, 0, attach_fn, chan);
+	TASK_INIT(&chan->ch_detach_task, 0, detach_fn, chan);
+
+	error = vmbus_chan_add(chan);
+	if (error) {
+		device_printf(sc->vmbus_dev, "add chan%u failed: %d\n",
+		    chan->ch_id, error);
+		atomic_subtract_int(&chan->ch_refs, 1);
+		vmbus_chan_free(chan);
+		return;
+	}
+	taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_attach_task);
+}
+
+static void
+vmbus_chan_msgproc_chrescind(struct vmbus_softc *sc,
+    const struct vmbus_message *msg)
+{
+	const struct vmbus_chanmsg_chrescind *note;
+	struct vmbus_channel *chan;
+
+	note = (const struct vmbus_chanmsg_chrescind *)msg->msg_data;
+	if (note->chm_chanid > VMBUS_CHAN_MAX) {
+		device_printf(sc->vmbus_dev, "invalid revoked chan%u\n",
+		    note->chm_chanid);
+		return;
+	}
+
+	/*
+	 * Find and remove the target channel from the channel list.
+	 */
+	mtx_lock(&sc->vmbus_chan_lock);
+	TAILQ_FOREACH(chan, &sc->vmbus_chans, ch_link) {
+		if (chan->ch_id == note->chm_chanid)
+			break;
+	}
+	if (chan == NULL) {
+		mtx_unlock(&sc->vmbus_chan_lock);
+		device_printf(sc->vmbus_dev, "chan%u is not offered\n",
+		    note->chm_chanid);
+		return;
+	}
+	vmbus_chan_rem_list(sc, chan);
+	mtx_unlock(&sc->vmbus_chan_lock);
+
+	if (VMBUS_CHAN_ISPRIMARY(chan)) {
+		/*
+		 * The target channel is a primary channel; remove the
+		 * target channel from the primary channel list now,
+		 * instead of later, so that it will not be found by
+		 * other sub-channel offers, which are processed in
+		 * this thread.
+		 */
+		mtx_lock(&sc->vmbus_prichan_lock);
+		vmbus_chan_rem_prilist(sc, chan);
+		mtx_unlock(&sc->vmbus_prichan_lock);
+	}
+
+	/*
+	 * NOTE:
+	 * The following processing order is critical:
+	 * Set the REVOKED state flag before orphaning the installed xact.
+	 */
+
+	if (atomic_testandset_int(&chan->ch_stflags,
+	    VMBUS_CHAN_ST_REVOKED_SHIFT))
+		panic("channel has already been revoked");
+
+	sx_xlock(&chan->ch_orphan_lock);
+	if (chan->ch_orphan_xact != NULL)
+		vmbus_xact_ctx_orphan(chan->ch_orphan_xact);
+	sx_xunlock(&chan->ch_orphan_lock);
+
+	if (bootverbose)
+		vmbus_chan_printf(chan, "chan%u revoked\n", note->chm_chanid);
+	vmbus_chan_detach(chan);
+}
+
+static int
+vmbus_chan_release(struct vmbus_channel *chan)
+{
+	struct vmbus_softc *sc = chan->ch_vmbus;
+	struct vmbus_chanmsg_chfree *req;
+	struct vmbus_msghc *mh;
+	int error;
+
+	mh = vmbus_msghc_get(sc, sizeof(*req));
+	if (mh == NULL) {
+		vmbus_chan_printf(chan,
+		    "can not get msg hypercall for chfree(chan%u)\n",
+		    chan->ch_id);
+		return (ENXIO);
+	}
+
+	req = vmbus_msghc_dataptr(mh);
+	req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHFREE;
+	req->chm_chanid = chan->ch_id;
+
+	error = vmbus_msghc_exec_noresult(mh);
+	vmbus_msghc_put(sc, mh);
+
+	if (error) {
+		vmbus_chan_printf(chan,
+		    "chfree(chan%u) msg hypercall exec failed: %d\n",
+		    chan->ch_id, error);
+	} else {
+		if (bootverbose)
+			vmbus_chan_printf(chan, "chan%u freed\n", chan->ch_id);
+	}
+	return (error);
+}
+
+static void
+vmbus_prichan_detach_task(void *xchan, int pending __unused)
+{
+	struct vmbus_channel *chan = xchan;
+
+	KASSERT(VMBUS_CHAN_ISPRIMARY(chan),
+	    ("chan%u is not primary channel", chan->ch_id));
+
+	/* Delete and detach the device associated with this channel. */
+	vmbus_delete_child(chan);
+
+	/* Release this channel (back to vmbus). */
+	vmbus_chan_release(chan);
+
+	/* Free this channel's resource. */
+	vmbus_chan_free(chan);
+}
+
+static void
+vmbus_subchan_detach_task(void *xchan, int pending __unused)
+{
+	struct vmbus_channel *chan = xchan;
+	struct vmbus_channel *pri_chan = chan->ch_prichan;
+
+	KASSERT(!VMBUS_CHAN_ISPRIMARY(chan),
+	    ("chan%u is primary channel", chan->ch_id));
+
+	/* Release this channel (back to vmbus). */
+	vmbus_chan_release(chan);
+
+	/* Unlink from its primary channel's sub-channel list. */
+	mtx_lock(&pri_chan->ch_subchan_lock);
+	vmbus_chan_rem_sublist(pri_chan, chan);
+	mtx_unlock(&pri_chan->ch_subchan_lock);
+	/* Notify anyone that is waiting for this sub-channel to vanish. */
+	wakeup(pri_chan);
+
+	/* Free this channel's resource. */
+	vmbus_chan_free(chan);
+}
+
+static void
+vmbus_prichan_attach_task(void *xchan, int pending __unused)
+{
+
+	/*
+	 * Add device for this primary channel.
+	 */
+	vmbus_add_child(xchan);
+}
+
+static void
+vmbus_subchan_attach_task(void *xchan __unused, int pending __unused)
+{
+
+	/* Nothing */
+}
+
+void
+vmbus_chan_destroy_all(struct vmbus_softc *sc)
+{
+
+	/*
+	 * Detach all devices and destroy the corresponding primary
+	 * channels.
+	 */
+	for (;;) {
+		struct vmbus_channel *chan;
+
+		mtx_lock(&sc->vmbus_chan_lock);
+		TAILQ_FOREACH(chan, &sc->vmbus_chans, ch_link) {
+			if (VMBUS_CHAN_ISPRIMARY(chan))
+				break;
+		}
+		if (chan == NULL) {
+			/* No more primary channels; done. */
+			mtx_unlock(&sc->vmbus_chan_lock);
+			break;
+		}
+		vmbus_chan_rem_list(sc, chan);
+		mtx_unlock(&sc->vmbus_chan_lock);
+
+		mtx_lock(&sc->vmbus_prichan_lock);
+		vmbus_chan_rem_prilist(sc, chan);
+		mtx_unlock(&sc->vmbus_prichan_lock);
+
+		taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_detach_task);
+	}
+}
+
+struct vmbus_channel **
+vmbus_subchan_get(struct vmbus_channel *pri_chan, int subchan_cnt)
+{
+	struct vmbus_channel **ret, *chan;
+	int i;
+
+	KASSERT(subchan_cnt > 0, ("invalid sub-channel count %d", subchan_cnt));
+
+	ret = malloc(subchan_cnt * sizeof(struct vmbus_channel *), M_TEMP,
+	    M_WAITOK);
+
+	mtx_lock(&pri_chan->ch_subchan_lock);
+
+	while (pri_chan->ch_subchan_cnt < subchan_cnt)
+		mtx_sleep(pri_chan, &pri_chan->ch_subchan_lock, 0, "subch", 0);
+
+	i = 0;
+	TAILQ_FOREACH(chan, &pri_chan->ch_subchans, ch_sublink) {
+		/* TODO: refcnt chan */
+		ret[i] = chan;
+
+		++i;
+		if (i == subchan_cnt)
+			break;
+	}
+	KASSERT(i == subchan_cnt, ("invalid subchan count %d, should be %d",
+	    pri_chan->ch_subchan_cnt, subchan_cnt));
+
+	mtx_unlock(&pri_chan->ch_subchan_lock);
+
+	return ret;
+}
+
+void
+vmbus_subchan_rel(struct vmbus_channel **subchan, int subchan_cnt __unused)
+{
+
+	free(subchan, M_TEMP);
+}
+
+void
+vmbus_subchan_drain(struct vmbus_channel *pri_chan)
+{
+	mtx_lock(&pri_chan->ch_subchan_lock);
+	while (pri_chan->ch_subchan_cnt > 0)
+		mtx_sleep(pri_chan, &pri_chan->ch_subchan_lock, 0, "dsubch", 0);
+	mtx_unlock(&pri_chan->ch_subchan_lock);
+}
+
+void
+vmbus_chan_msgproc(struct vmbus_softc *sc, const struct vmbus_message *msg)
+{
+	vmbus_chanmsg_proc_t msg_proc;
+	uint32_t msg_type;
+
+	msg_type = ((const struct vmbus_chanmsg_hdr *)msg->msg_data)->chm_type;
+	KASSERT(msg_type < VMBUS_CHANMSG_TYPE_MAX,
+	    ("invalid message type %u", msg_type));
+
+	msg_proc = vmbus_chan_msgprocs[msg_type];
+	if (msg_proc != NULL)
+		msg_proc(sc, msg);
+}
+
+void
+vmbus_chan_set_readbatch(struct vmbus_channel *chan, bool on)
+{
+	if (!on)
+		chan->ch_flags &= ~VMBUS_CHAN_FLAG_BATCHREAD;
+	else
+		chan->ch_flags |= VMBUS_CHAN_FLAG_BATCHREAD;
+}
+
+uint32_t
+vmbus_chan_id(const struct vmbus_channel *chan)
+{
+	return chan->ch_id;
+}
+
+uint32_t
+vmbus_chan_subidx(const struct vmbus_channel *chan)
+{
+	return chan->ch_subidx;
+}
+
+bool
+vmbus_chan_is_primary(const struct vmbus_channel *chan)
+{
+	if (VMBUS_CHAN_ISPRIMARY(chan))
+		return true;
+	else
+		return false;
+}
+
+bool
+vmbus_chan_is_hvs(const struct vmbus_channel *chan)
+{
+	return chan->ch_is_hvs;
+}
+
+bool
+vmbus_chan_is_hvs_conn_from_host(const struct vmbus_channel *chan)
+{
+	KASSERT(vmbus_chan_is_hvs(chan) == true,
+	    ("Not a HyperV Socket channel %u", chan->ch_id));
+	if (chan->ch_hvs_conn_from_host != 0)
+		return true;
+	else
+		return false;
+}
+
+struct hyperv_guid *
+vmbus_chan_guid_type(struct vmbus_channel *chan)
+{
+	return &chan->ch_guid_type;
+}
+
+struct hyperv_guid *
+vmbus_chan_guid_inst(struct vmbus_channel *chan)
+{
+	return &chan->ch_guid_inst;
+}
+
+int
+vmbus_chan_prplist_nelem(int br_size, int prpcnt_max, int dlen_max)
+{
+	int elem_size;
+
+	elem_size = __offsetof(struct vmbus_chanpkt_prplist,
+	    cp_range[0].gpa_page[prpcnt_max]);
+	elem_size += dlen_max;
+	elem_size = VMBUS_CHANPKT_TOTLEN(elem_size);
+
+	return (vmbus_br_nelem(br_size, elem_size));
+}
+
+bool
+vmbus_chan_tx_empty(const struct vmbus_channel *chan)
+{
+
+	return (vmbus_txbr_empty(&chan->ch_txbr));
+}
+
+bool
+vmbus_chan_rx_empty(const struct vmbus_channel *chan)
+{
+
+	return (vmbus_rxbr_empty(&chan->ch_rxbr));
+}
+
+static int
+vmbus_chan_printf(const struct vmbus_channel *chan, const char *fmt, ...)
+{
+	va_list ap;
+	device_t dev;
+	int retval;
+
+	if (chan->ch_dev == NULL || !device_is_alive(chan->ch_dev))
+		dev = chan->ch_vmbus->vmbus_dev;
+	else
+		dev = chan->ch_dev;
+
+	retval = device_print_prettyname(dev);
+	va_start(ap, fmt);
+	retval += vprintf(fmt, ap);
+	va_end(ap);
+
+	return (retval);
+}
+
+void
+vmbus_chan_run_task(struct vmbus_channel *chan, struct task *task)
+{
+
+	taskqueue_enqueue(chan->ch_tq, task);
+	taskqueue_drain(chan->ch_tq, task);
+}
+
+struct taskqueue *
+vmbus_chan_mgmt_tq(const struct vmbus_channel *chan)
+{
+
+	return (chan->ch_mgmt_tq);
+}
+
+bool
+vmbus_chan_is_revoked(const struct vmbus_channel *chan)
+{
+
+	if (chan->ch_stflags & VMBUS_CHAN_ST_REVOKED)
+		return (true);
+	return (false);
+}
+
+void
+vmbus_chan_set_orphan(struct vmbus_channel *chan, struct vmbus_xact_ctx *xact)
+{
+
+	sx_xlock(&chan->ch_orphan_lock);
+	chan->ch_orphan_xact = xact;
+	sx_xunlock(&chan->ch_orphan_lock);
+}
+
+void
+vmbus_chan_unset_orphan(struct vmbus_channel *chan)
+{
+
+	sx_xlock(&chan->ch_orphan_lock);
+	chan->ch_orphan_xact = NULL;
+	sx_xunlock(&chan->ch_orphan_lock);
+}
+
+const void *
+vmbus_chan_xact_wait(const struct vmbus_channel *chan,
+    struct vmbus_xact *xact, size_t *resp_len, bool can_sleep)
+{
+	const void *ret;
+
+	if (can_sleep)
+		ret = vmbus_xact_wait(xact, resp_len);
+	else
+		ret = vmbus_xact_busywait(xact, resp_len);
+	if (vmbus_chan_is_revoked(chan)) {
+		/*
+		 * This xact probably is interrupted, and the
+		 * interruption can race the reply reception,
+		 * so we have to make sure that there are nothing
+		 * left on the RX bufring, i.e. this xact will
+		 * not be touched, once this function returns.
+		 *
+		 * Since the hypervisor will not put more data
+		 * onto the RX bufring once the channel is revoked,
+		 * the following loop will be terminated, once all
+		 * data are drained by the driver's channel
+		 * callback.
+		 */
+		while (!vmbus_chan_rx_empty(chan)) {
+			if (can_sleep)
+				pause("chxact", 1);
+			else
+				DELAY(1000);
+		}
+	}
+	return (ret);
+}
+
+void
+vmbus_chan_poll_enable(struct vmbus_channel *chan, u_int pollhz)
+{
+	struct vmbus_chan_pollarg arg;
+	struct task poll_cfg;
+
+	KASSERT(chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD,
+	    ("enable polling on non-batch chan%u", chan->ch_id));
+	KASSERT(pollhz >= VMBUS_CHAN_POLLHZ_MIN &&
+	    pollhz <= VMBUS_CHAN_POLLHZ_MAX, ("invalid pollhz %u", pollhz));
+
+	arg.poll_chan = chan;
+	arg.poll_hz = pollhz;
+	TASK_INIT(&poll_cfg, 0, vmbus_chan_pollcfg_task, &arg);
+	vmbus_chan_run_task(chan, &poll_cfg);
+}
+
+void
+vmbus_chan_poll_disable(struct vmbus_channel *chan)
+{
+	struct task poll_dis;
+
+	KASSERT(chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD,
+	    ("disable polling on non-batch chan%u", chan->ch_id));
+
+	TASK_INIT(&poll_dis, 0, vmbus_chan_polldis_task, chan);
+	vmbus_chan_run_task(chan, &poll_dis);
+}
diff --git a/sys/dev/hyperv/vmbus/vmbus_chanvar.h b/sys/dev/hyperv/vmbus/vmbus_chanvar.h
new file mode 100644
index 000000000000..b20b0119bc04
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_chanvar.h
@@ -0,0 +1,195 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_CHANVAR_H_
+#define _VMBUS_CHANVAR_H_
+
+#include <sys/param.h>
+#include <sys/callout.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/vmbus/vmbus_brvar.h>
+
+struct vmbus_channel {
+	/*
+	 * NOTE:
+	 * Fields before ch_txbr are only accessed on this channel's
+	 * target CPU.
+	 */
+	uint32_t			ch_flags;	/* VMBUS_CHAN_FLAG_ */
+	int				ch_poll_flags;	/* callout flags */
+
+	/*
+	 * RX bufring; immediately following ch_txbr.
+	 */
+	struct vmbus_rxbr		ch_rxbr;
+
+	struct taskqueue		*ch_tq;
+	struct task			ch_task;
+	struct task			ch_poll_task;
+	sbintime_t			ch_poll_intvl;
+	struct callout			ch_poll_timeo;
+	vmbus_chan_callback_t		ch_cb;
+	void				*ch_cbarg;
+
+	/*
+	 * TX bufring; at the beginning of ch_bufring.
+	 *
+	 * NOTE:
+	 * Put TX bufring and the following MNF/evtflag to a new
+	 * cacheline, since they will be accessed on all CPUs by
+	 * locking ch_txbr first.
+	 *
+	 * XXX
+	 * TX bufring and following MNF/evtflags do _not_ fit in
+	 * one 64B cacheline.
+	 */
+	struct vmbus_txbr		ch_txbr __aligned(CACHE_LINE_SIZE);
+	uint32_t			ch_txflags;	/* VMBUS_CHAN_TXF_ */
+
+	/*
+	 * These are based on the vmbus_chanmsg_choffer.chm_montrig.
+	 * Save it here for easy access.
+	 */
+	uint32_t			ch_montrig_mask;/* MNF trig mask */
+	volatile uint32_t		*ch_montrig;	/* MNF trigger loc. */
+
+	/*
+	 * These are based on the vmbus_chanmsg_choffer.chm_chanid.
+	 * Save it here for easy access.
+	 */
+	u_long				ch_evtflag_mask;/* event flag */
+	volatile u_long			*ch_evtflag;	/* event flag loc. */
+
+	/*
+	 * Rarely used fields.
+	 */
+
+	struct hyperv_mon_param		*ch_monprm;
+	struct hyperv_dma		ch_monprm_dma;
+
+	uint32_t			ch_id;		/* channel id */
+	device_t			ch_dev;
+	struct vmbus_softc		*ch_vmbus;
+
+	int				ch_cpuid;	/* owner cpu */
+	/*
+	 * Virtual cpuid for ch_cpuid; it is used to communicate cpuid
+	 * related information w/ Hyper-V.  If MSR_HV_VP_INDEX does not
+	 * exist, ch_vcpuid will always be 0 for compatibility.
+	 */
+	uint32_t			ch_vcpuid;
+
+	/*
+	 * If this is a primary channel, ch_subchan* fields
+	 * contain sub-channels belonging to this primary
+	 * channel.
+	 */
+	struct mtx			ch_subchan_lock;
+	TAILQ_HEAD(, vmbus_channel)	ch_subchans;
+	int				ch_subchan_cnt;
+
+	/* If this is a sub-channel */
+	TAILQ_ENTRY(vmbus_channel)	ch_sublink;	/* sub-channel link */
+	struct vmbus_channel		*ch_prichan;	/* owner primary chan */
+
+	void				*ch_bufring;	/* TX+RX bufrings */
+	struct hyperv_dma		ch_bufring_dma;
+	uint32_t			ch_bufring_gpadl;
+
+	struct task			ch_attach_task;	/* run in ch_mgmt_tq */
+	struct task			ch_detach_task;	/* run in ch_mgmt_tq */
+	struct taskqueue		*ch_mgmt_tq;
+
+	/* If this is a primary channel */
+	TAILQ_ENTRY(vmbus_channel)	ch_prilink;	/* primary chan link */
+
+	TAILQ_ENTRY(vmbus_channel)	ch_link;	/* channel link */
+	uint32_t			ch_subidx;	/* subchan index */
+	volatile uint32_t		ch_stflags;	/* atomic-op */
+							/* VMBUS_CHAN_ST_ */
+	struct hyperv_guid		ch_guid_type;
+	struct hyperv_guid		ch_guid_inst;
+
+	struct sx			ch_orphan_lock;
+	struct vmbus_xact_ctx		*ch_orphan_xact;
+
+	int				ch_refs;
+
+	/*
+	 * These are for HyperV socket channel only
+	 */
+	bool				ch_is_hvs;
+	uint8_t				ch_hvs_conn_from_host;
+
+	struct sysctl_ctx_list		ch_sysctl_ctx;
+} __aligned(CACHE_LINE_SIZE);
+
+#define VMBUS_CHAN_ISPRIMARY(chan)	((chan)->ch_subidx == 0)
+
+/*
+ * If this flag is set, this channel's interrupt will be masked in ISR,
+ * and the RX bufring will be drained before this channel's interrupt is
+ * unmasked.
+ *
+ * This flag is turned on by default.  Drivers can turn it off according
+ * to their own requirement.
+ */
+#define VMBUS_CHAN_FLAG_BATCHREAD	0x0002
+
+#define VMBUS_CHAN_TXF_HASMNF		0x0001
+
+#define VMBUS_CHAN_ST_OPENED_SHIFT	0
+#define VMBUS_CHAN_ST_ONPRIL_SHIFT	1
+#define VMBUS_CHAN_ST_ONSUBL_SHIFT	2
+#define VMBUS_CHAN_ST_ONLIST_SHIFT	3
+#define VMBUS_CHAN_ST_REVOKED_SHIFT	4	/* sticky */
+#define VMBUS_CHAN_ST_OPENED		(1 << VMBUS_CHAN_ST_OPENED_SHIFT)
+#define VMBUS_CHAN_ST_ONPRIL		(1 << VMBUS_CHAN_ST_ONPRIL_SHIFT)
+#define VMBUS_CHAN_ST_ONSUBL		(1 << VMBUS_CHAN_ST_ONSUBL_SHIFT)
+#define VMBUS_CHAN_ST_ONLIST		(1 << VMBUS_CHAN_ST_ONLIST_SHIFT)
+#define VMBUS_CHAN_ST_REVOKED		(1 << VMBUS_CHAN_ST_REVOKED_SHIFT)
+
+struct vmbus_softc;
+struct vmbus_message;
+
+void		vmbus_event_proc(struct vmbus_softc *, int);
+void		vmbus_event_proc_compat(struct vmbus_softc *, int);
+void		vmbus_chan_msgproc(struct vmbus_softc *,
+		    const struct vmbus_message *);
+void		vmbus_chan_destroy_all(struct vmbus_softc *);
+
+#endif	/* !_VMBUS_CHANVAR_H_ */
diff --git a/sys/dev/hyperv/vmbus/vmbus_et.c b/sys/dev/hyperv/vmbus/vmbus_et.c
new file mode 100644
index 000000000000..d9ab2a9485e7
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_et.c
@@ -0,0 +1,201 @@
+/*-
+ * Copyright (c) 2015,2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+#include <sys/timeet.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/vmbus/hyperv_reg.h>
+#include <dev/hyperv/vmbus/hyperv_var.h>
+#include <dev/hyperv/vmbus/vmbus_var.h>
+
+#define VMBUS_ET_NAME			"hvet"
+
+#define MSR_HV_STIMER0_CFG_SINT		\
+	((((uint64_t)VMBUS_SINT_TIMER) << MSR_HV_STIMER_CFG_SINT_SHIFT) & \
+	 MSR_HV_STIMER_CFG_SINT_MASK)
+
+/*
+ * Additionally required feature:
+ * - SynIC is needed for interrupt generation.
+ */
+#define CPUID_HV_ET_MASK		(CPUID_HV_MSR_SYNIC |		\
+					 CPUID_HV_MSR_SYNTIMER)
+
+static void			vmbus_et_identify(driver_t *, device_t);
+static int			vmbus_et_probe(device_t);
+static int			vmbus_et_attach(device_t);
+static int			vmbus_et_detach(device_t);
+static int			vmbus_et_start(struct eventtimer *, sbintime_t,
+				    sbintime_t);
+
+static struct eventtimer	vmbus_et;
+
+static device_method_t vmbus_et_methods[] = {
+	DEVMETHOD(device_identify,	vmbus_et_identify),
+	DEVMETHOD(device_probe,		vmbus_et_probe),
+	DEVMETHOD(device_attach,	vmbus_et_attach),
+	DEVMETHOD(device_detach,	vmbus_et_detach),
+
+	DEVMETHOD_END
+};
+
+static driver_t vmbus_et_driver = {
+	VMBUS_ET_NAME,
+	vmbus_et_methods,
+	0
+};
+
+static devclass_t vmbus_et_devclass;
+
+DRIVER_MODULE(hv_et, vmbus, vmbus_et_driver, vmbus_et_devclass, NULL, NULL);
+MODULE_VERSION(hv_et, 1);
+
+static __inline uint64_t
+hyperv_sbintime2count(sbintime_t time)
+{
+	struct timespec val;
+
+	val = sbttots(time);
+	return (val.tv_sec * HYPERV_TIMER_FREQ) +
+	    (val.tv_nsec / HYPERV_TIMER_NS_FACTOR);
+}
+
+static int
+vmbus_et_start(struct eventtimer *et __unused, sbintime_t first,
+    sbintime_t period __unused)
+{
+	uint64_t current;
+
+	current = hyperv_tc64();
+	current += hyperv_sbintime2count(first);
+	wrmsr(MSR_HV_STIMER0_COUNT, current);
+
+	return (0);
+}
+
+void
+vmbus_et_intr(struct trapframe *frame)
+{
+	struct trapframe *oldframe;
+	struct thread *td;
+
+	if (vmbus_et.et_active) {
+		td = curthread;
+		td->td_intr_nesting_level++;
+		oldframe = td->td_intr_frame;
+		td->td_intr_frame = frame;
+		vmbus_et.et_event_cb(&vmbus_et, vmbus_et.et_arg);
+		td->td_intr_frame = oldframe;
+		td->td_intr_nesting_level--;
+	}
+}
+
+static void
+vmbus_et_identify(driver_t *driver, device_t parent)
+{
+	if (device_get_unit(parent) != 0 ||
+	    device_find_child(parent, VMBUS_ET_NAME, -1) != NULL ||
+	    (hyperv_features & CPUID_HV_ET_MASK) != CPUID_HV_ET_MASK ||
+	    hyperv_tc64 == NULL)
+		return;
+
+	device_add_child(parent, VMBUS_ET_NAME, -1);
+}
+
+static int
+vmbus_et_probe(device_t dev)
+{
+	if (resource_disabled(VMBUS_ET_NAME, 0))
+		return (ENXIO);
+
+	device_set_desc(dev, "Hyper-V event timer");
+
+	return (BUS_PROBE_NOWILDCARD);
+}
+
+static void
+vmbus_et_config(void *arg __unused)
+{
+	/*
+	 * Make sure that STIMER0 is really disabled before writing
+	 * to STIMER0_CONFIG.
+	 *
+	 * "Writing to the configuration register of a timer that
+	 *  is already enabled may result in undefined behaviour."
+	 */
+	for (;;) {
+		uint64_t val;
+
+		/* Stop counting, and this also implies disabling STIMER0 */
+		wrmsr(MSR_HV_STIMER0_COUNT, 0);
+
+		val = rdmsr(MSR_HV_STIMER0_CONFIG);
+		if ((val & MSR_HV_STIMER_CFG_ENABLE) == 0)
+			break;
+		cpu_spinwait();
+	}
+	wrmsr(MSR_HV_STIMER0_CONFIG,
+	    MSR_HV_STIMER_CFG_AUTOEN | MSR_HV_STIMER0_CFG_SINT);
+}
+
+static int
+vmbus_et_attach(device_t dev)
+{
+	/* TODO: use independent IDT vector */
+
+	vmbus_et.et_name = "Hyper-V";
+	vmbus_et.et_flags = ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU;
+	vmbus_et.et_quality = 1000;
+	vmbus_et.et_frequency = HYPERV_TIMER_FREQ;
+	vmbus_et.et_min_period = (0x00000001ULL << 32) / HYPERV_TIMER_FREQ;
+	vmbus_et.et_max_period = (0xfffffffeULL << 32) / HYPERV_TIMER_FREQ;
+	vmbus_et.et_start = vmbus_et_start;
+
+	/*
+	 * Delay a bit to make sure that hyperv_tc64 will not return 0,
+	 * since writing 0 to STIMER0_COUNT will disable STIMER0.
+	 */
+	DELAY(100);
+	smp_rendezvous(NULL, vmbus_et_config, NULL, NULL);
+
+	return (et_register(&vmbus_et));
+}
+
+static int
+vmbus_et_detach(device_t dev)
+{
+	return (et_deregister(&vmbus_et));
+}
diff --git a/sys/dev/hyperv/vmbus/vmbus_if.m b/sys/dev/hyperv/vmbus/vmbus_if.m
new file mode 100644
index 000000000000..3b41c5148fdf
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_if.m
@@ -0,0 +1,60 @@
+#-
+# Copyright (c) 2016 Microsoft Corp.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice unmodified, this list of conditions, and the following
+#    disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/param.h>
+#include <sys/bus.h>
+
+INTERFACE vmbus;
+
+HEADER {
+	struct hyperv_guid;
+	struct taskqueue;
+};
+
+METHOD uint32_t get_version {
+	device_t bus;
+	device_t dev;
+};
+
+METHOD int probe_guid {
+	device_t bus;
+	device_t dev;
+	const struct hyperv_guid *guid;
+};
+
+METHOD uint32_t get_vcpu_id {
+	device_t bus;
+	device_t dev;
+	int cpu;
+};
+
+METHOD struct taskqueue * get_event_taskq {
+	device_t bus;
+	device_t dev;
+	int cpu;
+};
diff --git a/sys/dev/hyperv/vmbus/vmbus_reg.h b/sys/dev/hyperv/vmbus/vmbus_reg.h
new file mode 100644
index 000000000000..80d197c48ee4
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_reg.h
@@ -0,0 +1,427 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_REG_H_
+#define _VMBUS_REG_H_
+
+#include <sys/param.h>
+#include <dev/hyperv/include/hyperv.h> /* XXX for hyperv_guid */
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/vmbus/hyperv_reg.h>
+
+/*
+ * Hyper-V SynIC message format.
+ */
+
+#define VMBUS_MSG_DSIZE_MAX		240
+#define VMBUS_MSG_SIZE			256
+
+struct vmbus_message {
+	uint32_t	msg_type;	/* HYPERV_MSGTYPE_ */
+	uint8_t		msg_dsize;	/* data size */
+	uint8_t		msg_flags;	/* VMBUS_MSGFLAG_ */
+	uint16_t	msg_rsvd;
+	uint64_t	msg_id;
+	uint8_t		msg_data[VMBUS_MSG_DSIZE_MAX];
+} __packed;
+CTASSERT(sizeof(struct vmbus_message) == VMBUS_MSG_SIZE);
+
+#define VMBUS_MSGFLAG_PENDING		0x01
+
+/*
+ * Hyper-V SynIC event flags
+ */
+
+#ifdef __LP64__
+#define VMBUS_EVTFLAGS_MAX	32
+#define VMBUS_EVTFLAG_SHIFT	6
+#else
+#define VMBUS_EVTFLAGS_MAX	64
+#define VMBUS_EVTFLAG_SHIFT	5
+#endif
+#define VMBUS_EVTFLAG_LEN	(1 << VMBUS_EVTFLAG_SHIFT)
+#define VMBUS_EVTFLAG_MASK	(VMBUS_EVTFLAG_LEN - 1)
+#define VMBUS_EVTFLAGS_SIZE	256
+
+struct vmbus_evtflags {
+	u_long		evt_flags[VMBUS_EVTFLAGS_MAX];
+} __packed;
+CTASSERT(sizeof(struct vmbus_evtflags) == VMBUS_EVTFLAGS_SIZE);
+
+/*
+ * Hyper-V Monitor Notification Facility
+ */
+
+struct vmbus_mon_trig {
+	uint32_t	mt_pending;
+	uint32_t	mt_armed;
+} __packed;
+
+#define VMBUS_MONTRIGS_MAX	4
+#define VMBUS_MONTRIG_LEN	32
+
+struct vmbus_mnf {
+	uint32_t	mnf_state;
+	uint32_t	mnf_rsvd1;
+
+	struct vmbus_mon_trig mnf_trigs[VMBUS_MONTRIGS_MAX];
+	uint8_t		mnf_rsvd2[536];
+
+	uint16_t	mnf_lat[VMBUS_MONTRIGS_MAX][VMBUS_MONTRIG_LEN];
+	uint8_t		mnf_rsvd3[256];
+
+	struct hyperv_mon_param
+			mnf_param[VMBUS_MONTRIGS_MAX][VMBUS_MONTRIG_LEN];
+	uint8_t		mnf_rsvd4[1984];
+} __packed;
+CTASSERT(sizeof(struct vmbus_mnf) == PAGE_SIZE);
+
+/*
+ * Buffer ring
+ */
+struct vmbus_bufring {
+	/*
+	 * If br_windex == br_rindex, this bufring is empty; this
+	 * means we can _not_ write data to the bufring, if the
+	 * write is going to make br_windex same as br_rindex.
+	 */
+	volatile uint32_t	br_windex;
+	volatile uint32_t	br_rindex;
+
+	/*
+	 * Interrupt mask {0,1}
+	 *
+	 * For TX bufring, host set this to 1, when it is processing
+	 * the TX bufring, so that we can safely skip the TX event
+	 * notification to host.
+	 *
+	 * For RX bufring, once this is set to 1 by us, host will not
+	 * further dispatch interrupts to us, even if there are data
+	 * pending on the RX bufring.  This effectively disables the
+	 * interrupt of the channel to which this RX bufring is attached.
+	 */
+	volatile uint32_t	br_imask;
+
+	/*
+	 * WS2012/Win8 and later versions of Hyper-V implement interrupt
+	 * driven flow management. The feature bit feat_pending_snd_sz
+	 * is set by the host on the host->guest buffer ring, and by the
+	 * guest on the guest->host buffer ring.
+	 *
+	 * The meaning of the feature bit is a bit complex in that it has
+	 * semantics that apply to both buffer rings.  If the guest sets
+	 * the feature bit in the guest->host buffer ring, the guest is
+	 * telling the host that:
+	 * 1) It will set the br_pending_snd_sz field in the guest->host buffer
+	 *    ring when it is waiting for space to become available, and
+	 * 2) It will read the pending_send_sz field in the host->guest
+	 *    ring buffer and interrupt the host when it frees enough space
+	 *
+	 * Similarly, if the host sets the feature bit in the host->guest
+	 * ring buffer, the host is telling the guest that:
+	 * 1) It will set the pending_send_sz field in the host->guest ring
+	 *    buffer when it is waiting for space to become available, and
+	 * 2) It will read the pending_send_sz field in the guest->host
+	 *    ring buffer and interrupt the guest when it frees enough space
+	 *
+	 * If either the guest or host does not set the feature bit that it
+	 * owns, that guest or host must do polling if it encounters a full
+	 * ring buffer, and not signal the other end with an interrupt.
+	 */
+	volatile uint32_t	br_pending_snd_sz;
+	uint32_t		br_rsvd1[12];
+	union	{
+		struct {
+			uint32_t feat_pending_snd_sz:1;
+		};
+		uint32_t value;
+	} br_feature_bits;
+
+	/* Padding to PAGE_SIZE */
+	uint8_t			br_rsvd2[4020];
+
+	/*
+	 * Total guest to host interrupt count
+	 * - For rx ring, this counts the guest signaling host when this rx
+	 * ring changing from full to not full.
+	 *
+	 * - For tx ring, this counts the guest signaling host when this tx
+	 * ring changing from empty to non empty.
+	 */
+	uint64_t		br_g2h_intr_cnt;
+
+	uint8_t			br_data[];
+} __packed;
+CTASSERT(sizeof(struct vmbus_bufring) == PAGE_SIZE);
+
+/*
+ * Channel
+ */
+
+#define VMBUS_CHAN_MAX_COMPAT	256
+#define VMBUS_CHAN_MAX		(VMBUS_EVTFLAG_LEN * VMBUS_EVTFLAGS_MAX)
+
+/*
+ * Channel packets
+ */
+
+#define VMBUS_CHANPKT_SIZE_ALIGN	(1 << VMBUS_CHANPKT_SIZE_SHIFT)
+
+#define VMBUS_CHANPKT_SETLEN(pktlen, len)		\
+do {							\
+	(pktlen) = (len) >> VMBUS_CHANPKT_SIZE_SHIFT;	\
+} while (0)
+
+#define VMBUS_CHANPKT_TOTLEN(tlen)	\
+	roundup2((tlen), VMBUS_CHANPKT_SIZE_ALIGN)
+
+#define VMBUS_CHANPKT_HLEN_MIN		\
+	(sizeof(struct vmbus_chanpkt_hdr) >> VMBUS_CHANPKT_SIZE_SHIFT)
+
+struct vmbus_chanpkt {
+	struct vmbus_chanpkt_hdr cp_hdr;
+} __packed;
+
+struct vmbus_chanpkt_sglist {
+	struct vmbus_chanpkt_hdr cp_hdr;
+	uint32_t	cp_rsvd;
+	uint32_t	cp_gpa_cnt;
+	struct vmbus_gpa cp_gpa[];
+} __packed;
+
+struct vmbus_chanpkt_prplist {
+	struct vmbus_chanpkt_hdr cp_hdr;
+	uint32_t	cp_rsvd;
+	uint32_t	cp_range_cnt;
+	struct vmbus_gpa_range cp_range[];
+} __packed;
+
+/*
+ * Channel messages
+ * - Embedded in vmbus_message.msg_data, e.g. response and notification.
+ * - Embedded in hypercall_postmsg_in.hc_data, e.g. request.
+ */
+
+#define VMBUS_CHANMSG_TYPE_CHOFFER		1	/* NOTE */
+#define VMBUS_CHANMSG_TYPE_CHRESCIND		2	/* NOTE */
+#define VMBUS_CHANMSG_TYPE_CHREQUEST		3	/* REQ */
+#define VMBUS_CHANMSG_TYPE_CHOFFER_DONE		4	/* NOTE */
+#define VMBUS_CHANMSG_TYPE_CHOPEN		5	/* REQ */
+#define VMBUS_CHANMSG_TYPE_CHOPEN_RESP		6	/* RESP */
+#define VMBUS_CHANMSG_TYPE_CHCLOSE		7	/* REQ */
+#define VMBUS_CHANMSG_TYPE_GPADL_CONN		8	/* REQ */
+#define VMBUS_CHANMSG_TYPE_GPADL_SUBCONN	9	/* REQ */
+#define VMBUS_CHANMSG_TYPE_GPADL_CONNRESP	10	/* RESP */
+#define VMBUS_CHANMSG_TYPE_GPADL_DISCONN	11	/* REQ */
+#define VMBUS_CHANMSG_TYPE_GPADL_DISCONNRESP	12	/* RESP */
+#define VMBUS_CHANMSG_TYPE_CHFREE		13	/* REQ */
+#define VMBUS_CHANMSG_TYPE_CONNECT		14	/* REQ */
+#define VMBUS_CHANMSG_TYPE_CONNECT_RESP		15	/* RESP */
+#define VMBUS_CHANMSG_TYPE_DISCONNECT		16	/* REQ */
+#define VMBUS_CHANMSG_TYPE_17			17
+#define VMBUS_CHANMSG_TYPE_18			18
+#define VMBUS_CHANMSG_TYPE_19			19
+#define VMBUS_CHANMSG_TYPE_20			20
+#define VMBUS_CHANMSG_TYPE_TL_CONN		21	/* REQ */
+#define VMBUS_CHANMSG_TYPE_22			22
+#define VMBUS_CHANMSG_TYPE_TL_RESULT		23	/* RESP */
+#define VMBUS_CHANMSG_TYPE_MAX			24
+
+struct vmbus_chanmsg_hdr {
+	uint32_t	chm_type;	/* VMBUS_CHANMSG_TYPE_ */
+	uint32_t	chm_rsvd;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_CONNECT */
+struct vmbus_chanmsg_connect {
+	struct vmbus_chanmsg_hdr chm_hdr;
+	uint32_t	chm_ver;
+	uint32_t	chm_rsvd;
+	uint64_t	chm_evtflags;
+	uint64_t	chm_mnf1;
+	uint64_t	chm_mnf2;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_CONNECT_RESP */
+struct vmbus_chanmsg_connect_resp {
+	struct vmbus_chanmsg_hdr chm_hdr;
+	uint8_t		chm_done;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_CHREQUEST */
+struct vmbus_chanmsg_chrequest {
+	struct vmbus_chanmsg_hdr chm_hdr;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_DISCONNECT */
+struct vmbus_chanmsg_disconnect {
+	struct vmbus_chanmsg_hdr chm_hdr;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_TL_CONN */
+/* Hyper-V socket guest connect request */
+struct vmbus_chanmsg_tl_connect {
+	struct vmbus_chanmsg_hdr chm_hdr;
+	struct hyperv_guid guest_endpoint_id;
+	struct hyperv_guid host_service_id;
+} __packed;
+
+
+/* VMBUS_CHANMSG_TYPE_CHOPEN */
+struct vmbus_chanmsg_chopen {
+	struct vmbus_chanmsg_hdr chm_hdr;
+	uint32_t	chm_chanid;
+	uint32_t	chm_openid;
+	uint32_t	chm_gpadl;
+	uint32_t	chm_vcpuid;
+	uint32_t	chm_txbr_pgcnt;
+#define VMBUS_CHANMSG_CHOPEN_UDATA_SIZE	120
+	uint8_t		chm_udata[VMBUS_CHANMSG_CHOPEN_UDATA_SIZE];
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_CHOPEN_RESP */
+struct vmbus_chanmsg_chopen_resp {
+	struct vmbus_chanmsg_hdr chm_hdr;
+	uint32_t	chm_chanid;
+	uint32_t	chm_openid;
+	uint32_t	chm_status;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_GPADL_CONN */
+struct vmbus_chanmsg_gpadl_conn {
+	struct vmbus_chanmsg_hdr chm_hdr;
+	uint32_t	chm_chanid;
+	uint32_t	chm_gpadl;
+	uint16_t	chm_range_len;
+	uint16_t	chm_range_cnt;
+	struct vmbus_gpa_range chm_range;
+} __packed;
+
+#define VMBUS_CHANMSG_GPADL_CONN_PGMAX		26
+CTASSERT(__offsetof(struct vmbus_chanmsg_gpadl_conn,
+    chm_range.gpa_page[VMBUS_CHANMSG_GPADL_CONN_PGMAX]) <=
+    HYPERCALL_POSTMSGIN_DSIZE_MAX);
+
+/* VMBUS_CHANMSG_TYPE_GPADL_SUBCONN */
+struct vmbus_chanmsg_gpadl_subconn {
+	struct vmbus_chanmsg_hdr chm_hdr;
+	uint32_t	chm_msgno;
+	uint32_t	chm_gpadl;
+	uint64_t	chm_gpa_page[];
+} __packed;
+
+#define VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX	28
+CTASSERT(__offsetof(struct vmbus_chanmsg_gpadl_subconn,
+    chm_gpa_page[VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX]) <=
+    HYPERCALL_POSTMSGIN_DSIZE_MAX);
+
+/* VMBUS_CHANMSG_TYPE_GPADL_CONNRESP */
+struct vmbus_chanmsg_gpadl_connresp {
+	struct vmbus_chanmsg_hdr chm_hdr;
+	uint32_t	chm_chanid;
+	uint32_t	chm_gpadl;
+	uint32_t	chm_status;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_CHCLOSE */
+struct vmbus_chanmsg_chclose {
+	struct vmbus_chanmsg_hdr chm_hdr;
+	uint32_t	chm_chanid;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_GPADL_DISCONN */
+struct vmbus_chanmsg_gpadl_disconn {
+	struct vmbus_chanmsg_hdr chm_hdr;
+	uint32_t	chm_chanid;
+	uint32_t	chm_gpadl;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_CHFREE */
+struct vmbus_chanmsg_chfree {
+	struct vmbus_chanmsg_hdr chm_hdr;
+	uint32_t	chm_chanid;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_CHRESCIND */
+struct vmbus_chanmsg_chrescind {
+	struct vmbus_chanmsg_hdr chm_hdr;
+	uint32_t	chm_chanid;
+} __packed;
+
+/* Size of the user defined data buffer for non-pipe offers */
+#define VMBUS_CHANMSG_CHOFFER_UDATA_SIZE		120
+
+/* Size of the user defined data buffer for pipe offers. */
+#define VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE		116
+
+/* VMBUS_CHANMSG_TYPE_CHOFFER */
+struct vmbus_chanmsg_choffer {
+	struct vmbus_chanmsg_hdr chm_hdr;
+	struct hyperv_guid chm_chtype;
+	struct hyperv_guid chm_chinst;
+	uint64_t	chm_chlat;	/* unit: 100ns */
+	uint32_t	chm_chrev;
+	uint32_t	chm_svrctx_sz;
+	uint16_t	chm_chflags;
+	uint16_t	chm_mmio_sz;	/* unit: MB */
+
+	union {
+		/* Non-pipes */
+		struct {
+			uint8_t	user_def[VMBUS_CHANMSG_CHOFFER_UDATA_SIZE];
+		} std;
+		/*
+		 * Pipes:
+		 * For integrated pipe protocol, which is implemented on
+		 * top of standard user-defined data. Pipe clients have
+		 * VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE bytes left for
+		 * their own user.
+		 */
+		struct {
+			uint32_t pipe_mode;
+			uint8_t
+			    user_def[VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE];
+		} pipe;
+	} chm_udata;
+
+	uint16_t	chm_subidx;
+	uint16_t	chm_rsvd;
+	uint32_t	chm_chanid;
+	uint8_t		chm_montrig;
+	uint8_t		chm_flags1;	/* VMBUS_CHOFFER_FLAG1_ */
+	uint16_t	chm_flags2;
+	uint32_t	chm_connid;
+} __packed;
+CTASSERT(sizeof(struct vmbus_chanmsg_choffer) <= VMBUS_MSG_DSIZE_MAX);
+
+/* Server Flag */
+#define VMBUS_CHAN_TLNPI_PROVIDER_OFFER			0x2000
+
+#define VMBUS_CHOFFER_FLAG1_HASMNF	0x01
+
+#endif	/* !_VMBUS_REG_H_ */
diff --git a/sys/dev/hyperv/vmbus/vmbus_res.c b/sys/dev/hyperv/vmbus/vmbus_res.c
new file mode 100644
index 000000000000..fba5a732ca58
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_res.c
@@ -0,0 +1,99 @@
+/*-
+ * Copyright (c) 2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/module.h>
+
+#include <contrib/dev/acpica/include/acpi.h>
+#include <dev/acpica/acpivar.h>
+
+#include <dev/hyperv/include/hyperv.h>
+
+#include "acpi_if.h"
+#include "bus_if.h"
+
+static int		vmbus_res_probe(device_t);
+static int		vmbus_res_attach(device_t);
+static int		vmbus_res_detach(device_t);
+
+static device_method_t vmbus_res_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,			vmbus_res_probe),
+	DEVMETHOD(device_attach,		vmbus_res_attach),
+	DEVMETHOD(device_detach,		vmbus_res_detach),
+	DEVMETHOD(device_shutdown,		bus_generic_shutdown),
+	DEVMETHOD(device_suspend,		bus_generic_suspend),
+	DEVMETHOD(device_resume,		bus_generic_resume),
+
+	DEVMETHOD_END
+};
+
+static driver_t vmbus_res_driver = {
+	"vmbus_res",
+	vmbus_res_methods,
+	1
+};
+
+static devclass_t vmbus_res_devclass;
+
+DRIVER_MODULE(vmbus_res, acpi, vmbus_res_driver, vmbus_res_devclass,
+    NULL, NULL);
+MODULE_DEPEND(vmbus_res, acpi, 1, 1, 1);
+MODULE_VERSION(vmbus_res, 1);
+
+static int
+vmbus_res_probe(device_t dev)
+{
+	char *id[] = { "VMBUS", NULL };
+	int rv;
+	
+	if (device_get_unit(dev) != 0 || vm_guest != VM_GUEST_HV ||
+	    (hyperv_features & CPUID_HV_MSR_SYNIC) == 0)
+		return (ENXIO);
+	rv = ACPI_ID_PROBE(device_get_parent(dev), dev, id, NULL);
+	if (rv <= 0)
+		device_set_desc(dev, "Hyper-V Vmbus Resource");
+	return (rv);
+}
+
+static int
+vmbus_res_attach(device_t dev __unused)
+{
+
+	return (0);
+}
+
+static int
+vmbus_res_detach(device_t dev __unused)
+{
+
+	return (0);
+}
diff --git a/sys/dev/hyperv/vmbus/vmbus_var.h b/sys/dev/hyperv/vmbus/vmbus_var.h
new file mode 100644
index 000000000000..0e42d70d8257
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_var.h
@@ -0,0 +1,175 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_VAR_H_
+#define _VMBUS_VAR_H_
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/taskqueue.h>
+#include <sys/rman.h>
+
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcib_private.h>
+
+/*
+ * NOTE: DO NOT CHANGE THIS.
+ */
+#define VMBUS_SINT_MESSAGE	2
+/*
+ * NOTE:
+ * - DO NOT set it to the same value as VMBUS_SINT_MESSAGE.
+ * - DO NOT set it to 0.
+ */
+#define VMBUS_SINT_TIMER	4
+
+/*
+ * NOTE: DO NOT CHANGE THESE
+ */
+#define VMBUS_CONNID_MESSAGE		1
+#define VMBUS_CONNID_EVENT		2
+
+struct vmbus_message;
+struct vmbus_softc;
+
+typedef void		(*vmbus_chanmsg_proc_t)(struct vmbus_softc *,
+			    const struct vmbus_message *);
+
+#define VMBUS_CHANMSG_PROC(name, func)	\
+	[VMBUS_CHANMSG_TYPE_##name] = func
+#define VMBUS_CHANMSG_PROC_WAKEUP(name)	\
+	VMBUS_CHANMSG_PROC(name, vmbus_msghc_wakeup)
+
+struct vmbus_pcpu_data {
+	u_long			*intr_cnt;	/* Hyper-V interrupt counter */
+	struct vmbus_message	*message;	/* shared messages */
+	uint32_t		vcpuid;		/* virtual cpuid */
+	int			event_flags_cnt;/* # of event flags */
+	struct vmbus_evtflags	*event_flags;	/* event flags from host */
+
+	/* Rarely used fields */
+	struct hyperv_dma	message_dma;	/* busdma glue */
+	struct hyperv_dma	event_flags_dma;/* busdma glue */
+	struct taskqueue	*event_tq;	/* event taskq */
+	struct taskqueue	*message_tq;	/* message taskq */
+	struct task		message_task;	/* message task */
+} __aligned(CACHE_LINE_SIZE);
+
+#if __FreeBSD_version < 1100000
+typedef u_long rman_res_t;
+#endif
+
+struct vmbus_softc {
+	void			(*vmbus_event_proc)(struct vmbus_softc *, int);
+	u_long			*vmbus_tx_evtflags;
+						/* event flags to host */
+	struct vmbus_mnf	*vmbus_mnf2;	/* monitored by host */
+
+	u_long			*vmbus_rx_evtflags;
+						/* compat evtflgs from host */
+	struct vmbus_channel *volatile *vmbus_chmap;
+	struct vmbus_xact_ctx	*vmbus_xc;
+	struct vmbus_pcpu_data	vmbus_pcpu[MAXCPU];
+
+	/*
+	 * Rarely used fields
+	 */
+
+	device_t		vmbus_dev;
+	int			vmbus_idtvec;
+	uint32_t		vmbus_flags;	/* see VMBUS_FLAG_ */
+	uint32_t		vmbus_version;
+	uint32_t		vmbus_gpadl;
+
+	/* Shared memory for vmbus_{rx,tx}_evtflags */
+	void			*vmbus_evtflags;
+	struct hyperv_dma	vmbus_evtflags_dma;
+
+	void			*vmbus_mnf1;	/* monitored by VM, unused */
+	struct hyperv_dma	vmbus_mnf1_dma;
+	struct hyperv_dma	vmbus_mnf2_dma;
+
+	bool			vmbus_scandone;
+	struct task		vmbus_scandone_task;
+
+	struct taskqueue	*vmbus_devtq;	/* for dev attach/detach */
+	struct taskqueue	*vmbus_subchtq;	/* for sub-chan attach/detach */
+
+	/* Primary channels */
+	struct mtx		vmbus_prichan_lock;
+	TAILQ_HEAD(, vmbus_channel) vmbus_prichans;
+
+	/* Complete channel list */
+	struct mtx		vmbus_chan_lock;
+	TAILQ_HEAD(, vmbus_channel) vmbus_chans;
+
+	struct intr_config_hook	vmbus_intrhook;
+
+#ifdef NEW_PCIB
+	/* The list of usable MMIO ranges for PCIe pass-through */
+	struct pcib_host_resources vmbus_mmio_res;
+#endif
+};
+
+#define VMBUS_FLAG_ATTACHED	0x0001	/* vmbus was attached */
+#define VMBUS_FLAG_SYNIC	0x0002	/* SynIC was setup */
+
+#define VMBUS_PCPU_GET(sc, field, cpu)	(sc)->vmbus_pcpu[(cpu)].field
+#define VMBUS_PCPU_PTR(sc, field, cpu)	&(sc)->vmbus_pcpu[(cpu)].field
+
+struct vmbus_channel;
+struct trapframe;
+struct vmbus_message;
+struct vmbus_msghc;
+
+void		vmbus_handle_intr(struct trapframe *);
+int		vmbus_add_child(struct vmbus_channel *);
+int		vmbus_delete_child(struct vmbus_channel *);
+void		vmbus_et_intr(struct trapframe *);
+uint32_t	vmbus_gpadl_alloc(struct vmbus_softc *);
+
+struct vmbus_msghc *
+		vmbus_msghc_get(struct vmbus_softc *, size_t);
+void		vmbus_msghc_put(struct vmbus_softc *, struct vmbus_msghc *);
+void		*vmbus_msghc_dataptr(struct vmbus_msghc *);
+int		vmbus_msghc_exec_noresult(struct vmbus_msghc *);
+int		vmbus_msghc_exec(struct vmbus_softc *, struct vmbus_msghc *);
+void		vmbus_msghc_exec_cancel(struct vmbus_softc *,
+		    struct vmbus_msghc *);
+const struct vmbus_message *
+		vmbus_msghc_wait_result(struct vmbus_softc *,
+		    struct vmbus_msghc *);
+const struct vmbus_message *
+		vmbus_msghc_poll_result(struct vmbus_softc *,
+		    struct vmbus_msghc *);
+void		vmbus_msghc_wakeup(struct vmbus_softc *,
+		    const struct vmbus_message *);
+void		vmbus_msghc_reset(struct vmbus_msghc *, size_t);
+
+#endif	/* !_VMBUS_VAR_H_ */
diff --git a/sys/dev/hyperv/vmbus/vmbus_xact.c b/sys/dev/hyperv/vmbus/vmbus_xact.c
new file mode 100644
index 000000000000..90bdba7e1058
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_xact.c
@@ -0,0 +1,442 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+
+struct vmbus_xact {
+	struct vmbus_xact_ctx		*x_ctx;
+	void				*x_priv;
+
+	void				*x_req;
+	struct hyperv_dma		x_req_dma;
+
+	const void			*x_resp;
+	size_t				x_resp_len;
+	void				*x_resp0;
+};
+
+struct vmbus_xact_ctx {
+	size_t				xc_req_size;
+	size_t				xc_resp_size;
+	size_t				xc_priv_size;
+
+	struct mtx			xc_lock;
+	/*
+	 * Protected by xc_lock.
+	 */
+	uint32_t			xc_flags;	/* VMBUS_XACT_CTXF_ */
+	struct vmbus_xact		*xc_free;
+	struct vmbus_xact		*xc_active;
+	struct vmbus_xact		*xc_orphan;
+};
+
+#define VMBUS_XACT_CTXF_DESTROY		0x0001
+
+static struct vmbus_xact	*vmbus_xact_alloc(struct vmbus_xact_ctx *,
+				    bus_dma_tag_t);
+static void			vmbus_xact_free(struct vmbus_xact *);
+static struct vmbus_xact	*vmbus_xact_get1(struct vmbus_xact_ctx *,
+				    uint32_t);
+static const void		*vmbus_xact_wait1(struct vmbus_xact *, size_t *,
+				    bool);
+static const void		*vmbus_xact_return(struct vmbus_xact *,
+				    size_t *);
+static void			vmbus_xact_save_resp(struct vmbus_xact *,
+				    const void *, size_t);
+static void			vmbus_xact_ctx_free(struct vmbus_xact_ctx *);
+
+static struct vmbus_xact *
+vmbus_xact_alloc(struct vmbus_xact_ctx *ctx, bus_dma_tag_t parent_dtag)
+{
+	struct vmbus_xact *xact;
+
+	xact = malloc(sizeof(*xact), M_DEVBUF, M_WAITOK | M_ZERO);
+	xact->x_ctx = ctx;
+
+	/* XXX assume that page aligned is enough */
+	xact->x_req = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0,
+	    ctx->xc_req_size, &xact->x_req_dma, BUS_DMA_WAITOK);
+	if (xact->x_req == NULL) {
+		free(xact, M_DEVBUF);
+		return (NULL);
+	}
+	if (ctx->xc_priv_size != 0)
+		xact->x_priv = malloc(ctx->xc_priv_size, M_DEVBUF, M_WAITOK);
+	xact->x_resp0 = malloc(ctx->xc_resp_size, M_DEVBUF, M_WAITOK);
+
+	return (xact);
+}
+
+static void
+vmbus_xact_free(struct vmbus_xact *xact)
+{
+
+	hyperv_dmamem_free(&xact->x_req_dma, xact->x_req);
+	free(xact->x_resp0, M_DEVBUF);
+	if (xact->x_priv != NULL)
+		free(xact->x_priv, M_DEVBUF);
+	free(xact, M_DEVBUF);
+}
+
+static struct vmbus_xact *
+vmbus_xact_get1(struct vmbus_xact_ctx *ctx, uint32_t dtor_flag)
+{
+	struct vmbus_xact *xact;
+
+	mtx_lock(&ctx->xc_lock);
+
+	while ((ctx->xc_flags & dtor_flag) == 0 && ctx->xc_free == NULL)
+		mtx_sleep(&ctx->xc_free, &ctx->xc_lock, 0, "gxact", 0);
+	if (ctx->xc_flags & dtor_flag) {
+		/* Being destroyed */
+		xact = NULL;
+	} else {
+		xact = ctx->xc_free;
+		KASSERT(xact != NULL, ("no free xact"));
+		KASSERT(xact->x_resp == NULL, ("xact has pending response"));
+		ctx->xc_free = NULL;
+	}
+
+	mtx_unlock(&ctx->xc_lock);
+
+	return (xact);
+}
+
+struct vmbus_xact_ctx *
+vmbus_xact_ctx_create(bus_dma_tag_t dtag, size_t req_size, size_t resp_size,
+    size_t priv_size)
+{
+	struct vmbus_xact_ctx *ctx;
+
+	KASSERT(req_size > 0, ("request size is 0"));
+	KASSERT(resp_size > 0, ("response size is 0"));
+
+	ctx = malloc(sizeof(*ctx), M_DEVBUF, M_WAITOK | M_ZERO);
+	ctx->xc_req_size = req_size;
+	ctx->xc_resp_size = resp_size;
+	ctx->xc_priv_size = priv_size;
+
+	ctx->xc_free = vmbus_xact_alloc(ctx, dtag);
+	if (ctx->xc_free == NULL) {
+		free(ctx, M_DEVBUF);
+		return (NULL);
+	}
+
+	mtx_init(&ctx->xc_lock, "vmbus xact", NULL, MTX_DEF);
+
+	return (ctx);
+}
+
+bool
+vmbus_xact_ctx_orphan(struct vmbus_xact_ctx *ctx)
+{
+	mtx_lock(&ctx->xc_lock);
+	if (ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) {
+		mtx_unlock(&ctx->xc_lock);
+		return (false);
+	}
+	ctx->xc_flags |= VMBUS_XACT_CTXF_DESTROY;
+	mtx_unlock(&ctx->xc_lock);
+
+	wakeup(&ctx->xc_free);
+	wakeup(&ctx->xc_active);
+
+	ctx->xc_orphan = vmbus_xact_get1(ctx, 0);
+	if (ctx->xc_orphan == NULL)
+		panic("can't get xact");
+	return (true);
+}
+
+static void
+vmbus_xact_ctx_free(struct vmbus_xact_ctx *ctx)
+{
+	KASSERT(ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY,
+	    ("xact ctx was not orphaned"));
+	KASSERT(ctx->xc_orphan != NULL, ("no orphaned xact"));
+
+	vmbus_xact_free(ctx->xc_orphan);
+	mtx_destroy(&ctx->xc_lock);
+	free(ctx, M_DEVBUF);
+}
+
+void
+vmbus_xact_ctx_destroy(struct vmbus_xact_ctx *ctx)
+{
+
+	vmbus_xact_ctx_orphan(ctx);
+	vmbus_xact_ctx_free(ctx);
+}
+
+struct vmbus_xact *
+vmbus_xact_get(struct vmbus_xact_ctx *ctx, size_t req_len)
+{
+	struct vmbus_xact *xact;
+
+	if (req_len > ctx->xc_req_size)
+		panic("invalid request size %zu", req_len);
+
+	xact = vmbus_xact_get1(ctx, VMBUS_XACT_CTXF_DESTROY);
+	if (xact == NULL)
+		return (NULL);
+
+	memset(xact->x_req, 0, req_len);
+	return (xact);
+}
+
+void
+vmbus_xact_put(struct vmbus_xact *xact)
+{
+	struct vmbus_xact_ctx *ctx = xact->x_ctx;
+
+	KASSERT(ctx->xc_active == NULL, ("pending active xact"));
+	xact->x_resp = NULL;
+
+	mtx_lock(&ctx->xc_lock);
+	KASSERT(ctx->xc_free == NULL, ("has free xact"));
+	ctx->xc_free = xact;
+	mtx_unlock(&ctx->xc_lock);
+	wakeup(&ctx->xc_free);
+}
+
+void *
+vmbus_xact_req_data(const struct vmbus_xact *xact)
+{
+
+	return (xact->x_req);
+}
+
+bus_addr_t
+vmbus_xact_req_paddr(const struct vmbus_xact *xact)
+{
+
+	return (xact->x_req_dma.hv_paddr);
+}
+
+void *
+vmbus_xact_priv(const struct vmbus_xact *xact, size_t priv_len)
+{
+
+	if (priv_len > xact->x_ctx->xc_priv_size)
+		panic("invalid priv size %zu", priv_len);
+	return (xact->x_priv);
+}
+
+void
+vmbus_xact_activate(struct vmbus_xact *xact)
+{
+	struct vmbus_xact_ctx *ctx = xact->x_ctx;
+
+	KASSERT(xact->x_resp == NULL, ("xact has pending response"));
+
+	mtx_lock(&ctx->xc_lock);
+	KASSERT(ctx->xc_active == NULL, ("pending active xact"));
+	ctx->xc_active = xact;
+	mtx_unlock(&ctx->xc_lock);
+}
+
+void
+vmbus_xact_deactivate(struct vmbus_xact *xact)
+{
+	struct vmbus_xact_ctx *ctx = xact->x_ctx;
+
+	mtx_lock(&ctx->xc_lock);
+	KASSERT(ctx->xc_active == xact, ("xact mismatch"));
+	ctx->xc_active = NULL;
+	mtx_unlock(&ctx->xc_lock);
+}
+
+static const void *
+vmbus_xact_return(struct vmbus_xact *xact, size_t *resp_len)
+{
+	struct vmbus_xact_ctx *ctx = xact->x_ctx;
+	const void *resp;
+
+	mtx_assert(&ctx->xc_lock, MA_OWNED);
+	KASSERT(ctx->xc_active == xact, ("xact trashed"));
+
+	if ((ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) && xact->x_resp == NULL) {
+		uint8_t b = 0;
+
+		/*
+		 * Orphaned and no response was received yet; fake up
+		 * an one byte response.
+		 */
+		printf("vmbus: xact ctx was orphaned w/ pending xact\n");
+		vmbus_xact_save_resp(ctx->xc_active, &b, sizeof(b));
+	}
+	KASSERT(xact->x_resp != NULL, ("no response"));
+
+	ctx->xc_active = NULL;
+
+	resp = xact->x_resp;
+	*resp_len = xact->x_resp_len;
+
+	return (resp);
+}
+
+static const void *
+vmbus_xact_wait1(struct vmbus_xact *xact, size_t *resp_len,
+    bool can_sleep)
+{
+	struct vmbus_xact_ctx *ctx = xact->x_ctx;
+	const void *resp;
+
+	mtx_lock(&ctx->xc_lock);
+
+	KASSERT(ctx->xc_active == xact, ("xact mismatch"));
+	while (xact->x_resp == NULL &&
+	    (ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) == 0) {
+		if (can_sleep) {
+			mtx_sleep(&ctx->xc_active, &ctx->xc_lock, 0,
+			    "wxact", 0);
+		} else {
+			mtx_unlock(&ctx->xc_lock);
+			DELAY(1000);
+			mtx_lock(&ctx->xc_lock);
+		}
+	}
+	resp = vmbus_xact_return(xact, resp_len);
+
+	mtx_unlock(&ctx->xc_lock);
+
+	return (resp);
+}
+
+const void *
+vmbus_xact_wait(struct vmbus_xact *xact, size_t *resp_len)
+{
+
+	return (vmbus_xact_wait1(xact, resp_len, true /* can sleep */));
+}
+
+const void *
+vmbus_xact_busywait(struct vmbus_xact *xact, size_t *resp_len)
+{
+
+	return (vmbus_xact_wait1(xact, resp_len, false /* can't sleep */));
+}
+
+const void *
+vmbus_xact_poll(struct vmbus_xact *xact, size_t *resp_len)
+{
+	struct vmbus_xact_ctx *ctx = xact->x_ctx;
+	const void *resp;
+
+	mtx_lock(&ctx->xc_lock);
+
+	KASSERT(ctx->xc_active == xact, ("xact mismatch"));
+	if (xact->x_resp == NULL &&
+	    (ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) == 0) {
+		mtx_unlock(&ctx->xc_lock);
+		*resp_len = 0;
+		return (NULL);
+	}
+	resp = vmbus_xact_return(xact, resp_len);
+
+	mtx_unlock(&ctx->xc_lock);
+
+	return (resp);
+}
+
+static void
+vmbus_xact_save_resp(struct vmbus_xact *xact, const void *data, size_t dlen)
+{
+	struct vmbus_xact_ctx *ctx = xact->x_ctx;
+	size_t cplen = dlen;
+
+	mtx_assert(&ctx->xc_lock, MA_OWNED);
+
+	if (cplen > ctx->xc_resp_size) {
+		printf("vmbus: xact response truncated %zu -> %zu\n",
+		    cplen, ctx->xc_resp_size);
+		cplen = ctx->xc_resp_size;
+	}
+
+	KASSERT(ctx->xc_active == xact, ("xact mismatch"));
+	memcpy(xact->x_resp0, data, cplen);
+	xact->x_resp_len = cplen;
+	xact->x_resp = xact->x_resp0;
+}
+
+void
+vmbus_xact_wakeup(struct vmbus_xact *xact, const void *data, size_t dlen)
+{
+	struct vmbus_xact_ctx *ctx = xact->x_ctx;
+	int do_wakeup = 0;
+
+	mtx_lock(&ctx->xc_lock);
+	/*
+	 * NOTE:
+	 * xc_active could be NULL, if the ctx has been orphaned.
+	 */
+	if (ctx->xc_active != NULL) {
+		vmbus_xact_save_resp(xact, data, dlen);
+		do_wakeup = 1;
+	} else {
+		KASSERT(ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY,
+		    ("no active xact pending"));
+		printf("vmbus: drop xact response\n");
+	}
+	mtx_unlock(&ctx->xc_lock);
+
+	if (do_wakeup)
+		wakeup(&ctx->xc_active);
+}
+
+void
+vmbus_xact_ctx_wakeup(struct vmbus_xact_ctx *ctx, const void *data, size_t dlen)
+{
+	int do_wakeup = 0;
+
+	mtx_lock(&ctx->xc_lock);
+	/*
+	 * NOTE:
+	 * xc_active could be NULL, if the ctx has been orphaned.
+	 */
+	if (ctx->xc_active != NULL) {
+		vmbus_xact_save_resp(ctx->xc_active, data, dlen);
+		do_wakeup = 1;
+	} else {
+		KASSERT(ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY,
+		    ("no active xact pending"));
+		printf("vmbus: drop xact response\n");
+	}
+	mtx_unlock(&ctx->xc_lock);
+
+	if (do_wakeup)
+		wakeup(&ctx->xc_active);
+}