aboutsummaryrefslogtreecommitdiff
path: root/sys/dev/hyperv
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev/hyperv')
-rw-r--r--sys/dev/hyperv/hvsock/hv_sock.c1762
-rw-r--r--sys/dev/hyperv/hvsock/hv_sock.h122
-rw-r--r--sys/dev/hyperv/include/hyperv.h104
-rw-r--r--sys/dev/hyperv/include/hyperv_busdma.h49
-rw-r--r--sys/dev/hyperv/include/vmbus.h261
-rw-r--r--sys/dev/hyperv/include/vmbus_xact.h65
-rw-r--r--sys/dev/hyperv/input/hv_kbd.c857
-rw-r--r--sys/dev/hyperv/input/hv_kbdc.c530
-rw-r--r--sys/dev/hyperv/input/hv_kbdc.h118
-rw-r--r--sys/dev/hyperv/netvsc/hn_nvs.c751
-rw-r--r--sys/dev/hyperv/netvsc/hn_nvs.h107
-rw-r--r--sys/dev/hyperv/netvsc/hn_rndis.c1061
-rw-r--r--sys/dev/hyperv/netvsc/hn_rndis.h50
-rw-r--r--sys/dev/hyperv/netvsc/if_hn.c7717
-rw-r--r--sys/dev/hyperv/netvsc/if_hnreg.h270
-rw-r--r--sys/dev/hyperv/netvsc/if_hnvar.h335
-rw-r--r--sys/dev/hyperv/netvsc/ndis.h422
-rw-r--r--sys/dev/hyperv/pcib/vmbus_pcib.c1897
-rw-r--r--sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c2515
-rw-r--r--sys/dev/hyperv/storvsc/hv_vstorage.h311
-rw-r--r--sys/dev/hyperv/utilities/hv_kvp.c920
-rw-r--r--sys/dev/hyperv/utilities/hv_kvp.h229
-rw-r--r--sys/dev/hyperv/utilities/hv_snapshot.c1061
-rw-r--r--sys/dev/hyperv/utilities/hv_snapshot.h56
-rw-r--r--sys/dev/hyperv/utilities/hv_utilreg.h86
-rw-r--r--sys/dev/hyperv/utilities/unicode.h201
-rw-r--r--sys/dev/hyperv/utilities/vmbus_heartbeat.c152
-rw-r--r--sys/dev/hyperv/utilities/vmbus_ic.c299
-rw-r--r--sys/dev/hyperv/utilities/vmbus_icreg.h135
-rw-r--r--sys/dev/hyperv/utilities/vmbus_icvar.h61
-rw-r--r--sys/dev/hyperv/utilities/vmbus_shutdown.c167
-rw-r--r--sys/dev/hyperv/utilities/vmbus_timesync.c260
-rw-r--r--sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c236
-rw-r--r--sys/dev/hyperv/vmbus/amd64/vmbus_vector.S44
-rw-r--r--sys/dev/hyperv/vmbus/hyperv.c340
-rw-r--r--sys/dev/hyperv/vmbus/hyperv_busdma.c98
-rw-r--r--sys/dev/hyperv/vmbus/hyperv_machdep.h37
-rw-r--r--sys/dev/hyperv/vmbus/hyperv_reg.h193
-rw-r--r--sys/dev/hyperv/vmbus/hyperv_var.h37
-rw-r--r--sys/dev/hyperv/vmbus/i386/hyperv_machdep.c51
-rw-r--r--sys/dev/hyperv/vmbus/i386/vmbus_vector.S54
-rw-r--r--sys/dev/hyperv/vmbus/vmbus.c1679
-rw-r--r--sys/dev/hyperv/vmbus/vmbus_br.c720
-rw-r--r--sys/dev/hyperv/vmbus/vmbus_brvar.h157
-rw-r--r--sys/dev/hyperv/vmbus/vmbus_chan.c2390
-rw-r--r--sys/dev/hyperv/vmbus/vmbus_chanvar.h195
-rw-r--r--sys/dev/hyperv/vmbus/vmbus_et.c201
-rw-r--r--sys/dev/hyperv/vmbus/vmbus_if.m60
-rw-r--r--sys/dev/hyperv/vmbus/vmbus_reg.h427
-rw-r--r--sys/dev/hyperv/vmbus/vmbus_res.c99
-rw-r--r--sys/dev/hyperv/vmbus/vmbus_var.h175
-rw-r--r--sys/dev/hyperv/vmbus/vmbus_xact.c442
52 files changed, 30566 insertions, 0 deletions
diff --git a/sys/dev/hyperv/hvsock/hv_sock.c b/sys/dev/hyperv/hvsock/hv_sock.c
new file mode 100644
index 000000000000..6d5ad4fc6609
--- /dev/null
+++ b/sys/dev/hyperv/hvsock/hv_sock.c
@@ -0,0 +1,1762 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/domain.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/sockbuf.h>
+#include <sys/sx.h>
+#include <sys/uio.h>
+
+#include <net/vnet.h>
+
+#include <dev/hyperv/vmbus/vmbus_reg.h>
+
+#include "hv_sock.h"
+
+#define HVSOCK_DBG_NONE 0x0
+#define HVSOCK_DBG_INFO 0x1
+#define HVSOCK_DBG_ERR 0x2
+#define HVSOCK_DBG_VERBOSE 0x3
+
+
+SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket");
+
+static int hvs_dbg_level;
+SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level,
+ 0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose");
+
+
+#define HVSOCK_DBG(level, ...) do { \
+ if (hvs_dbg_level >= (level)) \
+ printf(__VA_ARGS__); \
+ } while (0)
+
+MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures");
+
+static int hvs_dom_probe(void);
+
+/* The MTU is 16KB per host side's design */
+#define HVSOCK_MTU_SIZE (1024 * 16)
+#define HVSOCK_SEND_BUF_SZ (PAGE_SIZE - sizeof(struct vmpipe_proto_header))
+
+#define HVSOCK_HEADER_LEN (sizeof(struct hvs_pkt_header))
+
+#define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \
+ roundup2(payload_len, 8) + \
+ sizeof(uint64_t))
+
+
+static struct domain hv_socket_domain;
+
+/*
+ * HyperV Transport sockets
+ */
+static struct pr_usrreqs hvs_trans_usrreqs = {
+ .pru_attach = hvs_trans_attach,
+ .pru_bind = hvs_trans_bind,
+ .pru_listen = hvs_trans_listen,
+ .pru_accept = hvs_trans_accept,
+ .pru_connect = hvs_trans_connect,
+ .pru_peeraddr = hvs_trans_peeraddr,
+ .pru_sockaddr = hvs_trans_sockaddr,
+ .pru_soreceive = hvs_trans_soreceive,
+ .pru_sosend = hvs_trans_sosend,
+ .pru_disconnect = hvs_trans_disconnect,
+ .pru_close = hvs_trans_close,
+ .pru_detach = hvs_trans_detach,
+ .pru_shutdown = hvs_trans_shutdown,
+ .pru_abort = hvs_trans_abort,
+};
+
+/*
+ * Definitions of protocols supported in HyperV socket domain
+ */
+static struct protosw hv_socket_protosw[] = {
+{
+ .pr_type = SOCK_STREAM,
+ .pr_domain = &hv_socket_domain,
+ .pr_protocol = HYPERV_SOCK_PROTO_TRANS,
+ .pr_flags = PR_CONNREQUIRED,
+ .pr_init = hvs_trans_init,
+ .pr_usrreqs = &hvs_trans_usrreqs,
+},
+};
+
+static struct domain hv_socket_domain = {
+ .dom_family = AF_HYPERV,
+ .dom_name = "hyperv",
+ .dom_probe = hvs_dom_probe,
+ .dom_protosw = hv_socket_protosw,
+ .dom_protoswNPROTOSW = &hv_socket_protosw[nitems(hv_socket_protosw)]
+};
+
+VNET_DOMAIN_SET(hv_socket_);
+
+#define MAX_PORT ((uint32_t)0xFFFFFFFF)
+#define MIN_PORT ((uint32_t)0x0)
+
+/* 00000000-facb-11e6-bd58-64006a7986d3 */
+static const struct hyperv_guid srv_id_template = {
+ .hv_guid = {
+ 0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11,
+ 0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 }
+};
+
+static int hvsock_br_callback(void *, int, void *);
+static uint32_t hvsock_canread_check(struct hvs_pcb *);
+static uint32_t hvsock_canwrite_check(struct hvs_pcb *);
+static int hvsock_send_data(struct vmbus_channel *chan,
+ struct uio *uio, uint32_t to_write, struct sockbuf *sb);
+
+
+
+/* Globals */
+static struct sx hvs_trans_socks_sx;
+static struct mtx hvs_trans_socks_mtx;
+static LIST_HEAD(, hvs_pcb) hvs_trans_bound_socks;
+static LIST_HEAD(, hvs_pcb) hvs_trans_connected_socks;
+static uint32_t previous_auto_bound_port;
+
+static void
+hvsock_print_guid(struct hyperv_guid *guid)
+{
+ unsigned char *p = (unsigned char *)guid;
+
+ HVSOCK_DBG(HVSOCK_DBG_INFO,
+ "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n",
+ *(unsigned int *)p,
+ *((unsigned short *) &p[4]),
+ *((unsigned short *) &p[6]),
+ p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+}
+
+static bool
+is_valid_srv_id(const struct hyperv_guid *id)
+{
+ return !memcmp(&id->hv_guid[4],
+ &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4);
+}
+
+static unsigned int
+get_port_by_srv_id(const struct hyperv_guid *srv_id)
+{
+ return *((const unsigned int *)srv_id);
+}
+
+static void
+set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port)
+{
+ *((unsigned int *)srv_id) = port;
+}
+
+
+static void
+__hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list)
+{
+ struct hvs_pcb *p = NULL;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
+
+ if (!pcb)
+ return;
+
+ if (list & HVS_LIST_BOUND) {
+ LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
+ if (p == pcb)
+ LIST_REMOVE(p, bound_next);
+ }
+
+ if (list & HVS_LIST_CONNECTED) {
+ LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
+ if (p == pcb)
+ LIST_REMOVE(pcb, connected_next);
+ }
+}
+
+static void
+__hvs_remove_socket_from_list(struct socket *so, unsigned char list)
+{
+ struct hvs_pcb *pcb = so2hvspcb(so);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
+
+ __hvs_remove_pcb_from_list(pcb, list);
+}
+
+static void
+__hvs_insert_socket_on_list(struct socket *so, unsigned char list)
+{
+ struct hvs_pcb *pcb = so2hvspcb(so);
+
+ if (list & HVS_LIST_BOUND)
+ LIST_INSERT_HEAD(&hvs_trans_bound_socks,
+ pcb, bound_next);
+
+ if (list & HVS_LIST_CONNECTED)
+ LIST_INSERT_HEAD(&hvs_trans_connected_socks,
+ pcb, connected_next);
+}
+
+void
+hvs_remove_socket_from_list(struct socket *so, unsigned char list)
+{
+ if (!so || !so->so_pcb) {
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: socket or so_pcb is null\n", __func__);
+ return;
+ }
+
+ mtx_lock(&hvs_trans_socks_mtx);
+ __hvs_remove_socket_from_list(so, list);
+ mtx_unlock(&hvs_trans_socks_mtx);
+}
+
+static void
+hvs_insert_socket_on_list(struct socket *so, unsigned char list)
+{
+ if (!so || !so->so_pcb) {
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: socket or so_pcb is null\n", __func__);
+ return;
+ }
+
+ mtx_lock(&hvs_trans_socks_mtx);
+ __hvs_insert_socket_on_list(so, list);
+ mtx_unlock(&hvs_trans_socks_mtx);
+}
+
+static struct socket *
+__hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
+{
+ struct hvs_pcb *p = NULL;
+
+ if (list & HVS_LIST_BOUND)
+ LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
+ if (p->so != NULL &&
+ addr->hvs_port == p->local_addr.hvs_port)
+ return p->so;
+
+ if (list & HVS_LIST_CONNECTED)
+ LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
+ if (p->so != NULL &&
+ addr->hvs_port == p->local_addr.hvs_port)
+ return p->so;
+
+ return NULL;
+}
+
+static struct socket *
+hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
+{
+ struct socket *s = NULL;
+
+ mtx_lock(&hvs_trans_socks_mtx);
+ s = __hvs_find_socket_on_list(addr, list);
+ mtx_unlock(&hvs_trans_socks_mtx);
+
+ return s;
+}
+
+static inline void
+hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port)
+{
+ memset(addr, 0, sizeof(*addr));
+ addr->sa_family = AF_HYPERV;
+ addr->sa_len = sizeof(*addr);
+ addr->hvs_port = port;
+}
+
+void
+hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id)
+{
+ hvs_addr_set(addr, get_port_by_srv_id(svr_id));
+}
+
+int
+hvs_trans_lock(void)
+{
+ sx_xlock(&hvs_trans_socks_sx);
+ return (0);
+}
+
+void
+hvs_trans_unlock(void)
+{
+ sx_xunlock(&hvs_trans_socks_sx);
+}
+
+static int
+hvs_dom_probe(void)
+{
+
+ /* Don't even give us a chance to attach on non-HyperV. */
+ if (vm_guest != VM_GUEST_HV)
+ return (ENXIO);
+ return (0);
+}
+
+void
+hvs_trans_init(void)
+{
+ /* Skip initialization of globals for non-default instances. */
+ if (!IS_DEFAULT_VNET(curvnet))
+ return;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_init called\n", __func__);
+
+ /* Initialize Globals */
+ previous_auto_bound_port = MAX_PORT;
+ sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx");
+ mtx_init(&hvs_trans_socks_mtx,
+ "hvs_trans_socks_mtx", NULL, MTX_DEF);
+ LIST_INIT(&hvs_trans_bound_socks);
+ LIST_INIT(&hvs_trans_connected_socks);
+}
+
+/*
+ * Called in two cases:
+ * 1) When user calls socket();
+ * 2) When we accept new incoming conneciton and call sonewconn().
+ */
+int
+hvs_trans_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct hvs_pcb *pcb = so2hvspcb(so);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_attach called\n", __func__);
+
+ if (so->so_type != SOCK_STREAM)
+ return (ESOCKTNOSUPPORT);
+
+ if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS)
+ return (EPROTONOSUPPORT);
+
+ if (pcb != NULL)
+ return (EISCONN);
+ pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO);
+ if (pcb == NULL)
+ return (ENOMEM);
+
+ pcb->so = so;
+ so->so_pcb = (void *)pcb;
+
+ return (0);
+}
+
+void
+hvs_trans_detach(struct socket *so)
+{
+ struct hvs_pcb *pcb;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_detach called\n", __func__);
+
+ (void) hvs_trans_lock();
+ pcb = so2hvspcb(so);
+ if (pcb == NULL) {
+ hvs_trans_unlock();
+ return;
+ }
+
+ if (SOLISTENING(so)) {
+ bzero(pcb, sizeof(*pcb));
+ free(pcb, M_HVSOCK);
+ }
+
+ so->so_pcb = NULL;
+
+ hvs_trans_unlock();
+}
+
+int
+hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td)
+{
+ struct hvs_pcb *pcb = so2hvspcb(so);
+ struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr;
+ int error = 0;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_bind called\n", __func__);
+
+ if (sa == NULL) {
+ return (EINVAL);
+ }
+
+ if (pcb == NULL) {
+ return (EINVAL);
+ }
+
+ if (sa->sa_family != AF_HYPERV) {
+ HVSOCK_DBG(HVSOCK_DBG_ERR,
+ "%s: Not supported, sa_family is %u\n",
+ __func__, sa->sa_family);
+ return (EAFNOSUPPORT);
+ }
+ if (sa->sa_len != sizeof(*sa)) {
+ HVSOCK_DBG(HVSOCK_DBG_ERR,
+ "%s: Not supported, sa_len is %u\n",
+ __func__, sa->sa_len);
+ return (EINVAL);
+ }
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: binding port = 0x%x\n", __func__, sa->hvs_port);
+
+ mtx_lock(&hvs_trans_socks_mtx);
+ if (__hvs_find_socket_on_list(sa,
+ HVS_LIST_BOUND | HVS_LIST_CONNECTED)) {
+ error = EADDRINUSE;
+ } else {
+ /*
+ * The address is available for us to bind.
+ * Add socket to the bound list.
+ */
+ hvs_addr_set(&pcb->local_addr, sa->hvs_port);
+ hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY);
+ __hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
+ }
+ mtx_unlock(&hvs_trans_socks_mtx);
+
+ return (error);
+}
+
+int
+hvs_trans_listen(struct socket *so, int backlog, struct thread *td)
+{
+ struct hvs_pcb *pcb = so2hvspcb(so);
+ struct socket *bound_so;
+ int error;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_listen called\n", __func__);
+
+ if (pcb == NULL)
+ return (EINVAL);
+
+ /* Check if the address is already bound and it was by us. */
+ bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND);
+ if (bound_so == NULL || bound_so != so) {
+ HVSOCK_DBG(HVSOCK_DBG_ERR,
+ "%s: Address not bound or not by us.\n", __func__);
+ return (EADDRNOTAVAIL);
+ }
+
+ SOCK_LOCK(so);
+ error = solisten_proto_check(so);
+ if (error == 0)
+ solisten_proto(so, backlog);
+ SOCK_UNLOCK(so);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket listen error = %d\n", __func__, error);
+ return (error);
+}
+
+int
+hvs_trans_accept(struct socket *so, struct sockaddr **nam)
+{
+ struct hvs_pcb *pcb = so2hvspcb(so);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_accept called\n", __func__);
+
+ if (pcb == NULL)
+ return (EINVAL);
+
+ *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr,
+ M_NOWAIT);
+
+ return ((*nam == NULL) ? ENOMEM : 0);
+}
+
+int
+hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct hvs_pcb *pcb = so2hvspcb(so);
+ struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam;
+ bool found_auto_bound_port = false;
+ int i, error = 0;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n",
+ __func__, raddr->hvs_port);
+
+ if (pcb == NULL)
+ return (EINVAL);
+
+ /* Verify the remote address */
+ if (raddr == NULL)
+ return (EINVAL);
+ if (raddr->sa_family != AF_HYPERV)
+ return (EAFNOSUPPORT);
+ if (raddr->sa_len != sizeof(*raddr))
+ return (EINVAL);
+
+ mtx_lock(&hvs_trans_socks_mtx);
+ if (so->so_state &
+ (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) {
+ HVSOCK_DBG(HVSOCK_DBG_ERR,
+ "%s: socket connect in progress\n",
+ __func__);
+ error = EINPROGRESS;
+ goto out;
+ }
+
+ /*
+ * Find an available port for us to auto bind the local
+ * address.
+ */
+ hvs_addr_set(&pcb->local_addr, 0);
+
+ for (i = previous_auto_bound_port - 1;
+ i != previous_auto_bound_port; i --) {
+ if (i == MIN_PORT)
+ i = MAX_PORT;
+
+ pcb->local_addr.hvs_port = i;
+
+ if (__hvs_find_socket_on_list(&pcb->local_addr,
+ HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) {
+ found_auto_bound_port = true;
+ previous_auto_bound_port = i;
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: found local bound port is %x\n",
+ __func__, pcb->local_addr.hvs_port);
+ break;
+ }
+ }
+
+ if (found_auto_bound_port == true) {
+ /* Found available port for auto bound, put on list */
+ __hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
+ /* Set VM service ID */
+ pcb->vm_srv_id = srv_id_template;
+ set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port);
+ /* Set host service ID and remote port */
+ pcb->host_srv_id = srv_id_template;
+ set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port);
+ hvs_addr_set(&pcb->remote_addr, raddr->hvs_port);
+
+ /* Change the socket state to SS_ISCONNECTING */
+ soisconnecting(so);
+ } else {
+ HVSOCK_DBG(HVSOCK_DBG_ERR,
+ "%s: No local port available for auto bound\n",
+ __func__);
+ error = EADDRINUSE;
+ }
+
+ HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is ");
+ hvsock_print_guid(&pcb->vm_srv_id);
+ HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is ");
+ hvsock_print_guid(&pcb->host_srv_id);
+
+out:
+ mtx_unlock(&hvs_trans_socks_mtx);
+
+ if (found_auto_bound_port == true)
+ vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id);
+
+ return (error);
+}
+
+int
+hvs_trans_disconnect(struct socket *so)
+{
+ struct hvs_pcb *pcb;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_disconnect called\n", __func__);
+
+ (void) hvs_trans_lock();
+ pcb = so2hvspcb(so);
+ if (pcb == NULL) {
+ hvs_trans_unlock();
+ return (EINVAL);
+ }
+
+ /* If socket is already disconnected, skip this */
+ if ((so->so_state & SS_ISDISCONNECTED) == 0)
+ soisdisconnecting(so);
+
+ hvs_trans_unlock();
+
+ return (0);
+}
+
+struct hvs_callback_arg {
+ struct uio *uio;
+ struct sockbuf *sb;
+};
+
+int
+hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr,
+ struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+ struct hvs_pcb *pcb = so2hvspcb(so);
+ struct sockbuf *sb;
+ ssize_t orig_resid;
+ uint32_t canread, to_read;
+ int flags, error = 0;
+ struct hvs_callback_arg cbarg;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_soreceive called\n", __func__);
+
+ if (so->so_type != SOCK_STREAM)
+ return (EINVAL);
+ if (pcb == NULL)
+ return (EINVAL);
+
+ if (flagsp != NULL)
+ flags = *flagsp &~ MSG_EOR;
+ else
+ flags = 0;
+
+ if (flags & MSG_PEEK)
+ return (EOPNOTSUPP);
+
+ /* If no space to copy out anything */
+ if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ)
+ return (EINVAL);
+
+ orig_resid = uio->uio_resid;
+
+ /* Prevent other readers from entering the socket. */
+ error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
+ if (error) {
+ HVSOCK_DBG(HVSOCK_DBG_ERR,
+ "%s: soiolock returned error = %d\n", __func__, error);
+ return (error);
+ }
+
+ sb = &so->so_rcv;
+ SOCKBUF_LOCK(sb);
+
+ cbarg.uio = uio;
+ cbarg.sb = sb;
+ /*
+ * If the socket is closing, there might still be some data
+ * in rx br to read. However we need to make sure
+ * the channel is still open.
+ */
+ if ((sb->sb_state & SBS_CANTRCVMORE) &&
+ (so->so_state & SS_ISDISCONNECTED)) {
+ /* Other thread already closed the channel */
+ error = EPIPE;
+ goto out;
+ }
+
+ while (true) {
+ while (uio->uio_resid > 0 &&
+ (canread = hvsock_canread_check(pcb)) > 0) {
+ to_read = MIN(canread, uio->uio_resid);
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: to_read = %u, skip = %u\n", __func__, to_read,
+ (unsigned int)(sizeof(struct hvs_pkt_header) +
+ pcb->recv_data_off));
+
+ error = vmbus_chan_recv_peek_call(pcb->chan, to_read,
+ sizeof(struct hvs_pkt_header) + pcb->recv_data_off,
+ hvsock_br_callback, (void *)&cbarg);
+ /*
+ * It is possible socket is disconnected becasue
+ * we released lock in hvsock_br_callback. So we
+ * need to check the state to make sure it is not
+ * disconnected.
+ */
+ if (error || so->so_state & SS_ISDISCONNECTED) {
+ break;
+ }
+
+ pcb->recv_data_len -= to_read;
+ pcb->recv_data_off += to_read;
+ }
+
+ if (error)
+ break;
+
+ /* Abort if socket has reported problems. */
+ if (so->so_error) {
+ if (so->so_error == ESHUTDOWN &&
+ orig_resid > uio->uio_resid) {
+ /*
+ * Although we got a FIN, we also received
+ * some data in this round. Delivery it
+ * to user.
+ */
+ error = 0;
+ } else {
+ if (so->so_error != ESHUTDOWN)
+ error = so->so_error;
+ }
+
+ break;
+ }
+
+ /* Cannot received more. */
+ if (sb->sb_state & SBS_CANTRCVMORE)
+ break;
+
+ /* We are done if buffer has been filled */
+ if (uio->uio_resid == 0)
+ break;
+
+ if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid)
+ break;
+
+ /* Buffer ring is empty and we shall not block */
+ if ((so->so_state & SS_NBIO) ||
+ (flags & (MSG_DONTWAIT|MSG_NBIO))) {
+ if (orig_resid == uio->uio_resid) {
+ /* We have not read anything */
+ error = EAGAIN;
+ }
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: non blocked read return, error %d.\n",
+ __func__, error);
+ break;
+ }
+
+ /*
+ * Wait and block until (more) data comes in.
+ * Note: Drops the sockbuf lock during wait.
+ */
+ error = sbwait(sb);
+
+ if (error)
+ break;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: wake up from sbwait, read available is %u\n",
+ __func__, vmbus_chan_read_available(pcb->chan));
+ }
+
+out:
+ SOCKBUF_UNLOCK(sb);
+ SOCK_IO_RECV_UNLOCK(so);
+
+ /* We recieved a FIN in this call */
+ if (so->so_error == ESHUTDOWN) {
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+ /* Send has already closed */
+ soisdisconnecting(so);
+ } else {
+ /* Just close the receive side */
+ socantrcvmore(so);
+ }
+ }
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: returning error = %d, so_error = %d\n",
+ __func__, error, so->so_error);
+
+ return (error);
+}
+
+int
+hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
+ struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td)
+{
+ struct hvs_pcb *pcb = so2hvspcb(so);
+ struct sockbuf *sb;
+ ssize_t orig_resid;
+ uint32_t canwrite, to_write;
+ int error = 0;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n",
+ __func__, uio->uio_resid);
+
+ if (so->so_type != SOCK_STREAM)
+ return (EINVAL);
+ if (pcb == NULL)
+ return (EINVAL);
+
+ /* If nothing to send */
+ if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE)
+ return (EINVAL);
+
+ orig_resid = uio->uio_resid;
+
+ /* Prevent other writers from entering the socket. */
+ error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
+ if (error) {
+ HVSOCK_DBG(HVSOCK_DBG_ERR,
+ "%s: soiolocak returned error = %d\n", __func__, error);
+ return (error);
+ }
+
+ sb = &so->so_snd;
+ SOCKBUF_LOCK(sb);
+
+ if ((sb->sb_state & SBS_CANTSENDMORE) ||
+ so->so_error == ESHUTDOWN) {
+ error = EPIPE;
+ goto out;
+ }
+
+ while (uio->uio_resid > 0) {
+ canwrite = hvsock_canwrite_check(pcb);
+ if (canwrite == 0) {
+ /* We have sent some data */
+ if (orig_resid > uio->uio_resid)
+ break;
+ /*
+ * We have not sent any data and it is
+ * non-blocked io
+ */
+ if (so->so_state & SS_NBIO ||
+ (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
+ error = EWOULDBLOCK;
+ break;
+ } else {
+ /*
+ * We are here because there is no space on
+ * send buffer ring. Signal the other side
+ * to read and free more space.
+ * Sleep wait until space avaiable to send
+ * Note: Drops the sockbuf lock during wait.
+ */
+ error = sbwait(sb);
+
+ if (error)
+ break;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: wake up from sbwait, space avail on "
+ "tx ring is %u\n",
+ __func__,
+ vmbus_chan_write_available(pcb->chan));
+
+ continue;
+ }
+ }
+ to_write = MIN(canwrite, uio->uio_resid);
+ to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: canwrite is %u, to_write = %u\n", __func__,
+ canwrite, to_write);
+ error = hvsock_send_data(pcb->chan, uio, to_write, sb);
+
+ if (error)
+ break;
+ }
+
+out:
+ SOCKBUF_UNLOCK(sb);
+ SOCK_IO_SEND_UNLOCK(so);
+
+ return (error);
+}
+
+int
+hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+ struct hvs_pcb *pcb = so2hvspcb(so);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__);
+
+ if (pcb == NULL)
+ return (EINVAL);
+
+ *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT);
+
+ return ((*nam == NULL)? ENOMEM : 0);
+}
+
+int
+hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+ struct hvs_pcb *pcb = so2hvspcb(so);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__);
+
+ if (pcb == NULL)
+ return (EINVAL);
+
+ *nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT);
+
+ return ((*nam == NULL)? ENOMEM : 0);
+}
+
+void
+hvs_trans_close(struct socket *so)
+{
+ struct hvs_pcb *pcb;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_close called\n", __func__);
+
+ (void) hvs_trans_lock();
+ pcb = so2hvspcb(so);
+ if (!pcb) {
+ hvs_trans_unlock();
+ return;
+ }
+
+ if (so->so_state & SS_ISCONNECTED) {
+ /* Send a FIN to peer */
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: hvs_trans_close sending a FIN to host\n", __func__);
+ (void) hvsock_send_data(pcb->chan, NULL, 0, NULL);
+ }
+
+ if (so->so_state &
+ (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
+ soisdisconnected(so);
+
+ pcb->chan = NULL;
+ pcb->so = NULL;
+
+ if (SOLISTENING(so)) {
+ mtx_lock(&hvs_trans_socks_mtx);
+ /* Remove from bound list */
+ __hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
+ mtx_unlock(&hvs_trans_socks_mtx);
+ }
+
+ hvs_trans_unlock();
+
+ return;
+}
+
+void
+hvs_trans_abort(struct socket *so)
+{
+ struct hvs_pcb *pcb = so2hvspcb(so);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_abort called\n", __func__);
+
+ (void) hvs_trans_lock();
+ if (pcb == NULL) {
+ hvs_trans_unlock();
+ return;
+ }
+
+ if (SOLISTENING(so)) {
+ mtx_lock(&hvs_trans_socks_mtx);
+ /* Remove from bound list */
+ __hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
+ mtx_unlock(&hvs_trans_socks_mtx);
+ }
+
+ if (so->so_state & SS_ISCONNECTED) {
+ (void) sodisconnect(so);
+ }
+ hvs_trans_unlock();
+
+ return;
+}
+
+int
+hvs_trans_shutdown(struct socket *so)
+{
+ struct hvs_pcb *pcb = so2hvspcb(so);
+ struct sockbuf *sb;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: HyperV Socket hvs_trans_shutdown called\n", __func__);
+
+ if (pcb == NULL)
+ return (EINVAL);
+
+ /*
+ * Only get called with the shutdown method is SHUT_WR or
+ * SHUT_RDWR.
+ * When the method is SHUT_RD or SHUT_RDWR, the caller
+ * already set the SBS_CANTRCVMORE on receive side socket
+ * buffer.
+ */
+ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
+ /*
+ * SHUT_WR only case.
+ * Receive side is still open. Just close
+ * the send side.
+ */
+ socantsendmore(so);
+ } else {
+ /* SHUT_RDWR case */
+ if (so->so_state & SS_ISCONNECTED) {
+ /* Send a FIN to peer */
+ sb = &so->so_snd;
+ SOCKBUF_LOCK(sb);
+ (void) hvsock_send_data(pcb->chan, NULL, 0, sb);
+ SOCKBUF_UNLOCK(sb);
+
+ soisdisconnecting(so);
+ }
+ }
+
+ return (0);
+}
+
+/* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is
+ * <port> (see struct sockaddr_hvs).
+ *
+ * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV:
+ * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-
+ * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with
+ * the below sockaddr:
+ *
+ * struct SOCKADDR_HV
+ * {
+ * ADDRESS_FAMILY Family;
+ * USHORT Reserved;
+ * GUID VmId;
+ * GUID ServiceId;
+ * };
+ * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via
+ * VMBus, because here it's obvious the host and the VM can easily identify
+ * each other. Though the VmID is useful on the host, especially in the case
+ * of Windows container, FreeBSD VM doesn't need it at all.
+ *
+ * To be compatible with similar infrastructure in Linux VMs, we have
+ * to limit the available GUID space of SOCKADDR_HV so that we can create
+ * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID.
+ * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is:
+ *
+ ****************************************************************************
+ * The only valid Service GUIDs, from the perspectives of both the host and *
+ * FreeBSD VM, that can be connected by the other end, must conform to this *
+ * format: <port>-facb-11e6-bd58-64006a7986d3. *
+ ****************************************************************************
+ *
+ * When we write apps on the host to connect(), the GUID ServiceID is used.
+ * When we write apps in FreeBSD VM to connect(), we only need to specify the
+ * port and the driver will form the GUID and use that to request the host.
+ *
+ * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the
+ * auto-generated remote port for a connect request initiated by the host's
+ * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the
+ * FreeBSD guest.
+ */
+
+/*
+ * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before)
+ * restricts HyperV socket ring buffer size to six 4K pages. Newer
+ * HyperV hosts doen't have this limit.
+ */
+#define HVS_RINGBUF_RCV_SIZE (PAGE_SIZE * 6)
+#define HVS_RINGBUF_SND_SIZE (PAGE_SIZE * 6)
+#define HVS_RINGBUF_MAX_SIZE (PAGE_SIZE * 64)
+
+struct hvsock_sc {
+ device_t dev;
+ struct hvs_pcb *pcb;
+ struct vmbus_channel *channel;
+};
+
+static bool
+hvsock_chan_readable(struct vmbus_channel *chan)
+{
+ uint32_t readable = vmbus_chan_read_available(chan);
+
+ return (readable >= HVSOCK_PKT_LEN(0));
+}
+
+static void
+hvsock_chan_cb(struct vmbus_channel *chan, void *context)
+{
+ struct hvs_pcb *pcb = (struct hvs_pcb *) context;
+ struct socket *so;
+ uint32_t canwrite;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: host send us a wakeup on rb data, pcb = %p\n",
+ __func__, pcb);
+
+ /*
+ * Check if the socket is still attached and valid.
+ * Here we know channel is still open. Need to make
+ * sure the socket has not been closed or freed.
+ */
+ (void) hvs_trans_lock();
+ so = hsvpcb2so(pcb);
+
+ if (pcb->chan != NULL && so != NULL) {
+ /*
+ * Wake up reader if there are data to read.
+ */
+ SOCKBUF_LOCK(&(so)->so_rcv);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: read available = %u\n", __func__,
+ vmbus_chan_read_available(pcb->chan));
+
+ if (hvsock_chan_readable(pcb->chan))
+ sorwakeup_locked(so);
+ else
+ SOCKBUF_UNLOCK(&(so)->so_rcv);
+
+ /*
+ * Wake up sender if space becomes available to write.
+ */
+ SOCKBUF_LOCK(&(so)->so_snd);
+ canwrite = hvsock_canwrite_check(pcb);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: canwrite = %u\n", __func__, canwrite);
+
+ if (canwrite > 0) {
+ sowwakeup_locked(so);
+ } else {
+ SOCKBUF_UNLOCK(&(so)->so_snd);
+ }
+ }
+
+ hvs_trans_unlock();
+
+ return;
+}
+
+static int
+hvsock_br_callback(void *datap, int cplen, void *cbarg)
+{
+ struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg;
+ struct uio *uio = arg->uio;
+ struct sockbuf *sb = arg->sb;
+ int error = 0;
+
+ if (cbarg == NULL || datap == NULL)
+ return (EINVAL);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, "
+ "datap = %p\n",
+ __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br",
+ uio->uio_resid, cplen, datap);
+
+ if (sb)
+ SOCKBUF_UNLOCK(sb);
+
+ error = uiomove(datap, cplen, uio);
+
+ if (sb)
+ SOCKBUF_LOCK(sb);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: after uiomove, uio_resid = %zd, error = %d\n",
+ __func__, uio->uio_resid, error);
+
+ return (error);
+}
+
+static int
+hvsock_send_data(struct vmbus_channel *chan, struct uio *uio,
+ uint32_t to_write, struct sockbuf *sb)
+{
+ struct hvs_pkt_header hvs_pkt;
+ int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0;
+ uint64_t pad = 0;
+ struct iovec iov[3];
+ struct hvs_callback_arg cbarg;
+
+ if (chan == NULL)
+ return (ENOTCONN);
+
+ hlen = sizeof(struct vmbus_chanpkt_hdr);
+ hvs_pkthlen = sizeof(struct hvs_pkt_header);
+ hvs_pktlen = hvs_pkthlen + to_write;
+ pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, "
+ "pad_pktlen = %u, data_len = %u\n",
+ __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write);
+
+ hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND;
+ hvs_pkt.chan_pkt_hdr.cph_flags = 0;
+ VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen);
+ VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen);
+ hvs_pkt.chan_pkt_hdr.cph_xactid = 0;
+
+ hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1;
+ hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write;
+
+ cbarg.uio = uio;
+ cbarg.sb = sb;
+
+ if (uio && to_write > 0) {
+ iov[0].iov_base = &hvs_pkt;
+ iov[0].iov_len = hvs_pkthlen;
+ iov[1].iov_base = NULL;
+ iov[1].iov_len = to_write;
+ iov[2].iov_base = &pad;
+ iov[2].iov_len = pad_pktlen - hvs_pktlen;
+
+ error = vmbus_chan_iov_send(chan, iov, 3,
+ hvsock_br_callback, &cbarg);
+ } else {
+ if (to_write == 0) {
+ iov[0].iov_base = &hvs_pkt;
+ iov[0].iov_len = hvs_pkthlen;
+ iov[1].iov_base = &pad;
+ iov[1].iov_len = pad_pktlen - hvs_pktlen;
+ error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL);
+ }
+ }
+
+ if (error) {
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: error = %d\n", __func__, error);
+ }
+
+ return (error);
+}
+
+/*
+ * Check if we have data on current ring buffer to read
+ * or not. If not, advance the ring buffer read index to
+ * next packet. Update the recev_data_len and recev_data_off
+ * to new value.
+ * Return the number of bytes can read.
+ */
+static uint32_t
+hvsock_canread_check(struct hvs_pcb *pcb)
+{
+ uint32_t advance;
+ uint32_t tlen, hlen, dlen;
+ uint32_t bytes_canread = 0;
+ int error;
+
+ if (pcb == NULL || pcb->chan == NULL) {
+ pcb->so->so_error = EIO;
+ return (0);
+ }
+
+ /* Still have data not read yet on current packet */
+ if (pcb->recv_data_len > 0)
+ return (pcb->recv_data_len);
+
+ if (pcb->rb_init)
+ advance =
+ VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
+ else
+ advance = 0;
+
+ bytes_canread = vmbus_chan_read_available(pcb->chan);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: bytes_canread on br = %u, advance = %u\n",
+ __func__, bytes_canread, advance);
+
+ if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) {
+ /*
+ * Nothing to read. Need to advance the rindex before
+ * calling sbwait, so host knows to wake us up when data
+ * is available to read on rb.
+ */
+ error = vmbus_chan_recv_idxadv(pcb->chan, advance);
+ if (error) {
+ HVSOCK_DBG(HVSOCK_DBG_ERR,
+ "%s: after calling vmbus_chan_recv_idxadv, "
+ "got error = %d\n", __func__, error);
+ return (0);
+ } else {
+ pcb->rb_init = false;
+ pcb->recv_data_len = 0;
+ pcb->recv_data_off = 0;
+ bytes_canread = vmbus_chan_read_available(pcb->chan);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: advanced %u bytes, "
+ " bytes_canread on br now = %u\n",
+ __func__, advance, bytes_canread);
+
+ if (bytes_canread == 0)
+ return (0);
+ else
+ advance = 0;
+ }
+ }
+
+ if (bytes_canread <
+ advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t)))
+ return (0);
+
+ error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt,
+ sizeof(struct hvs_pkt_header), advance);
+
+ /* Don't have anything to read */
+ if (error) {
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: after calling vmbus_chan_recv_peek, got error = %d\n",
+ __func__, error);
+ return (0);
+ }
+
+ /*
+ * We just read in a new packet header. Do some sanity checks.
+ */
+ tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
+ hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen);
+ dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size;
+ if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) ||
+ __predict_false(hlen > tlen) ||
+ __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) {
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "invalid tlen(%u), hlen(%u) or dlen(%u)\n",
+ tlen, hlen, dlen);
+ pcb->so->so_error = EIO;
+ return (0);
+ }
+ if (pcb->rb_init == false)
+ pcb->rb_init = true;
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n",
+ tlen, hlen, dlen);
+
+ /* The other side has sent a close FIN */
+ if (dlen == 0) {
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: Received FIN from other side\n", __func__);
+ /* inform the caller by seting so_error to ESHUTDOWN */
+ pcb->so->so_error = ESHUTDOWN;
+ }
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: canread on receive ring is %u \n", __func__, dlen);
+
+ pcb->recv_data_len = dlen;
+ pcb->recv_data_off = 0;
+
+ return (pcb->recv_data_len);
+}
+
+static uint32_t
+hvsock_canwrite_check(struct hvs_pcb *pcb)
+{
+ uint32_t writeable;
+ uint32_t ret;
+
+ if (pcb == NULL || pcb->chan == NULL)
+ return (0);
+
+ writeable = vmbus_chan_write_available(pcb->chan);
+
+ /*
+ * We must always reserve a 0-length-payload packet for the FIN.
+ */
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: writeable is %u, should be greater than %ju\n",
+ __func__, writeable,
+ (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)));
+
+ if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) {
+ /*
+ * The Tx ring seems full.
+ */
+ return (0);
+ }
+
+ ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: available size is %u\n", __func__, rounddown2(ret, 8));
+
+ return (rounddown2(ret, 8));
+}
+
+static void
+hvsock_set_chan_pending_send_size(struct vmbus_channel *chan)
+{
+ vmbus_chan_set_pending_send_size(chan,
+ HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ));
+}
+
+static int
+hvsock_open_channel(struct vmbus_channel *chan, struct socket *so)
+{
+ unsigned int rcvbuf, sndbuf;
+ struct hvs_pcb *pcb = so2hvspcb(so);
+ int ret;
+
+ if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) {
+ sndbuf = HVS_RINGBUF_SND_SIZE;
+ rcvbuf = HVS_RINGBUF_RCV_SIZE;
+ } else {
+ sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE);
+ sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE);
+ sndbuf = rounddown2(sndbuf, PAGE_SIZE);
+ rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE);
+ rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE);
+ rcvbuf = rounddown2(rcvbuf, PAGE_SIZE);
+ }
+
+ /*
+ * Can only read whatever user provided size of data
+ * from ring buffer. Turn off batched reading.
+ */
+ vmbus_chan_set_readbatch(chan, false);
+
+ ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0,
+ hvsock_chan_cb, pcb);
+
+ if (ret != 0) {
+ HVSOCK_DBG(HVSOCK_DBG_ERR,
+ "%s: failed to open hvsock channel, sndbuf = %u, "
+ "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
+ } else {
+ HVSOCK_DBG(HVSOCK_DBG_INFO,
+ "%s: hvsock channel opened, sndbuf = %u, i"
+ "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
+ /*
+ * Se the pending send size so to receive wakeup
+ * signals from host when there is enough space on
+ * rx buffer ring to write.
+ */
+ hvsock_set_chan_pending_send_size(chan);
+ }
+
+ return ret;
+}
+
+/*
+ * Guest is listening passively on the socket. Open channel and
+ * create a new socket for the conneciton.
+ */
+static void
+hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so,
+ struct hvsock_sc *sc)
+{
+ struct socket *new_so;
+ struct hvs_pcb *new_pcb, *pcb;
+ int error;
+
+ /* Do nothing if socket is not listening */
+ if (!SOLISTENING(so)) {
+ HVSOCK_DBG(HVSOCK_DBG_ERR,
+ "%s: socket is not a listening one\n", __func__);
+ return;
+ }
+
+ /*
+ * Create a new socket. This will call pru_attach to complete
+ * the socket initialization and put the new socket onto
+ * listening socket's sol_incomp list, waiting to be promoted
+ * to sol_comp list.
+ * The new socket created has ref count 0. There is no other
+ * thread that changes the state of this new one at the
+ * moment, so we don't need to hold its lock while opening
+ * channel and filling out its pcb information.
+ */
+ new_so = sonewconn(so, 0);
+ if (!new_so)
+ HVSOCK_DBG(HVSOCK_DBG_ERR,
+ "%s: creating new socket failed\n", __func__);
+
+ /*
+ * Now open the vmbus channel. If it fails, the socket will be
+ * on the listening socket's sol_incomp queue until it is
+ * replaced and aborted.
+ */
+ error = hvsock_open_channel(chan, new_so);
+ if (error) {
+ new_so->so_error = error;
+ return;
+ }
+
+ pcb = so->so_pcb;
+ new_pcb = new_so->so_pcb;
+
+ hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port);
+ /* Remote port is unknown to guest in this type of conneciton */
+ hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN);
+ new_pcb->chan = chan;
+ new_pcb->recv_data_len = 0;
+ new_pcb->recv_data_off = 0;
+ new_pcb->rb_init = false;
+
+ new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan);
+ new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan);
+
+ hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED);
+
+ sc->pcb = new_pcb;
+
+ /*
+ * Change the socket state to SS_ISCONNECTED. This will promote
+ * the socket to sol_comp queue and wake up the thread which
+ * is accepting connection.
+ */
+ soisconnected(new_so);
+}
+
+
+/*
+ * Guest is actively connecting to host.
+ */
+static void
+hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so)
+{
+ struct hvs_pcb *pcb;
+ int error;
+
+ error = hvsock_open_channel(chan, so);
+ if (error) {
+ so->so_error = error;
+ return;
+ }
+
+ pcb = so->so_pcb;
+ pcb->chan = chan;
+ pcb->recv_data_len = 0;
+ pcb->recv_data_off = 0;
+ pcb->rb_init = false;
+
+ mtx_lock(&hvs_trans_socks_mtx);
+ __hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
+ __hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED);
+ mtx_unlock(&hvs_trans_socks_mtx);
+
+ /*
+ * Change the socket state to SS_ISCONNECTED. This will wake up
+ * the thread sleeping in connect call.
+ */
+ soisconnected(so);
+}
+
+static void
+hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc)
+{
+ struct hyperv_guid *inst_guid, *type_guid;
+ bool conn_from_host;
+ struct sockaddr_hvs addr;
+ struct socket *so;
+ struct hvs_pcb *pcb;
+
+ type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan);
+ inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan);
+ conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan);
+
+ HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is ");
+ hvsock_print_guid(type_guid);
+ HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is ");
+ hvsock_print_guid(inst_guid);
+ HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n",
+ (conn_from_host == true ) ? "from" : "to");
+
+ /*
+ * The listening port should be in [0, MAX_LISTEN_PORT]
+ */
+ if (!is_valid_srv_id(type_guid))
+ return;
+
+ /*
+ * There should be a bound socket already created no matter
+ * it is a passive or active connection.
+ * For host initiated connection (passive on guest side),
+ * the type_guid contains the port which guest is bound and
+ * listening.
+ * For the guest initiated connection (active on guest side),
+ * the inst_guid contains the port that guest has auto bound
+ * to.
+ */
+ hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid);
+ so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND);
+ if (!so) {
+ HVSOCK_DBG(HVSOCK_DBG_ERR,
+ "%s: no bound socket found for port %u\n",
+ __func__, addr.hvs_port);
+ return;
+ }
+
+ if (conn_from_host) {
+ hvsock_open_conn_passive(chan, so, sc);
+ } else {
+ (void) hvs_trans_lock();
+ pcb = so->so_pcb;
+ if (pcb && pcb->so) {
+ sc->pcb = so2hvspcb(so);
+ hvsock_open_conn_active(chan, so);
+ } else {
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "%s: channel detached before open\n", __func__);
+ }
+ hvs_trans_unlock();
+ }
+
+}
+
+static int
+hvsock_probe(device_t dev)
+{
+ struct vmbus_channel *channel = vmbus_get_channel(dev);
+
+ if (!channel || !vmbus_chan_is_hvs(channel)) {
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "hvsock_probe called but not a hvsock channel id %u\n",
+ vmbus_chan_id(channel));
+
+ return ENXIO;
+ } else {
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "hvsock_probe got a hvsock channel id %u\n",
+ vmbus_chan_id(channel));
+
+ return BUS_PROBE_DEFAULT;
+ }
+}
+
+static int
+hvsock_attach(device_t dev)
+{
+ struct vmbus_channel *channel = vmbus_get_channel(dev);
+ struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n");
+
+ hvsock_open_connection(channel, sc);
+
+ /*
+ * Always return success. On error the host will rescind the device
+ * in 30 seconds and we can do cleanup at that time in
+ * vmbus_chan_msgproc_chrescind().
+ */
+ return (0);
+}
+
+static int
+hvsock_detach(device_t dev)
+{
+ struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
+ struct socket *so;
+ int retry;
+
+ if (bootverbose)
+ device_printf(dev, "hvsock_detach called.\n");
+
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n");
+
+ if (sc->pcb != NULL) {
+ (void) hvs_trans_lock();
+
+ so = hsvpcb2so(sc->pcb);
+ if (so) {
+ /* Close the connection */
+ if (so->so_state &
+ (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
+ soisdisconnected(so);
+ }
+
+ mtx_lock(&hvs_trans_socks_mtx);
+ __hvs_remove_pcb_from_list(sc->pcb,
+ HVS_LIST_BOUND | HVS_LIST_CONNECTED);
+ mtx_unlock(&hvs_trans_socks_mtx);
+
+ /*
+ * Close channel while no reader and sender are working
+ * on the buffer rings.
+ */
+ if (so) {
+ retry = 0;
+ while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) {
+ /*
+ * Someone is reading, rx br is busy
+ */
+ soisdisconnected(so);
+ DELAY(500);
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "waiting for rx reader to exit, "
+ "retry = %d\n", retry++);
+ }
+ retry = 0;
+ while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) {
+ /*
+ * Someone is sending, tx br is busy
+ */
+ soisdisconnected(so);
+ DELAY(500);
+ HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
+ "waiting for tx sender to exit, "
+ "retry = %d\n", retry++);
+ }
+ }
+
+
+ bzero(sc->pcb, sizeof(struct hvs_pcb));
+ free(sc->pcb, M_HVSOCK);
+ sc->pcb = NULL;
+
+ if (so) {
+ SOCK_IO_RECV_UNLOCK(so);
+ SOCK_IO_SEND_UNLOCK(so);
+ so->so_pcb = NULL;
+ }
+
+ hvs_trans_unlock();
+ }
+
+ vmbus_chan_close(vmbus_get_channel(dev));
+
+ return (0);
+}
+
+static device_method_t hvsock_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, hvsock_probe),
+ DEVMETHOD(device_attach, hvsock_attach),
+ DEVMETHOD(device_detach, hvsock_detach),
+ DEVMETHOD_END
+};
+
+static driver_t hvsock_driver = {
+ "hv_sock",
+ hvsock_methods,
+ sizeof(struct hvsock_sc)
+};
+
+static devclass_t hvsock_devclass;
+
+DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL);
+MODULE_VERSION(hvsock, 1);
+MODULE_DEPEND(hvsock, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/hvsock/hv_sock.h b/sys/dev/hyperv/hvsock/hv_sock.h
new file mode 100644
index 000000000000..877425968345
--- /dev/null
+++ b/sys/dev/hyperv/hvsock/hv_sock.h
@@ -0,0 +1,122 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HVSOCK_H
+#define _HVSOCK_H
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/queue.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+
+/*
+ * HyperV Socket Protocols
+ */
+#define HYPERV_SOCK_PROTO_TRANS 1 /* Transport protocol */
+
+#define HVADDR_PORT_ANY -1U
+#define HVADDR_PORT_UNKNOWN -1U
+
+#define HVS_LIST_BOUND 0x01
+#define HVS_LIST_CONNECTED 0x02
+#define HVS_LIST_ALL (HVS_LIST_BOUND | HVS_LIST_CONNECTED)
+
+struct sockaddr_hvs {
+ unsigned char sa_len;
+ sa_family_t sa_family;
+ unsigned int hvs_port;
+ unsigned char hvs_zero[sizeof(struct sockaddr) -
+ sizeof(sa_family_t) -
+ sizeof(unsigned char) -
+ sizeof(unsigned int)];
+};
+
+struct vmpipe_proto_header {
+ uint32_t vmpipe_pkt_type;
+ uint32_t vmpipe_data_size;
+} __packed;
+
+struct hvs_pkt_header {
+ struct vmbus_chanpkt_hdr chan_pkt_hdr;
+ struct vmpipe_proto_header vmpipe_pkt_hdr;
+} __packed;
+
+struct hvs_pcb {
+ struct socket *so; /* Pointer to socket */
+ struct sockaddr_hvs local_addr;
+ struct sockaddr_hvs remote_addr;
+
+ struct hyperv_guid vm_srv_id;
+ struct hyperv_guid host_srv_id;
+
+ struct vmbus_channel *chan;
+ /* Current packet header on rx ring */
+ struct hvs_pkt_header hvs_pkt;
+ /* Available data in receive br in current packet */
+ uint32_t recv_data_len;
+ /* offset in the packet */
+ uint32_t recv_data_off;
+ bool rb_init;
+ /* Link lists for global bound and connected sockets */
+ LIST_ENTRY(hvs_pcb) bound_next;
+ LIST_ENTRY(hvs_pcb) connected_next;
+};
+
+#define so2hvspcb(so) \
+ ((struct hvs_pcb *)((so)->so_pcb))
+#define hsvpcb2so(hvspcb) \
+ ((struct socket *)((hvspcb)->so))
+
+void hvs_addr_init(struct sockaddr_hvs *, const struct hyperv_guid *);
+void hvs_trans_init(void);
+void hvs_trans_close(struct socket *);
+void hvs_trans_detach(struct socket *);
+void hvs_trans_abort(struct socket *);
+int hvs_trans_attach(struct socket *, int, struct thread *);
+int hvs_trans_bind(struct socket *, struct sockaddr *, struct thread *);
+int hvs_trans_listen(struct socket *, int, struct thread *);
+int hvs_trans_accept(struct socket *, struct sockaddr **);
+int hvs_trans_connect(struct socket *,
+ struct sockaddr *, struct thread *);
+int hvs_trans_peeraddr(struct socket *, struct sockaddr **);
+int hvs_trans_sockaddr(struct socket *, struct sockaddr **);
+int hvs_trans_soreceive(struct socket *, struct sockaddr **,
+ struct uio *, struct mbuf **, struct mbuf **, int *);
+int hvs_trans_sosend(struct socket *, struct sockaddr *, struct uio *,
+ struct mbuf *, struct mbuf *, int, struct thread *);
+int hvs_trans_disconnect(struct socket *);
+int hvs_trans_shutdown(struct socket *);
+
+int hvs_trans_lock(void);
+void hvs_trans_unlock(void);
+
+void hvs_remove_socket_from_list(struct socket *, unsigned char);
+#endif /* _HVSOCK_H */
diff --git a/sys/dev/hyperv/include/hyperv.h b/sys/dev/hyperv/include/hyperv.h
new file mode 100644
index 000000000000..8b985b2f31a7
--- /dev/null
+++ b/sys/dev/hyperv/include/hyperv.h
@@ -0,0 +1,104 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HYPERV_H_
+#define _HYPERV_H_
+
+#ifdef _KERNEL
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#define MSR_HV_TIME_REF_COUNT 0x40000020
+
+#define CPUID_HV_MSR_TIME_REFCNT 0x0002 /* MSR_HV_TIME_REF_COUNT */
+#define CPUID_HV_MSR_SYNIC 0x0004 /* MSRs for SynIC */
+#define CPUID_HV_MSR_SYNTIMER 0x0008 /* MSRs for SynTimer */
+#define CPUID_HV_MSR_APIC 0x0010 /* MSR_HV_{EOI,ICR,TPR} */
+#define CPUID_HV_MSR_HYPERCALL 0x0020 /* MSR_HV_GUEST_OS_ID
+ * MSR_HV_HYPERCALL */
+#define CPUID_HV_MSR_VP_INDEX 0x0040 /* MSR_HV_VP_INDEX */
+#define CPUID_HV_MSR_REFERENCE_TSC 0x0200 /* MSR_HV_REFERENCE_TSC */
+#define CPUID_HV_MSR_GUEST_IDLE 0x0400 /* MSR_HV_GUEST_IDLE */
+
+#ifndef NANOSEC
+#define NANOSEC 1000000000ULL
+#endif
+#define HYPERV_TIMER_NS_FACTOR 100ULL
+#define HYPERV_TIMER_FREQ (NANOSEC / HYPERV_TIMER_NS_FACTOR)
+
+#endif /* _KERNEL */
+
+#define HYPERV_REFTSC_DEVNAME "hv_tsc"
+
+/*
+ * Hyper-V Reference TSC
+ */
+struct hyperv_reftsc {
+ volatile uint32_t tsc_seq;
+ volatile uint32_t tsc_rsvd1;
+ volatile uint64_t tsc_scale;
+ volatile int64_t tsc_ofs;
+} __packed __aligned(PAGE_SIZE);
+#ifdef CTASSERT
+CTASSERT(sizeof(struct hyperv_reftsc) == PAGE_SIZE);
+#endif
+
+#ifdef _KERNEL
+
+struct hyperv_guid {
+ uint8_t hv_guid[16];
+} __packed;
+
+#define HYPERV_GUID_STRLEN 40
+
+typedef uint64_t (*hyperv_tc64_t)(void);
+
+int hyperv_guid2str(const struct hyperv_guid *, char *,
+ size_t);
+
+/*
+ * hyperv_tc64 could be NULL, if there were no suitable Hyper-V
+ * specific timecounter.
+ */
+extern hyperv_tc64_t hyperv_tc64;
+extern u_int hyperv_features; /* CPUID_HV_MSR_ */
+extern u_int hyperv_ver_major;
+
+/*
+ * Vmbus version after negotiation with host.
+ */
+extern uint32_t vmbus_current_version;
+
+#endif /* _KERNEL */
+
+#endif /* _HYPERV_H_ */
diff --git a/sys/dev/hyperv/include/hyperv_busdma.h b/sys/dev/hyperv/include/hyperv_busdma.h
new file mode 100644
index 000000000000..ff01b3e27a95
--- /dev/null
+++ b/sys/dev/hyperv/include/hyperv_busdma.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HYPERV_BUSDMA_H_
+#define _HYPERV_BUSDMA_H_
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <machine/bus.h>
+
+struct hyperv_dma {
+ bus_addr_t hv_paddr;
+ bus_dma_tag_t hv_dtag;
+ bus_dmamap_t hv_dmap;
+};
+
+void hyperv_dma_map_paddr(void *arg, bus_dma_segment_t *segs,
+ int nseg, int error);
+void *hyperv_dmamem_alloc(bus_dma_tag_t parent_dtag,
+ bus_size_t alignment, bus_addr_t boundary, bus_size_t size,
+ struct hyperv_dma *dma, int flags);
+void hyperv_dmamem_free(struct hyperv_dma *dma, void *ptr);
+
+#endif /* !_HYPERV_BUSDMA_H_ */
diff --git a/sys/dev/hyperv/include/vmbus.h b/sys/dev/hyperv/include/vmbus.h
new file mode 100644
index 000000000000..76c1ad632765
--- /dev/null
+++ b/sys/dev/hyperv/include/vmbus.h
@@ -0,0 +1,261 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_H_
+#define _VMBUS_H_
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/_iovec.h>
+
+/*
+ * VMBUS version is 32 bit, upper 16 bit for major_number and lower
+ * 16 bit for minor_number.
+ *
+ * 0.13 -- Windows Server 2008
+ * 1.1 -- Windows 7
+ * 2.4 -- Windows 8
+ * 3.0 -- Windows 8.1
+ * 4.0 -- Windows 10
+ * 5.0 -- Newer Windows 10
+ */
+#define VMBUS_VERSION_WS2008 ((0 << 16) | (13))
+#define VMBUS_VERSION_WIN7 ((1 << 16) | (1))
+#define VMBUS_VERSION_WIN8 ((2 << 16) | (4))
+#define VMBUS_VERSION_WIN8_1 ((3 << 16) | (0))
+#define VMBUS_VERSION_WIN10 ((4 << 16) | (0))
+#define VMBUS_VERSION_WIN10_V5 ((5 << 16) | (0))
+
+#define VMBUS_VERSION_MAJOR(ver) (((uint32_t)(ver)) >> 16)
+#define VMBUS_VERSION_MINOR(ver) (((uint32_t)(ver)) & 0xffff)
+
+#define VMBUS_CHAN_POLLHZ_MIN 100 /* 10ms interval */
+#define VMBUS_CHAN_POLLHZ_MAX 1000000 /* 1us interval */
+
+/*
+ * GPA stuffs.
+ */
+struct vmbus_gpa_range {
+ uint32_t gpa_len;
+ uint32_t gpa_ofs;
+ uint64_t gpa_page[0];
+} __packed;
+
+/* This is actually vmbus_gpa_range.gpa_page[1] */
+struct vmbus_gpa {
+ uint32_t gpa_len;
+ uint32_t gpa_ofs;
+ uint64_t gpa_page;
+} __packed;
+
+#define VMBUS_CHANPKT_SIZE_SHIFT 3
+
+#define VMBUS_CHANPKT_GETLEN(pktlen) \
+ (((int)(pktlen)) << VMBUS_CHANPKT_SIZE_SHIFT)
+
+struct vmbus_chanpkt_hdr {
+ uint16_t cph_type; /* VMBUS_CHANPKT_TYPE_ */
+ uint16_t cph_hlen; /* header len, in 8 bytes */
+ uint16_t cph_tlen; /* total len, in 8 bytes */
+ uint16_t cph_flags; /* VMBUS_CHANPKT_FLAG_ */
+ uint64_t cph_xactid;
+} __packed;
+
+#define VMBUS_CHANPKT_TYPE_INBAND 0x0006
+#define VMBUS_CHANPKT_TYPE_RXBUF 0x0007
+#define VMBUS_CHANPKT_TYPE_GPA 0x0009
+#define VMBUS_CHANPKT_TYPE_COMP 0x000b
+
+#define VMBUS_CHANPKT_FLAG_NONE 0
+#define VMBUS_CHANPKT_FLAG_RC 0x0001 /* report completion */
+
+#define VMBUS_CHANPKT_CONST_DATA(pkt) \
+ (const void *)((const uint8_t *)(pkt) + \
+ VMBUS_CHANPKT_GETLEN((pkt)->cph_hlen))
+
+/* Include padding */
+#define VMBUS_CHANPKT_DATALEN(pkt) \
+ (VMBUS_CHANPKT_GETLEN((pkt)->cph_tlen) -\
+ VMBUS_CHANPKT_GETLEN((pkt)->cph_hlen))
+
+struct vmbus_rxbuf_desc {
+ uint32_t rb_len;
+ uint32_t rb_ofs;
+} __packed;
+
+struct vmbus_chanpkt_rxbuf {
+ struct vmbus_chanpkt_hdr cp_hdr;
+ uint16_t cp_rxbuf_id;
+ uint16_t cp_rsvd;
+ uint32_t cp_rxbuf_cnt;
+ struct vmbus_rxbuf_desc cp_rxbuf[];
+} __packed;
+
+struct vmbus_chan_br {
+ void *cbr;
+ bus_addr_t cbr_paddr;
+ int cbr_txsz;
+ int cbr_rxsz;
+};
+
+struct vmbus_channel;
+struct vmbus_xact;
+struct vmbus_xact_ctx;
+struct hyperv_guid;
+struct task;
+struct taskqueue;
+
+typedef void (*vmbus_chan_callback_t)(struct vmbus_channel *, void *);
+typedef int (*vmbus_br_copy_callback_t)(void *, int, void *);
+
+static __inline struct vmbus_channel *
+vmbus_get_channel(device_t dev)
+{
+ return device_get_ivars(dev);
+}
+
+/*
+ * vmbus_chan_open_br()
+ *
+ * Return values:
+ * 0 Succeeded.
+ * EISCONN Failed, and the memory passed through 'br' is still
+ * connected. Callers must _not_ free the the memory
+ * passed through 'br', if this error happens.
+ * other values Failed. The memory passed through 'br' is no longer
+ * connected. Callers are free to do anything with the
+ * memory passed through 'br'.
+ *
+ *
+ *
+ * vmbus_chan_close_direct()
+ *
+ * NOTE:
+ * Callers of this function _must_ make sure to close all sub-channels before
+ * closing the primary channel.
+ *
+ * Return values:
+ * 0 Succeeded.
+ * EISCONN Failed, and the memory associated with the bufring
+ * is still connected. Callers must _not_ free the the
+ * memory associated with the bufring, if this error
+ * happens.
+ * other values Failed. The memory associated with the bufring is
+ * no longer connected. Callers are free to do anything
+ * with the memory associated with the bufring.
+ */
+int vmbus_chan_open(struct vmbus_channel *chan,
+ int txbr_size, int rxbr_size, const void *udata, int udlen,
+ vmbus_chan_callback_t cb, void *cbarg);
+int vmbus_chan_open_br(struct vmbus_channel *chan,
+ const struct vmbus_chan_br *cbr, const void *udata,
+ int udlen, vmbus_chan_callback_t cb, void *cbarg);
+void vmbus_chan_close(struct vmbus_channel *chan);
+int vmbus_chan_close_direct(struct vmbus_channel *chan);
+void vmbus_chan_intr_drain(struct vmbus_channel *chan);
+void vmbus_chan_run_task(struct vmbus_channel *chan,
+ struct task *task);
+void vmbus_chan_set_orphan(struct vmbus_channel *chan,
+ struct vmbus_xact_ctx *);
+void vmbus_chan_unset_orphan(struct vmbus_channel *chan);
+const void *vmbus_chan_xact_wait(const struct vmbus_channel *chan,
+ struct vmbus_xact *xact, size_t *resp_len, bool can_sleep);
+
+int vmbus_chan_gpadl_connect(struct vmbus_channel *chan,
+ bus_addr_t paddr, int size, uint32_t *gpadl);
+int vmbus_chan_gpadl_disconnect(struct vmbus_channel *chan,
+ uint32_t gpadl);
+
+void vmbus_chan_cpu_set(struct vmbus_channel *chan, int cpu);
+void vmbus_chan_cpu_rr(struct vmbus_channel *chan);
+void vmbus_chan_set_readbatch(struct vmbus_channel *chan, bool on);
+
+struct vmbus_channel **
+ vmbus_subchan_get(struct vmbus_channel *pri_chan,
+ int subchan_cnt);
+void vmbus_subchan_rel(struct vmbus_channel **subchan,
+ int subchan_cnt);
+void vmbus_subchan_drain(struct vmbus_channel *pri_chan);
+
+int vmbus_chan_recv(struct vmbus_channel *chan, void *data, int *dlen,
+ uint64_t *xactid);
+int vmbus_chan_recv_pkt(struct vmbus_channel *chan,
+ struct vmbus_chanpkt_hdr *pkt, int *pktlen);
+
+int vmbus_chan_recv_idxadv(struct vmbus_channel *chan,
+ uint32_t advance);
+int vmbus_chan_recv_peek(struct vmbus_channel *chan,
+ void *data, int data_len, uint32_t advance);
+int vmbus_chan_recv_peek_call(struct vmbus_channel *chan,
+ int data_len, uint32_t skip,
+ vmbus_br_copy_callback_t cb, void *cbarg);
+
+int vmbus_chan_send(struct vmbus_channel *chan, uint16_t type,
+ uint16_t flags, void *data, int dlen, uint64_t xactid);
+int vmbus_chan_send_sglist(struct vmbus_channel *chan,
+ struct vmbus_gpa sg[], int sglen, void *data, int dlen,
+ uint64_t xactid);
+int vmbus_chan_send_prplist(struct vmbus_channel *chan,
+ struct vmbus_gpa_range *prp, int prp_cnt, void *data,
+ int dlen, uint64_t xactid);
+int vmbus_chan_iov_send(struct vmbus_channel *chan,
+ const struct iovec iov[], int iovlen,
+ vmbus_br_copy_callback_t cb, void *cbarg);
+uint32_t vmbus_chan_write_available(struct vmbus_channel *chan);
+uint32_t vmbus_chan_read_available(struct vmbus_channel *chan);
+bool vmbus_chan_write_signal(struct vmbus_channel *chan,
+ int32_t min_signal_size);
+void vmbus_chan_set_pending_send_size(struct vmbus_channel *chan,
+ uint32_t size);
+
+uint32_t vmbus_chan_id(const struct vmbus_channel *chan);
+uint32_t vmbus_chan_subidx(const struct vmbus_channel *chan);
+bool vmbus_chan_is_primary(const struct vmbus_channel *chan);
+bool vmbus_chan_is_revoked(const struct vmbus_channel *chan);
+bool vmbus_chan_is_hvs(const struct vmbus_channel *chan);
+bool vmbus_chan_is_hvs_conn_from_host(
+ const struct vmbus_channel *chan);
+int vmbus_req_tl_connect(struct hyperv_guid *,
+ struct hyperv_guid *);
+
+struct hyperv_guid *
+ vmbus_chan_guid_type(struct vmbus_channel *chan);
+struct hyperv_guid *
+ vmbus_chan_guid_inst(struct vmbus_channel *chan);
+int vmbus_chan_prplist_nelem(int br_size, int prpcnt_max,
+ int dlen_max);
+bool vmbus_chan_rx_empty(const struct vmbus_channel *chan);
+bool vmbus_chan_tx_empty(const struct vmbus_channel *chan);
+struct taskqueue *
+ vmbus_chan_mgmt_tq(const struct vmbus_channel *chan);
+
+void vmbus_chan_poll_enable(struct vmbus_channel *chan,
+ u_int pollhz);
+void vmbus_chan_poll_disable(struct vmbus_channel *chan);
+
+#endif /* !_VMBUS_H_ */
diff --git a/sys/dev/hyperv/include/vmbus_xact.h b/sys/dev/hyperv/include/vmbus_xact.h
new file mode 100644
index 000000000000..90711a0be774
--- /dev/null
+++ b/sys/dev/hyperv/include/vmbus_xact.h
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_XACT_H_
+#define _VMBUS_XACT_H_
+
+#include <sys/param.h>
+#include <sys/bus.h>
+
+struct vmbus_xact;
+struct vmbus_xact_ctx;
+
+struct vmbus_xact_ctx *vmbus_xact_ctx_create(bus_dma_tag_t dtag,
+ size_t req_size, size_t resp_size,
+ size_t priv_size);
+void vmbus_xact_ctx_destroy(struct vmbus_xact_ctx *ctx);
+bool vmbus_xact_ctx_orphan(struct vmbus_xact_ctx *ctx);
+
+struct vmbus_xact *vmbus_xact_get(struct vmbus_xact_ctx *ctx,
+ size_t req_len);
+void vmbus_xact_put(struct vmbus_xact *xact);
+
+void *vmbus_xact_req_data(const struct vmbus_xact *xact);
+bus_addr_t vmbus_xact_req_paddr(const struct vmbus_xact *xact);
+void *vmbus_xact_priv(const struct vmbus_xact *xact,
+ size_t priv_len);
+void vmbus_xact_activate(struct vmbus_xact *xact);
+void vmbus_xact_deactivate(struct vmbus_xact *xact);
+const void *vmbus_xact_wait(struct vmbus_xact *xact,
+ size_t *resp_len);
+const void *vmbus_xact_busywait(struct vmbus_xact *xact,
+ size_t *resp_len);
+const void *vmbus_xact_poll(struct vmbus_xact *xact,
+ size_t *resp_len);
+void vmbus_xact_wakeup(struct vmbus_xact *xact,
+ const void *data, size_t dlen);
+void vmbus_xact_ctx_wakeup(struct vmbus_xact_ctx *ctx,
+ const void *data, size_t dlen);
+
+#endif /* !_VMBUS_XACT_H_ */
diff --git a/sys/dev/hyperv/input/hv_kbd.c b/sys/dev/hyperv/input/hv_kbd.c
new file mode 100644
index 000000000000..53aacda7fbcb
--- /dev/null
+++ b/sys/dev/hyperv/input/hv_kbd.c
@@ -0,0 +1,857 @@
+/*-
+ * Copyright (c) 2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_evdev.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/taskqueue.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/kthread.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/sema.h>
+#include <sys/signal.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/mutex.h>
+#include <sys/callout.h>
+
+#include <sys/kbio.h>
+#include <dev/kbd/kbdreg.h>
+#include <dev/kbd/kbdtables.h>
+
+#ifdef EVDEV_SUPPORT
+#include <dev/evdev/evdev.h>
+#include <dev/evdev/input.h>
+#endif
+
+#include "dev/hyperv/input/hv_kbdc.h"
+
+#define HVKBD_MTX_LOCK(_m) do { \
+ mtx_lock(_m); \
+} while (0)
+
+#define HVKBD_MTX_UNLOCK(_m) do { \
+ mtx_unlock(_m); \
+} while (0)
+
+#define HVKBD_MTX_ASSERT(_m, _t) do { \
+ mtx_assert(_m, _t); \
+} while (0)
+
+#define HVKBD_LOCK() HVKBD_MTX_LOCK(&Giant)
+#define HVKBD_UNLOCK() HVKBD_MTX_UNLOCK(&Giant)
+#define HVKBD_LOCK_ASSERT() HVKBD_MTX_ASSERT(&Giant, MA_OWNED)
+
+#define HVKBD_FLAG_COMPOSE 0x00000001 /* compose char flag */
+#define HVKBD_FLAG_POLLING 0x00000002
+
+#ifdef EVDEV_SUPPORT
+static evdev_event_t hvkbd_ev_event;
+
+static const struct evdev_methods hvkbd_evdev_methods = {
+ .ev_event = hvkbd_ev_event,
+};
+#endif
+
+/* early keyboard probe, not supported */
+static int
+hvkbd_configure(int flags)
+{
+ return (0);
+}
+
+/* detect a keyboard, not used */
+static int
+hvkbd_probe(int unit, void *arg, int flags)
+{
+ return (ENXIO);
+}
+
+/* reset and initialize the device, not used */
+static int
+hvkbd_init(int unit, keyboard_t **kbdp, void *arg, int flags)
+{
+ DEBUG_HVKBD(*kbdp, "%s\n", __func__);
+ return (ENXIO);
+}
+
+/* test the interface to the device, not used */
+static int
+hvkbd_test_if(keyboard_t *kbd)
+{
+ DEBUG_HVKBD(kbd, "%s\n", __func__);
+ return (0);
+}
+
+/* finish using this keyboard, not used */
+static int
+hvkbd_term(keyboard_t *kbd)
+{
+ DEBUG_HVKBD(kbd, "%s\n", __func__);
+ return (ENXIO);
+}
+
+/* keyboard interrupt routine, not used */
+static int
+hvkbd_intr(keyboard_t *kbd, void *arg)
+{
+ DEBUG_HVKBD(kbd, "%s\n", __func__);
+ return (0);
+}
+
+/* lock the access to the keyboard, not used */
+static int
+hvkbd_lock(keyboard_t *kbd, int lock)
+{
+ DEBUG_HVKBD(kbd, "%s\n", __func__);
+ return (1);
+}
+
+/* save the internal state, not used */
+static int
+hvkbd_get_state(keyboard_t *kbd, void *buf, size_t len)
+{
+ DEBUG_HVKBD(kbd,"%s\n", __func__);
+ return (len == 0) ? 1 : -1;
+}
+
+/* set the internal state, not used */
+static int
+hvkbd_set_state(keyboard_t *kbd, void *buf, size_t len)
+{
+ DEBUG_HVKBD(kbd, "%s\n", __func__);
+ return (EINVAL);
+}
+
+static int
+hvkbd_poll(keyboard_t *kbd, int on)
+{
+ hv_kbd_sc *sc = kbd->kb_data;
+
+ HVKBD_LOCK();
+ /*
+ * Keep a reference count on polling to allow recursive
+ * cngrab() during a panic for example.
+ */
+ if (on)
+ sc->sc_polling++;
+ else if (sc->sc_polling > 0)
+ sc->sc_polling--;
+
+ if (sc->sc_polling != 0) {
+ sc->sc_flags |= HVKBD_FLAG_POLLING;
+ } else {
+ sc->sc_flags &= ~HVKBD_FLAG_POLLING;
+ }
+ HVKBD_UNLOCK();
+ return (0);
+}
+
+/*
+ * Enable the access to the device; until this function is called,
+ * the client cannot read from the keyboard.
+ */
+static int
+hvkbd_enable(keyboard_t *kbd)
+{
+ HVKBD_LOCK();
+ KBD_ACTIVATE(kbd);
+ HVKBD_UNLOCK();
+ return (0);
+}
+
+/* disallow the access to the device */
+static int
+hvkbd_disable(keyboard_t *kbd)
+{
+ DEBUG_HVKBD(kbd, "%s\n", __func__);
+ HVKBD_LOCK();
+ KBD_DEACTIVATE(kbd);
+ HVKBD_UNLOCK();
+ return (0);
+}
+
+static void
+hvkbd_do_poll(hv_kbd_sc *sc, uint8_t wait)
+{
+ while (!hv_kbd_prod_is_ready(sc)) {
+ hv_kbd_read_channel(sc->hs_chan, sc);
+ if (!wait)
+ break;
+ }
+}
+
+/* check if data is waiting */
+/* Currently unused. */
+static int
+hvkbd_check(keyboard_t *kbd)
+{
+ DEBUG_HVKBD(kbd, "%s\n", __func__);
+ return (0);
+}
+
+/* check if char is waiting */
+static int
+hvkbd_check_char_locked(keyboard_t *kbd)
+{
+ HVKBD_LOCK_ASSERT();
+ if (!KBD_IS_ACTIVE(kbd))
+ return (FALSE);
+
+ hv_kbd_sc *sc = kbd->kb_data;
+ if (!(sc->sc_flags & HVKBD_FLAG_COMPOSE) && sc->sc_composed_char != 0)
+ return (TRUE);
+ if (sc->sc_flags & HVKBD_FLAG_POLLING)
+ hvkbd_do_poll(sc, 0);
+ if (hv_kbd_prod_is_ready(sc)) {
+ return (TRUE);
+ }
+ return (FALSE);
+}
+
+static int
+hvkbd_check_char(keyboard_t *kbd)
+{
+ int result;
+
+ HVKBD_LOCK();
+ result = hvkbd_check_char_locked(kbd);
+ HVKBD_UNLOCK();
+
+ return (result);
+}
+
+/* read char from the keyboard */
+static uint32_t
+hvkbd_read_char_locked(keyboard_t *kbd, int wait)
+{
+ uint32_t scancode = NOKEY;
+ uint32_t action;
+ keystroke ks;
+ hv_kbd_sc *sc = kbd->kb_data;
+ int keycode;
+
+ HVKBD_LOCK_ASSERT();
+
+ if (!KBD_IS_ACTIVE(kbd) || !hv_kbd_prod_is_ready(sc))
+ return (NOKEY);
+
+next_code:
+
+ /* do we have a composed char to return? */
+ if (!(sc->sc_flags & HVKBD_FLAG_COMPOSE) && sc->sc_composed_char > 0) {
+ action = sc->sc_composed_char;
+ sc->sc_composed_char = 0;
+ if (action > UCHAR_MAX) {
+ return (ERRKEY);
+ }
+ return (action);
+ }
+
+ if (hv_kbd_fetch_top(sc, &ks)) {
+ return (NOKEY);
+ }
+ if ((ks.info & IS_E0) || (ks.info & IS_E1)) {
+ /**
+ * Emulate the generation of E0 or E1 scancode,
+ * the real scancode will be consumed next time.
+ */
+ if (ks.info & IS_E0) {
+ scancode = XTKBD_EMUL0;
+ ks.info &= ~IS_E0;
+ } else if (ks.info & IS_E1) {
+ scancode = XTKBD_EMUL1;
+ ks.info &= ~IS_E1;
+ }
+ /**
+ * Change the top item to avoid encountering
+ * E0 or E1 twice.
+ */
+ hv_kbd_modify_top(sc, &ks);
+ } else if (ks.info & IS_UNICODE) {
+ /**
+ * XXX: Hyperv host send unicode to VM through
+ * 'Type clipboard text', the mapping from
+ * unicode to scancode depends on the keymap.
+ * It is so complicated that we do not plan to
+ * support it yet.
+ */
+ if (bootverbose)
+ device_printf(sc->dev, "Unsupported unicode\n");
+ hv_kbd_remove_top(sc);
+ return (NOKEY);
+ } else {
+ scancode = ks.makecode;
+ if (ks.info & IS_BREAK) {
+ scancode |= XTKBD_RELEASE;
+ }
+ hv_kbd_remove_top(sc);
+ }
+#ifdef EVDEV_SUPPORT
+ /* push evdev event */
+ if (evdev_rcpt_mask & EVDEV_RCPT_HW_KBD &&
+ sc->ks_evdev != NULL) {
+ keycode = evdev_scancode2key(&sc->ks_evdev_state,
+ scancode);
+
+ if (keycode != KEY_RESERVED) {
+ evdev_push_event(sc->ks_evdev, EV_KEY,
+ (uint16_t)keycode, scancode & 0x80 ? 0 : 1);
+ evdev_sync(sc->ks_evdev);
+ }
+ }
+#endif
+ ++kbd->kb_count;
+ DEBUG_HVKBD(kbd, "read scan: 0x%x\n", scancode);
+
+ /* return the byte as is for the K_RAW mode */
+ if (sc->sc_mode == K_RAW)
+ return scancode;
+
+ /* translate the scan code into a keycode */
+ keycode = scancode & 0x7F;
+ switch (sc->sc_prefix) {
+ case 0x00: /* normal scancode */
+ switch(scancode) {
+ case 0xB8: /* left alt (compose key) released */
+ if (sc->sc_flags & HVKBD_FLAG_COMPOSE) {
+ sc->sc_flags &= ~HVKBD_FLAG_COMPOSE;
+ if (sc->sc_composed_char > UCHAR_MAX)
+ sc->sc_composed_char = 0;
+ }
+ break;
+ case 0x38: /* left alt (compose key) pressed */
+ if (!(sc->sc_flags & HVKBD_FLAG_COMPOSE)) {
+ sc->sc_flags |= HVKBD_FLAG_COMPOSE;
+ sc->sc_composed_char = 0;
+ }
+ break;
+ case 0xE0:
+ case 0xE1:
+ sc->sc_prefix = scancode;
+ goto next_code;
+ }
+ break;
+ case 0xE0: /* 0xE0 prefix */
+ sc->sc_prefix = 0;
+ switch (keycode) {
+ case 0x1C: /* right enter key */
+ keycode = 0x59;
+ break;
+ case 0x1D: /* right ctrl key */
+ keycode = 0x5A;
+ break;
+ case 0x35: /* keypad divide key */
+ keycode = 0x5B;
+ break;
+ case 0x37: /* print scrn key */
+ keycode = 0x5C;
+ break;
+ case 0x38: /* right alt key (alt gr) */
+ keycode = 0x5D;
+ break;
+ case 0x46: /* ctrl-pause/break on AT 101 (see below) */
+ keycode = 0x68;
+ break;
+ case 0x47: /* grey home key */
+ keycode = 0x5E;
+ break;
+ case 0x48: /* grey up arrow key */
+ keycode = 0x5F;
+ break;
+ case 0x49: /* grey page up key */
+ keycode = 0x60;
+ break;
+ case 0x4B: /* grey left arrow key */
+ keycode = 0x61;
+ break;
+ case 0x4D: /* grey right arrow key */
+ keycode = 0x62;
+ break;
+ case 0x4F: /* grey end key */
+ keycode = 0x63;
+ break;
+ case 0x50: /* grey down arrow key */
+ keycode = 0x64;
+ break;
+ case 0x51: /* grey page down key */
+ keycode = 0x65;
+ break;
+ case 0x52: /* grey insert key */
+ keycode = 0x66;
+ break;
+ case 0x53: /* grey delete key */
+ keycode = 0x67;
+ break;
+ /* the following 3 are only used on the MS "Natural" keyboard */
+ case 0x5b: /* left Window key */
+ keycode = 0x69;
+ break;
+ case 0x5c: /* right Window key */
+ keycode = 0x6a;
+ break;
+ case 0x5d: /* menu key */
+ keycode = 0x6b;
+ break;
+ case 0x5e: /* power key */
+ keycode = 0x6d;
+ break;
+ case 0x5f: /* sleep key */
+ keycode = 0x6e;
+ break;
+ case 0x63: /* wake key */
+ keycode = 0x6f;
+ break;
+ default: /* ignore everything else */
+ goto next_code;
+ }
+ break;
+ case 0xE1: /* 0xE1 prefix */
+ /*
+ * The pause/break key on the 101 keyboard produces:
+ * E1-1D-45 E1-9D-C5
+ * Ctrl-pause/break produces:
+ * E0-46 E0-C6 (See above.)
+ */
+ sc->sc_prefix = 0;
+ if (keycode == 0x1D)
+ sc->sc_prefix = 0x1D;
+ goto next_code;
+ /* NOT REACHED */
+ case 0x1D: /* pause / break */
+ sc->sc_prefix = 0;
+ if (keycode != 0x45)
+ goto next_code;
+ keycode = 0x68;
+ break;
+ }
+
+ /* XXX assume 101/102 keys AT keyboard */
+ switch (keycode) {
+ case 0x5c: /* print screen */
+ if (sc->sc_flags & ALTS)
+ keycode = 0x54; /* sysrq */
+ break;
+ case 0x68: /* pause/break */
+ if (sc->sc_flags & CTLS)
+ keycode = 0x6c; /* break */
+ break;
+ }
+
+ /* return the key code in the K_CODE mode */
+ if (sc->sc_mode == K_CODE)
+ return (keycode | (scancode & 0x80));
+
+ /* compose a character code */
+ if (sc->sc_flags & HVKBD_FLAG_COMPOSE) {
+ switch (keycode | (scancode & 0x80)) {
+ /* key pressed, process it */
+ case 0x47: case 0x48: case 0x49: /* keypad 7,8,9 */
+ sc->sc_composed_char *= 10;
+ sc->sc_composed_char += keycode - 0x40;
+ if (sc->sc_composed_char > UCHAR_MAX)
+ return ERRKEY;
+ goto next_code;
+ case 0x4B: case 0x4C: case 0x4D: /* keypad 4,5,6 */
+ sc->sc_composed_char *= 10;
+ sc->sc_composed_char += keycode - 0x47;
+ if (sc->sc_composed_char > UCHAR_MAX)
+ return ERRKEY;
+ goto next_code;
+ case 0x4F: case 0x50: case 0x51: /* keypad 1,2,3 */
+ sc->sc_composed_char *= 10;
+ sc->sc_composed_char += keycode - 0x4E;
+ if (sc->sc_composed_char > UCHAR_MAX)
+ return ERRKEY;
+ goto next_code;
+ case 0x52: /* keypad 0 */
+ sc->sc_composed_char *= 10;
+ if (sc->sc_composed_char > UCHAR_MAX)
+ return ERRKEY;
+ goto next_code;
+
+ /* key released, no interest here */
+ case 0xC7: case 0xC8: case 0xC9: /* keypad 7,8,9 */
+ case 0xCB: case 0xCC: case 0xCD: /* keypad 4,5,6 */
+ case 0xCF: case 0xD0: case 0xD1: /* keypad 1,2,3 */
+ case 0xD2: /* keypad 0 */
+ goto next_code;
+
+ case 0x38: /* left alt key */
+ break;
+
+ default:
+ if (sc->sc_composed_char > 0) {
+ sc->sc_flags &= ~HVKBD_FLAG_COMPOSE;
+ sc->sc_composed_char = 0;
+ return (ERRKEY);
+ }
+ break;
+ }
+ }
+
+ /* keycode to key action */
+ action = genkbd_keyaction(kbd, keycode, scancode & 0x80,
+ &sc->sc_state, &sc->sc_accents);
+ if (action == NOKEY)
+ goto next_code;
+ else
+ return (action);
+}
+
+/* Currently wait is always false. */
+static uint32_t
+hvkbd_read_char(keyboard_t *kbd, int wait)
+{
+ uint32_t keycode;
+
+ HVKBD_LOCK();
+ keycode = hvkbd_read_char_locked(kbd, wait);
+ HVKBD_UNLOCK();
+
+ return (keycode);
+}
+
+/* clear the internal state of the keyboard */
+static void
+hvkbd_clear_state(keyboard_t *kbd)
+{
+ hv_kbd_sc *sc = kbd->kb_data;
+ sc->sc_state &= LOCK_MASK; /* preserve locking key state */
+ sc->sc_flags &= ~(HVKBD_FLAG_POLLING | HVKBD_FLAG_COMPOSE);
+ sc->sc_accents = 0;
+ sc->sc_composed_char = 0;
+}
+
+static int
+hvkbd_ioctl_locked(keyboard_t *kbd, u_long cmd, caddr_t arg)
+{
+ int i;
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+ int ival;
+#endif
+ hv_kbd_sc *sc = kbd->kb_data;
+ switch (cmd) {
+ case KDGKBMODE:
+ *(int *)arg = sc->sc_mode;
+ break;
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+ case _IO('K', 7):
+ ival = IOCPARM_IVAL(arg);
+ arg = (caddr_t)&ival;
+ /* FALLTHROUGH */
+#endif
+ case KDSKBMODE: /* set keyboard mode */
+ DEBUG_HVKBD(kbd, "expected mode: %x\n", *(int *)arg);
+ switch (*(int *)arg) {
+ case K_XLATE:
+ if (sc->sc_mode != K_XLATE) {
+ /* make lock key state and LED state match */
+ sc->sc_state &= ~LOCK_MASK;
+ sc->sc_state |= KBD_LED_VAL(kbd);
+ }
+ /* FALLTHROUGH */
+ case K_RAW:
+ case K_CODE:
+ if (sc->sc_mode != *(int *)arg) {
+ DEBUG_HVKBD(kbd, "mod changed to %x\n", *(int *)arg);
+ if ((sc->sc_flags & HVKBD_FLAG_POLLING) == 0)
+ hvkbd_clear_state(kbd);
+ sc->sc_mode = *(int *)arg;
+ }
+ break;
+ default:
+ return (EINVAL);
+ }
+ break;
+ case KDGKBSTATE: /* get lock key state */
+ *(int *)arg = sc->sc_state & LOCK_MASK;
+ break;
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+ case _IO('K', 20):
+ ival = IOCPARM_IVAL(arg);
+ arg = (caddr_t)&ival;
+ /* FALLTHROUGH */
+#endif
+ case KDSKBSTATE: /* set lock key state */
+ if (*(int *)arg & ~LOCK_MASK) {
+ return (EINVAL);
+ }
+ sc->sc_state &= ~LOCK_MASK;
+ sc->sc_state |= *(int *)arg;
+ return hvkbd_ioctl_locked(kbd, KDSETLED, arg);
+ case KDGETLED: /* get keyboard LED */
+ *(int *)arg = KBD_LED_VAL(kbd);
+ break;
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+ case _IO('K', 66):
+ ival = IOCPARM_IVAL(arg);
+ arg = (caddr_t)&ival;
+ /* FALLTHROUGH */
+#endif
+ case KDSETLED: /* set keyboard LED */
+ /* NOTE: lock key state in "sc_state" won't be changed */
+ if (*(int *)arg & ~LOCK_MASK)
+ return (EINVAL);
+
+ i = *(int *)arg;
+
+ /* replace CAPS LED with ALTGR LED for ALTGR keyboards */
+ if (sc->sc_mode == K_XLATE &&
+ kbd->kb_keymap->n_keys > ALTGR_OFFSET) {
+ if (i & ALKED)
+ i |= CLKED;
+ else
+ i &= ~CLKED;
+ }
+ if (KBD_HAS_DEVICE(kbd)) {
+ DEBUG_HVSC(sc, "setled 0x%x\n", *(int *)arg);
+ }
+
+#ifdef EVDEV_SUPPORT
+ /* push LED states to evdev */
+ if (sc->ks_evdev != NULL &&
+ evdev_rcpt_mask & EVDEV_RCPT_HW_KBD)
+ evdev_push_leds(sc->ks_evdev, *(int *)arg);
+#endif
+ KBD_LED_VAL(kbd) = *(int *)arg;
+ break;
+ case PIO_KEYMAP: /* set keyboard translation table */
+ case OPIO_KEYMAP: /* set keyboard translation table (compat) */
+ case PIO_KEYMAPENT: /* set keyboard translation table entry */
+ case PIO_DEADKEYMAP: /* set accent key translation table */
+ sc->sc_accents = 0;
+ /* FALLTHROUGH */
+ default:
+ return (genkbd_commonioctl(kbd, cmd, arg));
+ }
+ return (0);
+}
+
+/* some useful control functions */
+static int
+hvkbd_ioctl(keyboard_t *kbd, u_long cmd, caddr_t arg)
+{
+ DEBUG_HVKBD(kbd, "%s: %lx start\n", __func__, cmd);
+ HVKBD_LOCK();
+ int ret = hvkbd_ioctl_locked(kbd, cmd, arg);
+ HVKBD_UNLOCK();
+ DEBUG_HVKBD(kbd, "%s: %lx end %d\n", __func__, cmd, ret);
+ return (ret);
+}
+
+/* read one byte from the keyboard if it's allowed */
+/* Currently unused. */
+static int
+hvkbd_read(keyboard_t *kbd, int wait)
+{
+ DEBUG_HVKBD(kbd, "%s\n", __func__);
+ HVKBD_LOCK_ASSERT();
+ if (!KBD_IS_ACTIVE(kbd))
+ return (-1);
+ return hvkbd_read_char_locked(kbd, wait);
+}
+
+#ifdef EVDEV_SUPPORT
+static void
+hvkbd_ev_event(struct evdev_dev *evdev, uint16_t type, uint16_t code,
+ int32_t value)
+{
+ keyboard_t *kbd = evdev_get_softc(evdev);
+
+ if (evdev_rcpt_mask & EVDEV_RCPT_HW_KBD &&
+ (type == EV_LED || type == EV_REP)) {
+ mtx_lock(&Giant);
+ kbd_ev_event(kbd, type, code, value);
+ mtx_unlock(&Giant);
+ }
+}
+#endif
+
+static keyboard_switch_t hvkbdsw = {
+ .probe = hvkbd_probe, /* not used */
+ .init = hvkbd_init,
+ .term = hvkbd_term, /* not used */
+ .intr = hvkbd_intr, /* not used */
+ .test_if = hvkbd_test_if, /* not used */
+ .enable = hvkbd_enable,
+ .disable = hvkbd_disable,
+ .read = hvkbd_read,
+ .check = hvkbd_check,
+ .read_char = hvkbd_read_char,
+ .check_char = hvkbd_check_char,
+ .ioctl = hvkbd_ioctl,
+ .lock = hvkbd_lock, /* not used */
+ .clear_state = hvkbd_clear_state,
+ .get_state = hvkbd_get_state, /* not used */
+ .set_state = hvkbd_set_state, /* not used */
+ .poll = hvkbd_poll,
+};
+
+KEYBOARD_DRIVER(hvkbd, hvkbdsw, hvkbd_configure);
+
+void
+hv_kbd_intr(hv_kbd_sc *sc)
+{
+ uint32_t c;
+ if ((sc->sc_flags & HVKBD_FLAG_POLLING) != 0)
+ return;
+
+ if (KBD_IS_ACTIVE(&sc->sc_kbd) &&
+ KBD_IS_BUSY(&sc->sc_kbd)) {
+ /* let the callback function process the input */
+ (sc->sc_kbd.kb_callback.kc_func) (&sc->sc_kbd, KBDIO_KEYINPUT,
+ sc->sc_kbd.kb_callback.kc_arg);
+ } else {
+ /* read and discard the input, no one is waiting for it */
+ do {
+ c = hvkbd_read_char(&sc->sc_kbd, 0);
+ } while (c != NOKEY);
+ }
+}
+
+int
+hvkbd_driver_load(module_t mod, int what, void *arg)
+{
+ switch (what) {
+ case MOD_LOAD:
+ kbd_add_driver(&hvkbd_kbd_driver);
+ break;
+ case MOD_UNLOAD:
+ kbd_delete_driver(&hvkbd_kbd_driver);
+ break;
+ }
+ return (0);
+}
+
+int
+hv_kbd_drv_attach(device_t dev)
+{
+ hv_kbd_sc *sc = device_get_softc(dev);
+ int unit = device_get_unit(dev);
+ keyboard_t *kbd = &sc->sc_kbd;
+ keyboard_switch_t *sw;
+#ifdef EVDEV_SUPPORT
+ struct evdev_dev *evdev;
+#endif
+
+ sw = kbd_get_switch(HVKBD_DRIVER_NAME);
+ if (sw == NULL) {
+ return (ENXIO);
+ }
+
+ kbd_init_struct(kbd, HVKBD_DRIVER_NAME, KB_OTHER, unit, 0, 0, 0);
+ kbd->kb_data = (void *)sc;
+ kbd_set_maps(kbd, &key_map, &accent_map, fkey_tab, nitems(fkey_tab));
+ KBD_FOUND_DEVICE(kbd);
+ hvkbd_clear_state(kbd);
+ KBD_PROBE_DONE(kbd);
+ KBD_INIT_DONE(kbd);
+ sc->sc_mode = K_XLATE;
+ (*sw->enable)(kbd);
+
+#ifdef EVDEV_SUPPORT
+ evdev = evdev_alloc();
+ evdev_set_name(evdev, "Hyper-V keyboard");
+ evdev_set_phys(evdev, device_get_nameunit(dev));
+ evdev_set_id(evdev, BUS_VIRTUAL, 0, 0, 0);
+ evdev_set_methods(evdev, kbd, &hvkbd_evdev_methods);
+ evdev_support_event(evdev, EV_SYN);
+ evdev_support_event(evdev, EV_KEY);
+ evdev_support_event(evdev, EV_LED);
+ evdev_support_event(evdev, EV_REP);
+ evdev_support_all_known_keys(evdev);
+ evdev_support_led(evdev, LED_NUML);
+ evdev_support_led(evdev, LED_CAPSL);
+ evdev_support_led(evdev, LED_SCROLLL);
+ if (evdev_register_mtx(evdev, &Giant))
+ evdev_free(evdev);
+ else
+ sc->ks_evdev = evdev;
+ sc->ks_evdev_state = 0;
+#endif
+
+ if (kbd_register(kbd) < 0) {
+ goto detach;
+ }
+ KBD_CONFIG_DONE(kbd);
+#ifdef KBD_INSTALL_CDEV
+ if (kbd_attach(kbd)) {
+ goto detach;
+ }
+#endif
+ if (bootverbose) {
+ kbdd_diag(kbd, bootverbose);
+ }
+ return (0);
+detach:
+ hv_kbd_drv_detach(dev);
+ return (ENXIO);
+}
+
+int
+hv_kbd_drv_detach(device_t dev)
+{
+ int error = 0;
+ hv_kbd_sc *sc = device_get_softc(dev);
+ hvkbd_disable(&sc->sc_kbd);
+#ifdef EVDEV_SUPPORT
+ evdev_free(sc->ks_evdev);
+#endif
+ if (KBD_IS_CONFIGURED(&sc->sc_kbd)) {
+ error = kbd_unregister(&sc->sc_kbd);
+ if (error) {
+ device_printf(dev, "WARNING: kbd_unregister() "
+ "returned non-zero! (ignored)\n");
+ }
+ }
+#ifdef KBD_INSTALL_CDEV
+ error = kbd_detach(&sc->sc_kbd);
+#endif
+ return (error);
+}
+
diff --git a/sys/dev/hyperv/input/hv_kbdc.c b/sys/dev/hyperv/input/hv_kbdc.c
new file mode 100644
index 000000000000..7065ff3057a7
--- /dev/null
+++ b/sys/dev/hyperv/input/hv_kbdc.c
@@ -0,0 +1,530 @@
+/*-
+ * Copyright (c) 2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/lock.h>
+#include <sys/taskqueue.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/mutex.h>
+
+#include <sys/kbio.h>
+#include <dev/kbd/kbdreg.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/utilities/hv_utilreg.h>
+#include <dev/hyperv/utilities/vmbus_icreg.h>
+#include <dev/hyperv/utilities/vmbus_icvar.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+
+#include "dev/hyperv/input/hv_kbdc.h"
+#include "vmbus_if.h"
+
+#define HV_KBD_VER_MAJOR (1)
+#define HV_KBD_VER_MINOR (0)
+
+#define HV_KBD_VER (HV_KBD_VER_MINOR | (HV_KBD_VER_MAJOR) << 16)
+
+#define HV_KBD_PROTO_ACCEPTED (1)
+
+#define HV_BUFF_SIZE (4*PAGE_SIZE)
+#define HV_KBD_RINGBUFF_SEND_SZ (10*PAGE_SIZE)
+#define HV_KBD_RINGBUFF_RECV_SZ (10*PAGE_SIZE)
+
+enum hv_kbd_msg_type_t {
+ HV_KBD_PROTO_REQUEST = 1,
+ HV_KBD_PROTO_RESPONSE = 2,
+ HV_KBD_PROTO_EVENT = 3,
+ HV_KBD_PROTO_LED_INDICATORS = 4,
+};
+
+typedef struct hv_kbd_msg_hdr_t {
+ uint32_t type;
+} hv_kbd_msg_hdr;
+
+typedef struct hv_kbd_msg_t {
+ hv_kbd_msg_hdr hdr;
+ char data[];
+} hv_kbd_msg;
+
+typedef struct hv_kbd_proto_req_t {
+ hv_kbd_msg_hdr hdr;
+ uint32_t ver;
+} hv_kbd_proto_req;
+
+typedef struct hv_kbd_proto_resp_t {
+ hv_kbd_msg_hdr hdr;
+ uint32_t status;
+} hv_kbd_proto_resp;
+
+#define HV_KBD_PROTO_REQ_SZ (sizeof(hv_kbd_proto_req))
+#define HV_KBD_PROTO_RESP_SZ (sizeof(hv_kbd_proto_resp))
+
+/**
+ * the struct in win host:
+ * typedef struct _HK_MESSAGE_KEYSTROKE
+ * {
+ * HK_MESSAGE_HEADER Header;
+ * UINT16 MakeCode;
+ * UINT32 IsUnicode:1;
+ * UINT32 IsBreak:1;
+ * UINT32 IsE0:1;
+ * UINT32 IsE1:1;
+ * UINT32 Reserved:28;
+ * } HK_MESSAGE_KEYSTROKE
+ */
+typedef struct hv_kbd_keystroke_t {
+ hv_kbd_msg_hdr hdr;
+ keystroke ks;
+} hv_kbd_keystroke;
+
+static const struct vmbus_ic_desc vmbus_kbd_descs[] = {
+ {
+ .ic_guid = { .hv_guid = {
+ 0x6d, 0xad, 0x12, 0xf9, 0x17, 0x2b, 0xea, 0x48,
+ 0xbd, 0x65, 0xf9, 0x27, 0xa6, 0x1c, 0x76, 0x84} },
+ .ic_desc = "Hyper-V KBD"
+ },
+ VMBUS_IC_DESC_END
+};
+
+static int hv_kbd_attach(device_t dev);
+static int hv_kbd_detach(device_t dev);
+
+/**
+ * return 1 if producer is ready
+ */
+int
+hv_kbd_prod_is_ready(hv_kbd_sc *sc)
+{
+ int ret;
+ mtx_lock(&sc->ks_mtx);
+ ret = !STAILQ_EMPTY(&sc->ks_queue);
+ mtx_unlock(&sc->ks_mtx);
+ return (ret);
+}
+
+int
+hv_kbd_produce_ks(hv_kbd_sc *sc, const keystroke *ks)
+{
+ int ret = 0;
+ keystroke_info *ksi;
+ mtx_lock(&sc->ks_mtx);
+ if (LIST_EMPTY(&sc->ks_free_list)) {
+ DEBUG_HVSC(sc, "NO buffer!\n");
+ ret = 1;
+ } else {
+ ksi = LIST_FIRST(&sc->ks_free_list);
+ LIST_REMOVE(ksi, link);
+ ksi->ks = *ks;
+ STAILQ_INSERT_TAIL(&sc->ks_queue, ksi, slink);
+ }
+ mtx_unlock(&sc->ks_mtx);
+ return (ret);
+}
+
+/**
+ * return 0 if successfully get the 1st item of queue without removing it
+ */
+int
+hv_kbd_fetch_top(hv_kbd_sc *sc, keystroke *result)
+{
+ int ret = 0;
+ keystroke_info *ksi = NULL;
+ mtx_lock(&sc->ks_mtx);
+ if (STAILQ_EMPTY(&sc->ks_queue)) {
+ DEBUG_HVSC(sc, "Empty queue!\n");
+ ret = 1;
+ } else {
+ ksi = STAILQ_FIRST(&sc->ks_queue);
+ *result = ksi->ks;
+ }
+ mtx_unlock(&sc->ks_mtx);
+ return (ret);
+}
+
+/**
+ * return 0 if successfully removing the top item
+ */
+int
+hv_kbd_remove_top(hv_kbd_sc *sc)
+{
+ int ret = 0;
+ keystroke_info *ksi = NULL;
+ mtx_lock(&sc->ks_mtx);
+ if (STAILQ_EMPTY(&sc->ks_queue)) {
+ DEBUG_HVSC(sc, "Empty queue!\n");
+ ret = 1;
+ } else {
+ ksi = STAILQ_FIRST(&sc->ks_queue);
+ STAILQ_REMOVE_HEAD(&sc->ks_queue, slink);
+ LIST_INSERT_HEAD(&sc->ks_free_list, ksi, link);
+ }
+ mtx_unlock(&sc->ks_mtx);
+ return (ret);
+}
+
+/**
+ * return 0 if successfully modify the 1st item of queue
+ */
+int
+hv_kbd_modify_top(hv_kbd_sc *sc, keystroke *top)
+{
+ int ret = 0;
+ keystroke_info *ksi = NULL;
+ mtx_lock(&sc->ks_mtx);
+ if (STAILQ_EMPTY(&sc->ks_queue)) {
+ DEBUG_HVSC(sc, "Empty queue!\n");
+ ret = 1;
+ } else {
+ ksi = STAILQ_FIRST(&sc->ks_queue);
+ ksi->ks = *top;
+ }
+ mtx_unlock(&sc->ks_mtx);
+ return (ret);
+}
+
+static int
+hv_kbd_probe(device_t dev)
+{
+ device_t bus = device_get_parent(dev);
+ const struct vmbus_ic_desc *d;
+
+ if (resource_disabled(device_get_name(dev), 0))
+ return (ENXIO);
+
+ for (d = vmbus_kbd_descs; d->ic_desc != NULL; ++d) {
+ if (VMBUS_PROBE_GUID(bus, dev, &d->ic_guid) == 0) {
+ device_set_desc(dev, d->ic_desc);
+ return (BUS_PROBE_DEFAULT);
+ }
+ }
+ return (ENXIO);
+}
+
+static void
+hv_kbd_on_response(hv_kbd_sc *sc, struct vmbus_chanpkt_hdr *pkt)
+{
+ struct vmbus_xact_ctx *xact = sc->hs_xact_ctx;
+ if (xact != NULL) {
+ DEBUG_HVSC(sc, "hvkbd is ready\n");
+ vmbus_xact_ctx_wakeup(xact, VMBUS_CHANPKT_CONST_DATA(pkt),
+ VMBUS_CHANPKT_DATALEN(pkt));
+ }
+}
+
+static void
+hv_kbd_on_received(hv_kbd_sc *sc, struct vmbus_chanpkt_hdr *pkt)
+{
+
+ const hv_kbd_msg *msg = VMBUS_CHANPKT_CONST_DATA(pkt);
+ const hv_kbd_proto_resp *resp =
+ VMBUS_CHANPKT_CONST_DATA(pkt);
+ const hv_kbd_keystroke *keystroke =
+ VMBUS_CHANPKT_CONST_DATA(pkt);
+ uint32_t msg_len = VMBUS_CHANPKT_DATALEN(pkt);
+ enum hv_kbd_msg_type_t msg_type;
+ uint32_t info;
+ uint16_t scan_code;
+
+ if (msg_len <= sizeof(hv_kbd_msg)) {
+ device_printf(sc->dev, "Illegal packet\n");
+ return;
+ }
+ msg_type = msg->hdr.type;
+ switch (msg_type) {
+ case HV_KBD_PROTO_RESPONSE:
+ hv_kbd_on_response(sc, pkt);
+ DEBUG_HVSC(sc, "keyboard resp: 0x%x\n",
+ resp->status);
+ break;
+ case HV_KBD_PROTO_EVENT:
+ info = keystroke->ks.info;
+ scan_code = keystroke->ks.makecode;
+ DEBUG_HVSC(sc, "keystroke info: 0x%x, scan: 0x%x\n",
+ info, scan_code);
+ hv_kbd_produce_ks(sc, &keystroke->ks);
+ hv_kbd_intr(sc);
+ default:
+ break;
+ }
+}
+
+void
+hv_kbd_read_channel(struct vmbus_channel *channel, void *context)
+{
+ uint8_t *buf;
+ uint32_t buflen = 0;
+ int ret = 0;
+
+ hv_kbd_sc *sc = (hv_kbd_sc*)context;
+ buf = sc->buf;
+ buflen = sc->buflen;
+ for (;;) {
+ struct vmbus_chanpkt_hdr *pkt = (struct vmbus_chanpkt_hdr *)buf;
+ uint32_t rxed = buflen;
+
+ ret = vmbus_chan_recv_pkt(channel, pkt, &rxed);
+ if (__predict_false(ret == ENOBUFS)) {
+ buflen = sc->buflen * 2;
+ while (buflen < rxed)
+ buflen *= 2;
+ buf = malloc(buflen, M_DEVBUF, M_WAITOK | M_ZERO);
+ device_printf(sc->dev, "expand recvbuf %d -> %d\n",
+ sc->buflen, buflen);
+ free(sc->buf, M_DEVBUF);
+ sc->buf = buf;
+ sc->buflen = buflen;
+ continue;
+ } else if (__predict_false(ret == EAGAIN)) {
+ /* No more channel packets; done! */
+ break;
+ }
+ KASSERT(!ret, ("vmbus_chan_recv_pkt failed: %d", ret));
+
+ DEBUG_HVSC(sc, "event: 0x%x\n", pkt->cph_type);
+ switch (pkt->cph_type) {
+ case VMBUS_CHANPKT_TYPE_COMP:
+ case VMBUS_CHANPKT_TYPE_RXBUF:
+ device_printf(sc->dev, "unhandled event: %d\n",
+ pkt->cph_type);
+ break;
+ case VMBUS_CHANPKT_TYPE_INBAND:
+ hv_kbd_on_received(sc, pkt);
+ break;
+ default:
+ device_printf(sc->dev, "unknown event: %d\n",
+ pkt->cph_type);
+ break;
+ }
+ }
+}
+
+static int
+hv_kbd_connect_vsp(hv_kbd_sc *sc)
+{
+ int ret;
+ size_t resplen;
+ struct vmbus_xact *xact;
+ hv_kbd_proto_req *req;
+ const hv_kbd_proto_resp *resp;
+
+ xact = vmbus_xact_get(sc->hs_xact_ctx, sizeof(*req));
+ if (xact == NULL) {
+ device_printf(sc->dev, "no xact for kbd init");
+ return (ENODEV);
+ }
+ req = vmbus_xact_req_data(xact);
+ req->hdr.type = HV_KBD_PROTO_REQUEST;
+ req->ver = HV_KBD_VER;
+
+ vmbus_xact_activate(xact);
+ ret = vmbus_chan_send(sc->hs_chan,
+ VMBUS_CHANPKT_TYPE_INBAND,
+ VMBUS_CHANPKT_FLAG_RC,
+ req, sizeof(hv_kbd_proto_req),
+ (uint64_t)(uintptr_t)xact);
+ if (ret) {
+ device_printf(sc->dev, "fail to send\n");
+ vmbus_xact_deactivate(xact);
+ return (ret);
+ }
+ resp = vmbus_chan_xact_wait(sc->hs_chan, xact, &resplen, true);
+ if (resplen < HV_KBD_PROTO_RESP_SZ) {
+ device_printf(sc->dev, "hv_kbd init communicate failed\n");
+ ret = ENODEV;
+ goto clean;
+ }
+
+ if (!(resp->status & HV_KBD_PROTO_ACCEPTED)) {
+ device_printf(sc->dev, "hv_kbd protocol request failed\n");
+ ret = ENODEV;
+ }
+clean:
+ vmbus_xact_put(xact);
+ DEBUG_HVSC(sc, "finish connect vsp\n");
+ return (ret);
+}
+
+static int
+hv_kbd_attach1(device_t dev, vmbus_chan_callback_t cb)
+{
+ int ret;
+ hv_kbd_sc *sc;
+
+ sc = device_get_softc(dev);
+ sc->buflen = HV_BUFF_SIZE;
+ sc->buf = malloc(sc->buflen, M_DEVBUF, M_WAITOK | M_ZERO);
+ vmbus_chan_set_readbatch(sc->hs_chan, false);
+ ret = vmbus_chan_open(
+ sc->hs_chan,
+ HV_KBD_RINGBUFF_SEND_SZ,
+ HV_KBD_RINGBUFF_RECV_SZ,
+ NULL, 0,
+ cb,
+ sc);
+ if (ret != 0) {
+ free(sc->buf, M_DEVBUF);
+ }
+ return (ret);
+}
+
+static int
+hv_kbd_detach1(device_t dev)
+{
+ hv_kbd_sc *sc = device_get_softc(dev);
+ vmbus_chan_close(vmbus_get_channel(dev));
+ free(sc->buf, M_DEVBUF);
+ return (0);
+}
+
+static void
+hv_kbd_init(hv_kbd_sc *sc)
+{
+ const int max_list = 16;
+ int i;
+ keystroke_info *ksi;
+
+ mtx_init(&sc->ks_mtx, "hv_kbdc mutex", NULL, MTX_DEF);
+ LIST_INIT(&sc->ks_free_list);
+ STAILQ_INIT(&sc->ks_queue);
+ for (i = 0; i < max_list; i++) {
+ ksi = malloc(sizeof(keystroke_info),
+ M_DEVBUF, M_WAITOK|M_ZERO);
+ LIST_INSERT_HEAD(&sc->ks_free_list, ksi, link);
+ }
+}
+
+static void
+hv_kbd_fini(hv_kbd_sc *sc)
+{
+ keystroke_info *ksi;
+ while (!LIST_EMPTY(&sc->ks_free_list)) {
+ ksi = LIST_FIRST(&sc->ks_free_list);
+ LIST_REMOVE(ksi, link);
+ free(ksi, M_DEVBUF);
+ }
+ while (!STAILQ_EMPTY(&sc->ks_queue)) {
+ ksi = STAILQ_FIRST(&sc->ks_queue);
+ STAILQ_REMOVE_HEAD(&sc->ks_queue, slink);
+ free(ksi, M_DEVBUF);
+ }
+ mtx_destroy(&sc->ks_mtx);
+}
+
+static void
+hv_kbd_sysctl(device_t dev)
+{
+ struct sysctl_oid_list *child;
+ struct sysctl_ctx_list *ctx;
+ hv_kbd_sc *sc;
+
+ sc = device_get_softc(dev);
+ ctx = device_get_sysctl_ctx(dev);
+ child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "debug", CTLFLAG_RW,
+ &sc->debug, 0, "debug hyperv keyboard");
+}
+
+static int
+hv_kbd_attach(device_t dev)
+{
+ int error = 0;
+ hv_kbd_sc *sc;
+
+ sc = device_get_softc(dev);
+ sc->hs_chan = vmbus_get_channel(dev);
+ sc->dev = dev;
+ hv_kbd_init(sc);
+ sc->hs_xact_ctx = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
+ HV_KBD_PROTO_REQ_SZ, HV_KBD_PROTO_RESP_SZ, 0);
+ if (sc->hs_xact_ctx == NULL) {
+ error = ENOMEM;
+ goto failed;
+ }
+
+ error = hv_kbd_attach1(dev, hv_kbd_read_channel);
+ if (error)
+ goto failed;
+ error = hv_kbd_connect_vsp(sc);
+ if (error)
+ goto failed;
+
+ error = hv_kbd_drv_attach(dev);
+ if (error)
+ goto failed;
+ hv_kbd_sysctl(dev);
+ return (0);
+failed:
+ hv_kbd_detach(dev);
+ return (error);
+}
+
+static int
+hv_kbd_detach(device_t dev)
+{
+ int ret;
+ hv_kbd_sc *sc = device_get_softc(dev);
+ hv_kbd_fini(sc);
+ if (sc->hs_xact_ctx != NULL)
+ vmbus_xact_ctx_destroy(sc->hs_xact_ctx);
+ ret = hv_kbd_detach1(dev);
+ if (!ret)
+ device_printf(dev, "Fail to detach\n");
+ return hv_kbd_drv_detach(dev);
+}
+
+static device_method_t kbd_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, hv_kbd_probe),
+ DEVMETHOD(device_attach, hv_kbd_attach),
+ DEVMETHOD(device_detach, hv_kbd_detach),
+ { 0, 0 }
+};
+
+static driver_t kbd_driver = {HVKBD_DRIVER_NAME , kbd_methods, sizeof(hv_kbd_sc)};
+
+static devclass_t kbd_devclass;
+
+DRIVER_MODULE(hv_kbd, vmbus, kbd_driver, kbd_devclass, hvkbd_driver_load, NULL);
+MODULE_VERSION(hv_kbd, 1);
+MODULE_DEPEND(hv_kbd, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/input/hv_kbdc.h b/sys/dev/hyperv/input/hv_kbdc.h
new file mode 100644
index 000000000000..f6f76035e8c3
--- /dev/null
+++ b/sys/dev/hyperv/input/hv_kbdc.h
@@ -0,0 +1,118 @@
+/*-
+ * Copyright (c) 2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HV_KBD_H
+#define _HV_KBD_H
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/systm.h>
+
+#include <dev/kbd/kbdreg.h>
+
+#include "opt_evdev.h"
+#ifdef EVDEV_SUPPORT
+#include <dev/evdev/evdev.h>
+#include <dev/evdev/input.h>
+#endif
+
+#define HVKBD_DRIVER_NAME "hvkbd"
+#define IS_UNICODE (1)
+#define IS_BREAK (2)
+#define IS_E0 (4)
+#define IS_E1 (8)
+
+#define XTKBD_EMUL0 (0xe0)
+#define XTKBD_EMUL1 (0xe1)
+#define XTKBD_RELEASE (0x80)
+
+#define DEBUG_HVSC(sc, ...) do { \
+ if (sc->debug > 0) { \
+ device_printf(sc->dev, __VA_ARGS__); \
+ } \
+} while (0)
+#define DEBUG_HVKBD(kbd, ...) do { \
+ hv_kbd_sc *sc = (kbd)->kb_data; \
+ DEBUG_HVSC(sc, __VA_ARGS__); \
+} while (0)
+
+struct vmbus_channel;
+struct vmbus_xact_ctx;
+
+typedef struct keystroke_t {
+ uint16_t makecode;
+ uint32_t info;
+} keystroke;
+
+typedef struct keystroke_info {
+ LIST_ENTRY(keystroke_info) link;
+ STAILQ_ENTRY(keystroke_info) slink;
+ keystroke ks;
+} keystroke_info;
+
+typedef struct hv_kbd_sc_t {
+ struct vmbus_channel *hs_chan;
+ device_t dev;
+ struct vmbus_xact_ctx *hs_xact_ctx;
+ int32_t buflen;
+ uint8_t *buf;
+
+ struct mtx ks_mtx;
+ LIST_HEAD(, keystroke_info) ks_free_list;
+ STAILQ_HEAD(, keystroke_info) ks_queue; /* keystroke info queue */
+
+ keyboard_t sc_kbd;
+ int sc_mode;
+ int sc_state;
+ uint32_t sc_accents; /* accent key index (> 0) */
+ uint32_t sc_composed_char; /* composed char code */
+ uint8_t sc_prefix; /* AT scan code prefix */
+ int sc_polling; /* polling recursion count */
+ uint32_t sc_flags;
+ int debug;
+
+#ifdef EVDEV_SUPPORT
+ struct evdev_dev *ks_evdev;
+ int ks_evdev_state;
+#endif
+} hv_kbd_sc;
+
+int hv_kbd_produce_ks(hv_kbd_sc *sc, const keystroke *ks);
+int hv_kbd_fetch_top(hv_kbd_sc *sc, keystroke *top);
+int hv_kbd_modify_top(hv_kbd_sc *sc, keystroke *top);
+int hv_kbd_remove_top(hv_kbd_sc *sc);
+int hv_kbd_prod_is_ready(hv_kbd_sc *sc);
+void hv_kbd_read_channel(struct vmbus_channel *, void *);
+
+int hv_kbd_drv_attach(device_t dev);
+int hv_kbd_drv_detach(device_t dev);
+
+int hvkbd_driver_load(module_t, int, void *);
+void hv_kbd_intr(hv_kbd_sc *sc);
+#endif
diff --git a/sys/dev/hyperv/netvsc/hn_nvs.c b/sys/dev/hyperv/netvsc/hn_nvs.c
new file mode 100644
index 000000000000..4dbc28996617
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/hn_nvs.c
@@ -0,0 +1,751 @@
+/*-
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2010-2012 Citrix Inc.
+ * Copyright (c) 2012 NetApp Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Network Virtualization Service.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet6.h"
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/socket.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_media.h>
+
+#include <netinet/in.h>
+#include <netinet/tcp_lro.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+
+#include <dev/hyperv/netvsc/ndis.h>
+#include <dev/hyperv/netvsc/if_hnreg.h>
+#include <dev/hyperv/netvsc/if_hnvar.h>
+#include <dev/hyperv/netvsc/hn_nvs.h>
+
+static int hn_nvs_conn_chim(struct hn_softc *);
+static int hn_nvs_conn_rxbuf(struct hn_softc *);
+static void hn_nvs_disconn_chim(struct hn_softc *);
+static void hn_nvs_disconn_rxbuf(struct hn_softc *);
+static int hn_nvs_conf_ndis(struct hn_softc *, int);
+static int hn_nvs_init_ndis(struct hn_softc *);
+static int hn_nvs_doinit(struct hn_softc *, uint32_t);
+static int hn_nvs_init(struct hn_softc *);
+static const void *hn_nvs_xact_execute(struct hn_softc *,
+ struct vmbus_xact *, void *, int,
+ size_t *, uint32_t);
+static void hn_nvs_sent_none(struct hn_nvs_sendctx *,
+ struct hn_softc *, struct vmbus_channel *,
+ const void *, int);
+
+struct hn_nvs_sendctx hn_nvs_sendctx_none =
+ HN_NVS_SENDCTX_INITIALIZER(hn_nvs_sent_none, NULL);
+
+static const uint32_t hn_nvs_version[] = {
+ HN_NVS_VERSION_61,
+ HN_NVS_VERSION_6,
+ HN_NVS_VERSION_5,
+ HN_NVS_VERSION_4,
+ HN_NVS_VERSION_2,
+ HN_NVS_VERSION_1
+};
+
+static const void *
+hn_nvs_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact,
+ void *req, int reqlen, size_t *resplen0, uint32_t type)
+{
+ struct hn_nvs_sendctx sndc;
+ size_t resplen, min_resplen = *resplen0;
+ const struct hn_nvs_hdr *hdr;
+ int error;
+
+ KASSERT(min_resplen >= sizeof(*hdr),
+ ("invalid minimum response len %zu", min_resplen));
+
+ /*
+ * Execute the xact setup by the caller.
+ */
+ hn_nvs_sendctx_init(&sndc, hn_nvs_sent_xact, xact);
+
+ vmbus_xact_activate(xact);
+ error = hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_RC,
+ req, reqlen, &sndc);
+ if (error) {
+ vmbus_xact_deactivate(xact);
+ return (NULL);
+ }
+ hdr = vmbus_chan_xact_wait(sc->hn_prichan, xact, &resplen,
+ HN_CAN_SLEEP(sc));
+
+ /*
+ * Check this NVS response message.
+ */
+ if (resplen < min_resplen) {
+ if_printf(sc->hn_ifp, "invalid NVS resp len %zu\n", resplen);
+ return (NULL);
+ }
+ if (hdr->nvs_type != type) {
+ if_printf(sc->hn_ifp, "unexpected NVS resp 0x%08x, "
+ "expect 0x%08x\n", hdr->nvs_type, type);
+ return (NULL);
+ }
+ /* All pass! */
+ *resplen0 = resplen;
+ return (hdr);
+}
+
+static __inline int
+hn_nvs_req_send(struct hn_softc *sc, void *req, int reqlen)
+{
+
+ return (hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_NONE,
+ req, reqlen, &hn_nvs_sendctx_none));
+}
+
+static int
+hn_nvs_conn_rxbuf(struct hn_softc *sc)
+{
+ struct vmbus_xact *xact = NULL;
+ struct hn_nvs_rxbuf_conn *conn;
+ const struct hn_nvs_rxbuf_connresp *resp;
+ size_t resp_len;
+ uint32_t status;
+ int error, rxbuf_size;
+
+ /*
+ * Limit RXBUF size for old NVS.
+ */
+ if (sc->hn_nvs_ver <= HN_NVS_VERSION_2)
+ rxbuf_size = HN_RXBUF_SIZE_COMPAT;
+ else
+ rxbuf_size = HN_RXBUF_SIZE;
+
+ /*
+ * Connect the RXBUF GPADL to the primary channel.
+ *
+ * NOTE:
+ * Only primary channel has RXBUF connected to it. Sub-channels
+ * just share this RXBUF.
+ */
+ error = vmbus_chan_gpadl_connect(sc->hn_prichan,
+ sc->hn_rxbuf_dma.hv_paddr, rxbuf_size, &sc->hn_rxbuf_gpadl);
+ if (error) {
+ if_printf(sc->hn_ifp, "rxbuf gpadl conn failed: %d\n",
+ error);
+ goto cleanup;
+ }
+
+ /*
+ * Connect RXBUF to NVS.
+ */
+
+ xact = vmbus_xact_get(sc->hn_xact, sizeof(*conn));
+ if (xact == NULL) {
+ if_printf(sc->hn_ifp, "no xact for nvs rxbuf conn\n");
+ error = ENXIO;
+ goto cleanup;
+ }
+ conn = vmbus_xact_req_data(xact);
+ conn->nvs_type = HN_NVS_TYPE_RXBUF_CONN;
+ conn->nvs_gpadl = sc->hn_rxbuf_gpadl;
+ conn->nvs_sig = HN_NVS_RXBUF_SIG;
+
+ resp_len = sizeof(*resp);
+ resp = hn_nvs_xact_execute(sc, xact, conn, sizeof(*conn), &resp_len,
+ HN_NVS_TYPE_RXBUF_CONNRESP);
+ if (resp == NULL) {
+ if_printf(sc->hn_ifp, "exec nvs rxbuf conn failed\n");
+ error = EIO;
+ goto cleanup;
+ }
+
+ status = resp->nvs_status;
+ vmbus_xact_put(xact);
+ xact = NULL;
+
+ if (status != HN_NVS_STATUS_OK) {
+ if_printf(sc->hn_ifp, "nvs rxbuf conn failed: %x\n", status);
+ error = EIO;
+ goto cleanup;
+ }
+ sc->hn_flags |= HN_FLAG_RXBUF_CONNECTED;
+
+ return (0);
+
+cleanup:
+ if (xact != NULL)
+ vmbus_xact_put(xact);
+ hn_nvs_disconn_rxbuf(sc);
+ return (error);
+}
+
+static int
+hn_nvs_conn_chim(struct hn_softc *sc)
+{
+ struct vmbus_xact *xact = NULL;
+ struct hn_nvs_chim_conn *chim;
+ const struct hn_nvs_chim_connresp *resp;
+ size_t resp_len;
+ uint32_t status, sectsz;
+ int error;
+
+ /*
+ * Connect chimney sending buffer GPADL to the primary channel.
+ *
+ * NOTE:
+ * Only primary channel has chimney sending buffer connected to it.
+ * Sub-channels just share this chimney sending buffer.
+ */
+ error = vmbus_chan_gpadl_connect(sc->hn_prichan,
+ sc->hn_chim_dma.hv_paddr, HN_CHIM_SIZE, &sc->hn_chim_gpadl);
+ if (error) {
+ if_printf(sc->hn_ifp, "chim gpadl conn failed: %d\n", error);
+ goto cleanup;
+ }
+
+ /*
+ * Connect chimney sending buffer to NVS
+ */
+
+ xact = vmbus_xact_get(sc->hn_xact, sizeof(*chim));
+ if (xact == NULL) {
+ if_printf(sc->hn_ifp, "no xact for nvs chim conn\n");
+ error = ENXIO;
+ goto cleanup;
+ }
+ chim = vmbus_xact_req_data(xact);
+ chim->nvs_type = HN_NVS_TYPE_CHIM_CONN;
+ chim->nvs_gpadl = sc->hn_chim_gpadl;
+ chim->nvs_sig = HN_NVS_CHIM_SIG;
+
+ resp_len = sizeof(*resp);
+ resp = hn_nvs_xact_execute(sc, xact, chim, sizeof(*chim), &resp_len,
+ HN_NVS_TYPE_CHIM_CONNRESP);
+ if (resp == NULL) {
+ if_printf(sc->hn_ifp, "exec nvs chim conn failed\n");
+ error = EIO;
+ goto cleanup;
+ }
+
+ status = resp->nvs_status;
+ sectsz = resp->nvs_sectsz;
+ vmbus_xact_put(xact);
+ xact = NULL;
+
+ if (status != HN_NVS_STATUS_OK) {
+ if_printf(sc->hn_ifp, "nvs chim conn failed: %x\n", status);
+ error = EIO;
+ goto cleanup;
+ }
+ if (sectsz == 0 || sectsz % sizeof(uint32_t) != 0) {
+ /*
+ * Can't use chimney sending buffer; done!
+ */
+ if (sectsz == 0) {
+ if_printf(sc->hn_ifp, "zero chimney sending buffer "
+ "section size\n");
+ } else {
+ if_printf(sc->hn_ifp, "misaligned chimney sending "
+ "buffers, section size: %u\n", sectsz);
+ }
+ sc->hn_chim_szmax = 0;
+ sc->hn_chim_cnt = 0;
+ sc->hn_flags |= HN_FLAG_CHIM_CONNECTED;
+ return (0);
+ }
+
+ sc->hn_chim_szmax = sectsz;
+ sc->hn_chim_cnt = HN_CHIM_SIZE / sc->hn_chim_szmax;
+ if (HN_CHIM_SIZE % sc->hn_chim_szmax != 0) {
+ if_printf(sc->hn_ifp, "chimney sending sections are "
+ "not properly aligned\n");
+ }
+ if (sc->hn_chim_cnt % LONG_BIT != 0) {
+ if_printf(sc->hn_ifp, "discard %d chimney sending sections\n",
+ sc->hn_chim_cnt % LONG_BIT);
+ }
+
+ sc->hn_chim_bmap_cnt = sc->hn_chim_cnt / LONG_BIT;
+ sc->hn_chim_bmap = malloc(sc->hn_chim_bmap_cnt * sizeof(u_long),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+
+ /* Done! */
+ sc->hn_flags |= HN_FLAG_CHIM_CONNECTED;
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "chimney sending buffer %d/%d\n",
+ sc->hn_chim_szmax, sc->hn_chim_cnt);
+ }
+ return (0);
+
+cleanup:
+ if (xact != NULL)
+ vmbus_xact_put(xact);
+ hn_nvs_disconn_chim(sc);
+ return (error);
+}
+
+static void
+hn_nvs_disconn_rxbuf(struct hn_softc *sc)
+{
+ int error;
+
+ if (sc->hn_flags & HN_FLAG_RXBUF_CONNECTED) {
+ struct hn_nvs_rxbuf_disconn disconn;
+
+ /*
+ * Disconnect RXBUF from NVS.
+ */
+ memset(&disconn, 0, sizeof(disconn));
+ disconn.nvs_type = HN_NVS_TYPE_RXBUF_DISCONN;
+ disconn.nvs_sig = HN_NVS_RXBUF_SIG;
+
+ /* NOTE: No response. */
+ error = hn_nvs_req_send(sc, &disconn, sizeof(disconn));
+ if (error) {
+ if_printf(sc->hn_ifp,
+ "send nvs rxbuf disconn failed: %d\n", error);
+ /*
+ * Fine for a revoked channel, since the hypervisor
+ * does not drain TX bufring for a revoked channel.
+ */
+ if (!vmbus_chan_is_revoked(sc->hn_prichan))
+ sc->hn_flags |= HN_FLAG_RXBUF_REF;
+ }
+ sc->hn_flags &= ~HN_FLAG_RXBUF_CONNECTED;
+
+ /*
+ * Wait for the hypervisor to receive this NVS request.
+ *
+ * NOTE:
+ * The TX bufring will not be drained by the hypervisor,
+ * if the primary channel is revoked.
+ */
+ while (!vmbus_chan_tx_empty(sc->hn_prichan) &&
+ !vmbus_chan_is_revoked(sc->hn_prichan))
+ pause("waittx", 1);
+ /*
+ * Linger long enough for NVS to disconnect RXBUF.
+ */
+ pause("lingtx", (200 * hz) / 1000);
+ }
+
+ if (vmbus_current_version < VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
+ /*
+ * Disconnect RXBUF from primary channel.
+ */
+ error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
+ sc->hn_rxbuf_gpadl);
+ if (error) {
+ if_printf(sc->hn_ifp,
+ "rxbuf gpadl disconn failed: %d\n", error);
+ sc->hn_flags |= HN_FLAG_RXBUF_REF;
+ }
+ sc->hn_rxbuf_gpadl = 0;
+ }
+}
+
+static void
+hn_nvs_disconn_chim(struct hn_softc *sc)
+{
+ int error;
+
+ if (sc->hn_flags & HN_FLAG_CHIM_CONNECTED) {
+ struct hn_nvs_chim_disconn disconn;
+
+ /*
+ * Disconnect chimney sending buffer from NVS.
+ */
+ memset(&disconn, 0, sizeof(disconn));
+ disconn.nvs_type = HN_NVS_TYPE_CHIM_DISCONN;
+ disconn.nvs_sig = HN_NVS_CHIM_SIG;
+
+ /* NOTE: No response. */
+ error = hn_nvs_req_send(sc, &disconn, sizeof(disconn));
+ if (error) {
+ if_printf(sc->hn_ifp,
+ "send nvs chim disconn failed: %d\n", error);
+ /*
+ * Fine for a revoked channel, since the hypervisor
+ * does not drain TX bufring for a revoked channel.
+ */
+ if (!vmbus_chan_is_revoked(sc->hn_prichan))
+ sc->hn_flags |= HN_FLAG_CHIM_REF;
+ }
+ sc->hn_flags &= ~HN_FLAG_CHIM_CONNECTED;
+
+ /*
+ * Wait for the hypervisor to receive this NVS request.
+ *
+ * NOTE:
+ * The TX bufring will not be drained by the hypervisor,
+ * if the primary channel is revoked.
+ */
+ while (!vmbus_chan_tx_empty(sc->hn_prichan) &&
+ !vmbus_chan_is_revoked(sc->hn_prichan))
+ pause("waittx", 1);
+ /*
+ * Linger long enough for NVS to disconnect chimney
+ * sending buffer.
+ */
+ pause("lingtx", (200 * hz) / 1000);
+ }
+
+ if (vmbus_current_version < VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
+ /*
+ * Disconnect chimney sending buffer from primary channel.
+ */
+ error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
+ sc->hn_chim_gpadl);
+ if (error) {
+ if_printf(sc->hn_ifp,
+ "chim gpadl disconn failed: %d\n", error);
+ sc->hn_flags |= HN_FLAG_CHIM_REF;
+ }
+ sc->hn_chim_gpadl = 0;
+ }
+
+ if (sc->hn_chim_bmap != NULL) {
+ free(sc->hn_chim_bmap, M_DEVBUF);
+ sc->hn_chim_bmap = NULL;
+ sc->hn_chim_bmap_cnt = 0;
+ }
+}
+
+static int
+hn_nvs_doinit(struct hn_softc *sc, uint32_t nvs_ver)
+{
+ struct vmbus_xact *xact;
+ struct hn_nvs_init *init;
+ const struct hn_nvs_init_resp *resp;
+ size_t resp_len;
+ uint32_t status;
+
+ xact = vmbus_xact_get(sc->hn_xact, sizeof(*init));
+ if (xact == NULL) {
+ if_printf(sc->hn_ifp, "no xact for nvs init\n");
+ return (ENXIO);
+ }
+ init = vmbus_xact_req_data(xact);
+ init->nvs_type = HN_NVS_TYPE_INIT;
+ init->nvs_ver_min = nvs_ver;
+ init->nvs_ver_max = nvs_ver;
+
+ resp_len = sizeof(*resp);
+ resp = hn_nvs_xact_execute(sc, xact, init, sizeof(*init), &resp_len,
+ HN_NVS_TYPE_INIT_RESP);
+ if (resp == NULL) {
+ if_printf(sc->hn_ifp, "exec init failed\n");
+ vmbus_xact_put(xact);
+ return (EIO);
+ }
+
+ status = resp->nvs_status;
+ vmbus_xact_put(xact);
+
+ if (status != HN_NVS_STATUS_OK) {
+ if (bootverbose) {
+ /*
+ * Caller may try another NVS version, and will log
+ * error if there are no more NVS versions to try,
+ * so don't bark out loud here.
+ */
+ if_printf(sc->hn_ifp, "nvs init failed for ver 0x%x\n",
+ nvs_ver);
+ }
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * Configure MTU and enable VLAN.
+ */
+static int
+hn_nvs_conf_ndis(struct hn_softc *sc, int mtu)
+{
+ struct hn_nvs_ndis_conf conf;
+ int error;
+
+ memset(&conf, 0, sizeof(conf));
+ conf.nvs_type = HN_NVS_TYPE_NDIS_CONF;
+ conf.nvs_mtu = mtu + ETHER_HDR_LEN;
+ conf.nvs_caps = HN_NVS_NDIS_CONF_VLAN;
+ if (sc->hn_nvs_ver >= HN_NVS_VERSION_5)
+ conf.nvs_caps |= HN_NVS_NDIS_CONF_SRIOV;
+ if (sc->hn_nvs_ver >= HN_NVS_VERSION_61)
+ conf.nvs_caps |= HN_NVS_NDIS_CONF_RSC;
+
+
+ /* NOTE: No response. */
+ error = hn_nvs_req_send(sc, &conf, sizeof(conf));
+ if (error) {
+ if_printf(sc->hn_ifp, "send nvs ndis conf failed: %d\n", error);
+ return (error);
+ }
+
+ if (bootverbose)
+ if_printf(sc->hn_ifp, "nvs ndis conf done\n");
+ sc->hn_caps |= HN_CAP_MTU | HN_CAP_VLAN;
+ return (0);
+}
+
+static int
+hn_nvs_init_ndis(struct hn_softc *sc)
+{
+ struct hn_nvs_ndis_init ndis;
+ int error;
+
+ memset(&ndis, 0, sizeof(ndis));
+ ndis.nvs_type = HN_NVS_TYPE_NDIS_INIT;
+ ndis.nvs_ndis_major = HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver);
+ ndis.nvs_ndis_minor = HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver);
+
+ /* NOTE: No response. */
+ error = hn_nvs_req_send(sc, &ndis, sizeof(ndis));
+ if (error)
+ if_printf(sc->hn_ifp, "send nvs ndis init failed: %d\n", error);
+ return (error);
+}
+
+static int
+hn_nvs_init(struct hn_softc *sc)
+{
+ int i, error;
+
+ if (device_is_attached(sc->hn_dev)) {
+ /*
+ * NVS version and NDIS version MUST NOT be changed.
+ */
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "reinit NVS version 0x%x, "
+ "NDIS version %u.%u\n", sc->hn_nvs_ver,
+ HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
+ HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
+ }
+
+ error = hn_nvs_doinit(sc, sc->hn_nvs_ver);
+ if (error) {
+ if_printf(sc->hn_ifp, "reinit NVS version 0x%x "
+ "failed: %d\n", sc->hn_nvs_ver, error);
+ return (error);
+ }
+ goto done;
+ }
+
+ /*
+ * Find the supported NVS version and set NDIS version accordingly.
+ */
+ for (i = 0; i < nitems(hn_nvs_version); ++i) {
+ error = hn_nvs_doinit(sc, hn_nvs_version[i]);
+ if (!error) {
+ sc->hn_nvs_ver = hn_nvs_version[i];
+
+ /* Set NDIS version according to NVS version. */
+ sc->hn_ndis_ver = HN_NDIS_VERSION_6_30;
+ if (sc->hn_nvs_ver <= HN_NVS_VERSION_4)
+ sc->hn_ndis_ver = HN_NDIS_VERSION_6_1;
+
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "NVS version 0x%x, "
+ "NDIS version %u.%u\n", sc->hn_nvs_ver,
+ HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
+ HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
+ }
+ goto done;
+ }
+ }
+ if_printf(sc->hn_ifp, "no NVS available\n");
+ return (ENXIO);
+
+done:
+ if (sc->hn_nvs_ver >= HN_NVS_VERSION_5)
+ sc->hn_caps |= HN_CAP_HASHVAL;
+ return (0);
+}
+
+int
+hn_nvs_attach(struct hn_softc *sc, int mtu)
+{
+ int error;
+
+ if (hyperv_ver_major >= 10) {
+ /* UDP 4-tuple hash is enforced. */
+ sc->hn_caps |= HN_CAP_UDPHASH;
+ }
+
+ /*
+ * Initialize NVS.
+ */
+ error = hn_nvs_init(sc);
+ if (error)
+ return (error);
+
+ if (sc->hn_nvs_ver >= HN_NVS_VERSION_2) {
+ /*
+ * Configure NDIS before initializing it.
+ */
+ error = hn_nvs_conf_ndis(sc, mtu);
+ if (error)
+ return (error);
+ }
+
+ /*
+ * Initialize NDIS.
+ */
+ error = hn_nvs_init_ndis(sc);
+ if (error)
+ return (error);
+
+ /*
+ * Connect RXBUF.
+ */
+ error = hn_nvs_conn_rxbuf(sc);
+ if (error)
+ return (error);
+
+ /*
+ * Connect chimney sending buffer.
+ */
+ error = hn_nvs_conn_chim(sc);
+ if (error) {
+ hn_nvs_disconn_rxbuf(sc);
+ return (error);
+ }
+ return (0);
+}
+
+void
+hn_nvs_detach(struct hn_softc *sc)
+{
+
+ /* NOTE: there are no requests to stop the NVS. */
+ hn_nvs_disconn_rxbuf(sc);
+ hn_nvs_disconn_chim(sc);
+}
+
+void
+hn_nvs_sent_xact(struct hn_nvs_sendctx *sndc,
+ struct hn_softc *sc __unused, struct vmbus_channel *chan __unused,
+ const void *data, int dlen)
+{
+
+ vmbus_xact_wakeup(sndc->hn_cbarg, data, dlen);
+}
+
+static void
+hn_nvs_sent_none(struct hn_nvs_sendctx *sndc __unused,
+ struct hn_softc *sc __unused, struct vmbus_channel *chan __unused,
+ const void *data __unused, int dlen __unused)
+{
+ /* EMPTY */
+}
+
+int
+hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch0)
+{
+ struct vmbus_xact *xact;
+ struct hn_nvs_subch_req *req;
+ const struct hn_nvs_subch_resp *resp;
+ int error, nsubch_req;
+ uint32_t nsubch;
+ size_t resp_len;
+
+ nsubch_req = *nsubch0;
+ KASSERT(nsubch_req > 0, ("invalid # of sub-channels %d", nsubch_req));
+
+ xact = vmbus_xact_get(sc->hn_xact, sizeof(*req));
+ if (xact == NULL) {
+ if_printf(sc->hn_ifp, "no xact for nvs subch alloc\n");
+ return (ENXIO);
+ }
+ req = vmbus_xact_req_data(xact);
+ req->nvs_type = HN_NVS_TYPE_SUBCH_REQ;
+ req->nvs_op = HN_NVS_SUBCH_OP_ALLOC;
+ req->nvs_nsubch = nsubch_req;
+
+ resp_len = sizeof(*resp);
+ resp = hn_nvs_xact_execute(sc, xact, req, sizeof(*req), &resp_len,
+ HN_NVS_TYPE_SUBCH_RESP);
+ if (resp == NULL) {
+ if_printf(sc->hn_ifp, "exec nvs subch alloc failed\n");
+ error = EIO;
+ goto done;
+ }
+ if (resp->nvs_status != HN_NVS_STATUS_OK) {
+ if_printf(sc->hn_ifp, "nvs subch alloc failed: %x\n",
+ resp->nvs_status);
+ error = EIO;
+ goto done;
+ }
+
+ nsubch = resp->nvs_nsubch;
+ if (nsubch > nsubch_req) {
+ if_printf(sc->hn_ifp, "%u subchans are allocated, "
+ "requested %d\n", nsubch, nsubch_req);
+ nsubch = nsubch_req;
+ }
+ *nsubch0 = nsubch;
+ error = 0;
+done:
+ vmbus_xact_put(xact);
+ return (error);
+}
+
+int
+hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan,
+ struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt)
+{
+
+ return hn_nvs_send_rndis_sglist(chan, HN_NVS_RNDIS_MTYPE_CTRL,
+ sndc, gpa, gpa_cnt);
+}
+
+void
+hn_nvs_set_datapath(struct hn_softc *sc, uint32_t path)
+{
+ struct hn_nvs_datapath dp;
+
+ memset(&dp, 0, sizeof(dp));
+ dp.nvs_type = HN_NVS_TYPE_SET_DATAPATH;
+ dp.nvs_active_path = path;
+
+ hn_nvs_req_send(sc, &dp, sizeof(dp));
+}
diff --git a/sys/dev/hyperv/netvsc/hn_nvs.h b/sys/dev/hyperv/netvsc/hn_nvs.h
new file mode 100644
index 000000000000..a14d7b765590
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/hn_nvs.h
@@ -0,0 +1,107 @@
+/*-
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2010-2012 Citrix Inc.
+ * Copyright (c) 2012 NetApp Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HN_NVS_H_
+#define _HN_NVS_H_
+
+struct hn_nvs_sendctx;
+struct vmbus_channel;
+struct hn_softc;
+
+typedef void (*hn_nvs_sent_t)
+ (struct hn_nvs_sendctx *, struct hn_softc *,
+ struct vmbus_channel *, const void *, int);
+
+struct hn_nvs_sendctx {
+ hn_nvs_sent_t hn_cb;
+ void *hn_cbarg;
+};
+
+#define HN_NVS_SENDCTX_INITIALIZER(cb, cbarg) \
+{ \
+ .hn_cb = cb, \
+ .hn_cbarg = cbarg \
+}
+
+static __inline void
+hn_nvs_sendctx_init(struct hn_nvs_sendctx *sndc, hn_nvs_sent_t cb, void *cbarg)
+{
+
+ sndc->hn_cb = cb;
+ sndc->hn_cbarg = cbarg;
+}
+
+static __inline int
+hn_nvs_send(struct vmbus_channel *chan, uint16_t flags,
+ void *nvs_msg, int nvs_msglen, struct hn_nvs_sendctx *sndc)
+{
+
+ return (vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, flags,
+ nvs_msg, nvs_msglen, (uint64_t)(uintptr_t)sndc));
+}
+
+static __inline int
+hn_nvs_send_sglist(struct vmbus_channel *chan, struct vmbus_gpa sg[], int sglen,
+ void *nvs_msg, int nvs_msglen, struct hn_nvs_sendctx *sndc)
+{
+
+ return (vmbus_chan_send_sglist(chan, sg, sglen, nvs_msg, nvs_msglen,
+ (uint64_t)(uintptr_t)sndc));
+}
+
+static __inline int
+hn_nvs_send_rndis_sglist(struct vmbus_channel *chan, uint32_t rndis_mtype,
+ struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt)
+{
+ struct hn_nvs_rndis rndis;
+
+ rndis.nvs_type = HN_NVS_TYPE_RNDIS;
+ rndis.nvs_rndis_mtype = rndis_mtype;
+ rndis.nvs_chim_idx = HN_NVS_CHIM_IDX_INVALID;
+ rndis.nvs_chim_sz = 0;
+
+ return (hn_nvs_send_sglist(chan, gpa, gpa_cnt,
+ &rndis, sizeof(rndis), sndc));
+}
+
+int hn_nvs_attach(struct hn_softc *sc, int mtu);
+void hn_nvs_detach(struct hn_softc *sc);
+int hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch);
+void hn_nvs_sent_xact(struct hn_nvs_sendctx *sndc,
+ struct hn_softc *sc, struct vmbus_channel *chan,
+ const void *data, int dlen);
+int hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan,
+ struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa,
+ int gpa_cnt);
+void hn_nvs_set_datapath(struct hn_softc *sc, uint32_t path);
+
+extern struct hn_nvs_sendctx hn_nvs_sendctx_none;
+
+#endif /* !_HN_NVS_H_ */
diff --git a/sys/dev/hyperv/netvsc/hn_rndis.c b/sys/dev/hyperv/netvsc/hn_rndis.c
new file mode 100644
index 000000000000..108950aa3f9b
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/hn_rndis.c
@@ -0,0 +1,1061 @@
+/*-
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2010-2012 Citrix Inc.
+ * Copyright (c) 2012 NetApp Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet6.h"
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+
+#include <machine/atomic.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_media.h>
+#include <net/rndis.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_lro.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+
+#include <dev/hyperv/netvsc/ndis.h>
+#include <dev/hyperv/netvsc/if_hnreg.h>
+#include <dev/hyperv/netvsc/if_hnvar.h>
+#include <dev/hyperv/netvsc/hn_nvs.h>
+#include <dev/hyperv/netvsc/hn_rndis.h>
+
+#define HN_RNDIS_RID_COMPAT_MASK 0xffff
+#define HN_RNDIS_RID_COMPAT_MAX HN_RNDIS_RID_COMPAT_MASK
+
+#define HN_RNDIS_XFER_SIZE 2048
+
+#define HN_NDIS_TXCSUM_CAP_IP4 \
+ (NDIS_TXCSUM_CAP_IP4 | NDIS_TXCSUM_CAP_IP4OPT)
+#define HN_NDIS_TXCSUM_CAP_TCP4 \
+ (NDIS_TXCSUM_CAP_TCP4 | NDIS_TXCSUM_CAP_TCP4OPT)
+#define HN_NDIS_TXCSUM_CAP_TCP6 \
+ (NDIS_TXCSUM_CAP_TCP6 | NDIS_TXCSUM_CAP_TCP6OPT | \
+ NDIS_TXCSUM_CAP_IP6EXT)
+#define HN_NDIS_TXCSUM_CAP_UDP6 \
+ (NDIS_TXCSUM_CAP_UDP6 | NDIS_TXCSUM_CAP_IP6EXT)
+#define HN_NDIS_LSOV2_CAP_IP6 \
+ (NDIS_LSOV2_CAP_IP6EXT | NDIS_LSOV2_CAP_TCP6OPT)
+
+static const void *hn_rndis_xact_exec1(struct hn_softc *,
+ struct vmbus_xact *, size_t,
+ struct hn_nvs_sendctx *, size_t *);
+static const void *hn_rndis_xact_execute(struct hn_softc *,
+ struct vmbus_xact *, uint32_t, size_t, size_t *,
+ uint32_t);
+static int hn_rndis_query(struct hn_softc *, uint32_t,
+ const void *, size_t, void *, size_t *);
+static int hn_rndis_query2(struct hn_softc *, uint32_t,
+ const void *, size_t, void *, size_t *, size_t);
+static int hn_rndis_set(struct hn_softc *, uint32_t,
+ const void *, size_t);
+static int hn_rndis_init(struct hn_softc *);
+static int hn_rndis_halt(struct hn_softc *);
+static int hn_rndis_conf_offload(struct hn_softc *, int);
+static int hn_rndis_query_hwcaps(struct hn_softc *,
+ struct ndis_offload *);
+
+static __inline uint32_t
+hn_rndis_rid(struct hn_softc *sc)
+{
+ uint32_t rid;
+
+again:
+ rid = atomic_fetchadd_int(&sc->hn_rndis_rid, 1);
+ if (rid == 0)
+ goto again;
+
+ /* Use upper 16 bits for non-compat RNDIS messages. */
+ return ((rid & 0xffff) << 16);
+}
+
+void
+hn_rndis_rx_ctrl(struct hn_softc *sc, const void *data, int dlen)
+{
+ const struct rndis_comp_hdr *comp;
+ const struct rndis_msghdr *hdr;
+
+ KASSERT(dlen >= sizeof(*hdr), ("invalid RNDIS msg\n"));
+ hdr = data;
+
+ switch (hdr->rm_type) {
+ case REMOTE_NDIS_INITIALIZE_CMPLT:
+ case REMOTE_NDIS_QUERY_CMPLT:
+ case REMOTE_NDIS_SET_CMPLT:
+ case REMOTE_NDIS_KEEPALIVE_CMPLT: /* unused */
+ if (dlen < sizeof(*comp)) {
+ if_printf(sc->hn_ifp, "invalid RNDIS cmplt\n");
+ return;
+ }
+ comp = data;
+
+ KASSERT(comp->rm_rid > HN_RNDIS_RID_COMPAT_MAX,
+ ("invalid RNDIS rid 0x%08x\n", comp->rm_rid));
+ vmbus_xact_ctx_wakeup(sc->hn_xact, comp, dlen);
+ break;
+
+ case REMOTE_NDIS_RESET_CMPLT:
+ /*
+ * Reset completed, no rid.
+ *
+ * NOTE:
+ * RESET is not issued by hn(4), so this message should
+ * _not_ be observed.
+ */
+ if_printf(sc->hn_ifp, "RESET cmplt received\n");
+ break;
+
+ default:
+ if_printf(sc->hn_ifp, "unknown RNDIS msg 0x%x\n",
+ hdr->rm_type);
+ break;
+ }
+}
+
+int
+hn_rndis_get_eaddr(struct hn_softc *sc, uint8_t *eaddr)
+{
+ size_t eaddr_len;
+ int error;
+
+ eaddr_len = ETHER_ADDR_LEN;
+ error = hn_rndis_query(sc, OID_802_3_PERMANENT_ADDRESS, NULL, 0,
+ eaddr, &eaddr_len);
+ if (error)
+ return (error);
+ if (eaddr_len != ETHER_ADDR_LEN) {
+ if_printf(sc->hn_ifp, "invalid eaddr len %zu\n", eaddr_len);
+ return (EINVAL);
+ }
+ return (0);
+}
+
+int
+hn_rndis_get_linkstatus(struct hn_softc *sc, uint32_t *link_status)
+{
+ size_t size;
+ int error;
+
+ size = sizeof(*link_status);
+ error = hn_rndis_query(sc, OID_GEN_MEDIA_CONNECT_STATUS, NULL, 0,
+ link_status, &size);
+ if (error)
+ return (error);
+ if (size != sizeof(uint32_t)) {
+ if_printf(sc->hn_ifp, "invalid link status len %zu\n", size);
+ return (EINVAL);
+ }
+ return (0);
+}
+
+int
+hn_rndis_get_mtu(struct hn_softc *sc, uint32_t *mtu)
+{
+ size_t size;
+ int error;
+
+ size = sizeof(*mtu);
+ error = hn_rndis_query(sc, OID_GEN_MAXIMUM_FRAME_SIZE, NULL, 0,
+ mtu, &size);
+ if (error)
+ return (error);
+ if (size != sizeof(uint32_t)) {
+ if_printf(sc->hn_ifp, "invalid mtu len %zu\n", size);
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static const void *
+hn_rndis_xact_exec1(struct hn_softc *sc, struct vmbus_xact *xact, size_t reqlen,
+ struct hn_nvs_sendctx *sndc, size_t *comp_len)
+{
+ struct vmbus_gpa gpa[HN_XACT_REQ_PGCNT];
+ int gpa_cnt, error;
+ bus_addr_t paddr;
+
+ KASSERT(reqlen <= HN_XACT_REQ_SIZE && reqlen > 0,
+ ("invalid request length %zu", reqlen));
+
+ /*
+ * Setup the SG list.
+ */
+ paddr = vmbus_xact_req_paddr(xact);
+ KASSERT((paddr & PAGE_MASK) == 0,
+ ("vmbus xact request is not page aligned 0x%jx", (uintmax_t)paddr));
+ for (gpa_cnt = 0; gpa_cnt < HN_XACT_REQ_PGCNT; ++gpa_cnt) {
+ int len = PAGE_SIZE;
+
+ if (reqlen == 0)
+ break;
+ if (reqlen < len)
+ len = reqlen;
+
+ gpa[gpa_cnt].gpa_page = atop(paddr) + gpa_cnt;
+ gpa[gpa_cnt].gpa_len = len;
+ gpa[gpa_cnt].gpa_ofs = 0;
+
+ reqlen -= len;
+ }
+ KASSERT(reqlen == 0, ("still have %zu request data left", reqlen));
+
+ /*
+ * Send this RNDIS control message and wait for its completion
+ * message.
+ */
+ vmbus_xact_activate(xact);
+ error = hn_nvs_send_rndis_ctrl(sc->hn_prichan, sndc, gpa, gpa_cnt);
+ if (error) {
+ vmbus_xact_deactivate(xact);
+ if_printf(sc->hn_ifp, "RNDIS ctrl send failed: %d\n", error);
+ return (NULL);
+ }
+ return (vmbus_chan_xact_wait(sc->hn_prichan, xact, comp_len,
+ HN_CAN_SLEEP(sc)));
+}
+
+static const void *
+hn_rndis_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact, uint32_t rid,
+ size_t reqlen, size_t *comp_len0, uint32_t comp_type)
+{
+ const struct rndis_comp_hdr *comp;
+ size_t comp_len, min_complen = *comp_len0;
+
+ KASSERT(rid > HN_RNDIS_RID_COMPAT_MAX, ("invalid rid %u\n", rid));
+ KASSERT(min_complen >= sizeof(*comp),
+ ("invalid minimum complete len %zu", min_complen));
+
+ /*
+ * Execute the xact setup by the caller.
+ */
+ comp = hn_rndis_xact_exec1(sc, xact, reqlen, &hn_nvs_sendctx_none,
+ &comp_len);
+ if (comp == NULL)
+ return (NULL);
+
+ /*
+ * Check this RNDIS complete message.
+ */
+ if (comp_len < min_complen) {
+ if (comp_len >= sizeof(*comp)) {
+ /* rm_status field is valid */
+ if_printf(sc->hn_ifp, "invalid RNDIS comp len %zu, "
+ "status 0x%08x\n", comp_len, comp->rm_status);
+ } else {
+ if_printf(sc->hn_ifp, "invalid RNDIS comp len %zu\n",
+ comp_len);
+ }
+ return (NULL);
+ }
+ if (comp->rm_len < min_complen) {
+ if_printf(sc->hn_ifp, "invalid RNDIS comp msglen %u\n",
+ comp->rm_len);
+ return (NULL);
+ }
+ if (comp->rm_type != comp_type) {
+ if_printf(sc->hn_ifp, "unexpected RNDIS comp 0x%08x, "
+ "expect 0x%08x\n", comp->rm_type, comp_type);
+ return (NULL);
+ }
+ if (comp->rm_rid != rid) {
+ if_printf(sc->hn_ifp, "RNDIS comp rid mismatch %u, "
+ "expect %u\n", comp->rm_rid, rid);
+ return (NULL);
+ }
+ /* All pass! */
+ *comp_len0 = comp_len;
+ return (comp);
+}
+
+static int
+hn_rndis_query(struct hn_softc *sc, uint32_t oid,
+ const void *idata, size_t idlen, void *odata, size_t *odlen0)
+{
+
+ return (hn_rndis_query2(sc, oid, idata, idlen, odata, odlen0, *odlen0));
+}
+
+static int
+hn_rndis_query2(struct hn_softc *sc, uint32_t oid,
+ const void *idata, size_t idlen, void *odata, size_t *odlen0,
+ size_t min_odlen)
+{
+ struct rndis_query_req *req;
+ const struct rndis_query_comp *comp;
+ struct vmbus_xact *xact;
+ size_t reqlen, odlen = *odlen0, comp_len;
+ int error, ofs;
+ uint32_t rid;
+
+ reqlen = sizeof(*req) + idlen;
+ xact = vmbus_xact_get(sc->hn_xact, reqlen);
+ if (xact == NULL) {
+ if_printf(sc->hn_ifp, "no xact for RNDIS query 0x%08x\n", oid);
+ return (ENXIO);
+ }
+ rid = hn_rndis_rid(sc);
+ req = vmbus_xact_req_data(xact);
+ req->rm_type = REMOTE_NDIS_QUERY_MSG;
+ req->rm_len = reqlen;
+ req->rm_rid = rid;
+ req->rm_oid = oid;
+ /*
+ * XXX
+ * This is _not_ RNDIS Spec conforming:
+ * "This MUST be set to 0 when there is no input data
+ * associated with the OID."
+ *
+ * If this field was set to 0 according to the RNDIS Spec,
+ * Hyper-V would set non-SUCCESS status in the query
+ * completion.
+ */
+ req->rm_infobufoffset = RNDIS_QUERY_REQ_INFOBUFOFFSET;
+
+ if (idlen > 0) {
+ req->rm_infobuflen = idlen;
+ /* Input data immediately follows RNDIS query. */
+ memcpy(req + 1, idata, idlen);
+ }
+
+ comp_len = sizeof(*comp) + min_odlen;
+ comp = hn_rndis_xact_execute(sc, xact, rid, reqlen, &comp_len,
+ REMOTE_NDIS_QUERY_CMPLT);
+ if (comp == NULL) {
+ if_printf(sc->hn_ifp, "exec RNDIS query 0x%08x failed\n", oid);
+ error = EIO;
+ goto done;
+ }
+
+ if (comp->rm_status != RNDIS_STATUS_SUCCESS) {
+ if_printf(sc->hn_ifp, "RNDIS query 0x%08x failed: "
+ "status 0x%08x\n", oid, comp->rm_status);
+ error = EIO;
+ goto done;
+ }
+ if (comp->rm_infobuflen == 0 || comp->rm_infobufoffset == 0) {
+ /* No output data! */
+ if_printf(sc->hn_ifp, "RNDIS query 0x%08x, no data\n", oid);
+ *odlen0 = 0;
+ error = 0;
+ goto done;
+ }
+
+ /*
+ * Check output data length and offset.
+ */
+ /* ofs is the offset from the beginning of comp. */
+ ofs = RNDIS_QUERY_COMP_INFOBUFOFFSET_ABS(comp->rm_infobufoffset);
+ if (ofs < sizeof(*comp) || ofs + comp->rm_infobuflen > comp_len) {
+ if_printf(sc->hn_ifp, "RNDIS query invalid comp ib off/len, "
+ "%u/%u\n", comp->rm_infobufoffset, comp->rm_infobuflen);
+ error = EINVAL;
+ goto done;
+ }
+
+ /*
+ * Save output data.
+ */
+ if (comp->rm_infobuflen < odlen)
+ odlen = comp->rm_infobuflen;
+ memcpy(odata, ((const uint8_t *)comp) + ofs, odlen);
+ *odlen0 = odlen;
+
+ error = 0;
+done:
+ vmbus_xact_put(xact);
+ return (error);
+}
+
+int
+hn_rndis_query_rsscaps(struct hn_softc *sc, int *rxr_cnt0)
+{
+ struct ndis_rss_caps in, caps;
+ size_t caps_len;
+ int error, indsz, rxr_cnt, hash_fnidx;
+ uint32_t hash_func = 0, hash_types = 0;
+
+ *rxr_cnt0 = 0;
+
+ if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_20)
+ return (EOPNOTSUPP);
+
+ memset(&in, 0, sizeof(in));
+ in.ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_CAPS;
+ in.ndis_hdr.ndis_rev = NDIS_RSS_CAPS_REV_2;
+ in.ndis_hdr.ndis_size = NDIS_RSS_CAPS_SIZE;
+
+ caps_len = NDIS_RSS_CAPS_SIZE;
+ error = hn_rndis_query2(sc, OID_GEN_RECEIVE_SCALE_CAPABILITIES,
+ &in, NDIS_RSS_CAPS_SIZE, &caps, &caps_len, NDIS_RSS_CAPS_SIZE_6_0);
+ if (error)
+ return (error);
+
+ /*
+ * Preliminary verification.
+ */
+ if (caps.ndis_hdr.ndis_type != NDIS_OBJTYPE_RSS_CAPS) {
+ if_printf(sc->hn_ifp, "invalid NDIS objtype 0x%02x\n",
+ caps.ndis_hdr.ndis_type);
+ return (EINVAL);
+ }
+ if (caps.ndis_hdr.ndis_rev < NDIS_RSS_CAPS_REV_1) {
+ if_printf(sc->hn_ifp, "invalid NDIS objrev 0x%02x\n",
+ caps.ndis_hdr.ndis_rev);
+ return (EINVAL);
+ }
+ if (caps.ndis_hdr.ndis_size > caps_len) {
+ if_printf(sc->hn_ifp, "invalid NDIS objsize %u, "
+ "data size %zu\n", caps.ndis_hdr.ndis_size, caps_len);
+ return (EINVAL);
+ } else if (caps.ndis_hdr.ndis_size < NDIS_RSS_CAPS_SIZE_6_0) {
+ if_printf(sc->hn_ifp, "invalid NDIS objsize %u\n",
+ caps.ndis_hdr.ndis_size);
+ return (EINVAL);
+ }
+
+ /*
+ * Save information for later RSS configuration.
+ */
+ if (caps.ndis_nrxr == 0) {
+ if_printf(sc->hn_ifp, "0 RX rings!?\n");
+ return (EINVAL);
+ }
+ if (bootverbose)
+ if_printf(sc->hn_ifp, "%u RX rings\n", caps.ndis_nrxr);
+ rxr_cnt = caps.ndis_nrxr;
+
+ if (caps.ndis_hdr.ndis_size == NDIS_RSS_CAPS_SIZE &&
+ caps.ndis_hdr.ndis_rev >= NDIS_RSS_CAPS_REV_2) {
+ if (caps.ndis_nind > NDIS_HASH_INDCNT) {
+ if_printf(sc->hn_ifp,
+ "too many RSS indirect table entries %u\n",
+ caps.ndis_nind);
+ return (EOPNOTSUPP);
+ }
+ if (!powerof2(caps.ndis_nind)) {
+ if_printf(sc->hn_ifp, "RSS indirect table size is not "
+ "power-of-2 %u\n", caps.ndis_nind);
+ }
+
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "RSS indirect table size %u\n",
+ caps.ndis_nind);
+ }
+ indsz = caps.ndis_nind;
+ } else {
+ indsz = NDIS_HASH_INDCNT;
+ }
+ if (indsz < rxr_cnt) {
+ if_printf(sc->hn_ifp, "# of RX rings (%d) > "
+ "RSS indirect table size %d\n", rxr_cnt, indsz);
+ rxr_cnt = indsz;
+ }
+
+ /*
+ * NOTE:
+ * Toeplitz is at the lowest bit, and it is preferred; so ffs(),
+ * instead of fls(), is used here.
+ */
+ hash_fnidx = ffs(caps.ndis_caps & NDIS_RSS_CAP_HASHFUNC_MASK);
+ if (hash_fnidx == 0) {
+ if_printf(sc->hn_ifp, "no hash functions, caps 0x%08x\n",
+ caps.ndis_caps);
+ return (EOPNOTSUPP);
+ }
+ hash_func = 1 << (hash_fnidx - 1); /* ffs is 1-based */
+
+ if (caps.ndis_caps & NDIS_RSS_CAP_IPV4)
+ hash_types |= NDIS_HASH_IPV4 | NDIS_HASH_TCP_IPV4;
+ if (caps.ndis_caps & NDIS_RSS_CAP_IPV6)
+ hash_types |= NDIS_HASH_IPV6 | NDIS_HASH_TCP_IPV6;
+ if (caps.ndis_caps & NDIS_RSS_CAP_IPV6_EX)
+ hash_types |= NDIS_HASH_IPV6_EX | NDIS_HASH_TCP_IPV6_EX;
+ if (hash_types == 0) {
+ if_printf(sc->hn_ifp, "no hash types, caps 0x%08x\n",
+ caps.ndis_caps);
+ return (EOPNOTSUPP);
+ }
+ if (bootverbose)
+ if_printf(sc->hn_ifp, "RSS caps %#x\n", caps.ndis_caps);
+
+ /* Commit! */
+ sc->hn_rss_ind_size = indsz;
+ sc->hn_rss_hcap = hash_func | hash_types;
+ if (sc->hn_caps & HN_CAP_UDPHASH) {
+ /* UDP 4-tuple hash is unconditionally enabled. */
+ sc->hn_rss_hcap |= NDIS_HASH_UDP_IPV4_X;
+ }
+ *rxr_cnt0 = rxr_cnt;
+ return (0);
+}
+
+static int
+hn_rndis_set(struct hn_softc *sc, uint32_t oid, const void *data, size_t dlen)
+{
+ struct rndis_set_req *req;
+ const struct rndis_set_comp *comp;
+ struct vmbus_xact *xact;
+ size_t reqlen, comp_len;
+ uint32_t rid;
+ int error;
+
+ KASSERT(dlen > 0, ("invalid dlen %zu", dlen));
+
+ reqlen = sizeof(*req) + dlen;
+ xact = vmbus_xact_get(sc->hn_xact, reqlen);
+ if (xact == NULL) {
+ if_printf(sc->hn_ifp, "no xact for RNDIS set 0x%08x\n", oid);
+ return (ENXIO);
+ }
+ rid = hn_rndis_rid(sc);
+ req = vmbus_xact_req_data(xact);
+ req->rm_type = REMOTE_NDIS_SET_MSG;
+ req->rm_len = reqlen;
+ req->rm_rid = rid;
+ req->rm_oid = oid;
+ req->rm_infobuflen = dlen;
+ req->rm_infobufoffset = RNDIS_SET_REQ_INFOBUFOFFSET;
+ /* Data immediately follows RNDIS set. */
+ memcpy(req + 1, data, dlen);
+
+ comp_len = sizeof(*comp);
+ comp = hn_rndis_xact_execute(sc, xact, rid, reqlen, &comp_len,
+ REMOTE_NDIS_SET_CMPLT);
+ if (comp == NULL) {
+ if_printf(sc->hn_ifp, "exec RNDIS set 0x%08x failed\n", oid);
+ error = EIO;
+ goto done;
+ }
+
+ if (comp->rm_status != RNDIS_STATUS_SUCCESS) {
+ if_printf(sc->hn_ifp, "RNDIS set 0x%08x failed: "
+ "status 0x%08x\n", oid, comp->rm_status);
+ error = EIO;
+ goto done;
+ }
+ error = 0;
+done:
+ vmbus_xact_put(xact);
+ return (error);
+}
+
+static int
+hn_rndis_conf_offload(struct hn_softc *sc, int mtu)
+{
+ struct ndis_offload hwcaps;
+ struct ndis_offload_params params;
+ uint32_t caps = 0;
+ size_t paramsz;
+ int error, tso_maxsz, tso_minsg;
+
+ error = hn_rndis_query_hwcaps(sc, &hwcaps);
+ if (error) {
+ if_printf(sc->hn_ifp, "hwcaps query failed: %d\n", error);
+ return (error);
+ }
+
+ /* NOTE: 0 means "no change" */
+ memset(&params, 0, sizeof(params));
+
+ params.ndis_hdr.ndis_type = NDIS_OBJTYPE_DEFAULT;
+ if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_30) {
+ params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_2;
+ paramsz = NDIS_OFFLOAD_PARAMS_SIZE_6_1;
+ } else {
+ params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_3;
+ paramsz = NDIS_OFFLOAD_PARAMS_SIZE;
+ }
+ params.ndis_hdr.ndis_size = paramsz;
+
+ /*
+ * TSO4/TSO6 setup.
+ */
+ tso_maxsz = IP_MAXPACKET;
+ tso_minsg = 2;
+ if (hwcaps.ndis_lsov2.ndis_ip4_encap & NDIS_OFFLOAD_ENCAP_8023) {
+ caps |= HN_CAP_TSO4;
+ params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_ON;
+
+ if (hwcaps.ndis_lsov2.ndis_ip4_maxsz < tso_maxsz)
+ tso_maxsz = hwcaps.ndis_lsov2.ndis_ip4_maxsz;
+ if (hwcaps.ndis_lsov2.ndis_ip4_minsg > tso_minsg)
+ tso_minsg = hwcaps.ndis_lsov2.ndis_ip4_minsg;
+ }
+ if ((hwcaps.ndis_lsov2.ndis_ip6_encap & NDIS_OFFLOAD_ENCAP_8023) &&
+ (hwcaps.ndis_lsov2.ndis_ip6_opts & HN_NDIS_LSOV2_CAP_IP6) ==
+ HN_NDIS_LSOV2_CAP_IP6) {
+ caps |= HN_CAP_TSO6;
+ params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_ON;
+
+ if (hwcaps.ndis_lsov2.ndis_ip6_maxsz < tso_maxsz)
+ tso_maxsz = hwcaps.ndis_lsov2.ndis_ip6_maxsz;
+ if (hwcaps.ndis_lsov2.ndis_ip6_minsg > tso_minsg)
+ tso_minsg = hwcaps.ndis_lsov2.ndis_ip6_minsg;
+ }
+ sc->hn_ndis_tso_szmax = 0;
+ sc->hn_ndis_tso_sgmin = 0;
+ if (caps & (HN_CAP_TSO4 | HN_CAP_TSO6)) {
+ KASSERT(tso_maxsz <= IP_MAXPACKET,
+ ("invalid NDIS TSO maxsz %d", tso_maxsz));
+ KASSERT(tso_minsg >= 2,
+ ("invalid NDIS TSO minsg %d", tso_minsg));
+ if (tso_maxsz < tso_minsg * mtu) {
+ if_printf(sc->hn_ifp, "invalid NDIS TSO config: "
+ "maxsz %d, minsg %d, mtu %d; "
+ "disable TSO4 and TSO6\n",
+ tso_maxsz, tso_minsg, mtu);
+ caps &= ~(HN_CAP_TSO4 | HN_CAP_TSO6);
+ params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_OFF;
+ params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_OFF;
+ } else {
+ sc->hn_ndis_tso_szmax = tso_maxsz;
+ sc->hn_ndis_tso_sgmin = tso_minsg;
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "NDIS TSO "
+ "szmax %d sgmin %d\n",
+ sc->hn_ndis_tso_szmax,
+ sc->hn_ndis_tso_sgmin);
+ }
+ }
+ }
+
+ /* IPv4 checksum */
+ if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HN_NDIS_TXCSUM_CAP_IP4) ==
+ HN_NDIS_TXCSUM_CAP_IP4) {
+ caps |= HN_CAP_IPCS;
+ params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TX;
+ }
+ if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_IP4) {
+ if (params.ndis_ip4csum == NDIS_OFFLOAD_PARAM_TX)
+ params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TXRX;
+ else
+ params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_RX;
+ }
+
+ /* TCP4 checksum */
+ if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HN_NDIS_TXCSUM_CAP_TCP4) ==
+ HN_NDIS_TXCSUM_CAP_TCP4) {
+ caps |= HN_CAP_TCP4CS;
+ params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TX;
+ }
+ if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_TCP4) {
+ if (params.ndis_tcp4csum == NDIS_OFFLOAD_PARAM_TX)
+ params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TXRX;
+ else
+ params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_RX;
+ }
+
+ /* UDP4 checksum */
+ if (hwcaps.ndis_csum.ndis_ip4_txcsum & NDIS_TXCSUM_CAP_UDP4) {
+ caps |= HN_CAP_UDP4CS;
+ params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TX;
+ }
+ if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_UDP4) {
+ if (params.ndis_udp4csum == NDIS_OFFLOAD_PARAM_TX)
+ params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TXRX;
+ else
+ params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_RX;
+ }
+
+ /* TCP6 checksum */
+ if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HN_NDIS_TXCSUM_CAP_TCP6) ==
+ HN_NDIS_TXCSUM_CAP_TCP6) {
+ caps |= HN_CAP_TCP6CS;
+ params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TX;
+ }
+ if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_TCP6) {
+ if (params.ndis_tcp6csum == NDIS_OFFLOAD_PARAM_TX)
+ params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TXRX;
+ else
+ params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_RX;
+ }
+
+ /* UDP6 checksum */
+ if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HN_NDIS_TXCSUM_CAP_UDP6) ==
+ HN_NDIS_TXCSUM_CAP_UDP6) {
+ caps |= HN_CAP_UDP6CS;
+ params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TX;
+ }
+ if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_UDP6) {
+ if (params.ndis_udp6csum == NDIS_OFFLOAD_PARAM_TX)
+ params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TXRX;
+ else
+ params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_RX;
+ }
+
+ /* RSC offload */
+ if (hwcaps.ndis_hdr.ndis_rev >= NDIS_OFFLOAD_PARAMS_REV_3) {
+ if (hwcaps.ndis_rsc.ndis_ip4 && hwcaps.ndis_rsc.ndis_ip6) {
+ params.ndis_rsc_ip4 = NDIS_OFFLOAD_RSC_ON;
+ params.ndis_rsc_ip6 = NDIS_OFFLOAD_RSC_ON;
+ } else {
+ params.ndis_rsc_ip4 = NDIS_OFFLOAD_RSC_OFF;
+ params.ndis_rsc_ip6 = NDIS_OFFLOAD_RSC_OFF;
+ }
+ }
+
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "offload csum: "
+ "ip4 %u, tcp4 %u, udp4 %u, tcp6 %u, udp6 %u\n",
+ params.ndis_ip4csum,
+ params.ndis_tcp4csum,
+ params.ndis_udp4csum,
+ params.ndis_tcp6csum,
+ params.ndis_udp6csum);
+ if_printf(sc->hn_ifp, "offload lsov2: ip4 %u, ip6 %u\n",
+ params.ndis_lsov2_ip4,
+ params.ndis_lsov2_ip6);
+ if (hwcaps.ndis_hdr.ndis_rev >= NDIS_OFFLOAD_PARAMS_REV_3)
+ if_printf(sc->hn_ifp, "offload rsc: ip4 %u, ip6 %u\n",
+ params.ndis_rsc_ip4,
+ params.ndis_rsc_ip6);
+ }
+
+ error = hn_rndis_set(sc, OID_TCP_OFFLOAD_PARAMETERS, &params, paramsz);
+ if (error) {
+ if_printf(sc->hn_ifp, "offload config failed: %d\n", error);
+ return (error);
+ }
+
+ if (bootverbose)
+ if_printf(sc->hn_ifp, "offload config done\n");
+ sc->hn_caps |= caps;
+ return (0);
+}
+
+int
+hn_rndis_conf_rss(struct hn_softc *sc, uint16_t flags)
+{
+ struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
+ struct ndis_rss_params *prm = &rss->rss_params;
+ int error, rss_size;
+
+ /*
+ * Only NDIS 6.20+ is supported:
+ * We only support 4bytes element in indirect table, which has been
+ * adopted since NDIS 6.20.
+ */
+ KASSERT(sc->hn_ndis_ver >= HN_NDIS_VERSION_6_20,
+ ("NDIS 6.20+ is required, NDIS version 0x%08x", sc->hn_ndis_ver));
+
+ /* XXX only one can be specified through, popcnt? */
+ KASSERT((sc->hn_rss_hash & NDIS_HASH_FUNCTION_MASK),
+ ("no hash func %08x", sc->hn_rss_hash));
+ KASSERT((sc->hn_rss_hash & NDIS_HASH_STD),
+ ("no standard hash types %08x", sc->hn_rss_hash));
+ KASSERT(sc->hn_rss_ind_size > 0, ("no indirect table size"));
+
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "RSS indirect table size %d, "
+ "hash 0x%08x\n", sc->hn_rss_ind_size, sc->hn_rss_hash);
+ }
+
+ /*
+ * NOTE:
+ * DO NOT whack rss_key and rss_ind, which are setup by the caller.
+ */
+ memset(prm, 0, sizeof(*prm));
+ rss_size = NDIS_RSSPRM_TOEPLITZ_SIZE(sc->hn_rss_ind_size);
+
+ prm->ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_PARAMS;
+ prm->ndis_hdr.ndis_rev = NDIS_RSS_PARAMS_REV_2;
+ prm->ndis_hdr.ndis_size = rss_size;
+ prm->ndis_flags = flags;
+ prm->ndis_hash = sc->hn_rss_hash &
+ (NDIS_HASH_FUNCTION_MASK | NDIS_HASH_STD);
+ prm->ndis_indsize = sizeof(rss->rss_ind[0]) * sc->hn_rss_ind_size;
+ prm->ndis_indoffset =
+ __offsetof(struct ndis_rssprm_toeplitz, rss_ind[0]);
+ prm->ndis_keysize = sizeof(rss->rss_key);
+ prm->ndis_keyoffset =
+ __offsetof(struct ndis_rssprm_toeplitz, rss_key[0]);
+
+ error = hn_rndis_set(sc, OID_GEN_RECEIVE_SCALE_PARAMETERS,
+ rss, rss_size);
+ if (error) {
+ if_printf(sc->hn_ifp, "RSS config failed: %d\n", error);
+ } else {
+ if (bootverbose)
+ if_printf(sc->hn_ifp, "RSS config done\n");
+ }
+ return (error);
+}
+
+int
+hn_rndis_set_rxfilter(struct hn_softc *sc, uint32_t filter)
+{
+ int error;
+
+ error = hn_rndis_set(sc, OID_GEN_CURRENT_PACKET_FILTER,
+ &filter, sizeof(filter));
+ if (error) {
+ if_printf(sc->hn_ifp, "set RX filter 0x%08x failed: %d\n",
+ filter, error);
+ } else {
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "set RX filter 0x%08x done\n",
+ filter);
+ }
+ }
+ return (error);
+}
+
+static int
+hn_rndis_init(struct hn_softc *sc)
+{
+ struct rndis_init_req *req;
+ const struct rndis_init_comp *comp;
+ struct vmbus_xact *xact;
+ size_t comp_len;
+ uint32_t rid;
+ int error;
+
+ xact = vmbus_xact_get(sc->hn_xact, sizeof(*req));
+ if (xact == NULL) {
+ if_printf(sc->hn_ifp, "no xact for RNDIS init\n");
+ return (ENXIO);
+ }
+ rid = hn_rndis_rid(sc);
+ req = vmbus_xact_req_data(xact);
+ req->rm_type = REMOTE_NDIS_INITIALIZE_MSG;
+ req->rm_len = sizeof(*req);
+ req->rm_rid = rid;
+ req->rm_ver_major = RNDIS_VERSION_MAJOR;
+ req->rm_ver_minor = RNDIS_VERSION_MINOR;
+ req->rm_max_xfersz = HN_RNDIS_XFER_SIZE;
+
+ comp_len = RNDIS_INIT_COMP_SIZE_MIN;
+ comp = hn_rndis_xact_execute(sc, xact, rid, sizeof(*req), &comp_len,
+ REMOTE_NDIS_INITIALIZE_CMPLT);
+ if (comp == NULL) {
+ if_printf(sc->hn_ifp, "exec RNDIS init failed\n");
+ error = EIO;
+ goto done;
+ }
+
+ if (comp->rm_status != RNDIS_STATUS_SUCCESS) {
+ if_printf(sc->hn_ifp, "RNDIS init failed: status 0x%08x\n",
+ comp->rm_status);
+ error = EIO;
+ goto done;
+ }
+ sc->hn_rndis_agg_size = comp->rm_pktmaxsz;
+ sc->hn_rndis_agg_pkts = comp->rm_pktmaxcnt;
+ sc->hn_rndis_agg_align = 1U << comp->rm_align;
+
+ if (sc->hn_rndis_agg_align < sizeof(uint32_t)) {
+ /*
+ * The RNDIS packet messsage encap assumes that the RNDIS
+ * packet message is at least 4 bytes aligned. Fix up the
+ * alignment here, if the remote side sets the alignment
+ * too low.
+ */
+ if_printf(sc->hn_ifp, "fixup RNDIS aggpkt align: %u -> %zu\n",
+ sc->hn_rndis_agg_align, sizeof(uint32_t));
+ sc->hn_rndis_agg_align = sizeof(uint32_t);
+ }
+
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "RNDIS ver %u.%u, "
+ "aggpkt size %u, aggpkt cnt %u, aggpkt align %u\n",
+ comp->rm_ver_major, comp->rm_ver_minor,
+ sc->hn_rndis_agg_size, sc->hn_rndis_agg_pkts,
+ sc->hn_rndis_agg_align);
+ }
+ error = 0;
+done:
+ vmbus_xact_put(xact);
+ return (error);
+}
+
+static int
+hn_rndis_halt(struct hn_softc *sc)
+{
+ struct vmbus_xact *xact;
+ struct rndis_halt_req *halt;
+ struct hn_nvs_sendctx sndc;
+ size_t comp_len;
+
+ xact = vmbus_xact_get(sc->hn_xact, sizeof(*halt));
+ if (xact == NULL) {
+ if_printf(sc->hn_ifp, "no xact for RNDIS halt\n");
+ return (ENXIO);
+ }
+ halt = vmbus_xact_req_data(xact);
+ halt->rm_type = REMOTE_NDIS_HALT_MSG;
+ halt->rm_len = sizeof(*halt);
+ halt->rm_rid = hn_rndis_rid(sc);
+
+ /* No RNDIS completion; rely on NVS message send completion */
+ hn_nvs_sendctx_init(&sndc, hn_nvs_sent_xact, xact);
+ hn_rndis_xact_exec1(sc, xact, sizeof(*halt), &sndc, &comp_len);
+
+ vmbus_xact_put(xact);
+ if (bootverbose)
+ if_printf(sc->hn_ifp, "RNDIS halt done\n");
+ return (0);
+}
+
+static int
+hn_rndis_query_hwcaps(struct hn_softc *sc, struct ndis_offload *caps)
+{
+ struct ndis_offload in;
+ size_t caps_len, size;
+ int error;
+
+ memset(&in, 0, sizeof(in));
+ in.ndis_hdr.ndis_type = NDIS_OBJTYPE_OFFLOAD;
+ if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_30) {
+ in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_3;
+ size = NDIS_OFFLOAD_SIZE;
+ } else if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_1) {
+ in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_2;
+ size = NDIS_OFFLOAD_SIZE_6_1;
+ } else {
+ in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_1;
+ size = NDIS_OFFLOAD_SIZE_6_0;
+ }
+ in.ndis_hdr.ndis_size = size;
+
+ caps_len = NDIS_OFFLOAD_SIZE;
+ error = hn_rndis_query2(sc, OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES,
+ &in, size, caps, &caps_len, NDIS_OFFLOAD_SIZE_6_0);
+ if (error)
+ return (error);
+
+ /*
+ * Preliminary verification.
+ */
+ if (caps->ndis_hdr.ndis_type != NDIS_OBJTYPE_OFFLOAD) {
+ if_printf(sc->hn_ifp, "invalid NDIS objtype 0x%02x\n",
+ caps->ndis_hdr.ndis_type);
+ return (EINVAL);
+ }
+ if (caps->ndis_hdr.ndis_rev < NDIS_OFFLOAD_REV_1) {
+ if_printf(sc->hn_ifp, "invalid NDIS objrev 0x%02x\n",
+ caps->ndis_hdr.ndis_rev);
+ return (EINVAL);
+ }
+ if (caps->ndis_hdr.ndis_size > caps_len) {
+ if_printf(sc->hn_ifp, "invalid NDIS objsize %u, "
+ "data size %zu\n", caps->ndis_hdr.ndis_size, caps_len);
+ return (EINVAL);
+ } else if (caps->ndis_hdr.ndis_size < NDIS_OFFLOAD_SIZE_6_0) {
+ if_printf(sc->hn_ifp, "invalid NDIS objsize %u\n",
+ caps->ndis_hdr.ndis_size);
+ return (EINVAL);
+ } else if (caps->ndis_hdr.ndis_rev >= NDIS_OFFLOAD_REV_3 &&
+ caps->ndis_hdr.ndis_size < NDIS_OFFLOAD_SIZE) {
+ if_printf(sc->hn_ifp, "invalid NDIS rev3 objsize %u\n",
+ caps->ndis_hdr.ndis_size);
+ return (EINVAL);
+ }
+
+ if (bootverbose) {
+ /*
+ * NOTE:
+ * caps->ndis_hdr.ndis_size MUST be checked before accessing
+ * NDIS 6.1+ specific fields.
+ */
+ if_printf(sc->hn_ifp, "hwcaps rev %u\n",
+ caps->ndis_hdr.ndis_rev);
+
+ if_printf(sc->hn_ifp, "hwcaps csum: "
+ "ip4 tx 0x%x/0x%x rx 0x%x/0x%x, "
+ "ip6 tx 0x%x/0x%x rx 0x%x/0x%x\n",
+ caps->ndis_csum.ndis_ip4_txcsum,
+ caps->ndis_csum.ndis_ip4_txenc,
+ caps->ndis_csum.ndis_ip4_rxcsum,
+ caps->ndis_csum.ndis_ip4_rxenc,
+ caps->ndis_csum.ndis_ip6_txcsum,
+ caps->ndis_csum.ndis_ip6_txenc,
+ caps->ndis_csum.ndis_ip6_rxcsum,
+ caps->ndis_csum.ndis_ip6_rxenc);
+ if_printf(sc->hn_ifp, "hwcaps lsov2: "
+ "ip4 maxsz %u minsg %u encap 0x%x, "
+ "ip6 maxsz %u minsg %u encap 0x%x opts 0x%x\n",
+ caps->ndis_lsov2.ndis_ip4_maxsz,
+ caps->ndis_lsov2.ndis_ip4_minsg,
+ caps->ndis_lsov2.ndis_ip4_encap,
+ caps->ndis_lsov2.ndis_ip6_maxsz,
+ caps->ndis_lsov2.ndis_ip6_minsg,
+ caps->ndis_lsov2.ndis_ip6_encap,
+ caps->ndis_lsov2.ndis_ip6_opts);
+ if (caps->ndis_hdr.ndis_rev >= NDIS_OFFLOAD_REV_3)
+ if_printf(sc->hn_ifp, "hwcaps rsc: "
+ "ip4 %u ip6 %u\n",
+ caps->ndis_rsc.ndis_ip4,
+ caps->ndis_rsc.ndis_ip6);
+ }
+ return (0);
+}
+
+int
+hn_rndis_attach(struct hn_softc *sc, int mtu, int *init_done)
+{
+ int error;
+
+ *init_done = 0;
+
+ /*
+ * Initialize RNDIS.
+ */
+ error = hn_rndis_init(sc);
+ if (error)
+ return (error);
+ *init_done = 1;
+
+ /*
+ * Configure NDIS offload settings.
+ */
+ hn_rndis_conf_offload(sc, mtu);
+ return (0);
+}
+
+void
+hn_rndis_detach(struct hn_softc *sc)
+{
+
+ /* Halt the RNDIS. */
+ hn_rndis_halt(sc);
+}
diff --git a/sys/dev/hyperv/netvsc/hn_rndis.h b/sys/dev/hyperv/netvsc/hn_rndis.h
new file mode 100644
index 000000000000..4610d5a10526
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/hn_rndis.h
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2010-2012 Citrix Inc.
+ * Copyright (c) 2012 NetApp Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HN_RNDIS_H_
+#define _HN_RNDIS_H_
+
+struct hn_softc;
+
+int hn_rndis_attach(struct hn_softc *sc, int mtu, int *init_done);
+void hn_rndis_detach(struct hn_softc *sc);
+int hn_rndis_conf_rss(struct hn_softc *sc, uint16_t flags);
+int hn_rndis_query_rsscaps(struct hn_softc *sc, int *rxr_cnt);
+int hn_rndis_get_eaddr(struct hn_softc *sc, uint8_t *eaddr);
+/* link_status: NDIS_MEDIA_STATE_ */
+int hn_rndis_get_linkstatus(struct hn_softc *sc,
+ uint32_t *link_status);
+int hn_rndis_get_mtu(struct hn_softc *sc, uint32_t *mtu);
+/* filter: NDIS_PACKET_TYPE_. */
+int hn_rndis_set_rxfilter(struct hn_softc *sc, uint32_t filter);
+void hn_rndis_rx_ctrl(struct hn_softc *sc, const void *data,
+ int dlen);
+
+#endif /* !_HN_RNDIS_H_ */
diff --git a/sys/dev/hyperv/netvsc/if_hn.c b/sys/dev/hyperv/netvsc/if_hn.c
new file mode 100644
index 000000000000..d562a937ecad
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/if_hn.c
@@ -0,0 +1,7717 @@
+/*-
+ * Copyright (c) 2010-2012 Citrix Inc.
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 2004-2006 Kip Macy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_hn.h"
+#include "opt_inet6.h"
+#include "opt_inet.h"
+#include "opt_rss.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/counter.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/proc.h>
+#include <sys/rmlock.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/buf_ring.h>
+#include <sys/eventhandler.h>
+#include <sys/epoch.h>
+
+#include <machine/atomic.h>
+#include <machine/in_cksum.h>
+
+#include <net/bpf.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_media.h>
+#include <net/if_types.h>
+#include <net/if_var.h>
+#include <net/rndis.h>
+#ifdef RSS
+#include <net/rss_config.h>
+#endif
+
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_lro.h>
+#include <netinet/udp.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+
+#include <dev/hyperv/netvsc/ndis.h>
+#include <dev/hyperv/netvsc/if_hnreg.h>
+#include <dev/hyperv/netvsc/if_hnvar.h>
+#include <dev/hyperv/netvsc/hn_nvs.h>
+#include <dev/hyperv/netvsc/hn_rndis.h>
+
+#include "vmbus_if.h"
+
+#define HN_IFSTART_SUPPORT
+
+#define HN_RING_CNT_DEF_MAX 8
+
+#define HN_VFMAP_SIZE_DEF 8
+
+#define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */
+
+/* YYY should get it from the underlying channel */
+#define HN_TX_DESC_CNT 512
+
+#define HN_RNDIS_PKT_LEN \
+ (sizeof(struct rndis_packet_msg) + \
+ HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
+ HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
+ HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
+ HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
+#define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
+#define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
+
+#define HN_TX_DATA_BOUNDARY PAGE_SIZE
+#define HN_TX_DATA_MAXSIZE IP_MAXPACKET
+#define HN_TX_DATA_SEGSIZE PAGE_SIZE
+/* -1 for RNDIS packet message */
+#define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
+
+#define HN_DIRECT_TX_SIZE_DEF 128
+
+#define HN_EARLY_TXEOF_THRESH 8
+
+#define HN_PKTBUF_LEN_DEF (16 * 1024)
+
+#define HN_LROENT_CNT_DEF 128
+
+#define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
+#define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
+/* YYY 2*MTU is a bit rough, but should be good enough. */
+#define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
+
+#define HN_LRO_ACKCNT_DEF 1
+
+#define HN_LOCK_INIT(sc) \
+ sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
+#define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
+#define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
+#define HN_LOCK(sc) \
+do { \
+ while (sx_try_xlock(&(sc)->hn_lock) == 0) { \
+ /* Relinquish cpu to avoid deadlock */ \
+ sched_relinquish(curthread); \
+ DELAY(1000); \
+ } \
+} while (0)
+#define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
+
+#define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
+#define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
+#define HN_CSUM_IP_HWASSIST(sc) \
+ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
+#define HN_CSUM_IP6_HWASSIST(sc) \
+ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
+
+#define HN_PKTSIZE_MIN(align) \
+ roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
+ HN_RNDIS_PKT_LEN, (align))
+#define HN_PKTSIZE(m, align) \
+ roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
+
+#ifdef RSS
+#define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets())
+#else
+#define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
+#endif
+
+struct hn_txdesc {
+#ifndef HN_USE_TXDESC_BUFRING
+ SLIST_ENTRY(hn_txdesc) link;
+#endif
+ STAILQ_ENTRY(hn_txdesc) agg_link;
+
+ /* Aggregated txdescs, in sending order. */
+ STAILQ_HEAD(, hn_txdesc) agg_list;
+
+ /* The oldest packet, if transmission aggregation happens. */
+ struct mbuf *m;
+ struct hn_tx_ring *txr;
+ int refs;
+ uint32_t flags; /* HN_TXD_FLAG_ */
+ struct hn_nvs_sendctx send_ctx;
+ uint32_t chim_index;
+ int chim_size;
+
+ bus_dmamap_t data_dmap;
+
+ bus_addr_t rndis_pkt_paddr;
+ struct rndis_packet_msg *rndis_pkt;
+ bus_dmamap_t rndis_pkt_dmap;
+};
+
+#define HN_TXD_FLAG_ONLIST 0x0001
+#define HN_TXD_FLAG_DMAMAP 0x0002
+#define HN_TXD_FLAG_ONAGG 0x0004
+
+#define HN_NDIS_PKTINFO_SUBALLOC 0x01
+#define HN_NDIS_PKTINFO_1ST_FRAG 0x02
+#define HN_NDIS_PKTINFO_LAST_FRAG 0x04
+
+struct packet_info_id {
+ uint8_t ver;
+ uint8_t flag;
+ uint16_t pkt_id;
+};
+
+#define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id)
+
+
+struct hn_rxinfo {
+ const uint32_t *vlan_info;
+ const uint32_t *csum_info;
+ const uint32_t *hash_info;
+ const uint32_t *hash_value;
+ const struct packet_info_id *pktinfo_id;
+};
+
+struct hn_rxvf_setarg {
+ struct hn_rx_ring *rxr;
+ struct ifnet *vf_ifp;
+};
+
+#define HN_RXINFO_VLAN 0x0001
+#define HN_RXINFO_CSUM 0x0002
+#define HN_RXINFO_HASHINF 0x0004
+#define HN_RXINFO_HASHVAL 0x0008
+#define HN_RXINFO_PKTINFO_ID 0x0010
+#define HN_RXINFO_ALL \
+ (HN_RXINFO_VLAN | \
+ HN_RXINFO_CSUM | \
+ HN_RXINFO_HASHINF | \
+ HN_RXINFO_HASHVAL | \
+ HN_RXINFO_PKTINFO_ID)
+
+static int hn_probe(device_t);
+static int hn_attach(device_t);
+static int hn_detach(device_t);
+static int hn_shutdown(device_t);
+static void hn_chan_callback(struct vmbus_channel *,
+ void *);
+
+static void hn_init(void *);
+static int hn_ioctl(struct ifnet *, u_long, caddr_t);
+#ifdef HN_IFSTART_SUPPORT
+static void hn_start(struct ifnet *);
+#endif
+static int hn_transmit(struct ifnet *, struct mbuf *);
+static void hn_xmit_qflush(struct ifnet *);
+static int hn_ifmedia_upd(struct ifnet *);
+static void hn_ifmedia_sts(struct ifnet *,
+ struct ifmediareq *);
+
+static void hn_ifnet_event(void *, struct ifnet *, int);
+static void hn_ifaddr_event(void *, struct ifnet *);
+static void hn_ifnet_attevent(void *, struct ifnet *);
+static void hn_ifnet_detevent(void *, struct ifnet *);
+static void hn_ifnet_lnkevent(void *, struct ifnet *, int);
+
+static bool hn_ismyvf(const struct hn_softc *,
+ const struct ifnet *);
+static void hn_rxvf_change(struct hn_softc *,
+ struct ifnet *, bool);
+static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
+static void hn_rxvf_set_task(void *, int);
+static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
+static int hn_xpnt_vf_iocsetflags(struct hn_softc *);
+static int hn_xpnt_vf_iocsetcaps(struct hn_softc *,
+ struct ifreq *);
+static void hn_xpnt_vf_saveifflags(struct hn_softc *);
+static bool hn_xpnt_vf_isready(struct hn_softc *);
+static void hn_xpnt_vf_setready(struct hn_softc *);
+static void hn_xpnt_vf_init_taskfunc(void *, int);
+static void hn_xpnt_vf_init(struct hn_softc *);
+static void hn_xpnt_vf_setenable(struct hn_softc *);
+static void hn_xpnt_vf_setdisable(struct hn_softc *, bool);
+static void hn_vf_rss_fixup(struct hn_softc *, bool);
+static void hn_vf_rss_restore(struct hn_softc *);
+
+static int hn_rndis_rxinfo(const void *, int,
+ struct hn_rxinfo *);
+static void hn_rndis_rx_data(struct hn_rx_ring *,
+ const void *, int);
+static void hn_rndis_rx_status(struct hn_softc *,
+ const void *, int);
+static void hn_rndis_init_fixat(struct hn_softc *, int);
+
+static void hn_nvs_handle_notify(struct hn_softc *,
+ const struct vmbus_chanpkt_hdr *);
+static void hn_nvs_handle_comp(struct hn_softc *,
+ struct vmbus_channel *,
+ const struct vmbus_chanpkt_hdr *);
+static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
+ struct vmbus_channel *,
+ const struct vmbus_chanpkt_hdr *);
+static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
+ struct vmbus_channel *, uint64_t);
+
+#if __FreeBSD_version >= 1100099
+static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
+#endif
+static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
+#if __FreeBSD_version < 1100095
+static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
+#else
+static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
+#endif
+static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
+#ifndef RSS
+static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
+#endif
+static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
+
+static void hn_stop(struct hn_softc *, bool);
+static void hn_init_locked(struct hn_softc *);
+static int hn_chan_attach(struct hn_softc *,
+ struct vmbus_channel *);
+static void hn_chan_detach(struct hn_softc *,
+ struct vmbus_channel *);
+static int hn_attach_subchans(struct hn_softc *);
+static void hn_detach_allchans(struct hn_softc *);
+static void hn_chan_rollup(struct hn_rx_ring *,
+ struct hn_tx_ring *);
+static void hn_set_ring_inuse(struct hn_softc *, int);
+static int hn_synth_attach(struct hn_softc *, int);
+static void hn_synth_detach(struct hn_softc *);
+static int hn_synth_alloc_subchans(struct hn_softc *,
+ int *);
+static bool hn_synth_attachable(const struct hn_softc *);
+static void hn_suspend(struct hn_softc *);
+static void hn_suspend_data(struct hn_softc *);
+static void hn_suspend_mgmt(struct hn_softc *);
+static void hn_resume(struct hn_softc *);
+static void hn_resume_data(struct hn_softc *);
+static void hn_resume_mgmt(struct hn_softc *);
+static void hn_suspend_mgmt_taskfunc(void *, int);
+static void hn_chan_drain(struct hn_softc *,
+ struct vmbus_channel *);
+static void hn_disable_rx(struct hn_softc *);
+static void hn_drain_rxtx(struct hn_softc *, int);
+static void hn_polling(struct hn_softc *, u_int);
+static void hn_chan_polling(struct vmbus_channel *, u_int);
+static void hn_mtu_change_fixup(struct hn_softc *);
+
+static void hn_update_link_status(struct hn_softc *);
+static void hn_change_network(struct hn_softc *);
+static void hn_link_taskfunc(void *, int);
+static void hn_netchg_init_taskfunc(void *, int);
+static void hn_netchg_status_taskfunc(void *, int);
+static void hn_link_status(struct hn_softc *);
+
+static int hn_create_rx_data(struct hn_softc *, int);
+static void hn_destroy_rx_data(struct hn_softc *);
+static int hn_check_iplen(const struct mbuf *, int);
+static void hn_rxpkt_proto(const struct mbuf *, int *, int *);
+static int hn_set_rxfilter(struct hn_softc *, uint32_t);
+static int hn_rxfilter_config(struct hn_softc *);
+static int hn_rss_reconfig(struct hn_softc *);
+static void hn_rss_ind_fixup(struct hn_softc *);
+static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
+static int hn_rxpkt(struct hn_rx_ring *);
+static uint32_t hn_rss_type_fromndis(uint32_t);
+static uint32_t hn_rss_type_tondis(uint32_t);
+
+static int hn_tx_ring_create(struct hn_softc *, int);
+static void hn_tx_ring_destroy(struct hn_tx_ring *);
+static int hn_create_tx_data(struct hn_softc *, int);
+static void hn_fixup_tx_data(struct hn_softc *);
+static void hn_fixup_rx_data(struct hn_softc *);
+static void hn_destroy_tx_data(struct hn_softc *);
+static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
+static void hn_txdesc_gc(struct hn_tx_ring *,
+ struct hn_txdesc *);
+static int hn_encap(struct ifnet *, struct hn_tx_ring *,
+ struct hn_txdesc *, struct mbuf **);
+static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
+ struct hn_txdesc *);
+static void hn_set_chim_size(struct hn_softc *, int);
+static void hn_set_tso_maxsize(struct hn_softc *, int, int);
+static bool hn_tx_ring_pending(struct hn_tx_ring *);
+static void hn_tx_ring_qflush(struct hn_tx_ring *);
+static void hn_resume_tx(struct hn_softc *, int);
+static void hn_set_txagg(struct hn_softc *);
+static void *hn_try_txagg(struct ifnet *,
+ struct hn_tx_ring *, struct hn_txdesc *,
+ int);
+static int hn_get_txswq_depth(const struct hn_tx_ring *);
+static void hn_txpkt_done(struct hn_nvs_sendctx *,
+ struct hn_softc *, struct vmbus_channel *,
+ const void *, int);
+static int hn_txpkt_sglist(struct hn_tx_ring *,
+ struct hn_txdesc *);
+static int hn_txpkt_chim(struct hn_tx_ring *,
+ struct hn_txdesc *);
+static int hn_xmit(struct hn_tx_ring *, int);
+static void hn_xmit_taskfunc(void *, int);
+static void hn_xmit_txeof(struct hn_tx_ring *);
+static void hn_xmit_txeof_taskfunc(void *, int);
+#ifdef HN_IFSTART_SUPPORT
+static int hn_start_locked(struct hn_tx_ring *, int);
+static void hn_start_taskfunc(void *, int);
+static void hn_start_txeof(struct hn_tx_ring *);
+static void hn_start_txeof_taskfunc(void *, int);
+#endif
+
+SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
+ "Hyper-V network interface");
+
+/* Trust tcp segment verification on host side. */
+static int hn_trust_hosttcp = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
+ &hn_trust_hosttcp, 0,
+ "Trust tcp segment verification on host side, "
+ "when csum info is missing (global setting)");
+
+/* Trust udp datagrams verification on host side. */
+static int hn_trust_hostudp = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
+ &hn_trust_hostudp, 0,
+ "Trust udp datagram verification on host side, "
+ "when csum info is missing (global setting)");
+
+/* Trust ip packets verification on host side. */
+static int hn_trust_hostip = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
+ &hn_trust_hostip, 0,
+ "Trust ip packet verification on host side, "
+ "when csum info is missing (global setting)");
+
+/*
+ * Offload UDP/IPv4 checksum.
+ */
+static int hn_enable_udp4cs = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
+ &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
+
+/*
+ * Offload UDP/IPv6 checksum.
+ */
+static int hn_enable_udp6cs = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
+ &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
+
+/* Stats. */
+static counter_u64_t hn_udpcs_fixup;
+SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
+ &hn_udpcs_fixup, "# of UDP checksum fixup");
+
+/*
+ * See hn_set_hlen().
+ *
+ * This value is for Azure. For Hyper-V, set this above
+ * 65536 to disable UDP datagram checksum fixup.
+ */
+static int hn_udpcs_fixup_mtu = 1420;
+SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
+ &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
+
+/* Limit TSO burst size */
+static int hn_tso_maxlen = IP_MAXPACKET;
+SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
+ &hn_tso_maxlen, 0, "TSO burst limit");
+
+/* Limit chimney send size */
+static int hn_tx_chimney_size = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
+ &hn_tx_chimney_size, 0, "Chimney send packet size limit");
+
+/* Limit the size of packet for direct transmission */
+static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
+SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
+ &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
+
+/* # of LRO entries per RX ring */
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
+SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
+ &hn_lro_entry_count, 0, "LRO entry count");
+#endif
+#endif
+
+static int hn_tx_taskq_cnt = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
+ &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
+
+#define HN_TX_TASKQ_M_INDEP 0
+#define HN_TX_TASKQ_M_GLOBAL 1
+#define HN_TX_TASKQ_M_EVTTQ 2
+
+static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
+ &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
+ "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
+
+#ifndef HN_USE_TXDESC_BUFRING
+static int hn_use_txdesc_bufring = 0;
+#else
+static int hn_use_txdesc_bufring = 1;
+#endif
+SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
+ &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
+
+#ifdef HN_IFSTART_SUPPORT
+/* Use ifnet.if_start instead of ifnet.if_transmit */
+static int hn_use_if_start = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
+ &hn_use_if_start, 0, "Use if_start TX method");
+#endif
+
+/* # of channels to use */
+static int hn_chan_cnt = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
+ &hn_chan_cnt, 0,
+ "# of channels to use; each channel has one RX ring and one TX ring");
+
+/* # of transmit rings to use */
+static int hn_tx_ring_cnt = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
+ &hn_tx_ring_cnt, 0, "# of TX rings to use");
+
+/* Software TX ring deptch */
+static int hn_tx_swq_depth = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
+ &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
+
+/* Enable sorted LRO, and the depth of the per-channel mbuf queue */
+#if __FreeBSD_version >= 1100095
+static u_int hn_lro_mbufq_depth = 0;
+SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
+ &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
+#endif
+
+/* Packet transmission aggregation size limit */
+static int hn_tx_agg_size = -1;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
+ &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
+
+/* Packet transmission aggregation count limit */
+static int hn_tx_agg_pkts = -1;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
+ &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
+
+/* VF list */
+SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
+ CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
+ hn_vflist_sysctl, "A",
+ "VF list");
+
+/* VF mapping */
+SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
+ CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
+ hn_vfmap_sysctl, "A",
+ "VF mapping");
+
+/* Transparent VF */
+static int hn_xpnt_vf = 1;
+SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
+ &hn_xpnt_vf, 0, "Transparent VF mod");
+
+/* Accurate BPF support for Transparent VF */
+static int hn_xpnt_vf_accbpf = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
+ &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
+
+/* Extra wait for transparent VF attach routing; unit seconds. */
+static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
+SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
+ &hn_xpnt_vf_attwait, 0,
+ "Extra wait for transparent VF attach routing; unit: seconds");
+
+static u_int hn_cpu_index; /* next CPU for channel */
+static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
+
+static struct rmlock hn_vfmap_lock;
+static int hn_vfmap_size;
+static struct ifnet **hn_vfmap;
+
+#ifndef RSS
+static const uint8_t
+hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
+ 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+ 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+ 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+ 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+ 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
+};
+#endif /* !RSS */
+
+static const struct hyperv_guid hn_guid = {
+ .hv_guid = {
+ 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
+ 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
+};
+
+static device_method_t hn_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, hn_probe),
+ DEVMETHOD(device_attach, hn_attach),
+ DEVMETHOD(device_detach, hn_detach),
+ DEVMETHOD(device_shutdown, hn_shutdown),
+ DEVMETHOD_END
+};
+
+static driver_t hn_driver = {
+ "hn",
+ hn_methods,
+ sizeof(struct hn_softc)
+};
+
+static devclass_t hn_devclass;
+
+DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
+MODULE_VERSION(hn, 1);
+MODULE_DEPEND(hn, vmbus, 1, 1, 1);
+
+#if __FreeBSD_version >= 1100099
+static void
+hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
+{
+ int i;
+
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+ sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
+}
+#endif
+
+static int
+hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
+{
+
+ KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
+ txd->chim_size == 0, ("invalid rndis sglist txd"));
+ return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
+ &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
+}
+
+static int
+hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
+{
+ struct hn_nvs_rndis rndis;
+
+ KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
+ txd->chim_size > 0, ("invalid rndis chim txd"));
+
+ rndis.nvs_type = HN_NVS_TYPE_RNDIS;
+ rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
+ rndis.nvs_chim_idx = txd->chim_index;
+ rndis.nvs_chim_sz = txd->chim_size;
+
+ return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
+ &rndis, sizeof(rndis), &txd->send_ctx));
+}
+
+static __inline uint32_t
+hn_chim_alloc(struct hn_softc *sc)
+{
+ int i, bmap_cnt = sc->hn_chim_bmap_cnt;
+ u_long *bmap = sc->hn_chim_bmap;
+ uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
+
+ for (i = 0; i < bmap_cnt; ++i) {
+ int idx;
+
+ idx = ffsl(~bmap[i]);
+ if (idx == 0)
+ continue;
+
+ --idx; /* ffsl is 1-based */
+ KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
+ ("invalid i %d and idx %d", i, idx));
+
+ if (atomic_testandset_long(&bmap[i], idx))
+ continue;
+
+ ret = i * LONG_BIT + idx;
+ break;
+ }
+ return (ret);
+}
+
+static __inline void
+hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
+{
+ u_long mask;
+ uint32_t idx;
+
+ idx = chim_idx / LONG_BIT;
+ KASSERT(idx < sc->hn_chim_bmap_cnt,
+ ("invalid chimney index 0x%x", chim_idx));
+
+ mask = 1UL << (chim_idx % LONG_BIT);
+ KASSERT(sc->hn_chim_bmap[idx] & mask,
+ ("index bitmap 0x%lx, chimney index %u, "
+ "bitmap idx %d, bitmask 0x%lx",
+ sc->hn_chim_bmap[idx], chim_idx, idx, mask));
+
+ atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
+}
+
+#if defined(INET6) || defined(INET)
+
+#define PULLUP_HDR(m, len) \
+do { \
+ if (__predict_false((m)->m_len < (len))) { \
+ (m) = m_pullup((m), (len)); \
+ if ((m) == NULL) \
+ return (NULL); \
+ } \
+} while (0)
+
+/*
+ * NOTE: If this function failed, the m_head would be freed.
+ */
+static __inline struct mbuf *
+hn_tso_fixup(struct mbuf *m_head)
+{
+ struct ether_vlan_header *evl;
+ struct tcphdr *th;
+ int ehlen;
+
+ KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
+
+ PULLUP_HDR(m_head, sizeof(*evl));
+ evl = mtod(m_head, struct ether_vlan_header *);
+ if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
+ ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
+ else
+ ehlen = ETHER_HDR_LEN;
+ m_head->m_pkthdr.l2hlen = ehlen;
+
+#ifdef INET
+ if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
+ struct ip *ip;
+ int iphlen;
+
+ PULLUP_HDR(m_head, ehlen + sizeof(*ip));
+ ip = mtodo(m_head, ehlen);
+ iphlen = ip->ip_hl << 2;
+ m_head->m_pkthdr.l3hlen = iphlen;
+
+ PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
+ th = mtodo(m_head, ehlen + iphlen);
+
+ ip->ip_len = 0;
+ ip->ip_sum = 0;
+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htons(IPPROTO_TCP));
+ }
+#endif
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET6
+ {
+ struct ip6_hdr *ip6;
+
+ PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
+ ip6 = mtodo(m_head, ehlen);
+ if (ip6->ip6_nxt != IPPROTO_TCP) {
+ m_freem(m_head);
+ return (NULL);
+ }
+ m_head->m_pkthdr.l3hlen = sizeof(*ip6);
+
+ PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
+ th = mtodo(m_head, ehlen + sizeof(*ip6));
+
+ ip6->ip6_plen = 0;
+ th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
+ }
+#endif
+ return (m_head);
+}
+
+/*
+ * NOTE: If this function failed, the m_head would be freed.
+ */
+static __inline struct mbuf *
+hn_set_hlen(struct mbuf *m_head)
+{
+ const struct ether_vlan_header *evl;
+ int ehlen;
+
+ PULLUP_HDR(m_head, sizeof(*evl));
+ evl = mtod(m_head, const struct ether_vlan_header *);
+ if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
+ ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
+ else
+ ehlen = ETHER_HDR_LEN;
+ m_head->m_pkthdr.l2hlen = ehlen;
+
+#ifdef INET
+ if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
+ const struct ip *ip;
+ int iphlen;
+
+ PULLUP_HDR(m_head, ehlen + sizeof(*ip));
+ ip = mtodo(m_head, ehlen);
+ iphlen = ip->ip_hl << 2;
+ m_head->m_pkthdr.l3hlen = iphlen;
+
+ /*
+ * UDP checksum offload does not work in Azure, if the
+ * following conditions meet:
+ * - sizeof(IP hdr + UDP hdr + payload) > 1420.
+ * - IP_DF is not set in the IP hdr.
+ *
+ * Fallback to software checksum for these UDP datagrams.
+ */
+ if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
+ m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
+ (ntohs(ip->ip_off) & IP_DF) == 0) {
+ uint16_t off = ehlen + iphlen;
+
+ counter_u64_add(hn_udpcs_fixup, 1);
+ PULLUP_HDR(m_head, off + sizeof(struct udphdr));
+ *(uint16_t *)(m_head->m_data + off +
+ m_head->m_pkthdr.csum_data) = in_cksum_skip(
+ m_head, m_head->m_pkthdr.len, off);
+ m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
+ }
+ }
+#endif
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET6
+ {
+ const struct ip6_hdr *ip6;
+
+ PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
+ ip6 = mtodo(m_head, ehlen);
+ if (ip6->ip6_nxt != IPPROTO_TCP &&
+ ip6->ip6_nxt != IPPROTO_UDP) {
+ m_freem(m_head);
+ return (NULL);
+ }
+ m_head->m_pkthdr.l3hlen = sizeof(*ip6);
+ }
+#endif
+ return (m_head);
+}
+
+/*
+ * NOTE: If this function failed, the m_head would be freed.
+ */
+static __inline struct mbuf *
+hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
+{
+ const struct tcphdr *th;
+ int ehlen, iphlen;
+
+ *tcpsyn = 0;
+ ehlen = m_head->m_pkthdr.l2hlen;
+ iphlen = m_head->m_pkthdr.l3hlen;
+
+ PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
+ th = mtodo(m_head, ehlen + iphlen);
+ if (th->th_flags & TH_SYN)
+ *tcpsyn = 1;
+ return (m_head);
+}
+
+#undef PULLUP_HDR
+
+#endif /* INET6 || INET */
+
+static int
+hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
+{
+ int error = 0;
+
+ HN_LOCK_ASSERT(sc);
+
+ if (sc->hn_rx_filter != filter) {
+ error = hn_rndis_set_rxfilter(sc, filter);
+ if (!error)
+ sc->hn_rx_filter = filter;
+ }
+ return (error);
+}
+
+static int
+hn_rxfilter_config(struct hn_softc *sc)
+{
+ struct ifnet *ifp = sc->hn_ifp;
+ uint32_t filter;
+
+ HN_LOCK_ASSERT(sc);
+
+ /*
+ * If the non-transparent mode VF is activated, we don't know how
+ * its RX filter is configured, so stick the synthetic device in
+ * the promiscous mode.
+ */
+ if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
+ filter = NDIS_PACKET_TYPE_PROMISCUOUS;
+ } else {
+ filter = NDIS_PACKET_TYPE_DIRECTED;
+ if (ifp->if_flags & IFF_BROADCAST)
+ filter |= NDIS_PACKET_TYPE_BROADCAST;
+ /* TODO: support multicast list */
+ if ((ifp->if_flags & IFF_ALLMULTI) ||
+ !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
+ filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
+ }
+ return (hn_set_rxfilter(sc, filter));
+}
+
+static void
+hn_set_txagg(struct hn_softc *sc)
+{
+ uint32_t size, pkts;
+ int i;
+
+ /*
+ * Setup aggregation size.
+ */
+ if (sc->hn_agg_size < 0)
+ size = UINT32_MAX;
+ else
+ size = sc->hn_agg_size;
+
+ if (sc->hn_rndis_agg_size < size)
+ size = sc->hn_rndis_agg_size;
+
+ /* NOTE: We only aggregate packets using chimney sending buffers. */
+ if (size > (uint32_t)sc->hn_chim_szmax)
+ size = sc->hn_chim_szmax;
+
+ if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
+ /* Disable */
+ size = 0;
+ pkts = 0;
+ goto done;
+ }
+
+ /* NOTE: Type of the per TX ring setting is 'int'. */
+ if (size > INT_MAX)
+ size = INT_MAX;
+
+ /*
+ * Setup aggregation packet count.
+ */
+ if (sc->hn_agg_pkts < 0)
+ pkts = UINT32_MAX;
+ else
+ pkts = sc->hn_agg_pkts;
+
+ if (sc->hn_rndis_agg_pkts < pkts)
+ pkts = sc->hn_rndis_agg_pkts;
+
+ if (pkts <= 1) {
+ /* Disable */
+ size = 0;
+ pkts = 0;
+ goto done;
+ }
+
+ /* NOTE: Type of the per TX ring setting is 'short'. */
+ if (pkts > SHRT_MAX)
+ pkts = SHRT_MAX;
+
+done:
+ /* NOTE: Type of the per TX ring setting is 'short'. */
+ if (sc->hn_rndis_agg_align > SHRT_MAX) {
+ /* Disable */
+ size = 0;
+ pkts = 0;
+ }
+
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
+ size, pkts, sc->hn_rndis_agg_align);
+ }
+
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+ struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
+
+ mtx_lock(&txr->hn_tx_lock);
+ txr->hn_agg_szmax = size;
+ txr->hn_agg_pktmax = pkts;
+ txr->hn_agg_align = sc->hn_rndis_agg_align;
+ mtx_unlock(&txr->hn_tx_lock);
+ }
+}
+
+static int
+hn_get_txswq_depth(const struct hn_tx_ring *txr)
+{
+
+ KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
+ if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
+ return txr->hn_txdesc_cnt;
+ return hn_tx_swq_depth;
+}
+
+static int
+hn_rss_reconfig(struct hn_softc *sc)
+{
+ int error;
+
+ HN_LOCK_ASSERT(sc);
+
+ if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
+ return (ENXIO);
+
+ /*
+ * Disable RSS first.
+ *
+ * NOTE:
+ * Direct reconfiguration by setting the UNCHG flags does
+ * _not_ work properly.
+ */
+ if (bootverbose)
+ if_printf(sc->hn_ifp, "disable RSS\n");
+ error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
+ if (error) {
+ if_printf(sc->hn_ifp, "RSS disable failed\n");
+ return (error);
+ }
+
+ /*
+ * Reenable the RSS w/ the updated RSS key or indirect
+ * table.
+ */
+ if (bootverbose)
+ if_printf(sc->hn_ifp, "reconfig RSS\n");
+ error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
+ if (error) {
+ if_printf(sc->hn_ifp, "RSS reconfig failed\n");
+ return (error);
+ }
+ return (0);
+}
+
+static void
+hn_rss_ind_fixup(struct hn_softc *sc)
+{
+ struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
+ int i, nchan;
+
+ nchan = sc->hn_rx_ring_inuse;
+ KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
+
+ /*
+ * Check indirect table to make sure that all channels in it
+ * can be used.
+ */
+ for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
+ if (rss->rss_ind[i] >= nchan) {
+ if_printf(sc->hn_ifp,
+ "RSS indirect table %d fixup: %u -> %d\n",
+ i, rss->rss_ind[i], nchan - 1);
+ rss->rss_ind[i] = nchan - 1;
+ }
+ }
+}
+
+static int
+hn_ifmedia_upd(struct ifnet *ifp __unused)
+{
+
+ return EOPNOTSUPP;
+}
+
+static void
+hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
+{
+ struct hn_softc *sc = ifp->if_softc;
+
+ ifmr->ifm_status = IFM_AVALID;
+ ifmr->ifm_active = IFM_ETHER;
+
+ if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
+ ifmr->ifm_active |= IFM_NONE;
+ return;
+ }
+ ifmr->ifm_status |= IFM_ACTIVE;
+ ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
+}
+
+static void
+hn_rxvf_set_task(void *xarg, int pending __unused)
+{
+ struct hn_rxvf_setarg *arg = xarg;
+
+ arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
+}
+
+static void
+hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
+{
+ struct hn_rx_ring *rxr;
+ struct hn_rxvf_setarg arg;
+ struct task task;
+ int i;
+
+ HN_LOCK_ASSERT(sc);
+
+ TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
+
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ rxr = &sc->hn_rx_ring[i];
+
+ if (i < sc->hn_rx_ring_inuse) {
+ arg.rxr = rxr;
+ arg.vf_ifp = vf_ifp;
+ vmbus_chan_run_task(rxr->hn_chan, &task);
+ } else {
+ rxr->hn_rxvf_ifp = vf_ifp;
+ }
+ }
+}
+
+static bool
+hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
+{
+ const struct ifnet *hn_ifp;
+
+ hn_ifp = sc->hn_ifp;
+
+ if (ifp == hn_ifp)
+ return (false);
+
+ if (ifp->if_alloctype != IFT_ETHER)
+ return (false);
+
+ /* Ignore lagg/vlan interfaces */
+ if (strcmp(ifp->if_dname, "lagg") == 0 ||
+ strcmp(ifp->if_dname, "vlan") == 0)
+ return (false);
+
+ /*
+ * During detach events ifp->if_addr might be NULL.
+ * Make sure the bcmp() below doesn't panic on that:
+ */
+ if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
+ return (false);
+
+ if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
+ return (false);
+
+ return (true);
+}
+
+static void
+hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
+{
+ struct ifnet *hn_ifp;
+
+ HN_LOCK(sc);
+
+ if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
+ goto out;
+
+ if (!hn_ismyvf(sc, ifp))
+ goto out;
+ hn_ifp = sc->hn_ifp;
+
+ if (rxvf) {
+ if (sc->hn_flags & HN_FLAG_RXVF)
+ goto out;
+
+ sc->hn_flags |= HN_FLAG_RXVF;
+ hn_rxfilter_config(sc);
+ } else {
+ if (!(sc->hn_flags & HN_FLAG_RXVF))
+ goto out;
+
+ sc->hn_flags &= ~HN_FLAG_RXVF;
+ if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
+ hn_rxfilter_config(sc);
+ else
+ hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
+ }
+
+ hn_nvs_set_datapath(sc,
+ rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
+
+ hn_rxvf_set(sc, rxvf ? ifp : NULL);
+
+ if (rxvf) {
+ hn_vf_rss_fixup(sc, true);
+ hn_suspend_mgmt(sc);
+ sc->hn_link_flags &=
+ ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
+ if_link_state_change(hn_ifp, LINK_STATE_DOWN);
+ } else {
+ hn_vf_rss_restore(sc);
+ hn_resume_mgmt(sc);
+ }
+
+ devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
+ rxvf ? "VF_UP" : "VF_DOWN", NULL);
+
+ if (bootverbose) {
+ if_printf(hn_ifp, "datapath is switched %s %s\n",
+ rxvf ? "to" : "from", ifp->if_xname);
+ }
+out:
+ HN_UNLOCK(sc);
+}
+
+static void
+hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
+{
+
+ if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
+ return;
+ hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
+}
+
+static void
+hn_ifaddr_event(void *arg, struct ifnet *ifp)
+{
+
+ hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
+}
+
+static int
+hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
+{
+ struct ifnet *ifp, *vf_ifp;
+ uint64_t tmp;
+ int error;
+
+ HN_LOCK_ASSERT(sc);
+ ifp = sc->hn_ifp;
+ vf_ifp = sc->hn_vf_ifp;
+
+ /*
+ * Fix up requested capabilities w/ supported capabilities,
+ * since the supported capabilities could have been changed.
+ */
+ ifr->ifr_reqcap &= ifp->if_capabilities;
+ /* Pass SIOCSIFCAP to VF. */
+ error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
+
+ /*
+ * NOTE:
+ * The error will be propagated to the callers, however, it
+ * is _not_ useful here.
+ */
+
+ /*
+ * Merge VF's enabled capabilities.
+ */
+ ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
+
+ tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
+ if (ifp->if_capenable & IFCAP_TXCSUM)
+ ifp->if_hwassist |= tmp;
+ else
+ ifp->if_hwassist &= ~tmp;
+
+ tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
+ if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
+ ifp->if_hwassist |= tmp;
+ else
+ ifp->if_hwassist &= ~tmp;
+
+ tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
+ if (ifp->if_capenable & IFCAP_TSO4)
+ ifp->if_hwassist |= tmp;
+ else
+ ifp->if_hwassist &= ~tmp;
+
+ tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
+ if (ifp->if_capenable & IFCAP_TSO6)
+ ifp->if_hwassist |= tmp;
+ else
+ ifp->if_hwassist &= ~tmp;
+
+ return (error);
+}
+
+static int
+hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
+{
+ struct ifnet *vf_ifp;
+ struct ifreq ifr;
+
+ HN_LOCK_ASSERT(sc);
+ vf_ifp = sc->hn_vf_ifp;
+
+ memset(&ifr, 0, sizeof(ifr));
+ strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
+ ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
+ ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
+ return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
+}
+
+static void
+hn_xpnt_vf_saveifflags(struct hn_softc *sc)
+{
+ struct ifnet *ifp = sc->hn_ifp;
+ int allmulti = 0;
+
+ HN_LOCK_ASSERT(sc);
+
+ /* XXX vlan(4) style mcast addr maintenance */
+ if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
+ allmulti = IFF_ALLMULTI;
+
+ /* Always set the VF's if_flags */
+ sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
+}
+
+static void
+hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
+{
+ struct rm_priotracker pt;
+ struct ifnet *hn_ifp = NULL;
+ struct mbuf *mn;
+
+ /*
+ * XXX racy, if hn(4) ever detached.
+ */
+ rm_rlock(&hn_vfmap_lock, &pt);
+ if (vf_ifp->if_index < hn_vfmap_size)
+ hn_ifp = hn_vfmap[vf_ifp->if_index];
+ rm_runlock(&hn_vfmap_lock, &pt);
+
+ if (hn_ifp != NULL) {
+ for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
+ /*
+ * Allow tapping on the VF.
+ */
+ ETHER_BPF_MTAP(vf_ifp, mn);
+
+ /*
+ * Update VF stats.
+ */
+ if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
+ if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
+ mn->m_pkthdr.len);
+ }
+ /*
+ * XXX IFCOUNTER_IMCAST
+ * This stat updating is kinda invasive, since it
+ * requires two checks on the mbuf: the length check
+ * and the ethernet header check. As of this write,
+ * all multicast packets go directly to hn(4), which
+ * makes imcast stat updating in the VF a try in vian.
+ */
+
+ /*
+ * Fix up rcvif and increase hn(4)'s ipackets.
+ */
+ mn->m_pkthdr.rcvif = hn_ifp;
+ if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
+ }
+ /*
+ * Go through hn(4)'s if_input.
+ */
+ hn_ifp->if_input(hn_ifp, m);
+ } else {
+ /*
+ * In the middle of the transition; free this
+ * mbuf chain.
+ */
+ while (m != NULL) {
+ mn = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ m_freem(m);
+ m = mn;
+ }
+ }
+}
+
+static void
+hn_mtu_change_fixup(struct hn_softc *sc)
+{
+ struct ifnet *ifp;
+
+ HN_LOCK_ASSERT(sc);
+ ifp = sc->hn_ifp;
+
+ hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
+#if __FreeBSD_version >= 1100099
+ if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
+ hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
+#endif
+}
+
+static uint32_t
+hn_rss_type_fromndis(uint32_t rss_hash)
+{
+ uint32_t types = 0;
+
+ if (rss_hash & NDIS_HASH_IPV4)
+ types |= RSS_TYPE_IPV4;
+ if (rss_hash & NDIS_HASH_TCP_IPV4)
+ types |= RSS_TYPE_TCP_IPV4;
+ if (rss_hash & NDIS_HASH_IPV6)
+ types |= RSS_TYPE_IPV6;
+ if (rss_hash & NDIS_HASH_IPV6_EX)
+ types |= RSS_TYPE_IPV6_EX;
+ if (rss_hash & NDIS_HASH_TCP_IPV6)
+ types |= RSS_TYPE_TCP_IPV6;
+ if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
+ types |= RSS_TYPE_TCP_IPV6_EX;
+ if (rss_hash & NDIS_HASH_UDP_IPV4_X)
+ types |= RSS_TYPE_UDP_IPV4;
+ return (types);
+}
+
+static uint32_t
+hn_rss_type_tondis(uint32_t types)
+{
+ uint32_t rss_hash = 0;
+
+ KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
+ ("UDP6 and UDP6EX are not supported"));
+
+ if (types & RSS_TYPE_IPV4)
+ rss_hash |= NDIS_HASH_IPV4;
+ if (types & RSS_TYPE_TCP_IPV4)
+ rss_hash |= NDIS_HASH_TCP_IPV4;
+ if (types & RSS_TYPE_IPV6)
+ rss_hash |= NDIS_HASH_IPV6;
+ if (types & RSS_TYPE_IPV6_EX)
+ rss_hash |= NDIS_HASH_IPV6_EX;
+ if (types & RSS_TYPE_TCP_IPV6)
+ rss_hash |= NDIS_HASH_TCP_IPV6;
+ if (types & RSS_TYPE_TCP_IPV6_EX)
+ rss_hash |= NDIS_HASH_TCP_IPV6_EX;
+ if (types & RSS_TYPE_UDP_IPV4)
+ rss_hash |= NDIS_HASH_UDP_IPV4_X;
+ return (rss_hash);
+}
+
+static void
+hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
+{
+ int i;
+
+ HN_LOCK_ASSERT(sc);
+
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+ sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
+}
+
+static void
+hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
+{
+ struct ifnet *ifp, *vf_ifp;
+ struct ifrsshash ifrh;
+ struct ifrsskey ifrk;
+ int error;
+ uint32_t my_types, diff_types, mbuf_types = 0;
+
+ HN_LOCK_ASSERT(sc);
+ KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
+ ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
+
+ if (sc->hn_rx_ring_inuse == 1) {
+ /* No RSS on synthetic parts; done. */
+ return;
+ }
+ if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
+ /* Synthetic parts do not support Toeplitz; done. */
+ return;
+ }
+
+ ifp = sc->hn_ifp;
+ vf_ifp = sc->hn_vf_ifp;
+
+ /*
+ * Extract VF's RSS key. Only 40 bytes key for Toeplitz is
+ * supported.
+ */
+ memset(&ifrk, 0, sizeof(ifrk));
+ strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
+ error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
+ if (error) {
+ if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
+ vf_ifp->if_xname, error);
+ goto done;
+ }
+ if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
+ if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
+ vf_ifp->if_xname, ifrk.ifrk_func);
+ goto done;
+ }
+ if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
+ if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
+ vf_ifp->if_xname, ifrk.ifrk_keylen);
+ goto done;
+ }
+
+ /*
+ * Extract VF's RSS hash. Only Toeplitz is supported.
+ */
+ memset(&ifrh, 0, sizeof(ifrh));
+ strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
+ error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
+ if (error) {
+ if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
+ vf_ifp->if_xname, error);
+ goto done;
+ }
+ if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
+ if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
+ vf_ifp->if_xname, ifrh.ifrh_func);
+ goto done;
+ }
+
+ my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
+ if ((ifrh.ifrh_types & my_types) == 0) {
+ /* This disables RSS; ignore it then */
+ if_printf(ifp, "%s intersection of RSS types failed. "
+ "VF %#x, mine %#x\n", vf_ifp->if_xname,
+ ifrh.ifrh_types, my_types);
+ goto done;
+ }
+
+ diff_types = my_types ^ ifrh.ifrh_types;
+ my_types &= ifrh.ifrh_types;
+ mbuf_types = my_types;
+
+ /*
+ * Detect RSS hash value/type confliction.
+ *
+ * NOTE:
+ * We don't disable the hash type, but stop delivery the hash
+ * value/type through mbufs on RX path.
+ *
+ * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
+ * hash is delivered with type of TCP_IPV4. This means if
+ * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
+ * least to hn_mbuf_hash. However, given that _all_ of the
+ * NICs implement TCP_IPV4, this will _not_ impose any issues
+ * here.
+ */
+ if ((my_types & RSS_TYPE_IPV4) &&
+ (diff_types & ifrh.ifrh_types &
+ (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
+ /* Conflict; disable IPV4 hash type/value delivery. */
+ if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
+ mbuf_types &= ~RSS_TYPE_IPV4;
+ }
+ if ((my_types & RSS_TYPE_IPV6) &&
+ (diff_types & ifrh.ifrh_types &
+ (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
+ RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
+ RSS_TYPE_IPV6_EX))) {
+ /* Conflict; disable IPV6 hash type/value delivery. */
+ if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
+ mbuf_types &= ~RSS_TYPE_IPV6;
+ }
+ if ((my_types & RSS_TYPE_IPV6_EX) &&
+ (diff_types & ifrh.ifrh_types &
+ (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
+ RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
+ RSS_TYPE_IPV6))) {
+ /* Conflict; disable IPV6_EX hash type/value delivery. */
+ if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
+ mbuf_types &= ~RSS_TYPE_IPV6_EX;
+ }
+ if ((my_types & RSS_TYPE_TCP_IPV6) &&
+ (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
+ /* Conflict; disable TCP_IPV6 hash type/value delivery. */
+ if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
+ mbuf_types &= ~RSS_TYPE_TCP_IPV6;
+ }
+ if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
+ (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
+ /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
+ if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
+ mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
+ }
+ if ((my_types & RSS_TYPE_UDP_IPV6) &&
+ (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
+ /* Conflict; disable UDP_IPV6 hash type/value delivery. */
+ if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
+ mbuf_types &= ~RSS_TYPE_UDP_IPV6;
+ }
+ if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
+ (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
+ /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
+ if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
+ mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
+ }
+
+ /*
+ * Indirect table does not matter.
+ */
+
+ sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
+ hn_rss_type_tondis(my_types);
+ memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
+ sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
+
+ if (reconf) {
+ error = hn_rss_reconfig(sc);
+ if (error) {
+ /* XXX roll-back? */
+ if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
+ /* XXX keep going. */
+ }
+ }
+done:
+ /* Hash deliverability for mbufs. */
+ hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
+}
+
+static void
+hn_vf_rss_restore(struct hn_softc *sc)
+{
+
+ HN_LOCK_ASSERT(sc);
+ KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
+ ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
+
+ if (sc->hn_rx_ring_inuse == 1)
+ goto done;
+
+ /*
+ * Restore hash types. Key does _not_ matter.
+ */
+ if (sc->hn_rss_hash != sc->hn_rss_hcap) {
+ int error;
+
+ sc->hn_rss_hash = sc->hn_rss_hcap;
+ error = hn_rss_reconfig(sc);
+ if (error) {
+ if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
+ error);
+ /* XXX keep going. */
+ }
+ }
+done:
+ /* Hash deliverability for mbufs. */
+ hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
+}
+
+static void
+hn_xpnt_vf_setready(struct hn_softc *sc)
+{
+ struct ifnet *ifp, *vf_ifp;
+ struct ifreq ifr;
+
+ HN_LOCK_ASSERT(sc);
+ ifp = sc->hn_ifp;
+ vf_ifp = sc->hn_vf_ifp;
+
+ /*
+ * Mark the VF ready.
+ */
+ sc->hn_vf_rdytick = 0;
+
+ /*
+ * Save information for restoration.
+ */
+ sc->hn_saved_caps = ifp->if_capabilities;
+ sc->hn_saved_tsomax = ifp->if_hw_tsomax;
+ sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
+ sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
+
+ /*
+ * Intersect supported/enabled capabilities.
+ *
+ * NOTE:
+ * if_hwassist is not changed here.
+ */
+ ifp->if_capabilities &= vf_ifp->if_capabilities;
+ ifp->if_capenable &= ifp->if_capabilities;
+
+ /*
+ * Fix TSO settings.
+ */
+ if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
+ ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
+ if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
+ ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
+ if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
+ ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
+
+ /*
+ * Change VF's enabled capabilities.
+ */
+ memset(&ifr, 0, sizeof(ifr));
+ strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
+ ifr.ifr_reqcap = ifp->if_capenable;
+ hn_xpnt_vf_iocsetcaps(sc, &ifr);
+
+ if (ifp->if_mtu != ETHERMTU) {
+ int error;
+
+ /*
+ * Change VF's MTU.
+ */
+ memset(&ifr, 0, sizeof(ifr));
+ strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
+ ifr.ifr_mtu = ifp->if_mtu;
+ error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
+ if (error) {
+ if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
+ vf_ifp->if_xname, ifp->if_mtu);
+ if (ifp->if_mtu > ETHERMTU) {
+ if_printf(ifp, "change MTU to %d\n", ETHERMTU);
+
+ /*
+ * XXX
+ * No need to adjust the synthetic parts' MTU;
+ * failure of the adjustment will cause us
+ * infinite headache.
+ */
+ ifp->if_mtu = ETHERMTU;
+ hn_mtu_change_fixup(sc);
+ }
+ }
+ }
+}
+
+static bool
+hn_xpnt_vf_isready(struct hn_softc *sc)
+{
+
+ HN_LOCK_ASSERT(sc);
+
+ if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
+ return (false);
+
+ if (sc->hn_vf_rdytick == 0)
+ return (true);
+
+ if (sc->hn_vf_rdytick > ticks)
+ return (false);
+
+ /* Mark VF as ready. */
+ hn_xpnt_vf_setready(sc);
+ return (true);
+}
+
+static void
+hn_xpnt_vf_setenable(struct hn_softc *sc)
+{
+ int i;
+
+ HN_LOCK_ASSERT(sc);
+
+ /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
+ rm_wlock(&sc->hn_vf_lock);
+ sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
+ rm_wunlock(&sc->hn_vf_lock);
+
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+ sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
+}
+
+static void
+hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
+{
+ int i;
+
+ HN_LOCK_ASSERT(sc);
+
+ /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
+ rm_wlock(&sc->hn_vf_lock);
+ sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
+ if (clear_vf)
+ sc->hn_vf_ifp = NULL;
+ rm_wunlock(&sc->hn_vf_lock);
+
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+ sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
+}
+
+static void
+hn_xpnt_vf_init(struct hn_softc *sc)
+{
+ int error;
+
+ HN_LOCK_ASSERT(sc);
+
+ KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
+ ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
+
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "try bringing up %s\n",
+ sc->hn_vf_ifp->if_xname);
+ }
+
+ /*
+ * Bring the VF up.
+ */
+ hn_xpnt_vf_saveifflags(sc);
+ sc->hn_vf_ifp->if_flags |= IFF_UP;
+ error = hn_xpnt_vf_iocsetflags(sc);
+ if (error) {
+ if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
+ sc->hn_vf_ifp->if_xname, error);
+ return;
+ }
+
+ /*
+ * NOTE:
+ * Datapath setting must happen _after_ bringing the VF up.
+ */
+ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
+
+ /*
+ * NOTE:
+ * Fixup RSS related bits _after_ the VF is brought up, since
+ * many VFs generate RSS key during it's initialization.
+ */
+ hn_vf_rss_fixup(sc, true);
+
+ /* Mark transparent mode VF as enabled. */
+ hn_xpnt_vf_setenable(sc);
+}
+
+static void
+hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
+{
+ struct hn_softc *sc = xsc;
+
+ HN_LOCK(sc);
+
+ if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
+ goto done;
+ if (sc->hn_vf_ifp == NULL)
+ goto done;
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
+ goto done;
+
+ if (sc->hn_vf_rdytick != 0) {
+ /* Mark VF as ready. */
+ hn_xpnt_vf_setready(sc);
+ }
+
+ if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
+ /*
+ * Delayed VF initialization.
+ */
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "delayed initialize %s\n",
+ sc->hn_vf_ifp->if_xname);
+ }
+ hn_xpnt_vf_init(sc);
+ }
+done:
+ HN_UNLOCK(sc);
+}
+
+static void
+hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
+{
+ struct hn_softc *sc = xsc;
+
+ HN_LOCK(sc);
+
+ if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
+ goto done;
+
+ if (!hn_ismyvf(sc, ifp))
+ goto done;
+
+ if (sc->hn_vf_ifp != NULL) {
+ if_printf(sc->hn_ifp, "%s was attached as VF\n",
+ sc->hn_vf_ifp->if_xname);
+ goto done;
+ }
+
+ if (hn_xpnt_vf && ifp->if_start != NULL) {
+ /*
+ * ifnet.if_start is _not_ supported by transparent
+ * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
+ */
+ if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
+ "in transparent VF mode.\n", ifp->if_xname);
+ goto done;
+ }
+
+ rm_wlock(&hn_vfmap_lock);
+
+ if (ifp->if_index >= hn_vfmap_size) {
+ struct ifnet **newmap;
+ int newsize;
+
+ newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
+ newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
+ M_WAITOK | M_ZERO);
+
+ memcpy(newmap, hn_vfmap,
+ sizeof(struct ifnet *) * hn_vfmap_size);
+ free(hn_vfmap, M_DEVBUF);
+ hn_vfmap = newmap;
+ hn_vfmap_size = newsize;
+ }
+ KASSERT(hn_vfmap[ifp->if_index] == NULL,
+ ("%s: ifindex %d was mapped to %s",
+ ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
+ hn_vfmap[ifp->if_index] = sc->hn_ifp;
+
+ rm_wunlock(&hn_vfmap_lock);
+
+ /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
+ rm_wlock(&sc->hn_vf_lock);
+ KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
+ ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
+ sc->hn_vf_ifp = ifp;
+ rm_wunlock(&sc->hn_vf_lock);
+
+ if (hn_xpnt_vf) {
+ int wait_ticks;
+
+ /*
+ * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
+ * Save vf_ifp's current if_input for later restoration.
+ */
+ sc->hn_vf_input = ifp->if_input;
+ ifp->if_input = hn_xpnt_vf_input;
+
+ /*
+ * Stop link status management; use the VF's.
+ */
+ hn_suspend_mgmt(sc);
+
+ /*
+ * Give VF sometime to complete its attach routing.
+ */
+ wait_ticks = hn_xpnt_vf_attwait * hz;
+ sc->hn_vf_rdytick = ticks + wait_ticks;
+
+ taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
+ wait_ticks);
+ }
+done:
+ HN_UNLOCK(sc);
+}
+
+static void
+hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
+{
+ struct hn_softc *sc = xsc;
+
+ HN_LOCK(sc);
+
+ if (sc->hn_vf_ifp == NULL)
+ goto done;
+
+ if (!hn_ismyvf(sc, ifp))
+ goto done;
+
+ if (hn_xpnt_vf) {
+ /*
+ * Make sure that the delayed initialization is not running.
+ *
+ * NOTE:
+ * - This lock _must_ be released, since the hn_vf_init task
+ * will try holding this lock.
+ * - It is safe to release this lock here, since the
+ * hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
+ *
+ * XXX racy, if hn(4) ever detached.
+ */
+ HN_UNLOCK(sc);
+ taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
+ HN_LOCK(sc);
+
+ KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
+ sc->hn_ifp->if_xname));
+ ifp->if_input = sc->hn_vf_input;
+ sc->hn_vf_input = NULL;
+
+ if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
+ (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
+ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
+
+ if (sc->hn_vf_rdytick == 0) {
+ /*
+ * The VF was ready; restore some settings.
+ */
+ sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
+ /*
+ * NOTE:
+ * There is _no_ need to fixup if_capenable and
+ * if_hwassist, since the if_capabilities before
+ * restoration was an intersection of the VF's
+ * if_capabilites and the synthetic device's
+ * if_capabilites.
+ */
+ sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
+ sc->hn_ifp->if_hw_tsomaxsegcount =
+ sc->hn_saved_tsosegcnt;
+ sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
+ }
+
+ if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
+ /*
+ * Restore RSS settings.
+ */
+ hn_vf_rss_restore(sc);
+
+ /*
+ * Resume link status management, which was suspended
+ * by hn_ifnet_attevent().
+ */
+ hn_resume_mgmt(sc);
+ }
+ }
+
+ /* Mark transparent mode VF as disabled. */
+ hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
+
+ rm_wlock(&hn_vfmap_lock);
+
+ KASSERT(ifp->if_index < hn_vfmap_size,
+ ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
+ if (hn_vfmap[ifp->if_index] != NULL) {
+ KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
+ ("%s: ifindex %d was mapped to %s",
+ ifp->if_xname, ifp->if_index,
+ hn_vfmap[ifp->if_index]->if_xname));
+ hn_vfmap[ifp->if_index] = NULL;
+ }
+
+ rm_wunlock(&hn_vfmap_lock);
+done:
+ HN_UNLOCK(sc);
+}
+
+static void
+hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
+{
+ struct hn_softc *sc = xsc;
+
+ if (sc->hn_vf_ifp == ifp)
+ if_link_state_change(sc->hn_ifp, link_state);
+}
+
+static int
+hn_probe(device_t dev)
+{
+
+ if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
+ device_set_desc(dev, "Hyper-V Network Interface");
+ return BUS_PROBE_DEFAULT;
+ }
+ return ENXIO;
+}
+
+static int
+hn_attach(device_t dev)
+{
+ struct hn_softc *sc = device_get_softc(dev);
+ struct sysctl_oid_list *child;
+ struct sysctl_ctx_list *ctx;
+ uint8_t eaddr[ETHER_ADDR_LEN];
+ struct ifnet *ifp = NULL;
+ int error, ring_cnt, tx_ring_cnt;
+ uint32_t mtu;
+
+ sc->hn_dev = dev;
+ sc->hn_prichan = vmbus_get_channel(dev);
+ HN_LOCK_INIT(sc);
+ rm_init(&sc->hn_vf_lock, "hnvf");
+ if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
+ sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
+
+ /*
+ * Initialize these tunables once.
+ */
+ sc->hn_agg_size = hn_tx_agg_size;
+ sc->hn_agg_pkts = hn_tx_agg_pkts;
+
+ /*
+ * Setup taskqueue for transmission.
+ */
+ if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
+ int i;
+
+ sc->hn_tx_taskqs =
+ malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
+ M_DEVBUF, M_WAITOK);
+ for (i = 0; i < hn_tx_taskq_cnt; ++i) {
+ sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
+ M_WAITOK, taskqueue_thread_enqueue,
+ &sc->hn_tx_taskqs[i]);
+ taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
+ "%s tx%d", device_get_nameunit(dev), i);
+ }
+ } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
+ sc->hn_tx_taskqs = hn_tx_taskque;
+ }
+
+ /*
+ * Setup taskqueue for mangement tasks, e.g. link status.
+ */
+ sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
+ taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
+ taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
+ device_get_nameunit(dev));
+ TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
+ TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
+ TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
+ hn_netchg_status_taskfunc, sc);
+
+ if (hn_xpnt_vf) {
+ /*
+ * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
+ */
+ sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
+ taskqueue_thread_enqueue, &sc->hn_vf_taskq);
+ taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
+ device_get_nameunit(dev));
+ TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
+ hn_xpnt_vf_init_taskfunc, sc);
+ }
+
+ /*
+ * Allocate ifnet and setup its name earlier, so that if_printf
+ * can be used by functions, which will be called after
+ * ether_ifattach().
+ */
+ ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
+ ifp->if_softc = sc;
+ if_initname(ifp, device_get_name(dev), device_get_unit(dev));
+
+ /*
+ * Initialize ifmedia earlier so that it can be unconditionally
+ * destroyed, if error happened later on.
+ */
+ ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
+
+ /*
+ * Figure out the # of RX rings (ring_cnt) and the # of TX rings
+ * to use (tx_ring_cnt).
+ *
+ * NOTE:
+ * The # of RX rings to use is same as the # of channels to use.
+ */
+ ring_cnt = hn_chan_cnt;
+ if (ring_cnt <= 0) {
+ /* Default */
+ ring_cnt = mp_ncpus;
+ if (ring_cnt > HN_RING_CNT_DEF_MAX)
+ ring_cnt = HN_RING_CNT_DEF_MAX;
+ } else if (ring_cnt > mp_ncpus) {
+ ring_cnt = mp_ncpus;
+ }
+#ifdef RSS
+ if (ring_cnt > rss_getnumbuckets())
+ ring_cnt = rss_getnumbuckets();
+#endif
+
+ tx_ring_cnt = hn_tx_ring_cnt;
+ if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
+ tx_ring_cnt = ring_cnt;
+#ifdef HN_IFSTART_SUPPORT
+ if (hn_use_if_start) {
+ /* ifnet.if_start only needs one TX ring. */
+ tx_ring_cnt = 1;
+ }
+#endif
+
+ /*
+ * Set the leader CPU for channels.
+ */
+ sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
+
+ /*
+ * Create enough TX/RX rings, even if only limited number of
+ * channels can be allocated.
+ */
+ error = hn_create_tx_data(sc, tx_ring_cnt);
+ if (error)
+ goto failed;
+ error = hn_create_rx_data(sc, ring_cnt);
+ if (error)
+ goto failed;
+
+ /*
+ * Create transaction context for NVS and RNDIS transactions.
+ */
+ sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
+ HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
+ if (sc->hn_xact == NULL) {
+ error = ENXIO;
+ goto failed;
+ }
+
+ /*
+ * Install orphan handler for the revocation of this device's
+ * primary channel.
+ *
+ * NOTE:
+ * The processing order is critical here:
+ * Install the orphan handler, _before_ testing whether this
+ * device's primary channel has been revoked or not.
+ */
+ vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
+ if (vmbus_chan_is_revoked(sc->hn_prichan)) {
+ error = ENXIO;
+ goto failed;
+ }
+
+ /*
+ * Attach the synthetic parts, i.e. NVS and RNDIS.
+ */
+ error = hn_synth_attach(sc, ETHERMTU);
+ if (error)
+ goto failed;
+
+ error = hn_rndis_get_eaddr(sc, eaddr);
+ if (error)
+ goto failed;
+
+ error = hn_rndis_get_mtu(sc, &mtu);
+ if (error)
+ mtu = ETHERMTU;
+ else if (bootverbose)
+ device_printf(dev, "RNDIS mtu %u\n", mtu);
+
+#if __FreeBSD_version >= 1100099
+ if (sc->hn_rx_ring_inuse > 1) {
+ /*
+ * Reduce TCP segment aggregation limit for multiple
+ * RX rings to increase ACK timeliness.
+ */
+ hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
+ }
+#endif
+
+ /*
+ * Fixup TX/RX stuffs after synthetic parts are attached.
+ */
+ hn_fixup_tx_data(sc);
+ hn_fixup_rx_data(sc);
+
+ ctx = device_get_sysctl_ctx(dev);
+ child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
+ &sc->hn_nvs_ver, 0, "NVS version");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_ndis_version_sysctl, "A", "NDIS version");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_caps_sysctl, "A", "capabilities");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_hwassist_sysctl, "A", "hwassist");
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
+ CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
+ CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
+ "max # of TSO segments");
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
+ CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
+ "max size of TSO segment");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_rxfilter_sysctl, "A", "rxfilter");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_rss_hash_sysctl, "A", "RSS hash");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
+ CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
+#ifndef RSS
+ /*
+ * Don't allow RSS key/indirect table changes, if RSS is defined.
+ */
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
+ CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+ hn_rss_key_sysctl, "IU", "RSS key");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
+ CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+ hn_rss_ind_sysctl, "IU", "RSS indirect table");
+#endif
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
+ CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
+ "RNDIS offered packet transmission aggregation size limit");
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
+ CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
+ "RNDIS offered packet transmission aggregation count limit");
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
+ CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
+ "RNDIS packet transmission aggregation alignment");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+ hn_txagg_size_sysctl, "I",
+ "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+ hn_txagg_pkts_sysctl, "I",
+ "Packet transmission aggregation packets, "
+ "0 -- disable, -1 -- auto");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
+ CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+ hn_polling_sysctl, "I",
+ "Polling frequency: [100,1000000], 0 disable polling");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_vf_sysctl, "A", "Virtual Function's name");
+ if (!hn_xpnt_vf) {
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_rxvf_sysctl, "A", "activated Virtual Function's name");
+ } else {
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
+ CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_xpnt_vf_enabled_sysctl, "I",
+ "Transparent VF enabled");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+ hn_xpnt_vf_accbpf_sysctl, "I",
+ "Accurate BPF for transparent VF");
+ }
+
+ /*
+ * Setup the ifmedia, which has been initialized earlier.
+ */
+ ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
+ ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
+ /* XXX ifmedia_set really should do this for us */
+ sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
+
+ /*
+ * Setup the ifnet for this interface.
+ */
+
+ ifp->if_baudrate = IF_Gbps(10);
+ ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+ ifp->if_ioctl = hn_ioctl;
+ ifp->if_init = hn_init;
+#ifdef HN_IFSTART_SUPPORT
+ if (hn_use_if_start) {
+ int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
+
+ ifp->if_start = hn_start;
+ IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
+ ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
+ IFQ_SET_READY(&ifp->if_snd);
+ } else
+#endif
+ {
+ ifp->if_transmit = hn_transmit;
+ ifp->if_qflush = hn_xmit_qflush;
+ }
+
+ ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
+#ifdef foo
+ /* We can't diff IPv6 packets from IPv4 packets on RX path. */
+ ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
+#endif
+ if (sc->hn_caps & HN_CAP_VLAN) {
+ /* XXX not sure about VLAN_MTU. */
+ ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
+ }
+
+ ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
+ if (ifp->if_hwassist & HN_CSUM_IP_MASK)
+ ifp->if_capabilities |= IFCAP_TXCSUM;
+ if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
+ ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
+ if (sc->hn_caps & HN_CAP_TSO4) {
+ ifp->if_capabilities |= IFCAP_TSO4;
+ ifp->if_hwassist |= CSUM_IP_TSO;
+ }
+ if (sc->hn_caps & HN_CAP_TSO6) {
+ ifp->if_capabilities |= IFCAP_TSO6;
+ ifp->if_hwassist |= CSUM_IP6_TSO;
+ }
+
+ /* Enable all available capabilities by default. */
+ ifp->if_capenable = ifp->if_capabilities;
+
+ /*
+ * Disable IPv6 TSO and TXCSUM by default, they still can
+ * be enabled through SIOCSIFCAP.
+ */
+ ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
+ ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
+
+ if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
+ /*
+ * Lock hn_set_tso_maxsize() to simplify its
+ * internal logic.
+ */
+ HN_LOCK(sc);
+ hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
+ HN_UNLOCK(sc);
+ ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
+ ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
+ }
+
+ ether_ifattach(ifp, eaddr);
+
+ if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
+ if_printf(ifp, "TSO segcnt %u segsz %u\n",
+ ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
+ }
+ if (mtu < ETHERMTU) {
+ if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
+ ifp->if_mtu = mtu;
+ }
+
+ /* Inform the upper layer about the long frame support. */
+ ifp->if_hdrlen = sizeof(struct ether_vlan_header);
+
+ /*
+ * Kick off link status check.
+ */
+ sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
+ hn_update_link_status(sc);
+
+ if (!hn_xpnt_vf) {
+ sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
+ hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
+ sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
+ hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
+ } else {
+ sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
+ hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
+ }
+
+ /*
+ * NOTE:
+ * Subscribe ether_ifattach event, instead of ifnet_arrival event,
+ * since interface's LLADDR is needed; interface LLADDR is not
+ * available when ifnet_arrival event is triggered.
+ */
+ sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
+ hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
+ sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
+ hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
+
+ return (0);
+failed:
+ if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
+ hn_synth_detach(sc);
+ hn_detach(dev);
+ return (error);
+}
+
+static int
+hn_detach(device_t dev)
+{
+ struct hn_softc *sc = device_get_softc(dev);
+ struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
+
+ if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
+ /*
+ * In case that the vmbus missed the orphan handler
+ * installation.
+ */
+ vmbus_xact_ctx_orphan(sc->hn_xact);
+ }
+
+ if (sc->hn_ifaddr_evthand != NULL)
+ EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
+ if (sc->hn_ifnet_evthand != NULL)
+ EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
+ if (sc->hn_ifnet_atthand != NULL) {
+ EVENTHANDLER_DEREGISTER(ether_ifattach_event,
+ sc->hn_ifnet_atthand);
+ }
+ if (sc->hn_ifnet_dethand != NULL) {
+ EVENTHANDLER_DEREGISTER(ifnet_departure_event,
+ sc->hn_ifnet_dethand);
+ }
+ if (sc->hn_ifnet_lnkhand != NULL)
+ EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
+
+ vf_ifp = sc->hn_vf_ifp;
+ __compiler_membar();
+ if (vf_ifp != NULL)
+ hn_ifnet_detevent(sc, vf_ifp);
+
+ if (device_is_attached(dev)) {
+ HN_LOCK(sc);
+ if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+ hn_stop(sc, true);
+ /*
+ * NOTE:
+ * hn_stop() only suspends data, so managment
+ * stuffs have to be suspended manually here.
+ */
+ hn_suspend_mgmt(sc);
+ hn_synth_detach(sc);
+ }
+ HN_UNLOCK(sc);
+ ether_ifdetach(ifp);
+ }
+
+ ifmedia_removeall(&sc->hn_media);
+ hn_destroy_rx_data(sc);
+ hn_destroy_tx_data(sc);
+
+ if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
+ int i;
+
+ for (i = 0; i < hn_tx_taskq_cnt; ++i)
+ taskqueue_free(sc->hn_tx_taskqs[i]);
+ free(sc->hn_tx_taskqs, M_DEVBUF);
+ }
+ taskqueue_free(sc->hn_mgmt_taskq0);
+ if (sc->hn_vf_taskq != NULL)
+ taskqueue_free(sc->hn_vf_taskq);
+
+ if (sc->hn_xact != NULL) {
+ /*
+ * Uninstall the orphan handler _before_ the xact is
+ * destructed.
+ */
+ vmbus_chan_unset_orphan(sc->hn_prichan);
+ vmbus_xact_ctx_destroy(sc->hn_xact);
+ }
+
+ if_free(ifp);
+
+ HN_LOCK_DESTROY(sc);
+ rm_destroy(&sc->hn_vf_lock);
+ return (0);
+}
+
+static int
+hn_shutdown(device_t dev)
+{
+
+ return (0);
+}
+
+static void
+hn_link_status(struct hn_softc *sc)
+{
+ uint32_t link_status;
+ int error;
+
+ error = hn_rndis_get_linkstatus(sc, &link_status);
+ if (error) {
+ /* XXX what to do? */
+ return;
+ }
+
+ if (link_status == NDIS_MEDIA_STATE_CONNECTED)
+ sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
+ else
+ sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
+ if_link_state_change(sc->hn_ifp,
+ (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
+ LINK_STATE_UP : LINK_STATE_DOWN);
+}
+
+static void
+hn_link_taskfunc(void *xsc, int pending __unused)
+{
+ struct hn_softc *sc = xsc;
+
+ if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
+ return;
+ hn_link_status(sc);
+}
+
+static void
+hn_netchg_init_taskfunc(void *xsc, int pending __unused)
+{
+ struct hn_softc *sc = xsc;
+
+ /* Prevent any link status checks from running. */
+ sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
+
+ /*
+ * Fake up a [link down --> link up] state change; 5 seconds
+ * delay is used, which closely simulates miibus reaction
+ * upon link down event.
+ */
+ sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
+ if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
+ taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
+ &sc->hn_netchg_status, 5 * hz);
+}
+
+static void
+hn_netchg_status_taskfunc(void *xsc, int pending __unused)
+{
+ struct hn_softc *sc = xsc;
+
+ /* Re-allow link status checks. */
+ sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
+ hn_link_status(sc);
+}
+
+static void
+hn_update_link_status(struct hn_softc *sc)
+{
+
+ if (sc->hn_mgmt_taskq != NULL)
+ taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
+}
+
+static void
+hn_change_network(struct hn_softc *sc)
+{
+
+ if (sc->hn_mgmt_taskq != NULL)
+ taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
+}
+
+static __inline int
+hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
+ struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
+{
+ struct mbuf *m = *m_head;
+ int error;
+
+ KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
+
+ error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
+ m, segs, nsegs, BUS_DMA_NOWAIT);
+ if (error == EFBIG) {
+ struct mbuf *m_new;
+
+ m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
+ if (m_new == NULL)
+ return ENOBUFS;
+ else
+ *m_head = m = m_new;
+ txr->hn_tx_collapsed++;
+
+ error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
+ txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
+ }
+ if (!error) {
+ bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
+ BUS_DMASYNC_PREWRITE);
+ txd->flags |= HN_TXD_FLAG_DMAMAP;
+ }
+ return error;
+}
+
+static __inline int
+hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
+{
+
+ KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
+ ("put an onlist txd %#x", txd->flags));
+ KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
+ ("put an onagg txd %#x", txd->flags));
+
+ KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
+ if (atomic_fetchadd_int(&txd->refs, -1) != 1)
+ return 0;
+
+ if (!STAILQ_EMPTY(&txd->agg_list)) {
+ struct hn_txdesc *tmp_txd;
+
+ while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
+ int freed;
+
+ KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
+ ("resursive aggregation on aggregated txdesc"));
+ KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
+ ("not aggregated txdesc"));
+ KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
+ ("aggregated txdesc uses dmamap"));
+ KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
+ ("aggregated txdesc consumes "
+ "chimney sending buffer"));
+ KASSERT(tmp_txd->chim_size == 0,
+ ("aggregated txdesc has non-zero "
+ "chimney sending size"));
+
+ STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
+ tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
+ freed = hn_txdesc_put(txr, tmp_txd);
+ KASSERT(freed, ("failed to free aggregated txdesc"));
+ }
+ }
+
+ if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
+ KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
+ ("chim txd uses dmamap"));
+ hn_chim_free(txr->hn_sc, txd->chim_index);
+ txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
+ txd->chim_size = 0;
+ } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
+ bus_dmamap_sync(txr->hn_tx_data_dtag,
+ txd->data_dmap, BUS_DMASYNC_POSTWRITE);
+ bus_dmamap_unload(txr->hn_tx_data_dtag,
+ txd->data_dmap);
+ txd->flags &= ~HN_TXD_FLAG_DMAMAP;
+ }
+
+ if (txd->m != NULL) {
+ m_freem(txd->m);
+ txd->m = NULL;
+ }
+
+ txd->flags |= HN_TXD_FLAG_ONLIST;
+#ifndef HN_USE_TXDESC_BUFRING
+ mtx_lock_spin(&txr->hn_txlist_spin);
+ KASSERT(txr->hn_txdesc_avail >= 0 &&
+ txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
+ ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
+ txr->hn_txdesc_avail++;
+ SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
+ mtx_unlock_spin(&txr->hn_txlist_spin);
+#else /* HN_USE_TXDESC_BUFRING */
+#ifdef HN_DEBUG
+ atomic_add_int(&txr->hn_txdesc_avail, 1);
+#endif
+ buf_ring_enqueue(txr->hn_txdesc_br, txd);
+#endif /* !HN_USE_TXDESC_BUFRING */
+
+ return 1;
+}
+
+static __inline struct hn_txdesc *
+hn_txdesc_get(struct hn_tx_ring *txr)
+{
+ struct hn_txdesc *txd;
+
+#ifndef HN_USE_TXDESC_BUFRING
+ mtx_lock_spin(&txr->hn_txlist_spin);
+ txd = SLIST_FIRST(&txr->hn_txlist);
+ if (txd != NULL) {
+ KASSERT(txr->hn_txdesc_avail > 0,
+ ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
+ txr->hn_txdesc_avail--;
+ SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
+ }
+ mtx_unlock_spin(&txr->hn_txlist_spin);
+#else
+ txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
+#endif
+
+ if (txd != NULL) {
+#ifdef HN_USE_TXDESC_BUFRING
+#ifdef HN_DEBUG
+ atomic_subtract_int(&txr->hn_txdesc_avail, 1);
+#endif
+#endif /* HN_USE_TXDESC_BUFRING */
+ KASSERT(txd->m == NULL && txd->refs == 0 &&
+ STAILQ_EMPTY(&txd->agg_list) &&
+ txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
+ txd->chim_size == 0 &&
+ (txd->flags & HN_TXD_FLAG_ONLIST) &&
+ (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
+ (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
+ txd->flags &= ~HN_TXD_FLAG_ONLIST;
+ txd->refs = 1;
+ }
+ return txd;
+}
+
+static __inline void
+hn_txdesc_hold(struct hn_txdesc *txd)
+{
+
+ /* 0->1 transition will never work */
+ KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
+ atomic_add_int(&txd->refs, 1);
+}
+
+static __inline void
+hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
+{
+
+ KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
+ ("recursive aggregation on aggregating txdesc"));
+
+ KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
+ ("already aggregated"));
+ KASSERT(STAILQ_EMPTY(&txd->agg_list),
+ ("recursive aggregation on to-be-aggregated txdesc"));
+
+ txd->flags |= HN_TXD_FLAG_ONAGG;
+ STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
+}
+
+static bool
+hn_tx_ring_pending(struct hn_tx_ring *txr)
+{
+ bool pending = false;
+
+#ifndef HN_USE_TXDESC_BUFRING
+ mtx_lock_spin(&txr->hn_txlist_spin);
+ if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
+ pending = true;
+ mtx_unlock_spin(&txr->hn_txlist_spin);
+#else
+ if (!buf_ring_full(txr->hn_txdesc_br))
+ pending = true;
+#endif
+ return (pending);
+}
+
+static __inline void
+hn_txeof(struct hn_tx_ring *txr)
+{
+ txr->hn_has_txeof = 0;
+ txr->hn_txeof(txr);
+}
+
+static void
+hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
+ struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
+{
+ struct hn_txdesc *txd = sndc->hn_cbarg;
+ struct hn_tx_ring *txr;
+
+ txr = txd->txr;
+ KASSERT(txr->hn_chan == chan,
+ ("channel mismatch, on chan%u, should be chan%u",
+ vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
+
+ txr->hn_has_txeof = 1;
+ hn_txdesc_put(txr, txd);
+
+ ++txr->hn_txdone_cnt;
+ if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
+ txr->hn_txdone_cnt = 0;
+ if (txr->hn_oactive)
+ hn_txeof(txr);
+ }
+}
+
+static void
+hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
+{
+#if defined(INET) || defined(INET6)
+ struct epoch_tracker et;
+
+ NET_EPOCH_ENTER(et);
+ tcp_lro_flush_all(&rxr->hn_lro);
+ NET_EPOCH_EXIT(et);
+#endif
+
+ /*
+ * NOTE:
+ * 'txr' could be NULL, if multiple channels and
+ * ifnet.if_start method are enabled.
+ */
+ if (txr == NULL || !txr->hn_has_txeof)
+ return;
+
+ txr->hn_txdone_cnt = 0;
+ hn_txeof(txr);
+}
+
+static __inline uint32_t
+hn_rndis_pktmsg_offset(uint32_t ofs)
+{
+
+ KASSERT(ofs >= sizeof(struct rndis_packet_msg),
+ ("invalid RNDIS packet msg offset %u", ofs));
+ return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
+}
+
+static __inline void *
+hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
+ size_t pi_dlen, uint32_t pi_type)
+{
+ const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
+ struct rndis_pktinfo *pi;
+
+ KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
+ ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
+
+ /*
+ * Per-packet-info does not move; it only grows.
+ *
+ * NOTE:
+ * rm_pktinfooffset in this phase counts from the beginning
+ * of rndis_packet_msg.
+ */
+ KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
+ ("%u pktinfo overflows RNDIS packet msg", pi_type));
+ pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
+ pkt->rm_pktinfolen);
+ pkt->rm_pktinfolen += pi_size;
+
+ pi->rm_size = pi_size;
+ pi->rm_type = pi_type;
+ pi->rm_internal = 0;
+ pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
+
+ return (pi->rm_data);
+}
+
+static __inline int
+hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
+{
+ struct hn_txdesc *txd;
+ struct mbuf *m;
+ int error, pkts;
+
+ txd = txr->hn_agg_txd;
+ KASSERT(txd != NULL, ("no aggregate txdesc"));
+
+ /*
+ * Since hn_txpkt() will reset this temporary stat, save
+ * it now, so that oerrors can be updated properly, if
+ * hn_txpkt() ever fails.
+ */
+ pkts = txr->hn_stat_pkts;
+
+ /*
+ * Since txd's mbuf will _not_ be freed upon hn_txpkt()
+ * failure, save it for later freeing, if hn_txpkt() ever
+ * fails.
+ */
+ m = txd->m;
+ error = hn_txpkt(ifp, txr, txd);
+ if (__predict_false(error)) {
+ /* txd is freed, but m is not. */
+ m_freem(m);
+
+ txr->hn_flush_failed++;
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
+ }
+
+ /* Reset all aggregation states. */
+ txr->hn_agg_txd = NULL;
+ txr->hn_agg_szleft = 0;
+ txr->hn_agg_pktleft = 0;
+ txr->hn_agg_prevpkt = NULL;
+
+ return (error);
+}
+
+static void *
+hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
+ int pktsize)
+{
+ void *chim;
+
+ if (txr->hn_agg_txd != NULL) {
+ if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
+ struct hn_txdesc *agg_txd = txr->hn_agg_txd;
+ struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
+ int olen;
+
+ /*
+ * Update the previous RNDIS packet's total length,
+ * it can be increased due to the mandatory alignment
+ * padding for this RNDIS packet. And update the
+ * aggregating txdesc's chimney sending buffer size
+ * accordingly.
+ *
+ * XXX
+ * Zero-out the padding, as required by the RNDIS spec.
+ */
+ olen = pkt->rm_len;
+ pkt->rm_len = roundup2(olen, txr->hn_agg_align);
+ agg_txd->chim_size += pkt->rm_len - olen;
+
+ /* Link this txdesc to the parent. */
+ hn_txdesc_agg(agg_txd, txd);
+
+ chim = (uint8_t *)pkt + pkt->rm_len;
+ /* Save the current packet for later fixup. */
+ txr->hn_agg_prevpkt = chim;
+
+ txr->hn_agg_pktleft--;
+ txr->hn_agg_szleft -= pktsize;
+ if (txr->hn_agg_szleft <=
+ HN_PKTSIZE_MIN(txr->hn_agg_align)) {
+ /*
+ * Probably can't aggregate more packets,
+ * flush this aggregating txdesc proactively.
+ */
+ txr->hn_agg_pktleft = 0;
+ }
+ /* Done! */
+ return (chim);
+ }
+ hn_flush_txagg(ifp, txr);
+ }
+ KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
+
+ txr->hn_tx_chimney_tried++;
+ txd->chim_index = hn_chim_alloc(txr->hn_sc);
+ if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
+ return (NULL);
+ txr->hn_tx_chimney++;
+
+ chim = txr->hn_sc->hn_chim +
+ (txd->chim_index * txr->hn_sc->hn_chim_szmax);
+
+ if (txr->hn_agg_pktmax > 1 &&
+ txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
+ txr->hn_agg_txd = txd;
+ txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
+ txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
+ txr->hn_agg_prevpkt = chim;
+ }
+ return (chim);
+}
+
+/*
+ * NOTE:
+ * If this function fails, then both txd and m_head0 will be freed.
+ */
+static int
+hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
+ struct mbuf **m_head0)
+{
+ bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
+ int error, nsegs, i;
+ struct mbuf *m_head = *m_head0;
+ struct rndis_packet_msg *pkt;
+ uint32_t *pi_data;
+ void *chim = NULL;
+ int pkt_hlen, pkt_size;
+
+ pkt = txd->rndis_pkt;
+ pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
+ if (pkt_size < txr->hn_chim_size) {
+ chim = hn_try_txagg(ifp, txr, txd, pkt_size);
+ if (chim != NULL)
+ pkt = chim;
+ } else {
+ if (txr->hn_agg_txd != NULL)
+ hn_flush_txagg(ifp, txr);
+ }
+
+ pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
+ pkt->rm_len = m_head->m_pkthdr.len;
+ pkt->rm_dataoffset = 0;
+ pkt->rm_datalen = m_head->m_pkthdr.len;
+ pkt->rm_oobdataoffset = 0;
+ pkt->rm_oobdatalen = 0;
+ pkt->rm_oobdataelements = 0;
+ pkt->rm_pktinfooffset = sizeof(*pkt);
+ pkt->rm_pktinfolen = 0;
+ pkt->rm_vchandle = 0;
+ pkt->rm_reserved = 0;
+
+ if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
+ /*
+ * Set the hash value for this packet.
+ */
+ pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
+ HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
+
+ if (M_HASHTYPE_ISHASH(m_head))
+ /*
+ * The flowid field contains the hash value host
+ * set in the rx queue if it is a ip forwarding pkt.
+ * Set the same hash value so host can send on the
+ * cpu it was received.
+ */
+ *pi_data = m_head->m_pkthdr.flowid;
+ else
+ /*
+ * Otherwise just put the tx queue index.
+ */
+ *pi_data = txr->hn_tx_idx;
+ }
+
+ if (m_head->m_flags & M_VLANTAG) {
+ pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
+ NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
+ *pi_data = NDIS_VLAN_INFO_MAKE(
+ EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
+ EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
+ EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
+ }
+
+ if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
+#if defined(INET6) || defined(INET)
+ pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
+ NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
+#ifdef INET
+ if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
+ *pi_data = NDIS_LSO2_INFO_MAKEIPV4(
+ m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
+ m_head->m_pkthdr.tso_segsz);
+ }
+#endif
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET6
+ {
+ *pi_data = NDIS_LSO2_INFO_MAKEIPV6(
+ m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
+ m_head->m_pkthdr.tso_segsz);
+ }
+#endif
+#endif /* INET6 || INET */
+ } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
+ pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
+ NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
+ if (m_head->m_pkthdr.csum_flags &
+ (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
+ *pi_data = NDIS_TXCSUM_INFO_IPV6;
+ } else {
+ *pi_data = NDIS_TXCSUM_INFO_IPV4;
+ if (m_head->m_pkthdr.csum_flags & CSUM_IP)
+ *pi_data |= NDIS_TXCSUM_INFO_IPCS;
+ }
+
+ if (m_head->m_pkthdr.csum_flags &
+ (CSUM_IP_TCP | CSUM_IP6_TCP)) {
+ *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
+ m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
+ } else if (m_head->m_pkthdr.csum_flags &
+ (CSUM_IP_UDP | CSUM_IP6_UDP)) {
+ *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
+ m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
+ }
+ }
+
+ pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
+ /* Fixup RNDIS packet message total length */
+ pkt->rm_len += pkt_hlen;
+ /* Convert RNDIS packet message offsets */
+ pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
+ pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
+
+ /*
+ * Fast path: Chimney sending.
+ */
+ if (chim != NULL) {
+ struct hn_txdesc *tgt_txd = txd;
+
+ if (txr->hn_agg_txd != NULL) {
+ tgt_txd = txr->hn_agg_txd;
+#ifdef INVARIANTS
+ *m_head0 = NULL;
+#endif
+ }
+
+ KASSERT(pkt == chim,
+ ("RNDIS pkt not in chimney sending buffer"));
+ KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
+ ("chimney sending buffer is not used"));
+ tgt_txd->chim_size += pkt->rm_len;
+
+ m_copydata(m_head, 0, m_head->m_pkthdr.len,
+ ((uint8_t *)chim) + pkt_hlen);
+
+ txr->hn_gpa_cnt = 0;
+ txr->hn_sendpkt = hn_txpkt_chim;
+ goto done;
+ }
+
+ KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
+ KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
+ ("chimney buffer is used"));
+ KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
+
+ error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
+ if (__predict_false(error)) {
+ int freed;
+
+ /*
+ * This mbuf is not linked w/ the txd yet, so free it now.
+ */
+ m_freem(m_head);
+ *m_head0 = NULL;
+
+ freed = hn_txdesc_put(txr, txd);
+ KASSERT(freed != 0,
+ ("fail to free txd upon txdma error"));
+
+ txr->hn_txdma_failed++;
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return error;
+ }
+ *m_head0 = m_head;
+
+ /* +1 RNDIS packet message */
+ txr->hn_gpa_cnt = nsegs + 1;
+
+ /* send packet with page buffer */
+ txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
+ txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
+ txr->hn_gpa[0].gpa_len = pkt_hlen;
+
+ /*
+ * Fill the page buffers with mbuf info after the page
+ * buffer for RNDIS packet message.
+ */
+ for (i = 0; i < nsegs; ++i) {
+ struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
+
+ gpa->gpa_page = atop(segs[i].ds_addr);
+ gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
+ gpa->gpa_len = segs[i].ds_len;
+ }
+
+ txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
+ txd->chim_size = 0;
+ txr->hn_sendpkt = hn_txpkt_sglist;
+done:
+ txd->m = m_head;
+
+ /* Set the completion routine */
+ hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
+
+ /* Update temporary stats for later use. */
+ txr->hn_stat_pkts++;
+ txr->hn_stat_size += m_head->m_pkthdr.len;
+ if (m_head->m_flags & M_MCAST)
+ txr->hn_stat_mcasts++;
+
+ return 0;
+}
+
+/*
+ * NOTE:
+ * If this function fails, then txd will be freed, but the mbuf
+ * associated w/ the txd will _not_ be freed.
+ */
+static int
+hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
+{
+ int error, send_failed = 0, has_bpf;
+
+again:
+ has_bpf = bpf_peers_present(ifp->if_bpf);
+ if (has_bpf) {
+ /*
+ * Make sure that this txd and any aggregated txds are not
+ * freed before ETHER_BPF_MTAP.
+ */
+ hn_txdesc_hold(txd);
+ }
+ error = txr->hn_sendpkt(txr, txd);
+ if (!error) {
+ if (has_bpf) {
+ const struct hn_txdesc *tmp_txd;
+
+ ETHER_BPF_MTAP(ifp, txd->m);
+ STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
+ ETHER_BPF_MTAP(ifp, tmp_txd->m);
+ }
+
+ if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
+#ifdef HN_IFSTART_SUPPORT
+ if (!hn_use_if_start)
+#endif
+ {
+ if_inc_counter(ifp, IFCOUNTER_OBYTES,
+ txr->hn_stat_size);
+ if (txr->hn_stat_mcasts != 0) {
+ if_inc_counter(ifp, IFCOUNTER_OMCASTS,
+ txr->hn_stat_mcasts);
+ }
+ }
+ txr->hn_pkts += txr->hn_stat_pkts;
+ txr->hn_sends++;
+ }
+ if (has_bpf)
+ hn_txdesc_put(txr, txd);
+
+ if (__predict_false(error)) {
+ int freed;
+
+ /*
+ * This should "really rarely" happen.
+ *
+ * XXX Too many RX to be acked or too many sideband
+ * commands to run? Ask netvsc_channel_rollup()
+ * to kick start later.
+ */
+ txr->hn_has_txeof = 1;
+ if (!send_failed) {
+ txr->hn_send_failed++;
+ send_failed = 1;
+ /*
+ * Try sending again after set hn_has_txeof;
+ * in case that we missed the last
+ * netvsc_channel_rollup().
+ */
+ goto again;
+ }
+ if_printf(ifp, "send failed\n");
+
+ /*
+ * Caller will perform further processing on the
+ * associated mbuf, so don't free it in hn_txdesc_put();
+ * only unload it from the DMA map in hn_txdesc_put(),
+ * if it was loaded.
+ */
+ txd->m = NULL;
+ freed = hn_txdesc_put(txr, txd);
+ KASSERT(freed != 0,
+ ("fail to free txd upon send error"));
+
+ txr->hn_send_failed++;
+ }
+
+ /* Reset temporary stats, after this sending is done. */
+ txr->hn_stat_size = 0;
+ txr->hn_stat_pkts = 0;
+ txr->hn_stat_mcasts = 0;
+
+ return (error);
+}
+
+/*
+ * Append the specified data to the indicated mbuf chain,
+ * Extend the mbuf chain if the new data does not fit in
+ * existing space.
+ *
+ * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
+ * There should be an equivalent in the kernel mbuf code,
+ * but there does not appear to be one yet.
+ *
+ * Differs from m_append() in that additional mbufs are
+ * allocated with cluster size MJUMPAGESIZE, and filled
+ * accordingly.
+ *
+ * Return the last mbuf in the chain or NULL if failed to
+ * allocate new mbuf.
+ */
+static struct mbuf *
+hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
+{
+ struct mbuf *m, *n;
+ int remainder, space;
+
+ for (m = m0; m->m_next != NULL; m = m->m_next)
+ ;
+ remainder = len;
+ space = M_TRAILINGSPACE(m);
+ if (space > 0) {
+ /*
+ * Copy into available space.
+ */
+ if (space > remainder)
+ space = remainder;
+ bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
+ m->m_len += space;
+ cp += space;
+ remainder -= space;
+ }
+ while (remainder > 0) {
+ /*
+ * Allocate a new mbuf; could check space
+ * and allocate a cluster instead.
+ */
+ n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
+ if (n == NULL)
+ return NULL;
+ n->m_len = min(MJUMPAGESIZE, remainder);
+ bcopy(cp, mtod(n, caddr_t), n->m_len);
+ cp += n->m_len;
+ remainder -= n->m_len;
+ m->m_next = n;
+ m = n;
+ }
+
+ return m;
+}
+
+#if defined(INET) || defined(INET6)
+static __inline int
+hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
+{
+#if __FreeBSD_version >= 1100095
+ if (hn_lro_mbufq_depth) {
+ tcp_lro_queue_mbuf(lc, m);
+ return 0;
+ }
+#endif
+ return tcp_lro_rx(lc, m, 0);
+}
+#endif
+
+static int
+hn_rxpkt(struct hn_rx_ring *rxr)
+{
+ struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
+ struct mbuf *m_new, *n;
+ int size, do_lro = 0, do_csum = 1, is_vf = 0;
+ int hash_type = M_HASHTYPE_NONE;
+ int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
+ int i;
+
+ ifp = hn_ifp;
+ if (rxr->hn_rxvf_ifp != NULL) {
+ /*
+ * Non-transparent mode VF; pretend this packet is from
+ * the VF.
+ */
+ ifp = rxr->hn_rxvf_ifp;
+ is_vf = 1;
+ } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
+ /* Transparent mode VF. */
+ is_vf = 1;
+ }
+
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
+ /*
+ * NOTE:
+ * See the NOTE of hn_rndis_init_fixat(). This
+ * function can be reached, immediately after the
+ * RNDIS is initialized but before the ifnet is
+ * setup on the hn_attach() path; drop the unexpected
+ * packets.
+ */
+ return (0);
+ }
+
+ if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
+ if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
+ return (0);
+ }
+
+ if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
+ m_new = m_gethdr(M_NOWAIT, MT_DATA);
+ if (m_new == NULL) {
+ if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
+ return (0);
+ }
+ memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
+ rxr->rsc.frag_len[0]);
+ m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
+ } else {
+ /*
+ * Get an mbuf with a cluster. For packets 2K or less,
+ * get a standard 2K cluster. For anything larger, get a
+ * 4K cluster. Any buffers larger than 4K can cause problems
+ * if looped around to the Hyper-V TX channel, so avoid them.
+ */
+ size = MCLBYTES;
+ if (rxr->rsc.pktlen > MCLBYTES) {
+ /* 4096 */
+ size = MJUMPAGESIZE;
+ }
+
+ m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
+ if (m_new == NULL) {
+ if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
+ return (0);
+ }
+
+ n = m_new;
+ for (i = 0; i < rxr->rsc.cnt; i++) {
+ n = hv_m_append(n, rxr->rsc.frag_len[i],
+ rxr->rsc.frag_data[i]);
+ if (n == NULL) {
+ if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
+ return (0);
+ } else {
+ m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
+ }
+ }
+ }
+ if (rxr->rsc.pktlen <= MHLEN)
+ rxr->hn_small_pkts++;
+
+ m_new->m_pkthdr.rcvif = ifp;
+
+ if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
+ do_csum = 0;
+
+ /* receive side checksum offload */
+ if (rxr->rsc.csum_info != NULL) {
+ /* IP csum offload */
+ if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
+ m_new->m_pkthdr.csum_flags |=
+ (CSUM_IP_CHECKED | CSUM_IP_VALID);
+ rxr->hn_csum_ip++;
+ }
+
+ /* TCP/UDP csum offload */
+ if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
+ NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
+ m_new->m_pkthdr.csum_flags |=
+ (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+ m_new->m_pkthdr.csum_data = 0xffff;
+ if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
+ rxr->hn_csum_tcp++;
+ else
+ rxr->hn_csum_udp++;
+ }
+
+ /*
+ * XXX
+ * As of this write (Oct 28th, 2016), host side will turn
+ * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
+ * the do_lro setting here is actually _not_ accurate. We
+ * depend on the RSS hash type check to reset do_lro.
+ */
+ if ((*(rxr->rsc.csum_info) &
+ (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
+ (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
+ do_lro = 1;
+ } else {
+ hn_rxpkt_proto(m_new, &l3proto, &l4proto);
+ if (l3proto == ETHERTYPE_IP) {
+ if (l4proto == IPPROTO_TCP) {
+ if (do_csum &&
+ (rxr->hn_trust_hcsum &
+ HN_TRUST_HCSUM_TCP)) {
+ rxr->hn_csum_trusted++;
+ m_new->m_pkthdr.csum_flags |=
+ (CSUM_IP_CHECKED | CSUM_IP_VALID |
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+ m_new->m_pkthdr.csum_data = 0xffff;
+ }
+ do_lro = 1;
+ } else if (l4proto == IPPROTO_UDP) {
+ if (do_csum &&
+ (rxr->hn_trust_hcsum &
+ HN_TRUST_HCSUM_UDP)) {
+ rxr->hn_csum_trusted++;
+ m_new->m_pkthdr.csum_flags |=
+ (CSUM_IP_CHECKED | CSUM_IP_VALID |
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+ m_new->m_pkthdr.csum_data = 0xffff;
+ }
+ } else if (l4proto != IPPROTO_DONE && do_csum &&
+ (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
+ rxr->hn_csum_trusted++;
+ m_new->m_pkthdr.csum_flags |=
+ (CSUM_IP_CHECKED | CSUM_IP_VALID);
+ }
+ }
+ }
+
+ if (rxr->rsc.vlan_info != NULL) {
+ m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
+ NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
+ NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
+ NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
+ m_new->m_flags |= M_VLANTAG;
+ }
+
+ /*
+ * If VF is activated (tranparent/non-transparent mode does not
+ * matter here).
+ *
+ * - Disable LRO
+ *
+ * hn(4) will only receive broadcast packets, multicast packets,
+ * TCP SYN and SYN|ACK (in Azure), LRO is useless for these
+ * packet types.
+ *
+ * For non-transparent, we definitely _cannot_ enable LRO at
+ * all, since the LRO flush will use hn(4) as the receiving
+ * interface; i.e. hn_ifp->if_input(hn_ifp, m).
+ */
+ if (is_vf)
+ do_lro = 0;
+
+ /*
+ * If VF is activated (tranparent/non-transparent mode does not
+ * matter here), do _not_ mess with unsupported hash types or
+ * functions.
+ */
+ if (rxr->rsc.hash_info != NULL) {
+ rxr->hn_rss_pkts++;
+ m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
+ if (!is_vf)
+ hash_type = M_HASHTYPE_OPAQUE_HASH;
+ if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
+ NDIS_HASH_FUNCTION_TOEPLITZ) {
+ uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
+ rxr->hn_mbuf_hash);
+
+ /*
+ * NOTE:
+ * do_lro is resetted, if the hash types are not TCP
+ * related. See the comment in the above csum_flags
+ * setup section.
+ */
+ switch (type) {
+ case NDIS_HASH_IPV4:
+ hash_type = M_HASHTYPE_RSS_IPV4;
+ do_lro = 0;
+ break;
+
+ case NDIS_HASH_TCP_IPV4:
+ hash_type = M_HASHTYPE_RSS_TCP_IPV4;
+ if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
+ int def_htype = M_HASHTYPE_OPAQUE_HASH;
+
+ if (is_vf)
+ def_htype = M_HASHTYPE_NONE;
+
+ /*
+ * UDP 4-tuple hash is delivered as
+ * TCP 4-tuple hash.
+ */
+ if (l3proto == ETHERTYPE_MAX) {
+ hn_rxpkt_proto(m_new,
+ &l3proto, &l4proto);
+ }
+ if (l3proto == ETHERTYPE_IP) {
+ if (l4proto == IPPROTO_UDP &&
+ (rxr->hn_mbuf_hash &
+ NDIS_HASH_UDP_IPV4_X)) {
+ hash_type =
+ M_HASHTYPE_RSS_UDP_IPV4;
+ do_lro = 0;
+ } else if (l4proto !=
+ IPPROTO_TCP) {
+ hash_type = def_htype;
+ do_lro = 0;
+ }
+ } else {
+ hash_type = def_htype;
+ do_lro = 0;
+ }
+ }
+ break;
+
+ case NDIS_HASH_IPV6:
+ hash_type = M_HASHTYPE_RSS_IPV6;
+ do_lro = 0;
+ break;
+
+ case NDIS_HASH_IPV6_EX:
+ hash_type = M_HASHTYPE_RSS_IPV6_EX;
+ do_lro = 0;
+ break;
+
+ case NDIS_HASH_TCP_IPV6:
+ hash_type = M_HASHTYPE_RSS_TCP_IPV6;
+ break;
+
+ case NDIS_HASH_TCP_IPV6_EX:
+ hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
+ break;
+ }
+ }
+ } else if (!is_vf) {
+ m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
+ hash_type = M_HASHTYPE_OPAQUE;
+ }
+ M_HASHTYPE_SET(m_new, hash_type);
+
+ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
+ if (hn_ifp != ifp) {
+ const struct ether_header *eh;
+
+ /*
+ * Non-transparent mode VF is activated.
+ */
+
+ /*
+ * Allow tapping on hn(4).
+ */
+ ETHER_BPF_MTAP(hn_ifp, m_new);
+
+ /*
+ * Update hn(4)'s stats.
+ */
+ if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
+ if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
+ /* Checked at the beginning of this function. */
+ KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
+ eh = mtod(m_new, struct ether_header *);
+ if (ETHER_IS_MULTICAST(eh->ether_dhost))
+ if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
+ }
+ rxr->hn_pkts++;
+
+ if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
+#if defined(INET) || defined(INET6)
+ struct lro_ctrl *lro = &rxr->hn_lro;
+
+ if (lro->lro_cnt) {
+ rxr->hn_lro_tried++;
+ if (hn_lro_rx(lro, m_new) == 0) {
+ /* DONE! */
+ return 0;
+ }
+ }
+#endif
+ }
+ ifp->if_input(ifp, m_new);
+
+ return (0);
+}
+
+static int
+hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
+{
+ struct hn_softc *sc = ifp->if_softc;
+ struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
+ struct ifnet *vf_ifp;
+ int mask, error = 0;
+ struct ifrsskey *ifrk;
+ struct ifrsshash *ifrh;
+ uint32_t mtu;
+
+ switch (cmd) {
+ case SIOCSIFMTU:
+ if (ifr->ifr_mtu > HN_MTU_MAX) {
+ error = EINVAL;
+ break;
+ }
+
+ HN_LOCK(sc);
+
+ if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
+ HN_UNLOCK(sc);
+ break;
+ }
+
+ if ((sc->hn_caps & HN_CAP_MTU) == 0) {
+ /* Can't change MTU */
+ HN_UNLOCK(sc);
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ if (ifp->if_mtu == ifr->ifr_mtu) {
+ HN_UNLOCK(sc);
+ break;
+ }
+
+ if (hn_xpnt_vf_isready(sc)) {
+ vf_ifp = sc->hn_vf_ifp;
+ ifr_vf = *ifr;
+ strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
+ sizeof(ifr_vf.ifr_name));
+ error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
+ (caddr_t)&ifr_vf);
+ if (error) {
+ HN_UNLOCK(sc);
+ if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
+ vf_ifp->if_xname, ifr->ifr_mtu, error);
+ break;
+ }
+ }
+
+ /*
+ * Suspend this interface before the synthetic parts
+ * are ripped.
+ */
+ hn_suspend(sc);
+
+ /*
+ * Detach the synthetics parts, i.e. NVS and RNDIS.
+ */
+ hn_synth_detach(sc);
+
+ /*
+ * Reattach the synthetic parts, i.e. NVS and RNDIS,
+ * with the new MTU setting.
+ */
+ error = hn_synth_attach(sc, ifr->ifr_mtu);
+ if (error) {
+ HN_UNLOCK(sc);
+ break;
+ }
+
+ error = hn_rndis_get_mtu(sc, &mtu);
+ if (error)
+ mtu = ifr->ifr_mtu;
+ else if (bootverbose)
+ if_printf(ifp, "RNDIS mtu %u\n", mtu);
+
+ /*
+ * Commit the requested MTU, after the synthetic parts
+ * have been successfully attached.
+ */
+ if (mtu >= ifr->ifr_mtu) {
+ mtu = ifr->ifr_mtu;
+ } else {
+ if_printf(ifp, "fixup mtu %d -> %u\n",
+ ifr->ifr_mtu, mtu);
+ }
+ ifp->if_mtu = mtu;
+
+ /*
+ * Synthetic parts' reattach may change the chimney
+ * sending size; update it.
+ */
+ if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
+ hn_set_chim_size(sc, sc->hn_chim_szmax);
+
+ /*
+ * Make sure that various parameters based on MTU are
+ * still valid, after the MTU change.
+ */
+ hn_mtu_change_fixup(sc);
+
+ /*
+ * All done! Resume the interface now.
+ */
+ hn_resume(sc);
+
+ if ((sc->hn_flags & HN_FLAG_RXVF) ||
+ (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
+ /*
+ * Since we have reattached the NVS part,
+ * change the datapath to VF again; in case
+ * that it is lost, after the NVS was detached.
+ */
+ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
+ }
+
+ HN_UNLOCK(sc);
+ break;
+
+ case SIOCSIFFLAGS:
+ HN_LOCK(sc);
+
+ if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
+ HN_UNLOCK(sc);
+ break;
+ }
+
+ if (hn_xpnt_vf_isready(sc))
+ hn_xpnt_vf_saveifflags(sc);
+
+ if (ifp->if_flags & IFF_UP) {
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+ /*
+ * Caller meight hold mutex, e.g.
+ * bpf; use busy-wait for the RNDIS
+ * reply.
+ */
+ HN_NO_SLEEPING(sc);
+ hn_rxfilter_config(sc);
+ HN_SLEEPING_OK(sc);
+
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
+ error = hn_xpnt_vf_iocsetflags(sc);
+ } else {
+ hn_init_locked(sc);
+ }
+ } else {
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+ hn_stop(sc, false);
+ }
+ sc->hn_if_flags = ifp->if_flags;
+
+ HN_UNLOCK(sc);
+ break;
+
+ case SIOCSIFCAP:
+ HN_LOCK(sc);
+
+ if (hn_xpnt_vf_isready(sc)) {
+ ifr_vf = *ifr;
+ strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
+ sizeof(ifr_vf.ifr_name));
+ error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
+ HN_UNLOCK(sc);
+ break;
+ }
+
+ /*
+ * Fix up requested capabilities w/ supported capabilities,
+ * since the supported capabilities could have been changed.
+ */
+ mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
+ ifp->if_capenable;
+
+ if (mask & IFCAP_TXCSUM) {
+ ifp->if_capenable ^= IFCAP_TXCSUM;
+ if (ifp->if_capenable & IFCAP_TXCSUM)
+ ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
+ else
+ ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
+ }
+ if (mask & IFCAP_TXCSUM_IPV6) {
+ ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
+ if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
+ ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
+ else
+ ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
+ }
+
+ /* TODO: flip RNDIS offload parameters for RXCSUM. */
+ if (mask & IFCAP_RXCSUM)
+ ifp->if_capenable ^= IFCAP_RXCSUM;
+#ifdef foo
+ /* We can't diff IPv6 packets from IPv4 packets on RX path. */
+ if (mask & IFCAP_RXCSUM_IPV6)
+ ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
+#endif
+
+ if (mask & IFCAP_LRO)
+ ifp->if_capenable ^= IFCAP_LRO;
+
+ if (mask & IFCAP_TSO4) {
+ ifp->if_capenable ^= IFCAP_TSO4;
+ if (ifp->if_capenable & IFCAP_TSO4)
+ ifp->if_hwassist |= CSUM_IP_TSO;
+ else
+ ifp->if_hwassist &= ~CSUM_IP_TSO;
+ }
+ if (mask & IFCAP_TSO6) {
+ ifp->if_capenable ^= IFCAP_TSO6;
+ if (ifp->if_capenable & IFCAP_TSO6)
+ ifp->if_hwassist |= CSUM_IP6_TSO;
+ else
+ ifp->if_hwassist &= ~CSUM_IP6_TSO;
+ }
+
+ HN_UNLOCK(sc);
+ break;
+
+ case SIOCADDMULTI:
+ case SIOCDELMULTI:
+ HN_LOCK(sc);
+
+ if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
+ HN_UNLOCK(sc);
+ break;
+ }
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+ /*
+ * Multicast uses mutex; use busy-wait for
+ * the RNDIS reply.
+ */
+ HN_NO_SLEEPING(sc);
+ hn_rxfilter_config(sc);
+ HN_SLEEPING_OK(sc);
+ }
+
+ /* XXX vlan(4) style mcast addr maintenance */
+ if (hn_xpnt_vf_isready(sc)) {
+ int old_if_flags;
+
+ old_if_flags = sc->hn_vf_ifp->if_flags;
+ hn_xpnt_vf_saveifflags(sc);
+
+ if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
+ ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
+ IFF_ALLMULTI))
+ error = hn_xpnt_vf_iocsetflags(sc);
+ }
+
+ HN_UNLOCK(sc);
+ break;
+
+ case SIOCSIFMEDIA:
+ case SIOCGIFMEDIA:
+ HN_LOCK(sc);
+ if (hn_xpnt_vf_isready(sc)) {
+ /*
+ * SIOCGIFMEDIA expects ifmediareq, so don't
+ * create and pass ifr_vf to the VF here; just
+ * replace the ifr_name.
+ */
+ vf_ifp = sc->hn_vf_ifp;
+ strlcpy(ifr->ifr_name, vf_ifp->if_xname,
+ sizeof(ifr->ifr_name));
+ error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
+ /* Restore the ifr_name. */
+ strlcpy(ifr->ifr_name, ifp->if_xname,
+ sizeof(ifr->ifr_name));
+ HN_UNLOCK(sc);
+ break;
+ }
+ HN_UNLOCK(sc);
+ error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
+ break;
+
+ case SIOCGIFRSSHASH:
+ ifrh = (struct ifrsshash *)data;
+ HN_LOCK(sc);
+ if (sc->hn_rx_ring_inuse == 1) {
+ HN_UNLOCK(sc);
+ ifrh->ifrh_func = RSS_FUNC_NONE;
+ ifrh->ifrh_types = 0;
+ break;
+ }
+
+ if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
+ ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
+ else
+ ifrh->ifrh_func = RSS_FUNC_PRIVATE;
+ ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
+ HN_UNLOCK(sc);
+ break;
+
+ case SIOCGIFRSSKEY:
+ ifrk = (struct ifrsskey *)data;
+ HN_LOCK(sc);
+ if (sc->hn_rx_ring_inuse == 1) {
+ HN_UNLOCK(sc);
+ ifrk->ifrk_func = RSS_FUNC_NONE;
+ ifrk->ifrk_keylen = 0;
+ break;
+ }
+ if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
+ ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
+ else
+ ifrk->ifrk_func = RSS_FUNC_PRIVATE;
+ ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
+ memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
+ NDIS_HASH_KEYSIZE_TOEPLITZ);
+ HN_UNLOCK(sc);
+ break;
+
+ default:
+ error = ether_ioctl(ifp, cmd, data);
+ break;
+ }
+ return (error);
+}
+
+static void
+hn_stop(struct hn_softc *sc, bool detaching)
+{
+ struct ifnet *ifp = sc->hn_ifp;
+ int i;
+
+ HN_LOCK_ASSERT(sc);
+
+ KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
+ ("synthetic parts were not attached"));
+
+ /* Clear RUNNING bit ASAP. */
+ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
+
+ /* Disable polling. */
+ hn_polling(sc, 0);
+
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
+ KASSERT(sc->hn_vf_ifp != NULL,
+ ("%s: VF is not attached", ifp->if_xname));
+
+ /* Mark transparent mode VF as disabled. */
+ hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
+
+ /*
+ * NOTE:
+ * Datapath setting must happen _before_ bringing
+ * the VF down.
+ */
+ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
+
+ /*
+ * Bring the VF down.
+ */
+ hn_xpnt_vf_saveifflags(sc);
+ sc->hn_vf_ifp->if_flags &= ~IFF_UP;
+ hn_xpnt_vf_iocsetflags(sc);
+ }
+
+ /* Suspend data transfers. */
+ hn_suspend_data(sc);
+
+ /* Clear OACTIVE bit. */
+ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+ for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
+ sc->hn_tx_ring[i].hn_oactive = 0;
+
+ /*
+ * If the non-transparent mode VF is active, make sure
+ * that the RX filter still allows packet reception.
+ */
+ if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
+ hn_rxfilter_config(sc);
+}
+
+static void
+hn_init_locked(struct hn_softc *sc)
+{
+ struct ifnet *ifp = sc->hn_ifp;
+ int i;
+
+ HN_LOCK_ASSERT(sc);
+
+ if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
+ return;
+
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+ return;
+
+ /* Configure RX filter */
+ hn_rxfilter_config(sc);
+
+ /* Clear OACTIVE bit. */
+ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+ for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
+ sc->hn_tx_ring[i].hn_oactive = 0;
+
+ /* Clear TX 'suspended' bit. */
+ hn_resume_tx(sc, sc->hn_tx_ring_inuse);
+
+ if (hn_xpnt_vf_isready(sc)) {
+ /* Initialize transparent VF. */
+ hn_xpnt_vf_init(sc);
+ }
+
+ /* Everything is ready; unleash! */
+ atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
+
+ /* Re-enable polling if requested. */
+ if (sc->hn_pollhz > 0)
+ hn_polling(sc, sc->hn_pollhz);
+}
+
+static void
+hn_init(void *xsc)
+{
+ struct hn_softc *sc = xsc;
+
+ HN_LOCK(sc);
+ hn_init_locked(sc);
+ HN_UNLOCK(sc);
+}
+
+#if __FreeBSD_version >= 1100099
+
+static int
+hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ unsigned int lenlim;
+ int error;
+
+ lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
+ error = sysctl_handle_int(oidp, &lenlim, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ HN_LOCK(sc);
+ if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
+ lenlim > TCP_LRO_LENGTH_MAX) {
+ HN_UNLOCK(sc);
+ return EINVAL;
+ }
+ hn_set_lro_lenlim(sc, lenlim);
+ HN_UNLOCK(sc);
+
+ return 0;
+}
+
+static int
+hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int ackcnt, error, i;
+
+ /*
+ * lro_ackcnt_lim is append count limit,
+ * +1 to turn it into aggregation limit.
+ */
+ ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
+ error = sysctl_handle_int(oidp, &ackcnt, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
+ return EINVAL;
+
+ /*
+ * Convert aggregation limit back to append
+ * count limit.
+ */
+ --ackcnt;
+ HN_LOCK(sc);
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+ sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
+ HN_UNLOCK(sc);
+ return 0;
+}
+
+#endif
+
+static int
+hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int hcsum = arg2;
+ int on, error, i;
+
+ on = 0;
+ if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
+ on = 1;
+
+ error = sysctl_handle_int(oidp, &on, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ HN_LOCK(sc);
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
+
+ if (on)
+ rxr->hn_trust_hcsum |= hcsum;
+ else
+ rxr->hn_trust_hcsum &= ~hcsum;
+ }
+ HN_UNLOCK(sc);
+ return 0;
+}
+
+static int
+hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int chim_size, error;
+
+ chim_size = sc->hn_tx_ring[0].hn_chim_size;
+ error = sysctl_handle_int(oidp, &chim_size, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
+ return EINVAL;
+
+ HN_LOCK(sc);
+ hn_set_chim_size(sc, chim_size);
+ HN_UNLOCK(sc);
+ return 0;
+}
+
+#if __FreeBSD_version < 1100095
+static int
+hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int ofs = arg2, i, error;
+ struct hn_rx_ring *rxr;
+ uint64_t stat;
+
+ stat = 0;
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ rxr = &sc->hn_rx_ring[i];
+ stat += *((int *)((uint8_t *)rxr + ofs));
+ }
+
+ error = sysctl_handle_64(oidp, &stat, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ /* Zero out this stat. */
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ rxr = &sc->hn_rx_ring[i];
+ *((int *)((uint8_t *)rxr + ofs)) = 0;
+ }
+ return 0;
+}
+#else
+static int
+hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int ofs = arg2, i, error;
+ struct hn_rx_ring *rxr;
+ uint64_t stat;
+
+ stat = 0;
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ rxr = &sc->hn_rx_ring[i];
+ stat += *((uint64_t *)((uint8_t *)rxr + ofs));
+ }
+
+ error = sysctl_handle_64(oidp, &stat, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ /* Zero out this stat. */
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ rxr = &sc->hn_rx_ring[i];
+ *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
+ }
+ return 0;
+}
+
+#endif
+
+static int
+hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int ofs = arg2, i, error;
+ struct hn_rx_ring *rxr;
+ u_long stat;
+
+ stat = 0;
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ rxr = &sc->hn_rx_ring[i];
+ stat += *((u_long *)((uint8_t *)rxr + ofs));
+ }
+
+ error = sysctl_handle_long(oidp, &stat, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ /* Zero out this stat. */
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ rxr = &sc->hn_rx_ring[i];
+ *((u_long *)((uint8_t *)rxr + ofs)) = 0;
+ }
+ return 0;
+}
+
+static int
+hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int ofs = arg2, i, error;
+ struct hn_tx_ring *txr;
+ u_long stat;
+
+ stat = 0;
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+ txr = &sc->hn_tx_ring[i];
+ stat += *((u_long *)((uint8_t *)txr + ofs));
+ }
+
+ error = sysctl_handle_long(oidp, &stat, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ /* Zero out this stat. */
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+ txr = &sc->hn_tx_ring[i];
+ *((u_long *)((uint8_t *)txr + ofs)) = 0;
+ }
+ return 0;
+}
+
+static int
+hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int ofs = arg2, i, error, conf;
+ struct hn_tx_ring *txr;
+
+ txr = &sc->hn_tx_ring[0];
+ conf = *((int *)((uint8_t *)txr + ofs));
+
+ error = sysctl_handle_int(oidp, &conf, 0, req);
+ if (error || req->newptr == NULL)
+ return error;
+
+ HN_LOCK(sc);
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+ txr = &sc->hn_tx_ring[i];
+ *((int *)((uint8_t *)txr + ofs)) = conf;
+ }
+ HN_UNLOCK(sc);
+
+ return 0;
+}
+
+static int
+hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int error, size;
+
+ size = sc->hn_agg_size;
+ error = sysctl_handle_int(oidp, &size, 0, req);
+ if (error || req->newptr == NULL)
+ return (error);
+
+ HN_LOCK(sc);
+ sc->hn_agg_size = size;
+ hn_set_txagg(sc);
+ HN_UNLOCK(sc);
+
+ return (0);
+}
+
+static int
+hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int error, pkts;
+
+ pkts = sc->hn_agg_pkts;
+ error = sysctl_handle_int(oidp, &pkts, 0, req);
+ if (error || req->newptr == NULL)
+ return (error);
+
+ HN_LOCK(sc);
+ sc->hn_agg_pkts = pkts;
+ hn_set_txagg(sc);
+ HN_UNLOCK(sc);
+
+ return (0);
+}
+
+static int
+hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int pkts;
+
+ pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
+ return (sysctl_handle_int(oidp, &pkts, 0, req));
+}
+
+static int
+hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int align;
+
+ align = sc->hn_tx_ring[0].hn_agg_align;
+ return (sysctl_handle_int(oidp, &align, 0, req));
+}
+
+static void
+hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
+{
+ if (pollhz == 0)
+ vmbus_chan_poll_disable(chan);
+ else
+ vmbus_chan_poll_enable(chan, pollhz);
+}
+
+static void
+hn_polling(struct hn_softc *sc, u_int pollhz)
+{
+ int nsubch = sc->hn_rx_ring_inuse - 1;
+
+ HN_LOCK_ASSERT(sc);
+
+ if (nsubch > 0) {
+ struct vmbus_channel **subch;
+ int i;
+
+ subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
+ for (i = 0; i < nsubch; ++i)
+ hn_chan_polling(subch[i], pollhz);
+ vmbus_subchan_rel(subch, nsubch);
+ }
+ hn_chan_polling(sc->hn_prichan, pollhz);
+}
+
+static int
+hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int pollhz, error;
+
+ pollhz = sc->hn_pollhz;
+ error = sysctl_handle_int(oidp, &pollhz, 0, req);
+ if (error || req->newptr == NULL)
+ return (error);
+
+ if (pollhz != 0 &&
+ (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
+ return (EINVAL);
+
+ HN_LOCK(sc);
+ if (sc->hn_pollhz != pollhz) {
+ sc->hn_pollhz = pollhz;
+ if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
+ (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
+ hn_polling(sc, sc->hn_pollhz);
+ }
+ HN_UNLOCK(sc);
+
+ return (0);
+}
+
+static int
+hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ char verstr[16];
+
+ snprintf(verstr, sizeof(verstr), "%u.%u",
+ HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
+ HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
+ return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
+}
+
+static int
+hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ char caps_str[128];
+ uint32_t caps;
+
+ HN_LOCK(sc);
+ caps = sc->hn_caps;
+ HN_UNLOCK(sc);
+ snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
+ return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
+}
+
+static int
+hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ char assist_str[128];
+ uint32_t hwassist;
+
+ HN_LOCK(sc);
+ hwassist = sc->hn_ifp->if_hwassist;
+ HN_UNLOCK(sc);
+ snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
+ return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
+}
+
+static int
+hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ char filter_str[128];
+ uint32_t filter;
+
+ HN_LOCK(sc);
+ filter = sc->hn_rx_filter;
+ HN_UNLOCK(sc);
+ snprintf(filter_str, sizeof(filter_str), "%b", filter,
+ NDIS_PACKET_TYPES);
+ return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
+}
+
+#ifndef RSS
+
+static int
+hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int error;
+
+ HN_LOCK(sc);
+
+ error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
+ if (error || req->newptr == NULL)
+ goto back;
+
+ if ((sc->hn_flags & HN_FLAG_RXVF) ||
+ (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
+ /*
+ * RSS key is synchronized w/ VF's, don't allow users
+ * to change it.
+ */
+ error = EBUSY;
+ goto back;
+ }
+
+ error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
+ if (error)
+ goto back;
+ sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
+
+ if (sc->hn_rx_ring_inuse > 1) {
+ error = hn_rss_reconfig(sc);
+ } else {
+ /* Not RSS capable, at least for now; just save the RSS key. */
+ error = 0;
+ }
+back:
+ HN_UNLOCK(sc);
+ return (error);
+}
+
+static int
+hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int error;
+
+ HN_LOCK(sc);
+
+ error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
+ if (error || req->newptr == NULL)
+ goto back;
+
+ /*
+ * Don't allow RSS indirect table change, if this interface is not
+ * RSS capable currently.
+ */
+ if (sc->hn_rx_ring_inuse == 1) {
+ error = EOPNOTSUPP;
+ goto back;
+ }
+
+ error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
+ if (error)
+ goto back;
+ sc->hn_flags |= HN_FLAG_HAS_RSSIND;
+
+ hn_rss_ind_fixup(sc);
+ error = hn_rss_reconfig(sc);
+back:
+ HN_UNLOCK(sc);
+ return (error);
+}
+
+#endif /* !RSS */
+
+static int
+hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ char hash_str[128];
+ uint32_t hash;
+
+ HN_LOCK(sc);
+ hash = sc->hn_rss_hash;
+ HN_UNLOCK(sc);
+ snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
+ return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
+}
+
+static int
+hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ char hash_str[128];
+ uint32_t hash;
+
+ HN_LOCK(sc);
+ hash = sc->hn_rss_hcap;
+ HN_UNLOCK(sc);
+ snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
+ return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
+}
+
+static int
+hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ char hash_str[128];
+ uint32_t hash;
+
+ HN_LOCK(sc);
+ hash = sc->hn_rx_ring[0].hn_mbuf_hash;
+ HN_UNLOCK(sc);
+ snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
+ return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
+}
+
+static int
+hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ char vf_name[IFNAMSIZ + 1];
+ struct ifnet *vf_ifp;
+
+ HN_LOCK(sc);
+ vf_name[0] = '\0';
+ vf_ifp = sc->hn_vf_ifp;
+ if (vf_ifp != NULL)
+ snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
+ HN_UNLOCK(sc);
+ return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
+}
+
+static int
+hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ char vf_name[IFNAMSIZ + 1];
+ struct ifnet *vf_ifp;
+
+ HN_LOCK(sc);
+ vf_name[0] = '\0';
+ vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
+ if (vf_ifp != NULL)
+ snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
+ HN_UNLOCK(sc);
+ return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
+}
+
+static int
+hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct rm_priotracker pt;
+ struct sbuf *sb;
+ int error, i;
+ bool first;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+
+ sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
+ if (sb == NULL)
+ return (ENOMEM);
+
+ rm_rlock(&hn_vfmap_lock, &pt);
+
+ first = true;
+ for (i = 0; i < hn_vfmap_size; ++i) {
+ struct ifnet *ifp;
+
+ if (hn_vfmap[i] == NULL)
+ continue;
+
+ ifp = ifnet_byindex(i);
+ if (ifp != NULL) {
+ if (first)
+ sbuf_printf(sb, "%s", ifp->if_xname);
+ else
+ sbuf_printf(sb, " %s", ifp->if_xname);
+ first = false;
+ }
+ }
+
+ rm_runlock(&hn_vfmap_lock, &pt);
+
+ error = sbuf_finish(sb);
+ sbuf_delete(sb);
+ return (error);
+}
+
+static int
+hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct rm_priotracker pt;
+ struct sbuf *sb;
+ int error, i;
+ bool first;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+
+ sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
+ if (sb == NULL)
+ return (ENOMEM);
+
+ rm_rlock(&hn_vfmap_lock, &pt);
+
+ first = true;
+ for (i = 0; i < hn_vfmap_size; ++i) {
+ struct ifnet *ifp, *hn_ifp;
+
+ hn_ifp = hn_vfmap[i];
+ if (hn_ifp == NULL)
+ continue;
+
+ ifp = ifnet_byindex(i);
+ if (ifp != NULL) {
+ if (first) {
+ sbuf_printf(sb, "%s:%s", ifp->if_xname,
+ hn_ifp->if_xname);
+ } else {
+ sbuf_printf(sb, " %s:%s", ifp->if_xname,
+ hn_ifp->if_xname);
+ }
+ first = false;
+ }
+ }
+
+ rm_runlock(&hn_vfmap_lock, &pt);
+
+ error = sbuf_finish(sb);
+ sbuf_delete(sb);
+ return (error);
+}
+
+static int
+hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int error, onoff = 0;
+
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
+ onoff = 1;
+ error = sysctl_handle_int(oidp, &onoff, 0, req);
+ if (error || req->newptr == NULL)
+ return (error);
+
+ HN_LOCK(sc);
+ /* NOTE: hn_vf_lock for hn_transmit() */
+ rm_wlock(&sc->hn_vf_lock);
+ if (onoff)
+ sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
+ else
+ sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
+ rm_wunlock(&sc->hn_vf_lock);
+ HN_UNLOCK(sc);
+
+ return (0);
+}
+
+static int
+hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int enabled = 0;
+
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
+ enabled = 1;
+ return (sysctl_handle_int(oidp, &enabled, 0, req));
+}
+
+static int
+hn_check_iplen(const struct mbuf *m, int hoff)
+{
+ const struct ip *ip;
+ int len, iphlen, iplen;
+ const struct tcphdr *th;
+ int thoff; /* TCP data offset */
+
+ len = hoff + sizeof(struct ip);
+
+ /* The packet must be at least the size of an IP header. */
+ if (m->m_pkthdr.len < len)
+ return IPPROTO_DONE;
+
+ /* The fixed IP header must reside completely in the first mbuf. */
+ if (m->m_len < len)
+ return IPPROTO_DONE;
+
+ ip = mtodo(m, hoff);
+
+ /* Bound check the packet's stated IP header length. */
+ iphlen = ip->ip_hl << 2;
+ if (iphlen < sizeof(struct ip)) /* minimum header length */
+ return IPPROTO_DONE;
+
+ /* The full IP header must reside completely in the one mbuf. */
+ if (m->m_len < hoff + iphlen)
+ return IPPROTO_DONE;
+
+ iplen = ntohs(ip->ip_len);
+
+ /*
+ * Check that the amount of data in the buffers is as
+ * at least much as the IP header would have us expect.
+ */
+ if (m->m_pkthdr.len < hoff + iplen)
+ return IPPROTO_DONE;
+
+ /*
+ * Ignore IP fragments.
+ */
+ if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
+ return IPPROTO_DONE;
+
+ /*
+ * The TCP/IP or UDP/IP header must be entirely contained within
+ * the first fragment of a packet.
+ */
+ switch (ip->ip_p) {
+ case IPPROTO_TCP:
+ if (iplen < iphlen + sizeof(struct tcphdr))
+ return IPPROTO_DONE;
+ if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
+ return IPPROTO_DONE;
+ th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
+ thoff = th->th_off << 2;
+ if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
+ return IPPROTO_DONE;
+ if (m->m_len < hoff + iphlen + thoff)
+ return IPPROTO_DONE;
+ break;
+ case IPPROTO_UDP:
+ if (iplen < iphlen + sizeof(struct udphdr))
+ return IPPROTO_DONE;
+ if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
+ return IPPROTO_DONE;
+ break;
+ default:
+ if (iplen < iphlen)
+ return IPPROTO_DONE;
+ break;
+ }
+ return ip->ip_p;
+}
+
+static void
+hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
+{
+ const struct ether_header *eh;
+ uint16_t etype;
+ int hoff;
+
+ hoff = sizeof(*eh);
+ /* Checked at the beginning of this function. */
+ KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
+
+ eh = mtod(m_new, const struct ether_header *);
+ etype = ntohs(eh->ether_type);
+ if (etype == ETHERTYPE_VLAN) {
+ const struct ether_vlan_header *evl;
+
+ hoff = sizeof(*evl);
+ if (m_new->m_len < hoff)
+ return;
+ evl = mtod(m_new, const struct ether_vlan_header *);
+ etype = ntohs(evl->evl_proto);
+ }
+ *l3proto = etype;
+
+ if (etype == ETHERTYPE_IP)
+ *l4proto = hn_check_iplen(m_new, hoff);
+ else
+ *l4proto = IPPROTO_DONE;
+}
+
+static int
+hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
+{
+ struct sysctl_oid_list *child;
+ struct sysctl_ctx_list *ctx;
+ device_t dev = sc->hn_dev;
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+ int lroent_cnt;
+#endif
+#endif
+ int i;
+
+ /*
+ * Create RXBUF for reception.
+ *
+ * NOTE:
+ * - It is shared by all channels.
+ * - A large enough buffer is allocated, certain version of NVSes
+ * may further limit the usable space.
+ */
+ sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
+ PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
+ BUS_DMA_WAITOK | BUS_DMA_ZERO);
+ if (sc->hn_rxbuf == NULL) {
+ device_printf(sc->hn_dev, "allocate rxbuf failed\n");
+ return (ENOMEM);
+ }
+
+ sc->hn_rx_ring_cnt = ring_cnt;
+ sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
+
+ sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
+ M_DEVBUF, M_WAITOK | M_ZERO);
+
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+ lroent_cnt = hn_lro_entry_count;
+ if (lroent_cnt < TCP_LRO_ENTRIES)
+ lroent_cnt = TCP_LRO_ENTRIES;
+ if (bootverbose)
+ device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
+#endif
+#endif /* INET || INET6 */
+
+ ctx = device_get_sysctl_ctx(dev);
+ child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+
+ /* Create dev.hn.UNIT.rx sysctl tree */
+ sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
+ CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
+
+ rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
+ PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
+ &rxr->hn_br_dma, BUS_DMA_WAITOK);
+ if (rxr->hn_br == NULL) {
+ device_printf(dev, "allocate bufring failed\n");
+ return (ENOMEM);
+ }
+
+ if (hn_trust_hosttcp)
+ rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
+ if (hn_trust_hostudp)
+ rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
+ if (hn_trust_hostip)
+ rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
+ rxr->hn_mbuf_hash = NDIS_HASH_ALL;
+ rxr->hn_ifp = sc->hn_ifp;
+ if (i < sc->hn_tx_ring_cnt)
+ rxr->hn_txr = &sc->hn_tx_ring[i];
+ rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
+ rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
+ rxr->hn_rx_idx = i;
+ rxr->hn_rxbuf = sc->hn_rxbuf;
+
+ /*
+ * Initialize LRO.
+ */
+#if defined(INET) || defined(INET6)
+#if __FreeBSD_version >= 1100095
+ tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
+ hn_lro_mbufq_depth);
+#else
+ tcp_lro_init(&rxr->hn_lro);
+ rxr->hn_lro.ifp = sc->hn_ifp;
+#endif
+#if __FreeBSD_version >= 1100099
+ rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
+ rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
+#endif
+#endif /* INET || INET6 */
+
+ if (sc->hn_rx_sysctl_tree != NULL) {
+ char name[16];
+
+ /*
+ * Create per RX ring sysctl tree:
+ * dev.hn.UNIT.rx.RINGID
+ */
+ snprintf(name, sizeof(name), "%d", i);
+ rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
+ SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
+ OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+
+ if (rxr->hn_rx_sysctl_tree != NULL) {
+ SYSCTL_ADD_ULONG(ctx,
+ SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
+ OID_AUTO, "packets", CTLFLAG_RW,
+ &rxr->hn_pkts, "# of packets received");
+ SYSCTL_ADD_ULONG(ctx,
+ SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
+ OID_AUTO, "rss_pkts", CTLFLAG_RW,
+ &rxr->hn_rss_pkts,
+ "# of packets w/ RSS info received");
+ SYSCTL_ADD_ULONG(ctx,
+ SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
+ OID_AUTO, "rsc_pkts", CTLFLAG_RW,
+ &rxr->hn_rsc_pkts,
+ "# of RSC packets received");
+ SYSCTL_ADD_ULONG(ctx,
+ SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
+ OID_AUTO, "rsc_drop", CTLFLAG_RW,
+ &rxr->hn_rsc_drop,
+ "# of RSC fragments dropped");
+ SYSCTL_ADD_INT(ctx,
+ SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
+ OID_AUTO, "pktbuf_len", CTLFLAG_RD,
+ &rxr->hn_pktbuf_len, 0,
+ "Temporary channel packet buffer length");
+ }
+ }
+ }
+
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
+ CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
+#if __FreeBSD_version < 1100095
+ hn_rx_stat_int_sysctl,
+#else
+ hn_rx_stat_u64_sysctl,
+#endif
+ "LU", "LRO queued");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
+ CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
+#if __FreeBSD_version < 1100095
+ hn_rx_stat_int_sysctl,
+#else
+ hn_rx_stat_u64_sysctl,
+#endif
+ "LU", "LRO flushed");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_rx_ring, hn_lro_tried),
+ hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
+#if __FreeBSD_version >= 1100099
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
+ CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+ hn_lro_lenlim_sysctl, "IU",
+ "Max # of data bytes to be aggregated by LRO");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+ hn_lro_ackcnt_sysctl, "I",
+ "Max # of ACKs to be aggregated by LRO");
+#endif
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
+ hn_trust_hcsum_sysctl, "I",
+ "Trust tcp segment verification on host side, "
+ "when csum info is missing");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
+ hn_trust_hcsum_sysctl, "I",
+ "Trust udp datagram verification on host side, "
+ "when csum info is missing");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
+ hn_trust_hcsum_sysctl, "I",
+ "Trust ip packet verification on host side, "
+ "when csum info is missing");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_rx_ring, hn_csum_ip),
+ hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_rx_ring, hn_csum_tcp),
+ hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_rx_ring, hn_csum_udp),
+ hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_rx_ring, hn_csum_trusted),
+ hn_rx_stat_ulong_sysctl, "LU",
+ "# of packets that we trust host's csum verification");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_rx_ring, hn_small_pkts),
+ hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_rx_ring, hn_ack_failed),
+ hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
+ CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
+ CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
+
+ return (0);
+}
+
+static void
+hn_destroy_rx_data(struct hn_softc *sc)
+{
+ int i;
+
+ if (sc->hn_rxbuf != NULL) {
+ if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
+ hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
+ else
+ device_printf(sc->hn_dev, "RXBUF is referenced\n");
+ sc->hn_rxbuf = NULL;
+ }
+
+ if (sc->hn_rx_ring_cnt == 0)
+ return;
+
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
+
+ if (rxr->hn_br == NULL)
+ continue;
+ if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
+ hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
+ } else {
+ device_printf(sc->hn_dev,
+ "%dth channel bufring is referenced", i);
+ }
+ rxr->hn_br = NULL;
+
+#if defined(INET) || defined(INET6)
+ tcp_lro_free(&rxr->hn_lro);
+#endif
+ free(rxr->hn_pktbuf, M_DEVBUF);
+ }
+ free(sc->hn_rx_ring, M_DEVBUF);
+ sc->hn_rx_ring = NULL;
+
+ sc->hn_rx_ring_cnt = 0;
+ sc->hn_rx_ring_inuse = 0;
+}
+
+static int
+hn_tx_ring_create(struct hn_softc *sc, int id)
+{
+ struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
+ device_t dev = sc->hn_dev;
+ bus_dma_tag_t parent_dtag;
+ int error, i;
+
+ txr->hn_sc = sc;
+ txr->hn_tx_idx = id;
+
+#ifndef HN_USE_TXDESC_BUFRING
+ mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
+#endif
+ mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
+
+ txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
+ txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
+ M_DEVBUF, M_WAITOK | M_ZERO);
+#ifndef HN_USE_TXDESC_BUFRING
+ SLIST_INIT(&txr->hn_txlist);
+#else
+ txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
+ M_WAITOK, &txr->hn_tx_lock);
+#endif
+
+ if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
+ txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
+ device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
+ } else {
+ txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
+ }
+
+#ifdef HN_IFSTART_SUPPORT
+ if (hn_use_if_start) {
+ txr->hn_txeof = hn_start_txeof;
+ TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
+ TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
+ } else
+#endif
+ {
+ int br_depth;
+
+ txr->hn_txeof = hn_xmit_txeof;
+ TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
+ TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
+
+ br_depth = hn_get_txswq_depth(txr);
+ txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
+ M_WAITOK, &txr->hn_tx_lock);
+ }
+
+ txr->hn_direct_tx_size = hn_direct_tx_size;
+
+ /*
+ * Always schedule transmission instead of trying to do direct
+ * transmission. This one gives the best performance so far.
+ */
+ txr->hn_sched_tx = 1;
+
+ parent_dtag = bus_get_dma_tag(dev);
+
+ /* DMA tag for RNDIS packet messages. */
+ error = bus_dma_tag_create(parent_dtag, /* parent */
+ HN_RNDIS_PKT_ALIGN, /* alignment */
+ HN_RNDIS_PKT_BOUNDARY, /* boundary */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ HN_RNDIS_PKT_LEN, /* maxsize */
+ 1, /* nsegments */
+ HN_RNDIS_PKT_LEN, /* maxsegsize */
+ 0, /* flags */
+ NULL, /* lockfunc */
+ NULL, /* lockfuncarg */
+ &txr->hn_tx_rndis_dtag);
+ if (error) {
+ device_printf(dev, "failed to create rndis dmatag\n");
+ return error;
+ }
+
+ /* DMA tag for data. */
+ error = bus_dma_tag_create(parent_dtag, /* parent */
+ 1, /* alignment */
+ HN_TX_DATA_BOUNDARY, /* boundary */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ HN_TX_DATA_MAXSIZE, /* maxsize */
+ HN_TX_DATA_SEGCNT_MAX, /* nsegments */
+ HN_TX_DATA_SEGSIZE, /* maxsegsize */
+ 0, /* flags */
+ NULL, /* lockfunc */
+ NULL, /* lockfuncarg */
+ &txr->hn_tx_data_dtag);
+ if (error) {
+ device_printf(dev, "failed to create data dmatag\n");
+ return error;
+ }
+
+ for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
+ struct hn_txdesc *txd = &txr->hn_txdesc[i];
+
+ txd->txr = txr;
+ txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
+ STAILQ_INIT(&txd->agg_list);
+
+ /*
+ * Allocate and load RNDIS packet message.
+ */
+ error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
+ (void **)&txd->rndis_pkt,
+ BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
+ &txd->rndis_pkt_dmap);
+ if (error) {
+ device_printf(dev,
+ "failed to allocate rndis_packet_msg, %d\n", i);
+ return error;
+ }
+
+ error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
+ txd->rndis_pkt_dmap,
+ txd->rndis_pkt, HN_RNDIS_PKT_LEN,
+ hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
+ BUS_DMA_NOWAIT);
+ if (error) {
+ device_printf(dev,
+ "failed to load rndis_packet_msg, %d\n", i);
+ bus_dmamem_free(txr->hn_tx_rndis_dtag,
+ txd->rndis_pkt, txd->rndis_pkt_dmap);
+ return error;
+ }
+
+ /* DMA map for TX data. */
+ error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
+ &txd->data_dmap);
+ if (error) {
+ device_printf(dev,
+ "failed to allocate tx data dmamap\n");
+ bus_dmamap_unload(txr->hn_tx_rndis_dtag,
+ txd->rndis_pkt_dmap);
+ bus_dmamem_free(txr->hn_tx_rndis_dtag,
+ txd->rndis_pkt, txd->rndis_pkt_dmap);
+ return error;
+ }
+
+ /* All set, put it to list */
+ txd->flags |= HN_TXD_FLAG_ONLIST;
+#ifndef HN_USE_TXDESC_BUFRING
+ SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
+#else
+ buf_ring_enqueue(txr->hn_txdesc_br, txd);
+#endif
+ }
+ txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
+
+ if (sc->hn_tx_sysctl_tree != NULL) {
+ struct sysctl_oid_list *child;
+ struct sysctl_ctx_list *ctx;
+ char name[16];
+
+ /*
+ * Create per TX ring sysctl tree:
+ * dev.hn.UNIT.tx.RINGID
+ */
+ ctx = device_get_sysctl_ctx(dev);
+ child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
+
+ snprintf(name, sizeof(name), "%d", id);
+ txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
+ name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+
+ if (txr->hn_tx_sysctl_tree != NULL) {
+ child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
+
+#ifdef HN_DEBUG
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
+ CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
+ "# of available TX descs");
+#endif
+#ifdef HN_IFSTART_SUPPORT
+ if (!hn_use_if_start)
+#endif
+ {
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
+ CTLFLAG_RD, &txr->hn_oactive, 0,
+ "over active");
+ }
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
+ CTLFLAG_RW, &txr->hn_pkts,
+ "# of packets transmitted");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
+ CTLFLAG_RW, &txr->hn_sends, "# of sends");
+ }
+ }
+
+ return 0;
+}
+
+static void
+hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
+{
+ struct hn_tx_ring *txr = txd->txr;
+
+ KASSERT(txd->m == NULL, ("still has mbuf installed"));
+ KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
+
+ bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
+ bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
+ txd->rndis_pkt_dmap);
+ bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
+}
+
+static void
+hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
+{
+
+ KASSERT(txd->refs == 0 || txd->refs == 1,
+ ("invalid txd refs %d", txd->refs));
+
+ /* Aggregated txds will be freed by their aggregating txd. */
+ if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
+ int freed;
+
+ freed = hn_txdesc_put(txr, txd);
+ KASSERT(freed, ("can't free txdesc"));
+ }
+}
+
+static void
+hn_tx_ring_destroy(struct hn_tx_ring *txr)
+{
+ int i;
+
+ if (txr->hn_txdesc == NULL)
+ return;
+
+ /*
+ * NOTE:
+ * Because the freeing of aggregated txds will be deferred
+ * to the aggregating txd, two passes are used here:
+ * - The first pass GCes any pending txds. This GC is necessary,
+ * since if the channels are revoked, hypervisor will not
+ * deliver send-done for all pending txds.
+ * - The second pass frees the busdma stuffs, i.e. after all txds
+ * were freed.
+ */
+ for (i = 0; i < txr->hn_txdesc_cnt; ++i)
+ hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
+ for (i = 0; i < txr->hn_txdesc_cnt; ++i)
+ hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
+
+ if (txr->hn_tx_data_dtag != NULL)
+ bus_dma_tag_destroy(txr->hn_tx_data_dtag);
+ if (txr->hn_tx_rndis_dtag != NULL)
+ bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
+
+#ifdef HN_USE_TXDESC_BUFRING
+ buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
+#endif
+
+ free(txr->hn_txdesc, M_DEVBUF);
+ txr->hn_txdesc = NULL;
+
+ if (txr->hn_mbuf_br != NULL)
+ buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
+
+#ifndef HN_USE_TXDESC_BUFRING
+ mtx_destroy(&txr->hn_txlist_spin);
+#endif
+ mtx_destroy(&txr->hn_tx_lock);
+}
+
+static int
+hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
+{
+ struct sysctl_oid_list *child;
+ struct sysctl_ctx_list *ctx;
+ int i;
+
+ /*
+ * Create TXBUF for chimney sending.
+ *
+ * NOTE: It is shared by all channels.
+ */
+ sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
+ PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
+ BUS_DMA_WAITOK | BUS_DMA_ZERO);
+ if (sc->hn_chim == NULL) {
+ device_printf(sc->hn_dev, "allocate txbuf failed\n");
+ return (ENOMEM);
+ }
+
+ sc->hn_tx_ring_cnt = ring_cnt;
+ sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
+
+ sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
+ M_DEVBUF, M_WAITOK | M_ZERO);
+
+ ctx = device_get_sysctl_ctx(sc->hn_dev);
+ child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
+
+ /* Create dev.hn.UNIT.tx sysctl tree */
+ sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
+ CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+ int error;
+
+ error = hn_tx_ring_create(sc, i);
+ if (error)
+ return error;
+ }
+
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_tx_ring, hn_no_txdescs),
+ hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_tx_ring, hn_send_failed),
+ hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_tx_ring, hn_txdma_failed),
+ hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_tx_ring, hn_flush_failed),
+ hn_tx_stat_ulong_sysctl, "LU",
+ "# of packet transmission aggregation flush failure");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_tx_ring, hn_tx_collapsed),
+ hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_tx_ring, hn_tx_chimney),
+ hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
+ hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
+ CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
+ "# of total TX descs");
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
+ CTLFLAG_RD, &sc->hn_chim_szmax, 0,
+ "Chimney send packet size upper boundary");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+ hn_chim_size_sysctl, "I", "Chimney send packet size limit");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_tx_ring, hn_direct_tx_size),
+ hn_tx_conf_int_sysctl, "I",
+ "Size of the packet for direct transmission");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+ __offsetof(struct hn_tx_ring, hn_sched_tx),
+ hn_tx_conf_int_sysctl, "I",
+ "Always schedule transmission "
+ "instead of doing direct transmission");
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
+ CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
+ CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
+ CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
+ "Applied packet transmission aggregation size");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
+ CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_txagg_pktmax_sysctl, "I",
+ "Applied packet transmission aggregation packets");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
+ CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_txagg_align_sysctl, "I",
+ "Applied packet transmission aggregation alignment");
+
+ return 0;
+}
+
+static void
+hn_set_chim_size(struct hn_softc *sc, int chim_size)
+{
+ int i;
+
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
+ sc->hn_tx_ring[i].hn_chim_size = chim_size;
+}
+
+static void
+hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
+{
+ struct ifnet *ifp = sc->hn_ifp;
+ u_int hw_tsomax;
+ int tso_minlen;
+
+ HN_LOCK_ASSERT(sc);
+
+ if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
+ return;
+
+ KASSERT(sc->hn_ndis_tso_sgmin >= 2,
+ ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
+ tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
+
+ KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
+ sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
+ ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
+
+ if (tso_maxlen < tso_minlen)
+ tso_maxlen = tso_minlen;
+ else if (tso_maxlen > IP_MAXPACKET)
+ tso_maxlen = IP_MAXPACKET;
+ if (tso_maxlen > sc->hn_ndis_tso_szmax)
+ tso_maxlen = sc->hn_ndis_tso_szmax;
+ hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
+
+ if (hn_xpnt_vf_isready(sc)) {
+ if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
+ hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
+ }
+ ifp->if_hw_tsomax = hw_tsomax;
+ if (bootverbose)
+ if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
+}
+
+static void
+hn_fixup_tx_data(struct hn_softc *sc)
+{
+ uint64_t csum_assist;
+ int i;
+
+ hn_set_chim_size(sc, sc->hn_chim_szmax);
+ if (hn_tx_chimney_size > 0 &&
+ hn_tx_chimney_size < sc->hn_chim_szmax)
+ hn_set_chim_size(sc, hn_tx_chimney_size);
+
+ csum_assist = 0;
+ if (sc->hn_caps & HN_CAP_IPCS)
+ csum_assist |= CSUM_IP;
+ if (sc->hn_caps & HN_CAP_TCP4CS)
+ csum_assist |= CSUM_IP_TCP;
+ if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
+ csum_assist |= CSUM_IP_UDP;
+ if (sc->hn_caps & HN_CAP_TCP6CS)
+ csum_assist |= CSUM_IP6_TCP;
+ if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
+ csum_assist |= CSUM_IP6_UDP;
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
+ sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
+
+ if (sc->hn_caps & HN_CAP_HASHVAL) {
+ /*
+ * Support HASHVAL pktinfo on TX path.
+ */
+ if (bootverbose)
+ if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
+ sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
+ }
+}
+
+static void
+hn_fixup_rx_data(struct hn_softc *sc)
+{
+
+ if (sc->hn_caps & HN_CAP_UDPHASH) {
+ int i;
+
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+ sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
+ }
+}
+
+static void
+hn_destroy_tx_data(struct hn_softc *sc)
+{
+ int i;
+
+ if (sc->hn_chim != NULL) {
+ if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
+ hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
+ } else {
+ device_printf(sc->hn_dev,
+ "chimney sending buffer is referenced");
+ }
+ sc->hn_chim = NULL;
+ }
+
+ if (sc->hn_tx_ring_cnt == 0)
+ return;
+
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
+ hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
+
+ free(sc->hn_tx_ring, M_DEVBUF);
+ sc->hn_tx_ring = NULL;
+
+ sc->hn_tx_ring_cnt = 0;
+ sc->hn_tx_ring_inuse = 0;
+}
+
+#ifdef HN_IFSTART_SUPPORT
+
+static void
+hn_start_taskfunc(void *xtxr, int pending __unused)
+{
+ struct hn_tx_ring *txr = xtxr;
+
+ mtx_lock(&txr->hn_tx_lock);
+ hn_start_locked(txr, 0);
+ mtx_unlock(&txr->hn_tx_lock);
+}
+
+static int
+hn_start_locked(struct hn_tx_ring *txr, int len)
+{
+ struct hn_softc *sc = txr->hn_sc;
+ struct ifnet *ifp = sc->hn_ifp;
+ int sched = 0;
+
+ KASSERT(hn_use_if_start,
+ ("hn_start_locked is called, when if_start is disabled"));
+ KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
+ mtx_assert(&txr->hn_tx_lock, MA_OWNED);
+ KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
+
+ if (__predict_false(txr->hn_suspended))
+ return (0);
+
+ if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
+ IFF_DRV_RUNNING)
+ return (0);
+
+ while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
+ struct hn_txdesc *txd;
+ struct mbuf *m_head;
+ int error;
+
+ IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
+ if (m_head == NULL)
+ break;
+
+ if (len > 0 && m_head->m_pkthdr.len > len) {
+ /*
+ * This sending could be time consuming; let callers
+ * dispatch this packet sending (and sending of any
+ * following up packets) to tx taskqueue.
+ */
+ IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+ sched = 1;
+ break;
+ }
+
+#if defined(INET6) || defined(INET)
+ if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
+ m_head = hn_tso_fixup(m_head);
+ if (__predict_false(m_head == NULL)) {
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ continue;
+ }
+ } else if (m_head->m_pkthdr.csum_flags &
+ (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
+ m_head = hn_set_hlen(m_head);
+ if (__predict_false(m_head == NULL)) {
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ continue;
+ }
+ }
+#endif
+
+ txd = hn_txdesc_get(txr);
+ if (txd == NULL) {
+ txr->hn_no_txdescs++;
+ IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+ atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+ break;
+ }
+
+ error = hn_encap(ifp, txr, txd, &m_head);
+ if (error) {
+ /* Both txd and m_head are freed */
+ KASSERT(txr->hn_agg_txd == NULL,
+ ("encap failed w/ pending aggregating txdesc"));
+ continue;
+ }
+
+ if (txr->hn_agg_pktleft == 0) {
+ if (txr->hn_agg_txd != NULL) {
+ KASSERT(m_head == NULL,
+ ("pending mbuf for aggregating txdesc"));
+ error = hn_flush_txagg(ifp, txr);
+ if (__predict_false(error)) {
+ atomic_set_int(&ifp->if_drv_flags,
+ IFF_DRV_OACTIVE);
+ break;
+ }
+ } else {
+ KASSERT(m_head != NULL, ("mbuf was freed"));
+ error = hn_txpkt(ifp, txr, txd);
+ if (__predict_false(error)) {
+ /* txd is freed, but m_head is not */
+ IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+ atomic_set_int(&ifp->if_drv_flags,
+ IFF_DRV_OACTIVE);
+ break;
+ }
+ }
+ }
+#ifdef INVARIANTS
+ else {
+ KASSERT(txr->hn_agg_txd != NULL,
+ ("no aggregating txdesc"));
+ KASSERT(m_head == NULL,
+ ("pending mbuf for aggregating txdesc"));
+ }
+#endif
+ }
+
+ /* Flush pending aggerated transmission. */
+ if (txr->hn_agg_txd != NULL)
+ hn_flush_txagg(ifp, txr);
+ return (sched);
+}
+
+static void
+hn_start(struct ifnet *ifp)
+{
+ struct hn_softc *sc = ifp->if_softc;
+ struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
+
+ if (txr->hn_sched_tx)
+ goto do_sched;
+
+ if (mtx_trylock(&txr->hn_tx_lock)) {
+ int sched;
+
+ sched = hn_start_locked(txr, txr->hn_direct_tx_size);
+ mtx_unlock(&txr->hn_tx_lock);
+ if (!sched)
+ return;
+ }
+do_sched:
+ taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
+}
+
+static void
+hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
+{
+ struct hn_tx_ring *txr = xtxr;
+
+ mtx_lock(&txr->hn_tx_lock);
+ atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
+ hn_start_locked(txr, 0);
+ mtx_unlock(&txr->hn_tx_lock);
+}
+
+static void
+hn_start_txeof(struct hn_tx_ring *txr)
+{
+ struct hn_softc *sc = txr->hn_sc;
+ struct ifnet *ifp = sc->hn_ifp;
+
+ KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
+
+ if (txr->hn_sched_tx)
+ goto do_sched;
+
+ if (mtx_trylock(&txr->hn_tx_lock)) {
+ int sched;
+
+ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+ sched = hn_start_locked(txr, txr->hn_direct_tx_size);
+ mtx_unlock(&txr->hn_tx_lock);
+ if (sched) {
+ taskqueue_enqueue(txr->hn_tx_taskq,
+ &txr->hn_tx_task);
+ }
+ } else {
+do_sched:
+ /*
+ * Release the OACTIVE earlier, with the hope, that
+ * others could catch up. The task will clear the
+ * flag again with the hn_tx_lock to avoid possible
+ * races.
+ */
+ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
+ taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
+ }
+}
+
+#endif /* HN_IFSTART_SUPPORT */
+
+static int
+hn_xmit(struct hn_tx_ring *txr, int len)
+{
+ struct hn_softc *sc = txr->hn_sc;
+ struct ifnet *ifp = sc->hn_ifp;
+ struct mbuf *m_head;
+ int sched = 0;
+
+ mtx_assert(&txr->hn_tx_lock, MA_OWNED);
+#ifdef HN_IFSTART_SUPPORT
+ KASSERT(hn_use_if_start == 0,
+ ("hn_xmit is called, when if_start is enabled"));
+#endif
+ KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
+
+ if (__predict_false(txr->hn_suspended))
+ return (0);
+
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
+ return (0);
+
+ while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
+ struct hn_txdesc *txd;
+ int error;
+
+ if (len > 0 && m_head->m_pkthdr.len > len) {
+ /*
+ * This sending could be time consuming; let callers
+ * dispatch this packet sending (and sending of any
+ * following up packets) to tx taskqueue.
+ */
+ drbr_putback(ifp, txr->hn_mbuf_br, m_head);
+ sched = 1;
+ break;
+ }
+
+ txd = hn_txdesc_get(txr);
+ if (txd == NULL) {
+ txr->hn_no_txdescs++;
+ drbr_putback(ifp, txr->hn_mbuf_br, m_head);
+ txr->hn_oactive = 1;
+ break;
+ }
+
+ error = hn_encap(ifp, txr, txd, &m_head);
+ if (error) {
+ /* Both txd and m_head are freed; discard */
+ KASSERT(txr->hn_agg_txd == NULL,
+ ("encap failed w/ pending aggregating txdesc"));
+ drbr_advance(ifp, txr->hn_mbuf_br);
+ continue;
+ }
+
+ if (txr->hn_agg_pktleft == 0) {
+ if (txr->hn_agg_txd != NULL) {
+ KASSERT(m_head == NULL,
+ ("pending mbuf for aggregating txdesc"));
+ error = hn_flush_txagg(ifp, txr);
+ if (__predict_false(error)) {
+ txr->hn_oactive = 1;
+ break;
+ }
+ } else {
+ KASSERT(m_head != NULL, ("mbuf was freed"));
+ error = hn_txpkt(ifp, txr, txd);
+ if (__predict_false(error)) {
+ /* txd is freed, but m_head is not */
+ drbr_putback(ifp, txr->hn_mbuf_br,
+ m_head);
+ txr->hn_oactive = 1;
+ break;
+ }
+ }
+ }
+#ifdef INVARIANTS
+ else {
+ KASSERT(txr->hn_agg_txd != NULL,
+ ("no aggregating txdesc"));
+ KASSERT(m_head == NULL,
+ ("pending mbuf for aggregating txdesc"));
+ }
+#endif
+
+ /* Sent */
+ drbr_advance(ifp, txr->hn_mbuf_br);
+ }
+
+ /* Flush pending aggerated transmission. */
+ if (txr->hn_agg_txd != NULL)
+ hn_flush_txagg(ifp, txr);
+ return (sched);
+}
+
+static int
+hn_transmit(struct ifnet *ifp, struct mbuf *m)
+{
+ struct hn_softc *sc = ifp->if_softc;
+ struct hn_tx_ring *txr;
+ int error, idx = 0;
+
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
+ struct rm_priotracker pt;
+
+ rm_rlock(&sc->hn_vf_lock, &pt);
+ if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
+ struct mbuf *m_bpf = NULL;
+ int obytes, omcast;
+
+ obytes = m->m_pkthdr.len;
+ omcast = (m->m_flags & M_MCAST) != 0;
+
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
+ if (bpf_peers_present(ifp->if_bpf)) {
+ m_bpf = m_copypacket(m, M_NOWAIT);
+ if (m_bpf == NULL) {
+ /*
+ * Failed to grab a shallow
+ * copy; tap now.
+ */
+ ETHER_BPF_MTAP(ifp, m);
+ }
+ }
+ } else {
+ ETHER_BPF_MTAP(ifp, m);
+ }
+
+ error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
+ rm_runlock(&sc->hn_vf_lock, &pt);
+
+ if (m_bpf != NULL) {
+ if (!error)
+ ETHER_BPF_MTAP(ifp, m_bpf);
+ m_freem(m_bpf);
+ }
+
+ if (error == ENOBUFS) {
+ if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
+ } else if (error) {
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ } else {
+ if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+ if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
+ if (omcast) {
+ if_inc_counter(ifp, IFCOUNTER_OMCASTS,
+ omcast);
+ }
+ }
+ return (error);
+ }
+ rm_runlock(&sc->hn_vf_lock, &pt);
+ }
+
+#if defined(INET6) || defined(INET)
+ /*
+ * Perform TSO packet header fixup or get l2/l3 header length now,
+ * since packet headers should be cache-hot.
+ */
+ if (m->m_pkthdr.csum_flags & CSUM_TSO) {
+ m = hn_tso_fixup(m);
+ if (__predict_false(m == NULL)) {
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return EIO;
+ }
+ } else if (m->m_pkthdr.csum_flags &
+ (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
+ m = hn_set_hlen(m);
+ if (__predict_false(m == NULL)) {
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return EIO;
+ }
+ }
+#endif
+
+ /*
+ * Select the TX ring based on flowid
+ */
+ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
+#ifdef RSS
+ uint32_t bid;
+
+ if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
+ &bid) == 0)
+ idx = bid % sc->hn_tx_ring_inuse;
+ else
+#endif
+ {
+#if defined(INET6) || defined(INET)
+ int tcpsyn = 0;
+
+ if (m->m_pkthdr.len < 128 &&
+ (m->m_pkthdr.csum_flags &
+ (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
+ (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
+ m = hn_check_tcpsyn(m, &tcpsyn);
+ if (__predict_false(m == NULL)) {
+ if_inc_counter(ifp,
+ IFCOUNTER_OERRORS, 1);
+ return (EIO);
+ }
+ }
+#else
+ const int tcpsyn = 0;
+#endif
+ if (tcpsyn)
+ idx = 0;
+ else
+ idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
+ }
+ }
+ txr = &sc->hn_tx_ring[idx];
+
+ error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
+ if (error) {
+ if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
+ return error;
+ }
+
+ if (txr->hn_oactive)
+ return 0;
+
+ if (txr->hn_sched_tx)
+ goto do_sched;
+
+ if (mtx_trylock(&txr->hn_tx_lock)) {
+ int sched;
+
+ sched = hn_xmit(txr, txr->hn_direct_tx_size);
+ mtx_unlock(&txr->hn_tx_lock);
+ if (!sched)
+ return 0;
+ }
+do_sched:
+ taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
+ return 0;
+}
+
+static void
+hn_tx_ring_qflush(struct hn_tx_ring *txr)
+{
+ struct mbuf *m;
+
+ mtx_lock(&txr->hn_tx_lock);
+ while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
+ m_freem(m);
+ mtx_unlock(&txr->hn_tx_lock);
+}
+
+static void
+hn_xmit_qflush(struct ifnet *ifp)
+{
+ struct hn_softc *sc = ifp->if_softc;
+ struct rm_priotracker pt;
+ int i;
+
+ for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
+ hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
+ if_qflush(ifp);
+
+ rm_rlock(&sc->hn_vf_lock, &pt);
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
+ sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
+ rm_runlock(&sc->hn_vf_lock, &pt);
+}
+
+static void
+hn_xmit_txeof(struct hn_tx_ring *txr)
+{
+
+ if (txr->hn_sched_tx)
+ goto do_sched;
+
+ if (mtx_trylock(&txr->hn_tx_lock)) {
+ int sched;
+
+ txr->hn_oactive = 0;
+ sched = hn_xmit(txr, txr->hn_direct_tx_size);
+ mtx_unlock(&txr->hn_tx_lock);
+ if (sched) {
+ taskqueue_enqueue(txr->hn_tx_taskq,
+ &txr->hn_tx_task);
+ }
+ } else {
+do_sched:
+ /*
+ * Release the oactive earlier, with the hope, that
+ * others could catch up. The task will clear the
+ * oactive again with the hn_tx_lock to avoid possible
+ * races.
+ */
+ txr->hn_oactive = 0;
+ taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
+ }
+}
+
+static void
+hn_xmit_taskfunc(void *xtxr, int pending __unused)
+{
+ struct hn_tx_ring *txr = xtxr;
+
+ mtx_lock(&txr->hn_tx_lock);
+ hn_xmit(txr, 0);
+ mtx_unlock(&txr->hn_tx_lock);
+}
+
+static void
+hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
+{
+ struct hn_tx_ring *txr = xtxr;
+
+ mtx_lock(&txr->hn_tx_lock);
+ txr->hn_oactive = 0;
+ hn_xmit(txr, 0);
+ mtx_unlock(&txr->hn_tx_lock);
+}
+
+static int
+hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
+{
+ struct vmbus_chan_br cbr;
+ struct hn_rx_ring *rxr;
+ struct hn_tx_ring *txr = NULL;
+ int idx, error;
+
+ idx = vmbus_chan_subidx(chan);
+
+ /*
+ * Link this channel to RX/TX ring.
+ */
+ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
+ ("invalid channel index %d, should > 0 && < %d",
+ idx, sc->hn_rx_ring_inuse));
+ rxr = &sc->hn_rx_ring[idx];
+ KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
+ ("RX ring %d already attached", idx));
+ rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
+ rxr->hn_chan = chan;
+
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
+ idx, vmbus_chan_id(chan));
+ }
+
+ if (idx < sc->hn_tx_ring_inuse) {
+ txr = &sc->hn_tx_ring[idx];
+ KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
+ ("TX ring %d already attached", idx));
+ txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
+
+ txr->hn_chan = chan;
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
+ idx, vmbus_chan_id(chan));
+ }
+ }
+
+ /* Bind this channel to a proper CPU. */
+ vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
+
+ /*
+ * Open this channel
+ */
+ cbr.cbr = rxr->hn_br;
+ cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
+ cbr.cbr_txsz = HN_TXBR_SIZE;
+ cbr.cbr_rxsz = HN_RXBR_SIZE;
+ error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
+ if (error) {
+ if (error == EISCONN) {
+ if_printf(sc->hn_ifp, "bufring is connected after "
+ "chan%u open failure\n", vmbus_chan_id(chan));
+ rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
+ } else {
+ if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
+ vmbus_chan_id(chan), error);
+ }
+ }
+ return (error);
+}
+
+static void
+hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
+{
+ struct hn_rx_ring *rxr;
+ int idx, error;
+
+ idx = vmbus_chan_subidx(chan);
+
+ /*
+ * Link this channel to RX/TX ring.
+ */
+ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
+ ("invalid channel index %d, should > 0 && < %d",
+ idx, sc->hn_rx_ring_inuse));
+ rxr = &sc->hn_rx_ring[idx];
+ KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
+ ("RX ring %d is not attached", idx));
+ rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
+
+ if (idx < sc->hn_tx_ring_inuse) {
+ struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
+
+ KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
+ ("TX ring %d is not attached attached", idx));
+ txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
+ }
+
+ /*
+ * Close this channel.
+ *
+ * NOTE:
+ * Channel closing does _not_ destroy the target channel.
+ */
+ error = vmbus_chan_close_direct(chan);
+ if (error == EISCONN) {
+ if_printf(sc->hn_ifp, "chan%u bufring is connected "
+ "after being closed\n", vmbus_chan_id(chan));
+ rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
+ } else if (error) {
+ if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
+ vmbus_chan_id(chan), error);
+ }
+}
+
+static int
+hn_attach_subchans(struct hn_softc *sc)
+{
+ struct vmbus_channel **subchans;
+ int subchan_cnt = sc->hn_rx_ring_inuse - 1;
+ int i, error = 0;
+
+ KASSERT(subchan_cnt > 0, ("no sub-channels"));
+
+ /* Attach the sub-channels. */
+ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
+ for (i = 0; i < subchan_cnt; ++i) {
+ int error1;
+
+ error1 = hn_chan_attach(sc, subchans[i]);
+ if (error1) {
+ error = error1;
+ /* Move on; all channels will be detached later. */
+ }
+ }
+ vmbus_subchan_rel(subchans, subchan_cnt);
+
+ if (error) {
+ if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
+ } else {
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "%d sub-channels attached\n",
+ subchan_cnt);
+ }
+ }
+ return (error);
+}
+
+static void
+hn_detach_allchans(struct hn_softc *sc)
+{
+ struct vmbus_channel **subchans;
+ int subchan_cnt = sc->hn_rx_ring_inuse - 1;
+ int i;
+
+ if (subchan_cnt == 0)
+ goto back;
+
+ /* Detach the sub-channels. */
+ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
+ for (i = 0; i < subchan_cnt; ++i)
+ hn_chan_detach(sc, subchans[i]);
+ vmbus_subchan_rel(subchans, subchan_cnt);
+
+back:
+ /*
+ * Detach the primary channel, _after_ all sub-channels
+ * are detached.
+ */
+ hn_chan_detach(sc, sc->hn_prichan);
+
+ /* Wait for sub-channels to be destroyed, if any. */
+ vmbus_subchan_drain(sc->hn_prichan);
+
+#ifdef INVARIANTS
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
+ HN_RX_FLAG_ATTACHED) == 0,
+ ("%dth RX ring is still attached", i));
+ }
+ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+ KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
+ HN_TX_FLAG_ATTACHED) == 0,
+ ("%dth TX ring is still attached", i));
+ }
+#endif
+}
+
+static int
+hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
+{
+ struct vmbus_channel **subchans;
+ int nchan, rxr_cnt, error;
+
+ nchan = *nsubch + 1;
+ if (nchan == 1) {
+ /*
+ * Multiple RX/TX rings are not requested.
+ */
+ *nsubch = 0;
+ return (0);
+ }
+
+ /*
+ * Query RSS capabilities, e.g. # of RX rings, and # of indirect
+ * table entries.
+ */
+ error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
+ if (error) {
+ /* No RSS; this is benign. */
+ *nsubch = 0;
+ return (0);
+ }
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
+ rxr_cnt, nchan);
+ }
+
+ if (nchan > rxr_cnt)
+ nchan = rxr_cnt;
+ if (nchan == 1) {
+ if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
+ *nsubch = 0;
+ return (0);
+ }
+
+ /*
+ * Allocate sub-channels from NVS.
+ */
+ *nsubch = nchan - 1;
+ error = hn_nvs_alloc_subchans(sc, nsubch);
+ if (error || *nsubch == 0) {
+ /* Failed to allocate sub-channels. */
+ *nsubch = 0;
+ return (0);
+ }
+
+ /*
+ * Wait for all sub-channels to become ready before moving on.
+ */
+ subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
+ vmbus_subchan_rel(subchans, *nsubch);
+ return (0);
+}
+
+static bool
+hn_synth_attachable(const struct hn_softc *sc)
+{
+ int i;
+
+ if (sc->hn_flags & HN_FLAG_ERRORS)
+ return (false);
+
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
+ const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
+
+ if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
+ return (false);
+ }
+ return (true);
+}
+
+/*
+ * Make sure that the RX filter is zero after the successful
+ * RNDIS initialization.
+ *
+ * NOTE:
+ * Under certain conditions on certain versions of Hyper-V,
+ * the RNDIS rxfilter is _not_ zero on the hypervisor side
+ * after the successful RNDIS initialization, which breaks
+ * the assumption of any following code (well, it breaks the
+ * RNDIS API contract actually). Clear the RNDIS rxfilter
+ * explicitly, drain packets sneaking through, and drain the
+ * interrupt taskqueues scheduled due to the stealth packets.
+ */
+static void
+hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
+{
+
+ hn_disable_rx(sc);
+ hn_drain_rxtx(sc, nchan);
+}
+
+static int
+hn_synth_attach(struct hn_softc *sc, int mtu)
+{
+#define ATTACHED_NVS 0x0002
+#define ATTACHED_RNDIS 0x0004
+
+ struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
+ int error, nsubch, nchan = 1, i, rndis_inited;
+ uint32_t old_caps, attached = 0;
+
+ KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
+ ("synthetic parts were attached"));
+
+ if (!hn_synth_attachable(sc))
+ return (ENXIO);
+
+ /* Save capabilities for later verification. */
+ old_caps = sc->hn_caps;
+ sc->hn_caps = 0;
+
+ /* Clear RSS stuffs. */
+ sc->hn_rss_ind_size = 0;
+ sc->hn_rss_hash = 0;
+ sc->hn_rss_hcap = 0;
+
+ /*
+ * Attach the primary channel _before_ attaching NVS and RNDIS.
+ */
+ error = hn_chan_attach(sc, sc->hn_prichan);
+ if (error)
+ goto failed;
+
+ /*
+ * Attach NVS.
+ */
+ error = hn_nvs_attach(sc, mtu);
+ if (error)
+ goto failed;
+ attached |= ATTACHED_NVS;
+
+ /*
+ * Attach RNDIS _after_ NVS is attached.
+ */
+ error = hn_rndis_attach(sc, mtu, &rndis_inited);
+ if (rndis_inited)
+ attached |= ATTACHED_RNDIS;
+ if (error)
+ goto failed;
+
+ /*
+ * Make sure capabilities are not changed.
+ */
+ if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
+ if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
+ old_caps, sc->hn_caps);
+ error = ENXIO;
+ goto failed;
+ }
+
+ /*
+ * Allocate sub-channels for multi-TX/RX rings.
+ *
+ * NOTE:
+ * The # of RX rings that can be used is equivalent to the # of
+ * channels to be requested.
+ */
+ nsubch = sc->hn_rx_ring_cnt - 1;
+ error = hn_synth_alloc_subchans(sc, &nsubch);
+ if (error)
+ goto failed;
+ /* NOTE: _Full_ synthetic parts detach is required now. */
+ sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
+
+ /*
+ * Set the # of TX/RX rings that could be used according to
+ * the # of channels that NVS offered.
+ */
+ nchan = nsubch + 1;
+ hn_set_ring_inuse(sc, nchan);
+ if (nchan == 1) {
+ /* Only the primary channel can be used; done */
+ goto back;
+ }
+
+ /*
+ * Attach the sub-channels.
+ *
+ * NOTE: hn_set_ring_inuse() _must_ have been called.
+ */
+ error = hn_attach_subchans(sc);
+ if (error)
+ goto failed;
+
+ /*
+ * Configure RSS key and indirect table _after_ all sub-channels
+ * are attached.
+ */
+ if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
+ /*
+ * RSS key is not set yet; set it to the default RSS key.
+ */
+ if (bootverbose)
+ if_printf(sc->hn_ifp, "setup default RSS key\n");
+#ifdef RSS
+ rss_getkey(rss->rss_key);
+#else
+ memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
+#endif
+ sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
+ }
+
+ if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
+ /*
+ * RSS indirect table is not set yet; set it up in round-
+ * robin fashion.
+ */
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "setup default RSS indirect "
+ "table\n");
+ }
+ for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
+ uint32_t subidx;
+
+#ifdef RSS
+ subidx = rss_get_indirection_to_bucket(i);
+#else
+ subidx = i;
+#endif
+ rss->rss_ind[i] = subidx % nchan;
+ }
+ sc->hn_flags |= HN_FLAG_HAS_RSSIND;
+ } else {
+ /*
+ * # of usable channels may be changed, so we have to
+ * make sure that all entries in RSS indirect table
+ * are valid.
+ *
+ * NOTE: hn_set_ring_inuse() _must_ have been called.
+ */
+ hn_rss_ind_fixup(sc);
+ }
+
+ sc->hn_rss_hash = sc->hn_rss_hcap;
+ if ((sc->hn_flags & HN_FLAG_RXVF) ||
+ (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
+ /* NOTE: Don't reconfigure RSS; will do immediately. */
+ hn_vf_rss_fixup(sc, false);
+ }
+ error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
+ if (error)
+ goto failed;
+back:
+ /*
+ * Fixup transmission aggregation setup.
+ */
+ hn_set_txagg(sc);
+ hn_rndis_init_fixat(sc, nchan);
+ return (0);
+
+failed:
+ if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
+ hn_rndis_init_fixat(sc, nchan);
+ hn_synth_detach(sc);
+ } else {
+ if (attached & ATTACHED_RNDIS) {
+ hn_rndis_init_fixat(sc, nchan);
+ hn_rndis_detach(sc);
+ }
+ if (attached & ATTACHED_NVS)
+ hn_nvs_detach(sc);
+ hn_chan_detach(sc, sc->hn_prichan);
+ /* Restore old capabilities. */
+ sc->hn_caps = old_caps;
+ }
+ return (error);
+
+#undef ATTACHED_RNDIS
+#undef ATTACHED_NVS
+}
+
+/*
+ * NOTE:
+ * The interface must have been suspended though hn_suspend(), before
+ * this function get called.
+ */
+static void
+hn_synth_detach(struct hn_softc *sc)
+{
+
+ KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
+ ("synthetic parts were not attached"));
+
+ /* Detach the RNDIS first. */
+ hn_rndis_detach(sc);
+
+ /* Detach NVS. */
+ hn_nvs_detach(sc);
+
+ /* Detach all of the channels. */
+ hn_detach_allchans(sc);
+
+ if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
+ /*
+ * Host is post-Win2016, disconnect RXBUF from primary channel here.
+ */
+ int error;
+
+ error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
+ sc->hn_rxbuf_gpadl);
+ if (error) {
+ if_printf(sc->hn_ifp,
+ "rxbuf gpadl disconn failed: %d\n", error);
+ sc->hn_flags |= HN_FLAG_RXBUF_REF;
+ }
+ sc->hn_rxbuf_gpadl = 0;
+ }
+
+ if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
+ /*
+ * Host is post-Win2016, disconnect chimney sending buffer from
+ * primary channel here.
+ */
+ int error;
+
+ error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
+ sc->hn_chim_gpadl);
+ if (error) {
+ if_printf(sc->hn_ifp,
+ "chim gpadl disconn failed: %d\n", error);
+ sc->hn_flags |= HN_FLAG_CHIM_REF;
+ }
+ sc->hn_chim_gpadl = 0;
+ }
+ sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
+}
+
+static void
+hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
+{
+ KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
+ ("invalid ring count %d", ring_cnt));
+
+ if (sc->hn_tx_ring_cnt > ring_cnt)
+ sc->hn_tx_ring_inuse = ring_cnt;
+ else
+ sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
+ sc->hn_rx_ring_inuse = ring_cnt;
+
+#ifdef RSS
+ if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
+ if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
+ "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
+ rss_getnumbuckets());
+ }
+#endif
+
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
+ sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
+ }
+}
+
+static void
+hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
+{
+
+ /*
+ * NOTE:
+ * The TX bufring will not be drained by the hypervisor,
+ * if the primary channel is revoked.
+ */
+ while (!vmbus_chan_rx_empty(chan) ||
+ (!vmbus_chan_is_revoked(sc->hn_prichan) &&
+ !vmbus_chan_tx_empty(chan)))
+ pause("waitch", 1);
+ vmbus_chan_intr_drain(chan);
+}
+
+static void
+hn_disable_rx(struct hn_softc *sc)
+{
+
+ /*
+ * Disable RX by clearing RX filter forcefully.
+ */
+ sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
+ hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
+
+ /*
+ * Give RNDIS enough time to flush all pending data packets.
+ */
+ pause("waitrx", (200 * hz) / 1000);
+}
+
+/*
+ * NOTE:
+ * RX/TX _must_ have been suspended/disabled, before this function
+ * is called.
+ */
+static void
+hn_drain_rxtx(struct hn_softc *sc, int nchan)
+{
+ struct vmbus_channel **subch = NULL;
+ int nsubch;
+
+ /*
+ * Drain RX/TX bufrings and interrupts.
+ */
+ nsubch = nchan - 1;
+ if (nsubch > 0)
+ subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
+
+ if (subch != NULL) {
+ int i;
+
+ for (i = 0; i < nsubch; ++i)
+ hn_chan_drain(sc, subch[i]);
+ }
+ hn_chan_drain(sc, sc->hn_prichan);
+
+ if (subch != NULL)
+ vmbus_subchan_rel(subch, nsubch);
+}
+
+static void
+hn_suspend_data(struct hn_softc *sc)
+{
+ struct hn_tx_ring *txr;
+ int i;
+
+ HN_LOCK_ASSERT(sc);
+
+ /*
+ * Suspend TX.
+ */
+ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
+ txr = &sc->hn_tx_ring[i];
+
+ mtx_lock(&txr->hn_tx_lock);
+ txr->hn_suspended = 1;
+ mtx_unlock(&txr->hn_tx_lock);
+ /* No one is able send more packets now. */
+
+ /*
+ * Wait for all pending sends to finish.
+ *
+ * NOTE:
+ * We will _not_ receive all pending send-done, if the
+ * primary channel is revoked.
+ */
+ while (hn_tx_ring_pending(txr) &&
+ !vmbus_chan_is_revoked(sc->hn_prichan))
+ pause("hnwtx", 1 /* 1 tick */);
+ }
+
+ /*
+ * Disable RX.
+ */
+ hn_disable_rx(sc);
+
+ /*
+ * Drain RX/TX.
+ */
+ hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
+
+ /*
+ * Drain any pending TX tasks.
+ *
+ * NOTE:
+ * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
+ * tasks will have to be drained _after_ the above hn_drain_rxtx().
+ */
+ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
+ txr = &sc->hn_tx_ring[i];
+
+ taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
+ taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
+ }
+}
+
+static void
+hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
+{
+
+ ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
+}
+
+static void
+hn_suspend_mgmt(struct hn_softc *sc)
+{
+ struct task task;
+
+ HN_LOCK_ASSERT(sc);
+
+ /*
+ * Make sure that hn_mgmt_taskq0 can nolonger be accessed
+ * through hn_mgmt_taskq.
+ */
+ TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
+ vmbus_chan_run_task(sc->hn_prichan, &task);
+
+ /*
+ * Make sure that all pending management tasks are completed.
+ */
+ taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
+ taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
+ taskqueue_drain_all(sc->hn_mgmt_taskq0);
+}
+
+static void
+hn_suspend(struct hn_softc *sc)
+{
+
+ /* Disable polling. */
+ hn_polling(sc, 0);
+
+ /*
+ * If the non-transparent mode VF is activated, the synthetic
+ * device is receiving packets, so the data path of the
+ * synthetic device must be suspended.
+ */
+ if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
+ (sc->hn_flags & HN_FLAG_RXVF))
+ hn_suspend_data(sc);
+ hn_suspend_mgmt(sc);
+}
+
+static void
+hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
+{
+ int i;
+
+ KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
+ ("invalid TX ring count %d", tx_ring_cnt));
+
+ for (i = 0; i < tx_ring_cnt; ++i) {
+ struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
+
+ mtx_lock(&txr->hn_tx_lock);
+ txr->hn_suspended = 0;
+ mtx_unlock(&txr->hn_tx_lock);
+ }
+}
+
+static void
+hn_resume_data(struct hn_softc *sc)
+{
+ int i;
+
+ HN_LOCK_ASSERT(sc);
+
+ /*
+ * Re-enable RX.
+ */
+ hn_rxfilter_config(sc);
+
+ /*
+ * Make sure to clear suspend status on "all" TX rings,
+ * since hn_tx_ring_inuse can be changed after
+ * hn_suspend_data().
+ */
+ hn_resume_tx(sc, sc->hn_tx_ring_cnt);
+
+#ifdef HN_IFSTART_SUPPORT
+ if (!hn_use_if_start)
+#endif
+ {
+ /*
+ * Flush unused drbrs, since hn_tx_ring_inuse may be
+ * reduced.
+ */
+ for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
+ hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
+ }
+
+ /*
+ * Kick start TX.
+ */
+ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
+ struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
+
+ /*
+ * Use txeof task, so that any pending oactive can be
+ * cleared properly.
+ */
+ taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
+ }
+}
+
+static void
+hn_resume_mgmt(struct hn_softc *sc)
+{
+
+ sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
+
+ /*
+ * Kick off network change detection, if it was pending.
+ * If no network change was pending, start link status
+ * checks, which is more lightweight than network change
+ * detection.
+ */
+ if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
+ hn_change_network(sc);
+ else
+ hn_update_link_status(sc);
+}
+
+static void
+hn_resume(struct hn_softc *sc)
+{
+
+ /*
+ * If the non-transparent mode VF is activated, the synthetic
+ * device have to receive packets, so the data path of the
+ * synthetic device must be resumed.
+ */
+ if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
+ (sc->hn_flags & HN_FLAG_RXVF))
+ hn_resume_data(sc);
+
+ /*
+ * Don't resume link status change if VF is attached/activated.
+ * - In the non-transparent VF mode, the synthetic device marks
+ * link down until the VF is deactivated; i.e. VF is down.
+ * - In transparent VF mode, VF's media status is used until
+ * the VF is detached.
+ */
+ if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
+ !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
+ hn_resume_mgmt(sc);
+
+ /*
+ * Re-enable polling if this interface is running and
+ * the polling is requested.
+ */
+ if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
+ hn_polling(sc, sc->hn_pollhz);
+}
+
+static void
+hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
+{
+ const struct rndis_status_msg *msg;
+ int ofs;
+
+ if (dlen < sizeof(*msg)) {
+ if_printf(sc->hn_ifp, "invalid RNDIS status\n");
+ return;
+ }
+ msg = data;
+
+ switch (msg->rm_status) {
+ case RNDIS_STATUS_MEDIA_CONNECT:
+ case RNDIS_STATUS_MEDIA_DISCONNECT:
+ hn_update_link_status(sc);
+ break;
+
+ case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
+ case RNDIS_STATUS_LINK_SPEED_CHANGE:
+ /* Not really useful; ignore. */
+ break;
+
+ case RNDIS_STATUS_NETWORK_CHANGE:
+ ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
+ if (dlen < ofs + msg->rm_stbuflen ||
+ msg->rm_stbuflen < sizeof(uint32_t)) {
+ if_printf(sc->hn_ifp, "network changed\n");
+ } else {
+ uint32_t change;
+
+ memcpy(&change, ((const uint8_t *)msg) + ofs,
+ sizeof(change));
+ if_printf(sc->hn_ifp, "network changed, change %u\n",
+ change);
+ }
+ hn_change_network(sc);
+ break;
+
+ default:
+ if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
+ msg->rm_status);
+ break;
+ }
+}
+
+static int
+hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
+{
+ const struct rndis_pktinfo *pi = info_data;
+ uint32_t mask = 0;
+
+ while (info_dlen != 0) {
+ const void *data;
+ uint32_t dlen;
+
+ if (__predict_false(info_dlen < sizeof(*pi)))
+ return (EINVAL);
+ if (__predict_false(info_dlen < pi->rm_size))
+ return (EINVAL);
+ info_dlen -= pi->rm_size;
+
+ if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
+ return (EINVAL);
+ if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
+ return (EINVAL);
+ dlen = pi->rm_size - pi->rm_pktinfooffset;
+ data = pi->rm_data;
+
+ if (pi->rm_internal == 1) {
+ switch (pi->rm_type) {
+ case NDIS_PKTINFO_IT_PKTINFO_ID:
+ if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
+ return (EINVAL);
+ info->pktinfo_id =
+ (const struct packet_info_id *)data;
+ mask |= HN_RXINFO_PKTINFO_ID;
+ break;
+
+ default:
+ goto next;
+ }
+ } else {
+ switch (pi->rm_type) {
+ case NDIS_PKTINFO_TYPE_VLAN:
+ if (__predict_false(dlen
+ < NDIS_VLAN_INFO_SIZE))
+ return (EINVAL);
+ info->vlan_info = (const uint32_t *)data;
+ mask |= HN_RXINFO_VLAN;
+ break;
+
+ case NDIS_PKTINFO_TYPE_CSUM:
+ if (__predict_false(dlen
+ < NDIS_RXCSUM_INFO_SIZE))
+ return (EINVAL);
+ info->csum_info = (const uint32_t *)data;
+ mask |= HN_RXINFO_CSUM;
+ break;
+
+ case HN_NDIS_PKTINFO_TYPE_HASHVAL:
+ if (__predict_false(dlen
+ < HN_NDIS_HASH_VALUE_SIZE))
+ return (EINVAL);
+ info->hash_value = (const uint32_t *)data;
+ mask |= HN_RXINFO_HASHVAL;
+ break;
+
+ case HN_NDIS_PKTINFO_TYPE_HASHINF:
+ if (__predict_false(dlen
+ < HN_NDIS_HASH_INFO_SIZE))
+ return (EINVAL);
+ info->hash_info = (const uint32_t *)data;
+ mask |= HN_RXINFO_HASHINF;
+ break;
+
+ default:
+ goto next;
+ }
+ }
+
+ if (mask == HN_RXINFO_ALL) {
+ /* All found; done */
+ break;
+ }
+next:
+ pi = (const struct rndis_pktinfo *)
+ ((const uint8_t *)pi + pi->rm_size);
+ }
+
+ /*
+ * Final fixup.
+ * - If there is no hash value, invalidate the hash info.
+ */
+ if ((mask & HN_RXINFO_HASHVAL) == 0)
+ info->hash_info = NULL;
+ return (0);
+}
+
+static __inline bool
+hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
+{
+
+ if (off < check_off) {
+ if (__predict_true(off + len <= check_off))
+ return (false);
+ } else if (off > check_off) {
+ if (__predict_true(check_off + check_len <= off))
+ return (false);
+ }
+ return (true);
+}
+
+static __inline void
+hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
+ uint32_t len, struct hn_rxinfo *info)
+{
+ uint32_t cnt = rxr->rsc.cnt;
+
+ if (cnt) {
+ rxr->rsc.pktlen += len;
+ } else {
+ rxr->rsc.vlan_info = info->vlan_info;
+ rxr->rsc.csum_info = info->csum_info;
+ rxr->rsc.hash_info = info->hash_info;
+ rxr->rsc.hash_value = info->hash_value;
+ rxr->rsc.pktlen = len;
+ }
+
+ rxr->rsc.frag_data[cnt] = data;
+ rxr->rsc.frag_len[cnt] = len;
+ rxr->rsc.cnt++;
+}
+
+static void
+hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
+{
+ const struct rndis_packet_msg *pkt;
+ struct hn_rxinfo info;
+ int data_off, pktinfo_off, data_len, pktinfo_len;
+ bool rsc_more= false;
+
+ /*
+ * Check length.
+ */
+ if (__predict_false(dlen < sizeof(*pkt))) {
+ if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
+ return;
+ }
+ pkt = data;
+
+ if (__predict_false(dlen < pkt->rm_len)) {
+ if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
+ "dlen %d, msglen %u\n", dlen, pkt->rm_len);
+ return;
+ }
+ if (__predict_false(pkt->rm_len <
+ pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
+ if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
+ "msglen %u, data %u, oob %u, pktinfo %u\n",
+ pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
+ pkt->rm_pktinfolen);
+ return;
+ }
+ if (__predict_false(pkt->rm_datalen == 0)) {
+ if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
+ return;
+ }
+
+ /*
+ * Check offests.
+ */
+#define IS_OFFSET_INVALID(ofs) \
+ ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
+ ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
+
+ /* XXX Hyper-V does not meet data offset alignment requirement */
+ if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
+ if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+ "data offset %u\n", pkt->rm_dataoffset);
+ return;
+ }
+ if (__predict_false(pkt->rm_oobdataoffset > 0 &&
+ IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
+ if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+ "oob offset %u\n", pkt->rm_oobdataoffset);
+ return;
+ }
+ if (__predict_true(pkt->rm_pktinfooffset > 0) &&
+ __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
+ if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+ "pktinfo offset %u\n", pkt->rm_pktinfooffset);
+ return;
+ }
+
+#undef IS_OFFSET_INVALID
+
+ data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
+ data_len = pkt->rm_datalen;
+ pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
+ pktinfo_len = pkt->rm_pktinfolen;
+
+ /*
+ * Check OOB coverage.
+ */
+ if (__predict_false(pkt->rm_oobdatalen != 0)) {
+ int oob_off, oob_len;
+
+ if_printf(rxr->hn_ifp, "got oobdata\n");
+ oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
+ oob_len = pkt->rm_oobdatalen;
+
+ if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
+ if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+ "oob overflow, msglen %u, oob abs %d len %d\n",
+ pkt->rm_len, oob_off, oob_len);
+ return;
+ }
+
+ /*
+ * Check against data.
+ */
+ if (hn_rndis_check_overlap(oob_off, oob_len,
+ data_off, data_len)) {
+ if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+ "oob overlaps data, oob abs %d len %d, "
+ "data abs %d len %d\n",
+ oob_off, oob_len, data_off, data_len);
+ return;
+ }
+
+ /*
+ * Check against pktinfo.
+ */
+ if (pktinfo_len != 0 &&
+ hn_rndis_check_overlap(oob_off, oob_len,
+ pktinfo_off, pktinfo_len)) {
+ if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+ "oob overlaps pktinfo, oob abs %d len %d, "
+ "pktinfo abs %d len %d\n",
+ oob_off, oob_len, pktinfo_off, pktinfo_len);
+ return;
+ }
+ }
+
+ /*
+ * Check per-packet-info coverage and find useful per-packet-info.
+ */
+ info.vlan_info = NULL;
+ info.csum_info = NULL;
+ info.hash_info = NULL;
+ info.pktinfo_id = NULL;
+
+ if (__predict_true(pktinfo_len != 0)) {
+ bool overlap;
+ int error;
+
+ if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
+ if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+ "pktinfo overflow, msglen %u, "
+ "pktinfo abs %d len %d\n",
+ pkt->rm_len, pktinfo_off, pktinfo_len);
+ return;
+ }
+
+ /*
+ * Check packet info coverage.
+ */
+ overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
+ data_off, data_len);
+ if (__predict_false(overlap)) {
+ if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+ "pktinfo overlap data, pktinfo abs %d len %d, "
+ "data abs %d len %d\n",
+ pktinfo_off, pktinfo_len, data_off, data_len);
+ return;
+ }
+
+ /*
+ * Find useful per-packet-info.
+ */
+ error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
+ pktinfo_len, &info);
+ if (__predict_false(error)) {
+ if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
+ "pktinfo\n");
+ return;
+ }
+ }
+
+ if (__predict_false(data_off + data_len > pkt->rm_len)) {
+ if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
+ "data overflow, msglen %u, data abs %d len %d\n",
+ pkt->rm_len, data_off, data_len);
+ return;
+ }
+
+ /* Identify RSC fragments, drop invalid packets */
+ if ((info.pktinfo_id != NULL) &&
+ (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
+ if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
+ rxr->rsc.cnt = 0;
+ rxr->hn_rsc_pkts++;
+ } else if (rxr->rsc.cnt == 0)
+ goto drop;
+
+ rsc_more = true;
+
+ if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
+ rsc_more = false;
+
+ if (rsc_more && rxr->rsc.is_last)
+ goto drop;
+ } else {
+ rxr->rsc.cnt = 0;
+ }
+
+ if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
+ goto drop;
+
+ /* Store data in per rx ring structure */
+ hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
+ data_len, &info);
+
+ if (rsc_more)
+ return;
+
+ hn_rxpkt(rxr);
+ rxr->rsc.cnt = 0;
+ return;
+drop:
+ rxr->hn_rsc_drop++;
+ return;
+}
+
+static __inline void
+hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
+{
+ const struct rndis_msghdr *hdr;
+
+ if (__predict_false(dlen < sizeof(*hdr))) {
+ if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
+ return;
+ }
+ hdr = data;
+
+ if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
+ /* Hot data path. */
+ hn_rndis_rx_data(rxr, data, dlen);
+ /* Done! */
+ return;
+ }
+
+ if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
+ hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
+ else
+ hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
+}
+
+static void
+hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
+{
+ const struct hn_nvs_hdr *hdr;
+
+ if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
+ if_printf(sc->hn_ifp, "invalid nvs notify\n");
+ return;
+ }
+ hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
+
+ if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
+ /* Useless; ignore */
+ return;
+ }
+ if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
+}
+
+static void
+hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
+ const struct vmbus_chanpkt_hdr *pkt)
+{
+ struct hn_nvs_sendctx *sndc;
+
+ sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
+ sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
+ VMBUS_CHANPKT_DATALEN(pkt));
+ /*
+ * NOTE:
+ * 'sndc' CAN NOT be accessed anymore, since it can be freed by
+ * its callback.
+ */
+}
+
+static void
+hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
+ const struct vmbus_chanpkt_hdr *pkthdr)
+{
+ struct epoch_tracker et;
+ const struct vmbus_chanpkt_rxbuf *pkt;
+ const struct hn_nvs_hdr *nvs_hdr;
+ int count, i, hlen;
+
+ if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
+ if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
+ return;
+ }
+ nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
+
+ /* Make sure that this is a RNDIS message. */
+ if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
+ if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
+ nvs_hdr->nvs_type);
+ return;
+ }
+
+ hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
+ if (__predict_false(hlen < sizeof(*pkt))) {
+ if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
+ return;
+ }
+ pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
+
+ if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
+ if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
+ pkt->cp_rxbuf_id);
+ return;
+ }
+
+ count = pkt->cp_rxbuf_cnt;
+ if (__predict_false(hlen <
+ __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
+ if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
+ return;
+ }
+
+ NET_EPOCH_ENTER(et);
+ /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
+ for (i = 0; i < count; ++i) {
+ int ofs, len;
+
+ ofs = pkt->cp_rxbuf[i].rb_ofs;
+ len = pkt->cp_rxbuf[i].rb_len;
+ if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
+ if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
+ "ofs %d, len %d\n", i, ofs, len);
+ continue;
+ }
+
+ rxr->rsc.is_last = (i == (count - 1));
+ hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
+ }
+ NET_EPOCH_EXIT(et);
+
+ /*
+ * Ack the consumed RXBUF associated w/ this channel packet,
+ * so that this RXBUF can be recycled by the hypervisor.
+ */
+ hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
+}
+
+static void
+hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
+ uint64_t tid)
+{
+ struct hn_nvs_rndis_ack ack;
+ int retries, error;
+
+ ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
+ ack.nvs_status = HN_NVS_STATUS_OK;
+
+ retries = 0;
+again:
+ error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
+ VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
+ if (__predict_false(error == EAGAIN)) {
+ /*
+ * NOTE:
+ * This should _not_ happen in real world, since the
+ * consumption of the TX bufring from the TX path is
+ * controlled.
+ */
+ if (rxr->hn_ack_failed == 0)
+ if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
+ rxr->hn_ack_failed++;
+ retries++;
+ if (retries < 10) {
+ DELAY(100);
+ goto again;
+ }
+ /* RXBUF leaks! */
+ if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
+ }
+}
+
+static void
+hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
+{
+ struct hn_rx_ring *rxr = xrxr;
+ struct hn_softc *sc = rxr->hn_ifp->if_softc;
+
+ for (;;) {
+ struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
+ int error, pktlen;
+
+ pktlen = rxr->hn_pktbuf_len;
+ error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
+ if (__predict_false(error == ENOBUFS)) {
+ void *nbuf;
+ int nlen;
+
+ /*
+ * Expand channel packet buffer.
+ *
+ * XXX
+ * Use M_WAITOK here, since allocation failure
+ * is fatal.
+ */
+ nlen = rxr->hn_pktbuf_len * 2;
+ while (nlen < pktlen)
+ nlen *= 2;
+ nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
+
+ if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
+ rxr->hn_pktbuf_len, nlen);
+
+ free(rxr->hn_pktbuf, M_DEVBUF);
+ rxr->hn_pktbuf = nbuf;
+ rxr->hn_pktbuf_len = nlen;
+ /* Retry! */
+ continue;
+ } else if (__predict_false(error == EAGAIN)) {
+ /* No more channel packets; done! */
+ break;
+ }
+ KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
+
+ switch (pkt->cph_type) {
+ case VMBUS_CHANPKT_TYPE_COMP:
+ hn_nvs_handle_comp(sc, chan, pkt);
+ break;
+
+ case VMBUS_CHANPKT_TYPE_RXBUF:
+ hn_nvs_handle_rxbuf(rxr, chan, pkt);
+ break;
+
+ case VMBUS_CHANPKT_TYPE_INBAND:
+ hn_nvs_handle_notify(sc, pkt);
+ break;
+
+ default:
+ if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
+ pkt->cph_type);
+ break;
+ }
+ }
+ hn_chan_rollup(rxr, rxr->hn_txr);
+}
+
+static void
+hn_sysinit(void *arg __unused)
+{
+ int i;
+
+ hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
+
+#ifdef HN_IFSTART_SUPPORT
+ /*
+ * Don't use ifnet.if_start if transparent VF mode is requested;
+ * mainly due to the IFF_DRV_OACTIVE flag.
+ */
+ if (hn_xpnt_vf && hn_use_if_start) {
+ hn_use_if_start = 0;
+ printf("hn: tranparent VF mode, if_transmit will be used, "
+ "instead of if_start\n");
+ }
+#endif
+ if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
+ printf("hn: invalid transparent VF attach routing "
+ "wait timeout %d, reset to %d\n",
+ hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
+ hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
+ }
+
+ /*
+ * Initialize VF map.
+ */
+ rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
+ hn_vfmap_size = HN_VFMAP_SIZE_DEF;
+ hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
+ M_WAITOK | M_ZERO);
+
+ /*
+ * Fix the # of TX taskqueues.
+ */
+ if (hn_tx_taskq_cnt <= 0)
+ hn_tx_taskq_cnt = 1;
+ else if (hn_tx_taskq_cnt > mp_ncpus)
+ hn_tx_taskq_cnt = mp_ncpus;
+
+ /*
+ * Fix the TX taskqueue mode.
+ */
+ switch (hn_tx_taskq_mode) {
+ case HN_TX_TASKQ_M_INDEP:
+ case HN_TX_TASKQ_M_GLOBAL:
+ case HN_TX_TASKQ_M_EVTTQ:
+ break;
+ default:
+ hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
+ break;
+ }
+
+ if (vm_guest != VM_GUEST_HV)
+ return;
+
+ if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
+ return;
+
+ hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
+ M_DEVBUF, M_WAITOK);
+ for (i = 0; i < hn_tx_taskq_cnt; ++i) {
+ hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
+ taskqueue_thread_enqueue, &hn_tx_taskque[i]);
+ taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
+ "hn tx%d", i);
+ }
+}
+SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
+
+static void
+hn_sysuninit(void *arg __unused)
+{
+
+ if (hn_tx_taskque != NULL) {
+ int i;
+
+ for (i = 0; i < hn_tx_taskq_cnt; ++i)
+ taskqueue_free(hn_tx_taskque[i]);
+ free(hn_tx_taskque, M_DEVBUF);
+ }
+
+ if (hn_vfmap != NULL)
+ free(hn_vfmap, M_DEVBUF);
+ rm_destroy(&hn_vfmap_lock);
+
+ counter_u64_free(hn_udpcs_fixup);
+}
+SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
diff --git a/sys/dev/hyperv/netvsc/if_hnreg.h b/sys/dev/hyperv/netvsc/if_hnreg.h
new file mode 100644
index 000000000000..54db556cc56d
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/if_hnreg.h
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IF_HNREG_H_
+#define _IF_HNREG_H_
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+/*
+ * NDIS protocol version numbers
+ */
+#define HN_NDIS_VERSION_6_1 0x00060001
+#define HN_NDIS_VERSION_6_20 0x00060014
+#define HN_NDIS_VERSION_6_30 0x0006001e
+#define HN_NDIS_VERSION_MAJOR(ver) (((ver) & 0xffff0000) >> 16)
+#define HN_NDIS_VERSION_MINOR(ver) ((ver) & 0xffff)
+
+/*
+ * NVS versions.
+ */
+#define HN_NVS_VERSION_1 0x00002
+#define HN_NVS_VERSION_2 0x30002
+#define HN_NVS_VERSION_4 0x40000
+#define HN_NVS_VERSION_5 0x50000
+#define HN_NVS_VERSION_6 0x60000
+#define HN_NVS_VERSION_61 0x60001
+
+#define HN_NVS_RXBUF_SIG 0xcafe
+#define HN_NVS_CHIM_SIG 0xface
+
+#define HN_NVS_CHIM_IDX_INVALID 0xffffffff
+
+#define HN_NVS_RNDIS_MTYPE_DATA 0
+#define HN_NVS_RNDIS_MTYPE_CTRL 1
+
+/*
+ * NVS message transacion status codes.
+ */
+#define HN_NVS_STATUS_OK 1
+#define HN_NVS_STATUS_FAILED 2
+
+/*
+ * NVS request/response message types.
+ */
+#define HN_NVS_TYPE_INIT 1
+#define HN_NVS_TYPE_INIT_RESP 2
+#define HN_NVS_TYPE_NDIS_INIT 100
+#define HN_NVS_TYPE_RXBUF_CONN 101
+#define HN_NVS_TYPE_RXBUF_CONNRESP 102
+#define HN_NVS_TYPE_RXBUF_DISCONN 103
+#define HN_NVS_TYPE_CHIM_CONN 104
+#define HN_NVS_TYPE_CHIM_CONNRESP 105
+#define HN_NVS_TYPE_CHIM_DISCONN 106
+#define HN_NVS_TYPE_RNDIS 107
+#define HN_NVS_TYPE_RNDIS_ACK 108
+#define HN_NVS_TYPE_NDIS_CONF 125
+#define HN_NVS_TYPE_VFASSOC_NOTE 128 /* notification */
+#define HN_NVS_TYPE_SET_DATAPATH 129
+#define HN_NVS_TYPE_SUBCH_REQ 133
+#define HN_NVS_TYPE_SUBCH_RESP 133 /* same as SUBCH_REQ */
+#define HN_NVS_TYPE_TXTBL_NOTE 134 /* notification */
+
+/*
+ * Any size less than this one will _not_ work, e.g. hn_nvs_init
+ * only has 12B valid data, however, if only 12B data were sent,
+ * Hypervisor would never reply.
+ */
+#define HN_NVS_REQSIZE_MIN 32
+
+/* NVS message common header */
+struct hn_nvs_hdr {
+ uint32_t nvs_type;
+} __packed;
+
+struct hn_nvs_init {
+ uint32_t nvs_type; /* HN_NVS_TYPE_INIT */
+ uint32_t nvs_ver_min;
+ uint32_t nvs_ver_max;
+ uint8_t nvs_rsvd[20];
+ uint8_t nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_init) >= HN_NVS_REQSIZE_MIN);
+
+struct hn_nvs_init_resp {
+ uint32_t nvs_type; /* HN_NVS_TYPE_INIT_RESP */
+ uint32_t nvs_ver; /* deprecated */
+ uint32_t nvs_rsvd;
+ uint32_t nvs_status; /* HN_NVS_STATUS_ */
+} __packed;
+
+/* No reponse */
+struct hn_nvs_ndis_conf {
+ uint32_t nvs_type; /* HN_NVS_TYPE_NDIS_CONF */
+ uint32_t nvs_mtu;
+ uint32_t nvs_rsvd;
+ uint64_t nvs_caps; /* HN_NVS_NDIS_CONF_ */
+ uint8_t nvs_rsvd1[12];
+ uint8_t nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_ndis_conf) >= HN_NVS_REQSIZE_MIN);
+
+#define HN_NVS_NDIS_CONF_SRIOV 0x0004
+#define HN_NVS_NDIS_CONF_VLAN 0x0008
+#define HN_NVS_NDIS_CONF_RSC 0x0080
+
+/* No response */
+struct hn_nvs_ndis_init {
+ uint32_t nvs_type; /* HN_NVS_TYPE_NDIS_INIT */
+ uint32_t nvs_ndis_major; /* NDIS_VERSION_MAJOR_ */
+ uint32_t nvs_ndis_minor; /* NDIS_VERSION_MINOR_ */
+ uint8_t nvs_rsvd[20];
+ uint8_t nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_ndis_init) >= HN_NVS_REQSIZE_MIN);
+
+#define HN_NVS_DATAPATH_SYNTH 0
+#define HN_NVS_DATAPATH_VF 1
+
+/* No response */
+struct hn_nvs_datapath {
+ uint32_t nvs_type; /* HN_NVS_TYPE_SET_DATAPATH */
+ uint32_t nvs_active_path;/* HN_NVS_DATAPATH_* */
+ uint32_t nvs_rsvd[6];
+ uint8_t nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_datapath) >= HN_NVS_REQSIZE_MIN);
+
+struct hn_nvs_rxbuf_conn {
+ uint32_t nvs_type; /* HN_NVS_TYPE_RXBUF_CONN */
+ uint32_t nvs_gpadl; /* RXBUF vmbus GPADL */
+ uint16_t nvs_sig; /* HN_NVS_RXBUF_SIG */
+ uint8_t nvs_rsvd[22];
+ uint8_t nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_rxbuf_conn) >= HN_NVS_REQSIZE_MIN);
+
+struct hn_nvs_rxbuf_sect {
+ uint32_t nvs_start;
+ uint32_t nvs_slotsz;
+ uint32_t nvs_slotcnt;
+ uint32_t nvs_end;
+} __packed;
+
+struct hn_nvs_rxbuf_connresp {
+ uint32_t nvs_type; /* HN_NVS_TYPE_RXBUF_CONNRESP */
+ uint32_t nvs_status; /* HN_NVS_STATUS_ */
+ uint32_t nvs_nsect; /* # of elem in nvs_sect */
+ struct hn_nvs_rxbuf_sect nvs_sect[];
+} __packed;
+
+/* No response */
+struct hn_nvs_rxbuf_disconn {
+ uint32_t nvs_type; /* HN_NVS_TYPE_RXBUF_DISCONN */
+ uint16_t nvs_sig; /* HN_NVS_RXBUF_SIG */
+ uint8_t nvs_rsvd[26];
+ uint8_t nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_rxbuf_disconn) >= HN_NVS_REQSIZE_MIN);
+
+struct hn_nvs_chim_conn {
+ uint32_t nvs_type; /* HN_NVS_TYPE_CHIM_CONN */
+ uint32_t nvs_gpadl; /* chimney buf vmbus GPADL */
+ uint16_t nvs_sig; /* NDIS_NVS_CHIM_SIG */
+ uint8_t nvs_rsvd[22];
+ uint8_t nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_chim_conn) >= HN_NVS_REQSIZE_MIN);
+
+struct hn_nvs_chim_connresp {
+ uint32_t nvs_type; /* HN_NVS_TYPE_CHIM_CONNRESP */
+ uint32_t nvs_status; /* HN_NVS_STATUS_ */
+ uint32_t nvs_sectsz; /* section size */
+} __packed;
+
+/* No response */
+struct hn_nvs_chim_disconn {
+ uint32_t nvs_type; /* HN_NVS_TYPE_CHIM_DISCONN */
+ uint16_t nvs_sig; /* HN_NVS_CHIM_SIG */
+ uint8_t nvs_rsvd[26];
+ uint8_t nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_chim_disconn) >= HN_NVS_REQSIZE_MIN);
+
+#define HN_NVS_SUBCH_OP_ALLOC 1
+
+struct hn_nvs_subch_req {
+ uint32_t nvs_type; /* HN_NVS_TYPE_SUBCH_REQ */
+ uint32_t nvs_op; /* HN_NVS_SUBCH_OP_ */
+ uint32_t nvs_nsubch;
+ uint8_t nvs_rsvd[20];
+ uint8_t nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_subch_req) >= HN_NVS_REQSIZE_MIN);
+
+struct hn_nvs_subch_resp {
+ uint32_t nvs_type; /* HN_NVS_TYPE_SUBCH_RESP */
+ uint32_t nvs_status; /* HN_NVS_STATUS_ */
+ uint32_t nvs_nsubch;
+} __packed;
+
+struct hn_nvs_rndis {
+ uint32_t nvs_type; /* HN_NVS_TYPE_RNDIS */
+ uint32_t nvs_rndis_mtype;/* HN_NVS_RNDIS_MTYPE_ */
+ /*
+ * Chimney sending buffer index and size.
+ *
+ * NOTE:
+ * If nvs_chim_idx is set to HN_NVS_CHIM_IDX_INVALID
+ * and nvs_chim_sz is set to 0, then chimney sending
+ * buffer is _not_ used by this RNDIS message.
+ */
+ uint32_t nvs_chim_idx;
+ uint32_t nvs_chim_sz;
+ uint8_t nvs_rsvd[16];
+ uint8_t nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_rndis) >= HN_NVS_REQSIZE_MIN);
+
+struct hn_nvs_rndis_ack {
+ uint32_t nvs_type; /* HN_NVS_TYPE_RNDIS_ACK */
+ uint32_t nvs_status; /* HN_NVS_STATUS_ */
+ uint8_t nvs_rsvd[24];
+ uint8_t nvs_msg_pad[8];
+} __packed;
+CTASSERT(sizeof(struct hn_nvs_rndis_ack) >= HN_NVS_REQSIZE_MIN);
+
+/*
+ * RNDIS extension
+ */
+
+/* Per-packet hash info */
+#define HN_NDIS_HASH_INFO_SIZE sizeof(uint32_t)
+#define HN_NDIS_PKTINFO_TYPE_HASHINF NDIS_PKTINFO_TYPE_ORIG_NBLIST
+/* NDIS_HASH_ */
+
+/* Per-packet hash value */
+#define HN_NDIS_HASH_VALUE_SIZE sizeof(uint32_t)
+#define HN_NDIS_PKTINFO_TYPE_HASHVAL NDIS_PKTINFO_TYPE_PKT_CANCELID
+
+/* Per-packet-info size */
+#define HN_RNDIS_PKTINFO_SIZE(dlen) \
+ __offsetof(struct rndis_pktinfo, rm_data[dlen])
+
+#endif /* !_IF_HNREG_H_ */
diff --git a/sys/dev/hyperv/netvsc/if_hnvar.h b/sys/dev/hyperv/netvsc/if_hnvar.h
new file mode 100644
index 000000000000..27d93db5395e
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/if_hnvar.h
@@ -0,0 +1,335 @@
+/*-
+ * Copyright (c) 2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IF_HNVAR_H_
+#define _IF_HNVAR_H_
+
+#define HN_USE_TXDESC_BUFRING
+
+#define HN_CHIM_SIZE (15 * 1024 * 1024)
+
+#define HN_RXBUF_SIZE (31 * 1024 * 1024)
+#define HN_RXBUF_SIZE_COMPAT (15 * 1024 * 1024)
+
+#define HN_MTU_MAX (65535 - ETHER_ADDR_LEN)
+
+#define HN_TXBR_SIZE (128 * PAGE_SIZE)
+#define HN_RXBR_SIZE (128 * PAGE_SIZE)
+
+#define HN_XACT_REQ_PGCNT 2
+#define HN_XACT_RESP_PGCNT 2
+#define HN_XACT_REQ_SIZE (HN_XACT_REQ_PGCNT * PAGE_SIZE)
+#define HN_XACT_RESP_SIZE (HN_XACT_RESP_PGCNT * PAGE_SIZE)
+
+#define HN_GPACNT_MAX 32
+
+struct hn_txdesc;
+#ifndef HN_USE_TXDESC_BUFRING
+SLIST_HEAD(hn_txdesc_list, hn_txdesc);
+#else
+struct buf_ring;
+#endif
+struct hn_tx_ring;
+
+#define HN_NVS_RSC_MAX 562 /* Max RSC frags in one vmbus packet */
+
+struct hn_rx_rsc {
+ const uint32_t *vlan_info;
+ const uint32_t *csum_info;
+ const uint32_t *hash_info;
+ const uint32_t *hash_value;
+ uint32_t cnt; /* fragment count */
+ uint32_t pktlen; /* full packet length */
+ uint8_t is_last; /* last fragment */
+ const void *frag_data[HN_NVS_RSC_MAX];
+ uint32_t frag_len[HN_NVS_RSC_MAX];
+};
+
+struct hn_rx_ring {
+ struct ifnet *hn_ifp;
+ struct ifnet *hn_rxvf_ifp; /* SR-IOV VF for RX */
+ struct hn_tx_ring *hn_txr;
+ void *hn_pktbuf;
+ int hn_pktbuf_len;
+ int hn_rx_flags; /* HN_RX_FLAG_ */
+ uint32_t hn_mbuf_hash; /* NDIS_HASH_ */
+ uint8_t *hn_rxbuf; /* shadow sc->hn_rxbuf */
+ int hn_rx_idx;
+ struct hn_rx_rsc rsc;
+
+ /* Trust csum verification on host side */
+ int hn_trust_hcsum; /* HN_TRUST_HCSUM_ */
+ struct lro_ctrl hn_lro;
+
+ u_long hn_csum_ip;
+ u_long hn_csum_tcp;
+ u_long hn_csum_udp;
+ u_long hn_csum_trusted;
+ u_long hn_lro_tried;
+ u_long hn_small_pkts;
+ u_long hn_pkts;
+ u_long hn_rss_pkts;
+ u_long hn_ack_failed;
+ u_long hn_rsc_pkts;
+ u_long hn_rsc_drop;
+
+ /* Rarely used stuffs */
+ struct sysctl_oid *hn_rx_sysctl_tree;
+
+ void *hn_br; /* TX/RX bufring */
+ struct hyperv_dma hn_br_dma;
+
+ struct vmbus_channel *hn_chan;
+} __aligned(CACHE_LINE_SIZE);
+
+#define HN_TRUST_HCSUM_IP 0x0001
+#define HN_TRUST_HCSUM_TCP 0x0002
+#define HN_TRUST_HCSUM_UDP 0x0004
+
+#define HN_RX_FLAG_ATTACHED 0x0001
+#define HN_RX_FLAG_BR_REF 0x0002
+#define HN_RX_FLAG_XPNT_VF 0x0004
+#define HN_RX_FLAG_UDP_HASH 0x0008
+
+struct hn_tx_ring {
+#ifndef HN_USE_TXDESC_BUFRING
+ struct mtx hn_txlist_spin;
+ struct hn_txdesc_list hn_txlist;
+#else
+ struct buf_ring *hn_txdesc_br;
+#endif
+ int hn_txdesc_cnt;
+ int hn_txdesc_avail;
+ u_short hn_has_txeof;
+ u_short hn_txdone_cnt;
+
+ int hn_sched_tx;
+ void (*hn_txeof)(struct hn_tx_ring *);
+ struct taskqueue *hn_tx_taskq;
+ struct task hn_tx_task;
+ struct task hn_txeof_task;
+
+ struct buf_ring *hn_mbuf_br;
+ int hn_oactive;
+ int hn_tx_idx;
+ int hn_tx_flags;
+
+ struct mtx hn_tx_lock;
+ struct hn_softc *hn_sc;
+ struct vmbus_channel *hn_chan;
+
+ int hn_direct_tx_size;
+ int hn_chim_size;
+ bus_dma_tag_t hn_tx_data_dtag;
+ uint64_t hn_csum_assist;
+
+ /* Applied packet transmission aggregation limits. */
+ int hn_agg_szmax;
+ short hn_agg_pktmax;
+ short hn_agg_align;
+
+ /* Packet transmission aggregation states. */
+ struct hn_txdesc *hn_agg_txd;
+ int hn_agg_szleft;
+ short hn_agg_pktleft;
+ struct rndis_packet_msg *hn_agg_prevpkt;
+
+ /* Temporary stats for each sends. */
+ int hn_stat_size;
+ short hn_stat_pkts;
+ short hn_stat_mcasts;
+
+ int (*hn_sendpkt)(struct hn_tx_ring *, struct hn_txdesc *);
+ int hn_suspended;
+ int hn_gpa_cnt;
+ struct vmbus_gpa hn_gpa[HN_GPACNT_MAX];
+
+ u_long hn_no_txdescs;
+ u_long hn_send_failed;
+ u_long hn_txdma_failed;
+ u_long hn_tx_collapsed;
+ u_long hn_tx_chimney_tried;
+ u_long hn_tx_chimney;
+ u_long hn_pkts;
+ u_long hn_sends;
+ u_long hn_flush_failed;
+
+ /* Rarely used stuffs */
+ struct hn_txdesc *hn_txdesc;
+ bus_dma_tag_t hn_tx_rndis_dtag;
+ struct sysctl_oid *hn_tx_sysctl_tree;
+} __aligned(CACHE_LINE_SIZE);
+
+#define HN_TX_FLAG_ATTACHED 0x0001
+#define HN_TX_FLAG_HASHVAL 0x0002 /* support HASHVAL pktinfo */
+
+/*
+ * Device-specific softc structure
+ */
+struct hn_softc {
+ struct ifnet *hn_ifp;
+ struct ifmedia hn_media;
+ device_t hn_dev;
+ int hn_if_flags;
+ struct sx hn_lock;
+ struct vmbus_channel *hn_prichan;
+
+ int hn_rx_ring_cnt;
+ int hn_rx_ring_inuse;
+ struct hn_rx_ring *hn_rx_ring;
+
+ struct rmlock hn_vf_lock;
+ struct ifnet *hn_vf_ifp; /* SR-IOV VF */
+ uint32_t hn_xvf_flags; /* transparent VF flags */
+
+ int hn_tx_ring_cnt;
+ int hn_tx_ring_inuse;
+ struct hn_tx_ring *hn_tx_ring;
+
+ uint8_t *hn_chim;
+ u_long *hn_chim_bmap;
+ int hn_chim_bmap_cnt;
+ int hn_chim_cnt;
+ int hn_chim_szmax;
+
+ int hn_cpu;
+ struct taskqueue **hn_tx_taskqs;
+ struct sysctl_oid *hn_tx_sysctl_tree;
+ struct sysctl_oid *hn_rx_sysctl_tree;
+ struct vmbus_xact_ctx *hn_xact;
+ uint32_t hn_nvs_ver;
+ uint32_t hn_rx_filter;
+
+ /* Packet transmission aggregation user settings. */
+ int hn_agg_size;
+ int hn_agg_pkts;
+
+ struct taskqueue *hn_mgmt_taskq;
+ struct taskqueue *hn_mgmt_taskq0;
+ struct task hn_link_task;
+ struct task hn_netchg_init;
+ struct timeout_task hn_netchg_status;
+ uint32_t hn_link_flags; /* HN_LINK_FLAG_ */
+
+ uint32_t hn_caps; /* HN_CAP_ */
+ uint32_t hn_flags; /* HN_FLAG_ */
+ u_int hn_pollhz;
+
+ void *hn_rxbuf;
+ uint32_t hn_rxbuf_gpadl;
+ struct hyperv_dma hn_rxbuf_dma;
+
+ uint32_t hn_chim_gpadl;
+ struct hyperv_dma hn_chim_dma;
+
+ uint32_t hn_rndis_rid;
+ uint32_t hn_ndis_ver;
+ int hn_ndis_tso_szmax;
+ int hn_ndis_tso_sgmin;
+ uint32_t hn_rndis_agg_size;
+ uint32_t hn_rndis_agg_pkts;
+ uint32_t hn_rndis_agg_align;
+
+ int hn_rss_ind_size;
+ uint32_t hn_rss_hash; /* setting, NDIS_HASH_ */
+ uint32_t hn_rss_hcap; /* caps, NDIS_HASH_ */
+ struct ndis_rssprm_toeplitz hn_rss;
+
+ eventhandler_tag hn_ifaddr_evthand;
+ eventhandler_tag hn_ifnet_evthand;
+ eventhandler_tag hn_ifnet_atthand;
+ eventhandler_tag hn_ifnet_dethand;
+ eventhandler_tag hn_ifnet_lnkhand;
+
+ /*
+ * Transparent VF delayed initialization.
+ */
+ int hn_vf_rdytick; /* ticks, 0 == ready */
+ struct taskqueue *hn_vf_taskq;
+ struct timeout_task hn_vf_init;
+
+ /*
+ * Saved information for VF under transparent mode.
+ */
+ void (*hn_vf_input)
+ (struct ifnet *, struct mbuf *);
+ int hn_saved_caps;
+ u_int hn_saved_tsomax;
+ u_int hn_saved_tsosegcnt;
+ u_int hn_saved_tsosegsz;
+};
+
+#define HN_FLAG_RXBUF_CONNECTED 0x0001
+#define HN_FLAG_CHIM_CONNECTED 0x0002
+#define HN_FLAG_HAS_RSSKEY 0x0004
+#define HN_FLAG_HAS_RSSIND 0x0008
+#define HN_FLAG_SYNTH_ATTACHED 0x0010
+#define HN_FLAG_NO_SLEEPING 0x0020
+#define HN_FLAG_RXBUF_REF 0x0040
+#define HN_FLAG_CHIM_REF 0x0080
+#define HN_FLAG_RXVF 0x0100
+
+#define HN_FLAG_ERRORS (HN_FLAG_RXBUF_REF | HN_FLAG_CHIM_REF)
+
+#define HN_XVFFLAG_ENABLED 0x0001
+#define HN_XVFFLAG_ACCBPF 0x0002
+
+#define HN_NO_SLEEPING(sc) \
+do { \
+ (sc)->hn_flags |= HN_FLAG_NO_SLEEPING; \
+} while (0)
+
+#define HN_SLEEPING_OK(sc) \
+do { \
+ (sc)->hn_flags &= ~HN_FLAG_NO_SLEEPING; \
+} while (0)
+
+#define HN_CAN_SLEEP(sc) \
+ (((sc)->hn_flags & HN_FLAG_NO_SLEEPING) == 0)
+
+#define HN_CAP_VLAN 0x0001
+#define HN_CAP_MTU 0x0002
+#define HN_CAP_IPCS 0x0004
+#define HN_CAP_TCP4CS 0x0008
+#define HN_CAP_TCP6CS 0x0010
+#define HN_CAP_UDP4CS 0x0020
+#define HN_CAP_UDP6CS 0x0040
+#define HN_CAP_TSO4 0x0080
+#define HN_CAP_TSO6 0x0100
+#define HN_CAP_HASHVAL 0x0200
+#define HN_CAP_UDPHASH 0x0400
+
+/* Capability description for use with printf(9) %b identifier. */
+#define HN_CAP_BITS \
+ "\020\1VLAN\2MTU\3IPCS\4TCP4CS\5TCP6CS" \
+ "\6UDP4CS\7UDP6CS\10TSO4\11TSO6\12HASHVAL\13UDPHASH"
+
+#define HN_LINK_FLAG_LINKUP 0x0001
+#define HN_LINK_FLAG_NETCHG 0x0002
+
+#endif /* !_IF_HNVAR_H_ */
diff --git a/sys/dev/hyperv/netvsc/ndis.h b/sys/dev/hyperv/netvsc/ndis.h
new file mode 100644
index 000000000000..c69da7807a63
--- /dev/null
+++ b/sys/dev/hyperv/netvsc/ndis.h
@@ -0,0 +1,422 @@
+/*-
+ * Copyright (c) 2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NET_NDIS_H_
+#define _NET_NDIS_H_
+
+#define NDIS_MEDIA_STATE_CONNECTED 0
+#define NDIS_MEDIA_STATE_DISCONNECTED 1
+
+#define NDIS_NETCHANGE_TYPE_POSSIBLE 1
+#define NDIS_NETCHANGE_TYPE_DEFINITE 2
+#define NDIS_NETCHANGE_TYPE_FROMMEDIA 3
+
+#define NDIS_OFFLOAD_SET_NOCHG 0
+#define NDIS_OFFLOAD_SET_ON 1
+#define NDIS_OFFLOAD_SET_OFF 2
+
+/* a.k.a GRE MAC */
+#define NDIS_ENCAP_TYPE_NVGRE 0x00000001
+
+#define NDIS_HASH_FUNCTION_MASK 0x000000FF /* see hash function */
+#define NDIS_HASH_TYPE_MASK 0x00FFFF00 /* see hash type */
+
+/* hash function */
+#define NDIS_HASH_FUNCTION_TOEPLITZ 0x00000001
+
+/* hash type */
+#define NDIS_HASH_IPV4 0x00000100
+#define NDIS_HASH_TCP_IPV4 0x00000200
+#define NDIS_HASH_IPV6 0x00000400
+#define NDIS_HASH_IPV6_EX 0x00000800
+#define NDIS_HASH_TCP_IPV6 0x00001000
+#define NDIS_HASH_TCP_IPV6_EX 0x00002000
+#define NDIS_HASH_UDP_IPV4_X 0x00004000 /* XXX non-standard */
+
+#define NDIS_HASH_ALL (NDIS_HASH_IPV4 | \
+ NDIS_HASH_TCP_IPV4 | \
+ NDIS_HASH_IPV6 | \
+ NDIS_HASH_IPV6_EX | \
+ NDIS_HASH_TCP_IPV6 | \
+ NDIS_HASH_TCP_IPV6_EX |\
+ NDIS_HASH_UDP_IPV4_X)
+
+#define NDIS_HASH_STD (NDIS_HASH_IPV4 | \
+ NDIS_HASH_TCP_IPV4 | \
+ NDIS_HASH_IPV6 | \
+ NDIS_HASH_IPV6_EX | \
+ NDIS_HASH_TCP_IPV6 | \
+ NDIS_HASH_TCP_IPV6_EX)
+
+/* Hash description for use with printf(9) %b identifier. */
+#define NDIS_HASH_BITS \
+ "\20\1TOEPLITZ\11IP4\12TCP4\13IP6\14IP6EX\15TCP6\16TCP6EX\17UDP4_X"
+
+#define NDIS_HASH_KEYSIZE_TOEPLITZ 40
+#define NDIS_HASH_INDCNT 128
+
+#define NDIS_OBJTYPE_DEFAULT 0x80
+#define NDIS_OBJTYPE_RSS_CAPS 0x88
+#define NDIS_OBJTYPE_RSS_PARAMS 0x89
+#define NDIS_OBJTYPE_OFFLOAD 0xa7
+
+struct ndis_object_hdr {
+ uint8_t ndis_type; /* NDIS_OBJTYPE_ */
+ uint8_t ndis_rev; /* type specific */
+ uint16_t ndis_size; /* incl. this hdr */
+};
+
+/*
+ * OID_TCP_OFFLOAD_PARAMETERS
+ * ndis_type: NDIS_OBJTYPE_DEFAULT
+ */
+struct ndis_offload_params {
+ struct ndis_object_hdr ndis_hdr;
+ uint8_t ndis_ip4csum; /* NDIS_OFFLOAD_PARAM_ */
+ uint8_t ndis_tcp4csum; /* NDIS_OFFLOAD_PARAM_ */
+ uint8_t ndis_udp4csum; /* NDIS_OFFLOAD_PARAM_ */
+ uint8_t ndis_tcp6csum; /* NDIS_OFFLOAD_PARAM_ */
+ uint8_t ndis_udp6csum; /* NDIS_OFFLOAD_PARAM_ */
+ uint8_t ndis_lsov1; /* NDIS_OFFLOAD_PARAM_ */
+ uint8_t ndis_ipsecv1; /* NDIS_OFFLOAD_IPSECV1_ */
+ uint8_t ndis_lsov2_ip4; /* NDIS_OFFLOAD_LSOV2_ */
+ uint8_t ndis_lsov2_ip6; /* NDIS_OFFLOAD_LSOV2_ */
+ uint8_t ndis_tcp4conn; /* 0 */
+ uint8_t ndis_tcp6conn; /* 0 */
+ uint32_t ndis_flags; /* 0 */
+ /* NDIS >= 6.1 */
+ uint8_t ndis_ipsecv2; /* NDIS_OFFLOAD_IPSECV2_ */
+ uint8_t ndis_ipsecv2_ip4;/* NDIS_OFFLOAD_IPSECV2_ */
+ /* NDIS >= 6.30 */
+ uint8_t ndis_rsc_ip4; /* NDIS_OFFLOAD_RSC_ */
+ uint8_t ndis_rsc_ip6; /* NDIS_OFFLOAD_RSC_ */
+ uint8_t ndis_encap; /* NDIS_OFFLOAD_SET_ */
+ uint8_t ndis_encap_types;/* NDIS_ENCAP_TYPE_ */
+};
+
+#define NDIS_OFFLOAD_PARAMS_SIZE sizeof(struct ndis_offload_params)
+#define NDIS_OFFLOAD_PARAMS_SIZE_6_1 \
+ __offsetof(struct ndis_offload_params, ndis_rsc_ip4)
+
+#define NDIS_OFFLOAD_PARAMS_REV_2 2 /* NDIS 6.1 */
+#define NDIS_OFFLOAD_PARAMS_REV_3 3 /* NDIS 6.30 */
+
+#define NDIS_OFFLOAD_PARAM_NOCHG 0 /* common */
+#define NDIS_OFFLOAD_PARAM_OFF 1
+#define NDIS_OFFLOAD_PARAM_TX 2
+#define NDIS_OFFLOAD_PARAM_RX 3
+#define NDIS_OFFLOAD_PARAM_TXRX 4
+
+/* NDIS_OFFLOAD_PARAM_NOCHG */
+#define NDIS_OFFLOAD_LSOV1_OFF 1
+#define NDIS_OFFLOAD_LSOV1_ON 2
+
+/* NDIS_OFFLOAD_PARAM_NOCHG */
+#define NDIS_OFFLOAD_IPSECV1_OFF 1
+#define NDIS_OFFLOAD_IPSECV1_AH 2
+#define NDIS_OFFLOAD_IPSECV1_ESP 3
+#define NDIS_OFFLOAD_IPSECV1_AH_ESP 4
+
+/* NDIS_OFFLOAD_PARAM_NOCHG */
+#define NDIS_OFFLOAD_LSOV2_OFF 1
+#define NDIS_OFFLOAD_LSOV2_ON 2
+
+/* NDIS_OFFLOAD_PARAM_NOCHG */
+#define NDIS_OFFLOAD_IPSECV2_OFF 1
+#define NDIS_OFFLOAD_IPSECV2_AH 2
+#define NDIS_OFFLOAD_IPSECV2_ESP 3
+#define NDIS_OFFLOAD_IPSECV2_AH_ESP 4
+
+/* NDIS_OFFLOAD_PARAM_NOCHG */
+#define NDIS_OFFLOAD_RSC_OFF 1
+#define NDIS_OFFLOAD_RSC_ON 2
+
+/*
+ * OID_GEN_RECEIVE_SCALE_CAPABILITIES
+ * ndis_type: NDIS_OBJTYPE_RSS_CAPS
+ */
+struct ndis_rss_caps {
+ struct ndis_object_hdr ndis_hdr;
+ uint32_t ndis_caps; /* NDIS_RSS_CAP_ */
+ uint32_t ndis_nmsi; /* # of MSIs */
+ uint32_t ndis_nrxr; /* # of RX rings */
+ /* NDIS >= 6.30 */
+ uint16_t ndis_nind; /* # of indtbl ent. */
+ uint16_t ndis_pad;
+};
+
+#define NDIS_RSS_CAPS_SIZE \
+ __offsetof(struct ndis_rss_caps, ndis_pad)
+#define NDIS_RSS_CAPS_SIZE_6_0 \
+ __offsetof(struct ndis_rss_caps, ndis_nind)
+
+#define NDIS_RSS_CAPS_REV_1 1 /* NDIS 6.{0,1,20} */
+#define NDIS_RSS_CAPS_REV_2 2 /* NDIS 6.30 */
+
+#define NDIS_RSS_CAP_MSI 0x01000000
+#define NDIS_RSS_CAP_CLASSIFY_ISR 0x02000000
+#define NDIS_RSS_CAP_CLASSIFY_DPC 0x04000000
+#define NDIS_RSS_CAP_MSIX 0x08000000
+#define NDIS_RSS_CAP_IPV4 0x00000100
+#define NDIS_RSS_CAP_IPV6 0x00000200
+#define NDIS_RSS_CAP_IPV6_EX 0x00000400
+#define NDIS_RSS_CAP_HASH_TOEPLITZ NDIS_HASH_FUNCTION_TOEPLITZ
+#define NDIS_RSS_CAP_HASHFUNC_MASK NDIS_HASH_FUNCTION_MASK
+
+/*
+ * OID_GEN_RECEIVE_SCALE_PARAMETERS
+ * ndis_type: NDIS_OBJTYPE_RSS_PARAMS
+ */
+struct ndis_rss_params {
+ struct ndis_object_hdr ndis_hdr;
+ uint16_t ndis_flags; /* NDIS_RSS_FLAG_ */
+ uint16_t ndis_bcpu; /* base cpu 0 */
+ uint32_t ndis_hash; /* NDIS_HASH_ */
+ uint16_t ndis_indsize; /* indirect table */
+ uint32_t ndis_indoffset;
+ uint16_t ndis_keysize; /* hash key */
+ uint32_t ndis_keyoffset;
+ /* NDIS >= 6.20 */
+ uint32_t ndis_cpumaskoffset;
+ uint32_t ndis_cpumaskcnt;
+ uint32_t ndis_cpumaskentsz;
+};
+
+#define NDIS_RSS_PARAMS_SIZE sizeof(struct ndis_rss_params)
+#define NDIS_RSS_PARAMS_SIZE_6_0 \
+ __offsetof(struct ndis_rss_params, ndis_cpumaskoffset)
+
+#define NDIS_RSS_PARAMS_REV_1 1 /* NDIS 6.0 */
+#define NDIS_RSS_PARAMS_REV_2 2 /* NDIS 6.20 */
+
+#define NDIS_RSS_FLAG_NONE 0x0000
+#define NDIS_RSS_FLAG_BCPU_UNCHG 0x0001
+#define NDIS_RSS_FLAG_HASH_UNCHG 0x0002
+#define NDIS_RSS_FLAG_IND_UNCHG 0x0004
+#define NDIS_RSS_FLAG_KEY_UNCHG 0x0008
+#define NDIS_RSS_FLAG_DISABLE 0x0010
+
+/* non-standard convenient struct */
+struct ndis_rssprm_toeplitz {
+ struct ndis_rss_params rss_params;
+ /* Toeplitz hash key */
+ uint8_t rss_key[NDIS_HASH_KEYSIZE_TOEPLITZ];
+ /* Indirect table */
+ uint32_t rss_ind[NDIS_HASH_INDCNT];
+};
+
+#define NDIS_RSSPRM_TOEPLITZ_SIZE(nind) \
+ __offsetof(struct ndis_rssprm_toeplitz, rss_ind[nind])
+
+/*
+ * OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES
+ * ndis_type: NDIS_OBJTYPE_OFFLOAD
+ */
+
+#define NDIS_OFFLOAD_ENCAP_NONE 0x0000
+#define NDIS_OFFLOAD_ENCAP_NULL 0x0001
+#define NDIS_OFFLOAD_ENCAP_8023 0x0002
+#define NDIS_OFFLOAD_ENCAP_8023PQ 0x0004
+#define NDIS_OFFLOAD_ENCAP_8023PQ_OOB 0x0008
+#define NDIS_OFFLOAD_ENCAP_RFC1483 0x0010
+
+struct ndis_csum_offload {
+ uint32_t ndis_ip4_txenc; /*NDIS_OFFLOAD_ENCAP_*/
+ uint32_t ndis_ip4_txcsum;
+#define NDIS_TXCSUM_CAP_IP4OPT 0x001
+#define NDIS_TXCSUM_CAP_TCP4OPT 0x004
+#define NDIS_TXCSUM_CAP_TCP4 0x010
+#define NDIS_TXCSUM_CAP_UDP4 0x040
+#define NDIS_TXCSUM_CAP_IP4 0x100
+ uint32_t ndis_ip4_rxenc; /*NDIS_OFFLOAD_ENCAP_*/
+ uint32_t ndis_ip4_rxcsum;
+#define NDIS_RXCSUM_CAP_IP4OPT 0x001
+#define NDIS_RXCSUM_CAP_TCP4OPT 0x004
+#define NDIS_RXCSUM_CAP_TCP4 0x010
+#define NDIS_RXCSUM_CAP_UDP4 0x040
+#define NDIS_RXCSUM_CAP_IP4 0x100
+ uint32_t ndis_ip6_txenc; /*NDIS_OFFLOAD_ENCAP_*/
+ uint32_t ndis_ip6_txcsum;
+#define NDIS_TXCSUM_CAP_IP6EXT 0x001
+#define NDIS_TXCSUM_CAP_TCP6OPT 0x004
+#define NDIS_TXCSUM_CAP_TCP6 0x010
+#define NDIS_TXCSUM_CAP_UDP6 0x040
+ uint32_t ndis_ip6_rxenc; /*NDIS_OFFLOAD_ENCAP_*/
+ uint32_t ndis_ip6_rxcsum;
+#define NDIS_RXCSUM_CAP_IP6EXT 0x001
+#define NDIS_RXCSUM_CAP_TCP6OPT 0x004
+#define NDIS_RXCSUM_CAP_TCP6 0x010
+#define NDIS_RXCSUM_CAP_UDP6 0x040
+};
+
+struct ndis_lsov1_offload {
+ uint32_t ndis_encap; /*NDIS_OFFLOAD_ENCAP_*/
+ uint32_t ndis_maxsize;
+ uint32_t ndis_minsegs;
+ uint32_t ndis_opts;
+};
+
+struct ndis_ipsecv1_offload {
+ uint32_t ndis_encap; /*NDIS_OFFLOAD_ENCAP_*/
+ uint32_t ndis_ah_esp;
+ uint32_t ndis_xport_tun;
+ uint32_t ndis_ip4_opts;
+ uint32_t ndis_flags;
+ uint32_t ndis_ip4_ah;
+ uint32_t ndis_ip4_esp;
+};
+
+struct ndis_lsov2_offload {
+ uint32_t ndis_ip4_encap; /*NDIS_OFFLOAD_ENCAP_*/
+ uint32_t ndis_ip4_maxsz;
+ uint32_t ndis_ip4_minsg;
+ uint32_t ndis_ip6_encap; /*NDIS_OFFLOAD_ENCAP_*/
+ uint32_t ndis_ip6_maxsz;
+ uint32_t ndis_ip6_minsg;
+ uint32_t ndis_ip6_opts;
+#define NDIS_LSOV2_CAP_IP6EXT 0x001
+#define NDIS_LSOV2_CAP_TCP6OPT 0x004
+};
+
+struct ndis_ipsecv2_offload {
+ uint32_t ndis_encap; /*NDIS_OFFLOAD_ENCAP_*/
+ uint8_t ndis_ip6;
+ uint8_t ndis_ip4opt;
+ uint8_t ndis_ip6ext;
+ uint8_t ndis_ah;
+ uint8_t ndis_esp;
+ uint8_t ndis_ah_esp;
+ uint8_t ndis_xport;
+ uint8_t ndis_tun;
+ uint8_t ndis_xport_tun;
+ uint8_t ndis_lso;
+ uint8_t ndis_extseq;
+ uint32_t ndis_udp_esp;
+ uint32_t ndis_auth;
+ uint32_t ndis_crypto;
+ uint32_t ndis_sa_caps;
+};
+
+struct ndis_rsc_offload {
+ uint8_t ndis_ip4;
+ uint8_t ndis_ip6;
+};
+
+struct ndis_encap_offload {
+ uint32_t ndis_flags;
+ uint32_t ndis_maxhdr;
+};
+
+struct ndis_offload {
+ struct ndis_object_hdr ndis_hdr;
+ struct ndis_csum_offload ndis_csum;
+ struct ndis_lsov1_offload ndis_lsov1;
+ struct ndis_ipsecv1_offload ndis_ipsecv1;
+ struct ndis_lsov2_offload ndis_lsov2;
+ uint32_t ndis_flags;
+ /* NDIS >= 6.1 */
+ struct ndis_ipsecv2_offload ndis_ipsecv2;
+ /* NDIS >= 6.30 */
+ struct ndis_rsc_offload ndis_rsc;
+ struct ndis_encap_offload ndis_encap_gre;
+};
+
+#define NDIS_OFFLOAD_SIZE sizeof(struct ndis_offload)
+#define NDIS_OFFLOAD_SIZE_6_0 \
+ __offsetof(struct ndis_offload, ndis_ipsecv2)
+#define NDIS_OFFLOAD_SIZE_6_1 \
+ __offsetof(struct ndis_offload, ndis_rsc)
+
+#define NDIS_OFFLOAD_REV_1 1 /* NDIS 6.0 */
+#define NDIS_OFFLOAD_REV_2 2 /* NDIS 6.1 */
+#define NDIS_OFFLOAD_REV_3 3 /* NDIS 6.30 */
+
+/*
+ * Per-packet-info
+ */
+
+/* VLAN */
+#define NDIS_VLAN_INFO_SIZE sizeof(uint32_t)
+#define NDIS_VLAN_INFO_PRI_MASK 0x0007
+#define NDIS_VLAN_INFO_CFI_MASK 0x0008
+#define NDIS_VLAN_INFO_ID_MASK 0xfff0
+#define NDIS_VLAN_INFO_MAKE(id, pri, cfi) \
+ (((pri) & NDIS_VLAN_INFO_PRI_MASK) | \
+ (((cfi) & 0x1) << 3) | (((id) & 0xfff) << 4))
+#define NDIS_VLAN_INFO_ID(inf) (((inf) & NDIS_VLAN_INFO_ID_MASK) >> 4)
+#define NDIS_VLAN_INFO_CFI(inf) (((inf) & NDIS_VLAN_INFO_CFI_MASK) >> 3)
+#define NDIS_VLAN_INFO_PRI(inf) ((inf) & NDIS_VLAN_INFO_PRI_MASK)
+
+/* Reception checksum */
+#define NDIS_RXCSUM_INFO_SIZE sizeof(uint32_t)
+#define NDIS_RXCSUM_INFO_TCPCS_FAILED 0x0001
+#define NDIS_RXCSUM_INFO_UDPCS_FAILED 0x0002
+#define NDIS_RXCSUM_INFO_IPCS_FAILED 0x0004
+#define NDIS_RXCSUM_INFO_TCPCS_OK 0x0008
+#define NDIS_RXCSUM_INFO_UDPCS_OK 0x0010
+#define NDIS_RXCSUM_INFO_IPCS_OK 0x0020
+#define NDIS_RXCSUM_INFO_LOOPBACK 0x0040
+#define NDIS_RXCSUM_INFO_TCPCS_INVAL 0x0080
+#define NDIS_RXCSUM_INFO_IPCS_INVAL 0x0100
+
+/* LSOv2 */
+#define NDIS_LSO2_INFO_SIZE sizeof(uint32_t)
+#define NDIS_LSO2_INFO_MSS_MASK 0x000fffff
+#define NDIS_LSO2_INFO_THOFF_MASK 0x3ff00000
+#define NDIS_LSO2_INFO_ISLSO2 0x40000000
+#define NDIS_LSO2_INFO_ISIPV6 0x80000000
+
+#define NDIS_LSO2_INFO_MAKE(thoff, mss) \
+ ((((uint32_t)(mss)) & NDIS_LSO2_INFO_MSS_MASK) | \
+ ((((uint32_t)(thoff)) & 0x3ff) << 20) | \
+ NDIS_LSO2_INFO_ISLSO2)
+
+#define NDIS_LSO2_INFO_MAKEIPV4(thoff, mss) \
+ NDIS_LSO2_INFO_MAKE((thoff), (mss))
+
+#define NDIS_LSO2_INFO_MAKEIPV6(thoff, mss) \
+ (NDIS_LSO2_INFO_MAKE((thoff), (mss)) | NDIS_LSO2_INFO_ISIPV6)
+
+/* Transmission checksum */
+#define NDIS_TXCSUM_INFO_SIZE sizeof(uint32_t)
+#define NDIS_TXCSUM_INFO_IPV4 0x00000001
+#define NDIS_TXCSUM_INFO_IPV6 0x00000002
+#define NDIS_TXCSUM_INFO_TCPCS 0x00000004
+#define NDIS_TXCSUM_INFO_UDPCS 0x00000008
+#define NDIS_TXCSUM_INFO_IPCS 0x00000010
+#define NDIS_TXCSUM_INFO_THOFF 0x03ff0000
+
+#define NDIS_TXCSUM_INFO_MKL4CS(thoff, flag) \
+ ((((uint32_t)(thoff)) << 16) | (flag))
+
+#define NDIS_TXCSUM_INFO_MKTCPCS(thoff) \
+ NDIS_TXCSUM_INFO_MKL4CS((thoff), NDIS_TXCSUM_INFO_TCPCS)
+
+#define NDIS_TXCSUM_INFO_MKUDPCS(thoff) \
+ NDIS_TXCSUM_INFO_MKL4CS((thoff), NDIS_TXCSUM_INFO_UDPCS)
+#endif /* !_NET_NDIS_H_ */
diff --git a/sys/dev/hyperv/pcib/vmbus_pcib.c b/sys/dev/hyperv/pcib/vmbus_pcib.c
new file mode 100644
index 000000000000..c7df32044678
--- /dev/null
+++ b/sys/dev/hyperv/pcib/vmbus_pcib.c
@@ -0,0 +1,1897 @@
+/*-
+ * Copyright (c) 2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef NEW_PCIB
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/sx.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+#include <sys/mutex.h>
+#include <sys/errno.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/pmap.h>
+
+#include <machine/atomic.h>
+#include <machine/bus.h>
+#include <machine/frame.h>
+#include <machine/pci_cfgreg.h>
+#include <machine/resource.h>
+
+#include <sys/pciio.h>
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pci_private.h>
+#include <dev/pci/pcib_private.h>
+#include "pcib_if.h"
+
+#include <machine/intr_machdep.h>
+#include <x86/apicreg.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+#include <dev/hyperv/vmbus/vmbus_reg.h>
+#include <dev/hyperv/vmbus/vmbus_chanvar.h>
+
+#include "vmbus_if.h"
+
+#if __FreeBSD_version < 1100000
+typedef u_long rman_res_t;
+#define RM_MAX_END (~(rman_res_t)0)
+#endif
+
+struct completion {
+ unsigned int done;
+ struct mtx lock;
+};
+
+static void
+init_completion(struct completion *c)
+{
+ memset(c, 0, sizeof(*c));
+ mtx_init(&c->lock, "hvcmpl", NULL, MTX_DEF);
+ c->done = 0;
+}
+
+static void
+free_completion(struct completion *c)
+{
+ mtx_destroy(&c->lock);
+}
+
+static void
+complete(struct completion *c)
+{
+ mtx_lock(&c->lock);
+ c->done++;
+ mtx_unlock(&c->lock);
+ wakeup(c);
+}
+
+static void
+wait_for_completion(struct completion *c)
+{
+ mtx_lock(&c->lock);
+ while (c->done == 0)
+ mtx_sleep(c, &c->lock, 0, "hvwfc", 0);
+ c->done--;
+ mtx_unlock(&c->lock);
+}
+
+/*
+ * Return: 0 if completed, a non-zero value if timed out.
+ */
+static int
+wait_for_completion_timeout(struct completion *c, int timeout)
+{
+ int ret;
+
+ mtx_lock(&c->lock);
+
+ if (c->done == 0)
+ mtx_sleep(c, &c->lock, 0, "hvwfc", timeout);
+
+ if (c->done > 0) {
+ c->done--;
+ ret = 0;
+ } else {
+ ret = 1;
+ }
+
+ mtx_unlock(&c->lock);
+
+ return (ret);
+}
+
+#define PCI_MAKE_VERSION(major, minor) ((uint32_t)(((major) << 16) | (major)))
+
+enum {
+ PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1),
+ PCI_PROTOCOL_VERSION_CURRENT = PCI_PROTOCOL_VERSION_1_1
+};
+
+#define PCI_CONFIG_MMIO_LENGTH 0x2000
+#define CFG_PAGE_OFFSET 0x1000
+#define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
+
+/*
+ * Message Types
+ */
+
+enum pci_message_type {
+ /*
+ * Version 1.1
+ */
+ PCI_MESSAGE_BASE = 0x42490000,
+ PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0,
+ PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1,
+ PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4,
+ PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
+ PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6,
+ PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7,
+ PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8,
+ PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9,
+ PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA,
+ PCI_EJECT = PCI_MESSAGE_BASE + 0xB,
+ PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC,
+ PCI_REENABLE = PCI_MESSAGE_BASE + 0xD,
+ PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE,
+ PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF,
+ PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10,
+ PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11,
+ PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12,
+ PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13,
+ PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14,
+ PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15,
+ PCI_MESSAGE_MAXIMUM
+};
+
+/*
+ * Structures defining the virtual PCI Express protocol.
+ */
+
+union pci_version {
+ struct {
+ uint16_t minor_version;
+ uint16_t major_version;
+ } parts;
+ uint32_t version;
+} __packed;
+
+/*
+ * This representation is the one used in Windows, which is
+ * what is expected when sending this back and forth with
+ * the Hyper-V parent partition.
+ */
+union win_slot_encoding {
+ struct {
+ uint32_t slot:5;
+ uint32_t func:3;
+ uint32_t reserved:24;
+ } bits;
+ uint32_t val;
+} __packed;
+
+struct pci_func_desc {
+ uint16_t v_id; /* vendor ID */
+ uint16_t d_id; /* device ID */
+ uint8_t rev;
+ uint8_t prog_intf;
+ uint8_t subclass;
+ uint8_t base_class;
+ uint32_t subsystem_id;
+ union win_slot_encoding wslot;
+ uint32_t ser; /* serial number */
+} __packed;
+
+struct hv_msi_desc {
+ uint8_t vector;
+ uint8_t delivery_mode;
+ uint16_t vector_count;
+ uint32_t reserved;
+ uint64_t cpu_mask;
+} __packed;
+
+struct tran_int_desc {
+ uint16_t reserved;
+ uint16_t vector_count;
+ uint32_t data;
+ uint64_t address;
+} __packed;
+
+struct pci_message {
+ uint32_t type;
+} __packed;
+
+struct pci_child_message {
+ struct pci_message message_type;
+ union win_slot_encoding wslot;
+} __packed;
+
+struct pci_incoming_message {
+ struct vmbus_chanpkt_hdr hdr;
+ struct pci_message message_type;
+} __packed;
+
+struct pci_response {
+ struct vmbus_chanpkt_hdr hdr;
+ int32_t status; /* negative values are failures */
+} __packed;
+
+struct pci_packet {
+ void (*completion_func)(void *context, struct pci_response *resp,
+ int resp_packet_size);
+ void *compl_ctxt;
+
+ struct pci_message message[0];
+};
+
+/*
+ * Specific message types supporting the PCI protocol.
+ */
+
+struct pci_version_request {
+ struct pci_message message_type;
+ uint32_t protocol_version;
+ uint32_t is_last_attempt:1;
+ uint32_t reservedz:31;
+} __packed;
+
+struct pci_bus_d0_entry {
+ struct pci_message message_type;
+ uint32_t reserved;
+ uint64_t mmio_base;
+} __packed;
+
+struct pci_bus_relations {
+ struct pci_incoming_message incoming;
+ uint32_t device_count;
+ struct pci_func_desc func[0];
+} __packed;
+
+#define MAX_NUM_BARS (PCIR_MAX_BAR_0 + 1)
+struct pci_q_res_req_response {
+ struct vmbus_chanpkt_hdr hdr;
+ int32_t status; /* negative values are failures */
+ uint32_t probed_bar[MAX_NUM_BARS];
+} __packed;
+
+struct pci_resources_assigned {
+ struct pci_message message_type;
+ union win_slot_encoding wslot;
+ uint8_t memory_range[0x14][MAX_NUM_BARS]; /* unused here */
+ uint32_t msi_descriptors;
+ uint32_t reserved[4];
+} __packed;
+
+struct pci_create_interrupt {
+ struct pci_message message_type;
+ union win_slot_encoding wslot;
+ struct hv_msi_desc int_desc;
+} __packed;
+
+struct pci_create_int_response {
+ struct pci_response response;
+ uint32_t reserved;
+ struct tran_int_desc int_desc;
+} __packed;
+
+struct pci_delete_interrupt {
+ struct pci_message message_type;
+ union win_slot_encoding wslot;
+ struct tran_int_desc int_desc;
+} __packed;
+
+struct pci_dev_incoming {
+ struct pci_incoming_message incoming;
+ union win_slot_encoding wslot;
+} __packed;
+
+struct pci_eject_response {
+ struct pci_message message_type;
+ union win_slot_encoding wslot;
+ uint32_t status;
+} __packed;
+
+/*
+ * Driver specific state.
+ */
+
+enum hv_pcibus_state {
+ hv_pcibus_init = 0,
+ hv_pcibus_installed,
+};
+
+struct hv_pcibus {
+ device_t pcib;
+ device_t pci_bus;
+ struct vmbus_pcib_softc *sc;
+
+ uint16_t pci_domain;
+
+ enum hv_pcibus_state state;
+
+ struct resource *cfg_res;
+
+ struct completion query_completion, *query_comp;
+
+ struct mtx config_lock; /* Avoid two threads writing index page */
+ struct mtx device_list_lock; /* Protect lists below */
+ TAILQ_HEAD(, hv_pci_dev) children;
+ TAILQ_HEAD(, hv_dr_state) dr_list;
+
+ volatile int detaching;
+};
+
+struct hv_pci_dev {
+ TAILQ_ENTRY(hv_pci_dev) link;
+
+ struct pci_func_desc desc;
+
+ bool reported_missing;
+
+ struct hv_pcibus *hbus;
+ struct task eject_task;
+
+ TAILQ_HEAD(, hv_irq_desc) irq_desc_list;
+
+ /*
+ * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
+ * read it back, for each of the BAR offsets within config space.
+ */
+ uint32_t probed_bar[MAX_NUM_BARS];
+};
+
+/*
+ * Tracks "Device Relations" messages from the host, which must be both
+ * processed in order.
+ */
+struct hv_dr_work {
+ struct task task;
+ struct hv_pcibus *bus;
+};
+
+struct hv_dr_state {
+ TAILQ_ENTRY(hv_dr_state) link;
+ uint32_t device_count;
+ struct pci_func_desc func[0];
+};
+
+struct hv_irq_desc {
+ TAILQ_ENTRY(hv_irq_desc) link;
+ struct tran_int_desc desc;
+ int irq;
+};
+
+#define PCI_DEVFN(slot, func) ((((slot) & 0x1f) << 3) | ((func) & 0x07))
+#define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f)
+#define PCI_FUNC(devfn) ((devfn) & 0x07)
+
+static uint32_t
+devfn_to_wslot(unsigned int devfn)
+{
+ union win_slot_encoding wslot;
+
+ wslot.val = 0;
+ wslot.bits.slot = PCI_SLOT(devfn);
+ wslot.bits.func = PCI_FUNC(devfn);
+
+ return (wslot.val);
+}
+
+static unsigned int
+wslot_to_devfn(uint32_t wslot)
+{
+ union win_slot_encoding encoding;
+ unsigned int slot;
+ unsigned int func;
+
+ encoding.val = wslot;
+
+ slot = encoding.bits.slot;
+ func = encoding.bits.func;
+
+ return (PCI_DEVFN(slot, func));
+}
+
+struct vmbus_pcib_softc {
+ struct vmbus_channel *chan;
+ void *rx_buf;
+
+ struct taskqueue *taskq;
+
+ struct hv_pcibus *hbus;
+};
+
+/* {44C4F61D-4444-4400-9D52-802E27EDE19F} */
+static const struct hyperv_guid g_pass_through_dev_type = {
+ .hv_guid = {0x1D, 0xF6, 0xC4, 0x44, 0x44, 0x44, 0x00, 0x44,
+ 0x9D, 0x52, 0x80, 0x2E, 0x27, 0xED, 0xE1, 0x9F}
+};
+
+struct hv_pci_compl {
+ struct completion host_event;
+ int32_t completion_status;
+};
+
+struct q_res_req_compl {
+ struct completion host_event;
+ struct hv_pci_dev *hpdev;
+};
+
+struct compose_comp_ctxt {
+ struct hv_pci_compl comp_pkt;
+ struct tran_int_desc int_desc;
+};
+
+/*
+ * It is possible the device is revoked during initialization.
+ * Check if this happens during wait.
+ * Return: 0 if response arrived, ENODEV if device revoked.
+ */
+static int
+wait_for_response(struct hv_pcibus *hbus, struct completion *c)
+{
+ do {
+ if (vmbus_chan_is_revoked(hbus->sc->chan)) {
+ device_printf(hbus->pcib,
+ "The device is revoked.\n");
+ return (ENODEV);
+ }
+ } while (wait_for_completion_timeout(c, hz /10) != 0);
+
+ return 0;
+}
+
+static void
+hv_pci_generic_compl(void *context, struct pci_response *resp,
+ int resp_packet_size)
+{
+ struct hv_pci_compl *comp_pkt = context;
+
+ if (resp_packet_size >= sizeof(struct pci_response))
+ comp_pkt->completion_status = resp->status;
+ else
+ comp_pkt->completion_status = -1;
+
+ complete(&comp_pkt->host_event);
+}
+
+static void
+q_resource_requirements(void *context, struct pci_response *resp,
+ int resp_packet_size)
+{
+ struct q_res_req_compl *completion = context;
+ struct pci_q_res_req_response *q_res_req =
+ (struct pci_q_res_req_response *)resp;
+ int i;
+
+ if (resp->status < 0) {
+ printf("vmbus_pcib: failed to query resource requirements\n");
+ } else {
+ for (i = 0; i < MAX_NUM_BARS; i++)
+ completion->hpdev->probed_bar[i] =
+ q_res_req->probed_bar[i];
+ }
+
+ complete(&completion->host_event);
+}
+
+static void
+hv_pci_compose_compl(void *context, struct pci_response *resp,
+ int resp_packet_size)
+{
+ struct compose_comp_ctxt *comp_pkt = context;
+ struct pci_create_int_response *int_resp =
+ (struct pci_create_int_response *)resp;
+
+ comp_pkt->comp_pkt.completion_status = resp->status;
+ comp_pkt->int_desc = int_resp->int_desc;
+ complete(&comp_pkt->comp_pkt.host_event);
+}
+
+static void
+hv_int_desc_free(struct hv_pci_dev *hpdev, struct hv_irq_desc *hid)
+{
+ struct pci_delete_interrupt *int_pkt;
+ struct {
+ struct pci_packet pkt;
+ uint8_t buffer[sizeof(struct pci_delete_interrupt)];
+ } ctxt;
+
+ memset(&ctxt, 0, sizeof(ctxt));
+ int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
+ int_pkt->message_type.type = PCI_DELETE_INTERRUPT_MESSAGE;
+ int_pkt->wslot.val = hpdev->desc.wslot.val;
+ int_pkt->int_desc = hid->desc;
+
+ vmbus_chan_send(hpdev->hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
+ int_pkt, sizeof(*int_pkt), 0);
+
+ free(hid, M_DEVBUF);
+}
+
+static void
+hv_pci_delete_device(struct hv_pci_dev *hpdev)
+{
+ struct hv_pcibus *hbus = hpdev->hbus;
+ struct hv_irq_desc *hid, *tmp_hid;
+ device_t pci_dev;
+ int devfn;
+
+ devfn = wslot_to_devfn(hpdev->desc.wslot.val);
+
+ mtx_lock(&Giant);
+
+ pci_dev = pci_find_dbsf(hbus->pci_domain,
+ 0, PCI_SLOT(devfn), PCI_FUNC(devfn));
+ if (pci_dev)
+ device_delete_child(hbus->pci_bus, pci_dev);
+
+ mtx_unlock(&Giant);
+
+ mtx_lock(&hbus->device_list_lock);
+ TAILQ_REMOVE(&hbus->children, hpdev, link);
+ mtx_unlock(&hbus->device_list_lock);
+
+ TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid)
+ hv_int_desc_free(hpdev, hid);
+
+ free(hpdev, M_DEVBUF);
+}
+
+static struct hv_pci_dev *
+new_pcichild_device(struct hv_pcibus *hbus, struct pci_func_desc *desc)
+{
+ struct hv_pci_dev *hpdev;
+ struct pci_child_message *res_req;
+ struct q_res_req_compl comp_pkt;
+ struct {
+ struct pci_packet pkt;
+ uint8_t buffer[sizeof(struct pci_child_message)];
+ } ctxt;
+ int ret;
+
+ hpdev = malloc(sizeof(*hpdev), M_DEVBUF, M_WAITOK | M_ZERO);
+ hpdev->hbus = hbus;
+
+ TAILQ_INIT(&hpdev->irq_desc_list);
+
+ init_completion(&comp_pkt.host_event);
+ comp_pkt.hpdev = hpdev;
+
+ ctxt.pkt.compl_ctxt = &comp_pkt;
+ ctxt.pkt.completion_func = q_resource_requirements;
+
+ res_req = (struct pci_child_message *)&ctxt.pkt.message;
+ res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
+ res_req->wslot.val = desc->wslot.val;
+
+ ret = vmbus_chan_send(hbus->sc->chan,
+ VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+ res_req, sizeof(*res_req), (uint64_t)(uintptr_t)&ctxt.pkt);
+ if (ret)
+ goto err;
+
+ if (wait_for_response(hbus, &comp_pkt.host_event))
+ goto err;
+
+ free_completion(&comp_pkt.host_event);
+
+ hpdev->desc = *desc;
+
+ mtx_lock(&hbus->device_list_lock);
+ if (TAILQ_EMPTY(&hbus->children))
+ hbus->pci_domain = desc->ser & 0xFFFF;
+ TAILQ_INSERT_TAIL(&hbus->children, hpdev, link);
+ mtx_unlock(&hbus->device_list_lock);
+ return (hpdev);
+err:
+ free_completion(&comp_pkt.host_event);
+ free(hpdev, M_DEVBUF);
+ return (NULL);
+}
+
+#if __FreeBSD_version < 1100000
+
+/* Old versions don't have BUS_RESCAN(). Let's copy it from FreeBSD 11. */
+
+static struct pci_devinfo *
+pci_identify_function(device_t pcib, device_t dev, int domain, int busno,
+ int slot, int func, size_t dinfo_size)
+{
+ struct pci_devinfo *dinfo;
+
+ dinfo = pci_read_device(pcib, domain, busno, slot, func, dinfo_size);
+ if (dinfo != NULL)
+ pci_add_child(dev, dinfo);
+
+ return (dinfo);
+}
+
+static int
+pci_rescan(device_t dev)
+{
+#define REG(n, w) PCIB_READ_CONFIG(pcib, busno, s, f, n, w)
+ device_t pcib = device_get_parent(dev);
+ struct pci_softc *sc;
+ device_t child, *devlist, *unchanged;
+ int devcount, error, i, j, maxslots, oldcount;
+ int busno, domain, s, f, pcifunchigh;
+ uint8_t hdrtype;
+
+ /* No need to check for ARI on a rescan. */
+ error = device_get_children(dev, &devlist, &devcount);
+ if (error)
+ return (error);
+ if (devcount != 0) {
+ unchanged = malloc(devcount * sizeof(device_t), M_TEMP,
+ M_NOWAIT | M_ZERO);
+ if (unchanged == NULL) {
+ free(devlist, M_TEMP);
+ return (ENOMEM);
+ }
+ } else
+ unchanged = NULL;
+
+ sc = device_get_softc(dev);
+ domain = pcib_get_domain(dev);
+ busno = pcib_get_bus(dev);
+ maxslots = PCIB_MAXSLOTS(pcib);
+ for (s = 0; s <= maxslots; s++) {
+ /* If function 0 is not present, skip to the next slot. */
+ f = 0;
+ if (REG(PCIR_VENDOR, 2) == 0xffff)
+ continue;
+ pcifunchigh = 0;
+ hdrtype = REG(PCIR_HDRTYPE, 1);
+ if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE)
+ continue;
+ if (hdrtype & PCIM_MFDEV)
+ pcifunchigh = PCIB_MAXFUNCS(pcib);
+ for (f = 0; f <= pcifunchigh; f++) {
+ if (REG(PCIR_VENDOR, 2) == 0xffff)
+ continue;
+
+ /*
+ * Found a valid function. Check if a
+ * device_t for this device already exists.
+ */
+ for (i = 0; i < devcount; i++) {
+ child = devlist[i];
+ if (child == NULL)
+ continue;
+ if (pci_get_slot(child) == s &&
+ pci_get_function(child) == f) {
+ unchanged[i] = child;
+ goto next_func;
+ }
+ }
+
+ pci_identify_function(pcib, dev, domain, busno, s, f,
+ sizeof(struct pci_devinfo));
+ next_func:;
+ }
+ }
+
+ /* Remove devices that are no longer present. */
+ for (i = 0; i < devcount; i++) {
+ if (unchanged[i] != NULL)
+ continue;
+ device_delete_child(dev, devlist[i]);
+ }
+
+ free(devlist, M_TEMP);
+ oldcount = devcount;
+
+ /* Try to attach the devices just added. */
+ error = device_get_children(dev, &devlist, &devcount);
+ if (error) {
+ free(unchanged, M_TEMP);
+ return (error);
+ }
+
+ for (i = 0; i < devcount; i++) {
+ for (j = 0; j < oldcount; j++) {
+ if (devlist[i] == unchanged[j])
+ goto next_device;
+ }
+
+ device_probe_and_attach(devlist[i]);
+ next_device:;
+ }
+
+ free(unchanged, M_TEMP);
+ free(devlist, M_TEMP);
+ return (0);
+#undef REG
+}
+
+#else
+
+static int
+pci_rescan(device_t dev)
+{
+ return (BUS_RESCAN(dev));
+}
+
+#endif
+
+static void
+pci_devices_present_work(void *arg, int pending __unused)
+{
+ struct hv_dr_work *dr_wrk = arg;
+ struct hv_dr_state *dr = NULL;
+ struct hv_pcibus *hbus;
+ uint32_t child_no;
+ bool found;
+ struct pci_func_desc *new_desc;
+ struct hv_pci_dev *hpdev, *tmp_hpdev;
+ struct completion *query_comp;
+ bool need_rescan = false;
+
+ hbus = dr_wrk->bus;
+ free(dr_wrk, M_DEVBUF);
+
+ /* Pull this off the queue and process it if it was the last one. */
+ mtx_lock(&hbus->device_list_lock);
+ while (!TAILQ_EMPTY(&hbus->dr_list)) {
+ dr = TAILQ_FIRST(&hbus->dr_list);
+ TAILQ_REMOVE(&hbus->dr_list, dr, link);
+
+ /* Throw this away if the list still has stuff in it. */
+ if (!TAILQ_EMPTY(&hbus->dr_list)) {
+ free(dr, M_DEVBUF);
+ continue;
+ }
+ }
+ mtx_unlock(&hbus->device_list_lock);
+
+ if (!dr)
+ return;
+
+ /* First, mark all existing children as reported missing. */
+ mtx_lock(&hbus->device_list_lock);
+ TAILQ_FOREACH(hpdev, &hbus->children, link)
+ hpdev->reported_missing = true;
+ mtx_unlock(&hbus->device_list_lock);
+
+ /* Next, add back any reported devices. */
+ for (child_no = 0; child_no < dr->device_count; child_no++) {
+ found = false;
+ new_desc = &dr->func[child_no];
+
+ mtx_lock(&hbus->device_list_lock);
+ TAILQ_FOREACH(hpdev, &hbus->children, link) {
+ if ((hpdev->desc.wslot.val ==
+ new_desc->wslot.val) &&
+ (hpdev->desc.v_id == new_desc->v_id) &&
+ (hpdev->desc.d_id == new_desc->d_id) &&
+ (hpdev->desc.ser == new_desc->ser)) {
+ hpdev->reported_missing = false;
+ found = true;
+ break;
+ }
+ }
+ mtx_unlock(&hbus->device_list_lock);
+
+ if (!found) {
+ if (!need_rescan)
+ need_rescan = true;
+
+ hpdev = new_pcichild_device(hbus, new_desc);
+ if (!hpdev)
+ printf("vmbus_pcib: failed to add a child\n");
+ }
+ }
+
+ /* Remove missing device(s), if any */
+ TAILQ_FOREACH_SAFE(hpdev, &hbus->children, link, tmp_hpdev) {
+ if (hpdev->reported_missing)
+ hv_pci_delete_device(hpdev);
+ }
+
+ /* Rescan the bus to find any new device, if necessary. */
+ if (hbus->state == hv_pcibus_installed && need_rescan)
+ pci_rescan(hbus->pci_bus);
+
+ /* Wake up hv_pci_query_relations(), if it's waiting. */
+ query_comp = hbus->query_comp;
+ if (query_comp) {
+ hbus->query_comp = NULL;
+ complete(query_comp);
+ }
+
+ free(dr, M_DEVBUF);
+}
+
+static struct hv_pci_dev *
+get_pcichild_wslot(struct hv_pcibus *hbus, uint32_t wslot)
+{
+ struct hv_pci_dev *hpdev, *ret = NULL;
+
+ mtx_lock(&hbus->device_list_lock);
+ TAILQ_FOREACH(hpdev, &hbus->children, link) {
+ if (hpdev->desc.wslot.val == wslot) {
+ ret = hpdev;
+ break;
+ }
+ }
+ mtx_unlock(&hbus->device_list_lock);
+
+ return (ret);
+}
+
+static void
+hv_pci_devices_present(struct hv_pcibus *hbus,
+ struct pci_bus_relations *relations)
+{
+ struct hv_dr_state *dr;
+ struct hv_dr_work *dr_wrk;
+ unsigned long dr_size;
+
+ if (hbus->detaching && relations->device_count > 0)
+ return;
+
+ dr_size = offsetof(struct hv_dr_state, func) +
+ (sizeof(struct pci_func_desc) * relations->device_count);
+ dr = malloc(dr_size, M_DEVBUF, M_WAITOK | M_ZERO);
+
+ dr->device_count = relations->device_count;
+ if (dr->device_count != 0)
+ memcpy(dr->func, relations->func,
+ sizeof(struct pci_func_desc) * dr->device_count);
+
+ mtx_lock(&hbus->device_list_lock);
+ TAILQ_INSERT_TAIL(&hbus->dr_list, dr, link);
+ mtx_unlock(&hbus->device_list_lock);
+
+ dr_wrk = malloc(sizeof(*dr_wrk), M_DEVBUF, M_WAITOK | M_ZERO);
+ dr_wrk->bus = hbus;
+ TASK_INIT(&dr_wrk->task, 0, pci_devices_present_work, dr_wrk);
+ taskqueue_enqueue(hbus->sc->taskq, &dr_wrk->task);
+}
+
+static void
+hv_eject_device_work(void *arg, int pending __unused)
+{
+ struct hv_pci_dev *hpdev = arg;
+ union win_slot_encoding wslot = hpdev->desc.wslot;
+ struct hv_pcibus *hbus = hpdev->hbus;
+ struct pci_eject_response *eject_pkt;
+ struct {
+ struct pci_packet pkt;
+ uint8_t buffer[sizeof(struct pci_eject_response)];
+ } ctxt;
+
+ hv_pci_delete_device(hpdev);
+
+ memset(&ctxt, 0, sizeof(ctxt));
+ eject_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
+ eject_pkt->message_type.type = PCI_EJECTION_COMPLETE;
+ eject_pkt->wslot.val = wslot.val;
+ vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
+ eject_pkt, sizeof(*eject_pkt), 0);
+}
+
+static void
+hv_pci_eject_device(struct hv_pci_dev *hpdev)
+{
+ struct hv_pcibus *hbus = hpdev->hbus;
+ struct taskqueue *taskq;
+
+ if (hbus->detaching)
+ return;
+
+ /*
+ * Push this task into the same taskqueue on which
+ * vmbus_pcib_attach() runs, so we're sure this task can't run
+ * concurrently with vmbus_pcib_attach().
+ */
+ TASK_INIT(&hpdev->eject_task, 0, hv_eject_device_work, hpdev);
+ taskq = vmbus_chan_mgmt_tq(hbus->sc->chan);
+ taskqueue_enqueue(taskq, &hpdev->eject_task);
+}
+
+#define PCIB_PACKET_SIZE 0x100
+
+static void
+vmbus_pcib_on_channel_callback(struct vmbus_channel *chan, void *arg)
+{
+ struct vmbus_pcib_softc *sc = arg;
+ struct hv_pcibus *hbus = sc->hbus;
+
+ void *buffer;
+ int bufferlen = PCIB_PACKET_SIZE;
+
+ struct pci_packet *comp_packet;
+ struct pci_response *response;
+ struct pci_incoming_message *new_msg;
+ struct pci_bus_relations *bus_rel;
+ struct pci_dev_incoming *dev_msg;
+ struct hv_pci_dev *hpdev;
+
+ buffer = sc->rx_buf;
+ do {
+ struct vmbus_chanpkt_hdr *pkt = buffer;
+ uint32_t bytes_rxed;
+ int ret;
+
+ bytes_rxed = bufferlen;
+ ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed);
+
+ if (ret == ENOBUFS) {
+ /* Handle large packet */
+ if (bufferlen > PCIB_PACKET_SIZE) {
+ free(buffer, M_DEVBUF);
+ buffer = NULL;
+ }
+
+ /* alloc new buffer */
+ buffer = malloc(bytes_rxed, M_DEVBUF, M_WAITOK | M_ZERO);
+ bufferlen = bytes_rxed;
+
+ continue;
+ }
+
+ if (ret != 0) {
+ /* ignore EIO or EAGAIN */
+ break;
+ }
+
+ if (bytes_rxed <= sizeof(struct pci_response))
+ continue;
+
+ switch (pkt->cph_type) {
+ case VMBUS_CHANPKT_TYPE_COMP:
+ comp_packet =
+ (struct pci_packet *)(uintptr_t)pkt->cph_xactid;
+ response = (struct pci_response *)pkt;
+ comp_packet->completion_func(comp_packet->compl_ctxt,
+ response, bytes_rxed);
+ break;
+ case VMBUS_CHANPKT_TYPE_INBAND:
+ new_msg = (struct pci_incoming_message *)buffer;
+
+ switch (new_msg->message_type.type) {
+ case PCI_BUS_RELATIONS:
+ bus_rel = (struct pci_bus_relations *)buffer;
+
+ if (bus_rel->device_count == 0)
+ break;
+
+ if (bytes_rxed <
+ offsetof(struct pci_bus_relations, func) +
+ (sizeof(struct pci_func_desc) *
+ (bus_rel->device_count)))
+ break;
+
+ hv_pci_devices_present(hbus, bus_rel);
+ break;
+
+ case PCI_EJECT:
+ dev_msg = (struct pci_dev_incoming *)buffer;
+ hpdev = get_pcichild_wslot(hbus,
+ dev_msg->wslot.val);
+
+ if (hpdev)
+ hv_pci_eject_device(hpdev);
+
+ break;
+ default:
+ printf("vmbus_pcib: Unknown msg type 0x%x\n",
+ new_msg->message_type.type);
+ break;
+ }
+ break;
+ default:
+ printf("vmbus_pcib: Unknown VMBus msg type %hd\n",
+ pkt->cph_type);
+ break;
+ }
+ } while (1);
+
+ if (bufferlen > PCIB_PACKET_SIZE)
+ free(buffer, M_DEVBUF);
+}
+
+static int
+hv_pci_protocol_negotiation(struct hv_pcibus *hbus)
+{
+ struct pci_version_request *version_req;
+ struct hv_pci_compl comp_pkt;
+ struct {
+ struct pci_packet pkt;
+ uint8_t buffer[sizeof(struct pci_version_request)];
+ } ctxt;
+ int ret;
+
+ init_completion(&comp_pkt.host_event);
+
+ ctxt.pkt.completion_func = hv_pci_generic_compl;
+ ctxt.pkt.compl_ctxt = &comp_pkt;
+ version_req = (struct pci_version_request *)&ctxt.pkt.message;
+ version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
+ version_req->protocol_version = PCI_PROTOCOL_VERSION_CURRENT;
+ version_req->is_last_attempt = 1;
+
+ ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND,
+ VMBUS_CHANPKT_FLAG_RC, version_req, sizeof(*version_req),
+ (uint64_t)(uintptr_t)&ctxt.pkt);
+ if (!ret)
+ ret = wait_for_response(hbus, &comp_pkt.host_event);
+
+ if (ret) {
+ device_printf(hbus->pcib,
+ "vmbus_pcib failed to request version: %d\n",
+ ret);
+ goto out;
+ }
+
+ if (comp_pkt.completion_status < 0) {
+ device_printf(hbus->pcib,
+ "vmbus_pcib version negotiation failed: %x\n",
+ comp_pkt.completion_status);
+ ret = EPROTO;
+ } else {
+ ret = 0;
+ }
+out:
+ free_completion(&comp_pkt.host_event);
+ return (ret);
+}
+
+/* Ask the host to send along the list of child devices */
+static int
+hv_pci_query_relations(struct hv_pcibus *hbus)
+{
+ struct pci_message message;
+ int ret;
+
+ message.type = PCI_QUERY_BUS_RELATIONS;
+ ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
+ &message, sizeof(message), 0);
+ return (ret);
+}
+
+static int
+hv_pci_enter_d0(struct hv_pcibus *hbus)
+{
+ struct pci_bus_d0_entry *d0_entry;
+ struct hv_pci_compl comp_pkt;
+ struct {
+ struct pci_packet pkt;
+ uint8_t buffer[sizeof(struct pci_bus_d0_entry)];
+ } ctxt;
+ int ret;
+
+ /*
+ * Tell the host that the bus is ready to use, and moved into the
+ * powered-on state. This includes telling the host which region
+ * of memory-mapped I/O space has been chosen for configuration space
+ * access.
+ */
+ init_completion(&comp_pkt.host_event);
+
+ ctxt.pkt.completion_func = hv_pci_generic_compl;
+ ctxt.pkt.compl_ctxt = &comp_pkt;
+
+ d0_entry = (struct pci_bus_d0_entry *)&ctxt.pkt.message;
+ memset(d0_entry, 0, sizeof(*d0_entry));
+ d0_entry->message_type.type = PCI_BUS_D0ENTRY;
+ d0_entry->mmio_base = rman_get_start(hbus->cfg_res);
+
+ ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND,
+ VMBUS_CHANPKT_FLAG_RC, d0_entry, sizeof(*d0_entry),
+ (uint64_t)(uintptr_t)&ctxt.pkt);
+ if (!ret)
+ ret = wait_for_response(hbus, &comp_pkt.host_event);
+
+ if (ret)
+ goto out;
+
+ if (comp_pkt.completion_status < 0) {
+ device_printf(hbus->pcib, "vmbus_pcib failed to enable D0\n");
+ ret = EPROTO;
+ } else {
+ ret = 0;
+ }
+
+out:
+ free_completion(&comp_pkt.host_event);
+ return (ret);
+}
+
+/*
+ * It looks this is only needed by Windows VM, but let's send the message too
+ * just to make the host happy.
+ */
+static int
+hv_send_resources_allocated(struct hv_pcibus *hbus)
+{
+ struct pci_resources_assigned *res_assigned;
+ struct hv_pci_compl comp_pkt;
+ struct hv_pci_dev *hpdev;
+ struct pci_packet *pkt;
+ uint32_t wslot;
+ int ret = 0;
+
+ pkt = malloc(sizeof(*pkt) + sizeof(*res_assigned),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+
+ for (wslot = 0; wslot < 256; wslot++) {
+ hpdev = get_pcichild_wslot(hbus, wslot);
+ if (!hpdev)
+ continue;
+
+ init_completion(&comp_pkt.host_event);
+
+ memset(pkt, 0, sizeof(*pkt) + sizeof(*res_assigned));
+ pkt->completion_func = hv_pci_generic_compl;
+ pkt->compl_ctxt = &comp_pkt;
+
+ res_assigned = (struct pci_resources_assigned *)&pkt->message;
+ res_assigned->message_type.type = PCI_RESOURCES_ASSIGNED;
+ res_assigned->wslot.val = hpdev->desc.wslot.val;
+
+ ret = vmbus_chan_send(hbus->sc->chan,
+ VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+ &pkt->message, sizeof(*res_assigned),
+ (uint64_t)(uintptr_t)pkt);
+ if (!ret)
+ ret = wait_for_response(hbus, &comp_pkt.host_event);
+
+ free_completion(&comp_pkt.host_event);
+
+ if (ret)
+ break;
+
+ if (comp_pkt.completion_status < 0) {
+ ret = EPROTO;
+ device_printf(hbus->pcib,
+ "failed to send PCI_RESOURCES_ASSIGNED\n");
+ break;
+ }
+ }
+
+ free(pkt, M_DEVBUF);
+ return (ret);
+}
+
+static int
+hv_send_resources_released(struct hv_pcibus *hbus)
+{
+ struct pci_child_message pkt;
+ struct hv_pci_dev *hpdev;
+ uint32_t wslot;
+ int ret;
+
+ for (wslot = 0; wslot < 256; wslot++) {
+ hpdev = get_pcichild_wslot(hbus, wslot);
+ if (!hpdev)
+ continue;
+
+ pkt.message_type.type = PCI_RESOURCES_RELEASED;
+ pkt.wslot.val = hpdev->desc.wslot.val;
+
+ ret = vmbus_chan_send(hbus->sc->chan,
+ VMBUS_CHANPKT_TYPE_INBAND, 0, &pkt, sizeof(pkt), 0);
+ if (ret)
+ return (ret);
+ }
+
+ return (0);
+}
+
+#define hv_cfg_read(x, s) \
+static inline uint##x##_t hv_cfg_read_##s(struct hv_pcibus *bus, \
+ bus_size_t offset) \
+{ \
+ return (bus_read_##s(bus->cfg_res, offset)); \
+}
+
+#define hv_cfg_write(x, s) \
+static inline void hv_cfg_write_##s(struct hv_pcibus *bus, \
+ bus_size_t offset, uint##x##_t val) \
+{ \
+ return (bus_write_##s(bus->cfg_res, offset, val)); \
+}
+
+hv_cfg_read(8, 1)
+hv_cfg_read(16, 2)
+hv_cfg_read(32, 4)
+
+hv_cfg_write(8, 1)
+hv_cfg_write(16, 2)
+hv_cfg_write(32, 4)
+
+static void
+_hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, int size,
+ uint32_t *val)
+{
+ struct hv_pcibus *hbus = hpdev->hbus;
+ bus_size_t addr = CFG_PAGE_OFFSET + where;
+
+ /*
+ * If the attempt is to read the IDs or the ROM BAR, simulate that.
+ */
+ if (where + size <= PCIR_COMMAND) {
+ memcpy(val, ((uint8_t *)&hpdev->desc.v_id) + where, size);
+ } else if (where >= PCIR_REVID && where + size <=
+ PCIR_CACHELNSZ) {
+ memcpy(val, ((uint8_t *)&hpdev->desc.rev) + where -
+ PCIR_REVID, size);
+ } else if (where >= PCIR_SUBVEND_0 && where + size <=
+ PCIR_BIOS) {
+ memcpy(val, (uint8_t *)&hpdev->desc.subsystem_id + where -
+ PCIR_SUBVEND_0, size);
+ } else if (where >= PCIR_BIOS && where + size <=
+ PCIR_CAP_PTR) {
+ /* ROM BARs are unimplemented */
+ *val = 0;
+ } else if ((where >= PCIR_INTLINE && where + size <=
+ PCIR_INTPIN) ||(where == PCIR_INTPIN && size == 1)) {
+ /*
+ * Interrupt Line and Interrupt PIN are hard-wired to zero
+ * because this front-end only supports message-signaled
+ * interrupts.
+ */
+ *val = 0;
+ } else if (where + size <= CFG_PAGE_SIZE) {
+ mtx_lock(&hbus->config_lock);
+
+ /* Choose the function to be read. */
+ hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val);
+
+ /* Make sure the function was chosen before we start reading.*/
+ mb();
+
+ /* Read from that function's config space. */
+ switch (size) {
+ case 1:
+ *((uint8_t *)val) = hv_cfg_read_1(hbus, addr);
+ break;
+ case 2:
+ *((uint16_t *)val) = hv_cfg_read_2(hbus, addr);
+ break;
+ default:
+ *((uint32_t *)val) = hv_cfg_read_4(hbus, addr);
+ break;
+ }
+ /*
+ * Make sure the write was done before we release the lock,
+ * allowing consecutive reads/writes.
+ */
+ mb();
+
+ mtx_unlock(&hbus->config_lock);
+ } else {
+ /* Invalid config read: it's unlikely to reach here. */
+ memset(val, 0, size);
+ }
+}
+
+static void
+_hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, int size,
+ uint32_t val)
+{
+ struct hv_pcibus *hbus = hpdev->hbus;
+ bus_size_t addr = CFG_PAGE_OFFSET + where;
+
+ /* SSIDs and ROM BARs are read-only */
+ if (where >= PCIR_SUBVEND_0 && where + size <= PCIR_CAP_PTR)
+ return;
+
+ if (where >= PCIR_COMMAND && where + size <= CFG_PAGE_SIZE) {
+ mtx_lock(&hbus->config_lock);
+
+ /* Choose the function to be written. */
+ hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val);
+
+ /* Make sure the function was chosen before we start writing.*/
+ wmb();
+
+ /* Write to that function's config space. */
+ switch (size) {
+ case 1:
+ hv_cfg_write_1(hbus, addr, (uint8_t)val);
+ break;
+ case 2:
+ hv_cfg_write_2(hbus, addr, (uint16_t)val);
+ break;
+ default:
+ hv_cfg_write_4(hbus, addr, (uint32_t)val);
+ break;
+ }
+
+ /*
+ * Make sure the write was done before we release the lock,
+ * allowing consecutive reads/writes.
+ */
+ mb();
+
+ mtx_unlock(&hbus->config_lock);
+ } else {
+ /* Invalid config write: it's unlikely to reach here. */
+ return;
+ }
+}
+
+/*
+ * The vPCI in some Hyper-V releases do not initialize the last 4
+ * bit of BAR registers. This could result weird problems causing PCI
+ * code fail to configure BAR correctly.
+ *
+ * Just write all 1's to those BARs whose probed values are not zero.
+ * This seems to make the Hyper-V vPCI and pci_write_bar() to cooperate
+ * correctly.
+ */
+
+static void
+vmbus_pcib_prepopulate_bars(struct hv_pcibus *hbus)
+{
+ struct hv_pci_dev *hpdev;
+ int i;
+
+ mtx_lock(&hbus->device_list_lock);
+ TAILQ_FOREACH(hpdev, &hbus->children, link) {
+ for (i = 0; i < 6; i++) {
+ /* Ignore empty bar */
+ if (hpdev->probed_bar[i] == 0)
+ continue;
+
+ uint32_t bar_val = 0;
+
+ _hv_pcifront_read_config(hpdev, PCIR_BAR(i),
+ 4, &bar_val);
+
+ if (hpdev->probed_bar[i] != bar_val) {
+ if (bootverbose)
+ printf("vmbus_pcib: initialize bar %d "
+ "by writing all 1s\n", i);
+
+ _hv_pcifront_write_config(hpdev, PCIR_BAR(i),
+ 4, 0xffffffff);
+ }
+ }
+ }
+ mtx_unlock(&hbus->device_list_lock);
+}
+
+static void
+vmbus_pcib_set_detaching(void *arg, int pending __unused)
+{
+ struct hv_pcibus *hbus = arg;
+
+ atomic_set_int(&hbus->detaching, 1);
+}
+
+static void
+vmbus_pcib_pre_detach(struct hv_pcibus *hbus)
+{
+ struct task task;
+
+ TASK_INIT(&task, 0, vmbus_pcib_set_detaching, hbus);
+
+ /*
+ * Make sure the channel callback won't push any possible new
+ * PCI_BUS_RELATIONS and PCI_EJECT tasks to sc->taskq.
+ */
+ vmbus_chan_run_task(hbus->sc->chan, &task);
+
+ taskqueue_drain_all(hbus->sc->taskq);
+}
+
+
+/*
+ * Standard probe entry point.
+ *
+ */
+static int
+vmbus_pcib_probe(device_t dev)
+{
+ if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
+ &g_pass_through_dev_type) == 0) {
+ device_set_desc(dev, "Hyper-V PCI Express Pass Through");
+ return (BUS_PROBE_DEFAULT);
+ }
+ return (ENXIO);
+}
+
+/*
+ * Standard attach entry point.
+ *
+ */
+static int
+vmbus_pcib_attach(device_t dev)
+{
+ const int pci_ring_size = (4 * PAGE_SIZE);
+ const struct hyperv_guid *inst_guid;
+ struct vmbus_channel *channel;
+ struct vmbus_pcib_softc *sc;
+ struct hv_pcibus *hbus;
+ int rid = 0;
+ int ret;
+
+ hbus = malloc(sizeof(*hbus), M_DEVBUF, M_WAITOK | M_ZERO);
+ hbus->pcib = dev;
+
+ channel = vmbus_get_channel(dev);
+ inst_guid = vmbus_chan_guid_inst(channel);
+ hbus->pci_domain = inst_guid->hv_guid[9] |
+ (inst_guid->hv_guid[8] << 8);
+
+ mtx_init(&hbus->config_lock, "hbcfg", NULL, MTX_DEF);
+ mtx_init(&hbus->device_list_lock, "hbdl", NULL, MTX_DEF);
+ TAILQ_INIT(&hbus->children);
+ TAILQ_INIT(&hbus->dr_list);
+
+ hbus->cfg_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid,
+ 0, RM_MAX_END, PCI_CONFIG_MMIO_LENGTH,
+ RF_ACTIVE | rman_make_alignment_flags(PAGE_SIZE));
+
+ if (!hbus->cfg_res) {
+ device_printf(dev, "failed to get resource for cfg window\n");
+ ret = ENXIO;
+ goto free_bus;
+ }
+
+ sc = device_get_softc(dev);
+ sc->chan = channel;
+ sc->rx_buf = malloc(PCIB_PACKET_SIZE, M_DEVBUF, M_WAITOK | M_ZERO);
+ sc->hbus = hbus;
+
+ /*
+ * The taskq is used to handle PCI_BUS_RELATIONS and PCI_EJECT
+ * messages. NB: we can't handle the messages in the channel callback
+ * directly, because the message handlers need to send new messages
+ * to the host and waits for the host's completion messages, which
+ * must also be handled by the channel callback.
+ */
+ sc->taskq = taskqueue_create("vmbus_pcib_tq", M_WAITOK,
+ taskqueue_thread_enqueue, &sc->taskq);
+ taskqueue_start_threads(&sc->taskq, 1, PI_NET, "vmbus_pcib_tq");
+
+ hbus->sc = sc;
+
+ init_completion(&hbus->query_completion);
+ hbus->query_comp = &hbus->query_completion;
+
+ ret = vmbus_chan_open(sc->chan, pci_ring_size, pci_ring_size,
+ NULL, 0, vmbus_pcib_on_channel_callback, sc);
+ if (ret)
+ goto free_res;
+
+ ret = hv_pci_protocol_negotiation(hbus);
+ if (ret)
+ goto vmbus_close;
+
+ ret = hv_pci_query_relations(hbus);
+ if (!ret)
+ ret = wait_for_response(hbus, hbus->query_comp);
+
+ if (ret)
+ goto vmbus_close;
+
+ ret = hv_pci_enter_d0(hbus);
+ if (ret)
+ goto vmbus_close;
+
+ ret = hv_send_resources_allocated(hbus);
+ if (ret)
+ goto vmbus_close;
+
+ vmbus_pcib_prepopulate_bars(hbus);
+
+ hbus->pci_bus = device_add_child(dev, "pci", -1);
+ if (!hbus->pci_bus) {
+ device_printf(dev, "failed to create pci bus\n");
+ ret = ENXIO;
+ goto vmbus_close;
+ }
+
+ bus_generic_attach(dev);
+
+ hbus->state = hv_pcibus_installed;
+
+ return (0);
+
+vmbus_close:
+ vmbus_pcib_pre_detach(hbus);
+ vmbus_chan_close(sc->chan);
+free_res:
+ taskqueue_free(sc->taskq);
+ free_completion(&hbus->query_completion);
+ free(sc->rx_buf, M_DEVBUF);
+ bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res);
+free_bus:
+ mtx_destroy(&hbus->device_list_lock);
+ mtx_destroy(&hbus->config_lock);
+ free(hbus, M_DEVBUF);
+ return (ret);
+}
+
+/*
+ * Standard detach entry point
+ */
+static int
+vmbus_pcib_detach(device_t dev)
+{
+ struct vmbus_pcib_softc *sc = device_get_softc(dev);
+ struct hv_pcibus *hbus = sc->hbus;
+ struct pci_message teardown_packet;
+ struct pci_bus_relations relations;
+ int ret;
+
+ vmbus_pcib_pre_detach(hbus);
+
+ if (hbus->state == hv_pcibus_installed)
+ bus_generic_detach(dev);
+
+ /* Delete any children which might still exist. */
+ memset(&relations, 0, sizeof(relations));
+ hv_pci_devices_present(hbus, &relations);
+
+ ret = hv_send_resources_released(hbus);
+ if (ret)
+ device_printf(dev, "failed to send PCI_RESOURCES_RELEASED\n");
+
+ teardown_packet.type = PCI_BUS_D0EXIT;
+ ret = vmbus_chan_send(sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
+ &teardown_packet, sizeof(struct pci_message), 0);
+ if (ret)
+ device_printf(dev, "failed to send PCI_BUS_D0EXIT\n");
+
+ taskqueue_drain_all(hbus->sc->taskq);
+ vmbus_chan_close(sc->chan);
+ taskqueue_free(sc->taskq);
+
+ free_completion(&hbus->query_completion);
+ free(sc->rx_buf, M_DEVBUF);
+ bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res);
+
+ mtx_destroy(&hbus->device_list_lock);
+ mtx_destroy(&hbus->config_lock);
+ free(hbus, M_DEVBUF);
+
+ return (0);
+}
+
+static int
+vmbus_pcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *val)
+{
+ struct vmbus_pcib_softc *sc = device_get_softc(dev);
+
+ switch (which) {
+ case PCIB_IVAR_DOMAIN:
+ *val = sc->hbus->pci_domain;
+ return (0);
+
+ case PCIB_IVAR_BUS:
+ /* There is only bus 0. */
+ *val = 0;
+ return (0);
+ }
+ return (ENOENT);
+}
+
+static int
+vmbus_pcib_write_ivar(device_t dev, device_t child, int which, uintptr_t val)
+{
+ return (ENOENT);
+}
+
+static struct resource *
+vmbus_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
+ rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
+{
+ unsigned int bar_no;
+ struct hv_pci_dev *hpdev;
+ struct vmbus_pcib_softc *sc = device_get_softc(dev);
+ struct resource *res;
+ unsigned int devfn;
+
+ if (type == PCI_RES_BUS)
+ return (pci_domain_alloc_bus(sc->hbus->pci_domain, child, rid,
+ start, end, count, flags));
+
+ /* Devices with port I/O BAR are not supported. */
+ if (type == SYS_RES_IOPORT)
+ return (NULL);
+
+ if (type == SYS_RES_MEMORY) {
+ devfn = PCI_DEVFN(pci_get_slot(child),
+ pci_get_function(child));
+ hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
+ if (!hpdev)
+ return (NULL);
+
+ bar_no = PCI_RID2BAR(*rid);
+ if (bar_no >= MAX_NUM_BARS)
+ return (NULL);
+
+ /* Make sure a 32-bit BAR gets a 32-bit address */
+ if (!(hpdev->probed_bar[bar_no] & PCIM_BAR_MEM_64))
+ end = ulmin(end, 0xFFFFFFFF);
+ }
+
+ res = bus_generic_alloc_resource(dev, child, type, rid,
+ start, end, count, flags);
+ /*
+ * If this is a request for a specific range, assume it is
+ * correct and pass it up to the parent.
+ */
+ if (res == NULL && start + count - 1 == end)
+ res = bus_generic_alloc_resource(dev, child, type, rid,
+ start, end, count, flags);
+ return (res);
+}
+
+static int
+vmbus_pcib_release_resource(device_t dev, device_t child, int type, int rid,
+ struct resource *r)
+{
+ struct vmbus_pcib_softc *sc = device_get_softc(dev);
+
+ if (type == PCI_RES_BUS)
+ return (pci_domain_release_bus(sc->hbus->pci_domain, child,
+ rid, r));
+
+ if (type == SYS_RES_IOPORT)
+ return (EINVAL);
+
+ return (bus_generic_release_resource(dev, child, type, rid, r));
+}
+
+#if __FreeBSD_version >= 1100000
+static int
+vmbus_pcib_get_cpus(device_t pcib, device_t dev, enum cpu_sets op,
+ size_t setsize, cpuset_t *cpuset)
+{
+ return (bus_get_cpus(pcib, op, setsize, cpuset));
+}
+#endif
+
+static uint32_t
+vmbus_pcib_read_config(device_t dev, u_int bus, u_int slot, u_int func,
+ u_int reg, int bytes)
+{
+ struct vmbus_pcib_softc *sc = device_get_softc(dev);
+ struct hv_pci_dev *hpdev;
+ unsigned int devfn = PCI_DEVFN(slot, func);
+ uint32_t data = 0;
+
+ KASSERT(bus == 0, ("bus should be 0, but is %u", bus));
+
+ hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
+ if (!hpdev)
+ return (~0);
+
+ _hv_pcifront_read_config(hpdev, reg, bytes, &data);
+
+ return (data);
+}
+
+static void
+vmbus_pcib_write_config(device_t dev, u_int bus, u_int slot, u_int func,
+ u_int reg, uint32_t data, int bytes)
+{
+ struct vmbus_pcib_softc *sc = device_get_softc(dev);
+ struct hv_pci_dev *hpdev;
+ unsigned int devfn = PCI_DEVFN(slot, func);
+
+ KASSERT(bus == 0, ("bus should be 0, but is %u", bus));
+
+ hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
+ if (!hpdev)
+ return;
+
+ _hv_pcifront_write_config(hpdev, reg, bytes, data);
+}
+
+static int
+vmbus_pcib_route_intr(device_t pcib, device_t dev, int pin)
+{
+ /* We only support MSI/MSI-X and don't support INTx interrupt. */
+ return (PCI_INVALID_IRQ);
+}
+
+static int
+vmbus_pcib_alloc_msi(device_t pcib, device_t dev, int count,
+ int maxcount, int *irqs)
+{
+ return (PCIB_ALLOC_MSI(device_get_parent(pcib), dev, count, maxcount,
+ irqs));
+}
+
+static int
+vmbus_pcib_release_msi(device_t pcib, device_t dev, int count, int *irqs)
+{
+ return (PCIB_RELEASE_MSI(device_get_parent(pcib), dev, count, irqs));
+}
+
+static int
+vmbus_pcib_alloc_msix(device_t pcib, device_t dev, int *irq)
+{
+ return (PCIB_ALLOC_MSIX(device_get_parent(pcib), dev, irq));
+}
+
+static int
+vmbus_pcib_release_msix(device_t pcib, device_t dev, int irq)
+{
+ return (PCIB_RELEASE_MSIX(device_get_parent(pcib), dev, irq));
+}
+
+#define MSI_INTEL_ADDR_DEST 0x000ff000
+#define MSI_INTEL_DATA_INTVEC IOART_INTVEC /* Interrupt vector. */
+#define MSI_INTEL_DATA_DELFIXED IOART_DELFIXED
+
+static int
+vmbus_pcib_map_msi(device_t pcib, device_t child, int irq,
+ uint64_t *addr, uint32_t *data)
+{
+ unsigned int devfn;
+ struct hv_pci_dev *hpdev;
+
+ uint64_t v_addr;
+ uint32_t v_data;
+ struct hv_irq_desc *hid, *tmp_hid;
+ unsigned int cpu, vcpu_id;
+ unsigned int vector;
+
+ struct vmbus_pcib_softc *sc = device_get_softc(pcib);
+ struct pci_create_interrupt *int_pkt;
+ struct compose_comp_ctxt comp;
+ struct {
+ struct pci_packet pkt;
+ uint8_t buffer[sizeof(struct pci_create_interrupt)];
+ } ctxt;
+
+ int ret;
+
+ devfn = PCI_DEVFN(pci_get_slot(child), pci_get_function(child));
+ hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
+ if (!hpdev)
+ return (ENOENT);
+
+ ret = PCIB_MAP_MSI(device_get_parent(pcib), child, irq,
+ &v_addr, &v_data);
+ if (ret)
+ return (ret);
+
+ TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid) {
+ if (hid->irq == irq) {
+ TAILQ_REMOVE(&hpdev->irq_desc_list, hid, link);
+ hv_int_desc_free(hpdev, hid);
+ break;
+ }
+ }
+
+ cpu = (v_addr & MSI_INTEL_ADDR_DEST) >> 12;
+ vcpu_id = VMBUS_GET_VCPU_ID(device_get_parent(pcib), pcib, cpu);
+ vector = v_data & MSI_INTEL_DATA_INTVEC;
+
+ init_completion(&comp.comp_pkt.host_event);
+
+ memset(&ctxt, 0, sizeof(ctxt));
+ ctxt.pkt.completion_func = hv_pci_compose_compl;
+ ctxt.pkt.compl_ctxt = &comp;
+
+ int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message;
+ int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
+ int_pkt->wslot.val = hpdev->desc.wslot.val;
+ int_pkt->int_desc.vector = vector;
+ int_pkt->int_desc.vector_count = 1;
+ int_pkt->int_desc.delivery_mode = MSI_INTEL_DATA_DELFIXED;
+ int_pkt->int_desc.cpu_mask = 1ULL << vcpu_id;
+
+ ret = vmbus_chan_send(sc->chan, VMBUS_CHANPKT_TYPE_INBAND,
+ VMBUS_CHANPKT_FLAG_RC, int_pkt, sizeof(*int_pkt),
+ (uint64_t)(uintptr_t)&ctxt.pkt);
+ if (ret) {
+ free_completion(&comp.comp_pkt.host_event);
+ return (ret);
+ }
+
+ wait_for_completion(&comp.comp_pkt.host_event);
+ free_completion(&comp.comp_pkt.host_event);
+
+ if (comp.comp_pkt.completion_status < 0)
+ return (EPROTO);
+
+ *addr = comp.int_desc.address;
+ *data = comp.int_desc.data;
+
+ hid = malloc(sizeof(struct hv_irq_desc), M_DEVBUF, M_WAITOK | M_ZERO);
+ hid->irq = irq;
+ hid->desc = comp.int_desc;
+ TAILQ_INSERT_TAIL(&hpdev->irq_desc_list, hid, link);
+
+ return (0);
+}
+
+static device_method_t vmbus_pcib_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, vmbus_pcib_probe),
+ DEVMETHOD(device_attach, vmbus_pcib_attach),
+ DEVMETHOD(device_detach, vmbus_pcib_detach),
+ DEVMETHOD(device_shutdown, bus_generic_shutdown),
+ DEVMETHOD(device_suspend, bus_generic_suspend),
+ DEVMETHOD(device_resume, bus_generic_resume),
+
+ /* Bus interface */
+ DEVMETHOD(bus_read_ivar, vmbus_pcib_read_ivar),
+ DEVMETHOD(bus_write_ivar, vmbus_pcib_write_ivar),
+ DEVMETHOD(bus_alloc_resource, vmbus_pcib_alloc_resource),
+ DEVMETHOD(bus_release_resource, vmbus_pcib_release_resource),
+ DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
+ DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
+ DEVMETHOD(bus_setup_intr, bus_generic_setup_intr),
+ DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr),
+#if __FreeBSD_version >= 1100000
+ DEVMETHOD(bus_get_cpus, vmbus_pcib_get_cpus),
+#endif
+
+ /* pcib interface */
+ DEVMETHOD(pcib_maxslots, pcib_maxslots),
+ DEVMETHOD(pcib_read_config, vmbus_pcib_read_config),
+ DEVMETHOD(pcib_write_config, vmbus_pcib_write_config),
+ DEVMETHOD(pcib_route_interrupt, vmbus_pcib_route_intr),
+ DEVMETHOD(pcib_alloc_msi, vmbus_pcib_alloc_msi),
+ DEVMETHOD(pcib_release_msi, vmbus_pcib_release_msi),
+ DEVMETHOD(pcib_alloc_msix, vmbus_pcib_alloc_msix),
+ DEVMETHOD(pcib_release_msix, vmbus_pcib_release_msix),
+ DEVMETHOD(pcib_map_msi, vmbus_pcib_map_msi),
+ DEVMETHOD(pcib_request_feature, pcib_request_feature_allow),
+
+ DEVMETHOD_END
+};
+
+static devclass_t pcib_devclass;
+
+DEFINE_CLASS_0(pcib, vmbus_pcib_driver, vmbus_pcib_methods,
+ sizeof(struct vmbus_pcib_softc));
+DRIVER_MODULE(vmbus_pcib, vmbus, vmbus_pcib_driver, pcib_devclass, 0, 0);
+MODULE_DEPEND(vmbus_pcib, vmbus, 1, 1, 1);
+MODULE_DEPEND(vmbus_pcib, pci, 1, 1, 1);
+
+#endif /* NEW_PCIB */
diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
new file mode 100644
index 000000000000..702308e26a1d
--- /dev/null
+++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c
@@ -0,0 +1,2515 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * StorVSC driver for Hyper-V. This driver presents a SCSI HBA interface
+ * to the Comman Access Method (CAM) layer. CAM control blocks (CCBs) are
+ * converted into VSCSI protocol messages which are delivered to the parent
+ * partition StorVSP driver over the Hyper-V VMBUS.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/condvar.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/sockio.h>
+#include <sys/mbuf.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+#include <sys/bus.h>
+#include <sys/mutex.h>
+#include <sys/callout.h>
+#include <sys/smp.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/uma.h>
+#include <sys/lock.h>
+#include <sys/sema.h>
+#include <sys/sglist.h>
+#include <sys/eventhandler.h>
+#include <machine/bus.h>
+
+#include <cam/cam.h>
+#include <cam/cam_ccb.h>
+#include <cam/cam_periph.h>
+#include <cam/cam_sim.h>
+#include <cam/cam_xpt_sim.h>
+#include <cam/cam_xpt_internal.h>
+#include <cam/cam_debug.h>
+#include <cam/scsi/scsi_all.h>
+#include <cam/scsi/scsi_message.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+#include "hv_vstorage.h"
+#include "vmbus_if.h"
+
+#define STORVSC_MAX_LUNS_PER_TARGET (64)
+#define STORVSC_MAX_IO_REQUESTS (STORVSC_MAX_LUNS_PER_TARGET * 2)
+#define BLKVSC_MAX_IDE_DISKS_PER_TARGET (1)
+#define BLKVSC_MAX_IO_REQUESTS STORVSC_MAX_IO_REQUESTS
+#define STORVSC_MAX_TARGETS (2)
+
+#define VSTOR_PKT_SIZE (sizeof(struct vstor_packet) - vmscsi_size_delta)
+
+/*
+ * 33 segments are needed to allow 128KB maxio, in case the data
+ * in the first page is _not_ PAGE_SIZE aligned, e.g.
+ *
+ * |<----------- 128KB ----------->|
+ * | |
+ * 0 2K 4K 8K 16K 124K 128K 130K
+ * | | | | | | | |
+ * +--+--+-----+-----+.......+-----+--+--+
+ * | | | | | | | | | DATA
+ * | | | | | | | | |
+ * +--+--+-----+-----+.......------+--+--+
+ * | | | |
+ * | 1| 31 | 1| ...... # of segments
+ */
+#define STORVSC_DATA_SEGCNT_MAX 33
+#define STORVSC_DATA_SEGSZ_MAX PAGE_SIZE
+#define STORVSC_DATA_SIZE_MAX \
+ ((STORVSC_DATA_SEGCNT_MAX - 1) * STORVSC_DATA_SEGSZ_MAX)
+
+struct storvsc_softc;
+
+struct hv_sgl_node {
+ LIST_ENTRY(hv_sgl_node) link;
+ struct sglist *sgl_data;
+};
+
+struct hv_sgl_page_pool{
+ LIST_HEAD(, hv_sgl_node) in_use_sgl_list;
+ LIST_HEAD(, hv_sgl_node) free_sgl_list;
+ boolean_t is_init;
+} g_hv_sgl_page_pool;
+
+enum storvsc_request_type {
+ WRITE_TYPE,
+ READ_TYPE,
+ UNKNOWN_TYPE
+};
+
+SYSCTL_NODE(_hw, OID_AUTO, storvsc, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
+ "Hyper-V storage interface");
+
+static u_int hv_storvsc_use_win8ext_flags = 1;
+SYSCTL_UINT(_hw_storvsc, OID_AUTO, use_win8ext_flags, CTLFLAG_RW,
+ &hv_storvsc_use_win8ext_flags, 0,
+ "Use win8 extension flags or not");
+
+static u_int hv_storvsc_use_pim_unmapped = 1;
+SYSCTL_UINT(_hw_storvsc, OID_AUTO, use_pim_unmapped, CTLFLAG_RDTUN,
+ &hv_storvsc_use_pim_unmapped, 0,
+ "Optimize storvsc by using unmapped I/O");
+
+static u_int hv_storvsc_ringbuffer_size = (64 * PAGE_SIZE);
+SYSCTL_UINT(_hw_storvsc, OID_AUTO, ringbuffer_size, CTLFLAG_RDTUN,
+ &hv_storvsc_ringbuffer_size, 0, "Hyper-V storage ringbuffer size");
+
+static u_int hv_storvsc_max_io = 512;
+SYSCTL_UINT(_hw_storvsc, OID_AUTO, max_io, CTLFLAG_RDTUN,
+ &hv_storvsc_max_io, 0, "Hyper-V storage max io limit");
+
+static int hv_storvsc_chan_cnt = 0;
+SYSCTL_INT(_hw_storvsc, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
+ &hv_storvsc_chan_cnt, 0, "# of channels to use");
+#ifdef DIAGNOSTIC
+static int hv_storvsc_srb_status = -1;
+SYSCTL_INT(_hw_storvsc, OID_AUTO, srb_status, CTLFLAG_RW,
+ &hv_storvsc_srb_status, 0, "srb_status to inject");
+TUNABLE_INT("hw_storvsc.srb_status", &hv_storvsc_srb_status);
+#endif /* DIAGNOSTIC */
+
+#define STORVSC_MAX_IO \
+ vmbus_chan_prplist_nelem(hv_storvsc_ringbuffer_size, \
+ STORVSC_DATA_SEGCNT_MAX, VSTOR_PKT_SIZE)
+
+struct hv_storvsc_sysctl {
+ u_long data_bio_cnt;
+ u_long data_vaddr_cnt;
+ u_long data_sg_cnt;
+ u_long chan_send_cnt[MAXCPU];
+};
+
+struct storvsc_gpa_range {
+ struct vmbus_gpa_range gpa_range;
+ uint64_t gpa_page[STORVSC_DATA_SEGCNT_MAX];
+} __packed;
+
+struct hv_storvsc_request {
+ LIST_ENTRY(hv_storvsc_request) link;
+ struct vstor_packet vstor_packet;
+ int prp_cnt;
+ struct storvsc_gpa_range prp_list;
+ void *sense_data;
+ uint8_t sense_info_len;
+ uint8_t retries;
+ union ccb *ccb;
+ struct storvsc_softc *softc;
+ struct callout callout;
+ struct sema synch_sema; /*Synchronize the request/response if needed */
+ struct sglist *bounce_sgl;
+ unsigned int bounce_sgl_count;
+ uint64_t not_aligned_seg_bits;
+ bus_dmamap_t data_dmap;
+};
+
+struct storvsc_softc {
+ struct vmbus_channel *hs_chan;
+ LIST_HEAD(, hv_storvsc_request) hs_free_list;
+ struct mtx hs_lock;
+ struct storvsc_driver_props *hs_drv_props;
+ int hs_unit;
+ uint32_t hs_frozen;
+ struct cam_sim *hs_sim;
+ struct cam_path *hs_path;
+ uint32_t hs_num_out_reqs;
+ boolean_t hs_destroy;
+ boolean_t hs_drain_notify;
+ struct sema hs_drain_sema;
+ struct hv_storvsc_request hs_init_req;
+ struct hv_storvsc_request hs_reset_req;
+ device_t hs_dev;
+ bus_dma_tag_t storvsc_req_dtag;
+ struct hv_storvsc_sysctl sysctl_data;
+ uint32_t hs_nchan;
+ struct vmbus_channel *hs_sel_chan[MAXCPU];
+};
+
+static eventhandler_tag storvsc_handler_tag;
+/*
+ * The size of the vmscsi_request has changed in win8. The
+ * additional size is for the newly added elements in the
+ * structure. These elements are valid only when we are talking
+ * to a win8 host.
+ * Track the correct size we need to apply.
+ */
+static int vmscsi_size_delta = sizeof(struct vmscsi_win8_extension);
+
+/**
+ * HyperV storvsc timeout testing cases:
+ * a. IO returned after first timeout;
+ * b. IO returned after second timeout and queue freeze;
+ * c. IO returned while timer handler is running
+ * The first can be tested by "sg_senddiag -vv /dev/daX",
+ * and the second and third can be done by
+ * "sg_wr_mode -v -p 08 -c 0,1a -m 0,ff /dev/daX".
+ */
+#define HVS_TIMEOUT_TEST 0
+
+/*
+ * Bus/adapter reset functionality on the Hyper-V host is
+ * buggy and it will be disabled until
+ * it can be further tested.
+ */
+#define HVS_HOST_RESET 0
+
+struct storvsc_driver_props {
+ char *drv_name;
+ char *drv_desc;
+ uint8_t drv_max_luns_per_target;
+ uint32_t drv_max_ios_per_target;
+ uint32_t drv_ringbuffer_size;
+};
+
+enum hv_storage_type {
+ DRIVER_BLKVSC,
+ DRIVER_STORVSC,
+ DRIVER_UNKNOWN
+};
+
+#define HS_MAX_ADAPTERS 10
+
+#define HV_STORAGE_SUPPORTS_MULTI_CHANNEL 0x1
+
+/* {ba6163d9-04a1-4d29-b605-72e2ffb1dc7f} */
+static const struct hyperv_guid gStorVscDeviceType={
+ .hv_guid = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d,
+ 0xb6, 0x05, 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f}
+};
+
+/* {32412632-86cb-44a2-9b5c-50d1417354f5} */
+static const struct hyperv_guid gBlkVscDeviceType={
+ .hv_guid = {0x32, 0x26, 0x41, 0x32, 0xcb, 0x86, 0xa2, 0x44,
+ 0x9b, 0x5c, 0x50, 0xd1, 0x41, 0x73, 0x54, 0xf5}
+};
+
+static struct storvsc_driver_props g_drv_props_table[] = {
+ {"blkvsc", "Hyper-V IDE",
+ BLKVSC_MAX_IDE_DISKS_PER_TARGET, BLKVSC_MAX_IO_REQUESTS,
+ 20*PAGE_SIZE},
+ {"storvsc", "Hyper-V SCSI",
+ STORVSC_MAX_LUNS_PER_TARGET, STORVSC_MAX_IO_REQUESTS,
+ 20*PAGE_SIZE}
+};
+
+/*
+ * Sense buffer size changed in win8; have a run-time
+ * variable to track the size we should use.
+ */
+static int sense_buffer_size = PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE;
+
+/*
+ * The storage protocol version is determined during the
+ * initial exchange with the host. It will indicate which
+ * storage functionality is available in the host.
+*/
+static int vmstor_proto_version;
+
+struct vmstor_proto {
+ int proto_version;
+ int sense_buffer_size;
+ int vmscsi_size_delta;
+};
+
+static const struct vmstor_proto vmstor_proto_list[] = {
+ {
+ VMSTOR_PROTOCOL_VERSION_WIN10,
+ POST_WIN7_STORVSC_SENSE_BUFFER_SIZE,
+ 0
+ },
+ {
+ VMSTOR_PROTOCOL_VERSION_WIN8_1,
+ POST_WIN7_STORVSC_SENSE_BUFFER_SIZE,
+ 0
+ },
+ {
+ VMSTOR_PROTOCOL_VERSION_WIN8,
+ POST_WIN7_STORVSC_SENSE_BUFFER_SIZE,
+ 0
+ },
+ {
+ VMSTOR_PROTOCOL_VERSION_WIN7,
+ PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE,
+ sizeof(struct vmscsi_win8_extension),
+ },
+ {
+ VMSTOR_PROTOCOL_VERSION_WIN6,
+ PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE,
+ sizeof(struct vmscsi_win8_extension),
+ }
+};
+
+/* static functions */
+static int storvsc_probe(device_t dev);
+static int storvsc_attach(device_t dev);
+static int storvsc_detach(device_t dev);
+static void storvsc_poll(struct cam_sim * sim);
+static void storvsc_action(struct cam_sim * sim, union ccb * ccb);
+static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp);
+static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp);
+static enum hv_storage_type storvsc_get_storage_type(device_t dev);
+static void hv_storvsc_rescan_target(struct storvsc_softc *sc);
+static void hv_storvsc_on_channel_callback(struct vmbus_channel *chan, void *xsc);
+static void hv_storvsc_on_iocompletion( struct storvsc_softc *sc,
+ struct vstor_packet *vstor_packet,
+ struct hv_storvsc_request *request);
+static int hv_storvsc_connect_vsp(struct storvsc_softc *);
+static void storvsc_io_done(struct hv_storvsc_request *reqp);
+static void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl,
+ bus_dma_segment_t *orig_sgl,
+ unsigned int orig_sgl_count,
+ uint64_t seg_bits);
+void storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl,
+ unsigned int dest_sgl_count,
+ struct sglist* src_sgl,
+ uint64_t seg_bits);
+
+static device_method_t storvsc_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, storvsc_probe),
+ DEVMETHOD(device_attach, storvsc_attach),
+ DEVMETHOD(device_detach, storvsc_detach),
+ DEVMETHOD(device_shutdown, bus_generic_shutdown),
+ DEVMETHOD_END
+};
+
+static driver_t storvsc_driver = {
+ "storvsc", storvsc_methods, sizeof(struct storvsc_softc),
+};
+
+static devclass_t storvsc_devclass;
+DRIVER_MODULE(storvsc, vmbus, storvsc_driver, storvsc_devclass, 0, 0);
+MODULE_VERSION(storvsc, 1);
+MODULE_DEPEND(storvsc, vmbus, 1, 1, 1);
+
+static void
+storvsc_subchan_attach(struct storvsc_softc *sc,
+ struct vmbus_channel *new_channel)
+{
+ struct vmstor_chan_props props;
+ int ret = 0;
+
+ memset(&props, 0, sizeof(props));
+
+ vmbus_chan_cpu_rr(new_channel);
+ ret = vmbus_chan_open(new_channel,
+ sc->hs_drv_props->drv_ringbuffer_size,
+ sc->hs_drv_props->drv_ringbuffer_size,
+ (void *)&props,
+ sizeof(struct vmstor_chan_props),
+ hv_storvsc_on_channel_callback, sc);
+}
+
+/**
+ * @brief Send multi-channel creation request to host
+ *
+ * @param device a Hyper-V device pointer
+ * @param max_chans the max channels supported by vmbus
+ */
+static void
+storvsc_send_multichannel_request(struct storvsc_softc *sc, int max_subch)
+{
+ struct vmbus_channel **subchan;
+ struct hv_storvsc_request *request;
+ struct vstor_packet *vstor_packet;
+ int request_subch;
+ int ret, i;
+
+ /* get sub-channel count that need to create */
+ request_subch = MIN(max_subch, mp_ncpus - 1);
+
+ request = &sc->hs_init_req;
+
+ /* request the host to create multi-channel */
+ memset(request, 0, sizeof(struct hv_storvsc_request));
+
+ sema_init(&request->synch_sema, 0, ("stor_synch_sema"));
+
+ vstor_packet = &request->vstor_packet;
+
+ vstor_packet->operation = VSTOR_OPERATION_CREATE_MULTI_CHANNELS;
+ vstor_packet->flags = REQUEST_COMPLETION_FLAG;
+ vstor_packet->u.multi_channels_cnt = request_subch;
+
+ ret = vmbus_chan_send(sc->hs_chan,
+ VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+ vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
+
+ sema_wait(&request->synch_sema);
+
+ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
+ vstor_packet->status != 0) {
+ printf("Storvsc_error: create multi-channel invalid operation "
+ "(%d) or statue (%u)\n",
+ vstor_packet->operation, vstor_packet->status);
+ return;
+ }
+
+ /* Update channel count */
+ sc->hs_nchan = request_subch + 1;
+
+ /* Wait for sub-channels setup to complete. */
+ subchan = vmbus_subchan_get(sc->hs_chan, request_subch);
+
+ /* Attach the sub-channels. */
+ for (i = 0; i < request_subch; ++i)
+ storvsc_subchan_attach(sc, subchan[i]);
+
+ /* Release the sub-channels. */
+ vmbus_subchan_rel(subchan, request_subch);
+
+ if (bootverbose)
+ printf("Storvsc create multi-channel success!\n");
+}
+
+/**
+ * @brief initialize channel connection to parent partition
+ *
+ * @param dev a Hyper-V device pointer
+ * @returns 0 on success, non-zero error on failure
+ */
+static int
+hv_storvsc_channel_init(struct storvsc_softc *sc)
+{
+ int ret = 0, i;
+ struct hv_storvsc_request *request;
+ struct vstor_packet *vstor_packet;
+ uint16_t max_subch;
+ boolean_t support_multichannel;
+ uint32_t version;
+
+ max_subch = 0;
+ support_multichannel = FALSE;
+
+ request = &sc->hs_init_req;
+ memset(request, 0, sizeof(struct hv_storvsc_request));
+ vstor_packet = &request->vstor_packet;
+ request->softc = sc;
+
+ /**
+ * Initiate the vsc/vsp initialization protocol on the open channel
+ */
+ sema_init(&request->synch_sema, 0, ("stor_synch_sema"));
+
+ vstor_packet->operation = VSTOR_OPERATION_BEGININITIALIZATION;
+ vstor_packet->flags = REQUEST_COMPLETION_FLAG;
+
+
+ ret = vmbus_chan_send(sc->hs_chan,
+ VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+ vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
+
+ if (ret != 0)
+ goto cleanup;
+
+ sema_wait(&request->synch_sema);
+
+ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
+ vstor_packet->status != 0) {
+ goto cleanup;
+ }
+
+ for (i = 0; i < nitems(vmstor_proto_list); i++) {
+ /* reuse the packet for version range supported */
+
+ memset(vstor_packet, 0, sizeof(struct vstor_packet));
+ vstor_packet->operation = VSTOR_OPERATION_QUERYPROTOCOLVERSION;
+ vstor_packet->flags = REQUEST_COMPLETION_FLAG;
+
+ vstor_packet->u.version.major_minor =
+ vmstor_proto_list[i].proto_version;
+
+ /* revision is only significant for Windows guests */
+ vstor_packet->u.version.revision = 0;
+
+ ret = vmbus_chan_send(sc->hs_chan,
+ VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+ vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
+
+ if (ret != 0)
+ goto cleanup;
+
+ sema_wait(&request->synch_sema);
+
+ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO) {
+ ret = EINVAL;
+ goto cleanup;
+ }
+ if (vstor_packet->status == 0) {
+ vmstor_proto_version =
+ vmstor_proto_list[i].proto_version;
+ sense_buffer_size =
+ vmstor_proto_list[i].sense_buffer_size;
+ vmscsi_size_delta =
+ vmstor_proto_list[i].vmscsi_size_delta;
+ break;
+ }
+ }
+
+ if (vstor_packet->status != 0) {
+ ret = EINVAL;
+ goto cleanup;
+ }
+ /**
+ * Query channel properties
+ */
+ memset(vstor_packet, 0, sizeof(struct vstor_packet));
+ vstor_packet->operation = VSTOR_OPERATION_QUERYPROPERTIES;
+ vstor_packet->flags = REQUEST_COMPLETION_FLAG;
+
+ ret = vmbus_chan_send(sc->hs_chan,
+ VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+ vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
+
+ if ( ret != 0)
+ goto cleanup;
+
+ sema_wait(&request->synch_sema);
+
+ /* TODO: Check returned version */
+ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
+ vstor_packet->status != 0) {
+ goto cleanup;
+ }
+
+ max_subch = vstor_packet->u.chan_props.max_channel_cnt;
+ if (hv_storvsc_chan_cnt > 0 && hv_storvsc_chan_cnt < (max_subch + 1))
+ max_subch = hv_storvsc_chan_cnt - 1;
+
+ /* multi-channels feature is supported by WIN8 and above version */
+ version = VMBUS_GET_VERSION(device_get_parent(sc->hs_dev), sc->hs_dev);
+ if (version != VMBUS_VERSION_WIN7 && version != VMBUS_VERSION_WS2008 &&
+ (vstor_packet->u.chan_props.flags &
+ HV_STORAGE_SUPPORTS_MULTI_CHANNEL)) {
+ support_multichannel = TRUE;
+ }
+ if (bootverbose) {
+ device_printf(sc->hs_dev, "max chans %d%s\n", max_subch + 1,
+ support_multichannel ? ", multi-chan capable" : "");
+ }
+
+ memset(vstor_packet, 0, sizeof(struct vstor_packet));
+ vstor_packet->operation = VSTOR_OPERATION_ENDINITIALIZATION;
+ vstor_packet->flags = REQUEST_COMPLETION_FLAG;
+
+ ret = vmbus_chan_send(sc->hs_chan,
+ VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+ vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
+
+ if (ret != 0) {
+ goto cleanup;
+ }
+
+ sema_wait(&request->synch_sema);
+
+ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
+ vstor_packet->status != 0)
+ goto cleanup;
+
+ /*
+ * If multi-channel is supported, send multichannel create
+ * request to host.
+ */
+ if (support_multichannel && max_subch > 0)
+ storvsc_send_multichannel_request(sc, max_subch);
+cleanup:
+ sema_destroy(&request->synch_sema);
+ return (ret);
+}
+
+/**
+ * @brief Open channel connection to paraent partition StorVSP driver
+ *
+ * Open and initialize channel connection to parent partition StorVSP driver.
+ *
+ * @param pointer to a Hyper-V device
+ * @returns 0 on success, non-zero error on failure
+ */
+static int
+hv_storvsc_connect_vsp(struct storvsc_softc *sc)
+{
+ int ret = 0;
+ struct vmstor_chan_props props;
+
+ memset(&props, 0, sizeof(struct vmstor_chan_props));
+
+ /*
+ * Open the channel
+ */
+ vmbus_chan_cpu_rr(sc->hs_chan);
+ ret = vmbus_chan_open(
+ sc->hs_chan,
+ sc->hs_drv_props->drv_ringbuffer_size,
+ sc->hs_drv_props->drv_ringbuffer_size,
+ (void *)&props,
+ sizeof(struct vmstor_chan_props),
+ hv_storvsc_on_channel_callback, sc);
+
+ if (ret != 0) {
+ return ret;
+ }
+
+ ret = hv_storvsc_channel_init(sc);
+ return (ret);
+}
+
+#if HVS_HOST_RESET
+static int
+hv_storvsc_host_reset(struct storvsc_softc *sc)
+{
+ int ret = 0;
+
+ struct hv_storvsc_request *request;
+ struct vstor_packet *vstor_packet;
+
+ request = &sc->hs_reset_req;
+ request->softc = sc;
+ vstor_packet = &request->vstor_packet;
+
+ sema_init(&request->synch_sema, 0, "stor synch sema");
+
+ vstor_packet->operation = VSTOR_OPERATION_RESETBUS;
+ vstor_packet->flags = REQUEST_COMPLETION_FLAG;
+
+ ret = vmbus_chan_send(dev->channel,
+ VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+ vstor_packet, VSTOR_PKT_SIZE,
+ (uint64_t)(uintptr_t)&sc->hs_reset_req);
+
+ if (ret != 0) {
+ goto cleanup;
+ }
+
+ sema_wait(&request->synch_sema);
+
+ /*
+ * At this point, all outstanding requests in the adapter
+ * should have been flushed out and return to us
+ */
+
+cleanup:
+ sema_destroy(&request->synch_sema);
+ return (ret);
+}
+#endif /* HVS_HOST_RESET */
+
+/**
+ * @brief Function to initiate an I/O request
+ *
+ * @param device Hyper-V device pointer
+ * @param request pointer to a request structure
+ * @returns 0 on success, non-zero error on failure
+ */
+static int
+hv_storvsc_io_request(struct storvsc_softc *sc,
+ struct hv_storvsc_request *request)
+{
+ struct vstor_packet *vstor_packet = &request->vstor_packet;
+ struct vmbus_channel* outgoing_channel = NULL;
+ int ret = 0, ch_sel;
+
+ vstor_packet->flags |= REQUEST_COMPLETION_FLAG;
+
+ vstor_packet->u.vm_srb.length =
+ sizeof(struct vmscsi_req) - vmscsi_size_delta;
+
+ vstor_packet->u.vm_srb.sense_info_len = sense_buffer_size;
+
+ vstor_packet->u.vm_srb.transfer_len =
+ request->prp_list.gpa_range.gpa_len;
+
+ vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB;
+
+ ch_sel = (vstor_packet->u.vm_srb.lun + curcpu) % sc->hs_nchan;
+ /*
+ * If we are panic'ing, then we are dumping core. Since storvsc_polls
+ * always uses sc->hs_chan, then we must send to that channel or a poll
+ * timeout will occur.
+ */
+ if (panicstr) {
+ outgoing_channel = sc->hs_chan;
+ } else {
+ outgoing_channel = sc->hs_sel_chan[ch_sel];
+ }
+
+ mtx_unlock(&request->softc->hs_lock);
+ if (request->prp_list.gpa_range.gpa_len) {
+ ret = vmbus_chan_send_prplist(outgoing_channel,
+ &request->prp_list.gpa_range, request->prp_cnt,
+ vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
+ } else {
+ ret = vmbus_chan_send(outgoing_channel,
+ VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
+ vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
+ }
+ /* statistic for successful request sending on each channel */
+ if (!ret) {
+ sc->sysctl_data.chan_send_cnt[ch_sel]++;
+ }
+ mtx_lock(&request->softc->hs_lock);
+
+ if (ret != 0) {
+ printf("Unable to send packet %p ret %d", vstor_packet, ret);
+ } else {
+ atomic_add_int(&sc->hs_num_out_reqs, 1);
+ }
+
+ return (ret);
+}
+
+
+/**
+ * Process IO_COMPLETION_OPERATION and ready
+ * the result to be completed for upper layer
+ * processing by the CAM layer.
+ */
+static void
+hv_storvsc_on_iocompletion(struct storvsc_softc *sc,
+ struct vstor_packet *vstor_packet,
+ struct hv_storvsc_request *request)
+{
+ struct vmscsi_req *vm_srb;
+
+ vm_srb = &vstor_packet->u.vm_srb;
+
+ /*
+ * Copy some fields of the host's response into the request structure,
+ * because the fields will be used later in storvsc_io_done().
+ */
+ request->vstor_packet.u.vm_srb.scsi_status = vm_srb->scsi_status;
+ request->vstor_packet.u.vm_srb.srb_status = vm_srb->srb_status;
+ request->vstor_packet.u.vm_srb.transfer_len = vm_srb->transfer_len;
+
+ if (((vm_srb->scsi_status & 0xFF) == SCSI_STATUS_CHECK_COND) &&
+ (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID)) {
+ /* Autosense data available */
+
+ KASSERT(vm_srb->sense_info_len <= request->sense_info_len,
+ ("vm_srb->sense_info_len <= "
+ "request->sense_info_len"));
+
+ memcpy(request->sense_data, vm_srb->u.sense_data,
+ vm_srb->sense_info_len);
+
+ request->sense_info_len = vm_srb->sense_info_len;
+ }
+
+ /* Complete request by passing to the CAM layer */
+ storvsc_io_done(request);
+ atomic_subtract_int(&sc->hs_num_out_reqs, 1);
+ if (sc->hs_drain_notify && (sc->hs_num_out_reqs == 0)) {
+ sema_post(&sc->hs_drain_sema);
+ }
+}
+
+static void
+hv_storvsc_rescan_target(struct storvsc_softc *sc)
+{
+ path_id_t pathid;
+ target_id_t targetid;
+ union ccb *ccb;
+
+ pathid = cam_sim_path(sc->hs_sim);
+ targetid = CAM_TARGET_WILDCARD;
+
+ /*
+ * Allocate a CCB and schedule a rescan.
+ */
+ ccb = xpt_alloc_ccb_nowait();
+ if (ccb == NULL) {
+ printf("unable to alloc CCB for rescan\n");
+ return;
+ }
+
+ if (xpt_create_path(&ccb->ccb_h.path, NULL, pathid, targetid,
+ CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
+ printf("unable to create path for rescan, pathid: %u,"
+ "targetid: %u\n", pathid, targetid);
+ xpt_free_ccb(ccb);
+ return;
+ }
+
+ if (targetid == CAM_TARGET_WILDCARD)
+ ccb->ccb_h.func_code = XPT_SCAN_BUS;
+ else
+ ccb->ccb_h.func_code = XPT_SCAN_TGT;
+
+ xpt_rescan(ccb);
+}
+
+static void
+hv_storvsc_on_channel_callback(struct vmbus_channel *channel, void *xsc)
+{
+ int ret = 0;
+ struct storvsc_softc *sc = xsc;
+ uint32_t bytes_recvd;
+ uint64_t request_id;
+ uint8_t packet[roundup2(sizeof(struct vstor_packet), 8)];
+ struct hv_storvsc_request *request;
+ struct vstor_packet *vstor_packet;
+
+ bytes_recvd = roundup2(VSTOR_PKT_SIZE, 8);
+ ret = vmbus_chan_recv(channel, packet, &bytes_recvd, &request_id);
+ KASSERT(ret != ENOBUFS, ("storvsc recvbuf is not large enough"));
+ /* XXX check bytes_recvd to make sure that it contains enough data */
+
+ while ((ret == 0) && (bytes_recvd > 0)) {
+ request = (struct hv_storvsc_request *)(uintptr_t)request_id;
+
+ if ((request == &sc->hs_init_req) ||
+ (request == &sc->hs_reset_req)) {
+ memcpy(&request->vstor_packet, packet,
+ sizeof(struct vstor_packet));
+ sema_post(&request->synch_sema);
+ } else {
+ vstor_packet = (struct vstor_packet *)packet;
+ switch(vstor_packet->operation) {
+ case VSTOR_OPERATION_COMPLETEIO:
+ if (request == NULL)
+ panic("VMBUS: storvsc received a "
+ "packet with NULL request id in "
+ "COMPLETEIO operation.");
+
+ hv_storvsc_on_iocompletion(sc,
+ vstor_packet, request);
+ break;
+ case VSTOR_OPERATION_REMOVEDEVICE:
+ printf("VMBUS: storvsc operation %d not "
+ "implemented.\n", vstor_packet->operation);
+ /* TODO: implement */
+ break;
+ case VSTOR_OPERATION_ENUMERATE_BUS:
+ hv_storvsc_rescan_target(sc);
+ break;
+ default:
+ break;
+ }
+ }
+
+ bytes_recvd = roundup2(VSTOR_PKT_SIZE, 8),
+ ret = vmbus_chan_recv(channel, packet, &bytes_recvd,
+ &request_id);
+ KASSERT(ret != ENOBUFS,
+ ("storvsc recvbuf is not large enough"));
+ /*
+ * XXX check bytes_recvd to make sure that it contains
+ * enough data
+ */
+ }
+}
+
+/**
+ * @brief StorVSC probe function
+ *
+ * Device probe function. Returns 0 if the input device is a StorVSC
+ * device. Otherwise, a ENXIO is returned. If the input device is
+ * for BlkVSC (paravirtual IDE) device and this support is disabled in
+ * favor of the emulated ATA/IDE device, return ENXIO.
+ *
+ * @param a device
+ * @returns 0 on success, ENXIO if not a matcing StorVSC device
+ */
+static int
+storvsc_probe(device_t dev)
+{
+ int ret = ENXIO;
+
+ switch (storvsc_get_storage_type(dev)) {
+ case DRIVER_BLKVSC:
+ if(bootverbose)
+ device_printf(dev,
+ "Enlightened ATA/IDE detected\n");
+ device_set_desc(dev, g_drv_props_table[DRIVER_BLKVSC].drv_desc);
+ ret = BUS_PROBE_DEFAULT;
+ break;
+ case DRIVER_STORVSC:
+ if(bootverbose)
+ device_printf(dev, "Enlightened SCSI device detected\n");
+ device_set_desc(dev, g_drv_props_table[DRIVER_STORVSC].drv_desc);
+ ret = BUS_PROBE_DEFAULT;
+ break;
+ default:
+ ret = ENXIO;
+ }
+ return (ret);
+}
+
+static void
+storvsc_create_chan_sel(struct storvsc_softc *sc)
+{
+ struct vmbus_channel **subch;
+ int i, nsubch;
+
+ sc->hs_sel_chan[0] = sc->hs_chan;
+ nsubch = sc->hs_nchan - 1;
+ if (nsubch == 0)
+ return;
+
+ subch = vmbus_subchan_get(sc->hs_chan, nsubch);
+ for (i = 0; i < nsubch; i++)
+ sc->hs_sel_chan[i + 1] = subch[i];
+ vmbus_subchan_rel(subch, nsubch);
+}
+
+static int
+storvsc_init_requests(device_t dev)
+{
+ struct storvsc_softc *sc = device_get_softc(dev);
+ struct hv_storvsc_request *reqp;
+ int error, i;
+
+ LIST_INIT(&sc->hs_free_list);
+
+ error = bus_dma_tag_create(
+ bus_get_dma_tag(dev), /* parent */
+ 1, /* alignment */
+ PAGE_SIZE, /* boundary */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ STORVSC_DATA_SIZE_MAX, /* maxsize */
+ STORVSC_DATA_SEGCNT_MAX, /* nsegments */
+ STORVSC_DATA_SEGSZ_MAX, /* maxsegsize */
+ 0, /* flags */
+ NULL, /* lockfunc */
+ NULL, /* lockfuncarg */
+ &sc->storvsc_req_dtag);
+ if (error) {
+ device_printf(dev, "failed to create storvsc dma tag\n");
+ return (error);
+ }
+
+ for (i = 0; i < sc->hs_drv_props->drv_max_ios_per_target; ++i) {
+ reqp = malloc(sizeof(struct hv_storvsc_request),
+ M_DEVBUF, M_WAITOK|M_ZERO);
+ reqp->softc = sc;
+ error = bus_dmamap_create(sc->storvsc_req_dtag, 0,
+ &reqp->data_dmap);
+ if (error) {
+ device_printf(dev, "failed to allocate storvsc "
+ "data dmamap\n");
+ goto cleanup;
+ }
+ LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link);
+ }
+ return (0);
+
+cleanup:
+ while ((reqp = LIST_FIRST(&sc->hs_free_list)) != NULL) {
+ LIST_REMOVE(reqp, link);
+ bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap);
+ free(reqp, M_DEVBUF);
+ }
+ return (error);
+}
+
+static void
+storvsc_sysctl(device_t dev)
+{
+ struct sysctl_oid_list *child;
+ struct sysctl_ctx_list *ctx;
+ struct sysctl_oid *ch_tree, *chid_tree;
+ struct storvsc_softc *sc;
+ char name[16];
+ int i;
+
+ sc = device_get_softc(dev);
+ ctx = device_get_sysctl_ctx(dev);
+ child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_bio_cnt", CTLFLAG_RW,
+ &sc->sysctl_data.data_bio_cnt, "# of bio data block");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_vaddr_cnt", CTLFLAG_RW,
+ &sc->sysctl_data.data_vaddr_cnt, "# of vaddr data block");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_sg_cnt", CTLFLAG_RW,
+ &sc->sysctl_data.data_sg_cnt, "# of sg data block");
+
+ /* dev.storvsc.UNIT.channel */
+ ch_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "channel",
+ CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+ if (ch_tree == NULL)
+ return;
+
+ for (i = 0; i < sc->hs_nchan; i++) {
+ uint32_t ch_id;
+
+ ch_id = vmbus_chan_id(sc->hs_sel_chan[i]);
+ snprintf(name, sizeof(name), "%d", ch_id);
+ /* dev.storvsc.UNIT.channel.CHID */
+ chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree),
+ OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+ if (chid_tree == NULL)
+ return;
+ /* dev.storvsc.UNIT.channel.CHID.send_req */
+ SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
+ "send_req", CTLFLAG_RD, &sc->sysctl_data.chan_send_cnt[i],
+ "# of request sending from this channel");
+ }
+}
+
+/**
+ * @brief StorVSC attach function
+ *
+ * Function responsible for allocating per-device structures,
+ * setting up CAM interfaces and scanning for available LUNs to
+ * be used for SCSI device peripherals.
+ *
+ * @param a device
+ * @returns 0 on success or an error on failure
+ */
+static int
+storvsc_attach(device_t dev)
+{
+ enum hv_storage_type stor_type;
+ struct storvsc_softc *sc;
+ struct cam_devq *devq;
+ int ret, i, j;
+ struct hv_storvsc_request *reqp;
+ struct root_hold_token *root_mount_token = NULL;
+ struct hv_sgl_node *sgl_node = NULL;
+ void *tmp_buff = NULL;
+
+ /*
+ * We need to serialize storvsc attach calls.
+ */
+ root_mount_token = root_mount_hold("storvsc");
+
+ sc = device_get_softc(dev);
+ sc->hs_nchan = 1;
+ sc->hs_chan = vmbus_get_channel(dev);
+
+ stor_type = storvsc_get_storage_type(dev);
+
+ if (stor_type == DRIVER_UNKNOWN) {
+ ret = ENODEV;
+ goto cleanup;
+ }
+
+ /* fill in driver specific properties */
+ sc->hs_drv_props = &g_drv_props_table[stor_type];
+ sc->hs_drv_props->drv_ringbuffer_size = hv_storvsc_ringbuffer_size;
+ sc->hs_drv_props->drv_max_ios_per_target =
+ MIN(STORVSC_MAX_IO, hv_storvsc_max_io);
+ if (bootverbose) {
+ printf("storvsc ringbuffer size: %d, max_io: %d\n",
+ sc->hs_drv_props->drv_ringbuffer_size,
+ sc->hs_drv_props->drv_max_ios_per_target);
+ }
+ /* fill in device specific properties */
+ sc->hs_unit = device_get_unit(dev);
+ sc->hs_dev = dev;
+
+ mtx_init(&sc->hs_lock, "hvslck", NULL, MTX_DEF);
+
+ ret = storvsc_init_requests(dev);
+ if (ret != 0)
+ goto cleanup;
+
+ /* create sg-list page pool */
+ if (FALSE == g_hv_sgl_page_pool.is_init) {
+ g_hv_sgl_page_pool.is_init = TRUE;
+ LIST_INIT(&g_hv_sgl_page_pool.in_use_sgl_list);
+ LIST_INIT(&g_hv_sgl_page_pool.free_sgl_list);
+
+ /*
+ * Pre-create SG list, each SG list with
+ * STORVSC_DATA_SEGCNT_MAX segments, each
+ * segment has one page buffer
+ */
+ for (i = 0; i < sc->hs_drv_props->drv_max_ios_per_target; i++) {
+ sgl_node = malloc(sizeof(struct hv_sgl_node),
+ M_DEVBUF, M_WAITOK|M_ZERO);
+
+ sgl_node->sgl_data =
+ sglist_alloc(STORVSC_DATA_SEGCNT_MAX,
+ M_WAITOK|M_ZERO);
+
+ for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++) {
+ tmp_buff = malloc(PAGE_SIZE,
+ M_DEVBUF, M_WAITOK|M_ZERO);
+
+ sgl_node->sgl_data->sg_segs[j].ss_paddr =
+ (vm_paddr_t)tmp_buff;
+ }
+
+ LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list,
+ sgl_node, link);
+ }
+ }
+
+ sc->hs_destroy = FALSE;
+ sc->hs_drain_notify = FALSE;
+ sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema");
+
+ ret = hv_storvsc_connect_vsp(sc);
+ if (ret != 0) {
+ goto cleanup;
+ }
+
+ /* Construct cpu to channel mapping */
+ storvsc_create_chan_sel(sc);
+
+ /*
+ * Create the device queue.
+ * Hyper-V maps each target to one SCSI HBA
+ */
+ devq = cam_simq_alloc(sc->hs_drv_props->drv_max_ios_per_target);
+ if (devq == NULL) {
+ device_printf(dev, "Failed to alloc device queue\n");
+ ret = ENOMEM;
+ goto cleanup;
+ }
+
+ sc->hs_sim = cam_sim_alloc(storvsc_action,
+ storvsc_poll,
+ sc->hs_drv_props->drv_name,
+ sc,
+ sc->hs_unit,
+ &sc->hs_lock, 1,
+ sc->hs_drv_props->drv_max_ios_per_target,
+ devq);
+
+ if (sc->hs_sim == NULL) {
+ device_printf(dev, "Failed to alloc sim\n");
+ cam_simq_free(devq);
+ ret = ENOMEM;
+ goto cleanup;
+ }
+
+ mtx_lock(&sc->hs_lock);
+ /* bus_id is set to 0, need to get it from VMBUS channel query? */
+ if (xpt_bus_register(sc->hs_sim, dev, 0) != CAM_SUCCESS) {
+ cam_sim_free(sc->hs_sim, /*free_devq*/TRUE);
+ mtx_unlock(&sc->hs_lock);
+ device_printf(dev, "Unable to register SCSI bus\n");
+ ret = ENXIO;
+ goto cleanup;
+ }
+
+ if (xpt_create_path(&sc->hs_path, /*periph*/NULL,
+ cam_sim_path(sc->hs_sim),
+ CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
+ xpt_bus_deregister(cam_sim_path(sc->hs_sim));
+ cam_sim_free(sc->hs_sim, /*free_devq*/TRUE);
+ mtx_unlock(&sc->hs_lock);
+ device_printf(dev, "Unable to create path\n");
+ ret = ENXIO;
+ goto cleanup;
+ }
+
+ mtx_unlock(&sc->hs_lock);
+
+ storvsc_sysctl(dev);
+
+ root_mount_rel(root_mount_token);
+ return (0);
+
+
+cleanup:
+ root_mount_rel(root_mount_token);
+ while (!LIST_EMPTY(&sc->hs_free_list)) {
+ reqp = LIST_FIRST(&sc->hs_free_list);
+ LIST_REMOVE(reqp, link);
+ bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap);
+ free(reqp, M_DEVBUF);
+ }
+
+ while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
+ sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
+ LIST_REMOVE(sgl_node, link);
+ for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++) {
+ if (NULL !=
+ (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) {
+ free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF);
+ }
+ }
+ sglist_free(sgl_node->sgl_data);
+ free(sgl_node, M_DEVBUF);
+ }
+
+ return (ret);
+}
+
+/**
+ * @brief StorVSC device detach function
+ *
+ * This function is responsible for safely detaching a
+ * StorVSC device. This includes waiting for inbound responses
+ * to complete and freeing associated per-device structures.
+ *
+ * @param dev a device
+ * returns 0 on success
+ */
+static int
+storvsc_detach(device_t dev)
+{
+ struct storvsc_softc *sc = device_get_softc(dev);
+ struct hv_storvsc_request *reqp = NULL;
+ struct hv_sgl_node *sgl_node = NULL;
+ int j = 0;
+
+ sc->hs_destroy = TRUE;
+
+ /*
+ * At this point, all outbound traffic should be disabled. We
+ * only allow inbound traffic (responses) to proceed so that
+ * outstanding requests can be completed.
+ */
+
+ sc->hs_drain_notify = TRUE;
+ sema_wait(&sc->hs_drain_sema);
+ sc->hs_drain_notify = FALSE;
+
+ /*
+ * Since we have already drained, we don't need to busy wait.
+ * The call to close the channel will reset the callback
+ * under the protection of the incoming channel lock.
+ */
+
+ vmbus_chan_close(sc->hs_chan);
+
+ mtx_lock(&sc->hs_lock);
+ while (!LIST_EMPTY(&sc->hs_free_list)) {
+ reqp = LIST_FIRST(&sc->hs_free_list);
+ LIST_REMOVE(reqp, link);
+ bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap);
+ free(reqp, M_DEVBUF);
+ }
+ mtx_unlock(&sc->hs_lock);
+
+ while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
+ sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
+ LIST_REMOVE(sgl_node, link);
+ for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++){
+ if (NULL !=
+ (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) {
+ free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF);
+ }
+ }
+ sglist_free(sgl_node->sgl_data);
+ free(sgl_node, M_DEVBUF);
+ }
+
+ return (0);
+}
+
+#if HVS_TIMEOUT_TEST
+/**
+ * @brief unit test for timed out operations
+ *
+ * This function provides unit testing capability to simulate
+ * timed out operations. Recompilation with HV_TIMEOUT_TEST=1
+ * is required.
+ *
+ * @param reqp pointer to a request structure
+ * @param opcode SCSI operation being performed
+ * @param wait if 1, wait for I/O to complete
+ */
+static void
+storvsc_timeout_test(struct hv_storvsc_request *reqp,
+ uint8_t opcode, int wait)
+{
+ int ret;
+ union ccb *ccb = reqp->ccb;
+ struct storvsc_softc *sc = reqp->softc;
+
+ if (reqp->vstor_packet.vm_srb.cdb[0] != opcode) {
+ return;
+ }
+
+ if (wait) {
+ mtx_lock(&reqp->event.mtx);
+ }
+ ret = hv_storvsc_io_request(sc, reqp);
+ if (ret != 0) {
+ if (wait) {
+ mtx_unlock(&reqp->event.mtx);
+ }
+ printf("%s: io_request failed with %d.\n",
+ __func__, ret);
+ ccb->ccb_h.status = CAM_PROVIDE_FAIL;
+ mtx_lock(&sc->hs_lock);
+ storvsc_free_request(sc, reqp);
+ xpt_done(ccb);
+ mtx_unlock(&sc->hs_lock);
+ return;
+ }
+
+ if (wait) {
+ xpt_print(ccb->ccb_h.path,
+ "%u: %s: waiting for IO return.\n",
+ ticks, __func__);
+ ret = cv_timedwait(&reqp->event.cv, &reqp->event.mtx, 60*hz);
+ mtx_unlock(&reqp->event.mtx);
+ xpt_print(ccb->ccb_h.path, "%u: %s: %s.\n",
+ ticks, __func__, (ret == 0)?
+ "IO return detected" :
+ "IO return not detected");
+ /*
+ * Now both the timer handler and io done are running
+ * simultaneously. We want to confirm the io done always
+ * finishes after the timer handler exits. So reqp used by
+ * timer handler is not freed or stale. Do busy loop for
+ * another 1/10 second to make sure io done does
+ * wait for the timer handler to complete.
+ */
+ DELAY(100*1000);
+ mtx_lock(&sc->hs_lock);
+ xpt_print(ccb->ccb_h.path,
+ "%u: %s: finishing, queue frozen %d, "
+ "ccb status 0x%x scsi_status 0x%x.\n",
+ ticks, __func__, sc->hs_frozen,
+ ccb->ccb_h.status,
+ ccb->csio.scsi_status);
+ mtx_unlock(&sc->hs_lock);
+ }
+}
+#endif /* HVS_TIMEOUT_TEST */
+
+#ifdef notyet
+/**
+ * @brief timeout handler for requests
+ *
+ * This function is called as a result of a callout expiring.
+ *
+ * @param arg pointer to a request
+ */
+static void
+storvsc_timeout(void *arg)
+{
+ struct hv_storvsc_request *reqp = arg;
+ struct storvsc_softc *sc = reqp->softc;
+ union ccb *ccb = reqp->ccb;
+
+ if (reqp->retries == 0) {
+ mtx_lock(&sc->hs_lock);
+ xpt_print(ccb->ccb_h.path,
+ "%u: IO timed out (req=0x%p), wait for another %u secs.\n",
+ ticks, reqp, ccb->ccb_h.timeout / 1000);
+ cam_error_print(ccb, CAM_ESF_ALL, CAM_EPF_ALL);
+ mtx_unlock(&sc->hs_lock);
+
+ reqp->retries++;
+ callout_reset_sbt(&reqp->callout, SBT_1MS * ccb->ccb_h.timeout,
+ 0, storvsc_timeout, reqp, 0);
+#if HVS_TIMEOUT_TEST
+ storvsc_timeout_test(reqp, SEND_DIAGNOSTIC, 0);
+#endif
+ return;
+ }
+
+ mtx_lock(&sc->hs_lock);
+ xpt_print(ccb->ccb_h.path,
+ "%u: IO (reqp = 0x%p) did not return for %u seconds, %s.\n",
+ ticks, reqp, ccb->ccb_h.timeout * (reqp->retries+1) / 1000,
+ (sc->hs_frozen == 0)?
+ "freezing the queue" : "the queue is already frozen");
+ if (sc->hs_frozen == 0) {
+ sc->hs_frozen = 1;
+ xpt_freeze_simq(xpt_path_sim(ccb->ccb_h.path), 1);
+ }
+ mtx_unlock(&sc->hs_lock);
+
+#if HVS_TIMEOUT_TEST
+ storvsc_timeout_test(reqp, MODE_SELECT_10, 1);
+#endif
+}
+#endif
+
+/**
+ * @brief StorVSC device poll function
+ *
+ * This function is responsible for servicing requests when
+ * interrupts are disabled (i.e when we are dumping core.)
+ *
+ * @param sim a pointer to a CAM SCSI interface module
+ */
+static void
+storvsc_poll(struct cam_sim *sim)
+{
+ struct storvsc_softc *sc = cam_sim_softc(sim);
+
+ mtx_assert(&sc->hs_lock, MA_OWNED);
+ mtx_unlock(&sc->hs_lock);
+ hv_storvsc_on_channel_callback(sc->hs_chan, sc);
+ mtx_lock(&sc->hs_lock);
+}
+
+/**
+ * @brief StorVSC device action function
+ *
+ * This function is responsible for handling SCSI operations which
+ * are passed from the CAM layer. The requests are in the form of
+ * CAM control blocks which indicate the action being performed.
+ * Not all actions require converting the request to a VSCSI protocol
+ * message - these actions can be responded to by this driver.
+ * Requests which are destined for a backend storage device are converted
+ * to a VSCSI protocol message and sent on the channel connection associated
+ * with this device.
+ *
+ * @param sim pointer to a CAM SCSI interface module
+ * @param ccb pointer to a CAM control block
+ */
+static void
+storvsc_action(struct cam_sim *sim, union ccb *ccb)
+{
+ struct storvsc_softc *sc = cam_sim_softc(sim);
+ int res;
+
+ mtx_assert(&sc->hs_lock, MA_OWNED);
+ switch (ccb->ccb_h.func_code) {
+ case XPT_PATH_INQ: {
+ struct ccb_pathinq *cpi = &ccb->cpi;
+
+ cpi->version_num = 1;
+ cpi->hba_inquiry = PI_TAG_ABLE|PI_SDTR_ABLE;
+ cpi->target_sprt = 0;
+ cpi->hba_misc = PIM_NOBUSRESET;
+ if (hv_storvsc_use_pim_unmapped)
+ cpi->hba_misc |= PIM_UNMAPPED;
+ cpi->maxio = STORVSC_DATA_SIZE_MAX;
+ cpi->hba_eng_cnt = 0;
+ cpi->max_target = STORVSC_MAX_TARGETS;
+ cpi->max_lun = sc->hs_drv_props->drv_max_luns_per_target;
+ cpi->initiator_id = cpi->max_target;
+ cpi->bus_id = cam_sim_bus(sim);
+ cpi->base_transfer_speed = 300000;
+ cpi->transport = XPORT_SAS;
+ cpi->transport_version = 0;
+ cpi->protocol = PROTO_SCSI;
+ cpi->protocol_version = SCSI_REV_SPC2;
+ strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
+ strlcpy(cpi->hba_vid, sc->hs_drv_props->drv_name, HBA_IDLEN);
+ strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
+ cpi->unit_number = cam_sim_unit(sim);
+
+ ccb->ccb_h.status = CAM_REQ_CMP;
+ xpt_done(ccb);
+ return;
+ }
+ case XPT_GET_TRAN_SETTINGS: {
+ struct ccb_trans_settings *cts = &ccb->cts;
+
+ cts->transport = XPORT_SAS;
+ cts->transport_version = 0;
+ cts->protocol = PROTO_SCSI;
+ cts->protocol_version = SCSI_REV_SPC2;
+
+ /* enable tag queuing and disconnected mode */
+ cts->proto_specific.valid = CTS_SCSI_VALID_TQ;
+ cts->proto_specific.scsi.valid = CTS_SCSI_VALID_TQ;
+ cts->proto_specific.scsi.flags = CTS_SCSI_FLAGS_TAG_ENB;
+ cts->xport_specific.valid = CTS_SPI_VALID_DISC;
+ cts->xport_specific.spi.flags = CTS_SPI_FLAGS_DISC_ENB;
+
+ ccb->ccb_h.status = CAM_REQ_CMP;
+ xpt_done(ccb);
+ return;
+ }
+ case XPT_SET_TRAN_SETTINGS: {
+ ccb->ccb_h.status = CAM_REQ_CMP;
+ xpt_done(ccb);
+ return;
+ }
+ case XPT_CALC_GEOMETRY:{
+ cam_calc_geometry(&ccb->ccg, 1);
+ xpt_done(ccb);
+ return;
+ }
+ case XPT_RESET_BUS:
+ case XPT_RESET_DEV:{
+#if HVS_HOST_RESET
+ if ((res = hv_storvsc_host_reset(sc)) != 0) {
+ xpt_print(ccb->ccb_h.path,
+ "hv_storvsc_host_reset failed with %d\n", res);
+ ccb->ccb_h.status = CAM_PROVIDE_FAIL;
+ xpt_done(ccb);
+ return;
+ }
+ ccb->ccb_h.status = CAM_REQ_CMP;
+ xpt_done(ccb);
+ return;
+#else
+ xpt_print(ccb->ccb_h.path,
+ "%s reset not supported.\n",
+ (ccb->ccb_h.func_code == XPT_RESET_BUS)?
+ "bus" : "dev");
+ ccb->ccb_h.status = CAM_REQ_INVALID;
+ xpt_done(ccb);
+ return;
+#endif /* HVS_HOST_RESET */
+ }
+ case XPT_SCSI_IO:
+ case XPT_IMMED_NOTIFY: {
+ struct hv_storvsc_request *reqp = NULL;
+ bus_dmamap_t dmap_saved;
+
+ if (ccb->csio.cdb_len == 0) {
+ panic("cdl_len is 0\n");
+ }
+
+ if (LIST_EMPTY(&sc->hs_free_list)) {
+ ccb->ccb_h.status = CAM_REQUEUE_REQ;
+ if (sc->hs_frozen == 0) {
+ sc->hs_frozen = 1;
+ xpt_freeze_simq(sim, /* count*/1);
+ }
+ xpt_done(ccb);
+ return;
+ }
+
+ reqp = LIST_FIRST(&sc->hs_free_list);
+ LIST_REMOVE(reqp, link);
+
+ /* Save the data_dmap before reset request */
+ dmap_saved = reqp->data_dmap;
+
+ /* XXX this is ugly */
+ bzero(reqp, sizeof(struct hv_storvsc_request));
+
+ /* Restore necessary bits */
+ reqp->data_dmap = dmap_saved;
+ reqp->softc = sc;
+
+ ccb->ccb_h.status |= CAM_SIM_QUEUED;
+ if ((res = create_storvsc_request(ccb, reqp)) != 0) {
+ ccb->ccb_h.status = CAM_REQ_INVALID;
+ xpt_done(ccb);
+ return;
+ }
+
+#ifdef notyet
+ if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) {
+ callout_init(&reqp->callout, 1);
+ callout_reset_sbt(&reqp->callout,
+ SBT_1MS * ccb->ccb_h.timeout, 0,
+ storvsc_timeout, reqp, 0);
+#if HVS_TIMEOUT_TEST
+ cv_init(&reqp->event.cv, "storvsc timeout cv");
+ mtx_init(&reqp->event.mtx, "storvsc timeout mutex",
+ NULL, MTX_DEF);
+ switch (reqp->vstor_packet.vm_srb.cdb[0]) {
+ case MODE_SELECT_10:
+ case SEND_DIAGNOSTIC:
+ /* To have timer send the request. */
+ return;
+ default:
+ break;
+ }
+#endif /* HVS_TIMEOUT_TEST */
+ }
+#endif
+
+ if ((res = hv_storvsc_io_request(sc, reqp)) != 0) {
+ xpt_print(ccb->ccb_h.path,
+ "hv_storvsc_io_request failed with %d\n", res);
+ ccb->ccb_h.status = CAM_PROVIDE_FAIL;
+ storvsc_free_request(sc, reqp);
+ xpt_done(ccb);
+ return;
+ }
+ return;
+ }
+
+ default:
+ ccb->ccb_h.status = CAM_REQ_INVALID;
+ xpt_done(ccb);
+ return;
+ }
+}
+
+/**
+ * @brief destroy bounce buffer
+ *
+ * This function is responsible for destroy a Scatter/Gather list
+ * that create by storvsc_create_bounce_buffer()
+ *
+ * @param sgl- the Scatter/Gather need be destroy
+ * @param sg_count- page count of the SG list.
+ *
+ */
+static void
+storvsc_destroy_bounce_buffer(struct sglist *sgl)
+{
+ struct hv_sgl_node *sgl_node = NULL;
+ if (LIST_EMPTY(&g_hv_sgl_page_pool.in_use_sgl_list)) {
+ printf("storvsc error: not enough in use sgl\n");
+ return;
+ }
+ sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list);
+ LIST_REMOVE(sgl_node, link);
+ sgl_node->sgl_data = sgl;
+ LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link);
+}
+
+/**
+ * @brief create bounce buffer
+ *
+ * This function is responsible for create a Scatter/Gather list,
+ * which hold several pages that can be aligned with page size.
+ *
+ * @param seg_count- SG-list segments count
+ * @param write - if WRITE_TYPE, set SG list page used size to 0,
+ * otherwise set used size to page size.
+ *
+ * return NULL if create failed
+ */
+static struct sglist *
+storvsc_create_bounce_buffer(uint16_t seg_count, int write)
+{
+ int i = 0;
+ struct sglist *bounce_sgl = NULL;
+ unsigned int buf_len = ((write == WRITE_TYPE) ? 0 : PAGE_SIZE);
+ struct hv_sgl_node *sgl_node = NULL;
+
+ /* get struct sglist from free_sgl_list */
+ if (LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
+ printf("storvsc error: not enough free sgl\n");
+ return NULL;
+ }
+ sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
+ LIST_REMOVE(sgl_node, link);
+ bounce_sgl = sgl_node->sgl_data;
+ LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link);
+
+ bounce_sgl->sg_maxseg = seg_count;
+
+ if (write == WRITE_TYPE)
+ bounce_sgl->sg_nseg = 0;
+ else
+ bounce_sgl->sg_nseg = seg_count;
+
+ for (i = 0; i < seg_count; i++)
+ bounce_sgl->sg_segs[i].ss_len = buf_len;
+
+ return bounce_sgl;
+}
+
+/**
+ * @brief copy data from SG list to bounce buffer
+ *
+ * This function is responsible for copy data from one SG list's segments
+ * to another SG list which used as bounce buffer.
+ *
+ * @param bounce_sgl - the destination SG list
+ * @param orig_sgl - the segment of the source SG list.
+ * @param orig_sgl_count - the count of segments.
+ * @param orig_sgl_count - indicate which segment need bounce buffer,
+ * set 1 means need.
+ *
+ */
+static void
+storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl,
+ bus_dma_segment_t *orig_sgl,
+ unsigned int orig_sgl_count,
+ uint64_t seg_bits)
+{
+ int src_sgl_idx = 0;
+
+ for (src_sgl_idx = 0; src_sgl_idx < orig_sgl_count; src_sgl_idx++) {
+ if (seg_bits & (1 << src_sgl_idx)) {
+ memcpy((void*)bounce_sgl->sg_segs[src_sgl_idx].ss_paddr,
+ (void*)orig_sgl[src_sgl_idx].ds_addr,
+ orig_sgl[src_sgl_idx].ds_len);
+
+ bounce_sgl->sg_segs[src_sgl_idx].ss_len =
+ orig_sgl[src_sgl_idx].ds_len;
+ }
+ }
+}
+
+/**
+ * @brief copy data from SG list which used as bounce to another SG list
+ *
+ * This function is responsible for copy data from one SG list with bounce
+ * buffer to another SG list's segments.
+ *
+ * @param dest_sgl - the destination SG list's segments
+ * @param dest_sgl_count - the count of destination SG list's segment.
+ * @param src_sgl - the source SG list.
+ * @param seg_bits - indicate which segment used bounce buffer of src SG-list.
+ *
+ */
+void
+storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl,
+ unsigned int dest_sgl_count,
+ struct sglist* src_sgl,
+ uint64_t seg_bits)
+{
+ int sgl_idx = 0;
+
+ for (sgl_idx = 0; sgl_idx < dest_sgl_count; sgl_idx++) {
+ if (seg_bits & (1 << sgl_idx)) {
+ memcpy((void*)(dest_sgl[sgl_idx].ds_addr),
+ (void*)(src_sgl->sg_segs[sgl_idx].ss_paddr),
+ src_sgl->sg_segs[sgl_idx].ss_len);
+ }
+ }
+}
+
+/**
+ * @brief check SG list with bounce buffer or not
+ *
+ * This function is responsible for check if need bounce buffer for SG list.
+ *
+ * @param sgl - the SG list's segments
+ * @param sg_count - the count of SG list's segment.
+ * @param bits - segmengs number that need bounce buffer
+ *
+ * return -1 if SG list needless bounce buffer
+ */
+static int
+storvsc_check_bounce_buffer_sgl(bus_dma_segment_t *sgl,
+ unsigned int sg_count,
+ uint64_t *bits)
+{
+ int i = 0;
+ int offset = 0;
+ uint64_t phys_addr = 0;
+ uint64_t tmp_bits = 0;
+ boolean_t found_hole = FALSE;
+ boolean_t pre_aligned = TRUE;
+
+ if (sg_count < 2){
+ return -1;
+ }
+
+ *bits = 0;
+
+ phys_addr = vtophys(sgl[0].ds_addr);
+ offset = phys_addr - trunc_page(phys_addr);
+
+ if (offset != 0) {
+ pre_aligned = FALSE;
+ tmp_bits |= 1;
+ }
+
+ for (i = 1; i < sg_count; i++) {
+ phys_addr = vtophys(sgl[i].ds_addr);
+ offset = phys_addr - trunc_page(phys_addr);
+
+ if (offset == 0) {
+ if (FALSE == pre_aligned){
+ /*
+ * This segment is aligned, if the previous
+ * one is not aligned, find a hole
+ */
+ found_hole = TRUE;
+ }
+ pre_aligned = TRUE;
+ } else {
+ tmp_bits |= 1ULL << i;
+ if (!pre_aligned) {
+ if (phys_addr != vtophys(sgl[i-1].ds_addr +
+ sgl[i-1].ds_len)) {
+ /*
+ * Check whether connect to previous
+ * segment,if not, find the hole
+ */
+ found_hole = TRUE;
+ }
+ } else {
+ found_hole = TRUE;
+ }
+ pre_aligned = FALSE;
+ }
+ }
+
+ if (!found_hole) {
+ return (-1);
+ } else {
+ *bits = tmp_bits;
+ return 0;
+ }
+}
+
+/**
+ * Copy bus_dma segments to multiple page buffer, which requires
+ * the pages are compact composed except for the 1st and last pages.
+ */
+static void
+storvsc_xferbuf_prepare(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
+{
+ struct hv_storvsc_request *reqp = arg;
+ union ccb *ccb = reqp->ccb;
+ struct ccb_scsiio *csio = &ccb->csio;
+ struct storvsc_gpa_range *prplist;
+ int i;
+
+ prplist = &reqp->prp_list;
+ prplist->gpa_range.gpa_len = csio->dxfer_len;
+ prplist->gpa_range.gpa_ofs = segs[0].ds_addr & PAGE_MASK;
+
+ for (i = 0; i < nsegs; i++) {
+#ifdef INVARIANTS
+ if (nsegs > 1) {
+ if (i == 0) {
+ KASSERT((segs[i].ds_addr & PAGE_MASK) +
+ segs[i].ds_len == PAGE_SIZE,
+ ("invalid 1st page, ofs 0x%jx, len %zu",
+ (uintmax_t)segs[i].ds_addr,
+ segs[i].ds_len));
+ } else if (i == nsegs - 1) {
+ KASSERT((segs[i].ds_addr & PAGE_MASK) == 0,
+ ("invalid last page, ofs 0x%jx",
+ (uintmax_t)segs[i].ds_addr));
+ } else {
+ KASSERT((segs[i].ds_addr & PAGE_MASK) == 0 &&
+ segs[i].ds_len == PAGE_SIZE,
+ ("not a full page, ofs 0x%jx, len %zu",
+ (uintmax_t)segs[i].ds_addr,
+ segs[i].ds_len));
+ }
+ }
+#endif
+ prplist->gpa_page[i] = atop(segs[i].ds_addr);
+ }
+ reqp->prp_cnt = nsegs;
+}
+
+/**
+ * @brief Fill in a request structure based on a CAM control block
+ *
+ * Fills in a request structure based on the contents of a CAM control
+ * block. The request structure holds the payload information for
+ * VSCSI protocol request.
+ *
+ * @param ccb pointer to a CAM contorl block
+ * @param reqp pointer to a request structure
+ */
+static int
+create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp)
+{
+ struct ccb_scsiio *csio = &ccb->csio;
+ uint64_t phys_addr;
+ uint32_t pfn;
+ uint64_t not_aligned_seg_bits = 0;
+ int error;
+
+ /* refer to struct vmscsi_req for meanings of these two fields */
+ reqp->vstor_packet.u.vm_srb.port =
+ cam_sim_unit(xpt_path_sim(ccb->ccb_h.path));
+ reqp->vstor_packet.u.vm_srb.path_id =
+ cam_sim_bus(xpt_path_sim(ccb->ccb_h.path));
+
+ reqp->vstor_packet.u.vm_srb.target_id = ccb->ccb_h.target_id;
+ reqp->vstor_packet.u.vm_srb.lun = ccb->ccb_h.target_lun;
+
+ reqp->vstor_packet.u.vm_srb.cdb_len = csio->cdb_len;
+ if(ccb->ccb_h.flags & CAM_CDB_POINTER) {
+ memcpy(&reqp->vstor_packet.u.vm_srb.u.cdb, csio->cdb_io.cdb_ptr,
+ csio->cdb_len);
+ } else {
+ memcpy(&reqp->vstor_packet.u.vm_srb.u.cdb, csio->cdb_io.cdb_bytes,
+ csio->cdb_len);
+ }
+
+ if (hv_storvsc_use_win8ext_flags) {
+ reqp->vstor_packet.u.vm_srb.win8_extension.time_out_value = 60;
+ reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |=
+ SRB_FLAGS_DISABLE_SYNCH_TRANSFER;
+ }
+ switch (ccb->ccb_h.flags & CAM_DIR_MASK) {
+ case CAM_DIR_OUT:
+ reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE;
+ if (hv_storvsc_use_win8ext_flags) {
+ reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |=
+ SRB_FLAGS_DATA_OUT;
+ }
+ break;
+ case CAM_DIR_IN:
+ reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE;
+ if (hv_storvsc_use_win8ext_flags) {
+ reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |=
+ SRB_FLAGS_DATA_IN;
+ }
+ break;
+ case CAM_DIR_NONE:
+ reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE;
+ if (hv_storvsc_use_win8ext_flags) {
+ reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |=
+ SRB_FLAGS_NO_DATA_TRANSFER;
+ }
+ break;
+ default:
+ printf("Error: unexpected data direction: 0x%x\n",
+ ccb->ccb_h.flags & CAM_DIR_MASK);
+ return (EINVAL);
+ }
+
+ reqp->sense_data = &csio->sense_data;
+ reqp->sense_info_len = csio->sense_len;
+
+ reqp->ccb = ccb;
+ ccb->ccb_h.spriv_ptr0 = reqp;
+
+ if (0 == csio->dxfer_len) {
+ return (0);
+ }
+
+ switch (ccb->ccb_h.flags & CAM_DATA_MASK) {
+ case CAM_DATA_BIO:
+ case CAM_DATA_VADDR:
+ error = bus_dmamap_load_ccb(reqp->softc->storvsc_req_dtag,
+ reqp->data_dmap, ccb, storvsc_xferbuf_prepare, reqp,
+ BUS_DMA_NOWAIT);
+ if (error) {
+ xpt_print(ccb->ccb_h.path,
+ "bus_dmamap_load_ccb failed: %d\n", error);
+ return (error);
+ }
+ if ((ccb->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO)
+ reqp->softc->sysctl_data.data_bio_cnt++;
+ else
+ reqp->softc->sysctl_data.data_vaddr_cnt++;
+ break;
+
+ case CAM_DATA_SG:
+ {
+ struct storvsc_gpa_range *prplist;
+ int i = 0;
+ int offset = 0;
+ int ret;
+
+ bus_dma_segment_t *storvsc_sglist =
+ (bus_dma_segment_t *)ccb->csio.data_ptr;
+ u_int16_t storvsc_sg_count = ccb->csio.sglist_cnt;
+
+ prplist = &reqp->prp_list;
+ prplist->gpa_range.gpa_len = csio->dxfer_len;
+
+ printf("Storvsc: get SG I/O operation, %d\n",
+ reqp->vstor_packet.u.vm_srb.data_in);
+
+ if (storvsc_sg_count > STORVSC_DATA_SEGCNT_MAX){
+ printf("Storvsc: %d segments is too much, "
+ "only support %d segments\n",
+ storvsc_sg_count, STORVSC_DATA_SEGCNT_MAX);
+ return (EINVAL);
+ }
+
+ /*
+ * We create our own bounce buffer function currently. Idealy
+ * we should use BUS_DMA(9) framework. But with current BUS_DMA
+ * code there is no callback API to check the page alignment of
+ * middle segments before busdma can decide if a bounce buffer
+ * is needed for particular segment. There is callback,
+ * "bus_dma_filter_t *filter", but the parrameters are not
+ * sufficient for storvsc driver.
+ * TODO:
+ * Add page alignment check in BUS_DMA(9) callback. Once
+ * this is complete, switch the following code to use
+ * BUS_DMA(9) for storvsc bounce buffer support.
+ */
+ /* check if we need to create bounce buffer */
+ ret = storvsc_check_bounce_buffer_sgl(storvsc_sglist,
+ storvsc_sg_count, &not_aligned_seg_bits);
+ if (ret != -1) {
+ reqp->bounce_sgl =
+ storvsc_create_bounce_buffer(storvsc_sg_count,
+ reqp->vstor_packet.u.vm_srb.data_in);
+ if (NULL == reqp->bounce_sgl) {
+ printf("Storvsc_error: "
+ "create bounce buffer failed.\n");
+ return (ENOMEM);
+ }
+
+ reqp->bounce_sgl_count = storvsc_sg_count;
+ reqp->not_aligned_seg_bits = not_aligned_seg_bits;
+
+ /*
+ * if it is write, we need copy the original data
+ *to bounce buffer
+ */
+ if (WRITE_TYPE == reqp->vstor_packet.u.vm_srb.data_in) {
+ storvsc_copy_sgl_to_bounce_buf(
+ reqp->bounce_sgl,
+ storvsc_sglist,
+ storvsc_sg_count,
+ reqp->not_aligned_seg_bits);
+ }
+
+ /* transfer virtual address to physical frame number */
+ if (reqp->not_aligned_seg_bits & 0x1){
+ phys_addr =
+ vtophys(reqp->bounce_sgl->sg_segs[0].ss_paddr);
+ }else{
+ phys_addr =
+ vtophys(storvsc_sglist[0].ds_addr);
+ }
+ prplist->gpa_range.gpa_ofs = phys_addr & PAGE_MASK;
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ prplist->gpa_page[0] = pfn;
+
+ for (i = 1; i < storvsc_sg_count; i++) {
+ if (reqp->not_aligned_seg_bits & (1 << i)) {
+ phys_addr =
+ vtophys(reqp->bounce_sgl->sg_segs[i].ss_paddr);
+ } else {
+ phys_addr =
+ vtophys(storvsc_sglist[i].ds_addr);
+ }
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ prplist->gpa_page[i] = pfn;
+ }
+ reqp->prp_cnt = i;
+ } else {
+ phys_addr = vtophys(storvsc_sglist[0].ds_addr);
+
+ prplist->gpa_range.gpa_ofs = phys_addr & PAGE_MASK;
+
+ for (i = 0; i < storvsc_sg_count; i++) {
+ phys_addr = vtophys(storvsc_sglist[i].ds_addr);
+ pfn = phys_addr >> PAGE_SHIFT;
+ prplist->gpa_page[i] = pfn;
+ }
+ reqp->prp_cnt = i;
+
+ /* check the last segment cross boundary or not */
+ offset = phys_addr & PAGE_MASK;
+ if (offset) {
+ /* Add one more PRP entry */
+ phys_addr =
+ vtophys(storvsc_sglist[i-1].ds_addr +
+ PAGE_SIZE - offset);
+ pfn = phys_addr >> PAGE_SHIFT;
+ prplist->gpa_page[i] = pfn;
+ reqp->prp_cnt++;
+ }
+
+ reqp->bounce_sgl_count = 0;
+ }
+ reqp->softc->sysctl_data.data_sg_cnt++;
+ break;
+ }
+ default:
+ printf("Unknow flags: %d\n", ccb->ccb_h.flags);
+ return(EINVAL);
+ }
+
+ return(0);
+}
+
+static uint32_t
+is_scsi_valid(const struct scsi_inquiry_data *inq_data)
+{
+ u_int8_t type;
+
+ type = SID_TYPE(inq_data);
+ if (type == T_NODEVICE)
+ return (0);
+ if (SID_QUAL(inq_data) == SID_QUAL_BAD_LU)
+ return (0);
+ return (1);
+}
+
+/**
+ * @brief completion function before returning to CAM
+ *
+ * I/O process has been completed and the result needs
+ * to be passed to the CAM layer.
+ * Free resources related to this request.
+ *
+ * @param reqp pointer to a request structure
+ */
+static void
+storvsc_io_done(struct hv_storvsc_request *reqp)
+{
+ union ccb *ccb = reqp->ccb;
+ struct ccb_scsiio *csio = &ccb->csio;
+ struct storvsc_softc *sc = reqp->softc;
+ struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb;
+ bus_dma_segment_t *ori_sglist = NULL;
+ int ori_sg_count = 0;
+ const struct scsi_generic *cmd;
+
+ /* destroy bounce buffer if it is used */
+ if (reqp->bounce_sgl_count) {
+ ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr;
+ ori_sg_count = ccb->csio.sglist_cnt;
+
+ /*
+ * If it is READ operation, we should copy back the data
+ * to original SG list.
+ */
+ if (READ_TYPE == reqp->vstor_packet.u.vm_srb.data_in) {
+ storvsc_copy_from_bounce_buf_to_sgl(ori_sglist,
+ ori_sg_count,
+ reqp->bounce_sgl,
+ reqp->not_aligned_seg_bits);
+ }
+
+ storvsc_destroy_bounce_buffer(reqp->bounce_sgl);
+ reqp->bounce_sgl_count = 0;
+ }
+
+ if (reqp->retries > 0) {
+ mtx_lock(&sc->hs_lock);
+#if HVS_TIMEOUT_TEST
+ xpt_print(ccb->ccb_h.path,
+ "%u: IO returned after timeout, "
+ "waking up timer handler if any.\n", ticks);
+ mtx_lock(&reqp->event.mtx);
+ cv_signal(&reqp->event.cv);
+ mtx_unlock(&reqp->event.mtx);
+#endif
+ reqp->retries = 0;
+ xpt_print(ccb->ccb_h.path,
+ "%u: IO returned after timeout, "
+ "stopping timer if any.\n", ticks);
+ mtx_unlock(&sc->hs_lock);
+ }
+
+#ifdef notyet
+ /*
+ * callout_drain() will wait for the timer handler to finish
+ * if it is running. So we don't need any lock to synchronize
+ * between this routine and the timer handler.
+ * Note that we need to make sure reqp is not freed when timer
+ * handler is using or will use it.
+ */
+ if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) {
+ callout_drain(&reqp->callout);
+ }
+#endif
+ cmd = (const struct scsi_generic *)
+ ((ccb->ccb_h.flags & CAM_CDB_POINTER) ?
+ csio->cdb_io.cdb_ptr : csio->cdb_io.cdb_bytes);
+
+ ccb->ccb_h.status &= ~CAM_SIM_QUEUED;
+ ccb->ccb_h.status &= ~CAM_STATUS_MASK;
+ int srb_status = SRB_STATUS(vm_srb->srb_status);
+#ifdef DIAGNOSTIC
+ if (hv_storvsc_srb_status != -1) {
+ srb_status = SRB_STATUS(hv_storvsc_srb_status & 0x3f);
+ hv_storvsc_srb_status = -1;
+ }
+#endif /* DIAGNOSTIC */
+ if (vm_srb->scsi_status == SCSI_STATUS_OK) {
+ if (srb_status != SRB_STATUS_SUCCESS) {
+ bool log_error = true;
+ switch (srb_status) {
+ case SRB_STATUS_PENDING:
+ /* We should never get this */
+ panic("storvsc_io_done: SRB_STATUS_PENDING");
+ break;
+ case SRB_STATUS_ABORTED:
+ /*
+ * storvsc doesn't support aborts yet
+ * but if we ever get this status
+ * the I/O is complete - treat it as a
+ * timeout
+ */
+ ccb->ccb_h.status |= CAM_CMD_TIMEOUT;
+ break;
+ case SRB_STATUS_ABORT_FAILED:
+ /* We should never get this */
+ panic("storvsc_io_done: SRB_STATUS_ABORT_FAILED");
+ break;
+ case SRB_STATUS_ERROR:
+ /*
+ * We should never get this.
+ * Treat it as a CAM_UNREC_HBA_ERROR.
+ * It will be retried
+ */
+ ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+ break;
+ case SRB_STATUS_BUSY:
+ /* Host is busy. Delay and retry */
+ ccb->ccb_h.status |= CAM_BUSY;
+ break;
+ case SRB_STATUS_INVALID_REQUEST:
+ case SRB_STATUS_INVALID_PATH_ID:
+ case SRB_STATUS_NO_DEVICE:
+ case SRB_STATUS_INVALID_TARGET_ID:
+ /*
+ * These indicate an invalid address
+ * and really should never be seen.
+ * A CAM_PATH_INVALID could be
+ * used here but I want to run
+ * down retries. Do a CAM_BUSY
+ * since the host might be having issues.
+ */
+ ccb->ccb_h.status |= CAM_BUSY;
+ break;
+ case SRB_STATUS_TIMEOUT:
+ case SRB_STATUS_COMMAND_TIMEOUT:
+ /* The backend has timed this out */
+ ccb->ccb_h.status |= CAM_BUSY;
+ break;
+ /* Some old pSCSI errors below */
+ case SRB_STATUS_SELECTION_TIMEOUT:
+ case SRB_STATUS_MESSAGE_REJECTED:
+ case SRB_STATUS_PARITY_ERROR:
+ case SRB_STATUS_NO_HBA:
+ case SRB_STATUS_DATA_OVERRUN:
+ case SRB_STATUS_UNEXPECTED_BUS_FREE:
+ case SRB_STATUS_PHASE_SEQUENCE_FAILURE:
+ /*
+ * Old pSCSI responses, should never get.
+ * If we do treat as a CAM_UNREC_HBA_ERROR
+ * which will be retried
+ */
+ ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+ break;
+ case SRB_STATUS_BUS_RESET:
+ ccb->ccb_h.status |= CAM_SCSI_BUS_RESET;
+ break;
+ case SRB_STATUS_BAD_SRB_BLOCK_LENGTH:
+ /*
+ * The request block is malformed and
+ * I doubt it is from the guest. Just retry.
+ */
+ ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+ break;
+ /* Not used statuses just retry */
+ case SRB_STATUS_REQUEST_FLUSHED:
+ case SRB_STATUS_BAD_FUNCTION:
+ case SRB_STATUS_NOT_POWERED:
+ ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+ break;
+ case SRB_STATUS_INVALID_LUN:
+ /*
+ * Don't log an EMS for this response since
+ * there is no device at this LUN. This is a
+ * normal and expected response when a device
+ * is detached.
+ */
+ ccb->ccb_h.status |= CAM_DEV_NOT_THERE;
+ log_error = false;
+ break;
+ case SRB_STATUS_ERROR_RECOVERY:
+ case SRB_STATUS_LINK_DOWN:
+ /*
+ * I don't ever expect these from
+ * the host but if we ever get
+ * retry after a delay
+ */
+ ccb->ccb_h.status |= CAM_BUSY;
+ break;
+ default:
+ /*
+ * An undefined response assert on
+ * on debug builds else retry
+ */
+ ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR;
+ KASSERT(srb_status <= SRB_STATUS_LINK_DOWN,
+ ("storvsc: %s, unexpected srb_status of 0x%x",
+ __func__, srb_status));
+ break;
+ }
+ if (log_error) {
+ xpt_print(ccb->ccb_h.path, "The hypervisor's I/O adapter "
+ "driver received an unexpected response code 0x%x "
+ "for operation: %s. If this continues to occur, "
+ "report the condition to your hypervisor vendor so "
+ "they can rectify the issue.\n", srb_status,
+ scsi_op_desc(cmd->opcode, NULL));
+ }
+ } else {
+ ccb->ccb_h.status |= CAM_REQ_CMP;
+ }
+
+ if (cmd->opcode == INQUIRY &&
+ srb_status == SRB_STATUS_SUCCESS) {
+ int resp_xfer_len, resp_buf_len, data_len;
+ uint8_t *resp_buf = (uint8_t *)csio->data_ptr;
+ struct scsi_inquiry_data *inq_data =
+ (struct scsi_inquiry_data *)csio->data_ptr;
+
+ /* Get the buffer length reported by host */
+ resp_xfer_len = vm_srb->transfer_len;
+
+ /* Get the available buffer length */
+ resp_buf_len = resp_xfer_len >= 5 ? resp_buf[4] + 5 : 0;
+ data_len = (resp_buf_len < resp_xfer_len) ?
+ resp_buf_len : resp_xfer_len;
+ if (bootverbose && data_len >= 5) {
+ xpt_print(ccb->ccb_h.path, "storvsc inquiry "
+ "(%d) [%x %x %x %x %x ... ]\n", data_len,
+ resp_buf[0], resp_buf[1], resp_buf[2],
+ resp_buf[3], resp_buf[4]);
+ }
+ /*
+ * XXX: Hyper-V (since win2012r2) responses inquiry with
+ * unknown version (0) for GEN-2 DVD device.
+ * Manually set the version number to SPC3 in order to
+ * ask CAM to continue probing with "PROBE_REPORT_LUNS".
+ * see probedone() in scsi_xpt.c
+ */
+ if (SID_TYPE(inq_data) == T_CDROM &&
+ inq_data->version == 0 &&
+ (vmstor_proto_version >= VMSTOR_PROTOCOL_VERSION_WIN8)) {
+ inq_data->version = SCSI_REV_SPC3;
+ if (bootverbose) {
+ xpt_print(ccb->ccb_h.path,
+ "set version from 0 to %d\n",
+ inq_data->version);
+ }
+ }
+ /*
+ * XXX: Manually fix the wrong response returned from WS2012
+ */
+ if (!is_scsi_valid(inq_data) &&
+ (vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8_1 ||
+ vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8 ||
+ vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN7)) {
+ if (data_len >= 4 &&
+ (resp_buf[2] == 0 || resp_buf[3] == 0)) {
+ resp_buf[2] = SCSI_REV_SPC3;
+ resp_buf[3] = 2; // resp fmt must be 2
+ if (bootverbose)
+ xpt_print(ccb->ccb_h.path,
+ "fix version and resp fmt for 0x%x\n",
+ vmstor_proto_version);
+ }
+ } else if (data_len >= SHORT_INQUIRY_LENGTH) {
+ char vendor[16];
+
+ cam_strvis(vendor, inq_data->vendor,
+ sizeof(inq_data->vendor), sizeof(vendor));
+ /*
+ * XXX: Upgrade SPC2 to SPC3 if host is WIN8 or
+ * WIN2012 R2 in order to support UNMAP feature.
+ */
+ if (!strncmp(vendor, "Msft", 4) &&
+ SID_ANSI_REV(inq_data) == SCSI_REV_SPC2 &&
+ (vmstor_proto_version ==
+ VMSTOR_PROTOCOL_VERSION_WIN8_1 ||
+ vmstor_proto_version ==
+ VMSTOR_PROTOCOL_VERSION_WIN8)) {
+ inq_data->version = SCSI_REV_SPC3;
+ if (bootverbose) {
+ xpt_print(ccb->ccb_h.path,
+ "storvsc upgrades "
+ "SPC2 to SPC3\n");
+ }
+ }
+ }
+ }
+ } else {
+ /**
+ * On Some Windows hosts TEST_UNIT_READY command can return
+ * SRB_STATUS_ERROR and sense data, for example, asc=0x3a,1
+ * "(Medium not present - tray closed)". This error can be
+ * ignored since it will be sent to host periodically.
+ */
+ boolean_t unit_not_ready = \
+ vm_srb->scsi_status == SCSI_STATUS_CHECK_COND &&
+ cmd->opcode == TEST_UNIT_READY &&
+ srb_status == SRB_STATUS_ERROR;
+ if (!unit_not_ready && bootverbose) {
+ mtx_lock(&sc->hs_lock);
+ xpt_print(ccb->ccb_h.path,
+ "storvsc scsi_status = %d, srb_status = %d\n",
+ vm_srb->scsi_status, srb_status);
+ mtx_unlock(&sc->hs_lock);
+ }
+ ccb->ccb_h.status |= CAM_SCSI_STATUS_ERROR;
+ }
+
+ ccb->csio.scsi_status = (vm_srb->scsi_status & 0xFF);
+ if (srb_status == SRB_STATUS_SUCCESS ||
+ srb_status == SRB_STATUS_DATA_OVERRUN)
+ ccb->csio.resid = ccb->csio.dxfer_len - vm_srb->transfer_len;
+ else
+ ccb->csio.resid = ccb->csio.dxfer_len;
+
+ if ((vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID) != 0 &&
+ reqp->sense_info_len != 0) {
+ csio->sense_resid = csio->sense_len - reqp->sense_info_len;
+ ccb->ccb_h.status |= CAM_AUTOSNS_VALID;
+ }
+
+ mtx_lock(&sc->hs_lock);
+ if (reqp->softc->hs_frozen == 1) {
+ xpt_print(ccb->ccb_h.path,
+ "%u: storvsc unfreezing softc 0x%p.\n",
+ ticks, reqp->softc);
+ ccb->ccb_h.status |= CAM_RELEASE_SIMQ;
+ reqp->softc->hs_frozen = 0;
+ }
+ storvsc_free_request(sc, reqp);
+ mtx_unlock(&sc->hs_lock);
+
+ xpt_done_direct(ccb);
+}
+
+/**
+ * @brief Free a request structure
+ *
+ * Free a request structure by returning it to the free list
+ *
+ * @param sc pointer to a softc
+ * @param reqp pointer to a request structure
+ */
+static void
+storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp)
+{
+
+ LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link);
+}
+
+/**
+ * @brief Determine type of storage device from GUID
+ *
+ * Using the type GUID, determine if this is a StorVSC (paravirtual
+ * SCSI or BlkVSC (paravirtual IDE) device.
+ *
+ * @param dev a device
+ * returns an enum
+ */
+static enum hv_storage_type
+storvsc_get_storage_type(device_t dev)
+{
+ device_t parent = device_get_parent(dev);
+
+ if (VMBUS_PROBE_GUID(parent, dev, &gBlkVscDeviceType) == 0)
+ return DRIVER_BLKVSC;
+ if (VMBUS_PROBE_GUID(parent, dev, &gStorVscDeviceType) == 0)
+ return DRIVER_STORVSC;
+ return DRIVER_UNKNOWN;
+}
+
+#define PCI_VENDOR_INTEL 0x8086
+#define PCI_PRODUCT_PIIX4 0x7111
+
+static void
+storvsc_ada_probe_veto(void *arg __unused, struct cam_path *path,
+ struct ata_params *ident_buf __unused, int *veto)
+{
+
+ /*
+ * The ATA disks are shared with the controllers managed
+ * by this driver, so veto the ATA disks' attachment; the
+ * ATA disks will be attached as SCSI disks once this driver
+ * attached.
+ */
+ if (path->device->protocol == PROTO_ATA) {
+ struct ccb_pathinq cpi;
+
+ xpt_path_inq(&cpi, path);
+ if (cpi.ccb_h.status == CAM_REQ_CMP &&
+ cpi.hba_vendor == PCI_VENDOR_INTEL &&
+ cpi.hba_device == PCI_PRODUCT_PIIX4) {
+ (*veto)++;
+ if (bootverbose) {
+ xpt_print(path,
+ "Disable ATA disks on "
+ "simulated ATA controller (0x%04x%04x)\n",
+ cpi.hba_device, cpi.hba_vendor);
+ }
+ }
+ }
+}
+
+static void
+storvsc_sysinit(void *arg __unused)
+{
+ if (vm_guest == VM_GUEST_HV) {
+ storvsc_handler_tag = EVENTHANDLER_REGISTER(ada_probe_veto,
+ storvsc_ada_probe_veto, NULL, EVENTHANDLER_PRI_ANY);
+ }
+}
+SYSINIT(storvsc_sys_init, SI_SUB_DRIVERS, SI_ORDER_SECOND, storvsc_sysinit,
+ NULL);
+
+static void
+storvsc_sysuninit(void *arg __unused)
+{
+ if (storvsc_handler_tag != NULL)
+ EVENTHANDLER_DEREGISTER(ada_probe_veto, storvsc_handler_tag);
+}
+SYSUNINIT(storvsc_sys_uninit, SI_SUB_DRIVERS, SI_ORDER_SECOND,
+ storvsc_sysuninit, NULL);
diff --git a/sys/dev/hyperv/storvsc/hv_vstorage.h b/sys/dev/hyperv/storvsc/hv_vstorage.h
new file mode 100644
index 000000000000..f1d4c1dfd2e2
--- /dev/null
+++ b/sys/dev/hyperv/storvsc/hv_vstorage.h
@@ -0,0 +1,311 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2009-2012,2017 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __HV_VSTORAGE_H__
+#define __HV_VSTORAGE_H__
+
+/*
+ * Major/minor macros. Minor version is in LSB, meaning that earlier flat
+ * version numbers will be interpreted as "0.x" (i.e., 1 becomes 0.1).
+ */
+
+#define VMSTOR_PROTOCOL_MAJOR(VERSION_) (((VERSION_) >> 8) & 0xff)
+#define VMSTOR_PROTOCOL_MINOR(VERSION_) (((VERSION_) ) & 0xff)
+#define VMSTOR_PROTOCOL_VERSION(MAJOR_, MINOR_) ((((MAJOR_) & 0xff) << 8) | \
+ (((MINOR_) & 0xff) ))
+
+#define VMSTOR_PROTOCOL_VERSION_WIN6 VMSTOR_PROTOCOL_VERSION(2, 0)
+#define VMSTOR_PROTOCOL_VERSION_WIN7 VMSTOR_PROTOCOL_VERSION(4, 2)
+#define VMSTOR_PROTOCOL_VERSION_WIN8 VMSTOR_PROTOCOL_VERSION(5, 1)
+#define VMSTOR_PROTOCOL_VERSION_WIN8_1 VMSTOR_PROTOCOL_VERSION(6, 0)
+#define VMSTOR_PROTOCOL_VERSION_WIN10 VMSTOR_PROTOCOL_VERSION(6, 2)
+/*
+ * Invalid version.
+ */
+#define VMSTOR_INVALID_PROTOCOL_VERSION -1
+
+/*
+ * Version history:
+ * V1 Beta 0.1
+ * V1 RC < 2008/1/31 1.0
+ * V1 RC > 2008/1/31 2.0
+ * Win7: 4.2
+ * Win8: 5.1
+ */
+
+#define VMSTOR_PROTOCOL_VERSION_CURRENT VMSTOR_PROTOCOL_VERSION(5, 1)
+
+/**
+ * Packet structure ops describing virtual storage requests.
+ */
+enum vstor_packet_ops {
+ VSTOR_OPERATION_COMPLETEIO = 1,
+ VSTOR_OPERATION_REMOVEDEVICE = 2,
+ VSTOR_OPERATION_EXECUTESRB = 3,
+ VSTOR_OPERATION_RESETLUN = 4,
+ VSTOR_OPERATION_RESETADAPTER = 5,
+ VSTOR_OPERATION_RESETBUS = 6,
+ VSTOR_OPERATION_BEGININITIALIZATION = 7,
+ VSTOR_OPERATION_ENDINITIALIZATION = 8,
+ VSTOR_OPERATION_QUERYPROTOCOLVERSION = 9,
+ VSTOR_OPERATION_QUERYPROPERTIES = 10,
+ VSTOR_OPERATION_ENUMERATE_BUS = 11,
+ VSTOR_OPERATION_FCHBA_DATA = 12,
+ VSTOR_OPERATION_CREATE_MULTI_CHANNELS = 13,
+ VSTOR_OPERATION_MAXIMUM = 13
+};
+
+
+/*
+ * Platform neutral description of a scsi request -
+ * this remains the same across the write regardless of 32/64 bit
+ * note: it's patterned off the Windows DDK SCSI_PASS_THROUGH structure
+ */
+
+#define CDB16GENERIC_LENGTH 0x10
+#define SENSE_BUFFER_SIZE 0x14
+#define MAX_DATA_BUFFER_LENGTH_WITH_PADDING 0x14
+
+#define POST_WIN7_STORVSC_SENSE_BUFFER_SIZE 0x14
+#define PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE 0x12
+
+
+struct vmscsi_win8_extension {
+ /*
+ * The following were added in Windows 8
+ */
+ uint16_t reserve;
+ uint8_t queue_tag;
+ uint8_t queue_action;
+ uint32_t srb_flags;
+ uint32_t time_out_value;
+ uint32_t queue_sort_ey;
+} __packed;
+
+struct vmscsi_req {
+ uint16_t length;
+ uint8_t srb_status;
+ uint8_t scsi_status;
+
+ /* HBA number, set to the order number detected by initiator. */
+ uint8_t port;
+ /* SCSI bus number or bus_id, different from CAM's path_id. */
+ uint8_t path_id;
+
+ uint8_t target_id;
+ uint8_t lun;
+
+ uint8_t cdb_len;
+ uint8_t sense_info_len;
+ uint8_t data_in;
+ uint8_t reserved;
+
+ uint32_t transfer_len;
+
+ union {
+ uint8_t cdb[CDB16GENERIC_LENGTH];
+
+ uint8_t sense_data[SENSE_BUFFER_SIZE];
+
+ uint8_t reserved_array[MAX_DATA_BUFFER_LENGTH_WITH_PADDING];
+ } u;
+
+ /*
+ * The following was added in win8.
+ */
+ struct vmscsi_win8_extension win8_extension;
+
+} __packed;
+
+/**
+ * This structure is sent during the initialization phase to get the different
+ * properties of the channel.
+ */
+
+struct vmstor_chan_props {
+ uint16_t proto_ver;
+ uint8_t path_id;
+ uint8_t target_id;
+
+ uint16_t max_channel_cnt;
+
+ /**
+ * Note: port number is only really known on the client side
+ */
+ uint16_t port;
+ uint32_t flags;
+ uint32_t max_transfer_bytes;
+
+ /**
+ * This id is unique for each channel and will correspond with
+ * vendor specific data in the inquiry_ata
+ */
+ uint64_t unique_id;
+
+} __packed;
+
+/**
+ * This structure is sent during the storage protocol negotiations.
+ */
+
+struct vmstor_proto_ver
+{
+ /**
+ * Major (MSW) and minor (LSW) version numbers.
+ */
+ uint16_t major_minor;
+
+ uint16_t revision; /* always zero */
+} __packed;
+
+/**
+ * Channel Property Flags
+ */
+
+#define STORAGE_CHANNEL_REMOVABLE_FLAG 0x1
+#define STORAGE_CHANNEL_EMULATED_IDE_FLAG 0x2
+
+
+struct vstor_packet {
+ /**
+ * Requested operation type
+ */
+ enum vstor_packet_ops operation;
+
+ /*
+ * Flags - see below for values
+ */
+ uint32_t flags;
+
+ /**
+ * Status of the request returned from the server side.
+ */
+ uint32_t status;
+
+ union
+ {
+ /**
+ * Structure used to forward SCSI commands from the client to
+ * the server.
+ */
+ struct vmscsi_req vm_srb;
+
+ /**
+ * Structure used to query channel properties.
+ */
+ struct vmstor_chan_props chan_props;
+
+ /**
+ * Used during version negotiations.
+ */
+ struct vmstor_proto_ver version;
+
+ /**
+ * Number of multichannels to create
+ */
+ uint16_t multi_channels_cnt;
+ } u;
+
+} __packed;
+
+
+/**
+ * SRB (SCSI Request Block) Status Codes
+ */
+#define SRB_STATUS_PENDING 0x00
+#define SRB_STATUS_SUCCESS 0x01
+#define SRB_STATUS_ABORTED 0x02
+#define SRB_STATUS_ABORT_FAILED 0x03
+#define SRB_STATUS_ERROR 0x04
+#define SRB_STATUS_BUSY 0x05
+#define SRB_STATUS_INVALID_REQUEST 0x06
+#define SRB_STATUS_INVALID_PATH_ID 0x07
+#define SRB_STATUS_NO_DEVICE 0x08
+#define SRB_STATUS_TIMEOUT 0x09
+#define SRB_STATUS_SELECTION_TIMEOUT 0x0A
+#define SRB_STATUS_COMMAND_TIMEOUT 0x0B
+#define SRB_STATUS_MESSAGE_REJECTED 0x0D
+#define SRB_STATUS_BUS_RESET 0x0E
+#define SRB_STATUS_PARITY_ERROR 0x0F
+#define SRB_STATUS_REQUEST_SENSE_FAILED 0x10
+#define SRB_STATUS_NO_HBA 0x11
+#define SRB_STATUS_DATA_OVERRUN 0x12
+#define SRB_STATUS_UNEXPECTED_BUS_FREE 0x13
+#define SRB_STATUS_PHASE_SEQUENCE_FAILURE 0x14
+#define SRB_STATUS_BAD_SRB_BLOCK_LENGTH 0x15
+#define SRB_STATUS_REQUEST_FLUSHED 0x16
+#define SRB_STATUS_INVALID_LUN 0x20
+#define SRB_STATUS_INVALID_TARGET_ID 0x21
+#define SRB_STATUS_BAD_FUNCTION 0x22
+#define SRB_STATUS_ERROR_RECOVERY 0x23
+#define SRB_STATUS_NOT_POWERED 0x24
+#define SRB_STATUS_LINK_DOWN 0x25
+/**
+ * SRB Status Masks (can be combined with above status codes)
+ */
+#define SRB_STATUS_QUEUE_FROZEN 0x40
+#define SRB_STATUS_AUTOSENSE_VALID 0x80
+
+#define SRB_STATUS(status) \
+ ((status) & ~(SRB_STATUS_AUTOSENSE_VALID | SRB_STATUS_QUEUE_FROZEN))
+/*
+ * SRB Flag Bits
+ */
+
+#define SRB_FLAGS_QUEUE_ACTION_ENABLE 0x00000002
+#define SRB_FLAGS_DISABLE_DISCONNECT 0x00000004
+#define SRB_FLAGS_DISABLE_SYNCH_TRANSFER 0x00000008
+#define SRB_FLAGS_BYPASS_FROZEN_QUEUE 0x00000010
+#define SRB_FLAGS_DISABLE_AUTOSENSE 0x00000020
+#define SRB_FLAGS_DATA_IN 0x00000040
+#define SRB_FLAGS_DATA_OUT 0x00000080
+#define SRB_FLAGS_NO_DATA_TRANSFER 0x00000000
+#define SRB_FLAGS_UNSPECIFIED_DIRECTION (SRB_FLAGS_DATA_IN | SRB_FLAGS_DATA_OUT)
+#define SRB_FLAGS_NO_QUEUE_FREEZE 0x00000100
+#define SRB_FLAGS_ADAPTER_CACHE_ENABLE 0x00000200
+#define SRB_FLAGS_FREE_SENSE_BUFFER 0x00000400
+/**
+ * Packet flags
+ */
+
+/**
+ * This flag indicates that the server should send back a completion for this
+ * packet.
+ */
+#define REQUEST_COMPLETION_FLAG 0x1
+
+/**
+ * This is the set of flags that the vsc can set in any packets it sends
+ */
+#define VSC_LEGAL_FLAGS (REQUEST_COMPLETION_FLAG)
+
+#endif /* __HV_VSTORAGE_H__ */
diff --git a/sys/dev/hyperv/utilities/hv_kvp.c b/sys/dev/hyperv/utilities/hv_kvp.c
new file mode 100644
index 000000000000..8da0936f6cd7
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_kvp.c
@@ -0,0 +1,920 @@
+/*-
+ * Copyright (c) 2014,2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Author: Sainath Varanasi.
+ * Date: 4/2012
+ * Email: bsdic@microsoft.com
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/reboot.h>
+#include <sys/lock.h>
+#include <sys/taskqueue.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/un.h>
+#include <sys/endian.h>
+#include <sys/_null.h>
+#include <sys/sema.h>
+#include <sys/signal.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/mutex.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/utilities/hv_utilreg.h>
+#include <dev/hyperv/utilities/vmbus_icreg.h>
+#include <dev/hyperv/utilities/vmbus_icvar.h>
+
+#include "unicode.h"
+#include "hv_kvp.h"
+#include "vmbus_if.h"
+
+/* hv_kvp defines */
+#define BUFFERSIZE sizeof(struct hv_kvp_msg)
+#define kvp_hdr hdr.kvp_hdr
+
+#define KVP_FWVER_MAJOR 3
+#define KVP_FWVER VMBUS_IC_VERSION(KVP_FWVER_MAJOR, 0)
+
+#define KVP_MSGVER_MAJOR 4
+#define KVP_MSGVER VMBUS_IC_VERSION(KVP_MSGVER_MAJOR, 0)
+
+/* hv_kvp debug control */
+static int hv_kvp_log = 0;
+
+#define hv_kvp_log_error(...) do { \
+ if (hv_kvp_log > 0) \
+ log(LOG_ERR, "hv_kvp: " __VA_ARGS__); \
+} while (0)
+
+#define hv_kvp_log_info(...) do { \
+ if (hv_kvp_log > 1) \
+ log(LOG_INFO, "hv_kvp: " __VA_ARGS__); \
+} while (0)
+
+static const struct vmbus_ic_desc vmbus_kvp_descs[] = {
+ {
+ .ic_guid = { .hv_guid = {
+ 0xe7, 0xf4, 0xa0, 0xa9, 0x45, 0x5a, 0x96, 0x4d,
+ 0xb8, 0x27, 0x8a, 0x84, 0x1e, 0x8c, 0x3, 0xe6 } },
+ .ic_desc = "Hyper-V KVP"
+ },
+ VMBUS_IC_DESC_END
+};
+
+/* character device prototypes */
+static d_open_t hv_kvp_dev_open;
+static d_close_t hv_kvp_dev_close;
+static d_read_t hv_kvp_dev_daemon_read;
+static d_write_t hv_kvp_dev_daemon_write;
+static d_poll_t hv_kvp_dev_daemon_poll;
+
+/* hv_kvp character device structure */
+static struct cdevsw hv_kvp_cdevsw =
+{
+ .d_version = D_VERSION,
+ .d_open = hv_kvp_dev_open,
+ .d_close = hv_kvp_dev_close,
+ .d_read = hv_kvp_dev_daemon_read,
+ .d_write = hv_kvp_dev_daemon_write,
+ .d_poll = hv_kvp_dev_daemon_poll,
+ .d_name = "hv_kvp_dev",
+};
+
+
+/*
+ * Global state to track and synchronize multiple
+ * KVP transaction requests from the host.
+ */
+typedef struct hv_kvp_sc {
+ struct vmbus_ic_softc util_sc;
+ device_t dev;
+
+ /* Unless specified the pending mutex should be
+ * used to alter the values of the following parameters:
+ * 1. req_in_progress
+ * 2. req_timed_out
+ */
+ struct mtx pending_mutex;
+
+ struct task task;
+
+ /* To track if transaction is active or not */
+ boolean_t req_in_progress;
+ /* Tracks if daemon did not reply back in time */
+ boolean_t req_timed_out;
+ /* Tracks if daemon is serving a request currently */
+ boolean_t daemon_busy;
+
+ /* Length of host message */
+ uint32_t host_msg_len;
+
+ /* Host message id */
+ uint64_t host_msg_id;
+
+ /* Current kvp message from the host */
+ struct hv_kvp_msg *host_kvp_msg;
+
+ /* Current kvp message for daemon */
+ struct hv_kvp_msg daemon_kvp_msg;
+
+ /* Rcv buffer for communicating with the host*/
+ uint8_t *rcv_buf;
+
+ /* Device semaphore to control communication */
+ struct sema dev_sema;
+
+ /* Indicates if daemon registered with driver */
+ boolean_t register_done;
+
+ /* Character device status */
+ boolean_t dev_accessed;
+
+ struct cdev *hv_kvp_dev;
+
+ struct proc *daemon_task;
+
+ struct selinfo hv_kvp_selinfo;
+} hv_kvp_sc;
+
+/* hv_kvp prototypes */
+static int hv_kvp_req_in_progress(hv_kvp_sc *sc);
+static void hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t, uint64_t, uint8_t *);
+static void hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc);
+static void hv_kvp_process_request(void *context, int pending);
+
+/*
+ * hv_kvp low level functions
+ */
+
+/*
+ * Check if kvp transaction is in progres
+ */
+static int
+hv_kvp_req_in_progress(hv_kvp_sc *sc)
+{
+
+ return (sc->req_in_progress);
+}
+
+
+/*
+ * This routine is called whenever a message is received from the host
+ */
+static void
+hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t rcv_len,
+ uint64_t request_id, uint8_t *rcv_buf)
+{
+
+ /* Store all the relevant message details in the global structure */
+ /* Do not need to use mutex for req_in_progress here */
+ sc->req_in_progress = true;
+ sc->host_msg_len = rcv_len;
+ sc->host_msg_id = request_id;
+ sc->rcv_buf = rcv_buf;
+ sc->host_kvp_msg = (struct hv_kvp_msg *)&rcv_buf[
+ sizeof(struct hv_vmbus_pipe_hdr) +
+ sizeof(struct hv_vmbus_icmsg_hdr)];
+}
+
+/*
+ * Convert ip related info in umsg from utf8 to utf16 and store in hmsg
+ */
+static int
+hv_kvp_convert_utf8_ipinfo_to_utf16(struct hv_kvp_msg *umsg,
+ struct hv_kvp_ip_msg *host_ip_msg)
+{
+ int err_ip, err_subnet, err_gway, err_dns, err_adap;
+ int UNUSED_FLAG = 1;
+
+ utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.ip_addr,
+ MAX_IP_ADDR_SIZE,
+ (char *)umsg->body.kvp_ip_val.ip_addr,
+ strlen((char *)umsg->body.kvp_ip_val.ip_addr),
+ UNUSED_FLAG,
+ &err_ip);
+ utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.sub_net,
+ MAX_IP_ADDR_SIZE,
+ (char *)umsg->body.kvp_ip_val.sub_net,
+ strlen((char *)umsg->body.kvp_ip_val.sub_net),
+ UNUSED_FLAG,
+ &err_subnet);
+ utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.gate_way,
+ MAX_GATEWAY_SIZE,
+ (char *)umsg->body.kvp_ip_val.gate_way,
+ strlen((char *)umsg->body.kvp_ip_val.gate_way),
+ UNUSED_FLAG,
+ &err_gway);
+ utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.dns_addr,
+ MAX_IP_ADDR_SIZE,
+ (char *)umsg->body.kvp_ip_val.dns_addr,
+ strlen((char *)umsg->body.kvp_ip_val.dns_addr),
+ UNUSED_FLAG,
+ &err_dns);
+ utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.adapter_id,
+ MAX_ADAPTER_ID_SIZE,
+ (char *)umsg->body.kvp_ip_val.adapter_id,
+ strlen((char *)umsg->body.kvp_ip_val.adapter_id),
+ UNUSED_FLAG,
+ &err_adap);
+
+ host_ip_msg->kvp_ip_val.dhcp_enabled = umsg->body.kvp_ip_val.dhcp_enabled;
+ host_ip_msg->kvp_ip_val.addr_family = umsg->body.kvp_ip_val.addr_family;
+
+ return (err_ip | err_subnet | err_gway | err_dns | err_adap);
+}
+
+
+/*
+ * Convert ip related info in hmsg from utf16 to utf8 and store in umsg
+ */
+static int
+hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg,
+ struct hv_kvp_msg *umsg)
+{
+ int err_ip, err_subnet, err_gway, err_dns, err_adap;
+ int UNUSED_FLAG = 1;
+ device_t *devs;
+ int devcnt;
+
+ /* IP Address */
+ utf16_to_utf8((char *)umsg->body.kvp_ip_val.ip_addr,
+ MAX_IP_ADDR_SIZE,
+ (uint16_t *)host_ip_msg->kvp_ip_val.ip_addr,
+ MAX_IP_ADDR_SIZE,
+ UNUSED_FLAG,
+ &err_ip);
+
+ /* Adapter ID : GUID */
+ utf16_to_utf8((char *)umsg->body.kvp_ip_val.adapter_id,
+ MAX_ADAPTER_ID_SIZE,
+ (uint16_t *)host_ip_msg->kvp_ip_val.adapter_id,
+ MAX_ADAPTER_ID_SIZE,
+ UNUSED_FLAG,
+ &err_adap);
+
+ if (devclass_get_devices(devclass_find("hn"), &devs, &devcnt) == 0) {
+ for (devcnt = devcnt - 1; devcnt >= 0; devcnt--) {
+ device_t dev = devs[devcnt];
+ struct vmbus_channel *chan;
+ char buf[HYPERV_GUID_STRLEN];
+ int n;
+
+ chan = vmbus_get_channel(dev);
+ n = hyperv_guid2str(vmbus_chan_guid_inst(chan), buf,
+ sizeof(buf));
+
+ /*
+ * The string in the 'kvp_ip_val.adapter_id' has
+ * braces around the GUID; skip the leading brace
+ * in 'kvp_ip_val.adapter_id'.
+ */
+ if (strncmp(buf,
+ ((char *)&umsg->body.kvp_ip_val.adapter_id) + 1,
+ n) == 0) {
+ strlcpy((char *)umsg->body.kvp_ip_val.adapter_id,
+ device_get_nameunit(dev), MAX_ADAPTER_ID_SIZE);
+ break;
+ }
+ }
+ free(devs, M_TEMP);
+ }
+
+ /* Address Family , DHCP , SUBNET, Gateway, DNS */
+ umsg->kvp_hdr.operation = host_ip_msg->operation;
+ umsg->body.kvp_ip_val.addr_family = host_ip_msg->kvp_ip_val.addr_family;
+ umsg->body.kvp_ip_val.dhcp_enabled = host_ip_msg->kvp_ip_val.dhcp_enabled;
+ utf16_to_utf8((char *)umsg->body.kvp_ip_val.sub_net, MAX_IP_ADDR_SIZE,
+ (uint16_t *)host_ip_msg->kvp_ip_val.sub_net,
+ MAX_IP_ADDR_SIZE,
+ UNUSED_FLAG,
+ &err_subnet);
+
+ utf16_to_utf8((char *)umsg->body.kvp_ip_val.gate_way, MAX_GATEWAY_SIZE,
+ (uint16_t *)host_ip_msg->kvp_ip_val.gate_way,
+ MAX_GATEWAY_SIZE,
+ UNUSED_FLAG,
+ &err_gway);
+
+ utf16_to_utf8((char *)umsg->body.kvp_ip_val.dns_addr, MAX_IP_ADDR_SIZE,
+ (uint16_t *)host_ip_msg->kvp_ip_val.dns_addr,
+ MAX_IP_ADDR_SIZE,
+ UNUSED_FLAG,
+ &err_dns);
+
+ return (err_ip | err_subnet | err_gway | err_dns | err_adap);
+}
+
+
+/*
+ * Prepare a user kvp msg based on host kvp msg (utf16 to utf8)
+ * Ensure utf16_utf8 takes care of the additional string terminating char!!
+ */
+static void
+hv_kvp_convert_hostmsg_to_usermsg(struct hv_kvp_msg *hmsg, struct hv_kvp_msg *umsg)
+{
+ int utf_err = 0;
+ uint32_t value_type;
+ struct hv_kvp_ip_msg *host_ip_msg;
+
+ host_ip_msg = (struct hv_kvp_ip_msg*)hmsg;
+ memset(umsg, 0, sizeof(struct hv_kvp_msg));
+
+ umsg->kvp_hdr.operation = hmsg->kvp_hdr.operation;
+ umsg->kvp_hdr.pool = hmsg->kvp_hdr.pool;
+
+ switch (umsg->kvp_hdr.operation) {
+ case HV_KVP_OP_SET_IP_INFO:
+ hv_kvp_convert_utf16_ipinfo_to_utf8(host_ip_msg, umsg);
+ break;
+
+ case HV_KVP_OP_GET_IP_INFO:
+ utf16_to_utf8((char *)umsg->body.kvp_ip_val.adapter_id,
+ MAX_ADAPTER_ID_SIZE,
+ (uint16_t *)host_ip_msg->kvp_ip_val.adapter_id,
+ MAX_ADAPTER_ID_SIZE, 1, &utf_err);
+
+ umsg->body.kvp_ip_val.addr_family =
+ host_ip_msg->kvp_ip_val.addr_family;
+ break;
+
+ case HV_KVP_OP_SET:
+ value_type = hmsg->body.kvp_set.data.value_type;
+
+ switch (value_type) {
+ case HV_REG_SZ:
+ umsg->body.kvp_set.data.value_size =
+ utf16_to_utf8(
+ (char *)umsg->body.kvp_set.data.msg_value.value,
+ HV_KVP_EXCHANGE_MAX_VALUE_SIZE - 1,
+ (uint16_t *)hmsg->body.kvp_set.data.msg_value.value,
+ hmsg->body.kvp_set.data.value_size,
+ 1, &utf_err);
+ /* utf8 encoding */
+ umsg->body.kvp_set.data.value_size =
+ umsg->body.kvp_set.data.value_size / 2;
+ break;
+
+ case HV_REG_U32:
+ umsg->body.kvp_set.data.value_size =
+ sprintf(umsg->body.kvp_set.data.msg_value.value, "%d",
+ hmsg->body.kvp_set.data.msg_value.value_u32) + 1;
+ break;
+
+ case HV_REG_U64:
+ umsg->body.kvp_set.data.value_size =
+ sprintf(umsg->body.kvp_set.data.msg_value.value, "%llu",
+ (unsigned long long)
+ hmsg->body.kvp_set.data.msg_value.value_u64) + 1;
+ break;
+ }
+
+ umsg->body.kvp_set.data.key_size =
+ utf16_to_utf8(
+ umsg->body.kvp_set.data.key,
+ HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1,
+ (uint16_t *)hmsg->body.kvp_set.data.key,
+ hmsg->body.kvp_set.data.key_size,
+ 1, &utf_err);
+
+ /* utf8 encoding */
+ umsg->body.kvp_set.data.key_size =
+ umsg->body.kvp_set.data.key_size / 2;
+ break;
+
+ case HV_KVP_OP_GET:
+ umsg->body.kvp_get.data.key_size =
+ utf16_to_utf8(umsg->body.kvp_get.data.key,
+ HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1,
+ (uint16_t *)hmsg->body.kvp_get.data.key,
+ hmsg->body.kvp_get.data.key_size,
+ 1, &utf_err);
+ /* utf8 encoding */
+ umsg->body.kvp_get.data.key_size =
+ umsg->body.kvp_get.data.key_size / 2;
+ break;
+
+ case HV_KVP_OP_DELETE:
+ umsg->body.kvp_delete.key_size =
+ utf16_to_utf8(umsg->body.kvp_delete.key,
+ HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1,
+ (uint16_t *)hmsg->body.kvp_delete.key,
+ hmsg->body.kvp_delete.key_size,
+ 1, &utf_err);
+ /* utf8 encoding */
+ umsg->body.kvp_delete.key_size =
+ umsg->body.kvp_delete.key_size / 2;
+ break;
+
+ case HV_KVP_OP_ENUMERATE:
+ umsg->body.kvp_enum_data.index =
+ hmsg->body.kvp_enum_data.index;
+ break;
+
+ default:
+ hv_kvp_log_info("%s: daemon_kvp_msg: Invalid operation : %d\n",
+ __func__, umsg->kvp_hdr.operation);
+ }
+}
+
+
+/*
+ * Prepare a host kvp msg based on user kvp msg (utf8 to utf16)
+ */
+static int
+hv_kvp_convert_usermsg_to_hostmsg(struct hv_kvp_msg *umsg, struct hv_kvp_msg *hmsg)
+{
+ int hkey_len = 0, hvalue_len = 0, utf_err = 0;
+ struct hv_kvp_exchg_msg_value *host_exchg_data;
+ char *key_name, *value;
+
+ struct hv_kvp_ip_msg *host_ip_msg = (struct hv_kvp_ip_msg *)hmsg;
+
+ switch (hmsg->kvp_hdr.operation) {
+ case HV_KVP_OP_GET_IP_INFO:
+ return (hv_kvp_convert_utf8_ipinfo_to_utf16(umsg, host_ip_msg));
+
+ case HV_KVP_OP_SET_IP_INFO:
+ case HV_KVP_OP_SET:
+ case HV_KVP_OP_DELETE:
+ return (0);
+
+ case HV_KVP_OP_ENUMERATE:
+ host_exchg_data = &hmsg->body.kvp_enum_data.data;
+ key_name = umsg->body.kvp_enum_data.data.key;
+ hkey_len = utf8_to_utf16((uint16_t *)host_exchg_data->key,
+ ((HV_KVP_EXCHANGE_MAX_KEY_SIZE / 2) - 2),
+ key_name, strlen(key_name),
+ 1, &utf_err);
+ /* utf16 encoding */
+ host_exchg_data->key_size = 2 * (hkey_len + 1);
+ value = umsg->body.kvp_enum_data.data.msg_value.value;
+ hvalue_len = utf8_to_utf16(
+ (uint16_t *)host_exchg_data->msg_value.value,
+ ((HV_KVP_EXCHANGE_MAX_VALUE_SIZE / 2) - 2),
+ value, strlen(value),
+ 1, &utf_err);
+ host_exchg_data->value_size = 2 * (hvalue_len + 1);
+ host_exchg_data->value_type = HV_REG_SZ;
+
+ if ((hkey_len < 0) || (hvalue_len < 0))
+ return (EINVAL);
+
+ return (0);
+
+ case HV_KVP_OP_GET:
+ host_exchg_data = &hmsg->body.kvp_get.data;
+ value = umsg->body.kvp_get.data.msg_value.value;
+ hvalue_len = utf8_to_utf16(
+ (uint16_t *)host_exchg_data->msg_value.value,
+ ((HV_KVP_EXCHANGE_MAX_VALUE_SIZE / 2) - 2),
+ value, strlen(value),
+ 1, &utf_err);
+ /* Convert value size to uft16 */
+ host_exchg_data->value_size = 2 * (hvalue_len + 1);
+ /* Use values by string */
+ host_exchg_data->value_type = HV_REG_SZ;
+
+ if (hvalue_len < 0)
+ return (EINVAL);
+
+ return (0);
+
+ default:
+ return (EINVAL);
+ }
+}
+
+
+/*
+ * Send the response back to the host.
+ */
+static void
+hv_kvp_respond_host(hv_kvp_sc *sc, uint32_t error)
+{
+ struct hv_vmbus_icmsg_hdr *hv_icmsg_hdrp;
+
+ hv_icmsg_hdrp = (struct hv_vmbus_icmsg_hdr *)
+ &sc->rcv_buf[sizeof(struct hv_vmbus_pipe_hdr)];
+
+ hv_icmsg_hdrp->status = error;
+ hv_icmsg_hdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION |
+ HV_ICMSGHDRFLAG_RESPONSE;
+
+ error = vmbus_chan_send(vmbus_get_channel(sc->dev),
+ VMBUS_CHANPKT_TYPE_INBAND, 0, sc->rcv_buf, sc->host_msg_len,
+ sc->host_msg_id);
+ if (error)
+ hv_kvp_log_info("%s: hv_kvp_respond_host: sendpacket error:%d\n",
+ __func__, error);
+}
+
+
+/*
+ * This is the main kvp kernel process that interacts with both user daemon
+ * and the host
+ */
+static void
+hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc)
+{
+ struct hv_kvp_msg *hmsg = sc->host_kvp_msg;
+ struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg;
+
+ /* Prepare kvp_msg to be sent to user */
+ hv_kvp_convert_hostmsg_to_usermsg(hmsg, umsg);
+
+ /* Send the msg to user via function deamon_read - setting sema */
+ sema_post(&sc->dev_sema);
+
+ /* We should wake up the daemon, in case it's doing poll() */
+ selwakeup(&sc->hv_kvp_selinfo);
+}
+
+
+/*
+ * Function to read the kvp request buffer from host
+ * and interact with daemon
+ */
+static void
+hv_kvp_process_request(void *context, int pending)
+{
+ uint8_t *kvp_buf;
+ struct vmbus_channel *channel;
+ uint32_t recvlen = 0;
+ uint64_t requestid;
+ struct hv_vmbus_icmsg_hdr *icmsghdrp;
+ int ret = 0, error;
+ hv_kvp_sc *sc;
+
+ hv_kvp_log_info("%s: entering hv_kvp_process_request\n", __func__);
+
+ sc = (hv_kvp_sc*)context;
+ kvp_buf = sc->util_sc.ic_buf;
+ channel = vmbus_get_channel(sc->dev);
+
+ recvlen = sc->util_sc.ic_buflen;
+ ret = vmbus_chan_recv(channel, kvp_buf, &recvlen, &requestid);
+ KASSERT(ret != ENOBUFS, ("hvkvp recvbuf is not large enough"));
+ /* XXX check recvlen to make sure that it contains enough data */
+
+ while ((ret == 0) && (recvlen > 0)) {
+ icmsghdrp = (struct hv_vmbus_icmsg_hdr *)
+ &kvp_buf[sizeof(struct hv_vmbus_pipe_hdr)];
+
+ hv_kvp_transaction_init(sc, recvlen, requestid, kvp_buf);
+ if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) {
+ error = vmbus_ic_negomsg(&sc->util_sc,
+ kvp_buf, &recvlen, KVP_FWVER, KVP_MSGVER);
+ /* XXX handle vmbus_ic_negomsg failure. */
+ if (!error)
+ hv_kvp_respond_host(sc, HV_S_OK);
+ else
+ hv_kvp_respond_host(sc, HV_E_FAIL);
+ /*
+ * It is ok to not acquire the mutex before setting
+ * req_in_progress here because negotiation is the
+ * first thing that happens and hence there is no
+ * chance of a race condition.
+ */
+
+ sc->req_in_progress = false;
+ hv_kvp_log_info("%s :version negotiated\n", __func__);
+
+ } else {
+ if (!sc->daemon_busy) {
+
+ hv_kvp_log_info("%s: issuing qury to daemon\n", __func__);
+ mtx_lock(&sc->pending_mutex);
+ sc->req_timed_out = false;
+ sc->daemon_busy = true;
+ mtx_unlock(&sc->pending_mutex);
+
+ hv_kvp_send_msg_to_daemon(sc);
+ hv_kvp_log_info("%s: waiting for daemon\n", __func__);
+ }
+
+ /* Wait 5 seconds for daemon to respond back */
+ tsleep(sc, 0, "kvpworkitem", 5 * hz);
+ hv_kvp_log_info("%s: came out of wait\n", __func__);
+ }
+
+ mtx_lock(&sc->pending_mutex);
+
+ /* Notice that once req_timed_out is set to true
+ * it will remain true until the next request is
+ * sent to the daemon. The response from daemon
+ * is forwarded to host only when this flag is
+ * false.
+ */
+ sc->req_timed_out = true;
+
+ /*
+ * Cancel request if so need be.
+ */
+ if (hv_kvp_req_in_progress(sc)) {
+ hv_kvp_log_info("%s: request was still active after wait so failing\n", __func__);
+ hv_kvp_respond_host(sc, HV_E_FAIL);
+ sc->req_in_progress = false;
+ }
+
+ mtx_unlock(&sc->pending_mutex);
+
+ /*
+ * Try reading next buffer
+ */
+ recvlen = sc->util_sc.ic_buflen;
+ ret = vmbus_chan_recv(channel, kvp_buf, &recvlen, &requestid);
+ KASSERT(ret != ENOBUFS, ("hvkvp recvbuf is not large enough"));
+ /* XXX check recvlen to make sure that it contains enough data */
+
+ hv_kvp_log_info("%s: read: context %p, ret =%d, recvlen=%d\n",
+ __func__, context, ret, recvlen);
+ }
+}
+
+
+/*
+ * Callback routine that gets called whenever there is a message from host
+ */
+static void
+hv_kvp_callback(struct vmbus_channel *chan __unused, void *context)
+{
+ hv_kvp_sc *sc = (hv_kvp_sc*)context;
+ /*
+ The first request from host will not be handled until daemon is registered.
+ when callback is triggered without a registered daemon, callback just return.
+ When a new daemon gets regsitered, this callbcak is trigged from _write op.
+ */
+ if (sc->register_done) {
+ hv_kvp_log_info("%s: Queuing work item\n", __func__);
+ taskqueue_enqueue(taskqueue_thread, &sc->task);
+ }
+}
+
+static int
+hv_kvp_dev_open(struct cdev *dev, int oflags, int devtype,
+ struct thread *td)
+{
+ hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
+
+ hv_kvp_log_info("%s: Opened device \"hv_kvp_device\" successfully.\n", __func__);
+ if (sc->dev_accessed)
+ return (-EBUSY);
+
+ sc->daemon_task = curproc;
+ sc->dev_accessed = true;
+ sc->daemon_busy = false;
+ return (0);
+}
+
+
+static int
+hv_kvp_dev_close(struct cdev *dev __unused, int fflag __unused, int devtype __unused,
+ struct thread *td __unused)
+{
+ hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
+
+ hv_kvp_log_info("%s: Closing device \"hv_kvp_device\".\n", __func__);
+ sc->dev_accessed = false;
+ sc->register_done = false;
+ return (0);
+}
+
+
+/*
+ * hv_kvp_daemon read invokes this function
+ * acts as a send to daemon
+ */
+static int
+hv_kvp_dev_daemon_read(struct cdev *dev, struct uio *uio, int ioflag __unused)
+{
+ size_t amt;
+ int error = 0;
+ struct hv_kvp_msg *hv_kvp_dev_buf;
+ hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
+
+ /* Read is not allowed util registering is done. */
+ if (!sc->register_done)
+ return (EPERM);
+
+ sema_wait(&sc->dev_sema);
+
+ hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK);
+ memcpy(hv_kvp_dev_buf, &sc->daemon_kvp_msg, sizeof(struct hv_kvp_msg));
+
+ amt = MIN(uio->uio_resid, uio->uio_offset >= BUFFERSIZE + 1 ? 0 :
+ BUFFERSIZE + 1 - uio->uio_offset);
+
+ if ((error = uiomove(hv_kvp_dev_buf, amt, uio)) != 0)
+ hv_kvp_log_info("%s: hv_kvp uiomove read failed!\n", __func__);
+
+ free(hv_kvp_dev_buf, M_TEMP);
+ return (error);
+}
+
+
+/*
+ * hv_kvp_daemon write invokes this function
+ * acts as a receive from daemon
+ */
+static int
+hv_kvp_dev_daemon_write(struct cdev *dev, struct uio *uio, int ioflag __unused)
+{
+ size_t amt;
+ int error = 0;
+ struct hv_kvp_msg *hv_kvp_dev_buf;
+ hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
+
+ uio->uio_offset = 0;
+ hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK);
+
+ amt = MIN(uio->uio_resid, BUFFERSIZE);
+ error = uiomove(hv_kvp_dev_buf, amt, uio);
+
+ if (error != 0) {
+ free(hv_kvp_dev_buf, M_TEMP);
+ return (error);
+ }
+ memcpy(&sc->daemon_kvp_msg, hv_kvp_dev_buf, sizeof(struct hv_kvp_msg));
+
+ free(hv_kvp_dev_buf, M_TEMP);
+ if (sc->register_done == false) {
+ if (sc->daemon_kvp_msg.kvp_hdr.operation == HV_KVP_OP_REGISTER) {
+ sc->register_done = true;
+ hv_kvp_callback(vmbus_get_channel(sc->dev), dev->si_drv1);
+ }
+ else {
+ hv_kvp_log_info("%s, KVP Registration Failed\n", __func__);
+ return (EINVAL);
+ }
+ } else {
+
+ mtx_lock(&sc->pending_mutex);
+
+ if(!sc->req_timed_out) {
+ struct hv_kvp_msg *hmsg = sc->host_kvp_msg;
+ struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg;
+
+ error = hv_kvp_convert_usermsg_to_hostmsg(umsg, hmsg);
+ hv_kvp_respond_host(sc, umsg->hdr.error);
+ wakeup(sc);
+ sc->req_in_progress = false;
+ if (umsg->hdr.error != HV_S_OK)
+ hv_kvp_log_info("%s, Error 0x%x from daemon\n",
+ __func__, umsg->hdr.error);
+ if (error)
+ hv_kvp_log_info("%s, Error from convert\n", __func__);
+ }
+
+ sc->daemon_busy = false;
+ mtx_unlock(&sc->pending_mutex);
+ }
+
+ return (error);
+}
+
+
+/*
+ * hv_kvp_daemon poll invokes this function to check if data is available
+ * for daemon to read.
+ */
+static int
+hv_kvp_dev_daemon_poll(struct cdev *dev, int events, struct thread *td)
+{
+ int revents = 0;
+ hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1;
+
+ mtx_lock(&sc->pending_mutex);
+ /*
+ * We check global flag daemon_busy for the data availiability for
+ * userland to read. Deamon_busy is set to true before driver has data
+ * for daemon to read. It is set to false after daemon sends
+ * then response back to driver.
+ */
+ if (sc->daemon_busy == true)
+ revents = POLLIN;
+ else
+ selrecord(td, &sc->hv_kvp_selinfo);
+
+ mtx_unlock(&sc->pending_mutex);
+
+ return (revents);
+}
+
+static int
+hv_kvp_probe(device_t dev)
+{
+
+ return (vmbus_ic_probe(dev, vmbus_kvp_descs));
+}
+
+static int
+hv_kvp_attach(device_t dev)
+{
+ int error;
+ struct sysctl_oid_list *child;
+ struct sysctl_ctx_list *ctx;
+
+ hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev);
+
+ sc->dev = dev;
+ sema_init(&sc->dev_sema, 0, "hv_kvp device semaphore");
+ mtx_init(&sc->pending_mutex, "hv-kvp pending mutex",
+ NULL, MTX_DEF);
+
+ ctx = device_get_sysctl_ctx(dev);
+ child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "hv_kvp_log",
+ CTLFLAG_RWTUN, &hv_kvp_log, 0, "Hyperv KVP service log level");
+
+ TASK_INIT(&sc->task, 0, hv_kvp_process_request, sc);
+
+ /* create character device */
+ error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
+ &sc->hv_kvp_dev,
+ &hv_kvp_cdevsw,
+ 0,
+ UID_ROOT,
+ GID_WHEEL,
+ 0640,
+ "hv_kvp_dev");
+
+ if (error != 0)
+ return (error);
+ sc->hv_kvp_dev->si_drv1 = sc;
+
+ return (vmbus_ic_attach(dev, hv_kvp_callback));
+}
+
+static int
+hv_kvp_detach(device_t dev)
+{
+ hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev);
+
+ if (sc->daemon_task != NULL) {
+ PROC_LOCK(sc->daemon_task);
+ kern_psignal(sc->daemon_task, SIGKILL);
+ PROC_UNLOCK(sc->daemon_task);
+ }
+
+ destroy_dev(sc->hv_kvp_dev);
+ return (vmbus_ic_detach(dev));
+}
+
+static device_method_t kvp_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, hv_kvp_probe),
+ DEVMETHOD(device_attach, hv_kvp_attach),
+ DEVMETHOD(device_detach, hv_kvp_detach),
+ { 0, 0 }
+};
+
+static driver_t kvp_driver = { "hvkvp", kvp_methods, sizeof(hv_kvp_sc)};
+
+static devclass_t kvp_devclass;
+
+DRIVER_MODULE(hv_kvp, vmbus, kvp_driver, kvp_devclass, NULL, NULL);
+MODULE_VERSION(hv_kvp, 1);
+MODULE_DEPEND(hv_kvp, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/utilities/hv_kvp.h b/sys/dev/hyperv/utilities/hv_kvp.h
new file mode 100644
index 000000000000..91e1ea404d4a
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_kvp.h
@@ -0,0 +1,229 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014,2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _KVP_H
+#define _KVP_H
+/*
+ * An implementation of HyperV key value pair (KVP) functionality for FreeBSD
+ *
+ */
+
+/*
+ * Maximum value size - used for both key names and value data, and includes
+ * any applicable NULL terminators.
+ *
+ * Note: This limit is somewhat arbitrary, but falls easily within what is
+ * supported for all native guests (back to Win 2000) and what is reasonable
+ * for the IC KVP exchange functionality. Note that Windows Me/98/95 are
+ * limited to 255 character key names.
+ *
+ * MSDN recommends not storing data values larger than 2048 bytes in the
+ * registry.
+ *
+ * Note: This value is used in defining the KVP exchange message - this value
+ * cannot be modified without affecting the message size and compatibility.
+ */
+
+/*
+ * bytes, including any null terminators
+ */
+#define HV_KVP_EXCHANGE_MAX_VALUE_SIZE (2048)
+
+
+/*
+ * Maximum key size - the registry limit for the length of an entry name
+ * is 256 characters, including the null terminator
+ */
+#define HV_KVP_EXCHANGE_MAX_KEY_SIZE (512)
+
+
+/*
+ * In FreeBSD, we implement the KVP functionality in two components:
+ * 1) The kernel component which is packaged as part of the hv_utils driver
+ * is responsible for communicating with the host and responsible for
+ * implementing the host/guest protocol. 2) A user level daemon that is
+ * responsible for data gathering.
+ *
+ * Host/Guest Protocol: The host iterates over an index and expects the guest
+ * to assign a key name to the index and also return the value corresponding to
+ * the key. The host will have atmost one KVP transaction outstanding at any
+ * given point in time. The host side iteration stops when the guest returns
+ * an error. Microsoft has specified the following mapping of key names to
+ * host specified index:
+ *
+ * Index Key Name
+ * 0 FullyQualifiedDomainName
+ * 1 IntegrationServicesVersion
+ * 2 NetworkAddressIPv4
+ * 3 NetworkAddressIPv6
+ * 4 OSBuildNumber
+ * 5 OSName
+ * 6 OSMajorVersion
+ * 7 OSMinorVersion
+ * 8 OSVersion
+ * 9 ProcessorArchitecture
+ *
+ * The Windows host expects the Key Name and Key Value to be encoded in utf16.
+ *
+ * Guest Kernel/KVP Daemon Protocol: As noted earlier, we implement all of the
+ * data gathering functionality in a user mode daemon. The user level daemon
+ * is also responsible for binding the key name to the index as well. The
+ * kernel and user-level daemon communicate using a connector channel.
+ *
+ * The user mode component first registers with the
+ * the kernel component. Subsequently, the kernel component requests, data
+ * for the specified keys. In response to this message the user mode component
+ * fills in the value corresponding to the specified key. We overload the
+ * sequence field in the cn_msg header to define our KVP message types.
+ *
+ *
+ * The kernel component simply acts as a conduit for communication between the
+ * Windows host and the user-level daemon. The kernel component passes up the
+ * index received from the Host to the user-level daemon. If the index is
+ * valid (supported), the corresponding key as well as its
+ * value (both are strings) is returned. If the index is invalid
+ * (not supported), a NULL key string is returned.
+ */
+
+
+/*
+ * Registry value types.
+ */
+#define HV_REG_SZ 1
+#define HV_REG_U32 4
+#define HV_REG_U64 8
+
+
+/*
+ * Daemon code supporting IP injection.
+ */
+#define HV_KVP_OP_REGISTER 4
+
+
+enum hv_kvp_exchg_op {
+ HV_KVP_OP_GET = 0,
+ HV_KVP_OP_SET,
+ HV_KVP_OP_DELETE,
+ HV_KVP_OP_ENUMERATE,
+ HV_KVP_OP_GET_IP_INFO,
+ HV_KVP_OP_SET_IP_INFO,
+ HV_KVP_OP_COUNT /* Number of operations, must be last. */
+};
+
+enum hv_kvp_exchg_pool {
+ HV_KVP_POOL_EXTERNAL = 0,
+ HV_KVP_POOL_GUEST,
+ HV_KVP_POOL_AUTO,
+ HV_KVP_POOL_AUTO_EXTERNAL,
+ HV_KVP_POOL_AUTO_INTERNAL,
+ HV_KVP_POOL_COUNT /* Number of pools, must be last. */
+};
+
+#define ADDR_FAMILY_NONE 0x00
+#define ADDR_FAMILY_IPV4 0x01
+#define ADDR_FAMILY_IPV6 0x02
+
+#define MAX_ADAPTER_ID_SIZE 128
+#define MAX_IP_ADDR_SIZE 1024
+#define MAX_GATEWAY_SIZE 512
+
+
+struct hv_kvp_ipaddr_value {
+ uint16_t adapter_id[MAX_ADAPTER_ID_SIZE];
+ uint8_t addr_family;
+ uint8_t dhcp_enabled;
+ uint16_t ip_addr[MAX_IP_ADDR_SIZE];
+ uint16_t sub_net[MAX_IP_ADDR_SIZE];
+ uint16_t gate_way[MAX_GATEWAY_SIZE];
+ uint16_t dns_addr[MAX_IP_ADDR_SIZE];
+}__attribute__((packed));
+
+struct hv_kvp_hdr {
+ uint8_t operation;
+ uint8_t pool;
+ uint16_t pad;
+} __attribute__((packed));
+
+struct hv_kvp_exchg_msg_value {
+ uint32_t value_type;
+ uint32_t key_size;
+ uint32_t value_size;
+ uint8_t key[HV_KVP_EXCHANGE_MAX_KEY_SIZE];
+ union {
+ uint8_t value[HV_KVP_EXCHANGE_MAX_VALUE_SIZE];
+ uint32_t value_u32;
+ uint64_t value_u64;
+ } msg_value;
+} __attribute__((packed));
+
+struct hv_kvp_msg_enumerate {
+ uint32_t index;
+ struct hv_kvp_exchg_msg_value data;
+} __attribute__((packed));
+
+struct hv_kvp_msg_get {
+ struct hv_kvp_exchg_msg_value data;
+} __attribute__((packed));
+
+struct hv_kvp_msg_set {
+ struct hv_kvp_exchg_msg_value data;
+} __attribute__((packed));
+
+struct hv_kvp_msg_delete {
+ uint32_t key_size;
+ uint8_t key[HV_KVP_EXCHANGE_MAX_KEY_SIZE];
+} __attribute__((packed));
+
+struct hv_kvp_register {
+ uint8_t version[HV_KVP_EXCHANGE_MAX_KEY_SIZE];
+} __attribute__((packed));
+
+struct hv_kvp_msg {
+ union {
+ struct hv_kvp_hdr kvp_hdr;
+ uint32_t error;
+ } hdr;
+ union {
+ struct hv_kvp_msg_get kvp_get;
+ struct hv_kvp_msg_set kvp_set;
+ struct hv_kvp_msg_delete kvp_delete;
+ struct hv_kvp_msg_enumerate kvp_enum_data;
+ struct hv_kvp_ipaddr_value kvp_ip_val;
+ struct hv_kvp_register kvp_register;
+ } body;
+} __attribute__((packed));
+
+struct hv_kvp_ip_msg {
+ uint8_t operation;
+ uint8_t pool;
+ struct hv_kvp_ipaddr_value kvp_ip_val;
+} __attribute__((packed));
+
+#endif /* _KVP_H */
diff --git a/sys/dev/hyperv/utilities/hv_snapshot.c b/sys/dev/hyperv/utilities/hv_snapshot.c
new file mode 100644
index 000000000000..45defe1b0f1e
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_snapshot.c
@@ -0,0 +1,1061 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/lock.h>
+#include <sys/taskqueue.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/kthread.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/un.h>
+#include <sys/endian.h>
+#include <sys/sema.h>
+#include <sys/signal.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/mutex.h>
+#include <sys/callout.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/utilities/hv_utilreg.h>
+#include <dev/hyperv/utilities/vmbus_icreg.h>
+#include <dev/hyperv/utilities/vmbus_icvar.h>
+
+#include "hv_snapshot.h"
+#include "vmbus_if.h"
+
+#define VSS_MAJOR 5
+#define VSS_MINOR 0
+#define VSS_MSGVER VMBUS_IC_VERSION(VSS_MAJOR, VSS_MINOR)
+
+#define VSS_FWVER_MAJOR 3
+#define VSS_FWVER VMBUS_IC_VERSION(VSS_FWVER_MAJOR, 0)
+
+#define TIMEOUT_LIMIT (15) // seconds
+enum hv_vss_op {
+ VSS_OP_CREATE = 0,
+ VSS_OP_DELETE,
+ VSS_OP_HOT_BACKUP,
+ VSS_OP_GET_DM_INFO,
+ VSS_OP_BU_COMPLETE,
+ /*
+ * Following operations are only supported with IC version >= 5.0
+ */
+ VSS_OP_FREEZE, /* Freeze the file systems in the VM */
+ VSS_OP_THAW, /* Unfreeze the file systems */
+ VSS_OP_AUTO_RECOVER,
+ VSS_OP_COUNT /* Number of operations, must be last */
+};
+
+/*
+ * Header for all VSS messages.
+ */
+struct hv_vss_hdr {
+ struct vmbus_icmsg_hdr ic_hdr;
+ uint8_t operation;
+ uint8_t reserved[7];
+} __packed;
+
+
+/*
+ * Flag values for the hv_vss_check_feature. Here supports only
+ * one value.
+ */
+#define VSS_HBU_NO_AUTO_RECOVERY 0x00000005
+
+struct hv_vss_check_feature {
+ uint32_t flags;
+} __packed;
+
+struct hv_vss_check_dm_info {
+ uint32_t flags;
+} __packed;
+
+struct hv_vss_msg {
+ union {
+ struct hv_vss_hdr vss_hdr;
+ } hdr;
+ union {
+ struct hv_vss_check_feature vss_cf;
+ struct hv_vss_check_dm_info dm_info;
+ } body;
+} __packed;
+
+struct hv_vss_req {
+ struct hv_vss_opt_msg opt_msg; /* used to communicate with daemon */
+ struct hv_vss_msg msg; /* used to communicate with host */
+} __packed;
+
+/* hv_vss debug control */
+static int hv_vss_log = 0;
+
+#define hv_vss_log_error(...) do { \
+ if (hv_vss_log > 0) \
+ log(LOG_ERR, "hv_vss: " __VA_ARGS__); \
+} while (0)
+
+#define hv_vss_log_info(...) do { \
+ if (hv_vss_log > 1) \
+ log(LOG_INFO, "hv_vss: " __VA_ARGS__); \
+} while (0)
+
+static const struct vmbus_ic_desc vmbus_vss_descs[] = {
+ {
+ .ic_guid = { .hv_guid = {
+ 0x29, 0x2e, 0xfa, 0x35, 0x23, 0xea, 0x36, 0x42,
+ 0x96, 0xae, 0x3a, 0x6e, 0xba, 0xcb, 0xa4, 0x40} },
+ .ic_desc = "Hyper-V VSS"
+ },
+ VMBUS_IC_DESC_END
+};
+
+static const char * vss_opt_name[] = {"None", "VSSCheck", "Freeze", "Thaw"};
+
+/* character device prototypes */
+static d_open_t hv_vss_dev_open;
+static d_close_t hv_vss_dev_close;
+static d_poll_t hv_vss_dev_daemon_poll;
+static d_ioctl_t hv_vss_dev_daemon_ioctl;
+
+static d_open_t hv_appvss_dev_open;
+static d_close_t hv_appvss_dev_close;
+static d_poll_t hv_appvss_dev_poll;
+static d_ioctl_t hv_appvss_dev_ioctl;
+
+/* hv_vss character device structure */
+static struct cdevsw hv_vss_cdevsw =
+{
+ .d_version = D_VERSION,
+ .d_open = hv_vss_dev_open,
+ .d_close = hv_vss_dev_close,
+ .d_poll = hv_vss_dev_daemon_poll,
+ .d_ioctl = hv_vss_dev_daemon_ioctl,
+ .d_name = FS_VSS_DEV_NAME,
+};
+
+static struct cdevsw hv_appvss_cdevsw =
+{
+ .d_version = D_VERSION,
+ .d_open = hv_appvss_dev_open,
+ .d_close = hv_appvss_dev_close,
+ .d_poll = hv_appvss_dev_poll,
+ .d_ioctl = hv_appvss_dev_ioctl,
+ .d_name = APP_VSS_DEV_NAME,
+};
+
+struct hv_vss_sc;
+/*
+ * Global state to track cdev
+ */
+struct hv_vss_dev_sc {
+ /*
+ * msg was transferred from host to notify queue, and
+ * ack queue. Finally, it was recyled to free list.
+ */
+ STAILQ_HEAD(, hv_vss_req_internal) to_notify_queue;
+ STAILQ_HEAD(, hv_vss_req_internal) to_ack_queue;
+ struct hv_vss_sc *sc;
+ struct proc *proc_task;
+ struct selinfo hv_vss_selinfo;
+};
+/*
+ * Global state to track and synchronize the transaction requests from the host.
+ * The VSS allows user to register their function to do freeze/thaw for application.
+ * VSS kernel will notify both vss daemon and user application if it is registered.
+ * The implementation state transition is illustrated by:
+ * https://clovertrail.github.io/assets/vssdot.png
+ */
+typedef struct hv_vss_sc {
+ struct vmbus_ic_softc util_sc;
+ device_t dev;
+
+ struct task task;
+
+ /*
+ * mutex is used to protect access of list/queue,
+ * callout in request is also used this mutex.
+ */
+ struct mtx pending_mutex;
+ /*
+ * req_free_list contains all free items
+ */
+ LIST_HEAD(, hv_vss_req_internal) req_free_list;
+
+ /* Indicates if daemon registered with driver */
+ boolean_t register_done;
+
+ boolean_t app_register_done;
+
+ /* cdev for file system freeze/thaw */
+ struct cdev *hv_vss_dev;
+ /* cdev for application freeze/thaw */
+ struct cdev *hv_appvss_dev;
+
+ /* sc for app */
+ struct hv_vss_dev_sc app_sc;
+ /* sc for deamon */
+ struct hv_vss_dev_sc daemon_sc;
+} hv_vss_sc;
+
+typedef struct hv_vss_req_internal {
+ LIST_ENTRY(hv_vss_req_internal) link;
+ STAILQ_ENTRY(hv_vss_req_internal) slink;
+ struct hv_vss_req vss_req;
+
+ /* Rcv buffer for communicating with the host*/
+ uint8_t *rcv_buf;
+ /* Length of host message */
+ uint32_t host_msg_len;
+ /* Host message id */
+ uint64_t host_msg_id;
+
+ hv_vss_sc *sc;
+
+ struct callout callout;
+} hv_vss_req_internal;
+
+#define SEARCH_REMOVE_REQ_LOCKED(reqp, queue, link, tmp, id) \
+ do { \
+ STAILQ_FOREACH_SAFE(reqp, queue, link, tmp) { \
+ if (reqp->vss_req.opt_msg.msgid == id) { \
+ STAILQ_REMOVE(queue, \
+ reqp, hv_vss_req_internal, link); \
+ break; \
+ } \
+ } \
+ } while (0)
+
+static bool
+hv_vss_is_daemon_killed_after_launch(hv_vss_sc *sc)
+{
+ return (!sc->register_done && sc->daemon_sc.proc_task);
+}
+
+/*
+ * Callback routine that gets called whenever there is a message from host
+ */
+static void
+hv_vss_callback(struct vmbus_channel *chan __unused, void *context)
+{
+ hv_vss_sc *sc = (hv_vss_sc*)context;
+ if (hv_vss_is_daemon_killed_after_launch(sc))
+ hv_vss_log_info("%s: daemon was killed!\n", __func__);
+ if (sc->register_done || sc->daemon_sc.proc_task) {
+ hv_vss_log_info("%s: Queuing work item\n", __func__);
+ if (hv_vss_is_daemon_killed_after_launch(sc))
+ hv_vss_log_info("%s: daemon was killed!\n", __func__);
+ taskqueue_enqueue(taskqueue_thread, &sc->task);
+ } else {
+ hv_vss_log_info("%s: daemon has never been registered\n", __func__);
+ }
+ hv_vss_log_info("%s: received msg from host\n", __func__);
+}
+/*
+ * Send the response back to the host.
+ */
+static void
+hv_vss_respond_host(uint8_t *rcv_buf, struct vmbus_channel *ch,
+ uint32_t recvlen, uint64_t requestid, uint32_t error)
+{
+ struct vmbus_icmsg_hdr *hv_icmsg_hdrp;
+
+ hv_icmsg_hdrp = (struct vmbus_icmsg_hdr *)rcv_buf;
+
+ hv_icmsg_hdrp->ic_status = error;
+ hv_icmsg_hdrp->ic_flags = HV_ICMSGHDRFLAG_TRANSACTION | HV_ICMSGHDRFLAG_RESPONSE;
+
+ error = vmbus_chan_send(ch, VMBUS_CHANPKT_TYPE_INBAND, 0,
+ rcv_buf, recvlen, requestid);
+ if (error)
+ hv_vss_log_info("%s: hv_vss_respond_host: sendpacket error:%d\n",
+ __func__, error);
+}
+
+static void
+hv_vss_notify_host_result_locked(struct hv_vss_req_internal *reqp, uint32_t status)
+{
+ struct hv_vss_msg* msg = (struct hv_vss_msg *)reqp->rcv_buf;
+ hv_vss_sc *sc = reqp->sc;
+ if (reqp->vss_req.opt_msg.opt == HV_VSS_CHECK) {
+ msg->body.vss_cf.flags = VSS_HBU_NO_AUTO_RECOVERY;
+ }
+ hv_vss_log_info("%s, %s response %s to host\n", __func__,
+ vss_opt_name[reqp->vss_req.opt_msg.opt],
+ status == HV_S_OK ? "Success" : "Fail");
+ hv_vss_respond_host(reqp->rcv_buf, vmbus_get_channel(reqp->sc->dev),
+ reqp->host_msg_len, reqp->host_msg_id, status);
+ /* recycle the request */
+ LIST_INSERT_HEAD(&sc->req_free_list, reqp, link);
+}
+
+static void
+hv_vss_notify_host_result(struct hv_vss_req_internal *reqp, uint32_t status)
+{
+ mtx_lock(&reqp->sc->pending_mutex);
+ hv_vss_notify_host_result_locked(reqp, status);
+ mtx_unlock(&reqp->sc->pending_mutex);
+}
+
+static void
+hv_vss_cp_vssreq_to_user(struct hv_vss_req_internal *reqp,
+ struct hv_vss_opt_msg *userdata)
+{
+ struct hv_vss_req *hv_vss_dev_buf;
+ hv_vss_dev_buf = &reqp->vss_req;
+ hv_vss_dev_buf->opt_msg.opt = HV_VSS_NONE;
+ switch (reqp->vss_req.msg.hdr.vss_hdr.operation) {
+ case VSS_OP_FREEZE:
+ hv_vss_dev_buf->opt_msg.opt = HV_VSS_FREEZE;
+ break;
+ case VSS_OP_THAW:
+ hv_vss_dev_buf->opt_msg.opt = HV_VSS_THAW;
+ break;
+ case VSS_OP_HOT_BACKUP:
+ hv_vss_dev_buf->opt_msg.opt = HV_VSS_CHECK;
+ break;
+ }
+ *userdata = hv_vss_dev_buf->opt_msg;
+ hv_vss_log_info("%s, read data from user for "
+ "%s (%ju) \n", __func__, vss_opt_name[userdata->opt],
+ (uintmax_t)userdata->msgid);
+}
+
+/**
+ * Remove the request id from app notifiy or ack queue,
+ * and recyle the request by inserting it to free list.
+ *
+ * When app was notified but not yet sending ack, the request
+ * should locate in either notify queue or ack queue.
+ */
+static struct hv_vss_req_internal*
+hv_vss_drain_req_queue_locked(hv_vss_sc *sc, uint64_t req_id)
+{
+ struct hv_vss_req_internal *reqp, *tmp;
+ SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->daemon_sc.to_notify_queue,
+ slink, tmp, req_id);
+ if (reqp == NULL)
+ SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->daemon_sc.to_ack_queue,
+ slink, tmp, req_id);
+ if (reqp == NULL)
+ SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->app_sc.to_notify_queue,
+ slink, tmp, req_id);
+ if (reqp == NULL)
+ SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->app_sc.to_ack_queue, slink,
+ tmp, req_id);
+ return (reqp);
+}
+/**
+ * Actions for daemon who has been notified.
+ */
+static void
+hv_vss_notified(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
+{
+ struct hv_vss_req_internal *reqp;
+ mtx_lock(&dev_sc->sc->pending_mutex);
+ if (!STAILQ_EMPTY(&dev_sc->to_notify_queue)) {
+ reqp = STAILQ_FIRST(&dev_sc->to_notify_queue);
+ hv_vss_cp_vssreq_to_user(reqp, userdata);
+ STAILQ_REMOVE_HEAD(&dev_sc->to_notify_queue, slink);
+ /* insert the msg to queue for write */
+ STAILQ_INSERT_TAIL(&dev_sc->to_ack_queue, reqp, slink);
+ userdata->status = VSS_SUCCESS;
+ } else {
+ /* Timeout occur, thus request was removed from queue. */
+ hv_vss_log_info("%s: notify queue is empty!\n", __func__);
+ userdata->status = VSS_FAIL;
+ }
+ mtx_unlock(&dev_sc->sc->pending_mutex);
+}
+
+static void
+hv_vss_notify(struct hv_vss_dev_sc *dev_sc, struct hv_vss_req_internal *reqp)
+{
+ uint32_t opt = reqp->vss_req.opt_msg.opt;
+ mtx_lock(&dev_sc->sc->pending_mutex);
+ STAILQ_INSERT_TAIL(&dev_sc->to_notify_queue, reqp, slink);
+ hv_vss_log_info("%s: issuing query %s (%ju) to %s\n", __func__,
+ vss_opt_name[opt], (uintmax_t)reqp->vss_req.opt_msg.msgid,
+ &dev_sc->sc->app_sc == dev_sc ? "app" : "daemon");
+ mtx_unlock(&dev_sc->sc->pending_mutex);
+ selwakeup(&dev_sc->hv_vss_selinfo);
+}
+
+/**
+ * Actions for daemon who has acknowledged.
+ */
+static void
+hv_vss_daemon_acked(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
+{
+ struct hv_vss_req_internal *reqp, *tmp;
+ uint64_t req_id;
+ int opt;
+ uint32_t status;
+
+ opt = userdata->opt;
+ req_id = userdata->msgid;
+ status = userdata->status;
+ /* make sure the reserved fields are all zeros. */
+ memset(&userdata->reserved, 0, sizeof(struct hv_vss_opt_msg) -
+ __offsetof(struct hv_vss_opt_msg, reserved));
+ mtx_lock(&dev_sc->sc->pending_mutex);
+ SEARCH_REMOVE_REQ_LOCKED(reqp, &dev_sc->to_ack_queue, slink, tmp, req_id);
+ mtx_unlock(&dev_sc->sc->pending_mutex);
+ if (reqp == NULL) {
+ hv_vss_log_info("%s Timeout: fail to find daemon ack request\n",
+ __func__);
+ userdata->status = VSS_FAIL;
+ return;
+ }
+ KASSERT(opt == reqp->vss_req.opt_msg.opt, ("Mismatched VSS operation!"));
+ hv_vss_log_info("%s, get response %d from daemon for %s (%ju) \n", __func__,
+ status, vss_opt_name[opt], (uintmax_t)req_id);
+ switch (opt) {
+ case HV_VSS_CHECK:
+ case HV_VSS_FREEZE:
+ callout_drain(&reqp->callout);
+ hv_vss_notify_host_result(reqp,
+ status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
+ break;
+ case HV_VSS_THAW:
+ if (dev_sc->sc->app_register_done) {
+ if (status == VSS_SUCCESS) {
+ hv_vss_notify(&dev_sc->sc->app_sc, reqp);
+ } else {
+ /* handle error */
+ callout_drain(&reqp->callout);
+ hv_vss_notify_host_result(reqp, HV_E_FAIL);
+ }
+ } else {
+ callout_drain(&reqp->callout);
+ hv_vss_notify_host_result(reqp,
+ status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
+ }
+ break;
+ }
+}
+
+/**
+ * Actions for app who has acknowledged.
+ */
+static void
+hv_vss_app_acked(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata)
+{
+ struct hv_vss_req_internal *reqp, *tmp;
+ uint64_t req_id;
+ int opt;
+ uint8_t status;
+
+ opt = userdata->opt;
+ req_id = userdata->msgid;
+ status = userdata->status;
+ /* make sure the reserved fields are all zeros. */
+ memset(&userdata->reserved, 0, sizeof(struct hv_vss_opt_msg) -
+ __offsetof(struct hv_vss_opt_msg, reserved));
+ mtx_lock(&dev_sc->sc->pending_mutex);
+ SEARCH_REMOVE_REQ_LOCKED(reqp, &dev_sc->to_ack_queue, slink, tmp, req_id);
+ mtx_unlock(&dev_sc->sc->pending_mutex);
+ if (reqp == NULL) {
+ hv_vss_log_info("%s Timeout: fail to find app ack request\n",
+ __func__);
+ userdata->status = VSS_FAIL;
+ return;
+ }
+ KASSERT(opt == reqp->vss_req.opt_msg.opt, ("Mismatched VSS operation!"));
+ hv_vss_log_info("%s, get response %d from app for %s (%ju) \n",
+ __func__, status, vss_opt_name[opt], (uintmax_t)req_id);
+ if (dev_sc->sc->register_done) {
+ switch (opt) {
+ case HV_VSS_CHECK:
+ case HV_VSS_FREEZE:
+ if (status == VSS_SUCCESS) {
+ hv_vss_notify(&dev_sc->sc->daemon_sc, reqp);
+ } else {
+ /* handle error */
+ callout_drain(&reqp->callout);
+ hv_vss_notify_host_result(reqp, HV_E_FAIL);
+ }
+ break;
+ case HV_VSS_THAW:
+ callout_drain(&reqp->callout);
+ hv_vss_notify_host_result(reqp,
+ status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL);
+ break;
+ }
+ } else {
+ hv_vss_log_info("%s, Fatal: vss daemon was killed\n", __func__);
+ }
+}
+
+static int
+hv_vss_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+ struct proc *td_proc;
+ td_proc = td->td_proc;
+
+ struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+ hv_vss_log_info("%s: %s opens device \"%s\" successfully.\n",
+ __func__, td_proc->p_comm, FS_VSS_DEV_NAME);
+
+ if (dev_sc->sc->register_done)
+ return (EBUSY);
+
+ dev_sc->sc->register_done = true;
+ hv_vss_callback(vmbus_get_channel(dev_sc->sc->dev), dev_sc->sc);
+
+ dev_sc->proc_task = curproc;
+ return (0);
+}
+
+static int
+hv_vss_dev_close(struct cdev *dev, int fflag __unused, int devtype __unused,
+ struct thread *td)
+{
+ struct proc *td_proc;
+ td_proc = td->td_proc;
+
+ struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+
+ hv_vss_log_info("%s: %s closes device \"%s\"\n",
+ __func__, td_proc->p_comm, FS_VSS_DEV_NAME);
+ dev_sc->sc->register_done = false;
+ return (0);
+}
+
+static int
+hv_vss_dev_daemon_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
+ struct thread *td)
+{
+ struct proc *td_proc;
+ struct hv_vss_dev_sc *sc;
+
+ td_proc = td->td_proc;
+ sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+
+ hv_vss_log_info("%s: %s invoked vss ioctl\n", __func__, td_proc->p_comm);
+
+ struct hv_vss_opt_msg* userdata = (struct hv_vss_opt_msg*)data;
+ switch(cmd) {
+ case IOCHVVSSREAD:
+ hv_vss_notified(sc, userdata);
+ break;
+ case IOCHVVSSWRITE:
+ hv_vss_daemon_acked(sc, userdata);
+ break;
+ }
+ return (0);
+}
+
+/*
+ * hv_vss_daemon poll invokes this function to check if data is available
+ * for daemon to read.
+ */
+static int
+hv_vss_dev_daemon_poll(struct cdev *dev, int events, struct thread *td)
+{
+ int revent = 0;
+ struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+
+ mtx_lock(&dev_sc->sc->pending_mutex);
+ /**
+ * if there is data ready, inform daemon's poll
+ */
+ if (!STAILQ_EMPTY(&dev_sc->to_notify_queue))
+ revent = POLLIN;
+ if (revent == 0)
+ selrecord(td, &dev_sc->hv_vss_selinfo);
+ hv_vss_log_info("%s return 0x%x\n", __func__, revent);
+ mtx_unlock(&dev_sc->sc->pending_mutex);
+ return (revent);
+}
+
+static int
+hv_appvss_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+ struct proc *td_proc;
+ td_proc = td->td_proc;
+
+ struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+ hv_vss_log_info("%s: %s opens device \"%s\" successfully.\n",
+ __func__, td_proc->p_comm, APP_VSS_DEV_NAME);
+
+ if (dev_sc->sc->app_register_done)
+ return (EBUSY);
+
+ dev_sc->sc->app_register_done = true;
+ dev_sc->proc_task = curproc;
+ return (0);
+}
+
+static int
+hv_appvss_dev_close(struct cdev *dev, int fflag __unused, int devtype __unused,
+ struct thread *td)
+{
+ struct proc *td_proc;
+ td_proc = td->td_proc;
+
+ struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+
+ hv_vss_log_info("%s: %s closes device \"%s\".\n",
+ __func__, td_proc->p_comm, APP_VSS_DEV_NAME);
+ dev_sc->sc->app_register_done = false;
+ return (0);
+}
+
+static int
+hv_appvss_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
+ struct thread *td)
+{
+ struct proc *td_proc;
+ struct hv_vss_dev_sc *dev_sc;
+
+ td_proc = td->td_proc;
+ dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+
+ hv_vss_log_info("%s: %s invoked vss ioctl\n", __func__, td_proc->p_comm);
+
+ struct hv_vss_opt_msg* userdata = (struct hv_vss_opt_msg*)data;
+ switch(cmd) {
+ case IOCHVVSSREAD:
+ hv_vss_notified(dev_sc, userdata);
+ break;
+ case IOCHVVSSWRITE:
+ hv_vss_app_acked(dev_sc, userdata);
+ break;
+ }
+ return (0);
+}
+
+/*
+ * hv_vss_daemon poll invokes this function to check if data is available
+ * for daemon to read.
+ */
+static int
+hv_appvss_dev_poll(struct cdev *dev, int events, struct thread *td)
+{
+ int revent = 0;
+ struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1;
+
+ mtx_lock(&dev_sc->sc->pending_mutex);
+ /**
+ * if there is data ready, inform daemon's poll
+ */
+ if (!STAILQ_EMPTY(&dev_sc->to_notify_queue))
+ revent = POLLIN;
+ if (revent == 0)
+ selrecord(td, &dev_sc->hv_vss_selinfo);
+ hv_vss_log_info("%s return 0x%x\n", __func__, revent);
+ mtx_unlock(&dev_sc->sc->pending_mutex);
+ return (revent);
+}
+
+static void
+hv_vss_timeout(void *arg)
+{
+ hv_vss_req_internal *reqp = arg;
+ hv_vss_req_internal *request;
+ hv_vss_sc* sc = reqp->sc;
+ uint64_t req_id = reqp->vss_req.opt_msg.msgid;
+ /* This thread is locked */
+ KASSERT(mtx_owned(&sc->pending_mutex), ("mutex lock is not owned!"));
+ request = hv_vss_drain_req_queue_locked(sc, req_id);
+ KASSERT(request != NULL, ("timeout but fail to find request"));
+ hv_vss_notify_host_result_locked(reqp, HV_E_FAIL);
+}
+
+/*
+ * This routine is called whenever a message is received from the host
+ */
+static void
+hv_vss_init_req(hv_vss_req_internal *reqp,
+ uint32_t recvlen, uint64_t requestid, uint8_t *vss_buf, hv_vss_sc *sc)
+{
+ struct timespec vm_ts;
+ struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
+
+ memset(reqp, 0, __offsetof(hv_vss_req_internal, callout));
+ reqp->host_msg_len = recvlen;
+ reqp->host_msg_id = requestid;
+ reqp->rcv_buf = vss_buf;
+ reqp->sc = sc;
+ memcpy(&reqp->vss_req.msg,
+ (struct hv_vss_msg *)vss_buf, sizeof(struct hv_vss_msg));
+ /* set the opt for users */
+ switch (msg->hdr.vss_hdr.operation) {
+ case VSS_OP_FREEZE:
+ reqp->vss_req.opt_msg.opt = HV_VSS_FREEZE;
+ break;
+ case VSS_OP_THAW:
+ reqp->vss_req.opt_msg.opt = HV_VSS_THAW;
+ break;
+ case VSS_OP_HOT_BACKUP:
+ reqp->vss_req.opt_msg.opt = HV_VSS_CHECK;
+ break;
+ }
+ /* Use a timestamp as msg request ID */
+ nanotime(&vm_ts);
+ reqp->vss_req.opt_msg.msgid = (vm_ts.tv_sec * NANOSEC) + vm_ts.tv_nsec;
+}
+
+static hv_vss_req_internal*
+hv_vss_get_new_req_locked(hv_vss_sc *sc)
+{
+ hv_vss_req_internal *reqp;
+ if (!STAILQ_EMPTY(&sc->daemon_sc.to_notify_queue) ||
+ !STAILQ_EMPTY(&sc->daemon_sc.to_ack_queue) ||
+ !STAILQ_EMPTY(&sc->app_sc.to_notify_queue) ||
+ !STAILQ_EMPTY(&sc->app_sc.to_ack_queue)) {
+ /*
+ * There is request coming from host before
+ * finishing previous requests
+ */
+ hv_vss_log_info("%s: Warning: there is new request "
+ "coming before finishing previous requests\n", __func__);
+ return (NULL);
+ }
+ if (LIST_EMPTY(&sc->req_free_list)) {
+ /* TODO Error: no buffer */
+ hv_vss_log_info("Error: No buffer\n");
+ return (NULL);
+ }
+ reqp = LIST_FIRST(&sc->req_free_list);
+ LIST_REMOVE(reqp, link);
+ return (reqp);
+}
+
+static void
+hv_vss_start_notify(hv_vss_req_internal *reqp, uint32_t opt)
+{
+ hv_vss_sc *sc = reqp->sc;
+ /*
+ * Freeze/Check notification sequence: kernel -> app -> daemon(fs)
+ * Thaw notification sequence: kernel -> daemon(fs) -> app
+ *
+ * We should wake up the daemon, in case it's doing poll().
+ * The response should be received after 5s, otherwise, trigger timeout.
+ */
+ switch (opt) {
+ case VSS_OP_FREEZE:
+ case VSS_OP_HOT_BACKUP:
+ if (sc->app_register_done)
+ hv_vss_notify(&sc->app_sc, reqp);
+ else
+ hv_vss_notify(&sc->daemon_sc, reqp);
+ callout_reset(&reqp->callout, TIMEOUT_LIMIT * hz,
+ hv_vss_timeout, reqp);
+ break;
+ case VSS_OP_THAW:
+ hv_vss_notify(&sc->daemon_sc, reqp);
+ callout_reset(&reqp->callout, TIMEOUT_LIMIT * hz,
+ hv_vss_timeout, reqp);
+ break;
+ }
+}
+
+/*
+ * Function to read the vss request buffer from host
+ * and interact with daemon
+ */
+static void
+hv_vss_process_request(void *context, int pending __unused)
+{
+ uint8_t *vss_buf;
+ struct vmbus_channel *channel;
+ uint32_t recvlen = 0;
+ uint64_t requestid;
+ struct vmbus_icmsg_hdr *icmsghdrp;
+ int ret = 0;
+ hv_vss_sc *sc;
+ hv_vss_req_internal *reqp;
+
+ hv_vss_log_info("%s: entering hv_vss_process_request\n", __func__);
+
+ sc = (hv_vss_sc*)context;
+ vss_buf = sc->util_sc.ic_buf;
+ channel = vmbus_get_channel(sc->dev);
+
+ recvlen = sc->util_sc.ic_buflen;
+ ret = vmbus_chan_recv(channel, vss_buf, &recvlen, &requestid);
+ KASSERT(ret != ENOBUFS, ("hvvss recvbuf is not large enough"));
+ /* XXX check recvlen to make sure that it contains enough data */
+
+ while ((ret == 0) && (recvlen > 0)) {
+ icmsghdrp = (struct vmbus_icmsg_hdr *)vss_buf;
+
+ if (icmsghdrp->ic_type == HV_ICMSGTYPE_NEGOTIATE) {
+ ret = vmbus_ic_negomsg(&sc->util_sc, vss_buf,
+ &recvlen, VSS_FWVER, VSS_MSGVER);
+ hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
+ recvlen, requestid, ret);
+ hv_vss_log_info("%s: version negotiated\n", __func__);
+ } else if (!hv_vss_is_daemon_killed_after_launch(sc)) {
+ struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
+ switch(msg->hdr.vss_hdr.operation) {
+ case VSS_OP_FREEZE:
+ case VSS_OP_THAW:
+ case VSS_OP_HOT_BACKUP:
+ mtx_lock(&sc->pending_mutex);
+ reqp = hv_vss_get_new_req_locked(sc);
+ mtx_unlock(&sc->pending_mutex);
+ if (reqp == NULL) {
+ /* ignore this request from host */
+ break;
+ }
+ hv_vss_init_req(reqp, recvlen, requestid, vss_buf, sc);
+ hv_vss_log_info("%s: receive %s (%ju) from host\n",
+ __func__,
+ vss_opt_name[reqp->vss_req.opt_msg.opt],
+ (uintmax_t)reqp->vss_req.opt_msg.msgid);
+ hv_vss_start_notify(reqp, msg->hdr.vss_hdr.operation);
+ break;
+ case VSS_OP_GET_DM_INFO:
+ hv_vss_log_info("%s: receive GET_DM_INFO from host\n",
+ __func__);
+ msg->body.dm_info.flags = 0;
+ hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
+ recvlen, requestid, HV_S_OK);
+ break;
+ default:
+ device_printf(sc->dev, "Unknown opt from host: %d\n",
+ msg->hdr.vss_hdr.operation);
+ break;
+ }
+ } else {
+ /* daemon was killed for some reason after it was launched */
+ struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf;
+ switch(msg->hdr.vss_hdr.operation) {
+ case VSS_OP_FREEZE:
+ hv_vss_log_info("%s: response fail for FREEZE\n",
+ __func__);
+ break;
+ case VSS_OP_THAW:
+ hv_vss_log_info("%s: response fail for THAW\n",
+ __func__);
+ break;
+ case VSS_OP_HOT_BACKUP:
+ hv_vss_log_info("%s: response fail for HOT_BACKUP\n",
+ __func__);
+ msg->body.vss_cf.flags = VSS_HBU_NO_AUTO_RECOVERY;
+ break;
+ case VSS_OP_GET_DM_INFO:
+ hv_vss_log_info("%s: response fail for GET_DM_INFO\n",
+ __func__);
+ msg->body.dm_info.flags = 0;
+ break;
+ default:
+ device_printf(sc->dev, "Unknown opt from host: %d\n",
+ msg->hdr.vss_hdr.operation);
+ break;
+ }
+ hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev),
+ recvlen, requestid, HV_E_FAIL);
+ }
+ /*
+ * Try reading next buffer
+ */
+ recvlen = sc->util_sc.ic_buflen;
+ ret = vmbus_chan_recv(channel, vss_buf, &recvlen, &requestid);
+ KASSERT(ret != ENOBUFS, ("hvvss recvbuf is not large enough"));
+ /* XXX check recvlen to make sure that it contains enough data */
+
+ hv_vss_log_info("%s: read: context %p, ret =%d, recvlen=%d\n",
+ __func__, context, ret, recvlen);
+ }
+}
+
+static int
+hv_vss_probe(device_t dev)
+{
+ return (vmbus_ic_probe(dev, vmbus_vss_descs));
+}
+
+static int
+hv_vss_init_send_receive_queue(device_t dev)
+{
+ hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
+ int i;
+ const int max_list = 4; /* It is big enough for the list */
+ struct hv_vss_req_internal* reqp;
+
+ LIST_INIT(&sc->req_free_list);
+ STAILQ_INIT(&sc->daemon_sc.to_notify_queue);
+ STAILQ_INIT(&sc->daemon_sc.to_ack_queue);
+ STAILQ_INIT(&sc->app_sc.to_notify_queue);
+ STAILQ_INIT(&sc->app_sc.to_ack_queue);
+
+ for (i = 0; i < max_list; i++) {
+ reqp = malloc(sizeof(struct hv_vss_req_internal),
+ M_DEVBUF, M_WAITOK|M_ZERO);
+ LIST_INSERT_HEAD(&sc->req_free_list, reqp, link);
+ callout_init_mtx(&reqp->callout, &sc->pending_mutex, 0);
+ }
+ return (0);
+}
+
+static int
+hv_vss_destroy_send_receive_queue(device_t dev)
+{
+ hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
+ hv_vss_req_internal* reqp;
+
+ while (!LIST_EMPTY(&sc->req_free_list)) {
+ reqp = LIST_FIRST(&sc->req_free_list);
+ LIST_REMOVE(reqp, link);
+ free(reqp, M_DEVBUF);
+ }
+
+ while (!STAILQ_EMPTY(&sc->daemon_sc.to_notify_queue)) {
+ reqp = STAILQ_FIRST(&sc->daemon_sc.to_notify_queue);
+ STAILQ_REMOVE_HEAD(&sc->daemon_sc.to_notify_queue, slink);
+ free(reqp, M_DEVBUF);
+ }
+
+ while (!STAILQ_EMPTY(&sc->daemon_sc.to_ack_queue)) {
+ reqp = STAILQ_FIRST(&sc->daemon_sc.to_ack_queue);
+ STAILQ_REMOVE_HEAD(&sc->daemon_sc.to_ack_queue, slink);
+ free(reqp, M_DEVBUF);
+ }
+
+ while (!STAILQ_EMPTY(&sc->app_sc.to_notify_queue)) {
+ reqp = STAILQ_FIRST(&sc->app_sc.to_notify_queue);
+ STAILQ_REMOVE_HEAD(&sc->app_sc.to_notify_queue, slink);
+ free(reqp, M_DEVBUF);
+ }
+
+ while (!STAILQ_EMPTY(&sc->app_sc.to_ack_queue)) {
+ reqp = STAILQ_FIRST(&sc->app_sc.to_ack_queue);
+ STAILQ_REMOVE_HEAD(&sc->app_sc.to_ack_queue, slink);
+ free(reqp, M_DEVBUF);
+ }
+ return (0);
+}
+
+static int
+hv_vss_attach(device_t dev)
+{
+ int error;
+ struct sysctl_oid_list *child;
+ struct sysctl_ctx_list *ctx;
+
+ hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
+
+ sc->dev = dev;
+ mtx_init(&sc->pending_mutex, "hv_vss pending mutex", NULL, MTX_DEF);
+
+ ctx = device_get_sysctl_ctx(dev);
+ child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+
+ SYSCTL_ADD_INT(ctx, child, OID_AUTO, "hv_vss_log",
+ CTLFLAG_RWTUN, &hv_vss_log, 0, "Hyperv VSS service log level");
+
+ TASK_INIT(&sc->task, 0, hv_vss_process_request, sc);
+ hv_vss_init_send_receive_queue(dev);
+ /* create character device for file system freeze/thaw */
+ error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
+ &sc->hv_vss_dev,
+ &hv_vss_cdevsw,
+ 0,
+ UID_ROOT,
+ GID_WHEEL,
+ 0640,
+ FS_VSS_DEV_NAME);
+
+ if (error != 0) {
+ hv_vss_log_info("Fail to create '%s': %d\n", FS_VSS_DEV_NAME, error);
+ return (error);
+ }
+ sc->hv_vss_dev->si_drv1 = &sc->daemon_sc;
+ sc->daemon_sc.sc = sc;
+ /* create character device for application freeze/thaw */
+ error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK,
+ &sc->hv_appvss_dev,
+ &hv_appvss_cdevsw,
+ 0,
+ UID_ROOT,
+ GID_WHEEL,
+ 0640,
+ APP_VSS_DEV_NAME);
+
+ if (error != 0) {
+ hv_vss_log_info("Fail to create '%s': %d\n", APP_VSS_DEV_NAME, error);
+ return (error);
+ }
+ sc->hv_appvss_dev->si_drv1 = &sc->app_sc;
+ sc->app_sc.sc = sc;
+
+ return (vmbus_ic_attach(dev, hv_vss_callback));
+}
+
+static int
+hv_vss_detach(device_t dev)
+{
+ hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev);
+ mtx_destroy(&sc->pending_mutex);
+ if (sc->daemon_sc.proc_task != NULL) {
+ PROC_LOCK(sc->daemon_sc.proc_task);
+ kern_psignal(sc->daemon_sc.proc_task, SIGKILL);
+ PROC_UNLOCK(sc->daemon_sc.proc_task);
+ }
+ if (sc->app_sc.proc_task != NULL) {
+ PROC_LOCK(sc->app_sc.proc_task);
+ kern_psignal(sc->app_sc.proc_task, SIGKILL);
+ PROC_UNLOCK(sc->app_sc.proc_task);
+ }
+ hv_vss_destroy_send_receive_queue(dev);
+ destroy_dev(sc->hv_vss_dev);
+ destroy_dev(sc->hv_appvss_dev);
+ return (vmbus_ic_detach(dev));
+}
+
+static device_method_t vss_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, hv_vss_probe),
+ DEVMETHOD(device_attach, hv_vss_attach),
+ DEVMETHOD(device_detach, hv_vss_detach),
+ { 0, 0 }
+};
+
+static driver_t vss_driver = { "hvvss", vss_methods, sizeof(hv_vss_sc)};
+
+static devclass_t vss_devclass;
+
+DRIVER_MODULE(hv_vss, vmbus, vss_driver, vss_devclass, NULL, NULL);
+MODULE_VERSION(hv_vss, 1);
+MODULE_DEPEND(hv_vss, vmbus, 1, 1, 1);
diff --git a/sys/dev/hyperv/utilities/hv_snapshot.h b/sys/dev/hyperv/utilities/hv_snapshot.h
new file mode 100644
index 000000000000..e3c9e0c9fef2
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_snapshot.h
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VSS_H
+#define _VSS_H
+#include <sys/ioccom.h>
+#define FS_VSS_DEV_NAME "hv_fsvss_dev"
+#define APP_VSS_DEV_NAME "hv_appvss_dev"
+
+#define VSS_DEV(VSS) "/dev/"VSS
+
+#define VSS_SUCCESS 0x00000000
+#define VSS_FAIL 0x00000001
+
+enum hv_vss_op_t {
+ HV_VSS_NONE = 0,
+ HV_VSS_CHECK,
+ HV_VSS_FREEZE,
+ HV_VSS_THAW,
+ HV_VSS_COUNT
+};
+
+struct hv_vss_opt_msg {
+ uint32_t opt; /* operation */
+ uint32_t status; /* 0 for success, 1 for error */
+ uint64_t msgid; /* an ID used to identify the transaction */
+ uint8_t reserved[48]; /* reserved values are all zeroes */
+};
+#define IOCHVVSSREAD _IOR('v', 2, struct hv_vss_opt_msg)
+#define IOCHVVSSWRITE _IOW('v', 3, struct hv_vss_opt_msg)
+#endif
diff --git a/sys/dev/hyperv/utilities/hv_utilreg.h b/sys/dev/hyperv/utilities/hv_utilreg.h
new file mode 100644
index 000000000000..b29c0f99204f
--- /dev/null
+++ b/sys/dev/hyperv/utilities/hv_utilreg.h
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HV_UTILREG_H_
+#define _HV_UTILREG_H_
+
+/*
+ * Some Hyper-V status codes.
+ */
+#define HV_S_OK 0x00000000
+#define HV_E_FAIL 0x80004005
+#define HV_S_CONT 0x80070103
+#define HV_ERROR_NOT_SUPPORTED 0x80070032
+#define HV_ERROR_MACHINE_LOCKED 0x800704F7
+#define HV_ERROR_DEVICE_NOT_CONNECTED 0x8007048F
+#define HV_INVALIDARG 0x80070057
+#define HV_GUID_NOTFOUND 0x80041002
+
+/*
+ * Common defines for Hyper-V ICs
+ */
+#define HV_ICMSGTYPE_NEGOTIATE 0
+#define HV_ICMSGTYPE_HEARTBEAT 1
+#define HV_ICMSGTYPE_KVPEXCHANGE 2
+#define HV_ICMSGTYPE_SHUTDOWN 3
+#define HV_ICMSGTYPE_TIMESYNC 4
+#define HV_ICMSGTYPE_VSS 5
+
+#define HV_ICMSGHDRFLAG_TRANSACTION 1
+#define HV_ICMSGHDRFLAG_REQUEST 2
+#define HV_ICMSGHDRFLAG_RESPONSE 4
+
+typedef struct hv_vmbus_pipe_hdr {
+ uint32_t flags;
+ uint32_t msgsize;
+} __packed hv_vmbus_pipe_hdr;
+
+typedef struct hv_vmbus_ic_version {
+ uint16_t major;
+ uint16_t minor;
+} __packed hv_vmbus_ic_version;
+
+typedef struct hv_vmbus_icmsg_hdr {
+ hv_vmbus_ic_version icverframe;
+ uint16_t icmsgtype;
+ hv_vmbus_ic_version icvermsg;
+ uint16_t icmsgsize;
+ uint32_t status;
+ uint8_t ictransaction_id;
+ uint8_t icflags;
+ uint8_t reserved[2];
+} __packed hv_vmbus_icmsg_hdr;
+
+typedef struct hv_vmbus_icmsg_negotiate {
+ uint16_t icframe_vercnt;
+ uint16_t icmsg_vercnt;
+ uint32_t reserved;
+ hv_vmbus_ic_version icversion_data[1]; /* any size array */
+} __packed hv_vmbus_icmsg_negotiate;
+
+#endif /* !_HV_UTILREG_H_ */
diff --git a/sys/dev/hyperv/utilities/unicode.h b/sys/dev/hyperv/utilities/unicode.h
new file mode 100644
index 000000000000..696777cbbf26
--- /dev/null
+++ b/sys/dev/hyperv/utilities/unicode.h
@@ -0,0 +1,201 @@
+/* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */
+
+/*-
+ * Copyright (c) 2007 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/types.h>
+
+#define UNICODE_DECOMPOSE 0x01
+#define UNICODE_PRECOMPOSE 0x02
+#define UNICODE_UTF8_LATIN1_FALLBACK 0x03
+
+size_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *);
+size_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *);
+
+size_t
+utf8_to_utf16(uint16_t *dst, size_t dst_len,
+ const char *src, size_t src_len,
+ int flags, int *errp)
+{
+ const unsigned char *s;
+ size_t spos, dpos;
+ int error;
+ uint16_t c;
+
+#define IS_CONT(c) (((c)&0xc0) == 0x80)
+
+ error = 0;
+ s = (const unsigned char *)src;
+ spos = dpos = 0;
+ while (spos<src_len) {
+ if (s[spos] < 0x80)
+ c = s[spos++];
+ else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK)
+ && (spos >= src_len || !IS_CONT(s[spos+1]))
+ && s[spos]>=0xa0) {
+ /* not valid UTF-8, assume ISO 8859-1 */
+ c = s[spos++];
+ }
+ else if (s[spos] < 0xc0 || s[spos] >= 0xf5) {
+ /* continuation byte without lead byte
+ or lead byte for codepoint above 0x10ffff */
+ error++;
+ spos++;
+ continue;
+ }
+ else if (s[spos] < 0xe0) {
+ if (spos >= src_len || !IS_CONT(s[spos+1])) {
+ spos++;
+ error++;
+ continue;
+ }
+ c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f);
+ spos += 2;
+ if (c < 0x80) {
+ /* overlong encoding */
+ error++;
+ continue;
+ }
+ }
+ else if (s[spos] < 0xf0) {
+ if (spos >= src_len-2
+ || !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) {
+ spos++;
+ error++;
+ continue;
+ }
+ c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6)
+ | (s[spos+2] & 0x3f);
+ spos += 3;
+ if (c < 0x800 || (c & 0xdf00) == 0xd800 ) {
+ /* overlong encoding or encoded surrogate */
+ error++;
+ continue;
+ }
+ }
+ else {
+ uint32_t cc;
+ /* UTF-16 surrogate pair */
+
+ if (spos >= src_len-3 || !IS_CONT(s[spos+1])
+ || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) {
+ spos++;
+ error++;
+
+ continue;
+ }
+ cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12)
+ | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f);
+ spos += 4;
+ if (cc < 0x10000) {
+ /* overlong encoding */
+ error++;
+ continue;
+ }
+ if (dst && dpos < dst_len)
+ dst[dpos] = (0xd800 | ((cc-0x10000)>>10));
+ dpos++;
+ c = 0xdc00 | ((cc-0x10000) & 0x3ffff);
+ }
+
+ if (dst && dpos < dst_len)
+ dst[dpos] = c;
+ dpos++;
+ }
+
+ if (errp)
+ *errp = error;
+
+ return dpos;
+
+#undef IS_CONT
+}
+
+
+size_t
+utf16_to_utf8(char *dst, size_t dst_len,
+ const uint16_t *src, size_t src_len,
+ int flags, int *errp)
+{
+ uint16_t spos, dpos;
+ int error;
+
+#define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL)
+#define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++)
+
+ error = 0;
+ dpos = 0;
+ for (spos=0; spos<src_len; spos++) {
+ if (src[spos] < 0x80) {
+ CHECK_LENGTH(1);
+ ADD_BYTE(src[spos]);
+ }
+ else if (src[spos] < 0x800) {
+ CHECK_LENGTH(2);
+ ADD_BYTE(0xc0 | (src[spos]>>6));
+ ADD_BYTE(0x80 | (src[spos] & 0x3f));
+ }
+ else if ((src[spos] & 0xdc00) == 0xd800) {
+ uint32_t c;
+ /* first surrogate */
+ if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) {
+ /* no second surrogate present */
+ error++;
+ continue;
+ }
+ spos++;
+ CHECK_LENGTH(4);
+ c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000;
+ ADD_BYTE(0xf0 | (c>>18));
+ ADD_BYTE(0x80 | ((c>>12) & 0x3f));
+ ADD_BYTE(0x80 | ((c>>6) & 0x3f));
+ ADD_BYTE(0x80 | (c & 0x3f));
+ }
+ else if ((src[spos] & 0xdc00) == 0xdc00) {
+ /* second surrogate without preceding first surrogate */
+ error++;
+ }
+ else {
+ CHECK_LENGTH(3);
+ ADD_BYTE(0xe0 | src[spos]>>12);
+ ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f));
+ ADD_BYTE(0x80 | (src[spos] & 0x3f));
+ }
+ }
+
+ if (errp)
+ *errp = error;
+
+ return dpos;
+
+#undef ADD_BYTE
+#undef CHECK_LENGTH
+}
diff --git a/sys/dev/hyperv/utilities/vmbus_heartbeat.c b/sys/dev/hyperv/utilities/vmbus_heartbeat.c
new file mode 100644
index 000000000000..f15b94822aa9
--- /dev/null
+++ b/sys/dev/hyperv/utilities/vmbus_heartbeat.c
@@ -0,0 +1,152 @@
+/*-
+ * Copyright (c) 2014,2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/systm.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/utilities/vmbus_icreg.h>
+#include <dev/hyperv/utilities/vmbus_icvar.h>
+
+#define VMBUS_HEARTBEAT_FWVER_MAJOR 3
+#define VMBUS_HEARTBEAT_FWVER \
+ VMBUS_IC_VERSION(VMBUS_HEARTBEAT_FWVER_MAJOR, 0)
+
+#define VMBUS_HEARTBEAT_MSGVER_MAJOR 3
+#define VMBUS_HEARTBEAT_MSGVER \
+ VMBUS_IC_VERSION(VMBUS_HEARTBEAT_MSGVER_MAJOR, 0)
+
+static int vmbus_heartbeat_probe(device_t);
+static int vmbus_heartbeat_attach(device_t);
+
+static const struct vmbus_ic_desc vmbus_heartbeat_descs[] = {
+ {
+ .ic_guid = { .hv_guid = {
+ 0x39, 0x4f, 0x16, 0x57, 0x15, 0x91, 0x78, 0x4e,
+ 0xab, 0x55, 0x38, 0x2f, 0x3b, 0xd5, 0x42, 0x2d} },
+ .ic_desc = "Hyper-V Heartbeat"
+ },
+ VMBUS_IC_DESC_END
+};
+
+static device_method_t vmbus_heartbeat_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, vmbus_heartbeat_probe),
+ DEVMETHOD(device_attach, vmbus_heartbeat_attach),
+ DEVMETHOD(device_detach, vmbus_ic_detach),
+ DEVMETHOD_END
+};
+
+static driver_t vmbus_heartbeat_driver = {
+ "hvheartbeat",
+ vmbus_heartbeat_methods,
+ sizeof(struct vmbus_ic_softc)
+};
+
+static devclass_t vmbus_heartbeat_devclass;
+
+DRIVER_MODULE(hv_heartbeat, vmbus, vmbus_heartbeat_driver,
+ vmbus_heartbeat_devclass, NULL, NULL);
+MODULE_VERSION(hv_heartbeat, 1);
+MODULE_DEPEND(hv_heartbeat, vmbus, 1, 1, 1);
+
+static void
+vmbus_heartbeat_cb(struct vmbus_channel *chan, void *xsc)
+{
+ struct vmbus_ic_softc *sc = xsc;
+ struct vmbus_icmsg_hdr *hdr;
+ int dlen, error;
+ uint64_t xactid;
+ void *data;
+
+ /*
+ * Receive request.
+ */
+ data = sc->ic_buf;
+ dlen = sc->ic_buflen;
+ error = vmbus_chan_recv(chan, data, &dlen, &xactid);
+ KASSERT(error != ENOBUFS, ("icbuf is not large enough"));
+ if (error)
+ return;
+
+ if (dlen < sizeof(*hdr)) {
+ device_printf(sc->ic_dev, "invalid data len %d\n", dlen);
+ return;
+ }
+ hdr = data;
+
+ /*
+ * Update request, which will be echoed back as response.
+ */
+ switch (hdr->ic_type) {
+ case VMBUS_ICMSG_TYPE_NEGOTIATE:
+ error = vmbus_ic_negomsg(sc, data, &dlen,
+ VMBUS_HEARTBEAT_FWVER, VMBUS_HEARTBEAT_MSGVER);
+ if (error)
+ return;
+ break;
+
+ case VMBUS_ICMSG_TYPE_HEARTBEAT:
+ /* Only ic_seq is a must */
+ if (dlen < VMBUS_ICMSG_HEARTBEAT_SIZE_MIN) {
+ device_printf(sc->ic_dev, "invalid heartbeat len %d\n",
+ dlen);
+ return;
+ }
+ ((struct vmbus_icmsg_heartbeat *)data)->ic_seq++;
+ break;
+
+ default:
+ device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type);
+ break;
+ }
+
+ /*
+ * Send response by echoing the request back.
+ */
+ vmbus_ic_sendresp(sc, chan, data, dlen, xactid);
+}
+
+static int
+vmbus_heartbeat_probe(device_t dev)
+{
+
+ return (vmbus_ic_probe(dev, vmbus_heartbeat_descs));
+}
+
+static int
+vmbus_heartbeat_attach(device_t dev)
+{
+
+ return (vmbus_ic_attach(dev, vmbus_heartbeat_cb));
+}
diff --git a/sys/dev/hyperv/utilities/vmbus_ic.c b/sys/dev/hyperv/utilities/vmbus_ic.c
new file mode 100644
index 000000000000..574670053918
--- /dev/null
+++ b/sys/dev/hyperv/utilities/vmbus_ic.c
@@ -0,0 +1,299 @@
+/*-
+ * Copyright (c) 2014,2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/utilities/vmbus_icreg.h>
+#include <dev/hyperv/utilities/vmbus_icvar.h>
+
+#include "vmbus_if.h"
+
+#define VMBUS_IC_BRSIZE (4 * PAGE_SIZE)
+
+#define VMBUS_IC_VERCNT 2
+#define VMBUS_IC_NEGOSZ \
+ __offsetof(struct vmbus_icmsg_negotiate, ic_ver[VMBUS_IC_VERCNT])
+CTASSERT(VMBUS_IC_NEGOSZ < VMBUS_IC_BRSIZE);
+
+static int vmbus_ic_fwver_sysctl(SYSCTL_HANDLER_ARGS);
+static int vmbus_ic_msgver_sysctl(SYSCTL_HANDLER_ARGS);
+
+int
+vmbus_ic_negomsg(struct vmbus_ic_softc *sc, void *data, int *dlen0,
+ uint32_t fw_ver, uint32_t msg_ver)
+{
+ struct vmbus_icmsg_negotiate *nego;
+ int i, cnt, dlen = *dlen0, error;
+ uint32_t sel_fw_ver, sel_msg_ver;
+ bool has_fw_ver, has_msg_ver;
+
+ /*
+ * Preliminary message verification.
+ */
+ if (dlen < sizeof(*nego)) {
+ device_printf(sc->ic_dev, "truncated ic negotiate, len %d\n",
+ dlen);
+ return (EINVAL);
+ }
+ nego = data;
+
+ if (nego->ic_fwver_cnt == 0) {
+ device_printf(sc->ic_dev, "ic negotiate does not contain "
+ "framework version %u\n", nego->ic_fwver_cnt);
+ return (EINVAL);
+ }
+ if (nego->ic_msgver_cnt == 0) {
+ device_printf(sc->ic_dev, "ic negotiate does not contain "
+ "message version %u\n", nego->ic_msgver_cnt);
+ return (EINVAL);
+ }
+
+ cnt = nego->ic_fwver_cnt + nego->ic_msgver_cnt;
+ if (dlen < __offsetof(struct vmbus_icmsg_negotiate, ic_ver[cnt])) {
+ device_printf(sc->ic_dev, "ic negotiate does not contain "
+ "versions %d\n", dlen);
+ return (EINVAL);
+ }
+
+ error = EOPNOTSUPP;
+
+ /*
+ * Find the best match framework version.
+ */
+ has_fw_ver = false;
+ for (i = 0; i < nego->ic_fwver_cnt; ++i) {
+ if (VMBUS_ICVER_LE(nego->ic_ver[i], fw_ver)) {
+ if (!has_fw_ver) {
+ sel_fw_ver = nego->ic_ver[i];
+ has_fw_ver = true;
+ } else if (VMBUS_ICVER_GT(nego->ic_ver[i],
+ sel_fw_ver)) {
+ sel_fw_ver = nego->ic_ver[i];
+ }
+ }
+ }
+ if (!has_fw_ver) {
+ device_printf(sc->ic_dev, "failed to select framework "
+ "version\n");
+ goto done;
+ }
+
+ /*
+ * Fine the best match message version.
+ */
+ has_msg_ver = false;
+ for (i = nego->ic_fwver_cnt;
+ i < nego->ic_fwver_cnt + nego->ic_msgver_cnt; ++i) {
+ if (VMBUS_ICVER_LE(nego->ic_ver[i], msg_ver)) {
+ if (!has_msg_ver) {
+ sel_msg_ver = nego->ic_ver[i];
+ has_msg_ver = true;
+ } else if (VMBUS_ICVER_GT(nego->ic_ver[i],
+ sel_msg_ver)) {
+ sel_msg_ver = nego->ic_ver[i];
+ }
+ }
+ }
+ if (!has_msg_ver) {
+ device_printf(sc->ic_dev, "failed to select message "
+ "version\n");
+ goto done;
+ }
+
+ error = 0;
+done:
+ if (bootverbose || !has_fw_ver || !has_msg_ver) {
+ if (has_fw_ver) {
+ device_printf(sc->ic_dev, "sel framework version: "
+ "%u.%u\n",
+ VMBUS_ICVER_MAJOR(sel_fw_ver),
+ VMBUS_ICVER_MINOR(sel_fw_ver));
+ }
+ for (i = 0; i < nego->ic_fwver_cnt; i++) {
+ device_printf(sc->ic_dev, "supp framework version: "
+ "%u.%u\n",
+ VMBUS_ICVER_MAJOR(nego->ic_ver[i]),
+ VMBUS_ICVER_MINOR(nego->ic_ver[i]));
+ }
+
+ if (has_msg_ver) {
+ device_printf(sc->ic_dev, "sel message version: "
+ "%u.%u\n",
+ VMBUS_ICVER_MAJOR(sel_msg_ver),
+ VMBUS_ICVER_MINOR(sel_msg_ver));
+ }
+ for (i = nego->ic_fwver_cnt;
+ i < nego->ic_fwver_cnt + nego->ic_msgver_cnt; i++) {
+ device_printf(sc->ic_dev, "supp message version: "
+ "%u.%u\n",
+ VMBUS_ICVER_MAJOR(nego->ic_ver[i]),
+ VMBUS_ICVER_MINOR(nego->ic_ver[i]));
+ }
+ }
+ if (error)
+ return (error);
+
+ /* Record the selected versions. */
+ sc->ic_fwver = sel_fw_ver;
+ sc->ic_msgver = sel_msg_ver;
+
+ /* One framework version. */
+ nego->ic_fwver_cnt = 1;
+ nego->ic_ver[0] = sel_fw_ver;
+
+ /* One message version. */
+ nego->ic_msgver_cnt = 1;
+ nego->ic_ver[1] = sel_msg_ver;
+
+ /* Update data size. */
+ nego->ic_hdr.ic_dsize = VMBUS_IC_NEGOSZ -
+ sizeof(struct vmbus_icmsg_hdr);
+
+ /* Update total size, if necessary. */
+ if (dlen < VMBUS_IC_NEGOSZ)
+ *dlen0 = VMBUS_IC_NEGOSZ;
+
+ return (0);
+}
+
+int
+vmbus_ic_probe(device_t dev, const struct vmbus_ic_desc descs[])
+{
+ device_t bus = device_get_parent(dev);
+ const struct vmbus_ic_desc *d;
+
+ if (resource_disabled(device_get_name(dev), 0))
+ return (ENXIO);
+
+ for (d = descs; d->ic_desc != NULL; ++d) {
+ if (VMBUS_PROBE_GUID(bus, dev, &d->ic_guid) == 0) {
+ device_set_desc(dev, d->ic_desc);
+ return (BUS_PROBE_DEFAULT);
+ }
+ }
+ return (ENXIO);
+}
+
+int
+vmbus_ic_attach(device_t dev, vmbus_chan_callback_t cb)
+{
+ struct vmbus_ic_softc *sc = device_get_softc(dev);
+ struct vmbus_channel *chan = vmbus_get_channel(dev);
+ struct sysctl_oid_list *child;
+ struct sysctl_ctx_list *ctx;
+ int error;
+
+ sc->ic_dev = dev;
+ sc->ic_buflen = VMBUS_IC_BRSIZE;
+ sc->ic_buf = malloc(VMBUS_IC_BRSIZE, M_DEVBUF, M_WAITOK | M_ZERO);
+
+ /*
+ * These services are not performance critical and do not need
+ * batched reading. Furthermore, some services such as KVP can
+ * only handle one message from the host at a time.
+ * Turn off batched reading for all util drivers before we open the
+ * channel.
+ */
+ vmbus_chan_set_readbatch(chan, false);
+
+ error = vmbus_chan_open(chan, VMBUS_IC_BRSIZE, VMBUS_IC_BRSIZE, NULL, 0,
+ cb, sc);
+ if (error) {
+ free(sc->ic_buf, M_DEVBUF);
+ return (error);
+ }
+
+ ctx = device_get_sysctl_ctx(dev);
+ child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "fw_version",
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ vmbus_ic_fwver_sysctl, "A", "framework version");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "msg_version",
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ vmbus_ic_msgver_sysctl, "A", "message version");
+
+ return (0);
+}
+
+static int
+vmbus_ic_fwver_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct vmbus_ic_softc *sc = arg1;
+ char verstr[16];
+
+ snprintf(verstr, sizeof(verstr), "%u.%u",
+ VMBUS_ICVER_MAJOR(sc->ic_fwver), VMBUS_ICVER_MINOR(sc->ic_fwver));
+ return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
+}
+
+static int
+vmbus_ic_msgver_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct vmbus_ic_softc *sc = arg1;
+ char verstr[16];
+
+ snprintf(verstr, sizeof(verstr), "%u.%u",
+ VMBUS_ICVER_MAJOR(sc->ic_msgver), VMBUS_ICVER_MINOR(sc->ic_msgver));
+ return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
+}
+
+int
+vmbus_ic_detach(device_t dev)
+{
+ struct vmbus_ic_softc *sc = device_get_softc(dev);
+
+ vmbus_chan_close(vmbus_get_channel(dev));
+ free(sc->ic_buf, M_DEVBUF);
+
+ return (0);
+}
+
+int
+vmbus_ic_sendresp(struct vmbus_ic_softc *sc, struct vmbus_channel *chan,
+ void *data, int dlen, uint64_t xactid)
+{
+ struct vmbus_icmsg_hdr *hdr;
+ int error;
+
+ KASSERT(dlen >= sizeof(*hdr), ("invalid data length %d", dlen));
+ hdr = data;
+
+ hdr->ic_flags = VMBUS_ICMSG_FLAG_XACT | VMBUS_ICMSG_FLAG_RESP;
+ error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
+ data, dlen, xactid);
+ if (error)
+ device_printf(sc->ic_dev, "resp send failed: %d\n", error);
+ return (error);
+}
diff --git a/sys/dev/hyperv/utilities/vmbus_icreg.h b/sys/dev/hyperv/utilities/vmbus_icreg.h
new file mode 100644
index 000000000000..e962102d13dd
--- /dev/null
+++ b/sys/dev/hyperv/utilities/vmbus_icreg.h
@@ -0,0 +1,135 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_ICREG_H_
+#define _VMBUS_ICREG_H_
+
+#define VMBUS_ICMSG_TYPE_NEGOTIATE 0
+#define VMBUS_ICMSG_TYPE_HEARTBEAT 1
+#define VMBUS_ICMSG_TYPE_KVP 2
+#define VMBUS_ICMSG_TYPE_SHUTDOWN 3
+#define VMBUS_ICMSG_TYPE_TIMESYNC 4
+#define VMBUS_ICMSG_TYPE_VSS 5
+
+#define VMBUS_ICMSG_STATUS_OK 0x00000000
+#define VMBUS_ICMSG_STATUS_FAIL 0x80004005
+
+#define VMBUS_IC_VERSION(major, minor) ((major) | (((uint32_t)(minor)) << 16))
+#define VMBUS_ICVER_MAJOR(ver) ((ver) & 0xffff)
+#define VMBUS_ICVER_MINOR(ver) (((ver) & 0xffff0000) >> 16)
+#define VMBUS_ICVER_SWAP(ver) \
+ ((VMBUS_ICVER_MAJOR((ver)) << 16) | VMBUS_ICVER_MINOR((ver)))
+#define VMBUS_ICVER_LE(v1, v2) \
+ (VMBUS_ICVER_SWAP((v1)) <= VMBUS_ICVER_SWAP((v2)))
+#define VMBUS_ICVER_GT(v1, v2) \
+ (VMBUS_ICVER_SWAP((v1)) > VMBUS_ICVER_SWAP((v2)))
+
+struct vmbus_pipe_hdr {
+ uint32_t ph_flags;
+ uint32_t ph_msgsz;
+} __packed;
+
+struct vmbus_icmsg_hdr {
+ struct vmbus_pipe_hdr ic_pipe;
+ uint32_t ic_fwver; /* framework version */
+ uint16_t ic_type;
+ uint32_t ic_msgver; /* message version */
+ uint16_t ic_dsize; /* data size */
+ uint32_t ic_status; /* VMBUS_ICMSG_STATUS_ */
+ uint8_t ic_xactid;
+ uint8_t ic_flags; /* VMBUS_ICMSG_FLAG_ */
+ uint8_t ic_rsvd[2];
+} __packed;
+
+#define VMBUS_ICMSG_FLAG_XACT 0x0001
+#define VMBUS_ICMSG_FLAG_REQ 0x0002
+#define VMBUS_ICMSG_FLAG_RESP 0x0004
+
+/* VMBUS_ICMSG_TYPE_NEGOTIATE */
+struct vmbus_icmsg_negotiate {
+ struct vmbus_icmsg_hdr ic_hdr;
+ uint16_t ic_fwver_cnt;
+ uint16_t ic_msgver_cnt;
+ uint32_t ic_rsvd;
+ /*
+ * This version array contains two set of supported
+ * versions:
+ * - The first set consists of #ic_fwver_cnt supported framework
+ * versions.
+ * - The second set consists of #ic_msgver_cnt supported message
+ * versions.
+ */
+ uint32_t ic_ver[];
+} __packed;
+
+/* VMBUS_ICMSG_TYPE_HEARTBEAT */
+struct vmbus_icmsg_heartbeat {
+ struct vmbus_icmsg_hdr ic_hdr;
+ uint64_t ic_seq;
+ uint32_t ic_rsvd[8];
+} __packed;
+
+#define VMBUS_ICMSG_HEARTBEAT_SIZE_MIN \
+ __offsetof(struct vmbus_icmsg_heartbeat, ic_rsvd[0])
+
+/* VMBUS_ICMSG_TYPE_SHUTDOWN */
+struct vmbus_icmsg_shutdown {
+ struct vmbus_icmsg_hdr ic_hdr;
+ uint32_t ic_code;
+ uint32_t ic_timeo;
+ uint32_t ic_haltflags;
+ uint8_t ic_msg[2048];
+} __packed;
+
+#define VMBUS_ICMSG_SHUTDOWN_SIZE_MIN \
+ __offsetof(struct vmbus_icmsg_shutdown, ic_msg[0])
+
+/* VMBUS_ICMSG_TYPE_TIMESYNC */
+struct vmbus_icmsg_timesync {
+ struct vmbus_icmsg_hdr ic_hdr;
+ uint64_t ic_hvtime;
+ uint64_t ic_vmtime;
+ uint64_t ic_rtt;
+ uint8_t ic_tsflags; /* VMBUS_ICMSG_TS_FLAG_ */
+} __packed;
+
+/* VMBUS_ICMSG_TYPE_TIMESYNC, MSGVER4 */
+struct vmbus_icmsg_timesync4 {
+ struct vmbus_icmsg_hdr ic_hdr;
+ uint64_t ic_hvtime;
+ uint64_t ic_sent_tc;
+ uint8_t ic_tsflags; /* VMBUS_ICMSG_TS_FLAG_ */
+ uint8_t ic_rsvd[5];
+} __packed;
+
+#define VMBUS_ICMSG_TS_FLAG_SYNC 0x01
+#define VMBUS_ICMSG_TS_FLAG_SAMPLE 0x02
+
+#define VMBUS_ICMSG_TS_BASE 116444736000000000ULL
+
+#endif /* !_VMBUS_ICREG_H_ */
diff --git a/sys/dev/hyperv/utilities/vmbus_icvar.h b/sys/dev/hyperv/utilities/vmbus_icvar.h
new file mode 100644
index 000000000000..a60ecfed58a2
--- /dev/null
+++ b/sys/dev/hyperv/utilities/vmbus_icvar.h
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_ICVAR_H_
+#define _VMBUS_ICVAR_H_
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+
+struct vmbus_ic_softc {
+ device_t ic_dev;
+ uint8_t *ic_buf;
+ int ic_buflen;
+ uint32_t ic_fwver; /* framework version */
+ uint32_t ic_msgver; /* message version */
+};
+
+struct vmbus_ic_desc {
+ const struct hyperv_guid ic_guid;
+ const char *ic_desc;
+};
+
+#define VMBUS_IC_DESC_END { .ic_desc = NULL }
+
+int vmbus_ic_attach(device_t dev, vmbus_chan_callback_t cb);
+int vmbus_ic_detach(device_t dev);
+int vmbus_ic_probe(device_t dev, const struct vmbus_ic_desc descs[]);
+int vmbus_ic_negomsg(struct vmbus_ic_softc *sc, void *data,
+ int *dlen, uint32_t fw_ver, uint32_t msg_ver);
+int vmbus_ic_sendresp(struct vmbus_ic_softc *sc,
+ struct vmbus_channel *chan, void *data, int dlen,
+ uint64_t xactid);
+
+#endif /* !_VMBUS_ICVAR_H_ */
diff --git a/sys/dev/hyperv/utilities/vmbus_shutdown.c b/sys/dev/hyperv/utilities/vmbus_shutdown.c
new file mode 100644
index 000000000000..7e54dc9866bb
--- /dev/null
+++ b/sys/dev/hyperv/utilities/vmbus_shutdown.c
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (c) 2014,2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/reboot.h>
+#include <sys/systm.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/utilities/vmbus_icreg.h>
+#include <dev/hyperv/utilities/vmbus_icvar.h>
+
+#define VMBUS_SHUTDOWN_FWVER_MAJOR 3
+#define VMBUS_SHUTDOWN_FWVER \
+ VMBUS_IC_VERSION(VMBUS_SHUTDOWN_FWVER_MAJOR, 0)
+
+#define VMBUS_SHUTDOWN_MSGVER_MAJOR 3
+#define VMBUS_SHUTDOWN_MSGVER \
+ VMBUS_IC_VERSION(VMBUS_SHUTDOWN_MSGVER_MAJOR, 0)
+
+static int vmbus_shutdown_probe(device_t);
+static int vmbus_shutdown_attach(device_t);
+
+static const struct vmbus_ic_desc vmbus_shutdown_descs[] = {
+ {
+ .ic_guid = { .hv_guid = {
+ 0x31, 0x60, 0x0b, 0x0e, 0x13, 0x52, 0x34, 0x49,
+ 0x81, 0x8b, 0x38, 0xd9, 0x0c, 0xed, 0x39, 0xdb } },
+ .ic_desc = "Hyper-V Shutdown"
+ },
+ VMBUS_IC_DESC_END
+};
+
+static device_method_t vmbus_shutdown_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, vmbus_shutdown_probe),
+ DEVMETHOD(device_attach, vmbus_shutdown_attach),
+ DEVMETHOD(device_detach, vmbus_ic_detach),
+ DEVMETHOD_END
+};
+
+static driver_t vmbus_shutdown_driver = {
+ "hvshutdown",
+ vmbus_shutdown_methods,
+ sizeof(struct vmbus_ic_softc)
+};
+
+static devclass_t vmbus_shutdown_devclass;
+
+DRIVER_MODULE(hv_shutdown, vmbus, vmbus_shutdown_driver,
+ vmbus_shutdown_devclass, NULL, NULL);
+MODULE_VERSION(hv_shutdown, 1);
+MODULE_DEPEND(hv_shutdown, vmbus, 1, 1, 1);
+
+static void
+vmbus_shutdown_cb(struct vmbus_channel *chan, void *xsc)
+{
+ struct vmbus_ic_softc *sc = xsc;
+ struct vmbus_icmsg_hdr *hdr;
+ struct vmbus_icmsg_shutdown *msg;
+ int dlen, error, do_shutdown = 0;
+ uint64_t xactid;
+ void *data;
+
+ /*
+ * Receive request.
+ */
+ data = sc->ic_buf;
+ dlen = sc->ic_buflen;
+ error = vmbus_chan_recv(chan, data, &dlen, &xactid);
+ KASSERT(error != ENOBUFS, ("icbuf is not large enough"));
+ if (error)
+ return;
+
+ if (dlen < sizeof(*hdr)) {
+ device_printf(sc->ic_dev, "invalid data len %d\n", dlen);
+ return;
+ }
+ hdr = data;
+
+ /*
+ * Update request, which will be echoed back as response.
+ */
+ switch (hdr->ic_type) {
+ case VMBUS_ICMSG_TYPE_NEGOTIATE:
+ error = vmbus_ic_negomsg(sc, data, &dlen,
+ VMBUS_SHUTDOWN_FWVER, VMBUS_SHUTDOWN_MSGVER);
+ if (error)
+ return;
+ break;
+
+ case VMBUS_ICMSG_TYPE_SHUTDOWN:
+ if (dlen < VMBUS_ICMSG_SHUTDOWN_SIZE_MIN) {
+ device_printf(sc->ic_dev, "invalid shutdown len %d\n",
+ dlen);
+ return;
+ }
+ msg = data;
+
+ /* XXX ic_flags definition? */
+ if (msg->ic_haltflags == 0 || msg->ic_haltflags == 1) {
+ device_printf(sc->ic_dev, "shutdown requested\n");
+ hdr->ic_status = VMBUS_ICMSG_STATUS_OK;
+ do_shutdown = 1;
+ } else {
+ device_printf(sc->ic_dev, "unknown shutdown flags "
+ "0x%08x\n", msg->ic_haltflags);
+ hdr->ic_status = VMBUS_ICMSG_STATUS_FAIL;
+ }
+ break;
+
+ default:
+ device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type);
+ break;
+ }
+
+ /*
+ * Send response by echoing the request back.
+ */
+ vmbus_ic_sendresp(sc, chan, data, dlen, xactid);
+
+ if (do_shutdown)
+ shutdown_nice(RB_POWEROFF);
+}
+
+static int
+vmbus_shutdown_probe(device_t dev)
+{
+
+ return (vmbus_ic_probe(dev, vmbus_shutdown_descs));
+}
+
+static int
+vmbus_shutdown_attach(device_t dev)
+{
+
+ return (vmbus_ic_attach(dev, vmbus_shutdown_cb));
+}
diff --git a/sys/dev/hyperv/utilities/vmbus_timesync.c b/sys/dev/hyperv/utilities/vmbus_timesync.c
new file mode 100644
index 000000000000..2a8d3a988b43
--- /dev/null
+++ b/sys/dev/hyperv/utilities/vmbus_timesync.c
@@ -0,0 +1,260 @@
+/*-
+ * Copyright (c) 2014,2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/utilities/vmbus_icreg.h>
+#include <dev/hyperv/utilities/vmbus_icvar.h>
+
+#define VMBUS_TIMESYNC_FWVER_MAJOR 3
+#define VMBUS_TIMESYNC_FWVER \
+ VMBUS_IC_VERSION(VMBUS_TIMESYNC_FWVER_MAJOR, 0)
+
+#define VMBUS_TIMESYNC_MSGVER_MAJOR 4
+#define VMBUS_TIMESYNC_MSGVER \
+ VMBUS_IC_VERSION(VMBUS_TIMESYNC_MSGVER_MAJOR, 0)
+
+#define VMBUS_TIMESYNC_MSGVER4(sc) \
+ VMBUS_ICVER_LE(VMBUS_IC_VERSION(4, 0), (sc)->ic_msgver)
+
+#define VMBUS_TIMESYNC_DORTT(sc) \
+ (VMBUS_TIMESYNC_MSGVER4((sc)) && hyperv_tc64 != NULL)
+
+static int vmbus_timesync_probe(device_t);
+static int vmbus_timesync_attach(device_t);
+
+static const struct vmbus_ic_desc vmbus_timesync_descs[] = {
+ {
+ .ic_guid = { .hv_guid = {
+ 0x30, 0xe6, 0x27, 0x95, 0xae, 0xd0, 0x7b, 0x49,
+ 0xad, 0xce, 0xe8, 0x0a, 0xb0, 0x17, 0x5c, 0xaf } },
+ .ic_desc = "Hyper-V Timesync"
+ },
+ VMBUS_IC_DESC_END
+};
+
+static device_method_t vmbus_timesync_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, vmbus_timesync_probe),
+ DEVMETHOD(device_attach, vmbus_timesync_attach),
+ DEVMETHOD(device_detach, vmbus_ic_detach),
+ DEVMETHOD_END
+};
+
+static driver_t vmbus_timesync_driver = {
+ "hvtimesync",
+ vmbus_timesync_methods,
+ sizeof(struct vmbus_ic_softc)
+};
+
+static devclass_t vmbus_timesync_devclass;
+
+DRIVER_MODULE(hv_timesync, vmbus, vmbus_timesync_driver,
+ vmbus_timesync_devclass, NULL, NULL);
+MODULE_VERSION(hv_timesync, 1);
+MODULE_DEPEND(hv_timesync, vmbus, 1, 1, 1);
+
+SYSCTL_NODE(_hw, OID_AUTO, hvtimesync, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
+ "Hyper-V timesync interface");
+
+static int vmbus_ts_ignore_sync = 0;
+SYSCTL_INT(_hw_hvtimesync, OID_AUTO, ignore_sync, CTLFLAG_RWTUN,
+ &vmbus_ts_ignore_sync, 0, "Ignore the sync request.");
+
+/*
+ * Trigger sample sync when drift exceeds threshold (ms).
+ * Ignore the sample request when set to 0.
+ */
+static int vmbus_ts_sample_thresh = 100;
+SYSCTL_INT(_hw_hvtimesync, OID_AUTO, sample_thresh, CTLFLAG_RWTUN,
+ &vmbus_ts_sample_thresh, 0,
+ "Threshold that makes sample request trigger the sync (unit: ms).");
+
+static int vmbus_ts_sample_verbose = 0;
+SYSCTL_INT(_hw_hvtimesync, OID_AUTO, sample_verbose, CTLFLAG_RWTUN,
+ &vmbus_ts_sample_verbose, 0, "Increase sample request verbosity.");
+
+static void
+vmbus_timesync(struct vmbus_ic_softc *sc, uint64_t hvtime, uint64_t sent_tc,
+ uint8_t tsflags)
+{
+ struct timespec vm_ts;
+ uint64_t hv_ns, vm_ns, rtt = 0;
+
+ if (VMBUS_TIMESYNC_DORTT(sc))
+ rtt = hyperv_tc64() - sent_tc;
+
+ hv_ns = (hvtime - VMBUS_ICMSG_TS_BASE + rtt) * HYPERV_TIMER_NS_FACTOR;
+ nanotime(&vm_ts);
+ vm_ns = (vm_ts.tv_sec * NANOSEC) + vm_ts.tv_nsec;
+
+ if ((tsflags & VMBUS_ICMSG_TS_FLAG_SYNC) && !vmbus_ts_ignore_sync) {
+ struct timespec hv_ts;
+
+ if (bootverbose) {
+ device_printf(sc->ic_dev, "apply sync request, "
+ "hv: %ju, vm: %ju\n",
+ (uintmax_t)hv_ns, (uintmax_t)vm_ns);
+ }
+ hv_ts.tv_sec = hv_ns / NANOSEC;
+ hv_ts.tv_nsec = hv_ns % NANOSEC;
+ kern_clock_settime(curthread, CLOCK_REALTIME, &hv_ts);
+ /* Done! */
+ return;
+ }
+
+ if ((tsflags & VMBUS_ICMSG_TS_FLAG_SAMPLE) &&
+ vmbus_ts_sample_thresh >= 0) {
+ int64_t diff;
+
+ if (vmbus_ts_sample_verbose) {
+ device_printf(sc->ic_dev, "sample request, "
+ "hv: %ju, vm: %ju\n",
+ (uintmax_t)hv_ns, (uintmax_t)vm_ns);
+ }
+
+ if (hv_ns > vm_ns)
+ diff = hv_ns - vm_ns;
+ else
+ diff = vm_ns - hv_ns;
+ /* nanosec -> millisec */
+ diff /= 1000000;
+
+ if (diff > vmbus_ts_sample_thresh) {
+ struct timespec hv_ts;
+
+ if (bootverbose) {
+ device_printf(sc->ic_dev,
+ "apply sample request, hv: %ju, vm: %ju\n",
+ (uintmax_t)hv_ns, (uintmax_t)vm_ns);
+ }
+ hv_ts.tv_sec = hv_ns / NANOSEC;
+ hv_ts.tv_nsec = hv_ns % NANOSEC;
+ kern_clock_settime(curthread, CLOCK_REALTIME, &hv_ts);
+ }
+ /* Done */
+ return;
+ }
+}
+
+static void
+vmbus_timesync_cb(struct vmbus_channel *chan, void *xsc)
+{
+ struct vmbus_ic_softc *sc = xsc;
+ struct vmbus_icmsg_hdr *hdr;
+ int dlen, error;
+ uint64_t xactid;
+ void *data;
+
+ /*
+ * Receive request.
+ */
+ data = sc->ic_buf;
+ dlen = sc->ic_buflen;
+ error = vmbus_chan_recv(chan, data, &dlen, &xactid);
+ KASSERT(error != ENOBUFS, ("icbuf is not large enough"));
+ if (error)
+ return;
+
+ if (dlen < sizeof(*hdr)) {
+ device_printf(sc->ic_dev, "invalid data len %d\n", dlen);
+ return;
+ }
+ hdr = data;
+
+ /*
+ * Update request, which will be echoed back as response.
+ */
+ switch (hdr->ic_type) {
+ case VMBUS_ICMSG_TYPE_NEGOTIATE:
+ error = vmbus_ic_negomsg(sc, data, &dlen,
+ VMBUS_TIMESYNC_FWVER, VMBUS_TIMESYNC_MSGVER);
+ if (error)
+ return;
+ if (VMBUS_TIMESYNC_DORTT(sc))
+ device_printf(sc->ic_dev, "RTT\n");
+ break;
+
+ case VMBUS_ICMSG_TYPE_TIMESYNC:
+ if (VMBUS_TIMESYNC_MSGVER4(sc)) {
+ const struct vmbus_icmsg_timesync4 *msg4;
+
+ if (dlen < sizeof(*msg4)) {
+ device_printf(sc->ic_dev, "invalid timesync4 "
+ "len %d\n", dlen);
+ return;
+ }
+ msg4 = data;
+ vmbus_timesync(sc, msg4->ic_hvtime, msg4->ic_sent_tc,
+ msg4->ic_tsflags);
+ } else {
+ const struct vmbus_icmsg_timesync *msg;
+
+ if (dlen < sizeof(*msg)) {
+ device_printf(sc->ic_dev, "invalid timesync "
+ "len %d\n", dlen);
+ return;
+ }
+ msg = data;
+ vmbus_timesync(sc, msg->ic_hvtime, 0, msg->ic_tsflags);
+ }
+ break;
+
+ default:
+ device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type);
+ break;
+ }
+
+ /*
+ * Send response by echoing the request back.
+ */
+ vmbus_ic_sendresp(sc, chan, data, dlen, xactid);
+}
+
+static int
+vmbus_timesync_probe(device_t dev)
+{
+
+ return (vmbus_ic_probe(dev, vmbus_timesync_descs));
+}
+
+static int
+vmbus_timesync_attach(device_t dev)
+{
+
+ return (vmbus_ic_attach(dev, vmbus_timesync_cb));
+}
diff --git a/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c b/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c
new file mode 100644
index 000000000000..11d549dc18d2
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c
@@ -0,0 +1,236 @@
+/*-
+ * Copyright (c) 2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/timetc.h>
+#include <sys/vdso.h>
+
+#include <machine/cpufunc.h>
+#include <machine/cputypes.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+#include <vm/vm.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/vmbus/hyperv_machdep.h>
+#include <dev/hyperv/vmbus/hyperv_reg.h>
+#include <dev/hyperv/vmbus/hyperv_var.h>
+
+struct hyperv_reftsc_ctx {
+ struct hyperv_reftsc *tsc_ref;
+ struct hyperv_dma tsc_ref_dma;
+};
+
+static uint32_t hyperv_tsc_vdso_timehands(
+ struct vdso_timehands *,
+ struct timecounter *);
+
+static d_open_t hyperv_tsc_open;
+static d_mmap_t hyperv_tsc_mmap;
+
+static struct timecounter hyperv_tsc_timecounter = {
+ .tc_get_timecount = NULL, /* based on CPU vendor. */
+ .tc_counter_mask = 0xffffffff,
+ .tc_frequency = HYPERV_TIMER_FREQ,
+ .tc_name = "Hyper-V-TSC",
+ .tc_quality = 3000,
+ .tc_fill_vdso_timehands = hyperv_tsc_vdso_timehands,
+};
+
+static struct cdevsw hyperv_tsc_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = hyperv_tsc_open,
+ .d_mmap = hyperv_tsc_mmap,
+ .d_name = HYPERV_REFTSC_DEVNAME
+};
+
+static struct hyperv_reftsc_ctx hyperv_ref_tsc;
+
+uint64_t
+hypercall_md(volatile void *hc_addr, uint64_t in_val,
+ uint64_t in_paddr, uint64_t out_paddr)
+{
+ uint64_t status;
+
+ __asm__ __volatile__ ("mov %0, %%r8" : : "r" (out_paddr): "r8");
+ __asm__ __volatile__ ("call *%3" : "=a" (status) :
+ "c" (in_val), "d" (in_paddr), "m" (hc_addr));
+ return (status);
+}
+
+static int
+hyperv_tsc_open(struct cdev *dev __unused, int oflags, int devtype __unused,
+ struct thread *td __unused)
+{
+
+ if (oflags & FWRITE)
+ return (EPERM);
+ return (0);
+}
+
+static int
+hyperv_tsc_mmap(struct cdev *dev __unused, vm_ooffset_t offset,
+ vm_paddr_t *paddr, int nprot __unused, vm_memattr_t *memattr __unused)
+{
+
+ KASSERT(hyperv_ref_tsc.tsc_ref != NULL, ("reftsc has not been setup"));
+
+ /*
+ * NOTE:
+ * 'nprot' does not contain information interested to us;
+ * WR-open is blocked by d_open.
+ */
+
+ if (offset != 0)
+ return (EOPNOTSUPP);
+
+ *paddr = hyperv_ref_tsc.tsc_ref_dma.hv_paddr;
+ return (0);
+}
+
+static uint32_t
+hyperv_tsc_vdso_timehands(struct vdso_timehands *vdso_th,
+ struct timecounter *tc __unused)
+{
+
+ vdso_th->th_algo = VDSO_TH_ALGO_X86_HVTSC;
+ vdso_th->th_x86_shift = 0;
+ vdso_th->th_x86_hpet_idx = 0;
+ vdso_th->th_x86_pvc_last_systime = 0;
+ vdso_th->th_x86_pvc_stable_mask = 0;
+ bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
+ return (1);
+}
+
+#define HYPERV_TSC_TIMECOUNT(fence) \
+static uint64_t \
+hyperv_tc64_tsc_##fence(void) \
+{ \
+ struct hyperv_reftsc *tsc_ref = hyperv_ref_tsc.tsc_ref; \
+ uint32_t seq; \
+ \
+ while ((seq = atomic_load_acq_int(&tsc_ref->tsc_seq)) != 0) { \
+ uint64_t disc, ret, tsc; \
+ uint64_t scale = tsc_ref->tsc_scale; \
+ int64_t ofs = tsc_ref->tsc_ofs; \
+ \
+ fence(); \
+ tsc = rdtsc(); \
+ \
+ /* ret = ((tsc * scale) >> 64) + ofs */ \
+ __asm__ __volatile__ ("mulq %3" : \
+ "=d" (ret), "=a" (disc) : \
+ "a" (tsc), "r" (scale)); \
+ ret += ofs; \
+ \
+ atomic_thread_fence_acq(); \
+ if (tsc_ref->tsc_seq == seq) \
+ return (ret); \
+ \
+ /* Sequence changed; re-sync. */ \
+ } \
+ /* Fallback to the generic timecounter, i.e. rdmsr. */ \
+ return (rdmsr(MSR_HV_TIME_REF_COUNT)); \
+} \
+ \
+static u_int \
+hyperv_tsc_timecount_##fence(struct timecounter *tc __unused) \
+{ \
+ \
+ return (hyperv_tc64_tsc_##fence()); \
+} \
+struct __hack
+
+HYPERV_TSC_TIMECOUNT(lfence);
+HYPERV_TSC_TIMECOUNT(mfence);
+
+static void
+hyperv_tsc_tcinit(void *dummy __unused)
+{
+ hyperv_tc64_t tc64 = NULL;
+ uint64_t val, orig;
+
+ if ((hyperv_features &
+ (CPUID_HV_MSR_TIME_REFCNT | CPUID_HV_MSR_REFERENCE_TSC)) !=
+ (CPUID_HV_MSR_TIME_REFCNT | CPUID_HV_MSR_REFERENCE_TSC) ||
+ (cpu_feature & CPUID_SSE2) == 0) /* SSE2 for mfence/lfence */
+ return;
+
+ switch (cpu_vendor_id) {
+ case CPU_VENDOR_AMD:
+ case CPU_VENDOR_HYGON:
+ hyperv_tsc_timecounter.tc_get_timecount =
+ hyperv_tsc_timecount_mfence;
+ tc64 = hyperv_tc64_tsc_mfence;
+ break;
+
+ case CPU_VENDOR_INTEL:
+ hyperv_tsc_timecounter.tc_get_timecount =
+ hyperv_tsc_timecount_lfence;
+ tc64 = hyperv_tc64_tsc_lfence;
+ break;
+
+ default:
+ /* Unsupport CPU vendors. */
+ return;
+ }
+
+ hyperv_ref_tsc.tsc_ref = hyperv_dmamem_alloc(NULL, PAGE_SIZE, 0,
+ sizeof(struct hyperv_reftsc), &hyperv_ref_tsc.tsc_ref_dma,
+ BUS_DMA_WAITOK | BUS_DMA_ZERO);
+ if (hyperv_ref_tsc.tsc_ref == NULL) {
+ printf("hyperv: reftsc page allocation failed\n");
+ return;
+ }
+
+ orig = rdmsr(MSR_HV_REFERENCE_TSC);
+ val = MSR_HV_REFTSC_ENABLE | (orig & MSR_HV_REFTSC_RSVD_MASK) |
+ ((hyperv_ref_tsc.tsc_ref_dma.hv_paddr >> PAGE_SHIFT) <<
+ MSR_HV_REFTSC_PGSHIFT);
+ wrmsr(MSR_HV_REFERENCE_TSC, val);
+
+ /* Register "enlightened" timecounter. */
+ tc_init(&hyperv_tsc_timecounter);
+
+ /* Install 64 bits timecounter method for other modules to use. */
+ KASSERT(tc64 != NULL, ("tc64 is not set"));
+ hyperv_tc64 = tc64;
+
+ /* Add device for mmap(2). */
+ make_dev(&hyperv_tsc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0444,
+ HYPERV_REFTSC_DEVNAME);
+}
+SYSINIT(hyperv_tsc_init, SI_SUB_DRIVERS, SI_ORDER_FIRST, hyperv_tsc_tcinit,
+ NULL);
diff --git a/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S b/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S
new file mode 100644
index 000000000000..30c07348734c
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "assym.inc"
+
+#include <machine/psl.h>
+#include <machine/asmacros.h>
+#include <machine/specialreg.h>
+
+/*
+ * This is the Hyper-V vmbus channel direct callback interrupt.
+ * Only used when it is running on Hyper-V.
+ */
+ .text
+ SUPERALIGN_TEXT
+ INTR_HANDLER vmbus_isr
+ movq %rsp, %rdi
+ call vmbus_handle_intr
+ jmp doreti
diff --git a/sys/dev/hyperv/vmbus/hyperv.c b/sys/dev/hyperv/vmbus/hyperv.c
new file mode 100644
index 000000000000..01e0ad9610d9
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/hyperv.c
@@ -0,0 +1,340 @@
+/*-
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * Implements low-level interactions with Hyper-V/Azure
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/timetc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/pmap.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/vmbus/hyperv_machdep.h>
+#include <dev/hyperv/vmbus/hyperv_reg.h>
+#include <dev/hyperv/vmbus/hyperv_var.h>
+
+#define HYPERV_FREEBSD_BUILD 0ULL
+#define HYPERV_FREEBSD_VERSION ((uint64_t)__FreeBSD_version)
+#define HYPERV_FREEBSD_OSID 0ULL
+
+#define MSR_HV_GUESTID_BUILD_FREEBSD \
+ (HYPERV_FREEBSD_BUILD & MSR_HV_GUESTID_BUILD_MASK)
+#define MSR_HV_GUESTID_VERSION_FREEBSD \
+ ((HYPERV_FREEBSD_VERSION << MSR_HV_GUESTID_VERSION_SHIFT) & \
+ MSR_HV_GUESTID_VERSION_MASK)
+#define MSR_HV_GUESTID_OSID_FREEBSD \
+ ((HYPERV_FREEBSD_OSID << MSR_HV_GUESTID_OSID_SHIFT) & \
+ MSR_HV_GUESTID_OSID_MASK)
+
+#define MSR_HV_GUESTID_FREEBSD \
+ (MSR_HV_GUESTID_BUILD_FREEBSD | \
+ MSR_HV_GUESTID_VERSION_FREEBSD | \
+ MSR_HV_GUESTID_OSID_FREEBSD | \
+ MSR_HV_GUESTID_OSTYPE_FREEBSD)
+
+struct hypercall_ctx {
+ void *hc_addr;
+ vm_paddr_t hc_paddr;
+};
+
+static u_int hyperv_get_timecount(struct timecounter *);
+static bool hyperv_identify(void);
+static void hypercall_memfree(void);
+
+u_int hyperv_ver_major;
+
+u_int hyperv_features;
+u_int hyperv_recommends;
+
+static u_int hyperv_pm_features;
+static u_int hyperv_features3;
+
+hyperv_tc64_t hyperv_tc64;
+
+static struct timecounter hyperv_timecounter = {
+ .tc_get_timecount = hyperv_get_timecount,
+ .tc_poll_pps = NULL,
+ .tc_counter_mask = 0xffffffff,
+ .tc_frequency = HYPERV_TIMER_FREQ,
+ .tc_name = "Hyper-V",
+ .tc_quality = 2000,
+ .tc_flags = 0,
+ .tc_priv = NULL
+};
+
+static struct hypercall_ctx hypercall_context;
+
+static u_int
+hyperv_get_timecount(struct timecounter *tc __unused)
+{
+ return rdmsr(MSR_HV_TIME_REF_COUNT);
+}
+
+static uint64_t
+hyperv_tc64_rdmsr(void)
+{
+
+ return (rdmsr(MSR_HV_TIME_REF_COUNT));
+}
+
+uint64_t
+hypercall_post_message(bus_addr_t msg_paddr)
+{
+ return hypercall_md(hypercall_context.hc_addr,
+ HYPERCALL_POST_MESSAGE, msg_paddr, 0);
+}
+
+uint64_t
+hypercall_signal_event(bus_addr_t monprm_paddr)
+{
+ return hypercall_md(hypercall_context.hc_addr,
+ HYPERCALL_SIGNAL_EVENT, monprm_paddr, 0);
+}
+
+int
+hyperv_guid2str(const struct hyperv_guid *guid, char *buf, size_t sz)
+{
+ const uint8_t *d = guid->hv_guid;
+
+ return snprintf(buf, sz, "%02x%02x%02x%02x-"
+ "%02x%02x-%02x%02x-%02x%02x-"
+ "%02x%02x%02x%02x%02x%02x",
+ d[3], d[2], d[1], d[0],
+ d[5], d[4], d[7], d[6], d[8], d[9],
+ d[10], d[11], d[12], d[13], d[14], d[15]);
+}
+
+static bool
+hyperv_identify(void)
+{
+ u_int regs[4];
+ unsigned int maxleaf;
+
+ if (vm_guest != VM_GUEST_HV)
+ return (false);
+
+ do_cpuid(CPUID_LEAF_HV_MAXLEAF, regs);
+ maxleaf = regs[0];
+ if (maxleaf < CPUID_LEAF_HV_LIMITS)
+ return (false);
+
+ do_cpuid(CPUID_LEAF_HV_INTERFACE, regs);
+ if (regs[0] != CPUID_HV_IFACE_HYPERV)
+ return (false);
+
+ do_cpuid(CPUID_LEAF_HV_FEATURES, regs);
+ if ((regs[0] & CPUID_HV_MSR_HYPERCALL) == 0) {
+ /*
+ * Hyper-V w/o Hypercall is impossible; someone
+ * is faking Hyper-V.
+ */
+ return (false);
+ }
+ hyperv_features = regs[0];
+ hyperv_pm_features = regs[2];
+ hyperv_features3 = regs[3];
+
+ do_cpuid(CPUID_LEAF_HV_IDENTITY, regs);
+ hyperv_ver_major = regs[1] >> 16;
+ printf("Hyper-V Version: %d.%d.%d [SP%d]\n",
+ hyperv_ver_major, regs[1] & 0xffff, regs[0], regs[2]);
+
+ printf(" Features=0x%b\n", hyperv_features,
+ "\020"
+ "\001VPRUNTIME" /* MSR_HV_VP_RUNTIME */
+ "\002TMREFCNT" /* MSR_HV_TIME_REF_COUNT */
+ "\003SYNIC" /* MSRs for SynIC */
+ "\004SYNTM" /* MSRs for SynTimer */
+ "\005APIC" /* MSR_HV_{EOI,ICR,TPR} */
+ "\006HYPERCALL" /* MSR_HV_{GUEST_OS_ID,HYPERCALL} */
+ "\007VPINDEX" /* MSR_HV_VP_INDEX */
+ "\010RESET" /* MSR_HV_RESET */
+ "\011STATS" /* MSR_HV_STATS_ */
+ "\012REFTSC" /* MSR_HV_REFERENCE_TSC */
+ "\013IDLE" /* MSR_HV_GUEST_IDLE */
+ "\014TMFREQ" /* MSR_HV_{TSC,APIC}_FREQUENCY */
+ "\015DEBUG"); /* MSR_HV_SYNTH_DEBUG_ */
+ printf(" PM Features=0x%b [C%u]\n",
+ (hyperv_pm_features & ~CPUPM_HV_CSTATE_MASK),
+ "\020"
+ "\005C3HPET", /* HPET is required for C3 state */
+ CPUPM_HV_CSTATE(hyperv_pm_features));
+ printf(" Features3=0x%b\n", hyperv_features3,
+ "\020"
+ "\001MWAIT" /* MWAIT */
+ "\002DEBUG" /* guest debug support */
+ "\003PERFMON" /* performance monitor */
+ "\004PCPUDPE" /* physical CPU dynamic partition event */
+ "\005XMMHC" /* hypercall input through XMM regs */
+ "\006IDLE" /* guest idle support */
+ "\007SLEEP" /* hypervisor sleep support */
+ "\010NUMA" /* NUMA distance query support */
+ "\011TMFREQ" /* timer frequency query (TSC, LAPIC) */
+ "\012SYNCMC" /* inject synthetic machine checks */
+ "\013CRASH" /* MSRs for guest crash */
+ "\014DEBUGMSR" /* MSRs for guest debug */
+ "\015NPIEP" /* NPIEP */
+ "\016HVDIS"); /* disabling hypervisor */
+
+ do_cpuid(CPUID_LEAF_HV_RECOMMENDS, regs);
+ hyperv_recommends = regs[0];
+ if (bootverbose)
+ printf(" Recommends: %08x %08x\n", regs[0], regs[1]);
+
+ do_cpuid(CPUID_LEAF_HV_LIMITS, regs);
+ if (bootverbose) {
+ printf(" Limits: Vcpu:%d Lcpu:%d Int:%d\n",
+ regs[0], regs[1], regs[2]);
+ }
+
+ if (maxleaf >= CPUID_LEAF_HV_HWFEATURES) {
+ do_cpuid(CPUID_LEAF_HV_HWFEATURES, regs);
+ if (bootverbose) {
+ printf(" HW Features: %08x, AMD: %08x\n",
+ regs[0], regs[3]);
+ }
+ }
+
+ return (true);
+}
+
+static void
+hyperv_init(void *dummy __unused)
+{
+ if (!hyperv_identify()) {
+ /* Not Hyper-V; reset guest id to the generic one. */
+ if (vm_guest == VM_GUEST_HV)
+ vm_guest = VM_GUEST_VM;
+ return;
+ }
+
+ /* Set guest id */
+ wrmsr(MSR_HV_GUEST_OS_ID, MSR_HV_GUESTID_FREEBSD);
+
+ if (hyperv_features & CPUID_HV_MSR_TIME_REFCNT) {
+ /*
+ * Register Hyper-V timecounter. This should be done as early
+ * as possible to let DELAY() work, since the 8254 PIT is not
+ * reliably emulated or even available.
+ */
+ tc_init(&hyperv_timecounter);
+
+ /*
+ * Install 64 bits timecounter method for other modules
+ * to use.
+ */
+ hyperv_tc64 = hyperv_tc64_rdmsr;
+ }
+}
+SYSINIT(hyperv_initialize, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, hyperv_init,
+ NULL);
+
+static void
+hypercall_memfree(void)
+{
+ kmem_free((vm_offset_t)hypercall_context.hc_addr, PAGE_SIZE);
+ hypercall_context.hc_addr = NULL;
+}
+
+static void
+hypercall_create(void *arg __unused)
+{
+ uint64_t hc, hc_orig;
+
+ if (vm_guest != VM_GUEST_HV)
+ return;
+
+ /*
+ * NOTE:
+ * - busdma(9), i.e. hyperv_dmamem APIs, can _not_ be used due to
+ * the NX bit.
+ * - Assume kmem_malloc() returns properly aligned memory.
+ */
+ hypercall_context.hc_addr = (void *)kmem_malloc(PAGE_SIZE, M_EXEC |
+ M_WAITOK);
+ hypercall_context.hc_paddr = vtophys(hypercall_context.hc_addr);
+
+ /* Get the 'reserved' bits, which requires preservation. */
+ hc_orig = rdmsr(MSR_HV_HYPERCALL);
+
+ /*
+ * Setup the Hypercall page.
+ *
+ * NOTE: 'reserved' bits MUST be preserved.
+ */
+ hc = ((hypercall_context.hc_paddr >> PAGE_SHIFT) <<
+ MSR_HV_HYPERCALL_PGSHIFT) |
+ (hc_orig & MSR_HV_HYPERCALL_RSVD_MASK) |
+ MSR_HV_HYPERCALL_ENABLE;
+ wrmsr(MSR_HV_HYPERCALL, hc);
+
+ /*
+ * Confirm that Hypercall page did get setup.
+ */
+ hc = rdmsr(MSR_HV_HYPERCALL);
+ if ((hc & MSR_HV_HYPERCALL_ENABLE) == 0) {
+ printf("hyperv: Hypercall setup failed\n");
+ hypercall_memfree();
+ /* Can't perform any Hyper-V specific actions */
+ vm_guest = VM_GUEST_VM;
+ return;
+ }
+ if (bootverbose)
+ printf("hyperv: Hypercall created\n");
+}
+SYSINIT(hypercall_ctor, SI_SUB_DRIVERS, SI_ORDER_FIRST, hypercall_create, NULL);
+
+static void
+hypercall_destroy(void *arg __unused)
+{
+ uint64_t hc;
+
+ if (hypercall_context.hc_addr == NULL)
+ return;
+
+ /* Disable Hypercall */
+ hc = rdmsr(MSR_HV_HYPERCALL);
+ wrmsr(MSR_HV_HYPERCALL, (hc & MSR_HV_HYPERCALL_RSVD_MASK));
+ hypercall_memfree();
+
+ if (bootverbose)
+ printf("hyperv: Hypercall destroyed\n");
+}
+SYSUNINIT(hypercall_dtor, SI_SUB_DRIVERS, SI_ORDER_FIRST, hypercall_destroy,
+ NULL);
diff --git a/sys/dev/hyperv/vmbus/hyperv_busdma.c b/sys/dev/hyperv/vmbus/hyperv_busdma.c
new file mode 100644
index 000000000000..9550540014c4
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/hyperv_busdma.c
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+#include <machine/bus.h>
+
+#include <dev/hyperv/include/hyperv_busdma.h>
+
+#define HYPERV_DMA_MASK (BUS_DMA_WAITOK | BUS_DMA_NOWAIT | BUS_DMA_ZERO)
+
+void
+hyperv_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error)
+{
+ bus_addr_t *paddr = arg;
+
+ if (error)
+ return;
+
+ KASSERT(nseg == 1, ("too many segments %d!", nseg));
+ *paddr = segs->ds_addr;
+}
+
+void *
+hyperv_dmamem_alloc(bus_dma_tag_t parent_dtag, bus_size_t alignment,
+ bus_addr_t boundary, bus_size_t size, struct hyperv_dma *dma, int flags)
+{
+ void *ret;
+ int error;
+
+ error = bus_dma_tag_create(parent_dtag, /* parent */
+ alignment, /* alignment */
+ boundary, /* boundary */
+ BUS_SPACE_MAXADDR, /* lowaddr */
+ BUS_SPACE_MAXADDR, /* highaddr */
+ NULL, NULL, /* filter, filterarg */
+ size, /* maxsize */
+ 1, /* nsegments */
+ size, /* maxsegsize */
+ 0, /* flags */
+ NULL, /* lockfunc */
+ NULL, /* lockfuncarg */
+ &dma->hv_dtag);
+ if (error)
+ return NULL;
+
+ error = bus_dmamem_alloc(dma->hv_dtag, &ret,
+ (flags & HYPERV_DMA_MASK) | BUS_DMA_COHERENT, &dma->hv_dmap);
+ if (error) {
+ bus_dma_tag_destroy(dma->hv_dtag);
+ return NULL;
+ }
+
+ error = bus_dmamap_load(dma->hv_dtag, dma->hv_dmap, ret, size,
+ hyperv_dma_map_paddr, &dma->hv_paddr, BUS_DMA_NOWAIT);
+ if (error) {
+ bus_dmamem_free(dma->hv_dtag, ret, dma->hv_dmap);
+ bus_dma_tag_destroy(dma->hv_dtag);
+ return NULL;
+ }
+ return ret;
+}
+
+void
+hyperv_dmamem_free(struct hyperv_dma *dma, void *ptr)
+{
+ bus_dmamap_unload(dma->hv_dtag, dma->hv_dmap);
+ bus_dmamem_free(dma->hv_dtag, ptr, dma->hv_dmap);
+ bus_dma_tag_destroy(dma->hv_dtag);
+}
diff --git a/sys/dev/hyperv/vmbus/hyperv_machdep.h b/sys/dev/hyperv/vmbus/hyperv_machdep.h
new file mode 100644
index 000000000000..48cf5b78dc3b
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/hyperv_machdep.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HYPERV_MACHDEP_H_
+#define _HYPERV_MACHDEP_H_
+
+#include <sys/param.h>
+
+uint64_t hypercall_md(volatile void *hc_addr, uint64_t in_val,
+ uint64_t in_paddr, uint64_t out_paddr);
+
+#endif /* !_HYPERV_MACHDEP_H_ */
diff --git a/sys/dev/hyperv/vmbus/hyperv_reg.h b/sys/dev/hyperv/vmbus/hyperv_reg.h
new file mode 100644
index 000000000000..b3b133c84881
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/hyperv_reg.h
@@ -0,0 +1,193 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HYPERV_REG_H_
+#define _HYPERV_REG_H_
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+/*
+ * Hyper-V Synthetic MSRs
+ */
+
+#define MSR_HV_GUEST_OS_ID 0x40000000
+#define MSR_HV_GUESTID_BUILD_MASK 0xffffULL
+#define MSR_HV_GUESTID_VERSION_MASK 0x0000ffffffff0000ULL
+#define MSR_HV_GUESTID_VERSION_SHIFT 16
+#define MSR_HV_GUESTID_OSID_MASK 0x00ff000000000000ULL
+#define MSR_HV_GUESTID_OSID_SHIFT 48
+#define MSR_HV_GUESTID_OSTYPE_MASK 0x7f00000000000000ULL
+#define MSR_HV_GUESTID_OSTYPE_SHIFT 56
+#define MSR_HV_GUESTID_OPENSRC 0x8000000000000000ULL
+#define MSR_HV_GUESTID_OSTYPE_LINUX \
+ ((0x01ULL << MSR_HV_GUESTID_OSTYPE_SHIFT) | MSR_HV_GUESTID_OPENSRC)
+#define MSR_HV_GUESTID_OSTYPE_FREEBSD \
+ ((0x02ULL << MSR_HV_GUESTID_OSTYPE_SHIFT) | MSR_HV_GUESTID_OPENSRC)
+
+#define MSR_HV_HYPERCALL 0x40000001
+#define MSR_HV_HYPERCALL_ENABLE 0x0001ULL
+#define MSR_HV_HYPERCALL_RSVD_MASK 0x0ffeULL
+#define MSR_HV_HYPERCALL_PGSHIFT 12
+
+#define MSR_HV_VP_INDEX 0x40000002
+
+#define MSR_HV_REFERENCE_TSC 0x40000021
+#define MSR_HV_REFTSC_ENABLE 0x0001ULL
+#define MSR_HV_REFTSC_RSVD_MASK 0x0ffeULL
+#define MSR_HV_REFTSC_PGSHIFT 12
+
+#define MSR_HV_SCONTROL 0x40000080
+#define MSR_HV_SCTRL_ENABLE 0x0001ULL
+#define MSR_HV_SCTRL_RSVD_MASK 0xfffffffffffffffeULL
+
+#define MSR_HV_SIEFP 0x40000082
+#define MSR_HV_SIEFP_ENABLE 0x0001ULL
+#define MSR_HV_SIEFP_RSVD_MASK 0x0ffeULL
+#define MSR_HV_SIEFP_PGSHIFT 12
+
+#define MSR_HV_SIMP 0x40000083
+#define MSR_HV_SIMP_ENABLE 0x0001ULL
+#define MSR_HV_SIMP_RSVD_MASK 0x0ffeULL
+#define MSR_HV_SIMP_PGSHIFT 12
+
+#define MSR_HV_EOM 0x40000084
+
+#define MSR_HV_SINT0 0x40000090
+#define MSR_HV_SINT_VECTOR_MASK 0x00ffULL
+#define MSR_HV_SINT_RSVD1_MASK 0xff00ULL
+#define MSR_HV_SINT_MASKED 0x00010000ULL
+#define MSR_HV_SINT_AUTOEOI 0x00020000ULL
+#define MSR_HV_SINT_RSVD2_MASK 0xfffffffffffc0000ULL
+#define MSR_HV_SINT_RSVD_MASK (MSR_HV_SINT_RSVD1_MASK | \
+ MSR_HV_SINT_RSVD2_MASK)
+
+#define MSR_HV_STIMER0_CONFIG 0x400000b0
+#define MSR_HV_STIMER_CFG_ENABLE 0x0001ULL
+#define MSR_HV_STIMER_CFG_PERIODIC 0x0002ULL
+#define MSR_HV_STIMER_CFG_LAZY 0x0004ULL
+#define MSR_HV_STIMER_CFG_AUTOEN 0x0008ULL
+#define MSR_HV_STIMER_CFG_SINT_MASK 0x000f0000ULL
+#define MSR_HV_STIMER_CFG_SINT_SHIFT 16
+
+#define MSR_HV_STIMER0_COUNT 0x400000b1
+
+/*
+ * CPUID leaves
+ */
+
+#define CPUID_LEAF_HV_MAXLEAF 0x40000000
+
+#define CPUID_LEAF_HV_INTERFACE 0x40000001
+#define CPUID_HV_IFACE_HYPERV 0x31237648 /* HV#1 */
+
+#define CPUID_LEAF_HV_IDENTITY 0x40000002
+
+#define CPUID_LEAF_HV_FEATURES 0x40000003
+/* EAX: features include/hyperv.h CPUID_HV_MSR */
+/* ECX: power management features */
+#define CPUPM_HV_CSTATE_MASK 0x000f /* deepest C-state */
+#define CPUPM_HV_C3_HPET 0x0010 /* C3 requires HPET */
+#define CPUPM_HV_CSTATE(f) ((f) & CPUPM_HV_CSTATE_MASK)
+/* EDX: features3 */
+#define CPUID3_HV_MWAIT 0x0001 /* MWAIT */
+#define CPUID3_HV_XMM_HYPERCALL 0x0010 /* Hypercall input through
+ * XMM regs */
+#define CPUID3_HV_GUEST_IDLE 0x0020 /* guest idle */
+#define CPUID3_HV_NUMA 0x0080 /* NUMA distance query */
+#define CPUID3_HV_TIME_FREQ 0x0100 /* timer frequency query
+ * (TSC, LAPIC) */
+#define CPUID3_HV_MSR_CRASH 0x0400 /* MSRs for guest crash */
+
+#define CPUID_LEAF_HV_RECOMMENDS 0x40000004
+#define CPUID_LEAF_HV_LIMITS 0x40000005
+#define CPUID_LEAF_HV_HWFEATURES 0x40000006
+
+/*
+ * Hyper-V Monitor Notification Facility
+ */
+struct hyperv_mon_param {
+ uint32_t mp_connid;
+ uint16_t mp_evtflag_ofs;
+ uint16_t mp_rsvd;
+} __packed;
+
+/*
+ * Hyper-V message types
+ */
+#define HYPERV_MSGTYPE_NONE 0
+#define HYPERV_MSGTYPE_CHANNEL 1
+#define HYPERV_MSGTYPE_TIMER_EXPIRED 0x80000010
+
+/*
+ * Hypercall status codes
+ */
+#define HYPERCALL_STATUS_SUCCESS 0x0000
+
+/*
+ * Hypercall input values
+ */
+#define HYPERCALL_POST_MESSAGE 0x005c
+#define HYPERCALL_SIGNAL_EVENT 0x005d
+
+/*
+ * Hypercall input parameters
+ */
+#define HYPERCALL_PARAM_ALIGN 8
+#if 0
+/*
+ * XXX
+ * <<Hypervisor Top Level Functional Specification 4.0b>> requires
+ * input parameters size to be multiple of 8, however, many post
+ * message input parameters do _not_ meet this requirement.
+ */
+#define HYPERCALL_PARAM_SIZE_ALIGN 8
+#endif
+
+/*
+ * HYPERCALL_POST_MESSAGE
+ */
+#define HYPERCALL_POSTMSGIN_DSIZE_MAX 240
+#define HYPERCALL_POSTMSGIN_SIZE 256
+
+struct hypercall_postmsg_in {
+ uint32_t hc_connid;
+ uint32_t hc_rsvd;
+ uint32_t hc_msgtype; /* HYPERV_MSGTYPE_ */
+ uint32_t hc_dsize;
+ uint8_t hc_data[HYPERCALL_POSTMSGIN_DSIZE_MAX];
+} __packed;
+CTASSERT(sizeof(struct hypercall_postmsg_in) == HYPERCALL_POSTMSGIN_SIZE);
+
+/*
+ * HYPERCALL_SIGNAL_EVENT
+ *
+ * struct hyperv_mon_param.
+ */
+
+#endif /* !_HYPERV_REG_H_ */
diff --git a/sys/dev/hyperv/vmbus/hyperv_var.h b/sys/dev/hyperv/vmbus/hyperv_var.h
new file mode 100644
index 000000000000..f620e4fd64ae
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/hyperv_var.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _HYPERV_VAR_H_
+#define _HYPERV_VAR_H_
+
+extern u_int hyperv_recommends;
+
+uint64_t hypercall_post_message(bus_addr_t msg_paddr);
+uint64_t hypercall_signal_event(bus_addr_t monprm_paddr);
+
+#endif /* !_HYPERV_VAR_H_ */
diff --git a/sys/dev/hyperv/vmbus/i386/hyperv_machdep.c b/sys/dev/hyperv/vmbus/i386/hyperv_machdep.c
new file mode 100644
index 000000000000..b12bff855f63
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/i386/hyperv_machdep.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <dev/hyperv/vmbus/hyperv_machdep.h>
+
+uint64_t
+hypercall_md(volatile void *hc_addr, uint64_t in_val,
+ uint64_t in_paddr, uint64_t out_paddr)
+{
+ uint32_t in_val_hi = in_val >> 32;
+ uint32_t in_val_lo = in_val & 0xFFFFFFFF;
+ uint32_t status_hi, status_lo;
+ uint32_t in_paddr_hi = in_paddr >> 32;
+ uint32_t in_paddr_lo = in_paddr & 0xFFFFFFFF;
+ uint32_t out_paddr_hi = out_paddr >> 32;
+ uint32_t out_paddr_lo = out_paddr & 0xFFFFFFFF;
+
+ __asm__ __volatile__ ("call *%8" : "=d"(status_hi), "=a"(status_lo) :
+ "d" (in_val_hi), "a" (in_val_lo),
+ "b" (in_paddr_hi), "c" (in_paddr_lo),
+ "D"(out_paddr_hi), "S"(out_paddr_lo),
+ "m" (hc_addr));
+ return (status_lo | ((uint64_t)status_hi << 32));
+}
diff --git a/sys/dev/hyperv/vmbus/i386/vmbus_vector.S b/sys/dev/hyperv/vmbus/i386/vmbus_vector.S
new file mode 100644
index 000000000000..b1ffe89cd55d
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/i386/vmbus_vector.S
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "assym.inc"
+
+#include <machine/psl.h>
+#include <machine/asmacros.h>
+#include <machine/specialreg.h>
+
+/*
+ * This is the Hyper-V vmbus channel direct callback interrupt.
+ * Only used when it is running on Hyper-V.
+ *
+ * Note that this file is not assembled directly, it is included into
+ * i386/exception.s.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(vmbus_isr_pti)
+IDTVEC(vmbus_isr)
+ PUSH_FRAME
+ SET_KERNEL_SREGS
+ cld
+ KENTER
+ pushl %esp
+ mov $vmbus_handle_intr, %eax
+ call *%eax
+ add $4, %esp
+ jmp doreti
diff --git a/sys/dev/hyperv/vmbus/vmbus.c b/sys/dev/hyperv/vmbus/vmbus.c
new file mode 100644
index 000000000000..31951cbf4858
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus.c
@@ -0,0 +1,1679 @@
+/*-
+ * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * VM Bus Driver Implementation
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sbuf.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+
+#include <machine/bus.h>
+#include <machine/intr_machdep.h>
+#include <machine/metadata.h>
+#include <machine/md_var.h>
+#include <machine/resource.h>
+#include <x86/include/apicvar.h>
+
+#include <contrib/dev/acpica/include/acpi.h>
+#include <dev/acpica/acpivar.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+#include <dev/hyperv/vmbus/hyperv_reg.h>
+#include <dev/hyperv/vmbus/hyperv_var.h>
+#include <dev/hyperv/vmbus/vmbus_reg.h>
+#include <dev/hyperv/vmbus/vmbus_var.h>
+#include <dev/hyperv/vmbus/vmbus_chanvar.h>
+
+#include "acpi_if.h"
+#include "pcib_if.h"
+#include "vmbus_if.h"
+
+#define VMBUS_GPADL_START 0xe1e10
+
+struct vmbus_msghc {
+ struct vmbus_xact *mh_xact;
+ struct hypercall_postmsg_in mh_inprm_save;
+};
+
+static void vmbus_identify(driver_t *, device_t);
+static int vmbus_probe(device_t);
+static int vmbus_attach(device_t);
+static int vmbus_detach(device_t);
+static int vmbus_read_ivar(device_t, device_t, int,
+ uintptr_t *);
+static int vmbus_child_pnpinfo(device_t, device_t, struct sbuf *);
+static struct resource *vmbus_alloc_resource(device_t dev,
+ device_t child, int type, int *rid,
+ rman_res_t start, rman_res_t end,
+ rman_res_t count, u_int flags);
+static int vmbus_alloc_msi(device_t bus, device_t dev,
+ int count, int maxcount, int *irqs);
+static int vmbus_release_msi(device_t bus, device_t dev,
+ int count, int *irqs);
+static int vmbus_alloc_msix(device_t bus, device_t dev,
+ int *irq);
+static int vmbus_release_msix(device_t bus, device_t dev,
+ int irq);
+static int vmbus_map_msi(device_t bus, device_t dev,
+ int irq, uint64_t *addr, uint32_t *data);
+static uint32_t vmbus_get_version_method(device_t, device_t);
+static int vmbus_probe_guid_method(device_t, device_t,
+ const struct hyperv_guid *);
+static uint32_t vmbus_get_vcpu_id_method(device_t bus,
+ device_t dev, int cpu);
+static struct taskqueue *vmbus_get_eventtq_method(device_t, device_t,
+ int);
+#ifdef EARLY_AP_STARTUP
+static void vmbus_intrhook(void *);
+#endif
+
+static int vmbus_init(struct vmbus_softc *);
+static int vmbus_connect(struct vmbus_softc *, uint32_t);
+static int vmbus_req_channels(struct vmbus_softc *sc);
+static void vmbus_disconnect(struct vmbus_softc *);
+static int vmbus_scan(struct vmbus_softc *);
+static void vmbus_scan_teardown(struct vmbus_softc *);
+static void vmbus_scan_done(struct vmbus_softc *,
+ const struct vmbus_message *);
+static void vmbus_chanmsg_handle(struct vmbus_softc *,
+ const struct vmbus_message *);
+static void vmbus_msg_task(void *, int);
+static void vmbus_synic_setup(void *);
+static void vmbus_synic_teardown(void *);
+static int vmbus_sysctl_version(SYSCTL_HANDLER_ARGS);
+static int vmbus_dma_alloc(struct vmbus_softc *);
+static void vmbus_dma_free(struct vmbus_softc *);
+static int vmbus_intr_setup(struct vmbus_softc *);
+static void vmbus_intr_teardown(struct vmbus_softc *);
+static int vmbus_doattach(struct vmbus_softc *);
+static void vmbus_event_proc_dummy(struct vmbus_softc *,
+ int);
+
+static struct vmbus_softc *vmbus_sc;
+
+SYSCTL_NODE(_hw, OID_AUTO, vmbus, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
+ "Hyper-V vmbus");
+
+static int vmbus_pin_evttask = 1;
+SYSCTL_INT(_hw_vmbus, OID_AUTO, pin_evttask, CTLFLAG_RDTUN,
+ &vmbus_pin_evttask, 0, "Pin event tasks to their respective CPU");
+
+extern inthand_t IDTVEC(vmbus_isr), IDTVEC(vmbus_isr_pti);
+#define VMBUS_ISR_ADDR trunc_page((uintptr_t)IDTVEC(vmbus_isr_pti))
+
+uint32_t vmbus_current_version;
+
+static const uint32_t vmbus_version[] = {
+ VMBUS_VERSION_WIN10,
+ VMBUS_VERSION_WIN8_1,
+ VMBUS_VERSION_WIN8,
+ VMBUS_VERSION_WIN7,
+ VMBUS_VERSION_WS2008
+};
+
+static const vmbus_chanmsg_proc_t
+vmbus_chanmsg_handlers[VMBUS_CHANMSG_TYPE_MAX] = {
+ VMBUS_CHANMSG_PROC(CHOFFER_DONE, vmbus_scan_done),
+ VMBUS_CHANMSG_PROC_WAKEUP(CONNECT_RESP)
+};
+
+static device_method_t vmbus_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_identify, vmbus_identify),
+ DEVMETHOD(device_probe, vmbus_probe),
+ DEVMETHOD(device_attach, vmbus_attach),
+ DEVMETHOD(device_detach, vmbus_detach),
+ DEVMETHOD(device_shutdown, bus_generic_shutdown),
+ DEVMETHOD(device_suspend, bus_generic_suspend),
+ DEVMETHOD(device_resume, bus_generic_resume),
+
+ /* Bus interface */
+ DEVMETHOD(bus_add_child, bus_generic_add_child),
+ DEVMETHOD(bus_print_child, bus_generic_print_child),
+ DEVMETHOD(bus_read_ivar, vmbus_read_ivar),
+ DEVMETHOD(bus_child_pnpinfo, vmbus_child_pnpinfo),
+ DEVMETHOD(bus_alloc_resource, vmbus_alloc_resource),
+ DEVMETHOD(bus_release_resource, bus_generic_release_resource),
+ DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
+ DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
+ DEVMETHOD(bus_setup_intr, bus_generic_setup_intr),
+ DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr),
+#if __FreeBSD_version >= 1100000
+ DEVMETHOD(bus_get_cpus, bus_generic_get_cpus),
+#endif
+
+ /* pcib interface */
+ DEVMETHOD(pcib_alloc_msi, vmbus_alloc_msi),
+ DEVMETHOD(pcib_release_msi, vmbus_release_msi),
+ DEVMETHOD(pcib_alloc_msix, vmbus_alloc_msix),
+ DEVMETHOD(pcib_release_msix, vmbus_release_msix),
+ DEVMETHOD(pcib_map_msi, vmbus_map_msi),
+
+ /* Vmbus interface */
+ DEVMETHOD(vmbus_get_version, vmbus_get_version_method),
+ DEVMETHOD(vmbus_probe_guid, vmbus_probe_guid_method),
+ DEVMETHOD(vmbus_get_vcpu_id, vmbus_get_vcpu_id_method),
+ DEVMETHOD(vmbus_get_event_taskq, vmbus_get_eventtq_method),
+
+ DEVMETHOD_END
+};
+
+static driver_t vmbus_driver = {
+ "vmbus",
+ vmbus_methods,
+ sizeof(struct vmbus_softc)
+};
+
+static devclass_t vmbus_devclass;
+
+DRIVER_MODULE(vmbus, pcib, vmbus_driver, vmbus_devclass, NULL, NULL);
+DRIVER_MODULE(vmbus, acpi_syscontainer, vmbus_driver, vmbus_devclass,
+ NULL, NULL);
+
+MODULE_DEPEND(vmbus, acpi, 1, 1, 1);
+MODULE_DEPEND(vmbus, pci, 1, 1, 1);
+MODULE_VERSION(vmbus, 1);
+
+static __inline struct vmbus_softc *
+vmbus_get_softc(void)
+{
+ return vmbus_sc;
+}
+
+void
+vmbus_msghc_reset(struct vmbus_msghc *mh, size_t dsize)
+{
+ struct hypercall_postmsg_in *inprm;
+
+ if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX)
+ panic("invalid data size %zu", dsize);
+
+ inprm = vmbus_xact_req_data(mh->mh_xact);
+ memset(inprm, 0, HYPERCALL_POSTMSGIN_SIZE);
+ inprm->hc_connid = VMBUS_CONNID_MESSAGE;
+ inprm->hc_msgtype = HYPERV_MSGTYPE_CHANNEL;
+ inprm->hc_dsize = dsize;
+}
+
+struct vmbus_msghc *
+vmbus_msghc_get(struct vmbus_softc *sc, size_t dsize)
+{
+ struct vmbus_msghc *mh;
+ struct vmbus_xact *xact;
+
+ if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX)
+ panic("invalid data size %zu", dsize);
+
+ xact = vmbus_xact_get(sc->vmbus_xc,
+ dsize + __offsetof(struct hypercall_postmsg_in, hc_data[0]));
+ if (xact == NULL)
+ return (NULL);
+
+ mh = vmbus_xact_priv(xact, sizeof(*mh));
+ mh->mh_xact = xact;
+
+ vmbus_msghc_reset(mh, dsize);
+ return (mh);
+}
+
+void
+vmbus_msghc_put(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh)
+{
+
+ vmbus_xact_put(mh->mh_xact);
+}
+
+void *
+vmbus_msghc_dataptr(struct vmbus_msghc *mh)
+{
+ struct hypercall_postmsg_in *inprm;
+
+ inprm = vmbus_xact_req_data(mh->mh_xact);
+ return (inprm->hc_data);
+}
+
+int
+vmbus_msghc_exec_noresult(struct vmbus_msghc *mh)
+{
+ sbintime_t time = SBT_1MS;
+ struct hypercall_postmsg_in *inprm;
+ bus_addr_t inprm_paddr;
+ int i;
+
+ inprm = vmbus_xact_req_data(mh->mh_xact);
+ inprm_paddr = vmbus_xact_req_paddr(mh->mh_xact);
+
+ /*
+ * Save the input parameter so that we could restore the input
+ * parameter if the Hypercall failed.
+ *
+ * XXX
+ * Is this really necessary?! i.e. Will the Hypercall ever
+ * overwrite the input parameter?
+ */
+ memcpy(&mh->mh_inprm_save, inprm, HYPERCALL_POSTMSGIN_SIZE);
+
+ /*
+ * In order to cope with transient failures, e.g. insufficient
+ * resources on host side, we retry the post message Hypercall
+ * several times. 20 retries seem sufficient.
+ */
+#define HC_RETRY_MAX 20
+
+ for (i = 0; i < HC_RETRY_MAX; ++i) {
+ uint64_t status;
+
+ status = hypercall_post_message(inprm_paddr);
+ if (status == HYPERCALL_STATUS_SUCCESS)
+ return 0;
+
+ pause_sbt("hcpmsg", time, 0, C_HARDCLOCK);
+ if (time < SBT_1S * 2)
+ time *= 2;
+
+ /* Restore input parameter and try again */
+ memcpy(inprm, &mh->mh_inprm_save, HYPERCALL_POSTMSGIN_SIZE);
+ }
+
+#undef HC_RETRY_MAX
+
+ return EIO;
+}
+
+int
+vmbus_msghc_exec(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh)
+{
+ int error;
+
+ vmbus_xact_activate(mh->mh_xact);
+ error = vmbus_msghc_exec_noresult(mh);
+ if (error)
+ vmbus_xact_deactivate(mh->mh_xact);
+ return error;
+}
+
+void
+vmbus_msghc_exec_cancel(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh)
+{
+
+ vmbus_xact_deactivate(mh->mh_xact);
+}
+
+const struct vmbus_message *
+vmbus_msghc_wait_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh)
+{
+ size_t resp_len;
+
+ return (vmbus_xact_wait(mh->mh_xact, &resp_len));
+}
+
+const struct vmbus_message *
+vmbus_msghc_poll_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh)
+{
+ size_t resp_len;
+
+ return (vmbus_xact_poll(mh->mh_xact, &resp_len));
+}
+
+void
+vmbus_msghc_wakeup(struct vmbus_softc *sc, const struct vmbus_message *msg)
+{
+
+ vmbus_xact_ctx_wakeup(sc->vmbus_xc, msg, sizeof(*msg));
+}
+
+uint32_t
+vmbus_gpadl_alloc(struct vmbus_softc *sc)
+{
+ uint32_t gpadl;
+
+again:
+ gpadl = atomic_fetchadd_int(&sc->vmbus_gpadl, 1);
+ if (gpadl == 0)
+ goto again;
+ return (gpadl);
+}
+
+/* Used for Hyper-V socket when guest client connects to host */
+int
+vmbus_req_tl_connect(struct hyperv_guid *guest_srv_id,
+ struct hyperv_guid *host_srv_id)
+{
+ struct vmbus_softc *sc = vmbus_get_softc();
+ struct vmbus_chanmsg_tl_connect *req;
+ struct vmbus_msghc *mh;
+ int error;
+
+ if (!sc)
+ return ENXIO;
+
+ mh = vmbus_msghc_get(sc, sizeof(*req));
+ if (mh == NULL) {
+ device_printf(sc->vmbus_dev,
+ "can not get msg hypercall for tl connect\n");
+ return ENXIO;
+ }
+
+ req = vmbus_msghc_dataptr(mh);
+ req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_TL_CONN;
+ req->guest_endpoint_id = *guest_srv_id;
+ req->host_service_id = *host_srv_id;
+
+ error = vmbus_msghc_exec_noresult(mh);
+ vmbus_msghc_put(sc, mh);
+
+ if (error) {
+ device_printf(sc->vmbus_dev,
+ "tl connect msg hypercall failed\n");
+ }
+
+ return error;
+}
+
+static int
+vmbus_connect(struct vmbus_softc *sc, uint32_t version)
+{
+ struct vmbus_chanmsg_connect *req;
+ const struct vmbus_message *msg;
+ struct vmbus_msghc *mh;
+ int error, done = 0;
+
+ mh = vmbus_msghc_get(sc, sizeof(*req));
+ if (mh == NULL)
+ return ENXIO;
+
+ req = vmbus_msghc_dataptr(mh);
+ req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CONNECT;
+ req->chm_ver = version;
+ req->chm_evtflags = sc->vmbus_evtflags_dma.hv_paddr;
+ req->chm_mnf1 = sc->vmbus_mnf1_dma.hv_paddr;
+ req->chm_mnf2 = sc->vmbus_mnf2_dma.hv_paddr;
+
+ error = vmbus_msghc_exec(sc, mh);
+ if (error) {
+ vmbus_msghc_put(sc, mh);
+ return error;
+ }
+
+ msg = vmbus_msghc_wait_result(sc, mh);
+ done = ((const struct vmbus_chanmsg_connect_resp *)
+ msg->msg_data)->chm_done;
+
+ vmbus_msghc_put(sc, mh);
+
+ return (done ? 0 : EOPNOTSUPP);
+}
+
+static int
+vmbus_init(struct vmbus_softc *sc)
+{
+ int i;
+
+ for (i = 0; i < nitems(vmbus_version); ++i) {
+ int error;
+
+ error = vmbus_connect(sc, vmbus_version[i]);
+ if (!error) {
+ vmbus_current_version = vmbus_version[i];
+ sc->vmbus_version = vmbus_version[i];
+ device_printf(sc->vmbus_dev, "version %u.%u\n",
+ VMBUS_VERSION_MAJOR(sc->vmbus_version),
+ VMBUS_VERSION_MINOR(sc->vmbus_version));
+ return 0;
+ }
+ }
+ return ENXIO;
+}
+
+static void
+vmbus_disconnect(struct vmbus_softc *sc)
+{
+ struct vmbus_chanmsg_disconnect *req;
+ struct vmbus_msghc *mh;
+ int error;
+
+ mh = vmbus_msghc_get(sc, sizeof(*req));
+ if (mh == NULL) {
+ device_printf(sc->vmbus_dev,
+ "can not get msg hypercall for disconnect\n");
+ return;
+ }
+
+ req = vmbus_msghc_dataptr(mh);
+ req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_DISCONNECT;
+
+ error = vmbus_msghc_exec_noresult(mh);
+ vmbus_msghc_put(sc, mh);
+
+ if (error) {
+ device_printf(sc->vmbus_dev,
+ "disconnect msg hypercall failed\n");
+ }
+}
+
+static int
+vmbus_req_channels(struct vmbus_softc *sc)
+{
+ struct vmbus_chanmsg_chrequest *req;
+ struct vmbus_msghc *mh;
+ int error;
+
+ mh = vmbus_msghc_get(sc, sizeof(*req));
+ if (mh == NULL)
+ return ENXIO;
+
+ req = vmbus_msghc_dataptr(mh);
+ req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHREQUEST;
+
+ error = vmbus_msghc_exec_noresult(mh);
+ vmbus_msghc_put(sc, mh);
+
+ return error;
+}
+
+static void
+vmbus_scan_done_task(void *xsc, int pending __unused)
+{
+ struct vmbus_softc *sc = xsc;
+
+ mtx_lock(&Giant);
+ sc->vmbus_scandone = true;
+ mtx_unlock(&Giant);
+ wakeup(&sc->vmbus_scandone);
+}
+
+static void
+vmbus_scan_done(struct vmbus_softc *sc,
+ const struct vmbus_message *msg __unused)
+{
+
+ taskqueue_enqueue(sc->vmbus_devtq, &sc->vmbus_scandone_task);
+}
+
+static int
+vmbus_scan(struct vmbus_softc *sc)
+{
+ int error;
+
+ /*
+ * Identify, probe and attach for non-channel devices.
+ */
+ bus_generic_probe(sc->vmbus_dev);
+ bus_generic_attach(sc->vmbus_dev);
+
+ /*
+ * This taskqueue serializes vmbus devices' attach and detach
+ * for channel offer and rescind messages.
+ */
+ sc->vmbus_devtq = taskqueue_create("vmbus dev", M_WAITOK,
+ taskqueue_thread_enqueue, &sc->vmbus_devtq);
+ taskqueue_start_threads(&sc->vmbus_devtq, 1, PI_NET, "vmbusdev");
+ TASK_INIT(&sc->vmbus_scandone_task, 0, vmbus_scan_done_task, sc);
+
+ /*
+ * This taskqueue handles sub-channel detach, so that vmbus
+ * device's detach running in vmbus_devtq can drain its sub-
+ * channels.
+ */
+ sc->vmbus_subchtq = taskqueue_create("vmbus subch", M_WAITOK,
+ taskqueue_thread_enqueue, &sc->vmbus_subchtq);
+ taskqueue_start_threads(&sc->vmbus_subchtq, 1, PI_NET, "vmbussch");
+
+ /*
+ * Start vmbus scanning.
+ */
+ error = vmbus_req_channels(sc);
+ if (error) {
+ device_printf(sc->vmbus_dev, "channel request failed: %d\n",
+ error);
+ return (error);
+ }
+
+ /*
+ * Wait for all vmbus devices from the initial channel offers to be
+ * attached.
+ */
+ GIANT_REQUIRED;
+ while (!sc->vmbus_scandone)
+ mtx_sleep(&sc->vmbus_scandone, &Giant, 0, "vmbusdev", 0);
+
+ if (bootverbose) {
+ device_printf(sc->vmbus_dev, "device scan, probe and attach "
+ "done\n");
+ }
+ return (0);
+}
+
+static void
+vmbus_scan_teardown(struct vmbus_softc *sc)
+{
+
+ GIANT_REQUIRED;
+ if (sc->vmbus_devtq != NULL) {
+ mtx_unlock(&Giant);
+ taskqueue_free(sc->vmbus_devtq);
+ mtx_lock(&Giant);
+ sc->vmbus_devtq = NULL;
+ }
+ if (sc->vmbus_subchtq != NULL) {
+ mtx_unlock(&Giant);
+ taskqueue_free(sc->vmbus_subchtq);
+ mtx_lock(&Giant);
+ sc->vmbus_subchtq = NULL;
+ }
+}
+
+static void
+vmbus_chanmsg_handle(struct vmbus_softc *sc, const struct vmbus_message *msg)
+{
+ vmbus_chanmsg_proc_t msg_proc;
+ uint32_t msg_type;
+
+ msg_type = ((const struct vmbus_chanmsg_hdr *)msg->msg_data)->chm_type;
+ if (msg_type >= VMBUS_CHANMSG_TYPE_MAX) {
+ device_printf(sc->vmbus_dev, "unknown message type 0x%x\n",
+ msg_type);
+ return;
+ }
+
+ msg_proc = vmbus_chanmsg_handlers[msg_type];
+ if (msg_proc != NULL)
+ msg_proc(sc, msg);
+
+ /* Channel specific processing */
+ vmbus_chan_msgproc(sc, msg);
+}
+
+static void
+vmbus_msg_task(void *xsc, int pending __unused)
+{
+ struct vmbus_softc *sc = xsc;
+ volatile struct vmbus_message *msg;
+
+ msg = VMBUS_PCPU_GET(sc, message, curcpu) + VMBUS_SINT_MESSAGE;
+ for (;;) {
+ if (msg->msg_type == HYPERV_MSGTYPE_NONE) {
+ /* No message */
+ break;
+ } else if (msg->msg_type == HYPERV_MSGTYPE_CHANNEL) {
+ /* Channel message */
+ vmbus_chanmsg_handle(sc,
+ __DEVOLATILE(const struct vmbus_message *, msg));
+ }
+
+ msg->msg_type = HYPERV_MSGTYPE_NONE;
+ /*
+ * Make sure the write to msg_type (i.e. set to
+ * HYPERV_MSGTYPE_NONE) happens before we read the
+ * msg_flags and EOMing. Otherwise, the EOMing will
+ * not deliver any more messages since there is no
+ * empty slot
+ *
+ * NOTE:
+ * mb() is used here, since atomic_thread_fence_seq_cst()
+ * will become compiler fence on UP kernel.
+ */
+ mb();
+ if (msg->msg_flags & VMBUS_MSGFLAG_PENDING) {
+ /*
+ * This will cause message queue rescan to possibly
+ * deliver another msg from the hypervisor
+ */
+ wrmsr(MSR_HV_EOM, 0);
+ }
+ }
+}
+
+static __inline int
+vmbus_handle_intr1(struct vmbus_softc *sc, struct trapframe *frame, int cpu)
+{
+ volatile struct vmbus_message *msg;
+ struct vmbus_message *msg_base;
+
+ msg_base = VMBUS_PCPU_GET(sc, message, cpu);
+
+ /*
+ * Check event timer.
+ *
+ * TODO: move this to independent IDT vector.
+ */
+ msg = msg_base + VMBUS_SINT_TIMER;
+ if (msg->msg_type == HYPERV_MSGTYPE_TIMER_EXPIRED) {
+ msg->msg_type = HYPERV_MSGTYPE_NONE;
+
+ vmbus_et_intr(frame);
+
+ /*
+ * Make sure the write to msg_type (i.e. set to
+ * HYPERV_MSGTYPE_NONE) happens before we read the
+ * msg_flags and EOMing. Otherwise, the EOMing will
+ * not deliver any more messages since there is no
+ * empty slot
+ *
+ * NOTE:
+ * mb() is used here, since atomic_thread_fence_seq_cst()
+ * will become compiler fence on UP kernel.
+ */
+ mb();
+ if (msg->msg_flags & VMBUS_MSGFLAG_PENDING) {
+ /*
+ * This will cause message queue rescan to possibly
+ * deliver another msg from the hypervisor
+ */
+ wrmsr(MSR_HV_EOM, 0);
+ }
+ }
+
+ /*
+ * Check events. Hot path for network and storage I/O data; high rate.
+ *
+ * NOTE:
+ * As recommended by the Windows guest fellows, we check events before
+ * checking messages.
+ */
+ sc->vmbus_event_proc(sc, cpu);
+
+ /*
+ * Check messages. Mainly management stuffs; ultra low rate.
+ */
+ msg = msg_base + VMBUS_SINT_MESSAGE;
+ if (__predict_false(msg->msg_type != HYPERV_MSGTYPE_NONE)) {
+ taskqueue_enqueue(VMBUS_PCPU_GET(sc, message_tq, cpu),
+ VMBUS_PCPU_PTR(sc, message_task, cpu));
+ }
+
+ return (FILTER_HANDLED);
+}
+
+void
+vmbus_handle_intr(struct trapframe *trap_frame)
+{
+ struct vmbus_softc *sc = vmbus_get_softc();
+ int cpu = curcpu;
+
+ /*
+ * Disable preemption.
+ */
+ critical_enter();
+
+ /*
+ * Do a little interrupt counting.
+ */
+ (*VMBUS_PCPU_GET(sc, intr_cnt, cpu))++;
+
+ vmbus_handle_intr1(sc, trap_frame, cpu);
+
+ /*
+ * Enable preemption.
+ */
+ critical_exit();
+}
+
+static void
+vmbus_synic_setup(void *xsc)
+{
+ struct vmbus_softc *sc = xsc;
+ int cpu = curcpu;
+ uint64_t val, orig;
+ uint32_t sint;
+
+ if (hyperv_features & CPUID_HV_MSR_VP_INDEX) {
+ /* Save virtual processor id. */
+ VMBUS_PCPU_GET(sc, vcpuid, cpu) = rdmsr(MSR_HV_VP_INDEX);
+ } else {
+ /* Set virtual processor id to 0 for compatibility. */
+ VMBUS_PCPU_GET(sc, vcpuid, cpu) = 0;
+ }
+
+ /*
+ * Setup the SynIC message.
+ */
+ orig = rdmsr(MSR_HV_SIMP);
+ val = MSR_HV_SIMP_ENABLE | (orig & MSR_HV_SIMP_RSVD_MASK) |
+ ((VMBUS_PCPU_GET(sc, message_dma.hv_paddr, cpu) >> PAGE_SHIFT) <<
+ MSR_HV_SIMP_PGSHIFT);
+ wrmsr(MSR_HV_SIMP, val);
+
+ /*
+ * Setup the SynIC event flags.
+ */
+ orig = rdmsr(MSR_HV_SIEFP);
+ val = MSR_HV_SIEFP_ENABLE | (orig & MSR_HV_SIEFP_RSVD_MASK) |
+ ((VMBUS_PCPU_GET(sc, event_flags_dma.hv_paddr, cpu)
+ >> PAGE_SHIFT) << MSR_HV_SIEFP_PGSHIFT);
+ wrmsr(MSR_HV_SIEFP, val);
+
+
+ /*
+ * Configure and unmask SINT for message and event flags.
+ */
+ sint = MSR_HV_SINT0 + VMBUS_SINT_MESSAGE;
+ orig = rdmsr(sint);
+ val = sc->vmbus_idtvec | MSR_HV_SINT_AUTOEOI |
+ (orig & MSR_HV_SINT_RSVD_MASK);
+ wrmsr(sint, val);
+
+ /*
+ * Configure and unmask SINT for timer.
+ */
+ sint = MSR_HV_SINT0 + VMBUS_SINT_TIMER;
+ orig = rdmsr(sint);
+ val = sc->vmbus_idtvec | MSR_HV_SINT_AUTOEOI |
+ (orig & MSR_HV_SINT_RSVD_MASK);
+ wrmsr(sint, val);
+
+ /*
+ * All done; enable SynIC.
+ */
+ orig = rdmsr(MSR_HV_SCONTROL);
+ val = MSR_HV_SCTRL_ENABLE | (orig & MSR_HV_SCTRL_RSVD_MASK);
+ wrmsr(MSR_HV_SCONTROL, val);
+}
+
+static void
+vmbus_synic_teardown(void *arg)
+{
+ uint64_t orig;
+ uint32_t sint;
+
+ /*
+ * Disable SynIC.
+ */
+ orig = rdmsr(MSR_HV_SCONTROL);
+ wrmsr(MSR_HV_SCONTROL, (orig & MSR_HV_SCTRL_RSVD_MASK));
+
+ /*
+ * Mask message and event flags SINT.
+ */
+ sint = MSR_HV_SINT0 + VMBUS_SINT_MESSAGE;
+ orig = rdmsr(sint);
+ wrmsr(sint, orig | MSR_HV_SINT_MASKED);
+
+ /*
+ * Mask timer SINT.
+ */
+ sint = MSR_HV_SINT0 + VMBUS_SINT_TIMER;
+ orig = rdmsr(sint);
+ wrmsr(sint, orig | MSR_HV_SINT_MASKED);
+
+ /*
+ * Teardown SynIC message.
+ */
+ orig = rdmsr(MSR_HV_SIMP);
+ wrmsr(MSR_HV_SIMP, (orig & MSR_HV_SIMP_RSVD_MASK));
+
+ /*
+ * Teardown SynIC event flags.
+ */
+ orig = rdmsr(MSR_HV_SIEFP);
+ wrmsr(MSR_HV_SIEFP, (orig & MSR_HV_SIEFP_RSVD_MASK));
+}
+
+static int
+vmbus_dma_alloc(struct vmbus_softc *sc)
+{
+ bus_dma_tag_t parent_dtag;
+ uint8_t *evtflags;
+ int cpu;
+
+ parent_dtag = bus_get_dma_tag(sc->vmbus_dev);
+ CPU_FOREACH(cpu) {
+ void *ptr;
+
+ /*
+ * Per-cpu messages and event flags.
+ */
+ ptr = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0,
+ PAGE_SIZE, VMBUS_PCPU_PTR(sc, message_dma, cpu),
+ BUS_DMA_WAITOK | BUS_DMA_ZERO);
+ if (ptr == NULL)
+ return ENOMEM;
+ VMBUS_PCPU_GET(sc, message, cpu) = ptr;
+
+ ptr = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0,
+ PAGE_SIZE, VMBUS_PCPU_PTR(sc, event_flags_dma, cpu),
+ BUS_DMA_WAITOK | BUS_DMA_ZERO);
+ if (ptr == NULL)
+ return ENOMEM;
+ VMBUS_PCPU_GET(sc, event_flags, cpu) = ptr;
+ }
+
+ evtflags = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0,
+ PAGE_SIZE, &sc->vmbus_evtflags_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO);
+ if (evtflags == NULL)
+ return ENOMEM;
+ sc->vmbus_rx_evtflags = (u_long *)evtflags;
+ sc->vmbus_tx_evtflags = (u_long *)(evtflags + (PAGE_SIZE / 2));
+ sc->vmbus_evtflags = evtflags;
+
+ sc->vmbus_mnf1 = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0,
+ PAGE_SIZE, &sc->vmbus_mnf1_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO);
+ if (sc->vmbus_mnf1 == NULL)
+ return ENOMEM;
+
+ sc->vmbus_mnf2 = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0,
+ sizeof(struct vmbus_mnf), &sc->vmbus_mnf2_dma,
+ BUS_DMA_WAITOK | BUS_DMA_ZERO);
+ if (sc->vmbus_mnf2 == NULL)
+ return ENOMEM;
+
+ return 0;
+}
+
+static void
+vmbus_dma_free(struct vmbus_softc *sc)
+{
+ int cpu;
+
+ if (sc->vmbus_evtflags != NULL) {
+ hyperv_dmamem_free(&sc->vmbus_evtflags_dma, sc->vmbus_evtflags);
+ sc->vmbus_evtflags = NULL;
+ sc->vmbus_rx_evtflags = NULL;
+ sc->vmbus_tx_evtflags = NULL;
+ }
+ if (sc->vmbus_mnf1 != NULL) {
+ hyperv_dmamem_free(&sc->vmbus_mnf1_dma, sc->vmbus_mnf1);
+ sc->vmbus_mnf1 = NULL;
+ }
+ if (sc->vmbus_mnf2 != NULL) {
+ hyperv_dmamem_free(&sc->vmbus_mnf2_dma, sc->vmbus_mnf2);
+ sc->vmbus_mnf2 = NULL;
+ }
+
+ CPU_FOREACH(cpu) {
+ if (VMBUS_PCPU_GET(sc, message, cpu) != NULL) {
+ hyperv_dmamem_free(
+ VMBUS_PCPU_PTR(sc, message_dma, cpu),
+ VMBUS_PCPU_GET(sc, message, cpu));
+ VMBUS_PCPU_GET(sc, message, cpu) = NULL;
+ }
+ if (VMBUS_PCPU_GET(sc, event_flags, cpu) != NULL) {
+ hyperv_dmamem_free(
+ VMBUS_PCPU_PTR(sc, event_flags_dma, cpu),
+ VMBUS_PCPU_GET(sc, event_flags, cpu));
+ VMBUS_PCPU_GET(sc, event_flags, cpu) = NULL;
+ }
+ }
+}
+
+static int
+vmbus_intr_setup(struct vmbus_softc *sc)
+{
+ int cpu;
+
+ CPU_FOREACH(cpu) {
+ char buf[MAXCOMLEN + 1];
+ cpuset_t cpu_mask;
+
+ /* Allocate an interrupt counter for Hyper-V interrupt */
+ snprintf(buf, sizeof(buf), "cpu%d:hyperv", cpu);
+ intrcnt_add(buf, VMBUS_PCPU_PTR(sc, intr_cnt, cpu));
+
+ /*
+ * Setup taskqueue to handle events. Task will be per-
+ * channel.
+ */
+ VMBUS_PCPU_GET(sc, event_tq, cpu) = taskqueue_create_fast(
+ "hyperv event", M_WAITOK, taskqueue_thread_enqueue,
+ VMBUS_PCPU_PTR(sc, event_tq, cpu));
+ if (vmbus_pin_evttask) {
+ CPU_SETOF(cpu, &cpu_mask);
+ taskqueue_start_threads_cpuset(
+ VMBUS_PCPU_PTR(sc, event_tq, cpu), 1, PI_NET,
+ &cpu_mask, "hvevent%d", cpu);
+ } else {
+ taskqueue_start_threads(
+ VMBUS_PCPU_PTR(sc, event_tq, cpu), 1, PI_NET,
+ "hvevent%d", cpu);
+ }
+
+ /*
+ * Setup tasks and taskqueues to handle messages.
+ */
+ VMBUS_PCPU_GET(sc, message_tq, cpu) = taskqueue_create_fast(
+ "hyperv msg", M_WAITOK, taskqueue_thread_enqueue,
+ VMBUS_PCPU_PTR(sc, message_tq, cpu));
+ CPU_SETOF(cpu, &cpu_mask);
+ taskqueue_start_threads_cpuset(
+ VMBUS_PCPU_PTR(sc, message_tq, cpu), 1, PI_NET, &cpu_mask,
+ "hvmsg%d", cpu);
+ TASK_INIT(VMBUS_PCPU_PTR(sc, message_task, cpu), 0,
+ vmbus_msg_task, sc);
+ }
+
+#if defined(__amd64__) && defined(KLD_MODULE)
+ pmap_pti_add_kva(VMBUS_ISR_ADDR, VMBUS_ISR_ADDR + PAGE_SIZE, true);
+#endif
+
+ /*
+ * All Hyper-V ISR required resources are setup, now let's find a
+ * free IDT vector for Hyper-V ISR and set it up.
+ */
+ sc->vmbus_idtvec = lapic_ipi_alloc(pti ? IDTVEC(vmbus_isr_pti) :
+ IDTVEC(vmbus_isr));
+ if (sc->vmbus_idtvec < 0) {
+#if defined(__amd64__) && defined(KLD_MODULE)
+ pmap_pti_remove_kva(VMBUS_ISR_ADDR, VMBUS_ISR_ADDR + PAGE_SIZE);
+#endif
+ device_printf(sc->vmbus_dev, "cannot find free IDT vector\n");
+ return ENXIO;
+ }
+ if (bootverbose) {
+ device_printf(sc->vmbus_dev, "vmbus IDT vector %d\n",
+ sc->vmbus_idtvec);
+ }
+ return 0;
+}
+
+static void
+vmbus_intr_teardown(struct vmbus_softc *sc)
+{
+ int cpu;
+
+ if (sc->vmbus_idtvec >= 0) {
+ lapic_ipi_free(sc->vmbus_idtvec);
+ sc->vmbus_idtvec = -1;
+ }
+
+#if defined(__amd64__) && defined(KLD_MODULE)
+ pmap_pti_remove_kva(VMBUS_ISR_ADDR, VMBUS_ISR_ADDR + PAGE_SIZE);
+#endif
+
+ CPU_FOREACH(cpu) {
+ if (VMBUS_PCPU_GET(sc, event_tq, cpu) != NULL) {
+ taskqueue_free(VMBUS_PCPU_GET(sc, event_tq, cpu));
+ VMBUS_PCPU_GET(sc, event_tq, cpu) = NULL;
+ }
+ if (VMBUS_PCPU_GET(sc, message_tq, cpu) != NULL) {
+ taskqueue_drain(VMBUS_PCPU_GET(sc, message_tq, cpu),
+ VMBUS_PCPU_PTR(sc, message_task, cpu));
+ taskqueue_free(VMBUS_PCPU_GET(sc, message_tq, cpu));
+ VMBUS_PCPU_GET(sc, message_tq, cpu) = NULL;
+ }
+ }
+}
+
+static int
+vmbus_read_ivar(device_t dev, device_t child, int index, uintptr_t *result)
+{
+ return (ENOENT);
+}
+
+static int
+vmbus_child_pnpinfo(device_t dev, device_t child, struct sbuf *sb)
+{
+ const struct vmbus_channel *chan;
+ char guidbuf[HYPERV_GUID_STRLEN];
+
+ chan = vmbus_get_channel(child);
+ if (chan == NULL) {
+ /* Event timer device, which does not belong to a channel */
+ return (0);
+ }
+
+ hyperv_guid2str(&chan->ch_guid_type, guidbuf, sizeof(guidbuf));
+ sbuf_printf(sb, "classid=%s", guidbuf);
+
+ hyperv_guid2str(&chan->ch_guid_inst, guidbuf, sizeof(guidbuf));
+ sbuf_printf(sb, " deviceid=%s", guidbuf);
+
+ return (0);
+}
+
+int
+vmbus_add_child(struct vmbus_channel *chan)
+{
+ struct vmbus_softc *sc = chan->ch_vmbus;
+ device_t parent = sc->vmbus_dev;
+
+ mtx_lock(&Giant);
+
+ chan->ch_dev = device_add_child(parent, NULL, -1);
+ if (chan->ch_dev == NULL) {
+ mtx_unlock(&Giant);
+ device_printf(parent, "device_add_child for chan%u failed\n",
+ chan->ch_id);
+ return (ENXIO);
+ }
+ device_set_ivars(chan->ch_dev, chan);
+ device_probe_and_attach(chan->ch_dev);
+
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+int
+vmbus_delete_child(struct vmbus_channel *chan)
+{
+ int error = 0;
+
+ mtx_lock(&Giant);
+ if (chan->ch_dev != NULL) {
+ error = device_delete_child(chan->ch_vmbus->vmbus_dev,
+ chan->ch_dev);
+ chan->ch_dev = NULL;
+ }
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+static int
+vmbus_sysctl_version(SYSCTL_HANDLER_ARGS)
+{
+ struct vmbus_softc *sc = arg1;
+ char verstr[16];
+
+ snprintf(verstr, sizeof(verstr), "%u.%u",
+ VMBUS_VERSION_MAJOR(sc->vmbus_version),
+ VMBUS_VERSION_MINOR(sc->vmbus_version));
+ return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
+}
+
+/*
+ * We need the function to make sure the MMIO resource is allocated from the
+ * ranges found in _CRS.
+ *
+ * For the release function, we can use bus_generic_release_resource().
+ */
+static struct resource *
+vmbus_alloc_resource(device_t dev, device_t child, int type, int *rid,
+ rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
+{
+ device_t parent = device_get_parent(dev);
+ struct resource *res;
+
+#ifdef NEW_PCIB
+ if (type == SYS_RES_MEMORY) {
+ struct vmbus_softc *sc = device_get_softc(dev);
+
+ res = pcib_host_res_alloc(&sc->vmbus_mmio_res, child, type,
+ rid, start, end, count, flags);
+ } else
+#endif
+ {
+ res = BUS_ALLOC_RESOURCE(parent, child, type, rid, start,
+ end, count, flags);
+ }
+
+ return (res);
+}
+
+static int
+vmbus_alloc_msi(device_t bus, device_t dev, int count, int maxcount, int *irqs)
+{
+
+ return (PCIB_ALLOC_MSI(device_get_parent(bus), dev, count, maxcount,
+ irqs));
+}
+
+static int
+vmbus_release_msi(device_t bus, device_t dev, int count, int *irqs)
+{
+
+ return (PCIB_RELEASE_MSI(device_get_parent(bus), dev, count, irqs));
+}
+
+static int
+vmbus_alloc_msix(device_t bus, device_t dev, int *irq)
+{
+
+ return (PCIB_ALLOC_MSIX(device_get_parent(bus), dev, irq));
+}
+
+static int
+vmbus_release_msix(device_t bus, device_t dev, int irq)
+{
+
+ return (PCIB_RELEASE_MSIX(device_get_parent(bus), dev, irq));
+}
+
+static int
+vmbus_map_msi(device_t bus, device_t dev, int irq, uint64_t *addr,
+ uint32_t *data)
+{
+
+ return (PCIB_MAP_MSI(device_get_parent(bus), dev, irq, addr, data));
+}
+
+static uint32_t
+vmbus_get_version_method(device_t bus, device_t dev)
+{
+ struct vmbus_softc *sc = device_get_softc(bus);
+
+ return sc->vmbus_version;
+}
+
+static int
+vmbus_probe_guid_method(device_t bus, device_t dev,
+ const struct hyperv_guid *guid)
+{
+ const struct vmbus_channel *chan = vmbus_get_channel(dev);
+
+ if (memcmp(&chan->ch_guid_type, guid, sizeof(struct hyperv_guid)) == 0)
+ return 0;
+ return ENXIO;
+}
+
+static uint32_t
+vmbus_get_vcpu_id_method(device_t bus, device_t dev, int cpu)
+{
+ const struct vmbus_softc *sc = device_get_softc(bus);
+
+ return (VMBUS_PCPU_GET(sc, vcpuid, cpu));
+}
+
+static struct taskqueue *
+vmbus_get_eventtq_method(device_t bus, device_t dev __unused, int cpu)
+{
+ const struct vmbus_softc *sc = device_get_softc(bus);
+
+ KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu%d", cpu));
+ return (VMBUS_PCPU_GET(sc, event_tq, cpu));
+}
+
+#ifdef NEW_PCIB
+#define VTPM_BASE_ADDR 0xfed40000
+#define FOUR_GB (1ULL << 32)
+
+enum parse_pass { parse_64, parse_32 };
+
+struct parse_context {
+ device_t vmbus_dev;
+ enum parse_pass pass;
+};
+
+static ACPI_STATUS
+parse_crs(ACPI_RESOURCE *res, void *ctx)
+{
+ const struct parse_context *pc = ctx;
+ device_t vmbus_dev = pc->vmbus_dev;
+
+ struct vmbus_softc *sc = device_get_softc(vmbus_dev);
+ UINT64 start, end;
+
+ switch (res->Type) {
+ case ACPI_RESOURCE_TYPE_ADDRESS32:
+ start = res->Data.Address32.Address.Minimum;
+ end = res->Data.Address32.Address.Maximum;
+ break;
+
+ case ACPI_RESOURCE_TYPE_ADDRESS64:
+ start = res->Data.Address64.Address.Minimum;
+ end = res->Data.Address64.Address.Maximum;
+ break;
+
+ default:
+ /* Unused types. */
+ return (AE_OK);
+ }
+
+ /*
+ * We don't use <1MB addresses.
+ */
+ if (end < 0x100000)
+ return (AE_OK);
+
+ /* Don't conflict with vTPM. */
+ if (end >= VTPM_BASE_ADDR && start < VTPM_BASE_ADDR)
+ end = VTPM_BASE_ADDR - 1;
+
+ if ((pc->pass == parse_32 && start < FOUR_GB) ||
+ (pc->pass == parse_64 && start >= FOUR_GB))
+ pcib_host_res_decodes(&sc->vmbus_mmio_res, SYS_RES_MEMORY,
+ start, end, 0);
+
+ return (AE_OK);
+}
+
+static void
+vmbus_get_crs(device_t dev, device_t vmbus_dev, enum parse_pass pass)
+{
+ struct parse_context pc;
+ ACPI_STATUS status;
+
+ if (bootverbose)
+ device_printf(dev, "walking _CRS, pass=%d\n", pass);
+
+ pc.vmbus_dev = vmbus_dev;
+ pc.pass = pass;
+ status = AcpiWalkResources(acpi_get_handle(dev), "_CRS",
+ parse_crs, &pc);
+
+ if (bootverbose && ACPI_FAILURE(status))
+ device_printf(dev, "_CRS: not found, pass=%d\n", pass);
+}
+
+static void
+vmbus_get_mmio_res_pass(device_t dev, enum parse_pass pass)
+{
+ device_t acpi0, parent;
+
+ parent = device_get_parent(dev);
+
+ acpi0 = device_get_parent(parent);
+ if (strcmp("acpi0", device_get_nameunit(acpi0)) == 0) {
+ device_t *children;
+ int count;
+
+ /*
+ * Try to locate VMBUS resources and find _CRS on them.
+ */
+ if (device_get_children(acpi0, &children, &count) == 0) {
+ int i;
+
+ for (i = 0; i < count; ++i) {
+ if (!device_is_attached(children[i]))
+ continue;
+
+ if (strcmp("vmbus_res",
+ device_get_name(children[i])) == 0)
+ vmbus_get_crs(children[i], dev, pass);
+ }
+ free(children, M_TEMP);
+ }
+
+ /*
+ * Try to find _CRS on acpi.
+ */
+ vmbus_get_crs(acpi0, dev, pass);
+ } else {
+ device_printf(dev, "not grandchild of acpi\n");
+ }
+
+ /*
+ * Try to find _CRS on parent.
+ */
+ vmbus_get_crs(parent, dev, pass);
+}
+
+static void
+vmbus_get_mmio_res(device_t dev)
+{
+ struct vmbus_softc *sc = device_get_softc(dev);
+ /*
+ * We walk the resources twice to make sure that: in the resource
+ * list, the 32-bit resources appear behind the 64-bit resources.
+ * NB: resource_list_add() uses INSERT_TAIL. This way, when we
+ * iterate through the list to find a range for a 64-bit BAR in
+ * vmbus_alloc_resource(), we can make sure we try to use >4GB
+ * ranges first.
+ */
+ pcib_host_res_init(dev, &sc->vmbus_mmio_res);
+
+ vmbus_get_mmio_res_pass(dev, parse_64);
+ vmbus_get_mmio_res_pass(dev, parse_32);
+}
+
+/*
+ * On Gen2 VMs, Hyper-V provides mmio space for framebuffer.
+ * This mmio address range is not useable for other PCI devices.
+ * Currently only efifb and vbefb drivers are using this range without
+ * reserving it from system.
+ * Therefore, vmbus driver reserves it before any other PCI device
+ * drivers start to request mmio addresses.
+ */
+static struct resource *hv_fb_res;
+
+static void
+vmbus_fb_mmio_res(device_t dev)
+{
+ struct efi_fb *efifb;
+ struct vbe_fb *vbefb;
+ rman_res_t fb_start, fb_end, fb_count;
+ int fb_height, fb_width;
+ caddr_t kmdp;
+
+ struct vmbus_softc *sc = device_get_softc(dev);
+ int rid = 0;
+
+ kmdp = preload_search_by_type("elf kernel");
+ if (kmdp == NULL)
+ kmdp = preload_search_by_type("elf64 kernel");
+ efifb = (struct efi_fb *)preload_search_info(kmdp,
+ MODINFO_METADATA | MODINFOMD_EFI_FB);
+ vbefb = (struct vbe_fb *)preload_search_info(kmdp,
+ MODINFO_METADATA | MODINFOMD_VBE_FB);
+ if (efifb != NULL) {
+ fb_start = efifb->fb_addr;
+ fb_end = efifb->fb_addr + efifb->fb_size;
+ fb_count = efifb->fb_size;
+ fb_height = efifb->fb_height;
+ fb_width = efifb->fb_width;
+ } else if (vbefb != NULL) {
+ fb_start = vbefb->fb_addr;
+ fb_end = vbefb->fb_addr + vbefb->fb_size;
+ fb_count = vbefb->fb_size;
+ fb_height = vbefb->fb_height;
+ fb_width = vbefb->fb_width;
+ } else {
+ if (bootverbose)
+ device_printf(dev,
+ "no preloaded kernel fb information\n");
+ /* We are on Gen1 VM, just return. */
+ return;
+ }
+
+ if (bootverbose)
+ device_printf(dev,
+ "fb: fb_addr: %#jx, size: %#jx, "
+ "actual size needed: 0x%x\n",
+ fb_start, fb_count, fb_height * fb_width);
+
+ hv_fb_res = pcib_host_res_alloc(&sc->vmbus_mmio_res, dev,
+ SYS_RES_MEMORY, &rid, fb_start, fb_end, fb_count,
+ RF_ACTIVE | rman_make_alignment_flags(PAGE_SIZE));
+
+ if (hv_fb_res && bootverbose)
+ device_printf(dev,
+ "successfully reserved memory for framebuffer "
+ "starting at %#jx, size %#jx\n",
+ fb_start, fb_count);
+}
+
+static void
+vmbus_free_mmio_res(device_t dev)
+{
+ struct vmbus_softc *sc = device_get_softc(dev);
+
+ pcib_host_res_free(dev, &sc->vmbus_mmio_res);
+
+ if (hv_fb_res)
+ hv_fb_res = NULL;
+}
+#endif /* NEW_PCIB */
+
+static void
+vmbus_identify(driver_t *driver, device_t parent)
+{
+
+ if (device_get_unit(parent) != 0 || vm_guest != VM_GUEST_HV ||
+ (hyperv_features & CPUID_HV_MSR_SYNIC) == 0)
+ return;
+ device_add_child(parent, "vmbus", -1);
+}
+
+static int
+vmbus_probe(device_t dev)
+{
+
+ if (device_get_unit(dev) != 0 || vm_guest != VM_GUEST_HV ||
+ (hyperv_features & CPUID_HV_MSR_SYNIC) == 0)
+ return (ENXIO);
+
+ device_set_desc(dev, "Hyper-V Vmbus");
+ return (BUS_PROBE_DEFAULT);
+}
+
+/**
+ * @brief Main vmbus driver initialization routine.
+ *
+ * Here, we
+ * - initialize the vmbus driver context
+ * - setup various driver entry points
+ * - invoke the vmbus hv main init routine
+ * - get the irq resource
+ * - invoke the vmbus to add the vmbus root device
+ * - setup the vmbus root device
+ * - retrieve the channel offers
+ */
+static int
+vmbus_doattach(struct vmbus_softc *sc)
+{
+ struct sysctl_oid_list *child;
+ struct sysctl_ctx_list *ctx;
+ int ret;
+
+ if (sc->vmbus_flags & VMBUS_FLAG_ATTACHED)
+ return (0);
+
+#ifdef NEW_PCIB
+ vmbus_get_mmio_res(sc->vmbus_dev);
+ vmbus_fb_mmio_res(sc->vmbus_dev);
+#endif
+
+ sc->vmbus_flags |= VMBUS_FLAG_ATTACHED;
+
+ sc->vmbus_gpadl = VMBUS_GPADL_START;
+ mtx_init(&sc->vmbus_prichan_lock, "vmbus prichan", NULL, MTX_DEF);
+ TAILQ_INIT(&sc->vmbus_prichans);
+ mtx_init(&sc->vmbus_chan_lock, "vmbus channel", NULL, MTX_DEF);
+ TAILQ_INIT(&sc->vmbus_chans);
+ sc->vmbus_chmap = malloc(
+ sizeof(struct vmbus_channel *) * VMBUS_CHAN_MAX, M_DEVBUF,
+ M_WAITOK | M_ZERO);
+
+ /*
+ * Create context for "post message" Hypercalls
+ */
+ sc->vmbus_xc = vmbus_xact_ctx_create(bus_get_dma_tag(sc->vmbus_dev),
+ HYPERCALL_POSTMSGIN_SIZE, VMBUS_MSG_SIZE,
+ sizeof(struct vmbus_msghc));
+ if (sc->vmbus_xc == NULL) {
+ ret = ENXIO;
+ goto cleanup;
+ }
+
+ /*
+ * Allocate DMA stuffs.
+ */
+ ret = vmbus_dma_alloc(sc);
+ if (ret != 0)
+ goto cleanup;
+
+ /*
+ * Setup interrupt.
+ */
+ ret = vmbus_intr_setup(sc);
+ if (ret != 0)
+ goto cleanup;
+
+ /*
+ * Setup SynIC.
+ */
+ if (bootverbose)
+ device_printf(sc->vmbus_dev, "smp_started = %d\n", smp_started);
+ smp_rendezvous(NULL, vmbus_synic_setup, NULL, sc);
+ sc->vmbus_flags |= VMBUS_FLAG_SYNIC;
+
+ /*
+ * Initialize vmbus, e.g. connect to Hypervisor.
+ */
+ ret = vmbus_init(sc);
+ if (ret != 0)
+ goto cleanup;
+
+ if (sc->vmbus_version == VMBUS_VERSION_WS2008 ||
+ sc->vmbus_version == VMBUS_VERSION_WIN7)
+ sc->vmbus_event_proc = vmbus_event_proc_compat;
+ else
+ sc->vmbus_event_proc = vmbus_event_proc;
+
+ ret = vmbus_scan(sc);
+ if (ret != 0)
+ goto cleanup;
+
+ ctx = device_get_sysctl_ctx(sc->vmbus_dev);
+ child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->vmbus_dev));
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "version",
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ vmbus_sysctl_version, "A", "vmbus version");
+
+ return (ret);
+
+cleanup:
+ vmbus_scan_teardown(sc);
+ vmbus_intr_teardown(sc);
+ vmbus_dma_free(sc);
+ if (sc->vmbus_xc != NULL) {
+ vmbus_xact_ctx_destroy(sc->vmbus_xc);
+ sc->vmbus_xc = NULL;
+ }
+ free(__DEVOLATILE(void *, sc->vmbus_chmap), M_DEVBUF);
+ mtx_destroy(&sc->vmbus_prichan_lock);
+ mtx_destroy(&sc->vmbus_chan_lock);
+
+ return (ret);
+}
+
+static void
+vmbus_event_proc_dummy(struct vmbus_softc *sc __unused, int cpu __unused)
+{
+}
+
+#ifdef EARLY_AP_STARTUP
+
+static void
+vmbus_intrhook(void *xsc)
+{
+ struct vmbus_softc *sc = xsc;
+
+ if (bootverbose)
+ device_printf(sc->vmbus_dev, "intrhook\n");
+ vmbus_doattach(sc);
+ config_intrhook_disestablish(&sc->vmbus_intrhook);
+}
+
+#endif /* EARLY_AP_STARTUP */
+
+static int
+vmbus_attach(device_t dev)
+{
+ vmbus_sc = device_get_softc(dev);
+ vmbus_sc->vmbus_dev = dev;
+ vmbus_sc->vmbus_idtvec = -1;
+
+ /*
+ * Event processing logic will be configured:
+ * - After the vmbus protocol version negotiation.
+ * - Before we request channel offers.
+ */
+ vmbus_sc->vmbus_event_proc = vmbus_event_proc_dummy;
+
+#ifdef EARLY_AP_STARTUP
+ /*
+ * Defer the real attach until the pause(9) works as expected.
+ */
+ vmbus_sc->vmbus_intrhook.ich_func = vmbus_intrhook;
+ vmbus_sc->vmbus_intrhook.ich_arg = vmbus_sc;
+ config_intrhook_establish(&vmbus_sc->vmbus_intrhook);
+#else /* !EARLY_AP_STARTUP */
+ /*
+ * If the system has already booted and thread
+ * scheduling is possible indicated by the global
+ * cold set to zero, we just call the driver
+ * initialization directly.
+ */
+ if (!cold)
+ vmbus_doattach(vmbus_sc);
+#endif /* EARLY_AP_STARTUP */
+
+ return (0);
+}
+
+static int
+vmbus_detach(device_t dev)
+{
+ struct vmbus_softc *sc = device_get_softc(dev);
+
+ bus_generic_detach(dev);
+ vmbus_chan_destroy_all(sc);
+
+ vmbus_scan_teardown(sc);
+
+ vmbus_disconnect(sc);
+
+ if (sc->vmbus_flags & VMBUS_FLAG_SYNIC) {
+ sc->vmbus_flags &= ~VMBUS_FLAG_SYNIC;
+ smp_rendezvous(NULL, vmbus_synic_teardown, NULL, NULL);
+ }
+
+ vmbus_intr_teardown(sc);
+ vmbus_dma_free(sc);
+
+ if (sc->vmbus_xc != NULL) {
+ vmbus_xact_ctx_destroy(sc->vmbus_xc);
+ sc->vmbus_xc = NULL;
+ }
+
+ free(__DEVOLATILE(void *, sc->vmbus_chmap), M_DEVBUF);
+ mtx_destroy(&sc->vmbus_prichan_lock);
+ mtx_destroy(&sc->vmbus_chan_lock);
+
+#ifdef NEW_PCIB
+ vmbus_free_mmio_res(dev);
+#endif
+
+ return (0);
+}
+
+#ifndef EARLY_AP_STARTUP
+
+static void
+vmbus_sysinit(void *arg __unused)
+{
+ struct vmbus_softc *sc = vmbus_get_softc();
+
+ if (vm_guest != VM_GUEST_HV || sc == NULL)
+ return;
+
+ /*
+ * If the system has already booted and thread
+ * scheduling is possible, as indicated by the
+ * global cold set to zero, we just call the driver
+ * initialization directly.
+ */
+ if (!cold)
+ vmbus_doattach(sc);
+}
+/*
+ * NOTE:
+ * We have to start as the last step of SI_SUB_SMP, i.e. after SMP is
+ * initialized.
+ */
+SYSINIT(vmbus_initialize, SI_SUB_SMP, SI_ORDER_ANY, vmbus_sysinit, NULL);
+
+#endif /* !EARLY_AP_STARTUP */
diff --git a/sys/dev/hyperv/vmbus/vmbus_br.c b/sys/dev/hyperv/vmbus/vmbus_br.c
new file mode 100644
index 000000000000..7311f87fd596
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_br.c
@@ -0,0 +1,720 @@
+/*-
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+
+#include <dev/hyperv/vmbus/vmbus_reg.h>
+#include <dev/hyperv/vmbus/vmbus_brvar.h>
+
+/* Amount of space available for write */
+#define VMBUS_BR_WAVAIL(r, w, z) \
+ (((w) >= (r)) ? ((z) - ((w) - (r))) : ((r) - (w)))
+
+/* Increase bufing index */
+#define VMBUS_BR_IDXINC(idx, inc, sz) (((idx) + (inc)) % (sz))
+
+static int vmbus_br_sysctl_state(SYSCTL_HANDLER_ARGS);
+static int vmbus_br_sysctl_state_bin(SYSCTL_HANDLER_ARGS);
+static void vmbus_br_setup(struct vmbus_br *, void *, int);
+
+static int
+vmbus_br_sysctl_state(SYSCTL_HANDLER_ARGS)
+{
+ const struct vmbus_br *br = arg1;
+ uint32_t rindex, windex, imask, psndsz, fvalue, ravail, wavail;
+ uint64_t intrcnt;
+ char state[256];
+
+ intrcnt = br->vbr_intrcnt;
+ rindex = br->vbr_rindex;
+ windex = br->vbr_windex;
+ imask = br->vbr_imask;
+ psndsz = br->vbr_psndsz;
+ fvalue = br->vbr_fvalue;
+ wavail = VMBUS_BR_WAVAIL(rindex, windex, br->vbr_dsize);
+ ravail = br->vbr_dsize - wavail;
+
+ snprintf(state, sizeof(state),
+ "intrcnt:%ju rindex:%u windex:%u imask:%u psndsz:%u fvalue:%u "
+ "ravail:%u wavail:%u",
+ (uintmax_t)intrcnt, rindex, windex, imask, psndsz, fvalue,
+ ravail, wavail);
+ return sysctl_handle_string(oidp, state, sizeof(state), req);
+}
+
+/*
+ * Binary bufring states.
+ */
+static int
+vmbus_br_sysctl_state_bin(SYSCTL_HANDLER_ARGS)
+{
+#define BR_STATE_RIDX 0
+#define BR_STATE_WIDX 1
+#define BR_STATE_IMSK 2
+#define BR_STATE_PSSZ 3
+#define BR_STATE_FVAL 4
+#define BR_STATE_RSPC 5
+#define BR_STATE_WSPC 6
+#define BR_STATE_MAX 7
+
+ const struct vmbus_br *br = arg1;
+ uint32_t rindex, windex, wavail, state[BR_STATE_MAX];
+
+ rindex = br->vbr_rindex;
+ windex = br->vbr_windex;
+ wavail = VMBUS_BR_WAVAIL(rindex, windex, br->vbr_dsize);
+
+ state[BR_STATE_RIDX] = rindex;
+ state[BR_STATE_WIDX] = windex;
+ state[BR_STATE_IMSK] = br->vbr_imask;
+ state[BR_STATE_PSSZ] = br->vbr_psndsz;
+ state[BR_STATE_FVAL] = br->vbr_fvalue;
+ state[BR_STATE_WSPC] = wavail;
+ state[BR_STATE_RSPC] = br->vbr_dsize - wavail;
+
+ return sysctl_handle_opaque(oidp, state, sizeof(state), req);
+}
+
+void
+vmbus_br_sysctl_create(struct sysctl_ctx_list *ctx, struct sysctl_oid *br_tree,
+ struct vmbus_br *br, const char *name)
+{
+ struct sysctl_oid *tree;
+ char desc[64];
+
+ tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(br_tree), OID_AUTO,
+ name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+ if (tree == NULL)
+ return;
+
+ snprintf(desc, sizeof(desc), "%s state", name);
+ SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "state",
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ br, 0, vmbus_br_sysctl_state, "A", desc);
+
+ snprintf(desc, sizeof(desc), "%s binary state", name);
+ SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "state_bin",
+ CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ br, 0, vmbus_br_sysctl_state_bin, "IU", desc);
+}
+
+void
+vmbus_rxbr_intr_mask(struct vmbus_rxbr *rbr)
+{
+ rbr->rxbr_imask = 1;
+ mb();
+}
+
+static __inline uint32_t
+vmbus_rxbr_avail(const struct vmbus_rxbr *rbr)
+{
+ uint32_t rindex, windex;
+
+ /* Get snapshot */
+ rindex = rbr->rxbr_rindex;
+ windex = rbr->rxbr_windex;
+
+ return (rbr->rxbr_dsize -
+ VMBUS_BR_WAVAIL(rindex, windex, rbr->rxbr_dsize));
+}
+
+uint32_t
+vmbus_rxbr_available(const struct vmbus_rxbr *rbr)
+{
+ return (vmbus_rxbr_avail(rbr));
+}
+
+uint32_t
+vmbus_rxbr_intr_unmask(struct vmbus_rxbr *rbr)
+{
+ rbr->rxbr_imask = 0;
+ mb();
+
+ /*
+ * Now check to see if the ring buffer is still empty.
+ * If it is not, we raced and we need to process new
+ * incoming channel packets.
+ */
+ return vmbus_rxbr_avail(rbr);
+}
+
+static void
+vmbus_br_setup(struct vmbus_br *br, void *buf, int blen)
+{
+ br->vbr = buf;
+ br->vbr_dsize = blen - sizeof(struct vmbus_bufring);
+}
+
+void
+vmbus_rxbr_init(struct vmbus_rxbr *rbr)
+{
+ mtx_init(&rbr->rxbr_lock, "vmbus_rxbr", NULL, MTX_SPIN);
+}
+
+void
+vmbus_rxbr_deinit(struct vmbus_rxbr *rbr)
+{
+ mtx_destroy(&rbr->rxbr_lock);
+}
+
+void
+vmbus_rxbr_setup(struct vmbus_rxbr *rbr, void *buf, int blen)
+{
+ vmbus_br_setup(&rbr->rxbr, buf, blen);
+}
+
+static __inline boolean_t
+vmbus_rxbr_need_signal(const struct vmbus_rxbr *rbr, uint32_t bytes_read)
+{
+ uint32_t pending_snd_sz, canwrite_size;
+
+ /* No need to signal if host doesn't want us to */
+ if (!rbr->rxbr_fpsndsz)
+ return false;
+
+ mb();
+
+ pending_snd_sz = rbr->rxbr_psndsz;
+ /* No need to signal if host sets pending_snd_sz to 0 */
+ if (!pending_snd_sz)
+ return false;
+
+ mb();
+
+ canwrite_size = rbr->rxbr_dsize - vmbus_rxbr_avail(rbr);
+
+ /* No need to signal if br already has enough space before read */
+ if (canwrite_size - bytes_read > pending_snd_sz)
+ return false;
+
+ /*
+ * No need to signal if still doesn't have enough space
+ * asked by host
+ */
+ if (canwrite_size <= pending_snd_sz)
+ return false;
+
+ return true;
+}
+
+void
+vmbus_txbr_init(struct vmbus_txbr *tbr)
+{
+ mtx_init(&tbr->txbr_lock, "vmbus_txbr", NULL, MTX_SPIN);
+}
+
+void
+vmbus_txbr_deinit(struct vmbus_txbr *tbr)
+{
+ mtx_destroy(&tbr->txbr_lock);
+}
+
+void
+vmbus_txbr_setup(struct vmbus_txbr *tbr, void *buf, int blen)
+{
+ vmbus_br_setup(&tbr->txbr, buf, blen);
+
+ /* Set feature bit enabling flow control */
+ tbr->txbr_fpsndsz = 1;
+}
+
+uint32_t
+vmbus_txbr_get_imask(const struct vmbus_txbr *tbr)
+{
+ mb();
+
+ return(tbr->txbr_imask);
+}
+
+void
+vmbus_txbr_set_pending_snd_sz(struct vmbus_txbr *tbr, uint32_t size)
+{
+ tbr->txbr_psndsz = size;
+}
+
+/*
+ * When we write to the ring buffer, check if the host needs to be
+ * signaled.
+ *
+ * The contract:
+ * - The host guarantees that while it is draining the TX bufring,
+ * it will set the br_imask to indicate it does not need to be
+ * interrupted when new data are added.
+ * - The host guarantees that it will completely drain the TX bufring
+ * before exiting the read loop. Further, once the TX bufring is
+ * empty, it will clear the br_imask and re-check to see if new
+ * data have arrived.
+ */
+static __inline boolean_t
+vmbus_txbr_need_signal(const struct vmbus_txbr *tbr, uint32_t old_windex)
+{
+ mb();
+ if (tbr->txbr_imask)
+ return (FALSE);
+
+ __compiler_membar();
+
+ /*
+ * This is the only case we need to signal when the
+ * ring transitions from being empty to non-empty.
+ */
+ if (old_windex == tbr->txbr_rindex)
+ return (TRUE);
+
+ return (FALSE);
+}
+
+static __inline uint32_t
+vmbus_txbr_avail(const struct vmbus_txbr *tbr)
+{
+ uint32_t rindex, windex;
+
+ /* Get snapshot */
+ rindex = tbr->txbr_rindex;
+ windex = tbr->txbr_windex;
+
+ return VMBUS_BR_WAVAIL(rindex, windex, tbr->txbr_dsize);
+}
+
+static __inline uint32_t
+vmbus_txbr_copyto(const struct vmbus_txbr *tbr, uint32_t windex,
+ const void *src0, uint32_t cplen)
+{
+ const uint8_t *src = src0;
+ uint8_t *br_data = tbr->txbr_data;
+ uint32_t br_dsize = tbr->txbr_dsize;
+
+ if (cplen > br_dsize - windex) {
+ uint32_t fraglen = br_dsize - windex;
+
+ /* Wrap-around detected */
+ memcpy(br_data + windex, src, fraglen);
+ memcpy(br_data, src + fraglen, cplen - fraglen);
+ } else {
+ memcpy(br_data + windex, src, cplen);
+ }
+ return VMBUS_BR_IDXINC(windex, cplen, br_dsize);
+}
+
+static __inline uint32_t
+vmbus_txbr_copyto_call(const struct vmbus_txbr *tbr, uint32_t windex,
+ uint32_t cplen, vmbus_br_copy_callback_t cb, void *cbarg, int *ret)
+{
+ uint8_t *br_data = tbr->txbr_data;
+ uint32_t br_dsize = tbr->txbr_dsize;
+ int err = 0;
+
+ if (cplen > br_dsize - windex) {
+ uint32_t fraglen = br_dsize - windex;
+
+ /* Wrap-around detected */
+ err = cb((void *)(br_data + windex), fraglen, cbarg);
+ if (!err)
+ err = cb((void *)br_data, cplen - fraglen, cbarg);
+ } else {
+ err = cb((void *)(br_data + windex), cplen, cbarg);
+ }
+
+ *ret = err;
+
+ return VMBUS_BR_IDXINC(windex, cplen, br_dsize);
+}
+
+uint32_t
+vmbus_txbr_available(const struct vmbus_txbr *tbr)
+{
+ return (vmbus_txbr_avail(tbr));
+}
+
+/*
+ * NOTE:
+ * Not holding lock when calling user provided callback routine.
+ * Caller should hold lock to serialize ring buffer accesses.
+ */
+int
+vmbus_txbr_write_call(struct vmbus_txbr *tbr,
+ const struct iovec iov[], int iovlen,
+ vmbus_br_copy_callback_t cb, void *cbarg,
+ boolean_t *need_sig)
+{
+ uint32_t old_windex, windex, total;
+ uint64_t save_windex;
+ int i;
+ int cb_ret = 0;
+
+ total = 0;
+ for (i = 0; i < iovlen; i++)
+ total += iov[i].iov_len;
+ total += sizeof(save_windex);
+
+
+ /*
+ * NOTE:
+ * If this write is going to make br_windex same as br_rindex,
+ * i.e. the available space for write is same as the write size,
+ * we can't do it then, since br_windex == br_rindex means that
+ * the bufring is empty.
+ */
+ if (vmbus_txbr_avail(tbr) <= total) {
+ return (EAGAIN);
+ }
+
+ /* Save br_windex for later use */
+ old_windex = tbr->txbr_windex;
+
+ /*
+ * Copy the scattered channel packet to the TX bufring.
+ */
+ windex = old_windex;
+ for (i = 0; i < iovlen; i++) {
+ if (iov[i].iov_base != NULL) {
+ windex = vmbus_txbr_copyto(tbr, windex,
+ iov[i].iov_base, iov[i].iov_len);
+ } else if (cb != NULL) {
+ windex = vmbus_txbr_copyto_call(tbr, windex,
+ iov[i].iov_len, cb, cbarg, &cb_ret);
+ /*
+ * If callback fails, return without updating
+ * write index.
+ */
+ if (cb_ret)
+ return (cb_ret);
+ }
+ }
+
+ mtx_lock_spin(&tbr->txbr_lock);
+
+ /*
+ * Set the offset of the current channel packet.
+ */
+ save_windex = ((uint64_t)old_windex) << 32;
+ windex = vmbus_txbr_copyto(tbr, windex, &save_windex,
+ sizeof(save_windex));
+
+ /*
+ * Update the write index _after_ the channel packet
+ * is copied.
+ */
+ __compiler_membar();
+ tbr->txbr_windex = windex;
+
+ mtx_unlock_spin(&tbr->txbr_lock);
+
+ if (need_sig)
+ *need_sig = vmbus_txbr_need_signal(tbr, old_windex);
+
+ return (0);
+}
+
+/*
+ * Write scattered channel packet to TX bufring.
+ *
+ * The offset of this channel packet is written as a 64bits value
+ * immediately after this channel packet.
+ */
+int
+vmbus_txbr_write(struct vmbus_txbr *tbr, const struct iovec iov[], int iovlen,
+ boolean_t *need_sig)
+{
+ uint32_t old_windex, windex, total;
+ uint64_t save_windex;
+ int i;
+
+ total = 0;
+ for (i = 0; i < iovlen; i++)
+ total += iov[i].iov_len;
+ total += sizeof(save_windex);
+
+ mtx_lock_spin(&tbr->txbr_lock);
+
+ /*
+ * NOTE:
+ * If this write is going to make br_windex same as br_rindex,
+ * i.e. the available space for write is same as the write size,
+ * we can't do it then, since br_windex == br_rindex means that
+ * the bufring is empty.
+ */
+ if (vmbus_txbr_avail(tbr) <= total) {
+ mtx_unlock_spin(&tbr->txbr_lock);
+ return (EAGAIN);
+ }
+
+ /* Save br_windex for later use */
+ old_windex = tbr->txbr_windex;
+
+ /*
+ * Copy the scattered channel packet to the TX bufring.
+ */
+ windex = old_windex;
+ for (i = 0; i < iovlen; i++) {
+ windex = vmbus_txbr_copyto(tbr, windex,
+ iov[i].iov_base, iov[i].iov_len);
+ }
+
+ /*
+ * Set the offset of the current channel packet.
+ */
+ save_windex = ((uint64_t)old_windex) << 32;
+ windex = vmbus_txbr_copyto(tbr, windex, &save_windex,
+ sizeof(save_windex));
+
+ /*
+ * Update the write index _after_ the channel packet
+ * is copied.
+ */
+ __compiler_membar();
+ tbr->txbr_windex = windex;
+
+ mtx_unlock_spin(&tbr->txbr_lock);
+
+ *need_sig = vmbus_txbr_need_signal(tbr, old_windex);
+
+ return (0);
+}
+
+static __inline uint32_t
+vmbus_rxbr_copyfrom(const struct vmbus_rxbr *rbr, uint32_t rindex,
+ void *dst0, int cplen)
+{
+ uint8_t *dst = dst0;
+ const uint8_t *br_data = rbr->rxbr_data;
+ uint32_t br_dsize = rbr->rxbr_dsize;
+
+ if (cplen > br_dsize - rindex) {
+ uint32_t fraglen = br_dsize - rindex;
+
+ /* Wrap-around detected. */
+ memcpy(dst, br_data + rindex, fraglen);
+ memcpy(dst + fraglen, br_data, cplen - fraglen);
+ } else {
+ memcpy(dst, br_data + rindex, cplen);
+ }
+ return VMBUS_BR_IDXINC(rindex, cplen, br_dsize);
+}
+
+static __inline uint32_t
+vmbus_rxbr_copyfrom_call(const struct vmbus_rxbr *rbr, uint32_t rindex,
+ int cplen, vmbus_br_copy_callback_t cb, void *cbarg)
+{
+ uint8_t *br_data = rbr->rxbr_data;
+ uint32_t br_dsize = rbr->rxbr_dsize;
+ int error = 0;
+
+ if (cplen > br_dsize - rindex) {
+ uint32_t fraglen = br_dsize - rindex;
+
+ /* Wrap-around detected. */
+ error = cb((void *)(br_data + rindex), fraglen, cbarg);
+ if (!error)
+ error = cb((void *)br_data, cplen - fraglen, cbarg);
+ } else {
+ error = cb((void *)(br_data + rindex), cplen, cbarg);
+ }
+ return (error);
+}
+
+int
+vmbus_rxbr_peek(struct vmbus_rxbr *rbr, void *data, int dlen)
+{
+ mtx_lock_spin(&rbr->rxbr_lock);
+
+ /*
+ * The requested data and the 64bits channel packet
+ * offset should be there at least.
+ */
+ if (vmbus_rxbr_avail(rbr) < dlen + sizeof(uint64_t)) {
+ mtx_unlock_spin(&rbr->rxbr_lock);
+ return (EAGAIN);
+ }
+ vmbus_rxbr_copyfrom(rbr, rbr->rxbr_rindex, data, dlen);
+
+ mtx_unlock_spin(&rbr->rxbr_lock);
+
+ return (0);
+}
+
+/*
+ * NOTE:
+ * We only hold spin lock to check the ring buffer space. It is
+ * released before calling user provided callback routine.
+ * Caller should hold lock to serialize ring buffer accesses.
+ */
+int
+vmbus_rxbr_peek_call(struct vmbus_rxbr *rbr, int dlen, uint32_t skip,
+ vmbus_br_copy_callback_t cb, void *cbarg)
+{
+ uint32_t rindex, br_dsize0 = rbr->rxbr_dsize;
+ int ret;
+
+ mtx_lock_spin(&rbr->rxbr_lock);
+ /*
+ * The requested data + skip and the 64bits channel packet
+ * offset should be there at least.
+ */
+ if (vmbus_rxbr_avail(rbr) < skip + dlen + sizeof(uint64_t)) {
+ mtx_unlock_spin(&rbr->rxbr_lock);
+ return (EAGAIN);
+ }
+
+ rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, skip, br_dsize0);
+ mtx_unlock_spin(&rbr->rxbr_lock);
+
+ ret = vmbus_rxbr_copyfrom_call(rbr, rindex, dlen, cb, cbarg);
+
+ return (ret);
+}
+
+/*
+ * NOTE:
+ * We assume idx_adv == sizeof(channel packet).
+ */
+int
+vmbus_rxbr_idxadv_peek(struct vmbus_rxbr *rbr, void *data, int dlen,
+ uint32_t idx_adv, boolean_t *need_sig)
+{
+ uint32_t rindex, br_dsize = rbr->rxbr_dsize;
+
+ mtx_lock_spin(&rbr->rxbr_lock);
+ /*
+ * Make sure it has enough data to read.
+ */
+ if (vmbus_rxbr_avail(rbr) < idx_adv + sizeof(uint64_t) + dlen) {
+ mtx_unlock_spin(&rbr->rxbr_lock);
+ return (EAGAIN);
+ }
+
+ if (idx_adv > 0) {
+ /*
+ * Advance the read index first, including the channel's 64bit
+ * previous write offset.
+ */
+ rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex,
+ idx_adv + sizeof(uint64_t), br_dsize);
+ __compiler_membar();
+ rbr->rxbr_rindex = rindex;
+ }
+
+ vmbus_rxbr_copyfrom(rbr, rbr->rxbr_rindex, data, dlen);
+
+ mtx_unlock_spin(&rbr->rxbr_lock);
+
+ if (need_sig) {
+ if (idx_adv > 0)
+ *need_sig =
+ vmbus_rxbr_need_signal(rbr, idx_adv +
+ sizeof(uint64_t));
+ else
+ *need_sig = false;
+ }
+
+ return (0);
+}
+
+/*
+ * NOTE:
+ * Just update the RX rb index.
+ */
+int
+vmbus_rxbr_idxadv(struct vmbus_rxbr *rbr, uint32_t idx_adv,
+ boolean_t *need_sig)
+{
+ uint32_t rindex, br_dsize = rbr->rxbr_dsize;
+
+ mtx_lock_spin(&rbr->rxbr_lock);
+ /*
+ * Make sure it has enough space to advance.
+ */
+ if (vmbus_rxbr_avail(rbr) < idx_adv + sizeof(uint64_t)) {
+ mtx_unlock_spin(&rbr->rxbr_lock);
+ return (EAGAIN);
+ }
+
+ /*
+ * Advance the read index, including the channel's 64bit
+ * previous write offset.
+ */
+ rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex,
+ idx_adv + sizeof(uint64_t), br_dsize);
+ __compiler_membar();
+ rbr->rxbr_rindex = rindex;
+
+ mtx_unlock_spin(&rbr->rxbr_lock);
+
+ if (need_sig) {
+ *need_sig =
+ vmbus_rxbr_need_signal(rbr, idx_adv + sizeof(uint64_t));
+ }
+
+ return (0);
+}
+
+/*
+ * NOTE:
+ * We assume (dlen + skip) == sizeof(channel packet).
+ */
+int
+vmbus_rxbr_read(struct vmbus_rxbr *rbr, void *data, int dlen, uint32_t skip)
+{
+ uint32_t rindex, br_dsize = rbr->rxbr_dsize;
+
+ KASSERT(dlen + skip > 0, ("invalid dlen %d, offset %u", dlen, skip));
+
+ mtx_lock_spin(&rbr->rxbr_lock);
+
+ if (vmbus_rxbr_avail(rbr) < dlen + skip + sizeof(uint64_t)) {
+ mtx_unlock_spin(&rbr->rxbr_lock);
+ return (EAGAIN);
+ }
+
+ /*
+ * Copy channel packet from RX bufring.
+ */
+ rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, skip, br_dsize);
+ rindex = vmbus_rxbr_copyfrom(rbr, rindex, data, dlen);
+
+ /*
+ * Discard this channel packet's 64bits offset, which is useless to us.
+ */
+ rindex = VMBUS_BR_IDXINC(rindex, sizeof(uint64_t), br_dsize);
+
+ /*
+ * Update the read index _after_ the channel packet is fetched.
+ */
+ __compiler_membar();
+ rbr->rxbr_rindex = rindex;
+
+ mtx_unlock_spin(&rbr->rxbr_lock);
+
+ return (0);
+}
diff --git a/sys/dev/hyperv/vmbus/vmbus_brvar.h b/sys/dev/hyperv/vmbus/vmbus_brvar.h
new file mode 100644
index 000000000000..95bf4338ff1c
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_brvar.h
@@ -0,0 +1,157 @@
+/*-
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_BRVAR_H_
+#define _VMBUS_BRVAR_H_
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/_iovec.h>
+
+struct vmbus_br {
+ struct vmbus_bufring *vbr;
+ uint32_t vbr_dsize; /* total data size */
+};
+
+#define vbr_windex vbr->br_windex
+#define vbr_rindex vbr->br_rindex
+#define vbr_imask vbr->br_imask
+#define vbr_psndsz vbr->br_pending_snd_sz
+#define vbr_fpsndsz vbr->br_feature_bits.feat_pending_snd_sz
+#define vbr_fvalue vbr->br_feature_bits.value
+#define vbr_intrcnt vbr->br_g2h_intr_cnt
+#define vbr_data vbr->br_data
+
+struct vmbus_rxbr {
+ struct mtx rxbr_lock;
+ struct vmbus_br rxbr;
+};
+
+#define rxbr_windex rxbr.vbr_windex
+#define rxbr_rindex rxbr.vbr_rindex
+#define rxbr_imask rxbr.vbr_imask
+#define rxbr_psndsz rxbr.vbr_psndsz
+#define rxbr_fpsndsz rxbr.vbr_fpsndsz
+#define rxbr_fvalue rxbr.vbr_fvalue
+#define rxbr_intrcnt rxbr.vbr_intrcnt
+#define rxbr_data rxbr.vbr_data
+#define rxbr_dsize rxbr.vbr_dsize
+
+struct vmbus_txbr {
+ struct mtx txbr_lock;
+ struct vmbus_br txbr;
+};
+
+#define txbr_windex txbr.vbr_windex
+#define txbr_rindex txbr.vbr_rindex
+#define txbr_imask txbr.vbr_imask
+#define txbr_psndsz txbr.vbr_psndsz
+#define txbr_fpsndsz txbr.vbr_fpsndsz
+#define txbr_fvalue txbr.vbr_fvalue
+#define txbr_intrcnt txbr.vbr_intrcnt
+#define txbr_data txbr.vbr_data
+#define txbr_dsize txbr.vbr_dsize
+
+struct sysctl_ctx_list;
+struct sysctl_oid;
+
+static __inline int
+vmbus_txbr_maxpktsz(const struct vmbus_txbr *tbr)
+{
+
+ /*
+ * - 64 bits for the trailing start index (- sizeof(uint64_t)).
+ * - The rindex and windex can't be same (- 1). See
+ * the comment near vmbus_bufring.br_{r,w}index.
+ */
+ return (tbr->txbr_dsize - sizeof(uint64_t) - 1);
+}
+
+static __inline bool
+vmbus_txbr_empty(const struct vmbus_txbr *tbr)
+{
+
+ return (tbr->txbr_windex == tbr->txbr_rindex ? true : false);
+}
+
+static __inline bool
+vmbus_rxbr_empty(const struct vmbus_rxbr *rbr)
+{
+
+ return (rbr->rxbr_windex == rbr->rxbr_rindex ? true : false);
+}
+
+static __inline int
+vmbus_br_nelem(int br_size, int elem_size)
+{
+
+ /* Strip bufring header */
+ br_size -= sizeof(struct vmbus_bufring);
+ /* Add per-element trailing index */
+ elem_size += sizeof(uint64_t);
+ return (br_size / elem_size);
+}
+
+void vmbus_br_sysctl_create(struct sysctl_ctx_list *ctx,
+ struct sysctl_oid *br_tree, struct vmbus_br *br,
+ const char *name);
+
+void vmbus_rxbr_init(struct vmbus_rxbr *rbr);
+void vmbus_rxbr_deinit(struct vmbus_rxbr *rbr);
+void vmbus_rxbr_setup(struct vmbus_rxbr *rbr, void *buf, int blen);
+int vmbus_rxbr_peek(struct vmbus_rxbr *rbr, void *data, int dlen);
+int vmbus_rxbr_read(struct vmbus_rxbr *rbr, void *data, int dlen,
+ uint32_t skip);
+int vmbus_rxbr_idxadv(struct vmbus_rxbr *rbr, uint32_t idx_adv,
+ boolean_t *need_sig);
+int vmbus_rxbr_idxadv_peek(struct vmbus_rxbr *rbr, void *data,
+ int dlen, uint32_t idx_adv, boolean_t *need_sig);
+int vmbus_rxbr_peek_call(struct vmbus_rxbr *rbr, int dlen,
+ uint32_t skip, vmbus_br_copy_callback_t cb, void *cbarg);
+void vmbus_rxbr_intr_mask(struct vmbus_rxbr *rbr);
+uint32_t vmbus_rxbr_intr_unmask(struct vmbus_rxbr *rbr);
+uint32_t vmbus_rxbr_available(const struct vmbus_rxbr *rbr);
+
+void vmbus_txbr_init(struct vmbus_txbr *tbr);
+void vmbus_txbr_deinit(struct vmbus_txbr *tbr);
+void vmbus_txbr_setup(struct vmbus_txbr *tbr, void *buf, int blen);
+int vmbus_txbr_write(struct vmbus_txbr *tbr,
+ const struct iovec iov[], int iovlen, boolean_t *need_sig);
+int vmbus_txbr_write_call(struct vmbus_txbr *tbr,
+ const struct iovec iov[], int iovlen,
+ vmbus_br_copy_callback_t cb, void *cbarg,
+ boolean_t *need_sig);
+uint32_t vmbus_txbr_available(const struct vmbus_txbr *tbr);
+uint32_t vmbus_txbr_get_imask(const struct vmbus_txbr *tbr);
+void vmbus_txbr_set_pending_snd_sz(struct vmbus_txbr *tbr,
+ uint32_t size);
+
+#endif /* _VMBUS_BRVAR_H_ */
diff --git a/sys/dev/hyperv/vmbus/vmbus_chan.c b/sys/dev/hyperv/vmbus/vmbus_chan.c
new file mode 100644
index 000000000000..032e06c47c95
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_chan.c
@@ -0,0 +1,2390 @@
+/*-
+ * Copyright (c) 2009-2012,2016 Microsoft Corp.
+ * Copyright (c) 2012 NetApp Inc.
+ * Copyright (c) 2012 Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <machine/atomic.h>
+#include <machine/stdarg.h>
+
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+#include <dev/hyperv/vmbus/hyperv_var.h>
+#include <dev/hyperv/vmbus/vmbus_reg.h>
+#include <dev/hyperv/vmbus/vmbus_var.h>
+#include <dev/hyperv/vmbus/vmbus_brvar.h>
+#include <dev/hyperv/vmbus/vmbus_chanvar.h>
+
+struct vmbus_chan_pollarg {
+ struct vmbus_channel *poll_chan;
+ u_int poll_hz;
+};
+
+static void vmbus_chan_update_evtflagcnt(
+ struct vmbus_softc *,
+ const struct vmbus_channel *);
+static int vmbus_chan_close_internal(
+ struct vmbus_channel *);
+static int vmbus_chan_sysctl_mnf(SYSCTL_HANDLER_ARGS);
+static void vmbus_chan_sysctl_create(
+ struct vmbus_channel *);
+static struct vmbus_channel *vmbus_chan_alloc(struct vmbus_softc *);
+static void vmbus_chan_free(struct vmbus_channel *);
+static int vmbus_chan_add(struct vmbus_channel *);
+static void vmbus_chan_cpu_default(struct vmbus_channel *);
+static int vmbus_chan_release(struct vmbus_channel *);
+static void vmbus_chan_set_chmap(struct vmbus_channel *);
+static void vmbus_chan_clear_chmap(struct vmbus_channel *);
+static void vmbus_chan_detach(struct vmbus_channel *);
+static bool vmbus_chan_wait_revoke(
+ const struct vmbus_channel *, bool);
+static void vmbus_chan_poll_timeout(void *);
+static bool vmbus_chan_poll_cancel_intq(
+ struct vmbus_channel *);
+static void vmbus_chan_poll_cancel(struct vmbus_channel *);
+
+static void vmbus_chan_ins_prilist(struct vmbus_softc *,
+ struct vmbus_channel *);
+static void vmbus_chan_rem_prilist(struct vmbus_softc *,
+ struct vmbus_channel *);
+static void vmbus_chan_ins_list(struct vmbus_softc *,
+ struct vmbus_channel *);
+static void vmbus_chan_rem_list(struct vmbus_softc *,
+ struct vmbus_channel *);
+static void vmbus_chan_ins_sublist(struct vmbus_channel *,
+ struct vmbus_channel *);
+static void vmbus_chan_rem_sublist(struct vmbus_channel *,
+ struct vmbus_channel *);
+
+static void vmbus_chan_task(void *, int);
+static void vmbus_chan_task_nobatch(void *, int);
+static void vmbus_chan_poll_task(void *, int);
+static void vmbus_chan_clrchmap_task(void *, int);
+static void vmbus_chan_pollcfg_task(void *, int);
+static void vmbus_chan_polldis_task(void *, int);
+static void vmbus_chan_poll_cancel_task(void *, int);
+static void vmbus_prichan_attach_task(void *, int);
+static void vmbus_subchan_attach_task(void *, int);
+static void vmbus_prichan_detach_task(void *, int);
+static void vmbus_subchan_detach_task(void *, int);
+
+static void vmbus_chan_msgproc_choffer(struct vmbus_softc *,
+ const struct vmbus_message *);
+static void vmbus_chan_msgproc_chrescind(
+ struct vmbus_softc *,
+ const struct vmbus_message *);
+
+static int vmbus_chan_printf(const struct vmbus_channel *,
+ const char *, ...) __printflike(2, 3);
+
+/*
+ * Vmbus channel message processing.
+ */
+static const vmbus_chanmsg_proc_t
+vmbus_chan_msgprocs[VMBUS_CHANMSG_TYPE_MAX] = {
+ VMBUS_CHANMSG_PROC(CHOFFER, vmbus_chan_msgproc_choffer),
+ VMBUS_CHANMSG_PROC(CHRESCIND, vmbus_chan_msgproc_chrescind),
+
+ VMBUS_CHANMSG_PROC_WAKEUP(CHOPEN_RESP),
+ VMBUS_CHANMSG_PROC_WAKEUP(GPADL_CONNRESP),
+ VMBUS_CHANMSG_PROC_WAKEUP(GPADL_DISCONNRESP)
+};
+
+/*
+ * Notify host that there are data pending on our TX bufring or
+ * we have put some data on the TX bufring.
+ */
+static __inline void
+vmbus_chan_signal(const struct vmbus_channel *chan)
+{
+ atomic_set_long(chan->ch_evtflag, chan->ch_evtflag_mask);
+ if (chan->ch_txflags & VMBUS_CHAN_TXF_HASMNF)
+ atomic_set_int(chan->ch_montrig, chan->ch_montrig_mask);
+ else
+ hypercall_signal_event(chan->ch_monprm_dma.hv_paddr);
+}
+
+static __inline void
+vmbus_chan_signal_tx(struct vmbus_channel *chan)
+{
+ chan->ch_txbr.txbr_intrcnt ++;
+
+ vmbus_chan_signal(chan);
+}
+
+static __inline void
+vmbus_chan_signal_rx(struct vmbus_channel *chan)
+{
+ chan->ch_rxbr.rxbr_intrcnt ++;
+
+ vmbus_chan_signal(chan);
+}
+
+static void
+vmbus_chan_ins_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan)
+{
+
+ mtx_assert(&sc->vmbus_prichan_lock, MA_OWNED);
+ if (atomic_testandset_int(&chan->ch_stflags,
+ VMBUS_CHAN_ST_ONPRIL_SHIFT))
+ panic("channel is already on the prilist");
+ TAILQ_INSERT_TAIL(&sc->vmbus_prichans, chan, ch_prilink);
+}
+
+static void
+vmbus_chan_rem_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan)
+{
+
+ mtx_assert(&sc->vmbus_prichan_lock, MA_OWNED);
+ if (atomic_testandclear_int(&chan->ch_stflags,
+ VMBUS_CHAN_ST_ONPRIL_SHIFT) == 0)
+ panic("channel is not on the prilist");
+ TAILQ_REMOVE(&sc->vmbus_prichans, chan, ch_prilink);
+}
+
+static void
+vmbus_chan_ins_sublist(struct vmbus_channel *prichan,
+ struct vmbus_channel *chan)
+{
+
+ mtx_assert(&prichan->ch_subchan_lock, MA_OWNED);
+
+ if (atomic_testandset_int(&chan->ch_stflags,
+ VMBUS_CHAN_ST_ONSUBL_SHIFT))
+ panic("channel is already on the sublist");
+ TAILQ_INSERT_TAIL(&prichan->ch_subchans, chan, ch_sublink);
+
+ /* Bump sub-channel count. */
+ prichan->ch_subchan_cnt++;
+}
+
+static void
+vmbus_chan_rem_sublist(struct vmbus_channel *prichan,
+ struct vmbus_channel *chan)
+{
+
+ mtx_assert(&prichan->ch_subchan_lock, MA_OWNED);
+
+ KASSERT(prichan->ch_subchan_cnt > 0,
+ ("invalid subchan_cnt %d", prichan->ch_subchan_cnt));
+ prichan->ch_subchan_cnt--;
+
+ if (atomic_testandclear_int(&chan->ch_stflags,
+ VMBUS_CHAN_ST_ONSUBL_SHIFT) == 0)
+ panic("channel is not on the sublist");
+ TAILQ_REMOVE(&prichan->ch_subchans, chan, ch_sublink);
+}
+
+static void
+vmbus_chan_ins_list(struct vmbus_softc *sc, struct vmbus_channel *chan)
+{
+
+ mtx_assert(&sc->vmbus_chan_lock, MA_OWNED);
+ if (atomic_testandset_int(&chan->ch_stflags,
+ VMBUS_CHAN_ST_ONLIST_SHIFT))
+ panic("channel is already on the list");
+ TAILQ_INSERT_TAIL(&sc->vmbus_chans, chan, ch_link);
+}
+
+static void
+vmbus_chan_rem_list(struct vmbus_softc *sc, struct vmbus_channel *chan)
+{
+
+ mtx_assert(&sc->vmbus_chan_lock, MA_OWNED);
+ if (atomic_testandclear_int(&chan->ch_stflags,
+ VMBUS_CHAN_ST_ONLIST_SHIFT) == 0)
+ panic("channel is not on the list");
+ TAILQ_REMOVE(&sc->vmbus_chans, chan, ch_link);
+}
+
+static int
+vmbus_chan_sysctl_mnf(SYSCTL_HANDLER_ARGS)
+{
+ struct vmbus_channel *chan = arg1;
+ int mnf = 0;
+
+ if (chan->ch_txflags & VMBUS_CHAN_TXF_HASMNF)
+ mnf = 1;
+ return sysctl_handle_int(oidp, &mnf, 0, req);
+}
+
+static void
+vmbus_chan_sysctl_create(struct vmbus_channel *chan)
+{
+ struct sysctl_oid *ch_tree, *chid_tree, *br_tree;
+ struct sysctl_ctx_list *ctx;
+ uint32_t ch_id;
+ char name[16];
+
+ /*
+ * Add sysctl nodes related to this channel to this
+ * channel's sysctl ctx, so that they can be destroyed
+ * independently upon close of this channel, which can
+ * happen even if the device is not detached.
+ */
+ ctx = &chan->ch_sysctl_ctx;
+ sysctl_ctx_init(ctx);
+
+ /*
+ * Create dev.NAME.UNIT.channel tree.
+ */
+ ch_tree = SYSCTL_ADD_NODE(ctx,
+ SYSCTL_CHILDREN(device_get_sysctl_tree(chan->ch_dev)),
+ OID_AUTO, "channel", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+ if (ch_tree == NULL)
+ return;
+
+ /*
+ * Create dev.NAME.UNIT.channel.CHANID tree.
+ */
+ if (VMBUS_CHAN_ISPRIMARY(chan))
+ ch_id = chan->ch_id;
+ else
+ ch_id = chan->ch_prichan->ch_id;
+ snprintf(name, sizeof(name), "%d", ch_id);
+ chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree),
+ OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+ if (chid_tree == NULL)
+ return;
+
+ if (!VMBUS_CHAN_ISPRIMARY(chan)) {
+ /*
+ * Create dev.NAME.UNIT.channel.CHANID.sub tree.
+ */
+ ch_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree),
+ OID_AUTO, "sub", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+ if (ch_tree == NULL)
+ return;
+
+ /*
+ * Create dev.NAME.UNIT.channel.CHANID.sub.SUBIDX tree.
+ *
+ * NOTE:
+ * chid_tree is changed to this new sysctl tree.
+ */
+ snprintf(name, sizeof(name), "%d", chan->ch_subidx);
+ chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree),
+ OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+ if (chid_tree == NULL)
+ return;
+
+ SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
+ "chanid", CTLFLAG_RD, &chan->ch_id, 0, "channel id");
+ }
+
+ SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
+ "cpu", CTLFLAG_RD, &chan->ch_cpuid, 0, "owner CPU id");
+ SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
+ "mnf", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ chan, 0, vmbus_chan_sysctl_mnf, "I",
+ "has monitor notification facilities");
+
+ br_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
+ "br", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
+ if (br_tree != NULL) {
+ /*
+ * Create sysctl tree for RX bufring.
+ */
+ vmbus_br_sysctl_create(ctx, br_tree, &chan->ch_rxbr.rxbr, "rx");
+ /*
+ * Create sysctl tree for TX bufring.
+ */
+ vmbus_br_sysctl_create(ctx, br_tree, &chan->ch_txbr.txbr, "tx");
+ }
+}
+
+int
+vmbus_chan_open(struct vmbus_channel *chan, int txbr_size, int rxbr_size,
+ const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg)
+{
+ struct vmbus_chan_br cbr;
+ int error;
+
+ /*
+ * Allocate the TX+RX bufrings.
+ */
+ KASSERT(chan->ch_bufring == NULL, ("bufrings are allocated"));
+ chan->ch_bufring = hyperv_dmamem_alloc(bus_get_dma_tag(chan->ch_dev),
+ PAGE_SIZE, 0, txbr_size + rxbr_size, &chan->ch_bufring_dma,
+ BUS_DMA_WAITOK);
+ if (chan->ch_bufring == NULL) {
+ vmbus_chan_printf(chan, "bufring allocation failed\n");
+ return (ENOMEM);
+ }
+
+ cbr.cbr = chan->ch_bufring;
+ cbr.cbr_paddr = chan->ch_bufring_dma.hv_paddr;
+ cbr.cbr_txsz = txbr_size;
+ cbr.cbr_rxsz = rxbr_size;
+
+ error = vmbus_chan_open_br(chan, &cbr, udata, udlen, cb, cbarg);
+ if (error) {
+ if (error == EISCONN) {
+ /*
+ * XXX
+ * The bufring GPADL is still connected; abandon
+ * this bufring, instead of having mysterious
+ * crash or trashed data later on.
+ */
+ vmbus_chan_printf(chan, "chan%u bufring GPADL "
+ "is still connected upon channel open error; "
+ "leak %d bytes memory\n", chan->ch_id,
+ txbr_size + rxbr_size);
+ } else {
+ hyperv_dmamem_free(&chan->ch_bufring_dma,
+ chan->ch_bufring);
+ }
+ chan->ch_bufring = NULL;
+ }
+ return (error);
+}
+
+int
+vmbus_chan_open_br(struct vmbus_channel *chan, const struct vmbus_chan_br *cbr,
+ const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg)
+{
+ struct vmbus_softc *sc = chan->ch_vmbus;
+ const struct vmbus_message *msg;
+ struct vmbus_chanmsg_chopen *req;
+ struct vmbus_msghc *mh;
+ uint32_t status;
+ int error, txbr_size, rxbr_size;
+ task_fn_t *task_fn;
+ uint8_t *br;
+
+ if (udlen > VMBUS_CHANMSG_CHOPEN_UDATA_SIZE) {
+ vmbus_chan_printf(chan,
+ "invalid udata len %d for chan%u\n", udlen, chan->ch_id);
+ return (EINVAL);
+ }
+
+ br = cbr->cbr;
+ txbr_size = cbr->cbr_txsz;
+ rxbr_size = cbr->cbr_rxsz;
+ KASSERT((txbr_size & PAGE_MASK) == 0,
+ ("send bufring size is not multiple page"));
+ KASSERT((rxbr_size & PAGE_MASK) == 0,
+ ("recv bufring size is not multiple page"));
+ KASSERT((cbr->cbr_paddr & PAGE_MASK) == 0,
+ ("bufring is not page aligned"));
+
+ /*
+ * Zero out the TX/RX bufrings, in case that they were used before.
+ */
+ memset(br, 0, txbr_size + rxbr_size);
+
+ if (atomic_testandset_int(&chan->ch_stflags,
+ VMBUS_CHAN_ST_OPENED_SHIFT))
+ panic("double-open chan%u", chan->ch_id);
+
+ chan->ch_cb = cb;
+ chan->ch_cbarg = cbarg;
+
+ vmbus_chan_update_evtflagcnt(sc, chan);
+
+ chan->ch_tq = VMBUS_PCPU_GET(chan->ch_vmbus, event_tq, chan->ch_cpuid);
+ if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD)
+ task_fn = vmbus_chan_task;
+ else
+ task_fn = vmbus_chan_task_nobatch;
+ TASK_INIT(&chan->ch_task, 0, task_fn, chan);
+
+ /* TX bufring comes first */
+ vmbus_txbr_setup(&chan->ch_txbr, br, txbr_size);
+ /* RX bufring immediately follows TX bufring */
+ vmbus_rxbr_setup(&chan->ch_rxbr, br + txbr_size, rxbr_size);
+
+ /* Create sysctl tree for this channel */
+ vmbus_chan_sysctl_create(chan);
+
+ /*
+ * Connect the bufrings, both RX and TX, to this channel.
+ */
+ error = vmbus_chan_gpadl_connect(chan, cbr->cbr_paddr,
+ txbr_size + rxbr_size, &chan->ch_bufring_gpadl);
+ if (error) {
+ vmbus_chan_printf(chan,
+ "failed to connect bufring GPADL to chan%u\n", chan->ch_id);
+ goto failed;
+ }
+
+ /*
+ * Install this channel, before it is opened, but after everything
+ * else has been setup.
+ */
+ vmbus_chan_set_chmap(chan);
+
+ /*
+ * Open channel w/ the bufring GPADL on the target CPU.
+ */
+ mh = vmbus_msghc_get(sc, sizeof(*req));
+ if (mh == NULL) {
+ vmbus_chan_printf(chan,
+ "can not get msg hypercall for chopen(chan%u)\n",
+ chan->ch_id);
+ error = ENXIO;
+ goto failed;
+ }
+
+ req = vmbus_msghc_dataptr(mh);
+ req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHOPEN;
+ req->chm_chanid = chan->ch_id;
+ req->chm_openid = chan->ch_id;
+ req->chm_gpadl = chan->ch_bufring_gpadl;
+ req->chm_vcpuid = chan->ch_vcpuid;
+ req->chm_txbr_pgcnt = txbr_size >> PAGE_SHIFT;
+ if (udlen > 0)
+ memcpy(req->chm_udata, udata, udlen);
+
+ error = vmbus_msghc_exec(sc, mh);
+ if (error) {
+ vmbus_chan_printf(chan,
+ "chopen(chan%u) msg hypercall exec failed: %d\n",
+ chan->ch_id, error);
+ vmbus_msghc_put(sc, mh);
+ goto failed;
+ }
+
+ for (;;) {
+ msg = vmbus_msghc_poll_result(sc, mh);
+ if (msg != NULL)
+ break;
+ if (vmbus_chan_is_revoked(chan)) {
+ int i;
+
+ /*
+ * NOTE:
+ * Hypervisor does _not_ send response CHOPEN to
+ * a revoked channel.
+ */
+ vmbus_chan_printf(chan,
+ "chan%u is revoked, when it is being opened\n",
+ chan->ch_id);
+
+ /*
+ * XXX
+ * Add extra delay before cancel the hypercall
+ * execution; mainly to close any possible
+ * CHRESCIND and CHOPEN_RESP races on the
+ * hypervisor side.
+ */
+#define REVOKE_LINGER 100
+ for (i = 0; i < REVOKE_LINGER; ++i) {
+ msg = vmbus_msghc_poll_result(sc, mh);
+ if (msg != NULL)
+ break;
+ pause("rchopen", 1);
+ }
+#undef REVOKE_LINGER
+ if (msg == NULL)
+ vmbus_msghc_exec_cancel(sc, mh);
+ break;
+ }
+ pause("chopen", 1);
+ }
+ if (msg != NULL) {
+ status = ((const struct vmbus_chanmsg_chopen_resp *)
+ msg->msg_data)->chm_status;
+ } else {
+ /* XXX any non-0 value is ok here. */
+ status = 0xff;
+ }
+
+ vmbus_msghc_put(sc, mh);
+
+ if (status == 0) {
+ if (bootverbose)
+ vmbus_chan_printf(chan, "chan%u opened\n", chan->ch_id);
+ return (0);
+ }
+
+ vmbus_chan_printf(chan, "failed to open chan%u\n", chan->ch_id);
+ error = ENXIO;
+
+failed:
+ sysctl_ctx_free(&chan->ch_sysctl_ctx);
+ vmbus_chan_clear_chmap(chan);
+ if (chan->ch_bufring_gpadl != 0) {
+ int error1;
+
+ error1 = vmbus_chan_gpadl_disconnect(chan,
+ chan->ch_bufring_gpadl);
+ if (error1) {
+ /*
+ * Give caller a hint that the bufring GPADL is still
+ * connected.
+ */
+ error = EISCONN;
+ }
+ chan->ch_bufring_gpadl = 0;
+ }
+ atomic_clear_int(&chan->ch_stflags, VMBUS_CHAN_ST_OPENED);
+ return (error);
+}
+
+int
+vmbus_chan_gpadl_connect(struct vmbus_channel *chan, bus_addr_t paddr,
+ int size, uint32_t *gpadl0)
+{
+ struct vmbus_softc *sc = chan->ch_vmbus;
+ struct vmbus_msghc *mh;
+ struct vmbus_chanmsg_gpadl_conn *req;
+ const struct vmbus_message *msg;
+ size_t reqsz;
+ uint32_t gpadl, status;
+ int page_count, range_len, i, cnt, error;
+ uint64_t page_id;
+
+ KASSERT(*gpadl0 == 0, ("GPADL is not zero"));
+
+ /*
+ * Preliminary checks.
+ */
+
+ KASSERT((size & PAGE_MASK) == 0,
+ ("invalid GPA size %d, not multiple page size", size));
+ page_count = size >> PAGE_SHIFT;
+
+ KASSERT((paddr & PAGE_MASK) == 0,
+ ("GPA is not page aligned %jx", (uintmax_t)paddr));
+ page_id = paddr >> PAGE_SHIFT;
+
+ range_len = __offsetof(struct vmbus_gpa_range, gpa_page[page_count]);
+ /*
+ * We don't support multiple GPA ranges.
+ */
+ if (range_len > UINT16_MAX) {
+ vmbus_chan_printf(chan, "GPA too large, %d pages\n",
+ page_count);
+ return EOPNOTSUPP;
+ }
+
+ /*
+ * Allocate GPADL id.
+ */
+ gpadl = vmbus_gpadl_alloc(sc);
+
+ /*
+ * Connect this GPADL to the target channel.
+ *
+ * NOTE:
+ * Since each message can only hold small set of page
+ * addresses, several messages may be required to
+ * complete the connection.
+ */
+ if (page_count > VMBUS_CHANMSG_GPADL_CONN_PGMAX)
+ cnt = VMBUS_CHANMSG_GPADL_CONN_PGMAX;
+ else
+ cnt = page_count;
+ page_count -= cnt;
+
+ reqsz = __offsetof(struct vmbus_chanmsg_gpadl_conn,
+ chm_range.gpa_page[cnt]);
+ mh = vmbus_msghc_get(sc, reqsz);
+ if (mh == NULL) {
+ vmbus_chan_printf(chan,
+ "can not get msg hypercall for gpadl_conn(chan%u)\n",
+ chan->ch_id);
+ return EIO;
+ }
+
+ req = vmbus_msghc_dataptr(mh);
+ req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_CONN;
+ req->chm_chanid = chan->ch_id;
+ req->chm_gpadl = gpadl;
+ req->chm_range_len = range_len;
+ req->chm_range_cnt = 1;
+ req->chm_range.gpa_len = size;
+ req->chm_range.gpa_ofs = 0;
+ for (i = 0; i < cnt; ++i)
+ req->chm_range.gpa_page[i] = page_id++;
+
+ error = vmbus_msghc_exec(sc, mh);
+ if (error) {
+ vmbus_chan_printf(chan,
+ "gpadl_conn(chan%u) msg hypercall exec failed: %d\n",
+ chan->ch_id, error);
+ vmbus_msghc_put(sc, mh);
+ return error;
+ }
+
+ while (page_count > 0) {
+ struct vmbus_chanmsg_gpadl_subconn *subreq;
+
+ if (page_count > VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX)
+ cnt = VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX;
+ else
+ cnt = page_count;
+ page_count -= cnt;
+
+ reqsz = __offsetof(struct vmbus_chanmsg_gpadl_subconn,
+ chm_gpa_page[cnt]);
+ vmbus_msghc_reset(mh, reqsz);
+
+ subreq = vmbus_msghc_dataptr(mh);
+ subreq->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_SUBCONN;
+ subreq->chm_gpadl = gpadl;
+ for (i = 0; i < cnt; ++i)
+ subreq->chm_gpa_page[i] = page_id++;
+
+ vmbus_msghc_exec_noresult(mh);
+ }
+ KASSERT(page_count == 0, ("invalid page count %d", page_count));
+
+ msg = vmbus_msghc_wait_result(sc, mh);
+ status = ((const struct vmbus_chanmsg_gpadl_connresp *)
+ msg->msg_data)->chm_status;
+
+ vmbus_msghc_put(sc, mh);
+
+ if (status != 0) {
+ vmbus_chan_printf(chan, "gpadl_conn(chan%u) failed: %u\n",
+ chan->ch_id, status);
+ return EIO;
+ }
+
+ /* Done; commit the GPADL id. */
+ *gpadl0 = gpadl;
+ if (bootverbose) {
+ vmbus_chan_printf(chan, "gpadl_conn(chan%u) succeeded\n",
+ chan->ch_id);
+ }
+ return 0;
+}
+
+static bool
+vmbus_chan_wait_revoke(const struct vmbus_channel *chan, bool can_sleep)
+{
+#define WAIT_COUNT 200 /* 200ms */
+
+ int i;
+
+ for (i = 0; i < WAIT_COUNT; ++i) {
+ if (vmbus_chan_is_revoked(chan))
+ return (true);
+ if (can_sleep)
+ pause("wchrev", 1);
+ else
+ DELAY(1000);
+ }
+ return (false);
+
+#undef WAIT_COUNT
+}
+
+/*
+ * Disconnect the GPA from the target channel
+ */
+int
+vmbus_chan_gpadl_disconnect(struct vmbus_channel *chan, uint32_t gpadl)
+{
+ struct vmbus_softc *sc = chan->ch_vmbus;
+ struct vmbus_msghc *mh;
+ struct vmbus_chanmsg_gpadl_disconn *req;
+ int error;
+
+ KASSERT(gpadl != 0, ("GPADL is zero"));
+
+ mh = vmbus_msghc_get(sc, sizeof(*req));
+ if (mh == NULL) {
+ vmbus_chan_printf(chan,
+ "can not get msg hypercall for gpadl_disconn(chan%u)\n",
+ chan->ch_id);
+ return (EBUSY);
+ }
+
+ req = vmbus_msghc_dataptr(mh);
+ req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_DISCONN;
+ req->chm_chanid = chan->ch_id;
+ req->chm_gpadl = gpadl;
+
+ error = vmbus_msghc_exec(sc, mh);
+ if (error) {
+ vmbus_msghc_put(sc, mh);
+
+ if (vmbus_chan_wait_revoke(chan, true)) {
+ /*
+ * Error is benign; this channel is revoked,
+ * so this GPADL will not be touched anymore.
+ */
+ vmbus_chan_printf(chan,
+ "gpadl_disconn(revoked chan%u) msg hypercall "
+ "exec failed: %d\n", chan->ch_id, error);
+ return (0);
+ }
+ vmbus_chan_printf(chan,
+ "gpadl_disconn(chan%u) msg hypercall exec failed: %d\n",
+ chan->ch_id, error);
+ return (error);
+ }
+
+ vmbus_msghc_wait_result(sc, mh);
+ /* Discard result; no useful information */
+ vmbus_msghc_put(sc, mh);
+
+ return (0);
+}
+
+static void
+vmbus_chan_detach(struct vmbus_channel *chan)
+{
+ int refs;
+
+ KASSERT(chan->ch_refs > 0, ("chan%u: invalid refcnt %d",
+ chan->ch_id, chan->ch_refs));
+ refs = atomic_fetchadd_int(&chan->ch_refs, -1);
+#ifdef INVARIANTS
+ if (VMBUS_CHAN_ISPRIMARY(chan)) {
+ KASSERT(refs == 1, ("chan%u: invalid refcnt %d for prichan",
+ chan->ch_id, refs + 1));
+ }
+#endif
+ if (refs == 1) {
+ /*
+ * Detach the target channel.
+ */
+ if (bootverbose) {
+ vmbus_chan_printf(chan, "chan%u detached\n",
+ chan->ch_id);
+ }
+ taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_detach_task);
+ }
+}
+
+static void
+vmbus_chan_clrchmap_task(void *xchan, int pending __unused)
+{
+ struct vmbus_channel *chan = xchan;
+
+ chan->ch_vmbus->vmbus_chmap[chan->ch_id] = NULL;
+}
+
+static void
+vmbus_chan_clear_chmap(struct vmbus_channel *chan)
+{
+ struct task chmap_task;
+
+ TASK_INIT(&chmap_task, 0, vmbus_chan_clrchmap_task, chan);
+ vmbus_chan_run_task(chan, &chmap_task);
+}
+
+static void
+vmbus_chan_set_chmap(struct vmbus_channel *chan)
+{
+ __compiler_membar();
+ chan->ch_vmbus->vmbus_chmap[chan->ch_id] = chan;
+}
+
+static void
+vmbus_chan_poll_cancel_task(void *xchan, int pending __unused)
+{
+
+ vmbus_chan_poll_cancel_intq(xchan);
+}
+
+static void
+vmbus_chan_poll_cancel(struct vmbus_channel *chan)
+{
+ struct task poll_cancel;
+
+ TASK_INIT(&poll_cancel, 0, vmbus_chan_poll_cancel_task, chan);
+ vmbus_chan_run_task(chan, &poll_cancel);
+}
+
+static int
+vmbus_chan_close_internal(struct vmbus_channel *chan)
+{
+ struct vmbus_softc *sc = chan->ch_vmbus;
+ struct vmbus_msghc *mh;
+ struct vmbus_chanmsg_chclose *req;
+ uint32_t old_stflags;
+ int error;
+
+ /*
+ * NOTE:
+ * Sub-channels are closed upon their primary channel closing,
+ * so they can be closed even before they are opened.
+ */
+ for (;;) {
+ old_stflags = chan->ch_stflags;
+ if (atomic_cmpset_int(&chan->ch_stflags, old_stflags,
+ old_stflags & ~VMBUS_CHAN_ST_OPENED))
+ break;
+ }
+ if ((old_stflags & VMBUS_CHAN_ST_OPENED) == 0) {
+ /* Not opened yet; done */
+ if (bootverbose) {
+ vmbus_chan_printf(chan, "chan%u not opened\n",
+ chan->ch_id);
+ }
+ return (0);
+ }
+
+ /*
+ * Free this channel's sysctl tree attached to its device's
+ * sysctl tree.
+ */
+ sysctl_ctx_free(&chan->ch_sysctl_ctx);
+
+ /*
+ * Cancel polling, if it is enabled.
+ */
+ vmbus_chan_poll_cancel(chan);
+
+ /*
+ * NOTE:
+ * Order is critical. This channel _must_ be uninstalled first,
+ * else the channel task may be enqueued by the IDT after it has
+ * been drained.
+ */
+ vmbus_chan_clear_chmap(chan);
+ taskqueue_drain(chan->ch_tq, &chan->ch_task);
+ chan->ch_tq = NULL;
+
+ /*
+ * Close this channel.
+ */
+ mh = vmbus_msghc_get(sc, sizeof(*req));
+ if (mh == NULL) {
+ vmbus_chan_printf(chan,
+ "can not get msg hypercall for chclose(chan%u)\n",
+ chan->ch_id);
+ error = ENXIO;
+ goto disconnect;
+ }
+
+ req = vmbus_msghc_dataptr(mh);
+ req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHCLOSE;
+ req->chm_chanid = chan->ch_id;
+
+ error = vmbus_msghc_exec_noresult(mh);
+ vmbus_msghc_put(sc, mh);
+
+ if (error) {
+ vmbus_chan_printf(chan,
+ "chclose(chan%u) msg hypercall exec failed: %d\n",
+ chan->ch_id, error);
+ goto disconnect;
+ }
+
+ if (bootverbose)
+ vmbus_chan_printf(chan, "chan%u closed\n", chan->ch_id);
+
+disconnect:
+ /*
+ * Disconnect the TX+RX bufrings from this channel.
+ */
+ if (chan->ch_bufring_gpadl != 0) {
+ int error1;
+
+ error1 = vmbus_chan_gpadl_disconnect(chan,
+ chan->ch_bufring_gpadl);
+ if (error1) {
+ /*
+ * XXX
+ * The bufring GPADL is still connected; abandon
+ * this bufring, instead of having mysterious
+ * crash or trashed data later on.
+ */
+ vmbus_chan_printf(chan, "chan%u bufring GPADL "
+ "is still connected after close\n", chan->ch_id);
+ chan->ch_bufring = NULL;
+ /*
+ * Give caller a hint that the bufring GPADL is
+ * still connected.
+ */
+ error = EISCONN;
+ }
+ chan->ch_bufring_gpadl = 0;
+ }
+
+ /*
+ * Destroy the TX+RX bufrings.
+ */
+ if (chan->ch_bufring != NULL) {
+ hyperv_dmamem_free(&chan->ch_bufring_dma, chan->ch_bufring);
+ chan->ch_bufring = NULL;
+ }
+ return (error);
+}
+
+int
+vmbus_chan_close_direct(struct vmbus_channel *chan)
+{
+ int error;
+
+#ifdef INVARIANTS
+ if (VMBUS_CHAN_ISPRIMARY(chan)) {
+ struct vmbus_channel *subchan;
+
+ /*
+ * All sub-channels _must_ have been closed, or are _not_
+ * opened at all.
+ */
+ mtx_lock(&chan->ch_subchan_lock);
+ TAILQ_FOREACH(subchan, &chan->ch_subchans, ch_sublink) {
+ KASSERT(
+ (subchan->ch_stflags & VMBUS_CHAN_ST_OPENED) == 0,
+ ("chan%u: subchan%u is still opened",
+ chan->ch_id, subchan->ch_subidx));
+ }
+ mtx_unlock(&chan->ch_subchan_lock);
+ }
+#endif
+
+ error = vmbus_chan_close_internal(chan);
+ if (!VMBUS_CHAN_ISPRIMARY(chan)) {
+ /*
+ * This sub-channel is referenced, when it is linked to
+ * the primary channel; drop that reference now.
+ */
+ vmbus_chan_detach(chan);
+ }
+ return (error);
+}
+
+/*
+ * Caller should make sure that all sub-channels have
+ * been added to 'chan' and all to-be-closed channels
+ * are not being opened.
+ */
+void
+vmbus_chan_close(struct vmbus_channel *chan)
+{
+ int subchan_cnt;
+
+ if (!VMBUS_CHAN_ISPRIMARY(chan)) {
+ /*
+ * Sub-channel is closed when its primary channel
+ * is closed; done.
+ */
+ return;
+ }
+
+ /*
+ * Close all sub-channels, if any.
+ */
+ subchan_cnt = chan->ch_subchan_cnt;
+ if (subchan_cnt > 0) {
+ struct vmbus_channel **subchan;
+ int i;
+
+ subchan = vmbus_subchan_get(chan, subchan_cnt);
+ for (i = 0; i < subchan_cnt; ++i) {
+ vmbus_chan_close_internal(subchan[i]);
+ /*
+ * This sub-channel is referenced, when it is
+ * linked to the primary channel; drop that
+ * reference now.
+ */
+ vmbus_chan_detach(subchan[i]);
+ }
+ vmbus_subchan_rel(subchan, subchan_cnt);
+ }
+
+ /* Then close the primary channel. */
+ vmbus_chan_close_internal(chan);
+}
+
+void
+vmbus_chan_intr_drain(struct vmbus_channel *chan)
+{
+
+ taskqueue_drain(chan->ch_tq, &chan->ch_task);
+}
+
+uint32_t
+vmbus_chan_write_available(struct vmbus_channel *chan)
+{
+ return (vmbus_txbr_available(&chan->ch_txbr));
+}
+
+bool
+vmbus_chan_write_signal(struct vmbus_channel *chan,
+ int32_t min_signal_size)
+{
+ if (min_signal_size >= 0 &&
+ vmbus_chan_write_available(chan) > min_signal_size) {
+ return false;
+ }
+
+ if (!vmbus_txbr_get_imask(&chan->ch_txbr)) {
+ /* txbr imask is not set, signal the reader */
+ vmbus_chan_signal_tx(chan);
+ return true;
+ }
+
+ return false;
+}
+
+void
+vmbus_chan_set_pending_send_size(struct vmbus_channel *chan,
+ uint32_t size)
+{
+ if (chan)
+ vmbus_txbr_set_pending_snd_sz(&chan->ch_txbr, size);
+}
+
+int
+vmbus_chan_iov_send(struct vmbus_channel *chan,
+ const struct iovec iov[], int iovlen,
+ vmbus_br_copy_callback_t cb, void *cbarg)
+{
+ int error;
+ boolean_t send_evt;
+
+ if (iovlen == 0)
+ return (0);
+
+ error = vmbus_txbr_write_call(&chan->ch_txbr, iov, iovlen,
+ cb, cbarg, &send_evt);
+
+ if (!error && send_evt) {
+ vmbus_chan_signal_tx(chan);
+ }
+
+ return error;
+}
+
+int
+vmbus_chan_send(struct vmbus_channel *chan, uint16_t type, uint16_t flags,
+ void *data, int dlen, uint64_t xactid)
+{
+ struct vmbus_chanpkt pkt;
+ int pktlen, pad_pktlen, hlen, error;
+ uint64_t pad = 0;
+ struct iovec iov[3];
+ boolean_t send_evt;
+
+ hlen = sizeof(pkt);
+ pktlen = hlen + dlen;
+ pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen);
+ KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr),
+ ("invalid packet size %d", pad_pktlen));
+
+ pkt.cp_hdr.cph_type = type;
+ pkt.cp_hdr.cph_flags = flags;
+ VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen);
+ VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen);
+ pkt.cp_hdr.cph_xactid = xactid;
+
+ iov[0].iov_base = &pkt;
+ iov[0].iov_len = hlen;
+ iov[1].iov_base = data;
+ iov[1].iov_len = dlen;
+ iov[2].iov_base = &pad;
+ iov[2].iov_len = pad_pktlen - pktlen;
+
+ error = vmbus_txbr_write(&chan->ch_txbr, iov, 3, &send_evt);
+ if (!error && send_evt)
+ vmbus_chan_signal_tx(chan);
+ return error;
+}
+
+int
+vmbus_chan_send_sglist(struct vmbus_channel *chan,
+ struct vmbus_gpa sg[], int sglen, void *data, int dlen, uint64_t xactid)
+{
+ struct vmbus_chanpkt_sglist pkt;
+ int pktlen, pad_pktlen, hlen, error;
+ struct iovec iov[4];
+ boolean_t send_evt;
+ uint64_t pad = 0;
+
+ hlen = __offsetof(struct vmbus_chanpkt_sglist, cp_gpa[sglen]);
+ pktlen = hlen + dlen;
+ pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen);
+ KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr),
+ ("invalid packet size %d", pad_pktlen));
+
+ pkt.cp_hdr.cph_type = VMBUS_CHANPKT_TYPE_GPA;
+ pkt.cp_hdr.cph_flags = VMBUS_CHANPKT_FLAG_RC;
+ VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen);
+ VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen);
+ pkt.cp_hdr.cph_xactid = xactid;
+ pkt.cp_rsvd = 0;
+ pkt.cp_gpa_cnt = sglen;
+
+ iov[0].iov_base = &pkt;
+ iov[0].iov_len = sizeof(pkt);
+ iov[1].iov_base = sg;
+ iov[1].iov_len = sizeof(struct vmbus_gpa) * sglen;
+ iov[2].iov_base = data;
+ iov[2].iov_len = dlen;
+ iov[3].iov_base = &pad;
+ iov[3].iov_len = pad_pktlen - pktlen;
+
+ error = vmbus_txbr_write(&chan->ch_txbr, iov, 4, &send_evt);
+ if (!error && send_evt)
+ vmbus_chan_signal_tx(chan);
+ return error;
+}
+
+int
+vmbus_chan_send_prplist(struct vmbus_channel *chan,
+ struct vmbus_gpa_range *prp, int prp_cnt, void *data, int dlen,
+ uint64_t xactid)
+{
+ struct vmbus_chanpkt_prplist pkt;
+ int pktlen, pad_pktlen, hlen, error;
+ struct iovec iov[4];
+ boolean_t send_evt;
+ uint64_t pad = 0;
+
+ hlen = __offsetof(struct vmbus_chanpkt_prplist,
+ cp_range[0].gpa_page[prp_cnt]);
+ pktlen = hlen + dlen;
+ pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen);
+ KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr),
+ ("invalid packet size %d", pad_pktlen));
+
+ pkt.cp_hdr.cph_type = VMBUS_CHANPKT_TYPE_GPA;
+ pkt.cp_hdr.cph_flags = VMBUS_CHANPKT_FLAG_RC;
+ VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen);
+ VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen);
+ pkt.cp_hdr.cph_xactid = xactid;
+ pkt.cp_rsvd = 0;
+ pkt.cp_range_cnt = 1;
+
+ iov[0].iov_base = &pkt;
+ iov[0].iov_len = sizeof(pkt);
+ iov[1].iov_base = prp;
+ iov[1].iov_len = __offsetof(struct vmbus_gpa_range, gpa_page[prp_cnt]);
+ iov[2].iov_base = data;
+ iov[2].iov_len = dlen;
+ iov[3].iov_base = &pad;
+ iov[3].iov_len = pad_pktlen - pktlen;
+
+ error = vmbus_txbr_write(&chan->ch_txbr, iov, 4, &send_evt);
+ if (!error && send_evt)
+ vmbus_chan_signal_tx(chan);
+ return error;
+}
+
+int
+vmbus_chan_recv(struct vmbus_channel *chan, void *data, int *dlen0,
+ uint64_t *xactid)
+{
+ struct vmbus_chanpkt_hdr pkt;
+ int error, dlen, hlen;
+
+ error = vmbus_rxbr_peek(&chan->ch_rxbr, &pkt, sizeof(pkt));
+ if (error)
+ return (error);
+
+ if (__predict_false(pkt.cph_hlen < VMBUS_CHANPKT_HLEN_MIN)) {
+ vmbus_chan_printf(chan, "invalid hlen %u\n", pkt.cph_hlen);
+ /* XXX this channel is dead actually. */
+ return (EIO);
+ }
+ if (__predict_false(pkt.cph_hlen > pkt.cph_tlen)) {
+ vmbus_chan_printf(chan, "invalid hlen %u and tlen %u\n",
+ pkt.cph_hlen, pkt.cph_tlen);
+ /* XXX this channel is dead actually. */
+ return (EIO);
+ }
+
+ hlen = VMBUS_CHANPKT_GETLEN(pkt.cph_hlen);
+ dlen = VMBUS_CHANPKT_GETLEN(pkt.cph_tlen) - hlen;
+
+ if (*dlen0 < dlen) {
+ /* Return the size of this packet's data. */
+ *dlen0 = dlen;
+ return (ENOBUFS);
+ }
+
+ *xactid = pkt.cph_xactid;
+ *dlen0 = dlen;
+
+ /* Skip packet header */
+ error = vmbus_rxbr_read(&chan->ch_rxbr, data, dlen, hlen);
+ KASSERT(!error, ("vmbus_rxbr_read failed"));
+
+ return (0);
+}
+
+int
+vmbus_chan_recv_pkt(struct vmbus_channel *chan,
+ struct vmbus_chanpkt_hdr *pkt, int *pktlen0)
+{
+ int error, pktlen, pkt_hlen;
+
+ pkt_hlen = sizeof(*pkt);
+ error = vmbus_rxbr_peek(&chan->ch_rxbr, pkt, pkt_hlen);
+ if (error)
+ return (error);
+
+ if (__predict_false(pkt->cph_hlen < VMBUS_CHANPKT_HLEN_MIN)) {
+ vmbus_chan_printf(chan, "invalid hlen %u\n", pkt->cph_hlen);
+ /* XXX this channel is dead actually. */
+ return (EIO);
+ }
+ if (__predict_false(pkt->cph_hlen > pkt->cph_tlen)) {
+ vmbus_chan_printf(chan, "invalid hlen %u and tlen %u\n",
+ pkt->cph_hlen, pkt->cph_tlen);
+ /* XXX this channel is dead actually. */
+ return (EIO);
+ }
+
+ pktlen = VMBUS_CHANPKT_GETLEN(pkt->cph_tlen);
+ if (*pktlen0 < pktlen) {
+ /* Return the size of this packet. */
+ *pktlen0 = pktlen;
+ return (ENOBUFS);
+ }
+ *pktlen0 = pktlen;
+
+ /*
+ * Skip the fixed-size packet header, which has been filled
+ * by the above vmbus_rxbr_peek().
+ */
+ error = vmbus_rxbr_read(&chan->ch_rxbr, pkt + 1,
+ pktlen - pkt_hlen, pkt_hlen);
+ KASSERT(!error, ("vmbus_rxbr_read failed"));
+
+ return (0);
+}
+
+uint32_t
+vmbus_chan_read_available(struct vmbus_channel *chan)
+{
+ return (vmbus_rxbr_available(&chan->ch_rxbr));
+}
+
+/*
+ * This routine does:
+ * - Advance the channel read index for 'advance' bytes
+ * - Copy data_len bytes in to the buffer pointed by 'data'
+ * Return 0 if operation succeed. EAGAIN if operations if failed.
+ * If failed, the buffer pointed by 'data' is intact, and the
+ * channel read index is not advanced at all.
+ */
+int
+vmbus_chan_recv_peek(struct vmbus_channel *chan,
+ void *data, int data_len, uint32_t advance)
+{
+ int error;
+ boolean_t sig_event;
+
+ if (data == NULL || data_len <= 0)
+ return (EINVAL);
+
+ error = vmbus_rxbr_idxadv_peek(&chan->ch_rxbr,
+ data, data_len, advance, &sig_event);
+
+ if (!error && sig_event) {
+ vmbus_chan_signal_rx(chan);
+ }
+
+ return (error);
+}
+
+/*
+ * This routine does:
+ * - Advance the channel read index for 'advance' bytes
+ */
+int
+vmbus_chan_recv_idxadv(struct vmbus_channel *chan, uint32_t advance)
+{
+ int error;
+ boolean_t sig_event;
+
+ if (advance == 0)
+ return (EINVAL);
+
+ error = vmbus_rxbr_idxadv(&chan->ch_rxbr, advance, &sig_event);
+
+ if (!error && sig_event) {
+ vmbus_chan_signal_rx(chan);
+ }
+
+ return (error);
+}
+
+
+/*
+ * Caller should hold its own lock to serialize the ring buffer
+ * copy.
+ */
+int
+vmbus_chan_recv_peek_call(struct vmbus_channel *chan, int data_len,
+ uint32_t skip, vmbus_br_copy_callback_t cb, void *cbarg)
+{
+ if (!chan || data_len <= 0 || cb == NULL)
+ return (EINVAL);
+
+ return (vmbus_rxbr_peek_call(&chan->ch_rxbr, data_len, skip,
+ cb, cbarg));
+}
+
+static void
+vmbus_chan_task(void *xchan, int pending __unused)
+{
+ struct vmbus_channel *chan = xchan;
+ vmbus_chan_callback_t cb = chan->ch_cb;
+ void *cbarg = chan->ch_cbarg;
+
+ KASSERT(chan->ch_poll_intvl == 0,
+ ("chan%u: interrupted in polling mode", chan->ch_id));
+
+ /*
+ * Optimize host to guest signaling by ensuring:
+ * 1. While reading the channel, we disable interrupts from
+ * host.
+ * 2. Ensure that we process all posted messages from the host
+ * before returning from this callback.
+ * 3. Once we return, enable signaling from the host. Once this
+ * state is set we check to see if additional packets are
+ * available to read. In this case we repeat the process.
+ *
+ * NOTE: Interrupt has been disabled in the ISR.
+ */
+ for (;;) {
+ uint32_t left;
+
+ cb(chan, cbarg);
+
+ left = vmbus_rxbr_intr_unmask(&chan->ch_rxbr);
+ if (left == 0) {
+ /* No more data in RX bufring; done */
+ break;
+ }
+ vmbus_rxbr_intr_mask(&chan->ch_rxbr);
+ }
+}
+
+static void
+vmbus_chan_task_nobatch(void *xchan, int pending __unused)
+{
+ struct vmbus_channel *chan = xchan;
+
+ KASSERT(chan->ch_poll_intvl == 0,
+ ("chan%u: interrupted in polling mode", chan->ch_id));
+ chan->ch_cb(chan, chan->ch_cbarg);
+}
+
+static void
+vmbus_chan_poll_timeout(void *xchan)
+{
+ struct vmbus_channel *chan = xchan;
+
+ KASSERT(chan->ch_poll_intvl != 0,
+ ("chan%u: polling timeout in interrupt mode", chan->ch_id));
+ taskqueue_enqueue(chan->ch_tq, &chan->ch_poll_task);
+}
+
+static void
+vmbus_chan_poll_task(void *xchan, int pending __unused)
+{
+ struct vmbus_channel *chan = xchan;
+
+ KASSERT(chan->ch_poll_intvl != 0,
+ ("chan%u: polling in interrupt mode", chan->ch_id));
+ callout_reset_sbt_curcpu(&chan->ch_poll_timeo, chan->ch_poll_intvl, 0,
+ vmbus_chan_poll_timeout, chan, chan->ch_poll_flags);
+ chan->ch_cb(chan, chan->ch_cbarg);
+}
+
+static void
+vmbus_chan_pollcfg_task(void *xarg, int pending __unused)
+{
+ const struct vmbus_chan_pollarg *arg = xarg;
+ struct vmbus_channel *chan = arg->poll_chan;
+ sbintime_t intvl;
+ int poll_flags;
+
+ /*
+ * Save polling interval.
+ */
+ intvl = SBT_1S / arg->poll_hz;
+ if (intvl == 0)
+ intvl = 1;
+ if (intvl == chan->ch_poll_intvl) {
+ /* Nothing changes; done */
+ return;
+ }
+ chan->ch_poll_intvl = intvl;
+
+ /* Adjust callout flags. */
+ poll_flags = C_DIRECT_EXEC;
+ if (arg->poll_hz <= hz)
+ poll_flags |= C_HARDCLOCK;
+ chan->ch_poll_flags = poll_flags;
+
+ /*
+ * Disconnect this channel from the channel map to make sure that
+ * the RX bufring interrupt enabling bit can not be touched, and
+ * ISR can not enqueue this channel task anymore. THEN, disable
+ * interrupt from the RX bufring (TX bufring does not generate
+ * interrupt to VM).
+ *
+ * NOTE: order is critical.
+ */
+ chan->ch_vmbus->vmbus_chmap[chan->ch_id] = NULL;
+ __compiler_membar();
+ vmbus_rxbr_intr_mask(&chan->ch_rxbr);
+
+ /*
+ * NOTE:
+ * At this point, this channel task will not be enqueued by
+ * the ISR anymore, time to cancel the pending one.
+ */
+ taskqueue_cancel(chan->ch_tq, &chan->ch_task, NULL);
+
+ /* Kick start! */
+ taskqueue_enqueue(chan->ch_tq, &chan->ch_poll_task);
+}
+
+static bool
+vmbus_chan_poll_cancel_intq(struct vmbus_channel *chan)
+{
+
+ if (chan->ch_poll_intvl == 0) {
+ /* Not enabled. */
+ return (false);
+ }
+
+ /*
+ * Stop polling callout, so that channel polling task
+ * will not be enqueued anymore.
+ */
+ callout_drain(&chan->ch_poll_timeo);
+
+ /*
+ * Disable polling by resetting polling interval.
+ *
+ * NOTE:
+ * The polling interval resetting MUST be conducted
+ * after the callout is drained; mainly to keep the
+ * proper assertion in place.
+ */
+ chan->ch_poll_intvl = 0;
+
+ /*
+ * NOTE:
+ * At this point, this channel polling task will not be
+ * enqueued by the callout anymore, time to cancel the
+ * pending one.
+ */
+ taskqueue_cancel(chan->ch_tq, &chan->ch_poll_task, NULL);
+
+ /* Polling was enabled. */
+ return (true);
+}
+
+static void
+vmbus_chan_polldis_task(void *xchan, int pending __unused)
+{
+ struct vmbus_channel *chan = xchan;
+
+ if (!vmbus_chan_poll_cancel_intq(chan)) {
+ /* Already disabled; done. */
+ return;
+ }
+
+ /*
+ * Plug this channel back to the channel map and unmask
+ * the RX bufring interrupt.
+ */
+ chan->ch_vmbus->vmbus_chmap[chan->ch_id] = chan;
+ __compiler_membar();
+ vmbus_rxbr_intr_unmask(&chan->ch_rxbr);
+
+ /*
+ * Kick start the interrupt task, just in case unmasking
+ * interrupt races ISR.
+ */
+ taskqueue_enqueue(chan->ch_tq, &chan->ch_task);
+}
+
+static __inline void
+vmbus_event_flags_proc(struct vmbus_softc *sc, volatile u_long *event_flags,
+ int flag_cnt)
+{
+ int f;
+
+ for (f = 0; f < flag_cnt; ++f) {
+ uint32_t chid_base;
+ u_long flags;
+ int chid_ofs;
+
+ if (event_flags[f] == 0)
+ continue;
+
+ flags = atomic_swap_long(&event_flags[f], 0);
+ chid_base = f << VMBUS_EVTFLAG_SHIFT;
+
+ while ((chid_ofs = ffsl(flags)) != 0) {
+ struct vmbus_channel *chan;
+
+ --chid_ofs; /* NOTE: ffsl is 1-based */
+ flags &= ~(1UL << chid_ofs);
+
+ chan = sc->vmbus_chmap[chid_base + chid_ofs];
+ if (__predict_false(chan == NULL)) {
+ /* Channel is closed. */
+ continue;
+ }
+ __compiler_membar();
+
+ if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD)
+ vmbus_rxbr_intr_mask(&chan->ch_rxbr);
+ taskqueue_enqueue(chan->ch_tq, &chan->ch_task);
+ }
+ }
+}
+
+void
+vmbus_event_proc(struct vmbus_softc *sc, int cpu)
+{
+ struct vmbus_evtflags *eventf;
+
+ /*
+ * On Host with Win8 or above, the event page can be checked directly
+ * to get the id of the channel that has the pending interrupt.
+ */
+ eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE;
+ vmbus_event_flags_proc(sc, eventf->evt_flags,
+ VMBUS_PCPU_GET(sc, event_flags_cnt, cpu));
+}
+
+void
+vmbus_event_proc_compat(struct vmbus_softc *sc, int cpu)
+{
+ struct vmbus_evtflags *eventf;
+
+ eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE;
+ if (atomic_testandclear_long(&eventf->evt_flags[0], 0)) {
+ vmbus_event_flags_proc(sc, sc->vmbus_rx_evtflags,
+ VMBUS_CHAN_MAX_COMPAT >> VMBUS_EVTFLAG_SHIFT);
+ }
+}
+
+static void
+vmbus_chan_update_evtflagcnt(struct vmbus_softc *sc,
+ const struct vmbus_channel *chan)
+{
+ volatile int *flag_cnt_ptr;
+ int flag_cnt;
+
+ flag_cnt = (chan->ch_id / VMBUS_EVTFLAG_LEN) + 1;
+ flag_cnt_ptr = VMBUS_PCPU_PTR(sc, event_flags_cnt, chan->ch_cpuid);
+
+ for (;;) {
+ int old_flag_cnt;
+
+ old_flag_cnt = *flag_cnt_ptr;
+ if (old_flag_cnt >= flag_cnt)
+ break;
+ if (atomic_cmpset_int(flag_cnt_ptr, old_flag_cnt, flag_cnt)) {
+ if (bootverbose) {
+ vmbus_chan_printf(chan,
+ "chan%u update cpu%d flag_cnt to %d\n",
+ chan->ch_id, chan->ch_cpuid, flag_cnt);
+ }
+ break;
+ }
+ }
+}
+
+static struct vmbus_channel *
+vmbus_chan_alloc(struct vmbus_softc *sc)
+{
+ struct vmbus_channel *chan;
+
+ chan = malloc(sizeof(*chan), M_DEVBUF, M_WAITOK | M_ZERO);
+
+ chan->ch_monprm = hyperv_dmamem_alloc(bus_get_dma_tag(sc->vmbus_dev),
+ HYPERCALL_PARAM_ALIGN, 0, sizeof(struct hyperv_mon_param),
+ &chan->ch_monprm_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO);
+ if (chan->ch_monprm == NULL) {
+ device_printf(sc->vmbus_dev, "monprm alloc failed\n");
+ free(chan, M_DEVBUF);
+ return NULL;
+ }
+
+ chan->ch_refs = 1;
+ chan->ch_vmbus = sc;
+ mtx_init(&chan->ch_subchan_lock, "vmbus subchan", NULL, MTX_DEF);
+ sx_init(&chan->ch_orphan_lock, "vmbus chorphan");
+ TAILQ_INIT(&chan->ch_subchans);
+ vmbus_rxbr_init(&chan->ch_rxbr);
+ vmbus_txbr_init(&chan->ch_txbr);
+
+ TASK_INIT(&chan->ch_poll_task, 0, vmbus_chan_poll_task, chan);
+ callout_init(&chan->ch_poll_timeo, 1);
+
+ return chan;
+}
+
+static void
+vmbus_chan_free(struct vmbus_channel *chan)
+{
+
+ KASSERT(TAILQ_EMPTY(&chan->ch_subchans) && chan->ch_subchan_cnt == 0,
+ ("still owns sub-channels"));
+ KASSERT((chan->ch_stflags &
+ (VMBUS_CHAN_ST_OPENED |
+ VMBUS_CHAN_ST_ONPRIL |
+ VMBUS_CHAN_ST_ONSUBL |
+ VMBUS_CHAN_ST_ONLIST)) == 0, ("free busy channel"));
+ KASSERT(chan->ch_orphan_xact == NULL,
+ ("still has orphan xact installed"));
+ KASSERT(chan->ch_refs == 0, ("chan%u: invalid refcnt %d",
+ chan->ch_id, chan->ch_refs));
+ KASSERT(chan->ch_poll_intvl == 0, ("chan%u: polling is activated",
+ chan->ch_id));
+
+ hyperv_dmamem_free(&chan->ch_monprm_dma, chan->ch_monprm);
+ mtx_destroy(&chan->ch_subchan_lock);
+ sx_destroy(&chan->ch_orphan_lock);
+ vmbus_rxbr_deinit(&chan->ch_rxbr);
+ vmbus_txbr_deinit(&chan->ch_txbr);
+ free(chan, M_DEVBUF);
+}
+
+static int
+vmbus_chan_add(struct vmbus_channel *newchan)
+{
+ struct vmbus_softc *sc = newchan->ch_vmbus;
+ struct vmbus_channel *prichan;
+
+ if (newchan->ch_id == 0) {
+ /*
+ * XXX
+ * Chan0 will neither be processed nor should be offered;
+ * skip it.
+ */
+ device_printf(sc->vmbus_dev, "got chan0 offer, discard\n");
+ return EINVAL;
+ } else if (newchan->ch_id >= VMBUS_CHAN_MAX) {
+ device_printf(sc->vmbus_dev, "invalid chan%u offer\n",
+ newchan->ch_id);
+ return EINVAL;
+ }
+
+ mtx_lock(&sc->vmbus_prichan_lock);
+ TAILQ_FOREACH(prichan, &sc->vmbus_prichans, ch_prilink) {
+ /*
+ * Sub-channel will have the same type GUID and instance
+ * GUID as its primary channel.
+ */
+ if (memcmp(&prichan->ch_guid_type, &newchan->ch_guid_type,
+ sizeof(struct hyperv_guid)) == 0 &&
+ memcmp(&prichan->ch_guid_inst, &newchan->ch_guid_inst,
+ sizeof(struct hyperv_guid)) == 0)
+ break;
+ }
+ if (VMBUS_CHAN_ISPRIMARY(newchan)) {
+ if (prichan == NULL) {
+ /* Install the new primary channel */
+ vmbus_chan_ins_prilist(sc, newchan);
+ mtx_unlock(&sc->vmbus_prichan_lock);
+ goto done;
+ } else {
+ mtx_unlock(&sc->vmbus_prichan_lock);
+ device_printf(sc->vmbus_dev,
+ "duplicated primary chan%u\n", newchan->ch_id);
+ return EINVAL;
+ }
+ } else { /* Sub-channel */
+ if (prichan == NULL) {
+ mtx_unlock(&sc->vmbus_prichan_lock);
+ device_printf(sc->vmbus_dev,
+ "no primary chan for chan%u\n", newchan->ch_id);
+ return EINVAL;
+ }
+ /*
+ * Found the primary channel for this sub-channel and
+ * move on.
+ *
+ * XXX refcnt prichan
+ */
+ }
+ mtx_unlock(&sc->vmbus_prichan_lock);
+
+ /*
+ * This is a sub-channel; link it with the primary channel.
+ */
+ KASSERT(!VMBUS_CHAN_ISPRIMARY(newchan),
+ ("new channel is not sub-channel"));
+ KASSERT(prichan != NULL, ("no primary channel"));
+
+ /*
+ * Reference count this sub-channel; it will be dereferenced
+ * when this sub-channel is closed.
+ */
+ KASSERT(newchan->ch_refs == 1, ("chan%u: invalid refcnt %d",
+ newchan->ch_id, newchan->ch_refs));
+ atomic_add_int(&newchan->ch_refs, 1);
+
+ newchan->ch_prichan = prichan;
+ newchan->ch_dev = prichan->ch_dev;
+
+ mtx_lock(&prichan->ch_subchan_lock);
+ vmbus_chan_ins_sublist(prichan, newchan);
+ mtx_unlock(&prichan->ch_subchan_lock);
+ /*
+ * Notify anyone that is interested in this sub-channel,
+ * after this sub-channel is setup.
+ */
+ wakeup(prichan);
+done:
+ /*
+ * Hook this channel up for later revocation.
+ */
+ mtx_lock(&sc->vmbus_chan_lock);
+ vmbus_chan_ins_list(sc, newchan);
+ mtx_unlock(&sc->vmbus_chan_lock);
+
+ if (bootverbose) {
+ vmbus_chan_printf(newchan, "chan%u subidx%u offer\n",
+ newchan->ch_id, newchan->ch_subidx);
+ }
+
+ /* Select default cpu for this channel. */
+ vmbus_chan_cpu_default(newchan);
+
+ return 0;
+}
+
+void
+vmbus_chan_cpu_set(struct vmbus_channel *chan, int cpu)
+{
+ KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu %d", cpu));
+
+ if (chan->ch_vmbus->vmbus_version == VMBUS_VERSION_WS2008 ||
+ chan->ch_vmbus->vmbus_version == VMBUS_VERSION_WIN7) {
+ /* Only cpu0 is supported */
+ cpu = 0;
+ }
+
+ chan->ch_cpuid = cpu;
+ chan->ch_vcpuid = VMBUS_PCPU_GET(chan->ch_vmbus, vcpuid, cpu);
+
+ if (bootverbose) {
+ vmbus_chan_printf(chan,
+ "chan%u assigned to cpu%u [vcpu%u]\n",
+ chan->ch_id, chan->ch_cpuid, chan->ch_vcpuid);
+ }
+}
+
+void
+vmbus_chan_cpu_rr(struct vmbus_channel *chan)
+{
+ static uint32_t vmbus_chan_nextcpu;
+ int cpu;
+
+ cpu = atomic_fetchadd_int(&vmbus_chan_nextcpu, 1) % mp_ncpus;
+ vmbus_chan_cpu_set(chan, cpu);
+}
+
+static void
+vmbus_chan_cpu_default(struct vmbus_channel *chan)
+{
+ /*
+ * By default, pin the channel to cpu0. Devices having
+ * special channel-cpu mapping requirement should call
+ * vmbus_chan_cpu_{set,rr}().
+ */
+ vmbus_chan_cpu_set(chan, 0);
+}
+
+static void
+vmbus_chan_msgproc_choffer(struct vmbus_softc *sc,
+ const struct vmbus_message *msg)
+{
+ const struct vmbus_chanmsg_choffer *offer;
+ struct vmbus_channel *chan;
+ task_fn_t *detach_fn, *attach_fn;
+ int error;
+
+ offer = (const struct vmbus_chanmsg_choffer *)msg->msg_data;
+
+ chan = vmbus_chan_alloc(sc);
+ if (chan == NULL) {
+ device_printf(sc->vmbus_dev, "allocate chan%u failed\n",
+ offer->chm_chanid);
+ return;
+ }
+
+ chan->ch_id = offer->chm_chanid;
+ chan->ch_subidx = offer->chm_subidx;
+ chan->ch_guid_type = offer->chm_chtype;
+ chan->ch_guid_inst = offer->chm_chinst;
+
+ /* Batch reading is on by default */
+ chan->ch_flags |= VMBUS_CHAN_FLAG_BATCHREAD;
+
+ chan->ch_monprm->mp_connid = VMBUS_CONNID_EVENT;
+ if (sc->vmbus_version != VMBUS_VERSION_WS2008)
+ chan->ch_monprm->mp_connid = offer->chm_connid;
+
+ if (offer->chm_flags1 & VMBUS_CHOFFER_FLAG1_HASMNF) {
+ int trig_idx;
+
+ /*
+ * Setup MNF stuffs.
+ */
+ chan->ch_txflags |= VMBUS_CHAN_TXF_HASMNF;
+
+ trig_idx = offer->chm_montrig / VMBUS_MONTRIG_LEN;
+ if (trig_idx >= VMBUS_MONTRIGS_MAX)
+ panic("invalid monitor trigger %u", offer->chm_montrig);
+ chan->ch_montrig =
+ &sc->vmbus_mnf2->mnf_trigs[trig_idx].mt_pending;
+
+ chan->ch_montrig_mask =
+ 1 << (offer->chm_montrig % VMBUS_MONTRIG_LEN);
+ }
+
+ if (offer->chm_chflags & VMBUS_CHAN_TLNPI_PROVIDER_OFFER) {
+ /* This is HyperV socket channel */
+ chan->ch_is_hvs = true;
+ /* The first byte != 0 means the host initiated connection. */
+ chan->ch_hvs_conn_from_host =
+ offer->chm_udata.pipe.user_def[0];
+
+ if (bootverbose) {
+ device_printf(sc->vmbus_dev,
+ "chan%u is hyperv socket channel "
+ "connected %s host\n",
+ chan->ch_id,
+ (chan->ch_hvs_conn_from_host != 0) ?
+ "from" : "to");
+ }
+ } else {
+ chan->ch_is_hvs = false;
+ }
+
+ /*
+ * Setup event flag.
+ */
+ chan->ch_evtflag =
+ &sc->vmbus_tx_evtflags[chan->ch_id >> VMBUS_EVTFLAG_SHIFT];
+ chan->ch_evtflag_mask = 1UL << (chan->ch_id & VMBUS_EVTFLAG_MASK);
+
+ /*
+ * Setup attach and detach tasks.
+ */
+ if (VMBUS_CHAN_ISPRIMARY(chan)) {
+ chan->ch_mgmt_tq = sc->vmbus_devtq;
+ attach_fn = vmbus_prichan_attach_task;
+ detach_fn = vmbus_prichan_detach_task;
+ } else {
+ chan->ch_mgmt_tq = sc->vmbus_subchtq;
+ attach_fn = vmbus_subchan_attach_task;
+ detach_fn = vmbus_subchan_detach_task;
+ }
+ TASK_INIT(&chan->ch_attach_task, 0, attach_fn, chan);
+ TASK_INIT(&chan->ch_detach_task, 0, detach_fn, chan);
+
+ error = vmbus_chan_add(chan);
+ if (error) {
+ device_printf(sc->vmbus_dev, "add chan%u failed: %d\n",
+ chan->ch_id, error);
+ atomic_subtract_int(&chan->ch_refs, 1);
+ vmbus_chan_free(chan);
+ return;
+ }
+ taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_attach_task);
+}
+
+static void
+vmbus_chan_msgproc_chrescind(struct vmbus_softc *sc,
+ const struct vmbus_message *msg)
+{
+ const struct vmbus_chanmsg_chrescind *note;
+ struct vmbus_channel *chan;
+
+ note = (const struct vmbus_chanmsg_chrescind *)msg->msg_data;
+ if (note->chm_chanid > VMBUS_CHAN_MAX) {
+ device_printf(sc->vmbus_dev, "invalid revoked chan%u\n",
+ note->chm_chanid);
+ return;
+ }
+
+ /*
+ * Find and remove the target channel from the channel list.
+ */
+ mtx_lock(&sc->vmbus_chan_lock);
+ TAILQ_FOREACH(chan, &sc->vmbus_chans, ch_link) {
+ if (chan->ch_id == note->chm_chanid)
+ break;
+ }
+ if (chan == NULL) {
+ mtx_unlock(&sc->vmbus_chan_lock);
+ device_printf(sc->vmbus_dev, "chan%u is not offered\n",
+ note->chm_chanid);
+ return;
+ }
+ vmbus_chan_rem_list(sc, chan);
+ mtx_unlock(&sc->vmbus_chan_lock);
+
+ if (VMBUS_CHAN_ISPRIMARY(chan)) {
+ /*
+ * The target channel is a primary channel; remove the
+ * target channel from the primary channel list now,
+ * instead of later, so that it will not be found by
+ * other sub-channel offers, which are processed in
+ * this thread.
+ */
+ mtx_lock(&sc->vmbus_prichan_lock);
+ vmbus_chan_rem_prilist(sc, chan);
+ mtx_unlock(&sc->vmbus_prichan_lock);
+ }
+
+ /*
+ * NOTE:
+ * The following processing order is critical:
+ * Set the REVOKED state flag before orphaning the installed xact.
+ */
+
+ if (atomic_testandset_int(&chan->ch_stflags,
+ VMBUS_CHAN_ST_REVOKED_SHIFT))
+ panic("channel has already been revoked");
+
+ sx_xlock(&chan->ch_orphan_lock);
+ if (chan->ch_orphan_xact != NULL)
+ vmbus_xact_ctx_orphan(chan->ch_orphan_xact);
+ sx_xunlock(&chan->ch_orphan_lock);
+
+ if (bootverbose)
+ vmbus_chan_printf(chan, "chan%u revoked\n", note->chm_chanid);
+ vmbus_chan_detach(chan);
+}
+
+static int
+vmbus_chan_release(struct vmbus_channel *chan)
+{
+ struct vmbus_softc *sc = chan->ch_vmbus;
+ struct vmbus_chanmsg_chfree *req;
+ struct vmbus_msghc *mh;
+ int error;
+
+ mh = vmbus_msghc_get(sc, sizeof(*req));
+ if (mh == NULL) {
+ vmbus_chan_printf(chan,
+ "can not get msg hypercall for chfree(chan%u)\n",
+ chan->ch_id);
+ return (ENXIO);
+ }
+
+ req = vmbus_msghc_dataptr(mh);
+ req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHFREE;
+ req->chm_chanid = chan->ch_id;
+
+ error = vmbus_msghc_exec_noresult(mh);
+ vmbus_msghc_put(sc, mh);
+
+ if (error) {
+ vmbus_chan_printf(chan,
+ "chfree(chan%u) msg hypercall exec failed: %d\n",
+ chan->ch_id, error);
+ } else {
+ if (bootverbose)
+ vmbus_chan_printf(chan, "chan%u freed\n", chan->ch_id);
+ }
+ return (error);
+}
+
+static void
+vmbus_prichan_detach_task(void *xchan, int pending __unused)
+{
+ struct vmbus_channel *chan = xchan;
+
+ KASSERT(VMBUS_CHAN_ISPRIMARY(chan),
+ ("chan%u is not primary channel", chan->ch_id));
+
+ /* Delete and detach the device associated with this channel. */
+ vmbus_delete_child(chan);
+
+ /* Release this channel (back to vmbus). */
+ vmbus_chan_release(chan);
+
+ /* Free this channel's resource. */
+ vmbus_chan_free(chan);
+}
+
+static void
+vmbus_subchan_detach_task(void *xchan, int pending __unused)
+{
+ struct vmbus_channel *chan = xchan;
+ struct vmbus_channel *pri_chan = chan->ch_prichan;
+
+ KASSERT(!VMBUS_CHAN_ISPRIMARY(chan),
+ ("chan%u is primary channel", chan->ch_id));
+
+ /* Release this channel (back to vmbus). */
+ vmbus_chan_release(chan);
+
+ /* Unlink from its primary channel's sub-channel list. */
+ mtx_lock(&pri_chan->ch_subchan_lock);
+ vmbus_chan_rem_sublist(pri_chan, chan);
+ mtx_unlock(&pri_chan->ch_subchan_lock);
+ /* Notify anyone that is waiting for this sub-channel to vanish. */
+ wakeup(pri_chan);
+
+ /* Free this channel's resource. */
+ vmbus_chan_free(chan);
+}
+
+static void
+vmbus_prichan_attach_task(void *xchan, int pending __unused)
+{
+
+ /*
+ * Add device for this primary channel.
+ */
+ vmbus_add_child(xchan);
+}
+
+static void
+vmbus_subchan_attach_task(void *xchan __unused, int pending __unused)
+{
+
+ /* Nothing */
+}
+
+void
+vmbus_chan_destroy_all(struct vmbus_softc *sc)
+{
+
+ /*
+ * Detach all devices and destroy the corresponding primary
+ * channels.
+ */
+ for (;;) {
+ struct vmbus_channel *chan;
+
+ mtx_lock(&sc->vmbus_chan_lock);
+ TAILQ_FOREACH(chan, &sc->vmbus_chans, ch_link) {
+ if (VMBUS_CHAN_ISPRIMARY(chan))
+ break;
+ }
+ if (chan == NULL) {
+ /* No more primary channels; done. */
+ mtx_unlock(&sc->vmbus_chan_lock);
+ break;
+ }
+ vmbus_chan_rem_list(sc, chan);
+ mtx_unlock(&sc->vmbus_chan_lock);
+
+ mtx_lock(&sc->vmbus_prichan_lock);
+ vmbus_chan_rem_prilist(sc, chan);
+ mtx_unlock(&sc->vmbus_prichan_lock);
+
+ taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_detach_task);
+ }
+}
+
+struct vmbus_channel **
+vmbus_subchan_get(struct vmbus_channel *pri_chan, int subchan_cnt)
+{
+ struct vmbus_channel **ret, *chan;
+ int i;
+
+ KASSERT(subchan_cnt > 0, ("invalid sub-channel count %d", subchan_cnt));
+
+ ret = malloc(subchan_cnt * sizeof(struct vmbus_channel *), M_TEMP,
+ M_WAITOK);
+
+ mtx_lock(&pri_chan->ch_subchan_lock);
+
+ while (pri_chan->ch_subchan_cnt < subchan_cnt)
+ mtx_sleep(pri_chan, &pri_chan->ch_subchan_lock, 0, "subch", 0);
+
+ i = 0;
+ TAILQ_FOREACH(chan, &pri_chan->ch_subchans, ch_sublink) {
+ /* TODO: refcnt chan */
+ ret[i] = chan;
+
+ ++i;
+ if (i == subchan_cnt)
+ break;
+ }
+ KASSERT(i == subchan_cnt, ("invalid subchan count %d, should be %d",
+ pri_chan->ch_subchan_cnt, subchan_cnt));
+
+ mtx_unlock(&pri_chan->ch_subchan_lock);
+
+ return ret;
+}
+
+void
+vmbus_subchan_rel(struct vmbus_channel **subchan, int subchan_cnt __unused)
+{
+
+ free(subchan, M_TEMP);
+}
+
+void
+vmbus_subchan_drain(struct vmbus_channel *pri_chan)
+{
+ mtx_lock(&pri_chan->ch_subchan_lock);
+ while (pri_chan->ch_subchan_cnt > 0)
+ mtx_sleep(pri_chan, &pri_chan->ch_subchan_lock, 0, "dsubch", 0);
+ mtx_unlock(&pri_chan->ch_subchan_lock);
+}
+
+void
+vmbus_chan_msgproc(struct vmbus_softc *sc, const struct vmbus_message *msg)
+{
+ vmbus_chanmsg_proc_t msg_proc;
+ uint32_t msg_type;
+
+ msg_type = ((const struct vmbus_chanmsg_hdr *)msg->msg_data)->chm_type;
+ KASSERT(msg_type < VMBUS_CHANMSG_TYPE_MAX,
+ ("invalid message type %u", msg_type));
+
+ msg_proc = vmbus_chan_msgprocs[msg_type];
+ if (msg_proc != NULL)
+ msg_proc(sc, msg);
+}
+
+void
+vmbus_chan_set_readbatch(struct vmbus_channel *chan, bool on)
+{
+ if (!on)
+ chan->ch_flags &= ~VMBUS_CHAN_FLAG_BATCHREAD;
+ else
+ chan->ch_flags |= VMBUS_CHAN_FLAG_BATCHREAD;
+}
+
+uint32_t
+vmbus_chan_id(const struct vmbus_channel *chan)
+{
+ return chan->ch_id;
+}
+
+uint32_t
+vmbus_chan_subidx(const struct vmbus_channel *chan)
+{
+ return chan->ch_subidx;
+}
+
+bool
+vmbus_chan_is_primary(const struct vmbus_channel *chan)
+{
+ if (VMBUS_CHAN_ISPRIMARY(chan))
+ return true;
+ else
+ return false;
+}
+
+bool
+vmbus_chan_is_hvs(const struct vmbus_channel *chan)
+{
+ return chan->ch_is_hvs;
+}
+
+bool
+vmbus_chan_is_hvs_conn_from_host(const struct vmbus_channel *chan)
+{
+ KASSERT(vmbus_chan_is_hvs(chan) == true,
+ ("Not a HyperV Socket channel %u", chan->ch_id));
+ if (chan->ch_hvs_conn_from_host != 0)
+ return true;
+ else
+ return false;
+}
+
+struct hyperv_guid *
+vmbus_chan_guid_type(struct vmbus_channel *chan)
+{
+ return &chan->ch_guid_type;
+}
+
+struct hyperv_guid *
+vmbus_chan_guid_inst(struct vmbus_channel *chan)
+{
+ return &chan->ch_guid_inst;
+}
+
+int
+vmbus_chan_prplist_nelem(int br_size, int prpcnt_max, int dlen_max)
+{
+ int elem_size;
+
+ elem_size = __offsetof(struct vmbus_chanpkt_prplist,
+ cp_range[0].gpa_page[prpcnt_max]);
+ elem_size += dlen_max;
+ elem_size = VMBUS_CHANPKT_TOTLEN(elem_size);
+
+ return (vmbus_br_nelem(br_size, elem_size));
+}
+
+bool
+vmbus_chan_tx_empty(const struct vmbus_channel *chan)
+{
+
+ return (vmbus_txbr_empty(&chan->ch_txbr));
+}
+
+bool
+vmbus_chan_rx_empty(const struct vmbus_channel *chan)
+{
+
+ return (vmbus_rxbr_empty(&chan->ch_rxbr));
+}
+
+static int
+vmbus_chan_printf(const struct vmbus_channel *chan, const char *fmt, ...)
+{
+ va_list ap;
+ device_t dev;
+ int retval;
+
+ if (chan->ch_dev == NULL || !device_is_alive(chan->ch_dev))
+ dev = chan->ch_vmbus->vmbus_dev;
+ else
+ dev = chan->ch_dev;
+
+ retval = device_print_prettyname(dev);
+ va_start(ap, fmt);
+ retval += vprintf(fmt, ap);
+ va_end(ap);
+
+ return (retval);
+}
+
+void
+vmbus_chan_run_task(struct vmbus_channel *chan, struct task *task)
+{
+
+ taskqueue_enqueue(chan->ch_tq, task);
+ taskqueue_drain(chan->ch_tq, task);
+}
+
+struct taskqueue *
+vmbus_chan_mgmt_tq(const struct vmbus_channel *chan)
+{
+
+ return (chan->ch_mgmt_tq);
+}
+
+bool
+vmbus_chan_is_revoked(const struct vmbus_channel *chan)
+{
+
+ if (chan->ch_stflags & VMBUS_CHAN_ST_REVOKED)
+ return (true);
+ return (false);
+}
+
+void
+vmbus_chan_set_orphan(struct vmbus_channel *chan, struct vmbus_xact_ctx *xact)
+{
+
+ sx_xlock(&chan->ch_orphan_lock);
+ chan->ch_orphan_xact = xact;
+ sx_xunlock(&chan->ch_orphan_lock);
+}
+
+void
+vmbus_chan_unset_orphan(struct vmbus_channel *chan)
+{
+
+ sx_xlock(&chan->ch_orphan_lock);
+ chan->ch_orphan_xact = NULL;
+ sx_xunlock(&chan->ch_orphan_lock);
+}
+
+const void *
+vmbus_chan_xact_wait(const struct vmbus_channel *chan,
+ struct vmbus_xact *xact, size_t *resp_len, bool can_sleep)
+{
+ const void *ret;
+
+ if (can_sleep)
+ ret = vmbus_xact_wait(xact, resp_len);
+ else
+ ret = vmbus_xact_busywait(xact, resp_len);
+ if (vmbus_chan_is_revoked(chan)) {
+ /*
+ * This xact probably is interrupted, and the
+ * interruption can race the reply reception,
+ * so we have to make sure that there are nothing
+ * left on the RX bufring, i.e. this xact will
+ * not be touched, once this function returns.
+ *
+ * Since the hypervisor will not put more data
+ * onto the RX bufring once the channel is revoked,
+ * the following loop will be terminated, once all
+ * data are drained by the driver's channel
+ * callback.
+ */
+ while (!vmbus_chan_rx_empty(chan)) {
+ if (can_sleep)
+ pause("chxact", 1);
+ else
+ DELAY(1000);
+ }
+ }
+ return (ret);
+}
+
+void
+vmbus_chan_poll_enable(struct vmbus_channel *chan, u_int pollhz)
+{
+ struct vmbus_chan_pollarg arg;
+ struct task poll_cfg;
+
+ KASSERT(chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD,
+ ("enable polling on non-batch chan%u", chan->ch_id));
+ KASSERT(pollhz >= VMBUS_CHAN_POLLHZ_MIN &&
+ pollhz <= VMBUS_CHAN_POLLHZ_MAX, ("invalid pollhz %u", pollhz));
+
+ arg.poll_chan = chan;
+ arg.poll_hz = pollhz;
+ TASK_INIT(&poll_cfg, 0, vmbus_chan_pollcfg_task, &arg);
+ vmbus_chan_run_task(chan, &poll_cfg);
+}
+
+void
+vmbus_chan_poll_disable(struct vmbus_channel *chan)
+{
+ struct task poll_dis;
+
+ KASSERT(chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD,
+ ("disable polling on non-batch chan%u", chan->ch_id));
+
+ TASK_INIT(&poll_dis, 0, vmbus_chan_polldis_task, chan);
+ vmbus_chan_run_task(chan, &poll_dis);
+}
diff --git a/sys/dev/hyperv/vmbus/vmbus_chanvar.h b/sys/dev/hyperv/vmbus/vmbus_chanvar.h
new file mode 100644
index 000000000000..b20b0119bc04
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_chanvar.h
@@ -0,0 +1,195 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_CHANVAR_H_
+#define _VMBUS_CHANVAR_H_
+
+#include <sys/param.h>
+#include <sys/callout.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/vmbus/vmbus_brvar.h>
+
+struct vmbus_channel {
+ /*
+ * NOTE:
+ * Fields before ch_txbr are only accessed on this channel's
+ * target CPU.
+ */
+ uint32_t ch_flags; /* VMBUS_CHAN_FLAG_ */
+ int ch_poll_flags; /* callout flags */
+
+ /*
+ * RX bufring; immediately following ch_txbr.
+ */
+ struct vmbus_rxbr ch_rxbr;
+
+ struct taskqueue *ch_tq;
+ struct task ch_task;
+ struct task ch_poll_task;
+ sbintime_t ch_poll_intvl;
+ struct callout ch_poll_timeo;
+ vmbus_chan_callback_t ch_cb;
+ void *ch_cbarg;
+
+ /*
+ * TX bufring; at the beginning of ch_bufring.
+ *
+ * NOTE:
+ * Put TX bufring and the following MNF/evtflag to a new
+ * cacheline, since they will be accessed on all CPUs by
+ * locking ch_txbr first.
+ *
+ * XXX
+ * TX bufring and following MNF/evtflags do _not_ fit in
+ * one 64B cacheline.
+ */
+ struct vmbus_txbr ch_txbr __aligned(CACHE_LINE_SIZE);
+ uint32_t ch_txflags; /* VMBUS_CHAN_TXF_ */
+
+ /*
+ * These are based on the vmbus_chanmsg_choffer.chm_montrig.
+ * Save it here for easy access.
+ */
+ uint32_t ch_montrig_mask;/* MNF trig mask */
+ volatile uint32_t *ch_montrig; /* MNF trigger loc. */
+
+ /*
+ * These are based on the vmbus_chanmsg_choffer.chm_chanid.
+ * Save it here for easy access.
+ */
+ u_long ch_evtflag_mask;/* event flag */
+ volatile u_long *ch_evtflag; /* event flag loc. */
+
+ /*
+ * Rarely used fields.
+ */
+
+ struct hyperv_mon_param *ch_monprm;
+ struct hyperv_dma ch_monprm_dma;
+
+ uint32_t ch_id; /* channel id */
+ device_t ch_dev;
+ struct vmbus_softc *ch_vmbus;
+
+ int ch_cpuid; /* owner cpu */
+ /*
+ * Virtual cpuid for ch_cpuid; it is used to communicate cpuid
+ * related information w/ Hyper-V. If MSR_HV_VP_INDEX does not
+ * exist, ch_vcpuid will always be 0 for compatibility.
+ */
+ uint32_t ch_vcpuid;
+
+ /*
+ * If this is a primary channel, ch_subchan* fields
+ * contain sub-channels belonging to this primary
+ * channel.
+ */
+ struct mtx ch_subchan_lock;
+ TAILQ_HEAD(, vmbus_channel) ch_subchans;
+ int ch_subchan_cnt;
+
+ /* If this is a sub-channel */
+ TAILQ_ENTRY(vmbus_channel) ch_sublink; /* sub-channel link */
+ struct vmbus_channel *ch_prichan; /* owner primary chan */
+
+ void *ch_bufring; /* TX+RX bufrings */
+ struct hyperv_dma ch_bufring_dma;
+ uint32_t ch_bufring_gpadl;
+
+ struct task ch_attach_task; /* run in ch_mgmt_tq */
+ struct task ch_detach_task; /* run in ch_mgmt_tq */
+ struct taskqueue *ch_mgmt_tq;
+
+ /* If this is a primary channel */
+ TAILQ_ENTRY(vmbus_channel) ch_prilink; /* primary chan link */
+
+ TAILQ_ENTRY(vmbus_channel) ch_link; /* channel link */
+ uint32_t ch_subidx; /* subchan index */
+ volatile uint32_t ch_stflags; /* atomic-op */
+ /* VMBUS_CHAN_ST_ */
+ struct hyperv_guid ch_guid_type;
+ struct hyperv_guid ch_guid_inst;
+
+ struct sx ch_orphan_lock;
+ struct vmbus_xact_ctx *ch_orphan_xact;
+
+ int ch_refs;
+
+ /*
+ * These are for HyperV socket channel only
+ */
+ bool ch_is_hvs;
+ uint8_t ch_hvs_conn_from_host;
+
+ struct sysctl_ctx_list ch_sysctl_ctx;
+} __aligned(CACHE_LINE_SIZE);
+
+#define VMBUS_CHAN_ISPRIMARY(chan) ((chan)->ch_subidx == 0)
+
+/*
+ * If this flag is set, this channel's interrupt will be masked in ISR,
+ * and the RX bufring will be drained before this channel's interrupt is
+ * unmasked.
+ *
+ * This flag is turned on by default. Drivers can turn it off according
+ * to their own requirement.
+ */
+#define VMBUS_CHAN_FLAG_BATCHREAD 0x0002
+
+#define VMBUS_CHAN_TXF_HASMNF 0x0001
+
+#define VMBUS_CHAN_ST_OPENED_SHIFT 0
+#define VMBUS_CHAN_ST_ONPRIL_SHIFT 1
+#define VMBUS_CHAN_ST_ONSUBL_SHIFT 2
+#define VMBUS_CHAN_ST_ONLIST_SHIFT 3
+#define VMBUS_CHAN_ST_REVOKED_SHIFT 4 /* sticky */
+#define VMBUS_CHAN_ST_OPENED (1 << VMBUS_CHAN_ST_OPENED_SHIFT)
+#define VMBUS_CHAN_ST_ONPRIL (1 << VMBUS_CHAN_ST_ONPRIL_SHIFT)
+#define VMBUS_CHAN_ST_ONSUBL (1 << VMBUS_CHAN_ST_ONSUBL_SHIFT)
+#define VMBUS_CHAN_ST_ONLIST (1 << VMBUS_CHAN_ST_ONLIST_SHIFT)
+#define VMBUS_CHAN_ST_REVOKED (1 << VMBUS_CHAN_ST_REVOKED_SHIFT)
+
+struct vmbus_softc;
+struct vmbus_message;
+
+void vmbus_event_proc(struct vmbus_softc *, int);
+void vmbus_event_proc_compat(struct vmbus_softc *, int);
+void vmbus_chan_msgproc(struct vmbus_softc *,
+ const struct vmbus_message *);
+void vmbus_chan_destroy_all(struct vmbus_softc *);
+
+#endif /* !_VMBUS_CHANVAR_H_ */
diff --git a/sys/dev/hyperv/vmbus/vmbus_et.c b/sys/dev/hyperv/vmbus/vmbus_et.c
new file mode 100644
index 000000000000..d9ab2a9485e7
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_et.c
@@ -0,0 +1,201 @@
+/*-
+ * Copyright (c) 2015,2016-2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+#include <sys/timeet.h>
+
+#include <dev/hyperv/include/hyperv.h>
+#include <dev/hyperv/vmbus/hyperv_reg.h>
+#include <dev/hyperv/vmbus/hyperv_var.h>
+#include <dev/hyperv/vmbus/vmbus_var.h>
+
+#define VMBUS_ET_NAME "hvet"
+
+#define MSR_HV_STIMER0_CFG_SINT \
+ ((((uint64_t)VMBUS_SINT_TIMER) << MSR_HV_STIMER_CFG_SINT_SHIFT) & \
+ MSR_HV_STIMER_CFG_SINT_MASK)
+
+/*
+ * Additionally required feature:
+ * - SynIC is needed for interrupt generation.
+ */
+#define CPUID_HV_ET_MASK (CPUID_HV_MSR_SYNIC | \
+ CPUID_HV_MSR_SYNTIMER)
+
+static void vmbus_et_identify(driver_t *, device_t);
+static int vmbus_et_probe(device_t);
+static int vmbus_et_attach(device_t);
+static int vmbus_et_detach(device_t);
+static int vmbus_et_start(struct eventtimer *, sbintime_t,
+ sbintime_t);
+
+static struct eventtimer vmbus_et;
+
+static device_method_t vmbus_et_methods[] = {
+ DEVMETHOD(device_identify, vmbus_et_identify),
+ DEVMETHOD(device_probe, vmbus_et_probe),
+ DEVMETHOD(device_attach, vmbus_et_attach),
+ DEVMETHOD(device_detach, vmbus_et_detach),
+
+ DEVMETHOD_END
+};
+
+static driver_t vmbus_et_driver = {
+ VMBUS_ET_NAME,
+ vmbus_et_methods,
+ 0
+};
+
+static devclass_t vmbus_et_devclass;
+
+DRIVER_MODULE(hv_et, vmbus, vmbus_et_driver, vmbus_et_devclass, NULL, NULL);
+MODULE_VERSION(hv_et, 1);
+
+static __inline uint64_t
+hyperv_sbintime2count(sbintime_t time)
+{
+ struct timespec val;
+
+ val = sbttots(time);
+ return (val.tv_sec * HYPERV_TIMER_FREQ) +
+ (val.tv_nsec / HYPERV_TIMER_NS_FACTOR);
+}
+
+static int
+vmbus_et_start(struct eventtimer *et __unused, sbintime_t first,
+ sbintime_t period __unused)
+{
+ uint64_t current;
+
+ current = hyperv_tc64();
+ current += hyperv_sbintime2count(first);
+ wrmsr(MSR_HV_STIMER0_COUNT, current);
+
+ return (0);
+}
+
+void
+vmbus_et_intr(struct trapframe *frame)
+{
+ struct trapframe *oldframe;
+ struct thread *td;
+
+ if (vmbus_et.et_active) {
+ td = curthread;
+ td->td_intr_nesting_level++;
+ oldframe = td->td_intr_frame;
+ td->td_intr_frame = frame;
+ vmbus_et.et_event_cb(&vmbus_et, vmbus_et.et_arg);
+ td->td_intr_frame = oldframe;
+ td->td_intr_nesting_level--;
+ }
+}
+
+static void
+vmbus_et_identify(driver_t *driver, device_t parent)
+{
+ if (device_get_unit(parent) != 0 ||
+ device_find_child(parent, VMBUS_ET_NAME, -1) != NULL ||
+ (hyperv_features & CPUID_HV_ET_MASK) != CPUID_HV_ET_MASK ||
+ hyperv_tc64 == NULL)
+ return;
+
+ device_add_child(parent, VMBUS_ET_NAME, -1);
+}
+
+static int
+vmbus_et_probe(device_t dev)
+{
+ if (resource_disabled(VMBUS_ET_NAME, 0))
+ return (ENXIO);
+
+ device_set_desc(dev, "Hyper-V event timer");
+
+ return (BUS_PROBE_NOWILDCARD);
+}
+
+static void
+vmbus_et_config(void *arg __unused)
+{
+ /*
+ * Make sure that STIMER0 is really disabled before writing
+ * to STIMER0_CONFIG.
+ *
+ * "Writing to the configuration register of a timer that
+ * is already enabled may result in undefined behaviour."
+ */
+ for (;;) {
+ uint64_t val;
+
+ /* Stop counting, and this also implies disabling STIMER0 */
+ wrmsr(MSR_HV_STIMER0_COUNT, 0);
+
+ val = rdmsr(MSR_HV_STIMER0_CONFIG);
+ if ((val & MSR_HV_STIMER_CFG_ENABLE) == 0)
+ break;
+ cpu_spinwait();
+ }
+ wrmsr(MSR_HV_STIMER0_CONFIG,
+ MSR_HV_STIMER_CFG_AUTOEN | MSR_HV_STIMER0_CFG_SINT);
+}
+
+static int
+vmbus_et_attach(device_t dev)
+{
+ /* TODO: use independent IDT vector */
+
+ vmbus_et.et_name = "Hyper-V";
+ vmbus_et.et_flags = ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU;
+ vmbus_et.et_quality = 1000;
+ vmbus_et.et_frequency = HYPERV_TIMER_FREQ;
+ vmbus_et.et_min_period = (0x00000001ULL << 32) / HYPERV_TIMER_FREQ;
+ vmbus_et.et_max_period = (0xfffffffeULL << 32) / HYPERV_TIMER_FREQ;
+ vmbus_et.et_start = vmbus_et_start;
+
+ /*
+ * Delay a bit to make sure that hyperv_tc64 will not return 0,
+ * since writing 0 to STIMER0_COUNT will disable STIMER0.
+ */
+ DELAY(100);
+ smp_rendezvous(NULL, vmbus_et_config, NULL, NULL);
+
+ return (et_register(&vmbus_et));
+}
+
+static int
+vmbus_et_detach(device_t dev)
+{
+ return (et_deregister(&vmbus_et));
+}
diff --git a/sys/dev/hyperv/vmbus/vmbus_if.m b/sys/dev/hyperv/vmbus/vmbus_if.m
new file mode 100644
index 000000000000..3b41c5148fdf
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_if.m
@@ -0,0 +1,60 @@
+#-
+# Copyright (c) 2016 Microsoft Corp.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice unmodified, this list of conditions, and the following
+# disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/param.h>
+#include <sys/bus.h>
+
+INTERFACE vmbus;
+
+HEADER {
+ struct hyperv_guid;
+ struct taskqueue;
+};
+
+METHOD uint32_t get_version {
+ device_t bus;
+ device_t dev;
+};
+
+METHOD int probe_guid {
+ device_t bus;
+ device_t dev;
+ const struct hyperv_guid *guid;
+};
+
+METHOD uint32_t get_vcpu_id {
+ device_t bus;
+ device_t dev;
+ int cpu;
+};
+
+METHOD struct taskqueue * get_event_taskq {
+ device_t bus;
+ device_t dev;
+ int cpu;
+};
diff --git a/sys/dev/hyperv/vmbus/vmbus_reg.h b/sys/dev/hyperv/vmbus/vmbus_reg.h
new file mode 100644
index 000000000000..80d197c48ee4
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_reg.h
@@ -0,0 +1,427 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_REG_H_
+#define _VMBUS_REG_H_
+
+#include <sys/param.h>
+#include <dev/hyperv/include/hyperv.h> /* XXX for hyperv_guid */
+#include <dev/hyperv/include/vmbus.h>
+#include <dev/hyperv/vmbus/hyperv_reg.h>
+
+/*
+ * Hyper-V SynIC message format.
+ */
+
+#define VMBUS_MSG_DSIZE_MAX 240
+#define VMBUS_MSG_SIZE 256
+
+struct vmbus_message {
+ uint32_t msg_type; /* HYPERV_MSGTYPE_ */
+ uint8_t msg_dsize; /* data size */
+ uint8_t msg_flags; /* VMBUS_MSGFLAG_ */
+ uint16_t msg_rsvd;
+ uint64_t msg_id;
+ uint8_t msg_data[VMBUS_MSG_DSIZE_MAX];
+} __packed;
+CTASSERT(sizeof(struct vmbus_message) == VMBUS_MSG_SIZE);
+
+#define VMBUS_MSGFLAG_PENDING 0x01
+
+/*
+ * Hyper-V SynIC event flags
+ */
+
+#ifdef __LP64__
+#define VMBUS_EVTFLAGS_MAX 32
+#define VMBUS_EVTFLAG_SHIFT 6
+#else
+#define VMBUS_EVTFLAGS_MAX 64
+#define VMBUS_EVTFLAG_SHIFT 5
+#endif
+#define VMBUS_EVTFLAG_LEN (1 << VMBUS_EVTFLAG_SHIFT)
+#define VMBUS_EVTFLAG_MASK (VMBUS_EVTFLAG_LEN - 1)
+#define VMBUS_EVTFLAGS_SIZE 256
+
+struct vmbus_evtflags {
+ u_long evt_flags[VMBUS_EVTFLAGS_MAX];
+} __packed;
+CTASSERT(sizeof(struct vmbus_evtflags) == VMBUS_EVTFLAGS_SIZE);
+
+/*
+ * Hyper-V Monitor Notification Facility
+ */
+
+struct vmbus_mon_trig {
+ uint32_t mt_pending;
+ uint32_t mt_armed;
+} __packed;
+
+#define VMBUS_MONTRIGS_MAX 4
+#define VMBUS_MONTRIG_LEN 32
+
+struct vmbus_mnf {
+ uint32_t mnf_state;
+ uint32_t mnf_rsvd1;
+
+ struct vmbus_mon_trig mnf_trigs[VMBUS_MONTRIGS_MAX];
+ uint8_t mnf_rsvd2[536];
+
+ uint16_t mnf_lat[VMBUS_MONTRIGS_MAX][VMBUS_MONTRIG_LEN];
+ uint8_t mnf_rsvd3[256];
+
+ struct hyperv_mon_param
+ mnf_param[VMBUS_MONTRIGS_MAX][VMBUS_MONTRIG_LEN];
+ uint8_t mnf_rsvd4[1984];
+} __packed;
+CTASSERT(sizeof(struct vmbus_mnf) == PAGE_SIZE);
+
+/*
+ * Buffer ring
+ */
+struct vmbus_bufring {
+ /*
+ * If br_windex == br_rindex, this bufring is empty; this
+ * means we can _not_ write data to the bufring, if the
+ * write is going to make br_windex same as br_rindex.
+ */
+ volatile uint32_t br_windex;
+ volatile uint32_t br_rindex;
+
+ /*
+ * Interrupt mask {0,1}
+ *
+ * For TX bufring, host set this to 1, when it is processing
+ * the TX bufring, so that we can safely skip the TX event
+ * notification to host.
+ *
+ * For RX bufring, once this is set to 1 by us, host will not
+ * further dispatch interrupts to us, even if there are data
+ * pending on the RX bufring. This effectively disables the
+ * interrupt of the channel to which this RX bufring is attached.
+ */
+ volatile uint32_t br_imask;
+
+ /*
+ * WS2012/Win8 and later versions of Hyper-V implement interrupt
+ * driven flow management. The feature bit feat_pending_snd_sz
+ * is set by the host on the host->guest buffer ring, and by the
+ * guest on the guest->host buffer ring.
+ *
+ * The meaning of the feature bit is a bit complex in that it has
+ * semantics that apply to both buffer rings. If the guest sets
+ * the feature bit in the guest->host buffer ring, the guest is
+ * telling the host that:
+ * 1) It will set the br_pending_snd_sz field in the guest->host buffer
+ * ring when it is waiting for space to become available, and
+ * 2) It will read the pending_send_sz field in the host->guest
+ * ring buffer and interrupt the host when it frees enough space
+ *
+ * Similarly, if the host sets the feature bit in the host->guest
+ * ring buffer, the host is telling the guest that:
+ * 1) It will set the pending_send_sz field in the host->guest ring
+ * buffer when it is waiting for space to become available, and
+ * 2) It will read the pending_send_sz field in the guest->host
+ * ring buffer and interrupt the guest when it frees enough space
+ *
+ * If either the guest or host does not set the feature bit that it
+ * owns, that guest or host must do polling if it encounters a full
+ * ring buffer, and not signal the other end with an interrupt.
+ */
+ volatile uint32_t br_pending_snd_sz;
+ uint32_t br_rsvd1[12];
+ union {
+ struct {
+ uint32_t feat_pending_snd_sz:1;
+ };
+ uint32_t value;
+ } br_feature_bits;
+
+ /* Padding to PAGE_SIZE */
+ uint8_t br_rsvd2[4020];
+
+ /*
+ * Total guest to host interrupt count
+ * - For rx ring, this counts the guest signaling host when this rx
+ * ring changing from full to not full.
+ *
+ * - For tx ring, this counts the guest signaling host when this tx
+ * ring changing from empty to non empty.
+ */
+ uint64_t br_g2h_intr_cnt;
+
+ uint8_t br_data[];
+} __packed;
+CTASSERT(sizeof(struct vmbus_bufring) == PAGE_SIZE);
+
+/*
+ * Channel
+ */
+
+#define VMBUS_CHAN_MAX_COMPAT 256
+#define VMBUS_CHAN_MAX (VMBUS_EVTFLAG_LEN * VMBUS_EVTFLAGS_MAX)
+
+/*
+ * Channel packets
+ */
+
+#define VMBUS_CHANPKT_SIZE_ALIGN (1 << VMBUS_CHANPKT_SIZE_SHIFT)
+
+#define VMBUS_CHANPKT_SETLEN(pktlen, len) \
+do { \
+ (pktlen) = (len) >> VMBUS_CHANPKT_SIZE_SHIFT; \
+} while (0)
+
+#define VMBUS_CHANPKT_TOTLEN(tlen) \
+ roundup2((tlen), VMBUS_CHANPKT_SIZE_ALIGN)
+
+#define VMBUS_CHANPKT_HLEN_MIN \
+ (sizeof(struct vmbus_chanpkt_hdr) >> VMBUS_CHANPKT_SIZE_SHIFT)
+
+struct vmbus_chanpkt {
+ struct vmbus_chanpkt_hdr cp_hdr;
+} __packed;
+
+struct vmbus_chanpkt_sglist {
+ struct vmbus_chanpkt_hdr cp_hdr;
+ uint32_t cp_rsvd;
+ uint32_t cp_gpa_cnt;
+ struct vmbus_gpa cp_gpa[];
+} __packed;
+
+struct vmbus_chanpkt_prplist {
+ struct vmbus_chanpkt_hdr cp_hdr;
+ uint32_t cp_rsvd;
+ uint32_t cp_range_cnt;
+ struct vmbus_gpa_range cp_range[];
+} __packed;
+
+/*
+ * Channel messages
+ * - Embedded in vmbus_message.msg_data, e.g. response and notification.
+ * - Embedded in hypercall_postmsg_in.hc_data, e.g. request.
+ */
+
+#define VMBUS_CHANMSG_TYPE_CHOFFER 1 /* NOTE */
+#define VMBUS_CHANMSG_TYPE_CHRESCIND 2 /* NOTE */
+#define VMBUS_CHANMSG_TYPE_CHREQUEST 3 /* REQ */
+#define VMBUS_CHANMSG_TYPE_CHOFFER_DONE 4 /* NOTE */
+#define VMBUS_CHANMSG_TYPE_CHOPEN 5 /* REQ */
+#define VMBUS_CHANMSG_TYPE_CHOPEN_RESP 6 /* RESP */
+#define VMBUS_CHANMSG_TYPE_CHCLOSE 7 /* REQ */
+#define VMBUS_CHANMSG_TYPE_GPADL_CONN 8 /* REQ */
+#define VMBUS_CHANMSG_TYPE_GPADL_SUBCONN 9 /* REQ */
+#define VMBUS_CHANMSG_TYPE_GPADL_CONNRESP 10 /* RESP */
+#define VMBUS_CHANMSG_TYPE_GPADL_DISCONN 11 /* REQ */
+#define VMBUS_CHANMSG_TYPE_GPADL_DISCONNRESP 12 /* RESP */
+#define VMBUS_CHANMSG_TYPE_CHFREE 13 /* REQ */
+#define VMBUS_CHANMSG_TYPE_CONNECT 14 /* REQ */
+#define VMBUS_CHANMSG_TYPE_CONNECT_RESP 15 /* RESP */
+#define VMBUS_CHANMSG_TYPE_DISCONNECT 16 /* REQ */
+#define VMBUS_CHANMSG_TYPE_17 17
+#define VMBUS_CHANMSG_TYPE_18 18
+#define VMBUS_CHANMSG_TYPE_19 19
+#define VMBUS_CHANMSG_TYPE_20 20
+#define VMBUS_CHANMSG_TYPE_TL_CONN 21 /* REQ */
+#define VMBUS_CHANMSG_TYPE_22 22
+#define VMBUS_CHANMSG_TYPE_TL_RESULT 23 /* RESP */
+#define VMBUS_CHANMSG_TYPE_MAX 24
+
+struct vmbus_chanmsg_hdr {
+ uint32_t chm_type; /* VMBUS_CHANMSG_TYPE_ */
+ uint32_t chm_rsvd;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_CONNECT */
+struct vmbus_chanmsg_connect {
+ struct vmbus_chanmsg_hdr chm_hdr;
+ uint32_t chm_ver;
+ uint32_t chm_rsvd;
+ uint64_t chm_evtflags;
+ uint64_t chm_mnf1;
+ uint64_t chm_mnf2;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_CONNECT_RESP */
+struct vmbus_chanmsg_connect_resp {
+ struct vmbus_chanmsg_hdr chm_hdr;
+ uint8_t chm_done;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_CHREQUEST */
+struct vmbus_chanmsg_chrequest {
+ struct vmbus_chanmsg_hdr chm_hdr;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_DISCONNECT */
+struct vmbus_chanmsg_disconnect {
+ struct vmbus_chanmsg_hdr chm_hdr;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_TL_CONN */
+/* Hyper-V socket guest connect request */
+struct vmbus_chanmsg_tl_connect {
+ struct vmbus_chanmsg_hdr chm_hdr;
+ struct hyperv_guid guest_endpoint_id;
+ struct hyperv_guid host_service_id;
+} __packed;
+
+
+/* VMBUS_CHANMSG_TYPE_CHOPEN */
+struct vmbus_chanmsg_chopen {
+ struct vmbus_chanmsg_hdr chm_hdr;
+ uint32_t chm_chanid;
+ uint32_t chm_openid;
+ uint32_t chm_gpadl;
+ uint32_t chm_vcpuid;
+ uint32_t chm_txbr_pgcnt;
+#define VMBUS_CHANMSG_CHOPEN_UDATA_SIZE 120
+ uint8_t chm_udata[VMBUS_CHANMSG_CHOPEN_UDATA_SIZE];
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_CHOPEN_RESP */
+struct vmbus_chanmsg_chopen_resp {
+ struct vmbus_chanmsg_hdr chm_hdr;
+ uint32_t chm_chanid;
+ uint32_t chm_openid;
+ uint32_t chm_status;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_GPADL_CONN */
+struct vmbus_chanmsg_gpadl_conn {
+ struct vmbus_chanmsg_hdr chm_hdr;
+ uint32_t chm_chanid;
+ uint32_t chm_gpadl;
+ uint16_t chm_range_len;
+ uint16_t chm_range_cnt;
+ struct vmbus_gpa_range chm_range;
+} __packed;
+
+#define VMBUS_CHANMSG_GPADL_CONN_PGMAX 26
+CTASSERT(__offsetof(struct vmbus_chanmsg_gpadl_conn,
+ chm_range.gpa_page[VMBUS_CHANMSG_GPADL_CONN_PGMAX]) <=
+ HYPERCALL_POSTMSGIN_DSIZE_MAX);
+
+/* VMBUS_CHANMSG_TYPE_GPADL_SUBCONN */
+struct vmbus_chanmsg_gpadl_subconn {
+ struct vmbus_chanmsg_hdr chm_hdr;
+ uint32_t chm_msgno;
+ uint32_t chm_gpadl;
+ uint64_t chm_gpa_page[];
+} __packed;
+
+#define VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX 28
+CTASSERT(__offsetof(struct vmbus_chanmsg_gpadl_subconn,
+ chm_gpa_page[VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX]) <=
+ HYPERCALL_POSTMSGIN_DSIZE_MAX);
+
+/* VMBUS_CHANMSG_TYPE_GPADL_CONNRESP */
+struct vmbus_chanmsg_gpadl_connresp {
+ struct vmbus_chanmsg_hdr chm_hdr;
+ uint32_t chm_chanid;
+ uint32_t chm_gpadl;
+ uint32_t chm_status;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_CHCLOSE */
+struct vmbus_chanmsg_chclose {
+ struct vmbus_chanmsg_hdr chm_hdr;
+ uint32_t chm_chanid;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_GPADL_DISCONN */
+struct vmbus_chanmsg_gpadl_disconn {
+ struct vmbus_chanmsg_hdr chm_hdr;
+ uint32_t chm_chanid;
+ uint32_t chm_gpadl;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_CHFREE */
+struct vmbus_chanmsg_chfree {
+ struct vmbus_chanmsg_hdr chm_hdr;
+ uint32_t chm_chanid;
+} __packed;
+
+/* VMBUS_CHANMSG_TYPE_CHRESCIND */
+struct vmbus_chanmsg_chrescind {
+ struct vmbus_chanmsg_hdr chm_hdr;
+ uint32_t chm_chanid;
+} __packed;
+
+/* Size of the user defined data buffer for non-pipe offers */
+#define VMBUS_CHANMSG_CHOFFER_UDATA_SIZE 120
+
+/* Size of the user defined data buffer for pipe offers. */
+#define VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE 116
+
+/* VMBUS_CHANMSG_TYPE_CHOFFER */
+struct vmbus_chanmsg_choffer {
+ struct vmbus_chanmsg_hdr chm_hdr;
+ struct hyperv_guid chm_chtype;
+ struct hyperv_guid chm_chinst;
+ uint64_t chm_chlat; /* unit: 100ns */
+ uint32_t chm_chrev;
+ uint32_t chm_svrctx_sz;
+ uint16_t chm_chflags;
+ uint16_t chm_mmio_sz; /* unit: MB */
+
+ union {
+ /* Non-pipes */
+ struct {
+ uint8_t user_def[VMBUS_CHANMSG_CHOFFER_UDATA_SIZE];
+ } std;
+ /*
+ * Pipes:
+ * For integrated pipe protocol, which is implemented on
+ * top of standard user-defined data. Pipe clients have
+ * VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE bytes left for
+ * their own user.
+ */
+ struct {
+ uint32_t pipe_mode;
+ uint8_t
+ user_def[VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE];
+ } pipe;
+ } chm_udata;
+
+ uint16_t chm_subidx;
+ uint16_t chm_rsvd;
+ uint32_t chm_chanid;
+ uint8_t chm_montrig;
+ uint8_t chm_flags1; /* VMBUS_CHOFFER_FLAG1_ */
+ uint16_t chm_flags2;
+ uint32_t chm_connid;
+} __packed;
+CTASSERT(sizeof(struct vmbus_chanmsg_choffer) <= VMBUS_MSG_DSIZE_MAX);
+
+/* Server Flag */
+#define VMBUS_CHAN_TLNPI_PROVIDER_OFFER 0x2000
+
+#define VMBUS_CHOFFER_FLAG1_HASMNF 0x01
+
+#endif /* !_VMBUS_REG_H_ */
diff --git a/sys/dev/hyperv/vmbus/vmbus_res.c b/sys/dev/hyperv/vmbus/vmbus_res.c
new file mode 100644
index 000000000000..fba5a732ca58
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_res.c
@@ -0,0 +1,99 @@
+/*-
+ * Copyright (c) 2017 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/module.h>
+
+#include <contrib/dev/acpica/include/acpi.h>
+#include <dev/acpica/acpivar.h>
+
+#include <dev/hyperv/include/hyperv.h>
+
+#include "acpi_if.h"
+#include "bus_if.h"
+
+static int vmbus_res_probe(device_t);
+static int vmbus_res_attach(device_t);
+static int vmbus_res_detach(device_t);
+
+static device_method_t vmbus_res_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, vmbus_res_probe),
+ DEVMETHOD(device_attach, vmbus_res_attach),
+ DEVMETHOD(device_detach, vmbus_res_detach),
+ DEVMETHOD(device_shutdown, bus_generic_shutdown),
+ DEVMETHOD(device_suspend, bus_generic_suspend),
+ DEVMETHOD(device_resume, bus_generic_resume),
+
+ DEVMETHOD_END
+};
+
+static driver_t vmbus_res_driver = {
+ "vmbus_res",
+ vmbus_res_methods,
+ 1
+};
+
+static devclass_t vmbus_res_devclass;
+
+DRIVER_MODULE(vmbus_res, acpi, vmbus_res_driver, vmbus_res_devclass,
+ NULL, NULL);
+MODULE_DEPEND(vmbus_res, acpi, 1, 1, 1);
+MODULE_VERSION(vmbus_res, 1);
+
+static int
+vmbus_res_probe(device_t dev)
+{
+ char *id[] = { "VMBUS", NULL };
+ int rv;
+
+ if (device_get_unit(dev) != 0 || vm_guest != VM_GUEST_HV ||
+ (hyperv_features & CPUID_HV_MSR_SYNIC) == 0)
+ return (ENXIO);
+ rv = ACPI_ID_PROBE(device_get_parent(dev), dev, id, NULL);
+ if (rv <= 0)
+ device_set_desc(dev, "Hyper-V Vmbus Resource");
+ return (rv);
+}
+
+static int
+vmbus_res_attach(device_t dev __unused)
+{
+
+ return (0);
+}
+
+static int
+vmbus_res_detach(device_t dev __unused)
+{
+
+ return (0);
+}
diff --git a/sys/dev/hyperv/vmbus/vmbus_var.h b/sys/dev/hyperv/vmbus/vmbus_var.h
new file mode 100644
index 000000000000..0e42d70d8257
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_var.h
@@ -0,0 +1,175 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMBUS_VAR_H_
+#define _VMBUS_VAR_H_
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/taskqueue.h>
+#include <sys/rman.h>
+
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcib_private.h>
+
+/*
+ * NOTE: DO NOT CHANGE THIS.
+ */
+#define VMBUS_SINT_MESSAGE 2
+/*
+ * NOTE:
+ * - DO NOT set it to the same value as VMBUS_SINT_MESSAGE.
+ * - DO NOT set it to 0.
+ */
+#define VMBUS_SINT_TIMER 4
+
+/*
+ * NOTE: DO NOT CHANGE THESE
+ */
+#define VMBUS_CONNID_MESSAGE 1
+#define VMBUS_CONNID_EVENT 2
+
+struct vmbus_message;
+struct vmbus_softc;
+
+typedef void (*vmbus_chanmsg_proc_t)(struct vmbus_softc *,
+ const struct vmbus_message *);
+
+#define VMBUS_CHANMSG_PROC(name, func) \
+ [VMBUS_CHANMSG_TYPE_##name] = func
+#define VMBUS_CHANMSG_PROC_WAKEUP(name) \
+ VMBUS_CHANMSG_PROC(name, vmbus_msghc_wakeup)
+
+struct vmbus_pcpu_data {
+ u_long *intr_cnt; /* Hyper-V interrupt counter */
+ struct vmbus_message *message; /* shared messages */
+ uint32_t vcpuid; /* virtual cpuid */
+ int event_flags_cnt;/* # of event flags */
+ struct vmbus_evtflags *event_flags; /* event flags from host */
+
+ /* Rarely used fields */
+ struct hyperv_dma message_dma; /* busdma glue */
+ struct hyperv_dma event_flags_dma;/* busdma glue */
+ struct taskqueue *event_tq; /* event taskq */
+ struct taskqueue *message_tq; /* message taskq */
+ struct task message_task; /* message task */
+} __aligned(CACHE_LINE_SIZE);
+
+#if __FreeBSD_version < 1100000
+typedef u_long rman_res_t;
+#endif
+
+struct vmbus_softc {
+ void (*vmbus_event_proc)(struct vmbus_softc *, int);
+ u_long *vmbus_tx_evtflags;
+ /* event flags to host */
+ struct vmbus_mnf *vmbus_mnf2; /* monitored by host */
+
+ u_long *vmbus_rx_evtflags;
+ /* compat evtflgs from host */
+ struct vmbus_channel *volatile *vmbus_chmap;
+ struct vmbus_xact_ctx *vmbus_xc;
+ struct vmbus_pcpu_data vmbus_pcpu[MAXCPU];
+
+ /*
+ * Rarely used fields
+ */
+
+ device_t vmbus_dev;
+ int vmbus_idtvec;
+ uint32_t vmbus_flags; /* see VMBUS_FLAG_ */
+ uint32_t vmbus_version;
+ uint32_t vmbus_gpadl;
+
+ /* Shared memory for vmbus_{rx,tx}_evtflags */
+ void *vmbus_evtflags;
+ struct hyperv_dma vmbus_evtflags_dma;
+
+ void *vmbus_mnf1; /* monitored by VM, unused */
+ struct hyperv_dma vmbus_mnf1_dma;
+ struct hyperv_dma vmbus_mnf2_dma;
+
+ bool vmbus_scandone;
+ struct task vmbus_scandone_task;
+
+ struct taskqueue *vmbus_devtq; /* for dev attach/detach */
+ struct taskqueue *vmbus_subchtq; /* for sub-chan attach/detach */
+
+ /* Primary channels */
+ struct mtx vmbus_prichan_lock;
+ TAILQ_HEAD(, vmbus_channel) vmbus_prichans;
+
+ /* Complete channel list */
+ struct mtx vmbus_chan_lock;
+ TAILQ_HEAD(, vmbus_channel) vmbus_chans;
+
+ struct intr_config_hook vmbus_intrhook;
+
+#ifdef NEW_PCIB
+ /* The list of usable MMIO ranges for PCIe pass-through */
+ struct pcib_host_resources vmbus_mmio_res;
+#endif
+};
+
+#define VMBUS_FLAG_ATTACHED 0x0001 /* vmbus was attached */
+#define VMBUS_FLAG_SYNIC 0x0002 /* SynIC was setup */
+
+#define VMBUS_PCPU_GET(sc, field, cpu) (sc)->vmbus_pcpu[(cpu)].field
+#define VMBUS_PCPU_PTR(sc, field, cpu) &(sc)->vmbus_pcpu[(cpu)].field
+
+struct vmbus_channel;
+struct trapframe;
+struct vmbus_message;
+struct vmbus_msghc;
+
+void vmbus_handle_intr(struct trapframe *);
+int vmbus_add_child(struct vmbus_channel *);
+int vmbus_delete_child(struct vmbus_channel *);
+void vmbus_et_intr(struct trapframe *);
+uint32_t vmbus_gpadl_alloc(struct vmbus_softc *);
+
+struct vmbus_msghc *
+ vmbus_msghc_get(struct vmbus_softc *, size_t);
+void vmbus_msghc_put(struct vmbus_softc *, struct vmbus_msghc *);
+void *vmbus_msghc_dataptr(struct vmbus_msghc *);
+int vmbus_msghc_exec_noresult(struct vmbus_msghc *);
+int vmbus_msghc_exec(struct vmbus_softc *, struct vmbus_msghc *);
+void vmbus_msghc_exec_cancel(struct vmbus_softc *,
+ struct vmbus_msghc *);
+const struct vmbus_message *
+ vmbus_msghc_wait_result(struct vmbus_softc *,
+ struct vmbus_msghc *);
+const struct vmbus_message *
+ vmbus_msghc_poll_result(struct vmbus_softc *,
+ struct vmbus_msghc *);
+void vmbus_msghc_wakeup(struct vmbus_softc *,
+ const struct vmbus_message *);
+void vmbus_msghc_reset(struct vmbus_msghc *, size_t);
+
+#endif /* !_VMBUS_VAR_H_ */
diff --git a/sys/dev/hyperv/vmbus/vmbus_xact.c b/sys/dev/hyperv/vmbus/vmbus_xact.c
new file mode 100644
index 000000000000..90bdba7e1058
--- /dev/null
+++ b/sys/dev/hyperv/vmbus/vmbus_xact.c
@@ -0,0 +1,442 @@
+/*-
+ * Copyright (c) 2016 Microsoft Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+
+#include <dev/hyperv/include/hyperv_busdma.h>
+#include <dev/hyperv/include/vmbus_xact.h>
+
+struct vmbus_xact {
+ struct vmbus_xact_ctx *x_ctx;
+ void *x_priv;
+
+ void *x_req;
+ struct hyperv_dma x_req_dma;
+
+ const void *x_resp;
+ size_t x_resp_len;
+ void *x_resp0;
+};
+
+struct vmbus_xact_ctx {
+ size_t xc_req_size;
+ size_t xc_resp_size;
+ size_t xc_priv_size;
+
+ struct mtx xc_lock;
+ /*
+ * Protected by xc_lock.
+ */
+ uint32_t xc_flags; /* VMBUS_XACT_CTXF_ */
+ struct vmbus_xact *xc_free;
+ struct vmbus_xact *xc_active;
+ struct vmbus_xact *xc_orphan;
+};
+
+#define VMBUS_XACT_CTXF_DESTROY 0x0001
+
+static struct vmbus_xact *vmbus_xact_alloc(struct vmbus_xact_ctx *,
+ bus_dma_tag_t);
+static void vmbus_xact_free(struct vmbus_xact *);
+static struct vmbus_xact *vmbus_xact_get1(struct vmbus_xact_ctx *,
+ uint32_t);
+static const void *vmbus_xact_wait1(struct vmbus_xact *, size_t *,
+ bool);
+static const void *vmbus_xact_return(struct vmbus_xact *,
+ size_t *);
+static void vmbus_xact_save_resp(struct vmbus_xact *,
+ const void *, size_t);
+static void vmbus_xact_ctx_free(struct vmbus_xact_ctx *);
+
+static struct vmbus_xact *
+vmbus_xact_alloc(struct vmbus_xact_ctx *ctx, bus_dma_tag_t parent_dtag)
+{
+ struct vmbus_xact *xact;
+
+ xact = malloc(sizeof(*xact), M_DEVBUF, M_WAITOK | M_ZERO);
+ xact->x_ctx = ctx;
+
+ /* XXX assume that page aligned is enough */
+ xact->x_req = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0,
+ ctx->xc_req_size, &xact->x_req_dma, BUS_DMA_WAITOK);
+ if (xact->x_req == NULL) {
+ free(xact, M_DEVBUF);
+ return (NULL);
+ }
+ if (ctx->xc_priv_size != 0)
+ xact->x_priv = malloc(ctx->xc_priv_size, M_DEVBUF, M_WAITOK);
+ xact->x_resp0 = malloc(ctx->xc_resp_size, M_DEVBUF, M_WAITOK);
+
+ return (xact);
+}
+
+static void
+vmbus_xact_free(struct vmbus_xact *xact)
+{
+
+ hyperv_dmamem_free(&xact->x_req_dma, xact->x_req);
+ free(xact->x_resp0, M_DEVBUF);
+ if (xact->x_priv != NULL)
+ free(xact->x_priv, M_DEVBUF);
+ free(xact, M_DEVBUF);
+}
+
+static struct vmbus_xact *
+vmbus_xact_get1(struct vmbus_xact_ctx *ctx, uint32_t dtor_flag)
+{
+ struct vmbus_xact *xact;
+
+ mtx_lock(&ctx->xc_lock);
+
+ while ((ctx->xc_flags & dtor_flag) == 0 && ctx->xc_free == NULL)
+ mtx_sleep(&ctx->xc_free, &ctx->xc_lock, 0, "gxact", 0);
+ if (ctx->xc_flags & dtor_flag) {
+ /* Being destroyed */
+ xact = NULL;
+ } else {
+ xact = ctx->xc_free;
+ KASSERT(xact != NULL, ("no free xact"));
+ KASSERT(xact->x_resp == NULL, ("xact has pending response"));
+ ctx->xc_free = NULL;
+ }
+
+ mtx_unlock(&ctx->xc_lock);
+
+ return (xact);
+}
+
+struct vmbus_xact_ctx *
+vmbus_xact_ctx_create(bus_dma_tag_t dtag, size_t req_size, size_t resp_size,
+ size_t priv_size)
+{
+ struct vmbus_xact_ctx *ctx;
+
+ KASSERT(req_size > 0, ("request size is 0"));
+ KASSERT(resp_size > 0, ("response size is 0"));
+
+ ctx = malloc(sizeof(*ctx), M_DEVBUF, M_WAITOK | M_ZERO);
+ ctx->xc_req_size = req_size;
+ ctx->xc_resp_size = resp_size;
+ ctx->xc_priv_size = priv_size;
+
+ ctx->xc_free = vmbus_xact_alloc(ctx, dtag);
+ if (ctx->xc_free == NULL) {
+ free(ctx, M_DEVBUF);
+ return (NULL);
+ }
+
+ mtx_init(&ctx->xc_lock, "vmbus xact", NULL, MTX_DEF);
+
+ return (ctx);
+}
+
+bool
+vmbus_xact_ctx_orphan(struct vmbus_xact_ctx *ctx)
+{
+ mtx_lock(&ctx->xc_lock);
+ if (ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) {
+ mtx_unlock(&ctx->xc_lock);
+ return (false);
+ }
+ ctx->xc_flags |= VMBUS_XACT_CTXF_DESTROY;
+ mtx_unlock(&ctx->xc_lock);
+
+ wakeup(&ctx->xc_free);
+ wakeup(&ctx->xc_active);
+
+ ctx->xc_orphan = vmbus_xact_get1(ctx, 0);
+ if (ctx->xc_orphan == NULL)
+ panic("can't get xact");
+ return (true);
+}
+
+static void
+vmbus_xact_ctx_free(struct vmbus_xact_ctx *ctx)
+{
+ KASSERT(ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY,
+ ("xact ctx was not orphaned"));
+ KASSERT(ctx->xc_orphan != NULL, ("no orphaned xact"));
+
+ vmbus_xact_free(ctx->xc_orphan);
+ mtx_destroy(&ctx->xc_lock);
+ free(ctx, M_DEVBUF);
+}
+
+void
+vmbus_xact_ctx_destroy(struct vmbus_xact_ctx *ctx)
+{
+
+ vmbus_xact_ctx_orphan(ctx);
+ vmbus_xact_ctx_free(ctx);
+}
+
+struct vmbus_xact *
+vmbus_xact_get(struct vmbus_xact_ctx *ctx, size_t req_len)
+{
+ struct vmbus_xact *xact;
+
+ if (req_len > ctx->xc_req_size)
+ panic("invalid request size %zu", req_len);
+
+ xact = vmbus_xact_get1(ctx, VMBUS_XACT_CTXF_DESTROY);
+ if (xact == NULL)
+ return (NULL);
+
+ memset(xact->x_req, 0, req_len);
+ return (xact);
+}
+
+void
+vmbus_xact_put(struct vmbus_xact *xact)
+{
+ struct vmbus_xact_ctx *ctx = xact->x_ctx;
+
+ KASSERT(ctx->xc_active == NULL, ("pending active xact"));
+ xact->x_resp = NULL;
+
+ mtx_lock(&ctx->xc_lock);
+ KASSERT(ctx->xc_free == NULL, ("has free xact"));
+ ctx->xc_free = xact;
+ mtx_unlock(&ctx->xc_lock);
+ wakeup(&ctx->xc_free);
+}
+
+void *
+vmbus_xact_req_data(const struct vmbus_xact *xact)
+{
+
+ return (xact->x_req);
+}
+
+bus_addr_t
+vmbus_xact_req_paddr(const struct vmbus_xact *xact)
+{
+
+ return (xact->x_req_dma.hv_paddr);
+}
+
+void *
+vmbus_xact_priv(const struct vmbus_xact *xact, size_t priv_len)
+{
+
+ if (priv_len > xact->x_ctx->xc_priv_size)
+ panic("invalid priv size %zu", priv_len);
+ return (xact->x_priv);
+}
+
+void
+vmbus_xact_activate(struct vmbus_xact *xact)
+{
+ struct vmbus_xact_ctx *ctx = xact->x_ctx;
+
+ KASSERT(xact->x_resp == NULL, ("xact has pending response"));
+
+ mtx_lock(&ctx->xc_lock);
+ KASSERT(ctx->xc_active == NULL, ("pending active xact"));
+ ctx->xc_active = xact;
+ mtx_unlock(&ctx->xc_lock);
+}
+
+void
+vmbus_xact_deactivate(struct vmbus_xact *xact)
+{
+ struct vmbus_xact_ctx *ctx = xact->x_ctx;
+
+ mtx_lock(&ctx->xc_lock);
+ KASSERT(ctx->xc_active == xact, ("xact mismatch"));
+ ctx->xc_active = NULL;
+ mtx_unlock(&ctx->xc_lock);
+}
+
+static const void *
+vmbus_xact_return(struct vmbus_xact *xact, size_t *resp_len)
+{
+ struct vmbus_xact_ctx *ctx = xact->x_ctx;
+ const void *resp;
+
+ mtx_assert(&ctx->xc_lock, MA_OWNED);
+ KASSERT(ctx->xc_active == xact, ("xact trashed"));
+
+ if ((ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) && xact->x_resp == NULL) {
+ uint8_t b = 0;
+
+ /*
+ * Orphaned and no response was received yet; fake up
+ * an one byte response.
+ */
+ printf("vmbus: xact ctx was orphaned w/ pending xact\n");
+ vmbus_xact_save_resp(ctx->xc_active, &b, sizeof(b));
+ }
+ KASSERT(xact->x_resp != NULL, ("no response"));
+
+ ctx->xc_active = NULL;
+
+ resp = xact->x_resp;
+ *resp_len = xact->x_resp_len;
+
+ return (resp);
+}
+
+static const void *
+vmbus_xact_wait1(struct vmbus_xact *xact, size_t *resp_len,
+ bool can_sleep)
+{
+ struct vmbus_xact_ctx *ctx = xact->x_ctx;
+ const void *resp;
+
+ mtx_lock(&ctx->xc_lock);
+
+ KASSERT(ctx->xc_active == xact, ("xact mismatch"));
+ while (xact->x_resp == NULL &&
+ (ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) == 0) {
+ if (can_sleep) {
+ mtx_sleep(&ctx->xc_active, &ctx->xc_lock, 0,
+ "wxact", 0);
+ } else {
+ mtx_unlock(&ctx->xc_lock);
+ DELAY(1000);
+ mtx_lock(&ctx->xc_lock);
+ }
+ }
+ resp = vmbus_xact_return(xact, resp_len);
+
+ mtx_unlock(&ctx->xc_lock);
+
+ return (resp);
+}
+
+const void *
+vmbus_xact_wait(struct vmbus_xact *xact, size_t *resp_len)
+{
+
+ return (vmbus_xact_wait1(xact, resp_len, true /* can sleep */));
+}
+
+const void *
+vmbus_xact_busywait(struct vmbus_xact *xact, size_t *resp_len)
+{
+
+ return (vmbus_xact_wait1(xact, resp_len, false /* can't sleep */));
+}
+
+const void *
+vmbus_xact_poll(struct vmbus_xact *xact, size_t *resp_len)
+{
+ struct vmbus_xact_ctx *ctx = xact->x_ctx;
+ const void *resp;
+
+ mtx_lock(&ctx->xc_lock);
+
+ KASSERT(ctx->xc_active == xact, ("xact mismatch"));
+ if (xact->x_resp == NULL &&
+ (ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) == 0) {
+ mtx_unlock(&ctx->xc_lock);
+ *resp_len = 0;
+ return (NULL);
+ }
+ resp = vmbus_xact_return(xact, resp_len);
+
+ mtx_unlock(&ctx->xc_lock);
+
+ return (resp);
+}
+
+static void
+vmbus_xact_save_resp(struct vmbus_xact *xact, const void *data, size_t dlen)
+{
+ struct vmbus_xact_ctx *ctx = xact->x_ctx;
+ size_t cplen = dlen;
+
+ mtx_assert(&ctx->xc_lock, MA_OWNED);
+
+ if (cplen > ctx->xc_resp_size) {
+ printf("vmbus: xact response truncated %zu -> %zu\n",
+ cplen, ctx->xc_resp_size);
+ cplen = ctx->xc_resp_size;
+ }
+
+ KASSERT(ctx->xc_active == xact, ("xact mismatch"));
+ memcpy(xact->x_resp0, data, cplen);
+ xact->x_resp_len = cplen;
+ xact->x_resp = xact->x_resp0;
+}
+
+void
+vmbus_xact_wakeup(struct vmbus_xact *xact, const void *data, size_t dlen)
+{
+ struct vmbus_xact_ctx *ctx = xact->x_ctx;
+ int do_wakeup = 0;
+
+ mtx_lock(&ctx->xc_lock);
+ /*
+ * NOTE:
+ * xc_active could be NULL, if the ctx has been orphaned.
+ */
+ if (ctx->xc_active != NULL) {
+ vmbus_xact_save_resp(xact, data, dlen);
+ do_wakeup = 1;
+ } else {
+ KASSERT(ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY,
+ ("no active xact pending"));
+ printf("vmbus: drop xact response\n");
+ }
+ mtx_unlock(&ctx->xc_lock);
+
+ if (do_wakeup)
+ wakeup(&ctx->xc_active);
+}
+
+void
+vmbus_xact_ctx_wakeup(struct vmbus_xact_ctx *ctx, const void *data, size_t dlen)
+{
+ int do_wakeup = 0;
+
+ mtx_lock(&ctx->xc_lock);
+ /*
+ * NOTE:
+ * xc_active could be NULL, if the ctx has been orphaned.
+ */
+ if (ctx->xc_active != NULL) {
+ vmbus_xact_save_resp(ctx->xc_active, data, dlen);
+ do_wakeup = 1;
+ } else {
+ KASSERT(ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY,
+ ("no active xact pending"));
+ printf("vmbus: drop xact response\n");
+ }
+ mtx_unlock(&ctx->xc_lock);
+
+ if (do_wakeup)
+ wakeup(&ctx->xc_active);
+}