diff options
Diffstat (limited to 'sys/dev/hyperv')
52 files changed, 30566 insertions, 0 deletions
diff --git a/sys/dev/hyperv/hvsock/hv_sock.c b/sys/dev/hyperv/hvsock/hv_sock.c new file mode 100644 index 000000000000..6d5ad4fc6609 --- /dev/null +++ b/sys/dev/hyperv/hvsock/hv_sock.c @@ -0,0 +1,1762 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/domain.h> +#include <sys/lock.h> +#include <sys/kernel.h> +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/sysproto.h> +#include <sys/systm.h> +#include <sys/sockbuf.h> +#include <sys/sx.h> +#include <sys/uio.h> + +#include <net/vnet.h> + +#include <dev/hyperv/vmbus/vmbus_reg.h> + +#include "hv_sock.h" + +#define HVSOCK_DBG_NONE 0x0 +#define HVSOCK_DBG_INFO 0x1 +#define HVSOCK_DBG_ERR 0x2 +#define HVSOCK_DBG_VERBOSE 0x3 + + +SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket"); + +static int hvs_dbg_level; +SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level, + 0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose"); + + +#define HVSOCK_DBG(level, ...) do { \ + if (hvs_dbg_level >= (level)) \ + printf(__VA_ARGS__); \ + } while (0) + +MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures"); + +static int hvs_dom_probe(void); + +/* The MTU is 16KB per host side's design */ +#define HVSOCK_MTU_SIZE (1024 * 16) +#define HVSOCK_SEND_BUF_SZ (PAGE_SIZE - sizeof(struct vmpipe_proto_header)) + +#define HVSOCK_HEADER_LEN (sizeof(struct hvs_pkt_header)) + +#define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ + roundup2(payload_len, 8) + \ + sizeof(uint64_t)) + + +static struct domain hv_socket_domain; + +/* + * HyperV Transport sockets + */ +static struct pr_usrreqs hvs_trans_usrreqs = { + .pru_attach = hvs_trans_attach, + .pru_bind = hvs_trans_bind, + .pru_listen = hvs_trans_listen, + .pru_accept = hvs_trans_accept, + .pru_connect = hvs_trans_connect, + .pru_peeraddr = hvs_trans_peeraddr, + .pru_sockaddr = hvs_trans_sockaddr, + .pru_soreceive = hvs_trans_soreceive, + .pru_sosend = hvs_trans_sosend, + .pru_disconnect = hvs_trans_disconnect, + .pru_close = hvs_trans_close, + .pru_detach = hvs_trans_detach, + .pru_shutdown = hvs_trans_shutdown, + .pru_abort = hvs_trans_abort, +}; + +/* + * Definitions of protocols supported in HyperV socket domain + */ +static struct protosw hv_socket_protosw[] = { +{ + .pr_type = SOCK_STREAM, + .pr_domain = &hv_socket_domain, + .pr_protocol = HYPERV_SOCK_PROTO_TRANS, + .pr_flags = PR_CONNREQUIRED, + .pr_init = hvs_trans_init, + .pr_usrreqs = &hvs_trans_usrreqs, +}, +}; + +static struct domain hv_socket_domain = { + .dom_family = AF_HYPERV, + .dom_name = "hyperv", + .dom_probe = hvs_dom_probe, + .dom_protosw = hv_socket_protosw, + .dom_protoswNPROTOSW = &hv_socket_protosw[nitems(hv_socket_protosw)] +}; + +VNET_DOMAIN_SET(hv_socket_); + +#define MAX_PORT ((uint32_t)0xFFFFFFFF) +#define MIN_PORT ((uint32_t)0x0) + +/* 00000000-facb-11e6-bd58-64006a7986d3 */ +static const struct hyperv_guid srv_id_template = { + .hv_guid = { + 0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11, + 0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 } +}; + +static int hvsock_br_callback(void *, int, void *); +static uint32_t hvsock_canread_check(struct hvs_pcb *); +static uint32_t hvsock_canwrite_check(struct hvs_pcb *); +static int hvsock_send_data(struct vmbus_channel *chan, + struct uio *uio, uint32_t to_write, struct sockbuf *sb); + + + +/* Globals */ +static struct sx hvs_trans_socks_sx; +static struct mtx hvs_trans_socks_mtx; +static LIST_HEAD(, hvs_pcb) hvs_trans_bound_socks; +static LIST_HEAD(, hvs_pcb) hvs_trans_connected_socks; +static uint32_t previous_auto_bound_port; + +static void +hvsock_print_guid(struct hyperv_guid *guid) +{ + unsigned char *p = (unsigned char *)guid; + + HVSOCK_DBG(HVSOCK_DBG_INFO, + "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n", + *(unsigned int *)p, + *((unsigned short *) &p[4]), + *((unsigned short *) &p[6]), + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); +} + +static bool +is_valid_srv_id(const struct hyperv_guid *id) +{ + return !memcmp(&id->hv_guid[4], + &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4); +} + +static unsigned int +get_port_by_srv_id(const struct hyperv_guid *srv_id) +{ + return *((const unsigned int *)srv_id); +} + +static void +set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port) +{ + *((unsigned int *)srv_id) = port; +} + + +static void +__hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list) +{ + struct hvs_pcb *p = NULL; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); + + if (!pcb) + return; + + if (list & HVS_LIST_BOUND) { + LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) + if (p == pcb) + LIST_REMOVE(p, bound_next); + } + + if (list & HVS_LIST_CONNECTED) { + LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) + if (p == pcb) + LIST_REMOVE(pcb, connected_next); + } +} + +static void +__hvs_remove_socket_from_list(struct socket *so, unsigned char list) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); + + __hvs_remove_pcb_from_list(pcb, list); +} + +static void +__hvs_insert_socket_on_list(struct socket *so, unsigned char list) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + if (list & HVS_LIST_BOUND) + LIST_INSERT_HEAD(&hvs_trans_bound_socks, + pcb, bound_next); + + if (list & HVS_LIST_CONNECTED) + LIST_INSERT_HEAD(&hvs_trans_connected_socks, + pcb, connected_next); +} + +void +hvs_remove_socket_from_list(struct socket *so, unsigned char list) +{ + if (!so || !so->so_pcb) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: socket or so_pcb is null\n", __func__); + return; + } + + mtx_lock(&hvs_trans_socks_mtx); + __hvs_remove_socket_from_list(so, list); + mtx_unlock(&hvs_trans_socks_mtx); +} + +static void +hvs_insert_socket_on_list(struct socket *so, unsigned char list) +{ + if (!so || !so->so_pcb) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: socket or so_pcb is null\n", __func__); + return; + } + + mtx_lock(&hvs_trans_socks_mtx); + __hvs_insert_socket_on_list(so, list); + mtx_unlock(&hvs_trans_socks_mtx); +} + +static struct socket * +__hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) +{ + struct hvs_pcb *p = NULL; + + if (list & HVS_LIST_BOUND) + LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) + if (p->so != NULL && + addr->hvs_port == p->local_addr.hvs_port) + return p->so; + + if (list & HVS_LIST_CONNECTED) + LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) + if (p->so != NULL && + addr->hvs_port == p->local_addr.hvs_port) + return p->so; + + return NULL; +} + +static struct socket * +hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) +{ + struct socket *s = NULL; + + mtx_lock(&hvs_trans_socks_mtx); + s = __hvs_find_socket_on_list(addr, list); + mtx_unlock(&hvs_trans_socks_mtx); + + return s; +} + +static inline void +hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port) +{ + memset(addr, 0, sizeof(*addr)); + addr->sa_family = AF_HYPERV; + addr->sa_len = sizeof(*addr); + addr->hvs_port = port; +} + +void +hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id) +{ + hvs_addr_set(addr, get_port_by_srv_id(svr_id)); +} + +int +hvs_trans_lock(void) +{ + sx_xlock(&hvs_trans_socks_sx); + return (0); +} + +void +hvs_trans_unlock(void) +{ + sx_xunlock(&hvs_trans_socks_sx); +} + +static int +hvs_dom_probe(void) +{ + + /* Don't even give us a chance to attach on non-HyperV. */ + if (vm_guest != VM_GUEST_HV) + return (ENXIO); + return (0); +} + +void +hvs_trans_init(void) +{ + /* Skip initialization of globals for non-default instances. */ + if (!IS_DEFAULT_VNET(curvnet)) + return; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_init called\n", __func__); + + /* Initialize Globals */ + previous_auto_bound_port = MAX_PORT; + sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx"); + mtx_init(&hvs_trans_socks_mtx, + "hvs_trans_socks_mtx", NULL, MTX_DEF); + LIST_INIT(&hvs_trans_bound_socks); + LIST_INIT(&hvs_trans_connected_socks); +} + +/* + * Called in two cases: + * 1) When user calls socket(); + * 2) When we accept new incoming conneciton and call sonewconn(). + */ +int +hvs_trans_attach(struct socket *so, int proto, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_attach called\n", __func__); + + if (so->so_type != SOCK_STREAM) + return (ESOCKTNOSUPPORT); + + if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS) + return (EPROTONOSUPPORT); + + if (pcb != NULL) + return (EISCONN); + pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO); + if (pcb == NULL) + return (ENOMEM); + + pcb->so = so; + so->so_pcb = (void *)pcb; + + return (0); +} + +void +hvs_trans_detach(struct socket *so) +{ + struct hvs_pcb *pcb; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_detach called\n", __func__); + + (void) hvs_trans_lock(); + pcb = so2hvspcb(so); + if (pcb == NULL) { + hvs_trans_unlock(); + return; + } + + if (SOLISTENING(so)) { + bzero(pcb, sizeof(*pcb)); + free(pcb, M_HVSOCK); + } + + so->so_pcb = NULL; + + hvs_trans_unlock(); +} + +int +hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr; + int error = 0; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_bind called\n", __func__); + + if (sa == NULL) { + return (EINVAL); + } + + if (pcb == NULL) { + return (EINVAL); + } + + if (sa->sa_family != AF_HYPERV) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: Not supported, sa_family is %u\n", + __func__, sa->sa_family); + return (EAFNOSUPPORT); + } + if (sa->sa_len != sizeof(*sa)) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: Not supported, sa_len is %u\n", + __func__, sa->sa_len); + return (EINVAL); + } + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: binding port = 0x%x\n", __func__, sa->hvs_port); + + mtx_lock(&hvs_trans_socks_mtx); + if (__hvs_find_socket_on_list(sa, + HVS_LIST_BOUND | HVS_LIST_CONNECTED)) { + error = EADDRINUSE; + } else { + /* + * The address is available for us to bind. + * Add socket to the bound list. + */ + hvs_addr_set(&pcb->local_addr, sa->hvs_port); + hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY); + __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); + } + mtx_unlock(&hvs_trans_socks_mtx); + + return (error); +} + +int +hvs_trans_listen(struct socket *so, int backlog, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct socket *bound_so; + int error; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_listen called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + /* Check if the address is already bound and it was by us. */ + bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND); + if (bound_so == NULL || bound_so != so) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: Address not bound or not by us.\n", __func__); + return (EADDRNOTAVAIL); + } + + SOCK_LOCK(so); + error = solisten_proto_check(so); + if (error == 0) + solisten_proto(so, backlog); + SOCK_UNLOCK(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket listen error = %d\n", __func__, error); + return (error); +} + +int +hvs_trans_accept(struct socket *so, struct sockaddr **nam) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_accept called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, + M_NOWAIT); + + return ((*nam == NULL) ? ENOMEM : 0); +} + +int +hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam; + bool found_auto_bound_port = false; + int i, error = 0; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n", + __func__, raddr->hvs_port); + + if (pcb == NULL) + return (EINVAL); + + /* Verify the remote address */ + if (raddr == NULL) + return (EINVAL); + if (raddr->sa_family != AF_HYPERV) + return (EAFNOSUPPORT); + if (raddr->sa_len != sizeof(*raddr)) + return (EINVAL); + + mtx_lock(&hvs_trans_socks_mtx); + if (so->so_state & + (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: socket connect in progress\n", + __func__); + error = EINPROGRESS; + goto out; + } + + /* + * Find an available port for us to auto bind the local + * address. + */ + hvs_addr_set(&pcb->local_addr, 0); + + for (i = previous_auto_bound_port - 1; + i != previous_auto_bound_port; i --) { + if (i == MIN_PORT) + i = MAX_PORT; + + pcb->local_addr.hvs_port = i; + + if (__hvs_find_socket_on_list(&pcb->local_addr, + HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) { + found_auto_bound_port = true; + previous_auto_bound_port = i; + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: found local bound port is %x\n", + __func__, pcb->local_addr.hvs_port); + break; + } + } + + if (found_auto_bound_port == true) { + /* Found available port for auto bound, put on list */ + __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); + /* Set VM service ID */ + pcb->vm_srv_id = srv_id_template; + set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port); + /* Set host service ID and remote port */ + pcb->host_srv_id = srv_id_template; + set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port); + hvs_addr_set(&pcb->remote_addr, raddr->hvs_port); + + /* Change the socket state to SS_ISCONNECTING */ + soisconnecting(so); + } else { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: No local port available for auto bound\n", + __func__); + error = EADDRINUSE; + } + + HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is "); + hvsock_print_guid(&pcb->vm_srv_id); + HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is "); + hvsock_print_guid(&pcb->host_srv_id); + +out: + mtx_unlock(&hvs_trans_socks_mtx); + + if (found_auto_bound_port == true) + vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id); + + return (error); +} + +int +hvs_trans_disconnect(struct socket *so) +{ + struct hvs_pcb *pcb; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_disconnect called\n", __func__); + + (void) hvs_trans_lock(); + pcb = so2hvspcb(so); + if (pcb == NULL) { + hvs_trans_unlock(); + return (EINVAL); + } + + /* If socket is already disconnected, skip this */ + if ((so->so_state & SS_ISDISCONNECTED) == 0) + soisdisconnecting(so); + + hvs_trans_unlock(); + + return (0); +} + +struct hvs_callback_arg { + struct uio *uio; + struct sockbuf *sb; +}; + +int +hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr, + struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockbuf *sb; + ssize_t orig_resid; + uint32_t canread, to_read; + int flags, error = 0; + struct hvs_callback_arg cbarg; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_soreceive called\n", __func__); + + if (so->so_type != SOCK_STREAM) + return (EINVAL); + if (pcb == NULL) + return (EINVAL); + + if (flagsp != NULL) + flags = *flagsp &~ MSG_EOR; + else + flags = 0; + + if (flags & MSG_PEEK) + return (EOPNOTSUPP); + + /* If no space to copy out anything */ + if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ) + return (EINVAL); + + orig_resid = uio->uio_resid; + + /* Prevent other readers from entering the socket. */ + error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); + if (error) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: soiolock returned error = %d\n", __func__, error); + return (error); + } + + sb = &so->so_rcv; + SOCKBUF_LOCK(sb); + + cbarg.uio = uio; + cbarg.sb = sb; + /* + * If the socket is closing, there might still be some data + * in rx br to read. However we need to make sure + * the channel is still open. + */ + if ((sb->sb_state & SBS_CANTRCVMORE) && + (so->so_state & SS_ISDISCONNECTED)) { + /* Other thread already closed the channel */ + error = EPIPE; + goto out; + } + + while (true) { + while (uio->uio_resid > 0 && + (canread = hvsock_canread_check(pcb)) > 0) { + to_read = MIN(canread, uio->uio_resid); + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: to_read = %u, skip = %u\n", __func__, to_read, + (unsigned int)(sizeof(struct hvs_pkt_header) + + pcb->recv_data_off)); + + error = vmbus_chan_recv_peek_call(pcb->chan, to_read, + sizeof(struct hvs_pkt_header) + pcb->recv_data_off, + hvsock_br_callback, (void *)&cbarg); + /* + * It is possible socket is disconnected becasue + * we released lock in hvsock_br_callback. So we + * need to check the state to make sure it is not + * disconnected. + */ + if (error || so->so_state & SS_ISDISCONNECTED) { + break; + } + + pcb->recv_data_len -= to_read; + pcb->recv_data_off += to_read; + } + + if (error) + break; + + /* Abort if socket has reported problems. */ + if (so->so_error) { + if (so->so_error == ESHUTDOWN && + orig_resid > uio->uio_resid) { + /* + * Although we got a FIN, we also received + * some data in this round. Delivery it + * to user. + */ + error = 0; + } else { + if (so->so_error != ESHUTDOWN) + error = so->so_error; + } + + break; + } + + /* Cannot received more. */ + if (sb->sb_state & SBS_CANTRCVMORE) + break; + + /* We are done if buffer has been filled */ + if (uio->uio_resid == 0) + break; + + if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid) + break; + + /* Buffer ring is empty and we shall not block */ + if ((so->so_state & SS_NBIO) || + (flags & (MSG_DONTWAIT|MSG_NBIO))) { + if (orig_resid == uio->uio_resid) { + /* We have not read anything */ + error = EAGAIN; + } + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: non blocked read return, error %d.\n", + __func__, error); + break; + } + + /* + * Wait and block until (more) data comes in. + * Note: Drops the sockbuf lock during wait. + */ + error = sbwait(sb); + + if (error) + break; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: wake up from sbwait, read available is %u\n", + __func__, vmbus_chan_read_available(pcb->chan)); + } + +out: + SOCKBUF_UNLOCK(sb); + SOCK_IO_RECV_UNLOCK(so); + + /* We recieved a FIN in this call */ + if (so->so_error == ESHUTDOWN) { + if (so->so_snd.sb_state & SBS_CANTSENDMORE) { + /* Send has already closed */ + soisdisconnecting(so); + } else { + /* Just close the receive side */ + socantrcvmore(so); + } + } + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: returning error = %d, so_error = %d\n", + __func__, error, so->so_error); + + return (error); +} + +int +hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, + struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockbuf *sb; + ssize_t orig_resid; + uint32_t canwrite, to_write; + int error = 0; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n", + __func__, uio->uio_resid); + + if (so->so_type != SOCK_STREAM) + return (EINVAL); + if (pcb == NULL) + return (EINVAL); + + /* If nothing to send */ + if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE) + return (EINVAL); + + orig_resid = uio->uio_resid; + + /* Prevent other writers from entering the socket. */ + error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); + if (error) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: soiolocak returned error = %d\n", __func__, error); + return (error); + } + + sb = &so->so_snd; + SOCKBUF_LOCK(sb); + + if ((sb->sb_state & SBS_CANTSENDMORE) || + so->so_error == ESHUTDOWN) { + error = EPIPE; + goto out; + } + + while (uio->uio_resid > 0) { + canwrite = hvsock_canwrite_check(pcb); + if (canwrite == 0) { + /* We have sent some data */ + if (orig_resid > uio->uio_resid) + break; + /* + * We have not sent any data and it is + * non-blocked io + */ + if (so->so_state & SS_NBIO || + (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { + error = EWOULDBLOCK; + break; + } else { + /* + * We are here because there is no space on + * send buffer ring. Signal the other side + * to read and free more space. + * Sleep wait until space avaiable to send + * Note: Drops the sockbuf lock during wait. + */ + error = sbwait(sb); + + if (error) + break; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: wake up from sbwait, space avail on " + "tx ring is %u\n", + __func__, + vmbus_chan_write_available(pcb->chan)); + + continue; + } + } + to_write = MIN(canwrite, uio->uio_resid); + to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: canwrite is %u, to_write = %u\n", __func__, + canwrite, to_write); + error = hvsock_send_data(pcb->chan, uio, to_write, sb); + + if (error) + break; + } + +out: + SOCKBUF_UNLOCK(sb); + SOCK_IO_SEND_UNLOCK(so); + + return (error); +} + +int +hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT); + + return ((*nam == NULL)? ENOMEM : 0); +} + +int +hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + *nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT); + + return ((*nam == NULL)? ENOMEM : 0); +} + +void +hvs_trans_close(struct socket *so) +{ + struct hvs_pcb *pcb; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_close called\n", __func__); + + (void) hvs_trans_lock(); + pcb = so2hvspcb(so); + if (!pcb) { + hvs_trans_unlock(); + return; + } + + if (so->so_state & SS_ISCONNECTED) { + /* Send a FIN to peer */ + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: hvs_trans_close sending a FIN to host\n", __func__); + (void) hvsock_send_data(pcb->chan, NULL, 0, NULL); + } + + if (so->so_state & + (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) + soisdisconnected(so); + + pcb->chan = NULL; + pcb->so = NULL; + + if (SOLISTENING(so)) { + mtx_lock(&hvs_trans_socks_mtx); + /* Remove from bound list */ + __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); + mtx_unlock(&hvs_trans_socks_mtx); + } + + hvs_trans_unlock(); + + return; +} + +void +hvs_trans_abort(struct socket *so) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_abort called\n", __func__); + + (void) hvs_trans_lock(); + if (pcb == NULL) { + hvs_trans_unlock(); + return; + } + + if (SOLISTENING(so)) { + mtx_lock(&hvs_trans_socks_mtx); + /* Remove from bound list */ + __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); + mtx_unlock(&hvs_trans_socks_mtx); + } + + if (so->so_state & SS_ISCONNECTED) { + (void) sodisconnect(so); + } + hvs_trans_unlock(); + + return; +} + +int +hvs_trans_shutdown(struct socket *so) +{ + struct hvs_pcb *pcb = so2hvspcb(so); + struct sockbuf *sb; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: HyperV Socket hvs_trans_shutdown called\n", __func__); + + if (pcb == NULL) + return (EINVAL); + + /* + * Only get called with the shutdown method is SHUT_WR or + * SHUT_RDWR. + * When the method is SHUT_RD or SHUT_RDWR, the caller + * already set the SBS_CANTRCVMORE on receive side socket + * buffer. + */ + if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { + /* + * SHUT_WR only case. + * Receive side is still open. Just close + * the send side. + */ + socantsendmore(so); + } else { + /* SHUT_RDWR case */ + if (so->so_state & SS_ISCONNECTED) { + /* Send a FIN to peer */ + sb = &so->so_snd; + SOCKBUF_LOCK(sb); + (void) hvsock_send_data(pcb->chan, NULL, 0, sb); + SOCKBUF_UNLOCK(sb); + + soisdisconnecting(so); + } + } + + return (0); +} + +/* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is + * <port> (see struct sockaddr_hvs). + * + * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV: + * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user- + * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with + * the below sockaddr: + * + * struct SOCKADDR_HV + * { + * ADDRESS_FAMILY Family; + * USHORT Reserved; + * GUID VmId; + * GUID ServiceId; + * }; + * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via + * VMBus, because here it's obvious the host and the VM can easily identify + * each other. Though the VmID is useful on the host, especially in the case + * of Windows container, FreeBSD VM doesn't need it at all. + * + * To be compatible with similar infrastructure in Linux VMs, we have + * to limit the available GUID space of SOCKADDR_HV so that we can create + * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID. + * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is: + * + **************************************************************************** + * The only valid Service GUIDs, from the perspectives of both the host and * + * FreeBSD VM, that can be connected by the other end, must conform to this * + * format: <port>-facb-11e6-bd58-64006a7986d3. * + **************************************************************************** + * + * When we write apps on the host to connect(), the GUID ServiceID is used. + * When we write apps in FreeBSD VM to connect(), we only need to specify the + * port and the driver will form the GUID and use that to request the host. + * + * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the + * auto-generated remote port for a connect request initiated by the host's + * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the + * FreeBSD guest. + */ + +/* + * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before) + * restricts HyperV socket ring buffer size to six 4K pages. Newer + * HyperV hosts doen't have this limit. + */ +#define HVS_RINGBUF_RCV_SIZE (PAGE_SIZE * 6) +#define HVS_RINGBUF_SND_SIZE (PAGE_SIZE * 6) +#define HVS_RINGBUF_MAX_SIZE (PAGE_SIZE * 64) + +struct hvsock_sc { + device_t dev; + struct hvs_pcb *pcb; + struct vmbus_channel *channel; +}; + +static bool +hvsock_chan_readable(struct vmbus_channel *chan) +{ + uint32_t readable = vmbus_chan_read_available(chan); + + return (readable >= HVSOCK_PKT_LEN(0)); +} + +static void +hvsock_chan_cb(struct vmbus_channel *chan, void *context) +{ + struct hvs_pcb *pcb = (struct hvs_pcb *) context; + struct socket *so; + uint32_t canwrite; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: host send us a wakeup on rb data, pcb = %p\n", + __func__, pcb); + + /* + * Check if the socket is still attached and valid. + * Here we know channel is still open. Need to make + * sure the socket has not been closed or freed. + */ + (void) hvs_trans_lock(); + so = hsvpcb2so(pcb); + + if (pcb->chan != NULL && so != NULL) { + /* + * Wake up reader if there are data to read. + */ + SOCKBUF_LOCK(&(so)->so_rcv); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: read available = %u\n", __func__, + vmbus_chan_read_available(pcb->chan)); + + if (hvsock_chan_readable(pcb->chan)) + sorwakeup_locked(so); + else + SOCKBUF_UNLOCK(&(so)->so_rcv); + + /* + * Wake up sender if space becomes available to write. + */ + SOCKBUF_LOCK(&(so)->so_snd); + canwrite = hvsock_canwrite_check(pcb); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: canwrite = %u\n", __func__, canwrite); + + if (canwrite > 0) { + sowwakeup_locked(so); + } else { + SOCKBUF_UNLOCK(&(so)->so_snd); + } + } + + hvs_trans_unlock(); + + return; +} + +static int +hvsock_br_callback(void *datap, int cplen, void *cbarg) +{ + struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg; + struct uio *uio = arg->uio; + struct sockbuf *sb = arg->sb; + int error = 0; + + if (cbarg == NULL || datap == NULL) + return (EINVAL); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, " + "datap = %p\n", + __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br", + uio->uio_resid, cplen, datap); + + if (sb) + SOCKBUF_UNLOCK(sb); + + error = uiomove(datap, cplen, uio); + + if (sb) + SOCKBUF_LOCK(sb); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: after uiomove, uio_resid = %zd, error = %d\n", + __func__, uio->uio_resid, error); + + return (error); +} + +static int +hvsock_send_data(struct vmbus_channel *chan, struct uio *uio, + uint32_t to_write, struct sockbuf *sb) +{ + struct hvs_pkt_header hvs_pkt; + int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0; + uint64_t pad = 0; + struct iovec iov[3]; + struct hvs_callback_arg cbarg; + + if (chan == NULL) + return (ENOTCONN); + + hlen = sizeof(struct vmbus_chanpkt_hdr); + hvs_pkthlen = sizeof(struct hvs_pkt_header); + hvs_pktlen = hvs_pkthlen + to_write; + pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, " + "pad_pktlen = %u, data_len = %u\n", + __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write); + + hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND; + hvs_pkt.chan_pkt_hdr.cph_flags = 0; + VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen); + VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen); + hvs_pkt.chan_pkt_hdr.cph_xactid = 0; + + hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1; + hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write; + + cbarg.uio = uio; + cbarg.sb = sb; + + if (uio && to_write > 0) { + iov[0].iov_base = &hvs_pkt; + iov[0].iov_len = hvs_pkthlen; + iov[1].iov_base = NULL; + iov[1].iov_len = to_write; + iov[2].iov_base = &pad; + iov[2].iov_len = pad_pktlen - hvs_pktlen; + + error = vmbus_chan_iov_send(chan, iov, 3, + hvsock_br_callback, &cbarg); + } else { + if (to_write == 0) { + iov[0].iov_base = &hvs_pkt; + iov[0].iov_len = hvs_pkthlen; + iov[1].iov_base = &pad; + iov[1].iov_len = pad_pktlen - hvs_pktlen; + error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL); + } + } + + if (error) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: error = %d\n", __func__, error); + } + + return (error); +} + +/* + * Check if we have data on current ring buffer to read + * or not. If not, advance the ring buffer read index to + * next packet. Update the recev_data_len and recev_data_off + * to new value. + * Return the number of bytes can read. + */ +static uint32_t +hvsock_canread_check(struct hvs_pcb *pcb) +{ + uint32_t advance; + uint32_t tlen, hlen, dlen; + uint32_t bytes_canread = 0; + int error; + + if (pcb == NULL || pcb->chan == NULL) { + pcb->so->so_error = EIO; + return (0); + } + + /* Still have data not read yet on current packet */ + if (pcb->recv_data_len > 0) + return (pcb->recv_data_len); + + if (pcb->rb_init) + advance = + VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); + else + advance = 0; + + bytes_canread = vmbus_chan_read_available(pcb->chan); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: bytes_canread on br = %u, advance = %u\n", + __func__, bytes_canread, advance); + + if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) { + /* + * Nothing to read. Need to advance the rindex before + * calling sbwait, so host knows to wake us up when data + * is available to read on rb. + */ + error = vmbus_chan_recv_idxadv(pcb->chan, advance); + if (error) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: after calling vmbus_chan_recv_idxadv, " + "got error = %d\n", __func__, error); + return (0); + } else { + pcb->rb_init = false; + pcb->recv_data_len = 0; + pcb->recv_data_off = 0; + bytes_canread = vmbus_chan_read_available(pcb->chan); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: advanced %u bytes, " + " bytes_canread on br now = %u\n", + __func__, advance, bytes_canread); + + if (bytes_canread == 0) + return (0); + else + advance = 0; + } + } + + if (bytes_canread < + advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t))) + return (0); + + error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt, + sizeof(struct hvs_pkt_header), advance); + + /* Don't have anything to read */ + if (error) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: after calling vmbus_chan_recv_peek, got error = %d\n", + __func__, error); + return (0); + } + + /* + * We just read in a new packet header. Do some sanity checks. + */ + tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); + hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen); + dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size; + if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) || + __predict_false(hlen > tlen) || + __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "invalid tlen(%u), hlen(%u) or dlen(%u)\n", + tlen, hlen, dlen); + pcb->so->so_error = EIO; + return (0); + } + if (pcb->rb_init == false) + pcb->rb_init = true; + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n", + tlen, hlen, dlen); + + /* The other side has sent a close FIN */ + if (dlen == 0) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: Received FIN from other side\n", __func__); + /* inform the caller by seting so_error to ESHUTDOWN */ + pcb->so->so_error = ESHUTDOWN; + } + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: canread on receive ring is %u \n", __func__, dlen); + + pcb->recv_data_len = dlen; + pcb->recv_data_off = 0; + + return (pcb->recv_data_len); +} + +static uint32_t +hvsock_canwrite_check(struct hvs_pcb *pcb) +{ + uint32_t writeable; + uint32_t ret; + + if (pcb == NULL || pcb->chan == NULL) + return (0); + + writeable = vmbus_chan_write_available(pcb->chan); + + /* + * We must always reserve a 0-length-payload packet for the FIN. + */ + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: writeable is %u, should be greater than %ju\n", + __func__, writeable, + (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0))); + + if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) { + /* + * The Tx ring seems full. + */ + return (0); + } + + ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: available size is %u\n", __func__, rounddown2(ret, 8)); + + return (rounddown2(ret, 8)); +} + +static void +hvsock_set_chan_pending_send_size(struct vmbus_channel *chan) +{ + vmbus_chan_set_pending_send_size(chan, + HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ)); +} + +static int +hvsock_open_channel(struct vmbus_channel *chan, struct socket *so) +{ + unsigned int rcvbuf, sndbuf; + struct hvs_pcb *pcb = so2hvspcb(so); + int ret; + + if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) { + sndbuf = HVS_RINGBUF_SND_SIZE; + rcvbuf = HVS_RINGBUF_RCV_SIZE; + } else { + sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE); + sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE); + sndbuf = rounddown2(sndbuf, PAGE_SIZE); + rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE); + rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE); + rcvbuf = rounddown2(rcvbuf, PAGE_SIZE); + } + + /* + * Can only read whatever user provided size of data + * from ring buffer. Turn off batched reading. + */ + vmbus_chan_set_readbatch(chan, false); + + ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0, + hvsock_chan_cb, pcb); + + if (ret != 0) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: failed to open hvsock channel, sndbuf = %u, " + "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); + } else { + HVSOCK_DBG(HVSOCK_DBG_INFO, + "%s: hvsock channel opened, sndbuf = %u, i" + "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); + /* + * Se the pending send size so to receive wakeup + * signals from host when there is enough space on + * rx buffer ring to write. + */ + hvsock_set_chan_pending_send_size(chan); + } + + return ret; +} + +/* + * Guest is listening passively on the socket. Open channel and + * create a new socket for the conneciton. + */ +static void +hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so, + struct hvsock_sc *sc) +{ + struct socket *new_so; + struct hvs_pcb *new_pcb, *pcb; + int error; + + /* Do nothing if socket is not listening */ + if (!SOLISTENING(so)) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: socket is not a listening one\n", __func__); + return; + } + + /* + * Create a new socket. This will call pru_attach to complete + * the socket initialization and put the new socket onto + * listening socket's sol_incomp list, waiting to be promoted + * to sol_comp list. + * The new socket created has ref count 0. There is no other + * thread that changes the state of this new one at the + * moment, so we don't need to hold its lock while opening + * channel and filling out its pcb information. + */ + new_so = sonewconn(so, 0); + if (!new_so) + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: creating new socket failed\n", __func__); + + /* + * Now open the vmbus channel. If it fails, the socket will be + * on the listening socket's sol_incomp queue until it is + * replaced and aborted. + */ + error = hvsock_open_channel(chan, new_so); + if (error) { + new_so->so_error = error; + return; + } + + pcb = so->so_pcb; + new_pcb = new_so->so_pcb; + + hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port); + /* Remote port is unknown to guest in this type of conneciton */ + hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN); + new_pcb->chan = chan; + new_pcb->recv_data_len = 0; + new_pcb->recv_data_off = 0; + new_pcb->rb_init = false; + + new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan); + new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan); + + hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED); + + sc->pcb = new_pcb; + + /* + * Change the socket state to SS_ISCONNECTED. This will promote + * the socket to sol_comp queue and wake up the thread which + * is accepting connection. + */ + soisconnected(new_so); +} + + +/* + * Guest is actively connecting to host. + */ +static void +hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so) +{ + struct hvs_pcb *pcb; + int error; + + error = hvsock_open_channel(chan, so); + if (error) { + so->so_error = error; + return; + } + + pcb = so->so_pcb; + pcb->chan = chan; + pcb->recv_data_len = 0; + pcb->recv_data_off = 0; + pcb->rb_init = false; + + mtx_lock(&hvs_trans_socks_mtx); + __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); + __hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED); + mtx_unlock(&hvs_trans_socks_mtx); + + /* + * Change the socket state to SS_ISCONNECTED. This will wake up + * the thread sleeping in connect call. + */ + soisconnected(so); +} + +static void +hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc) +{ + struct hyperv_guid *inst_guid, *type_guid; + bool conn_from_host; + struct sockaddr_hvs addr; + struct socket *so; + struct hvs_pcb *pcb; + + type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan); + inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan); + conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan); + + HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is "); + hvsock_print_guid(type_guid); + HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is "); + hvsock_print_guid(inst_guid); + HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n", + (conn_from_host == true ) ? "from" : "to"); + + /* + * The listening port should be in [0, MAX_LISTEN_PORT] + */ + if (!is_valid_srv_id(type_guid)) + return; + + /* + * There should be a bound socket already created no matter + * it is a passive or active connection. + * For host initiated connection (passive on guest side), + * the type_guid contains the port which guest is bound and + * listening. + * For the guest initiated connection (active on guest side), + * the inst_guid contains the port that guest has auto bound + * to. + */ + hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid); + so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND); + if (!so) { + HVSOCK_DBG(HVSOCK_DBG_ERR, + "%s: no bound socket found for port %u\n", + __func__, addr.hvs_port); + return; + } + + if (conn_from_host) { + hvsock_open_conn_passive(chan, so, sc); + } else { + (void) hvs_trans_lock(); + pcb = so->so_pcb; + if (pcb && pcb->so) { + sc->pcb = so2hvspcb(so); + hvsock_open_conn_active(chan, so); + } else { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "%s: channel detached before open\n", __func__); + } + hvs_trans_unlock(); + } + +} + +static int +hvsock_probe(device_t dev) +{ + struct vmbus_channel *channel = vmbus_get_channel(dev); + + if (!channel || !vmbus_chan_is_hvs(channel)) { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "hvsock_probe called but not a hvsock channel id %u\n", + vmbus_chan_id(channel)); + + return ENXIO; + } else { + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "hvsock_probe got a hvsock channel id %u\n", + vmbus_chan_id(channel)); + + return BUS_PROBE_DEFAULT; + } +} + +static int +hvsock_attach(device_t dev) +{ + struct vmbus_channel *channel = vmbus_get_channel(dev); + struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n"); + + hvsock_open_connection(channel, sc); + + /* + * Always return success. On error the host will rescind the device + * in 30 seconds and we can do cleanup at that time in + * vmbus_chan_msgproc_chrescind(). + */ + return (0); +} + +static int +hvsock_detach(device_t dev) +{ + struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); + struct socket *so; + int retry; + + if (bootverbose) + device_printf(dev, "hvsock_detach called.\n"); + + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n"); + + if (sc->pcb != NULL) { + (void) hvs_trans_lock(); + + so = hsvpcb2so(sc->pcb); + if (so) { + /* Close the connection */ + if (so->so_state & + (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) + soisdisconnected(so); + } + + mtx_lock(&hvs_trans_socks_mtx); + __hvs_remove_pcb_from_list(sc->pcb, + HVS_LIST_BOUND | HVS_LIST_CONNECTED); + mtx_unlock(&hvs_trans_socks_mtx); + + /* + * Close channel while no reader and sender are working + * on the buffer rings. + */ + if (so) { + retry = 0; + while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) { + /* + * Someone is reading, rx br is busy + */ + soisdisconnected(so); + DELAY(500); + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "waiting for rx reader to exit, " + "retry = %d\n", retry++); + } + retry = 0; + while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) { + /* + * Someone is sending, tx br is busy + */ + soisdisconnected(so); + DELAY(500); + HVSOCK_DBG(HVSOCK_DBG_VERBOSE, + "waiting for tx sender to exit, " + "retry = %d\n", retry++); + } + } + + + bzero(sc->pcb, sizeof(struct hvs_pcb)); + free(sc->pcb, M_HVSOCK); + sc->pcb = NULL; + + if (so) { + SOCK_IO_RECV_UNLOCK(so); + SOCK_IO_SEND_UNLOCK(so); + so->so_pcb = NULL; + } + + hvs_trans_unlock(); + } + + vmbus_chan_close(vmbus_get_channel(dev)); + + return (0); +} + +static device_method_t hvsock_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, hvsock_probe), + DEVMETHOD(device_attach, hvsock_attach), + DEVMETHOD(device_detach, hvsock_detach), + DEVMETHOD_END +}; + +static driver_t hvsock_driver = { + "hv_sock", + hvsock_methods, + sizeof(struct hvsock_sc) +}; + +static devclass_t hvsock_devclass; + +DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL); +MODULE_VERSION(hvsock, 1); +MODULE_DEPEND(hvsock, vmbus, 1, 1, 1); diff --git a/sys/dev/hyperv/hvsock/hv_sock.h b/sys/dev/hyperv/hvsock/hv_sock.h new file mode 100644 index 000000000000..877425968345 --- /dev/null +++ b/sys/dev/hyperv/hvsock/hv_sock.h @@ -0,0 +1,122 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HVSOCK_H +#define _HVSOCK_H +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/queue.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/vmbus.h> + +/* + * HyperV Socket Protocols + */ +#define HYPERV_SOCK_PROTO_TRANS 1 /* Transport protocol */ + +#define HVADDR_PORT_ANY -1U +#define HVADDR_PORT_UNKNOWN -1U + +#define HVS_LIST_BOUND 0x01 +#define HVS_LIST_CONNECTED 0x02 +#define HVS_LIST_ALL (HVS_LIST_BOUND | HVS_LIST_CONNECTED) + +struct sockaddr_hvs { + unsigned char sa_len; + sa_family_t sa_family; + unsigned int hvs_port; + unsigned char hvs_zero[sizeof(struct sockaddr) - + sizeof(sa_family_t) - + sizeof(unsigned char) - + sizeof(unsigned int)]; +}; + +struct vmpipe_proto_header { + uint32_t vmpipe_pkt_type; + uint32_t vmpipe_data_size; +} __packed; + +struct hvs_pkt_header { + struct vmbus_chanpkt_hdr chan_pkt_hdr; + struct vmpipe_proto_header vmpipe_pkt_hdr; +} __packed; + +struct hvs_pcb { + struct socket *so; /* Pointer to socket */ + struct sockaddr_hvs local_addr; + struct sockaddr_hvs remote_addr; + + struct hyperv_guid vm_srv_id; + struct hyperv_guid host_srv_id; + + struct vmbus_channel *chan; + /* Current packet header on rx ring */ + struct hvs_pkt_header hvs_pkt; + /* Available data in receive br in current packet */ + uint32_t recv_data_len; + /* offset in the packet */ + uint32_t recv_data_off; + bool rb_init; + /* Link lists for global bound and connected sockets */ + LIST_ENTRY(hvs_pcb) bound_next; + LIST_ENTRY(hvs_pcb) connected_next; +}; + +#define so2hvspcb(so) \ + ((struct hvs_pcb *)((so)->so_pcb)) +#define hsvpcb2so(hvspcb) \ + ((struct socket *)((hvspcb)->so)) + +void hvs_addr_init(struct sockaddr_hvs *, const struct hyperv_guid *); +void hvs_trans_init(void); +void hvs_trans_close(struct socket *); +void hvs_trans_detach(struct socket *); +void hvs_trans_abort(struct socket *); +int hvs_trans_attach(struct socket *, int, struct thread *); +int hvs_trans_bind(struct socket *, struct sockaddr *, struct thread *); +int hvs_trans_listen(struct socket *, int, struct thread *); +int hvs_trans_accept(struct socket *, struct sockaddr **); +int hvs_trans_connect(struct socket *, + struct sockaddr *, struct thread *); +int hvs_trans_peeraddr(struct socket *, struct sockaddr **); +int hvs_trans_sockaddr(struct socket *, struct sockaddr **); +int hvs_trans_soreceive(struct socket *, struct sockaddr **, + struct uio *, struct mbuf **, struct mbuf **, int *); +int hvs_trans_sosend(struct socket *, struct sockaddr *, struct uio *, + struct mbuf *, struct mbuf *, int, struct thread *); +int hvs_trans_disconnect(struct socket *); +int hvs_trans_shutdown(struct socket *); + +int hvs_trans_lock(void); +void hvs_trans_unlock(void); + +void hvs_remove_socket_from_list(struct socket *, unsigned char); +#endif /* _HVSOCK_H */ diff --git a/sys/dev/hyperv/include/hyperv.h b/sys/dev/hyperv/include/hyperv.h new file mode 100644 index 000000000000..8b985b2f31a7 --- /dev/null +++ b/sys/dev/hyperv/include/hyperv.h @@ -0,0 +1,104 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HYPERV_H_ +#define _HYPERV_H_ + +#ifdef _KERNEL + +#include <sys/param.h> +#include <sys/systm.h> + +#define MSR_HV_TIME_REF_COUNT 0x40000020 + +#define CPUID_HV_MSR_TIME_REFCNT 0x0002 /* MSR_HV_TIME_REF_COUNT */ +#define CPUID_HV_MSR_SYNIC 0x0004 /* MSRs for SynIC */ +#define CPUID_HV_MSR_SYNTIMER 0x0008 /* MSRs for SynTimer */ +#define CPUID_HV_MSR_APIC 0x0010 /* MSR_HV_{EOI,ICR,TPR} */ +#define CPUID_HV_MSR_HYPERCALL 0x0020 /* MSR_HV_GUEST_OS_ID + * MSR_HV_HYPERCALL */ +#define CPUID_HV_MSR_VP_INDEX 0x0040 /* MSR_HV_VP_INDEX */ +#define CPUID_HV_MSR_REFERENCE_TSC 0x0200 /* MSR_HV_REFERENCE_TSC */ +#define CPUID_HV_MSR_GUEST_IDLE 0x0400 /* MSR_HV_GUEST_IDLE */ + +#ifndef NANOSEC +#define NANOSEC 1000000000ULL +#endif +#define HYPERV_TIMER_NS_FACTOR 100ULL +#define HYPERV_TIMER_FREQ (NANOSEC / HYPERV_TIMER_NS_FACTOR) + +#endif /* _KERNEL */ + +#define HYPERV_REFTSC_DEVNAME "hv_tsc" + +/* + * Hyper-V Reference TSC + */ +struct hyperv_reftsc { + volatile uint32_t tsc_seq; + volatile uint32_t tsc_rsvd1; + volatile uint64_t tsc_scale; + volatile int64_t tsc_ofs; +} __packed __aligned(PAGE_SIZE); +#ifdef CTASSERT +CTASSERT(sizeof(struct hyperv_reftsc) == PAGE_SIZE); +#endif + +#ifdef _KERNEL + +struct hyperv_guid { + uint8_t hv_guid[16]; +} __packed; + +#define HYPERV_GUID_STRLEN 40 + +typedef uint64_t (*hyperv_tc64_t)(void); + +int hyperv_guid2str(const struct hyperv_guid *, char *, + size_t); + +/* + * hyperv_tc64 could be NULL, if there were no suitable Hyper-V + * specific timecounter. + */ +extern hyperv_tc64_t hyperv_tc64; +extern u_int hyperv_features; /* CPUID_HV_MSR_ */ +extern u_int hyperv_ver_major; + +/* + * Vmbus version after negotiation with host. + */ +extern uint32_t vmbus_current_version; + +#endif /* _KERNEL */ + +#endif /* _HYPERV_H_ */ diff --git a/sys/dev/hyperv/include/hyperv_busdma.h b/sys/dev/hyperv/include/hyperv_busdma.h new file mode 100644 index 000000000000..ff01b3e27a95 --- /dev/null +++ b/sys/dev/hyperv/include/hyperv_busdma.h @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HYPERV_BUSDMA_H_ +#define _HYPERV_BUSDMA_H_ + +#include <sys/param.h> +#include <sys/bus.h> +#include <machine/bus.h> + +struct hyperv_dma { + bus_addr_t hv_paddr; + bus_dma_tag_t hv_dtag; + bus_dmamap_t hv_dmap; +}; + +void hyperv_dma_map_paddr(void *arg, bus_dma_segment_t *segs, + int nseg, int error); +void *hyperv_dmamem_alloc(bus_dma_tag_t parent_dtag, + bus_size_t alignment, bus_addr_t boundary, bus_size_t size, + struct hyperv_dma *dma, int flags); +void hyperv_dmamem_free(struct hyperv_dma *dma, void *ptr); + +#endif /* !_HYPERV_BUSDMA_H_ */ diff --git a/sys/dev/hyperv/include/vmbus.h b/sys/dev/hyperv/include/vmbus.h new file mode 100644 index 000000000000..76c1ad632765 --- /dev/null +++ b/sys/dev/hyperv/include/vmbus.h @@ -0,0 +1,261 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMBUS_H_ +#define _VMBUS_H_ + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/_iovec.h> + +/* + * VMBUS version is 32 bit, upper 16 bit for major_number and lower + * 16 bit for minor_number. + * + * 0.13 -- Windows Server 2008 + * 1.1 -- Windows 7 + * 2.4 -- Windows 8 + * 3.0 -- Windows 8.1 + * 4.0 -- Windows 10 + * 5.0 -- Newer Windows 10 + */ +#define VMBUS_VERSION_WS2008 ((0 << 16) | (13)) +#define VMBUS_VERSION_WIN7 ((1 << 16) | (1)) +#define VMBUS_VERSION_WIN8 ((2 << 16) | (4)) +#define VMBUS_VERSION_WIN8_1 ((3 << 16) | (0)) +#define VMBUS_VERSION_WIN10 ((4 << 16) | (0)) +#define VMBUS_VERSION_WIN10_V5 ((5 << 16) | (0)) + +#define VMBUS_VERSION_MAJOR(ver) (((uint32_t)(ver)) >> 16) +#define VMBUS_VERSION_MINOR(ver) (((uint32_t)(ver)) & 0xffff) + +#define VMBUS_CHAN_POLLHZ_MIN 100 /* 10ms interval */ +#define VMBUS_CHAN_POLLHZ_MAX 1000000 /* 1us interval */ + +/* + * GPA stuffs. + */ +struct vmbus_gpa_range { + uint32_t gpa_len; + uint32_t gpa_ofs; + uint64_t gpa_page[0]; +} __packed; + +/* This is actually vmbus_gpa_range.gpa_page[1] */ +struct vmbus_gpa { + uint32_t gpa_len; + uint32_t gpa_ofs; + uint64_t gpa_page; +} __packed; + +#define VMBUS_CHANPKT_SIZE_SHIFT 3 + +#define VMBUS_CHANPKT_GETLEN(pktlen) \ + (((int)(pktlen)) << VMBUS_CHANPKT_SIZE_SHIFT) + +struct vmbus_chanpkt_hdr { + uint16_t cph_type; /* VMBUS_CHANPKT_TYPE_ */ + uint16_t cph_hlen; /* header len, in 8 bytes */ + uint16_t cph_tlen; /* total len, in 8 bytes */ + uint16_t cph_flags; /* VMBUS_CHANPKT_FLAG_ */ + uint64_t cph_xactid; +} __packed; + +#define VMBUS_CHANPKT_TYPE_INBAND 0x0006 +#define VMBUS_CHANPKT_TYPE_RXBUF 0x0007 +#define VMBUS_CHANPKT_TYPE_GPA 0x0009 +#define VMBUS_CHANPKT_TYPE_COMP 0x000b + +#define VMBUS_CHANPKT_FLAG_NONE 0 +#define VMBUS_CHANPKT_FLAG_RC 0x0001 /* report completion */ + +#define VMBUS_CHANPKT_CONST_DATA(pkt) \ + (const void *)((const uint8_t *)(pkt) + \ + VMBUS_CHANPKT_GETLEN((pkt)->cph_hlen)) + +/* Include padding */ +#define VMBUS_CHANPKT_DATALEN(pkt) \ + (VMBUS_CHANPKT_GETLEN((pkt)->cph_tlen) -\ + VMBUS_CHANPKT_GETLEN((pkt)->cph_hlen)) + +struct vmbus_rxbuf_desc { + uint32_t rb_len; + uint32_t rb_ofs; +} __packed; + +struct vmbus_chanpkt_rxbuf { + struct vmbus_chanpkt_hdr cp_hdr; + uint16_t cp_rxbuf_id; + uint16_t cp_rsvd; + uint32_t cp_rxbuf_cnt; + struct vmbus_rxbuf_desc cp_rxbuf[]; +} __packed; + +struct vmbus_chan_br { + void *cbr; + bus_addr_t cbr_paddr; + int cbr_txsz; + int cbr_rxsz; +}; + +struct vmbus_channel; +struct vmbus_xact; +struct vmbus_xact_ctx; +struct hyperv_guid; +struct task; +struct taskqueue; + +typedef void (*vmbus_chan_callback_t)(struct vmbus_channel *, void *); +typedef int (*vmbus_br_copy_callback_t)(void *, int, void *); + +static __inline struct vmbus_channel * +vmbus_get_channel(device_t dev) +{ + return device_get_ivars(dev); +} + +/* + * vmbus_chan_open_br() + * + * Return values: + * 0 Succeeded. + * EISCONN Failed, and the memory passed through 'br' is still + * connected. Callers must _not_ free the the memory + * passed through 'br', if this error happens. + * other values Failed. The memory passed through 'br' is no longer + * connected. Callers are free to do anything with the + * memory passed through 'br'. + * + * + * + * vmbus_chan_close_direct() + * + * NOTE: + * Callers of this function _must_ make sure to close all sub-channels before + * closing the primary channel. + * + * Return values: + * 0 Succeeded. + * EISCONN Failed, and the memory associated with the bufring + * is still connected. Callers must _not_ free the the + * memory associated with the bufring, if this error + * happens. + * other values Failed. The memory associated with the bufring is + * no longer connected. Callers are free to do anything + * with the memory associated with the bufring. + */ +int vmbus_chan_open(struct vmbus_channel *chan, + int txbr_size, int rxbr_size, const void *udata, int udlen, + vmbus_chan_callback_t cb, void *cbarg); +int vmbus_chan_open_br(struct vmbus_channel *chan, + const struct vmbus_chan_br *cbr, const void *udata, + int udlen, vmbus_chan_callback_t cb, void *cbarg); +void vmbus_chan_close(struct vmbus_channel *chan); +int vmbus_chan_close_direct(struct vmbus_channel *chan); +void vmbus_chan_intr_drain(struct vmbus_channel *chan); +void vmbus_chan_run_task(struct vmbus_channel *chan, + struct task *task); +void vmbus_chan_set_orphan(struct vmbus_channel *chan, + struct vmbus_xact_ctx *); +void vmbus_chan_unset_orphan(struct vmbus_channel *chan); +const void *vmbus_chan_xact_wait(const struct vmbus_channel *chan, + struct vmbus_xact *xact, size_t *resp_len, bool can_sleep); + +int vmbus_chan_gpadl_connect(struct vmbus_channel *chan, + bus_addr_t paddr, int size, uint32_t *gpadl); +int vmbus_chan_gpadl_disconnect(struct vmbus_channel *chan, + uint32_t gpadl); + +void vmbus_chan_cpu_set(struct vmbus_channel *chan, int cpu); +void vmbus_chan_cpu_rr(struct vmbus_channel *chan); +void vmbus_chan_set_readbatch(struct vmbus_channel *chan, bool on); + +struct vmbus_channel ** + vmbus_subchan_get(struct vmbus_channel *pri_chan, + int subchan_cnt); +void vmbus_subchan_rel(struct vmbus_channel **subchan, + int subchan_cnt); +void vmbus_subchan_drain(struct vmbus_channel *pri_chan); + +int vmbus_chan_recv(struct vmbus_channel *chan, void *data, int *dlen, + uint64_t *xactid); +int vmbus_chan_recv_pkt(struct vmbus_channel *chan, + struct vmbus_chanpkt_hdr *pkt, int *pktlen); + +int vmbus_chan_recv_idxadv(struct vmbus_channel *chan, + uint32_t advance); +int vmbus_chan_recv_peek(struct vmbus_channel *chan, + void *data, int data_len, uint32_t advance); +int vmbus_chan_recv_peek_call(struct vmbus_channel *chan, + int data_len, uint32_t skip, + vmbus_br_copy_callback_t cb, void *cbarg); + +int vmbus_chan_send(struct vmbus_channel *chan, uint16_t type, + uint16_t flags, void *data, int dlen, uint64_t xactid); +int vmbus_chan_send_sglist(struct vmbus_channel *chan, + struct vmbus_gpa sg[], int sglen, void *data, int dlen, + uint64_t xactid); +int vmbus_chan_send_prplist(struct vmbus_channel *chan, + struct vmbus_gpa_range *prp, int prp_cnt, void *data, + int dlen, uint64_t xactid); +int vmbus_chan_iov_send(struct vmbus_channel *chan, + const struct iovec iov[], int iovlen, + vmbus_br_copy_callback_t cb, void *cbarg); +uint32_t vmbus_chan_write_available(struct vmbus_channel *chan); +uint32_t vmbus_chan_read_available(struct vmbus_channel *chan); +bool vmbus_chan_write_signal(struct vmbus_channel *chan, + int32_t min_signal_size); +void vmbus_chan_set_pending_send_size(struct vmbus_channel *chan, + uint32_t size); + +uint32_t vmbus_chan_id(const struct vmbus_channel *chan); +uint32_t vmbus_chan_subidx(const struct vmbus_channel *chan); +bool vmbus_chan_is_primary(const struct vmbus_channel *chan); +bool vmbus_chan_is_revoked(const struct vmbus_channel *chan); +bool vmbus_chan_is_hvs(const struct vmbus_channel *chan); +bool vmbus_chan_is_hvs_conn_from_host( + const struct vmbus_channel *chan); +int vmbus_req_tl_connect(struct hyperv_guid *, + struct hyperv_guid *); + +struct hyperv_guid * + vmbus_chan_guid_type(struct vmbus_channel *chan); +struct hyperv_guid * + vmbus_chan_guid_inst(struct vmbus_channel *chan); +int vmbus_chan_prplist_nelem(int br_size, int prpcnt_max, + int dlen_max); +bool vmbus_chan_rx_empty(const struct vmbus_channel *chan); +bool vmbus_chan_tx_empty(const struct vmbus_channel *chan); +struct taskqueue * + vmbus_chan_mgmt_tq(const struct vmbus_channel *chan); + +void vmbus_chan_poll_enable(struct vmbus_channel *chan, + u_int pollhz); +void vmbus_chan_poll_disable(struct vmbus_channel *chan); + +#endif /* !_VMBUS_H_ */ diff --git a/sys/dev/hyperv/include/vmbus_xact.h b/sys/dev/hyperv/include/vmbus_xact.h new file mode 100644 index 000000000000..90711a0be774 --- /dev/null +++ b/sys/dev/hyperv/include/vmbus_xact.h @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMBUS_XACT_H_ +#define _VMBUS_XACT_H_ + +#include <sys/param.h> +#include <sys/bus.h> + +struct vmbus_xact; +struct vmbus_xact_ctx; + +struct vmbus_xact_ctx *vmbus_xact_ctx_create(bus_dma_tag_t dtag, + size_t req_size, size_t resp_size, + size_t priv_size); +void vmbus_xact_ctx_destroy(struct vmbus_xact_ctx *ctx); +bool vmbus_xact_ctx_orphan(struct vmbus_xact_ctx *ctx); + +struct vmbus_xact *vmbus_xact_get(struct vmbus_xact_ctx *ctx, + size_t req_len); +void vmbus_xact_put(struct vmbus_xact *xact); + +void *vmbus_xact_req_data(const struct vmbus_xact *xact); +bus_addr_t vmbus_xact_req_paddr(const struct vmbus_xact *xact); +void *vmbus_xact_priv(const struct vmbus_xact *xact, + size_t priv_len); +void vmbus_xact_activate(struct vmbus_xact *xact); +void vmbus_xact_deactivate(struct vmbus_xact *xact); +const void *vmbus_xact_wait(struct vmbus_xact *xact, + size_t *resp_len); +const void *vmbus_xact_busywait(struct vmbus_xact *xact, + size_t *resp_len); +const void *vmbus_xact_poll(struct vmbus_xact *xact, + size_t *resp_len); +void vmbus_xact_wakeup(struct vmbus_xact *xact, + const void *data, size_t dlen); +void vmbus_xact_ctx_wakeup(struct vmbus_xact_ctx *ctx, + const void *data, size_t dlen); + +#endif /* !_VMBUS_XACT_H_ */ diff --git a/sys/dev/hyperv/input/hv_kbd.c b/sys/dev/hyperv/input/hv_kbd.c new file mode 100644 index 000000000000..53aacda7fbcb --- /dev/null +++ b/sys/dev/hyperv/input/hv_kbd.c @@ -0,0 +1,857 @@ +/*- + * Copyright (c) 2017 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_evdev.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/limits.h> +#include <sys/lock.h> +#include <sys/taskqueue.h> +#include <sys/selinfo.h> +#include <sys/sysctl.h> +#include <sys/poll.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/kthread.h> +#include <sys/syscallsubr.h> +#include <sys/sysproto.h> +#include <sys/sema.h> +#include <sys/signal.h> +#include <sys/syslog.h> +#include <sys/systm.h> +#include <sys/mutex.h> +#include <sys/callout.h> + +#include <sys/kbio.h> +#include <dev/kbd/kbdreg.h> +#include <dev/kbd/kbdtables.h> + +#ifdef EVDEV_SUPPORT +#include <dev/evdev/evdev.h> +#include <dev/evdev/input.h> +#endif + +#include "dev/hyperv/input/hv_kbdc.h" + +#define HVKBD_MTX_LOCK(_m) do { \ + mtx_lock(_m); \ +} while (0) + +#define HVKBD_MTX_UNLOCK(_m) do { \ + mtx_unlock(_m); \ +} while (0) + +#define HVKBD_MTX_ASSERT(_m, _t) do { \ + mtx_assert(_m, _t); \ +} while (0) + +#define HVKBD_LOCK() HVKBD_MTX_LOCK(&Giant) +#define HVKBD_UNLOCK() HVKBD_MTX_UNLOCK(&Giant) +#define HVKBD_LOCK_ASSERT() HVKBD_MTX_ASSERT(&Giant, MA_OWNED) + +#define HVKBD_FLAG_COMPOSE 0x00000001 /* compose char flag */ +#define HVKBD_FLAG_POLLING 0x00000002 + +#ifdef EVDEV_SUPPORT +static evdev_event_t hvkbd_ev_event; + +static const struct evdev_methods hvkbd_evdev_methods = { + .ev_event = hvkbd_ev_event, +}; +#endif + +/* early keyboard probe, not supported */ +static int +hvkbd_configure(int flags) +{ + return (0); +} + +/* detect a keyboard, not used */ +static int +hvkbd_probe(int unit, void *arg, int flags) +{ + return (ENXIO); +} + +/* reset and initialize the device, not used */ +static int +hvkbd_init(int unit, keyboard_t **kbdp, void *arg, int flags) +{ + DEBUG_HVKBD(*kbdp, "%s\n", __func__); + return (ENXIO); +} + +/* test the interface to the device, not used */ +static int +hvkbd_test_if(keyboard_t *kbd) +{ + DEBUG_HVKBD(kbd, "%s\n", __func__); + return (0); +} + +/* finish using this keyboard, not used */ +static int +hvkbd_term(keyboard_t *kbd) +{ + DEBUG_HVKBD(kbd, "%s\n", __func__); + return (ENXIO); +} + +/* keyboard interrupt routine, not used */ +static int +hvkbd_intr(keyboard_t *kbd, void *arg) +{ + DEBUG_HVKBD(kbd, "%s\n", __func__); + return (0); +} + +/* lock the access to the keyboard, not used */ +static int +hvkbd_lock(keyboard_t *kbd, int lock) +{ + DEBUG_HVKBD(kbd, "%s\n", __func__); + return (1); +} + +/* save the internal state, not used */ +static int +hvkbd_get_state(keyboard_t *kbd, void *buf, size_t len) +{ + DEBUG_HVKBD(kbd,"%s\n", __func__); + return (len == 0) ? 1 : -1; +} + +/* set the internal state, not used */ +static int +hvkbd_set_state(keyboard_t *kbd, void *buf, size_t len) +{ + DEBUG_HVKBD(kbd, "%s\n", __func__); + return (EINVAL); +} + +static int +hvkbd_poll(keyboard_t *kbd, int on) +{ + hv_kbd_sc *sc = kbd->kb_data; + + HVKBD_LOCK(); + /* + * Keep a reference count on polling to allow recursive + * cngrab() during a panic for example. + */ + if (on) + sc->sc_polling++; + else if (sc->sc_polling > 0) + sc->sc_polling--; + + if (sc->sc_polling != 0) { + sc->sc_flags |= HVKBD_FLAG_POLLING; + } else { + sc->sc_flags &= ~HVKBD_FLAG_POLLING; + } + HVKBD_UNLOCK(); + return (0); +} + +/* + * Enable the access to the device; until this function is called, + * the client cannot read from the keyboard. + */ +static int +hvkbd_enable(keyboard_t *kbd) +{ + HVKBD_LOCK(); + KBD_ACTIVATE(kbd); + HVKBD_UNLOCK(); + return (0); +} + +/* disallow the access to the device */ +static int +hvkbd_disable(keyboard_t *kbd) +{ + DEBUG_HVKBD(kbd, "%s\n", __func__); + HVKBD_LOCK(); + KBD_DEACTIVATE(kbd); + HVKBD_UNLOCK(); + return (0); +} + +static void +hvkbd_do_poll(hv_kbd_sc *sc, uint8_t wait) +{ + while (!hv_kbd_prod_is_ready(sc)) { + hv_kbd_read_channel(sc->hs_chan, sc); + if (!wait) + break; + } +} + +/* check if data is waiting */ +/* Currently unused. */ +static int +hvkbd_check(keyboard_t *kbd) +{ + DEBUG_HVKBD(kbd, "%s\n", __func__); + return (0); +} + +/* check if char is waiting */ +static int +hvkbd_check_char_locked(keyboard_t *kbd) +{ + HVKBD_LOCK_ASSERT(); + if (!KBD_IS_ACTIVE(kbd)) + return (FALSE); + + hv_kbd_sc *sc = kbd->kb_data; + if (!(sc->sc_flags & HVKBD_FLAG_COMPOSE) && sc->sc_composed_char != 0) + return (TRUE); + if (sc->sc_flags & HVKBD_FLAG_POLLING) + hvkbd_do_poll(sc, 0); + if (hv_kbd_prod_is_ready(sc)) { + return (TRUE); + } + return (FALSE); +} + +static int +hvkbd_check_char(keyboard_t *kbd) +{ + int result; + + HVKBD_LOCK(); + result = hvkbd_check_char_locked(kbd); + HVKBD_UNLOCK(); + + return (result); +} + +/* read char from the keyboard */ +static uint32_t +hvkbd_read_char_locked(keyboard_t *kbd, int wait) +{ + uint32_t scancode = NOKEY; + uint32_t action; + keystroke ks; + hv_kbd_sc *sc = kbd->kb_data; + int keycode; + + HVKBD_LOCK_ASSERT(); + + if (!KBD_IS_ACTIVE(kbd) || !hv_kbd_prod_is_ready(sc)) + return (NOKEY); + +next_code: + + /* do we have a composed char to return? */ + if (!(sc->sc_flags & HVKBD_FLAG_COMPOSE) && sc->sc_composed_char > 0) { + action = sc->sc_composed_char; + sc->sc_composed_char = 0; + if (action > UCHAR_MAX) { + return (ERRKEY); + } + return (action); + } + + if (hv_kbd_fetch_top(sc, &ks)) { + return (NOKEY); + } + if ((ks.info & IS_E0) || (ks.info & IS_E1)) { + /** + * Emulate the generation of E0 or E1 scancode, + * the real scancode will be consumed next time. + */ + if (ks.info & IS_E0) { + scancode = XTKBD_EMUL0; + ks.info &= ~IS_E0; + } else if (ks.info & IS_E1) { + scancode = XTKBD_EMUL1; + ks.info &= ~IS_E1; + } + /** + * Change the top item to avoid encountering + * E0 or E1 twice. + */ + hv_kbd_modify_top(sc, &ks); + } else if (ks.info & IS_UNICODE) { + /** + * XXX: Hyperv host send unicode to VM through + * 'Type clipboard text', the mapping from + * unicode to scancode depends on the keymap. + * It is so complicated that we do not plan to + * support it yet. + */ + if (bootverbose) + device_printf(sc->dev, "Unsupported unicode\n"); + hv_kbd_remove_top(sc); + return (NOKEY); + } else { + scancode = ks.makecode; + if (ks.info & IS_BREAK) { + scancode |= XTKBD_RELEASE; + } + hv_kbd_remove_top(sc); + } +#ifdef EVDEV_SUPPORT + /* push evdev event */ + if (evdev_rcpt_mask & EVDEV_RCPT_HW_KBD && + sc->ks_evdev != NULL) { + keycode = evdev_scancode2key(&sc->ks_evdev_state, + scancode); + + if (keycode != KEY_RESERVED) { + evdev_push_event(sc->ks_evdev, EV_KEY, + (uint16_t)keycode, scancode & 0x80 ? 0 : 1); + evdev_sync(sc->ks_evdev); + } + } +#endif + ++kbd->kb_count; + DEBUG_HVKBD(kbd, "read scan: 0x%x\n", scancode); + + /* return the byte as is for the K_RAW mode */ + if (sc->sc_mode == K_RAW) + return scancode; + + /* translate the scan code into a keycode */ + keycode = scancode & 0x7F; + switch (sc->sc_prefix) { + case 0x00: /* normal scancode */ + switch(scancode) { + case 0xB8: /* left alt (compose key) released */ + if (sc->sc_flags & HVKBD_FLAG_COMPOSE) { + sc->sc_flags &= ~HVKBD_FLAG_COMPOSE; + if (sc->sc_composed_char > UCHAR_MAX) + sc->sc_composed_char = 0; + } + break; + case 0x38: /* left alt (compose key) pressed */ + if (!(sc->sc_flags & HVKBD_FLAG_COMPOSE)) { + sc->sc_flags |= HVKBD_FLAG_COMPOSE; + sc->sc_composed_char = 0; + } + break; + case 0xE0: + case 0xE1: + sc->sc_prefix = scancode; + goto next_code; + } + break; + case 0xE0: /* 0xE0 prefix */ + sc->sc_prefix = 0; + switch (keycode) { + case 0x1C: /* right enter key */ + keycode = 0x59; + break; + case 0x1D: /* right ctrl key */ + keycode = 0x5A; + break; + case 0x35: /* keypad divide key */ + keycode = 0x5B; + break; + case 0x37: /* print scrn key */ + keycode = 0x5C; + break; + case 0x38: /* right alt key (alt gr) */ + keycode = 0x5D; + break; + case 0x46: /* ctrl-pause/break on AT 101 (see below) */ + keycode = 0x68; + break; + case 0x47: /* grey home key */ + keycode = 0x5E; + break; + case 0x48: /* grey up arrow key */ + keycode = 0x5F; + break; + case 0x49: /* grey page up key */ + keycode = 0x60; + break; + case 0x4B: /* grey left arrow key */ + keycode = 0x61; + break; + case 0x4D: /* grey right arrow key */ + keycode = 0x62; + break; + case 0x4F: /* grey end key */ + keycode = 0x63; + break; + case 0x50: /* grey down arrow key */ + keycode = 0x64; + break; + case 0x51: /* grey page down key */ + keycode = 0x65; + break; + case 0x52: /* grey insert key */ + keycode = 0x66; + break; + case 0x53: /* grey delete key */ + keycode = 0x67; + break; + /* the following 3 are only used on the MS "Natural" keyboard */ + case 0x5b: /* left Window key */ + keycode = 0x69; + break; + case 0x5c: /* right Window key */ + keycode = 0x6a; + break; + case 0x5d: /* menu key */ + keycode = 0x6b; + break; + case 0x5e: /* power key */ + keycode = 0x6d; + break; + case 0x5f: /* sleep key */ + keycode = 0x6e; + break; + case 0x63: /* wake key */ + keycode = 0x6f; + break; + default: /* ignore everything else */ + goto next_code; + } + break; + case 0xE1: /* 0xE1 prefix */ + /* + * The pause/break key on the 101 keyboard produces: + * E1-1D-45 E1-9D-C5 + * Ctrl-pause/break produces: + * E0-46 E0-C6 (See above.) + */ + sc->sc_prefix = 0; + if (keycode == 0x1D) + sc->sc_prefix = 0x1D; + goto next_code; + /* NOT REACHED */ + case 0x1D: /* pause / break */ + sc->sc_prefix = 0; + if (keycode != 0x45) + goto next_code; + keycode = 0x68; + break; + } + + /* XXX assume 101/102 keys AT keyboard */ + switch (keycode) { + case 0x5c: /* print screen */ + if (sc->sc_flags & ALTS) + keycode = 0x54; /* sysrq */ + break; + case 0x68: /* pause/break */ + if (sc->sc_flags & CTLS) + keycode = 0x6c; /* break */ + break; + } + + /* return the key code in the K_CODE mode */ + if (sc->sc_mode == K_CODE) + return (keycode | (scancode & 0x80)); + + /* compose a character code */ + if (sc->sc_flags & HVKBD_FLAG_COMPOSE) { + switch (keycode | (scancode & 0x80)) { + /* key pressed, process it */ + case 0x47: case 0x48: case 0x49: /* keypad 7,8,9 */ + sc->sc_composed_char *= 10; + sc->sc_composed_char += keycode - 0x40; + if (sc->sc_composed_char > UCHAR_MAX) + return ERRKEY; + goto next_code; + case 0x4B: case 0x4C: case 0x4D: /* keypad 4,5,6 */ + sc->sc_composed_char *= 10; + sc->sc_composed_char += keycode - 0x47; + if (sc->sc_composed_char > UCHAR_MAX) + return ERRKEY; + goto next_code; + case 0x4F: case 0x50: case 0x51: /* keypad 1,2,3 */ + sc->sc_composed_char *= 10; + sc->sc_composed_char += keycode - 0x4E; + if (sc->sc_composed_char > UCHAR_MAX) + return ERRKEY; + goto next_code; + case 0x52: /* keypad 0 */ + sc->sc_composed_char *= 10; + if (sc->sc_composed_char > UCHAR_MAX) + return ERRKEY; + goto next_code; + + /* key released, no interest here */ + case 0xC7: case 0xC8: case 0xC9: /* keypad 7,8,9 */ + case 0xCB: case 0xCC: case 0xCD: /* keypad 4,5,6 */ + case 0xCF: case 0xD0: case 0xD1: /* keypad 1,2,3 */ + case 0xD2: /* keypad 0 */ + goto next_code; + + case 0x38: /* left alt key */ + break; + + default: + if (sc->sc_composed_char > 0) { + sc->sc_flags &= ~HVKBD_FLAG_COMPOSE; + sc->sc_composed_char = 0; + return (ERRKEY); + } + break; + } + } + + /* keycode to key action */ + action = genkbd_keyaction(kbd, keycode, scancode & 0x80, + &sc->sc_state, &sc->sc_accents); + if (action == NOKEY) + goto next_code; + else + return (action); +} + +/* Currently wait is always false. */ +static uint32_t +hvkbd_read_char(keyboard_t *kbd, int wait) +{ + uint32_t keycode; + + HVKBD_LOCK(); + keycode = hvkbd_read_char_locked(kbd, wait); + HVKBD_UNLOCK(); + + return (keycode); +} + +/* clear the internal state of the keyboard */ +static void +hvkbd_clear_state(keyboard_t *kbd) +{ + hv_kbd_sc *sc = kbd->kb_data; + sc->sc_state &= LOCK_MASK; /* preserve locking key state */ + sc->sc_flags &= ~(HVKBD_FLAG_POLLING | HVKBD_FLAG_COMPOSE); + sc->sc_accents = 0; + sc->sc_composed_char = 0; +} + +static int +hvkbd_ioctl_locked(keyboard_t *kbd, u_long cmd, caddr_t arg) +{ + int i; +#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD4) || defined(COMPAT_43) + int ival; +#endif + hv_kbd_sc *sc = kbd->kb_data; + switch (cmd) { + case KDGKBMODE: + *(int *)arg = sc->sc_mode; + break; +#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD4) || defined(COMPAT_43) + case _IO('K', 7): + ival = IOCPARM_IVAL(arg); + arg = (caddr_t)&ival; + /* FALLTHROUGH */ +#endif + case KDSKBMODE: /* set keyboard mode */ + DEBUG_HVKBD(kbd, "expected mode: %x\n", *(int *)arg); + switch (*(int *)arg) { + case K_XLATE: + if (sc->sc_mode != K_XLATE) { + /* make lock key state and LED state match */ + sc->sc_state &= ~LOCK_MASK; + sc->sc_state |= KBD_LED_VAL(kbd); + } + /* FALLTHROUGH */ + case K_RAW: + case K_CODE: + if (sc->sc_mode != *(int *)arg) { + DEBUG_HVKBD(kbd, "mod changed to %x\n", *(int *)arg); + if ((sc->sc_flags & HVKBD_FLAG_POLLING) == 0) + hvkbd_clear_state(kbd); + sc->sc_mode = *(int *)arg; + } + break; + default: + return (EINVAL); + } + break; + case KDGKBSTATE: /* get lock key state */ + *(int *)arg = sc->sc_state & LOCK_MASK; + break; +#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD4) || defined(COMPAT_43) + case _IO('K', 20): + ival = IOCPARM_IVAL(arg); + arg = (caddr_t)&ival; + /* FALLTHROUGH */ +#endif + case KDSKBSTATE: /* set lock key state */ + if (*(int *)arg & ~LOCK_MASK) { + return (EINVAL); + } + sc->sc_state &= ~LOCK_MASK; + sc->sc_state |= *(int *)arg; + return hvkbd_ioctl_locked(kbd, KDSETLED, arg); + case KDGETLED: /* get keyboard LED */ + *(int *)arg = KBD_LED_VAL(kbd); + break; +#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD4) || defined(COMPAT_43) + case _IO('K', 66): + ival = IOCPARM_IVAL(arg); + arg = (caddr_t)&ival; + /* FALLTHROUGH */ +#endif + case KDSETLED: /* set keyboard LED */ + /* NOTE: lock key state in "sc_state" won't be changed */ + if (*(int *)arg & ~LOCK_MASK) + return (EINVAL); + + i = *(int *)arg; + + /* replace CAPS LED with ALTGR LED for ALTGR keyboards */ + if (sc->sc_mode == K_XLATE && + kbd->kb_keymap->n_keys > ALTGR_OFFSET) { + if (i & ALKED) + i |= CLKED; + else + i &= ~CLKED; + } + if (KBD_HAS_DEVICE(kbd)) { + DEBUG_HVSC(sc, "setled 0x%x\n", *(int *)arg); + } + +#ifdef EVDEV_SUPPORT + /* push LED states to evdev */ + if (sc->ks_evdev != NULL && + evdev_rcpt_mask & EVDEV_RCPT_HW_KBD) + evdev_push_leds(sc->ks_evdev, *(int *)arg); +#endif + KBD_LED_VAL(kbd) = *(int *)arg; + break; + case PIO_KEYMAP: /* set keyboard translation table */ + case OPIO_KEYMAP: /* set keyboard translation table (compat) */ + case PIO_KEYMAPENT: /* set keyboard translation table entry */ + case PIO_DEADKEYMAP: /* set accent key translation table */ + sc->sc_accents = 0; + /* FALLTHROUGH */ + default: + return (genkbd_commonioctl(kbd, cmd, arg)); + } + return (0); +} + +/* some useful control functions */ +static int +hvkbd_ioctl(keyboard_t *kbd, u_long cmd, caddr_t arg) +{ + DEBUG_HVKBD(kbd, "%s: %lx start\n", __func__, cmd); + HVKBD_LOCK(); + int ret = hvkbd_ioctl_locked(kbd, cmd, arg); + HVKBD_UNLOCK(); + DEBUG_HVKBD(kbd, "%s: %lx end %d\n", __func__, cmd, ret); + return (ret); +} + +/* read one byte from the keyboard if it's allowed */ +/* Currently unused. */ +static int +hvkbd_read(keyboard_t *kbd, int wait) +{ + DEBUG_HVKBD(kbd, "%s\n", __func__); + HVKBD_LOCK_ASSERT(); + if (!KBD_IS_ACTIVE(kbd)) + return (-1); + return hvkbd_read_char_locked(kbd, wait); +} + +#ifdef EVDEV_SUPPORT +static void +hvkbd_ev_event(struct evdev_dev *evdev, uint16_t type, uint16_t code, + int32_t value) +{ + keyboard_t *kbd = evdev_get_softc(evdev); + + if (evdev_rcpt_mask & EVDEV_RCPT_HW_KBD && + (type == EV_LED || type == EV_REP)) { + mtx_lock(&Giant); + kbd_ev_event(kbd, type, code, value); + mtx_unlock(&Giant); + } +} +#endif + +static keyboard_switch_t hvkbdsw = { + .probe = hvkbd_probe, /* not used */ + .init = hvkbd_init, + .term = hvkbd_term, /* not used */ + .intr = hvkbd_intr, /* not used */ + .test_if = hvkbd_test_if, /* not used */ + .enable = hvkbd_enable, + .disable = hvkbd_disable, + .read = hvkbd_read, + .check = hvkbd_check, + .read_char = hvkbd_read_char, + .check_char = hvkbd_check_char, + .ioctl = hvkbd_ioctl, + .lock = hvkbd_lock, /* not used */ + .clear_state = hvkbd_clear_state, + .get_state = hvkbd_get_state, /* not used */ + .set_state = hvkbd_set_state, /* not used */ + .poll = hvkbd_poll, +}; + +KEYBOARD_DRIVER(hvkbd, hvkbdsw, hvkbd_configure); + +void +hv_kbd_intr(hv_kbd_sc *sc) +{ + uint32_t c; + if ((sc->sc_flags & HVKBD_FLAG_POLLING) != 0) + return; + + if (KBD_IS_ACTIVE(&sc->sc_kbd) && + KBD_IS_BUSY(&sc->sc_kbd)) { + /* let the callback function process the input */ + (sc->sc_kbd.kb_callback.kc_func) (&sc->sc_kbd, KBDIO_KEYINPUT, + sc->sc_kbd.kb_callback.kc_arg); + } else { + /* read and discard the input, no one is waiting for it */ + do { + c = hvkbd_read_char(&sc->sc_kbd, 0); + } while (c != NOKEY); + } +} + +int +hvkbd_driver_load(module_t mod, int what, void *arg) +{ + switch (what) { + case MOD_LOAD: + kbd_add_driver(&hvkbd_kbd_driver); + break; + case MOD_UNLOAD: + kbd_delete_driver(&hvkbd_kbd_driver); + break; + } + return (0); +} + +int +hv_kbd_drv_attach(device_t dev) +{ + hv_kbd_sc *sc = device_get_softc(dev); + int unit = device_get_unit(dev); + keyboard_t *kbd = &sc->sc_kbd; + keyboard_switch_t *sw; +#ifdef EVDEV_SUPPORT + struct evdev_dev *evdev; +#endif + + sw = kbd_get_switch(HVKBD_DRIVER_NAME); + if (sw == NULL) { + return (ENXIO); + } + + kbd_init_struct(kbd, HVKBD_DRIVER_NAME, KB_OTHER, unit, 0, 0, 0); + kbd->kb_data = (void *)sc; + kbd_set_maps(kbd, &key_map, &accent_map, fkey_tab, nitems(fkey_tab)); + KBD_FOUND_DEVICE(kbd); + hvkbd_clear_state(kbd); + KBD_PROBE_DONE(kbd); + KBD_INIT_DONE(kbd); + sc->sc_mode = K_XLATE; + (*sw->enable)(kbd); + +#ifdef EVDEV_SUPPORT + evdev = evdev_alloc(); + evdev_set_name(evdev, "Hyper-V keyboard"); + evdev_set_phys(evdev, device_get_nameunit(dev)); + evdev_set_id(evdev, BUS_VIRTUAL, 0, 0, 0); + evdev_set_methods(evdev, kbd, &hvkbd_evdev_methods); + evdev_support_event(evdev, EV_SYN); + evdev_support_event(evdev, EV_KEY); + evdev_support_event(evdev, EV_LED); + evdev_support_event(evdev, EV_REP); + evdev_support_all_known_keys(evdev); + evdev_support_led(evdev, LED_NUML); + evdev_support_led(evdev, LED_CAPSL); + evdev_support_led(evdev, LED_SCROLLL); + if (evdev_register_mtx(evdev, &Giant)) + evdev_free(evdev); + else + sc->ks_evdev = evdev; + sc->ks_evdev_state = 0; +#endif + + if (kbd_register(kbd) < 0) { + goto detach; + } + KBD_CONFIG_DONE(kbd); +#ifdef KBD_INSTALL_CDEV + if (kbd_attach(kbd)) { + goto detach; + } +#endif + if (bootverbose) { + kbdd_diag(kbd, bootverbose); + } + return (0); +detach: + hv_kbd_drv_detach(dev); + return (ENXIO); +} + +int +hv_kbd_drv_detach(device_t dev) +{ + int error = 0; + hv_kbd_sc *sc = device_get_softc(dev); + hvkbd_disable(&sc->sc_kbd); +#ifdef EVDEV_SUPPORT + evdev_free(sc->ks_evdev); +#endif + if (KBD_IS_CONFIGURED(&sc->sc_kbd)) { + error = kbd_unregister(&sc->sc_kbd); + if (error) { + device_printf(dev, "WARNING: kbd_unregister() " + "returned non-zero! (ignored)\n"); + } + } +#ifdef KBD_INSTALL_CDEV + error = kbd_detach(&sc->sc_kbd); +#endif + return (error); +} + diff --git a/sys/dev/hyperv/input/hv_kbdc.c b/sys/dev/hyperv/input/hv_kbdc.c new file mode 100644 index 000000000000..7065ff3057a7 --- /dev/null +++ b/sys/dev/hyperv/input/hv_kbdc.c @@ -0,0 +1,530 @@ +/*- + * Copyright (c) 2017 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/lock.h> +#include <sys/taskqueue.h> +#include <sys/selinfo.h> +#include <sys/sysctl.h> +#include <sys/poll.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/syscallsubr.h> +#include <sys/sysproto.h> +#include <sys/systm.h> +#include <sys/mutex.h> + +#include <sys/kbio.h> +#include <dev/kbd/kbdreg.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/utilities/hv_utilreg.h> +#include <dev/hyperv/utilities/vmbus_icreg.h> +#include <dev/hyperv/utilities/vmbus_icvar.h> +#include <dev/hyperv/include/vmbus_xact.h> + +#include "dev/hyperv/input/hv_kbdc.h" +#include "vmbus_if.h" + +#define HV_KBD_VER_MAJOR (1) +#define HV_KBD_VER_MINOR (0) + +#define HV_KBD_VER (HV_KBD_VER_MINOR | (HV_KBD_VER_MAJOR) << 16) + +#define HV_KBD_PROTO_ACCEPTED (1) + +#define HV_BUFF_SIZE (4*PAGE_SIZE) +#define HV_KBD_RINGBUFF_SEND_SZ (10*PAGE_SIZE) +#define HV_KBD_RINGBUFF_RECV_SZ (10*PAGE_SIZE) + +enum hv_kbd_msg_type_t { + HV_KBD_PROTO_REQUEST = 1, + HV_KBD_PROTO_RESPONSE = 2, + HV_KBD_PROTO_EVENT = 3, + HV_KBD_PROTO_LED_INDICATORS = 4, +}; + +typedef struct hv_kbd_msg_hdr_t { + uint32_t type; +} hv_kbd_msg_hdr; + +typedef struct hv_kbd_msg_t { + hv_kbd_msg_hdr hdr; + char data[]; +} hv_kbd_msg; + +typedef struct hv_kbd_proto_req_t { + hv_kbd_msg_hdr hdr; + uint32_t ver; +} hv_kbd_proto_req; + +typedef struct hv_kbd_proto_resp_t { + hv_kbd_msg_hdr hdr; + uint32_t status; +} hv_kbd_proto_resp; + +#define HV_KBD_PROTO_REQ_SZ (sizeof(hv_kbd_proto_req)) +#define HV_KBD_PROTO_RESP_SZ (sizeof(hv_kbd_proto_resp)) + +/** + * the struct in win host: + * typedef struct _HK_MESSAGE_KEYSTROKE + * { + * HK_MESSAGE_HEADER Header; + * UINT16 MakeCode; + * UINT32 IsUnicode:1; + * UINT32 IsBreak:1; + * UINT32 IsE0:1; + * UINT32 IsE1:1; + * UINT32 Reserved:28; + * } HK_MESSAGE_KEYSTROKE + */ +typedef struct hv_kbd_keystroke_t { + hv_kbd_msg_hdr hdr; + keystroke ks; +} hv_kbd_keystroke; + +static const struct vmbus_ic_desc vmbus_kbd_descs[] = { + { + .ic_guid = { .hv_guid = { + 0x6d, 0xad, 0x12, 0xf9, 0x17, 0x2b, 0xea, 0x48, + 0xbd, 0x65, 0xf9, 0x27, 0xa6, 0x1c, 0x76, 0x84} }, + .ic_desc = "Hyper-V KBD" + }, + VMBUS_IC_DESC_END +}; + +static int hv_kbd_attach(device_t dev); +static int hv_kbd_detach(device_t dev); + +/** + * return 1 if producer is ready + */ +int +hv_kbd_prod_is_ready(hv_kbd_sc *sc) +{ + int ret; + mtx_lock(&sc->ks_mtx); + ret = !STAILQ_EMPTY(&sc->ks_queue); + mtx_unlock(&sc->ks_mtx); + return (ret); +} + +int +hv_kbd_produce_ks(hv_kbd_sc *sc, const keystroke *ks) +{ + int ret = 0; + keystroke_info *ksi; + mtx_lock(&sc->ks_mtx); + if (LIST_EMPTY(&sc->ks_free_list)) { + DEBUG_HVSC(sc, "NO buffer!\n"); + ret = 1; + } else { + ksi = LIST_FIRST(&sc->ks_free_list); + LIST_REMOVE(ksi, link); + ksi->ks = *ks; + STAILQ_INSERT_TAIL(&sc->ks_queue, ksi, slink); + } + mtx_unlock(&sc->ks_mtx); + return (ret); +} + +/** + * return 0 if successfully get the 1st item of queue without removing it + */ +int +hv_kbd_fetch_top(hv_kbd_sc *sc, keystroke *result) +{ + int ret = 0; + keystroke_info *ksi = NULL; + mtx_lock(&sc->ks_mtx); + if (STAILQ_EMPTY(&sc->ks_queue)) { + DEBUG_HVSC(sc, "Empty queue!\n"); + ret = 1; + } else { + ksi = STAILQ_FIRST(&sc->ks_queue); + *result = ksi->ks; + } + mtx_unlock(&sc->ks_mtx); + return (ret); +} + +/** + * return 0 if successfully removing the top item + */ +int +hv_kbd_remove_top(hv_kbd_sc *sc) +{ + int ret = 0; + keystroke_info *ksi = NULL; + mtx_lock(&sc->ks_mtx); + if (STAILQ_EMPTY(&sc->ks_queue)) { + DEBUG_HVSC(sc, "Empty queue!\n"); + ret = 1; + } else { + ksi = STAILQ_FIRST(&sc->ks_queue); + STAILQ_REMOVE_HEAD(&sc->ks_queue, slink); + LIST_INSERT_HEAD(&sc->ks_free_list, ksi, link); + } + mtx_unlock(&sc->ks_mtx); + return (ret); +} + +/** + * return 0 if successfully modify the 1st item of queue + */ +int +hv_kbd_modify_top(hv_kbd_sc *sc, keystroke *top) +{ + int ret = 0; + keystroke_info *ksi = NULL; + mtx_lock(&sc->ks_mtx); + if (STAILQ_EMPTY(&sc->ks_queue)) { + DEBUG_HVSC(sc, "Empty queue!\n"); + ret = 1; + } else { + ksi = STAILQ_FIRST(&sc->ks_queue); + ksi->ks = *top; + } + mtx_unlock(&sc->ks_mtx); + return (ret); +} + +static int +hv_kbd_probe(device_t dev) +{ + device_t bus = device_get_parent(dev); + const struct vmbus_ic_desc *d; + + if (resource_disabled(device_get_name(dev), 0)) + return (ENXIO); + + for (d = vmbus_kbd_descs; d->ic_desc != NULL; ++d) { + if (VMBUS_PROBE_GUID(bus, dev, &d->ic_guid) == 0) { + device_set_desc(dev, d->ic_desc); + return (BUS_PROBE_DEFAULT); + } + } + return (ENXIO); +} + +static void +hv_kbd_on_response(hv_kbd_sc *sc, struct vmbus_chanpkt_hdr *pkt) +{ + struct vmbus_xact_ctx *xact = sc->hs_xact_ctx; + if (xact != NULL) { + DEBUG_HVSC(sc, "hvkbd is ready\n"); + vmbus_xact_ctx_wakeup(xact, VMBUS_CHANPKT_CONST_DATA(pkt), + VMBUS_CHANPKT_DATALEN(pkt)); + } +} + +static void +hv_kbd_on_received(hv_kbd_sc *sc, struct vmbus_chanpkt_hdr *pkt) +{ + + const hv_kbd_msg *msg = VMBUS_CHANPKT_CONST_DATA(pkt); + const hv_kbd_proto_resp *resp = + VMBUS_CHANPKT_CONST_DATA(pkt); + const hv_kbd_keystroke *keystroke = + VMBUS_CHANPKT_CONST_DATA(pkt); + uint32_t msg_len = VMBUS_CHANPKT_DATALEN(pkt); + enum hv_kbd_msg_type_t msg_type; + uint32_t info; + uint16_t scan_code; + + if (msg_len <= sizeof(hv_kbd_msg)) { + device_printf(sc->dev, "Illegal packet\n"); + return; + } + msg_type = msg->hdr.type; + switch (msg_type) { + case HV_KBD_PROTO_RESPONSE: + hv_kbd_on_response(sc, pkt); + DEBUG_HVSC(sc, "keyboard resp: 0x%x\n", + resp->status); + break; + case HV_KBD_PROTO_EVENT: + info = keystroke->ks.info; + scan_code = keystroke->ks.makecode; + DEBUG_HVSC(sc, "keystroke info: 0x%x, scan: 0x%x\n", + info, scan_code); + hv_kbd_produce_ks(sc, &keystroke->ks); + hv_kbd_intr(sc); + default: + break; + } +} + +void +hv_kbd_read_channel(struct vmbus_channel *channel, void *context) +{ + uint8_t *buf; + uint32_t buflen = 0; + int ret = 0; + + hv_kbd_sc *sc = (hv_kbd_sc*)context; + buf = sc->buf; + buflen = sc->buflen; + for (;;) { + struct vmbus_chanpkt_hdr *pkt = (struct vmbus_chanpkt_hdr *)buf; + uint32_t rxed = buflen; + + ret = vmbus_chan_recv_pkt(channel, pkt, &rxed); + if (__predict_false(ret == ENOBUFS)) { + buflen = sc->buflen * 2; + while (buflen < rxed) + buflen *= 2; + buf = malloc(buflen, M_DEVBUF, M_WAITOK | M_ZERO); + device_printf(sc->dev, "expand recvbuf %d -> %d\n", + sc->buflen, buflen); + free(sc->buf, M_DEVBUF); + sc->buf = buf; + sc->buflen = buflen; + continue; + } else if (__predict_false(ret == EAGAIN)) { + /* No more channel packets; done! */ + break; + } + KASSERT(!ret, ("vmbus_chan_recv_pkt failed: %d", ret)); + + DEBUG_HVSC(sc, "event: 0x%x\n", pkt->cph_type); + switch (pkt->cph_type) { + case VMBUS_CHANPKT_TYPE_COMP: + case VMBUS_CHANPKT_TYPE_RXBUF: + device_printf(sc->dev, "unhandled event: %d\n", + pkt->cph_type); + break; + case VMBUS_CHANPKT_TYPE_INBAND: + hv_kbd_on_received(sc, pkt); + break; + default: + device_printf(sc->dev, "unknown event: %d\n", + pkt->cph_type); + break; + } + } +} + +static int +hv_kbd_connect_vsp(hv_kbd_sc *sc) +{ + int ret; + size_t resplen; + struct vmbus_xact *xact; + hv_kbd_proto_req *req; + const hv_kbd_proto_resp *resp; + + xact = vmbus_xact_get(sc->hs_xact_ctx, sizeof(*req)); + if (xact == NULL) { + device_printf(sc->dev, "no xact for kbd init"); + return (ENODEV); + } + req = vmbus_xact_req_data(xact); + req->hdr.type = HV_KBD_PROTO_REQUEST; + req->ver = HV_KBD_VER; + + vmbus_xact_activate(xact); + ret = vmbus_chan_send(sc->hs_chan, + VMBUS_CHANPKT_TYPE_INBAND, + VMBUS_CHANPKT_FLAG_RC, + req, sizeof(hv_kbd_proto_req), + (uint64_t)(uintptr_t)xact); + if (ret) { + device_printf(sc->dev, "fail to send\n"); + vmbus_xact_deactivate(xact); + return (ret); + } + resp = vmbus_chan_xact_wait(sc->hs_chan, xact, &resplen, true); + if (resplen < HV_KBD_PROTO_RESP_SZ) { + device_printf(sc->dev, "hv_kbd init communicate failed\n"); + ret = ENODEV; + goto clean; + } + + if (!(resp->status & HV_KBD_PROTO_ACCEPTED)) { + device_printf(sc->dev, "hv_kbd protocol request failed\n"); + ret = ENODEV; + } +clean: + vmbus_xact_put(xact); + DEBUG_HVSC(sc, "finish connect vsp\n"); + return (ret); +} + +static int +hv_kbd_attach1(device_t dev, vmbus_chan_callback_t cb) +{ + int ret; + hv_kbd_sc *sc; + + sc = device_get_softc(dev); + sc->buflen = HV_BUFF_SIZE; + sc->buf = malloc(sc->buflen, M_DEVBUF, M_WAITOK | M_ZERO); + vmbus_chan_set_readbatch(sc->hs_chan, false); + ret = vmbus_chan_open( + sc->hs_chan, + HV_KBD_RINGBUFF_SEND_SZ, + HV_KBD_RINGBUFF_RECV_SZ, + NULL, 0, + cb, + sc); + if (ret != 0) { + free(sc->buf, M_DEVBUF); + } + return (ret); +} + +static int +hv_kbd_detach1(device_t dev) +{ + hv_kbd_sc *sc = device_get_softc(dev); + vmbus_chan_close(vmbus_get_channel(dev)); + free(sc->buf, M_DEVBUF); + return (0); +} + +static void +hv_kbd_init(hv_kbd_sc *sc) +{ + const int max_list = 16; + int i; + keystroke_info *ksi; + + mtx_init(&sc->ks_mtx, "hv_kbdc mutex", NULL, MTX_DEF); + LIST_INIT(&sc->ks_free_list); + STAILQ_INIT(&sc->ks_queue); + for (i = 0; i < max_list; i++) { + ksi = malloc(sizeof(keystroke_info), + M_DEVBUF, M_WAITOK|M_ZERO); + LIST_INSERT_HEAD(&sc->ks_free_list, ksi, link); + } +} + +static void +hv_kbd_fini(hv_kbd_sc *sc) +{ + keystroke_info *ksi; + while (!LIST_EMPTY(&sc->ks_free_list)) { + ksi = LIST_FIRST(&sc->ks_free_list); + LIST_REMOVE(ksi, link); + free(ksi, M_DEVBUF); + } + while (!STAILQ_EMPTY(&sc->ks_queue)) { + ksi = STAILQ_FIRST(&sc->ks_queue); + STAILQ_REMOVE_HEAD(&sc->ks_queue, slink); + free(ksi, M_DEVBUF); + } + mtx_destroy(&sc->ks_mtx); +} + +static void +hv_kbd_sysctl(device_t dev) +{ + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + hv_kbd_sc *sc; + + sc = device_get_softc(dev); + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "debug", CTLFLAG_RW, + &sc->debug, 0, "debug hyperv keyboard"); +} + +static int +hv_kbd_attach(device_t dev) +{ + int error = 0; + hv_kbd_sc *sc; + + sc = device_get_softc(dev); + sc->hs_chan = vmbus_get_channel(dev); + sc->dev = dev; + hv_kbd_init(sc); + sc->hs_xact_ctx = vmbus_xact_ctx_create(bus_get_dma_tag(dev), + HV_KBD_PROTO_REQ_SZ, HV_KBD_PROTO_RESP_SZ, 0); + if (sc->hs_xact_ctx == NULL) { + error = ENOMEM; + goto failed; + } + + error = hv_kbd_attach1(dev, hv_kbd_read_channel); + if (error) + goto failed; + error = hv_kbd_connect_vsp(sc); + if (error) + goto failed; + + error = hv_kbd_drv_attach(dev); + if (error) + goto failed; + hv_kbd_sysctl(dev); + return (0); +failed: + hv_kbd_detach(dev); + return (error); +} + +static int +hv_kbd_detach(device_t dev) +{ + int ret; + hv_kbd_sc *sc = device_get_softc(dev); + hv_kbd_fini(sc); + if (sc->hs_xact_ctx != NULL) + vmbus_xact_ctx_destroy(sc->hs_xact_ctx); + ret = hv_kbd_detach1(dev); + if (!ret) + device_printf(dev, "Fail to detach\n"); + return hv_kbd_drv_detach(dev); +} + +static device_method_t kbd_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, hv_kbd_probe), + DEVMETHOD(device_attach, hv_kbd_attach), + DEVMETHOD(device_detach, hv_kbd_detach), + { 0, 0 } +}; + +static driver_t kbd_driver = {HVKBD_DRIVER_NAME , kbd_methods, sizeof(hv_kbd_sc)}; + +static devclass_t kbd_devclass; + +DRIVER_MODULE(hv_kbd, vmbus, kbd_driver, kbd_devclass, hvkbd_driver_load, NULL); +MODULE_VERSION(hv_kbd, 1); +MODULE_DEPEND(hv_kbd, vmbus, 1, 1, 1); diff --git a/sys/dev/hyperv/input/hv_kbdc.h b/sys/dev/hyperv/input/hv_kbdc.h new file mode 100644 index 000000000000..f6f76035e8c3 --- /dev/null +++ b/sys/dev/hyperv/input/hv_kbdc.h @@ -0,0 +1,118 @@ +/*- + * Copyright (c) 2017 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HV_KBD_H +#define _HV_KBD_H +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/queue.h> +#include <sys/systm.h> + +#include <dev/kbd/kbdreg.h> + +#include "opt_evdev.h" +#ifdef EVDEV_SUPPORT +#include <dev/evdev/evdev.h> +#include <dev/evdev/input.h> +#endif + +#define HVKBD_DRIVER_NAME "hvkbd" +#define IS_UNICODE (1) +#define IS_BREAK (2) +#define IS_E0 (4) +#define IS_E1 (8) + +#define XTKBD_EMUL0 (0xe0) +#define XTKBD_EMUL1 (0xe1) +#define XTKBD_RELEASE (0x80) + +#define DEBUG_HVSC(sc, ...) do { \ + if (sc->debug > 0) { \ + device_printf(sc->dev, __VA_ARGS__); \ + } \ +} while (0) +#define DEBUG_HVKBD(kbd, ...) do { \ + hv_kbd_sc *sc = (kbd)->kb_data; \ + DEBUG_HVSC(sc, __VA_ARGS__); \ +} while (0) + +struct vmbus_channel; +struct vmbus_xact_ctx; + +typedef struct keystroke_t { + uint16_t makecode; + uint32_t info; +} keystroke; + +typedef struct keystroke_info { + LIST_ENTRY(keystroke_info) link; + STAILQ_ENTRY(keystroke_info) slink; + keystroke ks; +} keystroke_info; + +typedef struct hv_kbd_sc_t { + struct vmbus_channel *hs_chan; + device_t dev; + struct vmbus_xact_ctx *hs_xact_ctx; + int32_t buflen; + uint8_t *buf; + + struct mtx ks_mtx; + LIST_HEAD(, keystroke_info) ks_free_list; + STAILQ_HEAD(, keystroke_info) ks_queue; /* keystroke info queue */ + + keyboard_t sc_kbd; + int sc_mode; + int sc_state; + uint32_t sc_accents; /* accent key index (> 0) */ + uint32_t sc_composed_char; /* composed char code */ + uint8_t sc_prefix; /* AT scan code prefix */ + int sc_polling; /* polling recursion count */ + uint32_t sc_flags; + int debug; + +#ifdef EVDEV_SUPPORT + struct evdev_dev *ks_evdev; + int ks_evdev_state; +#endif +} hv_kbd_sc; + +int hv_kbd_produce_ks(hv_kbd_sc *sc, const keystroke *ks); +int hv_kbd_fetch_top(hv_kbd_sc *sc, keystroke *top); +int hv_kbd_modify_top(hv_kbd_sc *sc, keystroke *top); +int hv_kbd_remove_top(hv_kbd_sc *sc); +int hv_kbd_prod_is_ready(hv_kbd_sc *sc); +void hv_kbd_read_channel(struct vmbus_channel *, void *); + +int hv_kbd_drv_attach(device_t dev); +int hv_kbd_drv_detach(device_t dev); + +int hvkbd_driver_load(module_t, int, void *); +void hv_kbd_intr(hv_kbd_sc *sc); +#endif diff --git a/sys/dev/hyperv/netvsc/hn_nvs.c b/sys/dev/hyperv/netvsc/hn_nvs.c new file mode 100644 index 000000000000..4dbc28996617 --- /dev/null +++ b/sys/dev/hyperv/netvsc/hn_nvs.c @@ -0,0 +1,751 @@ +/*- + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2012 NetApp Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Network Virtualization Service. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet6.h" +#include "opt_inet.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/limits.h> +#include <sys/socket.h> +#include <sys/systm.h> +#include <sys/taskqueue.h> + +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_media.h> + +#include <netinet/in.h> +#include <netinet/tcp_lro.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/hyperv_busdma.h> +#include <dev/hyperv/include/vmbus.h> +#include <dev/hyperv/include/vmbus_xact.h> + +#include <dev/hyperv/netvsc/ndis.h> +#include <dev/hyperv/netvsc/if_hnreg.h> +#include <dev/hyperv/netvsc/if_hnvar.h> +#include <dev/hyperv/netvsc/hn_nvs.h> + +static int hn_nvs_conn_chim(struct hn_softc *); +static int hn_nvs_conn_rxbuf(struct hn_softc *); +static void hn_nvs_disconn_chim(struct hn_softc *); +static void hn_nvs_disconn_rxbuf(struct hn_softc *); +static int hn_nvs_conf_ndis(struct hn_softc *, int); +static int hn_nvs_init_ndis(struct hn_softc *); +static int hn_nvs_doinit(struct hn_softc *, uint32_t); +static int hn_nvs_init(struct hn_softc *); +static const void *hn_nvs_xact_execute(struct hn_softc *, + struct vmbus_xact *, void *, int, + size_t *, uint32_t); +static void hn_nvs_sent_none(struct hn_nvs_sendctx *, + struct hn_softc *, struct vmbus_channel *, + const void *, int); + +struct hn_nvs_sendctx hn_nvs_sendctx_none = + HN_NVS_SENDCTX_INITIALIZER(hn_nvs_sent_none, NULL); + +static const uint32_t hn_nvs_version[] = { + HN_NVS_VERSION_61, + HN_NVS_VERSION_6, + HN_NVS_VERSION_5, + HN_NVS_VERSION_4, + HN_NVS_VERSION_2, + HN_NVS_VERSION_1 +}; + +static const void * +hn_nvs_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact, + void *req, int reqlen, size_t *resplen0, uint32_t type) +{ + struct hn_nvs_sendctx sndc; + size_t resplen, min_resplen = *resplen0; + const struct hn_nvs_hdr *hdr; + int error; + + KASSERT(min_resplen >= sizeof(*hdr), + ("invalid minimum response len %zu", min_resplen)); + + /* + * Execute the xact setup by the caller. + */ + hn_nvs_sendctx_init(&sndc, hn_nvs_sent_xact, xact); + + vmbus_xact_activate(xact); + error = hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_RC, + req, reqlen, &sndc); + if (error) { + vmbus_xact_deactivate(xact); + return (NULL); + } + hdr = vmbus_chan_xact_wait(sc->hn_prichan, xact, &resplen, + HN_CAN_SLEEP(sc)); + + /* + * Check this NVS response message. + */ + if (resplen < min_resplen) { + if_printf(sc->hn_ifp, "invalid NVS resp len %zu\n", resplen); + return (NULL); + } + if (hdr->nvs_type != type) { + if_printf(sc->hn_ifp, "unexpected NVS resp 0x%08x, " + "expect 0x%08x\n", hdr->nvs_type, type); + return (NULL); + } + /* All pass! */ + *resplen0 = resplen; + return (hdr); +} + +static __inline int +hn_nvs_req_send(struct hn_softc *sc, void *req, int reqlen) +{ + + return (hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_NONE, + req, reqlen, &hn_nvs_sendctx_none)); +} + +static int +hn_nvs_conn_rxbuf(struct hn_softc *sc) +{ + struct vmbus_xact *xact = NULL; + struct hn_nvs_rxbuf_conn *conn; + const struct hn_nvs_rxbuf_connresp *resp; + size_t resp_len; + uint32_t status; + int error, rxbuf_size; + + /* + * Limit RXBUF size for old NVS. + */ + if (sc->hn_nvs_ver <= HN_NVS_VERSION_2) + rxbuf_size = HN_RXBUF_SIZE_COMPAT; + else + rxbuf_size = HN_RXBUF_SIZE; + + /* + * Connect the RXBUF GPADL to the primary channel. + * + * NOTE: + * Only primary channel has RXBUF connected to it. Sub-channels + * just share this RXBUF. + */ + error = vmbus_chan_gpadl_connect(sc->hn_prichan, + sc->hn_rxbuf_dma.hv_paddr, rxbuf_size, &sc->hn_rxbuf_gpadl); + if (error) { + if_printf(sc->hn_ifp, "rxbuf gpadl conn failed: %d\n", + error); + goto cleanup; + } + + /* + * Connect RXBUF to NVS. + */ + + xact = vmbus_xact_get(sc->hn_xact, sizeof(*conn)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for nvs rxbuf conn\n"); + error = ENXIO; + goto cleanup; + } + conn = vmbus_xact_req_data(xact); + conn->nvs_type = HN_NVS_TYPE_RXBUF_CONN; + conn->nvs_gpadl = sc->hn_rxbuf_gpadl; + conn->nvs_sig = HN_NVS_RXBUF_SIG; + + resp_len = sizeof(*resp); + resp = hn_nvs_xact_execute(sc, xact, conn, sizeof(*conn), &resp_len, + HN_NVS_TYPE_RXBUF_CONNRESP); + if (resp == NULL) { + if_printf(sc->hn_ifp, "exec nvs rxbuf conn failed\n"); + error = EIO; + goto cleanup; + } + + status = resp->nvs_status; + vmbus_xact_put(xact); + xact = NULL; + + if (status != HN_NVS_STATUS_OK) { + if_printf(sc->hn_ifp, "nvs rxbuf conn failed: %x\n", status); + error = EIO; + goto cleanup; + } + sc->hn_flags |= HN_FLAG_RXBUF_CONNECTED; + + return (0); + +cleanup: + if (xact != NULL) + vmbus_xact_put(xact); + hn_nvs_disconn_rxbuf(sc); + return (error); +} + +static int +hn_nvs_conn_chim(struct hn_softc *sc) +{ + struct vmbus_xact *xact = NULL; + struct hn_nvs_chim_conn *chim; + const struct hn_nvs_chim_connresp *resp; + size_t resp_len; + uint32_t status, sectsz; + int error; + + /* + * Connect chimney sending buffer GPADL to the primary channel. + * + * NOTE: + * Only primary channel has chimney sending buffer connected to it. + * Sub-channels just share this chimney sending buffer. + */ + error = vmbus_chan_gpadl_connect(sc->hn_prichan, + sc->hn_chim_dma.hv_paddr, HN_CHIM_SIZE, &sc->hn_chim_gpadl); + if (error) { + if_printf(sc->hn_ifp, "chim gpadl conn failed: %d\n", error); + goto cleanup; + } + + /* + * Connect chimney sending buffer to NVS + */ + + xact = vmbus_xact_get(sc->hn_xact, sizeof(*chim)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for nvs chim conn\n"); + error = ENXIO; + goto cleanup; + } + chim = vmbus_xact_req_data(xact); + chim->nvs_type = HN_NVS_TYPE_CHIM_CONN; + chim->nvs_gpadl = sc->hn_chim_gpadl; + chim->nvs_sig = HN_NVS_CHIM_SIG; + + resp_len = sizeof(*resp); + resp = hn_nvs_xact_execute(sc, xact, chim, sizeof(*chim), &resp_len, + HN_NVS_TYPE_CHIM_CONNRESP); + if (resp == NULL) { + if_printf(sc->hn_ifp, "exec nvs chim conn failed\n"); + error = EIO; + goto cleanup; + } + + status = resp->nvs_status; + sectsz = resp->nvs_sectsz; + vmbus_xact_put(xact); + xact = NULL; + + if (status != HN_NVS_STATUS_OK) { + if_printf(sc->hn_ifp, "nvs chim conn failed: %x\n", status); + error = EIO; + goto cleanup; + } + if (sectsz == 0 || sectsz % sizeof(uint32_t) != 0) { + /* + * Can't use chimney sending buffer; done! + */ + if (sectsz == 0) { + if_printf(sc->hn_ifp, "zero chimney sending buffer " + "section size\n"); + } else { + if_printf(sc->hn_ifp, "misaligned chimney sending " + "buffers, section size: %u\n", sectsz); + } + sc->hn_chim_szmax = 0; + sc->hn_chim_cnt = 0; + sc->hn_flags |= HN_FLAG_CHIM_CONNECTED; + return (0); + } + + sc->hn_chim_szmax = sectsz; + sc->hn_chim_cnt = HN_CHIM_SIZE / sc->hn_chim_szmax; + if (HN_CHIM_SIZE % sc->hn_chim_szmax != 0) { + if_printf(sc->hn_ifp, "chimney sending sections are " + "not properly aligned\n"); + } + if (sc->hn_chim_cnt % LONG_BIT != 0) { + if_printf(sc->hn_ifp, "discard %d chimney sending sections\n", + sc->hn_chim_cnt % LONG_BIT); + } + + sc->hn_chim_bmap_cnt = sc->hn_chim_cnt / LONG_BIT; + sc->hn_chim_bmap = malloc(sc->hn_chim_bmap_cnt * sizeof(u_long), + M_DEVBUF, M_WAITOK | M_ZERO); + + /* Done! */ + sc->hn_flags |= HN_FLAG_CHIM_CONNECTED; + if (bootverbose) { + if_printf(sc->hn_ifp, "chimney sending buffer %d/%d\n", + sc->hn_chim_szmax, sc->hn_chim_cnt); + } + return (0); + +cleanup: + if (xact != NULL) + vmbus_xact_put(xact); + hn_nvs_disconn_chim(sc); + return (error); +} + +static void +hn_nvs_disconn_rxbuf(struct hn_softc *sc) +{ + int error; + + if (sc->hn_flags & HN_FLAG_RXBUF_CONNECTED) { + struct hn_nvs_rxbuf_disconn disconn; + + /* + * Disconnect RXBUF from NVS. + */ + memset(&disconn, 0, sizeof(disconn)); + disconn.nvs_type = HN_NVS_TYPE_RXBUF_DISCONN; + disconn.nvs_sig = HN_NVS_RXBUF_SIG; + + /* NOTE: No response. */ + error = hn_nvs_req_send(sc, &disconn, sizeof(disconn)); + if (error) { + if_printf(sc->hn_ifp, + "send nvs rxbuf disconn failed: %d\n", error); + /* + * Fine for a revoked channel, since the hypervisor + * does not drain TX bufring for a revoked channel. + */ + if (!vmbus_chan_is_revoked(sc->hn_prichan)) + sc->hn_flags |= HN_FLAG_RXBUF_REF; + } + sc->hn_flags &= ~HN_FLAG_RXBUF_CONNECTED; + + /* + * Wait for the hypervisor to receive this NVS request. + * + * NOTE: + * The TX bufring will not be drained by the hypervisor, + * if the primary channel is revoked. + */ + while (!vmbus_chan_tx_empty(sc->hn_prichan) && + !vmbus_chan_is_revoked(sc->hn_prichan)) + pause("waittx", 1); + /* + * Linger long enough for NVS to disconnect RXBUF. + */ + pause("lingtx", (200 * hz) / 1000); + } + + if (vmbus_current_version < VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { + /* + * Disconnect RXBUF from primary channel. + */ + error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, + sc->hn_rxbuf_gpadl); + if (error) { + if_printf(sc->hn_ifp, + "rxbuf gpadl disconn failed: %d\n", error); + sc->hn_flags |= HN_FLAG_RXBUF_REF; + } + sc->hn_rxbuf_gpadl = 0; + } +} + +static void +hn_nvs_disconn_chim(struct hn_softc *sc) +{ + int error; + + if (sc->hn_flags & HN_FLAG_CHIM_CONNECTED) { + struct hn_nvs_chim_disconn disconn; + + /* + * Disconnect chimney sending buffer from NVS. + */ + memset(&disconn, 0, sizeof(disconn)); + disconn.nvs_type = HN_NVS_TYPE_CHIM_DISCONN; + disconn.nvs_sig = HN_NVS_CHIM_SIG; + + /* NOTE: No response. */ + error = hn_nvs_req_send(sc, &disconn, sizeof(disconn)); + if (error) { + if_printf(sc->hn_ifp, + "send nvs chim disconn failed: %d\n", error); + /* + * Fine for a revoked channel, since the hypervisor + * does not drain TX bufring for a revoked channel. + */ + if (!vmbus_chan_is_revoked(sc->hn_prichan)) + sc->hn_flags |= HN_FLAG_CHIM_REF; + } + sc->hn_flags &= ~HN_FLAG_CHIM_CONNECTED; + + /* + * Wait for the hypervisor to receive this NVS request. + * + * NOTE: + * The TX bufring will not be drained by the hypervisor, + * if the primary channel is revoked. + */ + while (!vmbus_chan_tx_empty(sc->hn_prichan) && + !vmbus_chan_is_revoked(sc->hn_prichan)) + pause("waittx", 1); + /* + * Linger long enough for NVS to disconnect chimney + * sending buffer. + */ + pause("lingtx", (200 * hz) / 1000); + } + + if (vmbus_current_version < VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { + /* + * Disconnect chimney sending buffer from primary channel. + */ + error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, + sc->hn_chim_gpadl); + if (error) { + if_printf(sc->hn_ifp, + "chim gpadl disconn failed: %d\n", error); + sc->hn_flags |= HN_FLAG_CHIM_REF; + } + sc->hn_chim_gpadl = 0; + } + + if (sc->hn_chim_bmap != NULL) { + free(sc->hn_chim_bmap, M_DEVBUF); + sc->hn_chim_bmap = NULL; + sc->hn_chim_bmap_cnt = 0; + } +} + +static int +hn_nvs_doinit(struct hn_softc *sc, uint32_t nvs_ver) +{ + struct vmbus_xact *xact; + struct hn_nvs_init *init; + const struct hn_nvs_init_resp *resp; + size_t resp_len; + uint32_t status; + + xact = vmbus_xact_get(sc->hn_xact, sizeof(*init)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for nvs init\n"); + return (ENXIO); + } + init = vmbus_xact_req_data(xact); + init->nvs_type = HN_NVS_TYPE_INIT; + init->nvs_ver_min = nvs_ver; + init->nvs_ver_max = nvs_ver; + + resp_len = sizeof(*resp); + resp = hn_nvs_xact_execute(sc, xact, init, sizeof(*init), &resp_len, + HN_NVS_TYPE_INIT_RESP); + if (resp == NULL) { + if_printf(sc->hn_ifp, "exec init failed\n"); + vmbus_xact_put(xact); + return (EIO); + } + + status = resp->nvs_status; + vmbus_xact_put(xact); + + if (status != HN_NVS_STATUS_OK) { + if (bootverbose) { + /* + * Caller may try another NVS version, and will log + * error if there are no more NVS versions to try, + * so don't bark out loud here. + */ + if_printf(sc->hn_ifp, "nvs init failed for ver 0x%x\n", + nvs_ver); + } + return (EINVAL); + } + return (0); +} + +/* + * Configure MTU and enable VLAN. + */ +static int +hn_nvs_conf_ndis(struct hn_softc *sc, int mtu) +{ + struct hn_nvs_ndis_conf conf; + int error; + + memset(&conf, 0, sizeof(conf)); + conf.nvs_type = HN_NVS_TYPE_NDIS_CONF; + conf.nvs_mtu = mtu + ETHER_HDR_LEN; + conf.nvs_caps = HN_NVS_NDIS_CONF_VLAN; + if (sc->hn_nvs_ver >= HN_NVS_VERSION_5) + conf.nvs_caps |= HN_NVS_NDIS_CONF_SRIOV; + if (sc->hn_nvs_ver >= HN_NVS_VERSION_61) + conf.nvs_caps |= HN_NVS_NDIS_CONF_RSC; + + + /* NOTE: No response. */ + error = hn_nvs_req_send(sc, &conf, sizeof(conf)); + if (error) { + if_printf(sc->hn_ifp, "send nvs ndis conf failed: %d\n", error); + return (error); + } + + if (bootverbose) + if_printf(sc->hn_ifp, "nvs ndis conf done\n"); + sc->hn_caps |= HN_CAP_MTU | HN_CAP_VLAN; + return (0); +} + +static int +hn_nvs_init_ndis(struct hn_softc *sc) +{ + struct hn_nvs_ndis_init ndis; + int error; + + memset(&ndis, 0, sizeof(ndis)); + ndis.nvs_type = HN_NVS_TYPE_NDIS_INIT; + ndis.nvs_ndis_major = HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver); + ndis.nvs_ndis_minor = HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver); + + /* NOTE: No response. */ + error = hn_nvs_req_send(sc, &ndis, sizeof(ndis)); + if (error) + if_printf(sc->hn_ifp, "send nvs ndis init failed: %d\n", error); + return (error); +} + +static int +hn_nvs_init(struct hn_softc *sc) +{ + int i, error; + + if (device_is_attached(sc->hn_dev)) { + /* + * NVS version and NDIS version MUST NOT be changed. + */ + if (bootverbose) { + if_printf(sc->hn_ifp, "reinit NVS version 0x%x, " + "NDIS version %u.%u\n", sc->hn_nvs_ver, + HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), + HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); + } + + error = hn_nvs_doinit(sc, sc->hn_nvs_ver); + if (error) { + if_printf(sc->hn_ifp, "reinit NVS version 0x%x " + "failed: %d\n", sc->hn_nvs_ver, error); + return (error); + } + goto done; + } + + /* + * Find the supported NVS version and set NDIS version accordingly. + */ + for (i = 0; i < nitems(hn_nvs_version); ++i) { + error = hn_nvs_doinit(sc, hn_nvs_version[i]); + if (!error) { + sc->hn_nvs_ver = hn_nvs_version[i]; + + /* Set NDIS version according to NVS version. */ + sc->hn_ndis_ver = HN_NDIS_VERSION_6_30; + if (sc->hn_nvs_ver <= HN_NVS_VERSION_4) + sc->hn_ndis_ver = HN_NDIS_VERSION_6_1; + + if (bootverbose) { + if_printf(sc->hn_ifp, "NVS version 0x%x, " + "NDIS version %u.%u\n", sc->hn_nvs_ver, + HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), + HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); + } + goto done; + } + } + if_printf(sc->hn_ifp, "no NVS available\n"); + return (ENXIO); + +done: + if (sc->hn_nvs_ver >= HN_NVS_VERSION_5) + sc->hn_caps |= HN_CAP_HASHVAL; + return (0); +} + +int +hn_nvs_attach(struct hn_softc *sc, int mtu) +{ + int error; + + if (hyperv_ver_major >= 10) { + /* UDP 4-tuple hash is enforced. */ + sc->hn_caps |= HN_CAP_UDPHASH; + } + + /* + * Initialize NVS. + */ + error = hn_nvs_init(sc); + if (error) + return (error); + + if (sc->hn_nvs_ver >= HN_NVS_VERSION_2) { + /* + * Configure NDIS before initializing it. + */ + error = hn_nvs_conf_ndis(sc, mtu); + if (error) + return (error); + } + + /* + * Initialize NDIS. + */ + error = hn_nvs_init_ndis(sc); + if (error) + return (error); + + /* + * Connect RXBUF. + */ + error = hn_nvs_conn_rxbuf(sc); + if (error) + return (error); + + /* + * Connect chimney sending buffer. + */ + error = hn_nvs_conn_chim(sc); + if (error) { + hn_nvs_disconn_rxbuf(sc); + return (error); + } + return (0); +} + +void +hn_nvs_detach(struct hn_softc *sc) +{ + + /* NOTE: there are no requests to stop the NVS. */ + hn_nvs_disconn_rxbuf(sc); + hn_nvs_disconn_chim(sc); +} + +void +hn_nvs_sent_xact(struct hn_nvs_sendctx *sndc, + struct hn_softc *sc __unused, struct vmbus_channel *chan __unused, + const void *data, int dlen) +{ + + vmbus_xact_wakeup(sndc->hn_cbarg, data, dlen); +} + +static void +hn_nvs_sent_none(struct hn_nvs_sendctx *sndc __unused, + struct hn_softc *sc __unused, struct vmbus_channel *chan __unused, + const void *data __unused, int dlen __unused) +{ + /* EMPTY */ +} + +int +hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch0) +{ + struct vmbus_xact *xact; + struct hn_nvs_subch_req *req; + const struct hn_nvs_subch_resp *resp; + int error, nsubch_req; + uint32_t nsubch; + size_t resp_len; + + nsubch_req = *nsubch0; + KASSERT(nsubch_req > 0, ("invalid # of sub-channels %d", nsubch_req)); + + xact = vmbus_xact_get(sc->hn_xact, sizeof(*req)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for nvs subch alloc\n"); + return (ENXIO); + } + req = vmbus_xact_req_data(xact); + req->nvs_type = HN_NVS_TYPE_SUBCH_REQ; + req->nvs_op = HN_NVS_SUBCH_OP_ALLOC; + req->nvs_nsubch = nsubch_req; + + resp_len = sizeof(*resp); + resp = hn_nvs_xact_execute(sc, xact, req, sizeof(*req), &resp_len, + HN_NVS_TYPE_SUBCH_RESP); + if (resp == NULL) { + if_printf(sc->hn_ifp, "exec nvs subch alloc failed\n"); + error = EIO; + goto done; + } + if (resp->nvs_status != HN_NVS_STATUS_OK) { + if_printf(sc->hn_ifp, "nvs subch alloc failed: %x\n", + resp->nvs_status); + error = EIO; + goto done; + } + + nsubch = resp->nvs_nsubch; + if (nsubch > nsubch_req) { + if_printf(sc->hn_ifp, "%u subchans are allocated, " + "requested %d\n", nsubch, nsubch_req); + nsubch = nsubch_req; + } + *nsubch0 = nsubch; + error = 0; +done: + vmbus_xact_put(xact); + return (error); +} + +int +hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan, + struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt) +{ + + return hn_nvs_send_rndis_sglist(chan, HN_NVS_RNDIS_MTYPE_CTRL, + sndc, gpa, gpa_cnt); +} + +void +hn_nvs_set_datapath(struct hn_softc *sc, uint32_t path) +{ + struct hn_nvs_datapath dp; + + memset(&dp, 0, sizeof(dp)); + dp.nvs_type = HN_NVS_TYPE_SET_DATAPATH; + dp.nvs_active_path = path; + + hn_nvs_req_send(sc, &dp, sizeof(dp)); +} diff --git a/sys/dev/hyperv/netvsc/hn_nvs.h b/sys/dev/hyperv/netvsc/hn_nvs.h new file mode 100644 index 000000000000..a14d7b765590 --- /dev/null +++ b/sys/dev/hyperv/netvsc/hn_nvs.h @@ -0,0 +1,107 @@ +/*- + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2012 NetApp Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HN_NVS_H_ +#define _HN_NVS_H_ + +struct hn_nvs_sendctx; +struct vmbus_channel; +struct hn_softc; + +typedef void (*hn_nvs_sent_t) + (struct hn_nvs_sendctx *, struct hn_softc *, + struct vmbus_channel *, const void *, int); + +struct hn_nvs_sendctx { + hn_nvs_sent_t hn_cb; + void *hn_cbarg; +}; + +#define HN_NVS_SENDCTX_INITIALIZER(cb, cbarg) \ +{ \ + .hn_cb = cb, \ + .hn_cbarg = cbarg \ +} + +static __inline void +hn_nvs_sendctx_init(struct hn_nvs_sendctx *sndc, hn_nvs_sent_t cb, void *cbarg) +{ + + sndc->hn_cb = cb; + sndc->hn_cbarg = cbarg; +} + +static __inline int +hn_nvs_send(struct vmbus_channel *chan, uint16_t flags, + void *nvs_msg, int nvs_msglen, struct hn_nvs_sendctx *sndc) +{ + + return (vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, flags, + nvs_msg, nvs_msglen, (uint64_t)(uintptr_t)sndc)); +} + +static __inline int +hn_nvs_send_sglist(struct vmbus_channel *chan, struct vmbus_gpa sg[], int sglen, + void *nvs_msg, int nvs_msglen, struct hn_nvs_sendctx *sndc) +{ + + return (vmbus_chan_send_sglist(chan, sg, sglen, nvs_msg, nvs_msglen, + (uint64_t)(uintptr_t)sndc)); +} + +static __inline int +hn_nvs_send_rndis_sglist(struct vmbus_channel *chan, uint32_t rndis_mtype, + struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt) +{ + struct hn_nvs_rndis rndis; + + rndis.nvs_type = HN_NVS_TYPE_RNDIS; + rndis.nvs_rndis_mtype = rndis_mtype; + rndis.nvs_chim_idx = HN_NVS_CHIM_IDX_INVALID; + rndis.nvs_chim_sz = 0; + + return (hn_nvs_send_sglist(chan, gpa, gpa_cnt, + &rndis, sizeof(rndis), sndc)); +} + +int hn_nvs_attach(struct hn_softc *sc, int mtu); +void hn_nvs_detach(struct hn_softc *sc); +int hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch); +void hn_nvs_sent_xact(struct hn_nvs_sendctx *sndc, + struct hn_softc *sc, struct vmbus_channel *chan, + const void *data, int dlen); +int hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan, + struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, + int gpa_cnt); +void hn_nvs_set_datapath(struct hn_softc *sc, uint32_t path); + +extern struct hn_nvs_sendctx hn_nvs_sendctx_none; + +#endif /* !_HN_NVS_H_ */ diff --git a/sys/dev/hyperv/netvsc/hn_rndis.c b/sys/dev/hyperv/netvsc/hn_rndis.c new file mode 100644 index 000000000000..108950aa3f9b --- /dev/null +++ b/sys/dev/hyperv/netvsc/hn_rndis.c @@ -0,0 +1,1061 @@ +/*- + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2012 NetApp Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet6.h" +#include "opt_inet.h" + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/systm.h> +#include <sys/taskqueue.h> + +#include <machine/atomic.h> + +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_media.h> +#include <net/rndis.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp_lro.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/hyperv_busdma.h> +#include <dev/hyperv/include/vmbus.h> +#include <dev/hyperv/include/vmbus_xact.h> + +#include <dev/hyperv/netvsc/ndis.h> +#include <dev/hyperv/netvsc/if_hnreg.h> +#include <dev/hyperv/netvsc/if_hnvar.h> +#include <dev/hyperv/netvsc/hn_nvs.h> +#include <dev/hyperv/netvsc/hn_rndis.h> + +#define HN_RNDIS_RID_COMPAT_MASK 0xffff +#define HN_RNDIS_RID_COMPAT_MAX HN_RNDIS_RID_COMPAT_MASK + +#define HN_RNDIS_XFER_SIZE 2048 + +#define HN_NDIS_TXCSUM_CAP_IP4 \ + (NDIS_TXCSUM_CAP_IP4 | NDIS_TXCSUM_CAP_IP4OPT) +#define HN_NDIS_TXCSUM_CAP_TCP4 \ + (NDIS_TXCSUM_CAP_TCP4 | NDIS_TXCSUM_CAP_TCP4OPT) +#define HN_NDIS_TXCSUM_CAP_TCP6 \ + (NDIS_TXCSUM_CAP_TCP6 | NDIS_TXCSUM_CAP_TCP6OPT | \ + NDIS_TXCSUM_CAP_IP6EXT) +#define HN_NDIS_TXCSUM_CAP_UDP6 \ + (NDIS_TXCSUM_CAP_UDP6 | NDIS_TXCSUM_CAP_IP6EXT) +#define HN_NDIS_LSOV2_CAP_IP6 \ + (NDIS_LSOV2_CAP_IP6EXT | NDIS_LSOV2_CAP_TCP6OPT) + +static const void *hn_rndis_xact_exec1(struct hn_softc *, + struct vmbus_xact *, size_t, + struct hn_nvs_sendctx *, size_t *); +static const void *hn_rndis_xact_execute(struct hn_softc *, + struct vmbus_xact *, uint32_t, size_t, size_t *, + uint32_t); +static int hn_rndis_query(struct hn_softc *, uint32_t, + const void *, size_t, void *, size_t *); +static int hn_rndis_query2(struct hn_softc *, uint32_t, + const void *, size_t, void *, size_t *, size_t); +static int hn_rndis_set(struct hn_softc *, uint32_t, + const void *, size_t); +static int hn_rndis_init(struct hn_softc *); +static int hn_rndis_halt(struct hn_softc *); +static int hn_rndis_conf_offload(struct hn_softc *, int); +static int hn_rndis_query_hwcaps(struct hn_softc *, + struct ndis_offload *); + +static __inline uint32_t +hn_rndis_rid(struct hn_softc *sc) +{ + uint32_t rid; + +again: + rid = atomic_fetchadd_int(&sc->hn_rndis_rid, 1); + if (rid == 0) + goto again; + + /* Use upper 16 bits for non-compat RNDIS messages. */ + return ((rid & 0xffff) << 16); +} + +void +hn_rndis_rx_ctrl(struct hn_softc *sc, const void *data, int dlen) +{ + const struct rndis_comp_hdr *comp; + const struct rndis_msghdr *hdr; + + KASSERT(dlen >= sizeof(*hdr), ("invalid RNDIS msg\n")); + hdr = data; + + switch (hdr->rm_type) { + case REMOTE_NDIS_INITIALIZE_CMPLT: + case REMOTE_NDIS_QUERY_CMPLT: + case REMOTE_NDIS_SET_CMPLT: + case REMOTE_NDIS_KEEPALIVE_CMPLT: /* unused */ + if (dlen < sizeof(*comp)) { + if_printf(sc->hn_ifp, "invalid RNDIS cmplt\n"); + return; + } + comp = data; + + KASSERT(comp->rm_rid > HN_RNDIS_RID_COMPAT_MAX, + ("invalid RNDIS rid 0x%08x\n", comp->rm_rid)); + vmbus_xact_ctx_wakeup(sc->hn_xact, comp, dlen); + break; + + case REMOTE_NDIS_RESET_CMPLT: + /* + * Reset completed, no rid. + * + * NOTE: + * RESET is not issued by hn(4), so this message should + * _not_ be observed. + */ + if_printf(sc->hn_ifp, "RESET cmplt received\n"); + break; + + default: + if_printf(sc->hn_ifp, "unknown RNDIS msg 0x%x\n", + hdr->rm_type); + break; + } +} + +int +hn_rndis_get_eaddr(struct hn_softc *sc, uint8_t *eaddr) +{ + size_t eaddr_len; + int error; + + eaddr_len = ETHER_ADDR_LEN; + error = hn_rndis_query(sc, OID_802_3_PERMANENT_ADDRESS, NULL, 0, + eaddr, &eaddr_len); + if (error) + return (error); + if (eaddr_len != ETHER_ADDR_LEN) { + if_printf(sc->hn_ifp, "invalid eaddr len %zu\n", eaddr_len); + return (EINVAL); + } + return (0); +} + +int +hn_rndis_get_linkstatus(struct hn_softc *sc, uint32_t *link_status) +{ + size_t size; + int error; + + size = sizeof(*link_status); + error = hn_rndis_query(sc, OID_GEN_MEDIA_CONNECT_STATUS, NULL, 0, + link_status, &size); + if (error) + return (error); + if (size != sizeof(uint32_t)) { + if_printf(sc->hn_ifp, "invalid link status len %zu\n", size); + return (EINVAL); + } + return (0); +} + +int +hn_rndis_get_mtu(struct hn_softc *sc, uint32_t *mtu) +{ + size_t size; + int error; + + size = sizeof(*mtu); + error = hn_rndis_query(sc, OID_GEN_MAXIMUM_FRAME_SIZE, NULL, 0, + mtu, &size); + if (error) + return (error); + if (size != sizeof(uint32_t)) { + if_printf(sc->hn_ifp, "invalid mtu len %zu\n", size); + return (EINVAL); + } + return (0); +} + +static const void * +hn_rndis_xact_exec1(struct hn_softc *sc, struct vmbus_xact *xact, size_t reqlen, + struct hn_nvs_sendctx *sndc, size_t *comp_len) +{ + struct vmbus_gpa gpa[HN_XACT_REQ_PGCNT]; + int gpa_cnt, error; + bus_addr_t paddr; + + KASSERT(reqlen <= HN_XACT_REQ_SIZE && reqlen > 0, + ("invalid request length %zu", reqlen)); + + /* + * Setup the SG list. + */ + paddr = vmbus_xact_req_paddr(xact); + KASSERT((paddr & PAGE_MASK) == 0, + ("vmbus xact request is not page aligned 0x%jx", (uintmax_t)paddr)); + for (gpa_cnt = 0; gpa_cnt < HN_XACT_REQ_PGCNT; ++gpa_cnt) { + int len = PAGE_SIZE; + + if (reqlen == 0) + break; + if (reqlen < len) + len = reqlen; + + gpa[gpa_cnt].gpa_page = atop(paddr) + gpa_cnt; + gpa[gpa_cnt].gpa_len = len; + gpa[gpa_cnt].gpa_ofs = 0; + + reqlen -= len; + } + KASSERT(reqlen == 0, ("still have %zu request data left", reqlen)); + + /* + * Send this RNDIS control message and wait for its completion + * message. + */ + vmbus_xact_activate(xact); + error = hn_nvs_send_rndis_ctrl(sc->hn_prichan, sndc, gpa, gpa_cnt); + if (error) { + vmbus_xact_deactivate(xact); + if_printf(sc->hn_ifp, "RNDIS ctrl send failed: %d\n", error); + return (NULL); + } + return (vmbus_chan_xact_wait(sc->hn_prichan, xact, comp_len, + HN_CAN_SLEEP(sc))); +} + +static const void * +hn_rndis_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact, uint32_t rid, + size_t reqlen, size_t *comp_len0, uint32_t comp_type) +{ + const struct rndis_comp_hdr *comp; + size_t comp_len, min_complen = *comp_len0; + + KASSERT(rid > HN_RNDIS_RID_COMPAT_MAX, ("invalid rid %u\n", rid)); + KASSERT(min_complen >= sizeof(*comp), + ("invalid minimum complete len %zu", min_complen)); + + /* + * Execute the xact setup by the caller. + */ + comp = hn_rndis_xact_exec1(sc, xact, reqlen, &hn_nvs_sendctx_none, + &comp_len); + if (comp == NULL) + return (NULL); + + /* + * Check this RNDIS complete message. + */ + if (comp_len < min_complen) { + if (comp_len >= sizeof(*comp)) { + /* rm_status field is valid */ + if_printf(sc->hn_ifp, "invalid RNDIS comp len %zu, " + "status 0x%08x\n", comp_len, comp->rm_status); + } else { + if_printf(sc->hn_ifp, "invalid RNDIS comp len %zu\n", + comp_len); + } + return (NULL); + } + if (comp->rm_len < min_complen) { + if_printf(sc->hn_ifp, "invalid RNDIS comp msglen %u\n", + comp->rm_len); + return (NULL); + } + if (comp->rm_type != comp_type) { + if_printf(sc->hn_ifp, "unexpected RNDIS comp 0x%08x, " + "expect 0x%08x\n", comp->rm_type, comp_type); + return (NULL); + } + if (comp->rm_rid != rid) { + if_printf(sc->hn_ifp, "RNDIS comp rid mismatch %u, " + "expect %u\n", comp->rm_rid, rid); + return (NULL); + } + /* All pass! */ + *comp_len0 = comp_len; + return (comp); +} + +static int +hn_rndis_query(struct hn_softc *sc, uint32_t oid, + const void *idata, size_t idlen, void *odata, size_t *odlen0) +{ + + return (hn_rndis_query2(sc, oid, idata, idlen, odata, odlen0, *odlen0)); +} + +static int +hn_rndis_query2(struct hn_softc *sc, uint32_t oid, + const void *idata, size_t idlen, void *odata, size_t *odlen0, + size_t min_odlen) +{ + struct rndis_query_req *req; + const struct rndis_query_comp *comp; + struct vmbus_xact *xact; + size_t reqlen, odlen = *odlen0, comp_len; + int error, ofs; + uint32_t rid; + + reqlen = sizeof(*req) + idlen; + xact = vmbus_xact_get(sc->hn_xact, reqlen); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for RNDIS query 0x%08x\n", oid); + return (ENXIO); + } + rid = hn_rndis_rid(sc); + req = vmbus_xact_req_data(xact); + req->rm_type = REMOTE_NDIS_QUERY_MSG; + req->rm_len = reqlen; + req->rm_rid = rid; + req->rm_oid = oid; + /* + * XXX + * This is _not_ RNDIS Spec conforming: + * "This MUST be set to 0 when there is no input data + * associated with the OID." + * + * If this field was set to 0 according to the RNDIS Spec, + * Hyper-V would set non-SUCCESS status in the query + * completion. + */ + req->rm_infobufoffset = RNDIS_QUERY_REQ_INFOBUFOFFSET; + + if (idlen > 0) { + req->rm_infobuflen = idlen; + /* Input data immediately follows RNDIS query. */ + memcpy(req + 1, idata, idlen); + } + + comp_len = sizeof(*comp) + min_odlen; + comp = hn_rndis_xact_execute(sc, xact, rid, reqlen, &comp_len, + REMOTE_NDIS_QUERY_CMPLT); + if (comp == NULL) { + if_printf(sc->hn_ifp, "exec RNDIS query 0x%08x failed\n", oid); + error = EIO; + goto done; + } + + if (comp->rm_status != RNDIS_STATUS_SUCCESS) { + if_printf(sc->hn_ifp, "RNDIS query 0x%08x failed: " + "status 0x%08x\n", oid, comp->rm_status); + error = EIO; + goto done; + } + if (comp->rm_infobuflen == 0 || comp->rm_infobufoffset == 0) { + /* No output data! */ + if_printf(sc->hn_ifp, "RNDIS query 0x%08x, no data\n", oid); + *odlen0 = 0; + error = 0; + goto done; + } + + /* + * Check output data length and offset. + */ + /* ofs is the offset from the beginning of comp. */ + ofs = RNDIS_QUERY_COMP_INFOBUFOFFSET_ABS(comp->rm_infobufoffset); + if (ofs < sizeof(*comp) || ofs + comp->rm_infobuflen > comp_len) { + if_printf(sc->hn_ifp, "RNDIS query invalid comp ib off/len, " + "%u/%u\n", comp->rm_infobufoffset, comp->rm_infobuflen); + error = EINVAL; + goto done; + } + + /* + * Save output data. + */ + if (comp->rm_infobuflen < odlen) + odlen = comp->rm_infobuflen; + memcpy(odata, ((const uint8_t *)comp) + ofs, odlen); + *odlen0 = odlen; + + error = 0; +done: + vmbus_xact_put(xact); + return (error); +} + +int +hn_rndis_query_rsscaps(struct hn_softc *sc, int *rxr_cnt0) +{ + struct ndis_rss_caps in, caps; + size_t caps_len; + int error, indsz, rxr_cnt, hash_fnidx; + uint32_t hash_func = 0, hash_types = 0; + + *rxr_cnt0 = 0; + + if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_20) + return (EOPNOTSUPP); + + memset(&in, 0, sizeof(in)); + in.ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_CAPS; + in.ndis_hdr.ndis_rev = NDIS_RSS_CAPS_REV_2; + in.ndis_hdr.ndis_size = NDIS_RSS_CAPS_SIZE; + + caps_len = NDIS_RSS_CAPS_SIZE; + error = hn_rndis_query2(sc, OID_GEN_RECEIVE_SCALE_CAPABILITIES, + &in, NDIS_RSS_CAPS_SIZE, &caps, &caps_len, NDIS_RSS_CAPS_SIZE_6_0); + if (error) + return (error); + + /* + * Preliminary verification. + */ + if (caps.ndis_hdr.ndis_type != NDIS_OBJTYPE_RSS_CAPS) { + if_printf(sc->hn_ifp, "invalid NDIS objtype 0x%02x\n", + caps.ndis_hdr.ndis_type); + return (EINVAL); + } + if (caps.ndis_hdr.ndis_rev < NDIS_RSS_CAPS_REV_1) { + if_printf(sc->hn_ifp, "invalid NDIS objrev 0x%02x\n", + caps.ndis_hdr.ndis_rev); + return (EINVAL); + } + if (caps.ndis_hdr.ndis_size > caps_len) { + if_printf(sc->hn_ifp, "invalid NDIS objsize %u, " + "data size %zu\n", caps.ndis_hdr.ndis_size, caps_len); + return (EINVAL); + } else if (caps.ndis_hdr.ndis_size < NDIS_RSS_CAPS_SIZE_6_0) { + if_printf(sc->hn_ifp, "invalid NDIS objsize %u\n", + caps.ndis_hdr.ndis_size); + return (EINVAL); + } + + /* + * Save information for later RSS configuration. + */ + if (caps.ndis_nrxr == 0) { + if_printf(sc->hn_ifp, "0 RX rings!?\n"); + return (EINVAL); + } + if (bootverbose) + if_printf(sc->hn_ifp, "%u RX rings\n", caps.ndis_nrxr); + rxr_cnt = caps.ndis_nrxr; + + if (caps.ndis_hdr.ndis_size == NDIS_RSS_CAPS_SIZE && + caps.ndis_hdr.ndis_rev >= NDIS_RSS_CAPS_REV_2) { + if (caps.ndis_nind > NDIS_HASH_INDCNT) { + if_printf(sc->hn_ifp, + "too many RSS indirect table entries %u\n", + caps.ndis_nind); + return (EOPNOTSUPP); + } + if (!powerof2(caps.ndis_nind)) { + if_printf(sc->hn_ifp, "RSS indirect table size is not " + "power-of-2 %u\n", caps.ndis_nind); + } + + if (bootverbose) { + if_printf(sc->hn_ifp, "RSS indirect table size %u\n", + caps.ndis_nind); + } + indsz = caps.ndis_nind; + } else { + indsz = NDIS_HASH_INDCNT; + } + if (indsz < rxr_cnt) { + if_printf(sc->hn_ifp, "# of RX rings (%d) > " + "RSS indirect table size %d\n", rxr_cnt, indsz); + rxr_cnt = indsz; + } + + /* + * NOTE: + * Toeplitz is at the lowest bit, and it is preferred; so ffs(), + * instead of fls(), is used here. + */ + hash_fnidx = ffs(caps.ndis_caps & NDIS_RSS_CAP_HASHFUNC_MASK); + if (hash_fnidx == 0) { + if_printf(sc->hn_ifp, "no hash functions, caps 0x%08x\n", + caps.ndis_caps); + return (EOPNOTSUPP); + } + hash_func = 1 << (hash_fnidx - 1); /* ffs is 1-based */ + + if (caps.ndis_caps & NDIS_RSS_CAP_IPV4) + hash_types |= NDIS_HASH_IPV4 | NDIS_HASH_TCP_IPV4; + if (caps.ndis_caps & NDIS_RSS_CAP_IPV6) + hash_types |= NDIS_HASH_IPV6 | NDIS_HASH_TCP_IPV6; + if (caps.ndis_caps & NDIS_RSS_CAP_IPV6_EX) + hash_types |= NDIS_HASH_IPV6_EX | NDIS_HASH_TCP_IPV6_EX; + if (hash_types == 0) { + if_printf(sc->hn_ifp, "no hash types, caps 0x%08x\n", + caps.ndis_caps); + return (EOPNOTSUPP); + } + if (bootverbose) + if_printf(sc->hn_ifp, "RSS caps %#x\n", caps.ndis_caps); + + /* Commit! */ + sc->hn_rss_ind_size = indsz; + sc->hn_rss_hcap = hash_func | hash_types; + if (sc->hn_caps & HN_CAP_UDPHASH) { + /* UDP 4-tuple hash is unconditionally enabled. */ + sc->hn_rss_hcap |= NDIS_HASH_UDP_IPV4_X; + } + *rxr_cnt0 = rxr_cnt; + return (0); +} + +static int +hn_rndis_set(struct hn_softc *sc, uint32_t oid, const void *data, size_t dlen) +{ + struct rndis_set_req *req; + const struct rndis_set_comp *comp; + struct vmbus_xact *xact; + size_t reqlen, comp_len; + uint32_t rid; + int error; + + KASSERT(dlen > 0, ("invalid dlen %zu", dlen)); + + reqlen = sizeof(*req) + dlen; + xact = vmbus_xact_get(sc->hn_xact, reqlen); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for RNDIS set 0x%08x\n", oid); + return (ENXIO); + } + rid = hn_rndis_rid(sc); + req = vmbus_xact_req_data(xact); + req->rm_type = REMOTE_NDIS_SET_MSG; + req->rm_len = reqlen; + req->rm_rid = rid; + req->rm_oid = oid; + req->rm_infobuflen = dlen; + req->rm_infobufoffset = RNDIS_SET_REQ_INFOBUFOFFSET; + /* Data immediately follows RNDIS set. */ + memcpy(req + 1, data, dlen); + + comp_len = sizeof(*comp); + comp = hn_rndis_xact_execute(sc, xact, rid, reqlen, &comp_len, + REMOTE_NDIS_SET_CMPLT); + if (comp == NULL) { + if_printf(sc->hn_ifp, "exec RNDIS set 0x%08x failed\n", oid); + error = EIO; + goto done; + } + + if (comp->rm_status != RNDIS_STATUS_SUCCESS) { + if_printf(sc->hn_ifp, "RNDIS set 0x%08x failed: " + "status 0x%08x\n", oid, comp->rm_status); + error = EIO; + goto done; + } + error = 0; +done: + vmbus_xact_put(xact); + return (error); +} + +static int +hn_rndis_conf_offload(struct hn_softc *sc, int mtu) +{ + struct ndis_offload hwcaps; + struct ndis_offload_params params; + uint32_t caps = 0; + size_t paramsz; + int error, tso_maxsz, tso_minsg; + + error = hn_rndis_query_hwcaps(sc, &hwcaps); + if (error) { + if_printf(sc->hn_ifp, "hwcaps query failed: %d\n", error); + return (error); + } + + /* NOTE: 0 means "no change" */ + memset(¶ms, 0, sizeof(params)); + + params.ndis_hdr.ndis_type = NDIS_OBJTYPE_DEFAULT; + if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_30) { + params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_2; + paramsz = NDIS_OFFLOAD_PARAMS_SIZE_6_1; + } else { + params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_3; + paramsz = NDIS_OFFLOAD_PARAMS_SIZE; + } + params.ndis_hdr.ndis_size = paramsz; + + /* + * TSO4/TSO6 setup. + */ + tso_maxsz = IP_MAXPACKET; + tso_minsg = 2; + if (hwcaps.ndis_lsov2.ndis_ip4_encap & NDIS_OFFLOAD_ENCAP_8023) { + caps |= HN_CAP_TSO4; + params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_ON; + + if (hwcaps.ndis_lsov2.ndis_ip4_maxsz < tso_maxsz) + tso_maxsz = hwcaps.ndis_lsov2.ndis_ip4_maxsz; + if (hwcaps.ndis_lsov2.ndis_ip4_minsg > tso_minsg) + tso_minsg = hwcaps.ndis_lsov2.ndis_ip4_minsg; + } + if ((hwcaps.ndis_lsov2.ndis_ip6_encap & NDIS_OFFLOAD_ENCAP_8023) && + (hwcaps.ndis_lsov2.ndis_ip6_opts & HN_NDIS_LSOV2_CAP_IP6) == + HN_NDIS_LSOV2_CAP_IP6) { + caps |= HN_CAP_TSO6; + params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_ON; + + if (hwcaps.ndis_lsov2.ndis_ip6_maxsz < tso_maxsz) + tso_maxsz = hwcaps.ndis_lsov2.ndis_ip6_maxsz; + if (hwcaps.ndis_lsov2.ndis_ip6_minsg > tso_minsg) + tso_minsg = hwcaps.ndis_lsov2.ndis_ip6_minsg; + } + sc->hn_ndis_tso_szmax = 0; + sc->hn_ndis_tso_sgmin = 0; + if (caps & (HN_CAP_TSO4 | HN_CAP_TSO6)) { + KASSERT(tso_maxsz <= IP_MAXPACKET, + ("invalid NDIS TSO maxsz %d", tso_maxsz)); + KASSERT(tso_minsg >= 2, + ("invalid NDIS TSO minsg %d", tso_minsg)); + if (tso_maxsz < tso_minsg * mtu) { + if_printf(sc->hn_ifp, "invalid NDIS TSO config: " + "maxsz %d, minsg %d, mtu %d; " + "disable TSO4 and TSO6\n", + tso_maxsz, tso_minsg, mtu); + caps &= ~(HN_CAP_TSO4 | HN_CAP_TSO6); + params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_OFF; + params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_OFF; + } else { + sc->hn_ndis_tso_szmax = tso_maxsz; + sc->hn_ndis_tso_sgmin = tso_minsg; + if (bootverbose) { + if_printf(sc->hn_ifp, "NDIS TSO " + "szmax %d sgmin %d\n", + sc->hn_ndis_tso_szmax, + sc->hn_ndis_tso_sgmin); + } + } + } + + /* IPv4 checksum */ + if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HN_NDIS_TXCSUM_CAP_IP4) == + HN_NDIS_TXCSUM_CAP_IP4) { + caps |= HN_CAP_IPCS; + params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TX; + } + if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_IP4) { + if (params.ndis_ip4csum == NDIS_OFFLOAD_PARAM_TX) + params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TXRX; + else + params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_RX; + } + + /* TCP4 checksum */ + if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HN_NDIS_TXCSUM_CAP_TCP4) == + HN_NDIS_TXCSUM_CAP_TCP4) { + caps |= HN_CAP_TCP4CS; + params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TX; + } + if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_TCP4) { + if (params.ndis_tcp4csum == NDIS_OFFLOAD_PARAM_TX) + params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TXRX; + else + params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_RX; + } + + /* UDP4 checksum */ + if (hwcaps.ndis_csum.ndis_ip4_txcsum & NDIS_TXCSUM_CAP_UDP4) { + caps |= HN_CAP_UDP4CS; + params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TX; + } + if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_UDP4) { + if (params.ndis_udp4csum == NDIS_OFFLOAD_PARAM_TX) + params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TXRX; + else + params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_RX; + } + + /* TCP6 checksum */ + if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HN_NDIS_TXCSUM_CAP_TCP6) == + HN_NDIS_TXCSUM_CAP_TCP6) { + caps |= HN_CAP_TCP6CS; + params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TX; + } + if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_TCP6) { + if (params.ndis_tcp6csum == NDIS_OFFLOAD_PARAM_TX) + params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TXRX; + else + params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_RX; + } + + /* UDP6 checksum */ + if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HN_NDIS_TXCSUM_CAP_UDP6) == + HN_NDIS_TXCSUM_CAP_UDP6) { + caps |= HN_CAP_UDP6CS; + params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TX; + } + if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_UDP6) { + if (params.ndis_udp6csum == NDIS_OFFLOAD_PARAM_TX) + params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TXRX; + else + params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_RX; + } + + /* RSC offload */ + if (hwcaps.ndis_hdr.ndis_rev >= NDIS_OFFLOAD_PARAMS_REV_3) { + if (hwcaps.ndis_rsc.ndis_ip4 && hwcaps.ndis_rsc.ndis_ip6) { + params.ndis_rsc_ip4 = NDIS_OFFLOAD_RSC_ON; + params.ndis_rsc_ip6 = NDIS_OFFLOAD_RSC_ON; + } else { + params.ndis_rsc_ip4 = NDIS_OFFLOAD_RSC_OFF; + params.ndis_rsc_ip6 = NDIS_OFFLOAD_RSC_OFF; + } + } + + if (bootverbose) { + if_printf(sc->hn_ifp, "offload csum: " + "ip4 %u, tcp4 %u, udp4 %u, tcp6 %u, udp6 %u\n", + params.ndis_ip4csum, + params.ndis_tcp4csum, + params.ndis_udp4csum, + params.ndis_tcp6csum, + params.ndis_udp6csum); + if_printf(sc->hn_ifp, "offload lsov2: ip4 %u, ip6 %u\n", + params.ndis_lsov2_ip4, + params.ndis_lsov2_ip6); + if (hwcaps.ndis_hdr.ndis_rev >= NDIS_OFFLOAD_PARAMS_REV_3) + if_printf(sc->hn_ifp, "offload rsc: ip4 %u, ip6 %u\n", + params.ndis_rsc_ip4, + params.ndis_rsc_ip6); + } + + error = hn_rndis_set(sc, OID_TCP_OFFLOAD_PARAMETERS, ¶ms, paramsz); + if (error) { + if_printf(sc->hn_ifp, "offload config failed: %d\n", error); + return (error); + } + + if (bootverbose) + if_printf(sc->hn_ifp, "offload config done\n"); + sc->hn_caps |= caps; + return (0); +} + +int +hn_rndis_conf_rss(struct hn_softc *sc, uint16_t flags) +{ + struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; + struct ndis_rss_params *prm = &rss->rss_params; + int error, rss_size; + + /* + * Only NDIS 6.20+ is supported: + * We only support 4bytes element in indirect table, which has been + * adopted since NDIS 6.20. + */ + KASSERT(sc->hn_ndis_ver >= HN_NDIS_VERSION_6_20, + ("NDIS 6.20+ is required, NDIS version 0x%08x", sc->hn_ndis_ver)); + + /* XXX only one can be specified through, popcnt? */ + KASSERT((sc->hn_rss_hash & NDIS_HASH_FUNCTION_MASK), + ("no hash func %08x", sc->hn_rss_hash)); + KASSERT((sc->hn_rss_hash & NDIS_HASH_STD), + ("no standard hash types %08x", sc->hn_rss_hash)); + KASSERT(sc->hn_rss_ind_size > 0, ("no indirect table size")); + + if (bootverbose) { + if_printf(sc->hn_ifp, "RSS indirect table size %d, " + "hash 0x%08x\n", sc->hn_rss_ind_size, sc->hn_rss_hash); + } + + /* + * NOTE: + * DO NOT whack rss_key and rss_ind, which are setup by the caller. + */ + memset(prm, 0, sizeof(*prm)); + rss_size = NDIS_RSSPRM_TOEPLITZ_SIZE(sc->hn_rss_ind_size); + + prm->ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_PARAMS; + prm->ndis_hdr.ndis_rev = NDIS_RSS_PARAMS_REV_2; + prm->ndis_hdr.ndis_size = rss_size; + prm->ndis_flags = flags; + prm->ndis_hash = sc->hn_rss_hash & + (NDIS_HASH_FUNCTION_MASK | NDIS_HASH_STD); + prm->ndis_indsize = sizeof(rss->rss_ind[0]) * sc->hn_rss_ind_size; + prm->ndis_indoffset = + __offsetof(struct ndis_rssprm_toeplitz, rss_ind[0]); + prm->ndis_keysize = sizeof(rss->rss_key); + prm->ndis_keyoffset = + __offsetof(struct ndis_rssprm_toeplitz, rss_key[0]); + + error = hn_rndis_set(sc, OID_GEN_RECEIVE_SCALE_PARAMETERS, + rss, rss_size); + if (error) { + if_printf(sc->hn_ifp, "RSS config failed: %d\n", error); + } else { + if (bootverbose) + if_printf(sc->hn_ifp, "RSS config done\n"); + } + return (error); +} + +int +hn_rndis_set_rxfilter(struct hn_softc *sc, uint32_t filter) +{ + int error; + + error = hn_rndis_set(sc, OID_GEN_CURRENT_PACKET_FILTER, + &filter, sizeof(filter)); + if (error) { + if_printf(sc->hn_ifp, "set RX filter 0x%08x failed: %d\n", + filter, error); + } else { + if (bootverbose) { + if_printf(sc->hn_ifp, "set RX filter 0x%08x done\n", + filter); + } + } + return (error); +} + +static int +hn_rndis_init(struct hn_softc *sc) +{ + struct rndis_init_req *req; + const struct rndis_init_comp *comp; + struct vmbus_xact *xact; + size_t comp_len; + uint32_t rid; + int error; + + xact = vmbus_xact_get(sc->hn_xact, sizeof(*req)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for RNDIS init\n"); + return (ENXIO); + } + rid = hn_rndis_rid(sc); + req = vmbus_xact_req_data(xact); + req->rm_type = REMOTE_NDIS_INITIALIZE_MSG; + req->rm_len = sizeof(*req); + req->rm_rid = rid; + req->rm_ver_major = RNDIS_VERSION_MAJOR; + req->rm_ver_minor = RNDIS_VERSION_MINOR; + req->rm_max_xfersz = HN_RNDIS_XFER_SIZE; + + comp_len = RNDIS_INIT_COMP_SIZE_MIN; + comp = hn_rndis_xact_execute(sc, xact, rid, sizeof(*req), &comp_len, + REMOTE_NDIS_INITIALIZE_CMPLT); + if (comp == NULL) { + if_printf(sc->hn_ifp, "exec RNDIS init failed\n"); + error = EIO; + goto done; + } + + if (comp->rm_status != RNDIS_STATUS_SUCCESS) { + if_printf(sc->hn_ifp, "RNDIS init failed: status 0x%08x\n", + comp->rm_status); + error = EIO; + goto done; + } + sc->hn_rndis_agg_size = comp->rm_pktmaxsz; + sc->hn_rndis_agg_pkts = comp->rm_pktmaxcnt; + sc->hn_rndis_agg_align = 1U << comp->rm_align; + + if (sc->hn_rndis_agg_align < sizeof(uint32_t)) { + /* + * The RNDIS packet messsage encap assumes that the RNDIS + * packet message is at least 4 bytes aligned. Fix up the + * alignment here, if the remote side sets the alignment + * too low. + */ + if_printf(sc->hn_ifp, "fixup RNDIS aggpkt align: %u -> %zu\n", + sc->hn_rndis_agg_align, sizeof(uint32_t)); + sc->hn_rndis_agg_align = sizeof(uint32_t); + } + + if (bootverbose) { + if_printf(sc->hn_ifp, "RNDIS ver %u.%u, " + "aggpkt size %u, aggpkt cnt %u, aggpkt align %u\n", + comp->rm_ver_major, comp->rm_ver_minor, + sc->hn_rndis_agg_size, sc->hn_rndis_agg_pkts, + sc->hn_rndis_agg_align); + } + error = 0; +done: + vmbus_xact_put(xact); + return (error); +} + +static int +hn_rndis_halt(struct hn_softc *sc) +{ + struct vmbus_xact *xact; + struct rndis_halt_req *halt; + struct hn_nvs_sendctx sndc; + size_t comp_len; + + xact = vmbus_xact_get(sc->hn_xact, sizeof(*halt)); + if (xact == NULL) { + if_printf(sc->hn_ifp, "no xact for RNDIS halt\n"); + return (ENXIO); + } + halt = vmbus_xact_req_data(xact); + halt->rm_type = REMOTE_NDIS_HALT_MSG; + halt->rm_len = sizeof(*halt); + halt->rm_rid = hn_rndis_rid(sc); + + /* No RNDIS completion; rely on NVS message send completion */ + hn_nvs_sendctx_init(&sndc, hn_nvs_sent_xact, xact); + hn_rndis_xact_exec1(sc, xact, sizeof(*halt), &sndc, &comp_len); + + vmbus_xact_put(xact); + if (bootverbose) + if_printf(sc->hn_ifp, "RNDIS halt done\n"); + return (0); +} + +static int +hn_rndis_query_hwcaps(struct hn_softc *sc, struct ndis_offload *caps) +{ + struct ndis_offload in; + size_t caps_len, size; + int error; + + memset(&in, 0, sizeof(in)); + in.ndis_hdr.ndis_type = NDIS_OBJTYPE_OFFLOAD; + if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_30) { + in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_3; + size = NDIS_OFFLOAD_SIZE; + } else if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_1) { + in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_2; + size = NDIS_OFFLOAD_SIZE_6_1; + } else { + in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_1; + size = NDIS_OFFLOAD_SIZE_6_0; + } + in.ndis_hdr.ndis_size = size; + + caps_len = NDIS_OFFLOAD_SIZE; + error = hn_rndis_query2(sc, OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES, + &in, size, caps, &caps_len, NDIS_OFFLOAD_SIZE_6_0); + if (error) + return (error); + + /* + * Preliminary verification. + */ + if (caps->ndis_hdr.ndis_type != NDIS_OBJTYPE_OFFLOAD) { + if_printf(sc->hn_ifp, "invalid NDIS objtype 0x%02x\n", + caps->ndis_hdr.ndis_type); + return (EINVAL); + } + if (caps->ndis_hdr.ndis_rev < NDIS_OFFLOAD_REV_1) { + if_printf(sc->hn_ifp, "invalid NDIS objrev 0x%02x\n", + caps->ndis_hdr.ndis_rev); + return (EINVAL); + } + if (caps->ndis_hdr.ndis_size > caps_len) { + if_printf(sc->hn_ifp, "invalid NDIS objsize %u, " + "data size %zu\n", caps->ndis_hdr.ndis_size, caps_len); + return (EINVAL); + } else if (caps->ndis_hdr.ndis_size < NDIS_OFFLOAD_SIZE_6_0) { + if_printf(sc->hn_ifp, "invalid NDIS objsize %u\n", + caps->ndis_hdr.ndis_size); + return (EINVAL); + } else if (caps->ndis_hdr.ndis_rev >= NDIS_OFFLOAD_REV_3 && + caps->ndis_hdr.ndis_size < NDIS_OFFLOAD_SIZE) { + if_printf(sc->hn_ifp, "invalid NDIS rev3 objsize %u\n", + caps->ndis_hdr.ndis_size); + return (EINVAL); + } + + if (bootverbose) { + /* + * NOTE: + * caps->ndis_hdr.ndis_size MUST be checked before accessing + * NDIS 6.1+ specific fields. + */ + if_printf(sc->hn_ifp, "hwcaps rev %u\n", + caps->ndis_hdr.ndis_rev); + + if_printf(sc->hn_ifp, "hwcaps csum: " + "ip4 tx 0x%x/0x%x rx 0x%x/0x%x, " + "ip6 tx 0x%x/0x%x rx 0x%x/0x%x\n", + caps->ndis_csum.ndis_ip4_txcsum, + caps->ndis_csum.ndis_ip4_txenc, + caps->ndis_csum.ndis_ip4_rxcsum, + caps->ndis_csum.ndis_ip4_rxenc, + caps->ndis_csum.ndis_ip6_txcsum, + caps->ndis_csum.ndis_ip6_txenc, + caps->ndis_csum.ndis_ip6_rxcsum, + caps->ndis_csum.ndis_ip6_rxenc); + if_printf(sc->hn_ifp, "hwcaps lsov2: " + "ip4 maxsz %u minsg %u encap 0x%x, " + "ip6 maxsz %u minsg %u encap 0x%x opts 0x%x\n", + caps->ndis_lsov2.ndis_ip4_maxsz, + caps->ndis_lsov2.ndis_ip4_minsg, + caps->ndis_lsov2.ndis_ip4_encap, + caps->ndis_lsov2.ndis_ip6_maxsz, + caps->ndis_lsov2.ndis_ip6_minsg, + caps->ndis_lsov2.ndis_ip6_encap, + caps->ndis_lsov2.ndis_ip6_opts); + if (caps->ndis_hdr.ndis_rev >= NDIS_OFFLOAD_REV_3) + if_printf(sc->hn_ifp, "hwcaps rsc: " + "ip4 %u ip6 %u\n", + caps->ndis_rsc.ndis_ip4, + caps->ndis_rsc.ndis_ip6); + } + return (0); +} + +int +hn_rndis_attach(struct hn_softc *sc, int mtu, int *init_done) +{ + int error; + + *init_done = 0; + + /* + * Initialize RNDIS. + */ + error = hn_rndis_init(sc); + if (error) + return (error); + *init_done = 1; + + /* + * Configure NDIS offload settings. + */ + hn_rndis_conf_offload(sc, mtu); + return (0); +} + +void +hn_rndis_detach(struct hn_softc *sc) +{ + + /* Halt the RNDIS. */ + hn_rndis_halt(sc); +} diff --git a/sys/dev/hyperv/netvsc/hn_rndis.h b/sys/dev/hyperv/netvsc/hn_rndis.h new file mode 100644 index 000000000000..4610d5a10526 --- /dev/null +++ b/sys/dev/hyperv/netvsc/hn_rndis.h @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2012 NetApp Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HN_RNDIS_H_ +#define _HN_RNDIS_H_ + +struct hn_softc; + +int hn_rndis_attach(struct hn_softc *sc, int mtu, int *init_done); +void hn_rndis_detach(struct hn_softc *sc); +int hn_rndis_conf_rss(struct hn_softc *sc, uint16_t flags); +int hn_rndis_query_rsscaps(struct hn_softc *sc, int *rxr_cnt); +int hn_rndis_get_eaddr(struct hn_softc *sc, uint8_t *eaddr); +/* link_status: NDIS_MEDIA_STATE_ */ +int hn_rndis_get_linkstatus(struct hn_softc *sc, + uint32_t *link_status); +int hn_rndis_get_mtu(struct hn_softc *sc, uint32_t *mtu); +/* filter: NDIS_PACKET_TYPE_. */ +int hn_rndis_set_rxfilter(struct hn_softc *sc, uint32_t filter); +void hn_rndis_rx_ctrl(struct hn_softc *sc, const void *data, + int dlen); + +#endif /* !_HN_RNDIS_H_ */ diff --git a/sys/dev/hyperv/netvsc/if_hn.c b/sys/dev/hyperv/netvsc/if_hn.c new file mode 100644 index 000000000000..d562a937ecad --- /dev/null +++ b/sys/dev/hyperv/netvsc/if_hn.c @@ -0,0 +1,7717 @@ +/*- + * Copyright (c) 2010-2012 Citrix Inc. + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 2004-2006 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_hn.h" +#include "opt_inet6.h" +#include "opt_inet.h" +#include "opt_rss.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/counter.h> +#include <sys/kernel.h> +#include <sys/limits.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/queue.h> +#include <sys/lock.h> +#include <sys/proc.h> +#include <sys/rmlock.h> +#include <sys/sbuf.h> +#include <sys/sched.h> +#include <sys/smp.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/sx.h> +#include <sys/sysctl.h> +#include <sys/taskqueue.h> +#include <sys/buf_ring.h> +#include <sys/eventhandler.h> +#include <sys/epoch.h> + +#include <machine/atomic.h> +#include <machine/in_cksum.h> + +#include <net/bpf.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_dl.h> +#include <net/if_media.h> +#include <net/if_types.h> +#include <net/if_var.h> +#include <net/rndis.h> +#ifdef RSS +#include <net/rss_config.h> +#endif + +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <netinet/tcp_lro.h> +#include <netinet/udp.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/hyperv_busdma.h> +#include <dev/hyperv/include/vmbus.h> +#include <dev/hyperv/include/vmbus_xact.h> + +#include <dev/hyperv/netvsc/ndis.h> +#include <dev/hyperv/netvsc/if_hnreg.h> +#include <dev/hyperv/netvsc/if_hnvar.h> +#include <dev/hyperv/netvsc/hn_nvs.h> +#include <dev/hyperv/netvsc/hn_rndis.h> + +#include "vmbus_if.h" + +#define HN_IFSTART_SUPPORT + +#define HN_RING_CNT_DEF_MAX 8 + +#define HN_VFMAP_SIZE_DEF 8 + +#define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ + +/* YYY should get it from the underlying channel */ +#define HN_TX_DESC_CNT 512 + +#define HN_RNDIS_PKT_LEN \ + (sizeof(struct rndis_packet_msg) + \ + HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ + HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ + HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ + HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) +#define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE +#define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE + +#define HN_TX_DATA_BOUNDARY PAGE_SIZE +#define HN_TX_DATA_MAXSIZE IP_MAXPACKET +#define HN_TX_DATA_SEGSIZE PAGE_SIZE +/* -1 for RNDIS packet message */ +#define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) + +#define HN_DIRECT_TX_SIZE_DEF 128 + +#define HN_EARLY_TXEOF_THRESH 8 + +#define HN_PKTBUF_LEN_DEF (16 * 1024) + +#define HN_LROENT_CNT_DEF 128 + +#define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) +#define HN_LRO_LENLIM_DEF (25 * ETHERMTU) +/* YYY 2*MTU is a bit rough, but should be good enough. */ +#define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) + +#define HN_LRO_ACKCNT_DEF 1 + +#define HN_LOCK_INIT(sc) \ + sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) +#define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) +#define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) +#define HN_LOCK(sc) \ +do { \ + while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ + /* Relinquish cpu to avoid deadlock */ \ + sched_relinquish(curthread); \ + DELAY(1000); \ + } \ +} while (0) +#define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) + +#define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) +#define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) +#define HN_CSUM_IP_HWASSIST(sc) \ + ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) +#define HN_CSUM_IP6_HWASSIST(sc) \ + ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) + +#define HN_PKTSIZE_MIN(align) \ + roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ + HN_RNDIS_PKT_LEN, (align)) +#define HN_PKTSIZE(m, align) \ + roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) + +#ifdef RSS +#define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) +#else +#define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) +#endif + +struct hn_txdesc { +#ifndef HN_USE_TXDESC_BUFRING + SLIST_ENTRY(hn_txdesc) link; +#endif + STAILQ_ENTRY(hn_txdesc) agg_link; + + /* Aggregated txdescs, in sending order. */ + STAILQ_HEAD(, hn_txdesc) agg_list; + + /* The oldest packet, if transmission aggregation happens. */ + struct mbuf *m; + struct hn_tx_ring *txr; + int refs; + uint32_t flags; /* HN_TXD_FLAG_ */ + struct hn_nvs_sendctx send_ctx; + uint32_t chim_index; + int chim_size; + + bus_dmamap_t data_dmap; + + bus_addr_t rndis_pkt_paddr; + struct rndis_packet_msg *rndis_pkt; + bus_dmamap_t rndis_pkt_dmap; +}; + +#define HN_TXD_FLAG_ONLIST 0x0001 +#define HN_TXD_FLAG_DMAMAP 0x0002 +#define HN_TXD_FLAG_ONAGG 0x0004 + +#define HN_NDIS_PKTINFO_SUBALLOC 0x01 +#define HN_NDIS_PKTINFO_1ST_FRAG 0x02 +#define HN_NDIS_PKTINFO_LAST_FRAG 0x04 + +struct packet_info_id { + uint8_t ver; + uint8_t flag; + uint16_t pkt_id; +}; + +#define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) + + +struct hn_rxinfo { + const uint32_t *vlan_info; + const uint32_t *csum_info; + const uint32_t *hash_info; + const uint32_t *hash_value; + const struct packet_info_id *pktinfo_id; +}; + +struct hn_rxvf_setarg { + struct hn_rx_ring *rxr; + struct ifnet *vf_ifp; +}; + +#define HN_RXINFO_VLAN 0x0001 +#define HN_RXINFO_CSUM 0x0002 +#define HN_RXINFO_HASHINF 0x0004 +#define HN_RXINFO_HASHVAL 0x0008 +#define HN_RXINFO_PKTINFO_ID 0x0010 +#define HN_RXINFO_ALL \ + (HN_RXINFO_VLAN | \ + HN_RXINFO_CSUM | \ + HN_RXINFO_HASHINF | \ + HN_RXINFO_HASHVAL | \ + HN_RXINFO_PKTINFO_ID) + +static int hn_probe(device_t); +static int hn_attach(device_t); +static int hn_detach(device_t); +static int hn_shutdown(device_t); +static void hn_chan_callback(struct vmbus_channel *, + void *); + +static void hn_init(void *); +static int hn_ioctl(struct ifnet *, u_long, caddr_t); +#ifdef HN_IFSTART_SUPPORT +static void hn_start(struct ifnet *); +#endif +static int hn_transmit(struct ifnet *, struct mbuf *); +static void hn_xmit_qflush(struct ifnet *); +static int hn_ifmedia_upd(struct ifnet *); +static void hn_ifmedia_sts(struct ifnet *, + struct ifmediareq *); + +static void hn_ifnet_event(void *, struct ifnet *, int); +static void hn_ifaddr_event(void *, struct ifnet *); +static void hn_ifnet_attevent(void *, struct ifnet *); +static void hn_ifnet_detevent(void *, struct ifnet *); +static void hn_ifnet_lnkevent(void *, struct ifnet *, int); + +static bool hn_ismyvf(const struct hn_softc *, + const struct ifnet *); +static void hn_rxvf_change(struct hn_softc *, + struct ifnet *, bool); +static void hn_rxvf_set(struct hn_softc *, struct ifnet *); +static void hn_rxvf_set_task(void *, int); +static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); +static int hn_xpnt_vf_iocsetflags(struct hn_softc *); +static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, + struct ifreq *); +static void hn_xpnt_vf_saveifflags(struct hn_softc *); +static bool hn_xpnt_vf_isready(struct hn_softc *); +static void hn_xpnt_vf_setready(struct hn_softc *); +static void hn_xpnt_vf_init_taskfunc(void *, int); +static void hn_xpnt_vf_init(struct hn_softc *); +static void hn_xpnt_vf_setenable(struct hn_softc *); +static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); +static void hn_vf_rss_fixup(struct hn_softc *, bool); +static void hn_vf_rss_restore(struct hn_softc *); + +static int hn_rndis_rxinfo(const void *, int, + struct hn_rxinfo *); +static void hn_rndis_rx_data(struct hn_rx_ring *, + const void *, int); +static void hn_rndis_rx_status(struct hn_softc *, + const void *, int); +static void hn_rndis_init_fixat(struct hn_softc *, int); + +static void hn_nvs_handle_notify(struct hn_softc *, + const struct vmbus_chanpkt_hdr *); +static void hn_nvs_handle_comp(struct hn_softc *, + struct vmbus_channel *, + const struct vmbus_chanpkt_hdr *); +static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, + struct vmbus_channel *, + const struct vmbus_chanpkt_hdr *); +static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, + struct vmbus_channel *, uint64_t); + +#if __FreeBSD_version >= 1100099 +static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); +#endif +static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); +#if __FreeBSD_version < 1100095 +static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); +#else +static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); +#endif +static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); +#ifndef RSS +static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); +#endif +static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); + +static void hn_stop(struct hn_softc *, bool); +static void hn_init_locked(struct hn_softc *); +static int hn_chan_attach(struct hn_softc *, + struct vmbus_channel *); +static void hn_chan_detach(struct hn_softc *, + struct vmbus_channel *); +static int hn_attach_subchans(struct hn_softc *); +static void hn_detach_allchans(struct hn_softc *); +static void hn_chan_rollup(struct hn_rx_ring *, + struct hn_tx_ring *); +static void hn_set_ring_inuse(struct hn_softc *, int); +static int hn_synth_attach(struct hn_softc *, int); +static void hn_synth_detach(struct hn_softc *); +static int hn_synth_alloc_subchans(struct hn_softc *, + int *); +static bool hn_synth_attachable(const struct hn_softc *); +static void hn_suspend(struct hn_softc *); +static void hn_suspend_data(struct hn_softc *); +static void hn_suspend_mgmt(struct hn_softc *); +static void hn_resume(struct hn_softc *); +static void hn_resume_data(struct hn_softc *); +static void hn_resume_mgmt(struct hn_softc *); +static void hn_suspend_mgmt_taskfunc(void *, int); +static void hn_chan_drain(struct hn_softc *, + struct vmbus_channel *); +static void hn_disable_rx(struct hn_softc *); +static void hn_drain_rxtx(struct hn_softc *, int); +static void hn_polling(struct hn_softc *, u_int); +static void hn_chan_polling(struct vmbus_channel *, u_int); +static void hn_mtu_change_fixup(struct hn_softc *); + +static void hn_update_link_status(struct hn_softc *); +static void hn_change_network(struct hn_softc *); +static void hn_link_taskfunc(void *, int); +static void hn_netchg_init_taskfunc(void *, int); +static void hn_netchg_status_taskfunc(void *, int); +static void hn_link_status(struct hn_softc *); + +static int hn_create_rx_data(struct hn_softc *, int); +static void hn_destroy_rx_data(struct hn_softc *); +static int hn_check_iplen(const struct mbuf *, int); +static void hn_rxpkt_proto(const struct mbuf *, int *, int *); +static int hn_set_rxfilter(struct hn_softc *, uint32_t); +static int hn_rxfilter_config(struct hn_softc *); +static int hn_rss_reconfig(struct hn_softc *); +static void hn_rss_ind_fixup(struct hn_softc *); +static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); +static int hn_rxpkt(struct hn_rx_ring *); +static uint32_t hn_rss_type_fromndis(uint32_t); +static uint32_t hn_rss_type_tondis(uint32_t); + +static int hn_tx_ring_create(struct hn_softc *, int); +static void hn_tx_ring_destroy(struct hn_tx_ring *); +static int hn_create_tx_data(struct hn_softc *, int); +static void hn_fixup_tx_data(struct hn_softc *); +static void hn_fixup_rx_data(struct hn_softc *); +static void hn_destroy_tx_data(struct hn_softc *); +static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); +static void hn_txdesc_gc(struct hn_tx_ring *, + struct hn_txdesc *); +static int hn_encap(struct ifnet *, struct hn_tx_ring *, + struct hn_txdesc *, struct mbuf **); +static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, + struct hn_txdesc *); +static void hn_set_chim_size(struct hn_softc *, int); +static void hn_set_tso_maxsize(struct hn_softc *, int, int); +static bool hn_tx_ring_pending(struct hn_tx_ring *); +static void hn_tx_ring_qflush(struct hn_tx_ring *); +static void hn_resume_tx(struct hn_softc *, int); +static void hn_set_txagg(struct hn_softc *); +static void *hn_try_txagg(struct ifnet *, + struct hn_tx_ring *, struct hn_txdesc *, + int); +static int hn_get_txswq_depth(const struct hn_tx_ring *); +static void hn_txpkt_done(struct hn_nvs_sendctx *, + struct hn_softc *, struct vmbus_channel *, + const void *, int); +static int hn_txpkt_sglist(struct hn_tx_ring *, + struct hn_txdesc *); +static int hn_txpkt_chim(struct hn_tx_ring *, + struct hn_txdesc *); +static int hn_xmit(struct hn_tx_ring *, int); +static void hn_xmit_taskfunc(void *, int); +static void hn_xmit_txeof(struct hn_tx_ring *); +static void hn_xmit_txeof_taskfunc(void *, int); +#ifdef HN_IFSTART_SUPPORT +static int hn_start_locked(struct hn_tx_ring *, int); +static void hn_start_taskfunc(void *, int); +static void hn_start_txeof(struct hn_tx_ring *); +static void hn_start_txeof_taskfunc(void *, int); +#endif + +SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, + "Hyper-V network interface"); + +/* Trust tcp segment verification on host side. */ +static int hn_trust_hosttcp = 1; +SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, + &hn_trust_hosttcp, 0, + "Trust tcp segment verification on host side, " + "when csum info is missing (global setting)"); + +/* Trust udp datagrams verification on host side. */ +static int hn_trust_hostudp = 1; +SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, + &hn_trust_hostudp, 0, + "Trust udp datagram verification on host side, " + "when csum info is missing (global setting)"); + +/* Trust ip packets verification on host side. */ +static int hn_trust_hostip = 1; +SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, + &hn_trust_hostip, 0, + "Trust ip packet verification on host side, " + "when csum info is missing (global setting)"); + +/* + * Offload UDP/IPv4 checksum. + */ +static int hn_enable_udp4cs = 1; +SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, + &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); + +/* + * Offload UDP/IPv6 checksum. + */ +static int hn_enable_udp6cs = 1; +SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, + &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); + +/* Stats. */ +static counter_u64_t hn_udpcs_fixup; +SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, + &hn_udpcs_fixup, "# of UDP checksum fixup"); + +/* + * See hn_set_hlen(). + * + * This value is for Azure. For Hyper-V, set this above + * 65536 to disable UDP datagram checksum fixup. + */ +static int hn_udpcs_fixup_mtu = 1420; +SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, + &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); + +/* Limit TSO burst size */ +static int hn_tso_maxlen = IP_MAXPACKET; +SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, + &hn_tso_maxlen, 0, "TSO burst limit"); + +/* Limit chimney send size */ +static int hn_tx_chimney_size = 0; +SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, + &hn_tx_chimney_size, 0, "Chimney send packet size limit"); + +/* Limit the size of packet for direct transmission */ +static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; +SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, + &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); + +/* # of LRO entries per RX ring */ +#if defined(INET) || defined(INET6) +#if __FreeBSD_version >= 1100095 +static int hn_lro_entry_count = HN_LROENT_CNT_DEF; +SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, + &hn_lro_entry_count, 0, "LRO entry count"); +#endif +#endif + +static int hn_tx_taskq_cnt = 1; +SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, + &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); + +#define HN_TX_TASKQ_M_INDEP 0 +#define HN_TX_TASKQ_M_GLOBAL 1 +#define HN_TX_TASKQ_M_EVTTQ 2 + +static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; +SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, + &hn_tx_taskq_mode, 0, "TX taskqueue modes: " + "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); + +#ifndef HN_USE_TXDESC_BUFRING +static int hn_use_txdesc_bufring = 0; +#else +static int hn_use_txdesc_bufring = 1; +#endif +SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, + &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); + +#ifdef HN_IFSTART_SUPPORT +/* Use ifnet.if_start instead of ifnet.if_transmit */ +static int hn_use_if_start = 0; +SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, + &hn_use_if_start, 0, "Use if_start TX method"); +#endif + +/* # of channels to use */ +static int hn_chan_cnt = 0; +SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, + &hn_chan_cnt, 0, + "# of channels to use; each channel has one RX ring and one TX ring"); + +/* # of transmit rings to use */ +static int hn_tx_ring_cnt = 0; +SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, + &hn_tx_ring_cnt, 0, "# of TX rings to use"); + +/* Software TX ring deptch */ +static int hn_tx_swq_depth = 0; +SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, + &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); + +/* Enable sorted LRO, and the depth of the per-channel mbuf queue */ +#if __FreeBSD_version >= 1100095 +static u_int hn_lro_mbufq_depth = 0; +SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, + &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); +#endif + +/* Packet transmission aggregation size limit */ +static int hn_tx_agg_size = -1; +SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, + &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); + +/* Packet transmission aggregation count limit */ +static int hn_tx_agg_pkts = -1; +SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, + &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); + +/* VF list */ +SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, + CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, + hn_vflist_sysctl, "A", + "VF list"); + +/* VF mapping */ +SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, + CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, + hn_vfmap_sysctl, "A", + "VF mapping"); + +/* Transparent VF */ +static int hn_xpnt_vf = 1; +SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, + &hn_xpnt_vf, 0, "Transparent VF mod"); + +/* Accurate BPF support for Transparent VF */ +static int hn_xpnt_vf_accbpf = 0; +SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, + &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); + +/* Extra wait for transparent VF attach routing; unit seconds. */ +static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; +SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, + &hn_xpnt_vf_attwait, 0, + "Extra wait for transparent VF attach routing; unit: seconds"); + +static u_int hn_cpu_index; /* next CPU for channel */ +static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ + +static struct rmlock hn_vfmap_lock; +static int hn_vfmap_size; +static struct ifnet **hn_vfmap; + +#ifndef RSS +static const uint8_t +hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { + 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, + 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, + 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, + 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, + 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa +}; +#endif /* !RSS */ + +static const struct hyperv_guid hn_guid = { + .hv_guid = { + 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, + 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } +}; + +static device_method_t hn_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, hn_probe), + DEVMETHOD(device_attach, hn_attach), + DEVMETHOD(device_detach, hn_detach), + DEVMETHOD(device_shutdown, hn_shutdown), + DEVMETHOD_END +}; + +static driver_t hn_driver = { + "hn", + hn_methods, + sizeof(struct hn_softc) +}; + +static devclass_t hn_devclass; + +DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); +MODULE_VERSION(hn, 1); +MODULE_DEPEND(hn, vmbus, 1, 1, 1); + +#if __FreeBSD_version >= 1100099 +static void +hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) +{ + int i; + + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) + sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; +} +#endif + +static int +hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) +{ + + KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && + txd->chim_size == 0, ("invalid rndis sglist txd")); + return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, + &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); +} + +static int +hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) +{ + struct hn_nvs_rndis rndis; + + KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && + txd->chim_size > 0, ("invalid rndis chim txd")); + + rndis.nvs_type = HN_NVS_TYPE_RNDIS; + rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; + rndis.nvs_chim_idx = txd->chim_index; + rndis.nvs_chim_sz = txd->chim_size; + + return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, + &rndis, sizeof(rndis), &txd->send_ctx)); +} + +static __inline uint32_t +hn_chim_alloc(struct hn_softc *sc) +{ + int i, bmap_cnt = sc->hn_chim_bmap_cnt; + u_long *bmap = sc->hn_chim_bmap; + uint32_t ret = HN_NVS_CHIM_IDX_INVALID; + + for (i = 0; i < bmap_cnt; ++i) { + int idx; + + idx = ffsl(~bmap[i]); + if (idx == 0) + continue; + + --idx; /* ffsl is 1-based */ + KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, + ("invalid i %d and idx %d", i, idx)); + + if (atomic_testandset_long(&bmap[i], idx)) + continue; + + ret = i * LONG_BIT + idx; + break; + } + return (ret); +} + +static __inline void +hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) +{ + u_long mask; + uint32_t idx; + + idx = chim_idx / LONG_BIT; + KASSERT(idx < sc->hn_chim_bmap_cnt, + ("invalid chimney index 0x%x", chim_idx)); + + mask = 1UL << (chim_idx % LONG_BIT); + KASSERT(sc->hn_chim_bmap[idx] & mask, + ("index bitmap 0x%lx, chimney index %u, " + "bitmap idx %d, bitmask 0x%lx", + sc->hn_chim_bmap[idx], chim_idx, idx, mask)); + + atomic_clear_long(&sc->hn_chim_bmap[idx], mask); +} + +#if defined(INET6) || defined(INET) + +#define PULLUP_HDR(m, len) \ +do { \ + if (__predict_false((m)->m_len < (len))) { \ + (m) = m_pullup((m), (len)); \ + if ((m) == NULL) \ + return (NULL); \ + } \ +} while (0) + +/* + * NOTE: If this function failed, the m_head would be freed. + */ +static __inline struct mbuf * +hn_tso_fixup(struct mbuf *m_head) +{ + struct ether_vlan_header *evl; + struct tcphdr *th; + int ehlen; + + KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); + + PULLUP_HDR(m_head, sizeof(*evl)); + evl = mtod(m_head, struct ether_vlan_header *); + if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) + ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; + else + ehlen = ETHER_HDR_LEN; + m_head->m_pkthdr.l2hlen = ehlen; + +#ifdef INET + if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { + struct ip *ip; + int iphlen; + + PULLUP_HDR(m_head, ehlen + sizeof(*ip)); + ip = mtodo(m_head, ehlen); + iphlen = ip->ip_hl << 2; + m_head->m_pkthdr.l3hlen = iphlen; + + PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); + th = mtodo(m_head, ehlen + iphlen); + + ip->ip_len = 0; + ip->ip_sum = 0; + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(IPPROTO_TCP)); + } +#endif +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET6 + { + struct ip6_hdr *ip6; + + PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); + ip6 = mtodo(m_head, ehlen); + if (ip6->ip6_nxt != IPPROTO_TCP) { + m_freem(m_head); + return (NULL); + } + m_head->m_pkthdr.l3hlen = sizeof(*ip6); + + PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); + th = mtodo(m_head, ehlen + sizeof(*ip6)); + + ip6->ip6_plen = 0; + th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); + } +#endif + return (m_head); +} + +/* + * NOTE: If this function failed, the m_head would be freed. + */ +static __inline struct mbuf * +hn_set_hlen(struct mbuf *m_head) +{ + const struct ether_vlan_header *evl; + int ehlen; + + PULLUP_HDR(m_head, sizeof(*evl)); + evl = mtod(m_head, const struct ether_vlan_header *); + if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) + ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; + else + ehlen = ETHER_HDR_LEN; + m_head->m_pkthdr.l2hlen = ehlen; + +#ifdef INET + if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { + const struct ip *ip; + int iphlen; + + PULLUP_HDR(m_head, ehlen + sizeof(*ip)); + ip = mtodo(m_head, ehlen); + iphlen = ip->ip_hl << 2; + m_head->m_pkthdr.l3hlen = iphlen; + + /* + * UDP checksum offload does not work in Azure, if the + * following conditions meet: + * - sizeof(IP hdr + UDP hdr + payload) > 1420. + * - IP_DF is not set in the IP hdr. + * + * Fallback to software checksum for these UDP datagrams. + */ + if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && + m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && + (ntohs(ip->ip_off) & IP_DF) == 0) { + uint16_t off = ehlen + iphlen; + + counter_u64_add(hn_udpcs_fixup, 1); + PULLUP_HDR(m_head, off + sizeof(struct udphdr)); + *(uint16_t *)(m_head->m_data + off + + m_head->m_pkthdr.csum_data) = in_cksum_skip( + m_head, m_head->m_pkthdr.len, off); + m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; + } + } +#endif +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET6 + { + const struct ip6_hdr *ip6; + + PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); + ip6 = mtodo(m_head, ehlen); + if (ip6->ip6_nxt != IPPROTO_TCP && + ip6->ip6_nxt != IPPROTO_UDP) { + m_freem(m_head); + return (NULL); + } + m_head->m_pkthdr.l3hlen = sizeof(*ip6); + } +#endif + return (m_head); +} + +/* + * NOTE: If this function failed, the m_head would be freed. + */ +static __inline struct mbuf * +hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) +{ + const struct tcphdr *th; + int ehlen, iphlen; + + *tcpsyn = 0; + ehlen = m_head->m_pkthdr.l2hlen; + iphlen = m_head->m_pkthdr.l3hlen; + + PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); + th = mtodo(m_head, ehlen + iphlen); + if (th->th_flags & TH_SYN) + *tcpsyn = 1; + return (m_head); +} + +#undef PULLUP_HDR + +#endif /* INET6 || INET */ + +static int +hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) +{ + int error = 0; + + HN_LOCK_ASSERT(sc); + + if (sc->hn_rx_filter != filter) { + error = hn_rndis_set_rxfilter(sc, filter); + if (!error) + sc->hn_rx_filter = filter; + } + return (error); +} + +static int +hn_rxfilter_config(struct hn_softc *sc) +{ + struct ifnet *ifp = sc->hn_ifp; + uint32_t filter; + + HN_LOCK_ASSERT(sc); + + /* + * If the non-transparent mode VF is activated, we don't know how + * its RX filter is configured, so stick the synthetic device in + * the promiscous mode. + */ + if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { + filter = NDIS_PACKET_TYPE_PROMISCUOUS; + } else { + filter = NDIS_PACKET_TYPE_DIRECTED; + if (ifp->if_flags & IFF_BROADCAST) + filter |= NDIS_PACKET_TYPE_BROADCAST; + /* TODO: support multicast list */ + if ((ifp->if_flags & IFF_ALLMULTI) || + !CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) + filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; + } + return (hn_set_rxfilter(sc, filter)); +} + +static void +hn_set_txagg(struct hn_softc *sc) +{ + uint32_t size, pkts; + int i; + + /* + * Setup aggregation size. + */ + if (sc->hn_agg_size < 0) + size = UINT32_MAX; + else + size = sc->hn_agg_size; + + if (sc->hn_rndis_agg_size < size) + size = sc->hn_rndis_agg_size; + + /* NOTE: We only aggregate packets using chimney sending buffers. */ + if (size > (uint32_t)sc->hn_chim_szmax) + size = sc->hn_chim_szmax; + + if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { + /* Disable */ + size = 0; + pkts = 0; + goto done; + } + + /* NOTE: Type of the per TX ring setting is 'int'. */ + if (size > INT_MAX) + size = INT_MAX; + + /* + * Setup aggregation packet count. + */ + if (sc->hn_agg_pkts < 0) + pkts = UINT32_MAX; + else + pkts = sc->hn_agg_pkts; + + if (sc->hn_rndis_agg_pkts < pkts) + pkts = sc->hn_rndis_agg_pkts; + + if (pkts <= 1) { + /* Disable */ + size = 0; + pkts = 0; + goto done; + } + + /* NOTE: Type of the per TX ring setting is 'short'. */ + if (pkts > SHRT_MAX) + pkts = SHRT_MAX; + +done: + /* NOTE: Type of the per TX ring setting is 'short'. */ + if (sc->hn_rndis_agg_align > SHRT_MAX) { + /* Disable */ + size = 0; + pkts = 0; + } + + if (bootverbose) { + if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", + size, pkts, sc->hn_rndis_agg_align); + } + + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; + + mtx_lock(&txr->hn_tx_lock); + txr->hn_agg_szmax = size; + txr->hn_agg_pktmax = pkts; + txr->hn_agg_align = sc->hn_rndis_agg_align; + mtx_unlock(&txr->hn_tx_lock); + } +} + +static int +hn_get_txswq_depth(const struct hn_tx_ring *txr) +{ + + KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); + if (hn_tx_swq_depth < txr->hn_txdesc_cnt) + return txr->hn_txdesc_cnt; + return hn_tx_swq_depth; +} + +static int +hn_rss_reconfig(struct hn_softc *sc) +{ + int error; + + HN_LOCK_ASSERT(sc); + + if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) + return (ENXIO); + + /* + * Disable RSS first. + * + * NOTE: + * Direct reconfiguration by setting the UNCHG flags does + * _not_ work properly. + */ + if (bootverbose) + if_printf(sc->hn_ifp, "disable RSS\n"); + error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); + if (error) { + if_printf(sc->hn_ifp, "RSS disable failed\n"); + return (error); + } + + /* + * Reenable the RSS w/ the updated RSS key or indirect + * table. + */ + if (bootverbose) + if_printf(sc->hn_ifp, "reconfig RSS\n"); + error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); + if (error) { + if_printf(sc->hn_ifp, "RSS reconfig failed\n"); + return (error); + } + return (0); +} + +static void +hn_rss_ind_fixup(struct hn_softc *sc) +{ + struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; + int i, nchan; + + nchan = sc->hn_rx_ring_inuse; + KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); + + /* + * Check indirect table to make sure that all channels in it + * can be used. + */ + for (i = 0; i < NDIS_HASH_INDCNT; ++i) { + if (rss->rss_ind[i] >= nchan) { + if_printf(sc->hn_ifp, + "RSS indirect table %d fixup: %u -> %d\n", + i, rss->rss_ind[i], nchan - 1); + rss->rss_ind[i] = nchan - 1; + } + } +} + +static int +hn_ifmedia_upd(struct ifnet *ifp __unused) +{ + + return EOPNOTSUPP; +} + +static void +hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) +{ + struct hn_softc *sc = ifp->if_softc; + + ifmr->ifm_status = IFM_AVALID; + ifmr->ifm_active = IFM_ETHER; + + if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { + ifmr->ifm_active |= IFM_NONE; + return; + } + ifmr->ifm_status |= IFM_ACTIVE; + ifmr->ifm_active |= IFM_10G_T | IFM_FDX; +} + +static void +hn_rxvf_set_task(void *xarg, int pending __unused) +{ + struct hn_rxvf_setarg *arg = xarg; + + arg->rxr->hn_rxvf_ifp = arg->vf_ifp; +} + +static void +hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) +{ + struct hn_rx_ring *rxr; + struct hn_rxvf_setarg arg; + struct task task; + int i; + + HN_LOCK_ASSERT(sc); + + TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); + + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + rxr = &sc->hn_rx_ring[i]; + + if (i < sc->hn_rx_ring_inuse) { + arg.rxr = rxr; + arg.vf_ifp = vf_ifp; + vmbus_chan_run_task(rxr->hn_chan, &task); + } else { + rxr->hn_rxvf_ifp = vf_ifp; + } + } +} + +static bool +hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) +{ + const struct ifnet *hn_ifp; + + hn_ifp = sc->hn_ifp; + + if (ifp == hn_ifp) + return (false); + + if (ifp->if_alloctype != IFT_ETHER) + return (false); + + /* Ignore lagg/vlan interfaces */ + if (strcmp(ifp->if_dname, "lagg") == 0 || + strcmp(ifp->if_dname, "vlan") == 0) + return (false); + + /* + * During detach events ifp->if_addr might be NULL. + * Make sure the bcmp() below doesn't panic on that: + */ + if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL) + return (false); + + if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) + return (false); + + return (true); +} + +static void +hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) +{ + struct ifnet *hn_ifp; + + HN_LOCK(sc); + + if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) + goto out; + + if (!hn_ismyvf(sc, ifp)) + goto out; + hn_ifp = sc->hn_ifp; + + if (rxvf) { + if (sc->hn_flags & HN_FLAG_RXVF) + goto out; + + sc->hn_flags |= HN_FLAG_RXVF; + hn_rxfilter_config(sc); + } else { + if (!(sc->hn_flags & HN_FLAG_RXVF)) + goto out; + + sc->hn_flags &= ~HN_FLAG_RXVF; + if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) + hn_rxfilter_config(sc); + else + hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); + } + + hn_nvs_set_datapath(sc, + rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); + + hn_rxvf_set(sc, rxvf ? ifp : NULL); + + if (rxvf) { + hn_vf_rss_fixup(sc, true); + hn_suspend_mgmt(sc); + sc->hn_link_flags &= + ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); + if_link_state_change(hn_ifp, LINK_STATE_DOWN); + } else { + hn_vf_rss_restore(sc); + hn_resume_mgmt(sc); + } + + devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, + rxvf ? "VF_UP" : "VF_DOWN", NULL); + + if (bootverbose) { + if_printf(hn_ifp, "datapath is switched %s %s\n", + rxvf ? "to" : "from", ifp->if_xname); + } +out: + HN_UNLOCK(sc); +} + +static void +hn_ifnet_event(void *arg, struct ifnet *ifp, int event) +{ + + if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) + return; + hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); +} + +static void +hn_ifaddr_event(void *arg, struct ifnet *ifp) +{ + + hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); +} + +static int +hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) +{ + struct ifnet *ifp, *vf_ifp; + uint64_t tmp; + int error; + + HN_LOCK_ASSERT(sc); + ifp = sc->hn_ifp; + vf_ifp = sc->hn_vf_ifp; + + /* + * Fix up requested capabilities w/ supported capabilities, + * since the supported capabilities could have been changed. + */ + ifr->ifr_reqcap &= ifp->if_capabilities; + /* Pass SIOCSIFCAP to VF. */ + error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); + + /* + * NOTE: + * The error will be propagated to the callers, however, it + * is _not_ useful here. + */ + + /* + * Merge VF's enabled capabilities. + */ + ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; + + tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); + if (ifp->if_capenable & IFCAP_TXCSUM) + ifp->if_hwassist |= tmp; + else + ifp->if_hwassist &= ~tmp; + + tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); + if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) + ifp->if_hwassist |= tmp; + else + ifp->if_hwassist &= ~tmp; + + tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; + if (ifp->if_capenable & IFCAP_TSO4) + ifp->if_hwassist |= tmp; + else + ifp->if_hwassist &= ~tmp; + + tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; + if (ifp->if_capenable & IFCAP_TSO6) + ifp->if_hwassist |= tmp; + else + ifp->if_hwassist &= ~tmp; + + return (error); +} + +static int +hn_xpnt_vf_iocsetflags(struct hn_softc *sc) +{ + struct ifnet *vf_ifp; + struct ifreq ifr; + + HN_LOCK_ASSERT(sc); + vf_ifp = sc->hn_vf_ifp; + + memset(&ifr, 0, sizeof(ifr)); + strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); + ifr.ifr_flags = vf_ifp->if_flags & 0xffff; + ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; + return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); +} + +static void +hn_xpnt_vf_saveifflags(struct hn_softc *sc) +{ + struct ifnet *ifp = sc->hn_ifp; + int allmulti = 0; + + HN_LOCK_ASSERT(sc); + + /* XXX vlan(4) style mcast addr maintenance */ + if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) + allmulti = IFF_ALLMULTI; + + /* Always set the VF's if_flags */ + sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; +} + +static void +hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) +{ + struct rm_priotracker pt; + struct ifnet *hn_ifp = NULL; + struct mbuf *mn; + + /* + * XXX racy, if hn(4) ever detached. + */ + rm_rlock(&hn_vfmap_lock, &pt); + if (vf_ifp->if_index < hn_vfmap_size) + hn_ifp = hn_vfmap[vf_ifp->if_index]; + rm_runlock(&hn_vfmap_lock, &pt); + + if (hn_ifp != NULL) { + for (mn = m; mn != NULL; mn = mn->m_nextpkt) { + /* + * Allow tapping on the VF. + */ + ETHER_BPF_MTAP(vf_ifp, mn); + + /* + * Update VF stats. + */ + if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { + if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, + mn->m_pkthdr.len); + } + /* + * XXX IFCOUNTER_IMCAST + * This stat updating is kinda invasive, since it + * requires two checks on the mbuf: the length check + * and the ethernet header check. As of this write, + * all multicast packets go directly to hn(4), which + * makes imcast stat updating in the VF a try in vian. + */ + + /* + * Fix up rcvif and increase hn(4)'s ipackets. + */ + mn->m_pkthdr.rcvif = hn_ifp; + if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); + } + /* + * Go through hn(4)'s if_input. + */ + hn_ifp->if_input(hn_ifp, m); + } else { + /* + * In the middle of the transition; free this + * mbuf chain. + */ + while (m != NULL) { + mn = m->m_nextpkt; + m->m_nextpkt = NULL; + m_freem(m); + m = mn; + } + } +} + +static void +hn_mtu_change_fixup(struct hn_softc *sc) +{ + struct ifnet *ifp; + + HN_LOCK_ASSERT(sc); + ifp = sc->hn_ifp; + + hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); +#if __FreeBSD_version >= 1100099 + if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) + hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); +#endif +} + +static uint32_t +hn_rss_type_fromndis(uint32_t rss_hash) +{ + uint32_t types = 0; + + if (rss_hash & NDIS_HASH_IPV4) + types |= RSS_TYPE_IPV4; + if (rss_hash & NDIS_HASH_TCP_IPV4) + types |= RSS_TYPE_TCP_IPV4; + if (rss_hash & NDIS_HASH_IPV6) + types |= RSS_TYPE_IPV6; + if (rss_hash & NDIS_HASH_IPV6_EX) + types |= RSS_TYPE_IPV6_EX; + if (rss_hash & NDIS_HASH_TCP_IPV6) + types |= RSS_TYPE_TCP_IPV6; + if (rss_hash & NDIS_HASH_TCP_IPV6_EX) + types |= RSS_TYPE_TCP_IPV6_EX; + if (rss_hash & NDIS_HASH_UDP_IPV4_X) + types |= RSS_TYPE_UDP_IPV4; + return (types); +} + +static uint32_t +hn_rss_type_tondis(uint32_t types) +{ + uint32_t rss_hash = 0; + + KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, + ("UDP6 and UDP6EX are not supported")); + + if (types & RSS_TYPE_IPV4) + rss_hash |= NDIS_HASH_IPV4; + if (types & RSS_TYPE_TCP_IPV4) + rss_hash |= NDIS_HASH_TCP_IPV4; + if (types & RSS_TYPE_IPV6) + rss_hash |= NDIS_HASH_IPV6; + if (types & RSS_TYPE_IPV6_EX) + rss_hash |= NDIS_HASH_IPV6_EX; + if (types & RSS_TYPE_TCP_IPV6) + rss_hash |= NDIS_HASH_TCP_IPV6; + if (types & RSS_TYPE_TCP_IPV6_EX) + rss_hash |= NDIS_HASH_TCP_IPV6_EX; + if (types & RSS_TYPE_UDP_IPV4) + rss_hash |= NDIS_HASH_UDP_IPV4_X; + return (rss_hash); +} + +static void +hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) +{ + int i; + + HN_LOCK_ASSERT(sc); + + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) + sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; +} + +static void +hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) +{ + struct ifnet *ifp, *vf_ifp; + struct ifrsshash ifrh; + struct ifrsskey ifrk; + int error; + uint32_t my_types, diff_types, mbuf_types = 0; + + HN_LOCK_ASSERT(sc); + KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, + ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); + + if (sc->hn_rx_ring_inuse == 1) { + /* No RSS on synthetic parts; done. */ + return; + } + if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { + /* Synthetic parts do not support Toeplitz; done. */ + return; + } + + ifp = sc->hn_ifp; + vf_ifp = sc->hn_vf_ifp; + + /* + * Extract VF's RSS key. Only 40 bytes key for Toeplitz is + * supported. + */ + memset(&ifrk, 0, sizeof(ifrk)); + strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name)); + error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk); + if (error) { + if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", + vf_ifp->if_xname, error); + goto done; + } + if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { + if_printf(ifp, "%s RSS function %u is not Toeplitz\n", + vf_ifp->if_xname, ifrk.ifrk_func); + goto done; + } + if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { + if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", + vf_ifp->if_xname, ifrk.ifrk_keylen); + goto done; + } + + /* + * Extract VF's RSS hash. Only Toeplitz is supported. + */ + memset(&ifrh, 0, sizeof(ifrh)); + strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name)); + error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh); + if (error) { + if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", + vf_ifp->if_xname, error); + goto done; + } + if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { + if_printf(ifp, "%s RSS function %u is not Toeplitz\n", + vf_ifp->if_xname, ifrh.ifrh_func); + goto done; + } + + my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); + if ((ifrh.ifrh_types & my_types) == 0) { + /* This disables RSS; ignore it then */ + if_printf(ifp, "%s intersection of RSS types failed. " + "VF %#x, mine %#x\n", vf_ifp->if_xname, + ifrh.ifrh_types, my_types); + goto done; + } + + diff_types = my_types ^ ifrh.ifrh_types; + my_types &= ifrh.ifrh_types; + mbuf_types = my_types; + + /* + * Detect RSS hash value/type confliction. + * + * NOTE: + * We don't disable the hash type, but stop delivery the hash + * value/type through mbufs on RX path. + * + * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple + * hash is delivered with type of TCP_IPV4. This means if + * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at + * least to hn_mbuf_hash. However, given that _all_ of the + * NICs implement TCP_IPV4, this will _not_ impose any issues + * here. + */ + if ((my_types & RSS_TYPE_IPV4) && + (diff_types & ifrh.ifrh_types & + (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { + /* Conflict; disable IPV4 hash type/value delivery. */ + if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); + mbuf_types &= ~RSS_TYPE_IPV4; + } + if ((my_types & RSS_TYPE_IPV6) && + (diff_types & ifrh.ifrh_types & + (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | + RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | + RSS_TYPE_IPV6_EX))) { + /* Conflict; disable IPV6 hash type/value delivery. */ + if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); + mbuf_types &= ~RSS_TYPE_IPV6; + } + if ((my_types & RSS_TYPE_IPV6_EX) && + (diff_types & ifrh.ifrh_types & + (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | + RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | + RSS_TYPE_IPV6))) { + /* Conflict; disable IPV6_EX hash type/value delivery. */ + if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); + mbuf_types &= ~RSS_TYPE_IPV6_EX; + } + if ((my_types & RSS_TYPE_TCP_IPV6) && + (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { + /* Conflict; disable TCP_IPV6 hash type/value delivery. */ + if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); + mbuf_types &= ~RSS_TYPE_TCP_IPV6; + } + if ((my_types & RSS_TYPE_TCP_IPV6_EX) && + (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { + /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ + if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); + mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; + } + if ((my_types & RSS_TYPE_UDP_IPV6) && + (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { + /* Conflict; disable UDP_IPV6 hash type/value delivery. */ + if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); + mbuf_types &= ~RSS_TYPE_UDP_IPV6; + } + if ((my_types & RSS_TYPE_UDP_IPV6_EX) && + (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { + /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ + if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); + mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; + } + + /* + * Indirect table does not matter. + */ + + sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | + hn_rss_type_tondis(my_types); + memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); + sc->hn_flags |= HN_FLAG_HAS_RSSKEY; + + if (reconf) { + error = hn_rss_reconfig(sc); + if (error) { + /* XXX roll-back? */ + if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); + /* XXX keep going. */ + } + } +done: + /* Hash deliverability for mbufs. */ + hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); +} + +static void +hn_vf_rss_restore(struct hn_softc *sc) +{ + + HN_LOCK_ASSERT(sc); + KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, + ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); + + if (sc->hn_rx_ring_inuse == 1) + goto done; + + /* + * Restore hash types. Key does _not_ matter. + */ + if (sc->hn_rss_hash != sc->hn_rss_hcap) { + int error; + + sc->hn_rss_hash = sc->hn_rss_hcap; + error = hn_rss_reconfig(sc); + if (error) { + if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", + error); + /* XXX keep going. */ + } + } +done: + /* Hash deliverability for mbufs. */ + hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); +} + +static void +hn_xpnt_vf_setready(struct hn_softc *sc) +{ + struct ifnet *ifp, *vf_ifp; + struct ifreq ifr; + + HN_LOCK_ASSERT(sc); + ifp = sc->hn_ifp; + vf_ifp = sc->hn_vf_ifp; + + /* + * Mark the VF ready. + */ + sc->hn_vf_rdytick = 0; + + /* + * Save information for restoration. + */ + sc->hn_saved_caps = ifp->if_capabilities; + sc->hn_saved_tsomax = ifp->if_hw_tsomax; + sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; + sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; + + /* + * Intersect supported/enabled capabilities. + * + * NOTE: + * if_hwassist is not changed here. + */ + ifp->if_capabilities &= vf_ifp->if_capabilities; + ifp->if_capenable &= ifp->if_capabilities; + + /* + * Fix TSO settings. + */ + if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) + ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; + if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) + ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; + if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) + ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; + + /* + * Change VF's enabled capabilities. + */ + memset(&ifr, 0, sizeof(ifr)); + strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); + ifr.ifr_reqcap = ifp->if_capenable; + hn_xpnt_vf_iocsetcaps(sc, &ifr); + + if (ifp->if_mtu != ETHERMTU) { + int error; + + /* + * Change VF's MTU. + */ + memset(&ifr, 0, sizeof(ifr)); + strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); + ifr.ifr_mtu = ifp->if_mtu; + error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); + if (error) { + if_printf(ifp, "%s SIOCSIFMTU %u failed\n", + vf_ifp->if_xname, ifp->if_mtu); + if (ifp->if_mtu > ETHERMTU) { + if_printf(ifp, "change MTU to %d\n", ETHERMTU); + + /* + * XXX + * No need to adjust the synthetic parts' MTU; + * failure of the adjustment will cause us + * infinite headache. + */ + ifp->if_mtu = ETHERMTU; + hn_mtu_change_fixup(sc); + } + } + } +} + +static bool +hn_xpnt_vf_isready(struct hn_softc *sc) +{ + + HN_LOCK_ASSERT(sc); + + if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) + return (false); + + if (sc->hn_vf_rdytick == 0) + return (true); + + if (sc->hn_vf_rdytick > ticks) + return (false); + + /* Mark VF as ready. */ + hn_xpnt_vf_setready(sc); + return (true); +} + +static void +hn_xpnt_vf_setenable(struct hn_softc *sc) +{ + int i; + + HN_LOCK_ASSERT(sc); + + /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ + rm_wlock(&sc->hn_vf_lock); + sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; + rm_wunlock(&sc->hn_vf_lock); + + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) + sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; +} + +static void +hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) +{ + int i; + + HN_LOCK_ASSERT(sc); + + /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ + rm_wlock(&sc->hn_vf_lock); + sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; + if (clear_vf) + sc->hn_vf_ifp = NULL; + rm_wunlock(&sc->hn_vf_lock); + + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) + sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; +} + +static void +hn_xpnt_vf_init(struct hn_softc *sc) +{ + int error; + + HN_LOCK_ASSERT(sc); + + KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, + ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); + + if (bootverbose) { + if_printf(sc->hn_ifp, "try bringing up %s\n", + sc->hn_vf_ifp->if_xname); + } + + /* + * Bring the VF up. + */ + hn_xpnt_vf_saveifflags(sc); + sc->hn_vf_ifp->if_flags |= IFF_UP; + error = hn_xpnt_vf_iocsetflags(sc); + if (error) { + if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", + sc->hn_vf_ifp->if_xname, error); + return; + } + + /* + * NOTE: + * Datapath setting must happen _after_ bringing the VF up. + */ + hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); + + /* + * NOTE: + * Fixup RSS related bits _after_ the VF is brought up, since + * many VFs generate RSS key during it's initialization. + */ + hn_vf_rss_fixup(sc, true); + + /* Mark transparent mode VF as enabled. */ + hn_xpnt_vf_setenable(sc); +} + +static void +hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) +{ + struct hn_softc *sc = xsc; + + HN_LOCK(sc); + + if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) + goto done; + if (sc->hn_vf_ifp == NULL) + goto done; + if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) + goto done; + + if (sc->hn_vf_rdytick != 0) { + /* Mark VF as ready. */ + hn_xpnt_vf_setready(sc); + } + + if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { + /* + * Delayed VF initialization. + */ + if (bootverbose) { + if_printf(sc->hn_ifp, "delayed initialize %s\n", + sc->hn_vf_ifp->if_xname); + } + hn_xpnt_vf_init(sc); + } +done: + HN_UNLOCK(sc); +} + +static void +hn_ifnet_attevent(void *xsc, struct ifnet *ifp) +{ + struct hn_softc *sc = xsc; + + HN_LOCK(sc); + + if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) + goto done; + + if (!hn_ismyvf(sc, ifp)) + goto done; + + if (sc->hn_vf_ifp != NULL) { + if_printf(sc->hn_ifp, "%s was attached as VF\n", + sc->hn_vf_ifp->if_xname); + goto done; + } + + if (hn_xpnt_vf && ifp->if_start != NULL) { + /* + * ifnet.if_start is _not_ supported by transparent + * mode VF; mainly due to the IFF_DRV_OACTIVE flag. + */ + if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " + "in transparent VF mode.\n", ifp->if_xname); + goto done; + } + + rm_wlock(&hn_vfmap_lock); + + if (ifp->if_index >= hn_vfmap_size) { + struct ifnet **newmap; + int newsize; + + newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; + newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, + M_WAITOK | M_ZERO); + + memcpy(newmap, hn_vfmap, + sizeof(struct ifnet *) * hn_vfmap_size); + free(hn_vfmap, M_DEVBUF); + hn_vfmap = newmap; + hn_vfmap_size = newsize; + } + KASSERT(hn_vfmap[ifp->if_index] == NULL, + ("%s: ifindex %d was mapped to %s", + ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); + hn_vfmap[ifp->if_index] = sc->hn_ifp; + + rm_wunlock(&hn_vfmap_lock); + + /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ + rm_wlock(&sc->hn_vf_lock); + KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, + ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); + sc->hn_vf_ifp = ifp; + rm_wunlock(&sc->hn_vf_lock); + + if (hn_xpnt_vf) { + int wait_ticks; + + /* + * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. + * Save vf_ifp's current if_input for later restoration. + */ + sc->hn_vf_input = ifp->if_input; + ifp->if_input = hn_xpnt_vf_input; + + /* + * Stop link status management; use the VF's. + */ + hn_suspend_mgmt(sc); + + /* + * Give VF sometime to complete its attach routing. + */ + wait_ticks = hn_xpnt_vf_attwait * hz; + sc->hn_vf_rdytick = ticks + wait_ticks; + + taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, + wait_ticks); + } +done: + HN_UNLOCK(sc); +} + +static void +hn_ifnet_detevent(void *xsc, struct ifnet *ifp) +{ + struct hn_softc *sc = xsc; + + HN_LOCK(sc); + + if (sc->hn_vf_ifp == NULL) + goto done; + + if (!hn_ismyvf(sc, ifp)) + goto done; + + if (hn_xpnt_vf) { + /* + * Make sure that the delayed initialization is not running. + * + * NOTE: + * - This lock _must_ be released, since the hn_vf_init task + * will try holding this lock. + * - It is safe to release this lock here, since the + * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. + * + * XXX racy, if hn(4) ever detached. + */ + HN_UNLOCK(sc); + taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); + HN_LOCK(sc); + + KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", + sc->hn_ifp->if_xname)); + ifp->if_input = sc->hn_vf_input; + sc->hn_vf_input = NULL; + + if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && + (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) + hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); + + if (sc->hn_vf_rdytick == 0) { + /* + * The VF was ready; restore some settings. + */ + sc->hn_ifp->if_capabilities = sc->hn_saved_caps; + /* + * NOTE: + * There is _no_ need to fixup if_capenable and + * if_hwassist, since the if_capabilities before + * restoration was an intersection of the VF's + * if_capabilites and the synthetic device's + * if_capabilites. + */ + sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; + sc->hn_ifp->if_hw_tsomaxsegcount = + sc->hn_saved_tsosegcnt; + sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; + } + + if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { + /* + * Restore RSS settings. + */ + hn_vf_rss_restore(sc); + + /* + * Resume link status management, which was suspended + * by hn_ifnet_attevent(). + */ + hn_resume_mgmt(sc); + } + } + + /* Mark transparent mode VF as disabled. */ + hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); + + rm_wlock(&hn_vfmap_lock); + + KASSERT(ifp->if_index < hn_vfmap_size, + ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); + if (hn_vfmap[ifp->if_index] != NULL) { + KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, + ("%s: ifindex %d was mapped to %s", + ifp->if_xname, ifp->if_index, + hn_vfmap[ifp->if_index]->if_xname)); + hn_vfmap[ifp->if_index] = NULL; + } + + rm_wunlock(&hn_vfmap_lock); +done: + HN_UNLOCK(sc); +} + +static void +hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) +{ + struct hn_softc *sc = xsc; + + if (sc->hn_vf_ifp == ifp) + if_link_state_change(sc->hn_ifp, link_state); +} + +static int +hn_probe(device_t dev) +{ + + if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { + device_set_desc(dev, "Hyper-V Network Interface"); + return BUS_PROBE_DEFAULT; + } + return ENXIO; +} + +static int +hn_attach(device_t dev) +{ + struct hn_softc *sc = device_get_softc(dev); + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + uint8_t eaddr[ETHER_ADDR_LEN]; + struct ifnet *ifp = NULL; + int error, ring_cnt, tx_ring_cnt; + uint32_t mtu; + + sc->hn_dev = dev; + sc->hn_prichan = vmbus_get_channel(dev); + HN_LOCK_INIT(sc); + rm_init(&sc->hn_vf_lock, "hnvf"); + if (hn_xpnt_vf && hn_xpnt_vf_accbpf) + sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; + + /* + * Initialize these tunables once. + */ + sc->hn_agg_size = hn_tx_agg_size; + sc->hn_agg_pkts = hn_tx_agg_pkts; + + /* + * Setup taskqueue for transmission. + */ + if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { + int i; + + sc->hn_tx_taskqs = + malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), + M_DEVBUF, M_WAITOK); + for (i = 0; i < hn_tx_taskq_cnt; ++i) { + sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", + M_WAITOK, taskqueue_thread_enqueue, + &sc->hn_tx_taskqs[i]); + taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, + "%s tx%d", device_get_nameunit(dev), i); + } + } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { + sc->hn_tx_taskqs = hn_tx_taskque; + } + + /* + * Setup taskqueue for mangement tasks, e.g. link status. + */ + sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, + taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); + taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", + device_get_nameunit(dev)); + TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); + TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); + TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, + hn_netchg_status_taskfunc, sc); + + if (hn_xpnt_vf) { + /* + * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. + */ + sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, + taskqueue_thread_enqueue, &sc->hn_vf_taskq); + taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", + device_get_nameunit(dev)); + TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, + hn_xpnt_vf_init_taskfunc, sc); + } + + /* + * Allocate ifnet and setup its name earlier, so that if_printf + * can be used by functions, which will be called after + * ether_ifattach(). + */ + ifp = sc->hn_ifp = if_alloc(IFT_ETHER); + ifp->if_softc = sc; + if_initname(ifp, device_get_name(dev), device_get_unit(dev)); + + /* + * Initialize ifmedia earlier so that it can be unconditionally + * destroyed, if error happened later on. + */ + ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); + + /* + * Figure out the # of RX rings (ring_cnt) and the # of TX rings + * to use (tx_ring_cnt). + * + * NOTE: + * The # of RX rings to use is same as the # of channels to use. + */ + ring_cnt = hn_chan_cnt; + if (ring_cnt <= 0) { + /* Default */ + ring_cnt = mp_ncpus; + if (ring_cnt > HN_RING_CNT_DEF_MAX) + ring_cnt = HN_RING_CNT_DEF_MAX; + } else if (ring_cnt > mp_ncpus) { + ring_cnt = mp_ncpus; + } +#ifdef RSS + if (ring_cnt > rss_getnumbuckets()) + ring_cnt = rss_getnumbuckets(); +#endif + + tx_ring_cnt = hn_tx_ring_cnt; + if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) + tx_ring_cnt = ring_cnt; +#ifdef HN_IFSTART_SUPPORT + if (hn_use_if_start) { + /* ifnet.if_start only needs one TX ring. */ + tx_ring_cnt = 1; + } +#endif + + /* + * Set the leader CPU for channels. + */ + sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; + + /* + * Create enough TX/RX rings, even if only limited number of + * channels can be allocated. + */ + error = hn_create_tx_data(sc, tx_ring_cnt); + if (error) + goto failed; + error = hn_create_rx_data(sc, ring_cnt); + if (error) + goto failed; + + /* + * Create transaction context for NVS and RNDIS transactions. + */ + sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), + HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); + if (sc->hn_xact == NULL) { + error = ENXIO; + goto failed; + } + + /* + * Install orphan handler for the revocation of this device's + * primary channel. + * + * NOTE: + * The processing order is critical here: + * Install the orphan handler, _before_ testing whether this + * device's primary channel has been revoked or not. + */ + vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); + if (vmbus_chan_is_revoked(sc->hn_prichan)) { + error = ENXIO; + goto failed; + } + + /* + * Attach the synthetic parts, i.e. NVS and RNDIS. + */ + error = hn_synth_attach(sc, ETHERMTU); + if (error) + goto failed; + + error = hn_rndis_get_eaddr(sc, eaddr); + if (error) + goto failed; + + error = hn_rndis_get_mtu(sc, &mtu); + if (error) + mtu = ETHERMTU; + else if (bootverbose) + device_printf(dev, "RNDIS mtu %u\n", mtu); + +#if __FreeBSD_version >= 1100099 + if (sc->hn_rx_ring_inuse > 1) { + /* + * Reduce TCP segment aggregation limit for multiple + * RX rings to increase ACK timeliness. + */ + hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); + } +#endif + + /* + * Fixup TX/RX stuffs after synthetic parts are attached. + */ + hn_fixup_tx_data(sc); + hn_fixup_rx_data(sc); + + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, + &sc->hn_nvs_ver, 0, "NVS version"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_ndis_version_sysctl, "A", "NDIS version"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_caps_sysctl, "A", "capabilities"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_hwassist_sysctl, "A", "hwassist"); + SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", + CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); + SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", + CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, + "max # of TSO segments"); + SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", + CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, + "max size of TSO segment"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_rxfilter_sysctl, "A", "rxfilter"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_rss_hash_sysctl, "A", "RSS hash"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", + CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); +#ifndef RSS + /* + * Don't allow RSS key/indirect table changes, if RSS is defined. + */ + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", + CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_rss_key_sysctl, "IU", "RSS key"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", + CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_rss_ind_sysctl, "IU", "RSS indirect table"); +#endif + SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", + CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, + "RNDIS offered packet transmission aggregation size limit"); + SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", + CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, + "RNDIS offered packet transmission aggregation count limit"); + SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", + CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, + "RNDIS packet transmission aggregation alignment"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_txagg_size_sysctl, "I", + "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_txagg_pkts_sysctl, "I", + "Packet transmission aggregation packets, " + "0 -- disable, -1 -- auto"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_polling_sysctl, "I", + "Polling frequency: [100,1000000], 0 disable polling"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_vf_sysctl, "A", "Virtual Function's name"); + if (!hn_xpnt_vf) { + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_rxvf_sysctl, "A", "activated Virtual Function's name"); + } else { + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_xpnt_vf_enabled_sysctl, "I", + "Transparent VF enabled"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_xpnt_vf_accbpf_sysctl, "I", + "Accurate BPF for transparent VF"); + } + + /* + * Setup the ifmedia, which has been initialized earlier. + */ + ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); + ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); + /* XXX ifmedia_set really should do this for us */ + sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; + + /* + * Setup the ifnet for this interface. + */ + + ifp->if_baudrate = IF_Gbps(10); + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_ioctl = hn_ioctl; + ifp->if_init = hn_init; +#ifdef HN_IFSTART_SUPPORT + if (hn_use_if_start) { + int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); + + ifp->if_start = hn_start; + IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); + ifp->if_snd.ifq_drv_maxlen = qdepth - 1; + IFQ_SET_READY(&ifp->if_snd); + } else +#endif + { + ifp->if_transmit = hn_transmit; + ifp->if_qflush = hn_xmit_qflush; + } + + ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; +#ifdef foo + /* We can't diff IPv6 packets from IPv4 packets on RX path. */ + ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; +#endif + if (sc->hn_caps & HN_CAP_VLAN) { + /* XXX not sure about VLAN_MTU. */ + ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; + } + + ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; + if (ifp->if_hwassist & HN_CSUM_IP_MASK) + ifp->if_capabilities |= IFCAP_TXCSUM; + if (ifp->if_hwassist & HN_CSUM_IP6_MASK) + ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; + if (sc->hn_caps & HN_CAP_TSO4) { + ifp->if_capabilities |= IFCAP_TSO4; + ifp->if_hwassist |= CSUM_IP_TSO; + } + if (sc->hn_caps & HN_CAP_TSO6) { + ifp->if_capabilities |= IFCAP_TSO6; + ifp->if_hwassist |= CSUM_IP6_TSO; + } + + /* Enable all available capabilities by default. */ + ifp->if_capenable = ifp->if_capabilities; + + /* + * Disable IPv6 TSO and TXCSUM by default, they still can + * be enabled through SIOCSIFCAP. + */ + ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); + ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); + + if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { + /* + * Lock hn_set_tso_maxsize() to simplify its + * internal logic. + */ + HN_LOCK(sc); + hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); + HN_UNLOCK(sc); + ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; + ifp->if_hw_tsomaxsegsize = PAGE_SIZE; + } + + ether_ifattach(ifp, eaddr); + + if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { + if_printf(ifp, "TSO segcnt %u segsz %u\n", + ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); + } + if (mtu < ETHERMTU) { + if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu); + ifp->if_mtu = mtu; + } + + /* Inform the upper layer about the long frame support. */ + ifp->if_hdrlen = sizeof(struct ether_vlan_header); + + /* + * Kick off link status check. + */ + sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; + hn_update_link_status(sc); + + if (!hn_xpnt_vf) { + sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, + hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); + sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, + hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); + } else { + sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, + hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); + } + + /* + * NOTE: + * Subscribe ether_ifattach event, instead of ifnet_arrival event, + * since interface's LLADDR is needed; interface LLADDR is not + * available when ifnet_arrival event is triggered. + */ + sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, + hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); + sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, + hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); + + return (0); +failed: + if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) + hn_synth_detach(sc); + hn_detach(dev); + return (error); +} + +static int +hn_detach(device_t dev) +{ + struct hn_softc *sc = device_get_softc(dev); + struct ifnet *ifp = sc->hn_ifp, *vf_ifp; + + if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { + /* + * In case that the vmbus missed the orphan handler + * installation. + */ + vmbus_xact_ctx_orphan(sc->hn_xact); + } + + if (sc->hn_ifaddr_evthand != NULL) + EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); + if (sc->hn_ifnet_evthand != NULL) + EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); + if (sc->hn_ifnet_atthand != NULL) { + EVENTHANDLER_DEREGISTER(ether_ifattach_event, + sc->hn_ifnet_atthand); + } + if (sc->hn_ifnet_dethand != NULL) { + EVENTHANDLER_DEREGISTER(ifnet_departure_event, + sc->hn_ifnet_dethand); + } + if (sc->hn_ifnet_lnkhand != NULL) + EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); + + vf_ifp = sc->hn_vf_ifp; + __compiler_membar(); + if (vf_ifp != NULL) + hn_ifnet_detevent(sc, vf_ifp); + + if (device_is_attached(dev)) { + HN_LOCK(sc); + if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + hn_stop(sc, true); + /* + * NOTE: + * hn_stop() only suspends data, so managment + * stuffs have to be suspended manually here. + */ + hn_suspend_mgmt(sc); + hn_synth_detach(sc); + } + HN_UNLOCK(sc); + ether_ifdetach(ifp); + } + + ifmedia_removeall(&sc->hn_media); + hn_destroy_rx_data(sc); + hn_destroy_tx_data(sc); + + if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { + int i; + + for (i = 0; i < hn_tx_taskq_cnt; ++i) + taskqueue_free(sc->hn_tx_taskqs[i]); + free(sc->hn_tx_taskqs, M_DEVBUF); + } + taskqueue_free(sc->hn_mgmt_taskq0); + if (sc->hn_vf_taskq != NULL) + taskqueue_free(sc->hn_vf_taskq); + + if (sc->hn_xact != NULL) { + /* + * Uninstall the orphan handler _before_ the xact is + * destructed. + */ + vmbus_chan_unset_orphan(sc->hn_prichan); + vmbus_xact_ctx_destroy(sc->hn_xact); + } + + if_free(ifp); + + HN_LOCK_DESTROY(sc); + rm_destroy(&sc->hn_vf_lock); + return (0); +} + +static int +hn_shutdown(device_t dev) +{ + + return (0); +} + +static void +hn_link_status(struct hn_softc *sc) +{ + uint32_t link_status; + int error; + + error = hn_rndis_get_linkstatus(sc, &link_status); + if (error) { + /* XXX what to do? */ + return; + } + + if (link_status == NDIS_MEDIA_STATE_CONNECTED) + sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; + else + sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; + if_link_state_change(sc->hn_ifp, + (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? + LINK_STATE_UP : LINK_STATE_DOWN); +} + +static void +hn_link_taskfunc(void *xsc, int pending __unused) +{ + struct hn_softc *sc = xsc; + + if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) + return; + hn_link_status(sc); +} + +static void +hn_netchg_init_taskfunc(void *xsc, int pending __unused) +{ + struct hn_softc *sc = xsc; + + /* Prevent any link status checks from running. */ + sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; + + /* + * Fake up a [link down --> link up] state change; 5 seconds + * delay is used, which closely simulates miibus reaction + * upon link down event. + */ + sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; + if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); + taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, + &sc->hn_netchg_status, 5 * hz); +} + +static void +hn_netchg_status_taskfunc(void *xsc, int pending __unused) +{ + struct hn_softc *sc = xsc; + + /* Re-allow link status checks. */ + sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; + hn_link_status(sc); +} + +static void +hn_update_link_status(struct hn_softc *sc) +{ + + if (sc->hn_mgmt_taskq != NULL) + taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); +} + +static void +hn_change_network(struct hn_softc *sc) +{ + + if (sc->hn_mgmt_taskq != NULL) + taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); +} + +static __inline int +hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, + struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) +{ + struct mbuf *m = *m_head; + int error; + + KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); + + error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, + m, segs, nsegs, BUS_DMA_NOWAIT); + if (error == EFBIG) { + struct mbuf *m_new; + + m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); + if (m_new == NULL) + return ENOBUFS; + else + *m_head = m = m_new; + txr->hn_tx_collapsed++; + + error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, + txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); + } + if (!error) { + bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, + BUS_DMASYNC_PREWRITE); + txd->flags |= HN_TXD_FLAG_DMAMAP; + } + return error; +} + +static __inline int +hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) +{ + + KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, + ("put an onlist txd %#x", txd->flags)); + KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, + ("put an onagg txd %#x", txd->flags)); + + KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); + if (atomic_fetchadd_int(&txd->refs, -1) != 1) + return 0; + + if (!STAILQ_EMPTY(&txd->agg_list)) { + struct hn_txdesc *tmp_txd; + + while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { + int freed; + + KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), + ("resursive aggregation on aggregated txdesc")); + KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), + ("not aggregated txdesc")); + KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, + ("aggregated txdesc uses dmamap")); + KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, + ("aggregated txdesc consumes " + "chimney sending buffer")); + KASSERT(tmp_txd->chim_size == 0, + ("aggregated txdesc has non-zero " + "chimney sending size")); + + STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); + tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; + freed = hn_txdesc_put(txr, tmp_txd); + KASSERT(freed, ("failed to free aggregated txdesc")); + } + } + + if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { + KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, + ("chim txd uses dmamap")); + hn_chim_free(txr->hn_sc, txd->chim_index); + txd->chim_index = HN_NVS_CHIM_IDX_INVALID; + txd->chim_size = 0; + } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { + bus_dmamap_sync(txr->hn_tx_data_dtag, + txd->data_dmap, BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(txr->hn_tx_data_dtag, + txd->data_dmap); + txd->flags &= ~HN_TXD_FLAG_DMAMAP; + } + + if (txd->m != NULL) { + m_freem(txd->m); + txd->m = NULL; + } + + txd->flags |= HN_TXD_FLAG_ONLIST; +#ifndef HN_USE_TXDESC_BUFRING + mtx_lock_spin(&txr->hn_txlist_spin); + KASSERT(txr->hn_txdesc_avail >= 0 && + txr->hn_txdesc_avail < txr->hn_txdesc_cnt, + ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); + txr->hn_txdesc_avail++; + SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); + mtx_unlock_spin(&txr->hn_txlist_spin); +#else /* HN_USE_TXDESC_BUFRING */ +#ifdef HN_DEBUG + atomic_add_int(&txr->hn_txdesc_avail, 1); +#endif + buf_ring_enqueue(txr->hn_txdesc_br, txd); +#endif /* !HN_USE_TXDESC_BUFRING */ + + return 1; +} + +static __inline struct hn_txdesc * +hn_txdesc_get(struct hn_tx_ring *txr) +{ + struct hn_txdesc *txd; + +#ifndef HN_USE_TXDESC_BUFRING + mtx_lock_spin(&txr->hn_txlist_spin); + txd = SLIST_FIRST(&txr->hn_txlist); + if (txd != NULL) { + KASSERT(txr->hn_txdesc_avail > 0, + ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); + txr->hn_txdesc_avail--; + SLIST_REMOVE_HEAD(&txr->hn_txlist, link); + } + mtx_unlock_spin(&txr->hn_txlist_spin); +#else + txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); +#endif + + if (txd != NULL) { +#ifdef HN_USE_TXDESC_BUFRING +#ifdef HN_DEBUG + atomic_subtract_int(&txr->hn_txdesc_avail, 1); +#endif +#endif /* HN_USE_TXDESC_BUFRING */ + KASSERT(txd->m == NULL && txd->refs == 0 && + STAILQ_EMPTY(&txd->agg_list) && + txd->chim_index == HN_NVS_CHIM_IDX_INVALID && + txd->chim_size == 0 && + (txd->flags & HN_TXD_FLAG_ONLIST) && + (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && + (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); + txd->flags &= ~HN_TXD_FLAG_ONLIST; + txd->refs = 1; + } + return txd; +} + +static __inline void +hn_txdesc_hold(struct hn_txdesc *txd) +{ + + /* 0->1 transition will never work */ + KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); + atomic_add_int(&txd->refs, 1); +} + +static __inline void +hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) +{ + + KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, + ("recursive aggregation on aggregating txdesc")); + + KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, + ("already aggregated")); + KASSERT(STAILQ_EMPTY(&txd->agg_list), + ("recursive aggregation on to-be-aggregated txdesc")); + + txd->flags |= HN_TXD_FLAG_ONAGG; + STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); +} + +static bool +hn_tx_ring_pending(struct hn_tx_ring *txr) +{ + bool pending = false; + +#ifndef HN_USE_TXDESC_BUFRING + mtx_lock_spin(&txr->hn_txlist_spin); + if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) + pending = true; + mtx_unlock_spin(&txr->hn_txlist_spin); +#else + if (!buf_ring_full(txr->hn_txdesc_br)) + pending = true; +#endif + return (pending); +} + +static __inline void +hn_txeof(struct hn_tx_ring *txr) +{ + txr->hn_has_txeof = 0; + txr->hn_txeof(txr); +} + +static void +hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, + struct vmbus_channel *chan, const void *data __unused, int dlen __unused) +{ + struct hn_txdesc *txd = sndc->hn_cbarg; + struct hn_tx_ring *txr; + + txr = txd->txr; + KASSERT(txr->hn_chan == chan, + ("channel mismatch, on chan%u, should be chan%u", + vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); + + txr->hn_has_txeof = 1; + hn_txdesc_put(txr, txd); + + ++txr->hn_txdone_cnt; + if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { + txr->hn_txdone_cnt = 0; + if (txr->hn_oactive) + hn_txeof(txr); + } +} + +static void +hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) +{ +#if defined(INET) || defined(INET6) + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); + tcp_lro_flush_all(&rxr->hn_lro); + NET_EPOCH_EXIT(et); +#endif + + /* + * NOTE: + * 'txr' could be NULL, if multiple channels and + * ifnet.if_start method are enabled. + */ + if (txr == NULL || !txr->hn_has_txeof) + return; + + txr->hn_txdone_cnt = 0; + hn_txeof(txr); +} + +static __inline uint32_t +hn_rndis_pktmsg_offset(uint32_t ofs) +{ + + KASSERT(ofs >= sizeof(struct rndis_packet_msg), + ("invalid RNDIS packet msg offset %u", ofs)); + return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); +} + +static __inline void * +hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, + size_t pi_dlen, uint32_t pi_type) +{ + const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); + struct rndis_pktinfo *pi; + + KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, + ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); + + /* + * Per-packet-info does not move; it only grows. + * + * NOTE: + * rm_pktinfooffset in this phase counts from the beginning + * of rndis_packet_msg. + */ + KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, + ("%u pktinfo overflows RNDIS packet msg", pi_type)); + pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + + pkt->rm_pktinfolen); + pkt->rm_pktinfolen += pi_size; + + pi->rm_size = pi_size; + pi->rm_type = pi_type; + pi->rm_internal = 0; + pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; + + return (pi->rm_data); +} + +static __inline int +hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) +{ + struct hn_txdesc *txd; + struct mbuf *m; + int error, pkts; + + txd = txr->hn_agg_txd; + KASSERT(txd != NULL, ("no aggregate txdesc")); + + /* + * Since hn_txpkt() will reset this temporary stat, save + * it now, so that oerrors can be updated properly, if + * hn_txpkt() ever fails. + */ + pkts = txr->hn_stat_pkts; + + /* + * Since txd's mbuf will _not_ be freed upon hn_txpkt() + * failure, save it for later freeing, if hn_txpkt() ever + * fails. + */ + m = txd->m; + error = hn_txpkt(ifp, txr, txd); + if (__predict_false(error)) { + /* txd is freed, but m is not. */ + m_freem(m); + + txr->hn_flush_failed++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); + } + + /* Reset all aggregation states. */ + txr->hn_agg_txd = NULL; + txr->hn_agg_szleft = 0; + txr->hn_agg_pktleft = 0; + txr->hn_agg_prevpkt = NULL; + + return (error); +} + +static void * +hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, + int pktsize) +{ + void *chim; + + if (txr->hn_agg_txd != NULL) { + if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { + struct hn_txdesc *agg_txd = txr->hn_agg_txd; + struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; + int olen; + + /* + * Update the previous RNDIS packet's total length, + * it can be increased due to the mandatory alignment + * padding for this RNDIS packet. And update the + * aggregating txdesc's chimney sending buffer size + * accordingly. + * + * XXX + * Zero-out the padding, as required by the RNDIS spec. + */ + olen = pkt->rm_len; + pkt->rm_len = roundup2(olen, txr->hn_agg_align); + agg_txd->chim_size += pkt->rm_len - olen; + + /* Link this txdesc to the parent. */ + hn_txdesc_agg(agg_txd, txd); + + chim = (uint8_t *)pkt + pkt->rm_len; + /* Save the current packet for later fixup. */ + txr->hn_agg_prevpkt = chim; + + txr->hn_agg_pktleft--; + txr->hn_agg_szleft -= pktsize; + if (txr->hn_agg_szleft <= + HN_PKTSIZE_MIN(txr->hn_agg_align)) { + /* + * Probably can't aggregate more packets, + * flush this aggregating txdesc proactively. + */ + txr->hn_agg_pktleft = 0; + } + /* Done! */ + return (chim); + } + hn_flush_txagg(ifp, txr); + } + KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); + + txr->hn_tx_chimney_tried++; + txd->chim_index = hn_chim_alloc(txr->hn_sc); + if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) + return (NULL); + txr->hn_tx_chimney++; + + chim = txr->hn_sc->hn_chim + + (txd->chim_index * txr->hn_sc->hn_chim_szmax); + + if (txr->hn_agg_pktmax > 1 && + txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { + txr->hn_agg_txd = txd; + txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; + txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; + txr->hn_agg_prevpkt = chim; + } + return (chim); +} + +/* + * NOTE: + * If this function fails, then both txd and m_head0 will be freed. + */ +static int +hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, + struct mbuf **m_head0) +{ + bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; + int error, nsegs, i; + struct mbuf *m_head = *m_head0; + struct rndis_packet_msg *pkt; + uint32_t *pi_data; + void *chim = NULL; + int pkt_hlen, pkt_size; + + pkt = txd->rndis_pkt; + pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); + if (pkt_size < txr->hn_chim_size) { + chim = hn_try_txagg(ifp, txr, txd, pkt_size); + if (chim != NULL) + pkt = chim; + } else { + if (txr->hn_agg_txd != NULL) + hn_flush_txagg(ifp, txr); + } + + pkt->rm_type = REMOTE_NDIS_PACKET_MSG; + pkt->rm_len = m_head->m_pkthdr.len; + pkt->rm_dataoffset = 0; + pkt->rm_datalen = m_head->m_pkthdr.len; + pkt->rm_oobdataoffset = 0; + pkt->rm_oobdatalen = 0; + pkt->rm_oobdataelements = 0; + pkt->rm_pktinfooffset = sizeof(*pkt); + pkt->rm_pktinfolen = 0; + pkt->rm_vchandle = 0; + pkt->rm_reserved = 0; + + if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { + /* + * Set the hash value for this packet. + */ + pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, + HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); + + if (M_HASHTYPE_ISHASH(m_head)) + /* + * The flowid field contains the hash value host + * set in the rx queue if it is a ip forwarding pkt. + * Set the same hash value so host can send on the + * cpu it was received. + */ + *pi_data = m_head->m_pkthdr.flowid; + else + /* + * Otherwise just put the tx queue index. + */ + *pi_data = txr->hn_tx_idx; + } + + if (m_head->m_flags & M_VLANTAG) { + pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, + NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); + *pi_data = NDIS_VLAN_INFO_MAKE( + EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), + EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), + EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); + } + + if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { +#if defined(INET6) || defined(INET) + pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, + NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); +#ifdef INET + if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { + *pi_data = NDIS_LSO2_INFO_MAKEIPV4( + m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, + m_head->m_pkthdr.tso_segsz); + } +#endif +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET6 + { + *pi_data = NDIS_LSO2_INFO_MAKEIPV6( + m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, + m_head->m_pkthdr.tso_segsz); + } +#endif +#endif /* INET6 || INET */ + } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { + pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, + NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); + if (m_head->m_pkthdr.csum_flags & + (CSUM_IP6_TCP | CSUM_IP6_UDP)) { + *pi_data = NDIS_TXCSUM_INFO_IPV6; + } else { + *pi_data = NDIS_TXCSUM_INFO_IPV4; + if (m_head->m_pkthdr.csum_flags & CSUM_IP) + *pi_data |= NDIS_TXCSUM_INFO_IPCS; + } + + if (m_head->m_pkthdr.csum_flags & + (CSUM_IP_TCP | CSUM_IP6_TCP)) { + *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( + m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); + } else if (m_head->m_pkthdr.csum_flags & + (CSUM_IP_UDP | CSUM_IP6_UDP)) { + *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( + m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); + } + } + + pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; + /* Fixup RNDIS packet message total length */ + pkt->rm_len += pkt_hlen; + /* Convert RNDIS packet message offsets */ + pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); + pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); + + /* + * Fast path: Chimney sending. + */ + if (chim != NULL) { + struct hn_txdesc *tgt_txd = txd; + + if (txr->hn_agg_txd != NULL) { + tgt_txd = txr->hn_agg_txd; +#ifdef INVARIANTS + *m_head0 = NULL; +#endif + } + + KASSERT(pkt == chim, + ("RNDIS pkt not in chimney sending buffer")); + KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, + ("chimney sending buffer is not used")); + tgt_txd->chim_size += pkt->rm_len; + + m_copydata(m_head, 0, m_head->m_pkthdr.len, + ((uint8_t *)chim) + pkt_hlen); + + txr->hn_gpa_cnt = 0; + txr->hn_sendpkt = hn_txpkt_chim; + goto done; + } + + KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); + KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, + ("chimney buffer is used")); + KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); + + error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); + if (__predict_false(error)) { + int freed; + + /* + * This mbuf is not linked w/ the txd yet, so free it now. + */ + m_freem(m_head); + *m_head0 = NULL; + + freed = hn_txdesc_put(txr, txd); + KASSERT(freed != 0, + ("fail to free txd upon txdma error")); + + txr->hn_txdma_failed++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return error; + } + *m_head0 = m_head; + + /* +1 RNDIS packet message */ + txr->hn_gpa_cnt = nsegs + 1; + + /* send packet with page buffer */ + txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); + txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; + txr->hn_gpa[0].gpa_len = pkt_hlen; + + /* + * Fill the page buffers with mbuf info after the page + * buffer for RNDIS packet message. + */ + for (i = 0; i < nsegs; ++i) { + struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; + + gpa->gpa_page = atop(segs[i].ds_addr); + gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; + gpa->gpa_len = segs[i].ds_len; + } + + txd->chim_index = HN_NVS_CHIM_IDX_INVALID; + txd->chim_size = 0; + txr->hn_sendpkt = hn_txpkt_sglist; +done: + txd->m = m_head; + + /* Set the completion routine */ + hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); + + /* Update temporary stats for later use. */ + txr->hn_stat_pkts++; + txr->hn_stat_size += m_head->m_pkthdr.len; + if (m_head->m_flags & M_MCAST) + txr->hn_stat_mcasts++; + + return 0; +} + +/* + * NOTE: + * If this function fails, then txd will be freed, but the mbuf + * associated w/ the txd will _not_ be freed. + */ +static int +hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) +{ + int error, send_failed = 0, has_bpf; + +again: + has_bpf = bpf_peers_present(ifp->if_bpf); + if (has_bpf) { + /* + * Make sure that this txd and any aggregated txds are not + * freed before ETHER_BPF_MTAP. + */ + hn_txdesc_hold(txd); + } + error = txr->hn_sendpkt(txr, txd); + if (!error) { + if (has_bpf) { + const struct hn_txdesc *tmp_txd; + + ETHER_BPF_MTAP(ifp, txd->m); + STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) + ETHER_BPF_MTAP(ifp, tmp_txd->m); + } + + if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); +#ifdef HN_IFSTART_SUPPORT + if (!hn_use_if_start) +#endif + { + if_inc_counter(ifp, IFCOUNTER_OBYTES, + txr->hn_stat_size); + if (txr->hn_stat_mcasts != 0) { + if_inc_counter(ifp, IFCOUNTER_OMCASTS, + txr->hn_stat_mcasts); + } + } + txr->hn_pkts += txr->hn_stat_pkts; + txr->hn_sends++; + } + if (has_bpf) + hn_txdesc_put(txr, txd); + + if (__predict_false(error)) { + int freed; + + /* + * This should "really rarely" happen. + * + * XXX Too many RX to be acked or too many sideband + * commands to run? Ask netvsc_channel_rollup() + * to kick start later. + */ + txr->hn_has_txeof = 1; + if (!send_failed) { + txr->hn_send_failed++; + send_failed = 1; + /* + * Try sending again after set hn_has_txeof; + * in case that we missed the last + * netvsc_channel_rollup(). + */ + goto again; + } + if_printf(ifp, "send failed\n"); + + /* + * Caller will perform further processing on the + * associated mbuf, so don't free it in hn_txdesc_put(); + * only unload it from the DMA map in hn_txdesc_put(), + * if it was loaded. + */ + txd->m = NULL; + freed = hn_txdesc_put(txr, txd); + KASSERT(freed != 0, + ("fail to free txd upon send error")); + + txr->hn_send_failed++; + } + + /* Reset temporary stats, after this sending is done. */ + txr->hn_stat_size = 0; + txr->hn_stat_pkts = 0; + txr->hn_stat_mcasts = 0; + + return (error); +} + +/* + * Append the specified data to the indicated mbuf chain, + * Extend the mbuf chain if the new data does not fit in + * existing space. + * + * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. + * There should be an equivalent in the kernel mbuf code, + * but there does not appear to be one yet. + * + * Differs from m_append() in that additional mbufs are + * allocated with cluster size MJUMPAGESIZE, and filled + * accordingly. + * + * Return the last mbuf in the chain or NULL if failed to + * allocate new mbuf. + */ +static struct mbuf * +hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) +{ + struct mbuf *m, *n; + int remainder, space; + + for (m = m0; m->m_next != NULL; m = m->m_next) + ; + remainder = len; + space = M_TRAILINGSPACE(m); + if (space > 0) { + /* + * Copy into available space. + */ + if (space > remainder) + space = remainder; + bcopy(cp, mtod(m, caddr_t) + m->m_len, space); + m->m_len += space; + cp += space; + remainder -= space; + } + while (remainder > 0) { + /* + * Allocate a new mbuf; could check space + * and allocate a cluster instead. + */ + n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); + if (n == NULL) + return NULL; + n->m_len = min(MJUMPAGESIZE, remainder); + bcopy(cp, mtod(n, caddr_t), n->m_len); + cp += n->m_len; + remainder -= n->m_len; + m->m_next = n; + m = n; + } + + return m; +} + +#if defined(INET) || defined(INET6) +static __inline int +hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) +{ +#if __FreeBSD_version >= 1100095 + if (hn_lro_mbufq_depth) { + tcp_lro_queue_mbuf(lc, m); + return 0; + } +#endif + return tcp_lro_rx(lc, m, 0); +} +#endif + +static int +hn_rxpkt(struct hn_rx_ring *rxr) +{ + struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; + struct mbuf *m_new, *n; + int size, do_lro = 0, do_csum = 1, is_vf = 0; + int hash_type = M_HASHTYPE_NONE; + int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; + int i; + + ifp = hn_ifp; + if (rxr->hn_rxvf_ifp != NULL) { + /* + * Non-transparent mode VF; pretend this packet is from + * the VF. + */ + ifp = rxr->hn_rxvf_ifp; + is_vf = 1; + } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { + /* Transparent mode VF. */ + is_vf = 1; + } + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + /* + * NOTE: + * See the NOTE of hn_rndis_init_fixat(). This + * function can be reached, immediately after the + * RNDIS is initialized but before the ifnet is + * setup on the hn_attach() path; drop the unexpected + * packets. + */ + return (0); + } + + if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { + if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); + return (0); + } + + if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { + m_new = m_gethdr(M_NOWAIT, MT_DATA); + if (m_new == NULL) { + if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); + return (0); + } + memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], + rxr->rsc.frag_len[0]); + m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; + } else { + /* + * Get an mbuf with a cluster. For packets 2K or less, + * get a standard 2K cluster. For anything larger, get a + * 4K cluster. Any buffers larger than 4K can cause problems + * if looped around to the Hyper-V TX channel, so avoid them. + */ + size = MCLBYTES; + if (rxr->rsc.pktlen > MCLBYTES) { + /* 4096 */ + size = MJUMPAGESIZE; + } + + m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); + if (m_new == NULL) { + if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); + return (0); + } + + n = m_new; + for (i = 0; i < rxr->rsc.cnt; i++) { + n = hv_m_append(n, rxr->rsc.frag_len[i], + rxr->rsc.frag_data[i]); + if (n == NULL) { + if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); + return (0); + } else { + m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; + } + } + } + if (rxr->rsc.pktlen <= MHLEN) + rxr->hn_small_pkts++; + + m_new->m_pkthdr.rcvif = ifp; + + if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) + do_csum = 0; + + /* receive side checksum offload */ + if (rxr->rsc.csum_info != NULL) { + /* IP csum offload */ + if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { + m_new->m_pkthdr.csum_flags |= + (CSUM_IP_CHECKED | CSUM_IP_VALID); + rxr->hn_csum_ip++; + } + + /* TCP/UDP csum offload */ + if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | + NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { + m_new->m_pkthdr.csum_flags |= + (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + m_new->m_pkthdr.csum_data = 0xffff; + if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) + rxr->hn_csum_tcp++; + else + rxr->hn_csum_udp++; + } + + /* + * XXX + * As of this write (Oct 28th, 2016), host side will turn + * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so + * the do_lro setting here is actually _not_ accurate. We + * depend on the RSS hash type check to reset do_lro. + */ + if ((*(rxr->rsc.csum_info) & + (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == + (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) + do_lro = 1; + } else { + hn_rxpkt_proto(m_new, &l3proto, &l4proto); + if (l3proto == ETHERTYPE_IP) { + if (l4proto == IPPROTO_TCP) { + if (do_csum && + (rxr->hn_trust_hcsum & + HN_TRUST_HCSUM_TCP)) { + rxr->hn_csum_trusted++; + m_new->m_pkthdr.csum_flags |= + (CSUM_IP_CHECKED | CSUM_IP_VALID | + CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + m_new->m_pkthdr.csum_data = 0xffff; + } + do_lro = 1; + } else if (l4proto == IPPROTO_UDP) { + if (do_csum && + (rxr->hn_trust_hcsum & + HN_TRUST_HCSUM_UDP)) { + rxr->hn_csum_trusted++; + m_new->m_pkthdr.csum_flags |= + (CSUM_IP_CHECKED | CSUM_IP_VALID | + CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + m_new->m_pkthdr.csum_data = 0xffff; + } + } else if (l4proto != IPPROTO_DONE && do_csum && + (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { + rxr->hn_csum_trusted++; + m_new->m_pkthdr.csum_flags |= + (CSUM_IP_CHECKED | CSUM_IP_VALID); + } + } + } + + if (rxr->rsc.vlan_info != NULL) { + m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( + NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), + NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), + NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); + m_new->m_flags |= M_VLANTAG; + } + + /* + * If VF is activated (tranparent/non-transparent mode does not + * matter here). + * + * - Disable LRO + * + * hn(4) will only receive broadcast packets, multicast packets, + * TCP SYN and SYN|ACK (in Azure), LRO is useless for these + * packet types. + * + * For non-transparent, we definitely _cannot_ enable LRO at + * all, since the LRO flush will use hn(4) as the receiving + * interface; i.e. hn_ifp->if_input(hn_ifp, m). + */ + if (is_vf) + do_lro = 0; + + /* + * If VF is activated (tranparent/non-transparent mode does not + * matter here), do _not_ mess with unsupported hash types or + * functions. + */ + if (rxr->rsc.hash_info != NULL) { + rxr->hn_rss_pkts++; + m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); + if (!is_vf) + hash_type = M_HASHTYPE_OPAQUE_HASH; + if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == + NDIS_HASH_FUNCTION_TOEPLITZ) { + uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & + rxr->hn_mbuf_hash); + + /* + * NOTE: + * do_lro is resetted, if the hash types are not TCP + * related. See the comment in the above csum_flags + * setup section. + */ + switch (type) { + case NDIS_HASH_IPV4: + hash_type = M_HASHTYPE_RSS_IPV4; + do_lro = 0; + break; + + case NDIS_HASH_TCP_IPV4: + hash_type = M_HASHTYPE_RSS_TCP_IPV4; + if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { + int def_htype = M_HASHTYPE_OPAQUE_HASH; + + if (is_vf) + def_htype = M_HASHTYPE_NONE; + + /* + * UDP 4-tuple hash is delivered as + * TCP 4-tuple hash. + */ + if (l3proto == ETHERTYPE_MAX) { + hn_rxpkt_proto(m_new, + &l3proto, &l4proto); + } + if (l3proto == ETHERTYPE_IP) { + if (l4proto == IPPROTO_UDP && + (rxr->hn_mbuf_hash & + NDIS_HASH_UDP_IPV4_X)) { + hash_type = + M_HASHTYPE_RSS_UDP_IPV4; + do_lro = 0; + } else if (l4proto != + IPPROTO_TCP) { + hash_type = def_htype; + do_lro = 0; + } + } else { + hash_type = def_htype; + do_lro = 0; + } + } + break; + + case NDIS_HASH_IPV6: + hash_type = M_HASHTYPE_RSS_IPV6; + do_lro = 0; + break; + + case NDIS_HASH_IPV6_EX: + hash_type = M_HASHTYPE_RSS_IPV6_EX; + do_lro = 0; + break; + + case NDIS_HASH_TCP_IPV6: + hash_type = M_HASHTYPE_RSS_TCP_IPV6; + break; + + case NDIS_HASH_TCP_IPV6_EX: + hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; + break; + } + } + } else if (!is_vf) { + m_new->m_pkthdr.flowid = rxr->hn_rx_idx; + hash_type = M_HASHTYPE_OPAQUE; + } + M_HASHTYPE_SET(m_new, hash_type); + + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); + if (hn_ifp != ifp) { + const struct ether_header *eh; + + /* + * Non-transparent mode VF is activated. + */ + + /* + * Allow tapping on hn(4). + */ + ETHER_BPF_MTAP(hn_ifp, m_new); + + /* + * Update hn(4)'s stats. + */ + if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); + if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); + /* Checked at the beginning of this function. */ + KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); + eh = mtod(m_new, struct ether_header *); + if (ETHER_IS_MULTICAST(eh->ether_dhost)) + if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); + } + rxr->hn_pkts++; + + if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { +#if defined(INET) || defined(INET6) + struct lro_ctrl *lro = &rxr->hn_lro; + + if (lro->lro_cnt) { + rxr->hn_lro_tried++; + if (hn_lro_rx(lro, m_new) == 0) { + /* DONE! */ + return 0; + } + } +#endif + } + ifp->if_input(ifp, m_new); + + return (0); +} + +static int +hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct hn_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *)data, ifr_vf; + struct ifnet *vf_ifp; + int mask, error = 0; + struct ifrsskey *ifrk; + struct ifrsshash *ifrh; + uint32_t mtu; + + switch (cmd) { + case SIOCSIFMTU: + if (ifr->ifr_mtu > HN_MTU_MAX) { + error = EINVAL; + break; + } + + HN_LOCK(sc); + + if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { + HN_UNLOCK(sc); + break; + } + + if ((sc->hn_caps & HN_CAP_MTU) == 0) { + /* Can't change MTU */ + HN_UNLOCK(sc); + error = EOPNOTSUPP; + break; + } + + if (ifp->if_mtu == ifr->ifr_mtu) { + HN_UNLOCK(sc); + break; + } + + if (hn_xpnt_vf_isready(sc)) { + vf_ifp = sc->hn_vf_ifp; + ifr_vf = *ifr; + strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, + sizeof(ifr_vf.ifr_name)); + error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, + (caddr_t)&ifr_vf); + if (error) { + HN_UNLOCK(sc); + if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", + vf_ifp->if_xname, ifr->ifr_mtu, error); + break; + } + } + + /* + * Suspend this interface before the synthetic parts + * are ripped. + */ + hn_suspend(sc); + + /* + * Detach the synthetics parts, i.e. NVS and RNDIS. + */ + hn_synth_detach(sc); + + /* + * Reattach the synthetic parts, i.e. NVS and RNDIS, + * with the new MTU setting. + */ + error = hn_synth_attach(sc, ifr->ifr_mtu); + if (error) { + HN_UNLOCK(sc); + break; + } + + error = hn_rndis_get_mtu(sc, &mtu); + if (error) + mtu = ifr->ifr_mtu; + else if (bootverbose) + if_printf(ifp, "RNDIS mtu %u\n", mtu); + + /* + * Commit the requested MTU, after the synthetic parts + * have been successfully attached. + */ + if (mtu >= ifr->ifr_mtu) { + mtu = ifr->ifr_mtu; + } else { + if_printf(ifp, "fixup mtu %d -> %u\n", + ifr->ifr_mtu, mtu); + } + ifp->if_mtu = mtu; + + /* + * Synthetic parts' reattach may change the chimney + * sending size; update it. + */ + if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) + hn_set_chim_size(sc, sc->hn_chim_szmax); + + /* + * Make sure that various parameters based on MTU are + * still valid, after the MTU change. + */ + hn_mtu_change_fixup(sc); + + /* + * All done! Resume the interface now. + */ + hn_resume(sc); + + if ((sc->hn_flags & HN_FLAG_RXVF) || + (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { + /* + * Since we have reattached the NVS part, + * change the datapath to VF again; in case + * that it is lost, after the NVS was detached. + */ + hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); + } + + HN_UNLOCK(sc); + break; + + case SIOCSIFFLAGS: + HN_LOCK(sc); + + if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { + HN_UNLOCK(sc); + break; + } + + if (hn_xpnt_vf_isready(sc)) + hn_xpnt_vf_saveifflags(sc); + + if (ifp->if_flags & IFF_UP) { + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + /* + * Caller meight hold mutex, e.g. + * bpf; use busy-wait for the RNDIS + * reply. + */ + HN_NO_SLEEPING(sc); + hn_rxfilter_config(sc); + HN_SLEEPING_OK(sc); + + if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) + error = hn_xpnt_vf_iocsetflags(sc); + } else { + hn_init_locked(sc); + } + } else { + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + hn_stop(sc, false); + } + sc->hn_if_flags = ifp->if_flags; + + HN_UNLOCK(sc); + break; + + case SIOCSIFCAP: + HN_LOCK(sc); + + if (hn_xpnt_vf_isready(sc)) { + ifr_vf = *ifr; + strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, + sizeof(ifr_vf.ifr_name)); + error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); + HN_UNLOCK(sc); + break; + } + + /* + * Fix up requested capabilities w/ supported capabilities, + * since the supported capabilities could have been changed. + */ + mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ + ifp->if_capenable; + + if (mask & IFCAP_TXCSUM) { + ifp->if_capenable ^= IFCAP_TXCSUM; + if (ifp->if_capenable & IFCAP_TXCSUM) + ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); + else + ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); + } + if (mask & IFCAP_TXCSUM_IPV6) { + ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; + if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) + ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); + else + ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); + } + + /* TODO: flip RNDIS offload parameters for RXCSUM. */ + if (mask & IFCAP_RXCSUM) + ifp->if_capenable ^= IFCAP_RXCSUM; +#ifdef foo + /* We can't diff IPv6 packets from IPv4 packets on RX path. */ + if (mask & IFCAP_RXCSUM_IPV6) + ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; +#endif + + if (mask & IFCAP_LRO) + ifp->if_capenable ^= IFCAP_LRO; + + if (mask & IFCAP_TSO4) { + ifp->if_capenable ^= IFCAP_TSO4; + if (ifp->if_capenable & IFCAP_TSO4) + ifp->if_hwassist |= CSUM_IP_TSO; + else + ifp->if_hwassist &= ~CSUM_IP_TSO; + } + if (mask & IFCAP_TSO6) { + ifp->if_capenable ^= IFCAP_TSO6; + if (ifp->if_capenable & IFCAP_TSO6) + ifp->if_hwassist |= CSUM_IP6_TSO; + else + ifp->if_hwassist &= ~CSUM_IP6_TSO; + } + + HN_UNLOCK(sc); + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + HN_LOCK(sc); + + if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { + HN_UNLOCK(sc); + break; + } + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + /* + * Multicast uses mutex; use busy-wait for + * the RNDIS reply. + */ + HN_NO_SLEEPING(sc); + hn_rxfilter_config(sc); + HN_SLEEPING_OK(sc); + } + + /* XXX vlan(4) style mcast addr maintenance */ + if (hn_xpnt_vf_isready(sc)) { + int old_if_flags; + + old_if_flags = sc->hn_vf_ifp->if_flags; + hn_xpnt_vf_saveifflags(sc); + + if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && + ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & + IFF_ALLMULTI)) + error = hn_xpnt_vf_iocsetflags(sc); + } + + HN_UNLOCK(sc); + break; + + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + HN_LOCK(sc); + if (hn_xpnt_vf_isready(sc)) { + /* + * SIOCGIFMEDIA expects ifmediareq, so don't + * create and pass ifr_vf to the VF here; just + * replace the ifr_name. + */ + vf_ifp = sc->hn_vf_ifp; + strlcpy(ifr->ifr_name, vf_ifp->if_xname, + sizeof(ifr->ifr_name)); + error = vf_ifp->if_ioctl(vf_ifp, cmd, data); + /* Restore the ifr_name. */ + strlcpy(ifr->ifr_name, ifp->if_xname, + sizeof(ifr->ifr_name)); + HN_UNLOCK(sc); + break; + } + HN_UNLOCK(sc); + error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); + break; + + case SIOCGIFRSSHASH: + ifrh = (struct ifrsshash *)data; + HN_LOCK(sc); + if (sc->hn_rx_ring_inuse == 1) { + HN_UNLOCK(sc); + ifrh->ifrh_func = RSS_FUNC_NONE; + ifrh->ifrh_types = 0; + break; + } + + if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) + ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; + else + ifrh->ifrh_func = RSS_FUNC_PRIVATE; + ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); + HN_UNLOCK(sc); + break; + + case SIOCGIFRSSKEY: + ifrk = (struct ifrsskey *)data; + HN_LOCK(sc); + if (sc->hn_rx_ring_inuse == 1) { + HN_UNLOCK(sc); + ifrk->ifrk_func = RSS_FUNC_NONE; + ifrk->ifrk_keylen = 0; + break; + } + if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) + ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; + else + ifrk->ifrk_func = RSS_FUNC_PRIVATE; + ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; + memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, + NDIS_HASH_KEYSIZE_TOEPLITZ); + HN_UNLOCK(sc); + break; + + default: + error = ether_ioctl(ifp, cmd, data); + break; + } + return (error); +} + +static void +hn_stop(struct hn_softc *sc, bool detaching) +{ + struct ifnet *ifp = sc->hn_ifp; + int i; + + HN_LOCK_ASSERT(sc); + + KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, + ("synthetic parts were not attached")); + + /* Clear RUNNING bit ASAP. */ + atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); + + /* Disable polling. */ + hn_polling(sc, 0); + + if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { + KASSERT(sc->hn_vf_ifp != NULL, + ("%s: VF is not attached", ifp->if_xname)); + + /* Mark transparent mode VF as disabled. */ + hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); + + /* + * NOTE: + * Datapath setting must happen _before_ bringing + * the VF down. + */ + hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); + + /* + * Bring the VF down. + */ + hn_xpnt_vf_saveifflags(sc); + sc->hn_vf_ifp->if_flags &= ~IFF_UP; + hn_xpnt_vf_iocsetflags(sc); + } + + /* Suspend data transfers. */ + hn_suspend_data(sc); + + /* Clear OACTIVE bit. */ + atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) + sc->hn_tx_ring[i].hn_oactive = 0; + + /* + * If the non-transparent mode VF is active, make sure + * that the RX filter still allows packet reception. + */ + if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) + hn_rxfilter_config(sc); +} + +static void +hn_init_locked(struct hn_softc *sc) +{ + struct ifnet *ifp = sc->hn_ifp; + int i; + + HN_LOCK_ASSERT(sc); + + if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) + return; + + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + return; + + /* Configure RX filter */ + hn_rxfilter_config(sc); + + /* Clear OACTIVE bit. */ + atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) + sc->hn_tx_ring[i].hn_oactive = 0; + + /* Clear TX 'suspended' bit. */ + hn_resume_tx(sc, sc->hn_tx_ring_inuse); + + if (hn_xpnt_vf_isready(sc)) { + /* Initialize transparent VF. */ + hn_xpnt_vf_init(sc); + } + + /* Everything is ready; unleash! */ + atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); + + /* Re-enable polling if requested. */ + if (sc->hn_pollhz > 0) + hn_polling(sc, sc->hn_pollhz); +} + +static void +hn_init(void *xsc) +{ + struct hn_softc *sc = xsc; + + HN_LOCK(sc); + hn_init_locked(sc); + HN_UNLOCK(sc); +} + +#if __FreeBSD_version >= 1100099 + +static int +hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + unsigned int lenlim; + int error; + + lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; + error = sysctl_handle_int(oidp, &lenlim, 0, req); + if (error || req->newptr == NULL) + return error; + + HN_LOCK(sc); + if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || + lenlim > TCP_LRO_LENGTH_MAX) { + HN_UNLOCK(sc); + return EINVAL; + } + hn_set_lro_lenlim(sc, lenlim); + HN_UNLOCK(sc); + + return 0; +} + +static int +hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int ackcnt, error, i; + + /* + * lro_ackcnt_lim is append count limit, + * +1 to turn it into aggregation limit. + */ + ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; + error = sysctl_handle_int(oidp, &ackcnt, 0, req); + if (error || req->newptr == NULL) + return error; + + if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) + return EINVAL; + + /* + * Convert aggregation limit back to append + * count limit. + */ + --ackcnt; + HN_LOCK(sc); + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) + sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; + HN_UNLOCK(sc); + return 0; +} + +#endif + +static int +hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int hcsum = arg2; + int on, error, i; + + on = 0; + if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) + on = 1; + + error = sysctl_handle_int(oidp, &on, 0, req); + if (error || req->newptr == NULL) + return error; + + HN_LOCK(sc); + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; + + if (on) + rxr->hn_trust_hcsum |= hcsum; + else + rxr->hn_trust_hcsum &= ~hcsum; + } + HN_UNLOCK(sc); + return 0; +} + +static int +hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int chim_size, error; + + chim_size = sc->hn_tx_ring[0].hn_chim_size; + error = sysctl_handle_int(oidp, &chim_size, 0, req); + if (error || req->newptr == NULL) + return error; + + if (chim_size > sc->hn_chim_szmax || chim_size <= 0) + return EINVAL; + + HN_LOCK(sc); + hn_set_chim_size(sc, chim_size); + HN_UNLOCK(sc); + return 0; +} + +#if __FreeBSD_version < 1100095 +static int +hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int ofs = arg2, i, error; + struct hn_rx_ring *rxr; + uint64_t stat; + + stat = 0; + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + rxr = &sc->hn_rx_ring[i]; + stat += *((int *)((uint8_t *)rxr + ofs)); + } + + error = sysctl_handle_64(oidp, &stat, 0, req); + if (error || req->newptr == NULL) + return error; + + /* Zero out this stat. */ + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + rxr = &sc->hn_rx_ring[i]; + *((int *)((uint8_t *)rxr + ofs)) = 0; + } + return 0; +} +#else +static int +hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int ofs = arg2, i, error; + struct hn_rx_ring *rxr; + uint64_t stat; + + stat = 0; + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + rxr = &sc->hn_rx_ring[i]; + stat += *((uint64_t *)((uint8_t *)rxr + ofs)); + } + + error = sysctl_handle_64(oidp, &stat, 0, req); + if (error || req->newptr == NULL) + return error; + + /* Zero out this stat. */ + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + rxr = &sc->hn_rx_ring[i]; + *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; + } + return 0; +} + +#endif + +static int +hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int ofs = arg2, i, error; + struct hn_rx_ring *rxr; + u_long stat; + + stat = 0; + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + rxr = &sc->hn_rx_ring[i]; + stat += *((u_long *)((uint8_t *)rxr + ofs)); + } + + error = sysctl_handle_long(oidp, &stat, 0, req); + if (error || req->newptr == NULL) + return error; + + /* Zero out this stat. */ + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + rxr = &sc->hn_rx_ring[i]; + *((u_long *)((uint8_t *)rxr + ofs)) = 0; + } + return 0; +} + +static int +hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int ofs = arg2, i, error; + struct hn_tx_ring *txr; + u_long stat; + + stat = 0; + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + txr = &sc->hn_tx_ring[i]; + stat += *((u_long *)((uint8_t *)txr + ofs)); + } + + error = sysctl_handle_long(oidp, &stat, 0, req); + if (error || req->newptr == NULL) + return error; + + /* Zero out this stat. */ + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + txr = &sc->hn_tx_ring[i]; + *((u_long *)((uint8_t *)txr + ofs)) = 0; + } + return 0; +} + +static int +hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int ofs = arg2, i, error, conf; + struct hn_tx_ring *txr; + + txr = &sc->hn_tx_ring[0]; + conf = *((int *)((uint8_t *)txr + ofs)); + + error = sysctl_handle_int(oidp, &conf, 0, req); + if (error || req->newptr == NULL) + return error; + + HN_LOCK(sc); + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + txr = &sc->hn_tx_ring[i]; + *((int *)((uint8_t *)txr + ofs)) = conf; + } + HN_UNLOCK(sc); + + return 0; +} + +static int +hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int error, size; + + size = sc->hn_agg_size; + error = sysctl_handle_int(oidp, &size, 0, req); + if (error || req->newptr == NULL) + return (error); + + HN_LOCK(sc); + sc->hn_agg_size = size; + hn_set_txagg(sc); + HN_UNLOCK(sc); + + return (0); +} + +static int +hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int error, pkts; + + pkts = sc->hn_agg_pkts; + error = sysctl_handle_int(oidp, &pkts, 0, req); + if (error || req->newptr == NULL) + return (error); + + HN_LOCK(sc); + sc->hn_agg_pkts = pkts; + hn_set_txagg(sc); + HN_UNLOCK(sc); + + return (0); +} + +static int +hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int pkts; + + pkts = sc->hn_tx_ring[0].hn_agg_pktmax; + return (sysctl_handle_int(oidp, &pkts, 0, req)); +} + +static int +hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int align; + + align = sc->hn_tx_ring[0].hn_agg_align; + return (sysctl_handle_int(oidp, &align, 0, req)); +} + +static void +hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) +{ + if (pollhz == 0) + vmbus_chan_poll_disable(chan); + else + vmbus_chan_poll_enable(chan, pollhz); +} + +static void +hn_polling(struct hn_softc *sc, u_int pollhz) +{ + int nsubch = sc->hn_rx_ring_inuse - 1; + + HN_LOCK_ASSERT(sc); + + if (nsubch > 0) { + struct vmbus_channel **subch; + int i; + + subch = vmbus_subchan_get(sc->hn_prichan, nsubch); + for (i = 0; i < nsubch; ++i) + hn_chan_polling(subch[i], pollhz); + vmbus_subchan_rel(subch, nsubch); + } + hn_chan_polling(sc->hn_prichan, pollhz); +} + +static int +hn_polling_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int pollhz, error; + + pollhz = sc->hn_pollhz; + error = sysctl_handle_int(oidp, &pollhz, 0, req); + if (error || req->newptr == NULL) + return (error); + + if (pollhz != 0 && + (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) + return (EINVAL); + + HN_LOCK(sc); + if (sc->hn_pollhz != pollhz) { + sc->hn_pollhz = pollhz; + if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && + (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) + hn_polling(sc, sc->hn_pollhz); + } + HN_UNLOCK(sc); + + return (0); +} + +static int +hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + char verstr[16]; + + snprintf(verstr, sizeof(verstr), "%u.%u", + HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), + HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); + return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); +} + +static int +hn_caps_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + char caps_str[128]; + uint32_t caps; + + HN_LOCK(sc); + caps = sc->hn_caps; + HN_UNLOCK(sc); + snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); + return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); +} + +static int +hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + char assist_str[128]; + uint32_t hwassist; + + HN_LOCK(sc); + hwassist = sc->hn_ifp->if_hwassist; + HN_UNLOCK(sc); + snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); + return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); +} + +static int +hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + char filter_str[128]; + uint32_t filter; + + HN_LOCK(sc); + filter = sc->hn_rx_filter; + HN_UNLOCK(sc); + snprintf(filter_str, sizeof(filter_str), "%b", filter, + NDIS_PACKET_TYPES); + return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); +} + +#ifndef RSS + +static int +hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int error; + + HN_LOCK(sc); + + error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); + if (error || req->newptr == NULL) + goto back; + + if ((sc->hn_flags & HN_FLAG_RXVF) || + (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { + /* + * RSS key is synchronized w/ VF's, don't allow users + * to change it. + */ + error = EBUSY; + goto back; + } + + error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); + if (error) + goto back; + sc->hn_flags |= HN_FLAG_HAS_RSSKEY; + + if (sc->hn_rx_ring_inuse > 1) { + error = hn_rss_reconfig(sc); + } else { + /* Not RSS capable, at least for now; just save the RSS key. */ + error = 0; + } +back: + HN_UNLOCK(sc); + return (error); +} + +static int +hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int error; + + HN_LOCK(sc); + + error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); + if (error || req->newptr == NULL) + goto back; + + /* + * Don't allow RSS indirect table change, if this interface is not + * RSS capable currently. + */ + if (sc->hn_rx_ring_inuse == 1) { + error = EOPNOTSUPP; + goto back; + } + + error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); + if (error) + goto back; + sc->hn_flags |= HN_FLAG_HAS_RSSIND; + + hn_rss_ind_fixup(sc); + error = hn_rss_reconfig(sc); +back: + HN_UNLOCK(sc); + return (error); +} + +#endif /* !RSS */ + +static int +hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + char hash_str[128]; + uint32_t hash; + + HN_LOCK(sc); + hash = sc->hn_rss_hash; + HN_UNLOCK(sc); + snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); + return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); +} + +static int +hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + char hash_str[128]; + uint32_t hash; + + HN_LOCK(sc); + hash = sc->hn_rss_hcap; + HN_UNLOCK(sc); + snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); + return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); +} + +static int +hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + char hash_str[128]; + uint32_t hash; + + HN_LOCK(sc); + hash = sc->hn_rx_ring[0].hn_mbuf_hash; + HN_UNLOCK(sc); + snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); + return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); +} + +static int +hn_vf_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + char vf_name[IFNAMSIZ + 1]; + struct ifnet *vf_ifp; + + HN_LOCK(sc); + vf_name[0] = '\0'; + vf_ifp = sc->hn_vf_ifp; + if (vf_ifp != NULL) + snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); + HN_UNLOCK(sc); + return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); +} + +static int +hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + char vf_name[IFNAMSIZ + 1]; + struct ifnet *vf_ifp; + + HN_LOCK(sc); + vf_name[0] = '\0'; + vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; + if (vf_ifp != NULL) + snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); + HN_UNLOCK(sc); + return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); +} + +static int +hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct rm_priotracker pt; + struct sbuf *sb; + int error, i; + bool first; + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + + sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); + if (sb == NULL) + return (ENOMEM); + + rm_rlock(&hn_vfmap_lock, &pt); + + first = true; + for (i = 0; i < hn_vfmap_size; ++i) { + struct ifnet *ifp; + + if (hn_vfmap[i] == NULL) + continue; + + ifp = ifnet_byindex(i); + if (ifp != NULL) { + if (first) + sbuf_printf(sb, "%s", ifp->if_xname); + else + sbuf_printf(sb, " %s", ifp->if_xname); + first = false; + } + } + + rm_runlock(&hn_vfmap_lock, &pt); + + error = sbuf_finish(sb); + sbuf_delete(sb); + return (error); +} + +static int +hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct rm_priotracker pt; + struct sbuf *sb; + int error, i; + bool first; + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + + sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); + if (sb == NULL) + return (ENOMEM); + + rm_rlock(&hn_vfmap_lock, &pt); + + first = true; + for (i = 0; i < hn_vfmap_size; ++i) { + struct ifnet *ifp, *hn_ifp; + + hn_ifp = hn_vfmap[i]; + if (hn_ifp == NULL) + continue; + + ifp = ifnet_byindex(i); + if (ifp != NULL) { + if (first) { + sbuf_printf(sb, "%s:%s", ifp->if_xname, + hn_ifp->if_xname); + } else { + sbuf_printf(sb, " %s:%s", ifp->if_xname, + hn_ifp->if_xname); + } + first = false; + } + } + + rm_runlock(&hn_vfmap_lock, &pt); + + error = sbuf_finish(sb); + sbuf_delete(sb); + return (error); +} + +static int +hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int error, onoff = 0; + + if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) + onoff = 1; + error = sysctl_handle_int(oidp, &onoff, 0, req); + if (error || req->newptr == NULL) + return (error); + + HN_LOCK(sc); + /* NOTE: hn_vf_lock for hn_transmit() */ + rm_wlock(&sc->hn_vf_lock); + if (onoff) + sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; + else + sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; + rm_wunlock(&sc->hn_vf_lock); + HN_UNLOCK(sc); + + return (0); +} + +static int +hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int enabled = 0; + + if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) + enabled = 1; + return (sysctl_handle_int(oidp, &enabled, 0, req)); +} + +static int +hn_check_iplen(const struct mbuf *m, int hoff) +{ + const struct ip *ip; + int len, iphlen, iplen; + const struct tcphdr *th; + int thoff; /* TCP data offset */ + + len = hoff + sizeof(struct ip); + + /* The packet must be at least the size of an IP header. */ + if (m->m_pkthdr.len < len) + return IPPROTO_DONE; + + /* The fixed IP header must reside completely in the first mbuf. */ + if (m->m_len < len) + return IPPROTO_DONE; + + ip = mtodo(m, hoff); + + /* Bound check the packet's stated IP header length. */ + iphlen = ip->ip_hl << 2; + if (iphlen < sizeof(struct ip)) /* minimum header length */ + return IPPROTO_DONE; + + /* The full IP header must reside completely in the one mbuf. */ + if (m->m_len < hoff + iphlen) + return IPPROTO_DONE; + + iplen = ntohs(ip->ip_len); + + /* + * Check that the amount of data in the buffers is as + * at least much as the IP header would have us expect. + */ + if (m->m_pkthdr.len < hoff + iplen) + return IPPROTO_DONE; + + /* + * Ignore IP fragments. + */ + if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) + return IPPROTO_DONE; + + /* + * The TCP/IP or UDP/IP header must be entirely contained within + * the first fragment of a packet. + */ + switch (ip->ip_p) { + case IPPROTO_TCP: + if (iplen < iphlen + sizeof(struct tcphdr)) + return IPPROTO_DONE; + if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) + return IPPROTO_DONE; + th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); + thoff = th->th_off << 2; + if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) + return IPPROTO_DONE; + if (m->m_len < hoff + iphlen + thoff) + return IPPROTO_DONE; + break; + case IPPROTO_UDP: + if (iplen < iphlen + sizeof(struct udphdr)) + return IPPROTO_DONE; + if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) + return IPPROTO_DONE; + break; + default: + if (iplen < iphlen) + return IPPROTO_DONE; + break; + } + return ip->ip_p; +} + +static void +hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) +{ + const struct ether_header *eh; + uint16_t etype; + int hoff; + + hoff = sizeof(*eh); + /* Checked at the beginning of this function. */ + KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); + + eh = mtod(m_new, const struct ether_header *); + etype = ntohs(eh->ether_type); + if (etype == ETHERTYPE_VLAN) { + const struct ether_vlan_header *evl; + + hoff = sizeof(*evl); + if (m_new->m_len < hoff) + return; + evl = mtod(m_new, const struct ether_vlan_header *); + etype = ntohs(evl->evl_proto); + } + *l3proto = etype; + + if (etype == ETHERTYPE_IP) + *l4proto = hn_check_iplen(m_new, hoff); + else + *l4proto = IPPROTO_DONE; +} + +static int +hn_create_rx_data(struct hn_softc *sc, int ring_cnt) +{ + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + device_t dev = sc->hn_dev; +#if defined(INET) || defined(INET6) +#if __FreeBSD_version >= 1100095 + int lroent_cnt; +#endif +#endif + int i; + + /* + * Create RXBUF for reception. + * + * NOTE: + * - It is shared by all channels. + * - A large enough buffer is allocated, certain version of NVSes + * may further limit the usable space. + */ + sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), + PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, + BUS_DMA_WAITOK | BUS_DMA_ZERO); + if (sc->hn_rxbuf == NULL) { + device_printf(sc->hn_dev, "allocate rxbuf failed\n"); + return (ENOMEM); + } + + sc->hn_rx_ring_cnt = ring_cnt; + sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; + + sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, + M_DEVBUF, M_WAITOK | M_ZERO); + +#if defined(INET) || defined(INET6) +#if __FreeBSD_version >= 1100095 + lroent_cnt = hn_lro_entry_count; + if (lroent_cnt < TCP_LRO_ENTRIES) + lroent_cnt = TCP_LRO_ENTRIES; + if (bootverbose) + device_printf(dev, "LRO: entry count %d\n", lroent_cnt); +#endif +#endif /* INET || INET6 */ + + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + /* Create dev.hn.UNIT.rx sysctl tree */ + sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", + CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; + + rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), + PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, + &rxr->hn_br_dma, BUS_DMA_WAITOK); + if (rxr->hn_br == NULL) { + device_printf(dev, "allocate bufring failed\n"); + return (ENOMEM); + } + + if (hn_trust_hosttcp) + rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; + if (hn_trust_hostudp) + rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; + if (hn_trust_hostip) + rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; + rxr->hn_mbuf_hash = NDIS_HASH_ALL; + rxr->hn_ifp = sc->hn_ifp; + if (i < sc->hn_tx_ring_cnt) + rxr->hn_txr = &sc->hn_tx_ring[i]; + rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; + rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); + rxr->hn_rx_idx = i; + rxr->hn_rxbuf = sc->hn_rxbuf; + + /* + * Initialize LRO. + */ +#if defined(INET) || defined(INET6) +#if __FreeBSD_version >= 1100095 + tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, + hn_lro_mbufq_depth); +#else + tcp_lro_init(&rxr->hn_lro); + rxr->hn_lro.ifp = sc->hn_ifp; +#endif +#if __FreeBSD_version >= 1100099 + rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; + rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; +#endif +#endif /* INET || INET6 */ + + if (sc->hn_rx_sysctl_tree != NULL) { + char name[16]; + + /* + * Create per RX ring sysctl tree: + * dev.hn.UNIT.rx.RINGID + */ + snprintf(name, sizeof(name), "%d", i); + rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, + SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), + OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + + if (rxr->hn_rx_sysctl_tree != NULL) { + SYSCTL_ADD_ULONG(ctx, + SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), + OID_AUTO, "packets", CTLFLAG_RW, + &rxr->hn_pkts, "# of packets received"); + SYSCTL_ADD_ULONG(ctx, + SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), + OID_AUTO, "rss_pkts", CTLFLAG_RW, + &rxr->hn_rss_pkts, + "# of packets w/ RSS info received"); + SYSCTL_ADD_ULONG(ctx, + SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), + OID_AUTO, "rsc_pkts", CTLFLAG_RW, + &rxr->hn_rsc_pkts, + "# of RSC packets received"); + SYSCTL_ADD_ULONG(ctx, + SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), + OID_AUTO, "rsc_drop", CTLFLAG_RW, + &rxr->hn_rsc_drop, + "# of RSC fragments dropped"); + SYSCTL_ADD_INT(ctx, + SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), + OID_AUTO, "pktbuf_len", CTLFLAG_RD, + &rxr->hn_pktbuf_len, 0, + "Temporary channel packet buffer length"); + } + } + } + + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", + CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_rx_ring, hn_lro.lro_queued), +#if __FreeBSD_version < 1100095 + hn_rx_stat_int_sysctl, +#else + hn_rx_stat_u64_sysctl, +#endif + "LU", "LRO queued"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", + CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), +#if __FreeBSD_version < 1100095 + hn_rx_stat_int_sysctl, +#else + hn_rx_stat_u64_sysctl, +#endif + "LU", "LRO flushed"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_rx_ring, hn_lro_tried), + hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); +#if __FreeBSD_version >= 1100099 + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_lro_lenlim_sysctl, "IU", + "Max # of data bytes to be aggregated by LRO"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_lro_ackcnt_sysctl, "I", + "Max # of ACKs to be aggregated by LRO"); +#endif + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, + hn_trust_hcsum_sysctl, "I", + "Trust tcp segment verification on host side, " + "when csum info is missing"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, + hn_trust_hcsum_sysctl, "I", + "Trust udp datagram verification on host side, " + "when csum info is missing"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, + hn_trust_hcsum_sysctl, "I", + "Trust ip packet verification on host side, " + "when csum info is missing"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_rx_ring, hn_csum_ip), + hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_rx_ring, hn_csum_tcp), + hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_rx_ring, hn_csum_udp), + hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_rx_ring, hn_csum_trusted), + hn_rx_stat_ulong_sysctl, "LU", + "# of packets that we trust host's csum verification"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_rx_ring, hn_small_pkts), + hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_rx_ring, hn_ack_failed), + hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", + CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", + CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); + + return (0); +} + +static void +hn_destroy_rx_data(struct hn_softc *sc) +{ + int i; + + if (sc->hn_rxbuf != NULL) { + if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) + hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); + else + device_printf(sc->hn_dev, "RXBUF is referenced\n"); + sc->hn_rxbuf = NULL; + } + + if (sc->hn_rx_ring_cnt == 0) + return; + + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; + + if (rxr->hn_br == NULL) + continue; + if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { + hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); + } else { + device_printf(sc->hn_dev, + "%dth channel bufring is referenced", i); + } + rxr->hn_br = NULL; + +#if defined(INET) || defined(INET6) + tcp_lro_free(&rxr->hn_lro); +#endif + free(rxr->hn_pktbuf, M_DEVBUF); + } + free(sc->hn_rx_ring, M_DEVBUF); + sc->hn_rx_ring = NULL; + + sc->hn_rx_ring_cnt = 0; + sc->hn_rx_ring_inuse = 0; +} + +static int +hn_tx_ring_create(struct hn_softc *sc, int id) +{ + struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; + device_t dev = sc->hn_dev; + bus_dma_tag_t parent_dtag; + int error, i; + + txr->hn_sc = sc; + txr->hn_tx_idx = id; + +#ifndef HN_USE_TXDESC_BUFRING + mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); +#endif + mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); + + txr->hn_txdesc_cnt = HN_TX_DESC_CNT; + txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, + M_DEVBUF, M_WAITOK | M_ZERO); +#ifndef HN_USE_TXDESC_BUFRING + SLIST_INIT(&txr->hn_txlist); +#else + txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, + M_WAITOK, &txr->hn_tx_lock); +#endif + + if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { + txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( + device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); + } else { + txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; + } + +#ifdef HN_IFSTART_SUPPORT + if (hn_use_if_start) { + txr->hn_txeof = hn_start_txeof; + TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); + TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); + } else +#endif + { + int br_depth; + + txr->hn_txeof = hn_xmit_txeof; + TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); + TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); + + br_depth = hn_get_txswq_depth(txr); + txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, + M_WAITOK, &txr->hn_tx_lock); + } + + txr->hn_direct_tx_size = hn_direct_tx_size; + + /* + * Always schedule transmission instead of trying to do direct + * transmission. This one gives the best performance so far. + */ + txr->hn_sched_tx = 1; + + parent_dtag = bus_get_dma_tag(dev); + + /* DMA tag for RNDIS packet messages. */ + error = bus_dma_tag_create(parent_dtag, /* parent */ + HN_RNDIS_PKT_ALIGN, /* alignment */ + HN_RNDIS_PKT_BOUNDARY, /* boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + HN_RNDIS_PKT_LEN, /* maxsize */ + 1, /* nsegments */ + HN_RNDIS_PKT_LEN, /* maxsegsize */ + 0, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockfuncarg */ + &txr->hn_tx_rndis_dtag); + if (error) { + device_printf(dev, "failed to create rndis dmatag\n"); + return error; + } + + /* DMA tag for data. */ + error = bus_dma_tag_create(parent_dtag, /* parent */ + 1, /* alignment */ + HN_TX_DATA_BOUNDARY, /* boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + HN_TX_DATA_MAXSIZE, /* maxsize */ + HN_TX_DATA_SEGCNT_MAX, /* nsegments */ + HN_TX_DATA_SEGSIZE, /* maxsegsize */ + 0, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockfuncarg */ + &txr->hn_tx_data_dtag); + if (error) { + device_printf(dev, "failed to create data dmatag\n"); + return error; + } + + for (i = 0; i < txr->hn_txdesc_cnt; ++i) { + struct hn_txdesc *txd = &txr->hn_txdesc[i]; + + txd->txr = txr; + txd->chim_index = HN_NVS_CHIM_IDX_INVALID; + STAILQ_INIT(&txd->agg_list); + + /* + * Allocate and load RNDIS packet message. + */ + error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, + (void **)&txd->rndis_pkt, + BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, + &txd->rndis_pkt_dmap); + if (error) { + device_printf(dev, + "failed to allocate rndis_packet_msg, %d\n", i); + return error; + } + + error = bus_dmamap_load(txr->hn_tx_rndis_dtag, + txd->rndis_pkt_dmap, + txd->rndis_pkt, HN_RNDIS_PKT_LEN, + hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, + BUS_DMA_NOWAIT); + if (error) { + device_printf(dev, + "failed to load rndis_packet_msg, %d\n", i); + bus_dmamem_free(txr->hn_tx_rndis_dtag, + txd->rndis_pkt, txd->rndis_pkt_dmap); + return error; + } + + /* DMA map for TX data. */ + error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, + &txd->data_dmap); + if (error) { + device_printf(dev, + "failed to allocate tx data dmamap\n"); + bus_dmamap_unload(txr->hn_tx_rndis_dtag, + txd->rndis_pkt_dmap); + bus_dmamem_free(txr->hn_tx_rndis_dtag, + txd->rndis_pkt, txd->rndis_pkt_dmap); + return error; + } + + /* All set, put it to list */ + txd->flags |= HN_TXD_FLAG_ONLIST; +#ifndef HN_USE_TXDESC_BUFRING + SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); +#else + buf_ring_enqueue(txr->hn_txdesc_br, txd); +#endif + } + txr->hn_txdesc_avail = txr->hn_txdesc_cnt; + + if (sc->hn_tx_sysctl_tree != NULL) { + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + char name[16]; + + /* + * Create per TX ring sysctl tree: + * dev.hn.UNIT.tx.RINGID + */ + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); + + snprintf(name, sizeof(name), "%d", id); + txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, + name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + + if (txr->hn_tx_sysctl_tree != NULL) { + child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); + +#ifdef HN_DEBUG + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", + CTLFLAG_RD, &txr->hn_txdesc_avail, 0, + "# of available TX descs"); +#endif +#ifdef HN_IFSTART_SUPPORT + if (!hn_use_if_start) +#endif + { + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", + CTLFLAG_RD, &txr->hn_oactive, 0, + "over active"); + } + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", + CTLFLAG_RW, &txr->hn_pkts, + "# of packets transmitted"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", + CTLFLAG_RW, &txr->hn_sends, "# of sends"); + } + } + + return 0; +} + +static void +hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) +{ + struct hn_tx_ring *txr = txd->txr; + + KASSERT(txd->m == NULL, ("still has mbuf installed")); + KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); + + bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); + bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, + txd->rndis_pkt_dmap); + bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); +} + +static void +hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) +{ + + KASSERT(txd->refs == 0 || txd->refs == 1, + ("invalid txd refs %d", txd->refs)); + + /* Aggregated txds will be freed by their aggregating txd. */ + if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { + int freed; + + freed = hn_txdesc_put(txr, txd); + KASSERT(freed, ("can't free txdesc")); + } +} + +static void +hn_tx_ring_destroy(struct hn_tx_ring *txr) +{ + int i; + + if (txr->hn_txdesc == NULL) + return; + + /* + * NOTE: + * Because the freeing of aggregated txds will be deferred + * to the aggregating txd, two passes are used here: + * - The first pass GCes any pending txds. This GC is necessary, + * since if the channels are revoked, hypervisor will not + * deliver send-done for all pending txds. + * - The second pass frees the busdma stuffs, i.e. after all txds + * were freed. + */ + for (i = 0; i < txr->hn_txdesc_cnt; ++i) + hn_txdesc_gc(txr, &txr->hn_txdesc[i]); + for (i = 0; i < txr->hn_txdesc_cnt; ++i) + hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); + + if (txr->hn_tx_data_dtag != NULL) + bus_dma_tag_destroy(txr->hn_tx_data_dtag); + if (txr->hn_tx_rndis_dtag != NULL) + bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); + +#ifdef HN_USE_TXDESC_BUFRING + buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); +#endif + + free(txr->hn_txdesc, M_DEVBUF); + txr->hn_txdesc = NULL; + + if (txr->hn_mbuf_br != NULL) + buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); + +#ifndef HN_USE_TXDESC_BUFRING + mtx_destroy(&txr->hn_txlist_spin); +#endif + mtx_destroy(&txr->hn_tx_lock); +} + +static int +hn_create_tx_data(struct hn_softc *sc, int ring_cnt) +{ + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + int i; + + /* + * Create TXBUF for chimney sending. + * + * NOTE: It is shared by all channels. + */ + sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), + PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, + BUS_DMA_WAITOK | BUS_DMA_ZERO); + if (sc->hn_chim == NULL) { + device_printf(sc->hn_dev, "allocate txbuf failed\n"); + return (ENOMEM); + } + + sc->hn_tx_ring_cnt = ring_cnt; + sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; + + sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, + M_DEVBUF, M_WAITOK | M_ZERO); + + ctx = device_get_sysctl_ctx(sc->hn_dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); + + /* Create dev.hn.UNIT.tx sysctl tree */ + sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", + CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + int error; + + error = hn_tx_ring_create(sc, i); + if (error) + return error; + } + + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_tx_ring, hn_no_txdescs), + hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_tx_ring, hn_send_failed), + hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_tx_ring, hn_txdma_failed), + hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_tx_ring, hn_flush_failed), + hn_tx_stat_ulong_sysctl, "LU", + "# of packet transmission aggregation flush failure"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_tx_ring, hn_tx_collapsed), + hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_tx_ring, hn_tx_chimney), + hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), + hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", + CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, + "# of total TX descs"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", + CTLFLAG_RD, &sc->hn_chim_szmax, 0, + "Chimney send packet size upper boundary"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_chim_size_sysctl, "I", "Chimney send packet size limit"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_tx_ring, hn_direct_tx_size), + hn_tx_conf_int_sysctl, "I", + "Size of the packet for direct transmission"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_tx_ring, hn_sched_tx), + hn_tx_conf_int_sysctl, "I", + "Always schedule transmission " + "instead of doing direct transmission"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", + CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", + CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", + CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, + "Applied packet transmission aggregation size"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_txagg_pktmax_sysctl, "I", + "Applied packet transmission aggregation packets"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_txagg_align_sysctl, "I", + "Applied packet transmission aggregation alignment"); + + return 0; +} + +static void +hn_set_chim_size(struct hn_softc *sc, int chim_size) +{ + int i; + + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) + sc->hn_tx_ring[i].hn_chim_size = chim_size; +} + +static void +hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) +{ + struct ifnet *ifp = sc->hn_ifp; + u_int hw_tsomax; + int tso_minlen; + + HN_LOCK_ASSERT(sc); + + if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) + return; + + KASSERT(sc->hn_ndis_tso_sgmin >= 2, + ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); + tso_minlen = sc->hn_ndis_tso_sgmin * mtu; + + KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && + sc->hn_ndis_tso_szmax <= IP_MAXPACKET, + ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); + + if (tso_maxlen < tso_minlen) + tso_maxlen = tso_minlen; + else if (tso_maxlen > IP_MAXPACKET) + tso_maxlen = IP_MAXPACKET; + if (tso_maxlen > sc->hn_ndis_tso_szmax) + tso_maxlen = sc->hn_ndis_tso_szmax; + hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + + if (hn_xpnt_vf_isready(sc)) { + if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) + hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; + } + ifp->if_hw_tsomax = hw_tsomax; + if (bootverbose) + if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); +} + +static void +hn_fixup_tx_data(struct hn_softc *sc) +{ + uint64_t csum_assist; + int i; + + hn_set_chim_size(sc, sc->hn_chim_szmax); + if (hn_tx_chimney_size > 0 && + hn_tx_chimney_size < sc->hn_chim_szmax) + hn_set_chim_size(sc, hn_tx_chimney_size); + + csum_assist = 0; + if (sc->hn_caps & HN_CAP_IPCS) + csum_assist |= CSUM_IP; + if (sc->hn_caps & HN_CAP_TCP4CS) + csum_assist |= CSUM_IP_TCP; + if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) + csum_assist |= CSUM_IP_UDP; + if (sc->hn_caps & HN_CAP_TCP6CS) + csum_assist |= CSUM_IP6_TCP; + if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) + csum_assist |= CSUM_IP6_UDP; + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) + sc->hn_tx_ring[i].hn_csum_assist = csum_assist; + + if (sc->hn_caps & HN_CAP_HASHVAL) { + /* + * Support HASHVAL pktinfo on TX path. + */ + if (bootverbose) + if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) + sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; + } +} + +static void +hn_fixup_rx_data(struct hn_softc *sc) +{ + + if (sc->hn_caps & HN_CAP_UDPHASH) { + int i; + + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) + sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; + } +} + +static void +hn_destroy_tx_data(struct hn_softc *sc) +{ + int i; + + if (sc->hn_chim != NULL) { + if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { + hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); + } else { + device_printf(sc->hn_dev, + "chimney sending buffer is referenced"); + } + sc->hn_chim = NULL; + } + + if (sc->hn_tx_ring_cnt == 0) + return; + + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) + hn_tx_ring_destroy(&sc->hn_tx_ring[i]); + + free(sc->hn_tx_ring, M_DEVBUF); + sc->hn_tx_ring = NULL; + + sc->hn_tx_ring_cnt = 0; + sc->hn_tx_ring_inuse = 0; +} + +#ifdef HN_IFSTART_SUPPORT + +static void +hn_start_taskfunc(void *xtxr, int pending __unused) +{ + struct hn_tx_ring *txr = xtxr; + + mtx_lock(&txr->hn_tx_lock); + hn_start_locked(txr, 0); + mtx_unlock(&txr->hn_tx_lock); +} + +static int +hn_start_locked(struct hn_tx_ring *txr, int len) +{ + struct hn_softc *sc = txr->hn_sc; + struct ifnet *ifp = sc->hn_ifp; + int sched = 0; + + KASSERT(hn_use_if_start, + ("hn_start_locked is called, when if_start is disabled")); + KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); + mtx_assert(&txr->hn_tx_lock, MA_OWNED); + KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); + + if (__predict_false(txr->hn_suspended)) + return (0); + + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != + IFF_DRV_RUNNING) + return (0); + + while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { + struct hn_txdesc *txd; + struct mbuf *m_head; + int error; + + IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); + if (m_head == NULL) + break; + + if (len > 0 && m_head->m_pkthdr.len > len) { + /* + * This sending could be time consuming; let callers + * dispatch this packet sending (and sending of any + * following up packets) to tx taskqueue. + */ + IFQ_DRV_PREPEND(&ifp->if_snd, m_head); + sched = 1; + break; + } + +#if defined(INET6) || defined(INET) + if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { + m_head = hn_tso_fixup(m_head); + if (__predict_false(m_head == NULL)) { + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + continue; + } + } else if (m_head->m_pkthdr.csum_flags & + (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { + m_head = hn_set_hlen(m_head); + if (__predict_false(m_head == NULL)) { + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + continue; + } + } +#endif + + txd = hn_txdesc_get(txr); + if (txd == NULL) { + txr->hn_no_txdescs++; + IFQ_DRV_PREPEND(&ifp->if_snd, m_head); + atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); + break; + } + + error = hn_encap(ifp, txr, txd, &m_head); + if (error) { + /* Both txd and m_head are freed */ + KASSERT(txr->hn_agg_txd == NULL, + ("encap failed w/ pending aggregating txdesc")); + continue; + } + + if (txr->hn_agg_pktleft == 0) { + if (txr->hn_agg_txd != NULL) { + KASSERT(m_head == NULL, + ("pending mbuf for aggregating txdesc")); + error = hn_flush_txagg(ifp, txr); + if (__predict_false(error)) { + atomic_set_int(&ifp->if_drv_flags, + IFF_DRV_OACTIVE); + break; + } + } else { + KASSERT(m_head != NULL, ("mbuf was freed")); + error = hn_txpkt(ifp, txr, txd); + if (__predict_false(error)) { + /* txd is freed, but m_head is not */ + IFQ_DRV_PREPEND(&ifp->if_snd, m_head); + atomic_set_int(&ifp->if_drv_flags, + IFF_DRV_OACTIVE); + break; + } + } + } +#ifdef INVARIANTS + else { + KASSERT(txr->hn_agg_txd != NULL, + ("no aggregating txdesc")); + KASSERT(m_head == NULL, + ("pending mbuf for aggregating txdesc")); + } +#endif + } + + /* Flush pending aggerated transmission. */ + if (txr->hn_agg_txd != NULL) + hn_flush_txagg(ifp, txr); + return (sched); +} + +static void +hn_start(struct ifnet *ifp) +{ + struct hn_softc *sc = ifp->if_softc; + struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; + + if (txr->hn_sched_tx) + goto do_sched; + + if (mtx_trylock(&txr->hn_tx_lock)) { + int sched; + + sched = hn_start_locked(txr, txr->hn_direct_tx_size); + mtx_unlock(&txr->hn_tx_lock); + if (!sched) + return; + } +do_sched: + taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); +} + +static void +hn_start_txeof_taskfunc(void *xtxr, int pending __unused) +{ + struct hn_tx_ring *txr = xtxr; + + mtx_lock(&txr->hn_tx_lock); + atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); + hn_start_locked(txr, 0); + mtx_unlock(&txr->hn_tx_lock); +} + +static void +hn_start_txeof(struct hn_tx_ring *txr) +{ + struct hn_softc *sc = txr->hn_sc; + struct ifnet *ifp = sc->hn_ifp; + + KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); + + if (txr->hn_sched_tx) + goto do_sched; + + if (mtx_trylock(&txr->hn_tx_lock)) { + int sched; + + atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); + sched = hn_start_locked(txr, txr->hn_direct_tx_size); + mtx_unlock(&txr->hn_tx_lock); + if (sched) { + taskqueue_enqueue(txr->hn_tx_taskq, + &txr->hn_tx_task); + } + } else { +do_sched: + /* + * Release the OACTIVE earlier, with the hope, that + * others could catch up. The task will clear the + * flag again with the hn_tx_lock to avoid possible + * races. + */ + atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); + taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); + } +} + +#endif /* HN_IFSTART_SUPPORT */ + +static int +hn_xmit(struct hn_tx_ring *txr, int len) +{ + struct hn_softc *sc = txr->hn_sc; + struct ifnet *ifp = sc->hn_ifp; + struct mbuf *m_head; + int sched = 0; + + mtx_assert(&txr->hn_tx_lock, MA_OWNED); +#ifdef HN_IFSTART_SUPPORT + KASSERT(hn_use_if_start == 0, + ("hn_xmit is called, when if_start is enabled")); +#endif + KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); + + if (__predict_false(txr->hn_suspended)) + return (0); + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) + return (0); + + while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { + struct hn_txdesc *txd; + int error; + + if (len > 0 && m_head->m_pkthdr.len > len) { + /* + * This sending could be time consuming; let callers + * dispatch this packet sending (and sending of any + * following up packets) to tx taskqueue. + */ + drbr_putback(ifp, txr->hn_mbuf_br, m_head); + sched = 1; + break; + } + + txd = hn_txdesc_get(txr); + if (txd == NULL) { + txr->hn_no_txdescs++; + drbr_putback(ifp, txr->hn_mbuf_br, m_head); + txr->hn_oactive = 1; + break; + } + + error = hn_encap(ifp, txr, txd, &m_head); + if (error) { + /* Both txd and m_head are freed; discard */ + KASSERT(txr->hn_agg_txd == NULL, + ("encap failed w/ pending aggregating txdesc")); + drbr_advance(ifp, txr->hn_mbuf_br); + continue; + } + + if (txr->hn_agg_pktleft == 0) { + if (txr->hn_agg_txd != NULL) { + KASSERT(m_head == NULL, + ("pending mbuf for aggregating txdesc")); + error = hn_flush_txagg(ifp, txr); + if (__predict_false(error)) { + txr->hn_oactive = 1; + break; + } + } else { + KASSERT(m_head != NULL, ("mbuf was freed")); + error = hn_txpkt(ifp, txr, txd); + if (__predict_false(error)) { + /* txd is freed, but m_head is not */ + drbr_putback(ifp, txr->hn_mbuf_br, + m_head); + txr->hn_oactive = 1; + break; + } + } + } +#ifdef INVARIANTS + else { + KASSERT(txr->hn_agg_txd != NULL, + ("no aggregating txdesc")); + KASSERT(m_head == NULL, + ("pending mbuf for aggregating txdesc")); + } +#endif + + /* Sent */ + drbr_advance(ifp, txr->hn_mbuf_br); + } + + /* Flush pending aggerated transmission. */ + if (txr->hn_agg_txd != NULL) + hn_flush_txagg(ifp, txr); + return (sched); +} + +static int +hn_transmit(struct ifnet *ifp, struct mbuf *m) +{ + struct hn_softc *sc = ifp->if_softc; + struct hn_tx_ring *txr; + int error, idx = 0; + + if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { + struct rm_priotracker pt; + + rm_rlock(&sc->hn_vf_lock, &pt); + if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { + struct mbuf *m_bpf = NULL; + int obytes, omcast; + + obytes = m->m_pkthdr.len; + omcast = (m->m_flags & M_MCAST) != 0; + + if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { + if (bpf_peers_present(ifp->if_bpf)) { + m_bpf = m_copypacket(m, M_NOWAIT); + if (m_bpf == NULL) { + /* + * Failed to grab a shallow + * copy; tap now. + */ + ETHER_BPF_MTAP(ifp, m); + } + } + } else { + ETHER_BPF_MTAP(ifp, m); + } + + error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); + rm_runlock(&sc->hn_vf_lock, &pt); + + if (m_bpf != NULL) { + if (!error) + ETHER_BPF_MTAP(ifp, m_bpf); + m_freem(m_bpf); + } + + if (error == ENOBUFS) { + if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); + } else if (error) { + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + } else { + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); + if (omcast) { + if_inc_counter(ifp, IFCOUNTER_OMCASTS, + omcast); + } + } + return (error); + } + rm_runlock(&sc->hn_vf_lock, &pt); + } + +#if defined(INET6) || defined(INET) + /* + * Perform TSO packet header fixup or get l2/l3 header length now, + * since packet headers should be cache-hot. + */ + if (m->m_pkthdr.csum_flags & CSUM_TSO) { + m = hn_tso_fixup(m); + if (__predict_false(m == NULL)) { + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return EIO; + } + } else if (m->m_pkthdr.csum_flags & + (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { + m = hn_set_hlen(m); + if (__predict_false(m == NULL)) { + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return EIO; + } + } +#endif + + /* + * Select the TX ring based on flowid + */ + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { +#ifdef RSS + uint32_t bid; + + if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), + &bid) == 0) + idx = bid % sc->hn_tx_ring_inuse; + else +#endif + { +#if defined(INET6) || defined(INET) + int tcpsyn = 0; + + if (m->m_pkthdr.len < 128 && + (m->m_pkthdr.csum_flags & + (CSUM_IP_TCP | CSUM_IP6_TCP)) && + (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { + m = hn_check_tcpsyn(m, &tcpsyn); + if (__predict_false(m == NULL)) { + if_inc_counter(ifp, + IFCOUNTER_OERRORS, 1); + return (EIO); + } + } +#else + const int tcpsyn = 0; +#endif + if (tcpsyn) + idx = 0; + else + idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; + } + } + txr = &sc->hn_tx_ring[idx]; + + error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); + if (error) { + if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); + return error; + } + + if (txr->hn_oactive) + return 0; + + if (txr->hn_sched_tx) + goto do_sched; + + if (mtx_trylock(&txr->hn_tx_lock)) { + int sched; + + sched = hn_xmit(txr, txr->hn_direct_tx_size); + mtx_unlock(&txr->hn_tx_lock); + if (!sched) + return 0; + } +do_sched: + taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); + return 0; +} + +static void +hn_tx_ring_qflush(struct hn_tx_ring *txr) +{ + struct mbuf *m; + + mtx_lock(&txr->hn_tx_lock); + while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) + m_freem(m); + mtx_unlock(&txr->hn_tx_lock); +} + +static void +hn_xmit_qflush(struct ifnet *ifp) +{ + struct hn_softc *sc = ifp->if_softc; + struct rm_priotracker pt; + int i; + + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) + hn_tx_ring_qflush(&sc->hn_tx_ring[i]); + if_qflush(ifp); + + rm_rlock(&sc->hn_vf_lock, &pt); + if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) + sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); + rm_runlock(&sc->hn_vf_lock, &pt); +} + +static void +hn_xmit_txeof(struct hn_tx_ring *txr) +{ + + if (txr->hn_sched_tx) + goto do_sched; + + if (mtx_trylock(&txr->hn_tx_lock)) { + int sched; + + txr->hn_oactive = 0; + sched = hn_xmit(txr, txr->hn_direct_tx_size); + mtx_unlock(&txr->hn_tx_lock); + if (sched) { + taskqueue_enqueue(txr->hn_tx_taskq, + &txr->hn_tx_task); + } + } else { +do_sched: + /* + * Release the oactive earlier, with the hope, that + * others could catch up. The task will clear the + * oactive again with the hn_tx_lock to avoid possible + * races. + */ + txr->hn_oactive = 0; + taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); + } +} + +static void +hn_xmit_taskfunc(void *xtxr, int pending __unused) +{ + struct hn_tx_ring *txr = xtxr; + + mtx_lock(&txr->hn_tx_lock); + hn_xmit(txr, 0); + mtx_unlock(&txr->hn_tx_lock); +} + +static void +hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) +{ + struct hn_tx_ring *txr = xtxr; + + mtx_lock(&txr->hn_tx_lock); + txr->hn_oactive = 0; + hn_xmit(txr, 0); + mtx_unlock(&txr->hn_tx_lock); +} + +static int +hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) +{ + struct vmbus_chan_br cbr; + struct hn_rx_ring *rxr; + struct hn_tx_ring *txr = NULL; + int idx, error; + + idx = vmbus_chan_subidx(chan); + + /* + * Link this channel to RX/TX ring. + */ + KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, + ("invalid channel index %d, should > 0 && < %d", + idx, sc->hn_rx_ring_inuse)); + rxr = &sc->hn_rx_ring[idx]; + KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, + ("RX ring %d already attached", idx)); + rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; + rxr->hn_chan = chan; + + if (bootverbose) { + if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", + idx, vmbus_chan_id(chan)); + } + + if (idx < sc->hn_tx_ring_inuse) { + txr = &sc->hn_tx_ring[idx]; + KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, + ("TX ring %d already attached", idx)); + txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; + + txr->hn_chan = chan; + if (bootverbose) { + if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", + idx, vmbus_chan_id(chan)); + } + } + + /* Bind this channel to a proper CPU. */ + vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); + + /* + * Open this channel + */ + cbr.cbr = rxr->hn_br; + cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; + cbr.cbr_txsz = HN_TXBR_SIZE; + cbr.cbr_rxsz = HN_RXBR_SIZE; + error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); + if (error) { + if (error == EISCONN) { + if_printf(sc->hn_ifp, "bufring is connected after " + "chan%u open failure\n", vmbus_chan_id(chan)); + rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; + } else { + if_printf(sc->hn_ifp, "open chan%u failed: %d\n", + vmbus_chan_id(chan), error); + } + } + return (error); +} + +static void +hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) +{ + struct hn_rx_ring *rxr; + int idx, error; + + idx = vmbus_chan_subidx(chan); + + /* + * Link this channel to RX/TX ring. + */ + KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, + ("invalid channel index %d, should > 0 && < %d", + idx, sc->hn_rx_ring_inuse)); + rxr = &sc->hn_rx_ring[idx]; + KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), + ("RX ring %d is not attached", idx)); + rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; + + if (idx < sc->hn_tx_ring_inuse) { + struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; + + KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), + ("TX ring %d is not attached attached", idx)); + txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; + } + + /* + * Close this channel. + * + * NOTE: + * Channel closing does _not_ destroy the target channel. + */ + error = vmbus_chan_close_direct(chan); + if (error == EISCONN) { + if_printf(sc->hn_ifp, "chan%u bufring is connected " + "after being closed\n", vmbus_chan_id(chan)); + rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; + } else if (error) { + if_printf(sc->hn_ifp, "chan%u close failed: %d\n", + vmbus_chan_id(chan), error); + } +} + +static int +hn_attach_subchans(struct hn_softc *sc) +{ + struct vmbus_channel **subchans; + int subchan_cnt = sc->hn_rx_ring_inuse - 1; + int i, error = 0; + + KASSERT(subchan_cnt > 0, ("no sub-channels")); + + /* Attach the sub-channels. */ + subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); + for (i = 0; i < subchan_cnt; ++i) { + int error1; + + error1 = hn_chan_attach(sc, subchans[i]); + if (error1) { + error = error1; + /* Move on; all channels will be detached later. */ + } + } + vmbus_subchan_rel(subchans, subchan_cnt); + + if (error) { + if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); + } else { + if (bootverbose) { + if_printf(sc->hn_ifp, "%d sub-channels attached\n", + subchan_cnt); + } + } + return (error); +} + +static void +hn_detach_allchans(struct hn_softc *sc) +{ + struct vmbus_channel **subchans; + int subchan_cnt = sc->hn_rx_ring_inuse - 1; + int i; + + if (subchan_cnt == 0) + goto back; + + /* Detach the sub-channels. */ + subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); + for (i = 0; i < subchan_cnt; ++i) + hn_chan_detach(sc, subchans[i]); + vmbus_subchan_rel(subchans, subchan_cnt); + +back: + /* + * Detach the primary channel, _after_ all sub-channels + * are detached. + */ + hn_chan_detach(sc, sc->hn_prichan); + + /* Wait for sub-channels to be destroyed, if any. */ + vmbus_subchan_drain(sc->hn_prichan); + +#ifdef INVARIANTS + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + KASSERT((sc->hn_rx_ring[i].hn_rx_flags & + HN_RX_FLAG_ATTACHED) == 0, + ("%dth RX ring is still attached", i)); + } + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + KASSERT((sc->hn_tx_ring[i].hn_tx_flags & + HN_TX_FLAG_ATTACHED) == 0, + ("%dth TX ring is still attached", i)); + } +#endif +} + +static int +hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) +{ + struct vmbus_channel **subchans; + int nchan, rxr_cnt, error; + + nchan = *nsubch + 1; + if (nchan == 1) { + /* + * Multiple RX/TX rings are not requested. + */ + *nsubch = 0; + return (0); + } + + /* + * Query RSS capabilities, e.g. # of RX rings, and # of indirect + * table entries. + */ + error = hn_rndis_query_rsscaps(sc, &rxr_cnt); + if (error) { + /* No RSS; this is benign. */ + *nsubch = 0; + return (0); + } + if (bootverbose) { + if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", + rxr_cnt, nchan); + } + + if (nchan > rxr_cnt) + nchan = rxr_cnt; + if (nchan == 1) { + if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); + *nsubch = 0; + return (0); + } + + /* + * Allocate sub-channels from NVS. + */ + *nsubch = nchan - 1; + error = hn_nvs_alloc_subchans(sc, nsubch); + if (error || *nsubch == 0) { + /* Failed to allocate sub-channels. */ + *nsubch = 0; + return (0); + } + + /* + * Wait for all sub-channels to become ready before moving on. + */ + subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); + vmbus_subchan_rel(subchans, *nsubch); + return (0); +} + +static bool +hn_synth_attachable(const struct hn_softc *sc) +{ + int i; + + if (sc->hn_flags & HN_FLAG_ERRORS) + return (false); + + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; + + if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) + return (false); + } + return (true); +} + +/* + * Make sure that the RX filter is zero after the successful + * RNDIS initialization. + * + * NOTE: + * Under certain conditions on certain versions of Hyper-V, + * the RNDIS rxfilter is _not_ zero on the hypervisor side + * after the successful RNDIS initialization, which breaks + * the assumption of any following code (well, it breaks the + * RNDIS API contract actually). Clear the RNDIS rxfilter + * explicitly, drain packets sneaking through, and drain the + * interrupt taskqueues scheduled due to the stealth packets. + */ +static void +hn_rndis_init_fixat(struct hn_softc *sc, int nchan) +{ + + hn_disable_rx(sc); + hn_drain_rxtx(sc, nchan); +} + +static int +hn_synth_attach(struct hn_softc *sc, int mtu) +{ +#define ATTACHED_NVS 0x0002 +#define ATTACHED_RNDIS 0x0004 + + struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; + int error, nsubch, nchan = 1, i, rndis_inited; + uint32_t old_caps, attached = 0; + + KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, + ("synthetic parts were attached")); + + if (!hn_synth_attachable(sc)) + return (ENXIO); + + /* Save capabilities for later verification. */ + old_caps = sc->hn_caps; + sc->hn_caps = 0; + + /* Clear RSS stuffs. */ + sc->hn_rss_ind_size = 0; + sc->hn_rss_hash = 0; + sc->hn_rss_hcap = 0; + + /* + * Attach the primary channel _before_ attaching NVS and RNDIS. + */ + error = hn_chan_attach(sc, sc->hn_prichan); + if (error) + goto failed; + + /* + * Attach NVS. + */ + error = hn_nvs_attach(sc, mtu); + if (error) + goto failed; + attached |= ATTACHED_NVS; + + /* + * Attach RNDIS _after_ NVS is attached. + */ + error = hn_rndis_attach(sc, mtu, &rndis_inited); + if (rndis_inited) + attached |= ATTACHED_RNDIS; + if (error) + goto failed; + + /* + * Make sure capabilities are not changed. + */ + if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { + if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", + old_caps, sc->hn_caps); + error = ENXIO; + goto failed; + } + + /* + * Allocate sub-channels for multi-TX/RX rings. + * + * NOTE: + * The # of RX rings that can be used is equivalent to the # of + * channels to be requested. + */ + nsubch = sc->hn_rx_ring_cnt - 1; + error = hn_synth_alloc_subchans(sc, &nsubch); + if (error) + goto failed; + /* NOTE: _Full_ synthetic parts detach is required now. */ + sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; + + /* + * Set the # of TX/RX rings that could be used according to + * the # of channels that NVS offered. + */ + nchan = nsubch + 1; + hn_set_ring_inuse(sc, nchan); + if (nchan == 1) { + /* Only the primary channel can be used; done */ + goto back; + } + + /* + * Attach the sub-channels. + * + * NOTE: hn_set_ring_inuse() _must_ have been called. + */ + error = hn_attach_subchans(sc); + if (error) + goto failed; + + /* + * Configure RSS key and indirect table _after_ all sub-channels + * are attached. + */ + if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { + /* + * RSS key is not set yet; set it to the default RSS key. + */ + if (bootverbose) + if_printf(sc->hn_ifp, "setup default RSS key\n"); +#ifdef RSS + rss_getkey(rss->rss_key); +#else + memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); +#endif + sc->hn_flags |= HN_FLAG_HAS_RSSKEY; + } + + if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { + /* + * RSS indirect table is not set yet; set it up in round- + * robin fashion. + */ + if (bootverbose) { + if_printf(sc->hn_ifp, "setup default RSS indirect " + "table\n"); + } + for (i = 0; i < NDIS_HASH_INDCNT; ++i) { + uint32_t subidx; + +#ifdef RSS + subidx = rss_get_indirection_to_bucket(i); +#else + subidx = i; +#endif + rss->rss_ind[i] = subidx % nchan; + } + sc->hn_flags |= HN_FLAG_HAS_RSSIND; + } else { + /* + * # of usable channels may be changed, so we have to + * make sure that all entries in RSS indirect table + * are valid. + * + * NOTE: hn_set_ring_inuse() _must_ have been called. + */ + hn_rss_ind_fixup(sc); + } + + sc->hn_rss_hash = sc->hn_rss_hcap; + if ((sc->hn_flags & HN_FLAG_RXVF) || + (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { + /* NOTE: Don't reconfigure RSS; will do immediately. */ + hn_vf_rss_fixup(sc, false); + } + error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); + if (error) + goto failed; +back: + /* + * Fixup transmission aggregation setup. + */ + hn_set_txagg(sc); + hn_rndis_init_fixat(sc, nchan); + return (0); + +failed: + if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { + hn_rndis_init_fixat(sc, nchan); + hn_synth_detach(sc); + } else { + if (attached & ATTACHED_RNDIS) { + hn_rndis_init_fixat(sc, nchan); + hn_rndis_detach(sc); + } + if (attached & ATTACHED_NVS) + hn_nvs_detach(sc); + hn_chan_detach(sc, sc->hn_prichan); + /* Restore old capabilities. */ + sc->hn_caps = old_caps; + } + return (error); + +#undef ATTACHED_RNDIS +#undef ATTACHED_NVS +} + +/* + * NOTE: + * The interface must have been suspended though hn_suspend(), before + * this function get called. + */ +static void +hn_synth_detach(struct hn_softc *sc) +{ + + KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, + ("synthetic parts were not attached")); + + /* Detach the RNDIS first. */ + hn_rndis_detach(sc); + + /* Detach NVS. */ + hn_nvs_detach(sc); + + /* Detach all of the channels. */ + hn_detach_allchans(sc); + + if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { + /* + * Host is post-Win2016, disconnect RXBUF from primary channel here. + */ + int error; + + error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, + sc->hn_rxbuf_gpadl); + if (error) { + if_printf(sc->hn_ifp, + "rxbuf gpadl disconn failed: %d\n", error); + sc->hn_flags |= HN_FLAG_RXBUF_REF; + } + sc->hn_rxbuf_gpadl = 0; + } + + if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { + /* + * Host is post-Win2016, disconnect chimney sending buffer from + * primary channel here. + */ + int error; + + error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, + sc->hn_chim_gpadl); + if (error) { + if_printf(sc->hn_ifp, + "chim gpadl disconn failed: %d\n", error); + sc->hn_flags |= HN_FLAG_CHIM_REF; + } + sc->hn_chim_gpadl = 0; + } + sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; +} + +static void +hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) +{ + KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, + ("invalid ring count %d", ring_cnt)); + + if (sc->hn_tx_ring_cnt > ring_cnt) + sc->hn_tx_ring_inuse = ring_cnt; + else + sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; + sc->hn_rx_ring_inuse = ring_cnt; + +#ifdef RSS + if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { + if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " + "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, + rss_getnumbuckets()); + } +#endif + + if (bootverbose) { + if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", + sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); + } +} + +static void +hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) +{ + + /* + * NOTE: + * The TX bufring will not be drained by the hypervisor, + * if the primary channel is revoked. + */ + while (!vmbus_chan_rx_empty(chan) || + (!vmbus_chan_is_revoked(sc->hn_prichan) && + !vmbus_chan_tx_empty(chan))) + pause("waitch", 1); + vmbus_chan_intr_drain(chan); +} + +static void +hn_disable_rx(struct hn_softc *sc) +{ + + /* + * Disable RX by clearing RX filter forcefully. + */ + sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; + hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ + + /* + * Give RNDIS enough time to flush all pending data packets. + */ + pause("waitrx", (200 * hz) / 1000); +} + +/* + * NOTE: + * RX/TX _must_ have been suspended/disabled, before this function + * is called. + */ +static void +hn_drain_rxtx(struct hn_softc *sc, int nchan) +{ + struct vmbus_channel **subch = NULL; + int nsubch; + + /* + * Drain RX/TX bufrings and interrupts. + */ + nsubch = nchan - 1; + if (nsubch > 0) + subch = vmbus_subchan_get(sc->hn_prichan, nsubch); + + if (subch != NULL) { + int i; + + for (i = 0; i < nsubch; ++i) + hn_chan_drain(sc, subch[i]); + } + hn_chan_drain(sc, sc->hn_prichan); + + if (subch != NULL) + vmbus_subchan_rel(subch, nsubch); +} + +static void +hn_suspend_data(struct hn_softc *sc) +{ + struct hn_tx_ring *txr; + int i; + + HN_LOCK_ASSERT(sc); + + /* + * Suspend TX. + */ + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { + txr = &sc->hn_tx_ring[i]; + + mtx_lock(&txr->hn_tx_lock); + txr->hn_suspended = 1; + mtx_unlock(&txr->hn_tx_lock); + /* No one is able send more packets now. */ + + /* + * Wait for all pending sends to finish. + * + * NOTE: + * We will _not_ receive all pending send-done, if the + * primary channel is revoked. + */ + while (hn_tx_ring_pending(txr) && + !vmbus_chan_is_revoked(sc->hn_prichan)) + pause("hnwtx", 1 /* 1 tick */); + } + + /* + * Disable RX. + */ + hn_disable_rx(sc); + + /* + * Drain RX/TX. + */ + hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); + + /* + * Drain any pending TX tasks. + * + * NOTE: + * The above hn_drain_rxtx() can dispatch TX tasks, so the TX + * tasks will have to be drained _after_ the above hn_drain_rxtx(). + */ + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { + txr = &sc->hn_tx_ring[i]; + + taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); + taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); + } +} + +static void +hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) +{ + + ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; +} + +static void +hn_suspend_mgmt(struct hn_softc *sc) +{ + struct task task; + + HN_LOCK_ASSERT(sc); + + /* + * Make sure that hn_mgmt_taskq0 can nolonger be accessed + * through hn_mgmt_taskq. + */ + TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); + vmbus_chan_run_task(sc->hn_prichan, &task); + + /* + * Make sure that all pending management tasks are completed. + */ + taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); + taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); + taskqueue_drain_all(sc->hn_mgmt_taskq0); +} + +static void +hn_suspend(struct hn_softc *sc) +{ + + /* Disable polling. */ + hn_polling(sc, 0); + + /* + * If the non-transparent mode VF is activated, the synthetic + * device is receiving packets, so the data path of the + * synthetic device must be suspended. + */ + if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || + (sc->hn_flags & HN_FLAG_RXVF)) + hn_suspend_data(sc); + hn_suspend_mgmt(sc); +} + +static void +hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) +{ + int i; + + KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, + ("invalid TX ring count %d", tx_ring_cnt)); + + for (i = 0; i < tx_ring_cnt; ++i) { + struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; + + mtx_lock(&txr->hn_tx_lock); + txr->hn_suspended = 0; + mtx_unlock(&txr->hn_tx_lock); + } +} + +static void +hn_resume_data(struct hn_softc *sc) +{ + int i; + + HN_LOCK_ASSERT(sc); + + /* + * Re-enable RX. + */ + hn_rxfilter_config(sc); + + /* + * Make sure to clear suspend status on "all" TX rings, + * since hn_tx_ring_inuse can be changed after + * hn_suspend_data(). + */ + hn_resume_tx(sc, sc->hn_tx_ring_cnt); + +#ifdef HN_IFSTART_SUPPORT + if (!hn_use_if_start) +#endif + { + /* + * Flush unused drbrs, since hn_tx_ring_inuse may be + * reduced. + */ + for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) + hn_tx_ring_qflush(&sc->hn_tx_ring[i]); + } + + /* + * Kick start TX. + */ + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { + struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; + + /* + * Use txeof task, so that any pending oactive can be + * cleared properly. + */ + taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); + } +} + +static void +hn_resume_mgmt(struct hn_softc *sc) +{ + + sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; + + /* + * Kick off network change detection, if it was pending. + * If no network change was pending, start link status + * checks, which is more lightweight than network change + * detection. + */ + if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) + hn_change_network(sc); + else + hn_update_link_status(sc); +} + +static void +hn_resume(struct hn_softc *sc) +{ + + /* + * If the non-transparent mode VF is activated, the synthetic + * device have to receive packets, so the data path of the + * synthetic device must be resumed. + */ + if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || + (sc->hn_flags & HN_FLAG_RXVF)) + hn_resume_data(sc); + + /* + * Don't resume link status change if VF is attached/activated. + * - In the non-transparent VF mode, the synthetic device marks + * link down until the VF is deactivated; i.e. VF is down. + * - In transparent VF mode, VF's media status is used until + * the VF is detached. + */ + if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && + !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) + hn_resume_mgmt(sc); + + /* + * Re-enable polling if this interface is running and + * the polling is requested. + */ + if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) + hn_polling(sc, sc->hn_pollhz); +} + +static void +hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) +{ + const struct rndis_status_msg *msg; + int ofs; + + if (dlen < sizeof(*msg)) { + if_printf(sc->hn_ifp, "invalid RNDIS status\n"); + return; + } + msg = data; + + switch (msg->rm_status) { + case RNDIS_STATUS_MEDIA_CONNECT: + case RNDIS_STATUS_MEDIA_DISCONNECT: + hn_update_link_status(sc); + break; + + case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: + case RNDIS_STATUS_LINK_SPEED_CHANGE: + /* Not really useful; ignore. */ + break; + + case RNDIS_STATUS_NETWORK_CHANGE: + ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); + if (dlen < ofs + msg->rm_stbuflen || + msg->rm_stbuflen < sizeof(uint32_t)) { + if_printf(sc->hn_ifp, "network changed\n"); + } else { + uint32_t change; + + memcpy(&change, ((const uint8_t *)msg) + ofs, + sizeof(change)); + if_printf(sc->hn_ifp, "network changed, change %u\n", + change); + } + hn_change_network(sc); + break; + + default: + if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", + msg->rm_status); + break; + } +} + +static int +hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) +{ + const struct rndis_pktinfo *pi = info_data; + uint32_t mask = 0; + + while (info_dlen != 0) { + const void *data; + uint32_t dlen; + + if (__predict_false(info_dlen < sizeof(*pi))) + return (EINVAL); + if (__predict_false(info_dlen < pi->rm_size)) + return (EINVAL); + info_dlen -= pi->rm_size; + + if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) + return (EINVAL); + if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) + return (EINVAL); + dlen = pi->rm_size - pi->rm_pktinfooffset; + data = pi->rm_data; + + if (pi->rm_internal == 1) { + switch (pi->rm_type) { + case NDIS_PKTINFO_IT_PKTINFO_ID: + if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) + return (EINVAL); + info->pktinfo_id = + (const struct packet_info_id *)data; + mask |= HN_RXINFO_PKTINFO_ID; + break; + + default: + goto next; + } + } else { + switch (pi->rm_type) { + case NDIS_PKTINFO_TYPE_VLAN: + if (__predict_false(dlen + < NDIS_VLAN_INFO_SIZE)) + return (EINVAL); + info->vlan_info = (const uint32_t *)data; + mask |= HN_RXINFO_VLAN; + break; + + case NDIS_PKTINFO_TYPE_CSUM: + if (__predict_false(dlen + < NDIS_RXCSUM_INFO_SIZE)) + return (EINVAL); + info->csum_info = (const uint32_t *)data; + mask |= HN_RXINFO_CSUM; + break; + + case HN_NDIS_PKTINFO_TYPE_HASHVAL: + if (__predict_false(dlen + < HN_NDIS_HASH_VALUE_SIZE)) + return (EINVAL); + info->hash_value = (const uint32_t *)data; + mask |= HN_RXINFO_HASHVAL; + break; + + case HN_NDIS_PKTINFO_TYPE_HASHINF: + if (__predict_false(dlen + < HN_NDIS_HASH_INFO_SIZE)) + return (EINVAL); + info->hash_info = (const uint32_t *)data; + mask |= HN_RXINFO_HASHINF; + break; + + default: + goto next; + } + } + + if (mask == HN_RXINFO_ALL) { + /* All found; done */ + break; + } +next: + pi = (const struct rndis_pktinfo *) + ((const uint8_t *)pi + pi->rm_size); + } + + /* + * Final fixup. + * - If there is no hash value, invalidate the hash info. + */ + if ((mask & HN_RXINFO_HASHVAL) == 0) + info->hash_info = NULL; + return (0); +} + +static __inline bool +hn_rndis_check_overlap(int off, int len, int check_off, int check_len) +{ + + if (off < check_off) { + if (__predict_true(off + len <= check_off)) + return (false); + } else if (off > check_off) { + if (__predict_true(check_off + check_len <= off)) + return (false); + } + return (true); +} + +static __inline void +hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, + uint32_t len, struct hn_rxinfo *info) +{ + uint32_t cnt = rxr->rsc.cnt; + + if (cnt) { + rxr->rsc.pktlen += len; + } else { + rxr->rsc.vlan_info = info->vlan_info; + rxr->rsc.csum_info = info->csum_info; + rxr->rsc.hash_info = info->hash_info; + rxr->rsc.hash_value = info->hash_value; + rxr->rsc.pktlen = len; + } + + rxr->rsc.frag_data[cnt] = data; + rxr->rsc.frag_len[cnt] = len; + rxr->rsc.cnt++; +} + +static void +hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) +{ + const struct rndis_packet_msg *pkt; + struct hn_rxinfo info; + int data_off, pktinfo_off, data_len, pktinfo_len; + bool rsc_more= false; + + /* + * Check length. + */ + if (__predict_false(dlen < sizeof(*pkt))) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); + return; + } + pkt = data; + + if (__predict_false(dlen < pkt->rm_len)) { + if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " + "dlen %d, msglen %u\n", dlen, pkt->rm_len); + return; + } + if (__predict_false(pkt->rm_len < + pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " + "msglen %u, data %u, oob %u, pktinfo %u\n", + pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, + pkt->rm_pktinfolen); + return; + } + if (__predict_false(pkt->rm_datalen == 0)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); + return; + } + + /* + * Check offests. + */ +#define IS_OFFSET_INVALID(ofs) \ + ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ + ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) + + /* XXX Hyper-V does not meet data offset alignment requirement */ + if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "data offset %u\n", pkt->rm_dataoffset); + return; + } + if (__predict_false(pkt->rm_oobdataoffset > 0 && + IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "oob offset %u\n", pkt->rm_oobdataoffset); + return; + } + if (__predict_true(pkt->rm_pktinfooffset > 0) && + __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "pktinfo offset %u\n", pkt->rm_pktinfooffset); + return; + } + +#undef IS_OFFSET_INVALID + + data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); + data_len = pkt->rm_datalen; + pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); + pktinfo_len = pkt->rm_pktinfolen; + + /* + * Check OOB coverage. + */ + if (__predict_false(pkt->rm_oobdatalen != 0)) { + int oob_off, oob_len; + + if_printf(rxr->hn_ifp, "got oobdata\n"); + oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); + oob_len = pkt->rm_oobdatalen; + + if (__predict_false(oob_off + oob_len > pkt->rm_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "oob overflow, msglen %u, oob abs %d len %d\n", + pkt->rm_len, oob_off, oob_len); + return; + } + + /* + * Check against data. + */ + if (hn_rndis_check_overlap(oob_off, oob_len, + data_off, data_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "oob overlaps data, oob abs %d len %d, " + "data abs %d len %d\n", + oob_off, oob_len, data_off, data_len); + return; + } + + /* + * Check against pktinfo. + */ + if (pktinfo_len != 0 && + hn_rndis_check_overlap(oob_off, oob_len, + pktinfo_off, pktinfo_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "oob overlaps pktinfo, oob abs %d len %d, " + "pktinfo abs %d len %d\n", + oob_off, oob_len, pktinfo_off, pktinfo_len); + return; + } + } + + /* + * Check per-packet-info coverage and find useful per-packet-info. + */ + info.vlan_info = NULL; + info.csum_info = NULL; + info.hash_info = NULL; + info.pktinfo_id = NULL; + + if (__predict_true(pktinfo_len != 0)) { + bool overlap; + int error; + + if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "pktinfo overflow, msglen %u, " + "pktinfo abs %d len %d\n", + pkt->rm_len, pktinfo_off, pktinfo_len); + return; + } + + /* + * Check packet info coverage. + */ + overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, + data_off, data_len); + if (__predict_false(overlap)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "pktinfo overlap data, pktinfo abs %d len %d, " + "data abs %d len %d\n", + pktinfo_off, pktinfo_len, data_off, data_len); + return; + } + + /* + * Find useful per-packet-info. + */ + error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, + pktinfo_len, &info); + if (__predict_false(error)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " + "pktinfo\n"); + return; + } + } + + if (__predict_false(data_off + data_len > pkt->rm_len)) { + if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " + "data overflow, msglen %u, data abs %d len %d\n", + pkt->rm_len, data_off, data_len); + return; + } + + /* Identify RSC fragments, drop invalid packets */ + if ((info.pktinfo_id != NULL) && + (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { + if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { + rxr->rsc.cnt = 0; + rxr->hn_rsc_pkts++; + } else if (rxr->rsc.cnt == 0) + goto drop; + + rsc_more = true; + + if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) + rsc_more = false; + + if (rsc_more && rxr->rsc.is_last) + goto drop; + } else { + rxr->rsc.cnt = 0; + } + + if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) + goto drop; + + /* Store data in per rx ring structure */ + hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, + data_len, &info); + + if (rsc_more) + return; + + hn_rxpkt(rxr); + rxr->rsc.cnt = 0; + return; +drop: + rxr->hn_rsc_drop++; + return; +} + +static __inline void +hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) +{ + const struct rndis_msghdr *hdr; + + if (__predict_false(dlen < sizeof(*hdr))) { + if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); + return; + } + hdr = data; + + if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { + /* Hot data path. */ + hn_rndis_rx_data(rxr, data, dlen); + /* Done! */ + return; + } + + if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) + hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); + else + hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); +} + +static void +hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) +{ + const struct hn_nvs_hdr *hdr; + + if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { + if_printf(sc->hn_ifp, "invalid nvs notify\n"); + return; + } + hdr = VMBUS_CHANPKT_CONST_DATA(pkt); + + if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { + /* Useless; ignore */ + return; + } + if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); +} + +static void +hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, + const struct vmbus_chanpkt_hdr *pkt) +{ + struct hn_nvs_sendctx *sndc; + + sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; + sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), + VMBUS_CHANPKT_DATALEN(pkt)); + /* + * NOTE: + * 'sndc' CAN NOT be accessed anymore, since it can be freed by + * its callback. + */ +} + +static void +hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, + const struct vmbus_chanpkt_hdr *pkthdr) +{ + struct epoch_tracker et; + const struct vmbus_chanpkt_rxbuf *pkt; + const struct hn_nvs_hdr *nvs_hdr; + int count, i, hlen; + + if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { + if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); + return; + } + nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); + + /* Make sure that this is a RNDIS message. */ + if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { + if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", + nvs_hdr->nvs_type); + return; + } + + hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); + if (__predict_false(hlen < sizeof(*pkt))) { + if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); + return; + } + pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; + + if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { + if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", + pkt->cp_rxbuf_id); + return; + } + + count = pkt->cp_rxbuf_cnt; + if (__predict_false(hlen < + __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { + if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); + return; + } + + NET_EPOCH_ENTER(et); + /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ + for (i = 0; i < count; ++i) { + int ofs, len; + + ofs = pkt->cp_rxbuf[i].rb_ofs; + len = pkt->cp_rxbuf[i].rb_len; + if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { + if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " + "ofs %d, len %d\n", i, ofs, len); + continue; + } + + rxr->rsc.is_last = (i == (count - 1)); + hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); + } + NET_EPOCH_EXIT(et); + + /* + * Ack the consumed RXBUF associated w/ this channel packet, + * so that this RXBUF can be recycled by the hypervisor. + */ + hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); +} + +static void +hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, + uint64_t tid) +{ + struct hn_nvs_rndis_ack ack; + int retries, error; + + ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; + ack.nvs_status = HN_NVS_STATUS_OK; + + retries = 0; +again: + error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, + VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); + if (__predict_false(error == EAGAIN)) { + /* + * NOTE: + * This should _not_ happen in real world, since the + * consumption of the TX bufring from the TX path is + * controlled. + */ + if (rxr->hn_ack_failed == 0) + if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); + rxr->hn_ack_failed++; + retries++; + if (retries < 10) { + DELAY(100); + goto again; + } + /* RXBUF leaks! */ + if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); + } +} + +static void +hn_chan_callback(struct vmbus_channel *chan, void *xrxr) +{ + struct hn_rx_ring *rxr = xrxr; + struct hn_softc *sc = rxr->hn_ifp->if_softc; + + for (;;) { + struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; + int error, pktlen; + + pktlen = rxr->hn_pktbuf_len; + error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); + if (__predict_false(error == ENOBUFS)) { + void *nbuf; + int nlen; + + /* + * Expand channel packet buffer. + * + * XXX + * Use M_WAITOK here, since allocation failure + * is fatal. + */ + nlen = rxr->hn_pktbuf_len * 2; + while (nlen < pktlen) + nlen *= 2; + nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); + + if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", + rxr->hn_pktbuf_len, nlen); + + free(rxr->hn_pktbuf, M_DEVBUF); + rxr->hn_pktbuf = nbuf; + rxr->hn_pktbuf_len = nlen; + /* Retry! */ + continue; + } else if (__predict_false(error == EAGAIN)) { + /* No more channel packets; done! */ + break; + } + KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); + + switch (pkt->cph_type) { + case VMBUS_CHANPKT_TYPE_COMP: + hn_nvs_handle_comp(sc, chan, pkt); + break; + + case VMBUS_CHANPKT_TYPE_RXBUF: + hn_nvs_handle_rxbuf(rxr, chan, pkt); + break; + + case VMBUS_CHANPKT_TYPE_INBAND: + hn_nvs_handle_notify(sc, pkt); + break; + + default: + if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", + pkt->cph_type); + break; + } + } + hn_chan_rollup(rxr, rxr->hn_txr); +} + +static void +hn_sysinit(void *arg __unused) +{ + int i; + + hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); + +#ifdef HN_IFSTART_SUPPORT + /* + * Don't use ifnet.if_start if transparent VF mode is requested; + * mainly due to the IFF_DRV_OACTIVE flag. + */ + if (hn_xpnt_vf && hn_use_if_start) { + hn_use_if_start = 0; + printf("hn: tranparent VF mode, if_transmit will be used, " + "instead of if_start\n"); + } +#endif + if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { + printf("hn: invalid transparent VF attach routing " + "wait timeout %d, reset to %d\n", + hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); + hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; + } + + /* + * Initialize VF map. + */ + rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); + hn_vfmap_size = HN_VFMAP_SIZE_DEF; + hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, + M_WAITOK | M_ZERO); + + /* + * Fix the # of TX taskqueues. + */ + if (hn_tx_taskq_cnt <= 0) + hn_tx_taskq_cnt = 1; + else if (hn_tx_taskq_cnt > mp_ncpus) + hn_tx_taskq_cnt = mp_ncpus; + + /* + * Fix the TX taskqueue mode. + */ + switch (hn_tx_taskq_mode) { + case HN_TX_TASKQ_M_INDEP: + case HN_TX_TASKQ_M_GLOBAL: + case HN_TX_TASKQ_M_EVTTQ: + break; + default: + hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; + break; + } + + if (vm_guest != VM_GUEST_HV) + return; + + if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) + return; + + hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), + M_DEVBUF, M_WAITOK); + for (i = 0; i < hn_tx_taskq_cnt; ++i) { + hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, + taskqueue_thread_enqueue, &hn_tx_taskque[i]); + taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, + "hn tx%d", i); + } +} +SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); + +static void +hn_sysuninit(void *arg __unused) +{ + + if (hn_tx_taskque != NULL) { + int i; + + for (i = 0; i < hn_tx_taskq_cnt; ++i) + taskqueue_free(hn_tx_taskque[i]); + free(hn_tx_taskque, M_DEVBUF); + } + + if (hn_vfmap != NULL) + free(hn_vfmap, M_DEVBUF); + rm_destroy(&hn_vfmap_lock); + + counter_u64_free(hn_udpcs_fixup); +} +SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); diff --git a/sys/dev/hyperv/netvsc/if_hnreg.h b/sys/dev/hyperv/netvsc/if_hnreg.h new file mode 100644 index 000000000000..54db556cc56d --- /dev/null +++ b/sys/dev/hyperv/netvsc/if_hnreg.h @@ -0,0 +1,270 @@ +/*- + * Copyright (c) 2016-2017 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IF_HNREG_H_ +#define _IF_HNREG_H_ + +#include <sys/param.h> +#include <sys/systm.h> + +/* + * NDIS protocol version numbers + */ +#define HN_NDIS_VERSION_6_1 0x00060001 +#define HN_NDIS_VERSION_6_20 0x00060014 +#define HN_NDIS_VERSION_6_30 0x0006001e +#define HN_NDIS_VERSION_MAJOR(ver) (((ver) & 0xffff0000) >> 16) +#define HN_NDIS_VERSION_MINOR(ver) ((ver) & 0xffff) + +/* + * NVS versions. + */ +#define HN_NVS_VERSION_1 0x00002 +#define HN_NVS_VERSION_2 0x30002 +#define HN_NVS_VERSION_4 0x40000 +#define HN_NVS_VERSION_5 0x50000 +#define HN_NVS_VERSION_6 0x60000 +#define HN_NVS_VERSION_61 0x60001 + +#define HN_NVS_RXBUF_SIG 0xcafe +#define HN_NVS_CHIM_SIG 0xface + +#define HN_NVS_CHIM_IDX_INVALID 0xffffffff + +#define HN_NVS_RNDIS_MTYPE_DATA 0 +#define HN_NVS_RNDIS_MTYPE_CTRL 1 + +/* + * NVS message transacion status codes. + */ +#define HN_NVS_STATUS_OK 1 +#define HN_NVS_STATUS_FAILED 2 + +/* + * NVS request/response message types. + */ +#define HN_NVS_TYPE_INIT 1 +#define HN_NVS_TYPE_INIT_RESP 2 +#define HN_NVS_TYPE_NDIS_INIT 100 +#define HN_NVS_TYPE_RXBUF_CONN 101 +#define HN_NVS_TYPE_RXBUF_CONNRESP 102 +#define HN_NVS_TYPE_RXBUF_DISCONN 103 +#define HN_NVS_TYPE_CHIM_CONN 104 +#define HN_NVS_TYPE_CHIM_CONNRESP 105 +#define HN_NVS_TYPE_CHIM_DISCONN 106 +#define HN_NVS_TYPE_RNDIS 107 +#define HN_NVS_TYPE_RNDIS_ACK 108 +#define HN_NVS_TYPE_NDIS_CONF 125 +#define HN_NVS_TYPE_VFASSOC_NOTE 128 /* notification */ +#define HN_NVS_TYPE_SET_DATAPATH 129 +#define HN_NVS_TYPE_SUBCH_REQ 133 +#define HN_NVS_TYPE_SUBCH_RESP 133 /* same as SUBCH_REQ */ +#define HN_NVS_TYPE_TXTBL_NOTE 134 /* notification */ + +/* + * Any size less than this one will _not_ work, e.g. hn_nvs_init + * only has 12B valid data, however, if only 12B data were sent, + * Hypervisor would never reply. + */ +#define HN_NVS_REQSIZE_MIN 32 + +/* NVS message common header */ +struct hn_nvs_hdr { + uint32_t nvs_type; +} __packed; + +struct hn_nvs_init { + uint32_t nvs_type; /* HN_NVS_TYPE_INIT */ + uint32_t nvs_ver_min; + uint32_t nvs_ver_max; + uint8_t nvs_rsvd[20]; + uint8_t nvs_msg_pad[8]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_init) >= HN_NVS_REQSIZE_MIN); + +struct hn_nvs_init_resp { + uint32_t nvs_type; /* HN_NVS_TYPE_INIT_RESP */ + uint32_t nvs_ver; /* deprecated */ + uint32_t nvs_rsvd; + uint32_t nvs_status; /* HN_NVS_STATUS_ */ +} __packed; + +/* No reponse */ +struct hn_nvs_ndis_conf { + uint32_t nvs_type; /* HN_NVS_TYPE_NDIS_CONF */ + uint32_t nvs_mtu; + uint32_t nvs_rsvd; + uint64_t nvs_caps; /* HN_NVS_NDIS_CONF_ */ + uint8_t nvs_rsvd1[12]; + uint8_t nvs_msg_pad[8]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_ndis_conf) >= HN_NVS_REQSIZE_MIN); + +#define HN_NVS_NDIS_CONF_SRIOV 0x0004 +#define HN_NVS_NDIS_CONF_VLAN 0x0008 +#define HN_NVS_NDIS_CONF_RSC 0x0080 + +/* No response */ +struct hn_nvs_ndis_init { + uint32_t nvs_type; /* HN_NVS_TYPE_NDIS_INIT */ + uint32_t nvs_ndis_major; /* NDIS_VERSION_MAJOR_ */ + uint32_t nvs_ndis_minor; /* NDIS_VERSION_MINOR_ */ + uint8_t nvs_rsvd[20]; + uint8_t nvs_msg_pad[8]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_ndis_init) >= HN_NVS_REQSIZE_MIN); + +#define HN_NVS_DATAPATH_SYNTH 0 +#define HN_NVS_DATAPATH_VF 1 + +/* No response */ +struct hn_nvs_datapath { + uint32_t nvs_type; /* HN_NVS_TYPE_SET_DATAPATH */ + uint32_t nvs_active_path;/* HN_NVS_DATAPATH_* */ + uint32_t nvs_rsvd[6]; + uint8_t nvs_msg_pad[8]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_datapath) >= HN_NVS_REQSIZE_MIN); + +struct hn_nvs_rxbuf_conn { + uint32_t nvs_type; /* HN_NVS_TYPE_RXBUF_CONN */ + uint32_t nvs_gpadl; /* RXBUF vmbus GPADL */ + uint16_t nvs_sig; /* HN_NVS_RXBUF_SIG */ + uint8_t nvs_rsvd[22]; + uint8_t nvs_msg_pad[8]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_rxbuf_conn) >= HN_NVS_REQSIZE_MIN); + +struct hn_nvs_rxbuf_sect { + uint32_t nvs_start; + uint32_t nvs_slotsz; + uint32_t nvs_slotcnt; + uint32_t nvs_end; +} __packed; + +struct hn_nvs_rxbuf_connresp { + uint32_t nvs_type; /* HN_NVS_TYPE_RXBUF_CONNRESP */ + uint32_t nvs_status; /* HN_NVS_STATUS_ */ + uint32_t nvs_nsect; /* # of elem in nvs_sect */ + struct hn_nvs_rxbuf_sect nvs_sect[]; +} __packed; + +/* No response */ +struct hn_nvs_rxbuf_disconn { + uint32_t nvs_type; /* HN_NVS_TYPE_RXBUF_DISCONN */ + uint16_t nvs_sig; /* HN_NVS_RXBUF_SIG */ + uint8_t nvs_rsvd[26]; + uint8_t nvs_msg_pad[8]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_rxbuf_disconn) >= HN_NVS_REQSIZE_MIN); + +struct hn_nvs_chim_conn { + uint32_t nvs_type; /* HN_NVS_TYPE_CHIM_CONN */ + uint32_t nvs_gpadl; /* chimney buf vmbus GPADL */ + uint16_t nvs_sig; /* NDIS_NVS_CHIM_SIG */ + uint8_t nvs_rsvd[22]; + uint8_t nvs_msg_pad[8]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_chim_conn) >= HN_NVS_REQSIZE_MIN); + +struct hn_nvs_chim_connresp { + uint32_t nvs_type; /* HN_NVS_TYPE_CHIM_CONNRESP */ + uint32_t nvs_status; /* HN_NVS_STATUS_ */ + uint32_t nvs_sectsz; /* section size */ +} __packed; + +/* No response */ +struct hn_nvs_chim_disconn { + uint32_t nvs_type; /* HN_NVS_TYPE_CHIM_DISCONN */ + uint16_t nvs_sig; /* HN_NVS_CHIM_SIG */ + uint8_t nvs_rsvd[26]; + uint8_t nvs_msg_pad[8]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_chim_disconn) >= HN_NVS_REQSIZE_MIN); + +#define HN_NVS_SUBCH_OP_ALLOC 1 + +struct hn_nvs_subch_req { + uint32_t nvs_type; /* HN_NVS_TYPE_SUBCH_REQ */ + uint32_t nvs_op; /* HN_NVS_SUBCH_OP_ */ + uint32_t nvs_nsubch; + uint8_t nvs_rsvd[20]; + uint8_t nvs_msg_pad[8]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_subch_req) >= HN_NVS_REQSIZE_MIN); + +struct hn_nvs_subch_resp { + uint32_t nvs_type; /* HN_NVS_TYPE_SUBCH_RESP */ + uint32_t nvs_status; /* HN_NVS_STATUS_ */ + uint32_t nvs_nsubch; +} __packed; + +struct hn_nvs_rndis { + uint32_t nvs_type; /* HN_NVS_TYPE_RNDIS */ + uint32_t nvs_rndis_mtype;/* HN_NVS_RNDIS_MTYPE_ */ + /* + * Chimney sending buffer index and size. + * + * NOTE: + * If nvs_chim_idx is set to HN_NVS_CHIM_IDX_INVALID + * and nvs_chim_sz is set to 0, then chimney sending + * buffer is _not_ used by this RNDIS message. + */ + uint32_t nvs_chim_idx; + uint32_t nvs_chim_sz; + uint8_t nvs_rsvd[16]; + uint8_t nvs_msg_pad[8]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_rndis) >= HN_NVS_REQSIZE_MIN); + +struct hn_nvs_rndis_ack { + uint32_t nvs_type; /* HN_NVS_TYPE_RNDIS_ACK */ + uint32_t nvs_status; /* HN_NVS_STATUS_ */ + uint8_t nvs_rsvd[24]; + uint8_t nvs_msg_pad[8]; +} __packed; +CTASSERT(sizeof(struct hn_nvs_rndis_ack) >= HN_NVS_REQSIZE_MIN); + +/* + * RNDIS extension + */ + +/* Per-packet hash info */ +#define HN_NDIS_HASH_INFO_SIZE sizeof(uint32_t) +#define HN_NDIS_PKTINFO_TYPE_HASHINF NDIS_PKTINFO_TYPE_ORIG_NBLIST +/* NDIS_HASH_ */ + +/* Per-packet hash value */ +#define HN_NDIS_HASH_VALUE_SIZE sizeof(uint32_t) +#define HN_NDIS_PKTINFO_TYPE_HASHVAL NDIS_PKTINFO_TYPE_PKT_CANCELID + +/* Per-packet-info size */ +#define HN_RNDIS_PKTINFO_SIZE(dlen) \ + __offsetof(struct rndis_pktinfo, rm_data[dlen]) + +#endif /* !_IF_HNREG_H_ */ diff --git a/sys/dev/hyperv/netvsc/if_hnvar.h b/sys/dev/hyperv/netvsc/if_hnvar.h new file mode 100644 index 000000000000..27d93db5395e --- /dev/null +++ b/sys/dev/hyperv/netvsc/if_hnvar.h @@ -0,0 +1,335 @@ +/*- + * Copyright (c) 2016-2017 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IF_HNVAR_H_ +#define _IF_HNVAR_H_ + +#define HN_USE_TXDESC_BUFRING + +#define HN_CHIM_SIZE (15 * 1024 * 1024) + +#define HN_RXBUF_SIZE (31 * 1024 * 1024) +#define HN_RXBUF_SIZE_COMPAT (15 * 1024 * 1024) + +#define HN_MTU_MAX (65535 - ETHER_ADDR_LEN) + +#define HN_TXBR_SIZE (128 * PAGE_SIZE) +#define HN_RXBR_SIZE (128 * PAGE_SIZE) + +#define HN_XACT_REQ_PGCNT 2 +#define HN_XACT_RESP_PGCNT 2 +#define HN_XACT_REQ_SIZE (HN_XACT_REQ_PGCNT * PAGE_SIZE) +#define HN_XACT_RESP_SIZE (HN_XACT_RESP_PGCNT * PAGE_SIZE) + +#define HN_GPACNT_MAX 32 + +struct hn_txdesc; +#ifndef HN_USE_TXDESC_BUFRING +SLIST_HEAD(hn_txdesc_list, hn_txdesc); +#else +struct buf_ring; +#endif +struct hn_tx_ring; + +#define HN_NVS_RSC_MAX 562 /* Max RSC frags in one vmbus packet */ + +struct hn_rx_rsc { + const uint32_t *vlan_info; + const uint32_t *csum_info; + const uint32_t *hash_info; + const uint32_t *hash_value; + uint32_t cnt; /* fragment count */ + uint32_t pktlen; /* full packet length */ + uint8_t is_last; /* last fragment */ + const void *frag_data[HN_NVS_RSC_MAX]; + uint32_t frag_len[HN_NVS_RSC_MAX]; +}; + +struct hn_rx_ring { + struct ifnet *hn_ifp; + struct ifnet *hn_rxvf_ifp; /* SR-IOV VF for RX */ + struct hn_tx_ring *hn_txr; + void *hn_pktbuf; + int hn_pktbuf_len; + int hn_rx_flags; /* HN_RX_FLAG_ */ + uint32_t hn_mbuf_hash; /* NDIS_HASH_ */ + uint8_t *hn_rxbuf; /* shadow sc->hn_rxbuf */ + int hn_rx_idx; + struct hn_rx_rsc rsc; + + /* Trust csum verification on host side */ + int hn_trust_hcsum; /* HN_TRUST_HCSUM_ */ + struct lro_ctrl hn_lro; + + u_long hn_csum_ip; + u_long hn_csum_tcp; + u_long hn_csum_udp; + u_long hn_csum_trusted; + u_long hn_lro_tried; + u_long hn_small_pkts; + u_long hn_pkts; + u_long hn_rss_pkts; + u_long hn_ack_failed; + u_long hn_rsc_pkts; + u_long hn_rsc_drop; + + /* Rarely used stuffs */ + struct sysctl_oid *hn_rx_sysctl_tree; + + void *hn_br; /* TX/RX bufring */ + struct hyperv_dma hn_br_dma; + + struct vmbus_channel *hn_chan; +} __aligned(CACHE_LINE_SIZE); + +#define HN_TRUST_HCSUM_IP 0x0001 +#define HN_TRUST_HCSUM_TCP 0x0002 +#define HN_TRUST_HCSUM_UDP 0x0004 + +#define HN_RX_FLAG_ATTACHED 0x0001 +#define HN_RX_FLAG_BR_REF 0x0002 +#define HN_RX_FLAG_XPNT_VF 0x0004 +#define HN_RX_FLAG_UDP_HASH 0x0008 + +struct hn_tx_ring { +#ifndef HN_USE_TXDESC_BUFRING + struct mtx hn_txlist_spin; + struct hn_txdesc_list hn_txlist; +#else + struct buf_ring *hn_txdesc_br; +#endif + int hn_txdesc_cnt; + int hn_txdesc_avail; + u_short hn_has_txeof; + u_short hn_txdone_cnt; + + int hn_sched_tx; + void (*hn_txeof)(struct hn_tx_ring *); + struct taskqueue *hn_tx_taskq; + struct task hn_tx_task; + struct task hn_txeof_task; + + struct buf_ring *hn_mbuf_br; + int hn_oactive; + int hn_tx_idx; + int hn_tx_flags; + + struct mtx hn_tx_lock; + struct hn_softc *hn_sc; + struct vmbus_channel *hn_chan; + + int hn_direct_tx_size; + int hn_chim_size; + bus_dma_tag_t hn_tx_data_dtag; + uint64_t hn_csum_assist; + + /* Applied packet transmission aggregation limits. */ + int hn_agg_szmax; + short hn_agg_pktmax; + short hn_agg_align; + + /* Packet transmission aggregation states. */ + struct hn_txdesc *hn_agg_txd; + int hn_agg_szleft; + short hn_agg_pktleft; + struct rndis_packet_msg *hn_agg_prevpkt; + + /* Temporary stats for each sends. */ + int hn_stat_size; + short hn_stat_pkts; + short hn_stat_mcasts; + + int (*hn_sendpkt)(struct hn_tx_ring *, struct hn_txdesc *); + int hn_suspended; + int hn_gpa_cnt; + struct vmbus_gpa hn_gpa[HN_GPACNT_MAX]; + + u_long hn_no_txdescs; + u_long hn_send_failed; + u_long hn_txdma_failed; + u_long hn_tx_collapsed; + u_long hn_tx_chimney_tried; + u_long hn_tx_chimney; + u_long hn_pkts; + u_long hn_sends; + u_long hn_flush_failed; + + /* Rarely used stuffs */ + struct hn_txdesc *hn_txdesc; + bus_dma_tag_t hn_tx_rndis_dtag; + struct sysctl_oid *hn_tx_sysctl_tree; +} __aligned(CACHE_LINE_SIZE); + +#define HN_TX_FLAG_ATTACHED 0x0001 +#define HN_TX_FLAG_HASHVAL 0x0002 /* support HASHVAL pktinfo */ + +/* + * Device-specific softc structure + */ +struct hn_softc { + struct ifnet *hn_ifp; + struct ifmedia hn_media; + device_t hn_dev; + int hn_if_flags; + struct sx hn_lock; + struct vmbus_channel *hn_prichan; + + int hn_rx_ring_cnt; + int hn_rx_ring_inuse; + struct hn_rx_ring *hn_rx_ring; + + struct rmlock hn_vf_lock; + struct ifnet *hn_vf_ifp; /* SR-IOV VF */ + uint32_t hn_xvf_flags; /* transparent VF flags */ + + int hn_tx_ring_cnt; + int hn_tx_ring_inuse; + struct hn_tx_ring *hn_tx_ring; + + uint8_t *hn_chim; + u_long *hn_chim_bmap; + int hn_chim_bmap_cnt; + int hn_chim_cnt; + int hn_chim_szmax; + + int hn_cpu; + struct taskqueue **hn_tx_taskqs; + struct sysctl_oid *hn_tx_sysctl_tree; + struct sysctl_oid *hn_rx_sysctl_tree; + struct vmbus_xact_ctx *hn_xact; + uint32_t hn_nvs_ver; + uint32_t hn_rx_filter; + + /* Packet transmission aggregation user settings. */ + int hn_agg_size; + int hn_agg_pkts; + + struct taskqueue *hn_mgmt_taskq; + struct taskqueue *hn_mgmt_taskq0; + struct task hn_link_task; + struct task hn_netchg_init; + struct timeout_task hn_netchg_status; + uint32_t hn_link_flags; /* HN_LINK_FLAG_ */ + + uint32_t hn_caps; /* HN_CAP_ */ + uint32_t hn_flags; /* HN_FLAG_ */ + u_int hn_pollhz; + + void *hn_rxbuf; + uint32_t hn_rxbuf_gpadl; + struct hyperv_dma hn_rxbuf_dma; + + uint32_t hn_chim_gpadl; + struct hyperv_dma hn_chim_dma; + + uint32_t hn_rndis_rid; + uint32_t hn_ndis_ver; + int hn_ndis_tso_szmax; + int hn_ndis_tso_sgmin; + uint32_t hn_rndis_agg_size; + uint32_t hn_rndis_agg_pkts; + uint32_t hn_rndis_agg_align; + + int hn_rss_ind_size; + uint32_t hn_rss_hash; /* setting, NDIS_HASH_ */ + uint32_t hn_rss_hcap; /* caps, NDIS_HASH_ */ + struct ndis_rssprm_toeplitz hn_rss; + + eventhandler_tag hn_ifaddr_evthand; + eventhandler_tag hn_ifnet_evthand; + eventhandler_tag hn_ifnet_atthand; + eventhandler_tag hn_ifnet_dethand; + eventhandler_tag hn_ifnet_lnkhand; + + /* + * Transparent VF delayed initialization. + */ + int hn_vf_rdytick; /* ticks, 0 == ready */ + struct taskqueue *hn_vf_taskq; + struct timeout_task hn_vf_init; + + /* + * Saved information for VF under transparent mode. + */ + void (*hn_vf_input) + (struct ifnet *, struct mbuf *); + int hn_saved_caps; + u_int hn_saved_tsomax; + u_int hn_saved_tsosegcnt; + u_int hn_saved_tsosegsz; +}; + +#define HN_FLAG_RXBUF_CONNECTED 0x0001 +#define HN_FLAG_CHIM_CONNECTED 0x0002 +#define HN_FLAG_HAS_RSSKEY 0x0004 +#define HN_FLAG_HAS_RSSIND 0x0008 +#define HN_FLAG_SYNTH_ATTACHED 0x0010 +#define HN_FLAG_NO_SLEEPING 0x0020 +#define HN_FLAG_RXBUF_REF 0x0040 +#define HN_FLAG_CHIM_REF 0x0080 +#define HN_FLAG_RXVF 0x0100 + +#define HN_FLAG_ERRORS (HN_FLAG_RXBUF_REF | HN_FLAG_CHIM_REF) + +#define HN_XVFFLAG_ENABLED 0x0001 +#define HN_XVFFLAG_ACCBPF 0x0002 + +#define HN_NO_SLEEPING(sc) \ +do { \ + (sc)->hn_flags |= HN_FLAG_NO_SLEEPING; \ +} while (0) + +#define HN_SLEEPING_OK(sc) \ +do { \ + (sc)->hn_flags &= ~HN_FLAG_NO_SLEEPING; \ +} while (0) + +#define HN_CAN_SLEEP(sc) \ + (((sc)->hn_flags & HN_FLAG_NO_SLEEPING) == 0) + +#define HN_CAP_VLAN 0x0001 +#define HN_CAP_MTU 0x0002 +#define HN_CAP_IPCS 0x0004 +#define HN_CAP_TCP4CS 0x0008 +#define HN_CAP_TCP6CS 0x0010 +#define HN_CAP_UDP4CS 0x0020 +#define HN_CAP_UDP6CS 0x0040 +#define HN_CAP_TSO4 0x0080 +#define HN_CAP_TSO6 0x0100 +#define HN_CAP_HASHVAL 0x0200 +#define HN_CAP_UDPHASH 0x0400 + +/* Capability description for use with printf(9) %b identifier. */ +#define HN_CAP_BITS \ + "\020\1VLAN\2MTU\3IPCS\4TCP4CS\5TCP6CS" \ + "\6UDP4CS\7UDP6CS\10TSO4\11TSO6\12HASHVAL\13UDPHASH" + +#define HN_LINK_FLAG_LINKUP 0x0001 +#define HN_LINK_FLAG_NETCHG 0x0002 + +#endif /* !_IF_HNVAR_H_ */ diff --git a/sys/dev/hyperv/netvsc/ndis.h b/sys/dev/hyperv/netvsc/ndis.h new file mode 100644 index 000000000000..c69da7807a63 --- /dev/null +++ b/sys/dev/hyperv/netvsc/ndis.h @@ -0,0 +1,422 @@ +/*- + * Copyright (c) 2016-2017 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_NDIS_H_ +#define _NET_NDIS_H_ + +#define NDIS_MEDIA_STATE_CONNECTED 0 +#define NDIS_MEDIA_STATE_DISCONNECTED 1 + +#define NDIS_NETCHANGE_TYPE_POSSIBLE 1 +#define NDIS_NETCHANGE_TYPE_DEFINITE 2 +#define NDIS_NETCHANGE_TYPE_FROMMEDIA 3 + +#define NDIS_OFFLOAD_SET_NOCHG 0 +#define NDIS_OFFLOAD_SET_ON 1 +#define NDIS_OFFLOAD_SET_OFF 2 + +/* a.k.a GRE MAC */ +#define NDIS_ENCAP_TYPE_NVGRE 0x00000001 + +#define NDIS_HASH_FUNCTION_MASK 0x000000FF /* see hash function */ +#define NDIS_HASH_TYPE_MASK 0x00FFFF00 /* see hash type */ + +/* hash function */ +#define NDIS_HASH_FUNCTION_TOEPLITZ 0x00000001 + +/* hash type */ +#define NDIS_HASH_IPV4 0x00000100 +#define NDIS_HASH_TCP_IPV4 0x00000200 +#define NDIS_HASH_IPV6 0x00000400 +#define NDIS_HASH_IPV6_EX 0x00000800 +#define NDIS_HASH_TCP_IPV6 0x00001000 +#define NDIS_HASH_TCP_IPV6_EX 0x00002000 +#define NDIS_HASH_UDP_IPV4_X 0x00004000 /* XXX non-standard */ + +#define NDIS_HASH_ALL (NDIS_HASH_IPV4 | \ + NDIS_HASH_TCP_IPV4 | \ + NDIS_HASH_IPV6 | \ + NDIS_HASH_IPV6_EX | \ + NDIS_HASH_TCP_IPV6 | \ + NDIS_HASH_TCP_IPV6_EX |\ + NDIS_HASH_UDP_IPV4_X) + +#define NDIS_HASH_STD (NDIS_HASH_IPV4 | \ + NDIS_HASH_TCP_IPV4 | \ + NDIS_HASH_IPV6 | \ + NDIS_HASH_IPV6_EX | \ + NDIS_HASH_TCP_IPV6 | \ + NDIS_HASH_TCP_IPV6_EX) + +/* Hash description for use with printf(9) %b identifier. */ +#define NDIS_HASH_BITS \ + "\20\1TOEPLITZ\11IP4\12TCP4\13IP6\14IP6EX\15TCP6\16TCP6EX\17UDP4_X" + +#define NDIS_HASH_KEYSIZE_TOEPLITZ 40 +#define NDIS_HASH_INDCNT 128 + +#define NDIS_OBJTYPE_DEFAULT 0x80 +#define NDIS_OBJTYPE_RSS_CAPS 0x88 +#define NDIS_OBJTYPE_RSS_PARAMS 0x89 +#define NDIS_OBJTYPE_OFFLOAD 0xa7 + +struct ndis_object_hdr { + uint8_t ndis_type; /* NDIS_OBJTYPE_ */ + uint8_t ndis_rev; /* type specific */ + uint16_t ndis_size; /* incl. this hdr */ +}; + +/* + * OID_TCP_OFFLOAD_PARAMETERS + * ndis_type: NDIS_OBJTYPE_DEFAULT + */ +struct ndis_offload_params { + struct ndis_object_hdr ndis_hdr; + uint8_t ndis_ip4csum; /* NDIS_OFFLOAD_PARAM_ */ + uint8_t ndis_tcp4csum; /* NDIS_OFFLOAD_PARAM_ */ + uint8_t ndis_udp4csum; /* NDIS_OFFLOAD_PARAM_ */ + uint8_t ndis_tcp6csum; /* NDIS_OFFLOAD_PARAM_ */ + uint8_t ndis_udp6csum; /* NDIS_OFFLOAD_PARAM_ */ + uint8_t ndis_lsov1; /* NDIS_OFFLOAD_PARAM_ */ + uint8_t ndis_ipsecv1; /* NDIS_OFFLOAD_IPSECV1_ */ + uint8_t ndis_lsov2_ip4; /* NDIS_OFFLOAD_LSOV2_ */ + uint8_t ndis_lsov2_ip6; /* NDIS_OFFLOAD_LSOV2_ */ + uint8_t ndis_tcp4conn; /* 0 */ + uint8_t ndis_tcp6conn; /* 0 */ + uint32_t ndis_flags; /* 0 */ + /* NDIS >= 6.1 */ + uint8_t ndis_ipsecv2; /* NDIS_OFFLOAD_IPSECV2_ */ + uint8_t ndis_ipsecv2_ip4;/* NDIS_OFFLOAD_IPSECV2_ */ + /* NDIS >= 6.30 */ + uint8_t ndis_rsc_ip4; /* NDIS_OFFLOAD_RSC_ */ + uint8_t ndis_rsc_ip6; /* NDIS_OFFLOAD_RSC_ */ + uint8_t ndis_encap; /* NDIS_OFFLOAD_SET_ */ + uint8_t ndis_encap_types;/* NDIS_ENCAP_TYPE_ */ +}; + +#define NDIS_OFFLOAD_PARAMS_SIZE sizeof(struct ndis_offload_params) +#define NDIS_OFFLOAD_PARAMS_SIZE_6_1 \ + __offsetof(struct ndis_offload_params, ndis_rsc_ip4) + +#define NDIS_OFFLOAD_PARAMS_REV_2 2 /* NDIS 6.1 */ +#define NDIS_OFFLOAD_PARAMS_REV_3 3 /* NDIS 6.30 */ + +#define NDIS_OFFLOAD_PARAM_NOCHG 0 /* common */ +#define NDIS_OFFLOAD_PARAM_OFF 1 +#define NDIS_OFFLOAD_PARAM_TX 2 +#define NDIS_OFFLOAD_PARAM_RX 3 +#define NDIS_OFFLOAD_PARAM_TXRX 4 + +/* NDIS_OFFLOAD_PARAM_NOCHG */ +#define NDIS_OFFLOAD_LSOV1_OFF 1 +#define NDIS_OFFLOAD_LSOV1_ON 2 + +/* NDIS_OFFLOAD_PARAM_NOCHG */ +#define NDIS_OFFLOAD_IPSECV1_OFF 1 +#define NDIS_OFFLOAD_IPSECV1_AH 2 +#define NDIS_OFFLOAD_IPSECV1_ESP 3 +#define NDIS_OFFLOAD_IPSECV1_AH_ESP 4 + +/* NDIS_OFFLOAD_PARAM_NOCHG */ +#define NDIS_OFFLOAD_LSOV2_OFF 1 +#define NDIS_OFFLOAD_LSOV2_ON 2 + +/* NDIS_OFFLOAD_PARAM_NOCHG */ +#define NDIS_OFFLOAD_IPSECV2_OFF 1 +#define NDIS_OFFLOAD_IPSECV2_AH 2 +#define NDIS_OFFLOAD_IPSECV2_ESP 3 +#define NDIS_OFFLOAD_IPSECV2_AH_ESP 4 + +/* NDIS_OFFLOAD_PARAM_NOCHG */ +#define NDIS_OFFLOAD_RSC_OFF 1 +#define NDIS_OFFLOAD_RSC_ON 2 + +/* + * OID_GEN_RECEIVE_SCALE_CAPABILITIES + * ndis_type: NDIS_OBJTYPE_RSS_CAPS + */ +struct ndis_rss_caps { + struct ndis_object_hdr ndis_hdr; + uint32_t ndis_caps; /* NDIS_RSS_CAP_ */ + uint32_t ndis_nmsi; /* # of MSIs */ + uint32_t ndis_nrxr; /* # of RX rings */ + /* NDIS >= 6.30 */ + uint16_t ndis_nind; /* # of indtbl ent. */ + uint16_t ndis_pad; +}; + +#define NDIS_RSS_CAPS_SIZE \ + __offsetof(struct ndis_rss_caps, ndis_pad) +#define NDIS_RSS_CAPS_SIZE_6_0 \ + __offsetof(struct ndis_rss_caps, ndis_nind) + +#define NDIS_RSS_CAPS_REV_1 1 /* NDIS 6.{0,1,20} */ +#define NDIS_RSS_CAPS_REV_2 2 /* NDIS 6.30 */ + +#define NDIS_RSS_CAP_MSI 0x01000000 +#define NDIS_RSS_CAP_CLASSIFY_ISR 0x02000000 +#define NDIS_RSS_CAP_CLASSIFY_DPC 0x04000000 +#define NDIS_RSS_CAP_MSIX 0x08000000 +#define NDIS_RSS_CAP_IPV4 0x00000100 +#define NDIS_RSS_CAP_IPV6 0x00000200 +#define NDIS_RSS_CAP_IPV6_EX 0x00000400 +#define NDIS_RSS_CAP_HASH_TOEPLITZ NDIS_HASH_FUNCTION_TOEPLITZ +#define NDIS_RSS_CAP_HASHFUNC_MASK NDIS_HASH_FUNCTION_MASK + +/* + * OID_GEN_RECEIVE_SCALE_PARAMETERS + * ndis_type: NDIS_OBJTYPE_RSS_PARAMS + */ +struct ndis_rss_params { + struct ndis_object_hdr ndis_hdr; + uint16_t ndis_flags; /* NDIS_RSS_FLAG_ */ + uint16_t ndis_bcpu; /* base cpu 0 */ + uint32_t ndis_hash; /* NDIS_HASH_ */ + uint16_t ndis_indsize; /* indirect table */ + uint32_t ndis_indoffset; + uint16_t ndis_keysize; /* hash key */ + uint32_t ndis_keyoffset; + /* NDIS >= 6.20 */ + uint32_t ndis_cpumaskoffset; + uint32_t ndis_cpumaskcnt; + uint32_t ndis_cpumaskentsz; +}; + +#define NDIS_RSS_PARAMS_SIZE sizeof(struct ndis_rss_params) +#define NDIS_RSS_PARAMS_SIZE_6_0 \ + __offsetof(struct ndis_rss_params, ndis_cpumaskoffset) + +#define NDIS_RSS_PARAMS_REV_1 1 /* NDIS 6.0 */ +#define NDIS_RSS_PARAMS_REV_2 2 /* NDIS 6.20 */ + +#define NDIS_RSS_FLAG_NONE 0x0000 +#define NDIS_RSS_FLAG_BCPU_UNCHG 0x0001 +#define NDIS_RSS_FLAG_HASH_UNCHG 0x0002 +#define NDIS_RSS_FLAG_IND_UNCHG 0x0004 +#define NDIS_RSS_FLAG_KEY_UNCHG 0x0008 +#define NDIS_RSS_FLAG_DISABLE 0x0010 + +/* non-standard convenient struct */ +struct ndis_rssprm_toeplitz { + struct ndis_rss_params rss_params; + /* Toeplitz hash key */ + uint8_t rss_key[NDIS_HASH_KEYSIZE_TOEPLITZ]; + /* Indirect table */ + uint32_t rss_ind[NDIS_HASH_INDCNT]; +}; + +#define NDIS_RSSPRM_TOEPLITZ_SIZE(nind) \ + __offsetof(struct ndis_rssprm_toeplitz, rss_ind[nind]) + +/* + * OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES + * ndis_type: NDIS_OBJTYPE_OFFLOAD + */ + +#define NDIS_OFFLOAD_ENCAP_NONE 0x0000 +#define NDIS_OFFLOAD_ENCAP_NULL 0x0001 +#define NDIS_OFFLOAD_ENCAP_8023 0x0002 +#define NDIS_OFFLOAD_ENCAP_8023PQ 0x0004 +#define NDIS_OFFLOAD_ENCAP_8023PQ_OOB 0x0008 +#define NDIS_OFFLOAD_ENCAP_RFC1483 0x0010 + +struct ndis_csum_offload { + uint32_t ndis_ip4_txenc; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_ip4_txcsum; +#define NDIS_TXCSUM_CAP_IP4OPT 0x001 +#define NDIS_TXCSUM_CAP_TCP4OPT 0x004 +#define NDIS_TXCSUM_CAP_TCP4 0x010 +#define NDIS_TXCSUM_CAP_UDP4 0x040 +#define NDIS_TXCSUM_CAP_IP4 0x100 + uint32_t ndis_ip4_rxenc; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_ip4_rxcsum; +#define NDIS_RXCSUM_CAP_IP4OPT 0x001 +#define NDIS_RXCSUM_CAP_TCP4OPT 0x004 +#define NDIS_RXCSUM_CAP_TCP4 0x010 +#define NDIS_RXCSUM_CAP_UDP4 0x040 +#define NDIS_RXCSUM_CAP_IP4 0x100 + uint32_t ndis_ip6_txenc; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_ip6_txcsum; +#define NDIS_TXCSUM_CAP_IP6EXT 0x001 +#define NDIS_TXCSUM_CAP_TCP6OPT 0x004 +#define NDIS_TXCSUM_CAP_TCP6 0x010 +#define NDIS_TXCSUM_CAP_UDP6 0x040 + uint32_t ndis_ip6_rxenc; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_ip6_rxcsum; +#define NDIS_RXCSUM_CAP_IP6EXT 0x001 +#define NDIS_RXCSUM_CAP_TCP6OPT 0x004 +#define NDIS_RXCSUM_CAP_TCP6 0x010 +#define NDIS_RXCSUM_CAP_UDP6 0x040 +}; + +struct ndis_lsov1_offload { + uint32_t ndis_encap; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_maxsize; + uint32_t ndis_minsegs; + uint32_t ndis_opts; +}; + +struct ndis_ipsecv1_offload { + uint32_t ndis_encap; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_ah_esp; + uint32_t ndis_xport_tun; + uint32_t ndis_ip4_opts; + uint32_t ndis_flags; + uint32_t ndis_ip4_ah; + uint32_t ndis_ip4_esp; +}; + +struct ndis_lsov2_offload { + uint32_t ndis_ip4_encap; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_ip4_maxsz; + uint32_t ndis_ip4_minsg; + uint32_t ndis_ip6_encap; /*NDIS_OFFLOAD_ENCAP_*/ + uint32_t ndis_ip6_maxsz; + uint32_t ndis_ip6_minsg; + uint32_t ndis_ip6_opts; +#define NDIS_LSOV2_CAP_IP6EXT 0x001 +#define NDIS_LSOV2_CAP_TCP6OPT 0x004 +}; + +struct ndis_ipsecv2_offload { + uint32_t ndis_encap; /*NDIS_OFFLOAD_ENCAP_*/ + uint8_t ndis_ip6; + uint8_t ndis_ip4opt; + uint8_t ndis_ip6ext; + uint8_t ndis_ah; + uint8_t ndis_esp; + uint8_t ndis_ah_esp; + uint8_t ndis_xport; + uint8_t ndis_tun; + uint8_t ndis_xport_tun; + uint8_t ndis_lso; + uint8_t ndis_extseq; + uint32_t ndis_udp_esp; + uint32_t ndis_auth; + uint32_t ndis_crypto; + uint32_t ndis_sa_caps; +}; + +struct ndis_rsc_offload { + uint8_t ndis_ip4; + uint8_t ndis_ip6; +}; + +struct ndis_encap_offload { + uint32_t ndis_flags; + uint32_t ndis_maxhdr; +}; + +struct ndis_offload { + struct ndis_object_hdr ndis_hdr; + struct ndis_csum_offload ndis_csum; + struct ndis_lsov1_offload ndis_lsov1; + struct ndis_ipsecv1_offload ndis_ipsecv1; + struct ndis_lsov2_offload ndis_lsov2; + uint32_t ndis_flags; + /* NDIS >= 6.1 */ + struct ndis_ipsecv2_offload ndis_ipsecv2; + /* NDIS >= 6.30 */ + struct ndis_rsc_offload ndis_rsc; + struct ndis_encap_offload ndis_encap_gre; +}; + +#define NDIS_OFFLOAD_SIZE sizeof(struct ndis_offload) +#define NDIS_OFFLOAD_SIZE_6_0 \ + __offsetof(struct ndis_offload, ndis_ipsecv2) +#define NDIS_OFFLOAD_SIZE_6_1 \ + __offsetof(struct ndis_offload, ndis_rsc) + +#define NDIS_OFFLOAD_REV_1 1 /* NDIS 6.0 */ +#define NDIS_OFFLOAD_REV_2 2 /* NDIS 6.1 */ +#define NDIS_OFFLOAD_REV_3 3 /* NDIS 6.30 */ + +/* + * Per-packet-info + */ + +/* VLAN */ +#define NDIS_VLAN_INFO_SIZE sizeof(uint32_t) +#define NDIS_VLAN_INFO_PRI_MASK 0x0007 +#define NDIS_VLAN_INFO_CFI_MASK 0x0008 +#define NDIS_VLAN_INFO_ID_MASK 0xfff0 +#define NDIS_VLAN_INFO_MAKE(id, pri, cfi) \ + (((pri) & NDIS_VLAN_INFO_PRI_MASK) | \ + (((cfi) & 0x1) << 3) | (((id) & 0xfff) << 4)) +#define NDIS_VLAN_INFO_ID(inf) (((inf) & NDIS_VLAN_INFO_ID_MASK) >> 4) +#define NDIS_VLAN_INFO_CFI(inf) (((inf) & NDIS_VLAN_INFO_CFI_MASK) >> 3) +#define NDIS_VLAN_INFO_PRI(inf) ((inf) & NDIS_VLAN_INFO_PRI_MASK) + +/* Reception checksum */ +#define NDIS_RXCSUM_INFO_SIZE sizeof(uint32_t) +#define NDIS_RXCSUM_INFO_TCPCS_FAILED 0x0001 +#define NDIS_RXCSUM_INFO_UDPCS_FAILED 0x0002 +#define NDIS_RXCSUM_INFO_IPCS_FAILED 0x0004 +#define NDIS_RXCSUM_INFO_TCPCS_OK 0x0008 +#define NDIS_RXCSUM_INFO_UDPCS_OK 0x0010 +#define NDIS_RXCSUM_INFO_IPCS_OK 0x0020 +#define NDIS_RXCSUM_INFO_LOOPBACK 0x0040 +#define NDIS_RXCSUM_INFO_TCPCS_INVAL 0x0080 +#define NDIS_RXCSUM_INFO_IPCS_INVAL 0x0100 + +/* LSOv2 */ +#define NDIS_LSO2_INFO_SIZE sizeof(uint32_t) +#define NDIS_LSO2_INFO_MSS_MASK 0x000fffff +#define NDIS_LSO2_INFO_THOFF_MASK 0x3ff00000 +#define NDIS_LSO2_INFO_ISLSO2 0x40000000 +#define NDIS_LSO2_INFO_ISIPV6 0x80000000 + +#define NDIS_LSO2_INFO_MAKE(thoff, mss) \ + ((((uint32_t)(mss)) & NDIS_LSO2_INFO_MSS_MASK) | \ + ((((uint32_t)(thoff)) & 0x3ff) << 20) | \ + NDIS_LSO2_INFO_ISLSO2) + +#define NDIS_LSO2_INFO_MAKEIPV4(thoff, mss) \ + NDIS_LSO2_INFO_MAKE((thoff), (mss)) + +#define NDIS_LSO2_INFO_MAKEIPV6(thoff, mss) \ + (NDIS_LSO2_INFO_MAKE((thoff), (mss)) | NDIS_LSO2_INFO_ISIPV6) + +/* Transmission checksum */ +#define NDIS_TXCSUM_INFO_SIZE sizeof(uint32_t) +#define NDIS_TXCSUM_INFO_IPV4 0x00000001 +#define NDIS_TXCSUM_INFO_IPV6 0x00000002 +#define NDIS_TXCSUM_INFO_TCPCS 0x00000004 +#define NDIS_TXCSUM_INFO_UDPCS 0x00000008 +#define NDIS_TXCSUM_INFO_IPCS 0x00000010 +#define NDIS_TXCSUM_INFO_THOFF 0x03ff0000 + +#define NDIS_TXCSUM_INFO_MKL4CS(thoff, flag) \ + ((((uint32_t)(thoff)) << 16) | (flag)) + +#define NDIS_TXCSUM_INFO_MKTCPCS(thoff) \ + NDIS_TXCSUM_INFO_MKL4CS((thoff), NDIS_TXCSUM_INFO_TCPCS) + +#define NDIS_TXCSUM_INFO_MKUDPCS(thoff) \ + NDIS_TXCSUM_INFO_MKL4CS((thoff), NDIS_TXCSUM_INFO_UDPCS) +#endif /* !_NET_NDIS_H_ */ diff --git a/sys/dev/hyperv/pcib/vmbus_pcib.c b/sys/dev/hyperv/pcib/vmbus_pcib.c new file mode 100644 index 000000000000..c7df32044678 --- /dev/null +++ b/sys/dev/hyperv/pcib/vmbus_pcib.c @@ -0,0 +1,1897 @@ +/*- + * Copyright (c) 2016-2017 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#ifdef NEW_PCIB + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/kernel.h> +#include <sys/queue.h> +#include <sys/lock.h> +#include <sys/sx.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/bus.h> +#include <sys/rman.h> +#include <sys/mutex.h> +#include <sys/errno.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/pmap.h> + +#include <machine/atomic.h> +#include <machine/bus.h> +#include <machine/frame.h> +#include <machine/pci_cfgreg.h> +#include <machine/resource.h> + +#include <sys/pciio.h> +#include <dev/pci/pcireg.h> +#include <dev/pci/pcivar.h> +#include <dev/pci/pci_private.h> +#include <dev/pci/pcib_private.h> +#include "pcib_if.h" + +#include <machine/intr_machdep.h> +#include <x86/apicreg.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/hyperv_busdma.h> +#include <dev/hyperv/include/vmbus_xact.h> +#include <dev/hyperv/vmbus/vmbus_reg.h> +#include <dev/hyperv/vmbus/vmbus_chanvar.h> + +#include "vmbus_if.h" + +#if __FreeBSD_version < 1100000 +typedef u_long rman_res_t; +#define RM_MAX_END (~(rman_res_t)0) +#endif + +struct completion { + unsigned int done; + struct mtx lock; +}; + +static void +init_completion(struct completion *c) +{ + memset(c, 0, sizeof(*c)); + mtx_init(&c->lock, "hvcmpl", NULL, MTX_DEF); + c->done = 0; +} + +static void +free_completion(struct completion *c) +{ + mtx_destroy(&c->lock); +} + +static void +complete(struct completion *c) +{ + mtx_lock(&c->lock); + c->done++; + mtx_unlock(&c->lock); + wakeup(c); +} + +static void +wait_for_completion(struct completion *c) +{ + mtx_lock(&c->lock); + while (c->done == 0) + mtx_sleep(c, &c->lock, 0, "hvwfc", 0); + c->done--; + mtx_unlock(&c->lock); +} + +/* + * Return: 0 if completed, a non-zero value if timed out. + */ +static int +wait_for_completion_timeout(struct completion *c, int timeout) +{ + int ret; + + mtx_lock(&c->lock); + + if (c->done == 0) + mtx_sleep(c, &c->lock, 0, "hvwfc", timeout); + + if (c->done > 0) { + c->done--; + ret = 0; + } else { + ret = 1; + } + + mtx_unlock(&c->lock); + + return (ret); +} + +#define PCI_MAKE_VERSION(major, minor) ((uint32_t)(((major) << 16) | (major))) + +enum { + PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), + PCI_PROTOCOL_VERSION_CURRENT = PCI_PROTOCOL_VERSION_1_1 +}; + +#define PCI_CONFIG_MMIO_LENGTH 0x2000 +#define CFG_PAGE_OFFSET 0x1000 +#define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET) + +/* + * Message Types + */ + +enum pci_message_type { + /* + * Version 1.1 + */ + PCI_MESSAGE_BASE = 0x42490000, + PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0, + PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1, + PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4, + PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5, + PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6, + PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7, + PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8, + PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9, + PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA, + PCI_EJECT = PCI_MESSAGE_BASE + 0xB, + PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC, + PCI_REENABLE = PCI_MESSAGE_BASE + 0xD, + PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE, + PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF, + PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10, + PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11, + PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12, + PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13, + PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14, + PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15, + PCI_MESSAGE_MAXIMUM +}; + +/* + * Structures defining the virtual PCI Express protocol. + */ + +union pci_version { + struct { + uint16_t minor_version; + uint16_t major_version; + } parts; + uint32_t version; +} __packed; + +/* + * This representation is the one used in Windows, which is + * what is expected when sending this back and forth with + * the Hyper-V parent partition. + */ +union win_slot_encoding { + struct { + uint32_t slot:5; + uint32_t func:3; + uint32_t reserved:24; + } bits; + uint32_t val; +} __packed; + +struct pci_func_desc { + uint16_t v_id; /* vendor ID */ + uint16_t d_id; /* device ID */ + uint8_t rev; + uint8_t prog_intf; + uint8_t subclass; + uint8_t base_class; + uint32_t subsystem_id; + union win_slot_encoding wslot; + uint32_t ser; /* serial number */ +} __packed; + +struct hv_msi_desc { + uint8_t vector; + uint8_t delivery_mode; + uint16_t vector_count; + uint32_t reserved; + uint64_t cpu_mask; +} __packed; + +struct tran_int_desc { + uint16_t reserved; + uint16_t vector_count; + uint32_t data; + uint64_t address; +} __packed; + +struct pci_message { + uint32_t type; +} __packed; + +struct pci_child_message { + struct pci_message message_type; + union win_slot_encoding wslot; +} __packed; + +struct pci_incoming_message { + struct vmbus_chanpkt_hdr hdr; + struct pci_message message_type; +} __packed; + +struct pci_response { + struct vmbus_chanpkt_hdr hdr; + int32_t status; /* negative values are failures */ +} __packed; + +struct pci_packet { + void (*completion_func)(void *context, struct pci_response *resp, + int resp_packet_size); + void *compl_ctxt; + + struct pci_message message[0]; +}; + +/* + * Specific message types supporting the PCI protocol. + */ + +struct pci_version_request { + struct pci_message message_type; + uint32_t protocol_version; + uint32_t is_last_attempt:1; + uint32_t reservedz:31; +} __packed; + +struct pci_bus_d0_entry { + struct pci_message message_type; + uint32_t reserved; + uint64_t mmio_base; +} __packed; + +struct pci_bus_relations { + struct pci_incoming_message incoming; + uint32_t device_count; + struct pci_func_desc func[0]; +} __packed; + +#define MAX_NUM_BARS (PCIR_MAX_BAR_0 + 1) +struct pci_q_res_req_response { + struct vmbus_chanpkt_hdr hdr; + int32_t status; /* negative values are failures */ + uint32_t probed_bar[MAX_NUM_BARS]; +} __packed; + +struct pci_resources_assigned { + struct pci_message message_type; + union win_slot_encoding wslot; + uint8_t memory_range[0x14][MAX_NUM_BARS]; /* unused here */ + uint32_t msi_descriptors; + uint32_t reserved[4]; +} __packed; + +struct pci_create_interrupt { + struct pci_message message_type; + union win_slot_encoding wslot; + struct hv_msi_desc int_desc; +} __packed; + +struct pci_create_int_response { + struct pci_response response; + uint32_t reserved; + struct tran_int_desc int_desc; +} __packed; + +struct pci_delete_interrupt { + struct pci_message message_type; + union win_slot_encoding wslot; + struct tran_int_desc int_desc; +} __packed; + +struct pci_dev_incoming { + struct pci_incoming_message incoming; + union win_slot_encoding wslot; +} __packed; + +struct pci_eject_response { + struct pci_message message_type; + union win_slot_encoding wslot; + uint32_t status; +} __packed; + +/* + * Driver specific state. + */ + +enum hv_pcibus_state { + hv_pcibus_init = 0, + hv_pcibus_installed, +}; + +struct hv_pcibus { + device_t pcib; + device_t pci_bus; + struct vmbus_pcib_softc *sc; + + uint16_t pci_domain; + + enum hv_pcibus_state state; + + struct resource *cfg_res; + + struct completion query_completion, *query_comp; + + struct mtx config_lock; /* Avoid two threads writing index page */ + struct mtx device_list_lock; /* Protect lists below */ + TAILQ_HEAD(, hv_pci_dev) children; + TAILQ_HEAD(, hv_dr_state) dr_list; + + volatile int detaching; +}; + +struct hv_pci_dev { + TAILQ_ENTRY(hv_pci_dev) link; + + struct pci_func_desc desc; + + bool reported_missing; + + struct hv_pcibus *hbus; + struct task eject_task; + + TAILQ_HEAD(, hv_irq_desc) irq_desc_list; + + /* + * What would be observed if one wrote 0xFFFFFFFF to a BAR and then + * read it back, for each of the BAR offsets within config space. + */ + uint32_t probed_bar[MAX_NUM_BARS]; +}; + +/* + * Tracks "Device Relations" messages from the host, which must be both + * processed in order. + */ +struct hv_dr_work { + struct task task; + struct hv_pcibus *bus; +}; + +struct hv_dr_state { + TAILQ_ENTRY(hv_dr_state) link; + uint32_t device_count; + struct pci_func_desc func[0]; +}; + +struct hv_irq_desc { + TAILQ_ENTRY(hv_irq_desc) link; + struct tran_int_desc desc; + int irq; +}; + +#define PCI_DEVFN(slot, func) ((((slot) & 0x1f) << 3) | ((func) & 0x07)) +#define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f) +#define PCI_FUNC(devfn) ((devfn) & 0x07) + +static uint32_t +devfn_to_wslot(unsigned int devfn) +{ + union win_slot_encoding wslot; + + wslot.val = 0; + wslot.bits.slot = PCI_SLOT(devfn); + wslot.bits.func = PCI_FUNC(devfn); + + return (wslot.val); +} + +static unsigned int +wslot_to_devfn(uint32_t wslot) +{ + union win_slot_encoding encoding; + unsigned int slot; + unsigned int func; + + encoding.val = wslot; + + slot = encoding.bits.slot; + func = encoding.bits.func; + + return (PCI_DEVFN(slot, func)); +} + +struct vmbus_pcib_softc { + struct vmbus_channel *chan; + void *rx_buf; + + struct taskqueue *taskq; + + struct hv_pcibus *hbus; +}; + +/* {44C4F61D-4444-4400-9D52-802E27EDE19F} */ +static const struct hyperv_guid g_pass_through_dev_type = { + .hv_guid = {0x1D, 0xF6, 0xC4, 0x44, 0x44, 0x44, 0x00, 0x44, + 0x9D, 0x52, 0x80, 0x2E, 0x27, 0xED, 0xE1, 0x9F} +}; + +struct hv_pci_compl { + struct completion host_event; + int32_t completion_status; +}; + +struct q_res_req_compl { + struct completion host_event; + struct hv_pci_dev *hpdev; +}; + +struct compose_comp_ctxt { + struct hv_pci_compl comp_pkt; + struct tran_int_desc int_desc; +}; + +/* + * It is possible the device is revoked during initialization. + * Check if this happens during wait. + * Return: 0 if response arrived, ENODEV if device revoked. + */ +static int +wait_for_response(struct hv_pcibus *hbus, struct completion *c) +{ + do { + if (vmbus_chan_is_revoked(hbus->sc->chan)) { + device_printf(hbus->pcib, + "The device is revoked.\n"); + return (ENODEV); + } + } while (wait_for_completion_timeout(c, hz /10) != 0); + + return 0; +} + +static void +hv_pci_generic_compl(void *context, struct pci_response *resp, + int resp_packet_size) +{ + struct hv_pci_compl *comp_pkt = context; + + if (resp_packet_size >= sizeof(struct pci_response)) + comp_pkt->completion_status = resp->status; + else + comp_pkt->completion_status = -1; + + complete(&comp_pkt->host_event); +} + +static void +q_resource_requirements(void *context, struct pci_response *resp, + int resp_packet_size) +{ + struct q_res_req_compl *completion = context; + struct pci_q_res_req_response *q_res_req = + (struct pci_q_res_req_response *)resp; + int i; + + if (resp->status < 0) { + printf("vmbus_pcib: failed to query resource requirements\n"); + } else { + for (i = 0; i < MAX_NUM_BARS; i++) + completion->hpdev->probed_bar[i] = + q_res_req->probed_bar[i]; + } + + complete(&completion->host_event); +} + +static void +hv_pci_compose_compl(void *context, struct pci_response *resp, + int resp_packet_size) +{ + struct compose_comp_ctxt *comp_pkt = context; + struct pci_create_int_response *int_resp = + (struct pci_create_int_response *)resp; + + comp_pkt->comp_pkt.completion_status = resp->status; + comp_pkt->int_desc = int_resp->int_desc; + complete(&comp_pkt->comp_pkt.host_event); +} + +static void +hv_int_desc_free(struct hv_pci_dev *hpdev, struct hv_irq_desc *hid) +{ + struct pci_delete_interrupt *int_pkt; + struct { + struct pci_packet pkt; + uint8_t buffer[sizeof(struct pci_delete_interrupt)]; + } ctxt; + + memset(&ctxt, 0, sizeof(ctxt)); + int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; + int_pkt->message_type.type = PCI_DELETE_INTERRUPT_MESSAGE; + int_pkt->wslot.val = hpdev->desc.wslot.val; + int_pkt->int_desc = hid->desc; + + vmbus_chan_send(hpdev->hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0, + int_pkt, sizeof(*int_pkt), 0); + + free(hid, M_DEVBUF); +} + +static void +hv_pci_delete_device(struct hv_pci_dev *hpdev) +{ + struct hv_pcibus *hbus = hpdev->hbus; + struct hv_irq_desc *hid, *tmp_hid; + device_t pci_dev; + int devfn; + + devfn = wslot_to_devfn(hpdev->desc.wslot.val); + + mtx_lock(&Giant); + + pci_dev = pci_find_dbsf(hbus->pci_domain, + 0, PCI_SLOT(devfn), PCI_FUNC(devfn)); + if (pci_dev) + device_delete_child(hbus->pci_bus, pci_dev); + + mtx_unlock(&Giant); + + mtx_lock(&hbus->device_list_lock); + TAILQ_REMOVE(&hbus->children, hpdev, link); + mtx_unlock(&hbus->device_list_lock); + + TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid) + hv_int_desc_free(hpdev, hid); + + free(hpdev, M_DEVBUF); +} + +static struct hv_pci_dev * +new_pcichild_device(struct hv_pcibus *hbus, struct pci_func_desc *desc) +{ + struct hv_pci_dev *hpdev; + struct pci_child_message *res_req; + struct q_res_req_compl comp_pkt; + struct { + struct pci_packet pkt; + uint8_t buffer[sizeof(struct pci_child_message)]; + } ctxt; + int ret; + + hpdev = malloc(sizeof(*hpdev), M_DEVBUF, M_WAITOK | M_ZERO); + hpdev->hbus = hbus; + + TAILQ_INIT(&hpdev->irq_desc_list); + + init_completion(&comp_pkt.host_event); + comp_pkt.hpdev = hpdev; + + ctxt.pkt.compl_ctxt = &comp_pkt; + ctxt.pkt.completion_func = q_resource_requirements; + + res_req = (struct pci_child_message *)&ctxt.pkt.message; + res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS; + res_req->wslot.val = desc->wslot.val; + + ret = vmbus_chan_send(hbus->sc->chan, + VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, + res_req, sizeof(*res_req), (uint64_t)(uintptr_t)&ctxt.pkt); + if (ret) + goto err; + + if (wait_for_response(hbus, &comp_pkt.host_event)) + goto err; + + free_completion(&comp_pkt.host_event); + + hpdev->desc = *desc; + + mtx_lock(&hbus->device_list_lock); + if (TAILQ_EMPTY(&hbus->children)) + hbus->pci_domain = desc->ser & 0xFFFF; + TAILQ_INSERT_TAIL(&hbus->children, hpdev, link); + mtx_unlock(&hbus->device_list_lock); + return (hpdev); +err: + free_completion(&comp_pkt.host_event); + free(hpdev, M_DEVBUF); + return (NULL); +} + +#if __FreeBSD_version < 1100000 + +/* Old versions don't have BUS_RESCAN(). Let's copy it from FreeBSD 11. */ + +static struct pci_devinfo * +pci_identify_function(device_t pcib, device_t dev, int domain, int busno, + int slot, int func, size_t dinfo_size) +{ + struct pci_devinfo *dinfo; + + dinfo = pci_read_device(pcib, domain, busno, slot, func, dinfo_size); + if (dinfo != NULL) + pci_add_child(dev, dinfo); + + return (dinfo); +} + +static int +pci_rescan(device_t dev) +{ +#define REG(n, w) PCIB_READ_CONFIG(pcib, busno, s, f, n, w) + device_t pcib = device_get_parent(dev); + struct pci_softc *sc; + device_t child, *devlist, *unchanged; + int devcount, error, i, j, maxslots, oldcount; + int busno, domain, s, f, pcifunchigh; + uint8_t hdrtype; + + /* No need to check for ARI on a rescan. */ + error = device_get_children(dev, &devlist, &devcount); + if (error) + return (error); + if (devcount != 0) { + unchanged = malloc(devcount * sizeof(device_t), M_TEMP, + M_NOWAIT | M_ZERO); + if (unchanged == NULL) { + free(devlist, M_TEMP); + return (ENOMEM); + } + } else + unchanged = NULL; + + sc = device_get_softc(dev); + domain = pcib_get_domain(dev); + busno = pcib_get_bus(dev); + maxslots = PCIB_MAXSLOTS(pcib); + for (s = 0; s <= maxslots; s++) { + /* If function 0 is not present, skip to the next slot. */ + f = 0; + if (REG(PCIR_VENDOR, 2) == 0xffff) + continue; + pcifunchigh = 0; + hdrtype = REG(PCIR_HDRTYPE, 1); + if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE) + continue; + if (hdrtype & PCIM_MFDEV) + pcifunchigh = PCIB_MAXFUNCS(pcib); + for (f = 0; f <= pcifunchigh; f++) { + if (REG(PCIR_VENDOR, 2) == 0xffff) + continue; + + /* + * Found a valid function. Check if a + * device_t for this device already exists. + */ + for (i = 0; i < devcount; i++) { + child = devlist[i]; + if (child == NULL) + continue; + if (pci_get_slot(child) == s && + pci_get_function(child) == f) { + unchanged[i] = child; + goto next_func; + } + } + + pci_identify_function(pcib, dev, domain, busno, s, f, + sizeof(struct pci_devinfo)); + next_func:; + } + } + + /* Remove devices that are no longer present. */ + for (i = 0; i < devcount; i++) { + if (unchanged[i] != NULL) + continue; + device_delete_child(dev, devlist[i]); + } + + free(devlist, M_TEMP); + oldcount = devcount; + + /* Try to attach the devices just added. */ + error = device_get_children(dev, &devlist, &devcount); + if (error) { + free(unchanged, M_TEMP); + return (error); + } + + for (i = 0; i < devcount; i++) { + for (j = 0; j < oldcount; j++) { + if (devlist[i] == unchanged[j]) + goto next_device; + } + + device_probe_and_attach(devlist[i]); + next_device:; + } + + free(unchanged, M_TEMP); + free(devlist, M_TEMP); + return (0); +#undef REG +} + +#else + +static int +pci_rescan(device_t dev) +{ + return (BUS_RESCAN(dev)); +} + +#endif + +static void +pci_devices_present_work(void *arg, int pending __unused) +{ + struct hv_dr_work *dr_wrk = arg; + struct hv_dr_state *dr = NULL; + struct hv_pcibus *hbus; + uint32_t child_no; + bool found; + struct pci_func_desc *new_desc; + struct hv_pci_dev *hpdev, *tmp_hpdev; + struct completion *query_comp; + bool need_rescan = false; + + hbus = dr_wrk->bus; + free(dr_wrk, M_DEVBUF); + + /* Pull this off the queue and process it if it was the last one. */ + mtx_lock(&hbus->device_list_lock); + while (!TAILQ_EMPTY(&hbus->dr_list)) { + dr = TAILQ_FIRST(&hbus->dr_list); + TAILQ_REMOVE(&hbus->dr_list, dr, link); + + /* Throw this away if the list still has stuff in it. */ + if (!TAILQ_EMPTY(&hbus->dr_list)) { + free(dr, M_DEVBUF); + continue; + } + } + mtx_unlock(&hbus->device_list_lock); + + if (!dr) + return; + + /* First, mark all existing children as reported missing. */ + mtx_lock(&hbus->device_list_lock); + TAILQ_FOREACH(hpdev, &hbus->children, link) + hpdev->reported_missing = true; + mtx_unlock(&hbus->device_list_lock); + + /* Next, add back any reported devices. */ + for (child_no = 0; child_no < dr->device_count; child_no++) { + found = false; + new_desc = &dr->func[child_no]; + + mtx_lock(&hbus->device_list_lock); + TAILQ_FOREACH(hpdev, &hbus->children, link) { + if ((hpdev->desc.wslot.val == + new_desc->wslot.val) && + (hpdev->desc.v_id == new_desc->v_id) && + (hpdev->desc.d_id == new_desc->d_id) && + (hpdev->desc.ser == new_desc->ser)) { + hpdev->reported_missing = false; + found = true; + break; + } + } + mtx_unlock(&hbus->device_list_lock); + + if (!found) { + if (!need_rescan) + need_rescan = true; + + hpdev = new_pcichild_device(hbus, new_desc); + if (!hpdev) + printf("vmbus_pcib: failed to add a child\n"); + } + } + + /* Remove missing device(s), if any */ + TAILQ_FOREACH_SAFE(hpdev, &hbus->children, link, tmp_hpdev) { + if (hpdev->reported_missing) + hv_pci_delete_device(hpdev); + } + + /* Rescan the bus to find any new device, if necessary. */ + if (hbus->state == hv_pcibus_installed && need_rescan) + pci_rescan(hbus->pci_bus); + + /* Wake up hv_pci_query_relations(), if it's waiting. */ + query_comp = hbus->query_comp; + if (query_comp) { + hbus->query_comp = NULL; + complete(query_comp); + } + + free(dr, M_DEVBUF); +} + +static struct hv_pci_dev * +get_pcichild_wslot(struct hv_pcibus *hbus, uint32_t wslot) +{ + struct hv_pci_dev *hpdev, *ret = NULL; + + mtx_lock(&hbus->device_list_lock); + TAILQ_FOREACH(hpdev, &hbus->children, link) { + if (hpdev->desc.wslot.val == wslot) { + ret = hpdev; + break; + } + } + mtx_unlock(&hbus->device_list_lock); + + return (ret); +} + +static void +hv_pci_devices_present(struct hv_pcibus *hbus, + struct pci_bus_relations *relations) +{ + struct hv_dr_state *dr; + struct hv_dr_work *dr_wrk; + unsigned long dr_size; + + if (hbus->detaching && relations->device_count > 0) + return; + + dr_size = offsetof(struct hv_dr_state, func) + + (sizeof(struct pci_func_desc) * relations->device_count); + dr = malloc(dr_size, M_DEVBUF, M_WAITOK | M_ZERO); + + dr->device_count = relations->device_count; + if (dr->device_count != 0) + memcpy(dr->func, relations->func, + sizeof(struct pci_func_desc) * dr->device_count); + + mtx_lock(&hbus->device_list_lock); + TAILQ_INSERT_TAIL(&hbus->dr_list, dr, link); + mtx_unlock(&hbus->device_list_lock); + + dr_wrk = malloc(sizeof(*dr_wrk), M_DEVBUF, M_WAITOK | M_ZERO); + dr_wrk->bus = hbus; + TASK_INIT(&dr_wrk->task, 0, pci_devices_present_work, dr_wrk); + taskqueue_enqueue(hbus->sc->taskq, &dr_wrk->task); +} + +static void +hv_eject_device_work(void *arg, int pending __unused) +{ + struct hv_pci_dev *hpdev = arg; + union win_slot_encoding wslot = hpdev->desc.wslot; + struct hv_pcibus *hbus = hpdev->hbus; + struct pci_eject_response *eject_pkt; + struct { + struct pci_packet pkt; + uint8_t buffer[sizeof(struct pci_eject_response)]; + } ctxt; + + hv_pci_delete_device(hpdev); + + memset(&ctxt, 0, sizeof(ctxt)); + eject_pkt = (struct pci_eject_response *)&ctxt.pkt.message; + eject_pkt->message_type.type = PCI_EJECTION_COMPLETE; + eject_pkt->wslot.val = wslot.val; + vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0, + eject_pkt, sizeof(*eject_pkt), 0); +} + +static void +hv_pci_eject_device(struct hv_pci_dev *hpdev) +{ + struct hv_pcibus *hbus = hpdev->hbus; + struct taskqueue *taskq; + + if (hbus->detaching) + return; + + /* + * Push this task into the same taskqueue on which + * vmbus_pcib_attach() runs, so we're sure this task can't run + * concurrently with vmbus_pcib_attach(). + */ + TASK_INIT(&hpdev->eject_task, 0, hv_eject_device_work, hpdev); + taskq = vmbus_chan_mgmt_tq(hbus->sc->chan); + taskqueue_enqueue(taskq, &hpdev->eject_task); +} + +#define PCIB_PACKET_SIZE 0x100 + +static void +vmbus_pcib_on_channel_callback(struct vmbus_channel *chan, void *arg) +{ + struct vmbus_pcib_softc *sc = arg; + struct hv_pcibus *hbus = sc->hbus; + + void *buffer; + int bufferlen = PCIB_PACKET_SIZE; + + struct pci_packet *comp_packet; + struct pci_response *response; + struct pci_incoming_message *new_msg; + struct pci_bus_relations *bus_rel; + struct pci_dev_incoming *dev_msg; + struct hv_pci_dev *hpdev; + + buffer = sc->rx_buf; + do { + struct vmbus_chanpkt_hdr *pkt = buffer; + uint32_t bytes_rxed; + int ret; + + bytes_rxed = bufferlen; + ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed); + + if (ret == ENOBUFS) { + /* Handle large packet */ + if (bufferlen > PCIB_PACKET_SIZE) { + free(buffer, M_DEVBUF); + buffer = NULL; + } + + /* alloc new buffer */ + buffer = malloc(bytes_rxed, M_DEVBUF, M_WAITOK | M_ZERO); + bufferlen = bytes_rxed; + + continue; + } + + if (ret != 0) { + /* ignore EIO or EAGAIN */ + break; + } + + if (bytes_rxed <= sizeof(struct pci_response)) + continue; + + switch (pkt->cph_type) { + case VMBUS_CHANPKT_TYPE_COMP: + comp_packet = + (struct pci_packet *)(uintptr_t)pkt->cph_xactid; + response = (struct pci_response *)pkt; + comp_packet->completion_func(comp_packet->compl_ctxt, + response, bytes_rxed); + break; + case VMBUS_CHANPKT_TYPE_INBAND: + new_msg = (struct pci_incoming_message *)buffer; + + switch (new_msg->message_type.type) { + case PCI_BUS_RELATIONS: + bus_rel = (struct pci_bus_relations *)buffer; + + if (bus_rel->device_count == 0) + break; + + if (bytes_rxed < + offsetof(struct pci_bus_relations, func) + + (sizeof(struct pci_func_desc) * + (bus_rel->device_count))) + break; + + hv_pci_devices_present(hbus, bus_rel); + break; + + case PCI_EJECT: + dev_msg = (struct pci_dev_incoming *)buffer; + hpdev = get_pcichild_wslot(hbus, + dev_msg->wslot.val); + + if (hpdev) + hv_pci_eject_device(hpdev); + + break; + default: + printf("vmbus_pcib: Unknown msg type 0x%x\n", + new_msg->message_type.type); + break; + } + break; + default: + printf("vmbus_pcib: Unknown VMBus msg type %hd\n", + pkt->cph_type); + break; + } + } while (1); + + if (bufferlen > PCIB_PACKET_SIZE) + free(buffer, M_DEVBUF); +} + +static int +hv_pci_protocol_negotiation(struct hv_pcibus *hbus) +{ + struct pci_version_request *version_req; + struct hv_pci_compl comp_pkt; + struct { + struct pci_packet pkt; + uint8_t buffer[sizeof(struct pci_version_request)]; + } ctxt; + int ret; + + init_completion(&comp_pkt.host_event); + + ctxt.pkt.completion_func = hv_pci_generic_compl; + ctxt.pkt.compl_ctxt = &comp_pkt; + version_req = (struct pci_version_request *)&ctxt.pkt.message; + version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION; + version_req->protocol_version = PCI_PROTOCOL_VERSION_CURRENT; + version_req->is_last_attempt = 1; + + ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, + VMBUS_CHANPKT_FLAG_RC, version_req, sizeof(*version_req), + (uint64_t)(uintptr_t)&ctxt.pkt); + if (!ret) + ret = wait_for_response(hbus, &comp_pkt.host_event); + + if (ret) { + device_printf(hbus->pcib, + "vmbus_pcib failed to request version: %d\n", + ret); + goto out; + } + + if (comp_pkt.completion_status < 0) { + device_printf(hbus->pcib, + "vmbus_pcib version negotiation failed: %x\n", + comp_pkt.completion_status); + ret = EPROTO; + } else { + ret = 0; + } +out: + free_completion(&comp_pkt.host_event); + return (ret); +} + +/* Ask the host to send along the list of child devices */ +static int +hv_pci_query_relations(struct hv_pcibus *hbus) +{ + struct pci_message message; + int ret; + + message.type = PCI_QUERY_BUS_RELATIONS; + ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0, + &message, sizeof(message), 0); + return (ret); +} + +static int +hv_pci_enter_d0(struct hv_pcibus *hbus) +{ + struct pci_bus_d0_entry *d0_entry; + struct hv_pci_compl comp_pkt; + struct { + struct pci_packet pkt; + uint8_t buffer[sizeof(struct pci_bus_d0_entry)]; + } ctxt; + int ret; + + /* + * Tell the host that the bus is ready to use, and moved into the + * powered-on state. This includes telling the host which region + * of memory-mapped I/O space has been chosen for configuration space + * access. + */ + init_completion(&comp_pkt.host_event); + + ctxt.pkt.completion_func = hv_pci_generic_compl; + ctxt.pkt.compl_ctxt = &comp_pkt; + + d0_entry = (struct pci_bus_d0_entry *)&ctxt.pkt.message; + memset(d0_entry, 0, sizeof(*d0_entry)); + d0_entry->message_type.type = PCI_BUS_D0ENTRY; + d0_entry->mmio_base = rman_get_start(hbus->cfg_res); + + ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, + VMBUS_CHANPKT_FLAG_RC, d0_entry, sizeof(*d0_entry), + (uint64_t)(uintptr_t)&ctxt.pkt); + if (!ret) + ret = wait_for_response(hbus, &comp_pkt.host_event); + + if (ret) + goto out; + + if (comp_pkt.completion_status < 0) { + device_printf(hbus->pcib, "vmbus_pcib failed to enable D0\n"); + ret = EPROTO; + } else { + ret = 0; + } + +out: + free_completion(&comp_pkt.host_event); + return (ret); +} + +/* + * It looks this is only needed by Windows VM, but let's send the message too + * just to make the host happy. + */ +static int +hv_send_resources_allocated(struct hv_pcibus *hbus) +{ + struct pci_resources_assigned *res_assigned; + struct hv_pci_compl comp_pkt; + struct hv_pci_dev *hpdev; + struct pci_packet *pkt; + uint32_t wslot; + int ret = 0; + + pkt = malloc(sizeof(*pkt) + sizeof(*res_assigned), + M_DEVBUF, M_WAITOK | M_ZERO); + + for (wslot = 0; wslot < 256; wslot++) { + hpdev = get_pcichild_wslot(hbus, wslot); + if (!hpdev) + continue; + + init_completion(&comp_pkt.host_event); + + memset(pkt, 0, sizeof(*pkt) + sizeof(*res_assigned)); + pkt->completion_func = hv_pci_generic_compl; + pkt->compl_ctxt = &comp_pkt; + + res_assigned = (struct pci_resources_assigned *)&pkt->message; + res_assigned->message_type.type = PCI_RESOURCES_ASSIGNED; + res_assigned->wslot.val = hpdev->desc.wslot.val; + + ret = vmbus_chan_send(hbus->sc->chan, + VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, + &pkt->message, sizeof(*res_assigned), + (uint64_t)(uintptr_t)pkt); + if (!ret) + ret = wait_for_response(hbus, &comp_pkt.host_event); + + free_completion(&comp_pkt.host_event); + + if (ret) + break; + + if (comp_pkt.completion_status < 0) { + ret = EPROTO; + device_printf(hbus->pcib, + "failed to send PCI_RESOURCES_ASSIGNED\n"); + break; + } + } + + free(pkt, M_DEVBUF); + return (ret); +} + +static int +hv_send_resources_released(struct hv_pcibus *hbus) +{ + struct pci_child_message pkt; + struct hv_pci_dev *hpdev; + uint32_t wslot; + int ret; + + for (wslot = 0; wslot < 256; wslot++) { + hpdev = get_pcichild_wslot(hbus, wslot); + if (!hpdev) + continue; + + pkt.message_type.type = PCI_RESOURCES_RELEASED; + pkt.wslot.val = hpdev->desc.wslot.val; + + ret = vmbus_chan_send(hbus->sc->chan, + VMBUS_CHANPKT_TYPE_INBAND, 0, &pkt, sizeof(pkt), 0); + if (ret) + return (ret); + } + + return (0); +} + +#define hv_cfg_read(x, s) \ +static inline uint##x##_t hv_cfg_read_##s(struct hv_pcibus *bus, \ + bus_size_t offset) \ +{ \ + return (bus_read_##s(bus->cfg_res, offset)); \ +} + +#define hv_cfg_write(x, s) \ +static inline void hv_cfg_write_##s(struct hv_pcibus *bus, \ + bus_size_t offset, uint##x##_t val) \ +{ \ + return (bus_write_##s(bus->cfg_res, offset, val)); \ +} + +hv_cfg_read(8, 1) +hv_cfg_read(16, 2) +hv_cfg_read(32, 4) + +hv_cfg_write(8, 1) +hv_cfg_write(16, 2) +hv_cfg_write(32, 4) + +static void +_hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, int size, + uint32_t *val) +{ + struct hv_pcibus *hbus = hpdev->hbus; + bus_size_t addr = CFG_PAGE_OFFSET + where; + + /* + * If the attempt is to read the IDs or the ROM BAR, simulate that. + */ + if (where + size <= PCIR_COMMAND) { + memcpy(val, ((uint8_t *)&hpdev->desc.v_id) + where, size); + } else if (where >= PCIR_REVID && where + size <= + PCIR_CACHELNSZ) { + memcpy(val, ((uint8_t *)&hpdev->desc.rev) + where - + PCIR_REVID, size); + } else if (where >= PCIR_SUBVEND_0 && where + size <= + PCIR_BIOS) { + memcpy(val, (uint8_t *)&hpdev->desc.subsystem_id + where - + PCIR_SUBVEND_0, size); + } else if (where >= PCIR_BIOS && where + size <= + PCIR_CAP_PTR) { + /* ROM BARs are unimplemented */ + *val = 0; + } else if ((where >= PCIR_INTLINE && where + size <= + PCIR_INTPIN) ||(where == PCIR_INTPIN && size == 1)) { + /* + * Interrupt Line and Interrupt PIN are hard-wired to zero + * because this front-end only supports message-signaled + * interrupts. + */ + *val = 0; + } else if (where + size <= CFG_PAGE_SIZE) { + mtx_lock(&hbus->config_lock); + + /* Choose the function to be read. */ + hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val); + + /* Make sure the function was chosen before we start reading.*/ + mb(); + + /* Read from that function's config space. */ + switch (size) { + case 1: + *((uint8_t *)val) = hv_cfg_read_1(hbus, addr); + break; + case 2: + *((uint16_t *)val) = hv_cfg_read_2(hbus, addr); + break; + default: + *((uint32_t *)val) = hv_cfg_read_4(hbus, addr); + break; + } + /* + * Make sure the write was done before we release the lock, + * allowing consecutive reads/writes. + */ + mb(); + + mtx_unlock(&hbus->config_lock); + } else { + /* Invalid config read: it's unlikely to reach here. */ + memset(val, 0, size); + } +} + +static void +_hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, int size, + uint32_t val) +{ + struct hv_pcibus *hbus = hpdev->hbus; + bus_size_t addr = CFG_PAGE_OFFSET + where; + + /* SSIDs and ROM BARs are read-only */ + if (where >= PCIR_SUBVEND_0 && where + size <= PCIR_CAP_PTR) + return; + + if (where >= PCIR_COMMAND && where + size <= CFG_PAGE_SIZE) { + mtx_lock(&hbus->config_lock); + + /* Choose the function to be written. */ + hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val); + + /* Make sure the function was chosen before we start writing.*/ + wmb(); + + /* Write to that function's config space. */ + switch (size) { + case 1: + hv_cfg_write_1(hbus, addr, (uint8_t)val); + break; + case 2: + hv_cfg_write_2(hbus, addr, (uint16_t)val); + break; + default: + hv_cfg_write_4(hbus, addr, (uint32_t)val); + break; + } + + /* + * Make sure the write was done before we release the lock, + * allowing consecutive reads/writes. + */ + mb(); + + mtx_unlock(&hbus->config_lock); + } else { + /* Invalid config write: it's unlikely to reach here. */ + return; + } +} + +/* + * The vPCI in some Hyper-V releases do not initialize the last 4 + * bit of BAR registers. This could result weird problems causing PCI + * code fail to configure BAR correctly. + * + * Just write all 1's to those BARs whose probed values are not zero. + * This seems to make the Hyper-V vPCI and pci_write_bar() to cooperate + * correctly. + */ + +static void +vmbus_pcib_prepopulate_bars(struct hv_pcibus *hbus) +{ + struct hv_pci_dev *hpdev; + int i; + + mtx_lock(&hbus->device_list_lock); + TAILQ_FOREACH(hpdev, &hbus->children, link) { + for (i = 0; i < 6; i++) { + /* Ignore empty bar */ + if (hpdev->probed_bar[i] == 0) + continue; + + uint32_t bar_val = 0; + + _hv_pcifront_read_config(hpdev, PCIR_BAR(i), + 4, &bar_val); + + if (hpdev->probed_bar[i] != bar_val) { + if (bootverbose) + printf("vmbus_pcib: initialize bar %d " + "by writing all 1s\n", i); + + _hv_pcifront_write_config(hpdev, PCIR_BAR(i), + 4, 0xffffffff); + } + } + } + mtx_unlock(&hbus->device_list_lock); +} + +static void +vmbus_pcib_set_detaching(void *arg, int pending __unused) +{ + struct hv_pcibus *hbus = arg; + + atomic_set_int(&hbus->detaching, 1); +} + +static void +vmbus_pcib_pre_detach(struct hv_pcibus *hbus) +{ + struct task task; + + TASK_INIT(&task, 0, vmbus_pcib_set_detaching, hbus); + + /* + * Make sure the channel callback won't push any possible new + * PCI_BUS_RELATIONS and PCI_EJECT tasks to sc->taskq. + */ + vmbus_chan_run_task(hbus->sc->chan, &task); + + taskqueue_drain_all(hbus->sc->taskq); +} + + +/* + * Standard probe entry point. + * + */ +static int +vmbus_pcib_probe(device_t dev) +{ + if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, + &g_pass_through_dev_type) == 0) { + device_set_desc(dev, "Hyper-V PCI Express Pass Through"); + return (BUS_PROBE_DEFAULT); + } + return (ENXIO); +} + +/* + * Standard attach entry point. + * + */ +static int +vmbus_pcib_attach(device_t dev) +{ + const int pci_ring_size = (4 * PAGE_SIZE); + const struct hyperv_guid *inst_guid; + struct vmbus_channel *channel; + struct vmbus_pcib_softc *sc; + struct hv_pcibus *hbus; + int rid = 0; + int ret; + + hbus = malloc(sizeof(*hbus), M_DEVBUF, M_WAITOK | M_ZERO); + hbus->pcib = dev; + + channel = vmbus_get_channel(dev); + inst_guid = vmbus_chan_guid_inst(channel); + hbus->pci_domain = inst_guid->hv_guid[9] | + (inst_guid->hv_guid[8] << 8); + + mtx_init(&hbus->config_lock, "hbcfg", NULL, MTX_DEF); + mtx_init(&hbus->device_list_lock, "hbdl", NULL, MTX_DEF); + TAILQ_INIT(&hbus->children); + TAILQ_INIT(&hbus->dr_list); + + hbus->cfg_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, + 0, RM_MAX_END, PCI_CONFIG_MMIO_LENGTH, + RF_ACTIVE | rman_make_alignment_flags(PAGE_SIZE)); + + if (!hbus->cfg_res) { + device_printf(dev, "failed to get resource for cfg window\n"); + ret = ENXIO; + goto free_bus; + } + + sc = device_get_softc(dev); + sc->chan = channel; + sc->rx_buf = malloc(PCIB_PACKET_SIZE, M_DEVBUF, M_WAITOK | M_ZERO); + sc->hbus = hbus; + + /* + * The taskq is used to handle PCI_BUS_RELATIONS and PCI_EJECT + * messages. NB: we can't handle the messages in the channel callback + * directly, because the message handlers need to send new messages + * to the host and waits for the host's completion messages, which + * must also be handled by the channel callback. + */ + sc->taskq = taskqueue_create("vmbus_pcib_tq", M_WAITOK, + taskqueue_thread_enqueue, &sc->taskq); + taskqueue_start_threads(&sc->taskq, 1, PI_NET, "vmbus_pcib_tq"); + + hbus->sc = sc; + + init_completion(&hbus->query_completion); + hbus->query_comp = &hbus->query_completion; + + ret = vmbus_chan_open(sc->chan, pci_ring_size, pci_ring_size, + NULL, 0, vmbus_pcib_on_channel_callback, sc); + if (ret) + goto free_res; + + ret = hv_pci_protocol_negotiation(hbus); + if (ret) + goto vmbus_close; + + ret = hv_pci_query_relations(hbus); + if (!ret) + ret = wait_for_response(hbus, hbus->query_comp); + + if (ret) + goto vmbus_close; + + ret = hv_pci_enter_d0(hbus); + if (ret) + goto vmbus_close; + + ret = hv_send_resources_allocated(hbus); + if (ret) + goto vmbus_close; + + vmbus_pcib_prepopulate_bars(hbus); + + hbus->pci_bus = device_add_child(dev, "pci", -1); + if (!hbus->pci_bus) { + device_printf(dev, "failed to create pci bus\n"); + ret = ENXIO; + goto vmbus_close; + } + + bus_generic_attach(dev); + + hbus->state = hv_pcibus_installed; + + return (0); + +vmbus_close: + vmbus_pcib_pre_detach(hbus); + vmbus_chan_close(sc->chan); +free_res: + taskqueue_free(sc->taskq); + free_completion(&hbus->query_completion); + free(sc->rx_buf, M_DEVBUF); + bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res); +free_bus: + mtx_destroy(&hbus->device_list_lock); + mtx_destroy(&hbus->config_lock); + free(hbus, M_DEVBUF); + return (ret); +} + +/* + * Standard detach entry point + */ +static int +vmbus_pcib_detach(device_t dev) +{ + struct vmbus_pcib_softc *sc = device_get_softc(dev); + struct hv_pcibus *hbus = sc->hbus; + struct pci_message teardown_packet; + struct pci_bus_relations relations; + int ret; + + vmbus_pcib_pre_detach(hbus); + + if (hbus->state == hv_pcibus_installed) + bus_generic_detach(dev); + + /* Delete any children which might still exist. */ + memset(&relations, 0, sizeof(relations)); + hv_pci_devices_present(hbus, &relations); + + ret = hv_send_resources_released(hbus); + if (ret) + device_printf(dev, "failed to send PCI_RESOURCES_RELEASED\n"); + + teardown_packet.type = PCI_BUS_D0EXIT; + ret = vmbus_chan_send(sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0, + &teardown_packet, sizeof(struct pci_message), 0); + if (ret) + device_printf(dev, "failed to send PCI_BUS_D0EXIT\n"); + + taskqueue_drain_all(hbus->sc->taskq); + vmbus_chan_close(sc->chan); + taskqueue_free(sc->taskq); + + free_completion(&hbus->query_completion); + free(sc->rx_buf, M_DEVBUF); + bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res); + + mtx_destroy(&hbus->device_list_lock); + mtx_destroy(&hbus->config_lock); + free(hbus, M_DEVBUF); + + return (0); +} + +static int +vmbus_pcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *val) +{ + struct vmbus_pcib_softc *sc = device_get_softc(dev); + + switch (which) { + case PCIB_IVAR_DOMAIN: + *val = sc->hbus->pci_domain; + return (0); + + case PCIB_IVAR_BUS: + /* There is only bus 0. */ + *val = 0; + return (0); + } + return (ENOENT); +} + +static int +vmbus_pcib_write_ivar(device_t dev, device_t child, int which, uintptr_t val) +{ + return (ENOENT); +} + +static struct resource * +vmbus_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid, + rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) +{ + unsigned int bar_no; + struct hv_pci_dev *hpdev; + struct vmbus_pcib_softc *sc = device_get_softc(dev); + struct resource *res; + unsigned int devfn; + + if (type == PCI_RES_BUS) + return (pci_domain_alloc_bus(sc->hbus->pci_domain, child, rid, + start, end, count, flags)); + + /* Devices with port I/O BAR are not supported. */ + if (type == SYS_RES_IOPORT) + return (NULL); + + if (type == SYS_RES_MEMORY) { + devfn = PCI_DEVFN(pci_get_slot(child), + pci_get_function(child)); + hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn)); + if (!hpdev) + return (NULL); + + bar_no = PCI_RID2BAR(*rid); + if (bar_no >= MAX_NUM_BARS) + return (NULL); + + /* Make sure a 32-bit BAR gets a 32-bit address */ + if (!(hpdev->probed_bar[bar_no] & PCIM_BAR_MEM_64)) + end = ulmin(end, 0xFFFFFFFF); + } + + res = bus_generic_alloc_resource(dev, child, type, rid, + start, end, count, flags); + /* + * If this is a request for a specific range, assume it is + * correct and pass it up to the parent. + */ + if (res == NULL && start + count - 1 == end) + res = bus_generic_alloc_resource(dev, child, type, rid, + start, end, count, flags); + return (res); +} + +static int +vmbus_pcib_release_resource(device_t dev, device_t child, int type, int rid, + struct resource *r) +{ + struct vmbus_pcib_softc *sc = device_get_softc(dev); + + if (type == PCI_RES_BUS) + return (pci_domain_release_bus(sc->hbus->pci_domain, child, + rid, r)); + + if (type == SYS_RES_IOPORT) + return (EINVAL); + + return (bus_generic_release_resource(dev, child, type, rid, r)); +} + +#if __FreeBSD_version >= 1100000 +static int +vmbus_pcib_get_cpus(device_t pcib, device_t dev, enum cpu_sets op, + size_t setsize, cpuset_t *cpuset) +{ + return (bus_get_cpus(pcib, op, setsize, cpuset)); +} +#endif + +static uint32_t +vmbus_pcib_read_config(device_t dev, u_int bus, u_int slot, u_int func, + u_int reg, int bytes) +{ + struct vmbus_pcib_softc *sc = device_get_softc(dev); + struct hv_pci_dev *hpdev; + unsigned int devfn = PCI_DEVFN(slot, func); + uint32_t data = 0; + + KASSERT(bus == 0, ("bus should be 0, but is %u", bus)); + + hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn)); + if (!hpdev) + return (~0); + + _hv_pcifront_read_config(hpdev, reg, bytes, &data); + + return (data); +} + +static void +vmbus_pcib_write_config(device_t dev, u_int bus, u_int slot, u_int func, + u_int reg, uint32_t data, int bytes) +{ + struct vmbus_pcib_softc *sc = device_get_softc(dev); + struct hv_pci_dev *hpdev; + unsigned int devfn = PCI_DEVFN(slot, func); + + KASSERT(bus == 0, ("bus should be 0, but is %u", bus)); + + hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn)); + if (!hpdev) + return; + + _hv_pcifront_write_config(hpdev, reg, bytes, data); +} + +static int +vmbus_pcib_route_intr(device_t pcib, device_t dev, int pin) +{ + /* We only support MSI/MSI-X and don't support INTx interrupt. */ + return (PCI_INVALID_IRQ); +} + +static int +vmbus_pcib_alloc_msi(device_t pcib, device_t dev, int count, + int maxcount, int *irqs) +{ + return (PCIB_ALLOC_MSI(device_get_parent(pcib), dev, count, maxcount, + irqs)); +} + +static int +vmbus_pcib_release_msi(device_t pcib, device_t dev, int count, int *irqs) +{ + return (PCIB_RELEASE_MSI(device_get_parent(pcib), dev, count, irqs)); +} + +static int +vmbus_pcib_alloc_msix(device_t pcib, device_t dev, int *irq) +{ + return (PCIB_ALLOC_MSIX(device_get_parent(pcib), dev, irq)); +} + +static int +vmbus_pcib_release_msix(device_t pcib, device_t dev, int irq) +{ + return (PCIB_RELEASE_MSIX(device_get_parent(pcib), dev, irq)); +} + +#define MSI_INTEL_ADDR_DEST 0x000ff000 +#define MSI_INTEL_DATA_INTVEC IOART_INTVEC /* Interrupt vector. */ +#define MSI_INTEL_DATA_DELFIXED IOART_DELFIXED + +static int +vmbus_pcib_map_msi(device_t pcib, device_t child, int irq, + uint64_t *addr, uint32_t *data) +{ + unsigned int devfn; + struct hv_pci_dev *hpdev; + + uint64_t v_addr; + uint32_t v_data; + struct hv_irq_desc *hid, *tmp_hid; + unsigned int cpu, vcpu_id; + unsigned int vector; + + struct vmbus_pcib_softc *sc = device_get_softc(pcib); + struct pci_create_interrupt *int_pkt; + struct compose_comp_ctxt comp; + struct { + struct pci_packet pkt; + uint8_t buffer[sizeof(struct pci_create_interrupt)]; + } ctxt; + + int ret; + + devfn = PCI_DEVFN(pci_get_slot(child), pci_get_function(child)); + hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn)); + if (!hpdev) + return (ENOENT); + + ret = PCIB_MAP_MSI(device_get_parent(pcib), child, irq, + &v_addr, &v_data); + if (ret) + return (ret); + + TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid) { + if (hid->irq == irq) { + TAILQ_REMOVE(&hpdev->irq_desc_list, hid, link); + hv_int_desc_free(hpdev, hid); + break; + } + } + + cpu = (v_addr & MSI_INTEL_ADDR_DEST) >> 12; + vcpu_id = VMBUS_GET_VCPU_ID(device_get_parent(pcib), pcib, cpu); + vector = v_data & MSI_INTEL_DATA_INTVEC; + + init_completion(&comp.comp_pkt.host_event); + + memset(&ctxt, 0, sizeof(ctxt)); + ctxt.pkt.completion_func = hv_pci_compose_compl; + ctxt.pkt.compl_ctxt = ∁ + + int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message; + int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; + int_pkt->wslot.val = hpdev->desc.wslot.val; + int_pkt->int_desc.vector = vector; + int_pkt->int_desc.vector_count = 1; + int_pkt->int_desc.delivery_mode = MSI_INTEL_DATA_DELFIXED; + int_pkt->int_desc.cpu_mask = 1ULL << vcpu_id; + + ret = vmbus_chan_send(sc->chan, VMBUS_CHANPKT_TYPE_INBAND, + VMBUS_CHANPKT_FLAG_RC, int_pkt, sizeof(*int_pkt), + (uint64_t)(uintptr_t)&ctxt.pkt); + if (ret) { + free_completion(&comp.comp_pkt.host_event); + return (ret); + } + + wait_for_completion(&comp.comp_pkt.host_event); + free_completion(&comp.comp_pkt.host_event); + + if (comp.comp_pkt.completion_status < 0) + return (EPROTO); + + *addr = comp.int_desc.address; + *data = comp.int_desc.data; + + hid = malloc(sizeof(struct hv_irq_desc), M_DEVBUF, M_WAITOK | M_ZERO); + hid->irq = irq; + hid->desc = comp.int_desc; + TAILQ_INSERT_TAIL(&hpdev->irq_desc_list, hid, link); + + return (0); +} + +static device_method_t vmbus_pcib_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vmbus_pcib_probe), + DEVMETHOD(device_attach, vmbus_pcib_attach), + DEVMETHOD(device_detach, vmbus_pcib_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + /* Bus interface */ + DEVMETHOD(bus_read_ivar, vmbus_pcib_read_ivar), + DEVMETHOD(bus_write_ivar, vmbus_pcib_write_ivar), + DEVMETHOD(bus_alloc_resource, vmbus_pcib_alloc_resource), + DEVMETHOD(bus_release_resource, vmbus_pcib_release_resource), + DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), + DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), + DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), + DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), +#if __FreeBSD_version >= 1100000 + DEVMETHOD(bus_get_cpus, vmbus_pcib_get_cpus), +#endif + + /* pcib interface */ + DEVMETHOD(pcib_maxslots, pcib_maxslots), + DEVMETHOD(pcib_read_config, vmbus_pcib_read_config), + DEVMETHOD(pcib_write_config, vmbus_pcib_write_config), + DEVMETHOD(pcib_route_interrupt, vmbus_pcib_route_intr), + DEVMETHOD(pcib_alloc_msi, vmbus_pcib_alloc_msi), + DEVMETHOD(pcib_release_msi, vmbus_pcib_release_msi), + DEVMETHOD(pcib_alloc_msix, vmbus_pcib_alloc_msix), + DEVMETHOD(pcib_release_msix, vmbus_pcib_release_msix), + DEVMETHOD(pcib_map_msi, vmbus_pcib_map_msi), + DEVMETHOD(pcib_request_feature, pcib_request_feature_allow), + + DEVMETHOD_END +}; + +static devclass_t pcib_devclass; + +DEFINE_CLASS_0(pcib, vmbus_pcib_driver, vmbus_pcib_methods, + sizeof(struct vmbus_pcib_softc)); +DRIVER_MODULE(vmbus_pcib, vmbus, vmbus_pcib_driver, pcib_devclass, 0, 0); +MODULE_DEPEND(vmbus_pcib, vmbus, 1, 1, 1); +MODULE_DEPEND(vmbus_pcib, pci, 1, 1, 1); + +#endif /* NEW_PCIB */ diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c new file mode 100644 index 000000000000..702308e26a1d --- /dev/null +++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c @@ -0,0 +1,2515 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * StorVSC driver for Hyper-V. This driver presents a SCSI HBA interface + * to the Comman Access Method (CAM) layer. CAM control blocks (CCBs) are + * converted into VSCSI protocol messages which are delivered to the parent + * partition StorVSP driver over the Hyper-V VMBUS. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/condvar.h> +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/sysctl.h> +#include <sys/sockio.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/kernel.h> +#include <sys/queue.h> +#include <sys/lock.h> +#include <sys/sx.h> +#include <sys/taskqueue.h> +#include <sys/bus.h> +#include <sys/mutex.h> +#include <sys/callout.h> +#include <sys/smp.h> +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/uma.h> +#include <sys/lock.h> +#include <sys/sema.h> +#include <sys/sglist.h> +#include <sys/eventhandler.h> +#include <machine/bus.h> + +#include <cam/cam.h> +#include <cam/cam_ccb.h> +#include <cam/cam_periph.h> +#include <cam/cam_sim.h> +#include <cam/cam_xpt_sim.h> +#include <cam/cam_xpt_internal.h> +#include <cam/cam_debug.h> +#include <cam/scsi/scsi_all.h> +#include <cam/scsi/scsi_message.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/vmbus.h> +#include "hv_vstorage.h" +#include "vmbus_if.h" + +#define STORVSC_MAX_LUNS_PER_TARGET (64) +#define STORVSC_MAX_IO_REQUESTS (STORVSC_MAX_LUNS_PER_TARGET * 2) +#define BLKVSC_MAX_IDE_DISKS_PER_TARGET (1) +#define BLKVSC_MAX_IO_REQUESTS STORVSC_MAX_IO_REQUESTS +#define STORVSC_MAX_TARGETS (2) + +#define VSTOR_PKT_SIZE (sizeof(struct vstor_packet) - vmscsi_size_delta) + +/* + * 33 segments are needed to allow 128KB maxio, in case the data + * in the first page is _not_ PAGE_SIZE aligned, e.g. + * + * |<----------- 128KB ----------->| + * | | + * 0 2K 4K 8K 16K 124K 128K 130K + * | | | | | | | | + * +--+--+-----+-----+.......+-----+--+--+ + * | | | | | | | | | DATA + * | | | | | | | | | + * +--+--+-----+-----+.......------+--+--+ + * | | | | + * | 1| 31 | 1| ...... # of segments + */ +#define STORVSC_DATA_SEGCNT_MAX 33 +#define STORVSC_DATA_SEGSZ_MAX PAGE_SIZE +#define STORVSC_DATA_SIZE_MAX \ + ((STORVSC_DATA_SEGCNT_MAX - 1) * STORVSC_DATA_SEGSZ_MAX) + +struct storvsc_softc; + +struct hv_sgl_node { + LIST_ENTRY(hv_sgl_node) link; + struct sglist *sgl_data; +}; + +struct hv_sgl_page_pool{ + LIST_HEAD(, hv_sgl_node) in_use_sgl_list; + LIST_HEAD(, hv_sgl_node) free_sgl_list; + boolean_t is_init; +} g_hv_sgl_page_pool; + +enum storvsc_request_type { + WRITE_TYPE, + READ_TYPE, + UNKNOWN_TYPE +}; + +SYSCTL_NODE(_hw, OID_AUTO, storvsc, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, + "Hyper-V storage interface"); + +static u_int hv_storvsc_use_win8ext_flags = 1; +SYSCTL_UINT(_hw_storvsc, OID_AUTO, use_win8ext_flags, CTLFLAG_RW, + &hv_storvsc_use_win8ext_flags, 0, + "Use win8 extension flags or not"); + +static u_int hv_storvsc_use_pim_unmapped = 1; +SYSCTL_UINT(_hw_storvsc, OID_AUTO, use_pim_unmapped, CTLFLAG_RDTUN, + &hv_storvsc_use_pim_unmapped, 0, + "Optimize storvsc by using unmapped I/O"); + +static u_int hv_storvsc_ringbuffer_size = (64 * PAGE_SIZE); +SYSCTL_UINT(_hw_storvsc, OID_AUTO, ringbuffer_size, CTLFLAG_RDTUN, + &hv_storvsc_ringbuffer_size, 0, "Hyper-V storage ringbuffer size"); + +static u_int hv_storvsc_max_io = 512; +SYSCTL_UINT(_hw_storvsc, OID_AUTO, max_io, CTLFLAG_RDTUN, + &hv_storvsc_max_io, 0, "Hyper-V storage max io limit"); + +static int hv_storvsc_chan_cnt = 0; +SYSCTL_INT(_hw_storvsc, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, + &hv_storvsc_chan_cnt, 0, "# of channels to use"); +#ifdef DIAGNOSTIC +static int hv_storvsc_srb_status = -1; +SYSCTL_INT(_hw_storvsc, OID_AUTO, srb_status, CTLFLAG_RW, + &hv_storvsc_srb_status, 0, "srb_status to inject"); +TUNABLE_INT("hw_storvsc.srb_status", &hv_storvsc_srb_status); +#endif /* DIAGNOSTIC */ + +#define STORVSC_MAX_IO \ + vmbus_chan_prplist_nelem(hv_storvsc_ringbuffer_size, \ + STORVSC_DATA_SEGCNT_MAX, VSTOR_PKT_SIZE) + +struct hv_storvsc_sysctl { + u_long data_bio_cnt; + u_long data_vaddr_cnt; + u_long data_sg_cnt; + u_long chan_send_cnt[MAXCPU]; +}; + +struct storvsc_gpa_range { + struct vmbus_gpa_range gpa_range; + uint64_t gpa_page[STORVSC_DATA_SEGCNT_MAX]; +} __packed; + +struct hv_storvsc_request { + LIST_ENTRY(hv_storvsc_request) link; + struct vstor_packet vstor_packet; + int prp_cnt; + struct storvsc_gpa_range prp_list; + void *sense_data; + uint8_t sense_info_len; + uint8_t retries; + union ccb *ccb; + struct storvsc_softc *softc; + struct callout callout; + struct sema synch_sema; /*Synchronize the request/response if needed */ + struct sglist *bounce_sgl; + unsigned int bounce_sgl_count; + uint64_t not_aligned_seg_bits; + bus_dmamap_t data_dmap; +}; + +struct storvsc_softc { + struct vmbus_channel *hs_chan; + LIST_HEAD(, hv_storvsc_request) hs_free_list; + struct mtx hs_lock; + struct storvsc_driver_props *hs_drv_props; + int hs_unit; + uint32_t hs_frozen; + struct cam_sim *hs_sim; + struct cam_path *hs_path; + uint32_t hs_num_out_reqs; + boolean_t hs_destroy; + boolean_t hs_drain_notify; + struct sema hs_drain_sema; + struct hv_storvsc_request hs_init_req; + struct hv_storvsc_request hs_reset_req; + device_t hs_dev; + bus_dma_tag_t storvsc_req_dtag; + struct hv_storvsc_sysctl sysctl_data; + uint32_t hs_nchan; + struct vmbus_channel *hs_sel_chan[MAXCPU]; +}; + +static eventhandler_tag storvsc_handler_tag; +/* + * The size of the vmscsi_request has changed in win8. The + * additional size is for the newly added elements in the + * structure. These elements are valid only when we are talking + * to a win8 host. + * Track the correct size we need to apply. + */ +static int vmscsi_size_delta = sizeof(struct vmscsi_win8_extension); + +/** + * HyperV storvsc timeout testing cases: + * a. IO returned after first timeout; + * b. IO returned after second timeout and queue freeze; + * c. IO returned while timer handler is running + * The first can be tested by "sg_senddiag -vv /dev/daX", + * and the second and third can be done by + * "sg_wr_mode -v -p 08 -c 0,1a -m 0,ff /dev/daX". + */ +#define HVS_TIMEOUT_TEST 0 + +/* + * Bus/adapter reset functionality on the Hyper-V host is + * buggy and it will be disabled until + * it can be further tested. + */ +#define HVS_HOST_RESET 0 + +struct storvsc_driver_props { + char *drv_name; + char *drv_desc; + uint8_t drv_max_luns_per_target; + uint32_t drv_max_ios_per_target; + uint32_t drv_ringbuffer_size; +}; + +enum hv_storage_type { + DRIVER_BLKVSC, + DRIVER_STORVSC, + DRIVER_UNKNOWN +}; + +#define HS_MAX_ADAPTERS 10 + +#define HV_STORAGE_SUPPORTS_MULTI_CHANNEL 0x1 + +/* {ba6163d9-04a1-4d29-b605-72e2ffb1dc7f} */ +static const struct hyperv_guid gStorVscDeviceType={ + .hv_guid = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d, + 0xb6, 0x05, 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f} +}; + +/* {32412632-86cb-44a2-9b5c-50d1417354f5} */ +static const struct hyperv_guid gBlkVscDeviceType={ + .hv_guid = {0x32, 0x26, 0x41, 0x32, 0xcb, 0x86, 0xa2, 0x44, + 0x9b, 0x5c, 0x50, 0xd1, 0x41, 0x73, 0x54, 0xf5} +}; + +static struct storvsc_driver_props g_drv_props_table[] = { + {"blkvsc", "Hyper-V IDE", + BLKVSC_MAX_IDE_DISKS_PER_TARGET, BLKVSC_MAX_IO_REQUESTS, + 20*PAGE_SIZE}, + {"storvsc", "Hyper-V SCSI", + STORVSC_MAX_LUNS_PER_TARGET, STORVSC_MAX_IO_REQUESTS, + 20*PAGE_SIZE} +}; + +/* + * Sense buffer size changed in win8; have a run-time + * variable to track the size we should use. + */ +static int sense_buffer_size = PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE; + +/* + * The storage protocol version is determined during the + * initial exchange with the host. It will indicate which + * storage functionality is available in the host. +*/ +static int vmstor_proto_version; + +struct vmstor_proto { + int proto_version; + int sense_buffer_size; + int vmscsi_size_delta; +}; + +static const struct vmstor_proto vmstor_proto_list[] = { + { + VMSTOR_PROTOCOL_VERSION_WIN10, + POST_WIN7_STORVSC_SENSE_BUFFER_SIZE, + 0 + }, + { + VMSTOR_PROTOCOL_VERSION_WIN8_1, + POST_WIN7_STORVSC_SENSE_BUFFER_SIZE, + 0 + }, + { + VMSTOR_PROTOCOL_VERSION_WIN8, + POST_WIN7_STORVSC_SENSE_BUFFER_SIZE, + 0 + }, + { + VMSTOR_PROTOCOL_VERSION_WIN7, + PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE, + sizeof(struct vmscsi_win8_extension), + }, + { + VMSTOR_PROTOCOL_VERSION_WIN6, + PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE, + sizeof(struct vmscsi_win8_extension), + } +}; + +/* static functions */ +static int storvsc_probe(device_t dev); +static int storvsc_attach(device_t dev); +static int storvsc_detach(device_t dev); +static void storvsc_poll(struct cam_sim * sim); +static void storvsc_action(struct cam_sim * sim, union ccb * ccb); +static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp); +static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp); +static enum hv_storage_type storvsc_get_storage_type(device_t dev); +static void hv_storvsc_rescan_target(struct storvsc_softc *sc); +static void hv_storvsc_on_channel_callback(struct vmbus_channel *chan, void *xsc); +static void hv_storvsc_on_iocompletion( struct storvsc_softc *sc, + struct vstor_packet *vstor_packet, + struct hv_storvsc_request *request); +static int hv_storvsc_connect_vsp(struct storvsc_softc *); +static void storvsc_io_done(struct hv_storvsc_request *reqp); +static void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl, + bus_dma_segment_t *orig_sgl, + unsigned int orig_sgl_count, + uint64_t seg_bits); +void storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl, + unsigned int dest_sgl_count, + struct sglist* src_sgl, + uint64_t seg_bits); + +static device_method_t storvsc_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, storvsc_probe), + DEVMETHOD(device_attach, storvsc_attach), + DEVMETHOD(device_detach, storvsc_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD_END +}; + +static driver_t storvsc_driver = { + "storvsc", storvsc_methods, sizeof(struct storvsc_softc), +}; + +static devclass_t storvsc_devclass; +DRIVER_MODULE(storvsc, vmbus, storvsc_driver, storvsc_devclass, 0, 0); +MODULE_VERSION(storvsc, 1); +MODULE_DEPEND(storvsc, vmbus, 1, 1, 1); + +static void +storvsc_subchan_attach(struct storvsc_softc *sc, + struct vmbus_channel *new_channel) +{ + struct vmstor_chan_props props; + int ret = 0; + + memset(&props, 0, sizeof(props)); + + vmbus_chan_cpu_rr(new_channel); + ret = vmbus_chan_open(new_channel, + sc->hs_drv_props->drv_ringbuffer_size, + sc->hs_drv_props->drv_ringbuffer_size, + (void *)&props, + sizeof(struct vmstor_chan_props), + hv_storvsc_on_channel_callback, sc); +} + +/** + * @brief Send multi-channel creation request to host + * + * @param device a Hyper-V device pointer + * @param max_chans the max channels supported by vmbus + */ +static void +storvsc_send_multichannel_request(struct storvsc_softc *sc, int max_subch) +{ + struct vmbus_channel **subchan; + struct hv_storvsc_request *request; + struct vstor_packet *vstor_packet; + int request_subch; + int ret, i; + + /* get sub-channel count that need to create */ + request_subch = MIN(max_subch, mp_ncpus - 1); + + request = &sc->hs_init_req; + + /* request the host to create multi-channel */ + memset(request, 0, sizeof(struct hv_storvsc_request)); + + sema_init(&request->synch_sema, 0, ("stor_synch_sema")); + + vstor_packet = &request->vstor_packet; + + vstor_packet->operation = VSTOR_OPERATION_CREATE_MULTI_CHANNELS; + vstor_packet->flags = REQUEST_COMPLETION_FLAG; + vstor_packet->u.multi_channels_cnt = request_subch; + + ret = vmbus_chan_send(sc->hs_chan, + VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, + vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); + + sema_wait(&request->synch_sema); + + if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || + vstor_packet->status != 0) { + printf("Storvsc_error: create multi-channel invalid operation " + "(%d) or statue (%u)\n", + vstor_packet->operation, vstor_packet->status); + return; + } + + /* Update channel count */ + sc->hs_nchan = request_subch + 1; + + /* Wait for sub-channels setup to complete. */ + subchan = vmbus_subchan_get(sc->hs_chan, request_subch); + + /* Attach the sub-channels. */ + for (i = 0; i < request_subch; ++i) + storvsc_subchan_attach(sc, subchan[i]); + + /* Release the sub-channels. */ + vmbus_subchan_rel(subchan, request_subch); + + if (bootverbose) + printf("Storvsc create multi-channel success!\n"); +} + +/** + * @brief initialize channel connection to parent partition + * + * @param dev a Hyper-V device pointer + * @returns 0 on success, non-zero error on failure + */ +static int +hv_storvsc_channel_init(struct storvsc_softc *sc) +{ + int ret = 0, i; + struct hv_storvsc_request *request; + struct vstor_packet *vstor_packet; + uint16_t max_subch; + boolean_t support_multichannel; + uint32_t version; + + max_subch = 0; + support_multichannel = FALSE; + + request = &sc->hs_init_req; + memset(request, 0, sizeof(struct hv_storvsc_request)); + vstor_packet = &request->vstor_packet; + request->softc = sc; + + /** + * Initiate the vsc/vsp initialization protocol on the open channel + */ + sema_init(&request->synch_sema, 0, ("stor_synch_sema")); + + vstor_packet->operation = VSTOR_OPERATION_BEGININITIALIZATION; + vstor_packet->flags = REQUEST_COMPLETION_FLAG; + + + ret = vmbus_chan_send(sc->hs_chan, + VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, + vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); + + if (ret != 0) + goto cleanup; + + sema_wait(&request->synch_sema); + + if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || + vstor_packet->status != 0) { + goto cleanup; + } + + for (i = 0; i < nitems(vmstor_proto_list); i++) { + /* reuse the packet for version range supported */ + + memset(vstor_packet, 0, sizeof(struct vstor_packet)); + vstor_packet->operation = VSTOR_OPERATION_QUERYPROTOCOLVERSION; + vstor_packet->flags = REQUEST_COMPLETION_FLAG; + + vstor_packet->u.version.major_minor = + vmstor_proto_list[i].proto_version; + + /* revision is only significant for Windows guests */ + vstor_packet->u.version.revision = 0; + + ret = vmbus_chan_send(sc->hs_chan, + VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, + vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); + + if (ret != 0) + goto cleanup; + + sema_wait(&request->synch_sema); + + if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO) { + ret = EINVAL; + goto cleanup; + } + if (vstor_packet->status == 0) { + vmstor_proto_version = + vmstor_proto_list[i].proto_version; + sense_buffer_size = + vmstor_proto_list[i].sense_buffer_size; + vmscsi_size_delta = + vmstor_proto_list[i].vmscsi_size_delta; + break; + } + } + + if (vstor_packet->status != 0) { + ret = EINVAL; + goto cleanup; + } + /** + * Query channel properties + */ + memset(vstor_packet, 0, sizeof(struct vstor_packet)); + vstor_packet->operation = VSTOR_OPERATION_QUERYPROPERTIES; + vstor_packet->flags = REQUEST_COMPLETION_FLAG; + + ret = vmbus_chan_send(sc->hs_chan, + VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, + vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); + + if ( ret != 0) + goto cleanup; + + sema_wait(&request->synch_sema); + + /* TODO: Check returned version */ + if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || + vstor_packet->status != 0) { + goto cleanup; + } + + max_subch = vstor_packet->u.chan_props.max_channel_cnt; + if (hv_storvsc_chan_cnt > 0 && hv_storvsc_chan_cnt < (max_subch + 1)) + max_subch = hv_storvsc_chan_cnt - 1; + + /* multi-channels feature is supported by WIN8 and above version */ + version = VMBUS_GET_VERSION(device_get_parent(sc->hs_dev), sc->hs_dev); + if (version != VMBUS_VERSION_WIN7 && version != VMBUS_VERSION_WS2008 && + (vstor_packet->u.chan_props.flags & + HV_STORAGE_SUPPORTS_MULTI_CHANNEL)) { + support_multichannel = TRUE; + } + if (bootverbose) { + device_printf(sc->hs_dev, "max chans %d%s\n", max_subch + 1, + support_multichannel ? ", multi-chan capable" : ""); + } + + memset(vstor_packet, 0, sizeof(struct vstor_packet)); + vstor_packet->operation = VSTOR_OPERATION_ENDINITIALIZATION; + vstor_packet->flags = REQUEST_COMPLETION_FLAG; + + ret = vmbus_chan_send(sc->hs_chan, + VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, + vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); + + if (ret != 0) { + goto cleanup; + } + + sema_wait(&request->synch_sema); + + if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || + vstor_packet->status != 0) + goto cleanup; + + /* + * If multi-channel is supported, send multichannel create + * request to host. + */ + if (support_multichannel && max_subch > 0) + storvsc_send_multichannel_request(sc, max_subch); +cleanup: + sema_destroy(&request->synch_sema); + return (ret); +} + +/** + * @brief Open channel connection to paraent partition StorVSP driver + * + * Open and initialize channel connection to parent partition StorVSP driver. + * + * @param pointer to a Hyper-V device + * @returns 0 on success, non-zero error on failure + */ +static int +hv_storvsc_connect_vsp(struct storvsc_softc *sc) +{ + int ret = 0; + struct vmstor_chan_props props; + + memset(&props, 0, sizeof(struct vmstor_chan_props)); + + /* + * Open the channel + */ + vmbus_chan_cpu_rr(sc->hs_chan); + ret = vmbus_chan_open( + sc->hs_chan, + sc->hs_drv_props->drv_ringbuffer_size, + sc->hs_drv_props->drv_ringbuffer_size, + (void *)&props, + sizeof(struct vmstor_chan_props), + hv_storvsc_on_channel_callback, sc); + + if (ret != 0) { + return ret; + } + + ret = hv_storvsc_channel_init(sc); + return (ret); +} + +#if HVS_HOST_RESET +static int +hv_storvsc_host_reset(struct storvsc_softc *sc) +{ + int ret = 0; + + struct hv_storvsc_request *request; + struct vstor_packet *vstor_packet; + + request = &sc->hs_reset_req; + request->softc = sc; + vstor_packet = &request->vstor_packet; + + sema_init(&request->synch_sema, 0, "stor synch sema"); + + vstor_packet->operation = VSTOR_OPERATION_RESETBUS; + vstor_packet->flags = REQUEST_COMPLETION_FLAG; + + ret = vmbus_chan_send(dev->channel, + VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, + vstor_packet, VSTOR_PKT_SIZE, + (uint64_t)(uintptr_t)&sc->hs_reset_req); + + if (ret != 0) { + goto cleanup; + } + + sema_wait(&request->synch_sema); + + /* + * At this point, all outstanding requests in the adapter + * should have been flushed out and return to us + */ + +cleanup: + sema_destroy(&request->synch_sema); + return (ret); +} +#endif /* HVS_HOST_RESET */ + +/** + * @brief Function to initiate an I/O request + * + * @param device Hyper-V device pointer + * @param request pointer to a request structure + * @returns 0 on success, non-zero error on failure + */ +static int +hv_storvsc_io_request(struct storvsc_softc *sc, + struct hv_storvsc_request *request) +{ + struct vstor_packet *vstor_packet = &request->vstor_packet; + struct vmbus_channel* outgoing_channel = NULL; + int ret = 0, ch_sel; + + vstor_packet->flags |= REQUEST_COMPLETION_FLAG; + + vstor_packet->u.vm_srb.length = + sizeof(struct vmscsi_req) - vmscsi_size_delta; + + vstor_packet->u.vm_srb.sense_info_len = sense_buffer_size; + + vstor_packet->u.vm_srb.transfer_len = + request->prp_list.gpa_range.gpa_len; + + vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB; + + ch_sel = (vstor_packet->u.vm_srb.lun + curcpu) % sc->hs_nchan; + /* + * If we are panic'ing, then we are dumping core. Since storvsc_polls + * always uses sc->hs_chan, then we must send to that channel or a poll + * timeout will occur. + */ + if (panicstr) { + outgoing_channel = sc->hs_chan; + } else { + outgoing_channel = sc->hs_sel_chan[ch_sel]; + } + + mtx_unlock(&request->softc->hs_lock); + if (request->prp_list.gpa_range.gpa_len) { + ret = vmbus_chan_send_prplist(outgoing_channel, + &request->prp_list.gpa_range, request->prp_cnt, + vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); + } else { + ret = vmbus_chan_send(outgoing_channel, + VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, + vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); + } + /* statistic for successful request sending on each channel */ + if (!ret) { + sc->sysctl_data.chan_send_cnt[ch_sel]++; + } + mtx_lock(&request->softc->hs_lock); + + if (ret != 0) { + printf("Unable to send packet %p ret %d", vstor_packet, ret); + } else { + atomic_add_int(&sc->hs_num_out_reqs, 1); + } + + return (ret); +} + + +/** + * Process IO_COMPLETION_OPERATION and ready + * the result to be completed for upper layer + * processing by the CAM layer. + */ +static void +hv_storvsc_on_iocompletion(struct storvsc_softc *sc, + struct vstor_packet *vstor_packet, + struct hv_storvsc_request *request) +{ + struct vmscsi_req *vm_srb; + + vm_srb = &vstor_packet->u.vm_srb; + + /* + * Copy some fields of the host's response into the request structure, + * because the fields will be used later in storvsc_io_done(). + */ + request->vstor_packet.u.vm_srb.scsi_status = vm_srb->scsi_status; + request->vstor_packet.u.vm_srb.srb_status = vm_srb->srb_status; + request->vstor_packet.u.vm_srb.transfer_len = vm_srb->transfer_len; + + if (((vm_srb->scsi_status & 0xFF) == SCSI_STATUS_CHECK_COND) && + (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID)) { + /* Autosense data available */ + + KASSERT(vm_srb->sense_info_len <= request->sense_info_len, + ("vm_srb->sense_info_len <= " + "request->sense_info_len")); + + memcpy(request->sense_data, vm_srb->u.sense_data, + vm_srb->sense_info_len); + + request->sense_info_len = vm_srb->sense_info_len; + } + + /* Complete request by passing to the CAM layer */ + storvsc_io_done(request); + atomic_subtract_int(&sc->hs_num_out_reqs, 1); + if (sc->hs_drain_notify && (sc->hs_num_out_reqs == 0)) { + sema_post(&sc->hs_drain_sema); + } +} + +static void +hv_storvsc_rescan_target(struct storvsc_softc *sc) +{ + path_id_t pathid; + target_id_t targetid; + union ccb *ccb; + + pathid = cam_sim_path(sc->hs_sim); + targetid = CAM_TARGET_WILDCARD; + + /* + * Allocate a CCB and schedule a rescan. + */ + ccb = xpt_alloc_ccb_nowait(); + if (ccb == NULL) { + printf("unable to alloc CCB for rescan\n"); + return; + } + + if (xpt_create_path(&ccb->ccb_h.path, NULL, pathid, targetid, + CAM_LUN_WILDCARD) != CAM_REQ_CMP) { + printf("unable to create path for rescan, pathid: %u," + "targetid: %u\n", pathid, targetid); + xpt_free_ccb(ccb); + return; + } + + if (targetid == CAM_TARGET_WILDCARD) + ccb->ccb_h.func_code = XPT_SCAN_BUS; + else + ccb->ccb_h.func_code = XPT_SCAN_TGT; + + xpt_rescan(ccb); +} + +static void +hv_storvsc_on_channel_callback(struct vmbus_channel *channel, void *xsc) +{ + int ret = 0; + struct storvsc_softc *sc = xsc; + uint32_t bytes_recvd; + uint64_t request_id; + uint8_t packet[roundup2(sizeof(struct vstor_packet), 8)]; + struct hv_storvsc_request *request; + struct vstor_packet *vstor_packet; + + bytes_recvd = roundup2(VSTOR_PKT_SIZE, 8); + ret = vmbus_chan_recv(channel, packet, &bytes_recvd, &request_id); + KASSERT(ret != ENOBUFS, ("storvsc recvbuf is not large enough")); + /* XXX check bytes_recvd to make sure that it contains enough data */ + + while ((ret == 0) && (bytes_recvd > 0)) { + request = (struct hv_storvsc_request *)(uintptr_t)request_id; + + if ((request == &sc->hs_init_req) || + (request == &sc->hs_reset_req)) { + memcpy(&request->vstor_packet, packet, + sizeof(struct vstor_packet)); + sema_post(&request->synch_sema); + } else { + vstor_packet = (struct vstor_packet *)packet; + switch(vstor_packet->operation) { + case VSTOR_OPERATION_COMPLETEIO: + if (request == NULL) + panic("VMBUS: storvsc received a " + "packet with NULL request id in " + "COMPLETEIO operation."); + + hv_storvsc_on_iocompletion(sc, + vstor_packet, request); + break; + case VSTOR_OPERATION_REMOVEDEVICE: + printf("VMBUS: storvsc operation %d not " + "implemented.\n", vstor_packet->operation); + /* TODO: implement */ + break; + case VSTOR_OPERATION_ENUMERATE_BUS: + hv_storvsc_rescan_target(sc); + break; + default: + break; + } + } + + bytes_recvd = roundup2(VSTOR_PKT_SIZE, 8), + ret = vmbus_chan_recv(channel, packet, &bytes_recvd, + &request_id); + KASSERT(ret != ENOBUFS, + ("storvsc recvbuf is not large enough")); + /* + * XXX check bytes_recvd to make sure that it contains + * enough data + */ + } +} + +/** + * @brief StorVSC probe function + * + * Device probe function. Returns 0 if the input device is a StorVSC + * device. Otherwise, a ENXIO is returned. If the input device is + * for BlkVSC (paravirtual IDE) device and this support is disabled in + * favor of the emulated ATA/IDE device, return ENXIO. + * + * @param a device + * @returns 0 on success, ENXIO if not a matcing StorVSC device + */ +static int +storvsc_probe(device_t dev) +{ + int ret = ENXIO; + + switch (storvsc_get_storage_type(dev)) { + case DRIVER_BLKVSC: + if(bootverbose) + device_printf(dev, + "Enlightened ATA/IDE detected\n"); + device_set_desc(dev, g_drv_props_table[DRIVER_BLKVSC].drv_desc); + ret = BUS_PROBE_DEFAULT; + break; + case DRIVER_STORVSC: + if(bootverbose) + device_printf(dev, "Enlightened SCSI device detected\n"); + device_set_desc(dev, g_drv_props_table[DRIVER_STORVSC].drv_desc); + ret = BUS_PROBE_DEFAULT; + break; + default: + ret = ENXIO; + } + return (ret); +} + +static void +storvsc_create_chan_sel(struct storvsc_softc *sc) +{ + struct vmbus_channel **subch; + int i, nsubch; + + sc->hs_sel_chan[0] = sc->hs_chan; + nsubch = sc->hs_nchan - 1; + if (nsubch == 0) + return; + + subch = vmbus_subchan_get(sc->hs_chan, nsubch); + for (i = 0; i < nsubch; i++) + sc->hs_sel_chan[i + 1] = subch[i]; + vmbus_subchan_rel(subch, nsubch); +} + +static int +storvsc_init_requests(device_t dev) +{ + struct storvsc_softc *sc = device_get_softc(dev); + struct hv_storvsc_request *reqp; + int error, i; + + LIST_INIT(&sc->hs_free_list); + + error = bus_dma_tag_create( + bus_get_dma_tag(dev), /* parent */ + 1, /* alignment */ + PAGE_SIZE, /* boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + STORVSC_DATA_SIZE_MAX, /* maxsize */ + STORVSC_DATA_SEGCNT_MAX, /* nsegments */ + STORVSC_DATA_SEGSZ_MAX, /* maxsegsize */ + 0, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockfuncarg */ + &sc->storvsc_req_dtag); + if (error) { + device_printf(dev, "failed to create storvsc dma tag\n"); + return (error); + } + + for (i = 0; i < sc->hs_drv_props->drv_max_ios_per_target; ++i) { + reqp = malloc(sizeof(struct hv_storvsc_request), + M_DEVBUF, M_WAITOK|M_ZERO); + reqp->softc = sc; + error = bus_dmamap_create(sc->storvsc_req_dtag, 0, + &reqp->data_dmap); + if (error) { + device_printf(dev, "failed to allocate storvsc " + "data dmamap\n"); + goto cleanup; + } + LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link); + } + return (0); + +cleanup: + while ((reqp = LIST_FIRST(&sc->hs_free_list)) != NULL) { + LIST_REMOVE(reqp, link); + bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap); + free(reqp, M_DEVBUF); + } + return (error); +} + +static void +storvsc_sysctl(device_t dev) +{ + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + struct sysctl_oid *ch_tree, *chid_tree; + struct storvsc_softc *sc; + char name[16]; + int i; + + sc = device_get_softc(dev); + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_bio_cnt", CTLFLAG_RW, + &sc->sysctl_data.data_bio_cnt, "# of bio data block"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_vaddr_cnt", CTLFLAG_RW, + &sc->sysctl_data.data_vaddr_cnt, "# of vaddr data block"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_sg_cnt", CTLFLAG_RW, + &sc->sysctl_data.data_sg_cnt, "# of sg data block"); + + /* dev.storvsc.UNIT.channel */ + ch_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "channel", + CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + if (ch_tree == NULL) + return; + + for (i = 0; i < sc->hs_nchan; i++) { + uint32_t ch_id; + + ch_id = vmbus_chan_id(sc->hs_sel_chan[i]); + snprintf(name, sizeof(name), "%d", ch_id); + /* dev.storvsc.UNIT.channel.CHID */ + chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree), + OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + if (chid_tree == NULL) + return; + /* dev.storvsc.UNIT.channel.CHID.send_req */ + SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, + "send_req", CTLFLAG_RD, &sc->sysctl_data.chan_send_cnt[i], + "# of request sending from this channel"); + } +} + +/** + * @brief StorVSC attach function + * + * Function responsible for allocating per-device structures, + * setting up CAM interfaces and scanning for available LUNs to + * be used for SCSI device peripherals. + * + * @param a device + * @returns 0 on success or an error on failure + */ +static int +storvsc_attach(device_t dev) +{ + enum hv_storage_type stor_type; + struct storvsc_softc *sc; + struct cam_devq *devq; + int ret, i, j; + struct hv_storvsc_request *reqp; + struct root_hold_token *root_mount_token = NULL; + struct hv_sgl_node *sgl_node = NULL; + void *tmp_buff = NULL; + + /* + * We need to serialize storvsc attach calls. + */ + root_mount_token = root_mount_hold("storvsc"); + + sc = device_get_softc(dev); + sc->hs_nchan = 1; + sc->hs_chan = vmbus_get_channel(dev); + + stor_type = storvsc_get_storage_type(dev); + + if (stor_type == DRIVER_UNKNOWN) { + ret = ENODEV; + goto cleanup; + } + + /* fill in driver specific properties */ + sc->hs_drv_props = &g_drv_props_table[stor_type]; + sc->hs_drv_props->drv_ringbuffer_size = hv_storvsc_ringbuffer_size; + sc->hs_drv_props->drv_max_ios_per_target = + MIN(STORVSC_MAX_IO, hv_storvsc_max_io); + if (bootverbose) { + printf("storvsc ringbuffer size: %d, max_io: %d\n", + sc->hs_drv_props->drv_ringbuffer_size, + sc->hs_drv_props->drv_max_ios_per_target); + } + /* fill in device specific properties */ + sc->hs_unit = device_get_unit(dev); + sc->hs_dev = dev; + + mtx_init(&sc->hs_lock, "hvslck", NULL, MTX_DEF); + + ret = storvsc_init_requests(dev); + if (ret != 0) + goto cleanup; + + /* create sg-list page pool */ + if (FALSE == g_hv_sgl_page_pool.is_init) { + g_hv_sgl_page_pool.is_init = TRUE; + LIST_INIT(&g_hv_sgl_page_pool.in_use_sgl_list); + LIST_INIT(&g_hv_sgl_page_pool.free_sgl_list); + + /* + * Pre-create SG list, each SG list with + * STORVSC_DATA_SEGCNT_MAX segments, each + * segment has one page buffer + */ + for (i = 0; i < sc->hs_drv_props->drv_max_ios_per_target; i++) { + sgl_node = malloc(sizeof(struct hv_sgl_node), + M_DEVBUF, M_WAITOK|M_ZERO); + + sgl_node->sgl_data = + sglist_alloc(STORVSC_DATA_SEGCNT_MAX, + M_WAITOK|M_ZERO); + + for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++) { + tmp_buff = malloc(PAGE_SIZE, + M_DEVBUF, M_WAITOK|M_ZERO); + + sgl_node->sgl_data->sg_segs[j].ss_paddr = + (vm_paddr_t)tmp_buff; + } + + LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, + sgl_node, link); + } + } + + sc->hs_destroy = FALSE; + sc->hs_drain_notify = FALSE; + sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema"); + + ret = hv_storvsc_connect_vsp(sc); + if (ret != 0) { + goto cleanup; + } + + /* Construct cpu to channel mapping */ + storvsc_create_chan_sel(sc); + + /* + * Create the device queue. + * Hyper-V maps each target to one SCSI HBA + */ + devq = cam_simq_alloc(sc->hs_drv_props->drv_max_ios_per_target); + if (devq == NULL) { + device_printf(dev, "Failed to alloc device queue\n"); + ret = ENOMEM; + goto cleanup; + } + + sc->hs_sim = cam_sim_alloc(storvsc_action, + storvsc_poll, + sc->hs_drv_props->drv_name, + sc, + sc->hs_unit, + &sc->hs_lock, 1, + sc->hs_drv_props->drv_max_ios_per_target, + devq); + + if (sc->hs_sim == NULL) { + device_printf(dev, "Failed to alloc sim\n"); + cam_simq_free(devq); + ret = ENOMEM; + goto cleanup; + } + + mtx_lock(&sc->hs_lock); + /* bus_id is set to 0, need to get it from VMBUS channel query? */ + if (xpt_bus_register(sc->hs_sim, dev, 0) != CAM_SUCCESS) { + cam_sim_free(sc->hs_sim, /*free_devq*/TRUE); + mtx_unlock(&sc->hs_lock); + device_printf(dev, "Unable to register SCSI bus\n"); + ret = ENXIO; + goto cleanup; + } + + if (xpt_create_path(&sc->hs_path, /*periph*/NULL, + cam_sim_path(sc->hs_sim), + CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { + xpt_bus_deregister(cam_sim_path(sc->hs_sim)); + cam_sim_free(sc->hs_sim, /*free_devq*/TRUE); + mtx_unlock(&sc->hs_lock); + device_printf(dev, "Unable to create path\n"); + ret = ENXIO; + goto cleanup; + } + + mtx_unlock(&sc->hs_lock); + + storvsc_sysctl(dev); + + root_mount_rel(root_mount_token); + return (0); + + +cleanup: + root_mount_rel(root_mount_token); + while (!LIST_EMPTY(&sc->hs_free_list)) { + reqp = LIST_FIRST(&sc->hs_free_list); + LIST_REMOVE(reqp, link); + bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap); + free(reqp, M_DEVBUF); + } + + while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); + LIST_REMOVE(sgl_node, link); + for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++) { + if (NULL != + (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) { + free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF); + } + } + sglist_free(sgl_node->sgl_data); + free(sgl_node, M_DEVBUF); + } + + return (ret); +} + +/** + * @brief StorVSC device detach function + * + * This function is responsible for safely detaching a + * StorVSC device. This includes waiting for inbound responses + * to complete and freeing associated per-device structures. + * + * @param dev a device + * returns 0 on success + */ +static int +storvsc_detach(device_t dev) +{ + struct storvsc_softc *sc = device_get_softc(dev); + struct hv_storvsc_request *reqp = NULL; + struct hv_sgl_node *sgl_node = NULL; + int j = 0; + + sc->hs_destroy = TRUE; + + /* + * At this point, all outbound traffic should be disabled. We + * only allow inbound traffic (responses) to proceed so that + * outstanding requests can be completed. + */ + + sc->hs_drain_notify = TRUE; + sema_wait(&sc->hs_drain_sema); + sc->hs_drain_notify = FALSE; + + /* + * Since we have already drained, we don't need to busy wait. + * The call to close the channel will reset the callback + * under the protection of the incoming channel lock. + */ + + vmbus_chan_close(sc->hs_chan); + + mtx_lock(&sc->hs_lock); + while (!LIST_EMPTY(&sc->hs_free_list)) { + reqp = LIST_FIRST(&sc->hs_free_list); + LIST_REMOVE(reqp, link); + bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap); + free(reqp, M_DEVBUF); + } + mtx_unlock(&sc->hs_lock); + + while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); + LIST_REMOVE(sgl_node, link); + for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++){ + if (NULL != + (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) { + free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF); + } + } + sglist_free(sgl_node->sgl_data); + free(sgl_node, M_DEVBUF); + } + + return (0); +} + +#if HVS_TIMEOUT_TEST +/** + * @brief unit test for timed out operations + * + * This function provides unit testing capability to simulate + * timed out operations. Recompilation with HV_TIMEOUT_TEST=1 + * is required. + * + * @param reqp pointer to a request structure + * @param opcode SCSI operation being performed + * @param wait if 1, wait for I/O to complete + */ +static void +storvsc_timeout_test(struct hv_storvsc_request *reqp, + uint8_t opcode, int wait) +{ + int ret; + union ccb *ccb = reqp->ccb; + struct storvsc_softc *sc = reqp->softc; + + if (reqp->vstor_packet.vm_srb.cdb[0] != opcode) { + return; + } + + if (wait) { + mtx_lock(&reqp->event.mtx); + } + ret = hv_storvsc_io_request(sc, reqp); + if (ret != 0) { + if (wait) { + mtx_unlock(&reqp->event.mtx); + } + printf("%s: io_request failed with %d.\n", + __func__, ret); + ccb->ccb_h.status = CAM_PROVIDE_FAIL; + mtx_lock(&sc->hs_lock); + storvsc_free_request(sc, reqp); + xpt_done(ccb); + mtx_unlock(&sc->hs_lock); + return; + } + + if (wait) { + xpt_print(ccb->ccb_h.path, + "%u: %s: waiting for IO return.\n", + ticks, __func__); + ret = cv_timedwait(&reqp->event.cv, &reqp->event.mtx, 60*hz); + mtx_unlock(&reqp->event.mtx); + xpt_print(ccb->ccb_h.path, "%u: %s: %s.\n", + ticks, __func__, (ret == 0)? + "IO return detected" : + "IO return not detected"); + /* + * Now both the timer handler and io done are running + * simultaneously. We want to confirm the io done always + * finishes after the timer handler exits. So reqp used by + * timer handler is not freed or stale. Do busy loop for + * another 1/10 second to make sure io done does + * wait for the timer handler to complete. + */ + DELAY(100*1000); + mtx_lock(&sc->hs_lock); + xpt_print(ccb->ccb_h.path, + "%u: %s: finishing, queue frozen %d, " + "ccb status 0x%x scsi_status 0x%x.\n", + ticks, __func__, sc->hs_frozen, + ccb->ccb_h.status, + ccb->csio.scsi_status); + mtx_unlock(&sc->hs_lock); + } +} +#endif /* HVS_TIMEOUT_TEST */ + +#ifdef notyet +/** + * @brief timeout handler for requests + * + * This function is called as a result of a callout expiring. + * + * @param arg pointer to a request + */ +static void +storvsc_timeout(void *arg) +{ + struct hv_storvsc_request *reqp = arg; + struct storvsc_softc *sc = reqp->softc; + union ccb *ccb = reqp->ccb; + + if (reqp->retries == 0) { + mtx_lock(&sc->hs_lock); + xpt_print(ccb->ccb_h.path, + "%u: IO timed out (req=0x%p), wait for another %u secs.\n", + ticks, reqp, ccb->ccb_h.timeout / 1000); + cam_error_print(ccb, CAM_ESF_ALL, CAM_EPF_ALL); + mtx_unlock(&sc->hs_lock); + + reqp->retries++; + callout_reset_sbt(&reqp->callout, SBT_1MS * ccb->ccb_h.timeout, + 0, storvsc_timeout, reqp, 0); +#if HVS_TIMEOUT_TEST + storvsc_timeout_test(reqp, SEND_DIAGNOSTIC, 0); +#endif + return; + } + + mtx_lock(&sc->hs_lock); + xpt_print(ccb->ccb_h.path, + "%u: IO (reqp = 0x%p) did not return for %u seconds, %s.\n", + ticks, reqp, ccb->ccb_h.timeout * (reqp->retries+1) / 1000, + (sc->hs_frozen == 0)? + "freezing the queue" : "the queue is already frozen"); + if (sc->hs_frozen == 0) { + sc->hs_frozen = 1; + xpt_freeze_simq(xpt_path_sim(ccb->ccb_h.path), 1); + } + mtx_unlock(&sc->hs_lock); + +#if HVS_TIMEOUT_TEST + storvsc_timeout_test(reqp, MODE_SELECT_10, 1); +#endif +} +#endif + +/** + * @brief StorVSC device poll function + * + * This function is responsible for servicing requests when + * interrupts are disabled (i.e when we are dumping core.) + * + * @param sim a pointer to a CAM SCSI interface module + */ +static void +storvsc_poll(struct cam_sim *sim) +{ + struct storvsc_softc *sc = cam_sim_softc(sim); + + mtx_assert(&sc->hs_lock, MA_OWNED); + mtx_unlock(&sc->hs_lock); + hv_storvsc_on_channel_callback(sc->hs_chan, sc); + mtx_lock(&sc->hs_lock); +} + +/** + * @brief StorVSC device action function + * + * This function is responsible for handling SCSI operations which + * are passed from the CAM layer. The requests are in the form of + * CAM control blocks which indicate the action being performed. + * Not all actions require converting the request to a VSCSI protocol + * message - these actions can be responded to by this driver. + * Requests which are destined for a backend storage device are converted + * to a VSCSI protocol message and sent on the channel connection associated + * with this device. + * + * @param sim pointer to a CAM SCSI interface module + * @param ccb pointer to a CAM control block + */ +static void +storvsc_action(struct cam_sim *sim, union ccb *ccb) +{ + struct storvsc_softc *sc = cam_sim_softc(sim); + int res; + + mtx_assert(&sc->hs_lock, MA_OWNED); + switch (ccb->ccb_h.func_code) { + case XPT_PATH_INQ: { + struct ccb_pathinq *cpi = &ccb->cpi; + + cpi->version_num = 1; + cpi->hba_inquiry = PI_TAG_ABLE|PI_SDTR_ABLE; + cpi->target_sprt = 0; + cpi->hba_misc = PIM_NOBUSRESET; + if (hv_storvsc_use_pim_unmapped) + cpi->hba_misc |= PIM_UNMAPPED; + cpi->maxio = STORVSC_DATA_SIZE_MAX; + cpi->hba_eng_cnt = 0; + cpi->max_target = STORVSC_MAX_TARGETS; + cpi->max_lun = sc->hs_drv_props->drv_max_luns_per_target; + cpi->initiator_id = cpi->max_target; + cpi->bus_id = cam_sim_bus(sim); + cpi->base_transfer_speed = 300000; + cpi->transport = XPORT_SAS; + cpi->transport_version = 0; + cpi->protocol = PROTO_SCSI; + cpi->protocol_version = SCSI_REV_SPC2; + strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN); + strlcpy(cpi->hba_vid, sc->hs_drv_props->drv_name, HBA_IDLEN); + strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN); + cpi->unit_number = cam_sim_unit(sim); + + ccb->ccb_h.status = CAM_REQ_CMP; + xpt_done(ccb); + return; + } + case XPT_GET_TRAN_SETTINGS: { + struct ccb_trans_settings *cts = &ccb->cts; + + cts->transport = XPORT_SAS; + cts->transport_version = 0; + cts->protocol = PROTO_SCSI; + cts->protocol_version = SCSI_REV_SPC2; + + /* enable tag queuing and disconnected mode */ + cts->proto_specific.valid = CTS_SCSI_VALID_TQ; + cts->proto_specific.scsi.valid = CTS_SCSI_VALID_TQ; + cts->proto_specific.scsi.flags = CTS_SCSI_FLAGS_TAG_ENB; + cts->xport_specific.valid = CTS_SPI_VALID_DISC; + cts->xport_specific.spi.flags = CTS_SPI_FLAGS_DISC_ENB; + + ccb->ccb_h.status = CAM_REQ_CMP; + xpt_done(ccb); + return; + } + case XPT_SET_TRAN_SETTINGS: { + ccb->ccb_h.status = CAM_REQ_CMP; + xpt_done(ccb); + return; + } + case XPT_CALC_GEOMETRY:{ + cam_calc_geometry(&ccb->ccg, 1); + xpt_done(ccb); + return; + } + case XPT_RESET_BUS: + case XPT_RESET_DEV:{ +#if HVS_HOST_RESET + if ((res = hv_storvsc_host_reset(sc)) != 0) { + xpt_print(ccb->ccb_h.path, + "hv_storvsc_host_reset failed with %d\n", res); + ccb->ccb_h.status = CAM_PROVIDE_FAIL; + xpt_done(ccb); + return; + } + ccb->ccb_h.status = CAM_REQ_CMP; + xpt_done(ccb); + return; +#else + xpt_print(ccb->ccb_h.path, + "%s reset not supported.\n", + (ccb->ccb_h.func_code == XPT_RESET_BUS)? + "bus" : "dev"); + ccb->ccb_h.status = CAM_REQ_INVALID; + xpt_done(ccb); + return; +#endif /* HVS_HOST_RESET */ + } + case XPT_SCSI_IO: + case XPT_IMMED_NOTIFY: { + struct hv_storvsc_request *reqp = NULL; + bus_dmamap_t dmap_saved; + + if (ccb->csio.cdb_len == 0) { + panic("cdl_len is 0\n"); + } + + if (LIST_EMPTY(&sc->hs_free_list)) { + ccb->ccb_h.status = CAM_REQUEUE_REQ; + if (sc->hs_frozen == 0) { + sc->hs_frozen = 1; + xpt_freeze_simq(sim, /* count*/1); + } + xpt_done(ccb); + return; + } + + reqp = LIST_FIRST(&sc->hs_free_list); + LIST_REMOVE(reqp, link); + + /* Save the data_dmap before reset request */ + dmap_saved = reqp->data_dmap; + + /* XXX this is ugly */ + bzero(reqp, sizeof(struct hv_storvsc_request)); + + /* Restore necessary bits */ + reqp->data_dmap = dmap_saved; + reqp->softc = sc; + + ccb->ccb_h.status |= CAM_SIM_QUEUED; + if ((res = create_storvsc_request(ccb, reqp)) != 0) { + ccb->ccb_h.status = CAM_REQ_INVALID; + xpt_done(ccb); + return; + } + +#ifdef notyet + if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) { + callout_init(&reqp->callout, 1); + callout_reset_sbt(&reqp->callout, + SBT_1MS * ccb->ccb_h.timeout, 0, + storvsc_timeout, reqp, 0); +#if HVS_TIMEOUT_TEST + cv_init(&reqp->event.cv, "storvsc timeout cv"); + mtx_init(&reqp->event.mtx, "storvsc timeout mutex", + NULL, MTX_DEF); + switch (reqp->vstor_packet.vm_srb.cdb[0]) { + case MODE_SELECT_10: + case SEND_DIAGNOSTIC: + /* To have timer send the request. */ + return; + default: + break; + } +#endif /* HVS_TIMEOUT_TEST */ + } +#endif + + if ((res = hv_storvsc_io_request(sc, reqp)) != 0) { + xpt_print(ccb->ccb_h.path, + "hv_storvsc_io_request failed with %d\n", res); + ccb->ccb_h.status = CAM_PROVIDE_FAIL; + storvsc_free_request(sc, reqp); + xpt_done(ccb); + return; + } + return; + } + + default: + ccb->ccb_h.status = CAM_REQ_INVALID; + xpt_done(ccb); + return; + } +} + +/** + * @brief destroy bounce buffer + * + * This function is responsible for destroy a Scatter/Gather list + * that create by storvsc_create_bounce_buffer() + * + * @param sgl- the Scatter/Gather need be destroy + * @param sg_count- page count of the SG list. + * + */ +static void +storvsc_destroy_bounce_buffer(struct sglist *sgl) +{ + struct hv_sgl_node *sgl_node = NULL; + if (LIST_EMPTY(&g_hv_sgl_page_pool.in_use_sgl_list)) { + printf("storvsc error: not enough in use sgl\n"); + return; + } + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list); + LIST_REMOVE(sgl_node, link); + sgl_node->sgl_data = sgl; + LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link); +} + +/** + * @brief create bounce buffer + * + * This function is responsible for create a Scatter/Gather list, + * which hold several pages that can be aligned with page size. + * + * @param seg_count- SG-list segments count + * @param write - if WRITE_TYPE, set SG list page used size to 0, + * otherwise set used size to page size. + * + * return NULL if create failed + */ +static struct sglist * +storvsc_create_bounce_buffer(uint16_t seg_count, int write) +{ + int i = 0; + struct sglist *bounce_sgl = NULL; + unsigned int buf_len = ((write == WRITE_TYPE) ? 0 : PAGE_SIZE); + struct hv_sgl_node *sgl_node = NULL; + + /* get struct sglist from free_sgl_list */ + if (LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { + printf("storvsc error: not enough free sgl\n"); + return NULL; + } + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); + LIST_REMOVE(sgl_node, link); + bounce_sgl = sgl_node->sgl_data; + LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link); + + bounce_sgl->sg_maxseg = seg_count; + + if (write == WRITE_TYPE) + bounce_sgl->sg_nseg = 0; + else + bounce_sgl->sg_nseg = seg_count; + + for (i = 0; i < seg_count; i++) + bounce_sgl->sg_segs[i].ss_len = buf_len; + + return bounce_sgl; +} + +/** + * @brief copy data from SG list to bounce buffer + * + * This function is responsible for copy data from one SG list's segments + * to another SG list which used as bounce buffer. + * + * @param bounce_sgl - the destination SG list + * @param orig_sgl - the segment of the source SG list. + * @param orig_sgl_count - the count of segments. + * @param orig_sgl_count - indicate which segment need bounce buffer, + * set 1 means need. + * + */ +static void +storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl, + bus_dma_segment_t *orig_sgl, + unsigned int orig_sgl_count, + uint64_t seg_bits) +{ + int src_sgl_idx = 0; + + for (src_sgl_idx = 0; src_sgl_idx < orig_sgl_count; src_sgl_idx++) { + if (seg_bits & (1 << src_sgl_idx)) { + memcpy((void*)bounce_sgl->sg_segs[src_sgl_idx].ss_paddr, + (void*)orig_sgl[src_sgl_idx].ds_addr, + orig_sgl[src_sgl_idx].ds_len); + + bounce_sgl->sg_segs[src_sgl_idx].ss_len = + orig_sgl[src_sgl_idx].ds_len; + } + } +} + +/** + * @brief copy data from SG list which used as bounce to another SG list + * + * This function is responsible for copy data from one SG list with bounce + * buffer to another SG list's segments. + * + * @param dest_sgl - the destination SG list's segments + * @param dest_sgl_count - the count of destination SG list's segment. + * @param src_sgl - the source SG list. + * @param seg_bits - indicate which segment used bounce buffer of src SG-list. + * + */ +void +storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl, + unsigned int dest_sgl_count, + struct sglist* src_sgl, + uint64_t seg_bits) +{ + int sgl_idx = 0; + + for (sgl_idx = 0; sgl_idx < dest_sgl_count; sgl_idx++) { + if (seg_bits & (1 << sgl_idx)) { + memcpy((void*)(dest_sgl[sgl_idx].ds_addr), + (void*)(src_sgl->sg_segs[sgl_idx].ss_paddr), + src_sgl->sg_segs[sgl_idx].ss_len); + } + } +} + +/** + * @brief check SG list with bounce buffer or not + * + * This function is responsible for check if need bounce buffer for SG list. + * + * @param sgl - the SG list's segments + * @param sg_count - the count of SG list's segment. + * @param bits - segmengs number that need bounce buffer + * + * return -1 if SG list needless bounce buffer + */ +static int +storvsc_check_bounce_buffer_sgl(bus_dma_segment_t *sgl, + unsigned int sg_count, + uint64_t *bits) +{ + int i = 0; + int offset = 0; + uint64_t phys_addr = 0; + uint64_t tmp_bits = 0; + boolean_t found_hole = FALSE; + boolean_t pre_aligned = TRUE; + + if (sg_count < 2){ + return -1; + } + + *bits = 0; + + phys_addr = vtophys(sgl[0].ds_addr); + offset = phys_addr - trunc_page(phys_addr); + + if (offset != 0) { + pre_aligned = FALSE; + tmp_bits |= 1; + } + + for (i = 1; i < sg_count; i++) { + phys_addr = vtophys(sgl[i].ds_addr); + offset = phys_addr - trunc_page(phys_addr); + + if (offset == 0) { + if (FALSE == pre_aligned){ + /* + * This segment is aligned, if the previous + * one is not aligned, find a hole + */ + found_hole = TRUE; + } + pre_aligned = TRUE; + } else { + tmp_bits |= 1ULL << i; + if (!pre_aligned) { + if (phys_addr != vtophys(sgl[i-1].ds_addr + + sgl[i-1].ds_len)) { + /* + * Check whether connect to previous + * segment,if not, find the hole + */ + found_hole = TRUE; + } + } else { + found_hole = TRUE; + } + pre_aligned = FALSE; + } + } + + if (!found_hole) { + return (-1); + } else { + *bits = tmp_bits; + return 0; + } +} + +/** + * Copy bus_dma segments to multiple page buffer, which requires + * the pages are compact composed except for the 1st and last pages. + */ +static void +storvsc_xferbuf_prepare(void *arg, bus_dma_segment_t *segs, int nsegs, int error) +{ + struct hv_storvsc_request *reqp = arg; + union ccb *ccb = reqp->ccb; + struct ccb_scsiio *csio = &ccb->csio; + struct storvsc_gpa_range *prplist; + int i; + + prplist = &reqp->prp_list; + prplist->gpa_range.gpa_len = csio->dxfer_len; + prplist->gpa_range.gpa_ofs = segs[0].ds_addr & PAGE_MASK; + + for (i = 0; i < nsegs; i++) { +#ifdef INVARIANTS + if (nsegs > 1) { + if (i == 0) { + KASSERT((segs[i].ds_addr & PAGE_MASK) + + segs[i].ds_len == PAGE_SIZE, + ("invalid 1st page, ofs 0x%jx, len %zu", + (uintmax_t)segs[i].ds_addr, + segs[i].ds_len)); + } else if (i == nsegs - 1) { + KASSERT((segs[i].ds_addr & PAGE_MASK) == 0, + ("invalid last page, ofs 0x%jx", + (uintmax_t)segs[i].ds_addr)); + } else { + KASSERT((segs[i].ds_addr & PAGE_MASK) == 0 && + segs[i].ds_len == PAGE_SIZE, + ("not a full page, ofs 0x%jx, len %zu", + (uintmax_t)segs[i].ds_addr, + segs[i].ds_len)); + } + } +#endif + prplist->gpa_page[i] = atop(segs[i].ds_addr); + } + reqp->prp_cnt = nsegs; +} + +/** + * @brief Fill in a request structure based on a CAM control block + * + * Fills in a request structure based on the contents of a CAM control + * block. The request structure holds the payload information for + * VSCSI protocol request. + * + * @param ccb pointer to a CAM contorl block + * @param reqp pointer to a request structure + */ +static int +create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp) +{ + struct ccb_scsiio *csio = &ccb->csio; + uint64_t phys_addr; + uint32_t pfn; + uint64_t not_aligned_seg_bits = 0; + int error; + + /* refer to struct vmscsi_req for meanings of these two fields */ + reqp->vstor_packet.u.vm_srb.port = + cam_sim_unit(xpt_path_sim(ccb->ccb_h.path)); + reqp->vstor_packet.u.vm_srb.path_id = + cam_sim_bus(xpt_path_sim(ccb->ccb_h.path)); + + reqp->vstor_packet.u.vm_srb.target_id = ccb->ccb_h.target_id; + reqp->vstor_packet.u.vm_srb.lun = ccb->ccb_h.target_lun; + + reqp->vstor_packet.u.vm_srb.cdb_len = csio->cdb_len; + if(ccb->ccb_h.flags & CAM_CDB_POINTER) { + memcpy(&reqp->vstor_packet.u.vm_srb.u.cdb, csio->cdb_io.cdb_ptr, + csio->cdb_len); + } else { + memcpy(&reqp->vstor_packet.u.vm_srb.u.cdb, csio->cdb_io.cdb_bytes, + csio->cdb_len); + } + + if (hv_storvsc_use_win8ext_flags) { + reqp->vstor_packet.u.vm_srb.win8_extension.time_out_value = 60; + reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |= + SRB_FLAGS_DISABLE_SYNCH_TRANSFER; + } + switch (ccb->ccb_h.flags & CAM_DIR_MASK) { + case CAM_DIR_OUT: + reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE; + if (hv_storvsc_use_win8ext_flags) { + reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |= + SRB_FLAGS_DATA_OUT; + } + break; + case CAM_DIR_IN: + reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE; + if (hv_storvsc_use_win8ext_flags) { + reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |= + SRB_FLAGS_DATA_IN; + } + break; + case CAM_DIR_NONE: + reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; + if (hv_storvsc_use_win8ext_flags) { + reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |= + SRB_FLAGS_NO_DATA_TRANSFER; + } + break; + default: + printf("Error: unexpected data direction: 0x%x\n", + ccb->ccb_h.flags & CAM_DIR_MASK); + return (EINVAL); + } + + reqp->sense_data = &csio->sense_data; + reqp->sense_info_len = csio->sense_len; + + reqp->ccb = ccb; + ccb->ccb_h.spriv_ptr0 = reqp; + + if (0 == csio->dxfer_len) { + return (0); + } + + switch (ccb->ccb_h.flags & CAM_DATA_MASK) { + case CAM_DATA_BIO: + case CAM_DATA_VADDR: + error = bus_dmamap_load_ccb(reqp->softc->storvsc_req_dtag, + reqp->data_dmap, ccb, storvsc_xferbuf_prepare, reqp, + BUS_DMA_NOWAIT); + if (error) { + xpt_print(ccb->ccb_h.path, + "bus_dmamap_load_ccb failed: %d\n", error); + return (error); + } + if ((ccb->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO) + reqp->softc->sysctl_data.data_bio_cnt++; + else + reqp->softc->sysctl_data.data_vaddr_cnt++; + break; + + case CAM_DATA_SG: + { + struct storvsc_gpa_range *prplist; + int i = 0; + int offset = 0; + int ret; + + bus_dma_segment_t *storvsc_sglist = + (bus_dma_segment_t *)ccb->csio.data_ptr; + u_int16_t storvsc_sg_count = ccb->csio.sglist_cnt; + + prplist = &reqp->prp_list; + prplist->gpa_range.gpa_len = csio->dxfer_len; + + printf("Storvsc: get SG I/O operation, %d\n", + reqp->vstor_packet.u.vm_srb.data_in); + + if (storvsc_sg_count > STORVSC_DATA_SEGCNT_MAX){ + printf("Storvsc: %d segments is too much, " + "only support %d segments\n", + storvsc_sg_count, STORVSC_DATA_SEGCNT_MAX); + return (EINVAL); + } + + /* + * We create our own bounce buffer function currently. Idealy + * we should use BUS_DMA(9) framework. But with current BUS_DMA + * code there is no callback API to check the page alignment of + * middle segments before busdma can decide if a bounce buffer + * is needed for particular segment. There is callback, + * "bus_dma_filter_t *filter", but the parrameters are not + * sufficient for storvsc driver. + * TODO: + * Add page alignment check in BUS_DMA(9) callback. Once + * this is complete, switch the following code to use + * BUS_DMA(9) for storvsc bounce buffer support. + */ + /* check if we need to create bounce buffer */ + ret = storvsc_check_bounce_buffer_sgl(storvsc_sglist, + storvsc_sg_count, ¬_aligned_seg_bits); + if (ret != -1) { + reqp->bounce_sgl = + storvsc_create_bounce_buffer(storvsc_sg_count, + reqp->vstor_packet.u.vm_srb.data_in); + if (NULL == reqp->bounce_sgl) { + printf("Storvsc_error: " + "create bounce buffer failed.\n"); + return (ENOMEM); + } + + reqp->bounce_sgl_count = storvsc_sg_count; + reqp->not_aligned_seg_bits = not_aligned_seg_bits; + + /* + * if it is write, we need copy the original data + *to bounce buffer + */ + if (WRITE_TYPE == reqp->vstor_packet.u.vm_srb.data_in) { + storvsc_copy_sgl_to_bounce_buf( + reqp->bounce_sgl, + storvsc_sglist, + storvsc_sg_count, + reqp->not_aligned_seg_bits); + } + + /* transfer virtual address to physical frame number */ + if (reqp->not_aligned_seg_bits & 0x1){ + phys_addr = + vtophys(reqp->bounce_sgl->sg_segs[0].ss_paddr); + }else{ + phys_addr = + vtophys(storvsc_sglist[0].ds_addr); + } + prplist->gpa_range.gpa_ofs = phys_addr & PAGE_MASK; + + pfn = phys_addr >> PAGE_SHIFT; + prplist->gpa_page[0] = pfn; + + for (i = 1; i < storvsc_sg_count; i++) { + if (reqp->not_aligned_seg_bits & (1 << i)) { + phys_addr = + vtophys(reqp->bounce_sgl->sg_segs[i].ss_paddr); + } else { + phys_addr = + vtophys(storvsc_sglist[i].ds_addr); + } + + pfn = phys_addr >> PAGE_SHIFT; + prplist->gpa_page[i] = pfn; + } + reqp->prp_cnt = i; + } else { + phys_addr = vtophys(storvsc_sglist[0].ds_addr); + + prplist->gpa_range.gpa_ofs = phys_addr & PAGE_MASK; + + for (i = 0; i < storvsc_sg_count; i++) { + phys_addr = vtophys(storvsc_sglist[i].ds_addr); + pfn = phys_addr >> PAGE_SHIFT; + prplist->gpa_page[i] = pfn; + } + reqp->prp_cnt = i; + + /* check the last segment cross boundary or not */ + offset = phys_addr & PAGE_MASK; + if (offset) { + /* Add one more PRP entry */ + phys_addr = + vtophys(storvsc_sglist[i-1].ds_addr + + PAGE_SIZE - offset); + pfn = phys_addr >> PAGE_SHIFT; + prplist->gpa_page[i] = pfn; + reqp->prp_cnt++; + } + + reqp->bounce_sgl_count = 0; + } + reqp->softc->sysctl_data.data_sg_cnt++; + break; + } + default: + printf("Unknow flags: %d\n", ccb->ccb_h.flags); + return(EINVAL); + } + + return(0); +} + +static uint32_t +is_scsi_valid(const struct scsi_inquiry_data *inq_data) +{ + u_int8_t type; + + type = SID_TYPE(inq_data); + if (type == T_NODEVICE) + return (0); + if (SID_QUAL(inq_data) == SID_QUAL_BAD_LU) + return (0); + return (1); +} + +/** + * @brief completion function before returning to CAM + * + * I/O process has been completed and the result needs + * to be passed to the CAM layer. + * Free resources related to this request. + * + * @param reqp pointer to a request structure + */ +static void +storvsc_io_done(struct hv_storvsc_request *reqp) +{ + union ccb *ccb = reqp->ccb; + struct ccb_scsiio *csio = &ccb->csio; + struct storvsc_softc *sc = reqp->softc; + struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb; + bus_dma_segment_t *ori_sglist = NULL; + int ori_sg_count = 0; + const struct scsi_generic *cmd; + + /* destroy bounce buffer if it is used */ + if (reqp->bounce_sgl_count) { + ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr; + ori_sg_count = ccb->csio.sglist_cnt; + + /* + * If it is READ operation, we should copy back the data + * to original SG list. + */ + if (READ_TYPE == reqp->vstor_packet.u.vm_srb.data_in) { + storvsc_copy_from_bounce_buf_to_sgl(ori_sglist, + ori_sg_count, + reqp->bounce_sgl, + reqp->not_aligned_seg_bits); + } + + storvsc_destroy_bounce_buffer(reqp->bounce_sgl); + reqp->bounce_sgl_count = 0; + } + + if (reqp->retries > 0) { + mtx_lock(&sc->hs_lock); +#if HVS_TIMEOUT_TEST + xpt_print(ccb->ccb_h.path, + "%u: IO returned after timeout, " + "waking up timer handler if any.\n", ticks); + mtx_lock(&reqp->event.mtx); + cv_signal(&reqp->event.cv); + mtx_unlock(&reqp->event.mtx); +#endif + reqp->retries = 0; + xpt_print(ccb->ccb_h.path, + "%u: IO returned after timeout, " + "stopping timer if any.\n", ticks); + mtx_unlock(&sc->hs_lock); + } + +#ifdef notyet + /* + * callout_drain() will wait for the timer handler to finish + * if it is running. So we don't need any lock to synchronize + * between this routine and the timer handler. + * Note that we need to make sure reqp is not freed when timer + * handler is using or will use it. + */ + if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) { + callout_drain(&reqp->callout); + } +#endif + cmd = (const struct scsi_generic *) + ((ccb->ccb_h.flags & CAM_CDB_POINTER) ? + csio->cdb_io.cdb_ptr : csio->cdb_io.cdb_bytes); + + ccb->ccb_h.status &= ~CAM_SIM_QUEUED; + ccb->ccb_h.status &= ~CAM_STATUS_MASK; + int srb_status = SRB_STATUS(vm_srb->srb_status); +#ifdef DIAGNOSTIC + if (hv_storvsc_srb_status != -1) { + srb_status = SRB_STATUS(hv_storvsc_srb_status & 0x3f); + hv_storvsc_srb_status = -1; + } +#endif /* DIAGNOSTIC */ + if (vm_srb->scsi_status == SCSI_STATUS_OK) { + if (srb_status != SRB_STATUS_SUCCESS) { + bool log_error = true; + switch (srb_status) { + case SRB_STATUS_PENDING: + /* We should never get this */ + panic("storvsc_io_done: SRB_STATUS_PENDING"); + break; + case SRB_STATUS_ABORTED: + /* + * storvsc doesn't support aborts yet + * but if we ever get this status + * the I/O is complete - treat it as a + * timeout + */ + ccb->ccb_h.status |= CAM_CMD_TIMEOUT; + break; + case SRB_STATUS_ABORT_FAILED: + /* We should never get this */ + panic("storvsc_io_done: SRB_STATUS_ABORT_FAILED"); + break; + case SRB_STATUS_ERROR: + /* + * We should never get this. + * Treat it as a CAM_UNREC_HBA_ERROR. + * It will be retried + */ + ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR; + break; + case SRB_STATUS_BUSY: + /* Host is busy. Delay and retry */ + ccb->ccb_h.status |= CAM_BUSY; + break; + case SRB_STATUS_INVALID_REQUEST: + case SRB_STATUS_INVALID_PATH_ID: + case SRB_STATUS_NO_DEVICE: + case SRB_STATUS_INVALID_TARGET_ID: + /* + * These indicate an invalid address + * and really should never be seen. + * A CAM_PATH_INVALID could be + * used here but I want to run + * down retries. Do a CAM_BUSY + * since the host might be having issues. + */ + ccb->ccb_h.status |= CAM_BUSY; + break; + case SRB_STATUS_TIMEOUT: + case SRB_STATUS_COMMAND_TIMEOUT: + /* The backend has timed this out */ + ccb->ccb_h.status |= CAM_BUSY; + break; + /* Some old pSCSI errors below */ + case SRB_STATUS_SELECTION_TIMEOUT: + case SRB_STATUS_MESSAGE_REJECTED: + case SRB_STATUS_PARITY_ERROR: + case SRB_STATUS_NO_HBA: + case SRB_STATUS_DATA_OVERRUN: + case SRB_STATUS_UNEXPECTED_BUS_FREE: + case SRB_STATUS_PHASE_SEQUENCE_FAILURE: + /* + * Old pSCSI responses, should never get. + * If we do treat as a CAM_UNREC_HBA_ERROR + * which will be retried + */ + ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR; + break; + case SRB_STATUS_BUS_RESET: + ccb->ccb_h.status |= CAM_SCSI_BUS_RESET; + break; + case SRB_STATUS_BAD_SRB_BLOCK_LENGTH: + /* + * The request block is malformed and + * I doubt it is from the guest. Just retry. + */ + ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR; + break; + /* Not used statuses just retry */ + case SRB_STATUS_REQUEST_FLUSHED: + case SRB_STATUS_BAD_FUNCTION: + case SRB_STATUS_NOT_POWERED: + ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR; + break; + case SRB_STATUS_INVALID_LUN: + /* + * Don't log an EMS for this response since + * there is no device at this LUN. This is a + * normal and expected response when a device + * is detached. + */ + ccb->ccb_h.status |= CAM_DEV_NOT_THERE; + log_error = false; + break; + case SRB_STATUS_ERROR_RECOVERY: + case SRB_STATUS_LINK_DOWN: + /* + * I don't ever expect these from + * the host but if we ever get + * retry after a delay + */ + ccb->ccb_h.status |= CAM_BUSY; + break; + default: + /* + * An undefined response assert on + * on debug builds else retry + */ + ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR; + KASSERT(srb_status <= SRB_STATUS_LINK_DOWN, + ("storvsc: %s, unexpected srb_status of 0x%x", + __func__, srb_status)); + break; + } + if (log_error) { + xpt_print(ccb->ccb_h.path, "The hypervisor's I/O adapter " + "driver received an unexpected response code 0x%x " + "for operation: %s. If this continues to occur, " + "report the condition to your hypervisor vendor so " + "they can rectify the issue.\n", srb_status, + scsi_op_desc(cmd->opcode, NULL)); + } + } else { + ccb->ccb_h.status |= CAM_REQ_CMP; + } + + if (cmd->opcode == INQUIRY && + srb_status == SRB_STATUS_SUCCESS) { + int resp_xfer_len, resp_buf_len, data_len; + uint8_t *resp_buf = (uint8_t *)csio->data_ptr; + struct scsi_inquiry_data *inq_data = + (struct scsi_inquiry_data *)csio->data_ptr; + + /* Get the buffer length reported by host */ + resp_xfer_len = vm_srb->transfer_len; + + /* Get the available buffer length */ + resp_buf_len = resp_xfer_len >= 5 ? resp_buf[4] + 5 : 0; + data_len = (resp_buf_len < resp_xfer_len) ? + resp_buf_len : resp_xfer_len; + if (bootverbose && data_len >= 5) { + xpt_print(ccb->ccb_h.path, "storvsc inquiry " + "(%d) [%x %x %x %x %x ... ]\n", data_len, + resp_buf[0], resp_buf[1], resp_buf[2], + resp_buf[3], resp_buf[4]); + } + /* + * XXX: Hyper-V (since win2012r2) responses inquiry with + * unknown version (0) for GEN-2 DVD device. + * Manually set the version number to SPC3 in order to + * ask CAM to continue probing with "PROBE_REPORT_LUNS". + * see probedone() in scsi_xpt.c + */ + if (SID_TYPE(inq_data) == T_CDROM && + inq_data->version == 0 && + (vmstor_proto_version >= VMSTOR_PROTOCOL_VERSION_WIN8)) { + inq_data->version = SCSI_REV_SPC3; + if (bootverbose) { + xpt_print(ccb->ccb_h.path, + "set version from 0 to %d\n", + inq_data->version); + } + } + /* + * XXX: Manually fix the wrong response returned from WS2012 + */ + if (!is_scsi_valid(inq_data) && + (vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8_1 || + vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8 || + vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN7)) { + if (data_len >= 4 && + (resp_buf[2] == 0 || resp_buf[3] == 0)) { + resp_buf[2] = SCSI_REV_SPC3; + resp_buf[3] = 2; // resp fmt must be 2 + if (bootverbose) + xpt_print(ccb->ccb_h.path, + "fix version and resp fmt for 0x%x\n", + vmstor_proto_version); + } + } else if (data_len >= SHORT_INQUIRY_LENGTH) { + char vendor[16]; + + cam_strvis(vendor, inq_data->vendor, + sizeof(inq_data->vendor), sizeof(vendor)); + /* + * XXX: Upgrade SPC2 to SPC3 if host is WIN8 or + * WIN2012 R2 in order to support UNMAP feature. + */ + if (!strncmp(vendor, "Msft", 4) && + SID_ANSI_REV(inq_data) == SCSI_REV_SPC2 && + (vmstor_proto_version == + VMSTOR_PROTOCOL_VERSION_WIN8_1 || + vmstor_proto_version == + VMSTOR_PROTOCOL_VERSION_WIN8)) { + inq_data->version = SCSI_REV_SPC3; + if (bootverbose) { + xpt_print(ccb->ccb_h.path, + "storvsc upgrades " + "SPC2 to SPC3\n"); + } + } + } + } + } else { + /** + * On Some Windows hosts TEST_UNIT_READY command can return + * SRB_STATUS_ERROR and sense data, for example, asc=0x3a,1 + * "(Medium not present - tray closed)". This error can be + * ignored since it will be sent to host periodically. + */ + boolean_t unit_not_ready = \ + vm_srb->scsi_status == SCSI_STATUS_CHECK_COND && + cmd->opcode == TEST_UNIT_READY && + srb_status == SRB_STATUS_ERROR; + if (!unit_not_ready && bootverbose) { + mtx_lock(&sc->hs_lock); + xpt_print(ccb->ccb_h.path, + "storvsc scsi_status = %d, srb_status = %d\n", + vm_srb->scsi_status, srb_status); + mtx_unlock(&sc->hs_lock); + } + ccb->ccb_h.status |= CAM_SCSI_STATUS_ERROR; + } + + ccb->csio.scsi_status = (vm_srb->scsi_status & 0xFF); + if (srb_status == SRB_STATUS_SUCCESS || + srb_status == SRB_STATUS_DATA_OVERRUN) + ccb->csio.resid = ccb->csio.dxfer_len - vm_srb->transfer_len; + else + ccb->csio.resid = ccb->csio.dxfer_len; + + if ((vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID) != 0 && + reqp->sense_info_len != 0) { + csio->sense_resid = csio->sense_len - reqp->sense_info_len; + ccb->ccb_h.status |= CAM_AUTOSNS_VALID; + } + + mtx_lock(&sc->hs_lock); + if (reqp->softc->hs_frozen == 1) { + xpt_print(ccb->ccb_h.path, + "%u: storvsc unfreezing softc 0x%p.\n", + ticks, reqp->softc); + ccb->ccb_h.status |= CAM_RELEASE_SIMQ; + reqp->softc->hs_frozen = 0; + } + storvsc_free_request(sc, reqp); + mtx_unlock(&sc->hs_lock); + + xpt_done_direct(ccb); +} + +/** + * @brief Free a request structure + * + * Free a request structure by returning it to the free list + * + * @param sc pointer to a softc + * @param reqp pointer to a request structure + */ +static void +storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp) +{ + + LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link); +} + +/** + * @brief Determine type of storage device from GUID + * + * Using the type GUID, determine if this is a StorVSC (paravirtual + * SCSI or BlkVSC (paravirtual IDE) device. + * + * @param dev a device + * returns an enum + */ +static enum hv_storage_type +storvsc_get_storage_type(device_t dev) +{ + device_t parent = device_get_parent(dev); + + if (VMBUS_PROBE_GUID(parent, dev, &gBlkVscDeviceType) == 0) + return DRIVER_BLKVSC; + if (VMBUS_PROBE_GUID(parent, dev, &gStorVscDeviceType) == 0) + return DRIVER_STORVSC; + return DRIVER_UNKNOWN; +} + +#define PCI_VENDOR_INTEL 0x8086 +#define PCI_PRODUCT_PIIX4 0x7111 + +static void +storvsc_ada_probe_veto(void *arg __unused, struct cam_path *path, + struct ata_params *ident_buf __unused, int *veto) +{ + + /* + * The ATA disks are shared with the controllers managed + * by this driver, so veto the ATA disks' attachment; the + * ATA disks will be attached as SCSI disks once this driver + * attached. + */ + if (path->device->protocol == PROTO_ATA) { + struct ccb_pathinq cpi; + + xpt_path_inq(&cpi, path); + if (cpi.ccb_h.status == CAM_REQ_CMP && + cpi.hba_vendor == PCI_VENDOR_INTEL && + cpi.hba_device == PCI_PRODUCT_PIIX4) { + (*veto)++; + if (bootverbose) { + xpt_print(path, + "Disable ATA disks on " + "simulated ATA controller (0x%04x%04x)\n", + cpi.hba_device, cpi.hba_vendor); + } + } + } +} + +static void +storvsc_sysinit(void *arg __unused) +{ + if (vm_guest == VM_GUEST_HV) { + storvsc_handler_tag = EVENTHANDLER_REGISTER(ada_probe_veto, + storvsc_ada_probe_veto, NULL, EVENTHANDLER_PRI_ANY); + } +} +SYSINIT(storvsc_sys_init, SI_SUB_DRIVERS, SI_ORDER_SECOND, storvsc_sysinit, + NULL); + +static void +storvsc_sysuninit(void *arg __unused) +{ + if (storvsc_handler_tag != NULL) + EVENTHANDLER_DEREGISTER(ada_probe_veto, storvsc_handler_tag); +} +SYSUNINIT(storvsc_sys_uninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, + storvsc_sysuninit, NULL); diff --git a/sys/dev/hyperv/storvsc/hv_vstorage.h b/sys/dev/hyperv/storvsc/hv_vstorage.h new file mode 100644 index 000000000000..f1d4c1dfd2e2 --- /dev/null +++ b/sys/dev/hyperv/storvsc/hv_vstorage.h @@ -0,0 +1,311 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2009-2012,2017 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __HV_VSTORAGE_H__ +#define __HV_VSTORAGE_H__ + +/* + * Major/minor macros. Minor version is in LSB, meaning that earlier flat + * version numbers will be interpreted as "0.x" (i.e., 1 becomes 0.1). + */ + +#define VMSTOR_PROTOCOL_MAJOR(VERSION_) (((VERSION_) >> 8) & 0xff) +#define VMSTOR_PROTOCOL_MINOR(VERSION_) (((VERSION_) ) & 0xff) +#define VMSTOR_PROTOCOL_VERSION(MAJOR_, MINOR_) ((((MAJOR_) & 0xff) << 8) | \ + (((MINOR_) & 0xff) )) + +#define VMSTOR_PROTOCOL_VERSION_WIN6 VMSTOR_PROTOCOL_VERSION(2, 0) +#define VMSTOR_PROTOCOL_VERSION_WIN7 VMSTOR_PROTOCOL_VERSION(4, 2) +#define VMSTOR_PROTOCOL_VERSION_WIN8 VMSTOR_PROTOCOL_VERSION(5, 1) +#define VMSTOR_PROTOCOL_VERSION_WIN8_1 VMSTOR_PROTOCOL_VERSION(6, 0) +#define VMSTOR_PROTOCOL_VERSION_WIN10 VMSTOR_PROTOCOL_VERSION(6, 2) +/* + * Invalid version. + */ +#define VMSTOR_INVALID_PROTOCOL_VERSION -1 + +/* + * Version history: + * V1 Beta 0.1 + * V1 RC < 2008/1/31 1.0 + * V1 RC > 2008/1/31 2.0 + * Win7: 4.2 + * Win8: 5.1 + */ + +#define VMSTOR_PROTOCOL_VERSION_CURRENT VMSTOR_PROTOCOL_VERSION(5, 1) + +/** + * Packet structure ops describing virtual storage requests. + */ +enum vstor_packet_ops { + VSTOR_OPERATION_COMPLETEIO = 1, + VSTOR_OPERATION_REMOVEDEVICE = 2, + VSTOR_OPERATION_EXECUTESRB = 3, + VSTOR_OPERATION_RESETLUN = 4, + VSTOR_OPERATION_RESETADAPTER = 5, + VSTOR_OPERATION_RESETBUS = 6, + VSTOR_OPERATION_BEGININITIALIZATION = 7, + VSTOR_OPERATION_ENDINITIALIZATION = 8, + VSTOR_OPERATION_QUERYPROTOCOLVERSION = 9, + VSTOR_OPERATION_QUERYPROPERTIES = 10, + VSTOR_OPERATION_ENUMERATE_BUS = 11, + VSTOR_OPERATION_FCHBA_DATA = 12, + VSTOR_OPERATION_CREATE_MULTI_CHANNELS = 13, + VSTOR_OPERATION_MAXIMUM = 13 +}; + + +/* + * Platform neutral description of a scsi request - + * this remains the same across the write regardless of 32/64 bit + * note: it's patterned off the Windows DDK SCSI_PASS_THROUGH structure + */ + +#define CDB16GENERIC_LENGTH 0x10 +#define SENSE_BUFFER_SIZE 0x14 +#define MAX_DATA_BUFFER_LENGTH_WITH_PADDING 0x14 + +#define POST_WIN7_STORVSC_SENSE_BUFFER_SIZE 0x14 +#define PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE 0x12 + + +struct vmscsi_win8_extension { + /* + * The following were added in Windows 8 + */ + uint16_t reserve; + uint8_t queue_tag; + uint8_t queue_action; + uint32_t srb_flags; + uint32_t time_out_value; + uint32_t queue_sort_ey; +} __packed; + +struct vmscsi_req { + uint16_t length; + uint8_t srb_status; + uint8_t scsi_status; + + /* HBA number, set to the order number detected by initiator. */ + uint8_t port; + /* SCSI bus number or bus_id, different from CAM's path_id. */ + uint8_t path_id; + + uint8_t target_id; + uint8_t lun; + + uint8_t cdb_len; + uint8_t sense_info_len; + uint8_t data_in; + uint8_t reserved; + + uint32_t transfer_len; + + union { + uint8_t cdb[CDB16GENERIC_LENGTH]; + + uint8_t sense_data[SENSE_BUFFER_SIZE]; + + uint8_t reserved_array[MAX_DATA_BUFFER_LENGTH_WITH_PADDING]; + } u; + + /* + * The following was added in win8. + */ + struct vmscsi_win8_extension win8_extension; + +} __packed; + +/** + * This structure is sent during the initialization phase to get the different + * properties of the channel. + */ + +struct vmstor_chan_props { + uint16_t proto_ver; + uint8_t path_id; + uint8_t target_id; + + uint16_t max_channel_cnt; + + /** + * Note: port number is only really known on the client side + */ + uint16_t port; + uint32_t flags; + uint32_t max_transfer_bytes; + + /** + * This id is unique for each channel and will correspond with + * vendor specific data in the inquiry_ata + */ + uint64_t unique_id; + +} __packed; + +/** + * This structure is sent during the storage protocol negotiations. + */ + +struct vmstor_proto_ver +{ + /** + * Major (MSW) and minor (LSW) version numbers. + */ + uint16_t major_minor; + + uint16_t revision; /* always zero */ +} __packed; + +/** + * Channel Property Flags + */ + +#define STORAGE_CHANNEL_REMOVABLE_FLAG 0x1 +#define STORAGE_CHANNEL_EMULATED_IDE_FLAG 0x2 + + +struct vstor_packet { + /** + * Requested operation type + */ + enum vstor_packet_ops operation; + + /* + * Flags - see below for values + */ + uint32_t flags; + + /** + * Status of the request returned from the server side. + */ + uint32_t status; + + union + { + /** + * Structure used to forward SCSI commands from the client to + * the server. + */ + struct vmscsi_req vm_srb; + + /** + * Structure used to query channel properties. + */ + struct vmstor_chan_props chan_props; + + /** + * Used during version negotiations. + */ + struct vmstor_proto_ver version; + + /** + * Number of multichannels to create + */ + uint16_t multi_channels_cnt; + } u; + +} __packed; + + +/** + * SRB (SCSI Request Block) Status Codes + */ +#define SRB_STATUS_PENDING 0x00 +#define SRB_STATUS_SUCCESS 0x01 +#define SRB_STATUS_ABORTED 0x02 +#define SRB_STATUS_ABORT_FAILED 0x03 +#define SRB_STATUS_ERROR 0x04 +#define SRB_STATUS_BUSY 0x05 +#define SRB_STATUS_INVALID_REQUEST 0x06 +#define SRB_STATUS_INVALID_PATH_ID 0x07 +#define SRB_STATUS_NO_DEVICE 0x08 +#define SRB_STATUS_TIMEOUT 0x09 +#define SRB_STATUS_SELECTION_TIMEOUT 0x0A +#define SRB_STATUS_COMMAND_TIMEOUT 0x0B +#define SRB_STATUS_MESSAGE_REJECTED 0x0D +#define SRB_STATUS_BUS_RESET 0x0E +#define SRB_STATUS_PARITY_ERROR 0x0F +#define SRB_STATUS_REQUEST_SENSE_FAILED 0x10 +#define SRB_STATUS_NO_HBA 0x11 +#define SRB_STATUS_DATA_OVERRUN 0x12 +#define SRB_STATUS_UNEXPECTED_BUS_FREE 0x13 +#define SRB_STATUS_PHASE_SEQUENCE_FAILURE 0x14 +#define SRB_STATUS_BAD_SRB_BLOCK_LENGTH 0x15 +#define SRB_STATUS_REQUEST_FLUSHED 0x16 +#define SRB_STATUS_INVALID_LUN 0x20 +#define SRB_STATUS_INVALID_TARGET_ID 0x21 +#define SRB_STATUS_BAD_FUNCTION 0x22 +#define SRB_STATUS_ERROR_RECOVERY 0x23 +#define SRB_STATUS_NOT_POWERED 0x24 +#define SRB_STATUS_LINK_DOWN 0x25 +/** + * SRB Status Masks (can be combined with above status codes) + */ +#define SRB_STATUS_QUEUE_FROZEN 0x40 +#define SRB_STATUS_AUTOSENSE_VALID 0x80 + +#define SRB_STATUS(status) \ + ((status) & ~(SRB_STATUS_AUTOSENSE_VALID | SRB_STATUS_QUEUE_FROZEN)) +/* + * SRB Flag Bits + */ + +#define SRB_FLAGS_QUEUE_ACTION_ENABLE 0x00000002 +#define SRB_FLAGS_DISABLE_DISCONNECT 0x00000004 +#define SRB_FLAGS_DISABLE_SYNCH_TRANSFER 0x00000008 +#define SRB_FLAGS_BYPASS_FROZEN_QUEUE 0x00000010 +#define SRB_FLAGS_DISABLE_AUTOSENSE 0x00000020 +#define SRB_FLAGS_DATA_IN 0x00000040 +#define SRB_FLAGS_DATA_OUT 0x00000080 +#define SRB_FLAGS_NO_DATA_TRANSFER 0x00000000 +#define SRB_FLAGS_UNSPECIFIED_DIRECTION (SRB_FLAGS_DATA_IN | SRB_FLAGS_DATA_OUT) +#define SRB_FLAGS_NO_QUEUE_FREEZE 0x00000100 +#define SRB_FLAGS_ADAPTER_CACHE_ENABLE 0x00000200 +#define SRB_FLAGS_FREE_SENSE_BUFFER 0x00000400 +/** + * Packet flags + */ + +/** + * This flag indicates that the server should send back a completion for this + * packet. + */ +#define REQUEST_COMPLETION_FLAG 0x1 + +/** + * This is the set of flags that the vsc can set in any packets it sends + */ +#define VSC_LEGAL_FLAGS (REQUEST_COMPLETION_FLAG) + +#endif /* __HV_VSTORAGE_H__ */ diff --git a/sys/dev/hyperv/utilities/hv_kvp.c b/sys/dev/hyperv/utilities/hv_kvp.c new file mode 100644 index 000000000000..8da0936f6cd7 --- /dev/null +++ b/sys/dev/hyperv/utilities/hv_kvp.c @@ -0,0 +1,920 @@ +/*- + * Copyright (c) 2014,2016-2017 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author: Sainath Varanasi. + * Date: 4/2012 + * Email: bsdic@microsoft.com + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/reboot.h> +#include <sys/lock.h> +#include <sys/taskqueue.h> +#include <sys/selinfo.h> +#include <sys/sysctl.h> +#include <sys/poll.h> +#include <sys/proc.h> +#include <sys/kthread.h> +#include <sys/syscallsubr.h> +#include <sys/sysproto.h> +#include <sys/un.h> +#include <sys/endian.h> +#include <sys/_null.h> +#include <sys/sema.h> +#include <sys/signal.h> +#include <sys/syslog.h> +#include <sys/systm.h> +#include <sys/mutex.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/vmbus.h> +#include <dev/hyperv/utilities/hv_utilreg.h> +#include <dev/hyperv/utilities/vmbus_icreg.h> +#include <dev/hyperv/utilities/vmbus_icvar.h> + +#include "unicode.h" +#include "hv_kvp.h" +#include "vmbus_if.h" + +/* hv_kvp defines */ +#define BUFFERSIZE sizeof(struct hv_kvp_msg) +#define kvp_hdr hdr.kvp_hdr + +#define KVP_FWVER_MAJOR 3 +#define KVP_FWVER VMBUS_IC_VERSION(KVP_FWVER_MAJOR, 0) + +#define KVP_MSGVER_MAJOR 4 +#define KVP_MSGVER VMBUS_IC_VERSION(KVP_MSGVER_MAJOR, 0) + +/* hv_kvp debug control */ +static int hv_kvp_log = 0; + +#define hv_kvp_log_error(...) do { \ + if (hv_kvp_log > 0) \ + log(LOG_ERR, "hv_kvp: " __VA_ARGS__); \ +} while (0) + +#define hv_kvp_log_info(...) do { \ + if (hv_kvp_log > 1) \ + log(LOG_INFO, "hv_kvp: " __VA_ARGS__); \ +} while (0) + +static const struct vmbus_ic_desc vmbus_kvp_descs[] = { + { + .ic_guid = { .hv_guid = { + 0xe7, 0xf4, 0xa0, 0xa9, 0x45, 0x5a, 0x96, 0x4d, + 0xb8, 0x27, 0x8a, 0x84, 0x1e, 0x8c, 0x3, 0xe6 } }, + .ic_desc = "Hyper-V KVP" + }, + VMBUS_IC_DESC_END +}; + +/* character device prototypes */ +static d_open_t hv_kvp_dev_open; +static d_close_t hv_kvp_dev_close; +static d_read_t hv_kvp_dev_daemon_read; +static d_write_t hv_kvp_dev_daemon_write; +static d_poll_t hv_kvp_dev_daemon_poll; + +/* hv_kvp character device structure */ +static struct cdevsw hv_kvp_cdevsw = +{ + .d_version = D_VERSION, + .d_open = hv_kvp_dev_open, + .d_close = hv_kvp_dev_close, + .d_read = hv_kvp_dev_daemon_read, + .d_write = hv_kvp_dev_daemon_write, + .d_poll = hv_kvp_dev_daemon_poll, + .d_name = "hv_kvp_dev", +}; + + +/* + * Global state to track and synchronize multiple + * KVP transaction requests from the host. + */ +typedef struct hv_kvp_sc { + struct vmbus_ic_softc util_sc; + device_t dev; + + /* Unless specified the pending mutex should be + * used to alter the values of the following parameters: + * 1. req_in_progress + * 2. req_timed_out + */ + struct mtx pending_mutex; + + struct task task; + + /* To track if transaction is active or not */ + boolean_t req_in_progress; + /* Tracks if daemon did not reply back in time */ + boolean_t req_timed_out; + /* Tracks if daemon is serving a request currently */ + boolean_t daemon_busy; + + /* Length of host message */ + uint32_t host_msg_len; + + /* Host message id */ + uint64_t host_msg_id; + + /* Current kvp message from the host */ + struct hv_kvp_msg *host_kvp_msg; + + /* Current kvp message for daemon */ + struct hv_kvp_msg daemon_kvp_msg; + + /* Rcv buffer for communicating with the host*/ + uint8_t *rcv_buf; + + /* Device semaphore to control communication */ + struct sema dev_sema; + + /* Indicates if daemon registered with driver */ + boolean_t register_done; + + /* Character device status */ + boolean_t dev_accessed; + + struct cdev *hv_kvp_dev; + + struct proc *daemon_task; + + struct selinfo hv_kvp_selinfo; +} hv_kvp_sc; + +/* hv_kvp prototypes */ +static int hv_kvp_req_in_progress(hv_kvp_sc *sc); +static void hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t, uint64_t, uint8_t *); +static void hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc); +static void hv_kvp_process_request(void *context, int pending); + +/* + * hv_kvp low level functions + */ + +/* + * Check if kvp transaction is in progres + */ +static int +hv_kvp_req_in_progress(hv_kvp_sc *sc) +{ + + return (sc->req_in_progress); +} + + +/* + * This routine is called whenever a message is received from the host + */ +static void +hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t rcv_len, + uint64_t request_id, uint8_t *rcv_buf) +{ + + /* Store all the relevant message details in the global structure */ + /* Do not need to use mutex for req_in_progress here */ + sc->req_in_progress = true; + sc->host_msg_len = rcv_len; + sc->host_msg_id = request_id; + sc->rcv_buf = rcv_buf; + sc->host_kvp_msg = (struct hv_kvp_msg *)&rcv_buf[ + sizeof(struct hv_vmbus_pipe_hdr) + + sizeof(struct hv_vmbus_icmsg_hdr)]; +} + +/* + * Convert ip related info in umsg from utf8 to utf16 and store in hmsg + */ +static int +hv_kvp_convert_utf8_ipinfo_to_utf16(struct hv_kvp_msg *umsg, + struct hv_kvp_ip_msg *host_ip_msg) +{ + int err_ip, err_subnet, err_gway, err_dns, err_adap; + int UNUSED_FLAG = 1; + + utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.ip_addr, + MAX_IP_ADDR_SIZE, + (char *)umsg->body.kvp_ip_val.ip_addr, + strlen((char *)umsg->body.kvp_ip_val.ip_addr), + UNUSED_FLAG, + &err_ip); + utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.sub_net, + MAX_IP_ADDR_SIZE, + (char *)umsg->body.kvp_ip_val.sub_net, + strlen((char *)umsg->body.kvp_ip_val.sub_net), + UNUSED_FLAG, + &err_subnet); + utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.gate_way, + MAX_GATEWAY_SIZE, + (char *)umsg->body.kvp_ip_val.gate_way, + strlen((char *)umsg->body.kvp_ip_val.gate_way), + UNUSED_FLAG, + &err_gway); + utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.dns_addr, + MAX_IP_ADDR_SIZE, + (char *)umsg->body.kvp_ip_val.dns_addr, + strlen((char *)umsg->body.kvp_ip_val.dns_addr), + UNUSED_FLAG, + &err_dns); + utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.adapter_id, + MAX_ADAPTER_ID_SIZE, + (char *)umsg->body.kvp_ip_val.adapter_id, + strlen((char *)umsg->body.kvp_ip_val.adapter_id), + UNUSED_FLAG, + &err_adap); + + host_ip_msg->kvp_ip_val.dhcp_enabled = umsg->body.kvp_ip_val.dhcp_enabled; + host_ip_msg->kvp_ip_val.addr_family = umsg->body.kvp_ip_val.addr_family; + + return (err_ip | err_subnet | err_gway | err_dns | err_adap); +} + + +/* + * Convert ip related info in hmsg from utf16 to utf8 and store in umsg + */ +static int +hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg, + struct hv_kvp_msg *umsg) +{ + int err_ip, err_subnet, err_gway, err_dns, err_adap; + int UNUSED_FLAG = 1; + device_t *devs; + int devcnt; + + /* IP Address */ + utf16_to_utf8((char *)umsg->body.kvp_ip_val.ip_addr, + MAX_IP_ADDR_SIZE, + (uint16_t *)host_ip_msg->kvp_ip_val.ip_addr, + MAX_IP_ADDR_SIZE, + UNUSED_FLAG, + &err_ip); + + /* Adapter ID : GUID */ + utf16_to_utf8((char *)umsg->body.kvp_ip_val.adapter_id, + MAX_ADAPTER_ID_SIZE, + (uint16_t *)host_ip_msg->kvp_ip_val.adapter_id, + MAX_ADAPTER_ID_SIZE, + UNUSED_FLAG, + &err_adap); + + if (devclass_get_devices(devclass_find("hn"), &devs, &devcnt) == 0) { + for (devcnt = devcnt - 1; devcnt >= 0; devcnt--) { + device_t dev = devs[devcnt]; + struct vmbus_channel *chan; + char buf[HYPERV_GUID_STRLEN]; + int n; + + chan = vmbus_get_channel(dev); + n = hyperv_guid2str(vmbus_chan_guid_inst(chan), buf, + sizeof(buf)); + + /* + * The string in the 'kvp_ip_val.adapter_id' has + * braces around the GUID; skip the leading brace + * in 'kvp_ip_val.adapter_id'. + */ + if (strncmp(buf, + ((char *)&umsg->body.kvp_ip_val.adapter_id) + 1, + n) == 0) { + strlcpy((char *)umsg->body.kvp_ip_val.adapter_id, + device_get_nameunit(dev), MAX_ADAPTER_ID_SIZE); + break; + } + } + free(devs, M_TEMP); + } + + /* Address Family , DHCP , SUBNET, Gateway, DNS */ + umsg->kvp_hdr.operation = host_ip_msg->operation; + umsg->body.kvp_ip_val.addr_family = host_ip_msg->kvp_ip_val.addr_family; + umsg->body.kvp_ip_val.dhcp_enabled = host_ip_msg->kvp_ip_val.dhcp_enabled; + utf16_to_utf8((char *)umsg->body.kvp_ip_val.sub_net, MAX_IP_ADDR_SIZE, + (uint16_t *)host_ip_msg->kvp_ip_val.sub_net, + MAX_IP_ADDR_SIZE, + UNUSED_FLAG, + &err_subnet); + + utf16_to_utf8((char *)umsg->body.kvp_ip_val.gate_way, MAX_GATEWAY_SIZE, + (uint16_t *)host_ip_msg->kvp_ip_val.gate_way, + MAX_GATEWAY_SIZE, + UNUSED_FLAG, + &err_gway); + + utf16_to_utf8((char *)umsg->body.kvp_ip_val.dns_addr, MAX_IP_ADDR_SIZE, + (uint16_t *)host_ip_msg->kvp_ip_val.dns_addr, + MAX_IP_ADDR_SIZE, + UNUSED_FLAG, + &err_dns); + + return (err_ip | err_subnet | err_gway | err_dns | err_adap); +} + + +/* + * Prepare a user kvp msg based on host kvp msg (utf16 to utf8) + * Ensure utf16_utf8 takes care of the additional string terminating char!! + */ +static void +hv_kvp_convert_hostmsg_to_usermsg(struct hv_kvp_msg *hmsg, struct hv_kvp_msg *umsg) +{ + int utf_err = 0; + uint32_t value_type; + struct hv_kvp_ip_msg *host_ip_msg; + + host_ip_msg = (struct hv_kvp_ip_msg*)hmsg; + memset(umsg, 0, sizeof(struct hv_kvp_msg)); + + umsg->kvp_hdr.operation = hmsg->kvp_hdr.operation; + umsg->kvp_hdr.pool = hmsg->kvp_hdr.pool; + + switch (umsg->kvp_hdr.operation) { + case HV_KVP_OP_SET_IP_INFO: + hv_kvp_convert_utf16_ipinfo_to_utf8(host_ip_msg, umsg); + break; + + case HV_KVP_OP_GET_IP_INFO: + utf16_to_utf8((char *)umsg->body.kvp_ip_val.adapter_id, + MAX_ADAPTER_ID_SIZE, + (uint16_t *)host_ip_msg->kvp_ip_val.adapter_id, + MAX_ADAPTER_ID_SIZE, 1, &utf_err); + + umsg->body.kvp_ip_val.addr_family = + host_ip_msg->kvp_ip_val.addr_family; + break; + + case HV_KVP_OP_SET: + value_type = hmsg->body.kvp_set.data.value_type; + + switch (value_type) { + case HV_REG_SZ: + umsg->body.kvp_set.data.value_size = + utf16_to_utf8( + (char *)umsg->body.kvp_set.data.msg_value.value, + HV_KVP_EXCHANGE_MAX_VALUE_SIZE - 1, + (uint16_t *)hmsg->body.kvp_set.data.msg_value.value, + hmsg->body.kvp_set.data.value_size, + 1, &utf_err); + /* utf8 encoding */ + umsg->body.kvp_set.data.value_size = + umsg->body.kvp_set.data.value_size / 2; + break; + + case HV_REG_U32: + umsg->body.kvp_set.data.value_size = + sprintf(umsg->body.kvp_set.data.msg_value.value, "%d", + hmsg->body.kvp_set.data.msg_value.value_u32) + 1; + break; + + case HV_REG_U64: + umsg->body.kvp_set.data.value_size = + sprintf(umsg->body.kvp_set.data.msg_value.value, "%llu", + (unsigned long long) + hmsg->body.kvp_set.data.msg_value.value_u64) + 1; + break; + } + + umsg->body.kvp_set.data.key_size = + utf16_to_utf8( + umsg->body.kvp_set.data.key, + HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1, + (uint16_t *)hmsg->body.kvp_set.data.key, + hmsg->body.kvp_set.data.key_size, + 1, &utf_err); + + /* utf8 encoding */ + umsg->body.kvp_set.data.key_size = + umsg->body.kvp_set.data.key_size / 2; + break; + + case HV_KVP_OP_GET: + umsg->body.kvp_get.data.key_size = + utf16_to_utf8(umsg->body.kvp_get.data.key, + HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1, + (uint16_t *)hmsg->body.kvp_get.data.key, + hmsg->body.kvp_get.data.key_size, + 1, &utf_err); + /* utf8 encoding */ + umsg->body.kvp_get.data.key_size = + umsg->body.kvp_get.data.key_size / 2; + break; + + case HV_KVP_OP_DELETE: + umsg->body.kvp_delete.key_size = + utf16_to_utf8(umsg->body.kvp_delete.key, + HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1, + (uint16_t *)hmsg->body.kvp_delete.key, + hmsg->body.kvp_delete.key_size, + 1, &utf_err); + /* utf8 encoding */ + umsg->body.kvp_delete.key_size = + umsg->body.kvp_delete.key_size / 2; + break; + + case HV_KVP_OP_ENUMERATE: + umsg->body.kvp_enum_data.index = + hmsg->body.kvp_enum_data.index; + break; + + default: + hv_kvp_log_info("%s: daemon_kvp_msg: Invalid operation : %d\n", + __func__, umsg->kvp_hdr.operation); + } +} + + +/* + * Prepare a host kvp msg based on user kvp msg (utf8 to utf16) + */ +static int +hv_kvp_convert_usermsg_to_hostmsg(struct hv_kvp_msg *umsg, struct hv_kvp_msg *hmsg) +{ + int hkey_len = 0, hvalue_len = 0, utf_err = 0; + struct hv_kvp_exchg_msg_value *host_exchg_data; + char *key_name, *value; + + struct hv_kvp_ip_msg *host_ip_msg = (struct hv_kvp_ip_msg *)hmsg; + + switch (hmsg->kvp_hdr.operation) { + case HV_KVP_OP_GET_IP_INFO: + return (hv_kvp_convert_utf8_ipinfo_to_utf16(umsg, host_ip_msg)); + + case HV_KVP_OP_SET_IP_INFO: + case HV_KVP_OP_SET: + case HV_KVP_OP_DELETE: + return (0); + + case HV_KVP_OP_ENUMERATE: + host_exchg_data = &hmsg->body.kvp_enum_data.data; + key_name = umsg->body.kvp_enum_data.data.key; + hkey_len = utf8_to_utf16((uint16_t *)host_exchg_data->key, + ((HV_KVP_EXCHANGE_MAX_KEY_SIZE / 2) - 2), + key_name, strlen(key_name), + 1, &utf_err); + /* utf16 encoding */ + host_exchg_data->key_size = 2 * (hkey_len + 1); + value = umsg->body.kvp_enum_data.data.msg_value.value; + hvalue_len = utf8_to_utf16( + (uint16_t *)host_exchg_data->msg_value.value, + ((HV_KVP_EXCHANGE_MAX_VALUE_SIZE / 2) - 2), + value, strlen(value), + 1, &utf_err); + host_exchg_data->value_size = 2 * (hvalue_len + 1); + host_exchg_data->value_type = HV_REG_SZ; + + if ((hkey_len < 0) || (hvalue_len < 0)) + return (EINVAL); + + return (0); + + case HV_KVP_OP_GET: + host_exchg_data = &hmsg->body.kvp_get.data; + value = umsg->body.kvp_get.data.msg_value.value; + hvalue_len = utf8_to_utf16( + (uint16_t *)host_exchg_data->msg_value.value, + ((HV_KVP_EXCHANGE_MAX_VALUE_SIZE / 2) - 2), + value, strlen(value), + 1, &utf_err); + /* Convert value size to uft16 */ + host_exchg_data->value_size = 2 * (hvalue_len + 1); + /* Use values by string */ + host_exchg_data->value_type = HV_REG_SZ; + + if (hvalue_len < 0) + return (EINVAL); + + return (0); + + default: + return (EINVAL); + } +} + + +/* + * Send the response back to the host. + */ +static void +hv_kvp_respond_host(hv_kvp_sc *sc, uint32_t error) +{ + struct hv_vmbus_icmsg_hdr *hv_icmsg_hdrp; + + hv_icmsg_hdrp = (struct hv_vmbus_icmsg_hdr *) + &sc->rcv_buf[sizeof(struct hv_vmbus_pipe_hdr)]; + + hv_icmsg_hdrp->status = error; + hv_icmsg_hdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | + HV_ICMSGHDRFLAG_RESPONSE; + + error = vmbus_chan_send(vmbus_get_channel(sc->dev), + VMBUS_CHANPKT_TYPE_INBAND, 0, sc->rcv_buf, sc->host_msg_len, + sc->host_msg_id); + if (error) + hv_kvp_log_info("%s: hv_kvp_respond_host: sendpacket error:%d\n", + __func__, error); +} + + +/* + * This is the main kvp kernel process that interacts with both user daemon + * and the host + */ +static void +hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc) +{ + struct hv_kvp_msg *hmsg = sc->host_kvp_msg; + struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg; + + /* Prepare kvp_msg to be sent to user */ + hv_kvp_convert_hostmsg_to_usermsg(hmsg, umsg); + + /* Send the msg to user via function deamon_read - setting sema */ + sema_post(&sc->dev_sema); + + /* We should wake up the daemon, in case it's doing poll() */ + selwakeup(&sc->hv_kvp_selinfo); +} + + +/* + * Function to read the kvp request buffer from host + * and interact with daemon + */ +static void +hv_kvp_process_request(void *context, int pending) +{ + uint8_t *kvp_buf; + struct vmbus_channel *channel; + uint32_t recvlen = 0; + uint64_t requestid; + struct hv_vmbus_icmsg_hdr *icmsghdrp; + int ret = 0, error; + hv_kvp_sc *sc; + + hv_kvp_log_info("%s: entering hv_kvp_process_request\n", __func__); + + sc = (hv_kvp_sc*)context; + kvp_buf = sc->util_sc.ic_buf; + channel = vmbus_get_channel(sc->dev); + + recvlen = sc->util_sc.ic_buflen; + ret = vmbus_chan_recv(channel, kvp_buf, &recvlen, &requestid); + KASSERT(ret != ENOBUFS, ("hvkvp recvbuf is not large enough")); + /* XXX check recvlen to make sure that it contains enough data */ + + while ((ret == 0) && (recvlen > 0)) { + icmsghdrp = (struct hv_vmbus_icmsg_hdr *) + &kvp_buf[sizeof(struct hv_vmbus_pipe_hdr)]; + + hv_kvp_transaction_init(sc, recvlen, requestid, kvp_buf); + if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { + error = vmbus_ic_negomsg(&sc->util_sc, + kvp_buf, &recvlen, KVP_FWVER, KVP_MSGVER); + /* XXX handle vmbus_ic_negomsg failure. */ + if (!error) + hv_kvp_respond_host(sc, HV_S_OK); + else + hv_kvp_respond_host(sc, HV_E_FAIL); + /* + * It is ok to not acquire the mutex before setting + * req_in_progress here because negotiation is the + * first thing that happens and hence there is no + * chance of a race condition. + */ + + sc->req_in_progress = false; + hv_kvp_log_info("%s :version negotiated\n", __func__); + + } else { + if (!sc->daemon_busy) { + + hv_kvp_log_info("%s: issuing qury to daemon\n", __func__); + mtx_lock(&sc->pending_mutex); + sc->req_timed_out = false; + sc->daemon_busy = true; + mtx_unlock(&sc->pending_mutex); + + hv_kvp_send_msg_to_daemon(sc); + hv_kvp_log_info("%s: waiting for daemon\n", __func__); + } + + /* Wait 5 seconds for daemon to respond back */ + tsleep(sc, 0, "kvpworkitem", 5 * hz); + hv_kvp_log_info("%s: came out of wait\n", __func__); + } + + mtx_lock(&sc->pending_mutex); + + /* Notice that once req_timed_out is set to true + * it will remain true until the next request is + * sent to the daemon. The response from daemon + * is forwarded to host only when this flag is + * false. + */ + sc->req_timed_out = true; + + /* + * Cancel request if so need be. + */ + if (hv_kvp_req_in_progress(sc)) { + hv_kvp_log_info("%s: request was still active after wait so failing\n", __func__); + hv_kvp_respond_host(sc, HV_E_FAIL); + sc->req_in_progress = false; + } + + mtx_unlock(&sc->pending_mutex); + + /* + * Try reading next buffer + */ + recvlen = sc->util_sc.ic_buflen; + ret = vmbus_chan_recv(channel, kvp_buf, &recvlen, &requestid); + KASSERT(ret != ENOBUFS, ("hvkvp recvbuf is not large enough")); + /* XXX check recvlen to make sure that it contains enough data */ + + hv_kvp_log_info("%s: read: context %p, ret =%d, recvlen=%d\n", + __func__, context, ret, recvlen); + } +} + + +/* + * Callback routine that gets called whenever there is a message from host + */ +static void +hv_kvp_callback(struct vmbus_channel *chan __unused, void *context) +{ + hv_kvp_sc *sc = (hv_kvp_sc*)context; + /* + The first request from host will not be handled until daemon is registered. + when callback is triggered without a registered daemon, callback just return. + When a new daemon gets regsitered, this callbcak is trigged from _write op. + */ + if (sc->register_done) { + hv_kvp_log_info("%s: Queuing work item\n", __func__); + taskqueue_enqueue(taskqueue_thread, &sc->task); + } +} + +static int +hv_kvp_dev_open(struct cdev *dev, int oflags, int devtype, + struct thread *td) +{ + hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; + + hv_kvp_log_info("%s: Opened device \"hv_kvp_device\" successfully.\n", __func__); + if (sc->dev_accessed) + return (-EBUSY); + + sc->daemon_task = curproc; + sc->dev_accessed = true; + sc->daemon_busy = false; + return (0); +} + + +static int +hv_kvp_dev_close(struct cdev *dev __unused, int fflag __unused, int devtype __unused, + struct thread *td __unused) +{ + hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; + + hv_kvp_log_info("%s: Closing device \"hv_kvp_device\".\n", __func__); + sc->dev_accessed = false; + sc->register_done = false; + return (0); +} + + +/* + * hv_kvp_daemon read invokes this function + * acts as a send to daemon + */ +static int +hv_kvp_dev_daemon_read(struct cdev *dev, struct uio *uio, int ioflag __unused) +{ + size_t amt; + int error = 0; + struct hv_kvp_msg *hv_kvp_dev_buf; + hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; + + /* Read is not allowed util registering is done. */ + if (!sc->register_done) + return (EPERM); + + sema_wait(&sc->dev_sema); + + hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK); + memcpy(hv_kvp_dev_buf, &sc->daemon_kvp_msg, sizeof(struct hv_kvp_msg)); + + amt = MIN(uio->uio_resid, uio->uio_offset >= BUFFERSIZE + 1 ? 0 : + BUFFERSIZE + 1 - uio->uio_offset); + + if ((error = uiomove(hv_kvp_dev_buf, amt, uio)) != 0) + hv_kvp_log_info("%s: hv_kvp uiomove read failed!\n", __func__); + + free(hv_kvp_dev_buf, M_TEMP); + return (error); +} + + +/* + * hv_kvp_daemon write invokes this function + * acts as a receive from daemon + */ +static int +hv_kvp_dev_daemon_write(struct cdev *dev, struct uio *uio, int ioflag __unused) +{ + size_t amt; + int error = 0; + struct hv_kvp_msg *hv_kvp_dev_buf; + hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; + + uio->uio_offset = 0; + hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK); + + amt = MIN(uio->uio_resid, BUFFERSIZE); + error = uiomove(hv_kvp_dev_buf, amt, uio); + + if (error != 0) { + free(hv_kvp_dev_buf, M_TEMP); + return (error); + } + memcpy(&sc->daemon_kvp_msg, hv_kvp_dev_buf, sizeof(struct hv_kvp_msg)); + + free(hv_kvp_dev_buf, M_TEMP); + if (sc->register_done == false) { + if (sc->daemon_kvp_msg.kvp_hdr.operation == HV_KVP_OP_REGISTER) { + sc->register_done = true; + hv_kvp_callback(vmbus_get_channel(sc->dev), dev->si_drv1); + } + else { + hv_kvp_log_info("%s, KVP Registration Failed\n", __func__); + return (EINVAL); + } + } else { + + mtx_lock(&sc->pending_mutex); + + if(!sc->req_timed_out) { + struct hv_kvp_msg *hmsg = sc->host_kvp_msg; + struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg; + + error = hv_kvp_convert_usermsg_to_hostmsg(umsg, hmsg); + hv_kvp_respond_host(sc, umsg->hdr.error); + wakeup(sc); + sc->req_in_progress = false; + if (umsg->hdr.error != HV_S_OK) + hv_kvp_log_info("%s, Error 0x%x from daemon\n", + __func__, umsg->hdr.error); + if (error) + hv_kvp_log_info("%s, Error from convert\n", __func__); + } + + sc->daemon_busy = false; + mtx_unlock(&sc->pending_mutex); + } + + return (error); +} + + +/* + * hv_kvp_daemon poll invokes this function to check if data is available + * for daemon to read. + */ +static int +hv_kvp_dev_daemon_poll(struct cdev *dev, int events, struct thread *td) +{ + int revents = 0; + hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; + + mtx_lock(&sc->pending_mutex); + /* + * We check global flag daemon_busy for the data availiability for + * userland to read. Deamon_busy is set to true before driver has data + * for daemon to read. It is set to false after daemon sends + * then response back to driver. + */ + if (sc->daemon_busy == true) + revents = POLLIN; + else + selrecord(td, &sc->hv_kvp_selinfo); + + mtx_unlock(&sc->pending_mutex); + + return (revents); +} + +static int +hv_kvp_probe(device_t dev) +{ + + return (vmbus_ic_probe(dev, vmbus_kvp_descs)); +} + +static int +hv_kvp_attach(device_t dev) +{ + int error; + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + + hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev); + + sc->dev = dev; + sema_init(&sc->dev_sema, 0, "hv_kvp device semaphore"); + mtx_init(&sc->pending_mutex, "hv-kvp pending mutex", + NULL, MTX_DEF); + + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "hv_kvp_log", + CTLFLAG_RWTUN, &hv_kvp_log, 0, "Hyperv KVP service log level"); + + TASK_INIT(&sc->task, 0, hv_kvp_process_request, sc); + + /* create character device */ + error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, + &sc->hv_kvp_dev, + &hv_kvp_cdevsw, + 0, + UID_ROOT, + GID_WHEEL, + 0640, + "hv_kvp_dev"); + + if (error != 0) + return (error); + sc->hv_kvp_dev->si_drv1 = sc; + + return (vmbus_ic_attach(dev, hv_kvp_callback)); +} + +static int +hv_kvp_detach(device_t dev) +{ + hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev); + + if (sc->daemon_task != NULL) { + PROC_LOCK(sc->daemon_task); + kern_psignal(sc->daemon_task, SIGKILL); + PROC_UNLOCK(sc->daemon_task); + } + + destroy_dev(sc->hv_kvp_dev); + return (vmbus_ic_detach(dev)); +} + +static device_method_t kvp_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, hv_kvp_probe), + DEVMETHOD(device_attach, hv_kvp_attach), + DEVMETHOD(device_detach, hv_kvp_detach), + { 0, 0 } +}; + +static driver_t kvp_driver = { "hvkvp", kvp_methods, sizeof(hv_kvp_sc)}; + +static devclass_t kvp_devclass; + +DRIVER_MODULE(hv_kvp, vmbus, kvp_driver, kvp_devclass, NULL, NULL); +MODULE_VERSION(hv_kvp, 1); +MODULE_DEPEND(hv_kvp, vmbus, 1, 1, 1); diff --git a/sys/dev/hyperv/utilities/hv_kvp.h b/sys/dev/hyperv/utilities/hv_kvp.h new file mode 100644 index 000000000000..91e1ea404d4a --- /dev/null +++ b/sys/dev/hyperv/utilities/hv_kvp.h @@ -0,0 +1,229 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014,2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _KVP_H +#define _KVP_H +/* + * An implementation of HyperV key value pair (KVP) functionality for FreeBSD + * + */ + +/* + * Maximum value size - used for both key names and value data, and includes + * any applicable NULL terminators. + * + * Note: This limit is somewhat arbitrary, but falls easily within what is + * supported for all native guests (back to Win 2000) and what is reasonable + * for the IC KVP exchange functionality. Note that Windows Me/98/95 are + * limited to 255 character key names. + * + * MSDN recommends not storing data values larger than 2048 bytes in the + * registry. + * + * Note: This value is used in defining the KVP exchange message - this value + * cannot be modified without affecting the message size and compatibility. + */ + +/* + * bytes, including any null terminators + */ +#define HV_KVP_EXCHANGE_MAX_VALUE_SIZE (2048) + + +/* + * Maximum key size - the registry limit for the length of an entry name + * is 256 characters, including the null terminator + */ +#define HV_KVP_EXCHANGE_MAX_KEY_SIZE (512) + + +/* + * In FreeBSD, we implement the KVP functionality in two components: + * 1) The kernel component which is packaged as part of the hv_utils driver + * is responsible for communicating with the host and responsible for + * implementing the host/guest protocol. 2) A user level daemon that is + * responsible for data gathering. + * + * Host/Guest Protocol: The host iterates over an index and expects the guest + * to assign a key name to the index and also return the value corresponding to + * the key. The host will have atmost one KVP transaction outstanding at any + * given point in time. The host side iteration stops when the guest returns + * an error. Microsoft has specified the following mapping of key names to + * host specified index: + * + * Index Key Name + * 0 FullyQualifiedDomainName + * 1 IntegrationServicesVersion + * 2 NetworkAddressIPv4 + * 3 NetworkAddressIPv6 + * 4 OSBuildNumber + * 5 OSName + * 6 OSMajorVersion + * 7 OSMinorVersion + * 8 OSVersion + * 9 ProcessorArchitecture + * + * The Windows host expects the Key Name and Key Value to be encoded in utf16. + * + * Guest Kernel/KVP Daemon Protocol: As noted earlier, we implement all of the + * data gathering functionality in a user mode daemon. The user level daemon + * is also responsible for binding the key name to the index as well. The + * kernel and user-level daemon communicate using a connector channel. + * + * The user mode component first registers with the + * the kernel component. Subsequently, the kernel component requests, data + * for the specified keys. In response to this message the user mode component + * fills in the value corresponding to the specified key. We overload the + * sequence field in the cn_msg header to define our KVP message types. + * + * + * The kernel component simply acts as a conduit for communication between the + * Windows host and the user-level daemon. The kernel component passes up the + * index received from the Host to the user-level daemon. If the index is + * valid (supported), the corresponding key as well as its + * value (both are strings) is returned. If the index is invalid + * (not supported), a NULL key string is returned. + */ + + +/* + * Registry value types. + */ +#define HV_REG_SZ 1 +#define HV_REG_U32 4 +#define HV_REG_U64 8 + + +/* + * Daemon code supporting IP injection. + */ +#define HV_KVP_OP_REGISTER 4 + + +enum hv_kvp_exchg_op { + HV_KVP_OP_GET = 0, + HV_KVP_OP_SET, + HV_KVP_OP_DELETE, + HV_KVP_OP_ENUMERATE, + HV_KVP_OP_GET_IP_INFO, + HV_KVP_OP_SET_IP_INFO, + HV_KVP_OP_COUNT /* Number of operations, must be last. */ +}; + +enum hv_kvp_exchg_pool { + HV_KVP_POOL_EXTERNAL = 0, + HV_KVP_POOL_GUEST, + HV_KVP_POOL_AUTO, + HV_KVP_POOL_AUTO_EXTERNAL, + HV_KVP_POOL_AUTO_INTERNAL, + HV_KVP_POOL_COUNT /* Number of pools, must be last. */ +}; + +#define ADDR_FAMILY_NONE 0x00 +#define ADDR_FAMILY_IPV4 0x01 +#define ADDR_FAMILY_IPV6 0x02 + +#define MAX_ADAPTER_ID_SIZE 128 +#define MAX_IP_ADDR_SIZE 1024 +#define MAX_GATEWAY_SIZE 512 + + +struct hv_kvp_ipaddr_value { + uint16_t adapter_id[MAX_ADAPTER_ID_SIZE]; + uint8_t addr_family; + uint8_t dhcp_enabled; + uint16_t ip_addr[MAX_IP_ADDR_SIZE]; + uint16_t sub_net[MAX_IP_ADDR_SIZE]; + uint16_t gate_way[MAX_GATEWAY_SIZE]; + uint16_t dns_addr[MAX_IP_ADDR_SIZE]; +}__attribute__((packed)); + +struct hv_kvp_hdr { + uint8_t operation; + uint8_t pool; + uint16_t pad; +} __attribute__((packed)); + +struct hv_kvp_exchg_msg_value { + uint32_t value_type; + uint32_t key_size; + uint32_t value_size; + uint8_t key[HV_KVP_EXCHANGE_MAX_KEY_SIZE]; + union { + uint8_t value[HV_KVP_EXCHANGE_MAX_VALUE_SIZE]; + uint32_t value_u32; + uint64_t value_u64; + } msg_value; +} __attribute__((packed)); + +struct hv_kvp_msg_enumerate { + uint32_t index; + struct hv_kvp_exchg_msg_value data; +} __attribute__((packed)); + +struct hv_kvp_msg_get { + struct hv_kvp_exchg_msg_value data; +} __attribute__((packed)); + +struct hv_kvp_msg_set { + struct hv_kvp_exchg_msg_value data; +} __attribute__((packed)); + +struct hv_kvp_msg_delete { + uint32_t key_size; + uint8_t key[HV_KVP_EXCHANGE_MAX_KEY_SIZE]; +} __attribute__((packed)); + +struct hv_kvp_register { + uint8_t version[HV_KVP_EXCHANGE_MAX_KEY_SIZE]; +} __attribute__((packed)); + +struct hv_kvp_msg { + union { + struct hv_kvp_hdr kvp_hdr; + uint32_t error; + } hdr; + union { + struct hv_kvp_msg_get kvp_get; + struct hv_kvp_msg_set kvp_set; + struct hv_kvp_msg_delete kvp_delete; + struct hv_kvp_msg_enumerate kvp_enum_data; + struct hv_kvp_ipaddr_value kvp_ip_val; + struct hv_kvp_register kvp_register; + } body; +} __attribute__((packed)); + +struct hv_kvp_ip_msg { + uint8_t operation; + uint8_t pool; + struct hv_kvp_ipaddr_value kvp_ip_val; +} __attribute__((packed)); + +#endif /* _KVP_H */ diff --git a/sys/dev/hyperv/utilities/hv_snapshot.c b/sys/dev/hyperv/utilities/hv_snapshot.c new file mode 100644 index 000000000000..45defe1b0f1e --- /dev/null +++ b/sys/dev/hyperv/utilities/hv_snapshot.c @@ -0,0 +1,1061 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/lock.h> +#include <sys/taskqueue.h> +#include <sys/selinfo.h> +#include <sys/sysctl.h> +#include <sys/poll.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/kthread.h> +#include <sys/syscallsubr.h> +#include <sys/sysproto.h> +#include <sys/un.h> +#include <sys/endian.h> +#include <sys/sema.h> +#include <sys/signal.h> +#include <sys/syslog.h> +#include <sys/systm.h> +#include <sys/mutex.h> +#include <sys/callout.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/utilities/hv_utilreg.h> +#include <dev/hyperv/utilities/vmbus_icreg.h> +#include <dev/hyperv/utilities/vmbus_icvar.h> + +#include "hv_snapshot.h" +#include "vmbus_if.h" + +#define VSS_MAJOR 5 +#define VSS_MINOR 0 +#define VSS_MSGVER VMBUS_IC_VERSION(VSS_MAJOR, VSS_MINOR) + +#define VSS_FWVER_MAJOR 3 +#define VSS_FWVER VMBUS_IC_VERSION(VSS_FWVER_MAJOR, 0) + +#define TIMEOUT_LIMIT (15) // seconds +enum hv_vss_op { + VSS_OP_CREATE = 0, + VSS_OP_DELETE, + VSS_OP_HOT_BACKUP, + VSS_OP_GET_DM_INFO, + VSS_OP_BU_COMPLETE, + /* + * Following operations are only supported with IC version >= 5.0 + */ + VSS_OP_FREEZE, /* Freeze the file systems in the VM */ + VSS_OP_THAW, /* Unfreeze the file systems */ + VSS_OP_AUTO_RECOVER, + VSS_OP_COUNT /* Number of operations, must be last */ +}; + +/* + * Header for all VSS messages. + */ +struct hv_vss_hdr { + struct vmbus_icmsg_hdr ic_hdr; + uint8_t operation; + uint8_t reserved[7]; +} __packed; + + +/* + * Flag values for the hv_vss_check_feature. Here supports only + * one value. + */ +#define VSS_HBU_NO_AUTO_RECOVERY 0x00000005 + +struct hv_vss_check_feature { + uint32_t flags; +} __packed; + +struct hv_vss_check_dm_info { + uint32_t flags; +} __packed; + +struct hv_vss_msg { + union { + struct hv_vss_hdr vss_hdr; + } hdr; + union { + struct hv_vss_check_feature vss_cf; + struct hv_vss_check_dm_info dm_info; + } body; +} __packed; + +struct hv_vss_req { + struct hv_vss_opt_msg opt_msg; /* used to communicate with daemon */ + struct hv_vss_msg msg; /* used to communicate with host */ +} __packed; + +/* hv_vss debug control */ +static int hv_vss_log = 0; + +#define hv_vss_log_error(...) do { \ + if (hv_vss_log > 0) \ + log(LOG_ERR, "hv_vss: " __VA_ARGS__); \ +} while (0) + +#define hv_vss_log_info(...) do { \ + if (hv_vss_log > 1) \ + log(LOG_INFO, "hv_vss: " __VA_ARGS__); \ +} while (0) + +static const struct vmbus_ic_desc vmbus_vss_descs[] = { + { + .ic_guid = { .hv_guid = { + 0x29, 0x2e, 0xfa, 0x35, 0x23, 0xea, 0x36, 0x42, + 0x96, 0xae, 0x3a, 0x6e, 0xba, 0xcb, 0xa4, 0x40} }, + .ic_desc = "Hyper-V VSS" + }, + VMBUS_IC_DESC_END +}; + +static const char * vss_opt_name[] = {"None", "VSSCheck", "Freeze", "Thaw"}; + +/* character device prototypes */ +static d_open_t hv_vss_dev_open; +static d_close_t hv_vss_dev_close; +static d_poll_t hv_vss_dev_daemon_poll; +static d_ioctl_t hv_vss_dev_daemon_ioctl; + +static d_open_t hv_appvss_dev_open; +static d_close_t hv_appvss_dev_close; +static d_poll_t hv_appvss_dev_poll; +static d_ioctl_t hv_appvss_dev_ioctl; + +/* hv_vss character device structure */ +static struct cdevsw hv_vss_cdevsw = +{ + .d_version = D_VERSION, + .d_open = hv_vss_dev_open, + .d_close = hv_vss_dev_close, + .d_poll = hv_vss_dev_daemon_poll, + .d_ioctl = hv_vss_dev_daemon_ioctl, + .d_name = FS_VSS_DEV_NAME, +}; + +static struct cdevsw hv_appvss_cdevsw = +{ + .d_version = D_VERSION, + .d_open = hv_appvss_dev_open, + .d_close = hv_appvss_dev_close, + .d_poll = hv_appvss_dev_poll, + .d_ioctl = hv_appvss_dev_ioctl, + .d_name = APP_VSS_DEV_NAME, +}; + +struct hv_vss_sc; +/* + * Global state to track cdev + */ +struct hv_vss_dev_sc { + /* + * msg was transferred from host to notify queue, and + * ack queue. Finally, it was recyled to free list. + */ + STAILQ_HEAD(, hv_vss_req_internal) to_notify_queue; + STAILQ_HEAD(, hv_vss_req_internal) to_ack_queue; + struct hv_vss_sc *sc; + struct proc *proc_task; + struct selinfo hv_vss_selinfo; +}; +/* + * Global state to track and synchronize the transaction requests from the host. + * The VSS allows user to register their function to do freeze/thaw for application. + * VSS kernel will notify both vss daemon and user application if it is registered. + * The implementation state transition is illustrated by: + * https://clovertrail.github.io/assets/vssdot.png + */ +typedef struct hv_vss_sc { + struct vmbus_ic_softc util_sc; + device_t dev; + + struct task task; + + /* + * mutex is used to protect access of list/queue, + * callout in request is also used this mutex. + */ + struct mtx pending_mutex; + /* + * req_free_list contains all free items + */ + LIST_HEAD(, hv_vss_req_internal) req_free_list; + + /* Indicates if daemon registered with driver */ + boolean_t register_done; + + boolean_t app_register_done; + + /* cdev for file system freeze/thaw */ + struct cdev *hv_vss_dev; + /* cdev for application freeze/thaw */ + struct cdev *hv_appvss_dev; + + /* sc for app */ + struct hv_vss_dev_sc app_sc; + /* sc for deamon */ + struct hv_vss_dev_sc daemon_sc; +} hv_vss_sc; + +typedef struct hv_vss_req_internal { + LIST_ENTRY(hv_vss_req_internal) link; + STAILQ_ENTRY(hv_vss_req_internal) slink; + struct hv_vss_req vss_req; + + /* Rcv buffer for communicating with the host*/ + uint8_t *rcv_buf; + /* Length of host message */ + uint32_t host_msg_len; + /* Host message id */ + uint64_t host_msg_id; + + hv_vss_sc *sc; + + struct callout callout; +} hv_vss_req_internal; + +#define SEARCH_REMOVE_REQ_LOCKED(reqp, queue, link, tmp, id) \ + do { \ + STAILQ_FOREACH_SAFE(reqp, queue, link, tmp) { \ + if (reqp->vss_req.opt_msg.msgid == id) { \ + STAILQ_REMOVE(queue, \ + reqp, hv_vss_req_internal, link); \ + break; \ + } \ + } \ + } while (0) + +static bool +hv_vss_is_daemon_killed_after_launch(hv_vss_sc *sc) +{ + return (!sc->register_done && sc->daemon_sc.proc_task); +} + +/* + * Callback routine that gets called whenever there is a message from host + */ +static void +hv_vss_callback(struct vmbus_channel *chan __unused, void *context) +{ + hv_vss_sc *sc = (hv_vss_sc*)context; + if (hv_vss_is_daemon_killed_after_launch(sc)) + hv_vss_log_info("%s: daemon was killed!\n", __func__); + if (sc->register_done || sc->daemon_sc.proc_task) { + hv_vss_log_info("%s: Queuing work item\n", __func__); + if (hv_vss_is_daemon_killed_after_launch(sc)) + hv_vss_log_info("%s: daemon was killed!\n", __func__); + taskqueue_enqueue(taskqueue_thread, &sc->task); + } else { + hv_vss_log_info("%s: daemon has never been registered\n", __func__); + } + hv_vss_log_info("%s: received msg from host\n", __func__); +} +/* + * Send the response back to the host. + */ +static void +hv_vss_respond_host(uint8_t *rcv_buf, struct vmbus_channel *ch, + uint32_t recvlen, uint64_t requestid, uint32_t error) +{ + struct vmbus_icmsg_hdr *hv_icmsg_hdrp; + + hv_icmsg_hdrp = (struct vmbus_icmsg_hdr *)rcv_buf; + + hv_icmsg_hdrp->ic_status = error; + hv_icmsg_hdrp->ic_flags = HV_ICMSGHDRFLAG_TRANSACTION | HV_ICMSGHDRFLAG_RESPONSE; + + error = vmbus_chan_send(ch, VMBUS_CHANPKT_TYPE_INBAND, 0, + rcv_buf, recvlen, requestid); + if (error) + hv_vss_log_info("%s: hv_vss_respond_host: sendpacket error:%d\n", + __func__, error); +} + +static void +hv_vss_notify_host_result_locked(struct hv_vss_req_internal *reqp, uint32_t status) +{ + struct hv_vss_msg* msg = (struct hv_vss_msg *)reqp->rcv_buf; + hv_vss_sc *sc = reqp->sc; + if (reqp->vss_req.opt_msg.opt == HV_VSS_CHECK) { + msg->body.vss_cf.flags = VSS_HBU_NO_AUTO_RECOVERY; + } + hv_vss_log_info("%s, %s response %s to host\n", __func__, + vss_opt_name[reqp->vss_req.opt_msg.opt], + status == HV_S_OK ? "Success" : "Fail"); + hv_vss_respond_host(reqp->rcv_buf, vmbus_get_channel(reqp->sc->dev), + reqp->host_msg_len, reqp->host_msg_id, status); + /* recycle the request */ + LIST_INSERT_HEAD(&sc->req_free_list, reqp, link); +} + +static void +hv_vss_notify_host_result(struct hv_vss_req_internal *reqp, uint32_t status) +{ + mtx_lock(&reqp->sc->pending_mutex); + hv_vss_notify_host_result_locked(reqp, status); + mtx_unlock(&reqp->sc->pending_mutex); +} + +static void +hv_vss_cp_vssreq_to_user(struct hv_vss_req_internal *reqp, + struct hv_vss_opt_msg *userdata) +{ + struct hv_vss_req *hv_vss_dev_buf; + hv_vss_dev_buf = &reqp->vss_req; + hv_vss_dev_buf->opt_msg.opt = HV_VSS_NONE; + switch (reqp->vss_req.msg.hdr.vss_hdr.operation) { + case VSS_OP_FREEZE: + hv_vss_dev_buf->opt_msg.opt = HV_VSS_FREEZE; + break; + case VSS_OP_THAW: + hv_vss_dev_buf->opt_msg.opt = HV_VSS_THAW; + break; + case VSS_OP_HOT_BACKUP: + hv_vss_dev_buf->opt_msg.opt = HV_VSS_CHECK; + break; + } + *userdata = hv_vss_dev_buf->opt_msg; + hv_vss_log_info("%s, read data from user for " + "%s (%ju) \n", __func__, vss_opt_name[userdata->opt], + (uintmax_t)userdata->msgid); +} + +/** + * Remove the request id from app notifiy or ack queue, + * and recyle the request by inserting it to free list. + * + * When app was notified but not yet sending ack, the request + * should locate in either notify queue or ack queue. + */ +static struct hv_vss_req_internal* +hv_vss_drain_req_queue_locked(hv_vss_sc *sc, uint64_t req_id) +{ + struct hv_vss_req_internal *reqp, *tmp; + SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->daemon_sc.to_notify_queue, + slink, tmp, req_id); + if (reqp == NULL) + SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->daemon_sc.to_ack_queue, + slink, tmp, req_id); + if (reqp == NULL) + SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->app_sc.to_notify_queue, + slink, tmp, req_id); + if (reqp == NULL) + SEARCH_REMOVE_REQ_LOCKED(reqp, &sc->app_sc.to_ack_queue, slink, + tmp, req_id); + return (reqp); +} +/** + * Actions for daemon who has been notified. + */ +static void +hv_vss_notified(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata) +{ + struct hv_vss_req_internal *reqp; + mtx_lock(&dev_sc->sc->pending_mutex); + if (!STAILQ_EMPTY(&dev_sc->to_notify_queue)) { + reqp = STAILQ_FIRST(&dev_sc->to_notify_queue); + hv_vss_cp_vssreq_to_user(reqp, userdata); + STAILQ_REMOVE_HEAD(&dev_sc->to_notify_queue, slink); + /* insert the msg to queue for write */ + STAILQ_INSERT_TAIL(&dev_sc->to_ack_queue, reqp, slink); + userdata->status = VSS_SUCCESS; + } else { + /* Timeout occur, thus request was removed from queue. */ + hv_vss_log_info("%s: notify queue is empty!\n", __func__); + userdata->status = VSS_FAIL; + } + mtx_unlock(&dev_sc->sc->pending_mutex); +} + +static void +hv_vss_notify(struct hv_vss_dev_sc *dev_sc, struct hv_vss_req_internal *reqp) +{ + uint32_t opt = reqp->vss_req.opt_msg.opt; + mtx_lock(&dev_sc->sc->pending_mutex); + STAILQ_INSERT_TAIL(&dev_sc->to_notify_queue, reqp, slink); + hv_vss_log_info("%s: issuing query %s (%ju) to %s\n", __func__, + vss_opt_name[opt], (uintmax_t)reqp->vss_req.opt_msg.msgid, + &dev_sc->sc->app_sc == dev_sc ? "app" : "daemon"); + mtx_unlock(&dev_sc->sc->pending_mutex); + selwakeup(&dev_sc->hv_vss_selinfo); +} + +/** + * Actions for daemon who has acknowledged. + */ +static void +hv_vss_daemon_acked(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata) +{ + struct hv_vss_req_internal *reqp, *tmp; + uint64_t req_id; + int opt; + uint32_t status; + + opt = userdata->opt; + req_id = userdata->msgid; + status = userdata->status; + /* make sure the reserved fields are all zeros. */ + memset(&userdata->reserved, 0, sizeof(struct hv_vss_opt_msg) - + __offsetof(struct hv_vss_opt_msg, reserved)); + mtx_lock(&dev_sc->sc->pending_mutex); + SEARCH_REMOVE_REQ_LOCKED(reqp, &dev_sc->to_ack_queue, slink, tmp, req_id); + mtx_unlock(&dev_sc->sc->pending_mutex); + if (reqp == NULL) { + hv_vss_log_info("%s Timeout: fail to find daemon ack request\n", + __func__); + userdata->status = VSS_FAIL; + return; + } + KASSERT(opt == reqp->vss_req.opt_msg.opt, ("Mismatched VSS operation!")); + hv_vss_log_info("%s, get response %d from daemon for %s (%ju) \n", __func__, + status, vss_opt_name[opt], (uintmax_t)req_id); + switch (opt) { + case HV_VSS_CHECK: + case HV_VSS_FREEZE: + callout_drain(&reqp->callout); + hv_vss_notify_host_result(reqp, + status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL); + break; + case HV_VSS_THAW: + if (dev_sc->sc->app_register_done) { + if (status == VSS_SUCCESS) { + hv_vss_notify(&dev_sc->sc->app_sc, reqp); + } else { + /* handle error */ + callout_drain(&reqp->callout); + hv_vss_notify_host_result(reqp, HV_E_FAIL); + } + } else { + callout_drain(&reqp->callout); + hv_vss_notify_host_result(reqp, + status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL); + } + break; + } +} + +/** + * Actions for app who has acknowledged. + */ +static void +hv_vss_app_acked(struct hv_vss_dev_sc *dev_sc, struct hv_vss_opt_msg *userdata) +{ + struct hv_vss_req_internal *reqp, *tmp; + uint64_t req_id; + int opt; + uint8_t status; + + opt = userdata->opt; + req_id = userdata->msgid; + status = userdata->status; + /* make sure the reserved fields are all zeros. */ + memset(&userdata->reserved, 0, sizeof(struct hv_vss_opt_msg) - + __offsetof(struct hv_vss_opt_msg, reserved)); + mtx_lock(&dev_sc->sc->pending_mutex); + SEARCH_REMOVE_REQ_LOCKED(reqp, &dev_sc->to_ack_queue, slink, tmp, req_id); + mtx_unlock(&dev_sc->sc->pending_mutex); + if (reqp == NULL) { + hv_vss_log_info("%s Timeout: fail to find app ack request\n", + __func__); + userdata->status = VSS_FAIL; + return; + } + KASSERT(opt == reqp->vss_req.opt_msg.opt, ("Mismatched VSS operation!")); + hv_vss_log_info("%s, get response %d from app for %s (%ju) \n", + __func__, status, vss_opt_name[opt], (uintmax_t)req_id); + if (dev_sc->sc->register_done) { + switch (opt) { + case HV_VSS_CHECK: + case HV_VSS_FREEZE: + if (status == VSS_SUCCESS) { + hv_vss_notify(&dev_sc->sc->daemon_sc, reqp); + } else { + /* handle error */ + callout_drain(&reqp->callout); + hv_vss_notify_host_result(reqp, HV_E_FAIL); + } + break; + case HV_VSS_THAW: + callout_drain(&reqp->callout); + hv_vss_notify_host_result(reqp, + status == VSS_SUCCESS ? HV_S_OK : HV_E_FAIL); + break; + } + } else { + hv_vss_log_info("%s, Fatal: vss daemon was killed\n", __func__); + } +} + +static int +hv_vss_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +{ + struct proc *td_proc; + td_proc = td->td_proc; + + struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1; + hv_vss_log_info("%s: %s opens device \"%s\" successfully.\n", + __func__, td_proc->p_comm, FS_VSS_DEV_NAME); + + if (dev_sc->sc->register_done) + return (EBUSY); + + dev_sc->sc->register_done = true; + hv_vss_callback(vmbus_get_channel(dev_sc->sc->dev), dev_sc->sc); + + dev_sc->proc_task = curproc; + return (0); +} + +static int +hv_vss_dev_close(struct cdev *dev, int fflag __unused, int devtype __unused, + struct thread *td) +{ + struct proc *td_proc; + td_proc = td->td_proc; + + struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1; + + hv_vss_log_info("%s: %s closes device \"%s\"\n", + __func__, td_proc->p_comm, FS_VSS_DEV_NAME); + dev_sc->sc->register_done = false; + return (0); +} + +static int +hv_vss_dev_daemon_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, + struct thread *td) +{ + struct proc *td_proc; + struct hv_vss_dev_sc *sc; + + td_proc = td->td_proc; + sc = (struct hv_vss_dev_sc*)dev->si_drv1; + + hv_vss_log_info("%s: %s invoked vss ioctl\n", __func__, td_proc->p_comm); + + struct hv_vss_opt_msg* userdata = (struct hv_vss_opt_msg*)data; + switch(cmd) { + case IOCHVVSSREAD: + hv_vss_notified(sc, userdata); + break; + case IOCHVVSSWRITE: + hv_vss_daemon_acked(sc, userdata); + break; + } + return (0); +} + +/* + * hv_vss_daemon poll invokes this function to check if data is available + * for daemon to read. + */ +static int +hv_vss_dev_daemon_poll(struct cdev *dev, int events, struct thread *td) +{ + int revent = 0; + struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1; + + mtx_lock(&dev_sc->sc->pending_mutex); + /** + * if there is data ready, inform daemon's poll + */ + if (!STAILQ_EMPTY(&dev_sc->to_notify_queue)) + revent = POLLIN; + if (revent == 0) + selrecord(td, &dev_sc->hv_vss_selinfo); + hv_vss_log_info("%s return 0x%x\n", __func__, revent); + mtx_unlock(&dev_sc->sc->pending_mutex); + return (revent); +} + +static int +hv_appvss_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +{ + struct proc *td_proc; + td_proc = td->td_proc; + + struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1; + hv_vss_log_info("%s: %s opens device \"%s\" successfully.\n", + __func__, td_proc->p_comm, APP_VSS_DEV_NAME); + + if (dev_sc->sc->app_register_done) + return (EBUSY); + + dev_sc->sc->app_register_done = true; + dev_sc->proc_task = curproc; + return (0); +} + +static int +hv_appvss_dev_close(struct cdev *dev, int fflag __unused, int devtype __unused, + struct thread *td) +{ + struct proc *td_proc; + td_proc = td->td_proc; + + struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1; + + hv_vss_log_info("%s: %s closes device \"%s\".\n", + __func__, td_proc->p_comm, APP_VSS_DEV_NAME); + dev_sc->sc->app_register_done = false; + return (0); +} + +static int +hv_appvss_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, + struct thread *td) +{ + struct proc *td_proc; + struct hv_vss_dev_sc *dev_sc; + + td_proc = td->td_proc; + dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1; + + hv_vss_log_info("%s: %s invoked vss ioctl\n", __func__, td_proc->p_comm); + + struct hv_vss_opt_msg* userdata = (struct hv_vss_opt_msg*)data; + switch(cmd) { + case IOCHVVSSREAD: + hv_vss_notified(dev_sc, userdata); + break; + case IOCHVVSSWRITE: + hv_vss_app_acked(dev_sc, userdata); + break; + } + return (0); +} + +/* + * hv_vss_daemon poll invokes this function to check if data is available + * for daemon to read. + */ +static int +hv_appvss_dev_poll(struct cdev *dev, int events, struct thread *td) +{ + int revent = 0; + struct hv_vss_dev_sc *dev_sc = (struct hv_vss_dev_sc*)dev->si_drv1; + + mtx_lock(&dev_sc->sc->pending_mutex); + /** + * if there is data ready, inform daemon's poll + */ + if (!STAILQ_EMPTY(&dev_sc->to_notify_queue)) + revent = POLLIN; + if (revent == 0) + selrecord(td, &dev_sc->hv_vss_selinfo); + hv_vss_log_info("%s return 0x%x\n", __func__, revent); + mtx_unlock(&dev_sc->sc->pending_mutex); + return (revent); +} + +static void +hv_vss_timeout(void *arg) +{ + hv_vss_req_internal *reqp = arg; + hv_vss_req_internal *request; + hv_vss_sc* sc = reqp->sc; + uint64_t req_id = reqp->vss_req.opt_msg.msgid; + /* This thread is locked */ + KASSERT(mtx_owned(&sc->pending_mutex), ("mutex lock is not owned!")); + request = hv_vss_drain_req_queue_locked(sc, req_id); + KASSERT(request != NULL, ("timeout but fail to find request")); + hv_vss_notify_host_result_locked(reqp, HV_E_FAIL); +} + +/* + * This routine is called whenever a message is received from the host + */ +static void +hv_vss_init_req(hv_vss_req_internal *reqp, + uint32_t recvlen, uint64_t requestid, uint8_t *vss_buf, hv_vss_sc *sc) +{ + struct timespec vm_ts; + struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf; + + memset(reqp, 0, __offsetof(hv_vss_req_internal, callout)); + reqp->host_msg_len = recvlen; + reqp->host_msg_id = requestid; + reqp->rcv_buf = vss_buf; + reqp->sc = sc; + memcpy(&reqp->vss_req.msg, + (struct hv_vss_msg *)vss_buf, sizeof(struct hv_vss_msg)); + /* set the opt for users */ + switch (msg->hdr.vss_hdr.operation) { + case VSS_OP_FREEZE: + reqp->vss_req.opt_msg.opt = HV_VSS_FREEZE; + break; + case VSS_OP_THAW: + reqp->vss_req.opt_msg.opt = HV_VSS_THAW; + break; + case VSS_OP_HOT_BACKUP: + reqp->vss_req.opt_msg.opt = HV_VSS_CHECK; + break; + } + /* Use a timestamp as msg request ID */ + nanotime(&vm_ts); + reqp->vss_req.opt_msg.msgid = (vm_ts.tv_sec * NANOSEC) + vm_ts.tv_nsec; +} + +static hv_vss_req_internal* +hv_vss_get_new_req_locked(hv_vss_sc *sc) +{ + hv_vss_req_internal *reqp; + if (!STAILQ_EMPTY(&sc->daemon_sc.to_notify_queue) || + !STAILQ_EMPTY(&sc->daemon_sc.to_ack_queue) || + !STAILQ_EMPTY(&sc->app_sc.to_notify_queue) || + !STAILQ_EMPTY(&sc->app_sc.to_ack_queue)) { + /* + * There is request coming from host before + * finishing previous requests + */ + hv_vss_log_info("%s: Warning: there is new request " + "coming before finishing previous requests\n", __func__); + return (NULL); + } + if (LIST_EMPTY(&sc->req_free_list)) { + /* TODO Error: no buffer */ + hv_vss_log_info("Error: No buffer\n"); + return (NULL); + } + reqp = LIST_FIRST(&sc->req_free_list); + LIST_REMOVE(reqp, link); + return (reqp); +} + +static void +hv_vss_start_notify(hv_vss_req_internal *reqp, uint32_t opt) +{ + hv_vss_sc *sc = reqp->sc; + /* + * Freeze/Check notification sequence: kernel -> app -> daemon(fs) + * Thaw notification sequence: kernel -> daemon(fs) -> app + * + * We should wake up the daemon, in case it's doing poll(). + * The response should be received after 5s, otherwise, trigger timeout. + */ + switch (opt) { + case VSS_OP_FREEZE: + case VSS_OP_HOT_BACKUP: + if (sc->app_register_done) + hv_vss_notify(&sc->app_sc, reqp); + else + hv_vss_notify(&sc->daemon_sc, reqp); + callout_reset(&reqp->callout, TIMEOUT_LIMIT * hz, + hv_vss_timeout, reqp); + break; + case VSS_OP_THAW: + hv_vss_notify(&sc->daemon_sc, reqp); + callout_reset(&reqp->callout, TIMEOUT_LIMIT * hz, + hv_vss_timeout, reqp); + break; + } +} + +/* + * Function to read the vss request buffer from host + * and interact with daemon + */ +static void +hv_vss_process_request(void *context, int pending __unused) +{ + uint8_t *vss_buf; + struct vmbus_channel *channel; + uint32_t recvlen = 0; + uint64_t requestid; + struct vmbus_icmsg_hdr *icmsghdrp; + int ret = 0; + hv_vss_sc *sc; + hv_vss_req_internal *reqp; + + hv_vss_log_info("%s: entering hv_vss_process_request\n", __func__); + + sc = (hv_vss_sc*)context; + vss_buf = sc->util_sc.ic_buf; + channel = vmbus_get_channel(sc->dev); + + recvlen = sc->util_sc.ic_buflen; + ret = vmbus_chan_recv(channel, vss_buf, &recvlen, &requestid); + KASSERT(ret != ENOBUFS, ("hvvss recvbuf is not large enough")); + /* XXX check recvlen to make sure that it contains enough data */ + + while ((ret == 0) && (recvlen > 0)) { + icmsghdrp = (struct vmbus_icmsg_hdr *)vss_buf; + + if (icmsghdrp->ic_type == HV_ICMSGTYPE_NEGOTIATE) { + ret = vmbus_ic_negomsg(&sc->util_sc, vss_buf, + &recvlen, VSS_FWVER, VSS_MSGVER); + hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev), + recvlen, requestid, ret); + hv_vss_log_info("%s: version negotiated\n", __func__); + } else if (!hv_vss_is_daemon_killed_after_launch(sc)) { + struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf; + switch(msg->hdr.vss_hdr.operation) { + case VSS_OP_FREEZE: + case VSS_OP_THAW: + case VSS_OP_HOT_BACKUP: + mtx_lock(&sc->pending_mutex); + reqp = hv_vss_get_new_req_locked(sc); + mtx_unlock(&sc->pending_mutex); + if (reqp == NULL) { + /* ignore this request from host */ + break; + } + hv_vss_init_req(reqp, recvlen, requestid, vss_buf, sc); + hv_vss_log_info("%s: receive %s (%ju) from host\n", + __func__, + vss_opt_name[reqp->vss_req.opt_msg.opt], + (uintmax_t)reqp->vss_req.opt_msg.msgid); + hv_vss_start_notify(reqp, msg->hdr.vss_hdr.operation); + break; + case VSS_OP_GET_DM_INFO: + hv_vss_log_info("%s: receive GET_DM_INFO from host\n", + __func__); + msg->body.dm_info.flags = 0; + hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev), + recvlen, requestid, HV_S_OK); + break; + default: + device_printf(sc->dev, "Unknown opt from host: %d\n", + msg->hdr.vss_hdr.operation); + break; + } + } else { + /* daemon was killed for some reason after it was launched */ + struct hv_vss_msg* msg = (struct hv_vss_msg *)vss_buf; + switch(msg->hdr.vss_hdr.operation) { + case VSS_OP_FREEZE: + hv_vss_log_info("%s: response fail for FREEZE\n", + __func__); + break; + case VSS_OP_THAW: + hv_vss_log_info("%s: response fail for THAW\n", + __func__); + break; + case VSS_OP_HOT_BACKUP: + hv_vss_log_info("%s: response fail for HOT_BACKUP\n", + __func__); + msg->body.vss_cf.flags = VSS_HBU_NO_AUTO_RECOVERY; + break; + case VSS_OP_GET_DM_INFO: + hv_vss_log_info("%s: response fail for GET_DM_INFO\n", + __func__); + msg->body.dm_info.flags = 0; + break; + default: + device_printf(sc->dev, "Unknown opt from host: %d\n", + msg->hdr.vss_hdr.operation); + break; + } + hv_vss_respond_host(vss_buf, vmbus_get_channel(sc->dev), + recvlen, requestid, HV_E_FAIL); + } + /* + * Try reading next buffer + */ + recvlen = sc->util_sc.ic_buflen; + ret = vmbus_chan_recv(channel, vss_buf, &recvlen, &requestid); + KASSERT(ret != ENOBUFS, ("hvvss recvbuf is not large enough")); + /* XXX check recvlen to make sure that it contains enough data */ + + hv_vss_log_info("%s: read: context %p, ret =%d, recvlen=%d\n", + __func__, context, ret, recvlen); + } +} + +static int +hv_vss_probe(device_t dev) +{ + return (vmbus_ic_probe(dev, vmbus_vss_descs)); +} + +static int +hv_vss_init_send_receive_queue(device_t dev) +{ + hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev); + int i; + const int max_list = 4; /* It is big enough for the list */ + struct hv_vss_req_internal* reqp; + + LIST_INIT(&sc->req_free_list); + STAILQ_INIT(&sc->daemon_sc.to_notify_queue); + STAILQ_INIT(&sc->daemon_sc.to_ack_queue); + STAILQ_INIT(&sc->app_sc.to_notify_queue); + STAILQ_INIT(&sc->app_sc.to_ack_queue); + + for (i = 0; i < max_list; i++) { + reqp = malloc(sizeof(struct hv_vss_req_internal), + M_DEVBUF, M_WAITOK|M_ZERO); + LIST_INSERT_HEAD(&sc->req_free_list, reqp, link); + callout_init_mtx(&reqp->callout, &sc->pending_mutex, 0); + } + return (0); +} + +static int +hv_vss_destroy_send_receive_queue(device_t dev) +{ + hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev); + hv_vss_req_internal* reqp; + + while (!LIST_EMPTY(&sc->req_free_list)) { + reqp = LIST_FIRST(&sc->req_free_list); + LIST_REMOVE(reqp, link); + free(reqp, M_DEVBUF); + } + + while (!STAILQ_EMPTY(&sc->daemon_sc.to_notify_queue)) { + reqp = STAILQ_FIRST(&sc->daemon_sc.to_notify_queue); + STAILQ_REMOVE_HEAD(&sc->daemon_sc.to_notify_queue, slink); + free(reqp, M_DEVBUF); + } + + while (!STAILQ_EMPTY(&sc->daemon_sc.to_ack_queue)) { + reqp = STAILQ_FIRST(&sc->daemon_sc.to_ack_queue); + STAILQ_REMOVE_HEAD(&sc->daemon_sc.to_ack_queue, slink); + free(reqp, M_DEVBUF); + } + + while (!STAILQ_EMPTY(&sc->app_sc.to_notify_queue)) { + reqp = STAILQ_FIRST(&sc->app_sc.to_notify_queue); + STAILQ_REMOVE_HEAD(&sc->app_sc.to_notify_queue, slink); + free(reqp, M_DEVBUF); + } + + while (!STAILQ_EMPTY(&sc->app_sc.to_ack_queue)) { + reqp = STAILQ_FIRST(&sc->app_sc.to_ack_queue); + STAILQ_REMOVE_HEAD(&sc->app_sc.to_ack_queue, slink); + free(reqp, M_DEVBUF); + } + return (0); +} + +static int +hv_vss_attach(device_t dev) +{ + int error; + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + + hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev); + + sc->dev = dev; + mtx_init(&sc->pending_mutex, "hv_vss pending mutex", NULL, MTX_DEF); + + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "hv_vss_log", + CTLFLAG_RWTUN, &hv_vss_log, 0, "Hyperv VSS service log level"); + + TASK_INIT(&sc->task, 0, hv_vss_process_request, sc); + hv_vss_init_send_receive_queue(dev); + /* create character device for file system freeze/thaw */ + error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, + &sc->hv_vss_dev, + &hv_vss_cdevsw, + 0, + UID_ROOT, + GID_WHEEL, + 0640, + FS_VSS_DEV_NAME); + + if (error != 0) { + hv_vss_log_info("Fail to create '%s': %d\n", FS_VSS_DEV_NAME, error); + return (error); + } + sc->hv_vss_dev->si_drv1 = &sc->daemon_sc; + sc->daemon_sc.sc = sc; + /* create character device for application freeze/thaw */ + error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, + &sc->hv_appvss_dev, + &hv_appvss_cdevsw, + 0, + UID_ROOT, + GID_WHEEL, + 0640, + APP_VSS_DEV_NAME); + + if (error != 0) { + hv_vss_log_info("Fail to create '%s': %d\n", APP_VSS_DEV_NAME, error); + return (error); + } + sc->hv_appvss_dev->si_drv1 = &sc->app_sc; + sc->app_sc.sc = sc; + + return (vmbus_ic_attach(dev, hv_vss_callback)); +} + +static int +hv_vss_detach(device_t dev) +{ + hv_vss_sc *sc = (hv_vss_sc*)device_get_softc(dev); + mtx_destroy(&sc->pending_mutex); + if (sc->daemon_sc.proc_task != NULL) { + PROC_LOCK(sc->daemon_sc.proc_task); + kern_psignal(sc->daemon_sc.proc_task, SIGKILL); + PROC_UNLOCK(sc->daemon_sc.proc_task); + } + if (sc->app_sc.proc_task != NULL) { + PROC_LOCK(sc->app_sc.proc_task); + kern_psignal(sc->app_sc.proc_task, SIGKILL); + PROC_UNLOCK(sc->app_sc.proc_task); + } + hv_vss_destroy_send_receive_queue(dev); + destroy_dev(sc->hv_vss_dev); + destroy_dev(sc->hv_appvss_dev); + return (vmbus_ic_detach(dev)); +} + +static device_method_t vss_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, hv_vss_probe), + DEVMETHOD(device_attach, hv_vss_attach), + DEVMETHOD(device_detach, hv_vss_detach), + { 0, 0 } +}; + +static driver_t vss_driver = { "hvvss", vss_methods, sizeof(hv_vss_sc)}; + +static devclass_t vss_devclass; + +DRIVER_MODULE(hv_vss, vmbus, vss_driver, vss_devclass, NULL, NULL); +MODULE_VERSION(hv_vss, 1); +MODULE_DEPEND(hv_vss, vmbus, 1, 1, 1); diff --git a/sys/dev/hyperv/utilities/hv_snapshot.h b/sys/dev/hyperv/utilities/hv_snapshot.h new file mode 100644 index 000000000000..e3c9e0c9fef2 --- /dev/null +++ b/sys/dev/hyperv/utilities/hv_snapshot.h @@ -0,0 +1,56 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VSS_H +#define _VSS_H +#include <sys/ioccom.h> +#define FS_VSS_DEV_NAME "hv_fsvss_dev" +#define APP_VSS_DEV_NAME "hv_appvss_dev" + +#define VSS_DEV(VSS) "/dev/"VSS + +#define VSS_SUCCESS 0x00000000 +#define VSS_FAIL 0x00000001 + +enum hv_vss_op_t { + HV_VSS_NONE = 0, + HV_VSS_CHECK, + HV_VSS_FREEZE, + HV_VSS_THAW, + HV_VSS_COUNT +}; + +struct hv_vss_opt_msg { + uint32_t opt; /* operation */ + uint32_t status; /* 0 for success, 1 for error */ + uint64_t msgid; /* an ID used to identify the transaction */ + uint8_t reserved[48]; /* reserved values are all zeroes */ +}; +#define IOCHVVSSREAD _IOR('v', 2, struct hv_vss_opt_msg) +#define IOCHVVSSWRITE _IOW('v', 3, struct hv_vss_opt_msg) +#endif diff --git a/sys/dev/hyperv/utilities/hv_utilreg.h b/sys/dev/hyperv/utilities/hv_utilreg.h new file mode 100644 index 000000000000..b29c0f99204f --- /dev/null +++ b/sys/dev/hyperv/utilities/hv_utilreg.h @@ -0,0 +1,86 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HV_UTILREG_H_ +#define _HV_UTILREG_H_ + +/* + * Some Hyper-V status codes. + */ +#define HV_S_OK 0x00000000 +#define HV_E_FAIL 0x80004005 +#define HV_S_CONT 0x80070103 +#define HV_ERROR_NOT_SUPPORTED 0x80070032 +#define HV_ERROR_MACHINE_LOCKED 0x800704F7 +#define HV_ERROR_DEVICE_NOT_CONNECTED 0x8007048F +#define HV_INVALIDARG 0x80070057 +#define HV_GUID_NOTFOUND 0x80041002 + +/* + * Common defines for Hyper-V ICs + */ +#define HV_ICMSGTYPE_NEGOTIATE 0 +#define HV_ICMSGTYPE_HEARTBEAT 1 +#define HV_ICMSGTYPE_KVPEXCHANGE 2 +#define HV_ICMSGTYPE_SHUTDOWN 3 +#define HV_ICMSGTYPE_TIMESYNC 4 +#define HV_ICMSGTYPE_VSS 5 + +#define HV_ICMSGHDRFLAG_TRANSACTION 1 +#define HV_ICMSGHDRFLAG_REQUEST 2 +#define HV_ICMSGHDRFLAG_RESPONSE 4 + +typedef struct hv_vmbus_pipe_hdr { + uint32_t flags; + uint32_t msgsize; +} __packed hv_vmbus_pipe_hdr; + +typedef struct hv_vmbus_ic_version { + uint16_t major; + uint16_t minor; +} __packed hv_vmbus_ic_version; + +typedef struct hv_vmbus_icmsg_hdr { + hv_vmbus_ic_version icverframe; + uint16_t icmsgtype; + hv_vmbus_ic_version icvermsg; + uint16_t icmsgsize; + uint32_t status; + uint8_t ictransaction_id; + uint8_t icflags; + uint8_t reserved[2]; +} __packed hv_vmbus_icmsg_hdr; + +typedef struct hv_vmbus_icmsg_negotiate { + uint16_t icframe_vercnt; + uint16_t icmsg_vercnt; + uint32_t reserved; + hv_vmbus_ic_version icversion_data[1]; /* any size array */ +} __packed hv_vmbus_icmsg_negotiate; + +#endif /* !_HV_UTILREG_H_ */ diff --git a/sys/dev/hyperv/utilities/unicode.h b/sys/dev/hyperv/utilities/unicode.h new file mode 100644 index 000000000000..696777cbbf26 --- /dev/null +++ b/sys/dev/hyperv/utilities/unicode.h @@ -0,0 +1,201 @@ +/* $NetBSD: unicode.h,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */ + +/*- + * Copyright (c) 2007 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Dieter Baron. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/types.h> + +#define UNICODE_DECOMPOSE 0x01 +#define UNICODE_PRECOMPOSE 0x02 +#define UNICODE_UTF8_LATIN1_FALLBACK 0x03 + +size_t utf8_to_utf16(uint16_t *, size_t, const char *, size_t, int, int *); +size_t utf16_to_utf8(char *, size_t, const uint16_t *, size_t, int, int *); + +size_t +utf8_to_utf16(uint16_t *dst, size_t dst_len, + const char *src, size_t src_len, + int flags, int *errp) +{ + const unsigned char *s; + size_t spos, dpos; + int error; + uint16_t c; + +#define IS_CONT(c) (((c)&0xc0) == 0x80) + + error = 0; + s = (const unsigned char *)src; + spos = dpos = 0; + while (spos<src_len) { + if (s[spos] < 0x80) + c = s[spos++]; + else if ((flags & UNICODE_UTF8_LATIN1_FALLBACK) + && (spos >= src_len || !IS_CONT(s[spos+1])) + && s[spos]>=0xa0) { + /* not valid UTF-8, assume ISO 8859-1 */ + c = s[spos++]; + } + else if (s[spos] < 0xc0 || s[spos] >= 0xf5) { + /* continuation byte without lead byte + or lead byte for codepoint above 0x10ffff */ + error++; + spos++; + continue; + } + else if (s[spos] < 0xe0) { + if (spos >= src_len || !IS_CONT(s[spos+1])) { + spos++; + error++; + continue; + } + c = ((s[spos] & 0x3f) << 6) | (s[spos+1] & 0x3f); + spos += 2; + if (c < 0x80) { + /* overlong encoding */ + error++; + continue; + } + } + else if (s[spos] < 0xf0) { + if (spos >= src_len-2 + || !IS_CONT(s[spos+1]) || !IS_CONT(s[spos+2])) { + spos++; + error++; + continue; + } + c = ((s[spos] & 0x0f) << 12) | ((s[spos+1] & 0x3f) << 6) + | (s[spos+2] & 0x3f); + spos += 3; + if (c < 0x800 || (c & 0xdf00) == 0xd800 ) { + /* overlong encoding or encoded surrogate */ + error++; + continue; + } + } + else { + uint32_t cc; + /* UTF-16 surrogate pair */ + + if (spos >= src_len-3 || !IS_CONT(s[spos+1]) + || !IS_CONT(s[spos+2]) || !IS_CONT(s[spos+3])) { + spos++; + error++; + + continue; + } + cc = ((s[spos] & 0x03) << 18) | ((s[spos+1] & 0x3f) << 12) + | ((s[spos+2] & 0x3f) << 6) | (s[spos+3] & 0x3f); + spos += 4; + if (cc < 0x10000) { + /* overlong encoding */ + error++; + continue; + } + if (dst && dpos < dst_len) + dst[dpos] = (0xd800 | ((cc-0x10000)>>10)); + dpos++; + c = 0xdc00 | ((cc-0x10000) & 0x3ffff); + } + + if (dst && dpos < dst_len) + dst[dpos] = c; + dpos++; + } + + if (errp) + *errp = error; + + return dpos; + +#undef IS_CONT +} + + +size_t +utf16_to_utf8(char *dst, size_t dst_len, + const uint16_t *src, size_t src_len, + int flags, int *errp) +{ + uint16_t spos, dpos; + int error; + +#define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL) +#define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++) + + error = 0; + dpos = 0; + for (spos=0; spos<src_len; spos++) { + if (src[spos] < 0x80) { + CHECK_LENGTH(1); + ADD_BYTE(src[spos]); + } + else if (src[spos] < 0x800) { + CHECK_LENGTH(2); + ADD_BYTE(0xc0 | (src[spos]>>6)); + ADD_BYTE(0x80 | (src[spos] & 0x3f)); + } + else if ((src[spos] & 0xdc00) == 0xd800) { + uint32_t c; + /* first surrogate */ + if (spos == src_len - 1 || (src[spos] & 0xdc00) != 0xdc00) { + /* no second surrogate present */ + error++; + continue; + } + spos++; + CHECK_LENGTH(4); + c = (((src[spos]&0x3ff) << 10) | (src[spos+1]&0x3ff)) + 0x10000; + ADD_BYTE(0xf0 | (c>>18)); + ADD_BYTE(0x80 | ((c>>12) & 0x3f)); + ADD_BYTE(0x80 | ((c>>6) & 0x3f)); + ADD_BYTE(0x80 | (c & 0x3f)); + } + else if ((src[spos] & 0xdc00) == 0xdc00) { + /* second surrogate without preceding first surrogate */ + error++; + } + else { + CHECK_LENGTH(3); + ADD_BYTE(0xe0 | src[spos]>>12); + ADD_BYTE(0x80 | ((src[spos]>>6) & 0x3f)); + ADD_BYTE(0x80 | (src[spos] & 0x3f)); + } + } + + if (errp) + *errp = error; + + return dpos; + +#undef ADD_BYTE +#undef CHECK_LENGTH +} diff --git a/sys/dev/hyperv/utilities/vmbus_heartbeat.c b/sys/dev/hyperv/utilities/vmbus_heartbeat.c new file mode 100644 index 000000000000..f15b94822aa9 --- /dev/null +++ b/sys/dev/hyperv/utilities/vmbus_heartbeat.c @@ -0,0 +1,152 @@ +/*- + * Copyright (c) 2014,2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/systm.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/vmbus.h> +#include <dev/hyperv/utilities/vmbus_icreg.h> +#include <dev/hyperv/utilities/vmbus_icvar.h> + +#define VMBUS_HEARTBEAT_FWVER_MAJOR 3 +#define VMBUS_HEARTBEAT_FWVER \ + VMBUS_IC_VERSION(VMBUS_HEARTBEAT_FWVER_MAJOR, 0) + +#define VMBUS_HEARTBEAT_MSGVER_MAJOR 3 +#define VMBUS_HEARTBEAT_MSGVER \ + VMBUS_IC_VERSION(VMBUS_HEARTBEAT_MSGVER_MAJOR, 0) + +static int vmbus_heartbeat_probe(device_t); +static int vmbus_heartbeat_attach(device_t); + +static const struct vmbus_ic_desc vmbus_heartbeat_descs[] = { + { + .ic_guid = { .hv_guid = { + 0x39, 0x4f, 0x16, 0x57, 0x15, 0x91, 0x78, 0x4e, + 0xab, 0x55, 0x38, 0x2f, 0x3b, 0xd5, 0x42, 0x2d} }, + .ic_desc = "Hyper-V Heartbeat" + }, + VMBUS_IC_DESC_END +}; + +static device_method_t vmbus_heartbeat_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vmbus_heartbeat_probe), + DEVMETHOD(device_attach, vmbus_heartbeat_attach), + DEVMETHOD(device_detach, vmbus_ic_detach), + DEVMETHOD_END +}; + +static driver_t vmbus_heartbeat_driver = { + "hvheartbeat", + vmbus_heartbeat_methods, + sizeof(struct vmbus_ic_softc) +}; + +static devclass_t vmbus_heartbeat_devclass; + +DRIVER_MODULE(hv_heartbeat, vmbus, vmbus_heartbeat_driver, + vmbus_heartbeat_devclass, NULL, NULL); +MODULE_VERSION(hv_heartbeat, 1); +MODULE_DEPEND(hv_heartbeat, vmbus, 1, 1, 1); + +static void +vmbus_heartbeat_cb(struct vmbus_channel *chan, void *xsc) +{ + struct vmbus_ic_softc *sc = xsc; + struct vmbus_icmsg_hdr *hdr; + int dlen, error; + uint64_t xactid; + void *data; + + /* + * Receive request. + */ + data = sc->ic_buf; + dlen = sc->ic_buflen; + error = vmbus_chan_recv(chan, data, &dlen, &xactid); + KASSERT(error != ENOBUFS, ("icbuf is not large enough")); + if (error) + return; + + if (dlen < sizeof(*hdr)) { + device_printf(sc->ic_dev, "invalid data len %d\n", dlen); + return; + } + hdr = data; + + /* + * Update request, which will be echoed back as response. + */ + switch (hdr->ic_type) { + case VMBUS_ICMSG_TYPE_NEGOTIATE: + error = vmbus_ic_negomsg(sc, data, &dlen, + VMBUS_HEARTBEAT_FWVER, VMBUS_HEARTBEAT_MSGVER); + if (error) + return; + break; + + case VMBUS_ICMSG_TYPE_HEARTBEAT: + /* Only ic_seq is a must */ + if (dlen < VMBUS_ICMSG_HEARTBEAT_SIZE_MIN) { + device_printf(sc->ic_dev, "invalid heartbeat len %d\n", + dlen); + return; + } + ((struct vmbus_icmsg_heartbeat *)data)->ic_seq++; + break; + + default: + device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type); + break; + } + + /* + * Send response by echoing the request back. + */ + vmbus_ic_sendresp(sc, chan, data, dlen, xactid); +} + +static int +vmbus_heartbeat_probe(device_t dev) +{ + + return (vmbus_ic_probe(dev, vmbus_heartbeat_descs)); +} + +static int +vmbus_heartbeat_attach(device_t dev) +{ + + return (vmbus_ic_attach(dev, vmbus_heartbeat_cb)); +} diff --git a/sys/dev/hyperv/utilities/vmbus_ic.c b/sys/dev/hyperv/utilities/vmbus_ic.c new file mode 100644 index 000000000000..574670053918 --- /dev/null +++ b/sys/dev/hyperv/utilities/vmbus_ic.c @@ -0,0 +1,299 @@ +/*- + * Copyright (c) 2014,2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/systm.h> +#include <sys/sysctl.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/vmbus.h> +#include <dev/hyperv/utilities/vmbus_icreg.h> +#include <dev/hyperv/utilities/vmbus_icvar.h> + +#include "vmbus_if.h" + +#define VMBUS_IC_BRSIZE (4 * PAGE_SIZE) + +#define VMBUS_IC_VERCNT 2 +#define VMBUS_IC_NEGOSZ \ + __offsetof(struct vmbus_icmsg_negotiate, ic_ver[VMBUS_IC_VERCNT]) +CTASSERT(VMBUS_IC_NEGOSZ < VMBUS_IC_BRSIZE); + +static int vmbus_ic_fwver_sysctl(SYSCTL_HANDLER_ARGS); +static int vmbus_ic_msgver_sysctl(SYSCTL_HANDLER_ARGS); + +int +vmbus_ic_negomsg(struct vmbus_ic_softc *sc, void *data, int *dlen0, + uint32_t fw_ver, uint32_t msg_ver) +{ + struct vmbus_icmsg_negotiate *nego; + int i, cnt, dlen = *dlen0, error; + uint32_t sel_fw_ver, sel_msg_ver; + bool has_fw_ver, has_msg_ver; + + /* + * Preliminary message verification. + */ + if (dlen < sizeof(*nego)) { + device_printf(sc->ic_dev, "truncated ic negotiate, len %d\n", + dlen); + return (EINVAL); + } + nego = data; + + if (nego->ic_fwver_cnt == 0) { + device_printf(sc->ic_dev, "ic negotiate does not contain " + "framework version %u\n", nego->ic_fwver_cnt); + return (EINVAL); + } + if (nego->ic_msgver_cnt == 0) { + device_printf(sc->ic_dev, "ic negotiate does not contain " + "message version %u\n", nego->ic_msgver_cnt); + return (EINVAL); + } + + cnt = nego->ic_fwver_cnt + nego->ic_msgver_cnt; + if (dlen < __offsetof(struct vmbus_icmsg_negotiate, ic_ver[cnt])) { + device_printf(sc->ic_dev, "ic negotiate does not contain " + "versions %d\n", dlen); + return (EINVAL); + } + + error = EOPNOTSUPP; + + /* + * Find the best match framework version. + */ + has_fw_ver = false; + for (i = 0; i < nego->ic_fwver_cnt; ++i) { + if (VMBUS_ICVER_LE(nego->ic_ver[i], fw_ver)) { + if (!has_fw_ver) { + sel_fw_ver = nego->ic_ver[i]; + has_fw_ver = true; + } else if (VMBUS_ICVER_GT(nego->ic_ver[i], + sel_fw_ver)) { + sel_fw_ver = nego->ic_ver[i]; + } + } + } + if (!has_fw_ver) { + device_printf(sc->ic_dev, "failed to select framework " + "version\n"); + goto done; + } + + /* + * Fine the best match message version. + */ + has_msg_ver = false; + for (i = nego->ic_fwver_cnt; + i < nego->ic_fwver_cnt + nego->ic_msgver_cnt; ++i) { + if (VMBUS_ICVER_LE(nego->ic_ver[i], msg_ver)) { + if (!has_msg_ver) { + sel_msg_ver = nego->ic_ver[i]; + has_msg_ver = true; + } else if (VMBUS_ICVER_GT(nego->ic_ver[i], + sel_msg_ver)) { + sel_msg_ver = nego->ic_ver[i]; + } + } + } + if (!has_msg_ver) { + device_printf(sc->ic_dev, "failed to select message " + "version\n"); + goto done; + } + + error = 0; +done: + if (bootverbose || !has_fw_ver || !has_msg_ver) { + if (has_fw_ver) { + device_printf(sc->ic_dev, "sel framework version: " + "%u.%u\n", + VMBUS_ICVER_MAJOR(sel_fw_ver), + VMBUS_ICVER_MINOR(sel_fw_ver)); + } + for (i = 0; i < nego->ic_fwver_cnt; i++) { + device_printf(sc->ic_dev, "supp framework version: " + "%u.%u\n", + VMBUS_ICVER_MAJOR(nego->ic_ver[i]), + VMBUS_ICVER_MINOR(nego->ic_ver[i])); + } + + if (has_msg_ver) { + device_printf(sc->ic_dev, "sel message version: " + "%u.%u\n", + VMBUS_ICVER_MAJOR(sel_msg_ver), + VMBUS_ICVER_MINOR(sel_msg_ver)); + } + for (i = nego->ic_fwver_cnt; + i < nego->ic_fwver_cnt + nego->ic_msgver_cnt; i++) { + device_printf(sc->ic_dev, "supp message version: " + "%u.%u\n", + VMBUS_ICVER_MAJOR(nego->ic_ver[i]), + VMBUS_ICVER_MINOR(nego->ic_ver[i])); + } + } + if (error) + return (error); + + /* Record the selected versions. */ + sc->ic_fwver = sel_fw_ver; + sc->ic_msgver = sel_msg_ver; + + /* One framework version. */ + nego->ic_fwver_cnt = 1; + nego->ic_ver[0] = sel_fw_ver; + + /* One message version. */ + nego->ic_msgver_cnt = 1; + nego->ic_ver[1] = sel_msg_ver; + + /* Update data size. */ + nego->ic_hdr.ic_dsize = VMBUS_IC_NEGOSZ - + sizeof(struct vmbus_icmsg_hdr); + + /* Update total size, if necessary. */ + if (dlen < VMBUS_IC_NEGOSZ) + *dlen0 = VMBUS_IC_NEGOSZ; + + return (0); +} + +int +vmbus_ic_probe(device_t dev, const struct vmbus_ic_desc descs[]) +{ + device_t bus = device_get_parent(dev); + const struct vmbus_ic_desc *d; + + if (resource_disabled(device_get_name(dev), 0)) + return (ENXIO); + + for (d = descs; d->ic_desc != NULL; ++d) { + if (VMBUS_PROBE_GUID(bus, dev, &d->ic_guid) == 0) { + device_set_desc(dev, d->ic_desc); + return (BUS_PROBE_DEFAULT); + } + } + return (ENXIO); +} + +int +vmbus_ic_attach(device_t dev, vmbus_chan_callback_t cb) +{ + struct vmbus_ic_softc *sc = device_get_softc(dev); + struct vmbus_channel *chan = vmbus_get_channel(dev); + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + int error; + + sc->ic_dev = dev; + sc->ic_buflen = VMBUS_IC_BRSIZE; + sc->ic_buf = malloc(VMBUS_IC_BRSIZE, M_DEVBUF, M_WAITOK | M_ZERO); + + /* + * These services are not performance critical and do not need + * batched reading. Furthermore, some services such as KVP can + * only handle one message from the host at a time. + * Turn off batched reading for all util drivers before we open the + * channel. + */ + vmbus_chan_set_readbatch(chan, false); + + error = vmbus_chan_open(chan, VMBUS_IC_BRSIZE, VMBUS_IC_BRSIZE, NULL, 0, + cb, sc); + if (error) { + free(sc->ic_buf, M_DEVBUF); + return (error); + } + + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "fw_version", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + vmbus_ic_fwver_sysctl, "A", "framework version"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "msg_version", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + vmbus_ic_msgver_sysctl, "A", "message version"); + + return (0); +} + +static int +vmbus_ic_fwver_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct vmbus_ic_softc *sc = arg1; + char verstr[16]; + + snprintf(verstr, sizeof(verstr), "%u.%u", + VMBUS_ICVER_MAJOR(sc->ic_fwver), VMBUS_ICVER_MINOR(sc->ic_fwver)); + return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); +} + +static int +vmbus_ic_msgver_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct vmbus_ic_softc *sc = arg1; + char verstr[16]; + + snprintf(verstr, sizeof(verstr), "%u.%u", + VMBUS_ICVER_MAJOR(sc->ic_msgver), VMBUS_ICVER_MINOR(sc->ic_msgver)); + return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); +} + +int +vmbus_ic_detach(device_t dev) +{ + struct vmbus_ic_softc *sc = device_get_softc(dev); + + vmbus_chan_close(vmbus_get_channel(dev)); + free(sc->ic_buf, M_DEVBUF); + + return (0); +} + +int +vmbus_ic_sendresp(struct vmbus_ic_softc *sc, struct vmbus_channel *chan, + void *data, int dlen, uint64_t xactid) +{ + struct vmbus_icmsg_hdr *hdr; + int error; + + KASSERT(dlen >= sizeof(*hdr), ("invalid data length %d", dlen)); + hdr = data; + + hdr->ic_flags = VMBUS_ICMSG_FLAG_XACT | VMBUS_ICMSG_FLAG_RESP; + error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, 0, + data, dlen, xactid); + if (error) + device_printf(sc->ic_dev, "resp send failed: %d\n", error); + return (error); +} diff --git a/sys/dev/hyperv/utilities/vmbus_icreg.h b/sys/dev/hyperv/utilities/vmbus_icreg.h new file mode 100644 index 000000000000..e962102d13dd --- /dev/null +++ b/sys/dev/hyperv/utilities/vmbus_icreg.h @@ -0,0 +1,135 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMBUS_ICREG_H_ +#define _VMBUS_ICREG_H_ + +#define VMBUS_ICMSG_TYPE_NEGOTIATE 0 +#define VMBUS_ICMSG_TYPE_HEARTBEAT 1 +#define VMBUS_ICMSG_TYPE_KVP 2 +#define VMBUS_ICMSG_TYPE_SHUTDOWN 3 +#define VMBUS_ICMSG_TYPE_TIMESYNC 4 +#define VMBUS_ICMSG_TYPE_VSS 5 + +#define VMBUS_ICMSG_STATUS_OK 0x00000000 +#define VMBUS_ICMSG_STATUS_FAIL 0x80004005 + +#define VMBUS_IC_VERSION(major, minor) ((major) | (((uint32_t)(minor)) << 16)) +#define VMBUS_ICVER_MAJOR(ver) ((ver) & 0xffff) +#define VMBUS_ICVER_MINOR(ver) (((ver) & 0xffff0000) >> 16) +#define VMBUS_ICVER_SWAP(ver) \ + ((VMBUS_ICVER_MAJOR((ver)) << 16) | VMBUS_ICVER_MINOR((ver))) +#define VMBUS_ICVER_LE(v1, v2) \ + (VMBUS_ICVER_SWAP((v1)) <= VMBUS_ICVER_SWAP((v2))) +#define VMBUS_ICVER_GT(v1, v2) \ + (VMBUS_ICVER_SWAP((v1)) > VMBUS_ICVER_SWAP((v2))) + +struct vmbus_pipe_hdr { + uint32_t ph_flags; + uint32_t ph_msgsz; +} __packed; + +struct vmbus_icmsg_hdr { + struct vmbus_pipe_hdr ic_pipe; + uint32_t ic_fwver; /* framework version */ + uint16_t ic_type; + uint32_t ic_msgver; /* message version */ + uint16_t ic_dsize; /* data size */ + uint32_t ic_status; /* VMBUS_ICMSG_STATUS_ */ + uint8_t ic_xactid; + uint8_t ic_flags; /* VMBUS_ICMSG_FLAG_ */ + uint8_t ic_rsvd[2]; +} __packed; + +#define VMBUS_ICMSG_FLAG_XACT 0x0001 +#define VMBUS_ICMSG_FLAG_REQ 0x0002 +#define VMBUS_ICMSG_FLAG_RESP 0x0004 + +/* VMBUS_ICMSG_TYPE_NEGOTIATE */ +struct vmbus_icmsg_negotiate { + struct vmbus_icmsg_hdr ic_hdr; + uint16_t ic_fwver_cnt; + uint16_t ic_msgver_cnt; + uint32_t ic_rsvd; + /* + * This version array contains two set of supported + * versions: + * - The first set consists of #ic_fwver_cnt supported framework + * versions. + * - The second set consists of #ic_msgver_cnt supported message + * versions. + */ + uint32_t ic_ver[]; +} __packed; + +/* VMBUS_ICMSG_TYPE_HEARTBEAT */ +struct vmbus_icmsg_heartbeat { + struct vmbus_icmsg_hdr ic_hdr; + uint64_t ic_seq; + uint32_t ic_rsvd[8]; +} __packed; + +#define VMBUS_ICMSG_HEARTBEAT_SIZE_MIN \ + __offsetof(struct vmbus_icmsg_heartbeat, ic_rsvd[0]) + +/* VMBUS_ICMSG_TYPE_SHUTDOWN */ +struct vmbus_icmsg_shutdown { + struct vmbus_icmsg_hdr ic_hdr; + uint32_t ic_code; + uint32_t ic_timeo; + uint32_t ic_haltflags; + uint8_t ic_msg[2048]; +} __packed; + +#define VMBUS_ICMSG_SHUTDOWN_SIZE_MIN \ + __offsetof(struct vmbus_icmsg_shutdown, ic_msg[0]) + +/* VMBUS_ICMSG_TYPE_TIMESYNC */ +struct vmbus_icmsg_timesync { + struct vmbus_icmsg_hdr ic_hdr; + uint64_t ic_hvtime; + uint64_t ic_vmtime; + uint64_t ic_rtt; + uint8_t ic_tsflags; /* VMBUS_ICMSG_TS_FLAG_ */ +} __packed; + +/* VMBUS_ICMSG_TYPE_TIMESYNC, MSGVER4 */ +struct vmbus_icmsg_timesync4 { + struct vmbus_icmsg_hdr ic_hdr; + uint64_t ic_hvtime; + uint64_t ic_sent_tc; + uint8_t ic_tsflags; /* VMBUS_ICMSG_TS_FLAG_ */ + uint8_t ic_rsvd[5]; +} __packed; + +#define VMBUS_ICMSG_TS_FLAG_SYNC 0x01 +#define VMBUS_ICMSG_TS_FLAG_SAMPLE 0x02 + +#define VMBUS_ICMSG_TS_BASE 116444736000000000ULL + +#endif /* !_VMBUS_ICREG_H_ */ diff --git a/sys/dev/hyperv/utilities/vmbus_icvar.h b/sys/dev/hyperv/utilities/vmbus_icvar.h new file mode 100644 index 000000000000..a60ecfed58a2 --- /dev/null +++ b/sys/dev/hyperv/utilities/vmbus_icvar.h @@ -0,0 +1,61 @@ +/*- + * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMBUS_ICVAR_H_ +#define _VMBUS_ICVAR_H_ + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/vmbus.h> + +struct vmbus_ic_softc { + device_t ic_dev; + uint8_t *ic_buf; + int ic_buflen; + uint32_t ic_fwver; /* framework version */ + uint32_t ic_msgver; /* message version */ +}; + +struct vmbus_ic_desc { + const struct hyperv_guid ic_guid; + const char *ic_desc; +}; + +#define VMBUS_IC_DESC_END { .ic_desc = NULL } + +int vmbus_ic_attach(device_t dev, vmbus_chan_callback_t cb); +int vmbus_ic_detach(device_t dev); +int vmbus_ic_probe(device_t dev, const struct vmbus_ic_desc descs[]); +int vmbus_ic_negomsg(struct vmbus_ic_softc *sc, void *data, + int *dlen, uint32_t fw_ver, uint32_t msg_ver); +int vmbus_ic_sendresp(struct vmbus_ic_softc *sc, + struct vmbus_channel *chan, void *data, int dlen, + uint64_t xactid); + +#endif /* !_VMBUS_ICVAR_H_ */ diff --git a/sys/dev/hyperv/utilities/vmbus_shutdown.c b/sys/dev/hyperv/utilities/vmbus_shutdown.c new file mode 100644 index 000000000000..7e54dc9866bb --- /dev/null +++ b/sys/dev/hyperv/utilities/vmbus_shutdown.c @@ -0,0 +1,167 @@ +/*- + * Copyright (c) 2014,2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/reboot.h> +#include <sys/systm.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/vmbus.h> +#include <dev/hyperv/utilities/vmbus_icreg.h> +#include <dev/hyperv/utilities/vmbus_icvar.h> + +#define VMBUS_SHUTDOWN_FWVER_MAJOR 3 +#define VMBUS_SHUTDOWN_FWVER \ + VMBUS_IC_VERSION(VMBUS_SHUTDOWN_FWVER_MAJOR, 0) + +#define VMBUS_SHUTDOWN_MSGVER_MAJOR 3 +#define VMBUS_SHUTDOWN_MSGVER \ + VMBUS_IC_VERSION(VMBUS_SHUTDOWN_MSGVER_MAJOR, 0) + +static int vmbus_shutdown_probe(device_t); +static int vmbus_shutdown_attach(device_t); + +static const struct vmbus_ic_desc vmbus_shutdown_descs[] = { + { + .ic_guid = { .hv_guid = { + 0x31, 0x60, 0x0b, 0x0e, 0x13, 0x52, 0x34, 0x49, + 0x81, 0x8b, 0x38, 0xd9, 0x0c, 0xed, 0x39, 0xdb } }, + .ic_desc = "Hyper-V Shutdown" + }, + VMBUS_IC_DESC_END +}; + +static device_method_t vmbus_shutdown_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vmbus_shutdown_probe), + DEVMETHOD(device_attach, vmbus_shutdown_attach), + DEVMETHOD(device_detach, vmbus_ic_detach), + DEVMETHOD_END +}; + +static driver_t vmbus_shutdown_driver = { + "hvshutdown", + vmbus_shutdown_methods, + sizeof(struct vmbus_ic_softc) +}; + +static devclass_t vmbus_shutdown_devclass; + +DRIVER_MODULE(hv_shutdown, vmbus, vmbus_shutdown_driver, + vmbus_shutdown_devclass, NULL, NULL); +MODULE_VERSION(hv_shutdown, 1); +MODULE_DEPEND(hv_shutdown, vmbus, 1, 1, 1); + +static void +vmbus_shutdown_cb(struct vmbus_channel *chan, void *xsc) +{ + struct vmbus_ic_softc *sc = xsc; + struct vmbus_icmsg_hdr *hdr; + struct vmbus_icmsg_shutdown *msg; + int dlen, error, do_shutdown = 0; + uint64_t xactid; + void *data; + + /* + * Receive request. + */ + data = sc->ic_buf; + dlen = sc->ic_buflen; + error = vmbus_chan_recv(chan, data, &dlen, &xactid); + KASSERT(error != ENOBUFS, ("icbuf is not large enough")); + if (error) + return; + + if (dlen < sizeof(*hdr)) { + device_printf(sc->ic_dev, "invalid data len %d\n", dlen); + return; + } + hdr = data; + + /* + * Update request, which will be echoed back as response. + */ + switch (hdr->ic_type) { + case VMBUS_ICMSG_TYPE_NEGOTIATE: + error = vmbus_ic_negomsg(sc, data, &dlen, + VMBUS_SHUTDOWN_FWVER, VMBUS_SHUTDOWN_MSGVER); + if (error) + return; + break; + + case VMBUS_ICMSG_TYPE_SHUTDOWN: + if (dlen < VMBUS_ICMSG_SHUTDOWN_SIZE_MIN) { + device_printf(sc->ic_dev, "invalid shutdown len %d\n", + dlen); + return; + } + msg = data; + + /* XXX ic_flags definition? */ + if (msg->ic_haltflags == 0 || msg->ic_haltflags == 1) { + device_printf(sc->ic_dev, "shutdown requested\n"); + hdr->ic_status = VMBUS_ICMSG_STATUS_OK; + do_shutdown = 1; + } else { + device_printf(sc->ic_dev, "unknown shutdown flags " + "0x%08x\n", msg->ic_haltflags); + hdr->ic_status = VMBUS_ICMSG_STATUS_FAIL; + } + break; + + default: + device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type); + break; + } + + /* + * Send response by echoing the request back. + */ + vmbus_ic_sendresp(sc, chan, data, dlen, xactid); + + if (do_shutdown) + shutdown_nice(RB_POWEROFF); +} + +static int +vmbus_shutdown_probe(device_t dev) +{ + + return (vmbus_ic_probe(dev, vmbus_shutdown_descs)); +} + +static int +vmbus_shutdown_attach(device_t dev) +{ + + return (vmbus_ic_attach(dev, vmbus_shutdown_cb)); +} diff --git a/sys/dev/hyperv/utilities/vmbus_timesync.c b/sys/dev/hyperv/utilities/vmbus_timesync.c new file mode 100644 index 000000000000..2a8d3a988b43 --- /dev/null +++ b/sys/dev/hyperv/utilities/vmbus_timesync.c @@ -0,0 +1,260 @@ +/*- + * Copyright (c) 2014,2016-2017 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/syscallsubr.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/vmbus.h> +#include <dev/hyperv/utilities/vmbus_icreg.h> +#include <dev/hyperv/utilities/vmbus_icvar.h> + +#define VMBUS_TIMESYNC_FWVER_MAJOR 3 +#define VMBUS_TIMESYNC_FWVER \ + VMBUS_IC_VERSION(VMBUS_TIMESYNC_FWVER_MAJOR, 0) + +#define VMBUS_TIMESYNC_MSGVER_MAJOR 4 +#define VMBUS_TIMESYNC_MSGVER \ + VMBUS_IC_VERSION(VMBUS_TIMESYNC_MSGVER_MAJOR, 0) + +#define VMBUS_TIMESYNC_MSGVER4(sc) \ + VMBUS_ICVER_LE(VMBUS_IC_VERSION(4, 0), (sc)->ic_msgver) + +#define VMBUS_TIMESYNC_DORTT(sc) \ + (VMBUS_TIMESYNC_MSGVER4((sc)) && hyperv_tc64 != NULL) + +static int vmbus_timesync_probe(device_t); +static int vmbus_timesync_attach(device_t); + +static const struct vmbus_ic_desc vmbus_timesync_descs[] = { + { + .ic_guid = { .hv_guid = { + 0x30, 0xe6, 0x27, 0x95, 0xae, 0xd0, 0x7b, 0x49, + 0xad, 0xce, 0xe8, 0x0a, 0xb0, 0x17, 0x5c, 0xaf } }, + .ic_desc = "Hyper-V Timesync" + }, + VMBUS_IC_DESC_END +}; + +static device_method_t vmbus_timesync_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vmbus_timesync_probe), + DEVMETHOD(device_attach, vmbus_timesync_attach), + DEVMETHOD(device_detach, vmbus_ic_detach), + DEVMETHOD_END +}; + +static driver_t vmbus_timesync_driver = { + "hvtimesync", + vmbus_timesync_methods, + sizeof(struct vmbus_ic_softc) +}; + +static devclass_t vmbus_timesync_devclass; + +DRIVER_MODULE(hv_timesync, vmbus, vmbus_timesync_driver, + vmbus_timesync_devclass, NULL, NULL); +MODULE_VERSION(hv_timesync, 1); +MODULE_DEPEND(hv_timesync, vmbus, 1, 1, 1); + +SYSCTL_NODE(_hw, OID_AUTO, hvtimesync, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + "Hyper-V timesync interface"); + +static int vmbus_ts_ignore_sync = 0; +SYSCTL_INT(_hw_hvtimesync, OID_AUTO, ignore_sync, CTLFLAG_RWTUN, + &vmbus_ts_ignore_sync, 0, "Ignore the sync request."); + +/* + * Trigger sample sync when drift exceeds threshold (ms). + * Ignore the sample request when set to 0. + */ +static int vmbus_ts_sample_thresh = 100; +SYSCTL_INT(_hw_hvtimesync, OID_AUTO, sample_thresh, CTLFLAG_RWTUN, + &vmbus_ts_sample_thresh, 0, + "Threshold that makes sample request trigger the sync (unit: ms)."); + +static int vmbus_ts_sample_verbose = 0; +SYSCTL_INT(_hw_hvtimesync, OID_AUTO, sample_verbose, CTLFLAG_RWTUN, + &vmbus_ts_sample_verbose, 0, "Increase sample request verbosity."); + +static void +vmbus_timesync(struct vmbus_ic_softc *sc, uint64_t hvtime, uint64_t sent_tc, + uint8_t tsflags) +{ + struct timespec vm_ts; + uint64_t hv_ns, vm_ns, rtt = 0; + + if (VMBUS_TIMESYNC_DORTT(sc)) + rtt = hyperv_tc64() - sent_tc; + + hv_ns = (hvtime - VMBUS_ICMSG_TS_BASE + rtt) * HYPERV_TIMER_NS_FACTOR; + nanotime(&vm_ts); + vm_ns = (vm_ts.tv_sec * NANOSEC) + vm_ts.tv_nsec; + + if ((tsflags & VMBUS_ICMSG_TS_FLAG_SYNC) && !vmbus_ts_ignore_sync) { + struct timespec hv_ts; + + if (bootverbose) { + device_printf(sc->ic_dev, "apply sync request, " + "hv: %ju, vm: %ju\n", + (uintmax_t)hv_ns, (uintmax_t)vm_ns); + } + hv_ts.tv_sec = hv_ns / NANOSEC; + hv_ts.tv_nsec = hv_ns % NANOSEC; + kern_clock_settime(curthread, CLOCK_REALTIME, &hv_ts); + /* Done! */ + return; + } + + if ((tsflags & VMBUS_ICMSG_TS_FLAG_SAMPLE) && + vmbus_ts_sample_thresh >= 0) { + int64_t diff; + + if (vmbus_ts_sample_verbose) { + device_printf(sc->ic_dev, "sample request, " + "hv: %ju, vm: %ju\n", + (uintmax_t)hv_ns, (uintmax_t)vm_ns); + } + + if (hv_ns > vm_ns) + diff = hv_ns - vm_ns; + else + diff = vm_ns - hv_ns; + /* nanosec -> millisec */ + diff /= 1000000; + + if (diff > vmbus_ts_sample_thresh) { + struct timespec hv_ts; + + if (bootverbose) { + device_printf(sc->ic_dev, + "apply sample request, hv: %ju, vm: %ju\n", + (uintmax_t)hv_ns, (uintmax_t)vm_ns); + } + hv_ts.tv_sec = hv_ns / NANOSEC; + hv_ts.tv_nsec = hv_ns % NANOSEC; + kern_clock_settime(curthread, CLOCK_REALTIME, &hv_ts); + } + /* Done */ + return; + } +} + +static void +vmbus_timesync_cb(struct vmbus_channel *chan, void *xsc) +{ + struct vmbus_ic_softc *sc = xsc; + struct vmbus_icmsg_hdr *hdr; + int dlen, error; + uint64_t xactid; + void *data; + + /* + * Receive request. + */ + data = sc->ic_buf; + dlen = sc->ic_buflen; + error = vmbus_chan_recv(chan, data, &dlen, &xactid); + KASSERT(error != ENOBUFS, ("icbuf is not large enough")); + if (error) + return; + + if (dlen < sizeof(*hdr)) { + device_printf(sc->ic_dev, "invalid data len %d\n", dlen); + return; + } + hdr = data; + + /* + * Update request, which will be echoed back as response. + */ + switch (hdr->ic_type) { + case VMBUS_ICMSG_TYPE_NEGOTIATE: + error = vmbus_ic_negomsg(sc, data, &dlen, + VMBUS_TIMESYNC_FWVER, VMBUS_TIMESYNC_MSGVER); + if (error) + return; + if (VMBUS_TIMESYNC_DORTT(sc)) + device_printf(sc->ic_dev, "RTT\n"); + break; + + case VMBUS_ICMSG_TYPE_TIMESYNC: + if (VMBUS_TIMESYNC_MSGVER4(sc)) { + const struct vmbus_icmsg_timesync4 *msg4; + + if (dlen < sizeof(*msg4)) { + device_printf(sc->ic_dev, "invalid timesync4 " + "len %d\n", dlen); + return; + } + msg4 = data; + vmbus_timesync(sc, msg4->ic_hvtime, msg4->ic_sent_tc, + msg4->ic_tsflags); + } else { + const struct vmbus_icmsg_timesync *msg; + + if (dlen < sizeof(*msg)) { + device_printf(sc->ic_dev, "invalid timesync " + "len %d\n", dlen); + return; + } + msg = data; + vmbus_timesync(sc, msg->ic_hvtime, 0, msg->ic_tsflags); + } + break; + + default: + device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type); + break; + } + + /* + * Send response by echoing the request back. + */ + vmbus_ic_sendresp(sc, chan, data, dlen, xactid); +} + +static int +vmbus_timesync_probe(device_t dev) +{ + + return (vmbus_ic_probe(dev, vmbus_timesync_descs)); +} + +static int +vmbus_timesync_attach(device_t dev) +{ + + return (vmbus_ic_attach(dev, vmbus_timesync_cb)); +} diff --git a/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c b/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c new file mode 100644 index 000000000000..11d549dc18d2 --- /dev/null +++ b/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c @@ -0,0 +1,236 @@ +/*- + * Copyright (c) 2016-2017 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/conf.h> +#include <sys/fcntl.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/timetc.h> +#include <sys/vdso.h> + +#include <machine/cpufunc.h> +#include <machine/cputypes.h> +#include <machine/md_var.h> +#include <machine/specialreg.h> + +#include <vm/vm.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/hyperv_busdma.h> +#include <dev/hyperv/vmbus/hyperv_machdep.h> +#include <dev/hyperv/vmbus/hyperv_reg.h> +#include <dev/hyperv/vmbus/hyperv_var.h> + +struct hyperv_reftsc_ctx { + struct hyperv_reftsc *tsc_ref; + struct hyperv_dma tsc_ref_dma; +}; + +static uint32_t hyperv_tsc_vdso_timehands( + struct vdso_timehands *, + struct timecounter *); + +static d_open_t hyperv_tsc_open; +static d_mmap_t hyperv_tsc_mmap; + +static struct timecounter hyperv_tsc_timecounter = { + .tc_get_timecount = NULL, /* based on CPU vendor. */ + .tc_counter_mask = 0xffffffff, + .tc_frequency = HYPERV_TIMER_FREQ, + .tc_name = "Hyper-V-TSC", + .tc_quality = 3000, + .tc_fill_vdso_timehands = hyperv_tsc_vdso_timehands, +}; + +static struct cdevsw hyperv_tsc_cdevsw = { + .d_version = D_VERSION, + .d_open = hyperv_tsc_open, + .d_mmap = hyperv_tsc_mmap, + .d_name = HYPERV_REFTSC_DEVNAME +}; + +static struct hyperv_reftsc_ctx hyperv_ref_tsc; + +uint64_t +hypercall_md(volatile void *hc_addr, uint64_t in_val, + uint64_t in_paddr, uint64_t out_paddr) +{ + uint64_t status; + + __asm__ __volatile__ ("mov %0, %%r8" : : "r" (out_paddr): "r8"); + __asm__ __volatile__ ("call *%3" : "=a" (status) : + "c" (in_val), "d" (in_paddr), "m" (hc_addr)); + return (status); +} + +static int +hyperv_tsc_open(struct cdev *dev __unused, int oflags, int devtype __unused, + struct thread *td __unused) +{ + + if (oflags & FWRITE) + return (EPERM); + return (0); +} + +static int +hyperv_tsc_mmap(struct cdev *dev __unused, vm_ooffset_t offset, + vm_paddr_t *paddr, int nprot __unused, vm_memattr_t *memattr __unused) +{ + + KASSERT(hyperv_ref_tsc.tsc_ref != NULL, ("reftsc has not been setup")); + + /* + * NOTE: + * 'nprot' does not contain information interested to us; + * WR-open is blocked by d_open. + */ + + if (offset != 0) + return (EOPNOTSUPP); + + *paddr = hyperv_ref_tsc.tsc_ref_dma.hv_paddr; + return (0); +} + +static uint32_t +hyperv_tsc_vdso_timehands(struct vdso_timehands *vdso_th, + struct timecounter *tc __unused) +{ + + vdso_th->th_algo = VDSO_TH_ALGO_X86_HVTSC; + vdso_th->th_x86_shift = 0; + vdso_th->th_x86_hpet_idx = 0; + vdso_th->th_x86_pvc_last_systime = 0; + vdso_th->th_x86_pvc_stable_mask = 0; + bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); + return (1); +} + +#define HYPERV_TSC_TIMECOUNT(fence) \ +static uint64_t \ +hyperv_tc64_tsc_##fence(void) \ +{ \ + struct hyperv_reftsc *tsc_ref = hyperv_ref_tsc.tsc_ref; \ + uint32_t seq; \ + \ + while ((seq = atomic_load_acq_int(&tsc_ref->tsc_seq)) != 0) { \ + uint64_t disc, ret, tsc; \ + uint64_t scale = tsc_ref->tsc_scale; \ + int64_t ofs = tsc_ref->tsc_ofs; \ + \ + fence(); \ + tsc = rdtsc(); \ + \ + /* ret = ((tsc * scale) >> 64) + ofs */ \ + __asm__ __volatile__ ("mulq %3" : \ + "=d" (ret), "=a" (disc) : \ + "a" (tsc), "r" (scale)); \ + ret += ofs; \ + \ + atomic_thread_fence_acq(); \ + if (tsc_ref->tsc_seq == seq) \ + return (ret); \ + \ + /* Sequence changed; re-sync. */ \ + } \ + /* Fallback to the generic timecounter, i.e. rdmsr. */ \ + return (rdmsr(MSR_HV_TIME_REF_COUNT)); \ +} \ + \ +static u_int \ +hyperv_tsc_timecount_##fence(struct timecounter *tc __unused) \ +{ \ + \ + return (hyperv_tc64_tsc_##fence()); \ +} \ +struct __hack + +HYPERV_TSC_TIMECOUNT(lfence); +HYPERV_TSC_TIMECOUNT(mfence); + +static void +hyperv_tsc_tcinit(void *dummy __unused) +{ + hyperv_tc64_t tc64 = NULL; + uint64_t val, orig; + + if ((hyperv_features & + (CPUID_HV_MSR_TIME_REFCNT | CPUID_HV_MSR_REFERENCE_TSC)) != + (CPUID_HV_MSR_TIME_REFCNT | CPUID_HV_MSR_REFERENCE_TSC) || + (cpu_feature & CPUID_SSE2) == 0) /* SSE2 for mfence/lfence */ + return; + + switch (cpu_vendor_id) { + case CPU_VENDOR_AMD: + case CPU_VENDOR_HYGON: + hyperv_tsc_timecounter.tc_get_timecount = + hyperv_tsc_timecount_mfence; + tc64 = hyperv_tc64_tsc_mfence; + break; + + case CPU_VENDOR_INTEL: + hyperv_tsc_timecounter.tc_get_timecount = + hyperv_tsc_timecount_lfence; + tc64 = hyperv_tc64_tsc_lfence; + break; + + default: + /* Unsupport CPU vendors. */ + return; + } + + hyperv_ref_tsc.tsc_ref = hyperv_dmamem_alloc(NULL, PAGE_SIZE, 0, + sizeof(struct hyperv_reftsc), &hyperv_ref_tsc.tsc_ref_dma, + BUS_DMA_WAITOK | BUS_DMA_ZERO); + if (hyperv_ref_tsc.tsc_ref == NULL) { + printf("hyperv: reftsc page allocation failed\n"); + return; + } + + orig = rdmsr(MSR_HV_REFERENCE_TSC); + val = MSR_HV_REFTSC_ENABLE | (orig & MSR_HV_REFTSC_RSVD_MASK) | + ((hyperv_ref_tsc.tsc_ref_dma.hv_paddr >> PAGE_SHIFT) << + MSR_HV_REFTSC_PGSHIFT); + wrmsr(MSR_HV_REFERENCE_TSC, val); + + /* Register "enlightened" timecounter. */ + tc_init(&hyperv_tsc_timecounter); + + /* Install 64 bits timecounter method for other modules to use. */ + KASSERT(tc64 != NULL, ("tc64 is not set")); + hyperv_tc64 = tc64; + + /* Add device for mmap(2). */ + make_dev(&hyperv_tsc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0444, + HYPERV_REFTSC_DEVNAME); +} +SYSINIT(hyperv_tsc_init, SI_SUB_DRIVERS, SI_ORDER_FIRST, hyperv_tsc_tcinit, + NULL); diff --git a/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S b/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S new file mode 100644 index 000000000000..30c07348734c --- /dev/null +++ b/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "assym.inc" + +#include <machine/psl.h> +#include <machine/asmacros.h> +#include <machine/specialreg.h> + +/* + * This is the Hyper-V vmbus channel direct callback interrupt. + * Only used when it is running on Hyper-V. + */ + .text + SUPERALIGN_TEXT + INTR_HANDLER vmbus_isr + movq %rsp, %rdi + call vmbus_handle_intr + jmp doreti diff --git a/sys/dev/hyperv/vmbus/hyperv.c b/sys/dev/hyperv/vmbus/hyperv.c new file mode 100644 index 000000000000..01e0ad9610d9 --- /dev/null +++ b/sys/dev/hyperv/vmbus/hyperv.c @@ -0,0 +1,340 @@ +/*- + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * Implements low-level interactions with Hyper-V/Azure + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/systm.h> +#include <sys/timetc.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/pmap.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/hyperv_busdma.h> +#include <dev/hyperv/vmbus/hyperv_machdep.h> +#include <dev/hyperv/vmbus/hyperv_reg.h> +#include <dev/hyperv/vmbus/hyperv_var.h> + +#define HYPERV_FREEBSD_BUILD 0ULL +#define HYPERV_FREEBSD_VERSION ((uint64_t)__FreeBSD_version) +#define HYPERV_FREEBSD_OSID 0ULL + +#define MSR_HV_GUESTID_BUILD_FREEBSD \ + (HYPERV_FREEBSD_BUILD & MSR_HV_GUESTID_BUILD_MASK) +#define MSR_HV_GUESTID_VERSION_FREEBSD \ + ((HYPERV_FREEBSD_VERSION << MSR_HV_GUESTID_VERSION_SHIFT) & \ + MSR_HV_GUESTID_VERSION_MASK) +#define MSR_HV_GUESTID_OSID_FREEBSD \ + ((HYPERV_FREEBSD_OSID << MSR_HV_GUESTID_OSID_SHIFT) & \ + MSR_HV_GUESTID_OSID_MASK) + +#define MSR_HV_GUESTID_FREEBSD \ + (MSR_HV_GUESTID_BUILD_FREEBSD | \ + MSR_HV_GUESTID_VERSION_FREEBSD | \ + MSR_HV_GUESTID_OSID_FREEBSD | \ + MSR_HV_GUESTID_OSTYPE_FREEBSD) + +struct hypercall_ctx { + void *hc_addr; + vm_paddr_t hc_paddr; +}; + +static u_int hyperv_get_timecount(struct timecounter *); +static bool hyperv_identify(void); +static void hypercall_memfree(void); + +u_int hyperv_ver_major; + +u_int hyperv_features; +u_int hyperv_recommends; + +static u_int hyperv_pm_features; +static u_int hyperv_features3; + +hyperv_tc64_t hyperv_tc64; + +static struct timecounter hyperv_timecounter = { + .tc_get_timecount = hyperv_get_timecount, + .tc_poll_pps = NULL, + .tc_counter_mask = 0xffffffff, + .tc_frequency = HYPERV_TIMER_FREQ, + .tc_name = "Hyper-V", + .tc_quality = 2000, + .tc_flags = 0, + .tc_priv = NULL +}; + +static struct hypercall_ctx hypercall_context; + +static u_int +hyperv_get_timecount(struct timecounter *tc __unused) +{ + return rdmsr(MSR_HV_TIME_REF_COUNT); +} + +static uint64_t +hyperv_tc64_rdmsr(void) +{ + + return (rdmsr(MSR_HV_TIME_REF_COUNT)); +} + +uint64_t +hypercall_post_message(bus_addr_t msg_paddr) +{ + return hypercall_md(hypercall_context.hc_addr, + HYPERCALL_POST_MESSAGE, msg_paddr, 0); +} + +uint64_t +hypercall_signal_event(bus_addr_t monprm_paddr) +{ + return hypercall_md(hypercall_context.hc_addr, + HYPERCALL_SIGNAL_EVENT, monprm_paddr, 0); +} + +int +hyperv_guid2str(const struct hyperv_guid *guid, char *buf, size_t sz) +{ + const uint8_t *d = guid->hv_guid; + + return snprintf(buf, sz, "%02x%02x%02x%02x-" + "%02x%02x-%02x%02x-%02x%02x-" + "%02x%02x%02x%02x%02x%02x", + d[3], d[2], d[1], d[0], + d[5], d[4], d[7], d[6], d[8], d[9], + d[10], d[11], d[12], d[13], d[14], d[15]); +} + +static bool +hyperv_identify(void) +{ + u_int regs[4]; + unsigned int maxleaf; + + if (vm_guest != VM_GUEST_HV) + return (false); + + do_cpuid(CPUID_LEAF_HV_MAXLEAF, regs); + maxleaf = regs[0]; + if (maxleaf < CPUID_LEAF_HV_LIMITS) + return (false); + + do_cpuid(CPUID_LEAF_HV_INTERFACE, regs); + if (regs[0] != CPUID_HV_IFACE_HYPERV) + return (false); + + do_cpuid(CPUID_LEAF_HV_FEATURES, regs); + if ((regs[0] & CPUID_HV_MSR_HYPERCALL) == 0) { + /* + * Hyper-V w/o Hypercall is impossible; someone + * is faking Hyper-V. + */ + return (false); + } + hyperv_features = regs[0]; + hyperv_pm_features = regs[2]; + hyperv_features3 = regs[3]; + + do_cpuid(CPUID_LEAF_HV_IDENTITY, regs); + hyperv_ver_major = regs[1] >> 16; + printf("Hyper-V Version: %d.%d.%d [SP%d]\n", + hyperv_ver_major, regs[1] & 0xffff, regs[0], regs[2]); + + printf(" Features=0x%b\n", hyperv_features, + "\020" + "\001VPRUNTIME" /* MSR_HV_VP_RUNTIME */ + "\002TMREFCNT" /* MSR_HV_TIME_REF_COUNT */ + "\003SYNIC" /* MSRs for SynIC */ + "\004SYNTM" /* MSRs for SynTimer */ + "\005APIC" /* MSR_HV_{EOI,ICR,TPR} */ + "\006HYPERCALL" /* MSR_HV_{GUEST_OS_ID,HYPERCALL} */ + "\007VPINDEX" /* MSR_HV_VP_INDEX */ + "\010RESET" /* MSR_HV_RESET */ + "\011STATS" /* MSR_HV_STATS_ */ + "\012REFTSC" /* MSR_HV_REFERENCE_TSC */ + "\013IDLE" /* MSR_HV_GUEST_IDLE */ + "\014TMFREQ" /* MSR_HV_{TSC,APIC}_FREQUENCY */ + "\015DEBUG"); /* MSR_HV_SYNTH_DEBUG_ */ + printf(" PM Features=0x%b [C%u]\n", + (hyperv_pm_features & ~CPUPM_HV_CSTATE_MASK), + "\020" + "\005C3HPET", /* HPET is required for C3 state */ + CPUPM_HV_CSTATE(hyperv_pm_features)); + printf(" Features3=0x%b\n", hyperv_features3, + "\020" + "\001MWAIT" /* MWAIT */ + "\002DEBUG" /* guest debug support */ + "\003PERFMON" /* performance monitor */ + "\004PCPUDPE" /* physical CPU dynamic partition event */ + "\005XMMHC" /* hypercall input through XMM regs */ + "\006IDLE" /* guest idle support */ + "\007SLEEP" /* hypervisor sleep support */ + "\010NUMA" /* NUMA distance query support */ + "\011TMFREQ" /* timer frequency query (TSC, LAPIC) */ + "\012SYNCMC" /* inject synthetic machine checks */ + "\013CRASH" /* MSRs for guest crash */ + "\014DEBUGMSR" /* MSRs for guest debug */ + "\015NPIEP" /* NPIEP */ + "\016HVDIS"); /* disabling hypervisor */ + + do_cpuid(CPUID_LEAF_HV_RECOMMENDS, regs); + hyperv_recommends = regs[0]; + if (bootverbose) + printf(" Recommends: %08x %08x\n", regs[0], regs[1]); + + do_cpuid(CPUID_LEAF_HV_LIMITS, regs); + if (bootverbose) { + printf(" Limits: Vcpu:%d Lcpu:%d Int:%d\n", + regs[0], regs[1], regs[2]); + } + + if (maxleaf >= CPUID_LEAF_HV_HWFEATURES) { + do_cpuid(CPUID_LEAF_HV_HWFEATURES, regs); + if (bootverbose) { + printf(" HW Features: %08x, AMD: %08x\n", + regs[0], regs[3]); + } + } + + return (true); +} + +static void +hyperv_init(void *dummy __unused) +{ + if (!hyperv_identify()) { + /* Not Hyper-V; reset guest id to the generic one. */ + if (vm_guest == VM_GUEST_HV) + vm_guest = VM_GUEST_VM; + return; + } + + /* Set guest id */ + wrmsr(MSR_HV_GUEST_OS_ID, MSR_HV_GUESTID_FREEBSD); + + if (hyperv_features & CPUID_HV_MSR_TIME_REFCNT) { + /* + * Register Hyper-V timecounter. This should be done as early + * as possible to let DELAY() work, since the 8254 PIT is not + * reliably emulated or even available. + */ + tc_init(&hyperv_timecounter); + + /* + * Install 64 bits timecounter method for other modules + * to use. + */ + hyperv_tc64 = hyperv_tc64_rdmsr; + } +} +SYSINIT(hyperv_initialize, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, hyperv_init, + NULL); + +static void +hypercall_memfree(void) +{ + kmem_free((vm_offset_t)hypercall_context.hc_addr, PAGE_SIZE); + hypercall_context.hc_addr = NULL; +} + +static void +hypercall_create(void *arg __unused) +{ + uint64_t hc, hc_orig; + + if (vm_guest != VM_GUEST_HV) + return; + + /* + * NOTE: + * - busdma(9), i.e. hyperv_dmamem APIs, can _not_ be used due to + * the NX bit. + * - Assume kmem_malloc() returns properly aligned memory. + */ + hypercall_context.hc_addr = (void *)kmem_malloc(PAGE_SIZE, M_EXEC | + M_WAITOK); + hypercall_context.hc_paddr = vtophys(hypercall_context.hc_addr); + + /* Get the 'reserved' bits, which requires preservation. */ + hc_orig = rdmsr(MSR_HV_HYPERCALL); + + /* + * Setup the Hypercall page. + * + * NOTE: 'reserved' bits MUST be preserved. + */ + hc = ((hypercall_context.hc_paddr >> PAGE_SHIFT) << + MSR_HV_HYPERCALL_PGSHIFT) | + (hc_orig & MSR_HV_HYPERCALL_RSVD_MASK) | + MSR_HV_HYPERCALL_ENABLE; + wrmsr(MSR_HV_HYPERCALL, hc); + + /* + * Confirm that Hypercall page did get setup. + */ + hc = rdmsr(MSR_HV_HYPERCALL); + if ((hc & MSR_HV_HYPERCALL_ENABLE) == 0) { + printf("hyperv: Hypercall setup failed\n"); + hypercall_memfree(); + /* Can't perform any Hyper-V specific actions */ + vm_guest = VM_GUEST_VM; + return; + } + if (bootverbose) + printf("hyperv: Hypercall created\n"); +} +SYSINIT(hypercall_ctor, SI_SUB_DRIVERS, SI_ORDER_FIRST, hypercall_create, NULL); + +static void +hypercall_destroy(void *arg __unused) +{ + uint64_t hc; + + if (hypercall_context.hc_addr == NULL) + return; + + /* Disable Hypercall */ + hc = rdmsr(MSR_HV_HYPERCALL); + wrmsr(MSR_HV_HYPERCALL, (hc & MSR_HV_HYPERCALL_RSVD_MASK)); + hypercall_memfree(); + + if (bootverbose) + printf("hyperv: Hypercall destroyed\n"); +} +SYSUNINIT(hypercall_dtor, SI_SUB_DRIVERS, SI_ORDER_FIRST, hypercall_destroy, + NULL); diff --git a/sys/dev/hyperv/vmbus/hyperv_busdma.c b/sys/dev/hyperv/vmbus/hyperv_busdma.c new file mode 100644 index 000000000000..9550540014c4 --- /dev/null +++ b/sys/dev/hyperv/vmbus/hyperv_busdma.c @@ -0,0 +1,98 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> + +#include <machine/bus.h> + +#include <dev/hyperv/include/hyperv_busdma.h> + +#define HYPERV_DMA_MASK (BUS_DMA_WAITOK | BUS_DMA_NOWAIT | BUS_DMA_ZERO) + +void +hyperv_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error) +{ + bus_addr_t *paddr = arg; + + if (error) + return; + + KASSERT(nseg == 1, ("too many segments %d!", nseg)); + *paddr = segs->ds_addr; +} + +void * +hyperv_dmamem_alloc(bus_dma_tag_t parent_dtag, bus_size_t alignment, + bus_addr_t boundary, bus_size_t size, struct hyperv_dma *dma, int flags) +{ + void *ret; + int error; + + error = bus_dma_tag_create(parent_dtag, /* parent */ + alignment, /* alignment */ + boundary, /* boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + size, /* maxsize */ + 1, /* nsegments */ + size, /* maxsegsize */ + 0, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockfuncarg */ + &dma->hv_dtag); + if (error) + return NULL; + + error = bus_dmamem_alloc(dma->hv_dtag, &ret, + (flags & HYPERV_DMA_MASK) | BUS_DMA_COHERENT, &dma->hv_dmap); + if (error) { + bus_dma_tag_destroy(dma->hv_dtag); + return NULL; + } + + error = bus_dmamap_load(dma->hv_dtag, dma->hv_dmap, ret, size, + hyperv_dma_map_paddr, &dma->hv_paddr, BUS_DMA_NOWAIT); + if (error) { + bus_dmamem_free(dma->hv_dtag, ret, dma->hv_dmap); + bus_dma_tag_destroy(dma->hv_dtag); + return NULL; + } + return ret; +} + +void +hyperv_dmamem_free(struct hyperv_dma *dma, void *ptr) +{ + bus_dmamap_unload(dma->hv_dtag, dma->hv_dmap); + bus_dmamem_free(dma->hv_dtag, ptr, dma->hv_dmap); + bus_dma_tag_destroy(dma->hv_dtag); +} diff --git a/sys/dev/hyperv/vmbus/hyperv_machdep.h b/sys/dev/hyperv/vmbus/hyperv_machdep.h new file mode 100644 index 000000000000..48cf5b78dc3b --- /dev/null +++ b/sys/dev/hyperv/vmbus/hyperv_machdep.h @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HYPERV_MACHDEP_H_ +#define _HYPERV_MACHDEP_H_ + +#include <sys/param.h> + +uint64_t hypercall_md(volatile void *hc_addr, uint64_t in_val, + uint64_t in_paddr, uint64_t out_paddr); + +#endif /* !_HYPERV_MACHDEP_H_ */ diff --git a/sys/dev/hyperv/vmbus/hyperv_reg.h b/sys/dev/hyperv/vmbus/hyperv_reg.h new file mode 100644 index 000000000000..b3b133c84881 --- /dev/null +++ b/sys/dev/hyperv/vmbus/hyperv_reg.h @@ -0,0 +1,193 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HYPERV_REG_H_ +#define _HYPERV_REG_H_ + +#include <sys/param.h> +#include <sys/systm.h> + +/* + * Hyper-V Synthetic MSRs + */ + +#define MSR_HV_GUEST_OS_ID 0x40000000 +#define MSR_HV_GUESTID_BUILD_MASK 0xffffULL +#define MSR_HV_GUESTID_VERSION_MASK 0x0000ffffffff0000ULL +#define MSR_HV_GUESTID_VERSION_SHIFT 16 +#define MSR_HV_GUESTID_OSID_MASK 0x00ff000000000000ULL +#define MSR_HV_GUESTID_OSID_SHIFT 48 +#define MSR_HV_GUESTID_OSTYPE_MASK 0x7f00000000000000ULL +#define MSR_HV_GUESTID_OSTYPE_SHIFT 56 +#define MSR_HV_GUESTID_OPENSRC 0x8000000000000000ULL +#define MSR_HV_GUESTID_OSTYPE_LINUX \ + ((0x01ULL << MSR_HV_GUESTID_OSTYPE_SHIFT) | MSR_HV_GUESTID_OPENSRC) +#define MSR_HV_GUESTID_OSTYPE_FREEBSD \ + ((0x02ULL << MSR_HV_GUESTID_OSTYPE_SHIFT) | MSR_HV_GUESTID_OPENSRC) + +#define MSR_HV_HYPERCALL 0x40000001 +#define MSR_HV_HYPERCALL_ENABLE 0x0001ULL +#define MSR_HV_HYPERCALL_RSVD_MASK 0x0ffeULL +#define MSR_HV_HYPERCALL_PGSHIFT 12 + +#define MSR_HV_VP_INDEX 0x40000002 + +#define MSR_HV_REFERENCE_TSC 0x40000021 +#define MSR_HV_REFTSC_ENABLE 0x0001ULL +#define MSR_HV_REFTSC_RSVD_MASK 0x0ffeULL +#define MSR_HV_REFTSC_PGSHIFT 12 + +#define MSR_HV_SCONTROL 0x40000080 +#define MSR_HV_SCTRL_ENABLE 0x0001ULL +#define MSR_HV_SCTRL_RSVD_MASK 0xfffffffffffffffeULL + +#define MSR_HV_SIEFP 0x40000082 +#define MSR_HV_SIEFP_ENABLE 0x0001ULL +#define MSR_HV_SIEFP_RSVD_MASK 0x0ffeULL +#define MSR_HV_SIEFP_PGSHIFT 12 + +#define MSR_HV_SIMP 0x40000083 +#define MSR_HV_SIMP_ENABLE 0x0001ULL +#define MSR_HV_SIMP_RSVD_MASK 0x0ffeULL +#define MSR_HV_SIMP_PGSHIFT 12 + +#define MSR_HV_EOM 0x40000084 + +#define MSR_HV_SINT0 0x40000090 +#define MSR_HV_SINT_VECTOR_MASK 0x00ffULL +#define MSR_HV_SINT_RSVD1_MASK 0xff00ULL +#define MSR_HV_SINT_MASKED 0x00010000ULL +#define MSR_HV_SINT_AUTOEOI 0x00020000ULL +#define MSR_HV_SINT_RSVD2_MASK 0xfffffffffffc0000ULL +#define MSR_HV_SINT_RSVD_MASK (MSR_HV_SINT_RSVD1_MASK | \ + MSR_HV_SINT_RSVD2_MASK) + +#define MSR_HV_STIMER0_CONFIG 0x400000b0 +#define MSR_HV_STIMER_CFG_ENABLE 0x0001ULL +#define MSR_HV_STIMER_CFG_PERIODIC 0x0002ULL +#define MSR_HV_STIMER_CFG_LAZY 0x0004ULL +#define MSR_HV_STIMER_CFG_AUTOEN 0x0008ULL +#define MSR_HV_STIMER_CFG_SINT_MASK 0x000f0000ULL +#define MSR_HV_STIMER_CFG_SINT_SHIFT 16 + +#define MSR_HV_STIMER0_COUNT 0x400000b1 + +/* + * CPUID leaves + */ + +#define CPUID_LEAF_HV_MAXLEAF 0x40000000 + +#define CPUID_LEAF_HV_INTERFACE 0x40000001 +#define CPUID_HV_IFACE_HYPERV 0x31237648 /* HV#1 */ + +#define CPUID_LEAF_HV_IDENTITY 0x40000002 + +#define CPUID_LEAF_HV_FEATURES 0x40000003 +/* EAX: features include/hyperv.h CPUID_HV_MSR */ +/* ECX: power management features */ +#define CPUPM_HV_CSTATE_MASK 0x000f /* deepest C-state */ +#define CPUPM_HV_C3_HPET 0x0010 /* C3 requires HPET */ +#define CPUPM_HV_CSTATE(f) ((f) & CPUPM_HV_CSTATE_MASK) +/* EDX: features3 */ +#define CPUID3_HV_MWAIT 0x0001 /* MWAIT */ +#define CPUID3_HV_XMM_HYPERCALL 0x0010 /* Hypercall input through + * XMM regs */ +#define CPUID3_HV_GUEST_IDLE 0x0020 /* guest idle */ +#define CPUID3_HV_NUMA 0x0080 /* NUMA distance query */ +#define CPUID3_HV_TIME_FREQ 0x0100 /* timer frequency query + * (TSC, LAPIC) */ +#define CPUID3_HV_MSR_CRASH 0x0400 /* MSRs for guest crash */ + +#define CPUID_LEAF_HV_RECOMMENDS 0x40000004 +#define CPUID_LEAF_HV_LIMITS 0x40000005 +#define CPUID_LEAF_HV_HWFEATURES 0x40000006 + +/* + * Hyper-V Monitor Notification Facility + */ +struct hyperv_mon_param { + uint32_t mp_connid; + uint16_t mp_evtflag_ofs; + uint16_t mp_rsvd; +} __packed; + +/* + * Hyper-V message types + */ +#define HYPERV_MSGTYPE_NONE 0 +#define HYPERV_MSGTYPE_CHANNEL 1 +#define HYPERV_MSGTYPE_TIMER_EXPIRED 0x80000010 + +/* + * Hypercall status codes + */ +#define HYPERCALL_STATUS_SUCCESS 0x0000 + +/* + * Hypercall input values + */ +#define HYPERCALL_POST_MESSAGE 0x005c +#define HYPERCALL_SIGNAL_EVENT 0x005d + +/* + * Hypercall input parameters + */ +#define HYPERCALL_PARAM_ALIGN 8 +#if 0 +/* + * XXX + * <<Hypervisor Top Level Functional Specification 4.0b>> requires + * input parameters size to be multiple of 8, however, many post + * message input parameters do _not_ meet this requirement. + */ +#define HYPERCALL_PARAM_SIZE_ALIGN 8 +#endif + +/* + * HYPERCALL_POST_MESSAGE + */ +#define HYPERCALL_POSTMSGIN_DSIZE_MAX 240 +#define HYPERCALL_POSTMSGIN_SIZE 256 + +struct hypercall_postmsg_in { + uint32_t hc_connid; + uint32_t hc_rsvd; + uint32_t hc_msgtype; /* HYPERV_MSGTYPE_ */ + uint32_t hc_dsize; + uint8_t hc_data[HYPERCALL_POSTMSGIN_DSIZE_MAX]; +} __packed; +CTASSERT(sizeof(struct hypercall_postmsg_in) == HYPERCALL_POSTMSGIN_SIZE); + +/* + * HYPERCALL_SIGNAL_EVENT + * + * struct hyperv_mon_param. + */ + +#endif /* !_HYPERV_REG_H_ */ diff --git a/sys/dev/hyperv/vmbus/hyperv_var.h b/sys/dev/hyperv/vmbus/hyperv_var.h new file mode 100644 index 000000000000..f620e4fd64ae --- /dev/null +++ b/sys/dev/hyperv/vmbus/hyperv_var.h @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _HYPERV_VAR_H_ +#define _HYPERV_VAR_H_ + +extern u_int hyperv_recommends; + +uint64_t hypercall_post_message(bus_addr_t msg_paddr); +uint64_t hypercall_signal_event(bus_addr_t monprm_paddr); + +#endif /* !_HYPERV_VAR_H_ */ diff --git a/sys/dev/hyperv/vmbus/i386/hyperv_machdep.c b/sys/dev/hyperv/vmbus/i386/hyperv_machdep.c new file mode 100644 index 000000000000..b12bff855f63 --- /dev/null +++ b/sys/dev/hyperv/vmbus/i386/hyperv_machdep.c @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <dev/hyperv/vmbus/hyperv_machdep.h> + +uint64_t +hypercall_md(volatile void *hc_addr, uint64_t in_val, + uint64_t in_paddr, uint64_t out_paddr) +{ + uint32_t in_val_hi = in_val >> 32; + uint32_t in_val_lo = in_val & 0xFFFFFFFF; + uint32_t status_hi, status_lo; + uint32_t in_paddr_hi = in_paddr >> 32; + uint32_t in_paddr_lo = in_paddr & 0xFFFFFFFF; + uint32_t out_paddr_hi = out_paddr >> 32; + uint32_t out_paddr_lo = out_paddr & 0xFFFFFFFF; + + __asm__ __volatile__ ("call *%8" : "=d"(status_hi), "=a"(status_lo) : + "d" (in_val_hi), "a" (in_val_lo), + "b" (in_paddr_hi), "c" (in_paddr_lo), + "D"(out_paddr_hi), "S"(out_paddr_lo), + "m" (hc_addr)); + return (status_lo | ((uint64_t)status_hi << 32)); +} diff --git a/sys/dev/hyperv/vmbus/i386/vmbus_vector.S b/sys/dev/hyperv/vmbus/i386/vmbus_vector.S new file mode 100644 index 000000000000..b1ffe89cd55d --- /dev/null +++ b/sys/dev/hyperv/vmbus/i386/vmbus_vector.S @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "assym.inc" + +#include <machine/psl.h> +#include <machine/asmacros.h> +#include <machine/specialreg.h> + +/* + * This is the Hyper-V vmbus channel direct callback interrupt. + * Only used when it is running on Hyper-V. + * + * Note that this file is not assembled directly, it is included into + * i386/exception.s. + */ + .text + SUPERALIGN_TEXT +IDTVEC(vmbus_isr_pti) +IDTVEC(vmbus_isr) + PUSH_FRAME + SET_KERNEL_SREGS + cld + KENTER + pushl %esp + mov $vmbus_handle_intr, %eax + call *%eax + add $4, %esp + jmp doreti diff --git a/sys/dev/hyperv/vmbus/vmbus.c b/sys/dev/hyperv/vmbus/vmbus.c new file mode 100644 index 000000000000..31951cbf4858 --- /dev/null +++ b/sys/dev/hyperv/vmbus/vmbus.c @@ -0,0 +1,1679 @@ +/*- + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * VM Bus Driver Implementation + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/linker.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/sbuf.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/taskqueue.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> + +#include <machine/bus.h> +#include <machine/intr_machdep.h> +#include <machine/metadata.h> +#include <machine/md_var.h> +#include <machine/resource.h> +#include <x86/include/apicvar.h> + +#include <contrib/dev/acpica/include/acpi.h> +#include <dev/acpica/acpivar.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/vmbus_xact.h> +#include <dev/hyperv/vmbus/hyperv_reg.h> +#include <dev/hyperv/vmbus/hyperv_var.h> +#include <dev/hyperv/vmbus/vmbus_reg.h> +#include <dev/hyperv/vmbus/vmbus_var.h> +#include <dev/hyperv/vmbus/vmbus_chanvar.h> + +#include "acpi_if.h" +#include "pcib_if.h" +#include "vmbus_if.h" + +#define VMBUS_GPADL_START 0xe1e10 + +struct vmbus_msghc { + struct vmbus_xact *mh_xact; + struct hypercall_postmsg_in mh_inprm_save; +}; + +static void vmbus_identify(driver_t *, device_t); +static int vmbus_probe(device_t); +static int vmbus_attach(device_t); +static int vmbus_detach(device_t); +static int vmbus_read_ivar(device_t, device_t, int, + uintptr_t *); +static int vmbus_child_pnpinfo(device_t, device_t, struct sbuf *); +static struct resource *vmbus_alloc_resource(device_t dev, + device_t child, int type, int *rid, + rman_res_t start, rman_res_t end, + rman_res_t count, u_int flags); +static int vmbus_alloc_msi(device_t bus, device_t dev, + int count, int maxcount, int *irqs); +static int vmbus_release_msi(device_t bus, device_t dev, + int count, int *irqs); +static int vmbus_alloc_msix(device_t bus, device_t dev, + int *irq); +static int vmbus_release_msix(device_t bus, device_t dev, + int irq); +static int vmbus_map_msi(device_t bus, device_t dev, + int irq, uint64_t *addr, uint32_t *data); +static uint32_t vmbus_get_version_method(device_t, device_t); +static int vmbus_probe_guid_method(device_t, device_t, + const struct hyperv_guid *); +static uint32_t vmbus_get_vcpu_id_method(device_t bus, + device_t dev, int cpu); +static struct taskqueue *vmbus_get_eventtq_method(device_t, device_t, + int); +#ifdef EARLY_AP_STARTUP +static void vmbus_intrhook(void *); +#endif + +static int vmbus_init(struct vmbus_softc *); +static int vmbus_connect(struct vmbus_softc *, uint32_t); +static int vmbus_req_channels(struct vmbus_softc *sc); +static void vmbus_disconnect(struct vmbus_softc *); +static int vmbus_scan(struct vmbus_softc *); +static void vmbus_scan_teardown(struct vmbus_softc *); +static void vmbus_scan_done(struct vmbus_softc *, + const struct vmbus_message *); +static void vmbus_chanmsg_handle(struct vmbus_softc *, + const struct vmbus_message *); +static void vmbus_msg_task(void *, int); +static void vmbus_synic_setup(void *); +static void vmbus_synic_teardown(void *); +static int vmbus_sysctl_version(SYSCTL_HANDLER_ARGS); +static int vmbus_dma_alloc(struct vmbus_softc *); +static void vmbus_dma_free(struct vmbus_softc *); +static int vmbus_intr_setup(struct vmbus_softc *); +static void vmbus_intr_teardown(struct vmbus_softc *); +static int vmbus_doattach(struct vmbus_softc *); +static void vmbus_event_proc_dummy(struct vmbus_softc *, + int); + +static struct vmbus_softc *vmbus_sc; + +SYSCTL_NODE(_hw, OID_AUTO, vmbus, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, + "Hyper-V vmbus"); + +static int vmbus_pin_evttask = 1; +SYSCTL_INT(_hw_vmbus, OID_AUTO, pin_evttask, CTLFLAG_RDTUN, + &vmbus_pin_evttask, 0, "Pin event tasks to their respective CPU"); + +extern inthand_t IDTVEC(vmbus_isr), IDTVEC(vmbus_isr_pti); +#define VMBUS_ISR_ADDR trunc_page((uintptr_t)IDTVEC(vmbus_isr_pti)) + +uint32_t vmbus_current_version; + +static const uint32_t vmbus_version[] = { + VMBUS_VERSION_WIN10, + VMBUS_VERSION_WIN8_1, + VMBUS_VERSION_WIN8, + VMBUS_VERSION_WIN7, + VMBUS_VERSION_WS2008 +}; + +static const vmbus_chanmsg_proc_t +vmbus_chanmsg_handlers[VMBUS_CHANMSG_TYPE_MAX] = { + VMBUS_CHANMSG_PROC(CHOFFER_DONE, vmbus_scan_done), + VMBUS_CHANMSG_PROC_WAKEUP(CONNECT_RESP) +}; + +static device_method_t vmbus_methods[] = { + /* Device interface */ + DEVMETHOD(device_identify, vmbus_identify), + DEVMETHOD(device_probe, vmbus_probe), + DEVMETHOD(device_attach, vmbus_attach), + DEVMETHOD(device_detach, vmbus_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + /* Bus interface */ + DEVMETHOD(bus_add_child, bus_generic_add_child), + DEVMETHOD(bus_print_child, bus_generic_print_child), + DEVMETHOD(bus_read_ivar, vmbus_read_ivar), + DEVMETHOD(bus_child_pnpinfo, vmbus_child_pnpinfo), + DEVMETHOD(bus_alloc_resource, vmbus_alloc_resource), + DEVMETHOD(bus_release_resource, bus_generic_release_resource), + DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), + DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), + DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), + DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), +#if __FreeBSD_version >= 1100000 + DEVMETHOD(bus_get_cpus, bus_generic_get_cpus), +#endif + + /* pcib interface */ + DEVMETHOD(pcib_alloc_msi, vmbus_alloc_msi), + DEVMETHOD(pcib_release_msi, vmbus_release_msi), + DEVMETHOD(pcib_alloc_msix, vmbus_alloc_msix), + DEVMETHOD(pcib_release_msix, vmbus_release_msix), + DEVMETHOD(pcib_map_msi, vmbus_map_msi), + + /* Vmbus interface */ + DEVMETHOD(vmbus_get_version, vmbus_get_version_method), + DEVMETHOD(vmbus_probe_guid, vmbus_probe_guid_method), + DEVMETHOD(vmbus_get_vcpu_id, vmbus_get_vcpu_id_method), + DEVMETHOD(vmbus_get_event_taskq, vmbus_get_eventtq_method), + + DEVMETHOD_END +}; + +static driver_t vmbus_driver = { + "vmbus", + vmbus_methods, + sizeof(struct vmbus_softc) +}; + +static devclass_t vmbus_devclass; + +DRIVER_MODULE(vmbus, pcib, vmbus_driver, vmbus_devclass, NULL, NULL); +DRIVER_MODULE(vmbus, acpi_syscontainer, vmbus_driver, vmbus_devclass, + NULL, NULL); + +MODULE_DEPEND(vmbus, acpi, 1, 1, 1); +MODULE_DEPEND(vmbus, pci, 1, 1, 1); +MODULE_VERSION(vmbus, 1); + +static __inline struct vmbus_softc * +vmbus_get_softc(void) +{ + return vmbus_sc; +} + +void +vmbus_msghc_reset(struct vmbus_msghc *mh, size_t dsize) +{ + struct hypercall_postmsg_in *inprm; + + if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX) + panic("invalid data size %zu", dsize); + + inprm = vmbus_xact_req_data(mh->mh_xact); + memset(inprm, 0, HYPERCALL_POSTMSGIN_SIZE); + inprm->hc_connid = VMBUS_CONNID_MESSAGE; + inprm->hc_msgtype = HYPERV_MSGTYPE_CHANNEL; + inprm->hc_dsize = dsize; +} + +struct vmbus_msghc * +vmbus_msghc_get(struct vmbus_softc *sc, size_t dsize) +{ + struct vmbus_msghc *mh; + struct vmbus_xact *xact; + + if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX) + panic("invalid data size %zu", dsize); + + xact = vmbus_xact_get(sc->vmbus_xc, + dsize + __offsetof(struct hypercall_postmsg_in, hc_data[0])); + if (xact == NULL) + return (NULL); + + mh = vmbus_xact_priv(xact, sizeof(*mh)); + mh->mh_xact = xact; + + vmbus_msghc_reset(mh, dsize); + return (mh); +} + +void +vmbus_msghc_put(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) +{ + + vmbus_xact_put(mh->mh_xact); +} + +void * +vmbus_msghc_dataptr(struct vmbus_msghc *mh) +{ + struct hypercall_postmsg_in *inprm; + + inprm = vmbus_xact_req_data(mh->mh_xact); + return (inprm->hc_data); +} + +int +vmbus_msghc_exec_noresult(struct vmbus_msghc *mh) +{ + sbintime_t time = SBT_1MS; + struct hypercall_postmsg_in *inprm; + bus_addr_t inprm_paddr; + int i; + + inprm = vmbus_xact_req_data(mh->mh_xact); + inprm_paddr = vmbus_xact_req_paddr(mh->mh_xact); + + /* + * Save the input parameter so that we could restore the input + * parameter if the Hypercall failed. + * + * XXX + * Is this really necessary?! i.e. Will the Hypercall ever + * overwrite the input parameter? + */ + memcpy(&mh->mh_inprm_save, inprm, HYPERCALL_POSTMSGIN_SIZE); + + /* + * In order to cope with transient failures, e.g. insufficient + * resources on host side, we retry the post message Hypercall + * several times. 20 retries seem sufficient. + */ +#define HC_RETRY_MAX 20 + + for (i = 0; i < HC_RETRY_MAX; ++i) { + uint64_t status; + + status = hypercall_post_message(inprm_paddr); + if (status == HYPERCALL_STATUS_SUCCESS) + return 0; + + pause_sbt("hcpmsg", time, 0, C_HARDCLOCK); + if (time < SBT_1S * 2) + time *= 2; + + /* Restore input parameter and try again */ + memcpy(inprm, &mh->mh_inprm_save, HYPERCALL_POSTMSGIN_SIZE); + } + +#undef HC_RETRY_MAX + + return EIO; +} + +int +vmbus_msghc_exec(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) +{ + int error; + + vmbus_xact_activate(mh->mh_xact); + error = vmbus_msghc_exec_noresult(mh); + if (error) + vmbus_xact_deactivate(mh->mh_xact); + return error; +} + +void +vmbus_msghc_exec_cancel(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) +{ + + vmbus_xact_deactivate(mh->mh_xact); +} + +const struct vmbus_message * +vmbus_msghc_wait_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) +{ + size_t resp_len; + + return (vmbus_xact_wait(mh->mh_xact, &resp_len)); +} + +const struct vmbus_message * +vmbus_msghc_poll_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) +{ + size_t resp_len; + + return (vmbus_xact_poll(mh->mh_xact, &resp_len)); +} + +void +vmbus_msghc_wakeup(struct vmbus_softc *sc, const struct vmbus_message *msg) +{ + + vmbus_xact_ctx_wakeup(sc->vmbus_xc, msg, sizeof(*msg)); +} + +uint32_t +vmbus_gpadl_alloc(struct vmbus_softc *sc) +{ + uint32_t gpadl; + +again: + gpadl = atomic_fetchadd_int(&sc->vmbus_gpadl, 1); + if (gpadl == 0) + goto again; + return (gpadl); +} + +/* Used for Hyper-V socket when guest client connects to host */ +int +vmbus_req_tl_connect(struct hyperv_guid *guest_srv_id, + struct hyperv_guid *host_srv_id) +{ + struct vmbus_softc *sc = vmbus_get_softc(); + struct vmbus_chanmsg_tl_connect *req; + struct vmbus_msghc *mh; + int error; + + if (!sc) + return ENXIO; + + mh = vmbus_msghc_get(sc, sizeof(*req)); + if (mh == NULL) { + device_printf(sc->vmbus_dev, + "can not get msg hypercall for tl connect\n"); + return ENXIO; + } + + req = vmbus_msghc_dataptr(mh); + req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_TL_CONN; + req->guest_endpoint_id = *guest_srv_id; + req->host_service_id = *host_srv_id; + + error = vmbus_msghc_exec_noresult(mh); + vmbus_msghc_put(sc, mh); + + if (error) { + device_printf(sc->vmbus_dev, + "tl connect msg hypercall failed\n"); + } + + return error; +} + +static int +vmbus_connect(struct vmbus_softc *sc, uint32_t version) +{ + struct vmbus_chanmsg_connect *req; + const struct vmbus_message *msg; + struct vmbus_msghc *mh; + int error, done = 0; + + mh = vmbus_msghc_get(sc, sizeof(*req)); + if (mh == NULL) + return ENXIO; + + req = vmbus_msghc_dataptr(mh); + req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CONNECT; + req->chm_ver = version; + req->chm_evtflags = sc->vmbus_evtflags_dma.hv_paddr; + req->chm_mnf1 = sc->vmbus_mnf1_dma.hv_paddr; + req->chm_mnf2 = sc->vmbus_mnf2_dma.hv_paddr; + + error = vmbus_msghc_exec(sc, mh); + if (error) { + vmbus_msghc_put(sc, mh); + return error; + } + + msg = vmbus_msghc_wait_result(sc, mh); + done = ((const struct vmbus_chanmsg_connect_resp *) + msg->msg_data)->chm_done; + + vmbus_msghc_put(sc, mh); + + return (done ? 0 : EOPNOTSUPP); +} + +static int +vmbus_init(struct vmbus_softc *sc) +{ + int i; + + for (i = 0; i < nitems(vmbus_version); ++i) { + int error; + + error = vmbus_connect(sc, vmbus_version[i]); + if (!error) { + vmbus_current_version = vmbus_version[i]; + sc->vmbus_version = vmbus_version[i]; + device_printf(sc->vmbus_dev, "version %u.%u\n", + VMBUS_VERSION_MAJOR(sc->vmbus_version), + VMBUS_VERSION_MINOR(sc->vmbus_version)); + return 0; + } + } + return ENXIO; +} + +static void +vmbus_disconnect(struct vmbus_softc *sc) +{ + struct vmbus_chanmsg_disconnect *req; + struct vmbus_msghc *mh; + int error; + + mh = vmbus_msghc_get(sc, sizeof(*req)); + if (mh == NULL) { + device_printf(sc->vmbus_dev, + "can not get msg hypercall for disconnect\n"); + return; + } + + req = vmbus_msghc_dataptr(mh); + req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_DISCONNECT; + + error = vmbus_msghc_exec_noresult(mh); + vmbus_msghc_put(sc, mh); + + if (error) { + device_printf(sc->vmbus_dev, + "disconnect msg hypercall failed\n"); + } +} + +static int +vmbus_req_channels(struct vmbus_softc *sc) +{ + struct vmbus_chanmsg_chrequest *req; + struct vmbus_msghc *mh; + int error; + + mh = vmbus_msghc_get(sc, sizeof(*req)); + if (mh == NULL) + return ENXIO; + + req = vmbus_msghc_dataptr(mh); + req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHREQUEST; + + error = vmbus_msghc_exec_noresult(mh); + vmbus_msghc_put(sc, mh); + + return error; +} + +static void +vmbus_scan_done_task(void *xsc, int pending __unused) +{ + struct vmbus_softc *sc = xsc; + + mtx_lock(&Giant); + sc->vmbus_scandone = true; + mtx_unlock(&Giant); + wakeup(&sc->vmbus_scandone); +} + +static void +vmbus_scan_done(struct vmbus_softc *sc, + const struct vmbus_message *msg __unused) +{ + + taskqueue_enqueue(sc->vmbus_devtq, &sc->vmbus_scandone_task); +} + +static int +vmbus_scan(struct vmbus_softc *sc) +{ + int error; + + /* + * Identify, probe and attach for non-channel devices. + */ + bus_generic_probe(sc->vmbus_dev); + bus_generic_attach(sc->vmbus_dev); + + /* + * This taskqueue serializes vmbus devices' attach and detach + * for channel offer and rescind messages. + */ + sc->vmbus_devtq = taskqueue_create("vmbus dev", M_WAITOK, + taskqueue_thread_enqueue, &sc->vmbus_devtq); + taskqueue_start_threads(&sc->vmbus_devtq, 1, PI_NET, "vmbusdev"); + TASK_INIT(&sc->vmbus_scandone_task, 0, vmbus_scan_done_task, sc); + + /* + * This taskqueue handles sub-channel detach, so that vmbus + * device's detach running in vmbus_devtq can drain its sub- + * channels. + */ + sc->vmbus_subchtq = taskqueue_create("vmbus subch", M_WAITOK, + taskqueue_thread_enqueue, &sc->vmbus_subchtq); + taskqueue_start_threads(&sc->vmbus_subchtq, 1, PI_NET, "vmbussch"); + + /* + * Start vmbus scanning. + */ + error = vmbus_req_channels(sc); + if (error) { + device_printf(sc->vmbus_dev, "channel request failed: %d\n", + error); + return (error); + } + + /* + * Wait for all vmbus devices from the initial channel offers to be + * attached. + */ + GIANT_REQUIRED; + while (!sc->vmbus_scandone) + mtx_sleep(&sc->vmbus_scandone, &Giant, 0, "vmbusdev", 0); + + if (bootverbose) { + device_printf(sc->vmbus_dev, "device scan, probe and attach " + "done\n"); + } + return (0); +} + +static void +vmbus_scan_teardown(struct vmbus_softc *sc) +{ + + GIANT_REQUIRED; + if (sc->vmbus_devtq != NULL) { + mtx_unlock(&Giant); + taskqueue_free(sc->vmbus_devtq); + mtx_lock(&Giant); + sc->vmbus_devtq = NULL; + } + if (sc->vmbus_subchtq != NULL) { + mtx_unlock(&Giant); + taskqueue_free(sc->vmbus_subchtq); + mtx_lock(&Giant); + sc->vmbus_subchtq = NULL; + } +} + +static void +vmbus_chanmsg_handle(struct vmbus_softc *sc, const struct vmbus_message *msg) +{ + vmbus_chanmsg_proc_t msg_proc; + uint32_t msg_type; + + msg_type = ((const struct vmbus_chanmsg_hdr *)msg->msg_data)->chm_type; + if (msg_type >= VMBUS_CHANMSG_TYPE_MAX) { + device_printf(sc->vmbus_dev, "unknown message type 0x%x\n", + msg_type); + return; + } + + msg_proc = vmbus_chanmsg_handlers[msg_type]; + if (msg_proc != NULL) + msg_proc(sc, msg); + + /* Channel specific processing */ + vmbus_chan_msgproc(sc, msg); +} + +static void +vmbus_msg_task(void *xsc, int pending __unused) +{ + struct vmbus_softc *sc = xsc; + volatile struct vmbus_message *msg; + + msg = VMBUS_PCPU_GET(sc, message, curcpu) + VMBUS_SINT_MESSAGE; + for (;;) { + if (msg->msg_type == HYPERV_MSGTYPE_NONE) { + /* No message */ + break; + } else if (msg->msg_type == HYPERV_MSGTYPE_CHANNEL) { + /* Channel message */ + vmbus_chanmsg_handle(sc, + __DEVOLATILE(const struct vmbus_message *, msg)); + } + + msg->msg_type = HYPERV_MSGTYPE_NONE; + /* + * Make sure the write to msg_type (i.e. set to + * HYPERV_MSGTYPE_NONE) happens before we read the + * msg_flags and EOMing. Otherwise, the EOMing will + * not deliver any more messages since there is no + * empty slot + * + * NOTE: + * mb() is used here, since atomic_thread_fence_seq_cst() + * will become compiler fence on UP kernel. + */ + mb(); + if (msg->msg_flags & VMBUS_MSGFLAG_PENDING) { + /* + * This will cause message queue rescan to possibly + * deliver another msg from the hypervisor + */ + wrmsr(MSR_HV_EOM, 0); + } + } +} + +static __inline int +vmbus_handle_intr1(struct vmbus_softc *sc, struct trapframe *frame, int cpu) +{ + volatile struct vmbus_message *msg; + struct vmbus_message *msg_base; + + msg_base = VMBUS_PCPU_GET(sc, message, cpu); + + /* + * Check event timer. + * + * TODO: move this to independent IDT vector. + */ + msg = msg_base + VMBUS_SINT_TIMER; + if (msg->msg_type == HYPERV_MSGTYPE_TIMER_EXPIRED) { + msg->msg_type = HYPERV_MSGTYPE_NONE; + + vmbus_et_intr(frame); + + /* + * Make sure the write to msg_type (i.e. set to + * HYPERV_MSGTYPE_NONE) happens before we read the + * msg_flags and EOMing. Otherwise, the EOMing will + * not deliver any more messages since there is no + * empty slot + * + * NOTE: + * mb() is used here, since atomic_thread_fence_seq_cst() + * will become compiler fence on UP kernel. + */ + mb(); + if (msg->msg_flags & VMBUS_MSGFLAG_PENDING) { + /* + * This will cause message queue rescan to possibly + * deliver another msg from the hypervisor + */ + wrmsr(MSR_HV_EOM, 0); + } + } + + /* + * Check events. Hot path for network and storage I/O data; high rate. + * + * NOTE: + * As recommended by the Windows guest fellows, we check events before + * checking messages. + */ + sc->vmbus_event_proc(sc, cpu); + + /* + * Check messages. Mainly management stuffs; ultra low rate. + */ + msg = msg_base + VMBUS_SINT_MESSAGE; + if (__predict_false(msg->msg_type != HYPERV_MSGTYPE_NONE)) { + taskqueue_enqueue(VMBUS_PCPU_GET(sc, message_tq, cpu), + VMBUS_PCPU_PTR(sc, message_task, cpu)); + } + + return (FILTER_HANDLED); +} + +void +vmbus_handle_intr(struct trapframe *trap_frame) +{ + struct vmbus_softc *sc = vmbus_get_softc(); + int cpu = curcpu; + + /* + * Disable preemption. + */ + critical_enter(); + + /* + * Do a little interrupt counting. + */ + (*VMBUS_PCPU_GET(sc, intr_cnt, cpu))++; + + vmbus_handle_intr1(sc, trap_frame, cpu); + + /* + * Enable preemption. + */ + critical_exit(); +} + +static void +vmbus_synic_setup(void *xsc) +{ + struct vmbus_softc *sc = xsc; + int cpu = curcpu; + uint64_t val, orig; + uint32_t sint; + + if (hyperv_features & CPUID_HV_MSR_VP_INDEX) { + /* Save virtual processor id. */ + VMBUS_PCPU_GET(sc, vcpuid, cpu) = rdmsr(MSR_HV_VP_INDEX); + } else { + /* Set virtual processor id to 0 for compatibility. */ + VMBUS_PCPU_GET(sc, vcpuid, cpu) = 0; + } + + /* + * Setup the SynIC message. + */ + orig = rdmsr(MSR_HV_SIMP); + val = MSR_HV_SIMP_ENABLE | (orig & MSR_HV_SIMP_RSVD_MASK) | + ((VMBUS_PCPU_GET(sc, message_dma.hv_paddr, cpu) >> PAGE_SHIFT) << + MSR_HV_SIMP_PGSHIFT); + wrmsr(MSR_HV_SIMP, val); + + /* + * Setup the SynIC event flags. + */ + orig = rdmsr(MSR_HV_SIEFP); + val = MSR_HV_SIEFP_ENABLE | (orig & MSR_HV_SIEFP_RSVD_MASK) | + ((VMBUS_PCPU_GET(sc, event_flags_dma.hv_paddr, cpu) + >> PAGE_SHIFT) << MSR_HV_SIEFP_PGSHIFT); + wrmsr(MSR_HV_SIEFP, val); + + + /* + * Configure and unmask SINT for message and event flags. + */ + sint = MSR_HV_SINT0 + VMBUS_SINT_MESSAGE; + orig = rdmsr(sint); + val = sc->vmbus_idtvec | MSR_HV_SINT_AUTOEOI | + (orig & MSR_HV_SINT_RSVD_MASK); + wrmsr(sint, val); + + /* + * Configure and unmask SINT for timer. + */ + sint = MSR_HV_SINT0 + VMBUS_SINT_TIMER; + orig = rdmsr(sint); + val = sc->vmbus_idtvec | MSR_HV_SINT_AUTOEOI | + (orig & MSR_HV_SINT_RSVD_MASK); + wrmsr(sint, val); + + /* + * All done; enable SynIC. + */ + orig = rdmsr(MSR_HV_SCONTROL); + val = MSR_HV_SCTRL_ENABLE | (orig & MSR_HV_SCTRL_RSVD_MASK); + wrmsr(MSR_HV_SCONTROL, val); +} + +static void +vmbus_synic_teardown(void *arg) +{ + uint64_t orig; + uint32_t sint; + + /* + * Disable SynIC. + */ + orig = rdmsr(MSR_HV_SCONTROL); + wrmsr(MSR_HV_SCONTROL, (orig & MSR_HV_SCTRL_RSVD_MASK)); + + /* + * Mask message and event flags SINT. + */ + sint = MSR_HV_SINT0 + VMBUS_SINT_MESSAGE; + orig = rdmsr(sint); + wrmsr(sint, orig | MSR_HV_SINT_MASKED); + + /* + * Mask timer SINT. + */ + sint = MSR_HV_SINT0 + VMBUS_SINT_TIMER; + orig = rdmsr(sint); + wrmsr(sint, orig | MSR_HV_SINT_MASKED); + + /* + * Teardown SynIC message. + */ + orig = rdmsr(MSR_HV_SIMP); + wrmsr(MSR_HV_SIMP, (orig & MSR_HV_SIMP_RSVD_MASK)); + + /* + * Teardown SynIC event flags. + */ + orig = rdmsr(MSR_HV_SIEFP); + wrmsr(MSR_HV_SIEFP, (orig & MSR_HV_SIEFP_RSVD_MASK)); +} + +static int +vmbus_dma_alloc(struct vmbus_softc *sc) +{ + bus_dma_tag_t parent_dtag; + uint8_t *evtflags; + int cpu; + + parent_dtag = bus_get_dma_tag(sc->vmbus_dev); + CPU_FOREACH(cpu) { + void *ptr; + + /* + * Per-cpu messages and event flags. + */ + ptr = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, + PAGE_SIZE, VMBUS_PCPU_PTR(sc, message_dma, cpu), + BUS_DMA_WAITOK | BUS_DMA_ZERO); + if (ptr == NULL) + return ENOMEM; + VMBUS_PCPU_GET(sc, message, cpu) = ptr; + + ptr = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, + PAGE_SIZE, VMBUS_PCPU_PTR(sc, event_flags_dma, cpu), + BUS_DMA_WAITOK | BUS_DMA_ZERO); + if (ptr == NULL) + return ENOMEM; + VMBUS_PCPU_GET(sc, event_flags, cpu) = ptr; + } + + evtflags = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, + PAGE_SIZE, &sc->vmbus_evtflags_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); + if (evtflags == NULL) + return ENOMEM; + sc->vmbus_rx_evtflags = (u_long *)evtflags; + sc->vmbus_tx_evtflags = (u_long *)(evtflags + (PAGE_SIZE / 2)); + sc->vmbus_evtflags = evtflags; + + sc->vmbus_mnf1 = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, + PAGE_SIZE, &sc->vmbus_mnf1_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); + if (sc->vmbus_mnf1 == NULL) + return ENOMEM; + + sc->vmbus_mnf2 = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, + sizeof(struct vmbus_mnf), &sc->vmbus_mnf2_dma, + BUS_DMA_WAITOK | BUS_DMA_ZERO); + if (sc->vmbus_mnf2 == NULL) + return ENOMEM; + + return 0; +} + +static void +vmbus_dma_free(struct vmbus_softc *sc) +{ + int cpu; + + if (sc->vmbus_evtflags != NULL) { + hyperv_dmamem_free(&sc->vmbus_evtflags_dma, sc->vmbus_evtflags); + sc->vmbus_evtflags = NULL; + sc->vmbus_rx_evtflags = NULL; + sc->vmbus_tx_evtflags = NULL; + } + if (sc->vmbus_mnf1 != NULL) { + hyperv_dmamem_free(&sc->vmbus_mnf1_dma, sc->vmbus_mnf1); + sc->vmbus_mnf1 = NULL; + } + if (sc->vmbus_mnf2 != NULL) { + hyperv_dmamem_free(&sc->vmbus_mnf2_dma, sc->vmbus_mnf2); + sc->vmbus_mnf2 = NULL; + } + + CPU_FOREACH(cpu) { + if (VMBUS_PCPU_GET(sc, message, cpu) != NULL) { + hyperv_dmamem_free( + VMBUS_PCPU_PTR(sc, message_dma, cpu), + VMBUS_PCPU_GET(sc, message, cpu)); + VMBUS_PCPU_GET(sc, message, cpu) = NULL; + } + if (VMBUS_PCPU_GET(sc, event_flags, cpu) != NULL) { + hyperv_dmamem_free( + VMBUS_PCPU_PTR(sc, event_flags_dma, cpu), + VMBUS_PCPU_GET(sc, event_flags, cpu)); + VMBUS_PCPU_GET(sc, event_flags, cpu) = NULL; + } + } +} + +static int +vmbus_intr_setup(struct vmbus_softc *sc) +{ + int cpu; + + CPU_FOREACH(cpu) { + char buf[MAXCOMLEN + 1]; + cpuset_t cpu_mask; + + /* Allocate an interrupt counter for Hyper-V interrupt */ + snprintf(buf, sizeof(buf), "cpu%d:hyperv", cpu); + intrcnt_add(buf, VMBUS_PCPU_PTR(sc, intr_cnt, cpu)); + + /* + * Setup taskqueue to handle events. Task will be per- + * channel. + */ + VMBUS_PCPU_GET(sc, event_tq, cpu) = taskqueue_create_fast( + "hyperv event", M_WAITOK, taskqueue_thread_enqueue, + VMBUS_PCPU_PTR(sc, event_tq, cpu)); + if (vmbus_pin_evttask) { + CPU_SETOF(cpu, &cpu_mask); + taskqueue_start_threads_cpuset( + VMBUS_PCPU_PTR(sc, event_tq, cpu), 1, PI_NET, + &cpu_mask, "hvevent%d", cpu); + } else { + taskqueue_start_threads( + VMBUS_PCPU_PTR(sc, event_tq, cpu), 1, PI_NET, + "hvevent%d", cpu); + } + + /* + * Setup tasks and taskqueues to handle messages. + */ + VMBUS_PCPU_GET(sc, message_tq, cpu) = taskqueue_create_fast( + "hyperv msg", M_WAITOK, taskqueue_thread_enqueue, + VMBUS_PCPU_PTR(sc, message_tq, cpu)); + CPU_SETOF(cpu, &cpu_mask); + taskqueue_start_threads_cpuset( + VMBUS_PCPU_PTR(sc, message_tq, cpu), 1, PI_NET, &cpu_mask, + "hvmsg%d", cpu); + TASK_INIT(VMBUS_PCPU_PTR(sc, message_task, cpu), 0, + vmbus_msg_task, sc); + } + +#if defined(__amd64__) && defined(KLD_MODULE) + pmap_pti_add_kva(VMBUS_ISR_ADDR, VMBUS_ISR_ADDR + PAGE_SIZE, true); +#endif + + /* + * All Hyper-V ISR required resources are setup, now let's find a + * free IDT vector for Hyper-V ISR and set it up. + */ + sc->vmbus_idtvec = lapic_ipi_alloc(pti ? IDTVEC(vmbus_isr_pti) : + IDTVEC(vmbus_isr)); + if (sc->vmbus_idtvec < 0) { +#if defined(__amd64__) && defined(KLD_MODULE) + pmap_pti_remove_kva(VMBUS_ISR_ADDR, VMBUS_ISR_ADDR + PAGE_SIZE); +#endif + device_printf(sc->vmbus_dev, "cannot find free IDT vector\n"); + return ENXIO; + } + if (bootverbose) { + device_printf(sc->vmbus_dev, "vmbus IDT vector %d\n", + sc->vmbus_idtvec); + } + return 0; +} + +static void +vmbus_intr_teardown(struct vmbus_softc *sc) +{ + int cpu; + + if (sc->vmbus_idtvec >= 0) { + lapic_ipi_free(sc->vmbus_idtvec); + sc->vmbus_idtvec = -1; + } + +#if defined(__amd64__) && defined(KLD_MODULE) + pmap_pti_remove_kva(VMBUS_ISR_ADDR, VMBUS_ISR_ADDR + PAGE_SIZE); +#endif + + CPU_FOREACH(cpu) { + if (VMBUS_PCPU_GET(sc, event_tq, cpu) != NULL) { + taskqueue_free(VMBUS_PCPU_GET(sc, event_tq, cpu)); + VMBUS_PCPU_GET(sc, event_tq, cpu) = NULL; + } + if (VMBUS_PCPU_GET(sc, message_tq, cpu) != NULL) { + taskqueue_drain(VMBUS_PCPU_GET(sc, message_tq, cpu), + VMBUS_PCPU_PTR(sc, message_task, cpu)); + taskqueue_free(VMBUS_PCPU_GET(sc, message_tq, cpu)); + VMBUS_PCPU_GET(sc, message_tq, cpu) = NULL; + } + } +} + +static int +vmbus_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) +{ + return (ENOENT); +} + +static int +vmbus_child_pnpinfo(device_t dev, device_t child, struct sbuf *sb) +{ + const struct vmbus_channel *chan; + char guidbuf[HYPERV_GUID_STRLEN]; + + chan = vmbus_get_channel(child); + if (chan == NULL) { + /* Event timer device, which does not belong to a channel */ + return (0); + } + + hyperv_guid2str(&chan->ch_guid_type, guidbuf, sizeof(guidbuf)); + sbuf_printf(sb, "classid=%s", guidbuf); + + hyperv_guid2str(&chan->ch_guid_inst, guidbuf, sizeof(guidbuf)); + sbuf_printf(sb, " deviceid=%s", guidbuf); + + return (0); +} + +int +vmbus_add_child(struct vmbus_channel *chan) +{ + struct vmbus_softc *sc = chan->ch_vmbus; + device_t parent = sc->vmbus_dev; + + mtx_lock(&Giant); + + chan->ch_dev = device_add_child(parent, NULL, -1); + if (chan->ch_dev == NULL) { + mtx_unlock(&Giant); + device_printf(parent, "device_add_child for chan%u failed\n", + chan->ch_id); + return (ENXIO); + } + device_set_ivars(chan->ch_dev, chan); + device_probe_and_attach(chan->ch_dev); + + mtx_unlock(&Giant); + return (0); +} + +int +vmbus_delete_child(struct vmbus_channel *chan) +{ + int error = 0; + + mtx_lock(&Giant); + if (chan->ch_dev != NULL) { + error = device_delete_child(chan->ch_vmbus->vmbus_dev, + chan->ch_dev); + chan->ch_dev = NULL; + } + mtx_unlock(&Giant); + return (error); +} + +static int +vmbus_sysctl_version(SYSCTL_HANDLER_ARGS) +{ + struct vmbus_softc *sc = arg1; + char verstr[16]; + + snprintf(verstr, sizeof(verstr), "%u.%u", + VMBUS_VERSION_MAJOR(sc->vmbus_version), + VMBUS_VERSION_MINOR(sc->vmbus_version)); + return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); +} + +/* + * We need the function to make sure the MMIO resource is allocated from the + * ranges found in _CRS. + * + * For the release function, we can use bus_generic_release_resource(). + */ +static struct resource * +vmbus_alloc_resource(device_t dev, device_t child, int type, int *rid, + rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) +{ + device_t parent = device_get_parent(dev); + struct resource *res; + +#ifdef NEW_PCIB + if (type == SYS_RES_MEMORY) { + struct vmbus_softc *sc = device_get_softc(dev); + + res = pcib_host_res_alloc(&sc->vmbus_mmio_res, child, type, + rid, start, end, count, flags); + } else +#endif + { + res = BUS_ALLOC_RESOURCE(parent, child, type, rid, start, + end, count, flags); + } + + return (res); +} + +static int +vmbus_alloc_msi(device_t bus, device_t dev, int count, int maxcount, int *irqs) +{ + + return (PCIB_ALLOC_MSI(device_get_parent(bus), dev, count, maxcount, + irqs)); +} + +static int +vmbus_release_msi(device_t bus, device_t dev, int count, int *irqs) +{ + + return (PCIB_RELEASE_MSI(device_get_parent(bus), dev, count, irqs)); +} + +static int +vmbus_alloc_msix(device_t bus, device_t dev, int *irq) +{ + + return (PCIB_ALLOC_MSIX(device_get_parent(bus), dev, irq)); +} + +static int +vmbus_release_msix(device_t bus, device_t dev, int irq) +{ + + return (PCIB_RELEASE_MSIX(device_get_parent(bus), dev, irq)); +} + +static int +vmbus_map_msi(device_t bus, device_t dev, int irq, uint64_t *addr, + uint32_t *data) +{ + + return (PCIB_MAP_MSI(device_get_parent(bus), dev, irq, addr, data)); +} + +static uint32_t +vmbus_get_version_method(device_t bus, device_t dev) +{ + struct vmbus_softc *sc = device_get_softc(bus); + + return sc->vmbus_version; +} + +static int +vmbus_probe_guid_method(device_t bus, device_t dev, + const struct hyperv_guid *guid) +{ + const struct vmbus_channel *chan = vmbus_get_channel(dev); + + if (memcmp(&chan->ch_guid_type, guid, sizeof(struct hyperv_guid)) == 0) + return 0; + return ENXIO; +} + +static uint32_t +vmbus_get_vcpu_id_method(device_t bus, device_t dev, int cpu) +{ + const struct vmbus_softc *sc = device_get_softc(bus); + + return (VMBUS_PCPU_GET(sc, vcpuid, cpu)); +} + +static struct taskqueue * +vmbus_get_eventtq_method(device_t bus, device_t dev __unused, int cpu) +{ + const struct vmbus_softc *sc = device_get_softc(bus); + + KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu%d", cpu)); + return (VMBUS_PCPU_GET(sc, event_tq, cpu)); +} + +#ifdef NEW_PCIB +#define VTPM_BASE_ADDR 0xfed40000 +#define FOUR_GB (1ULL << 32) + +enum parse_pass { parse_64, parse_32 }; + +struct parse_context { + device_t vmbus_dev; + enum parse_pass pass; +}; + +static ACPI_STATUS +parse_crs(ACPI_RESOURCE *res, void *ctx) +{ + const struct parse_context *pc = ctx; + device_t vmbus_dev = pc->vmbus_dev; + + struct vmbus_softc *sc = device_get_softc(vmbus_dev); + UINT64 start, end; + + switch (res->Type) { + case ACPI_RESOURCE_TYPE_ADDRESS32: + start = res->Data.Address32.Address.Minimum; + end = res->Data.Address32.Address.Maximum; + break; + + case ACPI_RESOURCE_TYPE_ADDRESS64: + start = res->Data.Address64.Address.Minimum; + end = res->Data.Address64.Address.Maximum; + break; + + default: + /* Unused types. */ + return (AE_OK); + } + + /* + * We don't use <1MB addresses. + */ + if (end < 0x100000) + return (AE_OK); + + /* Don't conflict with vTPM. */ + if (end >= VTPM_BASE_ADDR && start < VTPM_BASE_ADDR) + end = VTPM_BASE_ADDR - 1; + + if ((pc->pass == parse_32 && start < FOUR_GB) || + (pc->pass == parse_64 && start >= FOUR_GB)) + pcib_host_res_decodes(&sc->vmbus_mmio_res, SYS_RES_MEMORY, + start, end, 0); + + return (AE_OK); +} + +static void +vmbus_get_crs(device_t dev, device_t vmbus_dev, enum parse_pass pass) +{ + struct parse_context pc; + ACPI_STATUS status; + + if (bootverbose) + device_printf(dev, "walking _CRS, pass=%d\n", pass); + + pc.vmbus_dev = vmbus_dev; + pc.pass = pass; + status = AcpiWalkResources(acpi_get_handle(dev), "_CRS", + parse_crs, &pc); + + if (bootverbose && ACPI_FAILURE(status)) + device_printf(dev, "_CRS: not found, pass=%d\n", pass); +} + +static void +vmbus_get_mmio_res_pass(device_t dev, enum parse_pass pass) +{ + device_t acpi0, parent; + + parent = device_get_parent(dev); + + acpi0 = device_get_parent(parent); + if (strcmp("acpi0", device_get_nameunit(acpi0)) == 0) { + device_t *children; + int count; + + /* + * Try to locate VMBUS resources and find _CRS on them. + */ + if (device_get_children(acpi0, &children, &count) == 0) { + int i; + + for (i = 0; i < count; ++i) { + if (!device_is_attached(children[i])) + continue; + + if (strcmp("vmbus_res", + device_get_name(children[i])) == 0) + vmbus_get_crs(children[i], dev, pass); + } + free(children, M_TEMP); + } + + /* + * Try to find _CRS on acpi. + */ + vmbus_get_crs(acpi0, dev, pass); + } else { + device_printf(dev, "not grandchild of acpi\n"); + } + + /* + * Try to find _CRS on parent. + */ + vmbus_get_crs(parent, dev, pass); +} + +static void +vmbus_get_mmio_res(device_t dev) +{ + struct vmbus_softc *sc = device_get_softc(dev); + /* + * We walk the resources twice to make sure that: in the resource + * list, the 32-bit resources appear behind the 64-bit resources. + * NB: resource_list_add() uses INSERT_TAIL. This way, when we + * iterate through the list to find a range for a 64-bit BAR in + * vmbus_alloc_resource(), we can make sure we try to use >4GB + * ranges first. + */ + pcib_host_res_init(dev, &sc->vmbus_mmio_res); + + vmbus_get_mmio_res_pass(dev, parse_64); + vmbus_get_mmio_res_pass(dev, parse_32); +} + +/* + * On Gen2 VMs, Hyper-V provides mmio space for framebuffer. + * This mmio address range is not useable for other PCI devices. + * Currently only efifb and vbefb drivers are using this range without + * reserving it from system. + * Therefore, vmbus driver reserves it before any other PCI device + * drivers start to request mmio addresses. + */ +static struct resource *hv_fb_res; + +static void +vmbus_fb_mmio_res(device_t dev) +{ + struct efi_fb *efifb; + struct vbe_fb *vbefb; + rman_res_t fb_start, fb_end, fb_count; + int fb_height, fb_width; + caddr_t kmdp; + + struct vmbus_softc *sc = device_get_softc(dev); + int rid = 0; + + kmdp = preload_search_by_type("elf kernel"); + if (kmdp == NULL) + kmdp = preload_search_by_type("elf64 kernel"); + efifb = (struct efi_fb *)preload_search_info(kmdp, + MODINFO_METADATA | MODINFOMD_EFI_FB); + vbefb = (struct vbe_fb *)preload_search_info(kmdp, + MODINFO_METADATA | MODINFOMD_VBE_FB); + if (efifb != NULL) { + fb_start = efifb->fb_addr; + fb_end = efifb->fb_addr + efifb->fb_size; + fb_count = efifb->fb_size; + fb_height = efifb->fb_height; + fb_width = efifb->fb_width; + } else if (vbefb != NULL) { + fb_start = vbefb->fb_addr; + fb_end = vbefb->fb_addr + vbefb->fb_size; + fb_count = vbefb->fb_size; + fb_height = vbefb->fb_height; + fb_width = vbefb->fb_width; + } else { + if (bootverbose) + device_printf(dev, + "no preloaded kernel fb information\n"); + /* We are on Gen1 VM, just return. */ + return; + } + + if (bootverbose) + device_printf(dev, + "fb: fb_addr: %#jx, size: %#jx, " + "actual size needed: 0x%x\n", + fb_start, fb_count, fb_height * fb_width); + + hv_fb_res = pcib_host_res_alloc(&sc->vmbus_mmio_res, dev, + SYS_RES_MEMORY, &rid, fb_start, fb_end, fb_count, + RF_ACTIVE | rman_make_alignment_flags(PAGE_SIZE)); + + if (hv_fb_res && bootverbose) + device_printf(dev, + "successfully reserved memory for framebuffer " + "starting at %#jx, size %#jx\n", + fb_start, fb_count); +} + +static void +vmbus_free_mmio_res(device_t dev) +{ + struct vmbus_softc *sc = device_get_softc(dev); + + pcib_host_res_free(dev, &sc->vmbus_mmio_res); + + if (hv_fb_res) + hv_fb_res = NULL; +} +#endif /* NEW_PCIB */ + +static void +vmbus_identify(driver_t *driver, device_t parent) +{ + + if (device_get_unit(parent) != 0 || vm_guest != VM_GUEST_HV || + (hyperv_features & CPUID_HV_MSR_SYNIC) == 0) + return; + device_add_child(parent, "vmbus", -1); +} + +static int +vmbus_probe(device_t dev) +{ + + if (device_get_unit(dev) != 0 || vm_guest != VM_GUEST_HV || + (hyperv_features & CPUID_HV_MSR_SYNIC) == 0) + return (ENXIO); + + device_set_desc(dev, "Hyper-V Vmbus"); + return (BUS_PROBE_DEFAULT); +} + +/** + * @brief Main vmbus driver initialization routine. + * + * Here, we + * - initialize the vmbus driver context + * - setup various driver entry points + * - invoke the vmbus hv main init routine + * - get the irq resource + * - invoke the vmbus to add the vmbus root device + * - setup the vmbus root device + * - retrieve the channel offers + */ +static int +vmbus_doattach(struct vmbus_softc *sc) +{ + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + int ret; + + if (sc->vmbus_flags & VMBUS_FLAG_ATTACHED) + return (0); + +#ifdef NEW_PCIB + vmbus_get_mmio_res(sc->vmbus_dev); + vmbus_fb_mmio_res(sc->vmbus_dev); +#endif + + sc->vmbus_flags |= VMBUS_FLAG_ATTACHED; + + sc->vmbus_gpadl = VMBUS_GPADL_START; + mtx_init(&sc->vmbus_prichan_lock, "vmbus prichan", NULL, MTX_DEF); + TAILQ_INIT(&sc->vmbus_prichans); + mtx_init(&sc->vmbus_chan_lock, "vmbus channel", NULL, MTX_DEF); + TAILQ_INIT(&sc->vmbus_chans); + sc->vmbus_chmap = malloc( + sizeof(struct vmbus_channel *) * VMBUS_CHAN_MAX, M_DEVBUF, + M_WAITOK | M_ZERO); + + /* + * Create context for "post message" Hypercalls + */ + sc->vmbus_xc = vmbus_xact_ctx_create(bus_get_dma_tag(sc->vmbus_dev), + HYPERCALL_POSTMSGIN_SIZE, VMBUS_MSG_SIZE, + sizeof(struct vmbus_msghc)); + if (sc->vmbus_xc == NULL) { + ret = ENXIO; + goto cleanup; + } + + /* + * Allocate DMA stuffs. + */ + ret = vmbus_dma_alloc(sc); + if (ret != 0) + goto cleanup; + + /* + * Setup interrupt. + */ + ret = vmbus_intr_setup(sc); + if (ret != 0) + goto cleanup; + + /* + * Setup SynIC. + */ + if (bootverbose) + device_printf(sc->vmbus_dev, "smp_started = %d\n", smp_started); + smp_rendezvous(NULL, vmbus_synic_setup, NULL, sc); + sc->vmbus_flags |= VMBUS_FLAG_SYNIC; + + /* + * Initialize vmbus, e.g. connect to Hypervisor. + */ + ret = vmbus_init(sc); + if (ret != 0) + goto cleanup; + + if (sc->vmbus_version == VMBUS_VERSION_WS2008 || + sc->vmbus_version == VMBUS_VERSION_WIN7) + sc->vmbus_event_proc = vmbus_event_proc_compat; + else + sc->vmbus_event_proc = vmbus_event_proc; + + ret = vmbus_scan(sc); + if (ret != 0) + goto cleanup; + + ctx = device_get_sysctl_ctx(sc->vmbus_dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->vmbus_dev)); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "version", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + vmbus_sysctl_version, "A", "vmbus version"); + + return (ret); + +cleanup: + vmbus_scan_teardown(sc); + vmbus_intr_teardown(sc); + vmbus_dma_free(sc); + if (sc->vmbus_xc != NULL) { + vmbus_xact_ctx_destroy(sc->vmbus_xc); + sc->vmbus_xc = NULL; + } + free(__DEVOLATILE(void *, sc->vmbus_chmap), M_DEVBUF); + mtx_destroy(&sc->vmbus_prichan_lock); + mtx_destroy(&sc->vmbus_chan_lock); + + return (ret); +} + +static void +vmbus_event_proc_dummy(struct vmbus_softc *sc __unused, int cpu __unused) +{ +} + +#ifdef EARLY_AP_STARTUP + +static void +vmbus_intrhook(void *xsc) +{ + struct vmbus_softc *sc = xsc; + + if (bootverbose) + device_printf(sc->vmbus_dev, "intrhook\n"); + vmbus_doattach(sc); + config_intrhook_disestablish(&sc->vmbus_intrhook); +} + +#endif /* EARLY_AP_STARTUP */ + +static int +vmbus_attach(device_t dev) +{ + vmbus_sc = device_get_softc(dev); + vmbus_sc->vmbus_dev = dev; + vmbus_sc->vmbus_idtvec = -1; + + /* + * Event processing logic will be configured: + * - After the vmbus protocol version negotiation. + * - Before we request channel offers. + */ + vmbus_sc->vmbus_event_proc = vmbus_event_proc_dummy; + +#ifdef EARLY_AP_STARTUP + /* + * Defer the real attach until the pause(9) works as expected. + */ + vmbus_sc->vmbus_intrhook.ich_func = vmbus_intrhook; + vmbus_sc->vmbus_intrhook.ich_arg = vmbus_sc; + config_intrhook_establish(&vmbus_sc->vmbus_intrhook); +#else /* !EARLY_AP_STARTUP */ + /* + * If the system has already booted and thread + * scheduling is possible indicated by the global + * cold set to zero, we just call the driver + * initialization directly. + */ + if (!cold) + vmbus_doattach(vmbus_sc); +#endif /* EARLY_AP_STARTUP */ + + return (0); +} + +static int +vmbus_detach(device_t dev) +{ + struct vmbus_softc *sc = device_get_softc(dev); + + bus_generic_detach(dev); + vmbus_chan_destroy_all(sc); + + vmbus_scan_teardown(sc); + + vmbus_disconnect(sc); + + if (sc->vmbus_flags & VMBUS_FLAG_SYNIC) { + sc->vmbus_flags &= ~VMBUS_FLAG_SYNIC; + smp_rendezvous(NULL, vmbus_synic_teardown, NULL, NULL); + } + + vmbus_intr_teardown(sc); + vmbus_dma_free(sc); + + if (sc->vmbus_xc != NULL) { + vmbus_xact_ctx_destroy(sc->vmbus_xc); + sc->vmbus_xc = NULL; + } + + free(__DEVOLATILE(void *, sc->vmbus_chmap), M_DEVBUF); + mtx_destroy(&sc->vmbus_prichan_lock); + mtx_destroy(&sc->vmbus_chan_lock); + +#ifdef NEW_PCIB + vmbus_free_mmio_res(dev); +#endif + + return (0); +} + +#ifndef EARLY_AP_STARTUP + +static void +vmbus_sysinit(void *arg __unused) +{ + struct vmbus_softc *sc = vmbus_get_softc(); + + if (vm_guest != VM_GUEST_HV || sc == NULL) + return; + + /* + * If the system has already booted and thread + * scheduling is possible, as indicated by the + * global cold set to zero, we just call the driver + * initialization directly. + */ + if (!cold) + vmbus_doattach(sc); +} +/* + * NOTE: + * We have to start as the last step of SI_SUB_SMP, i.e. after SMP is + * initialized. + */ +SYSINIT(vmbus_initialize, SI_SUB_SMP, SI_ORDER_ANY, vmbus_sysinit, NULL); + +#endif /* !EARLY_AP_STARTUP */ diff --git a/sys/dev/hyperv/vmbus/vmbus_br.c b/sys/dev/hyperv/vmbus/vmbus_br.c new file mode 100644 index 000000000000..7311f87fd596 --- /dev/null +++ b/sys/dev/hyperv/vmbus/vmbus_br.c @@ -0,0 +1,720 @@ +/*- + * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/sysctl.h> + +#include <dev/hyperv/vmbus/vmbus_reg.h> +#include <dev/hyperv/vmbus/vmbus_brvar.h> + +/* Amount of space available for write */ +#define VMBUS_BR_WAVAIL(r, w, z) \ + (((w) >= (r)) ? ((z) - ((w) - (r))) : ((r) - (w))) + +/* Increase bufing index */ +#define VMBUS_BR_IDXINC(idx, inc, sz) (((idx) + (inc)) % (sz)) + +static int vmbus_br_sysctl_state(SYSCTL_HANDLER_ARGS); +static int vmbus_br_sysctl_state_bin(SYSCTL_HANDLER_ARGS); +static void vmbus_br_setup(struct vmbus_br *, void *, int); + +static int +vmbus_br_sysctl_state(SYSCTL_HANDLER_ARGS) +{ + const struct vmbus_br *br = arg1; + uint32_t rindex, windex, imask, psndsz, fvalue, ravail, wavail; + uint64_t intrcnt; + char state[256]; + + intrcnt = br->vbr_intrcnt; + rindex = br->vbr_rindex; + windex = br->vbr_windex; + imask = br->vbr_imask; + psndsz = br->vbr_psndsz; + fvalue = br->vbr_fvalue; + wavail = VMBUS_BR_WAVAIL(rindex, windex, br->vbr_dsize); + ravail = br->vbr_dsize - wavail; + + snprintf(state, sizeof(state), + "intrcnt:%ju rindex:%u windex:%u imask:%u psndsz:%u fvalue:%u " + "ravail:%u wavail:%u", + (uintmax_t)intrcnt, rindex, windex, imask, psndsz, fvalue, + ravail, wavail); + return sysctl_handle_string(oidp, state, sizeof(state), req); +} + +/* + * Binary bufring states. + */ +static int +vmbus_br_sysctl_state_bin(SYSCTL_HANDLER_ARGS) +{ +#define BR_STATE_RIDX 0 +#define BR_STATE_WIDX 1 +#define BR_STATE_IMSK 2 +#define BR_STATE_PSSZ 3 +#define BR_STATE_FVAL 4 +#define BR_STATE_RSPC 5 +#define BR_STATE_WSPC 6 +#define BR_STATE_MAX 7 + + const struct vmbus_br *br = arg1; + uint32_t rindex, windex, wavail, state[BR_STATE_MAX]; + + rindex = br->vbr_rindex; + windex = br->vbr_windex; + wavail = VMBUS_BR_WAVAIL(rindex, windex, br->vbr_dsize); + + state[BR_STATE_RIDX] = rindex; + state[BR_STATE_WIDX] = windex; + state[BR_STATE_IMSK] = br->vbr_imask; + state[BR_STATE_PSSZ] = br->vbr_psndsz; + state[BR_STATE_FVAL] = br->vbr_fvalue; + state[BR_STATE_WSPC] = wavail; + state[BR_STATE_RSPC] = br->vbr_dsize - wavail; + + return sysctl_handle_opaque(oidp, state, sizeof(state), req); +} + +void +vmbus_br_sysctl_create(struct sysctl_ctx_list *ctx, struct sysctl_oid *br_tree, + struct vmbus_br *br, const char *name) +{ + struct sysctl_oid *tree; + char desc[64]; + + tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(br_tree), OID_AUTO, + name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + if (tree == NULL) + return; + + snprintf(desc, sizeof(desc), "%s state", name); + SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "state", + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, + br, 0, vmbus_br_sysctl_state, "A", desc); + + snprintf(desc, sizeof(desc), "%s binary state", name); + SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "state_bin", + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, + br, 0, vmbus_br_sysctl_state_bin, "IU", desc); +} + +void +vmbus_rxbr_intr_mask(struct vmbus_rxbr *rbr) +{ + rbr->rxbr_imask = 1; + mb(); +} + +static __inline uint32_t +vmbus_rxbr_avail(const struct vmbus_rxbr *rbr) +{ + uint32_t rindex, windex; + + /* Get snapshot */ + rindex = rbr->rxbr_rindex; + windex = rbr->rxbr_windex; + + return (rbr->rxbr_dsize - + VMBUS_BR_WAVAIL(rindex, windex, rbr->rxbr_dsize)); +} + +uint32_t +vmbus_rxbr_available(const struct vmbus_rxbr *rbr) +{ + return (vmbus_rxbr_avail(rbr)); +} + +uint32_t +vmbus_rxbr_intr_unmask(struct vmbus_rxbr *rbr) +{ + rbr->rxbr_imask = 0; + mb(); + + /* + * Now check to see if the ring buffer is still empty. + * If it is not, we raced and we need to process new + * incoming channel packets. + */ + return vmbus_rxbr_avail(rbr); +} + +static void +vmbus_br_setup(struct vmbus_br *br, void *buf, int blen) +{ + br->vbr = buf; + br->vbr_dsize = blen - sizeof(struct vmbus_bufring); +} + +void +vmbus_rxbr_init(struct vmbus_rxbr *rbr) +{ + mtx_init(&rbr->rxbr_lock, "vmbus_rxbr", NULL, MTX_SPIN); +} + +void +vmbus_rxbr_deinit(struct vmbus_rxbr *rbr) +{ + mtx_destroy(&rbr->rxbr_lock); +} + +void +vmbus_rxbr_setup(struct vmbus_rxbr *rbr, void *buf, int blen) +{ + vmbus_br_setup(&rbr->rxbr, buf, blen); +} + +static __inline boolean_t +vmbus_rxbr_need_signal(const struct vmbus_rxbr *rbr, uint32_t bytes_read) +{ + uint32_t pending_snd_sz, canwrite_size; + + /* No need to signal if host doesn't want us to */ + if (!rbr->rxbr_fpsndsz) + return false; + + mb(); + + pending_snd_sz = rbr->rxbr_psndsz; + /* No need to signal if host sets pending_snd_sz to 0 */ + if (!pending_snd_sz) + return false; + + mb(); + + canwrite_size = rbr->rxbr_dsize - vmbus_rxbr_avail(rbr); + + /* No need to signal if br already has enough space before read */ + if (canwrite_size - bytes_read > pending_snd_sz) + return false; + + /* + * No need to signal if still doesn't have enough space + * asked by host + */ + if (canwrite_size <= pending_snd_sz) + return false; + + return true; +} + +void +vmbus_txbr_init(struct vmbus_txbr *tbr) +{ + mtx_init(&tbr->txbr_lock, "vmbus_txbr", NULL, MTX_SPIN); +} + +void +vmbus_txbr_deinit(struct vmbus_txbr *tbr) +{ + mtx_destroy(&tbr->txbr_lock); +} + +void +vmbus_txbr_setup(struct vmbus_txbr *tbr, void *buf, int blen) +{ + vmbus_br_setup(&tbr->txbr, buf, blen); + + /* Set feature bit enabling flow control */ + tbr->txbr_fpsndsz = 1; +} + +uint32_t +vmbus_txbr_get_imask(const struct vmbus_txbr *tbr) +{ + mb(); + + return(tbr->txbr_imask); +} + +void +vmbus_txbr_set_pending_snd_sz(struct vmbus_txbr *tbr, uint32_t size) +{ + tbr->txbr_psndsz = size; +} + +/* + * When we write to the ring buffer, check if the host needs to be + * signaled. + * + * The contract: + * - The host guarantees that while it is draining the TX bufring, + * it will set the br_imask to indicate it does not need to be + * interrupted when new data are added. + * - The host guarantees that it will completely drain the TX bufring + * before exiting the read loop. Further, once the TX bufring is + * empty, it will clear the br_imask and re-check to see if new + * data have arrived. + */ +static __inline boolean_t +vmbus_txbr_need_signal(const struct vmbus_txbr *tbr, uint32_t old_windex) +{ + mb(); + if (tbr->txbr_imask) + return (FALSE); + + __compiler_membar(); + + /* + * This is the only case we need to signal when the + * ring transitions from being empty to non-empty. + */ + if (old_windex == tbr->txbr_rindex) + return (TRUE); + + return (FALSE); +} + +static __inline uint32_t +vmbus_txbr_avail(const struct vmbus_txbr *tbr) +{ + uint32_t rindex, windex; + + /* Get snapshot */ + rindex = tbr->txbr_rindex; + windex = tbr->txbr_windex; + + return VMBUS_BR_WAVAIL(rindex, windex, tbr->txbr_dsize); +} + +static __inline uint32_t +vmbus_txbr_copyto(const struct vmbus_txbr *tbr, uint32_t windex, + const void *src0, uint32_t cplen) +{ + const uint8_t *src = src0; + uint8_t *br_data = tbr->txbr_data; + uint32_t br_dsize = tbr->txbr_dsize; + + if (cplen > br_dsize - windex) { + uint32_t fraglen = br_dsize - windex; + + /* Wrap-around detected */ + memcpy(br_data + windex, src, fraglen); + memcpy(br_data, src + fraglen, cplen - fraglen); + } else { + memcpy(br_data + windex, src, cplen); + } + return VMBUS_BR_IDXINC(windex, cplen, br_dsize); +} + +static __inline uint32_t +vmbus_txbr_copyto_call(const struct vmbus_txbr *tbr, uint32_t windex, + uint32_t cplen, vmbus_br_copy_callback_t cb, void *cbarg, int *ret) +{ + uint8_t *br_data = tbr->txbr_data; + uint32_t br_dsize = tbr->txbr_dsize; + int err = 0; + + if (cplen > br_dsize - windex) { + uint32_t fraglen = br_dsize - windex; + + /* Wrap-around detected */ + err = cb((void *)(br_data + windex), fraglen, cbarg); + if (!err) + err = cb((void *)br_data, cplen - fraglen, cbarg); + } else { + err = cb((void *)(br_data + windex), cplen, cbarg); + } + + *ret = err; + + return VMBUS_BR_IDXINC(windex, cplen, br_dsize); +} + +uint32_t +vmbus_txbr_available(const struct vmbus_txbr *tbr) +{ + return (vmbus_txbr_avail(tbr)); +} + +/* + * NOTE: + * Not holding lock when calling user provided callback routine. + * Caller should hold lock to serialize ring buffer accesses. + */ +int +vmbus_txbr_write_call(struct vmbus_txbr *tbr, + const struct iovec iov[], int iovlen, + vmbus_br_copy_callback_t cb, void *cbarg, + boolean_t *need_sig) +{ + uint32_t old_windex, windex, total; + uint64_t save_windex; + int i; + int cb_ret = 0; + + total = 0; + for (i = 0; i < iovlen; i++) + total += iov[i].iov_len; + total += sizeof(save_windex); + + + /* + * NOTE: + * If this write is going to make br_windex same as br_rindex, + * i.e. the available space for write is same as the write size, + * we can't do it then, since br_windex == br_rindex means that + * the bufring is empty. + */ + if (vmbus_txbr_avail(tbr) <= total) { + return (EAGAIN); + } + + /* Save br_windex for later use */ + old_windex = tbr->txbr_windex; + + /* + * Copy the scattered channel packet to the TX bufring. + */ + windex = old_windex; + for (i = 0; i < iovlen; i++) { + if (iov[i].iov_base != NULL) { + windex = vmbus_txbr_copyto(tbr, windex, + iov[i].iov_base, iov[i].iov_len); + } else if (cb != NULL) { + windex = vmbus_txbr_copyto_call(tbr, windex, + iov[i].iov_len, cb, cbarg, &cb_ret); + /* + * If callback fails, return without updating + * write index. + */ + if (cb_ret) + return (cb_ret); + } + } + + mtx_lock_spin(&tbr->txbr_lock); + + /* + * Set the offset of the current channel packet. + */ + save_windex = ((uint64_t)old_windex) << 32; + windex = vmbus_txbr_copyto(tbr, windex, &save_windex, + sizeof(save_windex)); + + /* + * Update the write index _after_ the channel packet + * is copied. + */ + __compiler_membar(); + tbr->txbr_windex = windex; + + mtx_unlock_spin(&tbr->txbr_lock); + + if (need_sig) + *need_sig = vmbus_txbr_need_signal(tbr, old_windex); + + return (0); +} + +/* + * Write scattered channel packet to TX bufring. + * + * The offset of this channel packet is written as a 64bits value + * immediately after this channel packet. + */ +int +vmbus_txbr_write(struct vmbus_txbr *tbr, const struct iovec iov[], int iovlen, + boolean_t *need_sig) +{ + uint32_t old_windex, windex, total; + uint64_t save_windex; + int i; + + total = 0; + for (i = 0; i < iovlen; i++) + total += iov[i].iov_len; + total += sizeof(save_windex); + + mtx_lock_spin(&tbr->txbr_lock); + + /* + * NOTE: + * If this write is going to make br_windex same as br_rindex, + * i.e. the available space for write is same as the write size, + * we can't do it then, since br_windex == br_rindex means that + * the bufring is empty. + */ + if (vmbus_txbr_avail(tbr) <= total) { + mtx_unlock_spin(&tbr->txbr_lock); + return (EAGAIN); + } + + /* Save br_windex for later use */ + old_windex = tbr->txbr_windex; + + /* + * Copy the scattered channel packet to the TX bufring. + */ + windex = old_windex; + for (i = 0; i < iovlen; i++) { + windex = vmbus_txbr_copyto(tbr, windex, + iov[i].iov_base, iov[i].iov_len); + } + + /* + * Set the offset of the current channel packet. + */ + save_windex = ((uint64_t)old_windex) << 32; + windex = vmbus_txbr_copyto(tbr, windex, &save_windex, + sizeof(save_windex)); + + /* + * Update the write index _after_ the channel packet + * is copied. + */ + __compiler_membar(); + tbr->txbr_windex = windex; + + mtx_unlock_spin(&tbr->txbr_lock); + + *need_sig = vmbus_txbr_need_signal(tbr, old_windex); + + return (0); +} + +static __inline uint32_t +vmbus_rxbr_copyfrom(const struct vmbus_rxbr *rbr, uint32_t rindex, + void *dst0, int cplen) +{ + uint8_t *dst = dst0; + const uint8_t *br_data = rbr->rxbr_data; + uint32_t br_dsize = rbr->rxbr_dsize; + + if (cplen > br_dsize - rindex) { + uint32_t fraglen = br_dsize - rindex; + + /* Wrap-around detected. */ + memcpy(dst, br_data + rindex, fraglen); + memcpy(dst + fraglen, br_data, cplen - fraglen); + } else { + memcpy(dst, br_data + rindex, cplen); + } + return VMBUS_BR_IDXINC(rindex, cplen, br_dsize); +} + +static __inline uint32_t +vmbus_rxbr_copyfrom_call(const struct vmbus_rxbr *rbr, uint32_t rindex, + int cplen, vmbus_br_copy_callback_t cb, void *cbarg) +{ + uint8_t *br_data = rbr->rxbr_data; + uint32_t br_dsize = rbr->rxbr_dsize; + int error = 0; + + if (cplen > br_dsize - rindex) { + uint32_t fraglen = br_dsize - rindex; + + /* Wrap-around detected. */ + error = cb((void *)(br_data + rindex), fraglen, cbarg); + if (!error) + error = cb((void *)br_data, cplen - fraglen, cbarg); + } else { + error = cb((void *)(br_data + rindex), cplen, cbarg); + } + return (error); +} + +int +vmbus_rxbr_peek(struct vmbus_rxbr *rbr, void *data, int dlen) +{ + mtx_lock_spin(&rbr->rxbr_lock); + + /* + * The requested data and the 64bits channel packet + * offset should be there at least. + */ + if (vmbus_rxbr_avail(rbr) < dlen + sizeof(uint64_t)) { + mtx_unlock_spin(&rbr->rxbr_lock); + return (EAGAIN); + } + vmbus_rxbr_copyfrom(rbr, rbr->rxbr_rindex, data, dlen); + + mtx_unlock_spin(&rbr->rxbr_lock); + + return (0); +} + +/* + * NOTE: + * We only hold spin lock to check the ring buffer space. It is + * released before calling user provided callback routine. + * Caller should hold lock to serialize ring buffer accesses. + */ +int +vmbus_rxbr_peek_call(struct vmbus_rxbr *rbr, int dlen, uint32_t skip, + vmbus_br_copy_callback_t cb, void *cbarg) +{ + uint32_t rindex, br_dsize0 = rbr->rxbr_dsize; + int ret; + + mtx_lock_spin(&rbr->rxbr_lock); + /* + * The requested data + skip and the 64bits channel packet + * offset should be there at least. + */ + if (vmbus_rxbr_avail(rbr) < skip + dlen + sizeof(uint64_t)) { + mtx_unlock_spin(&rbr->rxbr_lock); + return (EAGAIN); + } + + rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, skip, br_dsize0); + mtx_unlock_spin(&rbr->rxbr_lock); + + ret = vmbus_rxbr_copyfrom_call(rbr, rindex, dlen, cb, cbarg); + + return (ret); +} + +/* + * NOTE: + * We assume idx_adv == sizeof(channel packet). + */ +int +vmbus_rxbr_idxadv_peek(struct vmbus_rxbr *rbr, void *data, int dlen, + uint32_t idx_adv, boolean_t *need_sig) +{ + uint32_t rindex, br_dsize = rbr->rxbr_dsize; + + mtx_lock_spin(&rbr->rxbr_lock); + /* + * Make sure it has enough data to read. + */ + if (vmbus_rxbr_avail(rbr) < idx_adv + sizeof(uint64_t) + dlen) { + mtx_unlock_spin(&rbr->rxbr_lock); + return (EAGAIN); + } + + if (idx_adv > 0) { + /* + * Advance the read index first, including the channel's 64bit + * previous write offset. + */ + rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, + idx_adv + sizeof(uint64_t), br_dsize); + __compiler_membar(); + rbr->rxbr_rindex = rindex; + } + + vmbus_rxbr_copyfrom(rbr, rbr->rxbr_rindex, data, dlen); + + mtx_unlock_spin(&rbr->rxbr_lock); + + if (need_sig) { + if (idx_adv > 0) + *need_sig = + vmbus_rxbr_need_signal(rbr, idx_adv + + sizeof(uint64_t)); + else + *need_sig = false; + } + + return (0); +} + +/* + * NOTE: + * Just update the RX rb index. + */ +int +vmbus_rxbr_idxadv(struct vmbus_rxbr *rbr, uint32_t idx_adv, + boolean_t *need_sig) +{ + uint32_t rindex, br_dsize = rbr->rxbr_dsize; + + mtx_lock_spin(&rbr->rxbr_lock); + /* + * Make sure it has enough space to advance. + */ + if (vmbus_rxbr_avail(rbr) < idx_adv + sizeof(uint64_t)) { + mtx_unlock_spin(&rbr->rxbr_lock); + return (EAGAIN); + } + + /* + * Advance the read index, including the channel's 64bit + * previous write offset. + */ + rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, + idx_adv + sizeof(uint64_t), br_dsize); + __compiler_membar(); + rbr->rxbr_rindex = rindex; + + mtx_unlock_spin(&rbr->rxbr_lock); + + if (need_sig) { + *need_sig = + vmbus_rxbr_need_signal(rbr, idx_adv + sizeof(uint64_t)); + } + + return (0); +} + +/* + * NOTE: + * We assume (dlen + skip) == sizeof(channel packet). + */ +int +vmbus_rxbr_read(struct vmbus_rxbr *rbr, void *data, int dlen, uint32_t skip) +{ + uint32_t rindex, br_dsize = rbr->rxbr_dsize; + + KASSERT(dlen + skip > 0, ("invalid dlen %d, offset %u", dlen, skip)); + + mtx_lock_spin(&rbr->rxbr_lock); + + if (vmbus_rxbr_avail(rbr) < dlen + skip + sizeof(uint64_t)) { + mtx_unlock_spin(&rbr->rxbr_lock); + return (EAGAIN); + } + + /* + * Copy channel packet from RX bufring. + */ + rindex = VMBUS_BR_IDXINC(rbr->rxbr_rindex, skip, br_dsize); + rindex = vmbus_rxbr_copyfrom(rbr, rindex, data, dlen); + + /* + * Discard this channel packet's 64bits offset, which is useless to us. + */ + rindex = VMBUS_BR_IDXINC(rindex, sizeof(uint64_t), br_dsize); + + /* + * Update the read index _after_ the channel packet is fetched. + */ + __compiler_membar(); + rbr->rxbr_rindex = rindex; + + mtx_unlock_spin(&rbr->rxbr_lock); + + return (0); +} diff --git a/sys/dev/hyperv/vmbus/vmbus_brvar.h b/sys/dev/hyperv/vmbus/vmbus_brvar.h new file mode 100644 index 000000000000..95bf4338ff1c --- /dev/null +++ b/sys/dev/hyperv/vmbus/vmbus_brvar.h @@ -0,0 +1,157 @@ +/*- + * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMBUS_BRVAR_H_ +#define _VMBUS_BRVAR_H_ + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/_iovec.h> + +struct vmbus_br { + struct vmbus_bufring *vbr; + uint32_t vbr_dsize; /* total data size */ +}; + +#define vbr_windex vbr->br_windex +#define vbr_rindex vbr->br_rindex +#define vbr_imask vbr->br_imask +#define vbr_psndsz vbr->br_pending_snd_sz +#define vbr_fpsndsz vbr->br_feature_bits.feat_pending_snd_sz +#define vbr_fvalue vbr->br_feature_bits.value +#define vbr_intrcnt vbr->br_g2h_intr_cnt +#define vbr_data vbr->br_data + +struct vmbus_rxbr { + struct mtx rxbr_lock; + struct vmbus_br rxbr; +}; + +#define rxbr_windex rxbr.vbr_windex +#define rxbr_rindex rxbr.vbr_rindex +#define rxbr_imask rxbr.vbr_imask +#define rxbr_psndsz rxbr.vbr_psndsz +#define rxbr_fpsndsz rxbr.vbr_fpsndsz +#define rxbr_fvalue rxbr.vbr_fvalue +#define rxbr_intrcnt rxbr.vbr_intrcnt +#define rxbr_data rxbr.vbr_data +#define rxbr_dsize rxbr.vbr_dsize + +struct vmbus_txbr { + struct mtx txbr_lock; + struct vmbus_br txbr; +}; + +#define txbr_windex txbr.vbr_windex +#define txbr_rindex txbr.vbr_rindex +#define txbr_imask txbr.vbr_imask +#define txbr_psndsz txbr.vbr_psndsz +#define txbr_fpsndsz txbr.vbr_fpsndsz +#define txbr_fvalue txbr.vbr_fvalue +#define txbr_intrcnt txbr.vbr_intrcnt +#define txbr_data txbr.vbr_data +#define txbr_dsize txbr.vbr_dsize + +struct sysctl_ctx_list; +struct sysctl_oid; + +static __inline int +vmbus_txbr_maxpktsz(const struct vmbus_txbr *tbr) +{ + + /* + * - 64 bits for the trailing start index (- sizeof(uint64_t)). + * - The rindex and windex can't be same (- 1). See + * the comment near vmbus_bufring.br_{r,w}index. + */ + return (tbr->txbr_dsize - sizeof(uint64_t) - 1); +} + +static __inline bool +vmbus_txbr_empty(const struct vmbus_txbr *tbr) +{ + + return (tbr->txbr_windex == tbr->txbr_rindex ? true : false); +} + +static __inline bool +vmbus_rxbr_empty(const struct vmbus_rxbr *rbr) +{ + + return (rbr->rxbr_windex == rbr->rxbr_rindex ? true : false); +} + +static __inline int +vmbus_br_nelem(int br_size, int elem_size) +{ + + /* Strip bufring header */ + br_size -= sizeof(struct vmbus_bufring); + /* Add per-element trailing index */ + elem_size += sizeof(uint64_t); + return (br_size / elem_size); +} + +void vmbus_br_sysctl_create(struct sysctl_ctx_list *ctx, + struct sysctl_oid *br_tree, struct vmbus_br *br, + const char *name); + +void vmbus_rxbr_init(struct vmbus_rxbr *rbr); +void vmbus_rxbr_deinit(struct vmbus_rxbr *rbr); +void vmbus_rxbr_setup(struct vmbus_rxbr *rbr, void *buf, int blen); +int vmbus_rxbr_peek(struct vmbus_rxbr *rbr, void *data, int dlen); +int vmbus_rxbr_read(struct vmbus_rxbr *rbr, void *data, int dlen, + uint32_t skip); +int vmbus_rxbr_idxadv(struct vmbus_rxbr *rbr, uint32_t idx_adv, + boolean_t *need_sig); +int vmbus_rxbr_idxadv_peek(struct vmbus_rxbr *rbr, void *data, + int dlen, uint32_t idx_adv, boolean_t *need_sig); +int vmbus_rxbr_peek_call(struct vmbus_rxbr *rbr, int dlen, + uint32_t skip, vmbus_br_copy_callback_t cb, void *cbarg); +void vmbus_rxbr_intr_mask(struct vmbus_rxbr *rbr); +uint32_t vmbus_rxbr_intr_unmask(struct vmbus_rxbr *rbr); +uint32_t vmbus_rxbr_available(const struct vmbus_rxbr *rbr); + +void vmbus_txbr_init(struct vmbus_txbr *tbr); +void vmbus_txbr_deinit(struct vmbus_txbr *tbr); +void vmbus_txbr_setup(struct vmbus_txbr *tbr, void *buf, int blen); +int vmbus_txbr_write(struct vmbus_txbr *tbr, + const struct iovec iov[], int iovlen, boolean_t *need_sig); +int vmbus_txbr_write_call(struct vmbus_txbr *tbr, + const struct iovec iov[], int iovlen, + vmbus_br_copy_callback_t cb, void *cbarg, + boolean_t *need_sig); +uint32_t vmbus_txbr_available(const struct vmbus_txbr *tbr); +uint32_t vmbus_txbr_get_imask(const struct vmbus_txbr *tbr); +void vmbus_txbr_set_pending_snd_sz(struct vmbus_txbr *tbr, + uint32_t size); + +#endif /* _VMBUS_BRVAR_H_ */ diff --git a/sys/dev/hyperv/vmbus/vmbus_chan.c b/sys/dev/hyperv/vmbus/vmbus_chan.c new file mode 100644 index 000000000000..032e06c47c95 --- /dev/null +++ b/sys/dev/hyperv/vmbus/vmbus_chan.c @@ -0,0 +1,2390 @@ +/*- + * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include <machine/atomic.h> +#include <machine/stdarg.h> + +#include <dev/hyperv/include/hyperv_busdma.h> +#include <dev/hyperv/include/vmbus_xact.h> +#include <dev/hyperv/vmbus/hyperv_var.h> +#include <dev/hyperv/vmbus/vmbus_reg.h> +#include <dev/hyperv/vmbus/vmbus_var.h> +#include <dev/hyperv/vmbus/vmbus_brvar.h> +#include <dev/hyperv/vmbus/vmbus_chanvar.h> + +struct vmbus_chan_pollarg { + struct vmbus_channel *poll_chan; + u_int poll_hz; +}; + +static void vmbus_chan_update_evtflagcnt( + struct vmbus_softc *, + const struct vmbus_channel *); +static int vmbus_chan_close_internal( + struct vmbus_channel *); +static int vmbus_chan_sysctl_mnf(SYSCTL_HANDLER_ARGS); +static void vmbus_chan_sysctl_create( + struct vmbus_channel *); +static struct vmbus_channel *vmbus_chan_alloc(struct vmbus_softc *); +static void vmbus_chan_free(struct vmbus_channel *); +static int vmbus_chan_add(struct vmbus_channel *); +static void vmbus_chan_cpu_default(struct vmbus_channel *); +static int vmbus_chan_release(struct vmbus_channel *); +static void vmbus_chan_set_chmap(struct vmbus_channel *); +static void vmbus_chan_clear_chmap(struct vmbus_channel *); +static void vmbus_chan_detach(struct vmbus_channel *); +static bool vmbus_chan_wait_revoke( + const struct vmbus_channel *, bool); +static void vmbus_chan_poll_timeout(void *); +static bool vmbus_chan_poll_cancel_intq( + struct vmbus_channel *); +static void vmbus_chan_poll_cancel(struct vmbus_channel *); + +static void vmbus_chan_ins_prilist(struct vmbus_softc *, + struct vmbus_channel *); +static void vmbus_chan_rem_prilist(struct vmbus_softc *, + struct vmbus_channel *); +static void vmbus_chan_ins_list(struct vmbus_softc *, + struct vmbus_channel *); +static void vmbus_chan_rem_list(struct vmbus_softc *, + struct vmbus_channel *); +static void vmbus_chan_ins_sublist(struct vmbus_channel *, + struct vmbus_channel *); +static void vmbus_chan_rem_sublist(struct vmbus_channel *, + struct vmbus_channel *); + +static void vmbus_chan_task(void *, int); +static void vmbus_chan_task_nobatch(void *, int); +static void vmbus_chan_poll_task(void *, int); +static void vmbus_chan_clrchmap_task(void *, int); +static void vmbus_chan_pollcfg_task(void *, int); +static void vmbus_chan_polldis_task(void *, int); +static void vmbus_chan_poll_cancel_task(void *, int); +static void vmbus_prichan_attach_task(void *, int); +static void vmbus_subchan_attach_task(void *, int); +static void vmbus_prichan_detach_task(void *, int); +static void vmbus_subchan_detach_task(void *, int); + +static void vmbus_chan_msgproc_choffer(struct vmbus_softc *, + const struct vmbus_message *); +static void vmbus_chan_msgproc_chrescind( + struct vmbus_softc *, + const struct vmbus_message *); + +static int vmbus_chan_printf(const struct vmbus_channel *, + const char *, ...) __printflike(2, 3); + +/* + * Vmbus channel message processing. + */ +static const vmbus_chanmsg_proc_t +vmbus_chan_msgprocs[VMBUS_CHANMSG_TYPE_MAX] = { + VMBUS_CHANMSG_PROC(CHOFFER, vmbus_chan_msgproc_choffer), + VMBUS_CHANMSG_PROC(CHRESCIND, vmbus_chan_msgproc_chrescind), + + VMBUS_CHANMSG_PROC_WAKEUP(CHOPEN_RESP), + VMBUS_CHANMSG_PROC_WAKEUP(GPADL_CONNRESP), + VMBUS_CHANMSG_PROC_WAKEUP(GPADL_DISCONNRESP) +}; + +/* + * Notify host that there are data pending on our TX bufring or + * we have put some data on the TX bufring. + */ +static __inline void +vmbus_chan_signal(const struct vmbus_channel *chan) +{ + atomic_set_long(chan->ch_evtflag, chan->ch_evtflag_mask); + if (chan->ch_txflags & VMBUS_CHAN_TXF_HASMNF) + atomic_set_int(chan->ch_montrig, chan->ch_montrig_mask); + else + hypercall_signal_event(chan->ch_monprm_dma.hv_paddr); +} + +static __inline void +vmbus_chan_signal_tx(struct vmbus_channel *chan) +{ + chan->ch_txbr.txbr_intrcnt ++; + + vmbus_chan_signal(chan); +} + +static __inline void +vmbus_chan_signal_rx(struct vmbus_channel *chan) +{ + chan->ch_rxbr.rxbr_intrcnt ++; + + vmbus_chan_signal(chan); +} + +static void +vmbus_chan_ins_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan) +{ + + mtx_assert(&sc->vmbus_prichan_lock, MA_OWNED); + if (atomic_testandset_int(&chan->ch_stflags, + VMBUS_CHAN_ST_ONPRIL_SHIFT)) + panic("channel is already on the prilist"); + TAILQ_INSERT_TAIL(&sc->vmbus_prichans, chan, ch_prilink); +} + +static void +vmbus_chan_rem_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan) +{ + + mtx_assert(&sc->vmbus_prichan_lock, MA_OWNED); + if (atomic_testandclear_int(&chan->ch_stflags, + VMBUS_CHAN_ST_ONPRIL_SHIFT) == 0) + panic("channel is not on the prilist"); + TAILQ_REMOVE(&sc->vmbus_prichans, chan, ch_prilink); +} + +static void +vmbus_chan_ins_sublist(struct vmbus_channel *prichan, + struct vmbus_channel *chan) +{ + + mtx_assert(&prichan->ch_subchan_lock, MA_OWNED); + + if (atomic_testandset_int(&chan->ch_stflags, + VMBUS_CHAN_ST_ONSUBL_SHIFT)) + panic("channel is already on the sublist"); + TAILQ_INSERT_TAIL(&prichan->ch_subchans, chan, ch_sublink); + + /* Bump sub-channel count. */ + prichan->ch_subchan_cnt++; +} + +static void +vmbus_chan_rem_sublist(struct vmbus_channel *prichan, + struct vmbus_channel *chan) +{ + + mtx_assert(&prichan->ch_subchan_lock, MA_OWNED); + + KASSERT(prichan->ch_subchan_cnt > 0, + ("invalid subchan_cnt %d", prichan->ch_subchan_cnt)); + prichan->ch_subchan_cnt--; + + if (atomic_testandclear_int(&chan->ch_stflags, + VMBUS_CHAN_ST_ONSUBL_SHIFT) == 0) + panic("channel is not on the sublist"); + TAILQ_REMOVE(&prichan->ch_subchans, chan, ch_sublink); +} + +static void +vmbus_chan_ins_list(struct vmbus_softc *sc, struct vmbus_channel *chan) +{ + + mtx_assert(&sc->vmbus_chan_lock, MA_OWNED); + if (atomic_testandset_int(&chan->ch_stflags, + VMBUS_CHAN_ST_ONLIST_SHIFT)) + panic("channel is already on the list"); + TAILQ_INSERT_TAIL(&sc->vmbus_chans, chan, ch_link); +} + +static void +vmbus_chan_rem_list(struct vmbus_softc *sc, struct vmbus_channel *chan) +{ + + mtx_assert(&sc->vmbus_chan_lock, MA_OWNED); + if (atomic_testandclear_int(&chan->ch_stflags, + VMBUS_CHAN_ST_ONLIST_SHIFT) == 0) + panic("channel is not on the list"); + TAILQ_REMOVE(&sc->vmbus_chans, chan, ch_link); +} + +static int +vmbus_chan_sysctl_mnf(SYSCTL_HANDLER_ARGS) +{ + struct vmbus_channel *chan = arg1; + int mnf = 0; + + if (chan->ch_txflags & VMBUS_CHAN_TXF_HASMNF) + mnf = 1; + return sysctl_handle_int(oidp, &mnf, 0, req); +} + +static void +vmbus_chan_sysctl_create(struct vmbus_channel *chan) +{ + struct sysctl_oid *ch_tree, *chid_tree, *br_tree; + struct sysctl_ctx_list *ctx; + uint32_t ch_id; + char name[16]; + + /* + * Add sysctl nodes related to this channel to this + * channel's sysctl ctx, so that they can be destroyed + * independently upon close of this channel, which can + * happen even if the device is not detached. + */ + ctx = &chan->ch_sysctl_ctx; + sysctl_ctx_init(ctx); + + /* + * Create dev.NAME.UNIT.channel tree. + */ + ch_tree = SYSCTL_ADD_NODE(ctx, + SYSCTL_CHILDREN(device_get_sysctl_tree(chan->ch_dev)), + OID_AUTO, "channel", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + if (ch_tree == NULL) + return; + + /* + * Create dev.NAME.UNIT.channel.CHANID tree. + */ + if (VMBUS_CHAN_ISPRIMARY(chan)) + ch_id = chan->ch_id; + else + ch_id = chan->ch_prichan->ch_id; + snprintf(name, sizeof(name), "%d", ch_id); + chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree), + OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + if (chid_tree == NULL) + return; + + if (!VMBUS_CHAN_ISPRIMARY(chan)) { + /* + * Create dev.NAME.UNIT.channel.CHANID.sub tree. + */ + ch_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree), + OID_AUTO, "sub", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + if (ch_tree == NULL) + return; + + /* + * Create dev.NAME.UNIT.channel.CHANID.sub.SUBIDX tree. + * + * NOTE: + * chid_tree is changed to this new sysctl tree. + */ + snprintf(name, sizeof(name), "%d", chan->ch_subidx); + chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree), + OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + if (chid_tree == NULL) + return; + + SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, + "chanid", CTLFLAG_RD, &chan->ch_id, 0, "channel id"); + } + + SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, + "cpu", CTLFLAG_RD, &chan->ch_cpuid, 0, "owner CPU id"); + SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, + "mnf", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, + chan, 0, vmbus_chan_sysctl_mnf, "I", + "has monitor notification facilities"); + + br_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, + "br", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + if (br_tree != NULL) { + /* + * Create sysctl tree for RX bufring. + */ + vmbus_br_sysctl_create(ctx, br_tree, &chan->ch_rxbr.rxbr, "rx"); + /* + * Create sysctl tree for TX bufring. + */ + vmbus_br_sysctl_create(ctx, br_tree, &chan->ch_txbr.txbr, "tx"); + } +} + +int +vmbus_chan_open(struct vmbus_channel *chan, int txbr_size, int rxbr_size, + const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg) +{ + struct vmbus_chan_br cbr; + int error; + + /* + * Allocate the TX+RX bufrings. + */ + KASSERT(chan->ch_bufring == NULL, ("bufrings are allocated")); + chan->ch_bufring = hyperv_dmamem_alloc(bus_get_dma_tag(chan->ch_dev), + PAGE_SIZE, 0, txbr_size + rxbr_size, &chan->ch_bufring_dma, + BUS_DMA_WAITOK); + if (chan->ch_bufring == NULL) { + vmbus_chan_printf(chan, "bufring allocation failed\n"); + return (ENOMEM); + } + + cbr.cbr = chan->ch_bufring; + cbr.cbr_paddr = chan->ch_bufring_dma.hv_paddr; + cbr.cbr_txsz = txbr_size; + cbr.cbr_rxsz = rxbr_size; + + error = vmbus_chan_open_br(chan, &cbr, udata, udlen, cb, cbarg); + if (error) { + if (error == EISCONN) { + /* + * XXX + * The bufring GPADL is still connected; abandon + * this bufring, instead of having mysterious + * crash or trashed data later on. + */ + vmbus_chan_printf(chan, "chan%u bufring GPADL " + "is still connected upon channel open error; " + "leak %d bytes memory\n", chan->ch_id, + txbr_size + rxbr_size); + } else { + hyperv_dmamem_free(&chan->ch_bufring_dma, + chan->ch_bufring); + } + chan->ch_bufring = NULL; + } + return (error); +} + +int +vmbus_chan_open_br(struct vmbus_channel *chan, const struct vmbus_chan_br *cbr, + const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg) +{ + struct vmbus_softc *sc = chan->ch_vmbus; + const struct vmbus_message *msg; + struct vmbus_chanmsg_chopen *req; + struct vmbus_msghc *mh; + uint32_t status; + int error, txbr_size, rxbr_size; + task_fn_t *task_fn; + uint8_t *br; + + if (udlen > VMBUS_CHANMSG_CHOPEN_UDATA_SIZE) { + vmbus_chan_printf(chan, + "invalid udata len %d for chan%u\n", udlen, chan->ch_id); + return (EINVAL); + } + + br = cbr->cbr; + txbr_size = cbr->cbr_txsz; + rxbr_size = cbr->cbr_rxsz; + KASSERT((txbr_size & PAGE_MASK) == 0, + ("send bufring size is not multiple page")); + KASSERT((rxbr_size & PAGE_MASK) == 0, + ("recv bufring size is not multiple page")); + KASSERT((cbr->cbr_paddr & PAGE_MASK) == 0, + ("bufring is not page aligned")); + + /* + * Zero out the TX/RX bufrings, in case that they were used before. + */ + memset(br, 0, txbr_size + rxbr_size); + + if (atomic_testandset_int(&chan->ch_stflags, + VMBUS_CHAN_ST_OPENED_SHIFT)) + panic("double-open chan%u", chan->ch_id); + + chan->ch_cb = cb; + chan->ch_cbarg = cbarg; + + vmbus_chan_update_evtflagcnt(sc, chan); + + chan->ch_tq = VMBUS_PCPU_GET(chan->ch_vmbus, event_tq, chan->ch_cpuid); + if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD) + task_fn = vmbus_chan_task; + else + task_fn = vmbus_chan_task_nobatch; + TASK_INIT(&chan->ch_task, 0, task_fn, chan); + + /* TX bufring comes first */ + vmbus_txbr_setup(&chan->ch_txbr, br, txbr_size); + /* RX bufring immediately follows TX bufring */ + vmbus_rxbr_setup(&chan->ch_rxbr, br + txbr_size, rxbr_size); + + /* Create sysctl tree for this channel */ + vmbus_chan_sysctl_create(chan); + + /* + * Connect the bufrings, both RX and TX, to this channel. + */ + error = vmbus_chan_gpadl_connect(chan, cbr->cbr_paddr, + txbr_size + rxbr_size, &chan->ch_bufring_gpadl); + if (error) { + vmbus_chan_printf(chan, + "failed to connect bufring GPADL to chan%u\n", chan->ch_id); + goto failed; + } + + /* + * Install this channel, before it is opened, but after everything + * else has been setup. + */ + vmbus_chan_set_chmap(chan); + + /* + * Open channel w/ the bufring GPADL on the target CPU. + */ + mh = vmbus_msghc_get(sc, sizeof(*req)); + if (mh == NULL) { + vmbus_chan_printf(chan, + "can not get msg hypercall for chopen(chan%u)\n", + chan->ch_id); + error = ENXIO; + goto failed; + } + + req = vmbus_msghc_dataptr(mh); + req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHOPEN; + req->chm_chanid = chan->ch_id; + req->chm_openid = chan->ch_id; + req->chm_gpadl = chan->ch_bufring_gpadl; + req->chm_vcpuid = chan->ch_vcpuid; + req->chm_txbr_pgcnt = txbr_size >> PAGE_SHIFT; + if (udlen > 0) + memcpy(req->chm_udata, udata, udlen); + + error = vmbus_msghc_exec(sc, mh); + if (error) { + vmbus_chan_printf(chan, + "chopen(chan%u) msg hypercall exec failed: %d\n", + chan->ch_id, error); + vmbus_msghc_put(sc, mh); + goto failed; + } + + for (;;) { + msg = vmbus_msghc_poll_result(sc, mh); + if (msg != NULL) + break; + if (vmbus_chan_is_revoked(chan)) { + int i; + + /* + * NOTE: + * Hypervisor does _not_ send response CHOPEN to + * a revoked channel. + */ + vmbus_chan_printf(chan, + "chan%u is revoked, when it is being opened\n", + chan->ch_id); + + /* + * XXX + * Add extra delay before cancel the hypercall + * execution; mainly to close any possible + * CHRESCIND and CHOPEN_RESP races on the + * hypervisor side. + */ +#define REVOKE_LINGER 100 + for (i = 0; i < REVOKE_LINGER; ++i) { + msg = vmbus_msghc_poll_result(sc, mh); + if (msg != NULL) + break; + pause("rchopen", 1); + } +#undef REVOKE_LINGER + if (msg == NULL) + vmbus_msghc_exec_cancel(sc, mh); + break; + } + pause("chopen", 1); + } + if (msg != NULL) { + status = ((const struct vmbus_chanmsg_chopen_resp *) + msg->msg_data)->chm_status; + } else { + /* XXX any non-0 value is ok here. */ + status = 0xff; + } + + vmbus_msghc_put(sc, mh); + + if (status == 0) { + if (bootverbose) + vmbus_chan_printf(chan, "chan%u opened\n", chan->ch_id); + return (0); + } + + vmbus_chan_printf(chan, "failed to open chan%u\n", chan->ch_id); + error = ENXIO; + +failed: + sysctl_ctx_free(&chan->ch_sysctl_ctx); + vmbus_chan_clear_chmap(chan); + if (chan->ch_bufring_gpadl != 0) { + int error1; + + error1 = vmbus_chan_gpadl_disconnect(chan, + chan->ch_bufring_gpadl); + if (error1) { + /* + * Give caller a hint that the bufring GPADL is still + * connected. + */ + error = EISCONN; + } + chan->ch_bufring_gpadl = 0; + } + atomic_clear_int(&chan->ch_stflags, VMBUS_CHAN_ST_OPENED); + return (error); +} + +int +vmbus_chan_gpadl_connect(struct vmbus_channel *chan, bus_addr_t paddr, + int size, uint32_t *gpadl0) +{ + struct vmbus_softc *sc = chan->ch_vmbus; + struct vmbus_msghc *mh; + struct vmbus_chanmsg_gpadl_conn *req; + const struct vmbus_message *msg; + size_t reqsz; + uint32_t gpadl, status; + int page_count, range_len, i, cnt, error; + uint64_t page_id; + + KASSERT(*gpadl0 == 0, ("GPADL is not zero")); + + /* + * Preliminary checks. + */ + + KASSERT((size & PAGE_MASK) == 0, + ("invalid GPA size %d, not multiple page size", size)); + page_count = size >> PAGE_SHIFT; + + KASSERT((paddr & PAGE_MASK) == 0, + ("GPA is not page aligned %jx", (uintmax_t)paddr)); + page_id = paddr >> PAGE_SHIFT; + + range_len = __offsetof(struct vmbus_gpa_range, gpa_page[page_count]); + /* + * We don't support multiple GPA ranges. + */ + if (range_len > UINT16_MAX) { + vmbus_chan_printf(chan, "GPA too large, %d pages\n", + page_count); + return EOPNOTSUPP; + } + + /* + * Allocate GPADL id. + */ + gpadl = vmbus_gpadl_alloc(sc); + + /* + * Connect this GPADL to the target channel. + * + * NOTE: + * Since each message can only hold small set of page + * addresses, several messages may be required to + * complete the connection. + */ + if (page_count > VMBUS_CHANMSG_GPADL_CONN_PGMAX) + cnt = VMBUS_CHANMSG_GPADL_CONN_PGMAX; + else + cnt = page_count; + page_count -= cnt; + + reqsz = __offsetof(struct vmbus_chanmsg_gpadl_conn, + chm_range.gpa_page[cnt]); + mh = vmbus_msghc_get(sc, reqsz); + if (mh == NULL) { + vmbus_chan_printf(chan, + "can not get msg hypercall for gpadl_conn(chan%u)\n", + chan->ch_id); + return EIO; + } + + req = vmbus_msghc_dataptr(mh); + req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_CONN; + req->chm_chanid = chan->ch_id; + req->chm_gpadl = gpadl; + req->chm_range_len = range_len; + req->chm_range_cnt = 1; + req->chm_range.gpa_len = size; + req->chm_range.gpa_ofs = 0; + for (i = 0; i < cnt; ++i) + req->chm_range.gpa_page[i] = page_id++; + + error = vmbus_msghc_exec(sc, mh); + if (error) { + vmbus_chan_printf(chan, + "gpadl_conn(chan%u) msg hypercall exec failed: %d\n", + chan->ch_id, error); + vmbus_msghc_put(sc, mh); + return error; + } + + while (page_count > 0) { + struct vmbus_chanmsg_gpadl_subconn *subreq; + + if (page_count > VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX) + cnt = VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX; + else + cnt = page_count; + page_count -= cnt; + + reqsz = __offsetof(struct vmbus_chanmsg_gpadl_subconn, + chm_gpa_page[cnt]); + vmbus_msghc_reset(mh, reqsz); + + subreq = vmbus_msghc_dataptr(mh); + subreq->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_SUBCONN; + subreq->chm_gpadl = gpadl; + for (i = 0; i < cnt; ++i) + subreq->chm_gpa_page[i] = page_id++; + + vmbus_msghc_exec_noresult(mh); + } + KASSERT(page_count == 0, ("invalid page count %d", page_count)); + + msg = vmbus_msghc_wait_result(sc, mh); + status = ((const struct vmbus_chanmsg_gpadl_connresp *) + msg->msg_data)->chm_status; + + vmbus_msghc_put(sc, mh); + + if (status != 0) { + vmbus_chan_printf(chan, "gpadl_conn(chan%u) failed: %u\n", + chan->ch_id, status); + return EIO; + } + + /* Done; commit the GPADL id. */ + *gpadl0 = gpadl; + if (bootverbose) { + vmbus_chan_printf(chan, "gpadl_conn(chan%u) succeeded\n", + chan->ch_id); + } + return 0; +} + +static bool +vmbus_chan_wait_revoke(const struct vmbus_channel *chan, bool can_sleep) +{ +#define WAIT_COUNT 200 /* 200ms */ + + int i; + + for (i = 0; i < WAIT_COUNT; ++i) { + if (vmbus_chan_is_revoked(chan)) + return (true); + if (can_sleep) + pause("wchrev", 1); + else + DELAY(1000); + } + return (false); + +#undef WAIT_COUNT +} + +/* + * Disconnect the GPA from the target channel + */ +int +vmbus_chan_gpadl_disconnect(struct vmbus_channel *chan, uint32_t gpadl) +{ + struct vmbus_softc *sc = chan->ch_vmbus; + struct vmbus_msghc *mh; + struct vmbus_chanmsg_gpadl_disconn *req; + int error; + + KASSERT(gpadl != 0, ("GPADL is zero")); + + mh = vmbus_msghc_get(sc, sizeof(*req)); + if (mh == NULL) { + vmbus_chan_printf(chan, + "can not get msg hypercall for gpadl_disconn(chan%u)\n", + chan->ch_id); + return (EBUSY); + } + + req = vmbus_msghc_dataptr(mh); + req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_DISCONN; + req->chm_chanid = chan->ch_id; + req->chm_gpadl = gpadl; + + error = vmbus_msghc_exec(sc, mh); + if (error) { + vmbus_msghc_put(sc, mh); + + if (vmbus_chan_wait_revoke(chan, true)) { + /* + * Error is benign; this channel is revoked, + * so this GPADL will not be touched anymore. + */ + vmbus_chan_printf(chan, + "gpadl_disconn(revoked chan%u) msg hypercall " + "exec failed: %d\n", chan->ch_id, error); + return (0); + } + vmbus_chan_printf(chan, + "gpadl_disconn(chan%u) msg hypercall exec failed: %d\n", + chan->ch_id, error); + return (error); + } + + vmbus_msghc_wait_result(sc, mh); + /* Discard result; no useful information */ + vmbus_msghc_put(sc, mh); + + return (0); +} + +static void +vmbus_chan_detach(struct vmbus_channel *chan) +{ + int refs; + + KASSERT(chan->ch_refs > 0, ("chan%u: invalid refcnt %d", + chan->ch_id, chan->ch_refs)); + refs = atomic_fetchadd_int(&chan->ch_refs, -1); +#ifdef INVARIANTS + if (VMBUS_CHAN_ISPRIMARY(chan)) { + KASSERT(refs == 1, ("chan%u: invalid refcnt %d for prichan", + chan->ch_id, refs + 1)); + } +#endif + if (refs == 1) { + /* + * Detach the target channel. + */ + if (bootverbose) { + vmbus_chan_printf(chan, "chan%u detached\n", + chan->ch_id); + } + taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_detach_task); + } +} + +static void +vmbus_chan_clrchmap_task(void *xchan, int pending __unused) +{ + struct vmbus_channel *chan = xchan; + + chan->ch_vmbus->vmbus_chmap[chan->ch_id] = NULL; +} + +static void +vmbus_chan_clear_chmap(struct vmbus_channel *chan) +{ + struct task chmap_task; + + TASK_INIT(&chmap_task, 0, vmbus_chan_clrchmap_task, chan); + vmbus_chan_run_task(chan, &chmap_task); +} + +static void +vmbus_chan_set_chmap(struct vmbus_channel *chan) +{ + __compiler_membar(); + chan->ch_vmbus->vmbus_chmap[chan->ch_id] = chan; +} + +static void +vmbus_chan_poll_cancel_task(void *xchan, int pending __unused) +{ + + vmbus_chan_poll_cancel_intq(xchan); +} + +static void +vmbus_chan_poll_cancel(struct vmbus_channel *chan) +{ + struct task poll_cancel; + + TASK_INIT(&poll_cancel, 0, vmbus_chan_poll_cancel_task, chan); + vmbus_chan_run_task(chan, &poll_cancel); +} + +static int +vmbus_chan_close_internal(struct vmbus_channel *chan) +{ + struct vmbus_softc *sc = chan->ch_vmbus; + struct vmbus_msghc *mh; + struct vmbus_chanmsg_chclose *req; + uint32_t old_stflags; + int error; + + /* + * NOTE: + * Sub-channels are closed upon their primary channel closing, + * so they can be closed even before they are opened. + */ + for (;;) { + old_stflags = chan->ch_stflags; + if (atomic_cmpset_int(&chan->ch_stflags, old_stflags, + old_stflags & ~VMBUS_CHAN_ST_OPENED)) + break; + } + if ((old_stflags & VMBUS_CHAN_ST_OPENED) == 0) { + /* Not opened yet; done */ + if (bootverbose) { + vmbus_chan_printf(chan, "chan%u not opened\n", + chan->ch_id); + } + return (0); + } + + /* + * Free this channel's sysctl tree attached to its device's + * sysctl tree. + */ + sysctl_ctx_free(&chan->ch_sysctl_ctx); + + /* + * Cancel polling, if it is enabled. + */ + vmbus_chan_poll_cancel(chan); + + /* + * NOTE: + * Order is critical. This channel _must_ be uninstalled first, + * else the channel task may be enqueued by the IDT after it has + * been drained. + */ + vmbus_chan_clear_chmap(chan); + taskqueue_drain(chan->ch_tq, &chan->ch_task); + chan->ch_tq = NULL; + + /* + * Close this channel. + */ + mh = vmbus_msghc_get(sc, sizeof(*req)); + if (mh == NULL) { + vmbus_chan_printf(chan, + "can not get msg hypercall for chclose(chan%u)\n", + chan->ch_id); + error = ENXIO; + goto disconnect; + } + + req = vmbus_msghc_dataptr(mh); + req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHCLOSE; + req->chm_chanid = chan->ch_id; + + error = vmbus_msghc_exec_noresult(mh); + vmbus_msghc_put(sc, mh); + + if (error) { + vmbus_chan_printf(chan, + "chclose(chan%u) msg hypercall exec failed: %d\n", + chan->ch_id, error); + goto disconnect; + } + + if (bootverbose) + vmbus_chan_printf(chan, "chan%u closed\n", chan->ch_id); + +disconnect: + /* + * Disconnect the TX+RX bufrings from this channel. + */ + if (chan->ch_bufring_gpadl != 0) { + int error1; + + error1 = vmbus_chan_gpadl_disconnect(chan, + chan->ch_bufring_gpadl); + if (error1) { + /* + * XXX + * The bufring GPADL is still connected; abandon + * this bufring, instead of having mysterious + * crash or trashed data later on. + */ + vmbus_chan_printf(chan, "chan%u bufring GPADL " + "is still connected after close\n", chan->ch_id); + chan->ch_bufring = NULL; + /* + * Give caller a hint that the bufring GPADL is + * still connected. + */ + error = EISCONN; + } + chan->ch_bufring_gpadl = 0; + } + + /* + * Destroy the TX+RX bufrings. + */ + if (chan->ch_bufring != NULL) { + hyperv_dmamem_free(&chan->ch_bufring_dma, chan->ch_bufring); + chan->ch_bufring = NULL; + } + return (error); +} + +int +vmbus_chan_close_direct(struct vmbus_channel *chan) +{ + int error; + +#ifdef INVARIANTS + if (VMBUS_CHAN_ISPRIMARY(chan)) { + struct vmbus_channel *subchan; + + /* + * All sub-channels _must_ have been closed, or are _not_ + * opened at all. + */ + mtx_lock(&chan->ch_subchan_lock); + TAILQ_FOREACH(subchan, &chan->ch_subchans, ch_sublink) { + KASSERT( + (subchan->ch_stflags & VMBUS_CHAN_ST_OPENED) == 0, + ("chan%u: subchan%u is still opened", + chan->ch_id, subchan->ch_subidx)); + } + mtx_unlock(&chan->ch_subchan_lock); + } +#endif + + error = vmbus_chan_close_internal(chan); + if (!VMBUS_CHAN_ISPRIMARY(chan)) { + /* + * This sub-channel is referenced, when it is linked to + * the primary channel; drop that reference now. + */ + vmbus_chan_detach(chan); + } + return (error); +} + +/* + * Caller should make sure that all sub-channels have + * been added to 'chan' and all to-be-closed channels + * are not being opened. + */ +void +vmbus_chan_close(struct vmbus_channel *chan) +{ + int subchan_cnt; + + if (!VMBUS_CHAN_ISPRIMARY(chan)) { + /* + * Sub-channel is closed when its primary channel + * is closed; done. + */ + return; + } + + /* + * Close all sub-channels, if any. + */ + subchan_cnt = chan->ch_subchan_cnt; + if (subchan_cnt > 0) { + struct vmbus_channel **subchan; + int i; + + subchan = vmbus_subchan_get(chan, subchan_cnt); + for (i = 0; i < subchan_cnt; ++i) { + vmbus_chan_close_internal(subchan[i]); + /* + * This sub-channel is referenced, when it is + * linked to the primary channel; drop that + * reference now. + */ + vmbus_chan_detach(subchan[i]); + } + vmbus_subchan_rel(subchan, subchan_cnt); + } + + /* Then close the primary channel. */ + vmbus_chan_close_internal(chan); +} + +void +vmbus_chan_intr_drain(struct vmbus_channel *chan) +{ + + taskqueue_drain(chan->ch_tq, &chan->ch_task); +} + +uint32_t +vmbus_chan_write_available(struct vmbus_channel *chan) +{ + return (vmbus_txbr_available(&chan->ch_txbr)); +} + +bool +vmbus_chan_write_signal(struct vmbus_channel *chan, + int32_t min_signal_size) +{ + if (min_signal_size >= 0 && + vmbus_chan_write_available(chan) > min_signal_size) { + return false; + } + + if (!vmbus_txbr_get_imask(&chan->ch_txbr)) { + /* txbr imask is not set, signal the reader */ + vmbus_chan_signal_tx(chan); + return true; + } + + return false; +} + +void +vmbus_chan_set_pending_send_size(struct vmbus_channel *chan, + uint32_t size) +{ + if (chan) + vmbus_txbr_set_pending_snd_sz(&chan->ch_txbr, size); +} + +int +vmbus_chan_iov_send(struct vmbus_channel *chan, + const struct iovec iov[], int iovlen, + vmbus_br_copy_callback_t cb, void *cbarg) +{ + int error; + boolean_t send_evt; + + if (iovlen == 0) + return (0); + + error = vmbus_txbr_write_call(&chan->ch_txbr, iov, iovlen, + cb, cbarg, &send_evt); + + if (!error && send_evt) { + vmbus_chan_signal_tx(chan); + } + + return error; +} + +int +vmbus_chan_send(struct vmbus_channel *chan, uint16_t type, uint16_t flags, + void *data, int dlen, uint64_t xactid) +{ + struct vmbus_chanpkt pkt; + int pktlen, pad_pktlen, hlen, error; + uint64_t pad = 0; + struct iovec iov[3]; + boolean_t send_evt; + + hlen = sizeof(pkt); + pktlen = hlen + dlen; + pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen); + KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr), + ("invalid packet size %d", pad_pktlen)); + + pkt.cp_hdr.cph_type = type; + pkt.cp_hdr.cph_flags = flags; + VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen); + VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen); + pkt.cp_hdr.cph_xactid = xactid; + + iov[0].iov_base = &pkt; + iov[0].iov_len = hlen; + iov[1].iov_base = data; + iov[1].iov_len = dlen; + iov[2].iov_base = &pad; + iov[2].iov_len = pad_pktlen - pktlen; + + error = vmbus_txbr_write(&chan->ch_txbr, iov, 3, &send_evt); + if (!error && send_evt) + vmbus_chan_signal_tx(chan); + return error; +} + +int +vmbus_chan_send_sglist(struct vmbus_channel *chan, + struct vmbus_gpa sg[], int sglen, void *data, int dlen, uint64_t xactid) +{ + struct vmbus_chanpkt_sglist pkt; + int pktlen, pad_pktlen, hlen, error; + struct iovec iov[4]; + boolean_t send_evt; + uint64_t pad = 0; + + hlen = __offsetof(struct vmbus_chanpkt_sglist, cp_gpa[sglen]); + pktlen = hlen + dlen; + pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen); + KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr), + ("invalid packet size %d", pad_pktlen)); + + pkt.cp_hdr.cph_type = VMBUS_CHANPKT_TYPE_GPA; + pkt.cp_hdr.cph_flags = VMBUS_CHANPKT_FLAG_RC; + VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen); + VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen); + pkt.cp_hdr.cph_xactid = xactid; + pkt.cp_rsvd = 0; + pkt.cp_gpa_cnt = sglen; + + iov[0].iov_base = &pkt; + iov[0].iov_len = sizeof(pkt); + iov[1].iov_base = sg; + iov[1].iov_len = sizeof(struct vmbus_gpa) * sglen; + iov[2].iov_base = data; + iov[2].iov_len = dlen; + iov[3].iov_base = &pad; + iov[3].iov_len = pad_pktlen - pktlen; + + error = vmbus_txbr_write(&chan->ch_txbr, iov, 4, &send_evt); + if (!error && send_evt) + vmbus_chan_signal_tx(chan); + return error; +} + +int +vmbus_chan_send_prplist(struct vmbus_channel *chan, + struct vmbus_gpa_range *prp, int prp_cnt, void *data, int dlen, + uint64_t xactid) +{ + struct vmbus_chanpkt_prplist pkt; + int pktlen, pad_pktlen, hlen, error; + struct iovec iov[4]; + boolean_t send_evt; + uint64_t pad = 0; + + hlen = __offsetof(struct vmbus_chanpkt_prplist, + cp_range[0].gpa_page[prp_cnt]); + pktlen = hlen + dlen; + pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen); + KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr), + ("invalid packet size %d", pad_pktlen)); + + pkt.cp_hdr.cph_type = VMBUS_CHANPKT_TYPE_GPA; + pkt.cp_hdr.cph_flags = VMBUS_CHANPKT_FLAG_RC; + VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen); + VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen); + pkt.cp_hdr.cph_xactid = xactid; + pkt.cp_rsvd = 0; + pkt.cp_range_cnt = 1; + + iov[0].iov_base = &pkt; + iov[0].iov_len = sizeof(pkt); + iov[1].iov_base = prp; + iov[1].iov_len = __offsetof(struct vmbus_gpa_range, gpa_page[prp_cnt]); + iov[2].iov_base = data; + iov[2].iov_len = dlen; + iov[3].iov_base = &pad; + iov[3].iov_len = pad_pktlen - pktlen; + + error = vmbus_txbr_write(&chan->ch_txbr, iov, 4, &send_evt); + if (!error && send_evt) + vmbus_chan_signal_tx(chan); + return error; +} + +int +vmbus_chan_recv(struct vmbus_channel *chan, void *data, int *dlen0, + uint64_t *xactid) +{ + struct vmbus_chanpkt_hdr pkt; + int error, dlen, hlen; + + error = vmbus_rxbr_peek(&chan->ch_rxbr, &pkt, sizeof(pkt)); + if (error) + return (error); + + if (__predict_false(pkt.cph_hlen < VMBUS_CHANPKT_HLEN_MIN)) { + vmbus_chan_printf(chan, "invalid hlen %u\n", pkt.cph_hlen); + /* XXX this channel is dead actually. */ + return (EIO); + } + if (__predict_false(pkt.cph_hlen > pkt.cph_tlen)) { + vmbus_chan_printf(chan, "invalid hlen %u and tlen %u\n", + pkt.cph_hlen, pkt.cph_tlen); + /* XXX this channel is dead actually. */ + return (EIO); + } + + hlen = VMBUS_CHANPKT_GETLEN(pkt.cph_hlen); + dlen = VMBUS_CHANPKT_GETLEN(pkt.cph_tlen) - hlen; + + if (*dlen0 < dlen) { + /* Return the size of this packet's data. */ + *dlen0 = dlen; + return (ENOBUFS); + } + + *xactid = pkt.cph_xactid; + *dlen0 = dlen; + + /* Skip packet header */ + error = vmbus_rxbr_read(&chan->ch_rxbr, data, dlen, hlen); + KASSERT(!error, ("vmbus_rxbr_read failed")); + + return (0); +} + +int +vmbus_chan_recv_pkt(struct vmbus_channel *chan, + struct vmbus_chanpkt_hdr *pkt, int *pktlen0) +{ + int error, pktlen, pkt_hlen; + + pkt_hlen = sizeof(*pkt); + error = vmbus_rxbr_peek(&chan->ch_rxbr, pkt, pkt_hlen); + if (error) + return (error); + + if (__predict_false(pkt->cph_hlen < VMBUS_CHANPKT_HLEN_MIN)) { + vmbus_chan_printf(chan, "invalid hlen %u\n", pkt->cph_hlen); + /* XXX this channel is dead actually. */ + return (EIO); + } + if (__predict_false(pkt->cph_hlen > pkt->cph_tlen)) { + vmbus_chan_printf(chan, "invalid hlen %u and tlen %u\n", + pkt->cph_hlen, pkt->cph_tlen); + /* XXX this channel is dead actually. */ + return (EIO); + } + + pktlen = VMBUS_CHANPKT_GETLEN(pkt->cph_tlen); + if (*pktlen0 < pktlen) { + /* Return the size of this packet. */ + *pktlen0 = pktlen; + return (ENOBUFS); + } + *pktlen0 = pktlen; + + /* + * Skip the fixed-size packet header, which has been filled + * by the above vmbus_rxbr_peek(). + */ + error = vmbus_rxbr_read(&chan->ch_rxbr, pkt + 1, + pktlen - pkt_hlen, pkt_hlen); + KASSERT(!error, ("vmbus_rxbr_read failed")); + + return (0); +} + +uint32_t +vmbus_chan_read_available(struct vmbus_channel *chan) +{ + return (vmbus_rxbr_available(&chan->ch_rxbr)); +} + +/* + * This routine does: + * - Advance the channel read index for 'advance' bytes + * - Copy data_len bytes in to the buffer pointed by 'data' + * Return 0 if operation succeed. EAGAIN if operations if failed. + * If failed, the buffer pointed by 'data' is intact, and the + * channel read index is not advanced at all. + */ +int +vmbus_chan_recv_peek(struct vmbus_channel *chan, + void *data, int data_len, uint32_t advance) +{ + int error; + boolean_t sig_event; + + if (data == NULL || data_len <= 0) + return (EINVAL); + + error = vmbus_rxbr_idxadv_peek(&chan->ch_rxbr, + data, data_len, advance, &sig_event); + + if (!error && sig_event) { + vmbus_chan_signal_rx(chan); + } + + return (error); +} + +/* + * This routine does: + * - Advance the channel read index for 'advance' bytes + */ +int +vmbus_chan_recv_idxadv(struct vmbus_channel *chan, uint32_t advance) +{ + int error; + boolean_t sig_event; + + if (advance == 0) + return (EINVAL); + + error = vmbus_rxbr_idxadv(&chan->ch_rxbr, advance, &sig_event); + + if (!error && sig_event) { + vmbus_chan_signal_rx(chan); + } + + return (error); +} + + +/* + * Caller should hold its own lock to serialize the ring buffer + * copy. + */ +int +vmbus_chan_recv_peek_call(struct vmbus_channel *chan, int data_len, + uint32_t skip, vmbus_br_copy_callback_t cb, void *cbarg) +{ + if (!chan || data_len <= 0 || cb == NULL) + return (EINVAL); + + return (vmbus_rxbr_peek_call(&chan->ch_rxbr, data_len, skip, + cb, cbarg)); +} + +static void +vmbus_chan_task(void *xchan, int pending __unused) +{ + struct vmbus_channel *chan = xchan; + vmbus_chan_callback_t cb = chan->ch_cb; + void *cbarg = chan->ch_cbarg; + + KASSERT(chan->ch_poll_intvl == 0, + ("chan%u: interrupted in polling mode", chan->ch_id)); + + /* + * Optimize host to guest signaling by ensuring: + * 1. While reading the channel, we disable interrupts from + * host. + * 2. Ensure that we process all posted messages from the host + * before returning from this callback. + * 3. Once we return, enable signaling from the host. Once this + * state is set we check to see if additional packets are + * available to read. In this case we repeat the process. + * + * NOTE: Interrupt has been disabled in the ISR. + */ + for (;;) { + uint32_t left; + + cb(chan, cbarg); + + left = vmbus_rxbr_intr_unmask(&chan->ch_rxbr); + if (left == 0) { + /* No more data in RX bufring; done */ + break; + } + vmbus_rxbr_intr_mask(&chan->ch_rxbr); + } +} + +static void +vmbus_chan_task_nobatch(void *xchan, int pending __unused) +{ + struct vmbus_channel *chan = xchan; + + KASSERT(chan->ch_poll_intvl == 0, + ("chan%u: interrupted in polling mode", chan->ch_id)); + chan->ch_cb(chan, chan->ch_cbarg); +} + +static void +vmbus_chan_poll_timeout(void *xchan) +{ + struct vmbus_channel *chan = xchan; + + KASSERT(chan->ch_poll_intvl != 0, + ("chan%u: polling timeout in interrupt mode", chan->ch_id)); + taskqueue_enqueue(chan->ch_tq, &chan->ch_poll_task); +} + +static void +vmbus_chan_poll_task(void *xchan, int pending __unused) +{ + struct vmbus_channel *chan = xchan; + + KASSERT(chan->ch_poll_intvl != 0, + ("chan%u: polling in interrupt mode", chan->ch_id)); + callout_reset_sbt_curcpu(&chan->ch_poll_timeo, chan->ch_poll_intvl, 0, + vmbus_chan_poll_timeout, chan, chan->ch_poll_flags); + chan->ch_cb(chan, chan->ch_cbarg); +} + +static void +vmbus_chan_pollcfg_task(void *xarg, int pending __unused) +{ + const struct vmbus_chan_pollarg *arg = xarg; + struct vmbus_channel *chan = arg->poll_chan; + sbintime_t intvl; + int poll_flags; + + /* + * Save polling interval. + */ + intvl = SBT_1S / arg->poll_hz; + if (intvl == 0) + intvl = 1; + if (intvl == chan->ch_poll_intvl) { + /* Nothing changes; done */ + return; + } + chan->ch_poll_intvl = intvl; + + /* Adjust callout flags. */ + poll_flags = C_DIRECT_EXEC; + if (arg->poll_hz <= hz) + poll_flags |= C_HARDCLOCK; + chan->ch_poll_flags = poll_flags; + + /* + * Disconnect this channel from the channel map to make sure that + * the RX bufring interrupt enabling bit can not be touched, and + * ISR can not enqueue this channel task anymore. THEN, disable + * interrupt from the RX bufring (TX bufring does not generate + * interrupt to VM). + * + * NOTE: order is critical. + */ + chan->ch_vmbus->vmbus_chmap[chan->ch_id] = NULL; + __compiler_membar(); + vmbus_rxbr_intr_mask(&chan->ch_rxbr); + + /* + * NOTE: + * At this point, this channel task will not be enqueued by + * the ISR anymore, time to cancel the pending one. + */ + taskqueue_cancel(chan->ch_tq, &chan->ch_task, NULL); + + /* Kick start! */ + taskqueue_enqueue(chan->ch_tq, &chan->ch_poll_task); +} + +static bool +vmbus_chan_poll_cancel_intq(struct vmbus_channel *chan) +{ + + if (chan->ch_poll_intvl == 0) { + /* Not enabled. */ + return (false); + } + + /* + * Stop polling callout, so that channel polling task + * will not be enqueued anymore. + */ + callout_drain(&chan->ch_poll_timeo); + + /* + * Disable polling by resetting polling interval. + * + * NOTE: + * The polling interval resetting MUST be conducted + * after the callout is drained; mainly to keep the + * proper assertion in place. + */ + chan->ch_poll_intvl = 0; + + /* + * NOTE: + * At this point, this channel polling task will not be + * enqueued by the callout anymore, time to cancel the + * pending one. + */ + taskqueue_cancel(chan->ch_tq, &chan->ch_poll_task, NULL); + + /* Polling was enabled. */ + return (true); +} + +static void +vmbus_chan_polldis_task(void *xchan, int pending __unused) +{ + struct vmbus_channel *chan = xchan; + + if (!vmbus_chan_poll_cancel_intq(chan)) { + /* Already disabled; done. */ + return; + } + + /* + * Plug this channel back to the channel map and unmask + * the RX bufring interrupt. + */ + chan->ch_vmbus->vmbus_chmap[chan->ch_id] = chan; + __compiler_membar(); + vmbus_rxbr_intr_unmask(&chan->ch_rxbr); + + /* + * Kick start the interrupt task, just in case unmasking + * interrupt races ISR. + */ + taskqueue_enqueue(chan->ch_tq, &chan->ch_task); +} + +static __inline void +vmbus_event_flags_proc(struct vmbus_softc *sc, volatile u_long *event_flags, + int flag_cnt) +{ + int f; + + for (f = 0; f < flag_cnt; ++f) { + uint32_t chid_base; + u_long flags; + int chid_ofs; + + if (event_flags[f] == 0) + continue; + + flags = atomic_swap_long(&event_flags[f], 0); + chid_base = f << VMBUS_EVTFLAG_SHIFT; + + while ((chid_ofs = ffsl(flags)) != 0) { + struct vmbus_channel *chan; + + --chid_ofs; /* NOTE: ffsl is 1-based */ + flags &= ~(1UL << chid_ofs); + + chan = sc->vmbus_chmap[chid_base + chid_ofs]; + if (__predict_false(chan == NULL)) { + /* Channel is closed. */ + continue; + } + __compiler_membar(); + + if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD) + vmbus_rxbr_intr_mask(&chan->ch_rxbr); + taskqueue_enqueue(chan->ch_tq, &chan->ch_task); + } + } +} + +void +vmbus_event_proc(struct vmbus_softc *sc, int cpu) +{ + struct vmbus_evtflags *eventf; + + /* + * On Host with Win8 or above, the event page can be checked directly + * to get the id of the channel that has the pending interrupt. + */ + eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE; + vmbus_event_flags_proc(sc, eventf->evt_flags, + VMBUS_PCPU_GET(sc, event_flags_cnt, cpu)); +} + +void +vmbus_event_proc_compat(struct vmbus_softc *sc, int cpu) +{ + struct vmbus_evtflags *eventf; + + eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE; + if (atomic_testandclear_long(&eventf->evt_flags[0], 0)) { + vmbus_event_flags_proc(sc, sc->vmbus_rx_evtflags, + VMBUS_CHAN_MAX_COMPAT >> VMBUS_EVTFLAG_SHIFT); + } +} + +static void +vmbus_chan_update_evtflagcnt(struct vmbus_softc *sc, + const struct vmbus_channel *chan) +{ + volatile int *flag_cnt_ptr; + int flag_cnt; + + flag_cnt = (chan->ch_id / VMBUS_EVTFLAG_LEN) + 1; + flag_cnt_ptr = VMBUS_PCPU_PTR(sc, event_flags_cnt, chan->ch_cpuid); + + for (;;) { + int old_flag_cnt; + + old_flag_cnt = *flag_cnt_ptr; + if (old_flag_cnt >= flag_cnt) + break; + if (atomic_cmpset_int(flag_cnt_ptr, old_flag_cnt, flag_cnt)) { + if (bootverbose) { + vmbus_chan_printf(chan, + "chan%u update cpu%d flag_cnt to %d\n", + chan->ch_id, chan->ch_cpuid, flag_cnt); + } + break; + } + } +} + +static struct vmbus_channel * +vmbus_chan_alloc(struct vmbus_softc *sc) +{ + struct vmbus_channel *chan; + + chan = malloc(sizeof(*chan), M_DEVBUF, M_WAITOK | M_ZERO); + + chan->ch_monprm = hyperv_dmamem_alloc(bus_get_dma_tag(sc->vmbus_dev), + HYPERCALL_PARAM_ALIGN, 0, sizeof(struct hyperv_mon_param), + &chan->ch_monprm_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); + if (chan->ch_monprm == NULL) { + device_printf(sc->vmbus_dev, "monprm alloc failed\n"); + free(chan, M_DEVBUF); + return NULL; + } + + chan->ch_refs = 1; + chan->ch_vmbus = sc; + mtx_init(&chan->ch_subchan_lock, "vmbus subchan", NULL, MTX_DEF); + sx_init(&chan->ch_orphan_lock, "vmbus chorphan"); + TAILQ_INIT(&chan->ch_subchans); + vmbus_rxbr_init(&chan->ch_rxbr); + vmbus_txbr_init(&chan->ch_txbr); + + TASK_INIT(&chan->ch_poll_task, 0, vmbus_chan_poll_task, chan); + callout_init(&chan->ch_poll_timeo, 1); + + return chan; +} + +static void +vmbus_chan_free(struct vmbus_channel *chan) +{ + + KASSERT(TAILQ_EMPTY(&chan->ch_subchans) && chan->ch_subchan_cnt == 0, + ("still owns sub-channels")); + KASSERT((chan->ch_stflags & + (VMBUS_CHAN_ST_OPENED | + VMBUS_CHAN_ST_ONPRIL | + VMBUS_CHAN_ST_ONSUBL | + VMBUS_CHAN_ST_ONLIST)) == 0, ("free busy channel")); + KASSERT(chan->ch_orphan_xact == NULL, + ("still has orphan xact installed")); + KASSERT(chan->ch_refs == 0, ("chan%u: invalid refcnt %d", + chan->ch_id, chan->ch_refs)); + KASSERT(chan->ch_poll_intvl == 0, ("chan%u: polling is activated", + chan->ch_id)); + + hyperv_dmamem_free(&chan->ch_monprm_dma, chan->ch_monprm); + mtx_destroy(&chan->ch_subchan_lock); + sx_destroy(&chan->ch_orphan_lock); + vmbus_rxbr_deinit(&chan->ch_rxbr); + vmbus_txbr_deinit(&chan->ch_txbr); + free(chan, M_DEVBUF); +} + +static int +vmbus_chan_add(struct vmbus_channel *newchan) +{ + struct vmbus_softc *sc = newchan->ch_vmbus; + struct vmbus_channel *prichan; + + if (newchan->ch_id == 0) { + /* + * XXX + * Chan0 will neither be processed nor should be offered; + * skip it. + */ + device_printf(sc->vmbus_dev, "got chan0 offer, discard\n"); + return EINVAL; + } else if (newchan->ch_id >= VMBUS_CHAN_MAX) { + device_printf(sc->vmbus_dev, "invalid chan%u offer\n", + newchan->ch_id); + return EINVAL; + } + + mtx_lock(&sc->vmbus_prichan_lock); + TAILQ_FOREACH(prichan, &sc->vmbus_prichans, ch_prilink) { + /* + * Sub-channel will have the same type GUID and instance + * GUID as its primary channel. + */ + if (memcmp(&prichan->ch_guid_type, &newchan->ch_guid_type, + sizeof(struct hyperv_guid)) == 0 && + memcmp(&prichan->ch_guid_inst, &newchan->ch_guid_inst, + sizeof(struct hyperv_guid)) == 0) + break; + } + if (VMBUS_CHAN_ISPRIMARY(newchan)) { + if (prichan == NULL) { + /* Install the new primary channel */ + vmbus_chan_ins_prilist(sc, newchan); + mtx_unlock(&sc->vmbus_prichan_lock); + goto done; + } else { + mtx_unlock(&sc->vmbus_prichan_lock); + device_printf(sc->vmbus_dev, + "duplicated primary chan%u\n", newchan->ch_id); + return EINVAL; + } + } else { /* Sub-channel */ + if (prichan == NULL) { + mtx_unlock(&sc->vmbus_prichan_lock); + device_printf(sc->vmbus_dev, + "no primary chan for chan%u\n", newchan->ch_id); + return EINVAL; + } + /* + * Found the primary channel for this sub-channel and + * move on. + * + * XXX refcnt prichan + */ + } + mtx_unlock(&sc->vmbus_prichan_lock); + + /* + * This is a sub-channel; link it with the primary channel. + */ + KASSERT(!VMBUS_CHAN_ISPRIMARY(newchan), + ("new channel is not sub-channel")); + KASSERT(prichan != NULL, ("no primary channel")); + + /* + * Reference count this sub-channel; it will be dereferenced + * when this sub-channel is closed. + */ + KASSERT(newchan->ch_refs == 1, ("chan%u: invalid refcnt %d", + newchan->ch_id, newchan->ch_refs)); + atomic_add_int(&newchan->ch_refs, 1); + + newchan->ch_prichan = prichan; + newchan->ch_dev = prichan->ch_dev; + + mtx_lock(&prichan->ch_subchan_lock); + vmbus_chan_ins_sublist(prichan, newchan); + mtx_unlock(&prichan->ch_subchan_lock); + /* + * Notify anyone that is interested in this sub-channel, + * after this sub-channel is setup. + */ + wakeup(prichan); +done: + /* + * Hook this channel up for later revocation. + */ + mtx_lock(&sc->vmbus_chan_lock); + vmbus_chan_ins_list(sc, newchan); + mtx_unlock(&sc->vmbus_chan_lock); + + if (bootverbose) { + vmbus_chan_printf(newchan, "chan%u subidx%u offer\n", + newchan->ch_id, newchan->ch_subidx); + } + + /* Select default cpu for this channel. */ + vmbus_chan_cpu_default(newchan); + + return 0; +} + +void +vmbus_chan_cpu_set(struct vmbus_channel *chan, int cpu) +{ + KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu %d", cpu)); + + if (chan->ch_vmbus->vmbus_version == VMBUS_VERSION_WS2008 || + chan->ch_vmbus->vmbus_version == VMBUS_VERSION_WIN7) { + /* Only cpu0 is supported */ + cpu = 0; + } + + chan->ch_cpuid = cpu; + chan->ch_vcpuid = VMBUS_PCPU_GET(chan->ch_vmbus, vcpuid, cpu); + + if (bootverbose) { + vmbus_chan_printf(chan, + "chan%u assigned to cpu%u [vcpu%u]\n", + chan->ch_id, chan->ch_cpuid, chan->ch_vcpuid); + } +} + +void +vmbus_chan_cpu_rr(struct vmbus_channel *chan) +{ + static uint32_t vmbus_chan_nextcpu; + int cpu; + + cpu = atomic_fetchadd_int(&vmbus_chan_nextcpu, 1) % mp_ncpus; + vmbus_chan_cpu_set(chan, cpu); +} + +static void +vmbus_chan_cpu_default(struct vmbus_channel *chan) +{ + /* + * By default, pin the channel to cpu0. Devices having + * special channel-cpu mapping requirement should call + * vmbus_chan_cpu_{set,rr}(). + */ + vmbus_chan_cpu_set(chan, 0); +} + +static void +vmbus_chan_msgproc_choffer(struct vmbus_softc *sc, + const struct vmbus_message *msg) +{ + const struct vmbus_chanmsg_choffer *offer; + struct vmbus_channel *chan; + task_fn_t *detach_fn, *attach_fn; + int error; + + offer = (const struct vmbus_chanmsg_choffer *)msg->msg_data; + + chan = vmbus_chan_alloc(sc); + if (chan == NULL) { + device_printf(sc->vmbus_dev, "allocate chan%u failed\n", + offer->chm_chanid); + return; + } + + chan->ch_id = offer->chm_chanid; + chan->ch_subidx = offer->chm_subidx; + chan->ch_guid_type = offer->chm_chtype; + chan->ch_guid_inst = offer->chm_chinst; + + /* Batch reading is on by default */ + chan->ch_flags |= VMBUS_CHAN_FLAG_BATCHREAD; + + chan->ch_monprm->mp_connid = VMBUS_CONNID_EVENT; + if (sc->vmbus_version != VMBUS_VERSION_WS2008) + chan->ch_monprm->mp_connid = offer->chm_connid; + + if (offer->chm_flags1 & VMBUS_CHOFFER_FLAG1_HASMNF) { + int trig_idx; + + /* + * Setup MNF stuffs. + */ + chan->ch_txflags |= VMBUS_CHAN_TXF_HASMNF; + + trig_idx = offer->chm_montrig / VMBUS_MONTRIG_LEN; + if (trig_idx >= VMBUS_MONTRIGS_MAX) + panic("invalid monitor trigger %u", offer->chm_montrig); + chan->ch_montrig = + &sc->vmbus_mnf2->mnf_trigs[trig_idx].mt_pending; + + chan->ch_montrig_mask = + 1 << (offer->chm_montrig % VMBUS_MONTRIG_LEN); + } + + if (offer->chm_chflags & VMBUS_CHAN_TLNPI_PROVIDER_OFFER) { + /* This is HyperV socket channel */ + chan->ch_is_hvs = true; + /* The first byte != 0 means the host initiated connection. */ + chan->ch_hvs_conn_from_host = + offer->chm_udata.pipe.user_def[0]; + + if (bootverbose) { + device_printf(sc->vmbus_dev, + "chan%u is hyperv socket channel " + "connected %s host\n", + chan->ch_id, + (chan->ch_hvs_conn_from_host != 0) ? + "from" : "to"); + } + } else { + chan->ch_is_hvs = false; + } + + /* + * Setup event flag. + */ + chan->ch_evtflag = + &sc->vmbus_tx_evtflags[chan->ch_id >> VMBUS_EVTFLAG_SHIFT]; + chan->ch_evtflag_mask = 1UL << (chan->ch_id & VMBUS_EVTFLAG_MASK); + + /* + * Setup attach and detach tasks. + */ + if (VMBUS_CHAN_ISPRIMARY(chan)) { + chan->ch_mgmt_tq = sc->vmbus_devtq; + attach_fn = vmbus_prichan_attach_task; + detach_fn = vmbus_prichan_detach_task; + } else { + chan->ch_mgmt_tq = sc->vmbus_subchtq; + attach_fn = vmbus_subchan_attach_task; + detach_fn = vmbus_subchan_detach_task; + } + TASK_INIT(&chan->ch_attach_task, 0, attach_fn, chan); + TASK_INIT(&chan->ch_detach_task, 0, detach_fn, chan); + + error = vmbus_chan_add(chan); + if (error) { + device_printf(sc->vmbus_dev, "add chan%u failed: %d\n", + chan->ch_id, error); + atomic_subtract_int(&chan->ch_refs, 1); + vmbus_chan_free(chan); + return; + } + taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_attach_task); +} + +static void +vmbus_chan_msgproc_chrescind(struct vmbus_softc *sc, + const struct vmbus_message *msg) +{ + const struct vmbus_chanmsg_chrescind *note; + struct vmbus_channel *chan; + + note = (const struct vmbus_chanmsg_chrescind *)msg->msg_data; + if (note->chm_chanid > VMBUS_CHAN_MAX) { + device_printf(sc->vmbus_dev, "invalid revoked chan%u\n", + note->chm_chanid); + return; + } + + /* + * Find and remove the target channel from the channel list. + */ + mtx_lock(&sc->vmbus_chan_lock); + TAILQ_FOREACH(chan, &sc->vmbus_chans, ch_link) { + if (chan->ch_id == note->chm_chanid) + break; + } + if (chan == NULL) { + mtx_unlock(&sc->vmbus_chan_lock); + device_printf(sc->vmbus_dev, "chan%u is not offered\n", + note->chm_chanid); + return; + } + vmbus_chan_rem_list(sc, chan); + mtx_unlock(&sc->vmbus_chan_lock); + + if (VMBUS_CHAN_ISPRIMARY(chan)) { + /* + * The target channel is a primary channel; remove the + * target channel from the primary channel list now, + * instead of later, so that it will not be found by + * other sub-channel offers, which are processed in + * this thread. + */ + mtx_lock(&sc->vmbus_prichan_lock); + vmbus_chan_rem_prilist(sc, chan); + mtx_unlock(&sc->vmbus_prichan_lock); + } + + /* + * NOTE: + * The following processing order is critical: + * Set the REVOKED state flag before orphaning the installed xact. + */ + + if (atomic_testandset_int(&chan->ch_stflags, + VMBUS_CHAN_ST_REVOKED_SHIFT)) + panic("channel has already been revoked"); + + sx_xlock(&chan->ch_orphan_lock); + if (chan->ch_orphan_xact != NULL) + vmbus_xact_ctx_orphan(chan->ch_orphan_xact); + sx_xunlock(&chan->ch_orphan_lock); + + if (bootverbose) + vmbus_chan_printf(chan, "chan%u revoked\n", note->chm_chanid); + vmbus_chan_detach(chan); +} + +static int +vmbus_chan_release(struct vmbus_channel *chan) +{ + struct vmbus_softc *sc = chan->ch_vmbus; + struct vmbus_chanmsg_chfree *req; + struct vmbus_msghc *mh; + int error; + + mh = vmbus_msghc_get(sc, sizeof(*req)); + if (mh == NULL) { + vmbus_chan_printf(chan, + "can not get msg hypercall for chfree(chan%u)\n", + chan->ch_id); + return (ENXIO); + } + + req = vmbus_msghc_dataptr(mh); + req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHFREE; + req->chm_chanid = chan->ch_id; + + error = vmbus_msghc_exec_noresult(mh); + vmbus_msghc_put(sc, mh); + + if (error) { + vmbus_chan_printf(chan, + "chfree(chan%u) msg hypercall exec failed: %d\n", + chan->ch_id, error); + } else { + if (bootverbose) + vmbus_chan_printf(chan, "chan%u freed\n", chan->ch_id); + } + return (error); +} + +static void +vmbus_prichan_detach_task(void *xchan, int pending __unused) +{ + struct vmbus_channel *chan = xchan; + + KASSERT(VMBUS_CHAN_ISPRIMARY(chan), + ("chan%u is not primary channel", chan->ch_id)); + + /* Delete and detach the device associated with this channel. */ + vmbus_delete_child(chan); + + /* Release this channel (back to vmbus). */ + vmbus_chan_release(chan); + + /* Free this channel's resource. */ + vmbus_chan_free(chan); +} + +static void +vmbus_subchan_detach_task(void *xchan, int pending __unused) +{ + struct vmbus_channel *chan = xchan; + struct vmbus_channel *pri_chan = chan->ch_prichan; + + KASSERT(!VMBUS_CHAN_ISPRIMARY(chan), + ("chan%u is primary channel", chan->ch_id)); + + /* Release this channel (back to vmbus). */ + vmbus_chan_release(chan); + + /* Unlink from its primary channel's sub-channel list. */ + mtx_lock(&pri_chan->ch_subchan_lock); + vmbus_chan_rem_sublist(pri_chan, chan); + mtx_unlock(&pri_chan->ch_subchan_lock); + /* Notify anyone that is waiting for this sub-channel to vanish. */ + wakeup(pri_chan); + + /* Free this channel's resource. */ + vmbus_chan_free(chan); +} + +static void +vmbus_prichan_attach_task(void *xchan, int pending __unused) +{ + + /* + * Add device for this primary channel. + */ + vmbus_add_child(xchan); +} + +static void +vmbus_subchan_attach_task(void *xchan __unused, int pending __unused) +{ + + /* Nothing */ +} + +void +vmbus_chan_destroy_all(struct vmbus_softc *sc) +{ + + /* + * Detach all devices and destroy the corresponding primary + * channels. + */ + for (;;) { + struct vmbus_channel *chan; + + mtx_lock(&sc->vmbus_chan_lock); + TAILQ_FOREACH(chan, &sc->vmbus_chans, ch_link) { + if (VMBUS_CHAN_ISPRIMARY(chan)) + break; + } + if (chan == NULL) { + /* No more primary channels; done. */ + mtx_unlock(&sc->vmbus_chan_lock); + break; + } + vmbus_chan_rem_list(sc, chan); + mtx_unlock(&sc->vmbus_chan_lock); + + mtx_lock(&sc->vmbus_prichan_lock); + vmbus_chan_rem_prilist(sc, chan); + mtx_unlock(&sc->vmbus_prichan_lock); + + taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_detach_task); + } +} + +struct vmbus_channel ** +vmbus_subchan_get(struct vmbus_channel *pri_chan, int subchan_cnt) +{ + struct vmbus_channel **ret, *chan; + int i; + + KASSERT(subchan_cnt > 0, ("invalid sub-channel count %d", subchan_cnt)); + + ret = malloc(subchan_cnt * sizeof(struct vmbus_channel *), M_TEMP, + M_WAITOK); + + mtx_lock(&pri_chan->ch_subchan_lock); + + while (pri_chan->ch_subchan_cnt < subchan_cnt) + mtx_sleep(pri_chan, &pri_chan->ch_subchan_lock, 0, "subch", 0); + + i = 0; + TAILQ_FOREACH(chan, &pri_chan->ch_subchans, ch_sublink) { + /* TODO: refcnt chan */ + ret[i] = chan; + + ++i; + if (i == subchan_cnt) + break; + } + KASSERT(i == subchan_cnt, ("invalid subchan count %d, should be %d", + pri_chan->ch_subchan_cnt, subchan_cnt)); + + mtx_unlock(&pri_chan->ch_subchan_lock); + + return ret; +} + +void +vmbus_subchan_rel(struct vmbus_channel **subchan, int subchan_cnt __unused) +{ + + free(subchan, M_TEMP); +} + +void +vmbus_subchan_drain(struct vmbus_channel *pri_chan) +{ + mtx_lock(&pri_chan->ch_subchan_lock); + while (pri_chan->ch_subchan_cnt > 0) + mtx_sleep(pri_chan, &pri_chan->ch_subchan_lock, 0, "dsubch", 0); + mtx_unlock(&pri_chan->ch_subchan_lock); +} + +void +vmbus_chan_msgproc(struct vmbus_softc *sc, const struct vmbus_message *msg) +{ + vmbus_chanmsg_proc_t msg_proc; + uint32_t msg_type; + + msg_type = ((const struct vmbus_chanmsg_hdr *)msg->msg_data)->chm_type; + KASSERT(msg_type < VMBUS_CHANMSG_TYPE_MAX, + ("invalid message type %u", msg_type)); + + msg_proc = vmbus_chan_msgprocs[msg_type]; + if (msg_proc != NULL) + msg_proc(sc, msg); +} + +void +vmbus_chan_set_readbatch(struct vmbus_channel *chan, bool on) +{ + if (!on) + chan->ch_flags &= ~VMBUS_CHAN_FLAG_BATCHREAD; + else + chan->ch_flags |= VMBUS_CHAN_FLAG_BATCHREAD; +} + +uint32_t +vmbus_chan_id(const struct vmbus_channel *chan) +{ + return chan->ch_id; +} + +uint32_t +vmbus_chan_subidx(const struct vmbus_channel *chan) +{ + return chan->ch_subidx; +} + +bool +vmbus_chan_is_primary(const struct vmbus_channel *chan) +{ + if (VMBUS_CHAN_ISPRIMARY(chan)) + return true; + else + return false; +} + +bool +vmbus_chan_is_hvs(const struct vmbus_channel *chan) +{ + return chan->ch_is_hvs; +} + +bool +vmbus_chan_is_hvs_conn_from_host(const struct vmbus_channel *chan) +{ + KASSERT(vmbus_chan_is_hvs(chan) == true, + ("Not a HyperV Socket channel %u", chan->ch_id)); + if (chan->ch_hvs_conn_from_host != 0) + return true; + else + return false; +} + +struct hyperv_guid * +vmbus_chan_guid_type(struct vmbus_channel *chan) +{ + return &chan->ch_guid_type; +} + +struct hyperv_guid * +vmbus_chan_guid_inst(struct vmbus_channel *chan) +{ + return &chan->ch_guid_inst; +} + +int +vmbus_chan_prplist_nelem(int br_size, int prpcnt_max, int dlen_max) +{ + int elem_size; + + elem_size = __offsetof(struct vmbus_chanpkt_prplist, + cp_range[0].gpa_page[prpcnt_max]); + elem_size += dlen_max; + elem_size = VMBUS_CHANPKT_TOTLEN(elem_size); + + return (vmbus_br_nelem(br_size, elem_size)); +} + +bool +vmbus_chan_tx_empty(const struct vmbus_channel *chan) +{ + + return (vmbus_txbr_empty(&chan->ch_txbr)); +} + +bool +vmbus_chan_rx_empty(const struct vmbus_channel *chan) +{ + + return (vmbus_rxbr_empty(&chan->ch_rxbr)); +} + +static int +vmbus_chan_printf(const struct vmbus_channel *chan, const char *fmt, ...) +{ + va_list ap; + device_t dev; + int retval; + + if (chan->ch_dev == NULL || !device_is_alive(chan->ch_dev)) + dev = chan->ch_vmbus->vmbus_dev; + else + dev = chan->ch_dev; + + retval = device_print_prettyname(dev); + va_start(ap, fmt); + retval += vprintf(fmt, ap); + va_end(ap); + + return (retval); +} + +void +vmbus_chan_run_task(struct vmbus_channel *chan, struct task *task) +{ + + taskqueue_enqueue(chan->ch_tq, task); + taskqueue_drain(chan->ch_tq, task); +} + +struct taskqueue * +vmbus_chan_mgmt_tq(const struct vmbus_channel *chan) +{ + + return (chan->ch_mgmt_tq); +} + +bool +vmbus_chan_is_revoked(const struct vmbus_channel *chan) +{ + + if (chan->ch_stflags & VMBUS_CHAN_ST_REVOKED) + return (true); + return (false); +} + +void +vmbus_chan_set_orphan(struct vmbus_channel *chan, struct vmbus_xact_ctx *xact) +{ + + sx_xlock(&chan->ch_orphan_lock); + chan->ch_orphan_xact = xact; + sx_xunlock(&chan->ch_orphan_lock); +} + +void +vmbus_chan_unset_orphan(struct vmbus_channel *chan) +{ + + sx_xlock(&chan->ch_orphan_lock); + chan->ch_orphan_xact = NULL; + sx_xunlock(&chan->ch_orphan_lock); +} + +const void * +vmbus_chan_xact_wait(const struct vmbus_channel *chan, + struct vmbus_xact *xact, size_t *resp_len, bool can_sleep) +{ + const void *ret; + + if (can_sleep) + ret = vmbus_xact_wait(xact, resp_len); + else + ret = vmbus_xact_busywait(xact, resp_len); + if (vmbus_chan_is_revoked(chan)) { + /* + * This xact probably is interrupted, and the + * interruption can race the reply reception, + * so we have to make sure that there are nothing + * left on the RX bufring, i.e. this xact will + * not be touched, once this function returns. + * + * Since the hypervisor will not put more data + * onto the RX bufring once the channel is revoked, + * the following loop will be terminated, once all + * data are drained by the driver's channel + * callback. + */ + while (!vmbus_chan_rx_empty(chan)) { + if (can_sleep) + pause("chxact", 1); + else + DELAY(1000); + } + } + return (ret); +} + +void +vmbus_chan_poll_enable(struct vmbus_channel *chan, u_int pollhz) +{ + struct vmbus_chan_pollarg arg; + struct task poll_cfg; + + KASSERT(chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD, + ("enable polling on non-batch chan%u", chan->ch_id)); + KASSERT(pollhz >= VMBUS_CHAN_POLLHZ_MIN && + pollhz <= VMBUS_CHAN_POLLHZ_MAX, ("invalid pollhz %u", pollhz)); + + arg.poll_chan = chan; + arg.poll_hz = pollhz; + TASK_INIT(&poll_cfg, 0, vmbus_chan_pollcfg_task, &arg); + vmbus_chan_run_task(chan, &poll_cfg); +} + +void +vmbus_chan_poll_disable(struct vmbus_channel *chan) +{ + struct task poll_dis; + + KASSERT(chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD, + ("disable polling on non-batch chan%u", chan->ch_id)); + + TASK_INIT(&poll_dis, 0, vmbus_chan_polldis_task, chan); + vmbus_chan_run_task(chan, &poll_dis); +} diff --git a/sys/dev/hyperv/vmbus/vmbus_chanvar.h b/sys/dev/hyperv/vmbus/vmbus_chanvar.h new file mode 100644 index 000000000000..b20b0119bc04 --- /dev/null +++ b/sys/dev/hyperv/vmbus/vmbus_chanvar.h @@ -0,0 +1,195 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMBUS_CHANVAR_H_ +#define _VMBUS_CHANVAR_H_ + +#include <sys/param.h> +#include <sys/callout.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/queue.h> +#include <sys/sysctl.h> +#include <sys/sx.h> +#include <sys/taskqueue.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/include/hyperv_busdma.h> +#include <dev/hyperv/include/vmbus.h> +#include <dev/hyperv/vmbus/vmbus_brvar.h> + +struct vmbus_channel { + /* + * NOTE: + * Fields before ch_txbr are only accessed on this channel's + * target CPU. + */ + uint32_t ch_flags; /* VMBUS_CHAN_FLAG_ */ + int ch_poll_flags; /* callout flags */ + + /* + * RX bufring; immediately following ch_txbr. + */ + struct vmbus_rxbr ch_rxbr; + + struct taskqueue *ch_tq; + struct task ch_task; + struct task ch_poll_task; + sbintime_t ch_poll_intvl; + struct callout ch_poll_timeo; + vmbus_chan_callback_t ch_cb; + void *ch_cbarg; + + /* + * TX bufring; at the beginning of ch_bufring. + * + * NOTE: + * Put TX bufring and the following MNF/evtflag to a new + * cacheline, since they will be accessed on all CPUs by + * locking ch_txbr first. + * + * XXX + * TX bufring and following MNF/evtflags do _not_ fit in + * one 64B cacheline. + */ + struct vmbus_txbr ch_txbr __aligned(CACHE_LINE_SIZE); + uint32_t ch_txflags; /* VMBUS_CHAN_TXF_ */ + + /* + * These are based on the vmbus_chanmsg_choffer.chm_montrig. + * Save it here for easy access. + */ + uint32_t ch_montrig_mask;/* MNF trig mask */ + volatile uint32_t *ch_montrig; /* MNF trigger loc. */ + + /* + * These are based on the vmbus_chanmsg_choffer.chm_chanid. + * Save it here for easy access. + */ + u_long ch_evtflag_mask;/* event flag */ + volatile u_long *ch_evtflag; /* event flag loc. */ + + /* + * Rarely used fields. + */ + + struct hyperv_mon_param *ch_monprm; + struct hyperv_dma ch_monprm_dma; + + uint32_t ch_id; /* channel id */ + device_t ch_dev; + struct vmbus_softc *ch_vmbus; + + int ch_cpuid; /* owner cpu */ + /* + * Virtual cpuid for ch_cpuid; it is used to communicate cpuid + * related information w/ Hyper-V. If MSR_HV_VP_INDEX does not + * exist, ch_vcpuid will always be 0 for compatibility. + */ + uint32_t ch_vcpuid; + + /* + * If this is a primary channel, ch_subchan* fields + * contain sub-channels belonging to this primary + * channel. + */ + struct mtx ch_subchan_lock; + TAILQ_HEAD(, vmbus_channel) ch_subchans; + int ch_subchan_cnt; + + /* If this is a sub-channel */ + TAILQ_ENTRY(vmbus_channel) ch_sublink; /* sub-channel link */ + struct vmbus_channel *ch_prichan; /* owner primary chan */ + + void *ch_bufring; /* TX+RX bufrings */ + struct hyperv_dma ch_bufring_dma; + uint32_t ch_bufring_gpadl; + + struct task ch_attach_task; /* run in ch_mgmt_tq */ + struct task ch_detach_task; /* run in ch_mgmt_tq */ + struct taskqueue *ch_mgmt_tq; + + /* If this is a primary channel */ + TAILQ_ENTRY(vmbus_channel) ch_prilink; /* primary chan link */ + + TAILQ_ENTRY(vmbus_channel) ch_link; /* channel link */ + uint32_t ch_subidx; /* subchan index */ + volatile uint32_t ch_stflags; /* atomic-op */ + /* VMBUS_CHAN_ST_ */ + struct hyperv_guid ch_guid_type; + struct hyperv_guid ch_guid_inst; + + struct sx ch_orphan_lock; + struct vmbus_xact_ctx *ch_orphan_xact; + + int ch_refs; + + /* + * These are for HyperV socket channel only + */ + bool ch_is_hvs; + uint8_t ch_hvs_conn_from_host; + + struct sysctl_ctx_list ch_sysctl_ctx; +} __aligned(CACHE_LINE_SIZE); + +#define VMBUS_CHAN_ISPRIMARY(chan) ((chan)->ch_subidx == 0) + +/* + * If this flag is set, this channel's interrupt will be masked in ISR, + * and the RX bufring will be drained before this channel's interrupt is + * unmasked. + * + * This flag is turned on by default. Drivers can turn it off according + * to their own requirement. + */ +#define VMBUS_CHAN_FLAG_BATCHREAD 0x0002 + +#define VMBUS_CHAN_TXF_HASMNF 0x0001 + +#define VMBUS_CHAN_ST_OPENED_SHIFT 0 +#define VMBUS_CHAN_ST_ONPRIL_SHIFT 1 +#define VMBUS_CHAN_ST_ONSUBL_SHIFT 2 +#define VMBUS_CHAN_ST_ONLIST_SHIFT 3 +#define VMBUS_CHAN_ST_REVOKED_SHIFT 4 /* sticky */ +#define VMBUS_CHAN_ST_OPENED (1 << VMBUS_CHAN_ST_OPENED_SHIFT) +#define VMBUS_CHAN_ST_ONPRIL (1 << VMBUS_CHAN_ST_ONPRIL_SHIFT) +#define VMBUS_CHAN_ST_ONSUBL (1 << VMBUS_CHAN_ST_ONSUBL_SHIFT) +#define VMBUS_CHAN_ST_ONLIST (1 << VMBUS_CHAN_ST_ONLIST_SHIFT) +#define VMBUS_CHAN_ST_REVOKED (1 << VMBUS_CHAN_ST_REVOKED_SHIFT) + +struct vmbus_softc; +struct vmbus_message; + +void vmbus_event_proc(struct vmbus_softc *, int); +void vmbus_event_proc_compat(struct vmbus_softc *, int); +void vmbus_chan_msgproc(struct vmbus_softc *, + const struct vmbus_message *); +void vmbus_chan_destroy_all(struct vmbus_softc *); + +#endif /* !_VMBUS_CHANVAR_H_ */ diff --git a/sys/dev/hyperv/vmbus/vmbus_et.c b/sys/dev/hyperv/vmbus/vmbus_et.c new file mode 100644 index 000000000000..d9ab2a9485e7 --- /dev/null +++ b/sys/dev/hyperv/vmbus/vmbus_et.c @@ -0,0 +1,201 @@ +/*- + * Copyright (c) 2015,2016-2017 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/proc.h> +#include <sys/smp.h> +#include <sys/systm.h> +#include <sys/timeet.h> + +#include <dev/hyperv/include/hyperv.h> +#include <dev/hyperv/vmbus/hyperv_reg.h> +#include <dev/hyperv/vmbus/hyperv_var.h> +#include <dev/hyperv/vmbus/vmbus_var.h> + +#define VMBUS_ET_NAME "hvet" + +#define MSR_HV_STIMER0_CFG_SINT \ + ((((uint64_t)VMBUS_SINT_TIMER) << MSR_HV_STIMER_CFG_SINT_SHIFT) & \ + MSR_HV_STIMER_CFG_SINT_MASK) + +/* + * Additionally required feature: + * - SynIC is needed for interrupt generation. + */ +#define CPUID_HV_ET_MASK (CPUID_HV_MSR_SYNIC | \ + CPUID_HV_MSR_SYNTIMER) + +static void vmbus_et_identify(driver_t *, device_t); +static int vmbus_et_probe(device_t); +static int vmbus_et_attach(device_t); +static int vmbus_et_detach(device_t); +static int vmbus_et_start(struct eventtimer *, sbintime_t, + sbintime_t); + +static struct eventtimer vmbus_et; + +static device_method_t vmbus_et_methods[] = { + DEVMETHOD(device_identify, vmbus_et_identify), + DEVMETHOD(device_probe, vmbus_et_probe), + DEVMETHOD(device_attach, vmbus_et_attach), + DEVMETHOD(device_detach, vmbus_et_detach), + + DEVMETHOD_END +}; + +static driver_t vmbus_et_driver = { + VMBUS_ET_NAME, + vmbus_et_methods, + 0 +}; + +static devclass_t vmbus_et_devclass; + +DRIVER_MODULE(hv_et, vmbus, vmbus_et_driver, vmbus_et_devclass, NULL, NULL); +MODULE_VERSION(hv_et, 1); + +static __inline uint64_t +hyperv_sbintime2count(sbintime_t time) +{ + struct timespec val; + + val = sbttots(time); + return (val.tv_sec * HYPERV_TIMER_FREQ) + + (val.tv_nsec / HYPERV_TIMER_NS_FACTOR); +} + +static int +vmbus_et_start(struct eventtimer *et __unused, sbintime_t first, + sbintime_t period __unused) +{ + uint64_t current; + + current = hyperv_tc64(); + current += hyperv_sbintime2count(first); + wrmsr(MSR_HV_STIMER0_COUNT, current); + + return (0); +} + +void +vmbus_et_intr(struct trapframe *frame) +{ + struct trapframe *oldframe; + struct thread *td; + + if (vmbus_et.et_active) { + td = curthread; + td->td_intr_nesting_level++; + oldframe = td->td_intr_frame; + td->td_intr_frame = frame; + vmbus_et.et_event_cb(&vmbus_et, vmbus_et.et_arg); + td->td_intr_frame = oldframe; + td->td_intr_nesting_level--; + } +} + +static void +vmbus_et_identify(driver_t *driver, device_t parent) +{ + if (device_get_unit(parent) != 0 || + device_find_child(parent, VMBUS_ET_NAME, -1) != NULL || + (hyperv_features & CPUID_HV_ET_MASK) != CPUID_HV_ET_MASK || + hyperv_tc64 == NULL) + return; + + device_add_child(parent, VMBUS_ET_NAME, -1); +} + +static int +vmbus_et_probe(device_t dev) +{ + if (resource_disabled(VMBUS_ET_NAME, 0)) + return (ENXIO); + + device_set_desc(dev, "Hyper-V event timer"); + + return (BUS_PROBE_NOWILDCARD); +} + +static void +vmbus_et_config(void *arg __unused) +{ + /* + * Make sure that STIMER0 is really disabled before writing + * to STIMER0_CONFIG. + * + * "Writing to the configuration register of a timer that + * is already enabled may result in undefined behaviour." + */ + for (;;) { + uint64_t val; + + /* Stop counting, and this also implies disabling STIMER0 */ + wrmsr(MSR_HV_STIMER0_COUNT, 0); + + val = rdmsr(MSR_HV_STIMER0_CONFIG); + if ((val & MSR_HV_STIMER_CFG_ENABLE) == 0) + break; + cpu_spinwait(); + } + wrmsr(MSR_HV_STIMER0_CONFIG, + MSR_HV_STIMER_CFG_AUTOEN | MSR_HV_STIMER0_CFG_SINT); +} + +static int +vmbus_et_attach(device_t dev) +{ + /* TODO: use independent IDT vector */ + + vmbus_et.et_name = "Hyper-V"; + vmbus_et.et_flags = ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU; + vmbus_et.et_quality = 1000; + vmbus_et.et_frequency = HYPERV_TIMER_FREQ; + vmbus_et.et_min_period = (0x00000001ULL << 32) / HYPERV_TIMER_FREQ; + vmbus_et.et_max_period = (0xfffffffeULL << 32) / HYPERV_TIMER_FREQ; + vmbus_et.et_start = vmbus_et_start; + + /* + * Delay a bit to make sure that hyperv_tc64 will not return 0, + * since writing 0 to STIMER0_COUNT will disable STIMER0. + */ + DELAY(100); + smp_rendezvous(NULL, vmbus_et_config, NULL, NULL); + + return (et_register(&vmbus_et)); +} + +static int +vmbus_et_detach(device_t dev) +{ + return (et_deregister(&vmbus_et)); +} diff --git a/sys/dev/hyperv/vmbus/vmbus_if.m b/sys/dev/hyperv/vmbus/vmbus_if.m new file mode 100644 index 000000000000..3b41c5148fdf --- /dev/null +++ b/sys/dev/hyperv/vmbus/vmbus_if.m @@ -0,0 +1,60 @@ +#- +# Copyright (c) 2016 Microsoft Corp. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice unmodified, this list of conditions, and the following +# disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# $FreeBSD$ +# + +#include <sys/param.h> +#include <sys/bus.h> + +INTERFACE vmbus; + +HEADER { + struct hyperv_guid; + struct taskqueue; +}; + +METHOD uint32_t get_version { + device_t bus; + device_t dev; +}; + +METHOD int probe_guid { + device_t bus; + device_t dev; + const struct hyperv_guid *guid; +}; + +METHOD uint32_t get_vcpu_id { + device_t bus; + device_t dev; + int cpu; +}; + +METHOD struct taskqueue * get_event_taskq { + device_t bus; + device_t dev; + int cpu; +}; diff --git a/sys/dev/hyperv/vmbus/vmbus_reg.h b/sys/dev/hyperv/vmbus/vmbus_reg.h new file mode 100644 index 000000000000..80d197c48ee4 --- /dev/null +++ b/sys/dev/hyperv/vmbus/vmbus_reg.h @@ -0,0 +1,427 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMBUS_REG_H_ +#define _VMBUS_REG_H_ + +#include <sys/param.h> +#include <dev/hyperv/include/hyperv.h> /* XXX for hyperv_guid */ +#include <dev/hyperv/include/vmbus.h> +#include <dev/hyperv/vmbus/hyperv_reg.h> + +/* + * Hyper-V SynIC message format. + */ + +#define VMBUS_MSG_DSIZE_MAX 240 +#define VMBUS_MSG_SIZE 256 + +struct vmbus_message { + uint32_t msg_type; /* HYPERV_MSGTYPE_ */ + uint8_t msg_dsize; /* data size */ + uint8_t msg_flags; /* VMBUS_MSGFLAG_ */ + uint16_t msg_rsvd; + uint64_t msg_id; + uint8_t msg_data[VMBUS_MSG_DSIZE_MAX]; +} __packed; +CTASSERT(sizeof(struct vmbus_message) == VMBUS_MSG_SIZE); + +#define VMBUS_MSGFLAG_PENDING 0x01 + +/* + * Hyper-V SynIC event flags + */ + +#ifdef __LP64__ +#define VMBUS_EVTFLAGS_MAX 32 +#define VMBUS_EVTFLAG_SHIFT 6 +#else +#define VMBUS_EVTFLAGS_MAX 64 +#define VMBUS_EVTFLAG_SHIFT 5 +#endif +#define VMBUS_EVTFLAG_LEN (1 << VMBUS_EVTFLAG_SHIFT) +#define VMBUS_EVTFLAG_MASK (VMBUS_EVTFLAG_LEN - 1) +#define VMBUS_EVTFLAGS_SIZE 256 + +struct vmbus_evtflags { + u_long evt_flags[VMBUS_EVTFLAGS_MAX]; +} __packed; +CTASSERT(sizeof(struct vmbus_evtflags) == VMBUS_EVTFLAGS_SIZE); + +/* + * Hyper-V Monitor Notification Facility + */ + +struct vmbus_mon_trig { + uint32_t mt_pending; + uint32_t mt_armed; +} __packed; + +#define VMBUS_MONTRIGS_MAX 4 +#define VMBUS_MONTRIG_LEN 32 + +struct vmbus_mnf { + uint32_t mnf_state; + uint32_t mnf_rsvd1; + + struct vmbus_mon_trig mnf_trigs[VMBUS_MONTRIGS_MAX]; + uint8_t mnf_rsvd2[536]; + + uint16_t mnf_lat[VMBUS_MONTRIGS_MAX][VMBUS_MONTRIG_LEN]; + uint8_t mnf_rsvd3[256]; + + struct hyperv_mon_param + mnf_param[VMBUS_MONTRIGS_MAX][VMBUS_MONTRIG_LEN]; + uint8_t mnf_rsvd4[1984]; +} __packed; +CTASSERT(sizeof(struct vmbus_mnf) == PAGE_SIZE); + +/* + * Buffer ring + */ +struct vmbus_bufring { + /* + * If br_windex == br_rindex, this bufring is empty; this + * means we can _not_ write data to the bufring, if the + * write is going to make br_windex same as br_rindex. + */ + volatile uint32_t br_windex; + volatile uint32_t br_rindex; + + /* + * Interrupt mask {0,1} + * + * For TX bufring, host set this to 1, when it is processing + * the TX bufring, so that we can safely skip the TX event + * notification to host. + * + * For RX bufring, once this is set to 1 by us, host will not + * further dispatch interrupts to us, even if there are data + * pending on the RX bufring. This effectively disables the + * interrupt of the channel to which this RX bufring is attached. + */ + volatile uint32_t br_imask; + + /* + * WS2012/Win8 and later versions of Hyper-V implement interrupt + * driven flow management. The feature bit feat_pending_snd_sz + * is set by the host on the host->guest buffer ring, and by the + * guest on the guest->host buffer ring. + * + * The meaning of the feature bit is a bit complex in that it has + * semantics that apply to both buffer rings. If the guest sets + * the feature bit in the guest->host buffer ring, the guest is + * telling the host that: + * 1) It will set the br_pending_snd_sz field in the guest->host buffer + * ring when it is waiting for space to become available, and + * 2) It will read the pending_send_sz field in the host->guest + * ring buffer and interrupt the host when it frees enough space + * + * Similarly, if the host sets the feature bit in the host->guest + * ring buffer, the host is telling the guest that: + * 1) It will set the pending_send_sz field in the host->guest ring + * buffer when it is waiting for space to become available, and + * 2) It will read the pending_send_sz field in the guest->host + * ring buffer and interrupt the guest when it frees enough space + * + * If either the guest or host does not set the feature bit that it + * owns, that guest or host must do polling if it encounters a full + * ring buffer, and not signal the other end with an interrupt. + */ + volatile uint32_t br_pending_snd_sz; + uint32_t br_rsvd1[12]; + union { + struct { + uint32_t feat_pending_snd_sz:1; + }; + uint32_t value; + } br_feature_bits; + + /* Padding to PAGE_SIZE */ + uint8_t br_rsvd2[4020]; + + /* + * Total guest to host interrupt count + * - For rx ring, this counts the guest signaling host when this rx + * ring changing from full to not full. + * + * - For tx ring, this counts the guest signaling host when this tx + * ring changing from empty to non empty. + */ + uint64_t br_g2h_intr_cnt; + + uint8_t br_data[]; +} __packed; +CTASSERT(sizeof(struct vmbus_bufring) == PAGE_SIZE); + +/* + * Channel + */ + +#define VMBUS_CHAN_MAX_COMPAT 256 +#define VMBUS_CHAN_MAX (VMBUS_EVTFLAG_LEN * VMBUS_EVTFLAGS_MAX) + +/* + * Channel packets + */ + +#define VMBUS_CHANPKT_SIZE_ALIGN (1 << VMBUS_CHANPKT_SIZE_SHIFT) + +#define VMBUS_CHANPKT_SETLEN(pktlen, len) \ +do { \ + (pktlen) = (len) >> VMBUS_CHANPKT_SIZE_SHIFT; \ +} while (0) + +#define VMBUS_CHANPKT_TOTLEN(tlen) \ + roundup2((tlen), VMBUS_CHANPKT_SIZE_ALIGN) + +#define VMBUS_CHANPKT_HLEN_MIN \ + (sizeof(struct vmbus_chanpkt_hdr) >> VMBUS_CHANPKT_SIZE_SHIFT) + +struct vmbus_chanpkt { + struct vmbus_chanpkt_hdr cp_hdr; +} __packed; + +struct vmbus_chanpkt_sglist { + struct vmbus_chanpkt_hdr cp_hdr; + uint32_t cp_rsvd; + uint32_t cp_gpa_cnt; + struct vmbus_gpa cp_gpa[]; +} __packed; + +struct vmbus_chanpkt_prplist { + struct vmbus_chanpkt_hdr cp_hdr; + uint32_t cp_rsvd; + uint32_t cp_range_cnt; + struct vmbus_gpa_range cp_range[]; +} __packed; + +/* + * Channel messages + * - Embedded in vmbus_message.msg_data, e.g. response and notification. + * - Embedded in hypercall_postmsg_in.hc_data, e.g. request. + */ + +#define VMBUS_CHANMSG_TYPE_CHOFFER 1 /* NOTE */ +#define VMBUS_CHANMSG_TYPE_CHRESCIND 2 /* NOTE */ +#define VMBUS_CHANMSG_TYPE_CHREQUEST 3 /* REQ */ +#define VMBUS_CHANMSG_TYPE_CHOFFER_DONE 4 /* NOTE */ +#define VMBUS_CHANMSG_TYPE_CHOPEN 5 /* REQ */ +#define VMBUS_CHANMSG_TYPE_CHOPEN_RESP 6 /* RESP */ +#define VMBUS_CHANMSG_TYPE_CHCLOSE 7 /* REQ */ +#define VMBUS_CHANMSG_TYPE_GPADL_CONN 8 /* REQ */ +#define VMBUS_CHANMSG_TYPE_GPADL_SUBCONN 9 /* REQ */ +#define VMBUS_CHANMSG_TYPE_GPADL_CONNRESP 10 /* RESP */ +#define VMBUS_CHANMSG_TYPE_GPADL_DISCONN 11 /* REQ */ +#define VMBUS_CHANMSG_TYPE_GPADL_DISCONNRESP 12 /* RESP */ +#define VMBUS_CHANMSG_TYPE_CHFREE 13 /* REQ */ +#define VMBUS_CHANMSG_TYPE_CONNECT 14 /* REQ */ +#define VMBUS_CHANMSG_TYPE_CONNECT_RESP 15 /* RESP */ +#define VMBUS_CHANMSG_TYPE_DISCONNECT 16 /* REQ */ +#define VMBUS_CHANMSG_TYPE_17 17 +#define VMBUS_CHANMSG_TYPE_18 18 +#define VMBUS_CHANMSG_TYPE_19 19 +#define VMBUS_CHANMSG_TYPE_20 20 +#define VMBUS_CHANMSG_TYPE_TL_CONN 21 /* REQ */ +#define VMBUS_CHANMSG_TYPE_22 22 +#define VMBUS_CHANMSG_TYPE_TL_RESULT 23 /* RESP */ +#define VMBUS_CHANMSG_TYPE_MAX 24 + +struct vmbus_chanmsg_hdr { + uint32_t chm_type; /* VMBUS_CHANMSG_TYPE_ */ + uint32_t chm_rsvd; +} __packed; + +/* VMBUS_CHANMSG_TYPE_CONNECT */ +struct vmbus_chanmsg_connect { + struct vmbus_chanmsg_hdr chm_hdr; + uint32_t chm_ver; + uint32_t chm_rsvd; + uint64_t chm_evtflags; + uint64_t chm_mnf1; + uint64_t chm_mnf2; +} __packed; + +/* VMBUS_CHANMSG_TYPE_CONNECT_RESP */ +struct vmbus_chanmsg_connect_resp { + struct vmbus_chanmsg_hdr chm_hdr; + uint8_t chm_done; +} __packed; + +/* VMBUS_CHANMSG_TYPE_CHREQUEST */ +struct vmbus_chanmsg_chrequest { + struct vmbus_chanmsg_hdr chm_hdr; +} __packed; + +/* VMBUS_CHANMSG_TYPE_DISCONNECT */ +struct vmbus_chanmsg_disconnect { + struct vmbus_chanmsg_hdr chm_hdr; +} __packed; + +/* VMBUS_CHANMSG_TYPE_TL_CONN */ +/* Hyper-V socket guest connect request */ +struct vmbus_chanmsg_tl_connect { + struct vmbus_chanmsg_hdr chm_hdr; + struct hyperv_guid guest_endpoint_id; + struct hyperv_guid host_service_id; +} __packed; + + +/* VMBUS_CHANMSG_TYPE_CHOPEN */ +struct vmbus_chanmsg_chopen { + struct vmbus_chanmsg_hdr chm_hdr; + uint32_t chm_chanid; + uint32_t chm_openid; + uint32_t chm_gpadl; + uint32_t chm_vcpuid; + uint32_t chm_txbr_pgcnt; +#define VMBUS_CHANMSG_CHOPEN_UDATA_SIZE 120 + uint8_t chm_udata[VMBUS_CHANMSG_CHOPEN_UDATA_SIZE]; +} __packed; + +/* VMBUS_CHANMSG_TYPE_CHOPEN_RESP */ +struct vmbus_chanmsg_chopen_resp { + struct vmbus_chanmsg_hdr chm_hdr; + uint32_t chm_chanid; + uint32_t chm_openid; + uint32_t chm_status; +} __packed; + +/* VMBUS_CHANMSG_TYPE_GPADL_CONN */ +struct vmbus_chanmsg_gpadl_conn { + struct vmbus_chanmsg_hdr chm_hdr; + uint32_t chm_chanid; + uint32_t chm_gpadl; + uint16_t chm_range_len; + uint16_t chm_range_cnt; + struct vmbus_gpa_range chm_range; +} __packed; + +#define VMBUS_CHANMSG_GPADL_CONN_PGMAX 26 +CTASSERT(__offsetof(struct vmbus_chanmsg_gpadl_conn, + chm_range.gpa_page[VMBUS_CHANMSG_GPADL_CONN_PGMAX]) <= + HYPERCALL_POSTMSGIN_DSIZE_MAX); + +/* VMBUS_CHANMSG_TYPE_GPADL_SUBCONN */ +struct vmbus_chanmsg_gpadl_subconn { + struct vmbus_chanmsg_hdr chm_hdr; + uint32_t chm_msgno; + uint32_t chm_gpadl; + uint64_t chm_gpa_page[]; +} __packed; + +#define VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX 28 +CTASSERT(__offsetof(struct vmbus_chanmsg_gpadl_subconn, + chm_gpa_page[VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX]) <= + HYPERCALL_POSTMSGIN_DSIZE_MAX); + +/* VMBUS_CHANMSG_TYPE_GPADL_CONNRESP */ +struct vmbus_chanmsg_gpadl_connresp { + struct vmbus_chanmsg_hdr chm_hdr; + uint32_t chm_chanid; + uint32_t chm_gpadl; + uint32_t chm_status; +} __packed; + +/* VMBUS_CHANMSG_TYPE_CHCLOSE */ +struct vmbus_chanmsg_chclose { + struct vmbus_chanmsg_hdr chm_hdr; + uint32_t chm_chanid; +} __packed; + +/* VMBUS_CHANMSG_TYPE_GPADL_DISCONN */ +struct vmbus_chanmsg_gpadl_disconn { + struct vmbus_chanmsg_hdr chm_hdr; + uint32_t chm_chanid; + uint32_t chm_gpadl; +} __packed; + +/* VMBUS_CHANMSG_TYPE_CHFREE */ +struct vmbus_chanmsg_chfree { + struct vmbus_chanmsg_hdr chm_hdr; + uint32_t chm_chanid; +} __packed; + +/* VMBUS_CHANMSG_TYPE_CHRESCIND */ +struct vmbus_chanmsg_chrescind { + struct vmbus_chanmsg_hdr chm_hdr; + uint32_t chm_chanid; +} __packed; + +/* Size of the user defined data buffer for non-pipe offers */ +#define VMBUS_CHANMSG_CHOFFER_UDATA_SIZE 120 + +/* Size of the user defined data buffer for pipe offers. */ +#define VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE 116 + +/* VMBUS_CHANMSG_TYPE_CHOFFER */ +struct vmbus_chanmsg_choffer { + struct vmbus_chanmsg_hdr chm_hdr; + struct hyperv_guid chm_chtype; + struct hyperv_guid chm_chinst; + uint64_t chm_chlat; /* unit: 100ns */ + uint32_t chm_chrev; + uint32_t chm_svrctx_sz; + uint16_t chm_chflags; + uint16_t chm_mmio_sz; /* unit: MB */ + + union { + /* Non-pipes */ + struct { + uint8_t user_def[VMBUS_CHANMSG_CHOFFER_UDATA_SIZE]; + } std; + /* + * Pipes: + * For integrated pipe protocol, which is implemented on + * top of standard user-defined data. Pipe clients have + * VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE bytes left for + * their own user. + */ + struct { + uint32_t pipe_mode; + uint8_t + user_def[VMBUS_CHANMSG_CHOFFER_UDATA_PIPE_SIZE]; + } pipe; + } chm_udata; + + uint16_t chm_subidx; + uint16_t chm_rsvd; + uint32_t chm_chanid; + uint8_t chm_montrig; + uint8_t chm_flags1; /* VMBUS_CHOFFER_FLAG1_ */ + uint16_t chm_flags2; + uint32_t chm_connid; +} __packed; +CTASSERT(sizeof(struct vmbus_chanmsg_choffer) <= VMBUS_MSG_DSIZE_MAX); + +/* Server Flag */ +#define VMBUS_CHAN_TLNPI_PROVIDER_OFFER 0x2000 + +#define VMBUS_CHOFFER_FLAG1_HASMNF 0x01 + +#endif /* !_VMBUS_REG_H_ */ diff --git a/sys/dev/hyperv/vmbus/vmbus_res.c b/sys/dev/hyperv/vmbus/vmbus_res.c new file mode 100644 index 000000000000..fba5a732ca58 --- /dev/null +++ b/sys/dev/hyperv/vmbus/vmbus_res.c @@ -0,0 +1,99 @@ +/*- + * Copyright (c) 2017 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/module.h> + +#include <contrib/dev/acpica/include/acpi.h> +#include <dev/acpica/acpivar.h> + +#include <dev/hyperv/include/hyperv.h> + +#include "acpi_if.h" +#include "bus_if.h" + +static int vmbus_res_probe(device_t); +static int vmbus_res_attach(device_t); +static int vmbus_res_detach(device_t); + +static device_method_t vmbus_res_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vmbus_res_probe), + DEVMETHOD(device_attach, vmbus_res_attach), + DEVMETHOD(device_detach, vmbus_res_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + DEVMETHOD_END +}; + +static driver_t vmbus_res_driver = { + "vmbus_res", + vmbus_res_methods, + 1 +}; + +static devclass_t vmbus_res_devclass; + +DRIVER_MODULE(vmbus_res, acpi, vmbus_res_driver, vmbus_res_devclass, + NULL, NULL); +MODULE_DEPEND(vmbus_res, acpi, 1, 1, 1); +MODULE_VERSION(vmbus_res, 1); + +static int +vmbus_res_probe(device_t dev) +{ + char *id[] = { "VMBUS", NULL }; + int rv; + + if (device_get_unit(dev) != 0 || vm_guest != VM_GUEST_HV || + (hyperv_features & CPUID_HV_MSR_SYNIC) == 0) + return (ENXIO); + rv = ACPI_ID_PROBE(device_get_parent(dev), dev, id, NULL); + if (rv <= 0) + device_set_desc(dev, "Hyper-V Vmbus Resource"); + return (rv); +} + +static int +vmbus_res_attach(device_t dev __unused) +{ + + return (0); +} + +static int +vmbus_res_detach(device_t dev __unused) +{ + + return (0); +} diff --git a/sys/dev/hyperv/vmbus/vmbus_var.h b/sys/dev/hyperv/vmbus/vmbus_var.h new file mode 100644 index 000000000000..0e42d70d8257 --- /dev/null +++ b/sys/dev/hyperv/vmbus/vmbus_var.h @@ -0,0 +1,175 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMBUS_VAR_H_ +#define _VMBUS_VAR_H_ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/taskqueue.h> +#include <sys/rman.h> + +#include <dev/hyperv/include/hyperv_busdma.h> +#include <dev/pci/pcivar.h> +#include <dev/pci/pcib_private.h> + +/* + * NOTE: DO NOT CHANGE THIS. + */ +#define VMBUS_SINT_MESSAGE 2 +/* + * NOTE: + * - DO NOT set it to the same value as VMBUS_SINT_MESSAGE. + * - DO NOT set it to 0. + */ +#define VMBUS_SINT_TIMER 4 + +/* + * NOTE: DO NOT CHANGE THESE + */ +#define VMBUS_CONNID_MESSAGE 1 +#define VMBUS_CONNID_EVENT 2 + +struct vmbus_message; +struct vmbus_softc; + +typedef void (*vmbus_chanmsg_proc_t)(struct vmbus_softc *, + const struct vmbus_message *); + +#define VMBUS_CHANMSG_PROC(name, func) \ + [VMBUS_CHANMSG_TYPE_##name] = func +#define VMBUS_CHANMSG_PROC_WAKEUP(name) \ + VMBUS_CHANMSG_PROC(name, vmbus_msghc_wakeup) + +struct vmbus_pcpu_data { + u_long *intr_cnt; /* Hyper-V interrupt counter */ + struct vmbus_message *message; /* shared messages */ + uint32_t vcpuid; /* virtual cpuid */ + int event_flags_cnt;/* # of event flags */ + struct vmbus_evtflags *event_flags; /* event flags from host */ + + /* Rarely used fields */ + struct hyperv_dma message_dma; /* busdma glue */ + struct hyperv_dma event_flags_dma;/* busdma glue */ + struct taskqueue *event_tq; /* event taskq */ + struct taskqueue *message_tq; /* message taskq */ + struct task message_task; /* message task */ +} __aligned(CACHE_LINE_SIZE); + +#if __FreeBSD_version < 1100000 +typedef u_long rman_res_t; +#endif + +struct vmbus_softc { + void (*vmbus_event_proc)(struct vmbus_softc *, int); + u_long *vmbus_tx_evtflags; + /* event flags to host */ + struct vmbus_mnf *vmbus_mnf2; /* monitored by host */ + + u_long *vmbus_rx_evtflags; + /* compat evtflgs from host */ + struct vmbus_channel *volatile *vmbus_chmap; + struct vmbus_xact_ctx *vmbus_xc; + struct vmbus_pcpu_data vmbus_pcpu[MAXCPU]; + + /* + * Rarely used fields + */ + + device_t vmbus_dev; + int vmbus_idtvec; + uint32_t vmbus_flags; /* see VMBUS_FLAG_ */ + uint32_t vmbus_version; + uint32_t vmbus_gpadl; + + /* Shared memory for vmbus_{rx,tx}_evtflags */ + void *vmbus_evtflags; + struct hyperv_dma vmbus_evtflags_dma; + + void *vmbus_mnf1; /* monitored by VM, unused */ + struct hyperv_dma vmbus_mnf1_dma; + struct hyperv_dma vmbus_mnf2_dma; + + bool vmbus_scandone; + struct task vmbus_scandone_task; + + struct taskqueue *vmbus_devtq; /* for dev attach/detach */ + struct taskqueue *vmbus_subchtq; /* for sub-chan attach/detach */ + + /* Primary channels */ + struct mtx vmbus_prichan_lock; + TAILQ_HEAD(, vmbus_channel) vmbus_prichans; + + /* Complete channel list */ + struct mtx vmbus_chan_lock; + TAILQ_HEAD(, vmbus_channel) vmbus_chans; + + struct intr_config_hook vmbus_intrhook; + +#ifdef NEW_PCIB + /* The list of usable MMIO ranges for PCIe pass-through */ + struct pcib_host_resources vmbus_mmio_res; +#endif +}; + +#define VMBUS_FLAG_ATTACHED 0x0001 /* vmbus was attached */ +#define VMBUS_FLAG_SYNIC 0x0002 /* SynIC was setup */ + +#define VMBUS_PCPU_GET(sc, field, cpu) (sc)->vmbus_pcpu[(cpu)].field +#define VMBUS_PCPU_PTR(sc, field, cpu) &(sc)->vmbus_pcpu[(cpu)].field + +struct vmbus_channel; +struct trapframe; +struct vmbus_message; +struct vmbus_msghc; + +void vmbus_handle_intr(struct trapframe *); +int vmbus_add_child(struct vmbus_channel *); +int vmbus_delete_child(struct vmbus_channel *); +void vmbus_et_intr(struct trapframe *); +uint32_t vmbus_gpadl_alloc(struct vmbus_softc *); + +struct vmbus_msghc * + vmbus_msghc_get(struct vmbus_softc *, size_t); +void vmbus_msghc_put(struct vmbus_softc *, struct vmbus_msghc *); +void *vmbus_msghc_dataptr(struct vmbus_msghc *); +int vmbus_msghc_exec_noresult(struct vmbus_msghc *); +int vmbus_msghc_exec(struct vmbus_softc *, struct vmbus_msghc *); +void vmbus_msghc_exec_cancel(struct vmbus_softc *, + struct vmbus_msghc *); +const struct vmbus_message * + vmbus_msghc_wait_result(struct vmbus_softc *, + struct vmbus_msghc *); +const struct vmbus_message * + vmbus_msghc_poll_result(struct vmbus_softc *, + struct vmbus_msghc *); +void vmbus_msghc_wakeup(struct vmbus_softc *, + const struct vmbus_message *); +void vmbus_msghc_reset(struct vmbus_msghc *, size_t); + +#endif /* !_VMBUS_VAR_H_ */ diff --git a/sys/dev/hyperv/vmbus/vmbus_xact.c b/sys/dev/hyperv/vmbus/vmbus_xact.c new file mode 100644 index 000000000000..90bdba7e1058 --- /dev/null +++ b/sys/dev/hyperv/vmbus/vmbus_xact.c @@ -0,0 +1,442 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/systm.h> + +#include <dev/hyperv/include/hyperv_busdma.h> +#include <dev/hyperv/include/vmbus_xact.h> + +struct vmbus_xact { + struct vmbus_xact_ctx *x_ctx; + void *x_priv; + + void *x_req; + struct hyperv_dma x_req_dma; + + const void *x_resp; + size_t x_resp_len; + void *x_resp0; +}; + +struct vmbus_xact_ctx { + size_t xc_req_size; + size_t xc_resp_size; + size_t xc_priv_size; + + struct mtx xc_lock; + /* + * Protected by xc_lock. + */ + uint32_t xc_flags; /* VMBUS_XACT_CTXF_ */ + struct vmbus_xact *xc_free; + struct vmbus_xact *xc_active; + struct vmbus_xact *xc_orphan; +}; + +#define VMBUS_XACT_CTXF_DESTROY 0x0001 + +static struct vmbus_xact *vmbus_xact_alloc(struct vmbus_xact_ctx *, + bus_dma_tag_t); +static void vmbus_xact_free(struct vmbus_xact *); +static struct vmbus_xact *vmbus_xact_get1(struct vmbus_xact_ctx *, + uint32_t); +static const void *vmbus_xact_wait1(struct vmbus_xact *, size_t *, + bool); +static const void *vmbus_xact_return(struct vmbus_xact *, + size_t *); +static void vmbus_xact_save_resp(struct vmbus_xact *, + const void *, size_t); +static void vmbus_xact_ctx_free(struct vmbus_xact_ctx *); + +static struct vmbus_xact * +vmbus_xact_alloc(struct vmbus_xact_ctx *ctx, bus_dma_tag_t parent_dtag) +{ + struct vmbus_xact *xact; + + xact = malloc(sizeof(*xact), M_DEVBUF, M_WAITOK | M_ZERO); + xact->x_ctx = ctx; + + /* XXX assume that page aligned is enough */ + xact->x_req = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, + ctx->xc_req_size, &xact->x_req_dma, BUS_DMA_WAITOK); + if (xact->x_req == NULL) { + free(xact, M_DEVBUF); + return (NULL); + } + if (ctx->xc_priv_size != 0) + xact->x_priv = malloc(ctx->xc_priv_size, M_DEVBUF, M_WAITOK); + xact->x_resp0 = malloc(ctx->xc_resp_size, M_DEVBUF, M_WAITOK); + + return (xact); +} + +static void +vmbus_xact_free(struct vmbus_xact *xact) +{ + + hyperv_dmamem_free(&xact->x_req_dma, xact->x_req); + free(xact->x_resp0, M_DEVBUF); + if (xact->x_priv != NULL) + free(xact->x_priv, M_DEVBUF); + free(xact, M_DEVBUF); +} + +static struct vmbus_xact * +vmbus_xact_get1(struct vmbus_xact_ctx *ctx, uint32_t dtor_flag) +{ + struct vmbus_xact *xact; + + mtx_lock(&ctx->xc_lock); + + while ((ctx->xc_flags & dtor_flag) == 0 && ctx->xc_free == NULL) + mtx_sleep(&ctx->xc_free, &ctx->xc_lock, 0, "gxact", 0); + if (ctx->xc_flags & dtor_flag) { + /* Being destroyed */ + xact = NULL; + } else { + xact = ctx->xc_free; + KASSERT(xact != NULL, ("no free xact")); + KASSERT(xact->x_resp == NULL, ("xact has pending response")); + ctx->xc_free = NULL; + } + + mtx_unlock(&ctx->xc_lock); + + return (xact); +} + +struct vmbus_xact_ctx * +vmbus_xact_ctx_create(bus_dma_tag_t dtag, size_t req_size, size_t resp_size, + size_t priv_size) +{ + struct vmbus_xact_ctx *ctx; + + KASSERT(req_size > 0, ("request size is 0")); + KASSERT(resp_size > 0, ("response size is 0")); + + ctx = malloc(sizeof(*ctx), M_DEVBUF, M_WAITOK | M_ZERO); + ctx->xc_req_size = req_size; + ctx->xc_resp_size = resp_size; + ctx->xc_priv_size = priv_size; + + ctx->xc_free = vmbus_xact_alloc(ctx, dtag); + if (ctx->xc_free == NULL) { + free(ctx, M_DEVBUF); + return (NULL); + } + + mtx_init(&ctx->xc_lock, "vmbus xact", NULL, MTX_DEF); + + return (ctx); +} + +bool +vmbus_xact_ctx_orphan(struct vmbus_xact_ctx *ctx) +{ + mtx_lock(&ctx->xc_lock); + if (ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) { + mtx_unlock(&ctx->xc_lock); + return (false); + } + ctx->xc_flags |= VMBUS_XACT_CTXF_DESTROY; + mtx_unlock(&ctx->xc_lock); + + wakeup(&ctx->xc_free); + wakeup(&ctx->xc_active); + + ctx->xc_orphan = vmbus_xact_get1(ctx, 0); + if (ctx->xc_orphan == NULL) + panic("can't get xact"); + return (true); +} + +static void +vmbus_xact_ctx_free(struct vmbus_xact_ctx *ctx) +{ + KASSERT(ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY, + ("xact ctx was not orphaned")); + KASSERT(ctx->xc_orphan != NULL, ("no orphaned xact")); + + vmbus_xact_free(ctx->xc_orphan); + mtx_destroy(&ctx->xc_lock); + free(ctx, M_DEVBUF); +} + +void +vmbus_xact_ctx_destroy(struct vmbus_xact_ctx *ctx) +{ + + vmbus_xact_ctx_orphan(ctx); + vmbus_xact_ctx_free(ctx); +} + +struct vmbus_xact * +vmbus_xact_get(struct vmbus_xact_ctx *ctx, size_t req_len) +{ + struct vmbus_xact *xact; + + if (req_len > ctx->xc_req_size) + panic("invalid request size %zu", req_len); + + xact = vmbus_xact_get1(ctx, VMBUS_XACT_CTXF_DESTROY); + if (xact == NULL) + return (NULL); + + memset(xact->x_req, 0, req_len); + return (xact); +} + +void +vmbus_xact_put(struct vmbus_xact *xact) +{ + struct vmbus_xact_ctx *ctx = xact->x_ctx; + + KASSERT(ctx->xc_active == NULL, ("pending active xact")); + xact->x_resp = NULL; + + mtx_lock(&ctx->xc_lock); + KASSERT(ctx->xc_free == NULL, ("has free xact")); + ctx->xc_free = xact; + mtx_unlock(&ctx->xc_lock); + wakeup(&ctx->xc_free); +} + +void * +vmbus_xact_req_data(const struct vmbus_xact *xact) +{ + + return (xact->x_req); +} + +bus_addr_t +vmbus_xact_req_paddr(const struct vmbus_xact *xact) +{ + + return (xact->x_req_dma.hv_paddr); +} + +void * +vmbus_xact_priv(const struct vmbus_xact *xact, size_t priv_len) +{ + + if (priv_len > xact->x_ctx->xc_priv_size) + panic("invalid priv size %zu", priv_len); + return (xact->x_priv); +} + +void +vmbus_xact_activate(struct vmbus_xact *xact) +{ + struct vmbus_xact_ctx *ctx = xact->x_ctx; + + KASSERT(xact->x_resp == NULL, ("xact has pending response")); + + mtx_lock(&ctx->xc_lock); + KASSERT(ctx->xc_active == NULL, ("pending active xact")); + ctx->xc_active = xact; + mtx_unlock(&ctx->xc_lock); +} + +void +vmbus_xact_deactivate(struct vmbus_xact *xact) +{ + struct vmbus_xact_ctx *ctx = xact->x_ctx; + + mtx_lock(&ctx->xc_lock); + KASSERT(ctx->xc_active == xact, ("xact mismatch")); + ctx->xc_active = NULL; + mtx_unlock(&ctx->xc_lock); +} + +static const void * +vmbus_xact_return(struct vmbus_xact *xact, size_t *resp_len) +{ + struct vmbus_xact_ctx *ctx = xact->x_ctx; + const void *resp; + + mtx_assert(&ctx->xc_lock, MA_OWNED); + KASSERT(ctx->xc_active == xact, ("xact trashed")); + + if ((ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) && xact->x_resp == NULL) { + uint8_t b = 0; + + /* + * Orphaned and no response was received yet; fake up + * an one byte response. + */ + printf("vmbus: xact ctx was orphaned w/ pending xact\n"); + vmbus_xact_save_resp(ctx->xc_active, &b, sizeof(b)); + } + KASSERT(xact->x_resp != NULL, ("no response")); + + ctx->xc_active = NULL; + + resp = xact->x_resp; + *resp_len = xact->x_resp_len; + + return (resp); +} + +static const void * +vmbus_xact_wait1(struct vmbus_xact *xact, size_t *resp_len, + bool can_sleep) +{ + struct vmbus_xact_ctx *ctx = xact->x_ctx; + const void *resp; + + mtx_lock(&ctx->xc_lock); + + KASSERT(ctx->xc_active == xact, ("xact mismatch")); + while (xact->x_resp == NULL && + (ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) == 0) { + if (can_sleep) { + mtx_sleep(&ctx->xc_active, &ctx->xc_lock, 0, + "wxact", 0); + } else { + mtx_unlock(&ctx->xc_lock); + DELAY(1000); + mtx_lock(&ctx->xc_lock); + } + } + resp = vmbus_xact_return(xact, resp_len); + + mtx_unlock(&ctx->xc_lock); + + return (resp); +} + +const void * +vmbus_xact_wait(struct vmbus_xact *xact, size_t *resp_len) +{ + + return (vmbus_xact_wait1(xact, resp_len, true /* can sleep */)); +} + +const void * +vmbus_xact_busywait(struct vmbus_xact *xact, size_t *resp_len) +{ + + return (vmbus_xact_wait1(xact, resp_len, false /* can't sleep */)); +} + +const void * +vmbus_xact_poll(struct vmbus_xact *xact, size_t *resp_len) +{ + struct vmbus_xact_ctx *ctx = xact->x_ctx; + const void *resp; + + mtx_lock(&ctx->xc_lock); + + KASSERT(ctx->xc_active == xact, ("xact mismatch")); + if (xact->x_resp == NULL && + (ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) == 0) { + mtx_unlock(&ctx->xc_lock); + *resp_len = 0; + return (NULL); + } + resp = vmbus_xact_return(xact, resp_len); + + mtx_unlock(&ctx->xc_lock); + + return (resp); +} + +static void +vmbus_xact_save_resp(struct vmbus_xact *xact, const void *data, size_t dlen) +{ + struct vmbus_xact_ctx *ctx = xact->x_ctx; + size_t cplen = dlen; + + mtx_assert(&ctx->xc_lock, MA_OWNED); + + if (cplen > ctx->xc_resp_size) { + printf("vmbus: xact response truncated %zu -> %zu\n", + cplen, ctx->xc_resp_size); + cplen = ctx->xc_resp_size; + } + + KASSERT(ctx->xc_active == xact, ("xact mismatch")); + memcpy(xact->x_resp0, data, cplen); + xact->x_resp_len = cplen; + xact->x_resp = xact->x_resp0; +} + +void +vmbus_xact_wakeup(struct vmbus_xact *xact, const void *data, size_t dlen) +{ + struct vmbus_xact_ctx *ctx = xact->x_ctx; + int do_wakeup = 0; + + mtx_lock(&ctx->xc_lock); + /* + * NOTE: + * xc_active could be NULL, if the ctx has been orphaned. + */ + if (ctx->xc_active != NULL) { + vmbus_xact_save_resp(xact, data, dlen); + do_wakeup = 1; + } else { + KASSERT(ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY, + ("no active xact pending")); + printf("vmbus: drop xact response\n"); + } + mtx_unlock(&ctx->xc_lock); + + if (do_wakeup) + wakeup(&ctx->xc_active); +} + +void +vmbus_xact_ctx_wakeup(struct vmbus_xact_ctx *ctx, const void *data, size_t dlen) +{ + int do_wakeup = 0; + + mtx_lock(&ctx->xc_lock); + /* + * NOTE: + * xc_active could be NULL, if the ctx has been orphaned. + */ + if (ctx->xc_active != NULL) { + vmbus_xact_save_resp(ctx->xc_active, data, dlen); + do_wakeup = 1; + } else { + KASSERT(ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY, + ("no active xact pending")); + printf("vmbus: drop xact response\n"); + } + mtx_unlock(&ctx->xc_lock); + + if (do_wakeup) + wakeup(&ctx->xc_active); +} |