aboutsummaryrefslogtreecommitdiff
path: root/lib/libvmmapi/vmmapi.c
diff options
context:
space:
mode:
authorNeel Natu <neel@FreeBSD.org>2015-06-18 06:00:17 +0000
committerNeel Natu <neel@FreeBSD.org>2015-06-18 06:00:17 +0000
commit9b1aa8d622e84cde39a5c4e1542410956e38548c (patch)
tree34972705799ce1ec23ee45b1e6c0c7631ad7ed98 /lib/libvmmapi/vmmapi.c
parente2dd1974587e2713e2a9774a5858b79803c3d061 (diff)
downloadsrc-9b1aa8d622e84cde39a5c4e1542410956e38548c.tar.gz
src-9b1aa8d622e84cde39a5c4e1542410956e38548c.zip
Restructure memory allocation in bhyve to support "devmem".
devmem is used to represent MMIO devices like the boot ROM or a VESA framebuffer where doing a trap-and-emulate for every access is impractical. devmem is a hybrid of system memory (sysmem) and emulated device models. devmem is mapped in the guest address space via nested page tables similar to sysmem. However the address range where devmem is mapped may be changed by the guest at runtime (e.g. by reprogramming a PCI BAR). Also devmem is usually mapped RO or RW as compared to RWX mappings for sysmem. Each devmem segment is named (e.g. "bootrom") and this name is used to create a device node for the devmem segment (e.g. /dev/vmm/testvm.bootrom). The device node supports mmap(2) and this decouples the host mapping of devmem from its mapping in the guest address space (which can change). Reviewed by: tychon Discussed with: grehan Differential Revision: https://reviews.freebsd.org/D2762 MFC after: 4 weeks
Notes
Notes: svn path=/head/; revision=284539
Diffstat (limited to 'lib/libvmmapi/vmmapi.c')
-rw-r--r--lib/libvmmapi/vmmapi.c320
1 files changed, 262 insertions, 58 deletions
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 1e6e627011b9..d5f387aec682 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -58,15 +58,23 @@ __FBSDID("$FreeBSD$");
#define MB (1024 * 1024UL)
#define GB (1024 * 1024 * 1024UL)
+/*
+ * Size of the guard region before and after the virtual address space
+ * mapping the guest physical memory. This must be a multiple of the
+ * superpage size for performance reasons.
+ */
+#define VM_MMAP_GUARD_SIZE (4 * MB)
+
+#define PROT_RW (PROT_READ | PROT_WRITE)
+#define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC)
+
struct vmctx {
int fd;
uint32_t lowmem_limit;
- enum vm_mmap_style vms;
int memflags;
size_t lowmem;
- char *lowmem_addr;
size_t highmem;
- char *highmem_addr;
+ char *baseaddr;
char *name;
};
@@ -157,22 +165,6 @@ vm_parse_memsize(const char *optarg, size_t *ret_memsize)
return (error);
}
-int
-vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
- int *wired)
-{
- int error;
- struct vm_memory_segment seg;
-
- bzero(&seg, sizeof(seg));
- seg.gpa = gpa;
- error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
- *ret_len = seg.len;
- if (wired != NULL)
- *wired = seg.wired;
- return (error);
-}
-
uint32_t
vm_get_lowmem_limit(struct vmctx *ctx)
{
@@ -194,39 +186,184 @@ vm_set_memflags(struct vmctx *ctx, int flags)
ctx->memflags = flags;
}
+int
+vm_get_memflags(struct vmctx *ctx)
+{
+
+ return (ctx->memflags);
+}
+
+/*
+ * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
+ */
+int
+vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
+ size_t len, int prot)
+{
+ struct vm_memmap memmap;
+ int error, flags;
+
+ memmap.gpa = gpa;
+ memmap.segid = segid;
+ memmap.segoff = off;
+ memmap.len = len;
+ memmap.prot = prot;
+ memmap.flags = 0;
+
+ if (ctx->memflags & VM_MEM_F_WIRED)
+ memmap.flags |= VM_MEMMAP_F_WIRED;
+
+ /*
+ * If this mapping already exists then don't create it again. This
+ * is the common case for SYSMEM mappings created by bhyveload(8).
+ */
+ error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
+ if (error == 0 && gpa == memmap.gpa) {
+ if (segid != memmap.segid || off != memmap.segoff ||
+ prot != memmap.prot || flags != memmap.flags) {
+ errno = EEXIST;
+ return (-1);
+ } else {
+ return (0);
+ }
+ }
+
+ error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
+ return (error);
+}
+
+int
+vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
+ vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
+{
+ struct vm_memmap memmap;
+ int error;
+
+ bzero(&memmap, sizeof(struct vm_memmap));
+ memmap.gpa = *gpa;
+ error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
+ if (error == 0) {
+ *gpa = memmap.gpa;
+ *segid = memmap.segid;
+ *segoff = memmap.segoff;
+ *len = memmap.len;
+ *prot = memmap.prot;
+ *flags = memmap.flags;
+ }
+ return (error);
+}
+
+/*
+ * Return 0 if the segments are identical and non-zero otherwise.
+ *
+ * This is slightly complicated by the fact that only device memory segments
+ * are named.
+ */
+static int
+cmpseg(size_t len, const char *str, size_t len2, const char *str2)
+{
+
+ if (len == len2) {
+ if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
+ return (0);
+ }
+ return (-1);
+}
+
static int
-setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **addr)
+vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
{
- int error, mmap_flags;
- struct vm_memory_segment seg;
+ struct vm_memseg memseg;
+ size_t n;
+ int error;
/*
- * Create and optionally map 'len' bytes of memory at guest
- * physical address 'gpa'
+ * If the memory segment has already been created then just return.
+ * This is the usual case for the SYSMEM segment created by userspace
+ * loaders like bhyveload(8).
*/
- bzero(&seg, sizeof(seg));
- seg.gpa = gpa;
- seg.len = len;
- error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
- if (error == 0 && addr != NULL) {
- mmap_flags = MAP_SHARED;
- if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
- mmap_flags |= MAP_NOCORE;
- *addr = mmap(NULL, len, PROT_READ | PROT_WRITE, mmap_flags,
- ctx->fd, gpa);
+ error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
+ sizeof(memseg.name));
+ if (error)
+ return (error);
+
+ if (memseg.len != 0) {
+ if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
+ errno = EINVAL;
+ return (-1);
+ } else {
+ return (0);
+ }
+ }
+
+ bzero(&memseg, sizeof(struct vm_memseg));
+ memseg.segid = segid;
+ memseg.len = len;
+ if (name != NULL) {
+ n = strlcpy(memseg.name, name, sizeof(memseg.name));
+ if (n >= sizeof(memseg.name)) {
+ errno = ENAMETOOLONG;
+ return (-1);
+ }
}
+
+ error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
return (error);
}
int
-vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
+vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
+ size_t bufsize)
{
- char **addr;
+ struct vm_memseg memseg;
+ size_t n;
int error;
- /* XXX VM_MMAP_SPARSE not implemented yet */
- assert(vms == VM_MMAP_NONE || vms == VM_MMAP_ALL);
- ctx->vms = vms;
+ memseg.segid = segid;
+ error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
+ if (error == 0) {
+ *lenp = memseg.len;
+ n = strlcpy(namebuf, memseg.name, bufsize);
+ if (n >= bufsize) {
+ errno = ENAMETOOLONG;
+ error = -1;
+ }
+ }
+ return (error);
+}
+
+static int
+setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
+{
+ char *ptr;
+ int error, flags;
+
+ /* Map 'len' bytes starting at 'gpa' in the guest address space */
+ error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
+ if (error)
+ return (error);
+
+ flags = MAP_SHARED | MAP_FIXED;
+ if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
+ flags |= MAP_NOCORE;
+
+ /* mmap into the process address space on the host */
+ ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
+ if (ptr == MAP_FAILED)
+ return (-1);
+
+ return (0);
+}
+
+int
+vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
+{
+ size_t objsize, len;
+ vm_paddr_t gpa;
+ char *baseaddr, *ptr;
+ int error, flags;
+
+ assert(vms == VM_MMAP_ALL);
/*
* If 'memsize' cannot fit entirely in the 'lowmem' segment then
@@ -234,46 +371,63 @@ vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
*/
if (memsize > ctx->lowmem_limit) {
ctx->lowmem = ctx->lowmem_limit;
- ctx->highmem = memsize - ctx->lowmem;
+ ctx->highmem = memsize - ctx->lowmem_limit;
+ objsize = 4*GB + ctx->highmem;
} else {
ctx->lowmem = memsize;
ctx->highmem = 0;
+ objsize = ctx->lowmem;
}
- if (ctx->lowmem > 0) {
- addr = (vms == VM_MMAP_ALL) ? &ctx->lowmem_addr : NULL;
- error = setup_memory_segment(ctx, 0, ctx->lowmem, addr);
+ error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
+ if (error)
+ return (error);
+
+ /*
+ * Stake out a contiguous region covering the guest physical memory
+ * and the adjoining guard regions.
+ */
+ len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
+ flags = MAP_PRIVATE | MAP_ANON | MAP_NOCORE | MAP_ALIGNED_SUPER;
+ ptr = mmap(NULL, len, PROT_NONE, flags, -1, 0);
+ if (ptr == MAP_FAILED)
+ return (-1);
+
+ baseaddr = ptr + VM_MMAP_GUARD_SIZE;
+ if (ctx->highmem > 0) {
+ gpa = 4*GB;
+ len = ctx->highmem;
+ error = setup_memory_segment(ctx, gpa, len, baseaddr);
if (error)
return (error);
}
- if (ctx->highmem > 0) {
- addr = (vms == VM_MMAP_ALL) ? &ctx->highmem_addr : NULL;
- error = setup_memory_segment(ctx, 4*GB, ctx->highmem, addr);
+ if (ctx->lowmem > 0) {
+ gpa = 0;
+ len = ctx->lowmem;
+ error = setup_memory_segment(ctx, gpa, len, baseaddr);
if (error)
return (error);
}
+ ctx->baseaddr = baseaddr;
+
return (0);
}
void *
vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
{
+ vm_paddr_t start, end, mapend;
- /* XXX VM_MMAP_SPARSE not implemented yet */
- assert(ctx->vms == VM_MMAP_ALL);
-
- if (gaddr < ctx->lowmem && gaddr + len <= ctx->lowmem)
- return ((void *)(ctx->lowmem_addr + gaddr));
-
- if (gaddr >= 4*GB) {
- gaddr -= 4*GB;
- if (gaddr < ctx->highmem && gaddr + len <= ctx->highmem)
- return ((void *)(ctx->highmem_addr + gaddr));
- }
+ start = gaddr;
+ end = gaddr + len;
+ mapend = ctx->highmem ? 4*GB + ctx->highmem : ctx->lowmem;
- return (NULL);
+ if (start <= end && end <= mapend)
+ return (ctx->baseaddr + start);
+ else
+ return (NULL);
}
size_t
@@ -290,6 +444,56 @@ vm_get_highmem_size(struct vmctx *ctx)
return (ctx->highmem);
}
+void *
+vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
+{
+ char pathname[MAXPATHLEN];
+ size_t len2;
+ char *base, *ptr;
+ int fd, error, flags;
+
+ fd = -1;
+ ptr = MAP_FAILED;
+ if (name == NULL || strlen(name) == 0) {
+ errno = EINVAL;
+ goto done;
+ }
+
+ error = vm_alloc_memseg(ctx, segid, len, name);
+ if (error)
+ goto done;
+
+ strlcpy(pathname, "/dev/vmm/", sizeof(pathname));
+ strlcat(pathname, ctx->name, sizeof(pathname));
+ strlcat(pathname, ".", sizeof(pathname));
+ strlcat(pathname, name, sizeof(pathname));
+
+ fd = open(pathname, O_RDWR);
+ if (fd < 0)
+ goto done;
+
+ /*
+ * Stake out a contiguous region covering the device memory and the
+ * adjoining guard regions.
+ */
+ len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
+ flags = MAP_PRIVATE | MAP_ANON | MAP_NOCORE | MAP_ALIGNED_SUPER;
+ base = mmap(NULL, len2, PROT_NONE, flags, -1, 0);
+ if (base == MAP_FAILED)
+ goto done;
+
+ flags = MAP_SHARED | MAP_FIXED;
+ if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
+ flags |= MAP_NOCORE;
+
+ /* mmap the devmem region in the host address space */
+ ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
+done:
+ if (fd >= 0)
+ close(fd);
+ return (ptr);
+}
+
int
vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t base, uint32_t limit, uint32_t access)