aboutsummaryrefslogtreecommitdiff
path: root/sys/amd64
diff options
context:
space:
mode:
authorPeter Wemm <peter@FreeBSD.org>2006-04-03 21:36:01 +0000
committerPeter Wemm <peter@FreeBSD.org>2006-04-03 21:36:01 +0000
commit68ac481184eec04ebb8e62f1ad1480bf6ea65253 (patch)
tree772d161a3a3e1ff398ff055a3926a44709cf5887 /sys/amd64
parent7035694536505b4d04855a2960d8a1dfc6dcd8de (diff)
downloadsrc-68ac481184eec04ebb8e62f1ad1480bf6ea65253.tar.gz
src-68ac481184eec04ebb8e62f1ad1480bf6ea65253.zip
Shrink the amd64 pv entry from 48 bytes to about 24 bytes. On a machine
with large mmap files mapped into many processes, this saves hundreds of megabytes of ram. pv entries were individually allocated and had two tailq entries and two pointers (or addresses). Each pv entry was linked to a vm_page_t and a process's address space (pmap). It had the virtual address and a pointer to the pmap. This change replaces the individual allocation with a per-process allocation system. A page ("pv chunk") is allocated and this provides 168 pv entries for that process. We can now eliminate one of the 16 byte tailq entries because we can simply iterate through the pv chunks to find all the pv entries for a process. We can eliminate one of the 8 byte pointers because the location of the pv entry implies the containing pv chunk, which has the pointer. After overheads from the pv chunk bitmap and tailq linkage, this works out that each pv entry has an effective size of 24.38 bytes. Future work still required, and other problems: * when running low on pv entries or system ram, we may need to defrag the chunk pages and free any spares. The stats (vm.pmap.*) show that this doesn't seem to be that much of a problem, but it can be done if needed. * running low on pv entries is now a much bigger problem. The old get_pv_entry() routine just needed to reclaim one other pv entry. Now, since they are per-process, we can only use pv entries that are assigned to our current process, or by stealing an entire page worth from another process. Under normal circumstances, the pmap_collect() code should be able to dislodge some pv entries from the current process. But if needed, it can still reclaim entire pv chunk pages from other processes. * This should port to i386 really easily, except there it would reduce pv entries from 24 bytes to about 12 bytes. (I have integrated Alan's recent changes.)
Notes
Notes: svn path=/head/; revision=157446
Diffstat (limited to 'sys/amd64')
-rw-r--r--sys/amd64/amd64/pmap.c441
-rw-r--r--sys/amd64/include/pmap.h19
2 files changed, 309 insertions, 151 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 7f403beb718a..009898a39801 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -158,6 +158,13 @@ __FBSDID("$FreeBSD$");
#define PMAP_INLINE
#endif
+#define PV_STATS
+#ifdef PV_STATS
+#define PV_STAT(x) do { x ; } while (0)
+#else
+#define PV_STAT(x) do { } while (0)
+#endif
+
struct pmap kernel_pmap_store;
vm_paddr_t avail_start; /* PA of first available physical page */
@@ -182,7 +189,6 @@ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
/*
* Data for the pv entry allocation mechanism
*/
-static uma_zone_t pvzone;
static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
static int shpgperproc = PMAP_SHPGPERPROC;
@@ -198,8 +204,8 @@ struct msgbuf *msgbufp = 0;
*/
static caddr_t crashdumpmap;
-static PMAP_INLINE void free_pv_entry(pv_entry_t pv);
-static pv_entry_t get_pv_entry(pmap_t locked_pmap);
+static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
+static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
static void pmap_clear_ptes(vm_page_t m, long bit);
static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
@@ -511,7 +517,7 @@ pmap_bootstrap(firstaddr)
PMAP_LOCK_INIT(kernel_pmap);
kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
kernel_pmap->pm_active = -1; /* don't allow deactivation */
- TAILQ_INIT(&kernel_pmap->pm_pvlist);
+ TAILQ_INIT(&kernel_pmap->pm_pvchunk);
nkpt = NKPT;
/*
@@ -571,8 +577,6 @@ pmap_init(void)
* high water mark so that the system can recover from excessive
* numbers of pv entries.
*/
- pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL,
- NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM);
TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
@@ -1065,7 +1069,7 @@ pmap_pinit0(pmap)
PMAP_LOCK_INIT(pmap);
pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys);
pmap->pm_active = 0;
- TAILQ_INIT(&pmap->pm_pvlist);
+ TAILQ_INIT(&pmap->pm_pvchunk);
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
}
@@ -1102,7 +1106,7 @@ pmap_pinit(pmap)
pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
pmap->pm_active = 0;
- TAILQ_INIT(&pmap->pm_pvlist);
+ TAILQ_INIT(&pmap->pm_pvchunk);
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
}
@@ -1439,61 +1443,78 @@ pmap_growkernel(vm_offset_t addr)
* page management routines.
***************************************************/
-/*
- * free the pv_entry back to the free list
- */
-static PMAP_INLINE void
-free_pv_entry(pv_entry_t pv)
+CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
+CTASSERT(_NPCM == 3);
+CTASSERT(_NPCPV == 168);
+
+static __inline struct pv_chunk *
+pv_to_chunk(pv_entry_t pv)
{
- pv_entry_count--;
- uma_zfree(pvzone, pv);
+
+ return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
}
+#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
+
+#define PC_FREE0 0xfffffffffffffffful
+#define PC_FREE1 0xfffffffffffffffful
+#define PC_FREE2 0x000000fffffffffful
+
+static uint64_t pc_freemask[3] = { PC_FREE0, PC_FREE1, PC_FREE2 };
+
+#ifdef PV_STATS
+static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
+
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
+ "Current number of pv entry chunks");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
+ "Current number of pv entry chunks allocated");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
+ "Current number of pv entry chunks frees");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
+ "Number of times tried to get a chunk page but failed.");
+
+static int pv_entry_frees, pv_entry_allocs, pv_entry_spare;
+
+SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
+ "Current number of pv entries");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
+ "Current number of pv entry frees");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
+ "Current number of pv entry allocs");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
+ "Current number of spare pv entries");
+
+static int pmap_collect_inactive, pmap_collect_active;
+
+SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
+ "Current number times pmap_collect called on inactive queue");
+SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
+ "Current number times pmap_collect called on active queue");
+#endif
+
/*
- * get a new pv_entry, allocating a block from the system
- * when needed.
+ * We are in a serious low memory condition. Resort to
+ * drastic measures to free some pages so we can allocate
+ * another pv entry chunk. This is normally called to
+ * unmap inactive pages, and if necessary, active pages.
*/
-static pv_entry_t
-get_pv_entry(pmap_t locked_pmap)
+static void
+pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
{
- static const struct timeval printinterval = { 60, 0 };
- static struct timeval lastprint;
- struct vpgqueues *vpq;
pd_entry_t ptepde;
pmap_t pmap;
pt_entry_t *pte, tpte;
- pv_entry_t allocated_pv, next_pv, pv;
+ pv_entry_t next_pv, pv;
vm_offset_t va;
vm_page_t m;
- PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
- mtx_assert(&vm_page_queue_mtx, MA_OWNED);
- allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
- if (allocated_pv != NULL) {
- pv_entry_count++;
- if (pv_entry_count > pv_entry_high_water)
- pagedaemon_wakeup();
- else
- return (allocated_pv);
- }
-
- /*
- * Reclaim pv entries: At first, destroy mappings to inactive
- * pages. After that, if a pv entry is still needed, destroy
- * mappings to active pages.
- */
- if (ratecheck(&lastprint, &printinterval))
- printf("Approaching the limit on PV entries, consider "
- "increasing sysctl vm.pmap.shpgperproc or "
- "vm.pmap.pv_entry_max\n");
- vpq = &vm_page_queues[PQ_INACTIVE];
-retry:
TAILQ_FOREACH(m, &vpq->pl, pageq) {
if (m->hold_count || m->busy || (m->flags & PG_BUSY))
continue;
TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
va = pv->pv_va;
- pmap = pv->pv_pmap;
+ pmap = PV_PMAP(pv);
/* Avoid deadlock and lock recursion. */
if (pmap > locked_pmap)
PMAP_LOCK(pmap);
@@ -1503,18 +1524,17 @@ retry:
pte = pmap_pte_pde(pmap, va, &ptepde);
tpte = pte_load_clear(pte);
KASSERT((tpte & PG_W) == 0,
- ("get_pv_entry: wired pte %#lx", tpte));
+ ("pmap_collect: wired pte %#lx", tpte));
if (tpte & PG_A)
vm_page_flag_set(m, PG_REFERENCED);
if (tpte & PG_M) {
KASSERT((tpte & PG_RW),
- ("get_pv_entry: modified page not writable: va: %#lx, pte: %#lx",
+ ("pmap_collect: modified page not writable: va: %#lx, pte: %#lx",
va, tpte));
if (pmap_track_modified(va))
vm_page_dirty(m);
}
pmap_invalidate_page(pmap, va);
- TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
if (TAILQ_EMPTY(&m->md.pv_list))
vm_page_flag_clear(m, PG_WRITEABLE);
@@ -1522,47 +1542,149 @@ retry:
pmap_unuse_pt(pmap, va, ptepde);
if (pmap != locked_pmap)
PMAP_UNLOCK(pmap);
- if (allocated_pv == NULL)
- allocated_pv = pv;
- else
- free_pv_entry(pv);
- }
- }
- if (allocated_pv == NULL) {
- if (vpq == &vm_page_queues[PQ_INACTIVE]) {
- vpq = &vm_page_queues[PQ_ACTIVE];
- goto retry;
+ free_pv_entry(locked_pmap, pv);
}
- panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
}
- return (allocated_pv);
}
+
+/*
+ * free the pv_entry back to the free list
+ */
static void
-pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
+free_pv_entry(pmap_t pmap, pv_entry_t pv)
{
+ vm_page_t m;
+ struct pv_chunk *pc;
+ int idx, field, bit;
+
+ PV_STAT(pv_entry_frees++);
+ PV_STAT(pv_entry_spare++);
+ PV_STAT(pv_entry_count--);
+ pc = pv_to_chunk(pv);
+ idx = pv - &pc->pc_pventry[0];
+ field = idx / 64;
+ bit = idx % 64;
+ pc->pc_map[field] |= 1ul << bit;
+ /* move to head of list */
+ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+ TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+ if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
+ pc->pc_map[2] != PC_FREE2)
+ return;
+ PV_STAT(pv_entry_spare -= _NPCPV);
+ PV_STAT(pc_chunk_count--);
+ PV_STAT(pc_chunk_frees++);
+ /* entire chunk is free, return it */
+ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
+ vm_page_lock_queues();
+ vm_page_free(m);
+ vm_page_unlock_queues();
+}
+
+/*
+ * get a new pv_entry, allocating a block from the system
+ * when needed.
+ */
+static pv_entry_t
+get_pv_entry(pmap_t pmap, int try)
+{
+ static const struct timeval printinterval = { 60, 0 };
+ static struct timeval lastprint;
+ static vm_pindex_t colour;
+ int bit, field;
pv_entry_t pv;
+ struct pv_chunk *pc;
+ vm_page_t m;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
mtx_assert(&vm_page_queue_mtx, MA_OWNED);
- if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
- TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
- if (pmap == pv->pv_pmap && va == pv->pv_va)
+ PV_STAT(pv_entry_allocs++);
+ PV_STAT(pv_entry_count++);
+ if (pv_entry_count > pv_entry_high_water)
+ pagedaemon_wakeup();
+ pc = TAILQ_FIRST(&pmap->pm_pvchunk);
+ if (pc != NULL) {
+ for (field = 0; field < _NPCM; field++) {
+ if (pc->pc_map[field]) {
+ bit = bsfq(pc->pc_map[field]);
break;
+ }
}
- } else {
- TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
- if (va == pv->pv_va)
- break;
+ if (field < _NPCM) {
+ pv = &pc->pc_pventry[field * 64 + bit];
+ pc->pc_map[field] &= ~(1ul << bit);
+ /* If this was the last item, move it to tail */
+ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
+ pc->pc_map[2] == 0) {
+ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+ TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+ }
+ PV_STAT(pv_entry_spare--);
+ return (pv);
+ }
+ }
+ /* No free items, allocate another chunk */
+ m = vm_page_alloc(NULL, colour, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ);
+ if (m == NULL) {
+ if (try) {
+ PV_STAT(pc_chunk_tryfail++);
+ return (NULL);
+ }
+ /*
+ * Reclaim pv entries: At first, destroy mappings to inactive
+ * pages. After that, if a pv chunk entry is still needed,
+ * destroy mappings to active pages.
+ */
+ if (ratecheck(&lastprint, &printinterval))
+ printf("Approaching the limit on PV entries, consider "
+ "increasing sysctl vm.pmap.shpgperproc or "
+ "vm.pmap.pv_entry_max\n");
+ PV_STAT(pmap_collect_inactive++);
+ pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]);
+ m = vm_page_alloc(NULL, colour,
+ VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ);
+ if (m == NULL) {
+ PV_STAT(pmap_collect_active++);
+ pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]);
+ m = vm_page_alloc(NULL, colour,
+ VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ);
+ if (m == NULL)
+ panic("get_pv_entry: increase vm.pmap.shpgperproc");
}
}
+ PV_STAT(pc_chunk_count++);
+ PV_STAT(pc_chunk_allocs++);
+ colour++;
+ pc = (void *)PHYS_TO_DMAP(m->phys_addr);
+ pc->pc_pmap = pmap;
+ pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */
+ pc->pc_map[1] = PC_FREE1;
+ pc->pc_map[2] = PC_FREE2;
+ pv = &pc->pc_pventry[0];
+ TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+ PV_STAT(pv_entry_spare += _NPCPV - 1);
+ return (pv);
+}
+
+static void
+pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
+{
+ pv_entry_t pv;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ if (pmap == PV_PMAP(pv) && va == pv->pv_va)
+ break;
+ }
KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
m->md.pv_list_count--;
if (TAILQ_EMPTY(&m->md.pv_list))
vm_page_flag_clear(m, PG_WRITEABLE);
- TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
- free_pv_entry(pv);
+ free_pv_entry(pmap, pv);
}
/*
@@ -1574,13 +1696,10 @@ pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
{
pv_entry_t pv;
- pv = get_pv_entry(pmap);
- pv->pv_va = va;
- pv->pv_pmap = pmap;
-
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
mtx_assert(&vm_page_queue_mtx, MA_OWNED);
- TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
+ pv = get_pv_entry(pmap, FALSE);
+ pv->pv_va = va;
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
m->md.pv_list_count++;
}
@@ -1596,11 +1715,8 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
mtx_assert(&vm_page_queue_mtx, MA_OWNED);
if (pv_entry_count < pv_entry_high_water &&
- (pv = uma_zalloc(pvzone, M_NOWAIT)) != NULL) {
- pv_entry_count++;
+ (pv = get_pv_entry(pmap, TRUE)) != NULL) {
pv->pv_va = va;
- pv->pv_pmap = pmap;
- TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
m->md.pv_list_count++;
return (TRUE);
@@ -1791,6 +1907,7 @@ void
pmap_remove_all(vm_page_t m)
{
register pv_entry_t pv;
+ pmap_t pmap;
pt_entry_t *pte, tpte;
pd_entry_t ptepde;
@@ -1805,12 +1922,13 @@ pmap_remove_all(vm_page_t m)
#endif
mtx_assert(&vm_page_queue_mtx, MA_OWNED);
while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
- PMAP_LOCK(pv->pv_pmap);
- pv->pv_pmap->pm_stats.resident_count--;
- pte = pmap_pte_pde(pv->pv_pmap, pv->pv_va, &ptepde);
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ pmap->pm_stats.resident_count--;
+ pte = pmap_pte_pde(pmap, pv->pv_va, &ptepde);
tpte = pte_load_clear(pte);
if (tpte & PG_W)
- pv->pv_pmap->pm_stats.wired_count--;
+ pmap->pm_stats.wired_count--;
if (tpte & PG_A)
vm_page_flag_set(m, PG_REFERENCED);
@@ -1824,13 +1942,12 @@ pmap_remove_all(vm_page_t m)
if (pmap_track_modified(pv->pv_va))
vm_page_dirty(m);
}
- pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
- TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
+ pmap_invalidate_page(pmap, pv->pv_va);
TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
m->md.pv_list_count--;
- pmap_unuse_pt(pv->pv_pmap, pv->pv_va, ptepde);
- PMAP_UNLOCK(pv->pv_pmap);
- free_pv_entry(pv);
+ pmap_unuse_pt(pmap, pv->pv_va, ptepde);
+ PMAP_UNLOCK(pmap);
+ free_pv_entry(pmap, pv);
}
vm_page_flag_clear(m, PG_WRITEABLE);
}
@@ -2584,7 +2701,7 @@ pmap_page_exists_quick(pmap, m)
mtx_assert(&vm_page_queue_mtx, MA_OWNED);
TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
- if (pv->pv_pmap == pmap) {
+ if (PV_PMAP(pv) == pmap) {
return TRUE;
}
loops++;
@@ -2594,7 +2711,6 @@ pmap_page_exists_quick(pmap, m)
return (FALSE);
}
-#define PMAP_REMOVE_PAGES_CURPROC_ONLY
/*
* Remove all pages from specified address space
* this aids process exit speeds. Also, this code
@@ -2608,68 +2724,91 @@ pmap_remove_pages(pmap_t pmap)
{
pt_entry_t *pte, tpte;
vm_page_t m;
- pv_entry_t pv, npv;
+ pv_entry_t pv;
+ struct pv_chunk *pc, *npc;
+ int field, idx;
+ int64_t bit;
+ uint64_t inuse, bitmask;
+ int allfree;
-#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
printf("warning: pmap_remove_pages called with non-current pmap\n");
return;
}
-#endif
vm_page_lock_queues();
PMAP_LOCK(pmap);
- for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
-
-#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
- pte = vtopte(pv->pv_va);
-#else
- pte = pmap_pte(pmap, pv->pv_va);
-#endif
- tpte = *pte;
-
- if (tpte == 0) {
- printf("TPTE at %p IS ZERO @ VA %08lx\n",
- pte, pv->pv_va);
- panic("bad pte");
- }
+ TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
+ allfree = 1;
+ for (field = 0; field < _NPCM; field++) {
+ inuse = (~(pc->pc_map[field])) & pc_freemask[field];
+ while (inuse != 0) {
+ bit = bsfq(inuse);
+ bitmask = 1UL << bit;
+ idx = field * 64 + bit;
+ pv = &pc->pc_pventry[idx];
+ inuse &= ~bitmask;
+
+ pte = vtopte(pv->pv_va);
+ tpte = *pte;
+
+ if (tpte == 0) {
+ printf(
+ "TPTE at %p IS ZERO @ VA %08lx\n",
+ pte, pv->pv_va);
+ panic("bad pte");
+ }
/*
* We cannot remove wired pages from a process' mapping at this time
*/
- if (tpte & PG_W) {
- npv = TAILQ_NEXT(pv, pv_plist);
- continue;
- }
-
- m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
- KASSERT(m->phys_addr == (tpte & PG_FRAME),
- ("vm_page_t %p phys_addr mismatch %016jx %016jx",
- m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
-
- KASSERT(m < &vm_page_array[vm_page_array_size],
- ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
+ if (tpte & PG_W) {
+ allfree = 0;
+ continue;
+ }
- pmap->pm_stats.resident_count--;
+ m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
+ KASSERT(m->phys_addr == (tpte & PG_FRAME),
+ ("vm_page_t %p phys_addr mismatch %016jx %016jx",
+ m, (uintmax_t)m->phys_addr,
+ (uintmax_t)tpte));
- pte_clear(pte);
+ KASSERT(m < &vm_page_array[vm_page_array_size],
+ ("pmap_remove_pages: bad tpte %#jx",
+ (uintmax_t)tpte));
- /*
- * Update the vm_page_t clean and reference bits.
- */
- if (tpte & PG_M) {
- vm_page_dirty(m);
- }
+ pmap->pm_stats.resident_count--;
- npv = TAILQ_NEXT(pv, pv_plist);
- TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
+ pte_clear(pte);
- m->md.pv_list_count--;
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
- if (TAILQ_EMPTY(&m->md.pv_list))
- vm_page_flag_clear(m, PG_WRITEABLE);
+ /*
+ * Update the vm_page_t clean/reference bits.
+ */
+ if (tpte & PG_M)
+ vm_page_dirty(m);
- pmap_unuse_pt(pmap, pv->pv_va, *vtopde(pv->pv_va));
- free_pv_entry(pv);
+ /* Mark free */
+ PV_STAT(pv_entry_frees++);
+ PV_STAT(pv_entry_spare++);
+ PV_STAT(pv_entry_count--);
+ pc->pc_map[field] |= bitmask;
+ m->md.pv_list_count--;
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+ if (TAILQ_EMPTY(&m->md.pv_list))
+ vm_page_flag_clear(m, PG_WRITEABLE);
+ pmap_unuse_pt(pmap, pv->pv_va,
+ *vtopde(pv->pv_va));
+ }
+ }
+ if (allfree) {
+ PV_STAT(pv_entry_spare -= _NPCPV);
+ PV_STAT(pc_chunk_count--);
+ PV_STAT(pc_chunk_frees++);
+ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
+ vm_page_lock_queues();
+ vm_page_free(m);
+ vm_page_unlock_queues();
+ }
}
pmap_invalidate_all(pmap);
PMAP_UNLOCK(pmap);
@@ -2687,6 +2826,7 @@ pmap_is_modified(vm_page_t m)
{
pv_entry_t pv;
pt_entry_t *pte;
+ pmap_t pmap;
boolean_t rv;
rv = FALSE;
@@ -2702,10 +2842,11 @@ pmap_is_modified(vm_page_t m)
*/
if (!pmap_track_modified(pv->pv_va))
continue;
- PMAP_LOCK(pv->pv_pmap);
- pte = pmap_pte(pv->pv_pmap, pv->pv_va);
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ pte = pmap_pte(pmap, pv->pv_va);
rv = (*pte & PG_M) != 0;
- PMAP_UNLOCK(pv->pv_pmap);
+ PMAP_UNLOCK(pmap);
if (rv)
break;
}
@@ -2743,6 +2884,7 @@ static __inline void
pmap_clear_ptes(vm_page_t m, long bit)
{
register pv_entry_t pv;
+ pmap_t pmap;
pt_entry_t pbits, *pte;
if ((m->flags & PG_FICTITIOUS) ||
@@ -2763,8 +2905,9 @@ pmap_clear_ptes(vm_page_t m, long bit)
continue;
}
- PMAP_LOCK(pv->pv_pmap);
- pte = pmap_pte(pv->pv_pmap, pv->pv_va);
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ pte = pmap_pte(pmap, pv->pv_va);
retry:
pbits = *pte;
if (pbits & bit) {
@@ -2778,9 +2921,9 @@ retry:
} else {
atomic_clear_long(pte, bit);
}
- pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
+ pmap_invalidate_page(pmap, pv->pv_va);
}
- PMAP_UNLOCK(pv->pv_pmap);
+ PMAP_UNLOCK(pmap);
}
if (bit == PG_RW)
vm_page_flag_clear(m, PG_WRITEABLE);
@@ -2819,6 +2962,7 @@ int
pmap_ts_referenced(vm_page_t m)
{
register pv_entry_t pv, pvf, pvn;
+ pmap_t pmap;
pt_entry_t *pte;
pt_entry_t v;
int rtval = 0;
@@ -2841,20 +2985,21 @@ pmap_ts_referenced(vm_page_t m)
if (!pmap_track_modified(pv->pv_va))
continue;
- PMAP_LOCK(pv->pv_pmap);
- pte = pmap_pte(pv->pv_pmap, pv->pv_va);
+ pmap = PV_PMAP(pv);
+ PMAP_LOCK(pmap);
+ pte = pmap_pte(pmap, pv->pv_va);
if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
atomic_clear_long(pte, PG_A);
- pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
+ pmap_invalidate_page(pmap, pv->pv_va);
rtval++;
if (rtval > 4) {
- PMAP_UNLOCK(pv->pv_pmap);
+ PMAP_UNLOCK(pmap);
break;
}
}
- PMAP_UNLOCK(pv->pv_pmap);
+ PMAP_UNLOCK(pmap);
} while ((pv = pvn) != NULL && pv != pvf);
}
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index 4af3b36d5d1f..49e3139ccf40 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -222,6 +222,7 @@ extern pt_entry_t pg_nx;
* Pmap stuff
*/
struct pv_entry;
+struct pv_chunk;
struct md_page {
int pv_list_count;
@@ -231,7 +232,7 @@ struct md_page {
struct pmap {
struct mtx pm_mtx;
pml4_entry_t *pm_pml4; /* KVA of level 4 page table */
- TAILQ_HEAD(,pv_entry) pm_pvlist; /* list of mappings in pmap */
+ TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */
u_int pm_active; /* active on cpus */
/* spare u_int here due to padding */
struct pmap_statistics pm_stats; /* pmap statistics */
@@ -260,12 +261,24 @@ extern struct pmap kernel_pmap_store;
* mappings of that page. An entry is a pv_entry_t, the list is pv_table.
*/
typedef struct pv_entry {
- pmap_t pv_pmap; /* pmap where mapping lies */
vm_offset_t pv_va; /* virtual address for mapping */
TAILQ_ENTRY(pv_entry) pv_list;
- TAILQ_ENTRY(pv_entry) pv_plist;
} *pv_entry_t;
+/*
+ * pv_entries are allocated in chunks per-process. This avoids the
+ * need to track per-pmap assignments.
+ */
+#define _NPCM 3
+#define _NPCPV 168
+struct pv_chunk {
+ pmap_t pc_pmap;
+ TAILQ_ENTRY(pv_chunk) pc_list;
+ uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */
+ uint64_t pc_spare[2];
+ struct pv_entry pc_pventry[_NPCPV];
+};
+
#ifdef _KERNEL
#define NPPROVMTRR 8