diff options
author | Peter Wemm <peter@FreeBSD.org> | 2006-04-03 21:36:01 +0000 |
---|---|---|
committer | Peter Wemm <peter@FreeBSD.org> | 2006-04-03 21:36:01 +0000 |
commit | 68ac481184eec04ebb8e62f1ad1480bf6ea65253 (patch) | |
tree | 772d161a3a3e1ff398ff055a3926a44709cf5887 /sys/amd64 | |
parent | 7035694536505b4d04855a2960d8a1dfc6dcd8de (diff) | |
download | src-68ac481184eec04ebb8e62f1ad1480bf6ea65253.tar.gz src-68ac481184eec04ebb8e62f1ad1480bf6ea65253.zip |
Shrink the amd64 pv entry from 48 bytes to about 24 bytes. On a machine
with large mmap files mapped into many processes, this saves hundreds of
megabytes of ram.
pv entries were individually allocated and had two tailq entries and two
pointers (or addresses). Each pv entry was linked to a vm_page_t and
a process's address space (pmap). It had the virtual address and a
pointer to the pmap.
This change replaces the individual allocation with a per-process
allocation system. A page ("pv chunk") is allocated and this provides
168 pv entries for that process. We can now eliminate one of the 16 byte
tailq entries because we can simply iterate through the pv chunks to find
all the pv entries for a process. We can eliminate one of the 8 byte
pointers because the location of the pv entry implies the containing
pv chunk, which has the pointer. After overheads from the pv chunk
bitmap and tailq linkage, this works out that each pv entry has an
effective size of 24.38 bytes.
Future work still required, and other problems:
* when running low on pv entries or system ram, we may need to defrag
the chunk pages and free any spares. The stats (vm.pmap.*) show that
this doesn't seem to be that much of a problem, but it can be done if
needed.
* running low on pv entries is now a much bigger problem. The old
get_pv_entry() routine just needed to reclaim one other pv entry.
Now, since they are per-process, we can only use pv entries that are
assigned to our current process, or by stealing an entire page worth
from another process. Under normal circumstances, the pmap_collect()
code should be able to dislodge some pv entries from the current
process. But if needed, it can still reclaim entire pv chunk pages
from other processes.
* This should port to i386 really easily, except there it would reduce
pv entries from 24 bytes to about 12 bytes.
(I have integrated Alan's recent changes.)
Notes
Notes:
svn path=/head/; revision=157446
Diffstat (limited to 'sys/amd64')
-rw-r--r-- | sys/amd64/amd64/pmap.c | 441 | ||||
-rw-r--r-- | sys/amd64/include/pmap.h | 19 |
2 files changed, 309 insertions, 151 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 7f403beb718a..009898a39801 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -158,6 +158,13 @@ __FBSDID("$FreeBSD$"); #define PMAP_INLINE #endif +#define PV_STATS +#ifdef PV_STATS +#define PV_STAT(x) do { x ; } while (0) +#else +#define PV_STAT(x) do { } while (0) +#endif + struct pmap kernel_pmap_store; vm_paddr_t avail_start; /* PA of first available physical page */ @@ -182,7 +189,6 @@ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ /* * Data for the pv entry allocation mechanism */ -static uma_zone_t pvzone; static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static int shpgperproc = PMAP_SHPGPERPROC; @@ -198,8 +204,8 @@ struct msgbuf *msgbufp = 0; */ static caddr_t crashdumpmap; -static PMAP_INLINE void free_pv_entry(pv_entry_t pv); -static pv_entry_t get_pv_entry(pmap_t locked_pmap); +static void free_pv_entry(pmap_t pmap, pv_entry_t pv); +static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); static void pmap_clear_ptes(vm_page_t m, long bit); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, @@ -511,7 +517,7 @@ pmap_bootstrap(firstaddr) PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys); kernel_pmap->pm_active = -1; /* don't allow deactivation */ - TAILQ_INIT(&kernel_pmap->pm_pvlist); + TAILQ_INIT(&kernel_pmap->pm_pvchunk); nkpt = NKPT; /* @@ -571,8 +577,6 @@ pmap_init(void) * high water mark so that the system can recover from excessive * numbers of pv entries. */ - pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); @@ -1065,7 +1069,7 @@ pmap_pinit0(pmap) PMAP_LOCK_INIT(pmap); pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys); pmap->pm_active = 0; - TAILQ_INIT(&pmap->pm_pvlist); + TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } @@ -1102,7 +1106,7 @@ pmap_pinit(pmap) pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M; pmap->pm_active = 0; - TAILQ_INIT(&pmap->pm_pvlist); + TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } @@ -1439,61 +1443,78 @@ pmap_growkernel(vm_offset_t addr) * page management routines. ***************************************************/ -/* - * free the pv_entry back to the free list - */ -static PMAP_INLINE void -free_pv_entry(pv_entry_t pv) +CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); +CTASSERT(_NPCM == 3); +CTASSERT(_NPCPV == 168); + +static __inline struct pv_chunk * +pv_to_chunk(pv_entry_t pv) { - pv_entry_count--; - uma_zfree(pvzone, pv); + + return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK); } +#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) + +#define PC_FREE0 0xfffffffffffffffful +#define PC_FREE1 0xfffffffffffffffful +#define PC_FREE2 0x000000fffffffffful + +static uint64_t pc_freemask[3] = { PC_FREE0, PC_FREE1, PC_FREE2 }; + +#ifdef PV_STATS +static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, + "Current number of pv entry chunks"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, + "Current number of pv entry chunks allocated"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, + "Current number of pv entry chunks frees"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, + "Number of times tried to get a chunk page but failed."); + +static int pv_entry_frees, pv_entry_allocs, pv_entry_spare; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, + "Current number of pv entries"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, + "Current number of pv entry frees"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, + "Current number of pv entry allocs"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, + "Current number of spare pv entries"); + +static int pmap_collect_inactive, pmap_collect_active; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0, + "Current number times pmap_collect called on inactive queue"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0, + "Current number times pmap_collect called on active queue"); +#endif + /* - * get a new pv_entry, allocating a block from the system - * when needed. + * We are in a serious low memory condition. Resort to + * drastic measures to free some pages so we can allocate + * another pv entry chunk. This is normally called to + * unmap inactive pages, and if necessary, active pages. */ -static pv_entry_t -get_pv_entry(pmap_t locked_pmap) +static void +pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) { - static const struct timeval printinterval = { 60, 0 }; - static struct timeval lastprint; - struct vpgqueues *vpq; pd_entry_t ptepde; pmap_t pmap; pt_entry_t *pte, tpte; - pv_entry_t allocated_pv, next_pv, pv; + pv_entry_t next_pv, pv; vm_offset_t va; vm_page_t m; - PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); - mtx_assert(&vm_page_queue_mtx, MA_OWNED); - allocated_pv = uma_zalloc(pvzone, M_NOWAIT); - if (allocated_pv != NULL) { - pv_entry_count++; - if (pv_entry_count > pv_entry_high_water) - pagedaemon_wakeup(); - else - return (allocated_pv); - } - - /* - * Reclaim pv entries: At first, destroy mappings to inactive - * pages. After that, if a pv entry is still needed, destroy - * mappings to active pages. - */ - if (ratecheck(&lastprint, &printinterval)) - printf("Approaching the limit on PV entries, consider " - "increasing sysctl vm.pmap.shpgperproc or " - "vm.pmap.pv_entry_max\n"); - vpq = &vm_page_queues[PQ_INACTIVE]; -retry: TAILQ_FOREACH(m, &vpq->pl, pageq) { if (m->hold_count || m->busy || (m->flags & PG_BUSY)) continue; TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { va = pv->pv_va; - pmap = pv->pv_pmap; + pmap = PV_PMAP(pv); /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); @@ -1503,18 +1524,17 @@ retry: pte = pmap_pte_pde(pmap, va, &ptepde); tpte = pte_load_clear(pte); KASSERT((tpte & PG_W) == 0, - ("get_pv_entry: wired pte %#lx", tpte)); + ("pmap_collect: wired pte %#lx", tpte)); if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); if (tpte & PG_M) { KASSERT((tpte & PG_RW), - ("get_pv_entry: modified page not writable: va: %#lx, pte: %#lx", + ("pmap_collect: modified page not writable: va: %#lx, pte: %#lx", va, tpte)); if (pmap_track_modified(va)) vm_page_dirty(m); } pmap_invalidate_page(pmap, va); - TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); @@ -1522,47 +1542,149 @@ retry: pmap_unuse_pt(pmap, va, ptepde); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); - if (allocated_pv == NULL) - allocated_pv = pv; - else - free_pv_entry(pv); - } - } - if (allocated_pv == NULL) { - if (vpq == &vm_page_queues[PQ_INACTIVE]) { - vpq = &vm_page_queues[PQ_ACTIVE]; - goto retry; + free_pv_entry(locked_pmap, pv); } - panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable"); } - return (allocated_pv); } + +/* + * free the pv_entry back to the free list + */ static void -pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) +free_pv_entry(pmap_t pmap, pv_entry_t pv) { + vm_page_t m; + struct pv_chunk *pc; + int idx, field, bit; + + PV_STAT(pv_entry_frees++); + PV_STAT(pv_entry_spare++); + PV_STAT(pv_entry_count--); + pc = pv_to_chunk(pv); + idx = pv - &pc->pc_pventry[0]; + field = idx / 64; + bit = idx % 64; + pc->pc_map[field] |= 1ul << bit; + /* move to head of list */ + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || + pc->pc_map[2] != PC_FREE2) + return; + PV_STAT(pv_entry_spare -= _NPCPV); + PV_STAT(pc_chunk_count--); + PV_STAT(pc_chunk_frees++); + /* entire chunk is free, return it */ + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); + vm_page_lock_queues(); + vm_page_free(m); + vm_page_unlock_queues(); +} + +/* + * get a new pv_entry, allocating a block from the system + * when needed. + */ +static pv_entry_t +get_pv_entry(pmap_t pmap, int try) +{ + static const struct timeval printinterval = { 60, 0 }; + static struct timeval lastprint; + static vm_pindex_t colour; + int bit, field; pv_entry_t pv; + struct pv_chunk *pc; + vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); - if (m->md.pv_list_count < pmap->pm_stats.resident_count) { - TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { - if (pmap == pv->pv_pmap && va == pv->pv_va) + PV_STAT(pv_entry_allocs++); + PV_STAT(pv_entry_count++); + if (pv_entry_count > pv_entry_high_water) + pagedaemon_wakeup(); + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + if (pc != NULL) { + for (field = 0; field < _NPCM; field++) { + if (pc->pc_map[field]) { + bit = bsfq(pc->pc_map[field]); break; + } } - } else { - TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { - if (va == pv->pv_va) - break; + if (field < _NPCM) { + pv = &pc->pc_pventry[field * 64 + bit]; + pc->pc_map[field] &= ~(1ul << bit); + /* If this was the last item, move it to tail */ + if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && + pc->pc_map[2] == 0) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } + PV_STAT(pv_entry_spare--); + return (pv); + } + } + /* No free items, allocate another chunk */ + m = vm_page_alloc(NULL, colour, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ); + if (m == NULL) { + if (try) { + PV_STAT(pc_chunk_tryfail++); + return (NULL); + } + /* + * Reclaim pv entries: At first, destroy mappings to inactive + * pages. After that, if a pv chunk entry is still needed, + * destroy mappings to active pages. + */ + if (ratecheck(&lastprint, &printinterval)) + printf("Approaching the limit on PV entries, consider " + "increasing sysctl vm.pmap.shpgperproc or " + "vm.pmap.pv_entry_max\n"); + PV_STAT(pmap_collect_inactive++); + pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]); + m = vm_page_alloc(NULL, colour, + VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ); + if (m == NULL) { + PV_STAT(pmap_collect_active++); + pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]); + m = vm_page_alloc(NULL, colour, + VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ); + if (m == NULL) + panic("get_pv_entry: increase vm.pmap.shpgperproc"); } } + PV_STAT(pc_chunk_count++); + PV_STAT(pc_chunk_allocs++); + colour++; + pc = (void *)PHYS_TO_DMAP(m->phys_addr); + pc->pc_pmap = pmap; + pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ + pc->pc_map[1] = PC_FREE1; + pc->pc_map[2] = PC_FREE2; + pv = &pc->pc_pventry[0]; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(pv_entry_spare += _NPCPV - 1); + return (pv); +} + +static void +pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) +{ + pv_entry_t pv; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mtx_assert(&vm_page_queue_mtx, MA_OWNED); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { + if (pmap == PV_PMAP(pv) && va == pv->pv_va) + break; + } KASSERT(pv != NULL, ("pmap_remove_entry: pv not found")); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); - TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); - free_pv_entry(pv); + free_pv_entry(pmap, pv); } /* @@ -1574,13 +1696,10 @@ pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; - pv = get_pv_entry(pmap); - pv->pv_va = va; - pv->pv_pmap = pmap; - PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); - TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); + pv = get_pv_entry(pmap, FALSE); + pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; } @@ -1596,11 +1715,8 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (pv_entry_count < pv_entry_high_water && - (pv = uma_zalloc(pvzone, M_NOWAIT)) != NULL) { - pv_entry_count++; + (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; - pv->pv_pmap = pmap; - TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; return (TRUE); @@ -1791,6 +1907,7 @@ void pmap_remove_all(vm_page_t m) { register pv_entry_t pv; + pmap_t pmap; pt_entry_t *pte, tpte; pd_entry_t ptepde; @@ -1805,12 +1922,13 @@ pmap_remove_all(vm_page_t m) #endif mtx_assert(&vm_page_queue_mtx, MA_OWNED); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { - PMAP_LOCK(pv->pv_pmap); - pv->pv_pmap->pm_stats.resident_count--; - pte = pmap_pte_pde(pv->pv_pmap, pv->pv_va, &ptepde); + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pmap->pm_stats.resident_count--; + pte = pmap_pte_pde(pmap, pv->pv_va, &ptepde); tpte = pte_load_clear(pte); if (tpte & PG_W) - pv->pv_pmap->pm_stats.wired_count--; + pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); @@ -1824,13 +1942,12 @@ pmap_remove_all(vm_page_t m) if (pmap_track_modified(pv->pv_va)) vm_page_dirty(m); } - pmap_invalidate_page(pv->pv_pmap, pv->pv_va); - TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); + pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; - pmap_unuse_pt(pv->pv_pmap, pv->pv_va, ptepde); - PMAP_UNLOCK(pv->pv_pmap); - free_pv_entry(pv); + pmap_unuse_pt(pmap, pv->pv_va, ptepde); + PMAP_UNLOCK(pmap); + free_pv_entry(pmap, pv); } vm_page_flag_clear(m, PG_WRITEABLE); } @@ -2584,7 +2701,7 @@ pmap_page_exists_quick(pmap, m) mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { - if (pv->pv_pmap == pmap) { + if (PV_PMAP(pv) == pmap) { return TRUE; } loops++; @@ -2594,7 +2711,6 @@ pmap_page_exists_quick(pmap, m) return (FALSE); } -#define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code @@ -2608,68 +2724,91 @@ pmap_remove_pages(pmap_t pmap) { pt_entry_t *pte, tpte; vm_page_t m; - pv_entry_t pv, npv; + pv_entry_t pv; + struct pv_chunk *pc, *npc; + int field, idx; + int64_t bit; + uint64_t inuse, bitmask; + int allfree; -#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } -#endif vm_page_lock_queues(); PMAP_LOCK(pmap); - for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { - -#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY - pte = vtopte(pv->pv_va); -#else - pte = pmap_pte(pmap, pv->pv_va); -#endif - tpte = *pte; - - if (tpte == 0) { - printf("TPTE at %p IS ZERO @ VA %08lx\n", - pte, pv->pv_va); - panic("bad pte"); - } + TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { + allfree = 1; + for (field = 0; field < _NPCM; field++) { + inuse = (~(pc->pc_map[field])) & pc_freemask[field]; + while (inuse != 0) { + bit = bsfq(inuse); + bitmask = 1UL << bit; + idx = field * 64 + bit; + pv = &pc->pc_pventry[idx]; + inuse &= ~bitmask; + + pte = vtopte(pv->pv_va); + tpte = *pte; + + if (tpte == 0) { + printf( + "TPTE at %p IS ZERO @ VA %08lx\n", + pte, pv->pv_va); + panic("bad pte"); + } /* * We cannot remove wired pages from a process' mapping at this time */ - if (tpte & PG_W) { - npv = TAILQ_NEXT(pv, pv_plist); - continue; - } - - m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); - KASSERT(m->phys_addr == (tpte & PG_FRAME), - ("vm_page_t %p phys_addr mismatch %016jx %016jx", - m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); - - KASSERT(m < &vm_page_array[vm_page_array_size], - ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); + if (tpte & PG_W) { + allfree = 0; + continue; + } - pmap->pm_stats.resident_count--; + m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); + KASSERT(m->phys_addr == (tpte & PG_FRAME), + ("vm_page_t %p phys_addr mismatch %016jx %016jx", + m, (uintmax_t)m->phys_addr, + (uintmax_t)tpte)); - pte_clear(pte); + KASSERT(m < &vm_page_array[vm_page_array_size], + ("pmap_remove_pages: bad tpte %#jx", + (uintmax_t)tpte)); - /* - * Update the vm_page_t clean and reference bits. - */ - if (tpte & PG_M) { - vm_page_dirty(m); - } + pmap->pm_stats.resident_count--; - npv = TAILQ_NEXT(pv, pv_plist); - TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); + pte_clear(pte); - m->md.pv_list_count--; - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_flag_clear(m, PG_WRITEABLE); + /* + * Update the vm_page_t clean/reference bits. + */ + if (tpte & PG_M) + vm_page_dirty(m); - pmap_unuse_pt(pmap, pv->pv_va, *vtopde(pv->pv_va)); - free_pv_entry(pv); + /* Mark free */ + PV_STAT(pv_entry_frees++); + PV_STAT(pv_entry_spare++); + PV_STAT(pv_entry_count--); + pc->pc_map[field] |= bitmask; + m->md.pv_list_count--; + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + if (TAILQ_EMPTY(&m->md.pv_list)) + vm_page_flag_clear(m, PG_WRITEABLE); + pmap_unuse_pt(pmap, pv->pv_va, + *vtopde(pv->pv_va)); + } + } + if (allfree) { + PV_STAT(pv_entry_spare -= _NPCPV); + PV_STAT(pc_chunk_count--); + PV_STAT(pc_chunk_frees++); + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); + vm_page_lock_queues(); + vm_page_free(m); + vm_page_unlock_queues(); + } } pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); @@ -2687,6 +2826,7 @@ pmap_is_modified(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; + pmap_t pmap; boolean_t rv; rv = FALSE; @@ -2702,10 +2842,11 @@ pmap_is_modified(vm_page_t m) */ if (!pmap_track_modified(pv->pv_va)) continue; - PMAP_LOCK(pv->pv_pmap); - pte = pmap_pte(pv->pv_pmap, pv->pv_va); + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte(pmap, pv->pv_va); rv = (*pte & PG_M) != 0; - PMAP_UNLOCK(pv->pv_pmap); + PMAP_UNLOCK(pmap); if (rv) break; } @@ -2743,6 +2884,7 @@ static __inline void pmap_clear_ptes(vm_page_t m, long bit) { register pv_entry_t pv; + pmap_t pmap; pt_entry_t pbits, *pte; if ((m->flags & PG_FICTITIOUS) || @@ -2763,8 +2905,9 @@ pmap_clear_ptes(vm_page_t m, long bit) continue; } - PMAP_LOCK(pv->pv_pmap); - pte = pmap_pte(pv->pv_pmap, pv->pv_va); + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte(pmap, pv->pv_va); retry: pbits = *pte; if (pbits & bit) { @@ -2778,9 +2921,9 @@ retry: } else { atomic_clear_long(pte, bit); } - pmap_invalidate_page(pv->pv_pmap, pv->pv_va); + pmap_invalidate_page(pmap, pv->pv_va); } - PMAP_UNLOCK(pv->pv_pmap); + PMAP_UNLOCK(pmap); } if (bit == PG_RW) vm_page_flag_clear(m, PG_WRITEABLE); @@ -2819,6 +2962,7 @@ int pmap_ts_referenced(vm_page_t m) { register pv_entry_t pv, pvf, pvn; + pmap_t pmap; pt_entry_t *pte; pt_entry_t v; int rtval = 0; @@ -2841,20 +2985,21 @@ pmap_ts_referenced(vm_page_t m) if (!pmap_track_modified(pv->pv_va)) continue; - PMAP_LOCK(pv->pv_pmap); - pte = pmap_pte(pv->pv_pmap, pv->pv_va); + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte = pmap_pte(pmap, pv->pv_va); if (pte && ((v = pte_load(pte)) & PG_A) != 0) { atomic_clear_long(pte, PG_A); - pmap_invalidate_page(pv->pv_pmap, pv->pv_va); + pmap_invalidate_page(pmap, pv->pv_va); rtval++; if (rtval > 4) { - PMAP_UNLOCK(pv->pv_pmap); + PMAP_UNLOCK(pmap); break; } } - PMAP_UNLOCK(pv->pv_pmap); + PMAP_UNLOCK(pmap); } while ((pv = pvn) != NULL && pv != pvf); } diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index 4af3b36d5d1f..49e3139ccf40 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -222,6 +222,7 @@ extern pt_entry_t pg_nx; * Pmap stuff */ struct pv_entry; +struct pv_chunk; struct md_page { int pv_list_count; @@ -231,7 +232,7 @@ struct md_page { struct pmap { struct mtx pm_mtx; pml4_entry_t *pm_pml4; /* KVA of level 4 page table */ - TAILQ_HEAD(,pv_entry) pm_pvlist; /* list of mappings in pmap */ + TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ u_int pm_active; /* active on cpus */ /* spare u_int here due to padding */ struct pmap_statistics pm_stats; /* pmap statistics */ @@ -260,12 +261,24 @@ extern struct pmap kernel_pmap_store; * mappings of that page. An entry is a pv_entry_t, the list is pv_table. */ typedef struct pv_entry { - pmap_t pv_pmap; /* pmap where mapping lies */ vm_offset_t pv_va; /* virtual address for mapping */ TAILQ_ENTRY(pv_entry) pv_list; - TAILQ_ENTRY(pv_entry) pv_plist; } *pv_entry_t; +/* + * pv_entries are allocated in chunks per-process. This avoids the + * need to track per-pmap assignments. + */ +#define _NPCM 3 +#define _NPCPV 168 +struct pv_chunk { + pmap_t pc_pmap; + TAILQ_ENTRY(pv_chunk) pc_list; + uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */ + uint64_t pc_spare[2]; + struct pv_entry pc_pventry[_NPCPV]; +}; + #ifdef _KERNEL #define NPPROVMTRR 8 |