aboutsummaryrefslogtreecommitdiff
path: root/sys/amd64
diff options
context:
space:
mode:
authorKonstantin Belousov <kib@FreeBSD.org>2013-08-30 07:59:49 +0000
committerKonstantin Belousov <kib@FreeBSD.org>2013-08-30 07:59:49 +0000
commit37eed8419c9d03e5a2f8717ecaaca5c30461ca0a (patch)
tree04770c540ba8145f9288bad14ac5d4d59ac30c95 /sys/amd64
parenta0887a4c2db88710c568f5051efd20bbf8696f07 (diff)
downloadsrc-37eed8419c9d03e5a2f8717ecaaca5c30461ca0a.tar.gz
src-37eed8419c9d03e5a2f8717ecaaca5c30461ca0a.zip
Implement support for the process-context identifiers ('PCID') on
Intel CPUs. The feature tags TLB entries with the Id of the address space and allows to avoid TLB invalidation on the context switch, it is available only in the long mode. In the microbenchmarks, using the PCID decreased latency of the context switches by ~30% on SandyBridge class desktop CPUs, measured with the lat_ctx program from lmbench. If available, use INVPCID instruction when a TLB entry in non-current address space needs to be invalidated. The instruction is typically available on the Haswell. If needed, the use of PCID can be turned off with the vm.pmap.pcid_enabled loader tunable set to 0. The state of the feature is reported by the vm.pmap.pcid_enabled sysctl. The sysctl vm.pmap.pcid_save_cnt reports the number of context switches which avoided invalidating the TLB; compare with the total number of context switches, available as sysctl vm.stats.sys.v_swtch. Sponsored by: The FreeBSD Foundation Reviewed by: alc Tested by: pho, bf
Notes
Notes: svn path=/head/; revision=255060
Diffstat (limited to 'sys/amd64')
-rw-r--r--sys/amd64/amd64/apic_vector.S241
-rw-r--r--sys/amd64/amd64/cpu_switch.S34
-rw-r--r--sys/amd64/amd64/genassym.c4
-rw-r--r--sys/amd64/amd64/machdep.c2
-rw-r--r--sys/amd64/amd64/mp_machdep.c62
-rw-r--r--sys/amd64/amd64/pmap.c276
-rw-r--r--sys/amd64/amd64/vm_machdep.c2
-rw-r--r--sys/amd64/include/pcpu.h2
-rw-r--r--sys/amd64/include/pmap.h2
-rw-r--r--sys/amd64/include/smp.h15
10 files changed, 527 insertions, 113 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 7551cc5c8b93..e868cf584971 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -43,6 +43,12 @@
#include "assym.s"
+#ifdef SMP
+#define LK lock ;
+#else
+#define LK
+#endif
+
/*
* I/O Interrupt Entry Point. Rather than having one entry point for
* each interrupt source, we use one entry point for each 32-bit word
@@ -149,6 +155,38 @@ IDTVEC(xen_intr_upcall)
* Global address space TLB shootdown.
*/
.text
+
+#define NAKE_INTR_CS 24
+
+ SUPERALIGN_TEXT
+global_invltlb:
+ movl %cr4,%eax
+ andl $~0x80,%eax
+ movl %eax,%cr4
+ orl $0x80,%eax
+ movl %eax,%cr4
+invltlb_ret_clear_pm_save:
+ movq smp_tlb_pmap,%rdx
+ testq %rdx,%rdx
+ jz invltlb_ret
+ testb $SEL_RPL_MASK,NAKE_INTR_CS(%rsp)
+ jz 1f
+ swapgs
+1:
+ movl PCPU(CPUID),%eax
+ jz 2f
+ swapgs
+2:
+ LK btcl %eax,PM_SAVE(%rdx)
+ SUPERALIGN_TEXT
+invltlb_ret:
+ movq lapic, %rax
+ movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
+ LK incl smp_tlb_wait
+ popq %rdx
+ popq %rax
+ jmp doreti_iret
+
SUPERALIGN_TEXT
IDTVEC(invltlb)
#if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
@@ -165,18 +203,44 @@ IDTVEC(invltlb)
#endif
pushq %rax
+ pushq %rdx
- movq %cr3, %rax /* invalidate the TLB */
- movq %rax, %cr3
-
- movq lapic, %rax
- movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
-
- lock
- incl smp_tlb_wait
-
- popq %rax
- jmp doreti_iret
+ movq %cr3,%rax
+ cmpl $0,pmap_pcid_enabled
+ je 2f
+
+ movq $smp_tlb_invpcid,%rdx
+ cmpl $0,(%rdx)
+ je global_invltlb
+ cmpl $-1,(%rdx)
+ je global_invltlb
+
+ /*
+ * Non-zero smp_tlb_invpcid, only invalidate TLB for entries with
+ * current PCID.
+ */
+ cmpl $0,invpcid_works
+ je 1f
+ /* Use invpcid if available. */
+ movl $1,%eax /* INVPCID_CTX */
+ /* invpcid (%rdx),%rax */
+ .byte 0x66,0x0f,0x38,0x82,0x02
+ jmp invltlb_ret_clear_pm_save
+1:
+ /* Otherwise reload %cr3 twice. */
+ movq pcid_cr3,%rdx
+ cmpq %rax,%rdx
+ je 2f
+ movq %rdx,%cr3 /* Invalidate, bit 63 is zero. */
+ btsq $63,%rax
+
+ /*
+ * Invalidate the TLB if PCID is not enabled.
+ * Restore the old address space.
+ */
+2:
+ movq %rax,%cr3
+ jmp invltlb_ret_clear_pm_save
/*
* Single page TLB shootdown
@@ -198,18 +262,54 @@ IDTVEC(invlpg)
#endif
pushq %rax
-
- movq smp_tlb_addr1, %rax
- invlpg (%rax) /* invalidate single page */
-
- movq lapic, %rax
- movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
-
- lock
- incl smp_tlb_wait
-
- popq %rax
- jmp doreti_iret
+ pushq %rdx
+ movq $smp_tlb_invpcid,%rdx
+ cmpl $0,pmap_pcid_enabled
+ je 3f
+ cmpl $0,invpcid_works
+ jne 2f
+
+ /* kernel pmap - use invlpg to invalidate global mapping */
+ cmpl $0,(%rdx)
+ je 3f
+ cmpl $-1,(%rdx)
+ je global_invltlb
+
+ /*
+ * PCID supported, but INVPCID is not.
+ * Temporarily switch to the target address space and do INVLPG.
+ */
+ pushq %rcx
+ movq %cr3,%rcx
+ movq pcid_cr3,%rax
+ cmp %rcx,%rax
+ je 1f
+ btsq $63,%rax
+ movq %rax,%cr3
+1: movq 8(%rdx),%rax
+ invlpg (%rax)
+ btsq $63,%rcx
+ movq %rcx,%cr3
+ popq %rcx
+ jmp invltlb_ret
+
+ /*
+ * Invalidate the TLB entry using INVPCID_ADDR.
+ */
+2:
+ xorl %eax,%eax
+/* invpcid (%rdx),%rax */
+ .byte 0x66,0x0f,0x38,0x82,0x02
+ jmp invltlb_ret
+
+ /*
+ * PCID is not supported or kernel pmap.
+ * Invalidate single page using INVLPG.
+ */
+3:
+ movq 8(%rdx),%rax
+ invlpg (%rax)
+ jmp invltlb_ret
/*
* Page range TLB shootdown.
@@ -232,23 +332,76 @@ IDTVEC(invlrng)
pushq %rax
pushq %rdx
-
- movq smp_tlb_addr1, %rdx
- movq smp_tlb_addr2, %rax
+ movq $smp_tlb_invpcid,%rdx
+ cmpl $0,pmap_pcid_enabled
+ jne invlrng_single_page
+ cmpl $0,invpcid_works
+ jne invlrng_invpcid
+
+ /* kernel pmap - use invlpg to invalidate global mapping */
+ cmpl $0,(%rdx)
+ je invlrng_single_page
+ cmpl $-1,(%rdx)
+ je global_invltlb
+
+ pushq %rcx
+ movq %cr3,%rcx
+ movq pcid_cr3,%rax
+ cmpq %rcx,%rax
+ je 1f
+ btsq $63,%rax
+ movq %rax,%cr3
+1:
+ movq 8(%rdx),%rdx
+ movq smp_tlb_addr2,%rax
+2:
+ invlpg (%rdx)
+ addq $PAGE_SIZE,%rdx
+ cmpq %rax,%rdx
+ jb 2b
+ btsq $63,%rcx
+ movq %rcx,%cr3
+ popq %rcx
+ jmp invltlb_ret
+
+invlrng_invpcid:
+ testb $SEL_RPL_MASK,NAKE_INTR_CS(%rsp)
+ jz 1f
+ swapgs
+1:
+ pushq %rcx
+ movq (%rdx),%rcx
+ movq %rcx,PCPU(INVPCID_DESCR)
+ movq 8(%rdx),%rax
+ movq %rax,PCPU(INVPCID_DESCR)+8
+ movq smp_tlb_addr2,%rcx
+ xorl %eax,%eax
+ movq $PC_INVPCID_DESCR,%rdx
+ gs
+ subq 8(%rdx),%rcx
+ shrq $PAGE_SHIFT,%rcx
+2:
+ gs
+// invpcid (%rdx),%rax
+ .byte 0x66,0x0f,0x38,0x82,0x02
+ gs
+ addq $PAGE_SIZE,8(%rdx)
+ dec %rcx
+ jne 2b
+ popq %rcx
+ testb $SEL_RPL_MASK,NAKE_INTR_CS(%rsp)
+ jz invltlb_ret
+ swapgs
+ jmp invltlb_ret
+
+invlrng_single_page:
+ movq 8(%rdx),%rdx
+ movq smp_tlb_addr2,%rax
1: invlpg (%rdx) /* invalidate single page */
- addq $PAGE_SIZE, %rdx
- cmpq %rax, %rdx
+ addq $PAGE_SIZE,%rdx
+ cmpq %rax,%rdx
jb 1b
-
- movq lapic, %rax
- movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
-
- lock
- incl smp_tlb_wait
-
- popq %rdx
- popq %rax
- jmp doreti_iret
+ jmp invltlb_ret
/*
* Invalidate cache.
@@ -265,17 +418,9 @@ IDTVEC(invlcache)
#endif
pushq %rax
-
+ pushq %rdx
wbinvd
-
- movq lapic, %rax
- movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */
-
- lock
- incl smp_tlb_wait
-
- popq %rax
- jmp doreti_iret
+ jmp invltlb_ret
/*
* Handler for IPIs sent via the per-cpu IPI bitmap.
diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S
index ed1ccb5b5c34..ac309900b844 100644
--- a/sys/amd64/amd64/cpu_switch.S
+++ b/sys/amd64/amd64/cpu_switch.S
@@ -77,8 +77,7 @@ ENTRY(cpu_throw)
LK btrl %eax,PM_ACTIVE(%rdx) /* clear old */
1:
movq TD_PCB(%rsi),%r8 /* newtd->td_pcb */
- movq PCB_CR3(%r8),%rdx
- movq %rdx,%cr3 /* new address space */
+ movq PCB_CR3(%r8),%rcx /* new address space */
jmp swact
END(cpu_throw)
@@ -145,20 +144,41 @@ ctx_switch_xsave:
SETLK %rdx, TD_LOCK(%rdi) /* Release the old thread */
jmp sw1
swinact:
- movq %rcx,%cr3 /* new address space */
- movl PCPU(CPUID), %eax
+ movl PCPU(CPUID),%eax
/* Release bit from old pmap->pm_active */
- movq PCPU(CURPMAP),%rcx
- LK btrl %eax,PM_ACTIVE(%rcx) /* clear old */
- SETLK %rdx, TD_LOCK(%rdi) /* Release the old thread */
+ movq PCPU(CURPMAP),%r12
+ LK btrl %eax,PM_ACTIVE(%r12) /* clear old */
+ SETLK %rdx,TD_LOCK(%rdi) /* Release the old thread */
swact:
/* Set bit in new pmap->pm_active */
movq TD_PROC(%rsi),%rdx /* newproc */
movq P_VMSPACE(%rdx), %rdx
addq $VM_PMAP,%rdx
+ cmpl $-1,PM_PCID(%rdx)
+ je 1f
+ LK btsl %eax,PM_SAVE(%rdx)
+ jnc 1f
+ btsq $63,%rcx /* CR3_PCID_SAVE */
+ incq PCPU(PM_SAVE_CNT)
+1:
+ movq %rcx,%cr3 /* new address space */
LK btsl %eax,PM_ACTIVE(%rdx) /* set new */
movq %rdx,PCPU(CURPMAP)
+ /*
+ * We might lose the race and other CPU might have changed
+ * the pmap after we set our bit in pmap->pm_save. Recheck.
+ * Reload %cr3 with CR3_PCID_SAVE bit cleared if pmap was
+ * modified, causing TLB flush for this pcid.
+ */
+ btrq $63,%rcx
+ jnc 1f
+ LK btsl %eax,PM_SAVE(%rdx)
+ jc 1f
+ decq PCPU(PM_SAVE_CNT)
+ movq %rcx,%cr3
+1:
+
sw1:
#if defined(SCHED_ULE) && defined(SMP)
/* Wait for the new thread to become unblocked */
diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c
index 3043bb521c92..62017e7097e9 100644
--- a/sys/amd64/amd64/genassym.c
+++ b/sys/amd64/amd64/genassym.c
@@ -76,6 +76,8 @@ __FBSDID("$FreeBSD$");
ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
+ASSYM(PM_SAVE, offsetof(struct pmap, pm_save));
+ASSYM(PM_PCID, offsetof(struct pmap, pm_pcid));
ASSYM(P_MD, offsetof(struct proc, p_md));
ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt));
@@ -225,6 +227,8 @@ ASSYM(PC_GS32P, offsetof(struct pcpu, pc_gs32p));
ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt));
ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp));
ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss));
+ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt));
+ASSYM(PC_INVPCID_DESCR, offsetof(struct pcpu, pc_invpcid_descr));
ASSYM(LA_VER, offsetof(struct LAPIC, version));
ASSYM(LA_TPR, offsetof(struct LAPIC, tpr));
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 7f7e54a5a191..f3969d3346ad 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1909,7 +1909,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
/* setup proc 0's pcb */
thread0.td_pcb->pcb_flags = 0;
- thread0.td_pcb->pcb_cr3 = KPML4phys;
+ thread0.td_pcb->pcb_cr3 = KPML4phys; /* PCID 0 is reserved for kernel */
thread0.td_frame = &proc0_tf;
env = getenv("kernelname");
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 267b933cefd2..530aa61c7017 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -107,9 +107,11 @@ struct pcb stoppcbs[MAXCPU];
struct pcb **susppcbs;
/* Variables needed for SMP tlb shootdown. */
-vm_offset_t smp_tlb_addr1;
vm_offset_t smp_tlb_addr2;
+struct invpcid_descr smp_tlb_invpcid;
volatile int smp_tlb_wait;
+uint64_t pcid_cr3;
+pmap_t smp_tlb_pmap;
#ifdef COUNT_IPIS
/* Interrupt counts. */
@@ -603,6 +605,8 @@ cpu_mp_announce(void)
}
}
+extern int pmap_pcid_enabled;
+
/*
* AP CPU's call this to initialize themselves.
*/
@@ -768,6 +772,8 @@ init_secondary(void)
*/
load_cr4(rcr4() | CR4_PGE);
+ if (pmap_pcid_enabled)
+ load_cr4(rcr4() | CR4_PCIDE);
load_ds(_udatasel);
load_es(_udatasel);
load_fs(_ufssel);
@@ -1119,7 +1125,8 @@ ipi_send_cpu(int cpu, u_int ipi)
* Flush the TLB on all other CPU's
*/
static void
-smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+smp_tlb_shootdown(u_int vector, pmap_t pmap, vm_offset_t addr1,
+ vm_offset_t addr2)
{
u_int ncpu;
@@ -1129,8 +1136,16 @@ smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
if (!(read_rflags() & PSL_I))
panic("%s: interrupts disabled", __func__);
mtx_lock_spin(&smp_ipi_mtx);
- smp_tlb_addr1 = addr1;
+ smp_tlb_invpcid.addr = addr1;
+ if (pmap == NULL) {
+ smp_tlb_invpcid.pcid = 0;
+ } else {
+ smp_tlb_invpcid.pcid = pmap->pm_pcid;
+ pcid_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) |
+ (pmap->pm_pcid == -1 ? 0 : pmap->pm_pcid);
+ }
smp_tlb_addr2 = addr2;
+ smp_tlb_pmap = pmap;
atomic_store_rel_int(&smp_tlb_wait, 0);
ipi_all_but_self(vector);
while (smp_tlb_wait < ncpu)
@@ -1139,7 +1154,8 @@ smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
}
static void
-smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
+ vm_offset_t addr1, vm_offset_t addr2)
{
int cpu, ncpu, othercpus;
@@ -1155,8 +1171,16 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_of
if (!(read_rflags() & PSL_I))
panic("%s: interrupts disabled", __func__);
mtx_lock_spin(&smp_ipi_mtx);
- smp_tlb_addr1 = addr1;
+ smp_tlb_invpcid.addr = addr1;
+ if (pmap == NULL) {
+ smp_tlb_invpcid.pcid = 0;
+ } else {
+ smp_tlb_invpcid.pcid = pmap->pm_pcid;
+ pcid_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) |
+ (pmap->pm_pcid == -1 ? 0 : pmap->pm_pcid);
+ }
smp_tlb_addr2 = addr2;
+ smp_tlb_pmap = pmap;
atomic_store_rel_int(&smp_tlb_wait, 0);
if (CPU_ISFULLSET(&mask)) {
ncpu = othercpus;
@@ -1182,15 +1206,15 @@ smp_cache_flush(void)
{
if (smp_started)
- smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
+ smp_tlb_shootdown(IPI_INVLCACHE, NULL, 0, 0);
}
void
-smp_invltlb(void)
+smp_invltlb(pmap_t pmap)
{
if (smp_started) {
- smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
+ smp_tlb_shootdown(IPI_INVLTLB, pmap, 0, 0);
#ifdef COUNT_XINVLTLB_HITS
ipi_global++;
#endif
@@ -1198,11 +1222,11 @@ smp_invltlb(void)
}
void
-smp_invlpg(vm_offset_t addr)
+smp_invlpg(pmap_t pmap, vm_offset_t addr)
{
if (smp_started) {
- smp_tlb_shootdown(IPI_INVLPG, addr, 0);
+ smp_tlb_shootdown(IPI_INVLPG, pmap, addr, 0);
#ifdef COUNT_XINVLTLB_HITS
ipi_page++;
#endif
@@ -1210,11 +1234,11 @@ smp_invlpg(vm_offset_t addr)
}
void
-smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
+smp_invlpg_range(pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2)
{
if (smp_started) {
- smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
+ smp_tlb_shootdown(IPI_INVLRNG, pmap, addr1, addr2);
#ifdef COUNT_XINVLTLB_HITS
ipi_range++;
ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
@@ -1223,11 +1247,11 @@ smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
}
void
-smp_masked_invltlb(cpuset_t mask)
+smp_masked_invltlb(cpuset_t mask, pmap_t pmap)
{
if (smp_started) {
- smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
+ smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, NULL, 0, 0);
#ifdef COUNT_XINVLTLB_HITS
ipi_masked_global++;
#endif
@@ -1235,11 +1259,11 @@ smp_masked_invltlb(cpuset_t mask)
}
void
-smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
+smp_masked_invlpg(cpuset_t mask, pmap_t pmap, vm_offset_t addr)
{
if (smp_started) {
- smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
+ smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0);
#ifdef COUNT_XINVLTLB_HITS
ipi_masked_page++;
#endif
@@ -1247,11 +1271,13 @@ smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
}
void
-smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2)
+smp_masked_invlpg_range(cpuset_t mask, pmap_t pmap, vm_offset_t addr1,
+ vm_offset_t addr2)
{
if (smp_started) {
- smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
+ smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1,
+ addr2);
#ifdef COUNT_XINVLTLB_HITS
ipi_masked_range++;
ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 851f92a098de..bca40f07fd5f 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -116,11 +116,8 @@ __FBSDID("$FreeBSD$");
#include <sys/vmmeter.h>
#include <sys/sched.h>
#include <sys/sysctl.h>
-#ifdef SMP
+#include <sys/_unrhdr.h>
#include <sys/smp.h>
-#else
-#include <sys/cpuset.h>
-#endif
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -250,6 +247,53 @@ static struct md_page *pv_table;
pt_entry_t *CMAP1 = 0;
caddr_t CADDR1 = 0;
+static struct unrhdr pcid_unr;
+static struct mtx pcid_mtx;
+int pmap_pcid_enabled = 1;
+SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled,
+ 0, "Is TLB Context ID enabled ?");
+int invpcid_works = 0;
+
+/*
+ * Perform the guaranteed invalidation of all TLB entries. This
+ * includes the global entries, and entries in all PCIDs, not only the
+ * current context. The function works both on non-PCID CPUs and CPUs
+ * with the PCID turned off or on. See IA-32 SDM Vol. 3a 4.10.4.1
+ * Operations that Invalidate TLBs and Paging-Structure Caches.
+ */
+static __inline void
+invltlb_globpcid(void)
+{
+ uint64_t cr4;
+
+ cr4 = rcr4();
+ load_cr4(cr4 & ~CR4_PGE);
+ /*
+ * Although preemption at this point could be detrimental to
+ * performance, it would not lead to an error. PG_G is simply
+ * ignored if CR4.PGE is clear. Moreover, in case this block
+ * is re-entered, the load_cr4() either above or below will
+ * modify CR4.PGE flushing the TLB.
+ */
+ load_cr4(cr4 | CR4_PGE);
+}
+
+static int
+pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
+{
+ int i;
+ uint64_t res;
+
+ res = 0;
+ CPU_FOREACH(i) {
+ res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
+ }
+ return (sysctl_handle_64(oidp, &res, 0, req));
+}
+SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
+ CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
+ "Count of saved TLB context on switch");
+
/*
* Crashdump maps.
*/
@@ -685,6 +729,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
PMAP_LOCK_INIT(kernel_pmap);
kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */
+ CPU_ZERO(&kernel_pmap->pm_save);
TAILQ_INIT(&kernel_pmap->pm_pvchunk);
/*
@@ -716,6 +761,21 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
/* Initialize the PAT MSR. */
pmap_init_pat();
+
+#ifdef SMP
+ /* Initialize TLB Context Id. */
+ TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
+ if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
+ load_cr4(rcr4() | CR4_PCIDE);
+ mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF);
+ init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx);
+ /* Check for INVPCID support */
+ invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
+ != 0;
+ kernel_pmap->pm_pcid = 0;
+ } else
+#endif
+ pmap_pcid_enabled = 0;
}
/*
@@ -952,7 +1012,6 @@ pmap_cache_bits(int mode, boolean_t is_pde)
static void
pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
{
- u_long cr4;
if ((newpde & PG_PS) == 0)
/* Demotion: flush a specific 2MB page mapping. */
@@ -968,19 +1027,34 @@ pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
* Promotion: flush every 4KB page mapping from the TLB,
* including any global (PG_G) mappings.
*/
- cr4 = rcr4();
- load_cr4(cr4 & ~CR4_PGE);
- /*
- * Although preemption at this point could be detrimental to
- * performance, it would not lead to an error. PG_G is simply
- * ignored if CR4.PGE is clear. Moreover, in case this block
- * is re-entered, the load_cr4() either above or below will
- * modify CR4.PGE flushing the TLB.
- */
- load_cr4(cr4 | CR4_PGE);
+ invltlb_globpcid();
}
}
#ifdef SMP
+
+static void
+pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
+{
+ struct invpcid_descr d;
+ uint64_t cr3;
+
+ if (invpcid_works) {
+ d.pcid = pmap->pm_pcid;
+ d.pad = 0;
+ d.addr = va;
+ invpcid(&d, INVPCID_ADDR);
+ return;
+ }
+
+ cr3 = rcr3();
+ critical_enter();
+ load_cr3(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) | pmap->pm_pcid |
+ CR3_PCID_SAVE);
+ invlpg(va);
+ load_cr3(cr3 | CR3_PCID_SAVE);
+ critical_exit();
+}
+
/*
* For SMP, these functions have to use the IPI mechanism for coherence.
*
@@ -1008,21 +1082,68 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
sched_pin();
if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
- invlpg(va);
- smp_invlpg(va);
+ if (!pmap_pcid_enabled) {
+ invlpg(va);
+ } else {
+ if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
+ if (pmap == PCPU_GET(curpmap))
+ invlpg(va);
+ else
+ pmap_invalidate_page_pcid(pmap, va);
+ } else {
+ invltlb_globpcid();
+ }
+ }
+ smp_invlpg(pmap, va);
} else {
cpuid = PCPU_GET(cpuid);
other_cpus = all_cpus;
CPU_CLR(cpuid, &other_cpus);
if (CPU_ISSET(cpuid, &pmap->pm_active))
invlpg(va);
- CPU_AND(&other_cpus, &pmap->pm_active);
+ else if (pmap_pcid_enabled) {
+ if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
+ pmap_invalidate_page_pcid(pmap, va);
+ else
+ invltlb_globpcid();
+ }
+ if (pmap_pcid_enabled)
+ CPU_AND(&other_cpus, &pmap->pm_save);
+ else
+ CPU_AND(&other_cpus, &pmap->pm_active);
if (!CPU_EMPTY(&other_cpus))
- smp_masked_invlpg(other_cpus, va);
+ smp_masked_invlpg(other_cpus, pmap, va);
}
sched_unpin();
}
+static void
+pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+ struct invpcid_descr d;
+ uint64_t cr3;
+ vm_offset_t addr;
+
+ if (invpcid_works) {
+ d.pcid = pmap->pm_pcid;
+ d.pad = 0;
+ for (addr = sva; addr < eva; addr += PAGE_SIZE) {
+ d.addr = addr;
+ invpcid(&d, INVPCID_ADDR);
+ }
+ return;
+ }
+
+ cr3 = rcr3();
+ critical_enter();
+ load_cr3(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) | pmap->pm_pcid |
+ CR3_PCID_SAVE);
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+ load_cr3(cr3 | CR3_PCID_SAVE);
+ critical_exit();
+}
+
void
pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
@@ -1032,19 +1153,43 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
sched_pin();
if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
- for (addr = sva; addr < eva; addr += PAGE_SIZE)
- invlpg(addr);
- smp_invlpg_range(sva, eva);
+ if (!pmap_pcid_enabled) {
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+ } else {
+ if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
+ if (pmap == PCPU_GET(curpmap)) {
+ for (addr = sva; addr < eva;
+ addr += PAGE_SIZE)
+ invlpg(addr);
+ } else {
+ pmap_invalidate_range_pcid(pmap,
+ sva, eva);
+ }
+ } else {
+ invltlb_globpcid();
+ }
+ }
+ smp_invlpg_range(pmap, sva, eva);
} else {
cpuid = PCPU_GET(cpuid);
other_cpus = all_cpus;
CPU_CLR(cpuid, &other_cpus);
- if (CPU_ISSET(cpuid, &pmap->pm_active))
+ if (CPU_ISSET(cpuid, &pmap->pm_active)) {
for (addr = sva; addr < eva; addr += PAGE_SIZE)
invlpg(addr);
- CPU_AND(&other_cpus, &pmap->pm_active);
+ } else if (pmap_pcid_enabled) {
+ if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
+ pmap_invalidate_range_pcid(pmap, sva, eva);
+ else
+ invltlb_globpcid();
+ }
+ if (pmap_pcid_enabled)
+ CPU_AND(&other_cpus, &pmap->pm_save);
+ else
+ CPU_AND(&other_cpus, &pmap->pm_active);
if (!CPU_EMPTY(&other_cpus))
- smp_masked_invlpg_range(other_cpus, sva, eva);
+ smp_masked_invlpg_range(other_cpus, pmap, sva, eva);
}
sched_unpin();
}
@@ -1053,21 +1198,63 @@ void
pmap_invalidate_all(pmap_t pmap)
{
cpuset_t other_cpus;
+ struct invpcid_descr d;
+ uint64_t cr3;
u_int cpuid;
sched_pin();
- if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
- invltlb();
- smp_invltlb();
+ cpuid = PCPU_GET(cpuid);
+ if (pmap == kernel_pmap ||
+ (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) ||
+ !CPU_CMP(&pmap->pm_active, &all_cpus)) {
+ if (invpcid_works) {
+ bzero(&d, sizeof(d));
+ invpcid(&d, INVPCID_CTXGLOB);
+ } else {
+ invltlb_globpcid();
+ }
+ CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
+ smp_invltlb(pmap);
} else {
- cpuid = PCPU_GET(cpuid);
other_cpus = all_cpus;
CPU_CLR(cpuid, &other_cpus);
- if (CPU_ISSET(cpuid, &pmap->pm_active))
+
+ /*
+ * This logic is duplicated in the Xinvltlb shootdown
+ * IPI handler.
+ */
+ if (pmap_pcid_enabled) {
+ if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
+ if (invpcid_works) {
+ d.pcid = pmap->pm_pcid;
+ d.pad = 0;
+ d.addr = 0;
+ invpcid(&d, INVPCID_CTX);
+ } else {
+ cr3 = rcr3();
+ critical_enter();
+
+ /*
+ * Bit 63 is clear, pcid TLB
+ * entries are invalidated.
+ */
+ load_cr3(DMAP_TO_PHYS((vm_offset_t)
+ pmap->pm_pml4) | pmap->pm_pcid);
+ load_cr3(cr3 | CR3_PCID_SAVE);
+ critical_exit();
+ }
+ } else {
+ invltlb_globpcid();
+ }
+ } else if (CPU_ISSET(cpuid, &pmap->pm_active))
invltlb();
- CPU_AND(&other_cpus, &pmap->pm_active);
+ CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
+ if (pmap_pcid_enabled)
+ CPU_AND(&other_cpus, &pmap->pm_save);
+ else
+ CPU_AND(&other_cpus, &pmap->pm_active);
if (!CPU_EMPTY(&other_cpus))
- smp_masked_invltlb(other_cpus);
+ smp_masked_invltlb(other_cpus, pmap);
}
sched_unpin();
}
@@ -1129,8 +1316,10 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
CPU_CLR(cpuid, &other_cpus);
if (pmap == kernel_pmap)
active = all_cpus;
- else
+ else {
active = pmap->pm_active;
+ CPU_AND_ATOMIC(&pmap->pm_save, &active);
+ }
if (CPU_OVERLAP(&active, &other_cpus)) {
act.store = cpuid;
act.invalidate = active;
@@ -1193,6 +1382,8 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
pde_store(pde, newpde);
if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
pmap_update_pde_invalidate(va, newpde);
+ else
+ CPU_ZERO(&pmap->pm_save);
}
#endif /* !SMP */
@@ -1675,6 +1866,8 @@ pmap_pinit0(pmap_t pmap)
PCPU_SET(curpmap, pmap);
TAILQ_INIT(&pmap->pm_pvchunk);
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+ pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
+ CPU_ZERO(&pmap->pm_save);
}
/*
@@ -1716,6 +1909,8 @@ pmap_pinit(pmap_t pmap)
CPU_ZERO(&pmap->pm_active);
TAILQ_INIT(&pmap->pm_pvchunk);
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+ pmap->pm_pcid = pmap_pcid_enabled ? alloc_unr(&pcid_unr) : -1;
+ CPU_ZERO(&pmap->pm_save);
return (1);
}
@@ -1957,6 +2152,14 @@ pmap_release(pmap_t pmap)
KASSERT(vm_radix_is_empty(&pmap->pm_root),
("pmap_release: pmap has reserved page table page(s)"));
+ if (pmap_pcid_enabled) {
+ /*
+ * Invalidate any left TLB entries, to allow the reuse
+ * of the pcid.
+ */
+ pmap_invalidate_all(pmap);
+ }
+
m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
for (i = 0; i < NKPML4E; i++) /* KVA */
@@ -1968,6 +2171,8 @@ pmap_release(pmap_t pmap)
m->wire_count--;
atomic_subtract_int(&cnt.v_wire_count, 1);
vm_page_free_zero(m);
+ if (pmap->pm_pcid != -1)
+ free_unr(&pcid_unr, pmap->pm_pcid);
}
static int
@@ -5734,15 +5939,20 @@ pmap_activate(struct thread *td)
critical_enter();
pmap = vmspace_pmap(td->td_proc->p_vmspace);
oldpmap = PCPU_GET(curpmap);
+ CPU_ZERO(&pmap->pm_save);
cpuid = PCPU_GET(cpuid);
#ifdef SMP
CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
+ CPU_SET_ATOMIC(cpuid, &pmap->pm_save);
#else
CPU_CLR(cpuid, &oldpmap->pm_active);
CPU_SET(cpuid, &pmap->pm_active);
+ CPU_SET(cpuid, &pmap->pm_save);
#endif
cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4);
+ if (pmap->pm_pcid != -1)
+ cr3 |= pmap->pm_pcid;
td->td_pcb->pcb_cr3 = cr3;
load_cr3(cr3);
PCPU_SET(curpmap, pmap);
diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
index ed0e7e9b3251..3764f720c65b 100644
--- a/sys/amd64/amd64/vm_machdep.c
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -221,6 +221,8 @@ cpu_fork(td1, p2, td2, flags)
*/
pmap2 = vmspace_pmap(p2->p_vmspace);
pcb2->pcb_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap2->pm_pml4);
+ if (pmap2->pm_pcid != -1)
+ pcb2->pcb_cr3 |= pmap2->pm_pcid;
pcb2->pcb_r12 = (register_t)fork_return; /* fork_trampoline argument */
pcb2->pcb_rbp = 0;
pcb2->pcb_rsp = (register_t)td2->td_frame - sizeof(void *);
diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h
index 1c83c2a625ee..0e11975c3806 100644
--- a/sys/amd64/include/pcpu.h
+++ b/sys/amd64/include/pcpu.h
@@ -67,6 +67,8 @@
struct system_segment_descriptor *pc_ldt; \
/* Pointer to the CPU TSS descriptor */ \
struct system_segment_descriptor *pc_tss; \
+ uint64_t pc_pm_save_cnt; \
+ char pc_invpcid_descr[16]; \
u_int pc_cmci_mask; /* MCx banks for CMCI */ \
uint64_t pc_dbreg[16]; /* ddb debugging regs */ \
int pc_dbreg_cmd; /* ddb debugging reg cmd */ \
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index aacb9ba94cf0..fa42389671e3 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -240,6 +240,8 @@ struct pmap {
pml4_entry_t *pm_pml4; /* KVA of level 4 page table */
TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */
cpuset_t pm_active; /* active on cpus */
+ cpuset_t pm_save; /* Context valid on cpus mask */
+ int pm_pcid; /* context id */
/* spare u_int here due to padding */
struct pmap_statistics pm_stats; /* pmap statistics */
struct vm_radix pm_root; /* spare page table pages */
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index 16d87ea24cc9..d6cd476450d2 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -54,6 +54,8 @@ inthand_t
IDTVEC(cpususpend), /* CPU suspends & waits to be resumed */
IDTVEC(rendezvous); /* handle CPU rendezvous */
+struct pmap;
+
/* functions in mp_machdep.c */
void cpu_add(u_int apic_id, char boot_cpu);
void cpustop_handler(void);
@@ -67,13 +69,14 @@ int ipi_nmi_handler(void);
void ipi_selected(cpuset_t cpus, u_int ipi);
u_int mp_bootaddress(u_int);
void smp_cache_flush(void);
-void smp_invlpg(vm_offset_t addr);
-void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr);
-void smp_invlpg_range(vm_offset_t startva, vm_offset_t endva);
-void smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva,
+void smp_invlpg(struct pmap *pmap, vm_offset_t addr);
+void smp_masked_invlpg(cpuset_t mask, struct pmap *pmap, vm_offset_t addr);
+void smp_invlpg_range(struct pmap *pmap, vm_offset_t startva,
vm_offset_t endva);
-void smp_invltlb(void);
-void smp_masked_invltlb(cpuset_t mask);
+void smp_masked_invlpg_range(cpuset_t mask, struct pmap *pmap,
+ vm_offset_t startva, vm_offset_t endva);
+void smp_invltlb(struct pmap *pmap);
+void smp_masked_invltlb(cpuset_t mask, struct pmap *pmap);
#endif /* !LOCORE */
#endif /* SMP */