diff options
author | Matt Macy <mmacy@FreeBSD.org> | 2018-06-04 01:10:23 +0000 |
---|---|---|
committer | Matt Macy <mmacy@FreeBSD.org> | 2018-06-04 01:10:23 +0000 |
commit | 5de96e33d68b091cd1c8d682799606e2a3d8f6fa (patch) | |
tree | 908d529e4f5c3103c7c3d8c4196b2178799a7a34 /sys/dev | |
parent | e52d92164754cbfff84767d4c6eb3cc93e8c21ae (diff) | |
download | src-5de96e33d68b091cd1c8d682799606e2a3d8f6fa.tar.gz src-5de96e33d68b091cd1c8d682799606e2a3d8f6fa.zip |
hwpmc: support sampling both kernel and user stacks when interrupted in kernel
This adds the -U options to pmcstat which will attribute in-kernel samples
back to the user stack that invoked the system call. It is not the default,
because when looking at kernel profiles it is generally more desirable to
merge all instances of a given system call together.
Although heavily revised, this change is directly derived from D7350 by
Jonathan T. Looney.
Obtained from: jtl
Sponsored by: Juniper Networks, Limelight Networks
Notes
Notes:
svn path=/head/; revision=334595
Diffstat (limited to 'sys/dev')
-rw-r--r-- | sys/dev/hwpmc/hwpmc_logging.c | 10 | ||||
-rw-r--r-- | sys/dev/hwpmc/hwpmc_mod.c | 242 |
2 files changed, 215 insertions, 37 deletions
diff --git a/sys/dev/hwpmc/hwpmc_logging.c b/sys/dev/hwpmc/hwpmc_logging.c index a69e9d0df813..454ac87359a8 100644 --- a/sys/dev/hwpmc/hwpmc_logging.c +++ b/sys/dev/hwpmc/hwpmc_logging.c @@ -250,11 +250,9 @@ pmc_plb_rele(struct pmclog_buffer *plb) mtx_unlock_spin(&pmc_dom_hdrs[plb->plb_domain]->pdbh_mtx); } - /* * Get a log buffer */ - static int pmclog_get_buffer(struct pmc_owner *po) { @@ -345,7 +343,6 @@ pmclog_proc_ignite(void *handle, struct pmc_owner *po) * * This function is executed by each pmc owner's helper thread. */ - static void pmclog_loop(void *arg) { @@ -846,12 +843,15 @@ pmclog_schedule_one_cond(void *arg) { struct pmc_owner *po = arg; struct pmclog_buffer *plb; + int cpu; spinlock_enter(); + cpu = curcpu; /* tell hardclock not to run again */ - if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid))) + if (PMC_CPU_HAS_SAMPLES(cpu)) PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL); - plb = po->po_curbuf[curcpu]; + pmc_flush_samples(cpu); + plb = po->po_curbuf[cpu]; if (plb && plb->plb_ptr != plb->plb_base) pmclog_schedule_io(po, 1); spinlock_exit(); diff --git a/sys/dev/hwpmc/hwpmc_mod.c b/sys/dev/hwpmc/hwpmc_mod.c index 690cd31708bc..b1729f8295d0 100644 --- a/sys/dev/hwpmc/hwpmc_mod.c +++ b/sys/dev/hwpmc/hwpmc_mod.c @@ -207,6 +207,8 @@ static int pmc_debugflags_parse(char *newstr, char *fence); #endif static int load(struct module *module, int cmd, void *arg); +static int pmc_add_sample(int cpu, int ring, struct pmc *pm, + struct trapframe *tf, int inuserspace); static void pmc_add_thread_descriptors_from_proc(struct proc *p, struct pmc_process *pp); static int pmc_attach_process(struct proc *p, struct pmc *pm); @@ -248,6 +250,7 @@ static void pmc_process_samples(int cpu, int soft); static void pmc_release_pmc_descriptor(struct pmc *pmc); static void pmc_process_thread_add(struct thread *td); static void pmc_process_thread_delete(struct thread *td); +static void pmc_process_thread_userret(struct thread *td); static void pmc_remove_owner(struct pmc_owner *po); static void pmc_remove_process_descriptor(struct pmc_process *pp); static void pmc_restore_cpu_binding(struct pmc_binding *pb); @@ -265,7 +268,7 @@ static int generic_switch_in(struct pmc_cpu *pc, struct pmc_process *pp); static int generic_switch_out(struct pmc_cpu *pc, struct pmc_process *pp); static struct pmc_mdep *pmc_generic_cpu_initialize(void); static void pmc_generic_cpu_finalize(struct pmc_mdep *md); - +static void pmc_post_callchain_callback(void); /* * Kernel tunables and sysctl(8) interface. */ @@ -291,6 +294,10 @@ SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, buffer_requests_failed, CTLFLAG_ &pmc_stats.pm_buffer_requests_failed, "# of buffer requests which failed"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, log_sweeps, CTLFLAG_RW, &pmc_stats.pm_log_sweeps, "# of ?"); +SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, merges, CTLFLAG_RW, + &pmc_stats.pm_merges, "# of times kernel stack was found for user trace"); +SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, overwrites, CTLFLAG_RW, + &pmc_stats.pm_overwrites, "# of times a sample was overwritten before being logged"); static int pmc_callchaindepth = PMC_CALLCHAIN_DEPTH; SYSCTL_INT(_kern_hwpmc, OID_AUTO, callchaindepth, CTLFLAG_RDTUN, @@ -1703,6 +1710,19 @@ pmc_process_thread_delete(struct thread *td) } /* + * A userret() call for a thread. + */ +static void +pmc_process_thread_userret(struct thread *td) +{ + + thread_lock(td); + curthread->td_flags |= TDF_ASTPENDING; + thread_unlock(td); + pmc_post_callchain_callback(); +} + +/* * A mapping change for a process. */ @@ -2030,6 +2050,7 @@ const char *pmc_hooknames[] = { "SOFTSAMPLING", "THR-CREATE", "THR-EXIT", + "THR-USERRET", }; #endif @@ -2194,6 +2215,7 @@ pmc_hook_handler(struct thread *td, int function, void *arg) cpu = PCPU_GET(cpuid); pmc_process_samples(cpu, PMC_HR); pmc_process_samples(cpu, PMC_SR); + pmc_process_samples(cpu, PMC_UR); break; case PMC_FN_MMAP: @@ -2214,6 +2236,11 @@ pmc_hook_handler(struct thread *td, int function, void *arg) pmc_capture_user_callchain(PCPU_GET(cpuid), PMC_HR, (struct trapframe *) arg); + + KASSERT(td->td_pinned == 1, + ("[pmc,%d] invalid td_pinned value", __LINE__)); + sched_unpin(); /* Can migrate safely now. */ + td->td_pflags &= ~TDP_CALLCHAIN; break; @@ -2223,8 +2250,18 @@ pmc_hook_handler(struct thread *td, int function, void *arg) */ KASSERT(td == curthread, ("[pmc,%d] td != curthread", __LINE__)); - pmc_capture_user_callchain(PCPU_GET(cpuid), PMC_SR, + + cpu = PCPU_GET(cpuid); + pmc_capture_user_callchain(cpu, PMC_SR, + (struct trapframe *) arg); + pmc_capture_user_callchain(cpu, PMC_UR, (struct trapframe *) arg); + + KASSERT(td->td_pinned == 1, + ("[pmc,%d] invalid td_pinned value", __LINE__)); + + sched_unpin(); /* Can migrate safely now. */ + td->td_pflags &= ~TDP_CALLCHAIN; break; @@ -2245,6 +2282,12 @@ pmc_hook_handler(struct thread *td, int function, void *arg) pmc_process_thread_delete(td); break; + case PMC_FN_THR_USERRET: + KASSERT(td == curthread, ("[pmc,%d] td != curthread", + __LINE__)); + pmc_process_thread_userret(td); + break; + default: #ifdef HWPMC_DEBUG KASSERT(0, ("[pmc,%d] unknown hook %d\n", __LINE__, function)); @@ -2658,7 +2701,9 @@ pmc_wait_for_pmc_idle(struct pmc *pm) * Loop (with a forced context switch) till the PMC's runcount * comes down to zero. */ + pmclog_flush(pm->pm_owner); while (counter_u64_fetch(pm->pm_runcount) > 0) { + pmclog_flush(pm->pm_owner); #ifdef HWPMC_DEBUG maxloop--; KASSERT(maxloop > 0, @@ -3264,7 +3309,7 @@ pmc_class_to_classdep(enum pmc_class class) return (NULL); } -#ifdef HWPMC_DEBUG +#if defined(HWPMC_DEBUG) && defined(KTR) static const char *pmc_op_to_name[] = { #undef __PMC_OP #define __PMC_OP(N, D) #N , @@ -3806,7 +3851,22 @@ pmc_syscall_handler(struct thread *td, void *syscall_args) */ if ((pa.pm_flags & ~(PMC_F_DESCENDANTS | PMC_F_LOG_PROCCSW | - PMC_F_LOG_PROCEXIT | PMC_F_CALLCHAIN)) != 0) { + PMC_F_LOG_PROCEXIT | PMC_F_CALLCHAIN | + PMC_F_USERCALLCHAIN)) != 0) { + error = EINVAL; + break; + } + + /* PMC_F_USERCALLCHAIN is only valid with PMC_F_CALLCHAIN */ + if ((pa.pm_flags & (PMC_F_CALLCHAIN | PMC_F_USERCALLCHAIN)) == + PMC_F_USERCALLCHAIN) { + error = EINVAL; + break; + } + + /* PMC_F_USERCALLCHAIN is only valid for sampling mode */ + if (pa.pm_flags & PMC_F_USERCALLCHAIN && + mode != PMC_MODE_TS && mode != PMC_MODE_SS) { error = EINVAL; break; } @@ -4533,8 +4593,6 @@ pmc_post_callchain_callback(void) } /* - * Interrupt processing. - * * Find a free slot in the per-cpu array of samples and capture the * current callchain there. If a sample was successfully added, a bit * is set in mask 'pmc_cpumask' denoting that the DO_SAMPLES hook @@ -4544,8 +4602,8 @@ pmc_post_callchain_callback(void) * use any of the locking primitives supplied by the OS. */ -int -pmc_process_interrupt(int cpu, int ring, struct pmc *pm, struct trapframe *tf, +static int +pmc_add_sample(int cpu, int ring, struct pmc *pm, struct trapframe *tf, int inuserspace) { int error, callchaindepth; @@ -4561,7 +4619,11 @@ pmc_process_interrupt(int cpu, int ring, struct pmc *pm, struct trapframe *tf, psb = pmc_pcpu[cpu]->pc_sb[ring]; ps = psb->ps_write; - if (ps->ps_nsamples) { /* in use, reader hasn't caught up */ + if (ps->ps_nsamples == PMC_SAMPLE_INUSE) { + counter_u64_add(ps->ps_pmc->pm_runcount, -1); + counter_u64_add(pmc_stats.pm_overwrites, 1); + ps->ps_nsamples = 0; + } else if (ps->ps_nsamples) { /* in use, reader hasn't caught up */ pm->pm_pcpu_state[cpu].pps_stalled = 1; counter_u64_add(pmc_stats.pm_intr_bufferfull, 1); PMCDBG6(SAM,INT,1,"(spc) cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", @@ -4573,7 +4635,6 @@ pmc_process_interrupt(int cpu, int ring, struct pmc *pm, struct trapframe *tf, goto done; } - /* Fill in entry. */ PMCDBG6(SAM,INT,1,"cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm, (void *) tf, inuserspace, @@ -4619,7 +4680,11 @@ pmc_process_interrupt(int cpu, int ring, struct pmc *pm, struct trapframe *tf, } ps->ps_nsamples = callchaindepth; /* mark entry as in use */ - + if (ring == PMC_UR) { + ps->ps_nsamples_actual = callchaindepth; /* mark entry as in use */ + ps->ps_nsamples = PMC_SAMPLE_INUSE; + } else + ps->ps_nsamples = callchaindepth; /* mark entry as in use */ /* increment write pointer, modulo ring buffer size */ ps++; if (ps == psb->ps_fence) @@ -4636,6 +4701,30 @@ pmc_process_interrupt(int cpu, int ring, struct pmc *pm, struct trapframe *tf, } /* + * Interrupt processing. + * + * This function is meant to be called from an NMI handler. It cannot + * use any of the locking primitives supplied by the OS. + */ + +int +pmc_process_interrupt(int cpu, int ring, struct pmc *pm, struct trapframe *tf, + int inuserspace) +{ + struct thread *td; + + td = curthread; + if ((pm->pm_flags & PMC_F_USERCALLCHAIN) && + td && td->td_proc && + (td->td_proc->p_flag & P_KPROC) == 0 && + !inuserspace) { + atomic_add_int(&curthread->td_pmcpend, 1); + return (pmc_add_sample(cpu, PMC_UR, pm, tf, 0)); + } + return (pmc_add_sample(cpu, ring, pm, tf, inuserspace)); +} + +/* * Capture a user call chain. This function will be called from ast() * before control returns to userland and before the process gets * rescheduled. @@ -4648,6 +4737,7 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf) struct thread *td; struct pmc_sample *ps, *ps_end; struct pmc_samplebuffer *psb; + int nsamples, nrecords, pass; #ifdef INVARIANTS int ncallchains; int nfree; @@ -4664,6 +4754,11 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf) ncallchains = 0; nfree = 0; #endif + nrecords = INT_MAX; + pass = 0; + restart: + if (ring == PMC_UR) + nrecords = atomic_readandclear_32(&td->td_pmcpend); /* * Iterate through all deferred callchain requests. @@ -4675,6 +4770,11 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf) ps_end = psb->ps_write; do { #ifdef INVARIANTS + if (ps->ps_nsamples == PMC_SAMPLE_FREE) { + nfree++; + goto next; + } + if ((ps->ps_pmc == NULL) || (ps->ps_pmc->pm_state != PMC_STATE_RUNNING)) nfree++; @@ -4697,37 +4797,91 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf) KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, ("[pmc,%d] runcount %ld", __LINE__, (unsigned long)counter_u64_fetch(pm->pm_runcount))); + if (ring == PMC_UR) { + nsamples = ps->ps_nsamples_actual; + counter_u64_add(pmc_stats.pm_merges, 1); + } else + nsamples = 0; + /* * Retrieve the callchain and mark the sample buffer * as 'processable' by the timer tick sweep code. */ - ps->ps_nsamples = pmc_save_user_callchain(ps->ps_pc, - pmc_callchaindepth, tf); -#ifdef INVARIANTS +#ifdef INVARIANTS ncallchains++; #endif + if (__predict_true(nsamples < pmc_callchaindepth - 1)) + nsamples += pmc_save_user_callchain(ps->ps_pc + nsamples, + pmc_callchaindepth - nsamples - 1, tf); + wmb(); + ps->ps_nsamples = nsamples; + if (nrecords-- == 1) + break; next: /* increment the pointer, modulo sample ring size */ if (++ps == psb->ps_fence) ps = psb->ps_samples; } while (ps != ps_end); + if (__predict_false(ring == PMC_UR && td->td_pmcpend)) { + if (pass == 0) { + pass = 1; + goto restart; + } + /* only collect samples for this part once */ + td->td_pmcpend = 0; + } -#ifdef INVARIANTS - KASSERT(ncallchains > 0 || nfree > 0, - ("[pmc,%d] cpu %d didn't find a sample to collect", __LINE__, - cpu)); +#ifdef INVARIANTS + if (ring == PMC_HR) + KASSERT(ncallchains > 0 || nfree > 0, + ("[pmc,%d] cpu %d didn't find a sample to collect", __LINE__, + cpu)); #endif - KASSERT(td->td_pinned == 1, - ("[pmc,%d] invalid td_pinned value", __LINE__)); - sched_unpin(); /* Can migrate safely now. */ - /* mark CPU as needing processing */ DPCPU_SET(pmc_sampled, 1); } + +static void +pmc_flush_ring(int cpu, int ring) +{ + struct pmc *pm; + struct pmc_sample *ps; + struct pmc_samplebuffer *psb; + int n; + + psb = pmc_pcpu[cpu]->pc_sb[ring]; + + for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */ + + ps = psb->ps_read; + if (ps->ps_nsamples == PMC_SAMPLE_FREE) + goto next; + pm = ps->ps_pmc; + counter_u64_add(pm->pm_runcount, -1); + ps->ps_nsamples = PMC_SAMPLE_FREE; + /* increment read pointer, modulo sample size */ + next: + if (++ps == psb->ps_fence) + psb->ps_read = psb->ps_samples; + else + psb->ps_read = ps; + } +} + +void +pmc_flush_samples(int cpu) +{ + int n; + + for (n = 0; n < PMC_NUM_SR; n++) + pmc_flush_ring(cpu, n); +} + + /* * Process saved PC samples. */ @@ -4788,22 +4942,19 @@ pmc_process_samples(int cpu, int ring) * its owner, and if the PC is in user mode, update * profiling statistics like timer-based profiling * would have done. + * + * Otherwise, this is either a sampling-mode PMC that + * is attached to a different process than its owner, + * or a system-wide sampling PMC. Dispatch a log + * entry to the PMC's owner process. */ if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) { if (ps->ps_flags & PMC_CC_F_USERSPACE) { td = FIRST_THREAD_IN_PROC(po->po_owner); addupc_intr(td, ps->ps_pc[0], 1); } - goto entrydone; - } - - /* - * Otherwise, this is either a sampling mode PMC that - * is attached to a different process than its owner, - * or a system-wide sampling PMC. Dispatch a log - * entry to the PMC's owner process. - */ - pmclog_process_callchain(pm, ps); + } else + pmclog_process_callchain(pm, ps); entrydone: ps->ps_nsamples = 0; /* mark entry as free */ @@ -5282,6 +5433,8 @@ pmc_initialize(void) pmc_stats.pm_buffer_requests = counter_u64_alloc(M_WAITOK); pmc_stats.pm_buffer_requests_failed = counter_u64_alloc(M_WAITOK); pmc_stats.pm_log_sweeps = counter_u64_alloc(M_WAITOK); + pmc_stats.pm_merges = counter_u64_alloc(M_WAITOK); + pmc_stats.pm_overwrites = counter_u64_alloc(M_WAITOK); #ifdef HWPMC_DEBUG /* parse debug flags first */ @@ -5427,6 +5580,24 @@ pmc_initialize(void) (n * pmc_callchaindepth); pmc_pcpu[cpu]->pc_sb[PMC_SR] = sb; + + sb = malloc_domain(sizeof(struct pmc_samplebuffer) + + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, domain, + M_WAITOK|M_ZERO); + sb->ps_read = sb->ps_write = sb->ps_samples; + sb->ps_fence = sb->ps_samples + pmc_nsamples; + + KASSERT(pmc_pcpu[cpu] != NULL, + ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu)); + + sb->ps_callchains = malloc_domain(pmc_callchaindepth * pmc_nsamples * + sizeof(uintptr_t), M_PMC, domain, M_WAITOK|M_ZERO); + + for (n = 0, ps = sb->ps_samples; n < pmc_nsamples; n++, ps++) + ps->ps_pc = sb->ps_callchains + + (n * pmc_callchaindepth); + + pmc_pcpu[cpu]->pc_sb[PMC_UR] = sb; } /* allocate space for the row disposition array */ @@ -5637,10 +5808,15 @@ pmc_cleanup(void) KASSERT(pmc_pcpu[cpu]->pc_sb[PMC_SR] != NULL, ("[pmc,%d] Null sw cpu sample buffer cpu=%d", __LINE__, cpu)); + KASSERT(pmc_pcpu[cpu]->pc_sb[PMC_UR] != NULL, + ("[pmc,%d] Null userret cpu sample buffer cpu=%d", __LINE__, + cpu)); free_domain(pmc_pcpu[cpu]->pc_sb[PMC_HR]->ps_callchains, M_PMC); free_domain(pmc_pcpu[cpu]->pc_sb[PMC_HR], M_PMC); free_domain(pmc_pcpu[cpu]->pc_sb[PMC_SR]->ps_callchains, M_PMC); free_domain(pmc_pcpu[cpu]->pc_sb[PMC_SR], M_PMC); + free_domain(pmc_pcpu[cpu]->pc_sb[PMC_UR]->ps_callchains, M_PMC); + free_domain(pmc_pcpu[cpu]->pc_sb[PMC_UR], M_PMC); free_domain(pmc_pcpu[cpu], M_PMC); } @@ -5669,6 +5845,8 @@ pmc_cleanup(void) counter_u64_free(pmc_stats.pm_buffer_requests); counter_u64_free(pmc_stats.pm_buffer_requests_failed); counter_u64_free(pmc_stats.pm_log_sweeps); + counter_u64_free(pmc_stats.pm_merges); + counter_u64_free(pmc_stats.pm_overwrites); sx_xunlock(&pmc_sx); /* we are done */ } |