aboutsummaryrefslogtreecommitdiff
path: root/sys/dev/hwpmc/hwpmc_mod.c
diff options
context:
space:
mode:
authorMatt Macy <mmacy@FreeBSD.org>2018-10-05 05:55:56 +0000
committerMatt Macy <mmacy@FreeBSD.org>2018-10-05 05:55:56 +0000
commitd9f1b8dbf29d1980a0f80c5c78b1b5867b5bb8d2 (patch)
tree4c559b1a002faddf9bd8b917cbbdf912445e800d /sys/dev/hwpmc/hwpmc_mod.c
parente8bb589d5603fe7761550bf885703f3f9865af56 (diff)
hwpmc: Refactor sample ring buffer handling to fix races
Refactor sample ring buffer ring handling to make it more robust to long running callchain collection handling r338112 introduced a (now fixed) regression that exposed a number of race conditions within the management of the sample buffers. This simplifies the handling and moves the decision to overwrite a callchain sample that has taken too long out of the NMI in to the hardlock handler. With this change the problem no longer shows up as a ring corruption but as the code spending all of its time in callchain collection. - Makes the producer / consumer index incrementing monotonic, making it easier (for me at least) to reason about. - Moves the decision to overwrite a sample from NMI context to interrupt context where we can enforce serialization. - Puts a time limit on waiting to collect a user callchain - putting a bound on head-of-line blocking causing samples to be dropped - Removes the flush routine which was previously needed to purge dangling references to the pmc from the sample buffers but now is only a source of a race condition on unload. Previously one could lock up or crash HEAD by running: pmcstat -S inst_retired.any_p -T and then hitting ^C After this change it is no longer possible. PR: 231793 Reviewed by: markj@ Approved by: re (gjb@) Differential Revision: https://reviews.freebsd.org/D17011
Notes
Notes: svn path=/head/; revision=339188
Diffstat (limited to 'sys/dev/hwpmc/hwpmc_mod.c')
-rw-r--r--sys/dev/hwpmc/hwpmc_mod.c220
1 files changed, 101 insertions, 119 deletions
diff --git a/sys/dev/hwpmc/hwpmc_mod.c b/sys/dev/hwpmc/hwpmc_mod.c
index 20df37d24bd6..1fad25c55702 100644
--- a/sys/dev/hwpmc/hwpmc_mod.c
+++ b/sys/dev/hwpmc/hwpmc_mod.c
@@ -211,7 +211,7 @@ static int pmc_debugflags_parse(char *newstr, char *fence);
#endif
static int load(struct module *module, int cmd, void *arg);
-static int pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf);
+static int pmc_add_sample(ring_type_t ring, struct pmc *pm, struct trapframe *tf);
static void pmc_add_thread_descriptors_from_proc(struct proc *p,
struct pmc_process *pp);
static int pmc_attach_process(struct proc *p, struct pmc *pm);
@@ -249,7 +249,7 @@ static void pmc_process_csw_out(struct thread *td);
static void pmc_process_exit(void *arg, struct proc *p);
static void pmc_process_fork(void *arg, struct proc *p1,
struct proc *p2, int n);
-static void pmc_process_samples(int cpu, int soft);
+static void pmc_process_samples(int cpu, ring_type_t soft);
static void pmc_release_pmc_descriptor(struct pmc *pmc);
static void pmc_process_thread_add(struct thread *td);
static void pmc_process_thread_delete(struct thread *td);
@@ -342,6 +342,7 @@ static int pmc_nsamples = PMC_NSAMPLES;
SYSCTL_INT(_kern_hwpmc, OID_AUTO, nsamples, CTLFLAG_RDTUN,
&pmc_nsamples, 0, "number of PC samples per CPU");
+static uint64_t pmc_sample_mask = PMC_NSAMPLES-1;
/*
* kern.hwpmc.mtxpoolsize -- number of mutexes in the mutex pool.
@@ -1402,6 +1403,10 @@ pmc_process_csw_in(struct thread *td)
if (pm->pm_state != PMC_STATE_RUNNING)
continue;
+ KASSERT(counter_u64_fetch(pm->pm_runcount) >= 0,
+ ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm,
+ (unsigned long)counter_u64_fetch(pm->pm_runcount)));
+
/* increment PMC runcount */
counter_u64_add(pm->pm_runcount, 1);
@@ -1596,6 +1601,10 @@ pmc_process_csw_out(struct thread *td)
if (pm->pm_pcpu_state[cpu].pps_stalled == 0)
pcd->pcd_stop_pmc(cpu, adjri);
+ KASSERT(counter_u64_fetch(pm->pm_runcount) > 0,
+ ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm,
+ (unsigned long)counter_u64_fetch(pm->pm_runcount)));
+
/* reduce this PMC's runcount */
counter_u64_add(pm->pm_runcount, -1);
@@ -2725,7 +2734,7 @@ pmc_destroy_pmc_descriptor(struct pmc *pm)
static void
pmc_wait_for_pmc_idle(struct pmc *pm)
{
-#ifdef HWPMC_DEBUG
+#ifdef INVARIANTS
volatile int maxloop;
maxloop = 100 * pmc_cpu_max();
@@ -2737,7 +2746,7 @@ pmc_wait_for_pmc_idle(struct pmc *pm)
pmclog_flush(pm->pm_owner, 1);
while (counter_u64_fetch(pm->pm_runcount) > 0) {
pmclog_flush(pm->pm_owner, 1);
-#ifdef HWPMC_DEBUG
+#ifdef INVARIANTS
maxloop--;
KASSERT(maxloop > 0,
("[pmc,%d] (ri%d, rc%ld) waiting too long for "
@@ -4657,7 +4666,7 @@ pmc_post_callchain_callback(void)
*/
static int
-pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf)
+pmc_add_sample(ring_type_t ring, struct pmc *pm, struct trapframe *tf)
{
int error, cpu, callchaindepth, inuserspace;
struct thread *td;
@@ -4672,18 +4681,15 @@ pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf)
cpu = curcpu;
psb = pmc_pcpu[cpu]->pc_sb[ring];
inuserspace = TRAPF_USERMODE(tf);
- ps = psb->ps_write;
- if (ps->ps_nsamples == PMC_SAMPLE_INUSE) {
- counter_u64_add(ps->ps_pmc->pm_runcount, -1);
- counter_u64_add(pmc_stats.pm_overwrites, 1);
- ps->ps_nsamples = 0;
- } else if (ps->ps_nsamples) { /* in use, reader hasn't caught up */
+ ps = PMC_PROD_SAMPLE(psb);
+ if (psb->ps_considx != psb->ps_prodidx &&
+ ps->ps_nsamples) { /* in use, reader hasn't caught up */
pm->pm_pcpu_state[cpu].pps_stalled = 1;
counter_u64_add(pmc_stats.pm_intr_bufferfull, 1);
PMCDBG6(SAM,INT,1,"(spc) cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d",
cpu, pm, (void *) tf, inuserspace,
- (int) (psb->ps_write - psb->ps_samples),
- (int) (psb->ps_read - psb->ps_samples));
+ (int) (psb->ps_prodidx & pmc_sample_mask),
+ (int) (psb->ps_considx & pmc_sample_mask));
callchaindepth = 1;
error = ENOMEM;
goto done;
@@ -4692,14 +4698,8 @@ pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf)
/* Fill in entry. */
PMCDBG6(SAM,INT,1,"cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm,
(void *) tf, inuserspace,
- (int) (psb->ps_write - psb->ps_samples),
- (int) (psb->ps_read - psb->ps_samples));
-
- KASSERT(counter_u64_fetch(pm->pm_runcount) >= 0,
- ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm,
- (unsigned long)counter_u64_fetch(pm->pm_runcount)));
-
- counter_u64_add(pm->pm_runcount, 1); /* hold onto PMC */
+ (int) (psb->ps_prodidx & pmc_sample_mask),
+ (int) (psb->ps_considx & pmc_sample_mask));
td = curthread;
ps->ps_pmc = pm;
@@ -4707,13 +4707,14 @@ pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf)
ps->ps_pid = td->td_proc->p_pid;
ps->ps_tid = td->td_tid;
ps->ps_tsc = pmc_rdtsc();
-
+ ps->ps_ticks = ticks;
ps->ps_cpu = cpu;
ps->ps_flags = inuserspace ? PMC_CC_F_USERSPACE : 0;
callchaindepth = (pm->pm_flags & PMC_F_CALLCHAIN) ?
pmc_callchaindepth : 1;
+ MPASS(ps->ps_pc != NULL);
if (callchaindepth == 1)
ps->ps_pc[0] = PMC_TRAPFRAME_TO_PC(tf);
else {
@@ -4727,26 +4728,27 @@ pmc_add_sample(int ring, struct pmc *pm, struct trapframe *tf)
callchaindepth, tf);
} else {
pmc_post_callchain_callback();
- callchaindepth = PMC_SAMPLE_INUSE;
+ callchaindepth = PMC_USER_CALLCHAIN_PENDING;
}
}
ps->ps_nsamples = callchaindepth; /* mark entry as in use */
if (ring == PMC_UR) {
ps->ps_nsamples_actual = callchaindepth; /* mark entry as in use */
- ps->ps_nsamples = PMC_SAMPLE_INUSE;
+ ps->ps_nsamples = PMC_USER_CALLCHAIN_PENDING;
} else
ps->ps_nsamples = callchaindepth; /* mark entry as in use */
- /* increment write pointer, modulo ring buffer size */
- ps++;
- if (ps == psb->ps_fence)
- psb->ps_write = psb->ps_samples;
- else
- psb->ps_write = ps;
+ KASSERT(counter_u64_fetch(pm->pm_runcount) >= 0,
+ ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm,
+ (unsigned long)counter_u64_fetch(pm->pm_runcount)));
+
+ counter_u64_add(pm->pm_runcount, 1); /* hold onto PMC */
+ /* increment write pointer */
+ psb->ps_prodidx++;
done:
/* mark CPU as needing processing */
- if (callchaindepth != PMC_SAMPLE_INUSE)
+ if (callchaindepth != PMC_USER_CALLCHAIN_PENDING)
DPCPU_SET(pmc_sampled, 1);
return (error);
@@ -4785,14 +4787,15 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf)
{
struct pmc *pm;
struct thread *td;
- struct pmc_sample *ps, *ps_end;
+ struct pmc_sample *ps;
struct pmc_samplebuffer *psb;
- int nsamples, nrecords, pass;
+ uint64_t considx, prodidx;
+ int nsamples, nrecords, pass, iter;
#ifdef INVARIANTS
int ncallchains;
int nfree;
+ int start_ticks = ticks;
#endif
-
psb = pmc_pcpu[cpu]->pc_sb[ring];
td = curthread;
@@ -4810,29 +4813,30 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf)
if (ring == PMC_UR)
nrecords = atomic_readandclear_32(&td->td_pmcpend);
+ for (iter = 0, considx = psb->ps_considx, prodidx = psb->ps_prodidx;
+ considx < prodidx && iter < pmc_nsamples; considx++, iter++) {
+ ps = PMC_CONS_SAMPLE_OFF(psb, considx);
+
/*
* Iterate through all deferred callchain requests.
* Walk from the current read pointer to the current
* write pointer.
*/
- ps = psb->ps_read;
- ps_end = psb->ps_write;
- do {
#ifdef INVARIANTS
if (ps->ps_nsamples == PMC_SAMPLE_FREE) {
nfree++;
- goto next;
+ continue;
}
if ((ps->ps_pmc == NULL) ||
(ps->ps_pmc->pm_state != PMC_STATE_RUNNING))
nfree++;
#endif
- if (ps->ps_nsamples != PMC_SAMPLE_INUSE)
- goto next;
- if (ps->ps_td != td)
- goto next;
+ if (ps->ps_td != td ||
+ ps->ps_nsamples == PMC_USER_CALLCHAIN_PENDING ||
+ ps->ps_pmc->pm_state != PMC_STATE_RUNNING)
+ continue;
KASSERT(ps->ps_cpu == cpu,
("[pmc,%d] cpu mismatch ps_cpu=%d pcpu=%d", __LINE__,
@@ -4865,15 +4869,28 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf)
if (__predict_true(nsamples < pmc_callchaindepth - 1))
nsamples += pmc_save_user_callchain(ps->ps_pc + nsamples,
pmc_callchaindepth - nsamples - 1, tf);
- wmb();
- ps->ps_nsamples = nsamples;
+
+ /*
+ * We have to prevent hardclock from potentially overwriting
+ * this sample between when we read the value and when we set
+ * it
+ */
+ spinlock_enter();
+ /*
+ * Verify that the sample hasn't been dropped in the meantime
+ */
+ if (ps->ps_nsamples == PMC_USER_CALLCHAIN_PENDING) {
+ ps->ps_nsamples = nsamples;
+ /*
+ * If we couldn't get a sample, simply drop the reference
+ */
+ if (nsamples == 0)
+ counter_u64_add(pm->pm_runcount, -1);
+ }
+ spinlock_exit();
if (nrecords-- == 1)
break;
-next:
- /* increment the pointer, modulo sample ring size */
- if (++ps == psb->ps_fence)
- ps = psb->ps_samples;
- } while (ps != ps_end);
+ }
if (__predict_false(ring == PMC_UR && td->td_pmcpend)) {
if (pass == 0) {
pass = 1;
@@ -4884,60 +4901,20 @@ next:
}
#ifdef INVARIANTS
- if (ring == PMC_HR)
- KASSERT(ncallchains > 0 || nfree > 0,
- ("[pmc,%d] cpu %d didn't find a sample to collect", __LINE__,
- cpu));
+ if ((ticks - start_ticks) > hz)
+ log(LOG_ERR, "%s took %d ticks\n", __func__, (ticks - start_ticks));
#endif
/* mark CPU as needing processing */
DPCPU_SET(pmc_sampled, 1);
}
-
-static void
-pmc_flush_ring(int cpu, int ring)
-{
- struct pmc *pm;
- struct pmc_sample *ps;
- struct pmc_samplebuffer *psb;
- int n;
-
- psb = pmc_pcpu[cpu]->pc_sb[ring];
-
- for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */
-
- ps = psb->ps_read;
- if (ps->ps_nsamples == PMC_SAMPLE_FREE)
- goto next;
- pm = ps->ps_pmc;
- counter_u64_add(pm->pm_runcount, -1);
- ps->ps_nsamples = PMC_SAMPLE_FREE;
- /* increment read pointer, modulo sample size */
- next:
- if (++ps == psb->ps_fence)
- psb->ps_read = psb->ps_samples;
- else
- psb->ps_read = ps;
- }
-}
-
-void
-pmc_flush_samples(int cpu)
-{
- int n;
-
- for (n = 0; n < PMC_NUM_SR; n++)
- pmc_flush_ring(cpu, n);
-}
-
-
/*
* Process saved PC samples.
*/
static void
-pmc_process_samples(int cpu, int ring)
+pmc_process_samples(int cpu, ring_type_t ring)
{
struct pmc *pm;
int adjri, n;
@@ -4946,20 +4923,25 @@ pmc_process_samples(int cpu, int ring)
struct pmc_sample *ps;
struct pmc_classdep *pcd;
struct pmc_samplebuffer *psb;
+ uint64_t delta;
KASSERT(PCPU_GET(cpuid) == cpu,
("[pmc,%d] not on the correct CPU pcpu=%d cpu=%d", __LINE__,
PCPU_GET(cpuid), cpu));
psb = pmc_pcpu[cpu]->pc_sb[ring];
+ delta = psb->ps_prodidx - psb->ps_considx;
+ MPASS(delta <= pmc_nsamples);
+ MPASS(psb->ps_considx <= psb->ps_prodidx);
+ for (n = 0; psb->ps_considx < psb->ps_prodidx; psb->ps_considx++, n++) {
+ ps = PMC_CONS_SAMPLE(psb);
- for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */
-
- ps = psb->ps_read;
- if (ps->ps_nsamples == PMC_SAMPLE_FREE)
- break;
-
+ if (__predict_false(ps->ps_nsamples == PMC_SAMPLE_FREE))
+ continue;
pm = ps->ps_pmc;
+ /* skip non-running samples */
+ if (pm->pm_state != PMC_STATE_RUNNING)
+ goto entrydone;
KASSERT(counter_u64_fetch(pm->pm_runcount) > 0,
("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm,
@@ -4971,12 +4953,19 @@ pmc_process_samples(int cpu, int ring)
("[pmc,%d] pmc=%p non-sampling mode=%d", __LINE__,
pm, PMC_TO_MODE(pm)));
- /* Ignore PMCs that have been switched off */
- if (pm->pm_state != PMC_STATE_RUNNING)
- goto entrydone;
/* If there is a pending AST wait for completion */
- if (ps->ps_nsamples == PMC_SAMPLE_INUSE) {
+ if (ps->ps_nsamples == PMC_USER_CALLCHAIN_PENDING) {
+ /* if sample is more than 65 ms old, drop it */
+ if (ticks - ps->ps_ticks > (hz >> 4)) {
+ /*
+ * track how often we hit this as it will
+ * preferentially lose user samples
+ * for long running system calls
+ */
+ counter_u64_add(pmc_stats.pm_overwrites, 1);
+ goto entrydone;
+ }
/* Need a rescan at a later time. */
DPCPU_SET(pmc_sampled, 1);
break;
@@ -4984,8 +4973,8 @@ pmc_process_samples(int cpu, int ring)
PMCDBG6(SAM,OPS,1,"cpu=%d pm=%p n=%d fl=%x wr=%d rd=%d", cpu,
pm, ps->ps_nsamples, ps->ps_flags,
- (int) (psb->ps_write - psb->ps_samples),
- (int) (psb->ps_read - psb->ps_samples));
+ (int) (psb->ps_prodidx & pmc_sample_mask),
+ (int) (psb->ps_considx & pmc_sample_mask));
/*
* If this is a process-mode PMC that is attached to
@@ -5008,13 +4997,11 @@ pmc_process_samples(int cpu, int ring)
entrydone:
ps->ps_nsamples = 0; /* mark entry as free */
- counter_u64_add(pm->pm_runcount, -1);
+ KASSERT(counter_u64_fetch(pm->pm_runcount) > 0,
+ ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm,
+ (unsigned long)counter_u64_fetch(pm->pm_runcount)));
- /* increment read pointer, modulo sample size */
- if (++ps == psb->ps_fence)
- psb->ps_read = psb->ps_samples;
- else
- psb->ps_read = ps;
+ counter_u64_add(pm->pm_runcount, -1);
}
counter_u64_add(pmc_stats.pm_log_sweeps, 1);
@@ -5197,11 +5184,11 @@ pmc_process_exit(void *arg __unused, struct proc *p)
}
}
- counter_u64_add(pm->pm_runcount, -1);
-
- KASSERT((int) counter_u64_fetch(pm->pm_runcount) >= 0,
+ KASSERT((int64_t) counter_u64_fetch(pm->pm_runcount) > 0,
("[pmc,%d] runcount is %d", __LINE__, ri));
+ counter_u64_add(pm->pm_runcount, -1);
+
(void) pcd->pcd_config_pmc(cpu, adjri, NULL);
}
@@ -5583,6 +5570,7 @@ pmc_initialize(void)
"range.\n", pmc_nsamples);
pmc_nsamples = PMC_NSAMPLES;
}
+ pmc_sample_mask = pmc_nsamples-1;
if (pmc_callchaindepth <= 0 ||
pmc_callchaindepth > PMC_CALLCHAIN_DEPTH_MAX) {
@@ -5658,8 +5646,6 @@ pmc_initialize(void)
sb = malloc_domain(sizeof(struct pmc_samplebuffer) +
pmc_nsamples * sizeof(struct pmc_sample), M_PMC, domain,
M_WAITOK|M_ZERO);
- sb->ps_read = sb->ps_write = sb->ps_samples;
- sb->ps_fence = sb->ps_samples + pmc_nsamples;
KASSERT(pmc_pcpu[cpu] != NULL,
("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu));
@@ -5676,8 +5662,6 @@ pmc_initialize(void)
sb = malloc_domain(sizeof(struct pmc_samplebuffer) +
pmc_nsamples * sizeof(struct pmc_sample), M_PMC, domain,
M_WAITOK|M_ZERO);
- sb->ps_read = sb->ps_write = sb->ps_samples;
- sb->ps_fence = sb->ps_samples + pmc_nsamples;
KASSERT(pmc_pcpu[cpu] != NULL,
("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu));
@@ -5694,8 +5678,6 @@ pmc_initialize(void)
sb = malloc_domain(sizeof(struct pmc_samplebuffer) +
pmc_nsamples * sizeof(struct pmc_sample), M_PMC, domain,
M_WAITOK|M_ZERO);
- sb->ps_read = sb->ps_write = sb->ps_samples;
- sb->ps_fence = sb->ps_samples + pmc_nsamples;
KASSERT(pmc_pcpu[cpu] != NULL,
("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu));